UTF8转UCS——被微软折磨的日子( 二 )


/**---------------------------------------------------* ucs2: UTF8* utf321 Bytes 0xxxxxxx * utf322 Bytes 110xxxxx 10xxxxxx * utf323 Bytes 1110xxxx 10xxxxxx 10xxxxxx * utf324 Bytes 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */static intutf8toutf32(const unsigned char **pp, uint32_t *out){const unsigned char *p = *pp;unsigned c = *p;if (c & 0x80)// c & 0b10000000,返回值非0表明是超过1个字节的编码{if ((c & 0xE0) == 0xC0)// 2字节编码场景,(c & 0b11100000)==0b11000000,符合 110xxxxx {const unsigned c2 = *++p;// 判断下一个字节,符合10xxxxxx if ((c2 & 0xC0) == 0x80)//(c2 & 0b11000000)==0b10000000 {*out =((c& 0x1F) << 6) | (c2 & 0x3F);} else // 不符合110xxxxx 10xxxxxx {return WIND_ERR_INVALID_UTF8;}} else if ((c & 0xF0) == 0xE0)//3字节场景 (c & 0b11110000)==0b11100000,满足1110xxxx{const unsigned c2 = *++p;if ((c2 & 0xC0) == 0x80)//判断下一字节 (c2 & 0b11000000)==0b10000000,满足10xxxxxx{const unsigned c3 = *++p;if ((c3 & 0xC0) == 0x80)//判断下一字节 (c3 & 0b11000000)==0b10000000,满足10xxxxxx{*out =((c& 0x0F) << 12)| ((c2 & 0x3F) << 6)|(c3 & 0x3F);} else {return WIND_ERR_INVALID_UTF8;}} else {return WIND_ERR_INVALID_UTF8;}} else if ((c & 0xF8) == 0xF0)// (c & 0b11111000)==0b111100004字节场景{const unsigned c2 = *++p;if ((c2 & 0xC0) == 0x80)// 3字节, (c2 & 0b11000000)==0b10000000,满足10xxxxxx{const unsigned c3 = *++p;if ((c3 & 0xC0) == 0x80)// 2字节, (c2 & 0b11000000)==0b10000000,满足10xxxxxx{const unsigned c4 = *++p;if ((c4 & 0xC0) == 0x80)// 1字节, (c2 & 0b11000000)==0b10000000,满足10xxxxxx{*out =((c& 0x07) << 18)| ((c2 & 0x3F) << 12)| ((c3 & 0x3F) <<6)|(c4 & 0x3F);} else {return WIND_ERR_INVALID_UTF8;}} else {return WIND_ERR_INVALID_UTF8;}} else {return WIND_ERR_INVALID_UTF8;}} else {return WIND_ERR_INVALID_UTF8;}} else {*out = c;//单个字节场景}*pp = p;return 0;}
参考资料
《请问UTF-8与UCS-2之间有何区别与联系》
《搞懂编码 GBK 和 UTF8》
《软件中的字符串编码, UCS2, UTF8哪个更优》
《UTF8和UCS2》