代码
#ifndef UNICONVERT_H
#define UNICONVERT_H#include <string>using std::string;namespace unicodeCvt{typedef unsigned int uint;//0x80 -> 10xx xxxx BF//0xC0 -> 110x xxxx 1F//0xE0 -> 1110 xxxx 0F//0xF0 -> 1111 0xxx 07void func(uint unic,int num,string &str) {for(int i= num;i>=0;i--)str.append(1, static_cast<char>((0x80) | (unic>>6*i) & 0xBF));}string utf8str{};//单字符的unicode值转utf8编码的字符串string &&unicode2Utf8(uint unic){utf8str.clear();if (unic < 0x80) {utf8str.append(1, static_cast<char>(unic));}else if (unic > 0x7F && unic < 0x0800) {utf8str.append(1, static_cast<char>(0xC0 | ((unic >> 6) & 0x1F)));func(unic, 0, utf8str);}else if (unic > 0x07FF && unic < 0x010000) {utf8str.append(1, static_cast<char>(0xE0 | ((unic >> 12) & 0x0F)));func(unic, 1, utf8str);}else if (unic > 0xFFFF && unic < 0x10FFFF) {utf8str.append(1, static_cast<char>(0xF0 | ((unic >> 18) & 0x07)));func(unic, 2, utf8str);}return std::move(utf8str);}//utf8编码的单字符转对应的unicode值。uint utf82Unicode(const string &str){int len = str.size();if (len > 4 || len <= 0)return 0;uint origin = 0;//翻转char arr[4]{};for (int i = 0; i < len; i++) {memcpy(arr+len-i-1, &str.at(i), 1);}memcpy(&origin, &arr, 4);uint unicode = 0;switch (len){case 1:return origin;case 2:{for (int i = 0; i < 6; i++)unicode |= (0x01 << i)&origin;for (int i = 8; i < 13; i++)unicode |= ((0x01 << i)&origin) >> 2;return unicode;}case 3:{for (int i = 0; i < 6; i++)unicode |= (0x01 << i)&origin;for (int i = 8; i < 14; i++)unicode |= ((0x01 << i)&origin) >> 2;for (int i = 16; i < 20; i++)unicode |= ((0x01 << i)&origin) >> 4;return unicode;}case 4:{for (int i = 0; i < 6; i++)unicode |= (0x01 << i)&origin;for (int i = 8; i < 14; i++)unicode |= ((0x01 << i)&origin) >> 2;for (int i = 16; i < 22; i++)unicode |= ((0x01 << i)&origin) >> 4;for (int i = 24; i < 27; i++)unicode |= ((0x01 << i)&origin) >> 6;return unicode;}default:return 0;}}};
#endif // UNICONVERT_H
调用如下:
#define UNIC2UTF8
void preformanceTest()
{
#ifdef ABC//右值能提升很多倍速度for (uint i = 0; i < 0x10FFFF; i++){string &&str = unicode2Utf8(i);}
#elsefor (uint i = 0; i < 0x10FFFF; i++){string &&str = unicode2Utf8(i);uint unic = utf82Unicode(str);}
#endif // UNIC2UTF8}
说明
-
unicode2Utf8函数将unicode值转为对应的utf8编码的字符串
-
utf82Unicode函数将utf8编码的字符串转为unicode值
-
两个函数性能都经过测试验证,目前是我能够优化的极限。
-
两者的转换原理则依据下表(详情参考字符编码):
Unicode码位范围 | utf-8编码二进制 | 内存空间 |
---|---|---|
0x00-0x7F | 0xxxxxxx | 一字节 |
0x80-0x07FF | 110xxxxx 10xxxxxx | 两字节 |
0x0800-0xFFFF | 1110xxxx 10xxxxxx 10xxxxxx | 三字节 |
0x010000-0x10FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | 四字节 |