Unicode与UTF-8编码转换(一)

Unicode是一个符号集合,规定了符号的二进制代码,而UTF-8是Unicode的一种实现,具体Unicode和UTF-8的联系如下所示:

Unicode符号范围UTF-8编码规则1|00000000-0000007F|0xxxxxxx2|00000080-000007FF|110xxxxx10xxxxxx3|00000800-0000FFFF|1110xxxx10xxxxxx10xxxxxx4|00010000-0010FFFF|11110xxx10xxxxxx10xxxxxx10xxxxxx5|00200000-03FFFFFF|111110xx10xxxxxx10xxxxxx10xxxxxx10xxxxxx6|04000000-7FFFFFFF|1111110x10xxxxxx10xxxxxx10xxxxxx10xxxxxx10xxxxxx

由图可清晰的知道Unicode和UTF-8间的联系。其中UTF-8编码规则中的x就是让你编码的地方。接下来逐一举例说明各段的编码:

1,范围0x00-0x7F:给定的用例Unicode码为0x41,对应的二进制位:0100 0001,而UTF-8编码规则为:0xxxxxxx。故有:

0xxx xxxx

+ 100 0001


0100 0001

所以Unicode编码0x41转换为UTF-8后为:0x41。

所以对于区间段0x00-0x7F之间的Unicode和UTF-8编码是一致的。即与ASCII码一致(ASCII共规定了128个字符的编码)

2,范围0x80-0x7FF:给定的用例Unicode码为0x123,对应的二进制为:0001 0010 0011,而UTF-8编码规则为:110xxxxx 10xxxxxx。故有:

110x xxxx 10xx xxxx

+ 0 0100 10 0011


1100 0100 1010 0011

所以Unicode编码0x123转换为UTF-8后为:0xC4A3

3,范围0x800-0xFFFF:给定的用例Unicode码为0x4E25,对应的二进制为:0100 1110 0010 0101,而UTF-8编码规则为:1110xxxx 10xxxxxx 10xxxxxx,故有:

1110 xxxx 10xx xxxx 10xx xxxx

+ 0100 11 1000 10 0101


1110 0100 1011 1000 1010 0101

所以Unicode编码为0x4E25转换为UTF-8后为:0xE4B8A5

4,范围0x10000-0x10FFFF:给定的Unicode码为0x23456,对应的二进制为:0010 0011 0100 0101 0110,而UTF-8编码规则为:111100xx 10xxxxxx 10xxxxxx 10xxxxxx。故有:

1111 00xx 10xx xxxx 10xx xxxx 10xx xxxx

+ 00 10 0011 01 0001 01 0110


1111 0000 1010 0011 1001 0001 1001 0110

所以Unicode编码为0x23456转换UTF-8后为:0xF0A39196

5,范围0x200000-0x3FFFFFF:给定的Unicode码为0x234567,对应的二进制为:0010 0011 0100 0101 0110 0111,UTF-8编码规则为:111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx。故有:

1111 10xx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx

+ 00 1000 11 0100 01 0101 10 0111


1111 1000 1000 1000 1011 0100 1001 0101 1010 0111

所以Unicode编码为0x234567转换UTF-8后为:0xF888B495A7

6,范围0x4000000-0x7FFFFFFF:给定的Unicode码为0x34561234,对应的二进制为:0011 0100 0101 0110 0001 0010 0011 0100,UTF-8编码规则为:1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx。故有:

1111 110x 10xx xxxx10xx xxxx10xx xxxx 10xx xxxx 10xx xxxx

+ 0 11 0100 01 0101 10 0001 00 1000 11 0100


1111 1100 1011 0100 1001 0101 1010 0001 1000 1000 1011 0100

所以Unicode编码为0x34561234转换UTF-8后为:0xFCB495A188B4


1,通过以上案例分析可得如下单字符Unicode编码转UTF-8程序为:

1)由于本系统采用大头方式(Big endian),所以先打出来的是高位的值。

2)实现思路:移动指定的位数是该字节处于易于操作的位置或使操作完的值达到指定位置,使用与运算取得指定位上的值,使用或运算实现相加效果。

intunicode_to_utf(unsignedlongunicode,unsignedchar*utf){assert(utf);intsize=0;if(unicode<=0x7F){*(utf+size++)=unicode&0x7F;}elseif(unicode>=0x80&&unicode<=0x7FF){*(utf+size++)=((unicode>>6)&0x1F)|0xC0;*(utf+size++)=(unicode&0x3F)|0x80;}elseif(unicode>=0x800&&unicode<=0xFFFF){*(utf+size++)=((unicode>>12)&0x0F)|0xE0;*(utf+size++)=((unicode>>6)&0x3F)|0x80;*(utf+size++)=(unicode&0x3F)|0x80;}elseif(unicode>=0x10000&&unicode<=0x10FFFF){*(utf+size++)=((unicode>>18)&0x7)|0xF0;*(utf+size++)=((unicode>>12)&0x3F)|0x80;*(utf+size++)=((unicode>>6)&0x3F)|0x80;*(utf+size++)=(unicode&0x3F)|0x80;}elseif(unicode>=0x200000&&unicode<=0x3FFFFFF){*(utf+size++)=((unicode>>24)&0x3)|0xF8;*(utf+size++)=((unicode>>18)&0x3F)|0x80;*(utf+size++)=((unicode>>12)&0x3F)|0x80;*(utf+size++)=((unicode>>6)&0x3F)|0x80;*(utf+size++)=(unicode&0x3F)|0x80;}elseif(unicode>=0x4000000&&unicode<=0x7FFFFFFF){*(utf+size++)=((unicode>>30)&0x1)|0xFC;*(utf+size++)=((unicode>>24)&0x3F)|0x80;*(utf+size++)=((unicode>>18)&0x3F)|0x80;*(utf+size++)=((unicode>>12)&0x3F)|0x80;*(utf+size++)=((unicode>>6)&0x3F)|0x80;*(utf+size++)=(unicode&0x3F)|0x80;}else{printf("Error:unknowscope\n");return-1;}*(utf+size)='\0';returnsize;}

测试用例如下:

intmain(intargc,char*argv[]){unsignedlongunicode1=0x55;unsignedcharutf[7]={0};intsize=unicode_to_utf(unicode1,utf);utf_print(utf,size);unsignedlongunicode2=0x123;size=unicode_to_utf(unicode2,utf);utf_print(utf,size);memset(utf,0x00,sizeof(utf));unsignedlongunicode3=0x4E25;size=unicode_to_utf(unicode3,utf);utf_print(utf,size);memset(utf,0x00,sizeof(utf));unsignedlongunicode4=0x23456;size=unicode_to_utf(unicode4,utf);utf_print(utf,size);memset(utf,0x00,sizeof(utf));unsignedlongunicode5=0x234567;size=unicode_to_utf(unicode5,utf);utf_print(utf,size);memset(utf,0x00,sizeof(utf));unsignedlongunicode6=0x34561234;size=unicode_to_utf(unicode6,utf);utf_print(utf,size);/*error*/memset(utf,0x00,sizeof(utf));unsignedlongunicode7=0x8FFFFFFF;size=unicode_to_utf(unicode7,utf);utf_print(utf,size);return0;}

打印函数如下:

voidutf_print(unsignedchar*utf,intsize){if(size==-1){printf("unknowscope\n");return;}intindex=0;for(;index<size;index+=1){printf("%X",*(utf+index));}printf("\n");}

2,单字符UTF-8编码转Unicode编码:

intutf_to_unicode(unsignedlongutf,unsignedchar*unicode){intsize=0;if(utf<=0x7F){*(unicode+size++)=utf&0x7F;}elseif(utf>=0xC080&&utf<=0xCFBF){*(unicode+size++)=((utf>>10)&0x07);*(unicode+size++)=(utf&0x3F)|(((utf>>8)&0x03)<<6);}elseif(utf>=0xE08080&&utf<=0xEFBFBF){*(unicode+size++)=((utf>>10)&0x0F)|((utf>>16)&0x0F)<<4;*(unicode+size++)=(utf&0x3F)|(((utf>>8)&0x03)<<6);}elseif(utf>=0xF0808080&&utf<=0xF7BFBFBF){*(unicode+size++)=((utf>>20)&0x03)|(((utf>>24)&0x07)<<2);*(unicode+size++)=((utf>>10)&0x0F)|(((utf>>16)&0x0F)<<4);*(unicode+size++)=(utf&0x3F)|((utf>>8)&0x03)<<6;}elseif(utf>=0xF880808080&&utf<=0xFBBFBFBFBF){*(unicode+size++)=(utf>>32)&0x03;*(unicode+size++)=((utf>>20)&0x03)|(((utf>>24)&0x3F)<<2);*(unicode+size++)=((utf>>10)&0x0F)|(((utf>>16)&0x0F)<<4);*(unicode+size++)=(utf&0x3F)|(((utf>>8)&0x03)<<6);}elseif(utf>=0xFC8080808080&&utf<=0xFDBFBFBFBFBF){*(unicode+size++)=((utf>>32)&0x3F)|(((utf>>40)&0x01)<<6);*(unicode+size++)=((utf>>20)&0x03)|(((utf>>24)&0x3F)<<2);*(unicode+size++)=((utf>>10)&0x0F)|(((utf>>16)&0x0F)<<4);*(unicode+size++)=(utf&0x3F)|(((utf>>8)&0x03)<<6);}else{printf("Error:unknowscope\n");return-1;}*(unicode+size)='\0';returnsize;}

测试用例:

intmain(intargc,char*argv[]){unsignedcharunicode[9];intsize=0;memset(unicode,0x00,sizeof(unicode));unsignedlongutf1=0x55;size=utf_to_unicode(utf1,unicode);unicode_print(unicode,size);memset(unicode,0x00,sizeof(unicode));unsignedlongutf2=0xC480;size=utf_to_unicode(utf2,unicode);unicode_print(unicode,size);memset(unicode,0x00,sizeof(unicode));unsignedlongutf3=0xE4B8A5;size=utf_to_unicode(utf3,unicode);unicode_print(unicode,size);memset(unicode,0x00,sizeof(unicode));unsignedlongutf4=0xF0A39196;size=utf_to_unicode(utf4,unicode);unicode_print(unicode,size);memset(unicode,0x00,sizeof(unicode));unsignedlongutf5=0xF888B495A7;size=utf_to_unicode(utf5,unicode);unicode_print(unicode,size);memset(unicode,0x00,sizeof(unicode));unsignedlongutf6=0xFCB495A188B4;size=utf_to_unicode(utf6,unicode);unicode_print(unicode,size);memset(unicode,0x00,sizeof(unicode));unsignedlongutf7=0xFEBFBFBFBFBF;size=utf_to_unicode(utf7,unicode);unicode_print(unicode,size);return0;}

打印函数如下:

voidunicode_print(unsignedchar*unicode,intsize){if(size==-1){printf("Error:unknowscope\n");return;}intindex=0;for(;index<size;index+=1){printf("%02X",*(unicode+index));}printf("\n");}

本文参考文献:http://www.ruanyifeng.com/blog/2007/10/ascii_unicode_and_utf-8.html