[C语言]Unicode编码(二)-中文字符刷选
Unicode编码(二)-中文字符刷选
1,UTF-8编码中三字节中文字符的刷选方法如下:
intchinese_filter(unsignedcharone,unsignedchartwo){intflag=-1;if(one>=0x4E&&one<0x9F){flag=1;}elseif(one==0x9F&&two<=0xCB){flag=1;}elseif(one>=0x34&&one<0x4D){flag=1;}elseif(one==0x4D&&two<=0xB5){flag=1;}elseif(one==0x2F&&two<=0xD5){flag=1;}elseif(one==0x2E&&two>=0x80&&two<=0xF3){flag=1}elseif(one>=0xF9&&one<0xFA){flag=1;}elseif(one==0xFA&&two<=0xD9){flag=1;}elseif(one==0xE8&&two>=0x15&&two<=0x6F){flag=1;}elseif(one>=0xE4&&one<0xE5){flag=1;}elseif(one==0xE5&&two<=0xE8){flag=1;}elseif(one==0xE6&&two<=0xCF){flag=1;}elseif(one==0x31&&two>=0xC0&&two<=0xE3){flag=1;}elseif(one==0x2F&&two>=0xF0&&two<=0xFB){flag=1;}elseif(one==0x31&&two>=0x05&&two<=0x20){flag=1;}elseif(one==0x31&&two>=0xA0&&two<=0xBA){flag=1;}returnflag;}
2,UTF-8编码中四字节中文字符刷选方法如下:
intchinese_filter2(unsignedcharone,unsignedchartwo,unsignedcharthr){intflag=-1;if(one==0x02&&two<0xA6){/*20000-2A6D6*/flag=1;}elseif(one==0x02&&two==0xA6&&thr<=0xD6){flag=1;}elseif(one==0x02&&two>=0xA7&&two<0xB7){/*2A700-2B734*/flag=1;}elseif(one==0x02&&two==0xB7&&thr<=0x34){flag=1;}elseif(one==0x02&&two>=0xB7&&thr>=0x40&&two<0xB8){/*2B740-2B81D*/flag=1;}elseif(one==0x02&&two==0xB8&&thr<=0x1D){flag=1;}elseif(one==0x02&&two>=0xF8&&two<0xFA){/*2F800-2FA1D*/flag=1;}elseif(one==0x02&&two==0xFA&&thr<=0x1D){flag=1;}returnflag;}
3,UTF-8字符转Unicode编码:
1)src为输入的UTF-8字符串
2)unicode为UTF-8字符串转换后输出的unicode编码串
3)chs为字符串中刷选出来的中文字符
intutf_to_unicode(unsignedchar*src,unsignedchar*unicode,unsignedchar*chs){intsize=0;intch_len=0;unsignedcharone=0x00;unsignedchartwo=0x00;unsignedcharthr=0x00;unsignedcharfou=0x00;unsignedcharfiv=0x00;unsignedcharsix=0x00;/*one使高位,FFFE方式存储,所以第一个取出来的one是高位*/while(*src){if(*src<=0x80){one=*(src+0);*(unicode+size++)=one;src+=1;}elseif(*src>=0xC0&&*src<0xE0){one=*(src+0);two=*(src+1);*(unicode+size++)=one&0x03;*(unicode+size++)=(two&0x3F)|((one&0x03)<<6);src+=2;}elseif(*src>=0xE0&&*src<0xF0){one=*(src+0);two=*(src+1);thr=*(src+2);*(unicode+size++)=((two&0x3C)>>2)|((one&0x0F)<<4);*(unicode+size++)=(thr&0x3F)|((two&0x03)<<6);intval=chinese_filter(*(unicode+size-2),*(unicode+size-1));if(val==1){*(chs+ch_len++)=*(src+0);*(chs+ch_len++)=*(src+1);*(chs+ch_len++)=*(src+2);}src+=3;}elseif(*src>=0xF0&&*src<0xF8){one=*(src+0);two=*(src+1);thr=*(src+2);fou=*(src+3);*(unicode+size++)=((two&0x30)>>4)|((one&0x07)<<2);*(unicode+size++)=((thr&0x3C)>>2)|((two&0x0F)<<4);*(unicode+size++)=(fou&0x3F)|((thr&0x03)<<6);inrval=chinese_filter2(*(unicode+size-3),*(unicode+size-2),*(unicode+size-1));if(val==1){*(chs+ch_len++)=*(src+0);*(chs+ch_len++)=*(src+1);*(chs+ch_len++)=*(src+2);*(chs+ch_len++)=*(src+3);}src+=4;}elseif(*src>=0xF8&&*src<0xFC){one=*(src+0);two=*(src+1);thr=*(src+2);fou=*(src+3);fiv=*(src+4);*(unicode+size++)=one&0x03;*(unicode+size++)=((thr&0x30)>>4)|((two&0x3F)<<2);*(unicode+size++)=((fou&0x3C)>>2)|((thr&0x0F)<<4);*(unicode+size++)=(fiv&0x3F)|((fou&0x03)<<6);src+=5;}elseif(*src>=0xFC){one=*(src+0);two=*(src+1);thr=*(src+2);fou=*(src+3);fiv=*(src+4);six=*(src+5);*(unicode+size++)=(two&0x3F)|((one&0x01)<<6);*(unicode+size++)=((fou&0x30)>>4)|((thr&0x3F)<<2);*(unicode+size++)=((fiv&0x3C)>>2)|((fou&0x0F)<<4);*(unicode+size++)=(six&0x3F)|((fiv&0x03)<<6);src+=6;}else{printf("Error:unknoescope\n");return-1;}}*(unicode+size)='\0';returnsize;}
4,主函数测试程序和Unicode编码打印程序
voidunicode_print(unsignedchar*unicode,intsize){intindex=0;for(;index<size;index+=1){printf("%02X",*(unicode+index));}printf("\n");}
intmain(intargc,char*argv[]){unsignedcharch4[]="一A严严·";intsize=0;intlen=0;len=strlen(ch4);unsignedcharunicode[len+1];memset(unicode,0x00,len+1);unsignedcharchina[len+1];memset(china,0x00,len+1);size=utf_to_unicode(ch4,len+1,unicode,china);unicode_print(unicode,size);printf("Chinese=%s\n",china);return0;}
本文参考文献:http://www.qqxiuzi.cn/zh/hanzi-unicode-bianma.php?zfj=kzb
声明:本站所有文章资源内容,如无特殊说明或标注,均为采集网络资源。如若本站内容侵犯了原著者的合法权益,可联系本站删除。