禁用字检测
utf8编码的数据可直接使用下面的代码
最关键的步骤就是把字符串拆成单个字,UTF-8编码的字,如果只有一个字节则其最高二进制位为0;如果是多字节,其第一个字节从最高位开始,连续的二进制位值为1的个数决定了其编码的位数,其余各字节均以10开头。
UTF-8最多可用到6个字节。
1字节 0xxxxxxx
2字节 110xxxxx 10xxxxxx
3字节 1110xxxx 10xxxxxx 10xxxxxx
4字节 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
5字节 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
6字节 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
其它就很简单了
1、禁用字处理 禁用字拆分后以第一个为key保存
2、待测试字符串
a)、拆分成单字
b)、大写转小写,字母和空格全角转半角,去掉多余空格(英文字母后最多只会有一个空格,中文后不应该有空格)
c)、遍历字符串的所有字 检测每个字对应的禁用字组是否在待测字符串中
#include<string>#include<vector>#include<map>#include<set>#include<iostream>#include<sstream>#include<string.h>#include<stdio.h>classCDisableWord{structSDisableWord{std::stringstr;};typedefstd::vector<SDisableWord>VDW;private:std::map<std::string,VDW>m_mapDisableWord;std::set<std::string>m_setAllDisableWord;//特殊转换std::map<std::string,std::string>m_mapSpecialWord;private://把字符串拆分为单个字size_tSplitWord(constchar*pSrc,unsignedintlen,std::vector<std::string>&output);//获取特殊字对应的转换字conststd::string*GetSpecialWord(conststd::string&str);public:CDisableWord();//设置禁用字voidAddOneDisableWord(conststd::string&str);//检测boolCheckStr(constchar*pSrc,unsignedintlen);boolCheckStr(conststd::string&str);};CDisableWord::CDisableWord(){std::stringqjdx[26]={"A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"};std::stringqjxx[26]={"a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"};std::stringdx="ABCDEFGHIJKLMNOPQRSTUVWXYZ";std::stringrst="abcdefghijklmnopqrstuvwxyz";std::stringstr1="a";std::stringstr2="a";for(inti=0;i<26;i++){str1[0]=rst[i];str2[0]=dx[i];m_mapSpecialWord[qjdx[i]]=str1;m_mapSpecialWord[qjxx[i]]=str1;m_mapSpecialWord[str2]=str1;}m_mapSpecialWord[" "]=std::string("");}//把字符串拆分为单个字size_tCDisableWord::SplitWord(constchar*pSrc,unsignedintlen,std::vector<std::string>&output){std::stringch;unsignedcharbyte;for(unsignedinti=0,wlen=0;i<len;i+=wlen){byte=(unsignedchar)pSrc[i];if(byte>=0xFC)wlen=6;elseif(byte>=0xF8)wlen=5;elseif(byte>=0xF0)wlen=4;elseif(byte>=0xE0)wlen=3;elseif(byte>=0xC0)wlen=2;elsewlen=1;if(i+wlen>len)break;ch.clear();for(unsignedintj=0;j<wlen;j++)ch+=pSrc[i+j];output.push_back(ch);}returnoutput.size();}//获取特殊字对应的转换字conststd::string*CDisableWord::GetSpecialWord(conststd::string&str){std::map<std::string,std::string>::iteratormiter=m_mapSpecialWord.find(str);if(miter==m_mapSpecialWord.end())returnNULL;return&(miter->second);}voidCDisableWord::AddOneDisableWord(conststd::string&str){if(m_setAllDisableWord.find(str)!=m_setAllDisableWord.end())return;std::vector<std::string>output;if(SplitWord(str.c_str(),str.size(),output)==0||output[0].size()==0)return;std::map<std::string,VDW>::iteratormiter=m_mapDisableWord.find(output[0]);if(miter==m_mapDisableWord.end()){m_mapDisableWord[output[0]]=VDW();miter=m_mapDisableWord.find(output[0]);}if(miter==m_mapDisableWord.end())return;SDisableWordsdw;sdw.str=str;miter->second.push_back(sdw);}boolCDisableWord::CheckStr(constchar*pSrc,unsignedintlen){if(len==0)returntrue;std::stringstr(pSrc,len);returnCheckStr(str);}boolCDisableWord::CheckStr(conststd::string&str){if(str.size()==0)returntrue;std::vector<std::string>output;if(SplitWord(str.c_str(),str.size(),output)==0||output[0].size()==0)returnfalse;//大写转小写全角转半角for(size_ti=0;i<output.size();++i){conststd::string*pStr=GetSpecialWord(output[i]);if(pStr)output[i]=*pStr;}std::stringStrSrc="";//转换之后的字符串std::stringStrDelSpace="";//删除非英文之后的所有空格所有大写转成小写std::set<std::string>sonly;for(size_ti=0;i<output.size();++i){sonly.insert(output[i]);StrSrc+=output[i];boolbnoadd=false;if(i>0&&output[i]==""){bnoadd=true;for(intj=int(i-1);j>=0;--j){if(output[j]=="")continue;if(output[j].size()>1)bnoadd=false;elseif(j+1==int(i))//英文字符留一个空格bnoadd=false;break;}}if(!bnoadd)StrDelSpace+=output[i];}boolbSame=(StrDelSpace==StrSrc);std::set<std::string>::iteratorsiter=sonly.begin();for(;siter!=sonly.end();++siter){std::map<std::string,VDW>::iteratormiter=m_mapDisableWord.find(*siter);if(miter==m_mapDisableWord.end())continue;for(size_tj=0;j<miter->second.size();++j){SDisableWord&sdw=miter->second[j];if(StrSrc.find(sdw.str)!=std::string::npos)returnfalse;elseif(!bSame&&StrDelSpace.find(sdw.str)!=std::string::npos)returnfalse;}}returntrue;}intmain(){CDisableWordcdw;//设置禁用字std::stringstrdw[]={"中文","英文","测试","aabb","测试","ccdd"};for(inti=0;i<6;i++)cdw.AddOneDisableWord(strdw[i]);while(1){chars[51];std::cin.getline(s,50);if(cdw.CheckStr(s,strlen(s)))printf("收到:%s没有敏感字\n",s);elseprintf("收到:%s敏感字敏感字敏感字\n",s);}return0;}//g++-g-oDisableWordDisableWord.cpp
声明:本站所有文章资源内容,如无特殊说明或标注,均为采集网络资源。如若本站内容侵犯了原著者的合法权益,可联系本站删除。