开发语言:C++
功能描述:
Unicode内码转换器。用于UTF-8、UTF-16(UCS2)、UTF-32(UCS4)之间的编码转换。
下载地址:
UnicodeConverter.zip
版本历史:
V1.02010年03月12日
源代码:
UnicodeConverter.h
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
#pragmaonce
-
-
#include<windows.h>
-
#include<stdio.h>
-
#include<ostream>
-
-
usingnamespacestd;
-
-
classCUnicodeConverter
-
{
-
-
-
-
public:
-
-
-
-
-
-
-
-
-
-
staticINTUCS4_To_UTF8(DWORDdwUCS4,BYTE*pbUTF8);
-
-
-
-
-
-
-
-
-
-
-
staticINTUTF8_To_UCS4(constBYTE*pbUTF8,DWORD&dwUCS4);
-
-
-
-
-
-
-
-
-
-
-
-
staticINTUCS4_To_UTF16(DWORDdwUCS4,WORD*pwUTF16);
-
-
-
-
-
-
-
-
-
-
-
-
staticINTUTF16_To_UCS4(constWORD*pwUTF16,DWORD&dwUCS4);
-
-
-
-
-
-
-
-
-
-
-
staticINTUTF8Str_To_UTF16Str(constBYTE*pbszUTF8Str,WORD*pwszUTF16Str);
-
-
-
-
-
-
-
-
-
-
-
staticINTUTF16Str_To_UTF8Str(constWORD*pwszUTF16Str,BYTE*pbszUTF8Str);
-
-
-
-
-
public:
-
-
-
-
-
-
staticUINTPrint_UTF8_By_UCS4(FILE*out,DWORDdwUCS4);
-
-
-
-
-
-
-
staticUINTPrint_UTF16_By_UCS4(FILE*out,DWORDdwUCS4,BOOLisBigEndian=FALSE);
-
-
-
-
-
-
-
staticUINTPrint_UTF8Str_By_UTF16Str(FILE*out,constWORD*pwszUTF16Str);
-
-
-
-
-
-
-
staticUINTPrint_UTF16Str_By_UTF8Str(FILE*out,constBYTE*pbszUTF8Str,BOOLisBigEndian=FALSE);
-
-
-
-
-
-
-
staticUINTPrint_UTF8_BOM(FILE*out);
-
-
-
-
-
-
-
staticUINTPrint_UTF16_BOM(FILE*out,BOOLisBigEndian=FALSE);
-
-
-
-
-
public:
-
-
-
-
-
-
staticUINTPrint_UTF8_By_UCS4(ostream&os,DWORDdwUCS4);
-
-
-
-
-
-
-
staticUINTPrint_UTF16_By_UCS4(ostream&os,DWORDdwUCS4,BOOLisBigEndian=FALSE);
-
-
-
-
-
-
-
staticUINTPrint_UTF8Str_By_UTF16Str(ostream&os,constWORD*pwszUTF16Str);
-
-
-
-
-
-
-
staticUINTPrint_UTF16Str_By_UTF8Str(ostream&os,constBYTE*pbszUTF8Str,BOOLisBigEndian=FALSE);
-
-
-
-
-
-
-
staticUINTPrint_UTF8_BOM(ostream&os);
-
-
-
-
-
-
-
staticUINTPrint_UTF16_BOM(ostream&os,BOOLisBigEndian=FALSE);
-
};
-
-
-
-
UnicodeConverter.cpp
-
#include"UnicodeConverter.h"
-
-
-
-
-
-
-
INTCUnicodeConverter::UCS4_To_UTF8(DWORDdwUCS4,BYTE*pbUTF8)
-
{
-
constBYTEabPrefix[]={0,0xC0,0xE0,0xF0,0xF8,0xFC};
-
constDWORDadwCodeUp[]={
-
0x80,
-
0x800,
-
0x10000,
-
0x200000,
-
0x4000000,
-
0x80000000
-
};
-
-
INTi,iLen;
-
-
-
iLen=sizeof(adwCodeUp)/sizeof(DWORD);
-
for(i=0;i<iLen;i++)
-
{
-
if(dwUCS4<adwCodeUp[i])
-
{
-
break;
-
}
-
}
-
-
if(i==iLen)return0;
-
-
iLen=i+1;
-
if(pbUTF8!=NULL)
-
{
-
for(;i>0;i--)
-
{
-
pbUTF8[i]=static_cast<BYTE>((dwUCS4&0x3F)|0x80);
-
dwUCS4>>=6;
-
}
-
-
pbUTF8[0]=static_cast<BYTE>(dwUCS4|abPrefix[iLen-1]);
-
}
-
-
returniLen;
-
}
-
-
-
INTCUnicodeConverter::UTF8_To_UCS4(constBYTE*pbUTF8,DWORD&dwUCS4)
-
{
-
INTi,iLen;
-
BYTEb;
-
-
if(pbUTF8==NULL)
-
{
-
return0;
-
}
-
-
b=*pbUTF8++;
-
if(b<0x80)
-
{
-
dwUCS4=b;
-
return1;
-
}
-
-
if(b<0xC0||b>0xFD)
-
{
-
return0;
-
}
-
-
if(b<0xE0)
-
{
-
dwUCS4=b&0x1F;
-
iLen=2;
-
}
-
elseif(b<0xF0)
-
{
-
dwUCS4=b&0x0F;
-
iLen=3;
-
}
-
elseif(b<0xF8)
-
{
-
dwUCS4=b&7;
-
iLen=4;
-
}
-
elseif(b<0xFC)
-
{
-
dwUCS4=b&3;
-
iLen=5;
-
}
-
else
-
{
-
dwUCS4=b&1;
-
iLen=6;
-
}
-
-
for(i=1;i<iLen;i++)
-
{
-
b=*pbUTF8++;
-
if(b<0x80||b>0xBF)
-
{
-
break;
-
}
-
-
dwUCS4=(dwUCS4<<6)+(b&0x3F);
-
}
-
-
if(i<iLen)
-
{
-
return0;
-
}
-
else
-
{
-
returniLen;
-
}
-
}
-
-
-
INTCUnicodeConverter::UCS4_To_UTF16(DWORDdwUCS4,WORD*pwUTF16)
-
{
-
if(dwUCS4<=0xFFFF)
-
{
-
if(pwUTF16!=NULL)
-
{
-
*pwUTF16=static_cast<WORD>(dwUCS4);
-
}
-
-
return1;
-
}
-
elseif(dwUCS4<=0xEFFFF)
-
{
-
if(pwUTF16!=NULL)
-
{
-
pwUTF16[0]=static_cast<WORD>(0xD800+(dwUCS4>>10)-0x40);
-
pwUTF16[1]=static_cast<WORD>(0xDC00+(dwUCS4&0x03FF));
-
}
-
-
return2;
-
}
-
else
-
{
-
return0;
-
}
-
}
-
-
-
INTCUnicodeConverter::UTF16_To_UCS4(constWORD*pwUTF16,DWORD&dwUCS4)
-
{
-
WORDw1,w2;
-
-
if(pwUTF16==NULL)
-
{
-
return0;
-
}
-
-
w1=pwUTF16[0];
-
if(w1>=0xD800&&w1<=0xDFFF)
-
{
-
if(w1<0xDC00)
-
{
-
w2=pwUTF16[1];
-
if(w2>=0xDC00&&w2<=0xDFFF)
-
{
-
dwUCS4=(w2&0x03FF)+(((w1&0x03FF)+0x40)<<10);
-
return2;
-
}
-
}
-
-
return0;
-
}
-
else
-
{
-
dwUCS4=w1;
-
return1;
-
}
-
}
-
-
-
INTCUnicodeConverter::UTF8Str_To_UTF16Str(constBYTE*pbszUTF8Str,WORD*pwszUTF16Str)
-
{
-
INTiNum,iLen;
-
DWORDdwUCS4;
-
-
if(pbszUTF8Str==NULL)
-
{
-
return0;
-
}
-
-
iNum=0;
-
while(*pbszUTF8Str)
-
{
-
iLen=UTF8_To_UCS4(pbszUTF8Str,dwUCS4);
-
if(iLen==0)
-
{
-
return0;
-
}
-
-
pbszUTF8Str+=iLen;
-
-
-
iLen=UCS4_To_UTF16(dwUCS4,pwszUTF16Str);
-
if(iLen==0)
-
{
-
return0;
-
}
-
-
if(pwszUTF16Str!=NULL)
-
{
-
pwszUTF16Str+=iLen;
-
}
-
-
iNum+=iLen;
-
}
-
-
if(pwszUTF16Str!=NULL)
-
{
-
*pwszUTF16Str=0;
-
}
-
-
returniNum;
-
}
-
-
-
INTCUnicodeConverter::UTF16Str_To_UTF8Str(constWORD*pwszUTF16Str,BYTE*pbszUTF8Str)
-
{
-
INTiNum,iLen;
-
DWORDdwUCS4;
-
-
if(pwszUTF16Str==NULL)
-
{
-
return0;
-
}
-
-
iNum=0;
-
while(*pwszUTF16Str)
-
{
-
iLen=UTF16_To_UCS4(pwszUTF16Str,dwUCS4);
-
if(iLen==0)
-
{
-
return0;
-
}
-
-
pwszUTF16Str+=iLen;
-
-
-
iLen=UCS4_To_UTF8(dwUCS4,pbszUTF8Str);
-
if(iLen==0)
-
{
-
return0;
-
}
-
-
if(pbszUTF8Str!=NULL)
-
{
-
pbszUTF8Str+=iLen;
-
}
-
-
iNum+=iLen;
-
}
-
-
if(pbszUTF8Str!=NULL)
-
{
-
*pbszUTF8Str=0;
-
}
-
-
returniNum;
-
}
-
-
-
-
-
-
-
UINTCUnicodeConverter::Print_UTF8_By_UCS4(FILE*out,DWORDdwUCS4)
-
{
-
INTiLen;
-
BYTEabUTF8[8];
-
-
if(out==NULL)
-
{
-
return0;
-
}
-
-
iLen=UCS4_To_UTF8(dwUCS4,abUTF8);
-
if(iLen==0)return0;
-
-
fwrite(abUTF8,1,iLen,out);
-
-
returniLen;
-
}
-
-
-
UINTCUnicodeConverter::Print_UTF16_By_UCS4(FILE*out,DWORDdwUCS4,BOOLisBigEndian)
-
{
-
INTi,iLen;
-
WORDwCode,awUTF16[2];
-
-
if(out==NULL)
-
{
-
return0;
-
}
-
-
iLen=UCS4_To_UTF16(dwUCS4,awUTF16);
-
if(iLen==0)return0;
-
-
for(i=0;i<iLen;i++)
-
{
-
wCode=awUTF16[i];
-
if(isBigEndian)
-
{
-
fputc(wCode>>8,out);
-
fputc(wCode&0xFF,out);
-
}
-
else
-
{
-
fputc(wCode&0xFF,out);
-
fputc(wCode>>8,out);
-
}
-
}
-
-
return(iLen<<1);
-
}
-
-
-
UINTCUnicodeConverter::Print_UTF8Str_By_UTF16Str(FILE*out,constWORD*pwszUTF16Str)
-
{
-
INTiCount,iLen;
-
DWORDdwUCS4;
-
-
if((out==NULL)||(pwszUTF16Str==NULL))
-
{
-
return0;
-
}
-
-
iCount=0;
-
while(*pwszUTF16Str)
-
{
-
iLen=UTF16_To_UCS4(pwszUTF16Str,dwUCS4);
-
if(iLen==0)
-
{
-
break;
-
}
-
-
pwszUTF16Str+=iLen;
-
-
-
iCount+=Print_UTF8_By_UCS4(out,dwUCS4);
-
}
-
-
returniCount;
-
}
-
-
-
UINTCUnicodeConverter::Print_UTF16Str_By_UTF8Str(FILE*out,constBYTE*pbszUTF8Str,BOOLisBigEndian)
-
{
-
INTiCount,iLen;
-
DWORDdwUCS4;
-
-
if((out==NULL)||(pbszUTF8Str==NULL))
-
{
-
return0;
-
}
-
-
iCount=0;
-
while(*pbszUTF8Str)
-
{
-
iLen=UTF8_To_UCS4(pbszUTF8Str,dwUCS4);
-
if(iLen==0)
-
{
-
break;
-
}
-
-
pbszUTF8Str+=iLen;
-
-
-
iCount+=Print_UTF16_By_UCS4(out,dwUCS4,isBigEndian);
-
}
-
-
returniCount;
-
}
-
-
-
UINTCUnicodeConverter::Print_UTF8_BOM(FILE*out)
-
{
-
if(out==NULL)
-
{
-
return0;
-
}
-
-
fputc(0xEF,out);
-
fputc(0xBB,out);
-
fputc(0xBF,out);
-
-
return3;
-
}
-
-
-
UINTCUnicodeConverter::Print_UTF16_BOM(FILE*out,BOOLisBigEndian)
-
{
-
if(out==NULL)
-
{
-
return0;
-
}
-
-
if(isBigEndian)
-
{
-
fputc(0xFE,out);
-
fputc(0xFF,out);
-
}
-
else
-
{
-
fputc(0xFF,out);
-
fputc(0xFE,out);
-
}
-
-
return2;
-
}
-
-
-
-
-
-
-
UINTCUnicodeConverter::Print_UTF8_By_UCS4(ostream&os,DWORDdwUCS4)
-
{
-
INTiLen;
-
BYTEabUTF8[8];
-
-
if(!os)return0;
-
-
iLen=UCS4_To_UTF8(dwUCS4,abUTF8);
-
if(iLen==0)return0;
-
-
os.write(reinterpret_cast<CHAR*>(abUTF8),iLen);
-
-
returniLen;
-
}
-
-
-
UINTCUnicodeConverter::Print_UTF16_By_UCS4(ostream&os,DWORDdwUCS4,BOOLisBigEndian)
-
{
-
INTi,iLen;
-
WORDwCode,awUTF16[2];
-
-
if(!os)return0;
-
-
iLen=UCS4_To_UTF16(dwUCS4,awUTF16);
-
if(iLen==0)return0;
-
-
for(i=0;i<iLen;i++)
-
{
-
wCode=awUTF16[i];
-
if(isBigEndian)
-
{
-
os.put(wCode>>8);
-
os.put(wCode&0xFF);
-
}
-
else
-
{
-
os.put(wCode&0xFF);
-
os.put(wCode>>8);
-
}
-
}
-
-
return(iLen<<1);
-
}
-
-
-
UINTCUnicodeConverter::Print_UTF8Str_By_UTF16Str(ostream&os,constWORD*pwszUTF16Str)
-
{
-
INTiCount,iLen;
-
DWORDdwUCS4;
-
-
if(!os||(pwszUTF16Str==NULL))return0;
-
-
iCount=0;
-
while(*pwszUTF16Str)
-
{
-
iLen=UTF16_To_UCS4(pwszUTF16Str,dwUCS4);
-
if(iLen==0)
-
{
-
break;
-
}
-
-
pwszUTF16Str+=iLen;
-
-
-
iCount+=Print_UTF8_By_UCS4(os,dwUCS4);
-
}
-
-
returniCount;
-
}
-
-
-
UINTCUnicodeConverter::Print_UTF16Str_By_UTF8Str(ostream&os,constBYTE*pbszUTF8Str,BOOLisBigEndian)
-
{
-
INTiCount,iLen;
-
DWORDdwUCS4;
-
-
if(!os||(pbszUTF8Str==NULL))return0;
-
-
iCount=0;
-
while(*pbszUTF8Str)
-
{
-
iLen=UTF8_To_UCS4(pbszUTF8Str,dwUCS4);
-
if(iLen==0)
-
{
-
break;
-
}
-
-
pbszUTF8Str+=iLen;
-
-
-
iCount+=Print_UTF16_By_UCS4(os,dwUCS4,isBigEndian);
-
}
-
-
returniCount;
-
}
-
-
-
UINTCUnicodeConverter::Print_UTF8_BOM(ostream&os)
-
{
-
if(!os)return0;
-
-
os.put(0xEF);
-
os.put(0xBB);
-
os.put(0xBF);
-
-
return3;
-
}
-
-
-
UINTCUnicodeConverter::Print_UTF16_BOM(ostream&os,BOOLisBigEndian)
-
{
-
if(!os)return0;
-
-
if(isBigEndian)
-
{
-
os.put(0xFE);
-
os.put(0xFF);
-
}
-
else
-
{
-
os.put(0xFF);
-
os.put(0xFE);
-
}
-
-
return2;
-
}
-
-
-
-
转自:http://blog.csdn.net/jhqin/article/details/5687505
分享到:
相关推荐
很多人喜欢用CString 或std:string,但是他们的缺点是不能完成汉字各种类型之间的转换,提供三种类库ascString,ucsString,utfString以及工具utfCount,utf8_ucs2_t,tcf8_ucs4_t类库,用于各种字符串之间的直接转换`...
基于MFC CString的GBK与UTF-8编码转换,在网上找到一些代码都有问题,但都存在一些错误。现在改好了,与大家分享一下。 (MFC 非UNICODE)
utf-8、ANSI、Unicode相互转化c++实现 std::string ConverANSI2UTF8(const std::string & str); std::wstring ConverANSI2Unicode(const std::string str); std::wstring ConverUTF82Unicode(const std::string str)...
GB2312编码与utf-8编码的字符串的转换,主要使用windows api函数MultiByteToWideChar和WideCharToMultiByte,代码简洁,经测试可用
多字节与UTF-8、Unicode之间的转换 ,里面有相互转换的6个函数 ,稍微修改下可以加到自己的c++程序中,比较好用
linux C/c++ 源代码,将中文字串与UTF-8格式字串相互转化,我在项目中使用的代码,完全可用
GBK、GB2312等与UTF8之间都必须通过Unicode编码才能相互转换
汉字编码转换工具,实现了汉字与 utf-8 gb2312 unicode 互转,开发者多百多度
ustring, an wrapper of std::basic_string, provides supports in converting an external code string to an internal code string automatically. I think ustring's performance may be as effective as the ...
该工具是使用Qt5.9开发的。支持UTF-8与GB2312文件编码的相互批量转换。
C++写的Windows下GB2312与UTF-8相互转换代码,VC实现,调用了windows底层函数,不支持linux环境
C++各种编码转换 Unicode UTF8
用来映射Unicode字符串的WideCharToMultiByte函数经常被用来进行UTF-8编码的转换,以下我们将看到C++使用WideCharToMultiByte函数生成UTF-8编码文件的方法,首先先来对WideCharToMultiByte作一个详细的了解:
在VS2005下正常编译通过 UTF-8 TO UNICODE 相互转换C源码 UTF-8 TO GBK 相互转换C源码 GBK TO UNICODE 相互转换C源码
CString转UTF8,UTF8转CString。
ANSI转Unicode Unicode转ANSI UTF8转Unicode Unicode转UTF8 wchar_t* 转 char* char*转 char* UTF8转ANSI ANSI转UTF8
UTF-8 GBK 转化工具 C++ 源代码编码转化 主要用于跨平台的源代码编译,防止乱码
跨平台(windows Linux)是纯c实现 gbk/utf8互转,Ansi/Utf8互转,
提供完整的函数,将普通的字符串转换为UTF8编码格式。
下面小编就为大家带来一篇Linux下实现UTF-8和GB2312互相转换的方法。小编觉得挺不错的,现在就分享给大家,也给大家做个参考。一起跟随小编过来看看吧,祝大家游戏愉快哦