#include #include #include #include "CharacterConversion.h" using namespace std; float Similarity(string name, string Name) { float res = 0.0; int n = (int)name.size(), m = (int)Name.size(); vector>dp(n + 1, vector(m + 1, 0)); dp[0][0] = 0;//dp[x][y]代表将a字符串前x个字符修改成b字符串前y个字符 for (int i = 1; i <= m; ++i) dp[0][i] = i; for (int i = 1; i <= n; ++i) dp[i][0] = i; for (int i = 1; i <= n; ++i) { for (int j = 1; j <= m; ++j) { int one = dp[i - 1][j] + 1, two = dp[i][j - 1] + 1, three = dp[i - 1][j - 1]; if (name[i - 1] != Name[j - 1]) three += 1; dp[i][j] = min(min(one, two), three); } } res = 1 / (1 + dp[n][m]); //cout << "SimIlarity:" << res << endl; return res; } //UTF-8转Unicode std::wstring Utf82Unicode(const std::string& utf8string) { int widesize = MultiByteToWideChar(CP_UTF8, 0, utf8string.c_str(), -1, NULL, 0); if (widesize == ERROR_NO_UNICODE_TRANSLATION) { throw std::exception("Invalid UTF-8 sequence."); } if (widesize == 0) { throw std::exception("Error in conversion."); } std::vector resultstring(widesize); int convresult = MultiByteToWideChar(CP_UTF8, 0, utf8string.c_str(), -1, &resultstring[0], widesize); if (convresult != widesize) { throw std::exception("La falla!"); } return std::wstring(&resultstring[0]); } //unicode 转为 ascii std::string WideByte2Acsi(std::wstring& wstrcode) { int asciisize = WideCharToMultiByte(CP_OEMCP, 0, wstrcode.c_str(), -1, NULL, 0, NULL, NULL); if (asciisize == ERROR_NO_UNICODE_TRANSLATION) { throw std::exception("Invalid UTF-8 sequence."); } if (asciisize == 0) { throw std::exception("Error in conversion."); } std::vector resultstring(asciisize); int convresult = WideCharToMultiByte(CP_OEMCP, 0, wstrcode.c_str(), -1, &resultstring[0], asciisize, NULL, NULL); if (convresult != asciisize) { throw std::exception("La falla!"); } return std::string(&resultstring[0]); } //utf-8 转 ascii std::string UTF_82ASCII(std::string& strUtf8Code) { using namespace std; string strRet = ""; //先把 utf8 转为 unicode wstring wstr = Utf82Unicode(strUtf8Code); //最后把 unicode 转为 ascii strRet = WideByte2Acsi(wstr); return strRet; } //ascii 转 Unicode std::wstring Acsi2WideByte(std::string& strascii) { using namespace std; int widesize = MultiByteToWideChar(CP_ACP, 0, (char*)strascii.c_str(), -1, NULL, 0); if (widesize == ERROR_NO_UNICODE_TRANSLATION) { throw std::exception("Invalid UTF-8 sequence."); } if (widesize == 0) { throw std::exception("Error in conversion."); } std::vector resultstring(widesize); int convresult = MultiByteToWideChar(CP_ACP, 0, (char*)strascii.c_str(), -1, &resultstring[0], widesize); if (convresult != widesize) { throw std::exception("La falla!"); } return std::wstring(&resultstring[0]); } //Unicode 转 Utf8 std::string Unicode2Utf8(const std::wstring& widestring) { using namespace std; int utf8size = WideCharToMultiByte(CP_UTF8, 0, widestring.c_str(), -1, NULL, 0, NULL, NULL); if (utf8size == 0) { throw std::exception("Error in conversion."); } std::vector resultstring(utf8size); int convresult = WideCharToMultiByte(CP_UTF8, 0, widestring.c_str(), -1, &resultstring[0], utf8size, NULL, NULL); if (convresult != utf8size) { throw std::exception("La falla!"); } return std::string(&resultstring[0]); } //ascii 转 Utf8 std::string ASCII2UTF_8(std::string& strAsciiCode) { using namespace std; string strRet(""); //先把 ascii 转为 unicode wstring wstr = Acsi2WideByte(strAsciiCode); //最后把 unicode 转为 utf8 strRet = Unicode2Utf8(wstr); return strRet; } int utf8_to_unicode(char* pInput, char** ppOutput) { int outputSize = 0; //记录转换后的Unicode字符串的字节数 *ppOutput = (char*)malloc(strlen(pInput) * 2); //为输出字符串分配足够大的内存空间 memset(*ppOutput, 0, strlen(pInput) * 2); char* tmp = *ppOutput; //临时变量,用于遍历输出字符串 while (*pInput) { if (*pInput > 0x00 && *pInput <= 0x7F) //处理单字节UTF8字符(英文字母、数字) { * tmp = *pInput; tmp++; *tmp = 0; //小端法表示,在高地址填补0 } else if (((*pInput) & 0xE0) == 0xC0) //处理双字节UTF8字符 { char high = *pInput; pInput++; char low = *pInput; if ((low & 0xC0) != 0x80) //检查是否为合法的UTF8字符表示 { return -1; //如果不是则报错 } *tmp = (high << 6) + (low & 0x3F); tmp++; *tmp = (high >> 2) & 0x07; } else if (((*pInput) & 0xF0) == 0xE0) //处理三字节UTF8字符 { char high = *pInput; pInput++; char middle = *pInput; pInput++; char low = *pInput; if (((middle & 0xC0) != 0x80) || ((low & 0xC0) != 0x80)) { return -1; } *tmp = (middle << 6) + (low & 0x7F); tmp++; *tmp = (high << 4) + ((middle >> 2) & 0x0F); } else //对于其他字节数的UTF8字符不进行处理 { return -1; } pInput++; tmp++; outputSize += 2; } *tmp = 0; tmp++; *tmp = 0; return outputSize; }