CharacterConversion.cpp 4.92 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
#include <iostream>
#include <vector>
#include <windows.h>

#include "CharacterConversion.h"


using namespace std;
float Similarity(string name, string Name)
{
	float res = 0.0;
	int n = (int)name.size(), m = (int)Name.size();
	vector<vector<int>>dp(n + 1, vector<int>(m + 1, 0));
	dp[0][0] = 0;//dp[x][y]代表将a字符串前x个字符修改成b字符串前y个字符
	for (int i = 1; i <= m; ++i)
		dp[0][i] = i;
	for (int i = 1; i <= n; ++i)
		dp[i][0] = i;
	for (int i = 1; i <= n; ++i)
	{
		for (int j = 1; j <= m; ++j)
		{
			int one = dp[i - 1][j] + 1, two = dp[i][j - 1] + 1, three = dp[i - 1][j - 1];
			if (name[i - 1] != Name[j - 1])
				three += 1;
			dp[i][j] = min(min(one, two), three);
		}
	}
	res = 1 / (1 + dp[n][m]);
	//cout << "SimIlarity:" << res << endl;
	return res;
}


//UTF-8转Unicode 
std::wstring Utf82Unicode(const std::string& utf8string) {
	int widesize = MultiByteToWideChar(CP_UTF8, 0, utf8string.c_str(), -1, NULL, 0);
	if (widesize == ERROR_NO_UNICODE_TRANSLATION)
	{
		throw std::exception("Invalid UTF-8 sequence.");
	}
	if (widesize == 0)
	{
		throw std::exception("Error in conversion.");
	}
	std::vector<wchar_t> resultstring(widesize);
	int convresult = MultiByteToWideChar(CP_UTF8, 0, utf8string.c_str(), -1, &resultstring[0], widesize);
	if (convresult != widesize)
	{
		throw std::exception("La falla!");
	}
	return std::wstring(&resultstring[0]);
}


//unicode 转为 ascii 
std::string WideByte2Acsi(std::wstring& wstrcode) {
	int asciisize = WideCharToMultiByte(CP_OEMCP, 0, wstrcode.c_str(), -1, NULL, 0, NULL, NULL);
	if (asciisize == ERROR_NO_UNICODE_TRANSLATION)
	{
		throw std::exception("Invalid UTF-8 sequence.");
	}
	if (asciisize == 0)
	{
		throw std::exception("Error in conversion.");
	}
	std::vector<char> resultstring(asciisize);
	int convresult = WideCharToMultiByte(CP_OEMCP, 0, wstrcode.c_str(), -1, &resultstring[0], asciisize, NULL, NULL);
	if (convresult != asciisize)
	{
		throw std::exception("La falla!");
	}
	return std::string(&resultstring[0]);
}



//utf-8 转 ascii 
std::string UTF_82ASCII(std::string& strUtf8Code) {
	using namespace std;
	string strRet = "";
	//先把 utf8 转为 unicode 
	wstring wstr = Utf82Unicode(strUtf8Code);
	//最后把 unicode 转为 ascii 
	strRet = WideByte2Acsi(wstr);
	return strRet;
}



//ascii 转 Unicode 
std::wstring Acsi2WideByte(std::string& strascii) {
	using namespace std;
	int widesize = MultiByteToWideChar(CP_ACP, 0, (char*)strascii.c_str(), -1, NULL, 0);
	if (widesize == ERROR_NO_UNICODE_TRANSLATION)
	{
		throw std::exception("Invalid UTF-8 sequence.");
	}
	if (widesize == 0)
	{
		throw std::exception("Error in conversion.");
	}
	std::vector<wchar_t> resultstring(widesize);
	int convresult = MultiByteToWideChar(CP_ACP, 0, (char*)strascii.c_str(), -1, &resultstring[0], widesize);
	if (convresult != widesize)
	{
		throw std::exception("La falla!");
	}
	return std::wstring(&resultstring[0]);
}


//Unicode 转 Utf8 
std::string Unicode2Utf8(const std::wstring& widestring) {
	using namespace std;
	int utf8size = WideCharToMultiByte(CP_UTF8, 0, widestring.c_str(), -1, NULL, 0, NULL, NULL);
	if (utf8size == 0)
	{
		throw std::exception("Error in conversion.");
	}
	std::vector<char> resultstring(utf8size);
	int convresult = WideCharToMultiByte(CP_UTF8, 0, widestring.c_str(), -1, &resultstring[0], utf8size, NULL, NULL);
	if (convresult != utf8size)
	{
		throw std::exception("La falla!");
	}
	return std::string(&resultstring[0]);
}


//ascii 转 Utf8 
std::string ASCII2UTF_8(std::string& strAsciiCode) {
	using namespace std;
	string strRet("");
	//先把 ascii 转为 unicode 
	wstring wstr = Acsi2WideByte(strAsciiCode);
	//最后把 unicode 转为 utf8 
	strRet = Unicode2Utf8(wstr);
	return strRet;
}

int utf8_to_unicode(char* pInput, char** ppOutput)
{
	int outputSize = 0; //记录转换后的Unicode字符串的字节数

	*ppOutput = (char*)malloc(strlen(pInput) * 2);  //为输出字符串分配足够大的内存空间
	memset(*ppOutput, 0, strlen(pInput) * 2);
	char* tmp = *ppOutput; //临时变量,用于遍历输出字符串

	while (*pInput)
	{
			if (*pInput > 0x00 && *pInput <= 0x7F) //处理单字节UTF8字符(英文字母、数字)
			{
				* tmp = *pInput;
				tmp++;
				*tmp = 0; //小端法表示,在高地址填补0
			}
			else if (((*pInput) & 0xE0) == 0xC0) //处理双字节UTF8字符
			{
				char high = *pInput;
				pInput++;
				char low = *pInput;

				if ((low & 0xC0) != 0x80)  //检查是否为合法的UTF8字符表示
				{
					return -1; //如果不是则报错
				}

				*tmp = (high << 6) + (low & 0x3F);
				tmp++;
				*tmp = (high >> 2) & 0x07;
			}
			else if (((*pInput) & 0xF0) == 0xE0) //处理三字节UTF8字符
			{
				char high = *pInput;
				pInput++;
				char middle = *pInput;
				pInput++;
				char low = *pInput;

				if (((middle & 0xC0) != 0x80) || ((low & 0xC0) != 0x80))
				{
					return -1;
				}

				*tmp = (middle << 6) + (low & 0x7F);
				tmp++;
				*tmp = (high << 4) + ((middle >> 2) & 0x0F);
			}
			else //对于其他字节数的UTF8字符不进行处理
			{
				return -1;
			}

		pInput++;
		tmp++;
		outputSize += 2;
	}
	*tmp = 0;
	tmp++;
	*tmp = 0;
	return outputSize;
}