CharacterConversion.cpp

#include <iostream>
#include <vector>
#include <windows.h>

#include "CharacterConversion.h"


using namespace std;
float Similarity(string name, string Name)
{
	float res = 0.0;
	int n = (int)name.size(), m = (int)Name.size();
	vector<vector<int>>dp(n + 1, vector<int>(m + 1, 0));
	dp[0][0] = 0;//dp[x][y]代表将a字符串前x个字符修改成b字符串前y个字符
	for (int i = 1; i <= m; ++i)
		dp[0][i] = i;
	for (int i = 1; i <= n; ++i)
		dp[i][0] = i;
	for (int i = 1; i <= n; ++i)
	{
		for (int j = 1; j <= m; ++j)
		{
			int one = dp[i - 1][j] + 1, two = dp[i][j - 1] + 1, three = dp[i - 1][j - 1];
			if (name[i - 1] != Name[j - 1])
				three += 1;
			dp[i][j] = min(min(one, two), three);
		}
	}
	res = 1 / (1 + dp[n][m]);
	//cout << "SimIlarity:" << res << endl;
	return res;
}


//UTF-8转Unicode 
std::wstring Utf82Unicode(const std::string& utf8string) {
	int widesize = MultiByteToWideChar(CP_UTF8, 0, utf8string.c_str(), -1, NULL, 0);
	if (widesize == ERROR_NO_UNICODE_TRANSLATION)
	{
		throw std::exception("Invalid UTF-8 sequence.");
	}
	if (widesize == 0)
	{
		throw std::exception("Error in conversion.");
	}
	std::vector<wchar_t> resultstring(widesize);
	int convresult = MultiByteToWideChar(CP_UTF8, 0, utf8string.c_str(), -1, &resultstring[0], widesize);
	if (convresult != widesize)
	{
		throw std::exception("La falla!");
	}
	return std::wstring(&resultstring[0]);
}


//unicode 转为 ascii 
std::string WideByte2Acsi(std::wstring& wstrcode) {
	int asciisize = WideCharToMultiByte(CP_OEMCP, 0, wstrcode.c_str(), -1, NULL, 0, NULL, NULL);
	if (asciisize == ERROR_NO_UNICODE_TRANSLATION)
	{
		throw std::exception("Invalid UTF-8 sequence.");
	}
	if (asciisize == 0)
	{
		throw std::exception("Error in conversion.");
	}
	std::vector<char> resultstring(asciisize);
	int convresult = WideCharToMultiByte(CP_OEMCP, 0, wstrcode.c_str(), -1, &resultstring[0], asciisize, NULL, NULL);
	if (convresult != asciisize)
	{
		throw std::exception("La falla!");
	}
	return std::string(&resultstring[0]);
}


//utf-8 转 ascii 
std::string UTF_82ASCII(std::string& strUtf8Code) {
	using namespace std;
	string strRet = "";
	//先把 utf8 转为 unicode 
	wstring wstr = Utf82Unicode(strUtf8Code);
	//最后把 unicode 转为 ascii 
	strRet = WideByte2Acsi(wstr);
	return strRet;
}


//ascii 转 Unicode 
std::wstring Acsi2WideByte(std::string& strascii) {
	using namespace std;
	int widesize = MultiByteToWideChar(CP_ACP, 0, (char*)strascii.c_str(), -1, NULL, 0);
	if (widesize == ERROR_NO_UNICODE_TRANSLATION)
	{
		throw std::exception("Invalid UTF-8 sequence.");
	}
	if (widesize == 0)
	{
		throw std::exception("Error in conversion.");
	}
	std::vector<wchar_t> resultstring(widesize);
	int convresult = MultiByteToWideChar(CP_ACP, 0, (char*)strascii.c_str(), -1, &resultstring[0], widesize);
	if (convresult != widesize)
	{
		throw std::exception("La falla!");
	}
	return std::wstring(&resultstring[0]);
}


//Unicode 转 Utf8 
std::string Unicode2Utf8(const std::wstring& widestring) {
	using namespace std;
	int utf8size = WideCharToMultiByte(CP_UTF8, 0, widestring.c_str(), -1, NULL, 0, NULL, NULL);
	if (utf8size == 0)
	{
		throw std::exception("Error in conversion.");
	}
	std::vector<char> resultstring(utf8size);
	int convresult = WideCharToMultiByte(CP_UTF8, 0, widestring.c_str(), -1, &resultstring[0], utf8size, NULL, NULL);
	if (convresult != utf8size)
	{
		throw std::exception("La falla!");
	}
	return std::string(&resultstring[0]);
}


//ascii 转 Utf8 
std::string ASCII2UTF_8(std::string& strAsciiCode) {
	using namespace std;
	string strRet("");
	//先把 ascii 转为 unicode 
	wstring wstr = Acsi2WideByte(strAsciiCode);
	//最后把 unicode 转为 utf8 
	strRet = Unicode2Utf8(wstr);
	return strRet;
}

int utf8_to_unicode(char* pInput, char** ppOutput)
{
	int outputSize = 0; //记录转换后的Unicode字符串的字节数

	*ppOutput = (char*)malloc(strlen(pInput) * 2);  //为输出字符串分配足够大的内存空间
	memset(*ppOutput, 0, strlen(pInput) * 2);
	char* tmp = *ppOutput; //临时变量，用于遍历输出字符串

	while (*pInput)
	{
			if (*pInput > 0x00 && *pInput <= 0x7F) //处理单字节UTF8字符（英文字母、数字）
			{
				* tmp = *pInput;
				tmp++;
				*tmp = 0; //小端法表示，在高地址填补0
			}
			else if (((*pInput) & 0xE0) == 0xC0) //处理双字节UTF8字符
			{
				char high = *pInput;
				pInput++;
				char low = *pInput;

				if ((low & 0xC0) != 0x80)  //检查是否为合法的UTF8字符表示
				{
					return -1; //如果不是则报错
				}

				*tmp = (high << 6) + (low & 0x3F);
				tmp++;
				*tmp = (high >> 2) & 0x07;
			}
			else if (((*pInput) & 0xF0) == 0xE0) //处理三字节UTF8字符
			{
				char high = *pInput;
				pInput++;
				char middle = *pInput;
				pInput++;
				char low = *pInput;

				if (((middle & 0xC0) != 0x80) || ((low & 0xC0) != 0x80))
				{
					return -1;
				}

				*tmp = (middle << 6) + (low & 0x7F);
				tmp++;
				*tmp = (high << 4) + ((middle >> 2) & 0x0F);
			}
			else //对于其他字节数的UTF8字符不进行处理
			{
				return -1;
			}

		pInput++;
		tmp++;
		outputSize += 2;
	}
	*tmp = 0;
	tmp++;
	*tmp = 0;
	return outputSize;
}