// // Created on 2024-05-17. // #include "utf_convert.h" #include #include namespace utfcvt { std::u32string UTF8ToUTF32(std::string_view utf8) { if (utf8.empty()) { return {}; } std::vector codePoints; // source may have multiple code points, treat as UTF-8 size_t i = 0; while (i < utf8.size()) { if ((utf8[i] & 0b10000000) == 0) { // 1 byte code point, ASCII int c = (utf8[i] & 0b01111111); codePoints.push_back(c); i++; } else if ((utf8[i] & 0b11100000) == 0b11000000) { // 2 byte code point int c = (utf8[i] & 0b00011111) << 6 | (utf8[i + 1] & 0b00111111); codePoints.push_back(c); i += 2; } else if ((utf8[i] & 0b11110000) == 0b11100000) { // 3 byte code point int c = (utf8[i] & 0b00001111) << 12 | (utf8[i + 1] & 0b00111111) << 6 | (utf8[i + 2] & 0b00111111); codePoints.push_back(c); i += 3; } else { // 4 byte code point int c = (utf8[i] & 0b00000111) << 18 | (utf8[i + 1] & 0b00111111) << 12 | (utf8[i + 2] & 0b00111111) << 6 | (utf8[i + 3] & 0b00111111); codePoints.push_back(c); i += 4; } } return {codePoints.begin(), codePoints.end()}; } std::u32string UTF16ToUTF32(std::u16string_view utf16) { if (utf16.empty()) { return {}; } std::vector codePoints; for (size_t i = 0; i < utf16.size(); ++i) { if (utf16[i] < 0xD800u || utf16[i] > 0xDFFFu) { codePoints.push_back(utf16[i]); } else if (i + 1 < utf16.size()) { codePoints.push_back(0x10000u + ((utf16[i] - 0xD800u) << 10) + (utf16[i + 1] - 0xDC00u)); ++i; } else { codePoints.push_back(0xFFFDu); } } return {codePoints.begin(), codePoints.end()}; } std::u16string UTF32ToUTF16(std::u32string_view utf32) { std::vector utf16; for (auto c: utf32) { if (c <= 0xFFFFu) { utf16.push_back(static_cast(c)); } else if (c <= 0x10FFFFu) { utf16.push_back(static_cast(0xD800u + ((c - 0x10000u) >> 10))); utf16.push_back(static_cast(0xDC00u + ((c - 0x10000u) & 0x3FF))); } else { utf16.push_back(0xFFFDu); } } return {utf16.begin(), utf16.end()}; } std::string UTF32ToUTF8(std::u32string_view utf32) { std::string utf8; for (auto c: utf32) { if (c <= 0x7F) { utf8.push_back(static_cast(c)); } else if (c <= 0x7FF) { utf8.push_back(static_cast(0xC0 | (c >> 6))); utf8.push_back(static_cast(0x80 | (c & 0x3F))); } else if (c <= 0xFFFF) { utf8.push_back(static_cast(0xE0 | (c >> 12))); utf8.push_back(static_cast(0x80 | ((c >> 6) & 0x3F))); utf8.push_back(static_cast(0x80 | (c & 0x3F))); } else { utf8.push_back(static_cast(0xF0 | (c >> 18))); utf8.push_back(static_cast(0x80 | ((c >> 12) & 0x3F))); utf8.push_back(static_cast(0x80 | ((c >> 6) & 0x3F))); utf8.push_back(static_cast(0x80 | (c & 0x3F))); } } return utf8; } std::string UTF16ToMUTF8(std::u16string_view utf16) { std::string mutf8; for (size_t i = 0; i < utf16.size(); ++i) { if (utf16[i] < 0x80) { mutf8.push_back(static_cast(utf16[i])); } else if (utf16[i] < 0x800) { mutf8.push_back(static_cast(0xC0 | (utf16[i] >> 6))); mutf8.push_back(static_cast(0x80 | (utf16[i] & 0x3F))); } else { mutf8.push_back(static_cast(0xE0 | (utf16[i] >> 12))); mutf8.push_back(static_cast(0x80 | ((utf16[i] >> 6) & 0x3F))); mutf8.push_back(static_cast(0x80 | (utf16[i] & 0x3F))); } } return mutf8; } std::u16string MUTF8ToUTF16(std::string_view mutf8) { std::vector utf16; for (size_t i = 0; i < mutf8.size(); ++i) { if ((mutf8[i] & 0b10000000) == 0) { // 1 byte code point, ASCII utf16.push_back(mutf8[i]); } else if ((mutf8[i] & 0b11100000) == 0b11000000) { // 2 byte code point utf16.push_back(uint16_t((mutf8[i] & 0b00011111u) << 6) | uint16_t(mutf8[i + 1] & 0b00111111u)); i++; } else if ((mutf8[i] & 0b11110000) == 0b11100000) { // 3 byte code point utf16.push_back(uint16_t((mutf8[i] & 0b00001111u) << 12) | uint16_t((mutf8[i + 1] & 0b00111111u) << 6) | uint16_t(mutf8[i + 2] & 0b00111111u)); i += 2; } } return {utf16.begin(), utf16.end()}; } std::string UTF16ToUTF8(std::u16string_view utf16) { return UTF32ToUTF8(UTF16ToUTF32(utf16)); } std::u16string UTF8ToUTF16(std::string_view utf8) { return UTF32ToUTF16(UTF8ToUTF32(utf8)); } #if defined(_WIN32) || defined(_WIN64) // for windows, they are not implemented on linux static_assert(sizeof(wchar_t) == sizeof(char16_t)); std::wstring UTF16ToWide(std::u16string_view utf16) { return {reinterpret_cast(utf16.data()), utf16.size()}; } std::u16string WideToUTF16(std::wstring_view wide) { return {reinterpret_cast(wide.data()), wide.size()}; } std::wstring UTF8ToWide(std::string_view utf8) { auto utf16 = UTF8ToUTF16(utf8); return UTF16ToWide(utf16); } std::string WideToUTF8(std::wstring_view wide) { return UTF16ToUTF8(std::u16string_view{reinterpret_cast(wide.data()), wide.size()}); } #endif }