From f9e5b2074b187b57f4a935e0e66967cc3020e9622a53e88d411873cb8b9555fd Mon Sep 17 00:00:00 2001 From: ACh Sulfate Date: Fri, 17 May 2024 22:55:06 +0800 Subject: [PATCH] add mutf8 --- utf_convert.cc | 60 +++++++++++++++++++++++++++++++++++++++++--------- utf_convert.h | 10 ++++++--- 2 files changed, 56 insertions(+), 14 deletions(-) diff --git a/utf_convert.cc b/utf_convert.cc index f5bdc42..8919860 100644 --- a/utf_convert.cc +++ b/utf_convert.cc @@ -5,6 +5,7 @@ #include "utf_convert.h" #include +#include namespace utfcvt { @@ -46,30 +47,30 @@ std::u32string UTF16ToUTF32(std::u16string_view utf16) { if (utf16.empty()) { return {}; } - std::vector codePoints; + std::vector codePoints; for (size_t i = 0; i < utf16.size(); ++i) { - if (utf16[i] < 0xD800 || utf16[i] > 0xDFFF) { + if (utf16[i] < 0xD800u || utf16[i] > 0xDFFFu) { codePoints.push_back(utf16[i]); } else if (i + 1 < utf16.size()) { - codePoints.push_back(0x10000 + ((utf16[i] - 0xD800) << 10) + (utf16[i + 1] - 0xDC00)); + codePoints.push_back(0x10000u + ((utf16[i] - 0xD800u) << 10) + (utf16[i + 1] - 0xDC00u)); ++i; } else { - codePoints.push_back(0xFFFD); + codePoints.push_back(0xFFFDu); } } return {codePoints.begin(), codePoints.end()}; } -std::u16string UTF32ToUTF16(std::u32string_view utf32){ - std::vector utf16; +std::u16string UTF32ToUTF16(std::u32string_view utf32) { + std::vector utf16; for (auto c: utf32) { - if (c <= 0xFFFF) { + if (c <= 0xFFFFu) { utf16.push_back(static_cast(c)); - } else if (c <= 0x10FFFF) { - utf16.push_back(static_cast(0xD800 + ((c - 0x10000) >> 10))); - utf16.push_back(static_cast(0xDC00 + ((c - 0x10000) & 0x3FF))); + } else if (c <= 0x10FFFFu) { + utf16.push_back(static_cast(0xD800u + ((c - 0x10000u) >> 10))); + utf16.push_back(static_cast(0xDC00u + ((c - 0x10000u) & 0x3FF))); } else { - utf16.push_back(0xFFFD); + utf16.push_back(0xFFFDu); } } return {utf16.begin(), utf16.end()}; @@ -97,6 +98,43 @@ std::string UTF32ToUTF8(std::u32string_view utf32) { return utf8; } +std::string UTF16ToMUTF8(std::u16string_view utf16) { + std::string mutf8; + for (size_t i = 0; i < utf16.size(); ++i) { + if (utf16[i] < 0x80) { + mutf8.push_back(static_cast(utf16[i])); + } else if (utf16[i] < 0x800) { + mutf8.push_back(static_cast(0xC0 | (utf16[i] >> 6))); + mutf8.push_back(static_cast(0x80 | (utf16[i] & 0x3F))); + } else { + mutf8.push_back(static_cast(0xE0 | (utf16[i] >> 12))); + mutf8.push_back(static_cast(0x80 | ((utf16[i] >> 6) & 0x3F))); + mutf8.push_back(static_cast(0x80 | (utf16[i] & 0x3F))); + } + } + return mutf8; +} + +std::u16string MUTF8ToUTF16(std::string_view mutf8) { + std::vector utf16; + for (size_t i = 0; i < mutf8.size(); ++i) { + if ((mutf8[i] & 0b10000000) == 0) { + // 1 byte code point, ASCII + utf16.push_back(mutf8[i]); + } else if ((mutf8[i] & 0b11100000) == 0b11000000) { + // 2 byte code point + utf16.push_back(uint16_t((mutf8[i] & 0b00011111u) << 6) | uint16_t(mutf8[i + 1] & 0b00111111u)); + i++; + } else if ((mutf8[i] & 0b11110000) == 0b11100000) { + // 3 byte code point + utf16.push_back(uint16_t((mutf8[i] & 0b00001111u) << 12) | + uint16_t((mutf8[i + 1] & 0b00111111u) << 6) | uint16_t(mutf8[i + 2] & 0b00111111u)); + i += 2; + } + } + return {utf16.begin(), utf16.end()}; +} + std::string UTF16ToUTF8(std::u16string_view utf16) { return UTF32ToUTF8(UTF16ToUTF32(utf16)); } diff --git a/utf_convert.h b/utf_convert.h index dae8bca..164da10 100644 --- a/utf_convert.h +++ b/utf_convert.h @@ -10,18 +10,22 @@ namespace utfcvt { -std::string UTF16ToUTF8(std::u16string_view utf16); - std::u16string UTF8ToUTF16(std::string_view utf8); std::u32string UTF8ToUTF32(std::string_view utf8); -std::string UTF32ToUTF8(std::u32string_view utf32); +std::string UTF16ToUTF8(std::u16string_view utf16); std::u32string UTF16ToUTF32(std::u16string_view utf16); +std::string UTF32ToUTF8(std::u32string_view utf32); + std::u16string UTF32ToUTF16(std::u32string_view utf32); +std::string UTF16ToMUTF8(std::u16string_view utf16); + +std::u16string MUTF8ToUTF16(std::string_view mutf8); + // for windows, they are not implemented on linux std::wstring UTF8ToWide(std::string_view utf8);