add mutf8
This commit is contained in:
@@ -5,6 +5,7 @@
|
|||||||
#include "utf_convert.h"
|
#include "utf_convert.h"
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <cstdint>
|
||||||
|
|
||||||
namespace utfcvt {
|
namespace utfcvt {
|
||||||
|
|
||||||
@@ -46,30 +47,30 @@ std::u32string UTF16ToUTF32(std::u16string_view utf16) {
|
|||||||
if (utf16.empty()) {
|
if (utf16.empty()) {
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
std::vector<int> codePoints;
|
std::vector<uint32_t> codePoints;
|
||||||
for (size_t i = 0; i < utf16.size(); ++i) {
|
for (size_t i = 0; i < utf16.size(); ++i) {
|
||||||
if (utf16[i] < 0xD800 || utf16[i] > 0xDFFF) {
|
if (utf16[i] < 0xD800u || utf16[i] > 0xDFFFu) {
|
||||||
codePoints.push_back(utf16[i]);
|
codePoints.push_back(utf16[i]);
|
||||||
} else if (i + 1 < utf16.size()) {
|
} else if (i + 1 < utf16.size()) {
|
||||||
codePoints.push_back(0x10000 + ((utf16[i] - 0xD800) << 10) + (utf16[i + 1] - 0xDC00));
|
codePoints.push_back(0x10000u + ((utf16[i] - 0xD800u) << 10) + (utf16[i + 1] - 0xDC00u));
|
||||||
++i;
|
++i;
|
||||||
} else {
|
} else {
|
||||||
codePoints.push_back(0xFFFD);
|
codePoints.push_back(0xFFFDu);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return {codePoints.begin(), codePoints.end()};
|
return {codePoints.begin(), codePoints.end()};
|
||||||
}
|
}
|
||||||
|
|
||||||
std::u16string UTF32ToUTF16(std::u32string_view utf32){
|
std::u16string UTF32ToUTF16(std::u32string_view utf32) {
|
||||||
std::vector<char16_t> utf16;
|
std::vector<uint16_t> utf16;
|
||||||
for (auto c: utf32) {
|
for (auto c: utf32) {
|
||||||
if (c <= 0xFFFF) {
|
if (c <= 0xFFFFu) {
|
||||||
utf16.push_back(static_cast<char16_t>(c));
|
utf16.push_back(static_cast<char16_t>(c));
|
||||||
} else if (c <= 0x10FFFF) {
|
} else if (c <= 0x10FFFFu) {
|
||||||
utf16.push_back(static_cast<char16_t>(0xD800 + ((c - 0x10000) >> 10)));
|
utf16.push_back(static_cast<char16_t>(0xD800u + ((c - 0x10000u) >> 10)));
|
||||||
utf16.push_back(static_cast<char16_t>(0xDC00 + ((c - 0x10000) & 0x3FF)));
|
utf16.push_back(static_cast<char16_t>(0xDC00u + ((c - 0x10000u) & 0x3FF)));
|
||||||
} else {
|
} else {
|
||||||
utf16.push_back(0xFFFD);
|
utf16.push_back(0xFFFDu);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return {utf16.begin(), utf16.end()};
|
return {utf16.begin(), utf16.end()};
|
||||||
@@ -97,6 +98,43 @@ std::string UTF32ToUTF8(std::u32string_view utf32) {
|
|||||||
return utf8;
|
return utf8;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string UTF16ToMUTF8(std::u16string_view utf16) {
|
||||||
|
std::string mutf8;
|
||||||
|
for (size_t i = 0; i < utf16.size(); ++i) {
|
||||||
|
if (utf16[i] < 0x80) {
|
||||||
|
mutf8.push_back(static_cast<char>(utf16[i]));
|
||||||
|
} else if (utf16[i] < 0x800) {
|
||||||
|
mutf8.push_back(static_cast<char>(0xC0 | (utf16[i] >> 6)));
|
||||||
|
mutf8.push_back(static_cast<char>(0x80 | (utf16[i] & 0x3F)));
|
||||||
|
} else {
|
||||||
|
mutf8.push_back(static_cast<char>(0xE0 | (utf16[i] >> 12)));
|
||||||
|
mutf8.push_back(static_cast<char>(0x80 | ((utf16[i] >> 6) & 0x3F)));
|
||||||
|
mutf8.push_back(static_cast<char>(0x80 | (utf16[i] & 0x3F)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return mutf8;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::u16string MUTF8ToUTF16(std::string_view mutf8) {
|
||||||
|
std::vector<uint16_t> utf16;
|
||||||
|
for (size_t i = 0; i < mutf8.size(); ++i) {
|
||||||
|
if ((mutf8[i] & 0b10000000) == 0) {
|
||||||
|
// 1 byte code point, ASCII
|
||||||
|
utf16.push_back(mutf8[i]);
|
||||||
|
} else if ((mutf8[i] & 0b11100000) == 0b11000000) {
|
||||||
|
// 2 byte code point
|
||||||
|
utf16.push_back(uint16_t((mutf8[i] & 0b00011111u) << 6) | uint16_t(mutf8[i + 1] & 0b00111111u));
|
||||||
|
i++;
|
||||||
|
} else if ((mutf8[i] & 0b11110000) == 0b11100000) {
|
||||||
|
// 3 byte code point
|
||||||
|
utf16.push_back(uint16_t((mutf8[i] & 0b00001111u) << 12) |
|
||||||
|
uint16_t((mutf8[i + 1] & 0b00111111u) << 6) | uint16_t(mutf8[i + 2] & 0b00111111u));
|
||||||
|
i += 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return {utf16.begin(), utf16.end()};
|
||||||
|
}
|
||||||
|
|
||||||
std::string UTF16ToUTF8(std::u16string_view utf16) {
|
std::string UTF16ToUTF8(std::u16string_view utf16) {
|
||||||
return UTF32ToUTF8(UTF16ToUTF32(utf16));
|
return UTF32ToUTF8(UTF16ToUTF32(utf16));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,18 +10,22 @@
|
|||||||
|
|
||||||
namespace utfcvt {
|
namespace utfcvt {
|
||||||
|
|
||||||
std::string UTF16ToUTF8(std::u16string_view utf16);
|
|
||||||
|
|
||||||
std::u16string UTF8ToUTF16(std::string_view utf8);
|
std::u16string UTF8ToUTF16(std::string_view utf8);
|
||||||
|
|
||||||
std::u32string UTF8ToUTF32(std::string_view utf8);
|
std::u32string UTF8ToUTF32(std::string_view utf8);
|
||||||
|
|
||||||
std::string UTF32ToUTF8(std::u32string_view utf32);
|
std::string UTF16ToUTF8(std::u16string_view utf16);
|
||||||
|
|
||||||
std::u32string UTF16ToUTF32(std::u16string_view utf16);
|
std::u32string UTF16ToUTF32(std::u16string_view utf16);
|
||||||
|
|
||||||
|
std::string UTF32ToUTF8(std::u32string_view utf32);
|
||||||
|
|
||||||
std::u16string UTF32ToUTF16(std::u32string_view utf32);
|
std::u16string UTF32ToUTF16(std::u32string_view utf32);
|
||||||
|
|
||||||
|
std::string UTF16ToMUTF8(std::u16string_view utf16);
|
||||||
|
|
||||||
|
std::u16string MUTF8ToUTF16(std::string_view mutf8);
|
||||||
|
|
||||||
// for windows, they are not implemented on linux
|
// for windows, they are not implemented on linux
|
||||||
|
|
||||||
std::wstring UTF8ToWide(std::string_view utf8);
|
std::wstring UTF8ToWide(std::string_view utf8);
|
||||||
|
|||||||
Reference in New Issue
Block a user