initial commit
This commit is contained in:
133
utf_convert.cc
Normal file
133
utf_convert.cc
Normal file
@@ -0,0 +1,133 @@
|
||||
//
|
||||
// Created on 2024-05-17.
|
||||
//
|
||||
|
||||
#include "utf_convert.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace utfcvt {
|
||||
|
||||
std::u32string UTF8ToUTF32(std::string_view utf8) {
|
||||
if (utf8.empty()) {
|
||||
return {};
|
||||
}
|
||||
std::vector<int> codePoints;
|
||||
// source may have multiple code points, treat as UTF-8
|
||||
size_t i = 0;
|
||||
while (i < utf8.size()) {
|
||||
if ((utf8[i] & 0b10000000) == 0) {
|
||||
// 1 byte code point, ASCII
|
||||
int c = (utf8[i] & 0b01111111);
|
||||
codePoints.push_back(c);
|
||||
i++;
|
||||
} else if ((utf8[i] & 0b11100000) == 0b11000000) {
|
||||
// 2 byte code point
|
||||
int c = (utf8[i] & 0b00011111) << 6 | (utf8[i + 1] & 0b00111111);
|
||||
codePoints.push_back(c);
|
||||
i += 2;
|
||||
} else if ((utf8[i] & 0b11110000) == 0b11100000) {
|
||||
// 3 byte code point
|
||||
int c = (utf8[i] & 0b00001111) << 12 | (utf8[i + 1] & 0b00111111) << 6 | (utf8[i + 2] & 0b00111111);
|
||||
codePoints.push_back(c);
|
||||
i += 3;
|
||||
} else {
|
||||
// 4 byte code point
|
||||
int c = (utf8[i] & 0b00000111) << 18 | (utf8[i + 1] & 0b00111111) << 12
|
||||
| (utf8[i + 2] & 0b00111111) << 6 | (utf8[i + 3] & 0b00111111);
|
||||
codePoints.push_back(c);
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
return {codePoints.begin(), codePoints.end()};
|
||||
}
|
||||
|
||||
std::u32string UTF16ToUTF32(std::u16string_view utf16) {
|
||||
if (utf16.empty()) {
|
||||
return {};
|
||||
}
|
||||
std::vector<int> codePoints;
|
||||
for (size_t i = 0; i < utf16.size(); ++i) {
|
||||
if (utf16[i] < 0xD800 || utf16[i] > 0xDFFF) {
|
||||
codePoints.push_back(utf16[i]);
|
||||
} else if (i + 1 < utf16.size()) {
|
||||
codePoints.push_back(0x10000 + ((utf16[i] - 0xD800) << 10) + (utf16[i + 1] - 0xDC00));
|
||||
++i;
|
||||
} else {
|
||||
codePoints.push_back(0xFFFD);
|
||||
}
|
||||
}
|
||||
return {codePoints.begin(), codePoints.end()};
|
||||
}
|
||||
|
||||
std::u16string UTF32ToUTF16(std::u32string_view utf32){
|
||||
std::vector<char16_t> utf16;
|
||||
for (auto c: utf32) {
|
||||
if (c <= 0xFFFF) {
|
||||
utf16.push_back(static_cast<char16_t>(c));
|
||||
} else if (c <= 0x10FFFF) {
|
||||
utf16.push_back(static_cast<char16_t>(0xD800 + ((c - 0x10000) >> 10)));
|
||||
utf16.push_back(static_cast<char16_t>(0xDC00 + ((c - 0x10000) & 0x3FF)));
|
||||
} else {
|
||||
utf16.push_back(0xFFFD);
|
||||
}
|
||||
}
|
||||
return {utf16.begin(), utf16.end()};
|
||||
}
|
||||
|
||||
std::string UTF32ToUTF8(std::u32string_view utf32) {
|
||||
std::string utf8;
|
||||
for (auto c: utf32) {
|
||||
if (c <= 0x7F) {
|
||||
utf8.push_back(static_cast<char>(c));
|
||||
} else if (c <= 0x7FF) {
|
||||
utf8.push_back(static_cast<char>(0xC0 | (c >> 6)));
|
||||
utf8.push_back(static_cast<char>(0x80 | (c & 0x3F)));
|
||||
} else if (c <= 0xFFFF) {
|
||||
utf8.push_back(static_cast<char>(0xE0 | (c >> 12)));
|
||||
utf8.push_back(static_cast<char>(0x80 | ((c >> 6) & 0x3F)));
|
||||
utf8.push_back(static_cast<char>(0x80 | (c & 0x3F)));
|
||||
} else {
|
||||
utf8.push_back(static_cast<char>(0xF0 | (c >> 18)));
|
||||
utf8.push_back(static_cast<char>(0x80 | ((c >> 12) & 0x3F)));
|
||||
utf8.push_back(static_cast<char>(0x80 | ((c >> 6) & 0x3F)));
|
||||
utf8.push_back(static_cast<char>(0x80 | (c & 0x3F)));
|
||||
}
|
||||
}
|
||||
return utf8;
|
||||
}
|
||||
|
||||
std::string UTF16ToUTF8(std::u16string_view utf16) {
|
||||
return UTF32ToUTF8(UTF16ToUTF32(utf16));
|
||||
}
|
||||
|
||||
std::u16string UTF8ToUTF16(std::string_view utf8) {
|
||||
return UTF32ToUTF16(UTF8ToUTF32(utf8));
|
||||
}
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
|
||||
// for windows, they are not implemented on linux
|
||||
|
||||
static_assert(sizeof(wchar_t) == sizeof(char16_t));
|
||||
|
||||
std::wstring UTF16ToWide(std::u16string_view utf16) {
|
||||
return {reinterpret_cast<const wchar_t*>(utf16.data()), utf16.size()};
|
||||
}
|
||||
|
||||
std::u16string WideToUTF16(std::wstring_view wide) {
|
||||
return {reinterpret_cast<const char16_t*>(wide.data()), wide.size()};
|
||||
}
|
||||
|
||||
std::wstring UTF8ToWide(std::string_view utf8) {
|
||||
auto utf16 = UTF8ToUTF16(utf8);
|
||||
return UTF16ToWide(utf16);
|
||||
}
|
||||
|
||||
std::string WideToUTF8(std::wstring_view wide) {
|
||||
return UTF16ToUTF8(std::u16string_view{reinterpret_cast<const char16_t*>(wide.data()), wide.size()});
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user