NandPartitionTableConvert/utf_convert.cc

//
// Created on 2024-05-17.
//

#include "utf_convert.h"

#include <vector>
#include <cstdint>

namespace utfcvt {

std::u32string UTF8ToUTF32(std::string_view utf8) {
    if (utf8.empty()) {
        return {};
    }
    std::vector<int> codePoints;
    // source may have multiple code points, treat as UTF-8
    size_t i = 0;
    while (i < utf8.size()) {
        if ((utf8[i] & 0b10000000) == 0) {
            // 1 byte code point, ASCII
            int c = (utf8[i] & 0b01111111);
            codePoints.push_back(c);
            i++;
        } else if ((utf8[i] & 0b11100000) == 0b11000000) {
            // 2 byte code point
            int c = (utf8[i] & 0b00011111) << 6 | (utf8[i + 1] & 0b00111111);
            codePoints.push_back(c);
            i += 2;
        } else if ((utf8[i] & 0b11110000) == 0b11100000) {
            // 3 byte code point
            int c = (utf8[i] & 0b00001111) << 12 | (utf8[i + 1] & 0b00111111) << 6 | (utf8[i + 2] & 0b00111111);
            codePoints.push_back(c);
            i += 3;
        } else {
            // 4 byte code point
            int c = (utf8[i] & 0b00000111) << 18 | (utf8[i + 1] & 0b00111111) << 12
                    | (utf8[i + 2] & 0b00111111) << 6 | (utf8[i + 3] & 0b00111111);
            codePoints.push_back(c);
            i += 4;
        }
    }
    return {codePoints.begin(), codePoints.end()};
}

std::u32string UTF16ToUTF32(std::u16string_view utf16) {
    if (utf16.empty()) {
        return {};
    }
    std::vector<uint32_t> codePoints;
    for (size_t i = 0; i < utf16.size(); ++i) {
        if (utf16[i] < 0xD800u || utf16[i] > 0xDFFFu) {
            codePoints.push_back(utf16[i]);
        } else if (i + 1 < utf16.size()) {
            codePoints.push_back(0x10000u + ((utf16[i] - 0xD800u) << 10) + (utf16[i + 1] - 0xDC00u));
            ++i;
        } else {
            codePoints.push_back(0xFFFDu);
        }
    }
    return {codePoints.begin(), codePoints.end()};
}

std::u16string UTF32ToUTF16(std::u32string_view utf32) {
    std::vector<uint16_t> utf16;
    for (auto c: utf32) {
        if (c <= 0xFFFFu) {
            utf16.push_back(static_cast<char16_t>(c));
        } else if (c <= 0x10FFFFu) {
            utf16.push_back(static_cast<char16_t>(0xD800u + ((c - 0x10000u) >> 10)));
            utf16.push_back(static_cast<char16_t>(0xDC00u + ((c - 0x10000u) & 0x3FF)));
        } else {
            utf16.push_back(0xFFFDu);
        }
    }
    return {utf16.begin(), utf16.end()};
}

std::string UTF32ToUTF8(std::u32string_view utf32) {
    std::string utf8;
    for (auto c: utf32) {
        if (c <= 0x7F) {
            utf8.push_back(static_cast<char>(c));
        } else if (c <= 0x7FF) {
            utf8.push_back(static_cast<char>(0xC0 | (c >> 6)));
            utf8.push_back(static_cast<char>(0x80 | (c & 0x3F)));
        } else if (c <= 0xFFFF) {
            utf8.push_back(static_cast<char>(0xE0 | (c >> 12)));
            utf8.push_back(static_cast<char>(0x80 | ((c >> 6) & 0x3F)));
            utf8.push_back(static_cast<char>(0x80 | (c & 0x3F)));
        } else {
            utf8.push_back(static_cast<char>(0xF0 | (c >> 18)));
            utf8.push_back(static_cast<char>(0x80 | ((c >> 12) & 0x3F)));
            utf8.push_back(static_cast<char>(0x80 | ((c >> 6) & 0x3F)));
            utf8.push_back(static_cast<char>(0x80 | (c & 0x3F)));
        }
    }
    return utf8;
}

std::string UTF16ToMUTF8(std::u16string_view utf16) {
    std::string mutf8;
    for (size_t i = 0; i < utf16.size(); ++i) {
        if (utf16[i] < 0x80) {
            mutf8.push_back(static_cast<char>(utf16[i]));
        } else if (utf16[i] < 0x800) {
            mutf8.push_back(static_cast<char>(0xC0 | (utf16[i] >> 6)));
            mutf8.push_back(static_cast<char>(0x80 | (utf16[i] & 0x3F)));
        } else {
            mutf8.push_back(static_cast<char>(0xE0 | (utf16[i] >> 12)));
            mutf8.push_back(static_cast<char>(0x80 | ((utf16[i] >> 6) & 0x3F)));
            mutf8.push_back(static_cast<char>(0x80 | (utf16[i] & 0x3F)));
        }
    }
    return mutf8;
}

std::u16string MUTF8ToUTF16(std::string_view mutf8) {
    std::vector<uint16_t> utf16;
    for (size_t i = 0; i < mutf8.size(); ++i) {
        if ((mutf8[i] & 0b10000000) == 0) {
            // 1 byte code point, ASCII
            utf16.push_back(mutf8[i]);
        } else if ((mutf8[i] & 0b11100000) == 0b11000000) {
            // 2 byte code point
            utf16.push_back(uint16_t((mutf8[i] & 0b00011111u) << 6) | uint16_t(mutf8[i + 1] & 0b00111111u));
            i++;
        } else if ((mutf8[i] & 0b11110000) == 0b11100000) {
            // 3 byte code point
            utf16.push_back(uint16_t((mutf8[i] & 0b00001111u) << 12) |
                            uint16_t((mutf8[i + 1] & 0b00111111u) << 6) | uint16_t(mutf8[i + 2] & 0b00111111u));
            i += 2;
        }
    }
    return {utf16.begin(), utf16.end()};
}

std::string UTF16ToUTF8(std::u16string_view utf16) {
    return UTF32ToUTF8(UTF16ToUTF32(utf16));
}

std::u16string UTF8ToUTF16(std::string_view utf8) {
    return UTF32ToUTF16(UTF8ToUTF32(utf8));
}

#if defined(_WIN32) || defined(_WIN64)

// for windows, they are not implemented on linux

static_assert(sizeof(wchar_t) == sizeof(char16_t));

std::wstring UTF16ToWide(std::u16string_view utf16) {
    return {reinterpret_cast<const wchar_t*>(utf16.data()), utf16.size()};
}

std::u16string WideToUTF16(std::wstring_view wide) {
    return {reinterpret_cast<const char16_t*>(wide.data()), wide.size()};
}

std::wstring UTF8ToWide(std::string_view utf8) {
    auto utf16 = UTF8ToUTF16(utf8);
    return UTF16ToWide(utf16);
}

std::string WideToUTF8(std::wstring_view wide) {
    return UTF16ToUTF8(std::u16string_view{reinterpret_cast<const char16_t*>(wide.data()), wide.size()});
}

#endif

}