2016年9月12日月曜日

std::codecvt と vs2015

string, u16string, u32string の相互変換。

vs2015 では愉快なバグが放置されているため楽しくないコードを書く必要がある。

#include <locale>
#include <codecvt>

static constexpr std::codecvt_mode mode = std::codecvt_mode::little_endian;
//static constexpr std::codecvt_mode mode = (std::codecvt_mode)0;

// utf8 to utf16
// gcc7.0.0 ... ---
// gcc6.1.0 ... ---
// gcc5.3.0 ... ---
// gcc5.2.0 ... need endianness.
// gcc5.1.0 ... ---
// clang4.0.0 . ---
static std::u16string utf8_to_utf16(const std::string& s)
{
#if defined(_MSC_VER) && (_MSC_VER <= 1900)
    std::wstring_convert<std::codecvt_utf8_utf16<std::uint16_t, 0x10ffff, mode>, std::uint16_t> conv;
    auto temp = conv.from_bytes(s);
    return std::u16string(temp.cbegin(), temp.cend());
#else
    std::wstring_convert<std::codecvt_utf8_utf16<char16_t, 0x10ffff, mode>, char16_t> conv;
    return conv.from_bytes(s);
#endif
}

// utf16 to utf8
// gcc7.0.0 ... ---
// gcc6.1.0 ... ---
// gcc5.3.0 ... ---
// gcc5.2.0 ... ---
// gcc5.1.0 ... ---
// clang4.0.0 . ---
static std::string utf16_to_utf8(const std::u16string& s)
{
#if defined(_MSC_VER) && (_MSC_VER <= 1900)
    std::wstring_convert<std::codecvt_utf8_utf16<std::uint16_t, 0x10ffff, mode>, std::uint16_t> conv;
    auto p = reinterpret_cast<const std::uint16_t*>(s.c_str());
    return conv.to_bytes(p, p + s.length());
#else
    std::wstring_convert<std::codecvt_utf8_utf16<char16_t, 0x10ffff, mode>, char16_t> conv;
    return conv.to_bytes(s);
#endif
}

// utf8 to utf32
// gcc7.0.0 ... ---
// gcc6.1.0 ... ---
// gcc5.3.0 ... ---
// gcc5.2.0 ... ---
// gcc5.1.0 ... ---
// clang4.0.0 . ---
static std::u32string utf8_to_utf32(const std::string& s)
{
#if defined(_MSC_VER) && (_MSC_VER <= 1900)
    std::wstring_convert<std::codecvt_utf8<std::uint32_t, 0x10ffff, mode>, std::uint32_t> conv;
    auto temp = conv.from_bytes(s);
    return std::u32string(temp.cbegin(), temp.cend());
#else
    std::wstring_convert<std::codecvt_utf8<char32_t, 0x10ffff, mode>, char32_t> conv;
    return conv.from_bytes(s);
#endif
}

// utf32 to utf8
// gcc7.0.0 ... ---
// gcc6.1.0 ... ---
// gcc5.3.0 ... ---
// gcc5.2.0 ... ---
// gcc5.1.0 ... ---
// clang4.0.0 . ---
static std::string utf32_to_utf8(const std::u32string& s)
{
#if defined(_MSC_VER) && (_MSC_VER <= 1900)
    std::wstring_convert<std::codecvt_utf8<std::uint32_t, 0x10ffff, mode>, std::uint32_t> conv;
    auto p = reinterpret_cast<const std::uint32_t*>(s.c_str());
    return conv.to_bytes(p, p + s.length());
#else
    std::wstring_convert<std::codecvt_utf8<char32_t, 0x10ffff, mode>, char32_t> conv;
    return conv.to_bytes(s);
#endif
}

// utf16 to utf32
// gcc7.0.0 ... need endianness.
// gcc6.1.0 ... need endianness.
// gcc5.3.0 ... need endianness.
// gcc5.2.0 ... need endianness.
// gcc5.1.0 ... need endianness.
// clang4.0.0 . need endianness.
static std::u32string utf16_to_utf32(const std::u16string &s)
{
#if defined(_MSC_VER) && (_MSC_VER <= 1900)
    std::wstring_convert<std::codecvt_utf16<std::uint32_t, 0x10ffff, mode>, std::uint32_t> conv;
    const char16_t* data = s.c_str();
    auto bytes = conv.from_bytes(reinterpret_cast<const char*>(data), reinterpret_cast<const char*>(data + s.length()));
    return std::u32string(bytes.cbegin(), bytes.cend());
#else
    std::wstring_convert<std::codecvt_utf16<char32_t, 0x10ffff, mode>, char32_t> conv;
    const char16_t* data = s.c_str();
    return conv.from_bytes(reinterpret_cast<const char*>(data), reinterpret_cast<const char*>(data + s.length()));
#endif
}

// utf32 to utf16
// gcc7.0.0 ... need endianness.
// gcc6.1.0 ... need endianness.
// gcc5.3.0 ... need endianness.
// gcc5.2.0 ... need endianness.
// gcc5.1.0 ... ---
// clang4.0.0 . need endianness.
static std::u16string utf32_to_utf16(const std::u32string& s)
{
#if defined(_MSC_VER) && (_MSC_VER <= 1900)
    std::wstring_convert<std::codecvt_utf16<std::uint32_t, 0x10ffff, mode>, std::uint32_t> conv;
    auto p = reinterpret_cast<const std::uint32_t*>(s.c_str());
    auto bytes = conv.to_bytes(p, p + s.length());
    return std::u16string(reinterpret_cast<const char16_t*>(bytes.c_str()), bytes.length() / sizeof(char16_t));
#else
    std::wstring_convert<std::codecvt_utf16<char32_t, 0x10ffff, mode>, char32_t> conv;
    auto bytes = conv.to_bytes(s);
    return std::u16string(reinterpret_cast<const char16_t*>(bytes.c_str()), bytes.length() / sizeof(char16_t));
#endif
}

// ucs2 to utf8
// gcc7.0.0 ... ---
// gcc6.1.0 ... ---
// gcc5.3.0 ... ---
// gcc5.2.0 ... ---
// gcc5.1.0 ... ---
// clang4.0.0 . ---
static std::string ucs2_to_utf8(const std::u16string& s)
{
#if defined(_MSC_VER) && (_MSC_VER <= 1900)
    std::wstring_convert<std::codecvt_utf8<std::uint16_t, 0x10ffff, mode>, std::uint16_t> conv;
    auto p = reinterpret_cast<const std::uint16_t*>(s.c_str());
    return conv.to_bytes(p, p + s.length());
#else
    std::wstring_convert<std::codecvt_utf8<char16_t, 0x10ffff, mode>, char16_t> conv;
    return conv.to_bytes(s);
#endif
}

// utf8 to ucs2
// gcc7.0.0 ... ---
// gcc6.1.0 ... need endianness.
// gcc5.3.0 ... need endianness.
// gcc5.2.0 ... need endianness.
// gcc5.1.0 ... ---
// clang4.0.0 . ---
static std::u16string utf8_to_ucs2(const std::string& s)
{
#if defined(_MSC_VER) && (_MSC_VER <= 1900)
    std::wstring_convert<std::codecvt_utf8<std::uint16_t, 0x10ffff, mode>, std::uint16_t> conv;
    auto temp = conv.from_bytes(s);
    return std::u16string(temp.cbegin(), temp.cend());
#else
    std::wstring_convert<std::codecvt_utf8<char16_t, 0x10ffff, mode>, char16_t> conv;
    return conv.from_bytes(s);
#endif
}

// ucs2 to utf16
// gcc7.0.0 ... need endianness.
// gcc6.1.0 ... need endianness.
// gcc5.3.0 ... need endianness.
// gcc5.2.0 ... need endianness.
// gcc5.1.0 ... need endianness.
// clang4.0.0 . need endianness.
static std::u16string ucs2_to_utf16(const std::u16string &s)
{
#if defined(_MSC_VER) && (_MSC_VER <= 1900)
    std::wstring_convert<std::codecvt_utf16<std::uint16_t, 0x10ffff, mode>, std::uint16_t> conv;
    auto p = reinterpret_cast<const std::uint16_t*>(s.c_str());
    auto bytes = conv.to_bytes(p, p + s.length());
    return std::u16string(reinterpret_cast<const char16_t*>(bytes.c_str()), bytes.length() / sizeof(char16_t));
#else
    std::wstring_convert<std::codecvt_utf16<char16_t, 0x10ffff, mode>, char16_t> conv;
    auto bytes = conv.to_bytes(s);
    return std::u16string(reinterpret_cast<const char16_t*>(bytes.c_str()), bytes.length() / sizeof(char16_t));
#endif
}

// utf16 to ucs2
// gcc7.0.0 ... need endianness.
// gcc6.1.0 ... need endianness.
// gcc5.3.0 ... need endianness.
// gcc5.2.0 ... need endianness.
// gcc5.1.0 ... need endianness.
// clang4.0.0 . need endianness.
static std::u16string utf16_to_ucs2(const std::u16string &s)
{
#if defined(_MSC_VER) && (_MSC_VER <= 1900)
    std::wstring_convert<std::codecvt_utf16<std::uint16_t, 0x10ffff, mode>, std::uint16_t> conv;
    const char16_t* data = s.c_str();
    auto bytes = conv.from_bytes(reinterpret_cast<const char*>(data), reinterpret_cast<const char*>(data + s.length()));
    return std::u16string(bytes.cbegin(), bytes.cend());
#else
    std::wstring_convert<std::codecvt_utf16<char16_t, 0x10ffff, mode>, char16_t> conv;
    const char16_t* data = s.c_str();
    auto temp = conv.from_bytes(reinterpret_cast<const char*>(data), reinterpret_cast<const char*>(data + s.length()));
    return std::u16string(temp.cbegin(), temp.cend());
#endif
}

関数のコメントは、gcc と clang の各バージョンでエンディアン指定の有無を確認した結果。
std::codecvt_utf16 以外は不要なはずなのだが、あるバージョンでは指定の有無で
動作が変わって混乱したので付記しておいた。
動作環境はいづれも Little-Endian。

#include <iostream>
#include <assert.h>
#include <vector>

class TestItem
{
public:
    TestItem(std::string utf8, std::u16string utf16, std::u32string utf32)
        : utf8(std::move(utf8)), utf16(std::move(utf16)), utf32(std::move(utf32))
    {
    }
    std::string utf8;
    std::u16string utf16;
    std::u32string utf32;
};

void Test(const TestItem& item)
{
    // u8 <-> u16
    if(utf8_to_utf16(item.utf8) != item.utf16)
        std::cout << "Failed to convert: utf8_to_utf16" << std::endl;
    if(utf16_to_utf8(item.utf16) != item.utf8)
        std::cout << "Failed to convert: utf16_to_utf8" << std::endl;
    // u8 <-> u32
    if(utf8_to_utf32(item.utf8) != item.utf32)
        std::cout << "Failed to convert: utf8_to_utf32" << std::endl;
    if(utf32_to_utf8(item.utf32) != item.utf8)
        std::cout << "Failed to convert: utf32_to_utf8" << std::endl;
    // u16 <-> u32
    if(utf16_to_utf32(item.utf16) != item.utf32)
        std::cout << "Failed to convert: utf16_to_utf32" << std::endl;
    if(utf32_to_utf16(item.utf32) != item.utf16)
        std::cout << "Failed to convert: utf32_to_utf16" << std::endl;
    // ucs2 <-> utf8
    try{
        if(ucs2_to_utf8(item.utf16) != item.utf8)
            std::cout << "Failed to convert: ucs2_to_utf8" << std::endl;
    }
    catch(const std::range_error&){
        std::cout << "range_error: ucs2_to_utf8" << std::endl;
    }
    try{
        if(utf8_to_ucs2(item.utf8) != item.utf16)
            std::cout << "Failed to convert: utf8_to_ucs2" << std::endl;
    }
    catch(const std::range_error&){
        std::cout << "range_error: utf8_to_ucs2" << std::endl;
    }
    // ucs2 <-> utf16
    try{
        if(ucs2_to_utf16(item.utf16) != item.utf16)
            std::cout << "Failed to convert: ucs2_to_utf16" << std::endl;
    }
    catch(const std::range_error&){
        std::cout << "range_error: ucs2_to_utf16" << std::endl;
    }
    try{
        if(utf16_to_ucs2(item.utf16) != item.utf16)
            std::cout << "Failed to convert: utf16_to_ucs2" << std::endl;
    }
    catch (const std::range_error&){
        std::cout << "range_error: utf16_to_ucs2" << std::endl;
    }
}

int main(void)
{
#if defined(_MSC_VER)
    std::cout << "_MSC_VER=" << _MSC_VER << std::endl;
    std::cout << "_MSC_FULL_VER=" << _MSC_FULL_VER << std::endl;
#endif
    std::cout << "char16_t is signed? " << std::boolalpha << std::is_signed<char16_t>::value << std::endl;
    std::cout << "char16_t size = " << sizeof(char16_t) << std::endl;
    std::cout << "char32_t is signed? " << std::boolalpha << std::is_signed<char32_t>::value << std::endl;
    std::cout << "char32_t size = " << sizeof(char32_t) << std::endl;
    std::cout << "wchar_t is signed? " << std::boolalpha << std::is_signed<wchar_t>::value << std::endl;
    std::cout << "wchar_t size = " << sizeof(wchar_t) << std::endl;

    std::vector<TestItem> items = {
        TestItem(u8"abcABCあいうえお", u"abcABCあいうえお", U"abcABCあいうえお"),
        TestItem("\x61", u"\x0061", U"\x00000061"),                                // a 
        TestItem("\xEF\xBD\x81", u"\xFF41", U"\x0000FF41"),                        // a 
        TestItem("\xC4\x8D", u"\x010D", U"\x010D"),                                // č̌
        TestItem("\x63\xCC\x8C", u"\x0063\x030C", U"\x00000063\x0000030C"),        // c 
        TestItem("\xC4\xB3", u"\x0133", U"\x00000133"),                            // ij 
        TestItem("\x69\x6A", u"\x0069\x006A", U"\x00000069\x0000006A"),            // ij 
        TestItem("\xCE\xA9", u"\x03A9", U"\x000003A9"),                            // Ω 
        TestItem("\xE2\x84\xA6", u"\x2126", U"\x00002126"),                        // Ω 
        TestItem("\xF0\x9D\x93\x83", u"\xD835\xDCC3", U"\x0001D4C3")            // 𝓃 
    };
    for(auto item : items){
        std::cout << "* " << item.utf8 << "" << std::endl;
        Test(item);
    }
    return 0;
}

そしてテストコード。

"𝓃"の ucs2 との相互変換が各種コンパイラで動作が異なる。

vs2015 では
    ucs2_to_utf8 -> Failed to convert
    utf8_to_ucs2 -> Failed to convert
    ucs2_to_utf16 -> range_error
    utf16_to_ucs2 -> Failed to convert

gcc 7.0.0 では
    ucs2_to_utf8 -> N/A
    utf8_to_ucs2 -> N/A
    ucs2_to_utf16 -> range_error
    utf16_to_ucs2 -> Failed to convert

clang 4.0.0 では
    ucs2_to_utf8 -> range_error
    utf8_to_ucs2 -> range_error
    ucs2_to_utf16 -> range_error
    utf16_to_ucs2 -> range_error

あとはそう、char16_t と char32_t というネーミングは死角を突かれるのでやめて頂きたい・・・。

0 件のコメント :

コメントを投稿

注: コメントを投稿できるのは、このブログのメンバーだけです。