vs2015 では愉快なバグが放置されているため楽しくないコードを書く必要がある。
#include <locale> #include <codecvt> static constexpr std::codecvt_mode mode = std::codecvt_mode::little_endian; //static constexpr std::codecvt_mode mode = (std::codecvt_mode)0; // utf8 to utf16 // gcc7.0.0 ... --- // gcc6.1.0 ... --- // gcc5.3.0 ... --- // gcc5.2.0 ... need endianness. // gcc5.1.0 ... --- // clang4.0.0 . --- static std::u16string utf8_to_utf16(const std::string& s) { #if defined(_MSC_VER) && (_MSC_VER <= 1900) std::wstring_convert<std::codecvt_utf8_utf16<std::uint16_t, 0x10ffff, mode>, std::uint16_t> conv; auto temp = conv.from_bytes(s); return std::u16string(temp.cbegin(), temp.cend()); #else std::wstring_convert<std::codecvt_utf8_utf16<char16_t, 0x10ffff, mode>, char16_t> conv; return conv.from_bytes(s); #endif } // utf16 to utf8 // gcc7.0.0 ... --- // gcc6.1.0 ... --- // gcc5.3.0 ... --- // gcc5.2.0 ... --- // gcc5.1.0 ... --- // clang4.0.0 . --- static std::string utf16_to_utf8(const std::u16string& s) { #if defined(_MSC_VER) && (_MSC_VER <= 1900) std::wstring_convert<std::codecvt_utf8_utf16<std::uint16_t, 0x10ffff, mode>, std::uint16_t> conv; auto p = reinterpret_cast<const std::uint16_t*>(s.c_str()); return conv.to_bytes(p, p + s.length()); #else std::wstring_convert<std::codecvt_utf8_utf16<char16_t, 0x10ffff, mode>, char16_t> conv; return conv.to_bytes(s); #endif } // utf8 to utf32 // gcc7.0.0 ... --- // gcc6.1.0 ... --- // gcc5.3.0 ... --- // gcc5.2.0 ... --- // gcc5.1.0 ... --- // clang4.0.0 . --- static std::u32string utf8_to_utf32(const std::string& s) { #if defined(_MSC_VER) && (_MSC_VER <= 1900) std::wstring_convert<std::codecvt_utf8<std::uint32_t, 0x10ffff, mode>, std::uint32_t> conv; auto temp = conv.from_bytes(s); return std::u32string(temp.cbegin(), temp.cend()); #else std::wstring_convert<std::codecvt_utf8<char32_t, 0x10ffff, mode>, char32_t> conv; return conv.from_bytes(s); #endif } // utf32 to utf8 // gcc7.0.0 ... --- // gcc6.1.0 ... --- // gcc5.3.0 ... --- // gcc5.2.0 ... --- // gcc5.1.0 ... --- // clang4.0.0 . --- static std::string utf32_to_utf8(const std::u32string& s) { #if defined(_MSC_VER) && (_MSC_VER <= 1900) std::wstring_convert<std::codecvt_utf8<std::uint32_t, 0x10ffff, mode>, std::uint32_t> conv; auto p = reinterpret_cast<const std::uint32_t*>(s.c_str()); return conv.to_bytes(p, p + s.length()); #else std::wstring_convert<std::codecvt_utf8<char32_t, 0x10ffff, mode>, char32_t> conv; return conv.to_bytes(s); #endif } // utf16 to utf32 // gcc7.0.0 ... need endianness. // gcc6.1.0 ... need endianness. // gcc5.3.0 ... need endianness. // gcc5.2.0 ... need endianness. // gcc5.1.0 ... need endianness. // clang4.0.0 . need endianness. static std::u32string utf16_to_utf32(const std::u16string &s) { #if defined(_MSC_VER) && (_MSC_VER <= 1900) std::wstring_convert<std::codecvt_utf16<std::uint32_t, 0x10ffff, mode>, std::uint32_t> conv; const char16_t* data = s.c_str(); auto bytes = conv.from_bytes(reinterpret_cast<const char*>(data), reinterpret_cast<const char*>(data + s.length())); return std::u32string(bytes.cbegin(), bytes.cend()); #else std::wstring_convert<std::codecvt_utf16<char32_t, 0x10ffff, mode>, char32_t> conv; const char16_t* data = s.c_str(); return conv.from_bytes(reinterpret_cast<const char*>(data), reinterpret_cast<const char*>(data + s.length())); #endif } // utf32 to utf16 // gcc7.0.0 ... need endianness. // gcc6.1.0 ... need endianness. // gcc5.3.0 ... need endianness. // gcc5.2.0 ... need endianness. // gcc5.1.0 ... --- // clang4.0.0 . need endianness. static std::u16string utf32_to_utf16(const std::u32string& s) { #if defined(_MSC_VER) && (_MSC_VER <= 1900) std::wstring_convert<std::codecvt_utf16<std::uint32_t, 0x10ffff, mode>, std::uint32_t> conv; auto p = reinterpret_cast<const std::uint32_t*>(s.c_str()); auto bytes = conv.to_bytes(p, p + s.length()); return std::u16string(reinterpret_cast<const char16_t*>(bytes.c_str()), bytes.length() / sizeof(char16_t)); #else std::wstring_convert<std::codecvt_utf16<char32_t, 0x10ffff, mode>, char32_t> conv; auto bytes = conv.to_bytes(s); return std::u16string(reinterpret_cast<const char16_t*>(bytes.c_str()), bytes.length() / sizeof(char16_t)); #endif } // ucs2 to utf8 // gcc7.0.0 ... --- // gcc6.1.0 ... --- // gcc5.3.0 ... --- // gcc5.2.0 ... --- // gcc5.1.0 ... --- // clang4.0.0 . --- static std::string ucs2_to_utf8(const std::u16string& s) { #if defined(_MSC_VER) && (_MSC_VER <= 1900) std::wstring_convert<std::codecvt_utf8<std::uint16_t, 0x10ffff, mode>, std::uint16_t> conv; auto p = reinterpret_cast<const std::uint16_t*>(s.c_str()); return conv.to_bytes(p, p + s.length()); #else std::wstring_convert<std::codecvt_utf8<char16_t, 0x10ffff, mode>, char16_t> conv; return conv.to_bytes(s); #endif } // utf8 to ucs2 // gcc7.0.0 ... --- // gcc6.1.0 ... need endianness. // gcc5.3.0 ... need endianness. // gcc5.2.0 ... need endianness. // gcc5.1.0 ... --- // clang4.0.0 . --- static std::u16string utf8_to_ucs2(const std::string& s) { #if defined(_MSC_VER) && (_MSC_VER <= 1900) std::wstring_convert<std::codecvt_utf8<std::uint16_t, 0x10ffff, mode>, std::uint16_t> conv; auto temp = conv.from_bytes(s); return std::u16string(temp.cbegin(), temp.cend()); #else std::wstring_convert<std::codecvt_utf8<char16_t, 0x10ffff, mode>, char16_t> conv; return conv.from_bytes(s); #endif } // ucs2 to utf16 // gcc7.0.0 ... need endianness. // gcc6.1.0 ... need endianness. // gcc5.3.0 ... need endianness. // gcc5.2.0 ... need endianness. // gcc5.1.0 ... need endianness. // clang4.0.0 . need endianness. static std::u16string ucs2_to_utf16(const std::u16string &s) { #if defined(_MSC_VER) && (_MSC_VER <= 1900) std::wstring_convert<std::codecvt_utf16<std::uint16_t, 0x10ffff, mode>, std::uint16_t> conv; auto p = reinterpret_cast<const std::uint16_t*>(s.c_str()); auto bytes = conv.to_bytes(p, p + s.length()); return std::u16string(reinterpret_cast<const char16_t*>(bytes.c_str()), bytes.length() / sizeof(char16_t)); #else std::wstring_convert<std::codecvt_utf16<char16_t, 0x10ffff, mode>, char16_t> conv; auto bytes = conv.to_bytes(s); return std::u16string(reinterpret_cast<const char16_t*>(bytes.c_str()), bytes.length() / sizeof(char16_t)); #endif } // utf16 to ucs2 // gcc7.0.0 ... need endianness. // gcc6.1.0 ... need endianness. // gcc5.3.0 ... need endianness. // gcc5.2.0 ... need endianness. // gcc5.1.0 ... need endianness. // clang4.0.0 . need endianness. static std::u16string utf16_to_ucs2(const std::u16string &s) { #if defined(_MSC_VER) && (_MSC_VER <= 1900) std::wstring_convert<std::codecvt_utf16<std::uint16_t, 0x10ffff, mode>, std::uint16_t> conv; const char16_t* data = s.c_str(); auto bytes = conv.from_bytes(reinterpret_cast<const char*>(data), reinterpret_cast<const char*>(data + s.length())); return std::u16string(bytes.cbegin(), bytes.cend()); #else std::wstring_convert<std::codecvt_utf16<char16_t, 0x10ffff, mode>, char16_t> conv; const char16_t* data = s.c_str(); auto temp = conv.from_bytes(reinterpret_cast<const char*>(data), reinterpret_cast<const char*>(data + s.length())); return std::u16string(temp.cbegin(), temp.cend()); #endif }
関数のコメントは、gcc と clang の各バージョンでエンディアン指定の有無を確認した結果。
std::codecvt_utf16 以外は不要なはずなのだが、あるバージョンでは指定の有無で
動作が変わって混乱したので付記しておいた。
動作環境はいづれも Little-Endian。
#include <iostream> #include <assert.h> #include <vector> class TestItem { public: TestItem(std::string utf8, std::u16string utf16, std::u32string utf32) : utf8(std::move(utf8)), utf16(std::move(utf16)), utf32(std::move(utf32)) { } std::string utf8; std::u16string utf16; std::u32string utf32; }; void Test(const TestItem& item) { // u8 <-> u16 if(utf8_to_utf16(item.utf8) != item.utf16) std::cout << "Failed to convert: utf8_to_utf16" << std::endl; if(utf16_to_utf8(item.utf16) != item.utf8) std::cout << "Failed to convert: utf16_to_utf8" << std::endl; // u8 <-> u32 if(utf8_to_utf32(item.utf8) != item.utf32) std::cout << "Failed to convert: utf8_to_utf32" << std::endl; if(utf32_to_utf8(item.utf32) != item.utf8) std::cout << "Failed to convert: utf32_to_utf8" << std::endl; // u16 <-> u32 if(utf16_to_utf32(item.utf16) != item.utf32) std::cout << "Failed to convert: utf16_to_utf32" << std::endl; if(utf32_to_utf16(item.utf32) != item.utf16) std::cout << "Failed to convert: utf32_to_utf16" << std::endl; // ucs2 <-> utf8 try{ if(ucs2_to_utf8(item.utf16) != item.utf8) std::cout << "Failed to convert: ucs2_to_utf8" << std::endl; } catch(const std::range_error&){ std::cout << "range_error: ucs2_to_utf8" << std::endl; } try{ if(utf8_to_ucs2(item.utf8) != item.utf16) std::cout << "Failed to convert: utf8_to_ucs2" << std::endl; } catch(const std::range_error&){ std::cout << "range_error: utf8_to_ucs2" << std::endl; } // ucs2 <-> utf16 try{ if(ucs2_to_utf16(item.utf16) != item.utf16) std::cout << "Failed to convert: ucs2_to_utf16" << std::endl; } catch(const std::range_error&){ std::cout << "range_error: ucs2_to_utf16" << std::endl; } try{ if(utf16_to_ucs2(item.utf16) != item.utf16) std::cout << "Failed to convert: utf16_to_ucs2" << std::endl; } catch (const std::range_error&){ std::cout << "range_error: utf16_to_ucs2" << std::endl; } } int main(void) { #if defined(_MSC_VER) std::cout << "_MSC_VER=" << _MSC_VER << std::endl; std::cout << "_MSC_FULL_VER=" << _MSC_FULL_VER << std::endl; #endif std::cout << "char16_t is signed? " << std::boolalpha << std::is_signed<char16_t>::value << std::endl; std::cout << "char16_t size = " << sizeof(char16_t) << std::endl; std::cout << "char32_t is signed? " << std::boolalpha << std::is_signed<char32_t>::value << std::endl; std::cout << "char32_t size = " << sizeof(char32_t) << std::endl; std::cout << "wchar_t is signed? " << std::boolalpha << std::is_signed<wchar_t>::value << std::endl; std::cout << "wchar_t size = " << sizeof(wchar_t) << std::endl; std::vector<TestItem> items = { TestItem(u8"abcABCあいうえお", u"abcABCあいうえお", U"abcABCあいうえお"), TestItem("\x61", u"\x0061", U"\x00000061"), // a TestItem("\xEF\xBD\x81", u"\xFF41", U"\x0000FF41"), // a TestItem("\xC4\x8D", u"\x010D", U"\x010D"), // č̌ TestItem("\x63\xCC\x8C", u"\x0063\x030C", U"\x00000063\x0000030C"), // c TestItem("\xC4\xB3", u"\x0133", U"\x00000133"), // ij TestItem("\x69\x6A", u"\x0069\x006A", U"\x00000069\x0000006A"), // ij TestItem("\xCE\xA9", u"\x03A9", U"\x000003A9"), // Ω TestItem("\xE2\x84\xA6", u"\x2126", U"\x00002126"), // Ω TestItem("\xF0\x9D\x93\x83", u"\xD835\xDCC3", U"\x0001D4C3") // 𝓃 }; for(auto item : items){ std::cout << "* " << item.utf8 << "" << std::endl; Test(item); } return 0; }
そしてテストコード。
"𝓃"の ucs2 との相互変換が各種コンパイラで動作が異なる。
vs2015 では
ucs2_to_utf8 -> Failed to convert
utf8_to_ucs2 -> Failed to convert
ucs2_to_utf16 -> range_error
utf16_to_ucs2 -> Failed to convert
gcc 7.0.0 では
ucs2_to_utf8 -> N/A
utf8_to_ucs2 -> N/A
ucs2_to_utf16 -> range_error
utf16_to_ucs2 -> Failed to convert
clang 4.0.0 では
ucs2_to_utf8 -> range_error
utf8_to_ucs2 -> range_error
ucs2_to_utf16 -> range_error
utf16_to_ucs2 -> range_error
あとはそう、char16_t と char32_t というネーミングは死角を突かれるのでやめて頂きたい・・・。
0 件のコメント :
コメントを投稿
注: コメントを投稿できるのは、このブログのメンバーだけです。