TextDecoderTest.cpp 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. /*
  2. * Copyright 2021 gitlost
  3. */
  4. // SPDX-License-Identifier: Apache-2.0
  5. #include "CharacterSet.h"
  6. #include "TextDecoder.h"
  7. #include "Utf.h"
  8. #include "gtest/gtest.h"
  9. #include "gmock/gmock.h"
  10. using namespace ZXing;
  11. using namespace testing;
  12. namespace ZXing {
  13. int Utf32ToUtf8(char32_t utf32, char* out);
  14. }
  15. // Encode Unicode codepoint `utf32` as UTF-8
  16. std::string Utf32ToUtf8(const char32_t utf32)
  17. {
  18. char buf[4];
  19. int len = Utf32ToUtf8(utf32, buf);
  20. return std::string(buf, len);
  21. }
  22. TEST(TextDecoderTest, AppendBINARY_ASCII)
  23. {
  24. uint8_t data[256];
  25. for (int i = 0; i < 256; i++) {
  26. data[i] = (uint8_t)i;
  27. }
  28. {
  29. std::wstring str;
  30. TextDecoder::Append(str, data, sizeof(data), CharacterSet::BINARY);
  31. EXPECT_THAT(str, ElementsAreArray(data, sizeof(data)));
  32. }
  33. {
  34. // Accepts non-ASCII
  35. std::wstring str;
  36. TextDecoder::Append(str, data, sizeof(data), CharacterSet::ASCII);
  37. EXPECT_THAT(str, ElementsAreArray(data, sizeof(data)));
  38. }
  39. }
  40. TEST(TextDecoderTest, AppendAllASCIIRange00_7F)
  41. {
  42. std::string expected;
  43. uint8_t data[0x80];
  44. uint8_t dataUTF16BE[0x80 * 2];
  45. uint8_t dataUTF16LE[0x80 * 2];
  46. uint8_t dataUTF32BE[0x80 * 4];
  47. uint8_t dataUTF32LE[0x80 * 4];
  48. for (int i = 0; i < 0x80; i++) {
  49. uint8_t ch = static_cast<uint8_t>(i);
  50. data[i] = ch;
  51. expected.append(Utf32ToUtf8(i));
  52. int j = i << 1;
  53. int k = j << 1;
  54. dataUTF16BE[j] = 0;
  55. dataUTF16BE[j + 1] = ch;
  56. dataUTF16LE[j] = ch;
  57. dataUTF16LE[j + 1] = 0;
  58. dataUTF32BE[k] = dataUTF32BE[k + 1] = dataUTF32BE[k + 2] = 0;
  59. dataUTF32BE[k + 3] = ch;
  60. dataUTF32LE[k] = ch;
  61. dataUTF32LE[k + 1] = dataUTF32LE[k + 2] = dataUTF32LE[k + 3] = 0;
  62. }
  63. EXPECT_EQ(expected.size(), 128);
  64. for (int i = 0; i < static_cast<int>(CharacterSet::CharsetCount); i++) {
  65. std::string str;
  66. CharacterSet cs = static_cast<CharacterSet>(i);
  67. switch(cs) {
  68. case CharacterSet::UTF16BE: TextDecoder::Append(str, dataUTF16BE, sizeof(dataUTF16BE), cs); break;
  69. case CharacterSet::UTF16LE: TextDecoder::Append(str, dataUTF16LE, sizeof(dataUTF16LE), cs); break;
  70. case CharacterSet::UTF32BE: TextDecoder::Append(str, dataUTF32BE, sizeof(dataUTF32BE), cs); break;
  71. case CharacterSet::UTF32LE: TextDecoder::Append(str, dataUTF32LE, sizeof(dataUTF32LE), cs); break;
  72. default: TextDecoder::Append(str, data, sizeof(data), cs); break;
  73. }
  74. EXPECT_EQ(str, expected) << " charset: " << ToString(cs);
  75. }
  76. }
  77. TEST(TextDecoderTest, AppendISO8859Range80_9F)
  78. {
  79. uint8_t data[0xA0 - 0x80];
  80. for (int i = 0x80; i < 0xA0; i++) {
  81. data[i - 0x80] = (uint8_t)i;
  82. }
  83. static const CharacterSet isos[] = {
  84. CharacterSet::ISO8859_1, CharacterSet::ISO8859_2, CharacterSet::ISO8859_3, CharacterSet::ISO8859_4,
  85. CharacterSet::ISO8859_5, CharacterSet::ISO8859_6, CharacterSet::ISO8859_7, CharacterSet::ISO8859_8,
  86. CharacterSet::ISO8859_7, CharacterSet::ISO8859_8, CharacterSet::ISO8859_9, CharacterSet::ISO8859_10,
  87. CharacterSet::ISO8859_11, // extended with 9 CP874 codepoints in 0x80-9F range
  88. CharacterSet::ISO8859_13, CharacterSet::ISO8859_14, CharacterSet::ISO8859_15, CharacterSet::ISO8859_16
  89. };
  90. for (CharacterSet iso : isos) {
  91. std::wstring str;
  92. TextDecoder::Append(str, data, sizeof(data), iso);
  93. EXPECT_THAT(str, ElementsAreArray(data, sizeof(data))) << "iso: " << static_cast<int>(iso);
  94. }
  95. }
  96. TEST(TextDecoderTest, AppendShift_JIS)
  97. {
  98. {
  99. // Shift JIS 0x5C (backslash in ASCII) normally mapped to U+00A5 (Yen sign), but direct ASCII mapping used
  100. static const uint8_t data[] = { 0x5C };
  101. std::wstring str;
  102. TextDecoder::Append(str, data, sizeof(data), CharacterSet::Shift_JIS);
  103. EXPECT_EQ(str, L"\u005C"); // Would normally be "\u00A5"
  104. EXPECT_EQ(ToUtf8(str), "\\"); // "¥" ditto
  105. }
  106. // {
  107. // // Shift JIS 0x815F goes to U+FF3C (full width reverse solidus i.e. backslash)
  108. // static const uint8_t data[] = { 0x81, 0x5F };
  109. // std::wstring str;
  110. // TextDecoder::Append(str, data, sizeof(data), CharacterSet::Shift_JIS);
  111. // EXPECT_EQ(str, L"\uFF3C");
  112. // EXPECT_EQ(ToUtf8(str), "\");
  113. // }
  114. {
  115. // Shift JIS 0xA5 (Yen sign in ISO/IEC 8859-1) goes to U+FF65 (half-width katakana middle dot)
  116. static const uint8_t data[] = { 0xA5 };
  117. std::wstring str;
  118. TextDecoder::Append(str, data, sizeof(data), CharacterSet::Shift_JIS);
  119. EXPECT_EQ(str, L"\uFF65");
  120. EXPECT_EQ(ToUtf8(str), "・");
  121. }
  122. {
  123. // Shift JIS 0x7E (tilde in ASCII) normally mapped to U+203E (overline), but direct ASCII mapping used
  124. static const uint8_t data[] = { 0x7E };
  125. std::wstring str;
  126. TextDecoder::Append(str, data, sizeof(data), CharacterSet::Shift_JIS);
  127. EXPECT_EQ(str, L"~"); // Would normally be "\u203E"
  128. EXPECT_EQ(ToUtf8(str), "~"); // "‾" ditto
  129. }
  130. {
  131. static const uint8_t data[] = { 'a', 0x83, 0xC0, 'c', 0x84, 0x47, 0xA5, 0xBF, 0x93, 0x5F,
  132. 0xE4, 0xAA, 0x83, 0x65 };
  133. std::wstring str;
  134. TextDecoder::Append(str, data, sizeof(data), CharacterSet::Shift_JIS);
  135. EXPECT_EQ(str, L"a\u03B2c\u0416\uFF65\uFF7F\u70B9\u8317\u30C6");
  136. EXPECT_EQ(ToUtf8(str), "aβcЖ・ソ点茗テ");
  137. }
  138. }
  139. TEST(TextDecoderTest, AppendBig5)
  140. {
  141. // {
  142. // static const uint8_t data[] = { 0xA1, 0x5A }; // Drawings box light left in Big5-2003; not in original Big5
  143. // std::wstring str;
  144. // TextDecoder::Append(str, data, sizeof(data), CharacterSet::Big5);
  145. // EXPECT_EQ(str, L"\u2574");
  146. // EXPECT_EQ(ToUtf8(str), "╴");
  147. // }
  148. {
  149. static const uint8_t data[] = { 0xA1, 0x56 }; // En dash U+2013 in Big5, horizontal bar U+2015 in Big5-2003
  150. std::wstring str;
  151. TextDecoder::Append(str, data, sizeof(data), CharacterSet::Big5);
  152. EXPECT_EQ(str, L"\u2013");
  153. EXPECT_EQ(ToUtf8(str), "–");
  154. }
  155. {
  156. static const uint8_t data[] = { 0x1, ' ', 0xA1, 0x71, '@', 0xC0, 0x40, 0xF9, 0xD5, 0x7F };
  157. std::wstring str;
  158. TextDecoder::Append(str, data, sizeof(data), CharacterSet::Big5);
  159. EXPECT_EQ(str, L"\u0001 \u3008@\u9310\u9F98\u007F");
  160. EXPECT_EQ(ToUtf8(str), "\x01 〈@錐龘\x7F");
  161. }
  162. }
  163. TEST(TextDecoderTest, AppendGB2312)
  164. {
  165. {
  166. static const uint8_t data[] = { 'a', 0xB0, 0xA1 };
  167. std::wstring str;
  168. TextDecoder::Append(str, data, sizeof(data), CharacterSet::GB2312);
  169. EXPECT_EQ(str, L"a\u554a");
  170. EXPECT_EQ(ToUtf8(str), "a啊");
  171. }
  172. }
  173. TEST(TextDecoderTest, AppendGB18030)
  174. {
  175. {
  176. static const uint8_t data[] = { 'a', 0xA6, 0xC2, 'c', 0x81, 0x39, 0xA7, 0x39, 0xA1, 0xA4, 0xA1, 0xAA,
  177. 0xA8, 0xA6, 'Z' };
  178. std::wstring str;
  179. TextDecoder::Append(str, data, sizeof(data), CharacterSet::GB18030);
  180. EXPECT_EQ(str, L"a\u03B2c\u30FB\u00B7\u2014\u00E9Z");
  181. EXPECT_EQ(ToUtf8(str), "aβc・·—éZ");
  182. }
  183. }
  184. TEST(TextDecoderTest, AppendEUC_KR)
  185. {
  186. {
  187. static const uint8_t data[] = { 0xA2, 0xE6 }; // Euro sign U+20AC added KS X 1001:1998
  188. std::wstring str;
  189. TextDecoder::Append(str, data, sizeof(data), CharacterSet::EUC_KR);
  190. EXPECT_EQ(str, L"\u20AC");
  191. EXPECT_EQ(ToUtf8(str), "€");
  192. }
  193. {
  194. static const uint8_t data[] = { 'a', 0xA4, 0xA1, 'Z' };
  195. std::wstring str;
  196. TextDecoder::Append(str, data, sizeof(data), CharacterSet::EUC_KR);
  197. EXPECT_EQ(str, L"a\u3131Z");
  198. EXPECT_EQ(ToUtf8(str), "aㄱZ");
  199. }
  200. }
  201. TEST(TextDecoderTest, AppendUTF16BE)
  202. {
  203. {
  204. std::wstring str;
  205. static const uint8_t data[] = { 0x00, 0x01, 0x00, 0x7F, 0x00, 0x80, 0x00, 0xFF, 0x01, 0xFF, 0x10, 0xFF,
  206. 0xFF, 0xFD };
  207. TextDecoder::Append(str, data, sizeof(data), CharacterSet::UTF16BE);
  208. EXPECT_EQ(str, L"\u0001\u007F\u0080\u00FF\u01FF\u10FF\uFFFD");
  209. EXPECT_EQ(ToUtf8(str), "\x01\x7F\xC2\x80ÿǿჿ\xEF\xBF\xBD");
  210. }
  211. {
  212. std::wstring str;
  213. static const uint8_t data[] = { 0xD8, 0x00, 0xDC, 0x00 }; // Surrogate pair U+10000
  214. TextDecoder::Append(str, data, sizeof(data), CharacterSet::UTF16BE);
  215. EXPECT_EQ(str, L"\U00010000");
  216. EXPECT_EQ(ToUtf8(str), "𐀀");
  217. }
  218. }