test_gb18030.c 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641
  1. /*
  2. libzint - the open source barcode library
  3. Copyright (C) 2019-2023 Robin Stuart <rstuart114@gmail.com>
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions
  6. are met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in the
  11. documentation and/or other materials provided with the distribution.
  12. 3. Neither the name of the project nor the names of its contributors
  13. may be used to endorse or promote products derived from this software
  14. without specific prior written permission.
  15. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  16. ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
  19. FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  21. OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  22. HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  23. LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  24. OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  25. SUCH DAMAGE.
  26. */
  27. /* SPDX-License-Identifier: BSD-3-Clause */
  28. #include "testcommon.h"
  29. #include "test_gb18030_tab.h"
  30. #include "test_gbk_tab.h"
  31. #include "../eci.h"
  32. /* For local "private" testing using previous libiconv adaptation, not included for licensing reasons */
  33. #if 0
  34. #define TEST_JUST_SAY_GNO */
  35. #endif
  36. #ifdef TEST_JUST_SAY_GNO
  37. #include "../just_say_gno/gb18030_gnu.c"
  38. #include "../just_say_gno/gb2312_gnu.c"
  39. #endif
  40. INTERNAL int u_gb18030_int_test(const unsigned int u, unsigned int *dest1, unsigned int *dest2);
  41. /* As control convert to GB 18030 using table generated from GB18030.TXT plus simple processing.
  42. The version of GB18030.TXT is jdk-1.4.2/GB18030.TXT taken from
  43. https://haible.de/bruno/charsets/conversion-tables/GB18030.html
  44. The generated file backend/tests/test_gb18030_tab.h does not include U+10000..10FFFF codepoints to save space.
  45. See also backend/tests/tools/data/GB18030.TXT.README and backend/tests/tools/gen_test_tab.php.
  46. */
  47. static int u_gb18030_int2(unsigned int u, unsigned int *dest1, unsigned int *dest2) {
  48. unsigned int c;
  49. int tab_length, start_i, end_i;
  50. int i;
  51. /* GB18030 two-byte extension */
  52. if (u == 0x1E3F) { /* GB 18030-2005 change, was PUA U+E7C7 below, see Table 3-39, p.111, Lunde 2nd ed. */
  53. *dest1 = 0xA8BC;
  54. return 2;
  55. }
  56. /* GB18030 four-byte extension */
  57. if (u == 0xE7C7) { /* PUA */
  58. *dest1 = 0x8135;
  59. *dest2 = 0xF437;
  60. return 4;
  61. }
  62. /* GB18030 two-byte extension */
  63. if (u >= 0x9FB4 && u <= 0x9FBB) { /* GB 18030-2005 change, were PUA, see Table 3-37, p.108, Lunde 2nd ed. */
  64. if (u == 0x9FB4) {
  65. *dest1 = 0xFE59;
  66. } else if (u == 0x9FB5) {
  67. *dest1 = 0xFE61;
  68. } else if (u == 0x9FB6 || u == 0x9FB7) {
  69. *dest1 = 0xFE66 + (u - 0x9FB6);
  70. } else if (u == 0x9FB8) {
  71. *dest1 = 0xFE6D;
  72. } else if (u == 0x9FB9) {
  73. *dest1 = 0xFE7E;
  74. } else if (u == 0x9FBA) {
  75. *dest1 = 0xFE90;
  76. } else {
  77. *dest1 = 0xFEA0;
  78. }
  79. return 2;
  80. }
  81. /* GB18030 two-byte extension */
  82. if (u >= 0xFE10 && u <= 0xFE19) { /* GB 18030-2005 change, were PUA, see Table 3-37, p.108, Lunde 2nd ed. */
  83. if (u == 0xFE10) {
  84. *dest1 = 0xA6D9;
  85. } else if (u == 0xFE11) {
  86. *dest1 = 0xA6DB;
  87. } else if (u == 0xFE12) {
  88. *dest1 = 0xA6DA;
  89. } else if (u >= 0xFE13 && u <= 0xFE16) {
  90. *dest1 = 0xA6DC + (u - 0xFE13);
  91. } else if (u == 0xFE17 || u == 0xFE18) {
  92. *dest1 = 0xA6EC + (u - 0xFE17);
  93. } else {
  94. *dest1 = 0xA6F3;
  95. }
  96. return 2;
  97. }
  98. /* GB18030 four-byte extension */
  99. if (u >= 0xFE1A && u <= 0xFE2F) { /* These are Vertical Forms (U+FE1A..FE1F unassigned) and Combining Half Marks (U+FE20..FE2F) */
  100. if (u >= 0xFE1A && u <= 0xFE1D) {
  101. c = 0x84318336 + (u - 0xFE1A);
  102. } else if (u >= 0xFE1E && u <= 0xFE27) {
  103. c = 0x84318430 + (u - 0xFE1E);
  104. } else {
  105. c = 0x84318530 + (u - 0xFE28);
  106. }
  107. *dest1 = c >> 16;
  108. *dest2 = c & 0xFFFF;
  109. return 4;
  110. }
  111. /* GB18030 */
  112. /* Code set 3 (Unicode U+10000..U+10FFFF) */
  113. if (u >= 0x10000 /*&& u < 0x10400*/) { /* Not being called for U+10400..U+10FFFF */
  114. c = u - 0x10000;
  115. *dest1 = 0x9030;
  116. *dest2 = 0x8130 + (c % 10) + 0x100 * (c / 10);
  117. return 4;
  118. }
  119. tab_length = ARRAY_SIZE(test_gb18030_tab);
  120. start_i = test_gb18030_tab_ind[u >> 10];
  121. end_i = start_i + 0x800 > tab_length ? tab_length : start_i + 0x800;
  122. for (i = start_i; i < end_i; i += 2) {
  123. if (test_gb18030_tab[i + 1] == u) {
  124. c = test_gb18030_tab[i];
  125. if (c <= 0xFFFF) {
  126. *dest1 = c;
  127. return c <= 0xFF ? 1 : 2;
  128. }
  129. *dest1 = c >> 16;
  130. *dest2 = c & 0xFFFF;
  131. return 4;
  132. }
  133. }
  134. return 0;
  135. }
  136. #include <time.h>
  137. #define TEST_PERF_TIME(arg) (((arg) * 1000.0) / CLOCKS_PER_SEC)
  138. #define TEST_PERF_RATIO(a1, a2) (a2 ? TEST_PERF_TIME(a1) / TEST_PERF_TIME(a2) : 0)
  139. #ifdef TEST_JUST_SAY_GNO
  140. #define TEST_INT_PERF_ITERATIONS 250
  141. #endif
  142. static void test_u_gb18030_int(const testCtx *const p_ctx) {
  143. int debug = p_ctx->debug;
  144. int ret, ret2;
  145. unsigned int val1_1, val1_2, val2_1, val2_2;
  146. unsigned int i;
  147. /* See: https://file.allitebooks.com/20160708/CJKV%20Information%20Processing.pdf (table 3-37, p.109, 2nd ed.) */
  148. static const int nonpua_nonbmp[] = {
  149. 0x20087, 0x20089, 0x200CC, 0x215D7, 0x2298F, 0x241FE
  150. };
  151. static const unsigned int nonpua_nonbmp_vals[] = {
  152. 0xFE51, 0xFE52, 0xFE53, 0xFE6C, 0xFE76, 0xFE91
  153. };
  154. #ifdef TEST_JUST_SAY_GNO
  155. int j;
  156. clock_t start;
  157. clock_t total = 0, total_gno = 0;
  158. #else
  159. (void)debug;
  160. #endif
  161. testStart("test_u_gb18030_int");
  162. #ifdef TEST_JUST_SAY_GNO
  163. if ((debug & ZINT_DEBUG_TEST_PERFORMANCE)) { /* -d 256 */
  164. printf("test_u_gb18030_int perf iterations: %d\n", TEST_INT_PERF_ITERATIONS);
  165. }
  166. #endif
  167. for (i = 0; i < 0x10400; i++) { /* Don't bother with U+10400..U+10FFFF, programmatically filled */
  168. if (i >= 0xD800 && i <= 0xDFFF) { /* UTF-16 surrogates */
  169. continue;
  170. }
  171. if (testContinue(p_ctx, i)) continue;
  172. val1_1 = val1_2 = val2_1 = val2_2 = 0;
  173. ret = u_gb18030_int_test(i, &val1_1, &val1_2);
  174. ret2 = u_gb18030_int2(i, &val2_1, &val2_2);
  175. assert_equal(ret, ret2, "i:%d 0x%04X ret %d != ret2 %d, val1_1 0x%04X, val2_1 0x%04X, val1_2 0x%04X, val2_2 0x%04X\n", (int) i, i, ret, ret2, val1_1, val2_1, val1_2, val2_2);
  176. if (ret2) {
  177. assert_equal(val1_1, val2_1, "i:%d 0x%04X val1_1 0x%04X != val2_1 0x%04X\n", (int) i, i, val1_1, val2_1);
  178. assert_equal(val1_2, val2_2, "i:%d 0x%04X val1_2 0x%04X != val2_2 0x%04X\n", (int) i, i, val1_2, val2_2);
  179. }
  180. #ifdef TEST_JUST_SAY_GNO
  181. if (!(debug & ZINT_DEBUG_TEST_PERFORMANCE)) { /* -d 256 */
  182. val2_1 = val2_2 = 0;
  183. ret2 = gb18030_wctomb_zint(&val2_1, &val2_2, i);
  184. } else {
  185. for (j = 0; j < TEST_INT_PERF_ITERATIONS; j++) {
  186. val1_1 = val1_2 = val2_1 = val2_2 = 0;
  187. start = clock();
  188. ret = u_gb18030_int_test(i, &val1_1, &val1_2);
  189. total += clock() - start;
  190. start = clock();
  191. ret2 = gb18030_wctomb_zint(&val2_1, &val2_2, i);
  192. total_gno += clock() - start;
  193. }
  194. }
  195. assert_equal(ret, ret2, "i:%d 0x%04X ret %d != ret2 %d, val1_1 0x%04X, val2_1 0x%04X, val1_2 0x%04X, val2_2 0x%04X\n", (int) i, i, ret, ret2, val1_1, val2_1, val1_2, val2_2);
  196. if (ret2) {
  197. assert_equal(val1_1, val2_1, "i:%d 0x%04X val1_1 0x%04X != val2_1 0x%04X\n", (int) i, i, val1_1, val2_1);
  198. assert_equal(val1_2, val2_2, "i:%d 0x%04X val1_2 0x%04X != val2_2 0x%04X\n", (int) i, i, val1_2, val2_2);
  199. }
  200. #endif
  201. }
  202. /* u_gb18030() assumes valid Unicode so now returns a nonsense value here */
  203. val1_1 = val1_2 = 0;
  204. ret = u_gb18030_int_test(0x110000, &val1_1, &val1_2); /* Invalid Unicode codepoint */
  205. assert_equal(ret, 4, "0x110000 ret %d != 4, val1_1 0x%04X, val1_2 0x%04X\n", ret, val1_1, val1_2);
  206. for (i = 0; i < ARRAY_SIZE(nonpua_nonbmp); i++) {
  207. val1_1 = val1_2 = 0;
  208. ret = u_gb18030_int_test(nonpua_nonbmp[i], &val1_1, &val1_2);
  209. assert_equal(ret, 2, "i:%d 0x%04X ret %d != 2, val1_1 0x%04X, val1_2 0x%04X\n", (int) i, nonpua_nonbmp[i], ret, val1_1, val1_2);
  210. assert_equal(val1_1, nonpua_nonbmp_vals[i], "i:%d 0x%04X val1_1 0x%04X != 0x%04X\n", (int) i, nonpua_nonbmp[i], val1_1, nonpua_nonbmp_vals[i]);
  211. assert_zero(val1_2, "i:%d 0x%04X val1_2 0x%04X != 0\n", (int) i, nonpua_nonbmp[i], val1_2);
  212. }
  213. #ifdef TEST_JUST_SAY_GNO
  214. if ((debug & ZINT_DEBUG_TEST_PERFORMANCE)) { /* -d 256 */
  215. printf("test_u_gb18030_int perf totals: new % 8gms, gno % 8gms ratio %g\n",
  216. TEST_PERF_TIME(total), TEST_PERF_TIME(total_gno), TEST_PERF_RATIO(total, total_gno));
  217. }
  218. #endif
  219. testFinish();
  220. }
  221. static void test_gb18030_utf8(const testCtx *const p_ctx) {
  222. struct item {
  223. char *data;
  224. int length;
  225. int ret;
  226. int ret_length;
  227. unsigned int expected_gbdata[30];
  228. char *comment;
  229. };
  230. /*
  231. é U+00E9 in ISO 8859-1 plus other ISO 8859 (but not in ISO 8859-7 or ISO 8859-11), Win 1250 plus other Win, in GB 18030 0xA8A6, UTF-8 C3A9
  232. β U+03B2 in ISO 8859-7 Greek (but not other ISO 8859 or Win page), in GB 18030 0xA6C2, UTF-8 CEB2
  233. ¤ U+00A4 in ISO 8859-1 plus other ISO 8859 (but not in ISO 8859-7 or ISO 8859-11), Win 1250 plus other Win, in GB 18030 0xA1E8, UTF-8 C2A4
  234. ¥ U+00A5 in ISO 8859-1 0xA5, in GB 18030 4-byte 0x81308436, UTF-8 C2A5
  235. ・ U+30FB katakana middle dot, not in any ISO or Win page, in GB 18030 0xA1A4, duplicate of mapping of U+00B7, UTF-8 E383BB
  236. · U+00B7 middle dot in ISO 8859-1 0xB7, in GB 18030 "GB 18030 subset" 0xA1A4, duplicate of mapping of U+30FB, UTF-8 C2B7
  237. ― U+2015 horizontal bar in ISO 8859-7 Greek and ISO 8859-10 Nordic, not in any Win page, in GB 18030 "GB18030.TXT" 0xA1AA, duplicate of mapping of U+2014, UTF-8 E28095
  238. — U+2014 em dash, not in any ISO, in Win 1250 and other Win, in GB 18030 "GB 18030 subset" 0xA1AA, duplicate of mapping of U+2015, UTF-8 E28094
  239. */
  240. /* s/\/\*[ 0-9]*\*\//\=printf("\/\*%3d*\/", line(".") - line("'<")): */
  241. struct item data[] = {
  242. /* 0*/ { "é", -1, 0, 1, { 0xA8A6 }, "" },
  243. /* 1*/ { "β", -1, 0, 1, { 0xA6C2 }, "" },
  244. /* 2*/ { "¤", -1, 0, 1, { 0xA1E8 }, "" },
  245. /* 3*/ { "¥", -1, 0, 2, { 0x8130, 0x8436 }, "0x81308436" },
  246. /* 4*/ { "・", -1, 0, 2, { 0x8139, 0xA739 }, "" },
  247. /* 5*/ { "·", -1, 0, 1, { 0xA1A4 }, "GB 18030 subset mapping" },
  248. /* 6*/ { "―", -1, 0, 1, { 0xA844 }, "GB18030.TXT mapping" },
  249. /* 7*/ { "—", -1, 0, 1, { 0xA1AA }, "GB 18030 subset mapping" },
  250. /* 8*/ { "aβc・·—é—Z", -1, 0, 10, { 'a', 0xA6C2, 'c', 0x8139, 0xA739, 0xA1A4, 0xA1AA, 0xA8A6, 0xA1AA, 'Z' }, "" },
  251. /* 9*/ { "\200", -1, ZINT_ERROR_INVALID_DATA, -1, {0}, "Invalid UTF-8" },
  252. /* 10*/ { "\357\277\276", -1, 0, 2, { 0x8431, 0xA438 }, "U+FFFE (reversed BOM)" },
  253. /* 11*/ { "\357\277\277", -1, 0, 2, { 0x8431, 0xA439 }, "U+FFFF" },
  254. };
  255. int data_size = ARRAY_SIZE(data);
  256. int i, length, ret;
  257. struct zint_symbol symbol = {0};
  258. unsigned int gbdata[30];
  259. testStart("test_gb18030_utf8");
  260. for (i = 0; i < data_size; i++) {
  261. int ret_length;
  262. if (testContinue(p_ctx, i)) continue;
  263. length = data[i].length == -1 ? (int) strlen(data[i].data) : data[i].length;
  264. ret_length = length;
  265. ret = gb18030_utf8(&symbol, (unsigned char *) data[i].data, &ret_length, gbdata);
  266. assert_equal(ret, data[i].ret, "i:%d ret %d != %d (%s)\n", i, ret, data[i].ret, symbol.errtxt);
  267. if (ret == 0) {
  268. int j;
  269. assert_equal(ret_length, data[i].ret_length, "i:%d ret_length %d != %d\n", i, ret_length, data[i].ret_length);
  270. for (j = 0; j < (int) ret_length; j++) {
  271. assert_equal(gbdata[j], data[i].expected_gbdata[j], "i:%d gbdata[%d] 0x%04X != 0x%04X\n", i, j, gbdata[j], data[i].expected_gbdata[j]);
  272. }
  273. }
  274. }
  275. testFinish();
  276. }
  277. static void test_gb18030_utf8_to_eci(const testCtx *const p_ctx) {
  278. struct item {
  279. int eci;
  280. int full_multibyte;
  281. char *data;
  282. int length;
  283. int ret;
  284. int ret_length;
  285. unsigned int expected_gbdata[30];
  286. char *comment;
  287. };
  288. /*
  289. é U+00E9 in ISO 8859-1 0xE9, Win 1250 plus other Win, in HANXIN Chinese mode first byte range 0x81..FE
  290. β U+03B2 in ISO 8859-7 Greek 0xE2 (but not other ISO 8859 or Win page)
  291. ¥ U+00A5 in ISO 8859-1 0xA5, in first byte range 0x81..FE
  292. ÿ U+00FF in ISO 8859-1 0xFF, outside first byte and second/third/fourth byte ranges
  293. @ U+0040 in ASCII 0x40, outside first byte range, in double-byte second byte range, outside quad-byte second/third/fourth byte ranges
  294. 9 U+0039 in ASCII 0x39, outside first byte range, outside double-byte second byte range and quad-byte third byte range, in quad-byte second/fourth byte ranges
  295. */
  296. /* s/\/\*[ 0-9]*\*\//\=printf("\/\*%3d*\/", line(".") - line("'<")): */
  297. struct item data[] = {
  298. /* 0*/ { 3, 0, "é", -1, 0, 1, { 0xE9 }, "Not full multibyte" },
  299. /* 1*/ { 3, 1, "é", -1, 0, 1, { 0xE9 }, "First byte in range but only one byte" },
  300. /* 2*/ { 3, 0, "β", -1, ZINT_ERROR_INVALID_DATA, -1, {0}, "Not full multibyte" },
  301. /* 3*/ { 3, 1, "β", -1, ZINT_ERROR_INVALID_DATA, -1, {0}, "Not in ECI 3 (ISO 8859-1)" },
  302. /* 4*/ { 9, 0, "β", -1, 0, 1, { 0xE2 }, "Not full multibyte" },
  303. /* 5*/ { 9, 1, "β", -1, 0, 1, { 0xE2 }, "In ECI 9 (ISO 8859-7)" },
  304. /* 6*/ { 3, 0, "¥", -1, 0, 1, { 0xA5 }, "Not full multibyte" },
  305. /* 7*/ { 3, 1, "¥", -1, 0, 1, { 0xA5 }, "First byte in range but only one byte" },
  306. /* 8*/ { 3, 0, "¥é", -1, 0, 2, { 0xA5, 0xE9 }, "Not full multibyte" },
  307. /* 9*/ { 3, 1, "¥é", -1, 0, 1, { 0xA5E9 }, "In double-byte range" },
  308. /* 10*/ { 3, 0, "¥ÿ", -1, 0, 2, { 0xA5, 0xFF }, "Not full multibyte" },
  309. /* 11*/ { 3, 1, "¥ÿ", -1, 0, 2, { 0xA5, 0xFF }, "First byte in range but not second" },
  310. /* 12*/ { 3, 0, "¥9é9", -1, 0, 4, { 0xA5, 0x39, 0xE9, 0x39 }, "Not full multibyte" },
  311. /* 13*/ { 3, 1, "¥9é9", -1, 0, 2, { 0xA539, 0xE939 }, "In quad-byte range" },
  312. /* 14*/ { 3, 0, "¥9", -1, 0, 2, { 0xA5, 0x39 }, "Not full multibyte" },
  313. /* 15*/ { 3, 1, "¥9", -1, 0, 2, { 0xA5, 0x39 }, "In quad-byte first/second range but only 2 bytes, not in double-byte range" },
  314. /* 16*/ { 3, 0, "¥9é", -1, 0, 3, { 0xA5, 0x39, 0xE9 }, "Not full multibyte" },
  315. /* 17*/ { 3, 1, "¥9é", -1, 0, 3, { 0xA5, 0x39, 0xE9 }, "In quad-byte first/second/third range but only 3 bytes, no bytes in double-byte range" },
  316. /* 18*/ { 3, 0, "¥9é@", -1, 0, 4, { 0xA5, 0x39, 0xE9, 0x40 }, "Not full multibyte" },
  317. /* 19*/ { 3, 1, "¥9é@", -1, 0, 3, { 0xA5, 0x39, 0xE940 }, "In quad-byte first/second/third range but not fourth, second 2 bytes in double-byte range" },
  318. /* 20*/ { 3, 0, "¥@é9", -1, 0, 4, { 0xA5, 0x40, 0xE9, 0x39 }, "Not full multibyte" },
  319. /* 21*/ { 3, 1, "¥@é9", -1, 0, 3, { 0xA540, 0xE9, 0x39 }, "In quad-byte first/third/fourth range but not second, first 2 bytes in double-byte range" },
  320. /* 22*/ { 3, 0, "¥9@9", -1, 0, 4, { 0xA5, 0x39, 0x40, 0x39 }, "Not full multibyte" },
  321. /* 23*/ { 3, 1, "¥9@9", -1, 0, 4, { 0xA5, 0x39, 0x40, 0x39 }, "In quad-byte first/second/fourth range but not third, no bytes in double-byte range" },
  322. /* 24*/ { 3, 0, "é9éé¥9é@¥9é9¥9é0é@@¥¥é0é1", -1, 0, 25, { 0xE9, 0x39, 0xE9, 0xE9, 0xA5, 0x39, 0xE9, 0x40, 0xA5, 0x39, 0xE9, 0x39, 0xA5, 0x39, 0xE9, 0x30, 0xE9, 0x40, 0x40, 0xA5, 0xA5, 0xE9, 0x30, 0xE9, 0x31 }, "" },
  323. /* 25*/ { 3, 1, "é9éé¥9é@¥9é9¥9é0é@@¥¥é0é1", -1, 0, 15, { 0xE9, 0x39, 0xE9E9, 0xA5, 0x39, 0xE940, 0xA539, 0xE939, 0xA539, 0xE930, 0xE940, 0x40, 0xA5A5, 0xE930, 0xE931 }, "" },
  324. /* 26*/ { 20, 0, "\\\\", -1, 0, 4, { 0x81, 0x5F, 0x81, 0x5F }, "Shift JIS reverse solidus (backslash) mapping from ASCII to double byte" },
  325. /* 27*/ { 20, 1, "\\\\", -1, 0, 2, { 0x815F, 0x815F }, "Shift JIS in GB 18030 Hanzi mode range" },
  326. /* 28*/ { 20, 0, "爍", -1, 0, 2, { 0xE0, 0xA1 }, "Shift JIS U+720D" },
  327. /* 29*/ { 20, 1, "爍", -1, 0, 1, { 0xE0A1 }, "Shift JIS in GB 18030 Hanzi mode range" },
  328. /* 30*/ { 25, 0, "12", -1, 0, 4, { 0x00, 0x31, 0x00, 0x32 }, "UCS-2BE ASCII" },
  329. /* 31*/ { 25, 0, "", -1, 0, 4, { 0x00, 0x81, 0x00, 0x81 }, "UCS-2BE U+0081" },
  330. /* 32*/ { 25, 1, "", -1, 0, 4, { 0x00, 0x81, 0x00, 0x81 }, "UCS-2BE outside GB 18030 Hanzi mode range" },
  331. /* 33*/ { 25, 0, "ꆩꆩ", -1, 0, 4, { 0xA1, 0xA9, 0xA1, 0xA9 }, "UCS-2BE U+A1A9" },
  332. /* 34*/ { 25, 1, "ꆩꆩ", -1, 0, 2, { 0xA1A9, 0xA1A9 }, "UCS-2BE in GB 18030 Hanzi mode range" },
  333. /* 35*/ { 25, 0, "膀膀", -1, 0, 4, { 0x81, 0x80, 0x81, 0x80 }, "UCS-2BE U+8180" },
  334. /* 36*/ { 25, 1, "膀膀", -1, 0, 2, { 0x8180, 0x8180 }, "UCS-2BE in GB 18030 Hanzi mode range (but outside GB 2312 range)" },
  335. /* 37*/ { 28, 0, "¢¢", -1, 0, 4, { 0xA2, 0x46, 0xA2, 0x46 }, "Big5 U+00A2" },
  336. /* 38*/ { 28, 1, "¢¢", -1, 0, 2, { 0xA246, 0xA246 }, "Big5 in GB 18030 Hanzi mode range (but outside GB 2312 range)" },
  337. /* 39*/ { 28, 0, "陛", -1, 0, 2, { 0xB0, 0xA1 }, "Big5 U+965B" },
  338. /* 40*/ { 28, 1, "陛", -1, 0, 1, { 0xB0A1 }, "Big5 in GB 18030 Hanzi mode range" },
  339. /* 41*/ { 29, 0, "¨¨", -1, 0, 2, { 0xA1A7, 0xA1A7 }, "GB 2312 U+00A8" },
  340. /* 42*/ { 29, 1, "¨¨", -1, 0, 2, { 0xA1A7, 0xA1A7 }, "GB 2312" },
  341. /* 43*/ { 29, 0, "崂", -1, 0, 1, { 0xE1C0 }, "GB 2312 U+5D02" },
  342. /* 44*/ { 29, 1, "崂", -1, 0, 1, { 0xE1C0 }, "GB 2312" },
  343. /* 45*/ { 29, 0, "・", -1, 0, 1, { 0xA1A4 }, "GB 2312 U+30FB" },
  344. /* 46*/ { 29, 1, "・", -1, 0, 1, { 0xA1A4 }, "GB 2312" },
  345. /* 47*/ { 29, 0, "釦", -1, ZINT_ERROR_INVALID_DATA, -1, {0}, "GB 18030 U+91E6 not in GB 2312" },
  346. /* 48*/ { 30, 0, "¡¡", -1, 0, 4, { 0x22 + 0x80, 0x2E + 0x80, 0x22 + 0x80, 0x2E + 0x80 }, "EUC-KR U+00A1 (0xA2AE)" },
  347. /* 49*/ { 30, 1, "¡¡", -1, 0, 2, { 0x222E + 0x8080, 0x222E + 0x8080 }, "All EUC-KR in GB 18030 Hanzi mode range" },
  348. /* 50*/ { 30, 0, "詰", -1, 0, 2, { 0x7D + 0x80, 0x7E + 0x80 }, "EUC-KR U+8A70 (0xFDFE)" },
  349. /* 51*/ { 30, 1, "詰", -1, 0, 1, { 0x7D7E + 0x8080 }, "All EUC-KR in GB 18030 Hanzi mode range" },
  350. /* 52*/ { 31, 0, "條", -1, 0, 1, { 0x976C }, "GBK U+689D" },
  351. /* 53*/ { 31, 1, "條", -1, 0, 1, { 0x976C }, "GBK U+689D" },
  352. /* 54*/ { 31, 0, "條碼", -1, 0, 2, { 0x976C, 0xB461 }, "GBK U+689D" },
  353. /* 55*/ { 31, 1, "條碼", -1, 0, 2, { 0x976C, 0xB461 }, "GBK U+689D" },
  354. /* 56*/ { 31, 0, "釦", -1, 0, 1, { 0xE240 }, "GB 18030 U+91E6 in GBK" },
  355. /* 57*/ { 31, 0, "€", -1, ZINT_ERROR_INVALID_DATA, -1, {0}, "GB 18030 U+20AC not in GBK" },
  356. /* 58*/ { 32, 0, "¨¨", -1, 0, 2, { 0xA1A7, 0xA1A7 }, "GB 18030 U+00A8" },
  357. /* 59*/ { 32, 1, "¨¨", -1, 0, 2, { 0xA1A7, 0xA1A7 }, "GB 18030" },
  358. /* 60*/ { 32, 0, "崂", -1, 0, 1, { 0xE1C0 }, "GB 18030 U+5D02" },
  359. /* 61*/ { 32, 1, "崂", -1, 0, 1, { 0xE1C0 }, "GB 18030" },
  360. /* 62*/ { 32, 0, "・", -1, 0, 2, { 0x8139, 0xA739 }, "GB 18030 U+30FB" },
  361. /* 63*/ { 32, 1, "・", -1, 0, 2, { 0x8139, 0xA739 }, "GB 18030" },
  362. /* 64*/ { 32, 0, "€", -1, 0, 1, { 0xA2E3 }, "GB 18030 U+20AC " },
  363. };
  364. int data_size = ARRAY_SIZE(data);
  365. int i, length, ret;
  366. unsigned int gbdata[30];
  367. testStart("test_gb18030_utf8_to_eci");
  368. for (i = 0; i < data_size; i++) {
  369. int ret_length;
  370. if (testContinue(p_ctx, i)) continue;
  371. length = data[i].length == -1 ? (int) strlen(data[i].data) : data[i].length;
  372. ret_length = length;
  373. ret = gb18030_utf8_to_eci(data[i].eci, (unsigned char *) data[i].data, &ret_length, gbdata, data[i].full_multibyte);
  374. assert_equal(ret, data[i].ret, "i:%d ret %d != %d\n", i, ret, data[i].ret);
  375. if (ret == 0) {
  376. int j;
  377. assert_equal(ret_length, data[i].ret_length, "i:%d ret_length %d != %d\n", i, ret_length, data[i].ret_length);
  378. for (j = 0; j < (int) ret_length; j++) {
  379. assert_equal(gbdata[j], data[i].expected_gbdata[j], "i:%d gbdata[%d] 0x%04X != 0x%04X\n", i, j, gbdata[j], data[i].expected_gbdata[j]);
  380. }
  381. }
  382. }
  383. testFinish();
  384. }
  385. INTERNAL void gb18030_cpy_test(const unsigned char source[], int *p_length, unsigned int *ddata,
  386. const int full_multibyte);
  387. static void test_gb18030_cpy(const testCtx *const p_ctx) {
  388. struct item {
  389. int full_multibyte;
  390. char *data;
  391. int length;
  392. int ret;
  393. int ret_length;
  394. unsigned int expected_gbdata[30];
  395. char *comment;
  396. };
  397. /* s/\/\*[ 0-9]*\*\//\=printf("\/\*%3d*\/", line(".") - line("'<")): */
  398. struct item data[] = {
  399. /* 0*/ { 0, "\351", -1, 0, 1, { 0xE9 }, "Not full multibyte" },
  400. /* 1*/ { 1, "\351", -1, 0, 1, { 0xE9 }, "In HANXIN Chinese mode first-byte range but only one byte" },
  401. /* 2*/ { 0, "\351\241", -1, 0, 2, { 0xE9, 0xA1 }, "Not full multibyte" },
  402. /* 3*/ { 1, "\351\241", -1, 0, 1, { 0xE9A1 }, "In HANXIN Chinese range" },
  403. /* 4*/ { 0, "\241", -1, 0, 1, { 0xA1 }, "Not full multibyte" },
  404. /* 5*/ { 1, "\241", -1, 0, 1, { 0xA1 }, "In first-byte range but only one byte" },
  405. /* 6*/ { 0, "\241\241", -1, 0, 2, { 0xA1, 0xA1 }, "Not full multibyte" },
  406. /* 7*/ { 1, "\241\241", -1, 0, 1, { 0xA1A1 }, "In range" },
  407. /* 8*/ { 0, "\241\240\241\376\367\376\367\377\2012\2013", -1, 0, 12, { 0xA1, 0xA0, 0xA1, 0xFE, 0xF7, 0xFE, 0xF7, 0xFF, 0x81, 0x32, 0x81, 0x33 }, "" },
  408. /* 9*/ { 1, "\241\240\241\376\367\376\367\377\2012\2013", -1, 0, 7, { 0xA1A0, 0xA1FE, 0xF7FE, 0xF7, 0xFF, 0x8132, 0x8133 }, "" },
  409. };
  410. int data_size = ARRAY_SIZE(data);
  411. int i, length;
  412. unsigned int gbdata[30];
  413. testStart("test_gb18030_cpy");
  414. for (i = 0; i < data_size; i++) {
  415. int ret_length;
  416. int j;
  417. if (testContinue(p_ctx, i)) continue;
  418. length = data[i].length == -1 ? (int) strlen(data[i].data) : data[i].length;
  419. ret_length = length;
  420. gb18030_cpy_test((unsigned char *) data[i].data, &ret_length, gbdata, data[i].full_multibyte);
  421. assert_equal(ret_length, data[i].ret_length, "i:%d ret_length %d != %d\n", i, ret_length, data[i].ret_length);
  422. for (j = 0; j < (int) ret_length; j++) {
  423. assert_equal(gbdata[j], data[i].expected_gbdata[j], "i:%d gbdata[%d] %04X != %04X\n", i, j, gbdata[j], data[i].expected_gbdata[j]);
  424. }
  425. }
  426. testFinish();
  427. }
  428. INTERNAL int u_gbk_int_test(const unsigned int u, unsigned int *dest);
  429. /* Control for GBK */
  430. static int u_gbk_int2(unsigned int u, unsigned int *dest) {
  431. unsigned int c;
  432. int tab_length, start_i, end_i;
  433. int i;
  434. tab_length = ARRAY_SIZE(test_gbk_tab);
  435. start_i = test_gbk_tab_ind[u >> 10];
  436. end_i = start_i + 0x800 > tab_length ? tab_length : start_i + 0x800;
  437. for (i = start_i; i < end_i; i += 2) {
  438. if (test_gbk_tab[i + 1] == u) {
  439. c = test_gbk_tab[i];
  440. if (c <= 0xFFFF) {
  441. *dest = c;
  442. return c <= 0xFF ? 1 : 2;
  443. }
  444. return 0;
  445. }
  446. }
  447. return 0;
  448. }
  449. static void test_u_gbk_int(const testCtx *const p_ctx) {
  450. int ret, ret2;
  451. unsigned int val, val2;
  452. unsigned int i;
  453. testStart("test_u_gbk_int");
  454. for (i = 0; i < 0xFFFE; i++) {
  455. if (i >= 0xD800 && i <= 0xDFFF) { /* UTF-16 surrogates */
  456. continue;
  457. }
  458. if (testContinue(p_ctx, i)) continue;
  459. val = val2 = 0;
  460. ret = u_gbk_int_test(i, &val);
  461. ret2 = u_gbk_int2(i, &val2);
  462. assert_equal(ret, ret2, "i:%d 0x%04X ret %d != ret2 %d, val 0x%04X, val2 0x%04X\n", (int) i, i, ret, ret2, val, val2);
  463. if (ret2) {
  464. assert_equal(val, val2, "i:%d 0x%04X val 0x%04X != val2 0x%04X\n", (int) i, i, val, val2);
  465. }
  466. }
  467. testFinish();
  468. }
  469. #define TEST_PERF_ITER_MILLES 100
  470. #define TEST_PERF_ITERATIONS (TEST_PERF_ITER_MILLES * 1000)
  471. /* Not a real test, just performance indicator */
  472. static void test_perf(const testCtx *const p_ctx) {
  473. int debug = p_ctx->debug;
  474. struct item {
  475. char *data;
  476. int ret;
  477. char *comment;
  478. };
  479. struct item data[] = {
  480. /* 0*/ { "1234567890", 0, "10 numerics" },
  481. /* 1*/ { "条码北京條碼པེ་ཅིང།バーコード바코드", 0, "Small various code pages" },
  482. /* 2*/ { "Summer Palace Ticket for 6 June 2015 13:00;2015年6月6日夜01時00分PM頤和園のチケット;2015년6월6일13시오후여름궁전티켓.2015年6月6号下午13:00的颐和园门票;", 0, "Small mixed ASCII/Hanzi" },
  483. /* 3*/ { "汉信码标准\015\012中国物品编码中心\015\012北京网路畅想科技发展有限公司\015\012张成海、赵楠、黄燕滨、罗秋科、王毅、张铎、王越\015\012施煜、边峥、修兴强\015\012汉信码标准\015\012中国物品编码中心\015\012北京网路畅想科技发展有限公司", 0, "Bigger mixed" },
  484. /* 4*/ { "本标准规定了一种矩阵式二维条码——汉信码的码制以及编译码方法。本标准中对汉信码的码图方案、信息编码方法、纠错编译码算法、信息排布方法、参考译码算法等内容进行了详细的描述,汉信码可高效表示《GB 18030—2000 信息技术 信息交换用汉字编码字符集基本集的扩充》中的汉字信息,并具有数据容量大、抗畸变和抗污损能力强、外观美观等特点,适合于在我国各行业的广泛应用。 测试文本,测试人:施煜,边峥,修兴强,袁娲,测试目的:汉字表示,测试版本:40\015\012", 0, "Bigger mixed" },
  485. /* 5*/ { "本标准规定了一种矩阵式二维条码——汉信码的码制以及编译码方法。本标准中对汉信码的码图方案、信息编码方法、纠错编译码算法、信息排布方法、参考译码算法等内容进行了详细的描述,汉信码可高效表示《GB 18030—2000 信息技术 信息交换用汉字编码字符集基本集的扩充》中的汉字信息,并具有数据容量大、抗畸变和抗污损能力强、外观美观等特点,适合于在我国各行业的广泛应用。 测试文本,测试人:施煜,边峥,修兴强,袁娲,测试目的:汉字表示,测试版本:40\015\012本标准规定了一种矩阵式二维条码——汉信码的码制以及编译码方法。本标准中对汉信码的码图方案、信息编码方法、纠错编译码算法、信息排布方法、参考译码算法等内容进行了详细的描述,汉信码可高效表示《GB 18030—2000 信息技术 信息交换用汉字编码字符集基本集的扩充》中的汉字信息,并具有数据容量大、抗畸变和抗污损能力强、外观美观等特点,适合于在我国各行业的广泛应用。 测试文本,测试人:施煜,边峥,修兴强,袁娲,测试目的:汉字表示,测试版本:40\015\012本标准规定了一种矩阵式二维条码——汉信码的码制以及编译码方法。本标准中对汉信码的码图方案、信息编码方法、纠错编译码算法RS、信息排布方法、参考译码算法等内容进行了详细的描述,汉信码可高效表示《GB 18030—2000 信息技术 122", 0, "Medium mixed" },
  486. /* 6*/ { "本标准规定了一种矩阵式二维条码——汉信码的码制以及编译码方法。本标准中对汉信码的码图方案、信息编码方法、纠错编译码算法、信息排布方法、参考译码算法等内容进行了详细的描述,汉信码可高效表示《GB 18030—2000 信息技术 信息交换用汉字编码字符集基本集的扩充》中的汉字信息,并具有数据容量大、抗畸变和抗污损能力强、外观美观等特点,适合于在我国各行业的广泛应用。 测试文本,测试人:施煜,边峥,修兴强,袁娲,测试目的:汉字表示,测试版本:84\015\012本标准规定了一种矩阵式二维条码——汉信码的码制以及编译码方法。本标准中对汉信码的码图方案、信息编码方法、纠错编译码算法、信息排布方法、参考译码算法等内容进行了详细的描述,汉信码可高效表示《GB 18030—2000 信息技术 信息交换用汉字编码字符集基本集的扩充》中的汉字信息,并具有数据容量大、抗畸变和抗污损能力强、外观美观等特点,适合于在我国各行业的广泛应用。 测试文本,测试人:施煜,边峥,修兴强,袁娲,测试目的:汉字表示,测试版本:84\015\012本标准规定了一种矩阵式二维条码——汉信码的码制以及编译码方法。本标准中对汉信码的码图方案、信息编码方法、纠错编译码算法、信息排布方法、参考译码算法等内容进行了详细的描述,汉信码可高效表示《GB 18030—2000 信息技术 信息交换用汉字编码字符集基本集的扩充》中的汉字信息,并具有数据容量大、抗畸变和抗污损能力强、外观美观等特点,适合于在我国各行业的广泛应用。 测试文本,测试人:施煜,边峥,修兴强,袁娲,测试目的:汉字表示,测试版本:40本标准规定了一种矩阵式二维条码——汉信码的码制以及编译码方法。本标准中对汉信码的码图方", 0, "Bigger mixed" },
  487. };
  488. int data_size = ARRAY_SIZE(data);
  489. int i, length, ret;
  490. struct zint_symbol symbol = {0};
  491. int ret_length;
  492. #ifdef TEST_JUST_SAY_GNO
  493. int ret_length2;
  494. #endif
  495. unsigned int ddata[8192];
  496. unsigned char dest[8192];
  497. int ret2 = 0;
  498. #ifdef TEST_JUST_SAY_GNO
  499. unsigned int ddata2[8192];
  500. #endif
  501. clock_t start;
  502. clock_t total = 0, total_gno = 0, total_eci = 0;
  503. clock_t diff, diff_gno, diff_eci;
  504. int comment_max = 0;
  505. if (!(debug & ZINT_DEBUG_TEST_PERFORMANCE)) { /* -d 256 */
  506. return;
  507. }
  508. for (i = 0; i < data_size; i++) if ((int) strlen(data[i].comment) > comment_max) comment_max = (int) strlen(data[i].comment);
  509. printf("Iterations %d\n", TEST_PERF_ITERATIONS);
  510. for (i = 0; i < data_size; i++) {
  511. int j;
  512. if (testContinue(p_ctx, i)) continue;
  513. length = (int) strlen(data[i].data);
  514. diff = diff_gno = diff_eci = 0;
  515. for (j = 0; j < TEST_PERF_ITERATIONS; j++) {
  516. ret_length = length;
  517. start = clock();
  518. ret = gb18030_utf8(&symbol, (unsigned char *) data[i].data, &ret_length, ddata);
  519. diff += clock() - start;
  520. #ifdef TEST_JUST_SAY_GNO
  521. ret_length2 = length;
  522. start = clock();
  523. ret2 = gb18030_utf8_wctomb(&symbol, (unsigned char *) data[i].data, &ret_length2, ddata2);
  524. diff_gno += clock() - start;
  525. #endif
  526. ret_length = length;
  527. start = clock();
  528. (void)utf8_to_eci(32, (unsigned char *) data[i].data, dest, &ret_length);
  529. diff_eci += clock() - start;
  530. }
  531. assert_equal(ret, ret2, "i:%d ret %d != ret2 %d\n", (int) i, ret, ret2);
  532. printf("%*s: new % 8gms, gno % 8gms ratio % 9g, eci %gms\n", comment_max, data[i].comment,
  533. TEST_PERF_TIME(diff), TEST_PERF_TIME(diff_gno), TEST_PERF_RATIO(diff, diff_gno), TEST_PERF_TIME(diff_eci));
  534. total += diff;
  535. total_gno += diff_gno;
  536. }
  537. if (p_ctx->index == -1) {
  538. printf("%*s: new % 8gms, gno % 8gms ratio % 9g, eci %gms\n", comment_max, "totals",
  539. TEST_PERF_TIME(total), TEST_PERF_TIME(total_gno), TEST_PERF_RATIO(total, total_gno), TEST_PERF_TIME(total_eci));
  540. }
  541. }
  542. int main(int argc, char *argv[]) {
  543. testFunction funcs[] = { /* name, func */
  544. { "test_u_gb18030_int", test_u_gb18030_int },
  545. { "test_gb18030_utf8", test_gb18030_utf8 },
  546. { "test_gb18030_utf8_to_eci", test_gb18030_utf8_to_eci },
  547. { "test_gb18030_cpy", test_gb18030_cpy },
  548. { "test_u_gbk_int", test_u_gbk_int },
  549. { "test_perf", test_perf },
  550. };
  551. testRun(argc, argv, funcs, ARRAY_SIZE(funcs));
  552. testReport();
  553. return 0;
  554. }
  555. /* vim: set ts=4 sw=4 et : */