gen_eci_mb_h.php 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720
  1. <?php
  2. /* Generate ECI multibyte tables from unicode.org mapping files */
  3. /*
  4. libzint - the open source barcode library
  5. Copyright (C) 2022-2023 Robin Stuart <rstuart114@gmail.com>
  6. */
  7. /* SPDX-License-Identifier: BSD-3-Clause */
  8. /*
  9. * To create "backend/eci_big5/gb18030/gb2312/gbk/ksx1001/sjis.h" (from project root directory):
  10. *
  11. * php backend/tools/gen_eci_mb_h.php
  12. *
  13. * NOTE: backend/tools/data/GB18030.TXT will have to be downloaded first from the tarball
  14. * https://haible.de/bruno/charsets/conversion-tables/GB18030.tar.bz2
  15. * using the version jdk-1.4.2/GB18030.TXT
  16. *
  17. * NOTE: tools/data/GB2312.TXT will have to be downloaded first from the tarball
  18. * https://haible.de/bruno/charsets/conversion-tables/GB2312.tar.bz2
  19. * using the version unicode.org-mappings/EASTASIA/GB/GB2312.TXT
  20. */
  21. // 'zend.assertions' should set to 1 in php.ini
  22. $copyright_text = <<<'EOD'
  23. Redistribution and use in source and binary forms, with or without
  24. modification, are permitted provided that the following conditions
  25. are met:
  26. 1. Redistributions of source code must retain the above copyright
  27. notice, this list of conditions and the following disclaimer.
  28. 2. Redistributions in binary form must reproduce the above copyright
  29. notice, this list of conditions and the following disclaimer in the
  30. documentation and/or other materials provided with the distribution.
  31. 3. Neither the name of the project nor the names of its contributors
  32. may be used to endorse or promote products derived from this software
  33. without specific prior written permission.
  34. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  35. ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  36. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  37. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
  38. FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  39. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  40. OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  41. HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  42. LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  43. OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  44. SUCH DAMAGE.
  45. */
  46. /* SPDX-License-Identifier: BSD-3-Clause */
  47. EOD;
  48. $basename = basename(__FILE__);
  49. $dirname = dirname(__FILE__);
  50. $opts = getopt('d:o:');
  51. $data_dirname = isset($opts['d']) ? $opts['d'] : ($dirname . '/data'); // Where to load file from.
  52. $out_dirname = isset($opts['o']) ? $opts['o'] : ($dirname . '/..'); // Where to put output.
  53. $year = 2022;
  54. function out_header(&$out, $name, $descr, $file, $start_year = 0, $extra_comment = '') {
  55. global $copyright_text, $basename, $year;
  56. $caps = strtoupper($name);
  57. $out[] = '/* ' . $name . '.h - tables for Unicode to ' . $descr . ', generated by "backend/tools/' . $basename . '"';
  58. if ($extra_comment !== '') {
  59. $out[] = ' from "' . $file . '"';
  60. $out[] = ' ' . $extra_comment . ' */';
  61. } else {
  62. $out[] = ' from "' . $file . '" */';
  63. }
  64. $out[] = '/*';
  65. $out[] = ' libzint - the open source barcode library';
  66. if ($start_year && $start_year != $year) {
  67. $out[] = ' Copyright (C) ' . $start_year . '-' . $year . ' Robin Stuart <rstuart114@gmail.com>';
  68. } else {
  69. $out[] = ' Copyright (C) ' . $year . ' Robin Stuart <rstuart114@gmail.com>';
  70. }
  71. $out = array_merge($out, explode("\n", $copyright_text));
  72. $out[] = '#ifndef Z_' . $caps . '_H';
  73. $out[] = '#define Z_' . $caps . '_H';
  74. }
  75. /* Output a block of table entries to `$out` array */
  76. function out_tab_entries(&$out, $arr, $cnt, $not_hex = false) {
  77. $line = ' ';
  78. for ($i = 0; $i < $cnt; $i++) {
  79. if ($i && $i % 8 === 0) {
  80. $out[] = $line;
  81. $line = ' ';
  82. }
  83. if ($not_hex) {
  84. $line .= sprintf(' %5d,', $arr[$i]);
  85. } else {
  86. $line .= sprintf(' 0x%04X,', $arr[$i]);
  87. }
  88. }
  89. if ($line !== ' ') {
  90. $out[] = $line;
  91. }
  92. }
  93. /* Output tables to `$out` array */
  94. function out_tabs(&$out, $name, $sort, $mb, $no_u_ind = false, $u_comment = '', $mb_comment = '') {
  95. if ($u_comment == '') $u_comment = 'Unicode codepoints sorted';
  96. $cnt_sort = count($sort);
  97. $out[] = '';
  98. $out[] = '/* ' . $u_comment . ' */';
  99. $out[] = 'static const unsigned short ' . $name . '_u[' . $cnt_sort . '] = {';
  100. out_tab_entries($out, $sort, $cnt_sort);
  101. $out[] = '};';
  102. if (!empty($mb)) {
  103. if ($mb_comment == '') $mb_comment = 'Multibyte values sorted in Unicode order';
  104. $cnt = count($mb);
  105. $out[] = '';
  106. $out[] = '/* ' . $mb_comment . ' */';
  107. $out[] = 'static const unsigned short ' . $name . '_mb[' . $cnt . '] = {';
  108. $line = ' ';
  109. out_tab_entries($out, $mb, $cnt);
  110. $out[] = '};';
  111. }
  112. if (!$no_u_ind) {
  113. $ind_cnt = ($sort[$cnt_sort - 1] >> 8) + 1;
  114. $out[] = '';
  115. $out[] = '/* Indexes into Unicode `' . $name . '_u[]` array in blocks of 0x100 */';
  116. $ind_idx = count($out);
  117. $out[] = 'static const unsigned short ' . $name . '_u_ind[] = {';
  118. $line = ' ';
  119. $i = 0;
  120. foreach ($sort as $ind => $u) {
  121. $div = ($u - $sort[0]) >> 8;
  122. while ($div >= $i) {
  123. if ($i && $i % 8 === 0) {
  124. $out[] = $line;
  125. $line = ' ';
  126. }
  127. $line .= sprintf(' %5d,', $ind);
  128. $i++;
  129. }
  130. }
  131. if ($line !== ' ') {
  132. $out[] = $line;
  133. $line = ' ';
  134. }
  135. $out[] = '};';
  136. $out[$ind_idx] = 'static const unsigned short ' . $name . '_u_ind[' . $i . '] = {';
  137. }
  138. }
  139. /* Helper to output special-case URO (Unified Repertoire and Ordering) block (U+4E00-U+9FFF) tables */
  140. function out_uro_tabs(&$out, $name, $tab_uro_u, $tab_uro_mb_ind) {
  141. $cnt = count($tab_uro_u);
  142. $out[] = '';
  143. $out[] = '/* Unicode usage bit-flags for URO (U+4E00-U+9FFF) block */';
  144. $out[] = 'static const unsigned short ' . $name . '_uro_u[' . $cnt . '] = {';
  145. out_tab_entries($out, $tab_uro_u, $cnt);
  146. $cnt = count($tab_uro_mb_ind);
  147. $out[] = '};';
  148. $out[] = '';
  149. $out[] = '/* Multibyte indexes for URO (U+4E00-U+9FFF) block */';
  150. $out[] = 'static const unsigned short ' . $name . '_uro_mb_ind[' . $cnt . '] = {';
  151. out_tab_entries($out, $tab_uro_mb_ind, $cnt, true /*not_hex*/);
  152. $out[] = '};';
  153. }
  154. // BIG5
  155. $out = array();
  156. out_header($out, 'big5', 'Big5', 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT', 2021);
  157. $file = 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT';
  158. // Read the file.
  159. if (($get = file_get_contents($file)) === false) {
  160. error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
  161. exit($error . PHP_EOL);
  162. }
  163. $lines = explode("\n", $get);
  164. // Parse the file.
  165. $sort = array();
  166. $mb = array();
  167. foreach ($lines as $line) {
  168. $line = trim($line);
  169. if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
  170. continue;
  171. }
  172. $matches = array();
  173. if (preg_match('/^0x([0-9A-F]{4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) {
  174. $d = hexdec($matches[1]);
  175. $u = hexdec($matches[2]);
  176. $sort[] = $u;
  177. $mb[] = $d;
  178. }
  179. }
  180. array_multisort($sort, $mb);
  181. // Calculate URO (U+4E00-U+9FFF) table
  182. for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++);
  183. $start_u_i = $u_i;
  184. $big5_uro_u = $big5_uro_mb_ind = array();
  185. $sort_search = array_flip($sort);
  186. for ($u = 0x4E00; $u <= 0x9FAF; $u += 16) {
  187. $used = 0;
  188. $next_u_i = $u_i;
  189. for ($j = 0; $j < 16; $j++) {
  190. if (isset($sort_search[$u + $j])) {
  191. $i = $sort_search[$u + $j];
  192. $used |= 1 << $j;
  193. $next_u_i = $i + 1;
  194. $end_u_i = $i;
  195. }
  196. }
  197. $big5_uro_u[] = $used;
  198. $big5_uro_mb_ind[] = $u_i;
  199. $u_i = $next_u_i;
  200. }
  201. // Output URO tables
  202. out_uro_tabs($out, 'big5', $big5_uro_u, $big5_uro_mb_ind);
  203. // Remove URO block from Unicode table
  204. array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1);
  205. // Output Big5 tables
  206. out_tabs($out, 'big5', $sort, $mb, true /*no_ind*/);
  207. $out[] = '';
  208. $out[] = '#endif /* Z_BIG5_H */';
  209. file_put_contents($out_dirname . '/big5.h', implode("\n", $out) . "\n");
  210. // EUC-KR (KS X 1001)
  211. $out = array();
  212. out_header($out, 'ksx1001', 'EUC-KR (KS X 1001)',
  213. 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/KSX1001.TXT', 2021);
  214. $file = 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/KSX1001.TXT';
  215. // Read the file.
  216. if (($get = file_get_contents($file)) === false) {
  217. error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
  218. exit($error . PHP_EOL);
  219. }
  220. $lines = explode("\n", $get);
  221. // Parse the file.
  222. $sort = array();
  223. $mb = array();
  224. foreach ($lines as $line) {
  225. $line = trim($line);
  226. if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
  227. continue;
  228. }
  229. $matches = array();
  230. if (preg_match('/^0x([0-9A-F]{4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) {
  231. $d = hexdec($matches[1]) + 0x8080; // Convert to EUC-KR
  232. $u = hexdec($matches[2]);
  233. $sort[] = $u;
  234. $mb[] = $d;
  235. }
  236. }
  237. // Add some characters defined later than in KSX1001.TXT
  238. $sort[] = 0x20AC; // Euro sign added KS X 1001:1998
  239. $mb[] = 0x2266 + 0x8080;
  240. $sort[] = 0xAE; // Registered trademark added KS X 1001:1998
  241. $mb[] = 0x2267 + 0x8080;
  242. $sort[] = 0x327E; // Korean postal code symbol added KS X 1001:2002
  243. $mb[]= 0x2268 + 0x8080;
  244. array_multisort($sort, $mb);
  245. // Calculate URO (U+4E00-U+9FFF) table
  246. for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++);
  247. $start_u_i = $u_i;
  248. $ksx1001_uro_u = $ksx1001_uro_mb_ind = array();
  249. $sort_search = array_flip($sort);
  250. for ($u = 0x4E00; $u <= 0x9F9F; $u += 16) {
  251. $used = 0;
  252. $next_u_i = $u_i;
  253. for ($j = 0; $j < 16; $j++) {
  254. if (isset($sort_search[$u + $j])) {
  255. $i = $sort_search[$u + $j];
  256. $used |= 1 << $j;
  257. $next_u_i = $i + 1;
  258. $end_u_i = $i;
  259. }
  260. }
  261. $ksx1001_uro_u[] = $used;
  262. $ksx1001_uro_mb_ind[] = $u_i;
  263. $u_i = $next_u_i;
  264. }
  265. // Output URO tables
  266. out_uro_tabs($out, 'ksx1001', $ksx1001_uro_u, $ksx1001_uro_mb_ind);
  267. // Remove URO block from Unicode table
  268. array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1);
  269. // Output KS X 1001 tables
  270. out_tabs($out, 'ksx1001', $sort, $mb);
  271. $out[] = '';
  272. $out[] = '#endif /* Z_KSX1001_H */';
  273. file_put_contents($out_dirname . '/ksx1001.h', implode("\n", $out) . "\n");
  274. // Shift JIS
  275. $out = array();
  276. out_header($out, 'sjis', 'Shift JIS', 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT', 2009);
  277. $file = 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT';
  278. // Read the file.
  279. if (($get = file_get_contents($file)) === false) {
  280. error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
  281. exit($error . PHP_EOL);
  282. }
  283. $lines = explode("\n", $get);
  284. // Parse the file.
  285. $sort = array();
  286. $mb = array();
  287. foreach ($lines as $line) {
  288. $line = trim($line);
  289. if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
  290. continue;
  291. }
  292. $matches = array();
  293. if (preg_match('/^0x([0-9A-F]{2,4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) {
  294. $d = hexdec($matches[1]);
  295. if ($d < 0x80 && $d != 0x5C && $d != 0x7E) {
  296. continue;
  297. }
  298. $u = hexdec($matches[2]);
  299. // PUA characters (user-defined range), dealt with programatically by `u_sjis()`
  300. // See CJKV Information Processing by Ken Lunde, 2nd ed., Table 4-86, p.286
  301. // https://file.allitebooks.com/20160708/CJKV%20Information%20Processing.pdf
  302. if ($u >= 0xE000 && $u <= 0xE757) {
  303. continue;
  304. }
  305. $sort[] = $u;
  306. $mb[] = $d;
  307. }
  308. }
  309. array_multisort($sort, $mb);
  310. // Calculate URO (U+4E00-U+9FFF) table
  311. for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++);
  312. $start_u_i = $u_i;
  313. $sjis_uro_u = $sjis_uro_mb_ind = array();
  314. $sort_search = array_flip($sort);
  315. for ($u = 0x4E00; $u <= 0x9FAF; $u += 16) {
  316. $used = 0;
  317. $next_u_i = $u_i;
  318. for ($j = 0; $j < 16; $j++) {
  319. if (isset($sort_search[$u + $j])) {
  320. $i = $sort_search[$u + $j];
  321. $used |= 1 << $j;
  322. $next_u_i = $i + 1;
  323. $end_u_i = $i;
  324. }
  325. }
  326. $sjis_uro_u[] = $used;
  327. $sjis_uro_mb_ind[] = $u_i;
  328. $u_i = $next_u_i;
  329. }
  330. // Output URO tables
  331. out_uro_tabs($out, 'sjis', $sjis_uro_u, $sjis_uro_mb_ind);
  332. // Remove URO block from Unicode table
  333. array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1);
  334. // Output Shift JIS tables
  335. out_tabs($out, 'sjis', $sort, $mb, true /*no_ind*/);
  336. $out[] = '';
  337. $out[] = '#endif /* Z_SJIS_H */';
  338. file_put_contents($out_dirname . '/sjis.h', implode("\n", $out) . "\n");
  339. // GB 2312
  340. $out = array();
  341. out_header($out, 'gb2312', 'GB 2312-1980 (EUC-CN)',
  342. 'unicode.org-mappings/EASTASIA/GB/GB2312.TXT', 2009,
  343. '(see https://haible.de/bruno/charsets/conversion-tables/GB2312.tar.bz2)');
  344. $file = $data_dirname . '/' . 'GB2312.TXT';
  345. // Read the file.
  346. if (($get = file_get_contents($file)) === false) {
  347. error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
  348. exit($error . PHP_EOL);
  349. }
  350. $lines = explode("\n", $get);
  351. // Parse the file.
  352. $sort = array();
  353. $mb = array();
  354. $in_gb2312 = array();
  355. foreach ($lines as $line) {
  356. $line = trim($line);
  357. if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
  358. continue;
  359. }
  360. $matches = array();
  361. if (preg_match('/^0x([0-9A-F]{2,4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) {
  362. $d = hexdec($matches[1]);
  363. if ($d < 0x80) {
  364. continue;
  365. }
  366. $u = hexdec($matches[2]);
  367. $sort[] = $u;
  368. $mb[] = $d + 0x8080; // Convert to EUC-CN
  369. $in_gb2312[$u] = true;
  370. }
  371. }
  372. array_multisort($sort, $mb);
  373. // Calculate URO (U+4E00-U+9FFF) table
  374. for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++);
  375. $start_u_i = $u_i;
  376. $gb2312_uro_u = $gb2312_uro_mb_ind = array();
  377. $sort_search = array_flip($sort);
  378. for ($u = 0x4E00; $u <= 0x9CEF; $u += 16) {
  379. $used = 0;
  380. $next_u_i = $u_i;
  381. for ($j = 0; $j < 16; $j++) {
  382. if (isset($sort_search[$u + $j])) {
  383. $i = $sort_search[$u + $j];
  384. $used |= 1 << $j;
  385. $next_u_i = $i + 1;
  386. $end_u_i = $i;
  387. }
  388. }
  389. $gb2312_uro_u[] = $used;
  390. $gb2312_uro_mb_ind[] = $u_i;
  391. $u_i = $next_u_i;
  392. }
  393. // Output URO tables
  394. out_uro_tabs($out, 'gb2312', $gb2312_uro_u, $gb2312_uro_mb_ind);
  395. // Remove URO block from Unicode table
  396. array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1);
  397. // Output GB 2312 tables
  398. out_tabs($out, 'gb2312', $sort, $mb);
  399. $out[] = '';
  400. $out[] = '#endif /* Z_GB2312_H */';
  401. file_put_contents($out_dirname . '/gb2312.h', implode("\n", $out) . "\n");
  402. // GBK
  403. $out = array();
  404. out_header($out, 'gbk', 'GBK, excluding mappings in GB 2312',
  405. 'https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT');
  406. // Note this has weird 0x80 mapping to U+20AC (EURO SIGN) which needs to be ignored
  407. $file = 'https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT';
  408. // Read the file.
  409. if (($get = file_get_contents($file)) === false) {
  410. error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
  411. exit($error . PHP_EOL);
  412. }
  413. $lines = explode("\n", $get);
  414. // Parse the file.
  415. $sort = array();
  416. $mb = array();
  417. $in_gbk = array();
  418. foreach ($lines as $line) {
  419. $line = trim($line);
  420. if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
  421. continue;
  422. }
  423. $matches = array();
  424. if (preg_match('/^0x([0-9A-F]{2,4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) {
  425. $d = hexdec($matches[1]);
  426. if ($d <= 0x80) { // Ignore weird 0x80 mapping to U+20AC (EURO SIGN) if any (present in Unicode Public mapping file)
  427. continue;
  428. }
  429. $u = hexdec($matches[2]);
  430. $in_gbk[$u] = true;
  431. if ($u != 0x2015 && isset($in_gb2312[$u])) { // U+2015 mapped differently by GBK
  432. continue;
  433. }
  434. $sort[] = $u;
  435. $mb[] = $d;
  436. }
  437. }
  438. array_multisort($sort, $mb);
  439. // Calculate URO (U+4E00-U+9FFF) table
  440. for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++);
  441. $start_u_i = $u_i;
  442. $gbk_uro_u = $gbk_uro_mb_ind = array();
  443. $sort_search = array_flip($sort);
  444. for ($u = 0x4E00; $u <= 0x9FAF; $u += 16) {
  445. $used = 0;
  446. $next_u_i = $u_i;
  447. for ($j = 0; $j < 16; $j++) {
  448. if (isset($sort_search[$u + $j])) {
  449. $i = $sort_search[$u + $j];
  450. $used |= 1 << $j;
  451. $next_u_i = $i + 1;
  452. $end_u_i = $i;
  453. }
  454. }
  455. $gbk_uro_u[] = $used;
  456. $gbk_uro_mb_ind[] = $u_i;
  457. $u_i = $next_u_i;
  458. }
  459. // Output URO tables
  460. out_uro_tabs($out, 'gbk', $gbk_uro_u, $gbk_uro_mb_ind);
  461. // Remove URO block from Unicode table
  462. array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1);
  463. // Output GBK tables
  464. out_tabs($out, 'gbk', $sort, $mb, true /*no_ind*/);
  465. $out[] = '';
  466. $out[] = '#endif /* Z_GBK_H */';
  467. file_put_contents($out_dirname . '/gbk.h', implode("\n", $out) . "\n");
  468. // GB 18030
  469. $out = array();
  470. out_header($out, 'gb18030', 'GB 18030-2005', 'jdk-1.4.2/GB18030.TXT', 2016,
  471. '(see https://haible.de/bruno/charsets/conversion-tables/GB18030.tar.bz2)');
  472. $file = $data_dirname . '/' . 'GB18030.TXT';
  473. // Read the file.
  474. if (($get = file_get_contents($file)) === false) {
  475. error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
  476. exit($error . PHP_EOL);
  477. }
  478. $lines = explode("\n", $get);
  479. // Parse the file.
  480. $sort2 = array();
  481. $mb2 = array();
  482. $sort4 = array();
  483. $mb4 = array();
  484. foreach ($lines as $line) {
  485. $line = trim($line);
  486. if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
  487. continue;
  488. }
  489. if (preg_match('/^0x([0-9A-F]{2,8})[ \t]+0x([0-9A-F]{5})/', $line)) { // Exclude U+10000..10FFFF to save space
  490. continue;
  491. }
  492. $matches = array();
  493. if (preg_match('/^0x([0-9A-F]{2,8})[ \t]+0x([0-9A-F]{4}).*$/', $line, $matches)) {
  494. $d = hexdec($matches[1]);
  495. if ($d < 0x80) {
  496. continue;
  497. }
  498. $u = hexdec($matches[2]);
  499. // 2-byte extension GB 18030-2005 change, were PUA, see Table 3-37, p.109, Lunde 2nd ed.
  500. if (($u >= 0x9FB4 && $u <= 0x9FBB) || ($u >= 0xFE10 && $u <= 0xFE19)) {
  501. //continue;
  502. }
  503. // 4-byte extension change, PUA
  504. if ($u == 0xE7C7) {
  505. continue;
  506. }
  507. if ($d < 0x10000) {
  508. if (isset($in_gbk[$u])) {
  509. continue;
  510. }
  511. // User-defined, dealt with programatically by `u_gb18030()`
  512. if ($u >= 0xE000 && $u <= 0xE765) {
  513. continue;
  514. }
  515. $sort2[] = $u;
  516. $mb2[] = $d;
  517. } else if ($u < 0x10000) {
  518. $sort4[] = $u;
  519. $mb4[] = $d;
  520. }
  521. }
  522. }
  523. /* 2-byte extension GB 18030-2005 change, was PUA U+E7C7 below, see Table 3-39, p.111, Lunde 2nd ed. */
  524. $sort2[] = 0x1E3F; $mb2[] = 0xA8BC;
  525. /* 2-byte extension GB 18030-2005 change, were PUA, see Table 3-37, p.109, Lunde 2nd ed. */
  526. $sort2[] = 0x9FB4; $mb2[] = 0xFE59;
  527. $sort2[] = 0x9FB5; $mb2[] = 0xFE61;
  528. $sort2[] = 0x9FB6; $mb2[] = 0xFE66;
  529. $sort2[] = 0x9FB7; $mb2[] = 0xFE67;
  530. $sort2[] = 0x9FB8; $mb2[] = 0xFE6D;
  531. $sort2[] = 0x9FB9; $mb2[] = 0xFE7E;
  532. $sort2[] = 0x9FBA; $mb2[] = 0xFE90;
  533. $sort2[] = 0x9FBB; $mb2[] = 0xFEA0;
  534. $sort2[] = 0xFE10; $mb2[] = 0xA6D9;
  535. $sort2[] = 0xFE11; $mb2[] = 0xA6DB;
  536. $sort2[] = 0xFE12; $mb2[] = 0xA6DA;
  537. $sort2[] = 0xFE13; $mb2[] = 0xA6DC;
  538. $sort2[] = 0xFE14; $mb2[] = 0xA6DD;
  539. $sort2[] = 0xFE15; $mb2[] = 0xA6DE;
  540. $sort2[] = 0xFE16; $mb2[] = 0xA6DF;
  541. $sort2[] = 0xFE17; $mb2[] = 0xA6EC;
  542. $sort2[] = 0xFE18; $mb2[] = 0xA6ED;
  543. $sort2[] = 0xFE19; $mb2[] = 0xA6F3;
  544. /* 4-byte extension PUA */
  545. // Dealt with by `u_gb18030()`
  546. //$sort4[] = 0xE7C7;
  547. //$mb4[] = 0x8135F437;
  548. // Calculate Unicode start/end codepoints mapping to consecutive 4-byte blocks
  549. array_multisort($sort4, $mb4);
  550. $gb18030_4_u_b = array();
  551. $gb18030_4_u_e = array();
  552. $gb18030_4_mb_o = array();
  553. // Start/end points
  554. $prev_u = $begin_u = $sort4[0];
  555. for ($i = 1, $cnt = count($sort4); $i < $cnt; $i++) {
  556. $u = $sort4[$i];
  557. if ($u === $prev_u + 1) {
  558. $prev_u++;
  559. continue;
  560. }
  561. $gb18030_4_u_b[] = $begin_u;
  562. $gb18030_4_u_e[] = $prev_u;
  563. $begin_u = $prev_u = $u;
  564. }
  565. $gb18030_4_u_b[] = $begin_u;
  566. $gb18030_4_u_e[] = $prev_u;
  567. // Gaps between blocks
  568. $gb18030_4_mb_o[] = 0;
  569. for ($i = 1, $cnt = count($gb18030_4_u_b); $i < $cnt; $i++) {
  570. $gb18030_4_mb_o[] = $gb18030_4_u_b[$i] - ($gb18030_4_u_e[$i - 1] + 1) + $gb18030_4_mb_o[count($gb18030_4_mb_o) - 1];
  571. }
  572. // Output GB 18030 tables
  573. array_multisort($sort2, $mb2);
  574. out_tabs($out, 'gb18030_2', $sort2, $mb2, true /*no_ind*/);
  575. // Start codepoints `gb18030_4_u_b` array not needed by `u_gb18030()`
  576. $cnt = count($gb18030_4_u_e);
  577. $out[] = '';
  578. $out[] = '/* End Unicode codepoints of blocks mapping consecutively to 4-byte multibyte blocks */';
  579. $out[] = 'static const unsigned short gb18030_4_u_e[' . $cnt .'] = {';
  580. out_tab_entries($out, $gb18030_4_u_e, $cnt);
  581. $out[] = '};';
  582. $cnt = count($gb18030_4_mb_o);
  583. $out[] = '';
  584. $out[] = '/* Cumulative gaps between Unicode blocks mapping consecutively to 4-byte multibyte blocks,';
  585. $out[] = ' used to adjust multibyte offsets */';
  586. $out[] = 'static const unsigned short gb18030_4_mb_o[' . $cnt .'] = {';
  587. out_tab_entries($out, $gb18030_4_mb_o, $cnt, true /*not_hex*/);
  588. $out[] = '};';
  589. $out[] = '';
  590. $out[] = '#endif /* Z_GB18030_H */';
  591. file_put_contents($out_dirname . '/gb18030.h', implode("\n", $out) . "\n");
  592. /* vim: set ts=4 sw=4 et : */