| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134 |
- /* eci.c - Extended Channel Interpretations */
- /*
- libzint - the open source barcode library
- Copyright (C) 2009-2024 Robin Stuart <rstuart114@gmail.com>
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- 3. Neither the name of the project nor the names of its contributors
- may be used to endorse or promote products derived from this software
- without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- SUCH DAMAGE.
- */
- /* SPDX-License-Identifier: BSD-3-Clause */
- #include <assert.h>
- #include "common.h"
- #include "eci.h"
- #include "eci_sb.h"
- #include "big5.h"
- #include "gb18030.h"
- #include "gb2312.h"
- #include "gbk.h"
- #include "ksx1001.h"
- #include "sjis.h"
- /* Single-byte stuff */
- /* Base ISO/IEC 8859 routine to convert Unicode codepoint `u` */
- static int u_iso8859(const unsigned int u, const unsigned short *tab_s, const unsigned short *tab_u,
- const unsigned char *tab_sb, int e, unsigned char *dest) {
- int s;
- if (u < 0xA0) {
- if (u >= 0x80) { /* U+0080-9F fail */
- return 0;
- }
- *dest = (unsigned char) u;
- return 1;
- }
- if (u <= 0xFF) {
- const unsigned int u2 = u - 0xA0;
- if (tab_s[u2 >> 4] & ((unsigned short) 1 << (u2 & 0xF))) {
- *dest = (unsigned char) u; /* Straight-thru */
- return 1;
- }
- }
- s = 0;
- while (s <= e) {
- const int m = (s + e) >> 1;
- if (tab_u[m] < u) {
- s = m + 1;
- } else if (tab_u[m] > u) {
- e = m - 1;
- } else {
- *dest = tab_sb[m];
- return 1;
- }
- }
- return 0;
- }
- /* Base Windows-125x routine to convert Unicode codepoint `u` */
- static int u_cp125x(const unsigned int u, const unsigned short *tab_s, const unsigned short *tab_u,
- const unsigned char *tab_sb, int e, unsigned char *dest) {
- int s;
- if (u < 0x80) {
- *dest = (unsigned char) u;
- return 1;
- }
- if (u <= 0xFF && u >= 0xA0) {
- const unsigned int u2 = u - 0xA0;
- if (tab_s[u2 >> 4] & ((unsigned short) 1 << (u2 & 0xF))) {
- *dest = (unsigned char) u; /* Straight-thru */
- return 1;
- }
- }
- s = 0;
- while (s <= e) {
- const int m = (s + e) >> 1;
- if (tab_u[m] < u) {
- s = m + 1;
- } else if (tab_u[m] > u) {
- e = m - 1;
- } else {
- *dest = tab_sb[m];
- return 1;
- }
- }
- return 0;
- }
- /* ECI 27 ASCII (ISO/IEC 646:1991 IRV (US)) */
- static int u_ascii(const unsigned int u, unsigned char *dest) {
- if (u < 0x80) {
- *dest = (unsigned char) u;
- return 1;
- }
- return 0;
- }
- /* ECI 170 ASCII subset (ISO/IEC 646:1991 Invariant), excludes 12 chars that historically had national variants,
- namely "#$@[\]^`{|}~" */
- static int u_ascii_inv(const unsigned int u, unsigned char *dest) {
- if (u == 0x7F || (u <= 'z' && u != '#' && u != '$' && u != '@' && (u <= 'Z' || u == '_' || u >= 'a'))) {
- *dest = (unsigned char) u;
- return 1;
- }
- return 0;
- }
- /* ECI 25 UTF-16 Big Endian (ISO/IEC 10646) - assumes valid Unicode */
- static int u_utf16be(const unsigned int u, unsigned char *dest) {
- unsigned int u2, v;
- if (u < 0x10000) {
- dest[0] = (unsigned char) (u >> 8);
- dest[1] = (unsigned char) u;
- return 2;
- }
- u2 = u - 0x10000;
- v = u2 >> 10;
- dest[0] = (unsigned char) (0xD8 + (v >> 8));
- dest[1] = (unsigned char) v;
- v = u2 & 0x3FF;
- dest[2] = (unsigned char) (0xDC + (v >> 8));
- dest[3] = (unsigned char) v;
- return 4;
- }
- /* ECI 33 UTF-16 Little Endian (ISO/IEC 10646) - assumes valid Unicode */
- static int u_utf16le(const unsigned int u, unsigned char *dest) {
- unsigned int u2, v;
- if (u < 0x10000) {
- dest[0] = (unsigned char) u;
- dest[1] = (unsigned char) (u >> 8);
- return 2;
- }
- u2 = u - 0x10000;
- v = u2 >> 10;
- dest[0] = (unsigned char) v;
- dest[1] = (unsigned char) (0xD8 + (v >> 8));
- v = u2 & 0x3FF;
- dest[2] = (unsigned char) v;
- dest[3] = (unsigned char) (0xDC + (v >> 8));
- return 4;
- }
- /* ECI 34 UTF-32 Big Endian (ISO/IEC 10646) - assumes valid Unicode */
- static int u_utf32be(const unsigned int u, unsigned char *dest) {
- dest[0] = 0;
- dest[1] = (unsigned char) (u >> 16);
- dest[2] = (unsigned char) (u >> 8);
- dest[3] = (unsigned char) u;
- return 4;
- }
- /* ECI 35 UTF-32 Little Endian (ISO/IEC 10646) - assumes valid Unicode */
- static int u_utf32le(const unsigned int u, unsigned char *dest) {
- dest[0] = (unsigned char) u;
- dest[1] = (unsigned char) (u >> 8);
- dest[2] = (unsigned char) (u >> 16);
- dest[3] = 0;
- return 4;
- }
- /* Multibyte stuff */
- /* Acknowledgements to Bruno Haible <bruno@clisp.org> for a no. of techniques used here */
- /* Helper to lookup Unicode codepoint `u` in the URO (Unified Repertoire and Ordering) block (U+4E00-9FFF) */
- static int eci_u_lookup_uro_int(const unsigned int u, const unsigned short *tab_u, const unsigned short *tab_mb_ind,
- const unsigned short *tab_mb, unsigned int *d) {
- unsigned int u2 = (u - 0x4E00) >> 4; /* Blocks of 16 */
- unsigned int v = (unsigned int) 1 << (u & 0xF);
- if ((tab_u[u2] & v) == 0) {
- return 0;
- }
- v = tab_u[u2] & (v - 1); /* Mask to bits prior to this one */
- /* Count bits set (http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel) */
- v = v - ((v >> 1) & 0x55555555);
- v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
- v = (((v + (v >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
- *d = tab_mb[tab_mb_ind[u2] + v];
- return 2;
- }
- /* Version of `eci_u_lookup_uro_int()` taking unsigned char destination */
- static int eci_u_lookup_uro(const unsigned int u, const unsigned short *tab_u, const unsigned short *tab_mb_ind,
- const unsigned short *tab_mb, unsigned char *dest) {
- unsigned int d;
- int ret = eci_u_lookup_uro_int(u, tab_u, tab_mb_ind, tab_mb, &d);
- if (ret) {
- dest[0] = (unsigned char) (d >> 8);
- dest[1] = (unsigned char) d;
- }
- return ret;
- }
- /* ECI 20 Shift JIS */
- static int u_sjis_int(const unsigned int u, unsigned int *d) {
- unsigned int u2, dv, md;
- int s, e;
- if (u < 0x80 && u != 0x5C && u != 0x7E) { /* Backslash & tilde re-mapped according to JIS X 0201 Roman */
- *d = u;
- return 1;
- }
- /* Special case URO block sequential mappings (considerably lessens size of `sjis_u[]` array) */
- if (u >= 0x4E00 && u <= 0xDFFF) { /* 0xE000 next used value >= 0x4E00 */
- if (u >= 0x9FB0) {
- return 0;
- }
- return eci_u_lookup_uro_int(u, sjis_uro_u, sjis_uro_mb_ind, sjis_mb, d);
- }
- /* PUA to user-defined (Table 4-86, Lunde, 2nd ed.) */
- if (u >= 0xE000 && u <= 0xE757) {
- u2 = u - 0xE000;
- dv = u2 / 188;
- md = u2 - dv * 188;
- *d = ((dv + 0xF0) << 8) | (md + 0x40 + (md >= 0x3F));
- return 2;
- }
- if (u >= sjis_u[0] && u <= sjis_u[ARRAY_SIZE(sjis_u) - 1]) {
- s = 0;
- e = ARRAY_SIZE(sjis_u) - 1;
- while (s <= e) {
- const int m = (s + e) >> 1;
- if (sjis_u[m] < u) {
- s = m + 1;
- } else if (sjis_u[m] > u) {
- e = m - 1;
- } else {
- *d = sjis_mb[u >= 0x4E00 ? m + 6356 : m]; /* Adjust for URO block */
- return 1 + (*d > 0xFF);
- }
- }
- }
- return 0;
- }
- #ifdef ZINT_TEST /* Wrapper for direct testing */
- INTERNAL int u_sjis_int_test(const unsigned int u, unsigned int *d) {
- return u_sjis_int(u, d);
- }
- #endif
- /* Version of `u_sjis_int()` taking unsigned char destination, for use by `utf8_to_eci()` */
- static int u_sjis(const unsigned int u, unsigned char *dest) {
- unsigned int d;
- int ret = u_sjis_int(u, &d);
- if (ret) {
- if (ret == 1) {
- dest[0] = (unsigned char) d;
- } else {
- dest[0] = (unsigned char) (d >> 8);
- dest[1] = (unsigned char) d;
- }
- }
- return ret;
- }
- /* ECI 28 Big5 Chinese (Taiwan) */
- static int u_big5(const unsigned int u, unsigned char *dest) {
- int s, e;
- if (u < 0x80) {
- *dest = (unsigned char) u;
- return 1;
- }
- /* Special case URO block sequential mappings (considerably lessens size of `big5_u[]` array) */
- if (u >= 0x4E00 && u <= 0xFA0B) { /* 0xFA0C next used value >= 0x4E00 */
- if (u >= 0x9FB0) {
- return 0;
- }
- return eci_u_lookup_uro(u, big5_uro_u, big5_uro_mb_ind, big5_mb, dest);
- }
- if (u >= big5_u[0] && u <= big5_u[ARRAY_SIZE(big5_u) - 1]) {
- s = 0;
- e = ARRAY_SIZE(big5_u) - 1;
- while (s <= e) {
- const int m = (s + e) >> 1;
- if (big5_u[m] < u) {
- s = m + 1;
- } else if (big5_u[m] > u) {
- e = m - 1;
- } else {
- const unsigned short mb = big5_mb[u >= 0x4E00 ? m + 13061 : m]; /* Adjust for URO block */
- dest[0] = (unsigned char) (mb >> 8);
- dest[1] = (unsigned char) mb;
- return 2;
- }
- }
- }
- return 0;
- }
- #ifdef ZINT_TEST /* Wrapper for direct testing */
- INTERNAL int u_big5_test(const unsigned int u, unsigned char *dest) {
- return u_big5(u, dest);
- }
- #endif
- /* ECI 30 EUC-KR (KS X 1001, formerly KS C 5601) Korean */
- static int u_ksx1001(const unsigned int u, unsigned char *dest) {
- int s, e;
- if (u < 0x80) {
- *dest = (unsigned char) u;
- return 1;
- }
- /* Special case URO block sequential mappings (considerably lessens size of `ksx1001_u[]` array) */
- if (u >= 0x4E00 && u <= 0xABFF) { /* 0xAC00 next used value >= 0x4E00 */
- if (u >= 0x9FA0) {
- return 0;
- }
- return eci_u_lookup_uro(u, ksx1001_uro_u, ksx1001_uro_mb_ind, ksx1001_mb, dest);
- }
- if (u >= ksx1001_u[0] && u <= ksx1001_u[ARRAY_SIZE(ksx1001_u) - 1]) {
- s = ksx1001_u_ind[(u - ksx1001_u[0]) >> 8];
- e = s + 0x100 > ARRAY_SIZE(ksx1001_u) ? ARRAY_SIZE(ksx1001_u) - 1 : s + 0x100 - 1;
- while (s <= e) {
- const int m = (s + e) >> 1;
- if (ksx1001_u[m] < u) {
- s = m + 1;
- } else if (ksx1001_u[m] > u) {
- e = m - 1;
- } else {
- const unsigned short mb = ksx1001_mb[u >= 0x4E00 ? m + 4620 : m]; /* Adjust for URO block */
- dest[0] = (unsigned char) (mb >> 8);
- dest[1] = (unsigned char) mb;
- return 2;
- }
- }
- }
- return 0;
- }
- #ifdef ZINT_TEST /* Wrapper for direct testing */
- INTERNAL int u_ksx1001_test(const unsigned int u, unsigned char *dest) {
- return u_ksx1001(u, dest);
- }
- #endif
- /* ECI 29 GB 2312 Chinese (PRC) */
- static int u_gb2312_int(const unsigned int u, unsigned int *d) {
- int s, e;
- if (u < 0x80) {
- *d = u;
- return 1;
- }
- /* Special case URO block sequential mappings (considerably lessens size of `gb2312_u[]` array) */
- if (u >= 0x4E00 && u <= 0x9E1E) { /* 0x9E1F next used value >= 0x4E00 */
- if (u >= 0x9CF0) {
- return 0;
- }
- return eci_u_lookup_uro_int(u, gb2312_uro_u, gb2312_uro_mb_ind, gb2312_mb, d);
- }
- if (u >= gb2312_u[0] && u <= gb2312_u[ARRAY_SIZE(gb2312_u) - 1]) {
- s = gb2312_u_ind[(u - gb2312_u[0]) >> 8];
- e = s + 0x100 > ARRAY_SIZE(gb2312_u) ? ARRAY_SIZE(gb2312_u) - 1 : s + 0x100 - 1;
- while (s <= e) {
- const int m = (s + e) >> 1;
- if (gb2312_u[m] < u) {
- s = m + 1;
- } else if (gb2312_u[m] > u) {
- e = m - 1;
- } else {
- *d = gb2312_mb[u > 0x4E00 ? m + 6627 : m]; /* Adjust for URO block */
- return 2;
- }
- }
- }
- return 0;
- }
- #ifdef ZINT_TEST /* Wrapper for direct testing */
- INTERNAL int u_gb2312_int_test(const unsigned int u, unsigned int *d) {
- return u_gb2312_int(u, d);
- }
- #endif
- /* Version of `u_gb2312_int()` taking unsigned char destination, for use by `utf8_to_eci()` */
- static int u_gb2312(const unsigned int u, unsigned char *dest) {
- unsigned int d;
- int ret = u_gb2312_int(u, &d);
- if (ret) {
- if (ret == 1) {
- dest[0] = (unsigned char) d;
- } else {
- dest[0] = (unsigned char) (d >> 8);
- dest[1] = (unsigned char) d;
- }
- }
- return ret;
- }
- /* ECI 31 GBK Chinese */
- static int u_gbk_int(const unsigned int u, unsigned int *d) {
- int s, e;
- if (u < 0x80) {
- *d = u;
- return 1;
- }
- /* Check GB 2312 first */
- if (u == 0x30FB) {
- /* KATAKANA MIDDLE DOT, mapped by GB 2312 but not by GBK (U+00B7 MIDDLE DOT mapped to 0xA1A4 instead) */
- return 0;
- }
- if (u == 0x2015) {
- /* HORIZONTAL BAR, mapped to 0xA844 by GBK rather than 0xA1AA (U+2014 EM DASH mapped there instead) */
- *d = 0xA844;
- return 2;
- }
- if (u_gb2312_int(u, d)) { /* Includes the 2 GB 6345.1-86 corrections given in Table 3-22, Lunde, 2nd ed. */
- return 2;
- }
- /* Special case URO block sequential mappings (considerably lessens size of `gbk_u[]` array) */
- if (u >= 0x4E00 && u <= 0xF92B) { /* 0xF92C next used value >= 0x4E00 */
- if (u >= 0x9FB0) {
- return 0;
- }
- return eci_u_lookup_uro_int(u, gbk_uro_u, gbk_uro_mb_ind, gbk_mb, d);
- }
- if (u >= gbk_u[0] && u <= gbk_u[ARRAY_SIZE(gbk_u) - 1]) {
- s = 0;
- e = ARRAY_SIZE(gbk_u) - 1;
- while (s <= e) {
- const int m = (s + e) >> 1;
- if (gbk_u[m] < u) {
- s = m + 1;
- } else if (gbk_u[m] > u) {
- e = m - 1;
- } else {
- *d = gbk_mb[u >= 0x4E00 ? m + 14139 : m]; /* Adjust for URO block */
- return 2;
- }
- }
- }
- return 0;
- }
- #ifdef ZINT_TEST /* Wrapper for direct testing */
- INTERNAL int u_gbk_int_test(const unsigned int u, unsigned int *d) {
- return u_gbk_int(u, d);
- }
- #endif
- /* Version of `u_gbk_int()` taking unsigned char destination, for use by `utf8_to_eci()` */
- static int u_gbk(const unsigned int u, unsigned char *dest) {
- unsigned int d;
- int ret = u_gbk_int(u, &d);
- if (ret) {
- if (ret == 1) {
- dest[0] = (unsigned char) d;
- } else {
- dest[0] = (unsigned char) (d >> 8);
- dest[1] = (unsigned char) d;
- }
- }
- return ret;
- }
- /* Helper for `u_gb18030_int()` to output 4-byte sequential blocks */
- static int u_gb18030_4_sequential_int(unsigned int u2, unsigned int mb_lead, unsigned int *d1, unsigned int *d2) {
- unsigned int dv;
- dv = u2 / 10;
- *d2 = u2 - dv * 10 + 0x30;
- u2 = dv;
- dv = u2 / 126;
- *d2 |= (u2 - dv * 126 + 0x81) << 8;
- u2 = dv;
- dv = u2 / 10;
- *d1 = ((dv + mb_lead) << 8) | (u2 - dv * 10 + 0x30);
- return 4;
- }
- /* ECI 32 GB 18030 Chinese - assumes valid Unicode */
- static int u_gb18030_int(const unsigned int u, unsigned int *d1, unsigned int *d2) {
- unsigned int u2, dv;
- int s, e;
- if (u < 0x80) {
- *d1 = u;
- return 1;
- }
- /* Check GBK first */
- if (u_gbk_int(u, d1)) {
- return 2;
- }
- if (u >= 0x10000) {
- /* Non-PUA, non-BMP, see Table 3-37, Lunde, 2nd ed. */
- if (u == 0x20087) {
- *d1 = 0xFE51;
- return 2;
- }
- if (u == 0x20089) {
- *d1 = 0xFE52;
- return 2;
- }
- if (u == 0x200CC) {
- *d1 = 0xFE53;
- return 2;
- }
- if (u == 0x215D7) {
- *d1 = 0xFE6C;
- return 2;
- }
- if (u == 0x2298F) {
- *d1 = 0xFE76;
- return 2;
- }
- if (u == 0x241FE) {
- *d1 = 0xFE91;
- return 2;
- }
- /* All other non-BMP U+10000-10FFFF */
- return u_gb18030_4_sequential_int(u - 0x10000, 0x90, d1, d2);
- }
- if (u >= 0xE000 && u <= 0xE765) { /* PUA to user-defined */
- if (u <= 0xE4C5) {
- u2 = u - 0xE000;
- dv = u2 / 94;
- *d1 = ((dv + (dv < 6 ? 0xAA : 0xF2)) << 8) | (u2 - dv * 94 + 0xA1);
- } else {
- unsigned int md;
- u2 = u - 0xE4C6;
- dv = u2 / 96;
- md = u2 - dv * 96;
- *d1 = ((dv + 0xA1) << 8) | (md + 0x40 + (md >= 0x3F));
- }
- return 2;
- }
- if (u >= gb18030_2_u[0] && u <= gb18030_2_u[ARRAY_SIZE(gb18030_2_u) - 1]) {
- s = 0;
- e = ARRAY_SIZE(gb18030_2_u) - 1;
- while (s <= e) {
- const int m = (s + e) >> 1;
- if (gb18030_2_u[m] < u) {
- s = m + 1;
- } else if (gb18030_2_u[m] > u) {
- e = m - 1;
- } else {
- *d1 = gb18030_2_mb[m];
- return 2;
- }
- }
- }
- /* All other BMP U+0080-FFFF */
- if (u == 0xE7C7) { /* PUA change to non-PUA, see Table 3-39, Lunde, 2nd ed. */
- *d1 = 0x8135;
- *d2 = 0xF437;
- return 4;
- }
- s = 0;
- e = ARRAY_SIZE(gb18030_4_u_e) - 1;
- while (s < e) { /* Lower bound */
- const int m = (s + e) >> 1;
- if (gb18030_4_u_e[m] < u) {
- s = m + 1;
- } else {
- e = m;
- }
- }
- assert(s < ARRAY_SIZE(gb18030_4_u_e));
- return u_gb18030_4_sequential_int(u - gb18030_4_mb_o[s] - 0x80, 0x81, d1, d2);
- }
- #ifdef ZINT_TEST /* Wrapper for direct testing */
- INTERNAL int u_gb18030_int_test(const unsigned int u, unsigned int *d1, unsigned int *d2) {
- return u_gb18030_int(u, d1, d2);
- }
- #endif
- /* Version of `u_gb18030_int()` taking unsigned char destination, for use by `utf8_to_eci()` */
- static int u_gb18030(const unsigned int u, unsigned char *dest) {
- unsigned int d1, d2;
- int ret = u_gb18030_int(u, &d1, &d2);
- if (ret) {
- if (ret == 1) {
- dest[0] = (unsigned char) d1;
- } else {
- dest[0] = (unsigned char) (d1 >> 8);
- dest[1] = (unsigned char) d1;
- if (ret == 4) {
- dest[2] = (unsigned char) (d2 >> 8);
- dest[3] = (unsigned char) d2;
- }
- }
- }
- return ret;
- }
- /* Main ECI stuff */
- /* Helper to count the number of chars in a string within a range */
- static int chr_range_cnt(const unsigned char string[], const int length, const unsigned char c1,
- const unsigned char c2) {
- int count = 0;
- int i;
- if (c1) {
- for (i = 0; i < length; i++) {
- if (string[i] >= c1 && string[i] <= c2) {
- count++;
- }
- }
- } else {
- for (i = 0; i < length; i++) {
- if (string[i] <= c2) {
- count++;
- }
- }
- }
- return count;
- }
- /* Is ECI convertible from UTF-8? */
- INTERNAL int is_eci_convertible(const int eci) {
- if (eci == 26 || (eci > 35 && eci != 170)) { /* Exclude ECI 170 - ASCII Invariant */
- /* UTF-8 (26) or 8-bit binary data (899) or undefined (> 35 and < 899) or not character set (> 899) */
- return 0;
- }
- return 1;
- }
- /* Are any of the ECIs in the segments convertible from UTF-8?
- Sets `convertible[]` for each, which must be at least `seg_count` in size */
- INTERNAL int is_eci_convertible_segs(const struct zint_seg segs[], const int seg_count, int convertible[]) {
- int ret = 0;
- int i;
- for (i = 0; i < seg_count; i++) {
- convertible[i] = is_eci_convertible(segs[i].eci);
- ret |= convertible[i];
- }
- return ret;
- }
- /* Calculate length required to convert UTF-8 to (double-byte) encoding */
- INTERNAL int get_eci_length(const int eci, const unsigned char source[], int length) {
- if (eci == 20) { /* Shift JIS */
- /* Only ASCII backslash (reverse solidus) exceeds UTF-8 length */
- length += chr_cnt(source, length, '\\');
- } else if (eci == 25 || eci == 33) { /* UTF-16 */
- /* All ASCII chars take 2 bytes */
- length += chr_range_cnt(source, length, 0, 0x7F);
- /* Surrogate pairs are 4 UTF-8 bytes long so fit */
- } else if (eci == 32) { /* GB 18030 */
- /* Allow for GB 18030 4 byters */
- length *= 2;
- } else if (eci == 34 || eci == 35) { /* UTF-32 */
- /* Quadruple-up ASCII and double-up non-ASCII */
- length += chr_range_cnt(source, length, 0, 0x7F) * 2 + length;
- }
- /* Big5, GB 2312, EUC-KR and GBK fit in UTF-8 length */
- return length;
- }
- /* Call `get_eci_length()` for each segment, returning total */
- INTERNAL int get_eci_length_segs(const struct zint_seg segs[], const int seg_count) {
- int length = 0;
- int i;
- for (i = 0; i < seg_count; i++) {
- length += get_eci_length(segs[i].eci, segs[i].source, segs[i].length);
- }
- return length;
- }
- /* Convert UTF-8 to other character encodings */
- typedef int (*eci_func_t)(const unsigned int u, unsigned char *dest);
- INTERNAL int utf8_to_eci(const int eci, const unsigned char source[], unsigned char dest[], int *p_length) {
- static const eci_func_t eci_funcs[36] = {
- NULL, NULL, NULL, NULL, u_iso8859_2, /*0-4*/
- u_iso8859_3, u_iso8859_4, u_iso8859_5, u_iso8859_6, u_iso8859_7, /*5-9*/
- u_iso8859_8, u_iso8859_9, u_iso8859_10, u_iso8859_11, NULL, /*10-14*/
- u_iso8859_13, u_iso8859_14, u_iso8859_15, u_iso8859_16, NULL, /*15-19*/
- u_sjis, u_cp1250, u_cp1251, u_cp1252, u_cp1256, /*20-24*/
- u_utf16be, NULL, u_ascii, u_big5, u_gb2312, /*25-29*/
- u_ksx1001, u_gbk, u_gb18030, u_utf16le, u_utf32be, /*30-34*/
- u_utf32le,
- };
- eci_func_t eci_func;
- unsigned int codepoint, state = 0;
- int in_posn = 0;
- int out_posn = 0;
- int length = *p_length;
- /* Special case ISO/IEC 8859-1 */
- if (eci == 0 || eci == 3) { /* Default ECI 0 to ISO/IEC 8859-1 */
- while (in_posn < length) {
- do {
- decode_utf8(&state, &codepoint, source[in_posn++]);
- } while (in_posn < length && state != 0 && state != 12);
- if (state != 0) {
- return ZINT_ERROR_INVALID_DATA;
- }
- if (codepoint >= 0x80 && (codepoint < 0xA0 || codepoint >= 0x100)) {
- return ZINT_ERROR_INVALID_DATA;
- }
- dest[out_posn++] = (unsigned char) codepoint;
- }
- dest[out_posn] = '\0';
- *p_length = out_posn;
- return 0;
- }
- if (eci == 170) { /* ASCII Invariant (archaic subset) */
- eci_func = u_ascii_inv;
- } else {
- eci_func = eci_funcs[eci];
- if (eci_func == NULL) {
- return ZINT_ERROR_INVALID_DATA;
- }
- }
- while (in_posn < length) {
- int incr;
- do {
- decode_utf8(&state, &codepoint, source[in_posn++]);
- } while (in_posn < length && state != 0 && state != 12);
- if (state != 0) {
- return ZINT_ERROR_INVALID_DATA;
- }
- incr = (*eci_func)(codepoint, dest + out_posn);
- if (incr == 0) {
- return ZINT_ERROR_INVALID_DATA;
- }
- out_posn += incr;
- }
- dest[out_posn] = '\0';
- *p_length = out_posn;
- return 0;
- }
- /* Find the lowest single-byte ECI mode which will encode a given set of Unicode text, assuming valid UTF-8 */
- INTERNAL int get_best_eci(const unsigned char source[], int length) {
- int eci = 3;
- /* Note: attempting single-byte conversions only, so get_eci_length() unnecessary */
- unsigned char *local_source = (unsigned char *) z_alloca(length + 1);
- do {
- if (eci == 14) { /* Reserved */
- eci = 15;
- } else if (eci == 19) { /* Reserved */
- eci = 21; /* Skip 20 Shift JIS */
- }
- if (utf8_to_eci(eci, source, local_source, &length) == 0) {
- return eci;
- }
- eci++;
- } while (eci < 25);
- assert(is_valid_utf8(source, length));
- return 26; /* If all of these fail, use UTF-8! */
- }
- /* Call `get_best_eci()` for each segment, assuming valid UTF-8. Returns 0 on failure, first ECI set on success */
- INTERNAL int get_best_eci_segs(struct zint_symbol *symbol, struct zint_seg segs[], const int seg_count) {
- const int default_eci = symbol->symbology == BARCODE_GRIDMATRIX ? 29 : symbol->symbology == BARCODE_UPNQR ? 4 : 3;
- int first_eci_set = 0;
- int i;
- for (i = 0; i < seg_count; i++) {
- if (segs[i].eci == 0) {
- const int eci = get_best_eci(segs[i].source, segs[i].length);
- if (eci == default_eci) {
- if (i != 0 && segs[i - 1].eci != 0 && segs[i - 1].eci != default_eci) {
- segs[i].eci = eci;
- if (first_eci_set == 0) {
- first_eci_set = eci;
- }
- }
- } else {
- segs[i].eci = eci;
- if (first_eci_set == 0) {
- first_eci_set = eci;
- if (i == 0) {
- symbol->eci = eci;
- }
- }
- }
- }
- }
- return first_eci_set;
- }
- /* QRCODE Shift JIS helpers */
- /* Convert UTF-8 string to Shift JIS and place in array of ints */
- INTERNAL int sjis_utf8(struct zint_symbol *symbol, const unsigned char source[], int *p_length,
- unsigned int *ddata) {
- int error_number;
- unsigned int i, length;
- unsigned int *utfdata = (unsigned int *) z_alloca(sizeof(unsigned int) * (*p_length + 1));
- error_number = utf8_to_unicode(symbol, source, utfdata, p_length, 1 /*disallow_4byte*/);
- if (error_number != 0) {
- return error_number;
- }
- for (i = 0, length = *p_length; i < length; i++) {
- if (!u_sjis_int(utfdata[i], ddata + i)) {
- return errtxt(ZINT_ERROR_INVALID_DATA, symbol, 800, "Invalid character in input");
- }
- }
- return 0;
- }
- /* If `full_multibyte` set, copy byte input stream to array of ints, putting double-bytes that match QR Kanji mode in
- * a single entry. If `full_multibyte` not set, do a straight copy */
- INTERNAL void sjis_cpy(const unsigned char source[], int *p_length, unsigned int *ddata, const int full_multibyte) {
- unsigned int i, j, length;
- unsigned char c1, c2;
- if (full_multibyte) {
- for (i = 0, j = 0, length = *p_length; i < length; i++, j++) {
- c1 = source[i];
- /* Now using stricter interpretation of standard, and excluding certain trailing bytes */
- if (((c1 >= 0x81 && c1 <= 0x9F) || (c1 >= 0xE0 && c1 <= 0xEB)) && length - i >= 2) {
- c2 = source[i + 1];
- if ((c2 >= 0x40 && c2 <= 0xFC) && c2 != 0x7F && (c1 != 0xEB || c2 <= 0xBF)) {
- /* This may or may not be valid Shift JIS, but don't care as long as it can be encoded in
- * QR Kanji mode */
- ddata[j] = (c1 << 8) | c2;
- i++;
- } else {
- ddata[j] = c1;
- }
- } else {
- ddata[j] = c1;
- }
- }
- *p_length = j;
- } else {
- /* Straight copy */
- for (i = 0, length = *p_length; i < length; i++) {
- ddata[i] = source[i];
- }
- }
- }
- /* Call `sjis_cpy()` for each segment */
- INTERNAL void sjis_cpy_segs(struct zint_seg segs[], const int seg_count, unsigned int *ddata,
- const int full_multibyte) {
- int i;
- unsigned int *dd = ddata;
- for (i = 0; i < seg_count; i++) {
- sjis_cpy(segs[i].source, &segs[i].length, dd, full_multibyte);
- dd += segs[i].length;
- }
- }
- /* Convert UTF-8 string to ECI and place in array of ints using `sjis_cpy()` */
- INTERNAL int sjis_utf8_to_eci(const int eci, const unsigned char source[], int *p_length, unsigned int *ddata,
- const int full_multibyte) {
- if (is_eci_convertible(eci)) {
- int error_number;
- const int eci_length = get_eci_length(eci, source, *p_length);
- unsigned char *converted = (unsigned char *) z_alloca(eci_length + 1);
- error_number = utf8_to_eci(eci, source, converted, p_length);
- if (error_number != 0) {
- /* Note not setting `symbol->errtxt`, up to caller */
- return error_number;
- }
- sjis_cpy(converted, p_length, ddata, full_multibyte || eci == 20);
- } else {
- sjis_cpy(source, p_length, ddata, full_multibyte);
- }
- return 0;
- }
- /* GRIDMATRIX GB 2312 helpers */
- /* Convert UTF-8 string to GB 2312 (EUC-CN) and place in array of ints */
- INTERNAL int gb2312_utf8(struct zint_symbol *symbol, const unsigned char source[], int *p_length,
- unsigned int *ddata) {
- int error_number;
- unsigned int i, length;
- unsigned int *utfdata = (unsigned int *) z_alloca(sizeof(unsigned int) * (*p_length + 1));
- error_number = utf8_to_unicode(symbol, source, utfdata, p_length, 1 /*disallow_4byte*/);
- if (error_number != 0) {
- return error_number;
- }
- for (i = 0, length = *p_length; i < length; i++) {
- if (utfdata[i] < 0x80) {
- ddata[i] = utfdata[i];
- } else {
- if (!u_gb2312_int(utfdata[i], ddata + i)) {
- return errtxt(ZINT_ERROR_INVALID_DATA, symbol, 810, "Invalid character in input");
- }
- }
- }
- return 0;
- }
- /* If `full_multibyte` set, copy byte input stream to array of ints, putting double-bytes that match GRIDMATRIX
- * Chinese mode in a single entry. If `full_multibyte` not set, do a straight copy */
- static void gb2312_cpy(const unsigned char source[], int *p_length, unsigned int *ddata,
- const int full_multibyte) {
- unsigned int i, j, length;
- unsigned char c1, c2;
- if (full_multibyte) {
- for (i = 0, j = 0, length = *p_length; i < length; i++, j++) {
- if (length - i >= 2) {
- c1 = source[i];
- c2 = source[i + 1];
- if (((c1 >= 0xA1 && c1 <= 0xA9) || (c1 >= 0xB0 && c1 <= 0xF7)) && c2 >= 0xA1 && c2 <= 0xFE) {
- /* This may or may not be valid GB 2312 (EUC-CN), but don't care as long as it can be encoded in
- * GRIDMATRIX Chinese mode */
- ddata[j] = (c1 << 8) | c2;
- i++;
- } else {
- ddata[j] = c1;
- }
- } else {
- ddata[j] = source[i];
- }
- }
- *p_length = j;
- } else {
- /* Straight copy */
- for (i = 0, length = *p_length; i < length; i++) {
- ddata[i] = source[i];
- }
- }
- }
- #ifdef ZINT_TEST /* Wrapper for direct testing */
- INTERNAL void gb2312_cpy_test(const unsigned char source[], int *p_length, unsigned int *ddata,
- const int full_multibyte) {
- gb2312_cpy(source, p_length, ddata, full_multibyte);
- }
- #endif
- /* Call `gb2312_cpy()` for each segment */
- INTERNAL void gb2312_cpy_segs(struct zint_seg segs[], const int seg_count, unsigned int *ddata,
- const int full_multibyte) {
- int i;
- unsigned int *dd = ddata;
- for (i = 0; i < seg_count; i++) {
- gb2312_cpy(segs[i].source, &segs[i].length, dd, full_multibyte);
- dd += segs[i].length;
- }
- }
- /* Convert UTF-8 string to ECI and place in array of ints using `gb2312_cpy()` */
- INTERNAL int gb2312_utf8_to_eci(const int eci, const unsigned char source[], int *p_length, unsigned int *ddata,
- const int full_multibyte) {
- if (is_eci_convertible(eci)) {
- int error_number;
- const int eci_length = get_eci_length(eci, source, *p_length);
- unsigned char *converted = (unsigned char *) z_alloca(eci_length + 1);
- error_number = utf8_to_eci(eci, source, converted, p_length);
- if (error_number != 0) {
- /* Note not setting `symbol->errtxt`, up to caller */
- return error_number;
- }
- gb2312_cpy(converted, p_length, ddata, full_multibyte || eci == 29);
- } else {
- gb2312_cpy(source, p_length, ddata, full_multibyte);
- }
- return 0;
- }
- /* HANXIN GB 18030 helpers */
- /* Convert UTF-8 string to GB 18030 and place in array of ints */
- INTERNAL int gb18030_utf8(struct zint_symbol *symbol, const unsigned char source[], int *p_length,
- unsigned int *ddata) {
- int error_number, ret;
- unsigned int i, j, length;
- unsigned int *utfdata = (unsigned int *) z_alloca(sizeof(unsigned int) * (*p_length + 1));
- error_number = utf8_to_unicode(symbol, source, utfdata, p_length, 0 /*disallow_4byte*/);
- if (error_number != 0) {
- return error_number;
- }
- for (i = 0, j = 0, length = *p_length; i < length; i++, j++) {
- if (utfdata[i] < 0x80) {
- ddata[j] = utfdata[i];
- } else {
- ret = u_gb18030_int(utfdata[i], ddata + j, ddata + j + 1);
- if (ret == 0) { /* Should never happen, as GB 18030 is a UTF i.e. maps all Unicode codepoints */
- return errtxt(ZINT_ERROR_INVALID_DATA, symbol, 820, "Invalid character in input"); /* Not reached */
- }
- if (ret == 4) {
- j++;
- }
- }
- }
- *p_length = j;
- return 0;
- }
- /* If `full_multibyte` set, copy byte input stream to array of ints, putting double-bytes that match HANXIN
- * Chinese mode in single entry, and quad-bytes in 2 entries. If `full_multibyte` not set, do a straight copy */
- static void gb18030_cpy(const unsigned char source[], int *p_length, unsigned int *ddata,
- const int full_multibyte) {
- unsigned int i, j, length;
- int done;
- unsigned char c1, c2, c3, c4;
- if (full_multibyte) {
- for (i = 0, j = 0, length = *p_length; i < length; i++, j++) {
- done = 0;
- c1 = source[i];
- if (length - i >= 2) {
- if (c1 >= 0x81 && c1 <= 0xFE) {
- c2 = source[i + 1];
- if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0x80 && c2 <= 0xFE)) {
- ddata[j] = (c1 << 8) | c2;
- i++;
- done = 1;
- } else if (length - i >= 4 && (c2 >= 0x30 && c2 <= 0x39)) {
- c3 = source[i + 2];
- c4 = source[i + 3];
- if ((c3 >= 0x81 && c3 <= 0xFE) && (c4 >= 0x30 && c4 <= 0x39)) {
- ddata[j++] = (c1 << 8) | c2;
- ddata[j] = (c3 << 8) | c4;
- i += 3;
- done = 1;
- }
- }
- }
- }
- if (!done) {
- ddata[j] = c1;
- }
- }
- *p_length = j;
- } else {
- /* Straight copy */
- for (i = 0, length = *p_length; i < length; i++) {
- ddata[i] = source[i];
- }
- }
- }
- #ifdef ZINT_TEST /* Wrapper for direct testing */
- INTERNAL void gb18030_cpy_test(const unsigned char source[], int *p_length, unsigned int *ddata,
- const int full_multibyte) {
- gb18030_cpy(source, p_length, ddata, full_multibyte);
- }
- #endif
- /* Call `gb18030_cpy()` for each segment */
- INTERNAL void gb18030_cpy_segs(struct zint_seg segs[], const int seg_count, unsigned int *ddata,
- const int full_multibyte) {
- int i;
- unsigned int *dd = ddata;
- for (i = 0; i < seg_count; i++) {
- gb18030_cpy(segs[i].source, &segs[i].length, dd, full_multibyte);
- dd += segs[i].length;
- }
- }
- /* Convert UTF-8 string to ECI and place in array of ints using `gb18030_cpy()` */
- INTERNAL int gb18030_utf8_to_eci(const int eci, const unsigned char source[], int *p_length, unsigned int *ddata,
- const int full_multibyte) {
- if (is_eci_convertible(eci)) {
- int error_number;
- const int eci_length = get_eci_length(eci, source, *p_length);
- unsigned char *converted = (unsigned char *) z_alloca(eci_length + 1);
- error_number = utf8_to_eci(eci, source, converted, p_length);
- if (error_number != 0) {
- /* Note not setting `symbol->errtxt`, up to caller */
- return error_number;
- }
- /* GB 18030 (ECI 32) superset of GB 2312 (ECI 29) and GBK (ECI 31) */
- gb18030_cpy(converted, p_length, ddata, full_multibyte || eci == 32 || eci == 29 || eci == 31);
- } else {
- gb18030_cpy(source, p_length, ddata, full_multibyte);
- }
- return 0;
- }
- /* vim: set ts=4 sw=4 et : */
|