text-decoder.c 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. // Copyright (C) 2024 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #include "mupdf/fitz.h"
  23. #include "mupdf/pdf.h"
  24. static int simple_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n)
  25. {
  26. return n * 4 + 1;
  27. }
  28. static int simple_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n)
  29. {
  30. const unsigned short *table = dec->table1;
  31. unsigned char *e = s + n;
  32. int len = 1;
  33. while (s < e)
  34. len += fz_runelen(table[*s++]);
  35. return len;
  36. }
  37. static void simple_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n)
  38. {
  39. const unsigned short *table = dec->table1;
  40. unsigned char *e = s + n;
  41. while (s < e)
  42. p += fz_runetochar(p, table[*s++]);
  43. *p = 0;
  44. }
  45. static int utf16be_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n)
  46. {
  47. return n * 2 + 1;
  48. }
  49. static int utf16le_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n)
  50. {
  51. return n * 2 + 1;
  52. }
  53. static int utf16be_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n)
  54. {
  55. unsigned char *e = s + n;
  56. int len = 1;
  57. while (s + 1 < e) {
  58. len += fz_runelen(s[0] << 8 | s[1]);
  59. s += 2;
  60. }
  61. return len;
  62. }
  63. static int utf16le_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n)
  64. {
  65. unsigned char *e = s + n;
  66. int len = 1;
  67. while (s + 1 < e) {
  68. len += fz_runelen(s[0] | s[1] << 8);
  69. s += 2;
  70. }
  71. return len;
  72. }
  73. static void utf16be_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n)
  74. {
  75. unsigned char *e = s + n;
  76. while (s + 1 < e) {
  77. p += fz_runetochar(p, s[0] << 8 | s[1]);
  78. s += 2;
  79. }
  80. *p = 0;
  81. }
  82. static void utf16le_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n)
  83. {
  84. unsigned char *e = s + n;
  85. while (s + 1 < e) {
  86. p += fz_runetochar(p, s[0] | s[1] << 8);
  87. s += 2;
  88. }
  89. *p = 0;
  90. }
  91. static int cjk_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n)
  92. {
  93. return n * 4 + 1;
  94. }
  95. static int cjk_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n)
  96. {
  97. unsigned char *e = s + n;
  98. pdf_cmap *to_cid = dec->table1;
  99. pdf_cmap *to_uni = dec->table2;
  100. unsigned int raw;
  101. int cid, uni;
  102. int len = 1;
  103. while (s < e) {
  104. s += pdf_decode_cmap(to_cid, s, e, &raw);
  105. cid = pdf_lookup_cmap(to_cid, raw);
  106. uni = pdf_lookup_cmap(to_uni, cid);
  107. if (uni < 0) {
  108. // ASCII control characters are missing in the CMaps
  109. if (raw < 32)
  110. uni = raw;
  111. else
  112. uni = FZ_REPLACEMENT_CHARACTER;
  113. }
  114. len += fz_runelen(uni);
  115. }
  116. return len;
  117. }
  118. static void cjk_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n)
  119. {
  120. unsigned char *e = s + n;
  121. pdf_cmap *to_cid = dec->table1;
  122. pdf_cmap *to_uni = dec->table2;
  123. unsigned int raw;
  124. int cid, uni;
  125. while (s < e) {
  126. s += pdf_decode_cmap(to_cid, s, e, &raw);
  127. cid = pdf_lookup_cmap(to_cid, raw);
  128. uni = pdf_lookup_cmap(to_uni, cid);
  129. if (uni < 0) {
  130. // ASCII control characters are missing in the CMaps
  131. if (raw < 32)
  132. uni = raw;
  133. else
  134. uni = FZ_REPLACEMENT_CHARACTER;
  135. }
  136. p += fz_runetochar(p, uni);
  137. }
  138. *p = 0;
  139. }
  140. static void fz_init_simple_text_decoder(fz_context *ctx, fz_text_decoder *dec, const unsigned short *table)
  141. {
  142. dec->decode_bound = simple_text_decode_bound;
  143. dec->decode_size = simple_text_decode_size;
  144. dec->decode = simple_text_decode;
  145. dec->table1 = (void*)table;
  146. }
  147. static void fz_init_utf16be_text_decoder(fz_context *ctx, fz_text_decoder *dec)
  148. {
  149. dec->decode_bound = utf16be_text_decode_bound;
  150. dec->decode_size = utf16be_text_decode_size;
  151. dec->decode = utf16be_text_decode;
  152. }
  153. static void fz_init_utf16le_text_decoder(fz_context *ctx, fz_text_decoder *dec)
  154. {
  155. dec->decode_bound = utf16le_text_decode_bound;
  156. dec->decode_size = utf16le_text_decode_size;
  157. dec->decode = utf16le_text_decode;
  158. }
  159. static void fz_init_cjk_text_decoder(fz_context *ctx, fz_text_decoder *dec, const char *to_cid, const char *to_uni)
  160. {
  161. dec->decode_bound = cjk_text_decode_bound;
  162. dec->decode_size = cjk_text_decode_size;
  163. dec->decode = cjk_text_decode;
  164. dec->table1 = pdf_load_builtin_cmap(ctx, to_cid);
  165. if (!dec->table1)
  166. fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "unknown CMap: %s", to_cid);
  167. dec->table2 = pdf_load_builtin_cmap(ctx, to_uni);
  168. if (!dec->table2)
  169. fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "unknown CMap: %s", to_uni);
  170. }
  171. void fz_init_text_decoder(fz_context *ctx, fz_text_decoder *dec, const char *enc)
  172. {
  173. // Recognize IANA character set identifiers (case insensitive).
  174. // https://www.iana.org/assignments/character-sets/character-sets.xhtml
  175. if (!fz_strcasecmp(enc, "utf-16"))
  176. fz_init_utf16le_text_decoder(ctx, dec);
  177. else if (!fz_strcasecmp(enc, "utf-16be"))
  178. fz_init_utf16be_text_decoder(ctx, dec);
  179. else if (!fz_strcasecmp(enc, "utf-16le"))
  180. fz_init_utf16le_text_decoder(ctx, dec);
  181. else if (!fz_strcasecmp(enc, "euc-jp"))
  182. fz_init_cjk_text_decoder(ctx, dec, "EUC-H", "Adobe-Japan1-UCS2");
  183. else if (!fz_strcasecmp(enc, "shift_jis") || !fz_strcasecmp(enc, "sjis"))
  184. fz_init_cjk_text_decoder(ctx, dec, "90msp-H", "Adobe-Japan1-UCS2");
  185. else if (!fz_strcasecmp(enc, "euc-kr"))
  186. fz_init_cjk_text_decoder(ctx, dec, "KSCms-UHC-H", "Adobe-Korea1-UCS2");
  187. else if (!fz_strcasecmp(enc, "euc-cn"))
  188. fz_init_cjk_text_decoder(ctx, dec, "GB-EUC-H", "Adobe-GB1-UCS2");
  189. else if (!fz_strcasecmp(enc, "gbk") || !fz_strcasecmp(enc, "gb2312") || !fz_strcasecmp(enc, "gb18030"))
  190. fz_init_cjk_text_decoder(ctx, dec, "GBK2K-H", "Adobe-GB1-UCS2");
  191. else if (!fz_strcasecmp(enc, "euc-tw"))
  192. fz_init_cjk_text_decoder(ctx, dec, "CNS-EUC-H", "Adobe-CNS1-UCS2");
  193. else if (!fz_strcasecmp(enc, "big5"))
  194. fz_init_cjk_text_decoder(ctx, dec, "ETen-B5-H", "Adobe-CNS1-UCS2");
  195. else if (!fz_strcasecmp(enc, "big5-hkscs"))
  196. fz_init_cjk_text_decoder(ctx, dec, "HKscs-B5-H", "Adobe-CNS1-UCS2");
  197. else if (!fz_strcasecmp(enc, "iso-8859-1"))
  198. fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_iso8859_1);
  199. else if (!fz_strcasecmp(enc, "iso-8859-7"))
  200. fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_iso8859_7);
  201. else if (!fz_strcasecmp(enc, "koi8-r"))
  202. fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_koi8u);
  203. else if (!fz_strcasecmp(enc, "windows-1250"))
  204. fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_windows_1250);
  205. else if (!fz_strcasecmp(enc, "windows-1251"))
  206. fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_windows_1251);
  207. else if (!fz_strcasecmp(enc, "windows-1252"))
  208. fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_windows_1252);
  209. else
  210. fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "unknown text encoding: %s", enc);
  211. }