pdf-cmap-parse.c 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444
  1. // Copyright (C) 2004-2021 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #include "mupdf/fitz.h"
  23. #include "mupdf/pdf.h"
  24. #include <string.h>
  25. /*
  26. * CMap parser
  27. */
  28. static int
  29. is_keyword(pdf_token tok, pdf_lexbuf *buf, const char *word)
  30. {
  31. /* Ignore trailing garbage when matching keywords */
  32. return (tok == PDF_TOK_KEYWORD && !strncmp(buf->scratch, word, strlen(word)));
  33. }
  34. static void
  35. skip_to_keyword(fz_context *ctx, fz_stream *file, pdf_lexbuf *buf, const char *end, const char *warn)
  36. {
  37. fz_warn(ctx, "%s", warn);
  38. for (;;)
  39. {
  40. pdf_token tok = pdf_lex(ctx, file, buf);
  41. if (is_keyword(tok, buf, end))
  42. return;
  43. if (tok == PDF_TOK_ERROR)
  44. return;
  45. if (tok == PDF_TOK_EOF)
  46. return;
  47. }
  48. }
  49. static void
  50. skip_to_token(fz_context *ctx, fz_stream *file, pdf_lexbuf *buf, pdf_token end, const char *warn)
  51. {
  52. fz_warn(ctx, "%s", warn);
  53. for (;;)
  54. {
  55. pdf_token tok = pdf_lex(ctx, file, buf);
  56. if (tok == end)
  57. return;
  58. if (tok == PDF_TOK_ERROR)
  59. return;
  60. if (tok == PDF_TOK_EOF)
  61. return;
  62. }
  63. }
  64. static int
  65. pdf_code_from_string(char *buf, size_t len)
  66. {
  67. unsigned int a = 0;
  68. while (len--)
  69. a = (a << 8) | *(unsigned char *)buf++;
  70. return a;
  71. }
  72. static void
  73. pdf_parse_cmap_name(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
  74. {
  75. pdf_token tok;
  76. tok = pdf_lex(ctx, file, buf);
  77. if (tok == PDF_TOK_NAME)
  78. fz_strlcpy(cmap->cmap_name, buf->scratch, sizeof(cmap->cmap_name));
  79. else
  80. fz_warn(ctx, "expected name after CMapName in cmap");
  81. }
  82. static void
  83. pdf_parse_wmode(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
  84. {
  85. pdf_token tok;
  86. tok = pdf_lex(ctx, file, buf);
  87. if (tok == PDF_TOK_INT)
  88. pdf_set_cmap_wmode(ctx, cmap, buf->i);
  89. else
  90. fz_warn(ctx, "expected integer after WMode in cmap");
  91. }
  92. static void
  93. pdf_parse_codespace_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
  94. {
  95. pdf_token tok;
  96. int lo, hi;
  97. while (1)
  98. {
  99. tok = pdf_lex(ctx, file, buf);
  100. if (is_keyword(tok, buf, "endcodespacerange"))
  101. return;
  102. else if (tok == PDF_TOK_STRING)
  103. {
  104. lo = pdf_code_from_string(buf->scratch, buf->len);
  105. tok = pdf_lex(ctx, file, buf);
  106. if (tok == PDF_TOK_STRING)
  107. {
  108. hi = pdf_code_from_string(buf->scratch, buf->len);
  109. pdf_add_codespace(ctx, cmap, lo, hi, buf->len);
  110. }
  111. else
  112. {
  113. skip_to_keyword(ctx, file, buf, "endcodespacerange", "expected string or endcodespacerange");
  114. return;
  115. }
  116. }
  117. else
  118. {
  119. skip_to_keyword(ctx, file, buf, "endcodespacerange", "expected string or endcodespacerange");
  120. return;
  121. }
  122. }
  123. }
  124. static void
  125. pdf_parse_cid_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
  126. {
  127. pdf_token tok;
  128. int lo, hi, dst;
  129. while (1)
  130. {
  131. tok = pdf_lex(ctx, file, buf);
  132. if (is_keyword(tok, buf, "endcidrange"))
  133. return;
  134. else if (tok != PDF_TOK_STRING)
  135. {
  136. skip_to_keyword(ctx, file, buf, "endcidrange", "expected string or endcidrange");
  137. return;
  138. }
  139. lo = pdf_code_from_string(buf->scratch, buf->len);
  140. tok = pdf_lex(ctx, file, buf);
  141. if (tok != PDF_TOK_STRING)
  142. {
  143. skip_to_keyword(ctx, file, buf, "endcidrange", "expected string");
  144. return;
  145. }
  146. hi = pdf_code_from_string(buf->scratch, buf->len);
  147. tok = pdf_lex(ctx, file, buf);
  148. if (tok != PDF_TOK_INT)
  149. {
  150. skip_to_keyword(ctx, file, buf, "endcidrange", "expected integer");
  151. return;
  152. }
  153. dst = buf->i;
  154. pdf_map_range_to_range(ctx, cmap, lo, hi, dst);
  155. }
  156. }
  157. static void
  158. pdf_parse_cid_char(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
  159. {
  160. pdf_token tok;
  161. int src, dst;
  162. while (1)
  163. {
  164. tok = pdf_lex(ctx, file, buf);
  165. if (is_keyword(tok, buf, "endcidchar"))
  166. return;
  167. else if (tok != PDF_TOK_STRING)
  168. {
  169. skip_to_keyword(ctx, file, buf, "endcidchar", "expected string or endcidchar");
  170. return;
  171. }
  172. src = pdf_code_from_string(buf->scratch, buf->len);
  173. tok = pdf_lex(ctx, file, buf);
  174. if (tok != PDF_TOK_INT)
  175. {
  176. skip_to_keyword(ctx, file, buf, "endcidchar", "expected integer");
  177. return;
  178. }
  179. dst = buf->i;
  180. pdf_map_range_to_range(ctx, cmap, src, src, dst);
  181. }
  182. }
  183. static void
  184. pdf_parse_bf_range_array(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf, int lo, int hi)
  185. {
  186. pdf_token tok;
  187. int dst[256];
  188. while (1)
  189. {
  190. tok = pdf_lex(ctx, file, buf);
  191. if (tok == PDF_TOK_CLOSE_ARRAY)
  192. return;
  193. /* Note: does not handle [ /Name /Name ... ] */
  194. else if (tok != PDF_TOK_STRING)
  195. {
  196. skip_to_token(ctx, file, buf, PDF_TOK_CLOSE_ARRAY, "expected string or ]");
  197. return;
  198. }
  199. if (buf->len / 2)
  200. {
  201. size_t i;
  202. size_t len = fz_minz(buf->len / 2, nelem(dst));
  203. for (i = 0; i < len; i++)
  204. dst[i] = pdf_code_from_string(&buf->scratch[i * 2], 2);
  205. pdf_map_one_to_many(ctx, cmap, lo, dst, i);
  206. }
  207. lo ++;
  208. }
  209. }
  210. static void
  211. pdf_parse_bf_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
  212. {
  213. pdf_token tok;
  214. int lo, hi, dst;
  215. while (1)
  216. {
  217. tok = pdf_lex(ctx, file, buf);
  218. if (is_keyword(tok, buf, "endbfrange"))
  219. return;
  220. else if (tok != PDF_TOK_STRING)
  221. {
  222. skip_to_keyword(ctx, file, buf, "endbfrange", "expected string or endbfrange");
  223. return;
  224. }
  225. lo = pdf_code_from_string(buf->scratch, buf->len);
  226. tok = pdf_lex(ctx, file, buf);
  227. if (tok != PDF_TOK_STRING)
  228. {
  229. skip_to_keyword(ctx, file, buf, "endbfrange", "expected string");
  230. return;
  231. }
  232. hi = pdf_code_from_string(buf->scratch, buf->len);
  233. if (lo < 0 || lo > 65535 || hi < 0 || hi > 65535 || lo > hi)
  234. {
  235. skip_to_keyword(ctx, file, buf, "endbfrange", "bfrange limits out of range");
  236. return;
  237. }
  238. tok = pdf_lex(ctx, file, buf);
  239. if (tok == PDF_TOK_STRING)
  240. {
  241. if (buf->len == 2)
  242. {
  243. dst = pdf_code_from_string(buf->scratch, buf->len);
  244. pdf_map_range_to_range(ctx, cmap, lo, hi, dst);
  245. }
  246. else
  247. {
  248. int dststr[256];
  249. size_t i;
  250. if (buf->len / 2)
  251. {
  252. size_t len = fz_minz(buf->len / 2, nelem(dststr));
  253. for (i = 0; i < len; i++)
  254. dststr[i] = pdf_code_from_string(&buf->scratch[i * 2], 2);
  255. while (lo <= hi)
  256. {
  257. pdf_map_one_to_many(ctx, cmap, lo, dststr, i);
  258. dststr[i-1] ++;
  259. lo ++;
  260. }
  261. }
  262. }
  263. }
  264. else if (tok == PDF_TOK_OPEN_ARRAY)
  265. {
  266. pdf_parse_bf_range_array(ctx, cmap, file, buf, lo, hi);
  267. }
  268. else
  269. {
  270. skip_to_keyword(ctx, file, buf, "endbfrange", "expected string or array or endbfrange");
  271. return;
  272. }
  273. }
  274. }
  275. static void
  276. pdf_parse_bf_char(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
  277. {
  278. pdf_token tok;
  279. int dst[256];
  280. int src;
  281. while (1)
  282. {
  283. tok = pdf_lex(ctx, file, buf);
  284. if (is_keyword(tok, buf, "endbfchar"))
  285. return;
  286. else if (tok != PDF_TOK_STRING)
  287. {
  288. skip_to_keyword(ctx, file, buf, "endbfchar", "expected string or endbfchar");
  289. return;
  290. }
  291. src = pdf_code_from_string(buf->scratch, buf->len);
  292. tok = pdf_lex(ctx, file, buf);
  293. /* Note: does not handle /dstName */
  294. if (tok != PDF_TOK_STRING)
  295. {
  296. skip_to_keyword(ctx, file, buf, "endbfchar", "expected string");
  297. return;
  298. }
  299. if (buf->len / 2)
  300. {
  301. size_t i;
  302. size_t len = fz_minz(buf->len / 2, nelem(dst));
  303. for (i = 0; i < len; i++)
  304. dst[i] = pdf_code_from_string(&buf->scratch[i * 2], 2);
  305. pdf_map_one_to_many(ctx, cmap, src, dst, i);
  306. }
  307. }
  308. }
  309. pdf_cmap *
  310. pdf_load_cmap(fz_context *ctx, fz_stream *file)
  311. {
  312. pdf_cmap *cmap;
  313. char key[64];
  314. pdf_lexbuf buf;
  315. pdf_token tok;
  316. pdf_lexbuf_init(ctx, &buf, PDF_LEXBUF_SMALL);
  317. cmap = pdf_new_cmap(ctx);
  318. strcpy(key, ".notdef");
  319. fz_try(ctx)
  320. {
  321. while (1)
  322. {
  323. tok = pdf_lex(ctx, file, &buf);
  324. if (tok == PDF_TOK_EOF)
  325. break;
  326. else if (tok == PDF_TOK_NAME)
  327. {
  328. if (!strcmp(buf.scratch, "CMapName"))
  329. pdf_parse_cmap_name(ctx, cmap, file, &buf);
  330. else if (!strcmp(buf.scratch, "WMode"))
  331. pdf_parse_wmode(ctx, cmap, file, &buf);
  332. else
  333. fz_strlcpy(key, buf.scratch, sizeof key);
  334. }
  335. else if (tok == PDF_TOK_KEYWORD)
  336. {
  337. if (is_keyword(tok, &buf, "endcmap"))
  338. break;
  339. else if (is_keyword(tok, &buf, "usecmap"))
  340. fz_strlcpy(cmap->usecmap_name, key, sizeof(cmap->usecmap_name));
  341. else if (is_keyword(tok, &buf, "begincodespacerange"))
  342. pdf_parse_codespace_range(ctx, cmap, file, &buf);
  343. else if (is_keyword(tok, &buf, "beginbfchar"))
  344. pdf_parse_bf_char(ctx, cmap, file, &buf);
  345. else if (is_keyword(tok, &buf, "begincidchar"))
  346. pdf_parse_cid_char(ctx, cmap, file, &buf);
  347. else if (is_keyword(tok, &buf, "beginbfrange"))
  348. pdf_parse_bf_range(ctx, cmap, file, &buf);
  349. else if (is_keyword(tok, &buf, "begincidrange"))
  350. pdf_parse_cid_range(ctx, cmap, file, &buf);
  351. }
  352. /* ignore everything else */
  353. }
  354. pdf_sort_cmap(ctx, cmap);
  355. }
  356. fz_always(ctx)
  357. {
  358. pdf_lexbuf_fin(ctx, &buf);
  359. }
  360. fz_catch(ctx)
  361. {
  362. pdf_drop_cmap(ctx, cmap);
  363. fz_rethrow(ctx);
  364. }
  365. return cmap;
  366. }