pdf-parse.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979
  1. // Copyright (C) 2004-2021 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #include "mupdf/fitz.h"
  23. #include "mupdf/pdf.h"
  24. #include <string.h>
  25. #include <time.h>
  26. #ifdef _WIN32
  27. #define timegm _mkgmtime
  28. #endif
  29. #define isdigit(c) (c >= '0' && c <= '9')
  30. fz_rect
  31. pdf_to_rect(fz_context *ctx, pdf_obj *array)
  32. {
  33. if (!pdf_is_array(ctx, array))
  34. return fz_empty_rect;
  35. else
  36. {
  37. float a = pdf_array_get_real(ctx, array, 0);
  38. float b = pdf_array_get_real(ctx, array, 1);
  39. float c = pdf_array_get_real(ctx, array, 2);
  40. float d = pdf_array_get_real(ctx, array, 3);
  41. fz_rect r;
  42. r.x0 = fz_min(a, c);
  43. r.y0 = fz_min(b, d);
  44. r.x1 = fz_max(a, c);
  45. r.y1 = fz_max(b, d);
  46. return r;
  47. }
  48. }
  49. fz_quad
  50. pdf_to_quad(fz_context *ctx, pdf_obj *array, int offset)
  51. {
  52. fz_quad q;
  53. q.ul.x = pdf_array_get_real(ctx, array, offset+0);
  54. q.ul.y = pdf_array_get_real(ctx, array, offset+1);
  55. q.ur.x = pdf_array_get_real(ctx, array, offset+2);
  56. q.ur.y = pdf_array_get_real(ctx, array, offset+3);
  57. q.ll.x = pdf_array_get_real(ctx, array, offset+4);
  58. q.ll.y = pdf_array_get_real(ctx, array, offset+5);
  59. q.lr.x = pdf_array_get_real(ctx, array, offset+6);
  60. q.lr.y = pdf_array_get_real(ctx, array, offset+7);
  61. return q;
  62. }
  63. fz_point
  64. pdf_to_point(fz_context *ctx, pdf_obj *array, int offset)
  65. {
  66. fz_point p;
  67. p.x = pdf_array_get_real(ctx, array, offset+0);
  68. p.y = pdf_array_get_real(ctx, array, offset+1);
  69. return p;
  70. }
  71. fz_matrix
  72. pdf_to_matrix(fz_context *ctx, pdf_obj *array)
  73. {
  74. if (!pdf_is_array(ctx, array))
  75. return fz_identity;
  76. else
  77. {
  78. fz_matrix m;
  79. m.a = pdf_array_get_real(ctx, array, 0);
  80. m.b = pdf_array_get_real(ctx, array, 1);
  81. m.c = pdf_array_get_real(ctx, array, 2);
  82. m.d = pdf_array_get_real(ctx, array, 3);
  83. m.e = pdf_array_get_real(ctx, array, 4);
  84. m.f = pdf_array_get_real(ctx, array, 5);
  85. return m;
  86. }
  87. }
  88. char *
  89. pdf_format_date(fz_context *ctx, int64_t time, char *s, size_t n)
  90. {
  91. time_t secs = time;
  92. #ifdef _POSIX_SOURCE
  93. struct tm tmbuf, *tm = gmtime_r(&secs, &tmbuf);
  94. #else
  95. struct tm *tm = gmtime(&secs);
  96. #endif
  97. if (time < 0 || !tm || !strftime(s, n, "D:%Y%m%d%H%M%SZ", tm))
  98. return NULL;
  99. return s;
  100. }
  101. int64_t
  102. pdf_parse_date(fz_context *ctx, const char *s)
  103. {
  104. int tz_sign, tz_hour, tz_min, tz_adj;
  105. struct tm tm;
  106. time_t utc;
  107. if (!s[0])
  108. return -1;
  109. memset(&tm, 0, sizeof tm);
  110. tm.tm_mday = 1;
  111. tz_sign = 1;
  112. tz_hour = 0;
  113. tz_min = 0;
  114. if (s[0] == 'D' && s[1] == ':')
  115. s += 2;
  116. if (!isdigit(s[0]) || !isdigit(s[1]) || !isdigit(s[2]) || !isdigit(s[3]))
  117. {
  118. fz_warn(ctx, "invalid date format (missing year)");
  119. return -1;
  120. }
  121. tm.tm_year = (s[0]-'0')*1000 + (s[1]-'0')*100 + (s[2]-'0')*10 + (s[3]-'0') - 1900;
  122. s += 4;
  123. if (tm.tm_year < 70)
  124. {
  125. fz_warn(ctx, "invalid date (year out of range)");
  126. return -1;
  127. }
  128. if (isdigit(s[0]) && isdigit(s[1]))
  129. {
  130. tm.tm_mon = (s[0]-'0')*10 + (s[1]-'0') - 1; /* month is 0-11 in struct tm */
  131. s += 2;
  132. if (isdigit(s[0]) && isdigit(s[1]))
  133. {
  134. tm.tm_mday = (s[0]-'0')*10 + (s[1]-'0');
  135. s += 2;
  136. if (isdigit(s[0]) && isdigit(s[1]))
  137. {
  138. tm.tm_hour = (s[0]-'0')*10 + (s[1]-'0');
  139. s += 2;
  140. if (isdigit(s[0]) && isdigit(s[1]))
  141. {
  142. tm.tm_min = (s[0]-'0')*10 + (s[1]-'0');
  143. s += 2;
  144. if (isdigit(s[0]) && isdigit(s[1]))
  145. {
  146. tm.tm_sec = (s[0]-'0')*10 + (s[1]-'0');
  147. s += 2;
  148. }
  149. }
  150. }
  151. }
  152. }
  153. if (tm.tm_sec > 60 || tm.tm_min > 59 || tm.tm_hour > 23 || tm.tm_mday > 31 || tm.tm_mon > 11)
  154. {
  155. fz_warn(ctx, "invalid date (a field is out of range)");
  156. return -1;
  157. }
  158. if (s[0] == 'Z')
  159. {
  160. if (s[1] == '0' && s[2] == '0')
  161. {
  162. s += 3;
  163. if (s[0] == '\'' && s[1] == '0' && s[2] == '0')
  164. {
  165. s += 3;
  166. if (s[0] == '\'')
  167. s += 1;
  168. }
  169. }
  170. else
  171. {
  172. s += 1;
  173. }
  174. }
  175. else if ((s[0] == '-' || s[0] == '+') && isdigit(s[1]) && isdigit(s[2]))
  176. {
  177. tz_sign = (s[0] == '-') ? -1 : 1;
  178. tz_hour = (s[1]-'0')*10 + (s[2]-'0');
  179. s += 3;
  180. if (s[0] == '\'' && isdigit(s[1]) && isdigit(s[2]))
  181. {
  182. tz_min = (s[1]-'0')*10 + (s[2]-'0');
  183. s += 3;
  184. if (s[0] == '\'')
  185. s += 1;
  186. }
  187. }
  188. /* PDF is based on ISO/IEC 8824 which limits time zones from -15 to +16. */
  189. if (tz_sign < 0 && (tz_hour > 15 || (tz_hour == 15 && tz_min > 0)))
  190. {
  191. fz_warn(ctx, "invalid date format (time zone out of range)");
  192. return -1;
  193. }
  194. if (tz_sign > 0 && (tz_hour > 16 || (tz_hour == 16 && tz_min > 0)))
  195. {
  196. fz_warn(ctx, "invalid date format (time zone out of range)");
  197. return -1;
  198. }
  199. if (s[0] != 0)
  200. fz_warn(ctx, "invalid date format (garbage at end)");
  201. utc = timegm(&tm);
  202. if (utc == (time_t)-1)
  203. {
  204. fz_warn(ctx, "date overflow error");
  205. return -1;
  206. }
  207. tz_adj = tz_sign * (tz_hour * 3600 + tz_min * 60);
  208. return utc - tz_adj;
  209. }
  210. int64_t
  211. pdf_to_date(fz_context *ctx, pdf_obj *time)
  212. {
  213. return pdf_parse_date(ctx, pdf_to_str_buf(ctx, time));
  214. }
  215. static int
  216. rune_from_utf16be(int *out, const unsigned char *s, const unsigned char *end)
  217. {
  218. if (s + 2 <= end)
  219. {
  220. int a = s[0] << 8 | s[1];
  221. if (a >= 0xD800 && a <= 0xDFFF && s + 4 <= end)
  222. {
  223. int b = s[2] << 8 | s[3];
  224. *out = ((a - 0xD800) << 10) + (b - 0xDC00) + 0x10000;
  225. return 4;
  226. }
  227. *out = a;
  228. return 2;
  229. }
  230. *out = FZ_REPLACEMENT_CHARACTER;
  231. return 1;
  232. }
  233. static int
  234. rune_from_utf16le(int *out, const unsigned char *s, const unsigned char *end)
  235. {
  236. if (s + 2 <= end)
  237. {
  238. int a = s[1] << 8 | s[0];
  239. if (a >= 0xD800 && a <= 0xDFFF && s + 4 <= end)
  240. {
  241. int b = s[3] << 8 | s[2];
  242. *out = ((a - 0xD800) << 10) + (b - 0xDC00) + 0x10000;
  243. return 4;
  244. }
  245. *out = a;
  246. return 2;
  247. }
  248. *out = FZ_REPLACEMENT_CHARACTER;
  249. return 1;
  250. }
  251. static size_t
  252. skip_language_code_utf16le(const unsigned char *s, size_t n, size_t i)
  253. {
  254. /* skip language escape codes */
  255. if (i + 6 <= n && s[i+1] == 0 && s[i+0] == 27 && s[i+5] == 0 && s[i+4] == 27)
  256. return 6;
  257. else if (i + 8 <= n && s[i+1] == 0 && s[i+0] == 27 && s[i+7] == 0 && s[i+6] == 27)
  258. return 8;
  259. return 0;
  260. }
  261. static size_t
  262. skip_language_code_utf16be(const unsigned char *s, size_t n, size_t i)
  263. {
  264. /* skip language escape codes */
  265. if (i + 6 <= n && s[i+0] == 0 && s[i+1] == 27 && s[i+4] == 0 && s[i+5] == 27)
  266. return 6;
  267. else if (i + 8 <= n && s[i+0] == 0 && s[i+1] == 27 && s[i+6] == 0 && s[i+7] == 27)
  268. return 8;
  269. return 0;
  270. }
  271. static size_t
  272. skip_language_code_utf8(const unsigned char *s, size_t n, size_t i)
  273. {
  274. /* skip language escape codes */
  275. if (i + 3 <= n && s[i] == 27 && s[i+3])
  276. return 3;
  277. else if (i + 5 <= n && s[i] == 27 && s[i+5] == 27)
  278. return 5;
  279. return 0;
  280. }
  281. static int
  282. is_valid_utf8(const unsigned char *s, const unsigned char *end)
  283. {
  284. for (; s < end; ++s)
  285. {
  286. int skip = *s < 0x80 ? 0 : *s < 0xC0 ? -1 : *s < 0xE0 ? 1 : *s < 0xF0 ? 2 : *s < 0xF5 ? 3 : -1;
  287. if (skip == -1)
  288. return 0;
  289. while (skip-- > 0)
  290. if (++s >= end || (*s & 0xC0) != 0x80)
  291. return 0;
  292. }
  293. return 1;
  294. }
  295. char *
  296. pdf_new_utf8_from_pdf_string(fz_context *ctx, const char *ssrcptr, size_t srclen)
  297. {
  298. const unsigned char *srcptr = (const unsigned char*)ssrcptr;
  299. char *dstptr, *dst;
  300. size_t dstlen = 0;
  301. int ucs;
  302. size_t i, n;
  303. /* UTF-16BE */
  304. if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
  305. {
  306. i = 2;
  307. while (i + 2 <= srclen)
  308. {
  309. n = skip_language_code_utf16be(srcptr, srclen, i);
  310. if (n)
  311. i += n;
  312. else
  313. {
  314. i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen);
  315. dstlen += fz_runelen(ucs);
  316. }
  317. }
  318. dstptr = dst = Memento_label(fz_malloc(ctx, dstlen + 1), "utf8_from_utf16be");
  319. i = 2;
  320. while (i + 2 <= srclen)
  321. {
  322. n = skip_language_code_utf16be(srcptr, srclen, i);
  323. if (n)
  324. i += n;
  325. else
  326. {
  327. i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen);
  328. dstptr += fz_runetochar(dstptr, ucs);
  329. }
  330. }
  331. }
  332. /* UTF-16LE */
  333. else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
  334. {
  335. i = 2;
  336. while (i + 2 <= srclen)
  337. {
  338. n = skip_language_code_utf16le(srcptr, srclen, i);
  339. if (n)
  340. i += n;
  341. else
  342. {
  343. i += rune_from_utf16le(&ucs, srcptr + i, srcptr + srclen);
  344. dstlen += fz_runelen(ucs);
  345. }
  346. }
  347. dstptr = dst = Memento_label(fz_malloc(ctx, dstlen + 1), "utf8_from_utf16le");
  348. i = 2;
  349. while (i + 2 <= srclen)
  350. {
  351. n = skip_language_code_utf16le(srcptr, srclen, i);
  352. if (n)
  353. i += n;
  354. else
  355. {
  356. i += rune_from_utf16le(&ucs, srcptr + i, srcptr + srclen);
  357. dstptr += fz_runetochar(dstptr, ucs);
  358. }
  359. }
  360. }
  361. /* UTF-8 */
  362. else if (srclen >= 3 && srcptr[0] == 239 && srcptr[1] == 187 && srcptr[2] == 191)
  363. {
  364. i = 3;
  365. while (i < srclen)
  366. {
  367. n = skip_language_code_utf8(srcptr, srclen, i);
  368. if (n)
  369. i += n;
  370. else
  371. {
  372. i += 1;
  373. dstlen += 1;
  374. }
  375. }
  376. dstptr = dst = Memento_label(fz_malloc(ctx, dstlen + 1), "utf8_from_utf8");
  377. i = 3;
  378. while (i < srclen)
  379. {
  380. n = skip_language_code_utf8(srcptr, srclen, i);
  381. if (n)
  382. i += n;
  383. else
  384. *dstptr++ = srcptr[i++];
  385. }
  386. }
  387. /* Detect UTF-8 strings that aren't marked with a BOM */
  388. else if (is_valid_utf8(srcptr, srcptr + srclen))
  389. {
  390. dst = Memento_label(fz_malloc(ctx, srclen + 1), "utf8_from_guess");
  391. memcpy(dst, srcptr, srclen);
  392. dstptr = dst + srclen;
  393. }
  394. /* PDFDocEncoding */
  395. else
  396. {
  397. for (i = 0; i < srclen; i++)
  398. dstlen += fz_runelen(fz_unicode_from_pdf_doc_encoding[srcptr[i]]);
  399. dstptr = dst = Memento_label(fz_malloc(ctx, dstlen + 1), "utf8_from_pdfdocenc");
  400. for (i = 0; i < srclen; i++)
  401. {
  402. ucs = fz_unicode_from_pdf_doc_encoding[srcptr[i]];
  403. dstptr += fz_runetochar(dstptr, ucs);
  404. }
  405. }
  406. *dstptr = 0;
  407. return dst;
  408. }
  409. char *
  410. pdf_new_utf8_from_pdf_string_obj(fz_context *ctx, pdf_obj *src)
  411. {
  412. const char *srcptr;
  413. size_t srclen;
  414. srcptr = pdf_to_string(ctx, src, &srclen);
  415. return pdf_new_utf8_from_pdf_string(ctx, srcptr, srclen);
  416. }
  417. char *
  418. pdf_new_utf8_from_pdf_stream_obj(fz_context *ctx, pdf_obj *src)
  419. {
  420. fz_buffer *stmbuf;
  421. char *srcptr;
  422. size_t srclen;
  423. char *dst = NULL;
  424. stmbuf = pdf_load_stream(ctx, src);
  425. srclen = fz_buffer_storage(ctx, stmbuf, (unsigned char **)&srcptr);
  426. fz_try(ctx)
  427. dst = pdf_new_utf8_from_pdf_string(ctx, srcptr, srclen);
  428. fz_always(ctx)
  429. fz_drop_buffer(ctx, stmbuf);
  430. fz_catch(ctx)
  431. fz_rethrow(ctx);
  432. return dst;
  433. }
  434. char *
  435. pdf_load_stream_or_string_as_utf8(fz_context *ctx, pdf_obj *src)
  436. {
  437. if (pdf_is_stream(ctx, src))
  438. return pdf_new_utf8_from_pdf_stream_obj(ctx, src);
  439. return pdf_new_utf8_from_pdf_string_obj(ctx, src);
  440. }
  441. static pdf_obj *
  442. pdf_new_text_string_utf16be(fz_context *ctx, const char *s)
  443. {
  444. const char *ss;
  445. int c, i, n, a, b;
  446. unsigned char *p;
  447. pdf_obj *obj;
  448. ss = s;
  449. n = 0;
  450. while (*ss)
  451. {
  452. ss += fz_chartorune(&c, ss);
  453. n += (c >= 0x10000) ? 2 : 1;
  454. }
  455. p = fz_malloc(ctx, n * 2 + 2);
  456. i = 0;
  457. p[i++] = 254;
  458. p[i++] = 255;
  459. while (*s)
  460. {
  461. s += fz_chartorune(&c, s);
  462. if (c >= 0x10000)
  463. {
  464. a = (((c - 0x10000) >> 10) & 0x3ff) + 0xD800;
  465. p[i++] = (a>>8) & 0xff;
  466. p[i++] = (a) & 0xff;
  467. b = (((c - 0x10000)) & 0x3ff) + 0xDC00;
  468. p[i++] = (b>>8) & 0xff;
  469. p[i++] = (b) & 0xff;
  470. }
  471. else
  472. {
  473. p[i++] = (c>>8) & 0xff;
  474. p[i++] = (c) & 0xff;
  475. }
  476. }
  477. fz_try(ctx)
  478. obj = pdf_new_string(ctx, (char*)p, i);
  479. fz_always(ctx)
  480. fz_free(ctx, p);
  481. fz_catch(ctx)
  482. fz_rethrow(ctx);
  483. return obj;
  484. }
  485. pdf_obj *
  486. pdf_new_text_string(fz_context *ctx, const char *s)
  487. {
  488. int i = 0;
  489. while (s[i] != 0)
  490. {
  491. if (((unsigned char)s[i]) >= 128)
  492. return pdf_new_text_string_utf16be(ctx, s);
  493. ++i;
  494. }
  495. return pdf_new_string(ctx, s, i);
  496. }
  497. pdf_obj *
  498. pdf_parse_array(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
  499. {
  500. pdf_obj *ary = NULL;
  501. pdf_obj *obj = NULL;
  502. int64_t a = 0, b = 0, n = 0;
  503. pdf_token tok;
  504. pdf_obj *op = NULL;
  505. fz_var(obj);
  506. ary = pdf_new_array(ctx, doc, 4);
  507. fz_try(ctx)
  508. {
  509. while (1)
  510. {
  511. tok = pdf_lex(ctx, file, buf);
  512. if (tok != PDF_TOK_INT && tok != PDF_TOK_R)
  513. {
  514. if (n > 0)
  515. pdf_array_push_int(ctx, ary, a);
  516. if (n > 1)
  517. pdf_array_push_int(ctx, ary, b);
  518. n = 0;
  519. }
  520. if (tok == PDF_TOK_INT && n == 2)
  521. {
  522. pdf_array_push_int(ctx, ary, a);
  523. a = b;
  524. n --;
  525. }
  526. switch (tok)
  527. {
  528. case PDF_TOK_EOF:
  529. fz_throw(ctx, FZ_ERROR_SYNTAX, "array not closed before end of file");
  530. case PDF_TOK_CLOSE_ARRAY:
  531. op = ary;
  532. goto end;
  533. case PDF_TOK_INT:
  534. if (n == 0)
  535. a = buf->i;
  536. if (n == 1)
  537. b = buf->i;
  538. n ++;
  539. break;
  540. case PDF_TOK_R:
  541. if (n != 2)
  542. fz_throw(ctx, FZ_ERROR_SYNTAX, "cannot parse indirect reference in array");
  543. pdf_array_push_drop(ctx, ary, pdf_new_indirect(ctx, doc, a, b));
  544. n = 0;
  545. break;
  546. case PDF_TOK_OPEN_ARRAY:
  547. obj = pdf_parse_array(ctx, doc, file, buf);
  548. pdf_array_push_drop(ctx, ary, obj);
  549. break;
  550. case PDF_TOK_OPEN_DICT:
  551. obj = pdf_parse_dict(ctx, doc, file, buf);
  552. pdf_array_push_drop(ctx, ary, obj);
  553. break;
  554. case PDF_TOK_NAME:
  555. pdf_array_push_name(ctx, ary, buf->scratch);
  556. break;
  557. case PDF_TOK_REAL:
  558. pdf_array_push_real(ctx, ary, buf->f);
  559. break;
  560. case PDF_TOK_STRING:
  561. pdf_array_push_string(ctx, ary, buf->scratch, buf->len);
  562. break;
  563. case PDF_TOK_TRUE:
  564. pdf_array_push_bool(ctx, ary, 1);
  565. break;
  566. case PDF_TOK_FALSE:
  567. pdf_array_push_bool(ctx, ary, 0);
  568. break;
  569. case PDF_TOK_NULL:
  570. pdf_array_push(ctx, ary, PDF_NULL);
  571. break;
  572. default:
  573. pdf_array_push(ctx, ary, PDF_NULL);
  574. break;
  575. }
  576. }
  577. end:
  578. {}
  579. }
  580. fz_catch(ctx)
  581. {
  582. pdf_drop_obj(ctx, ary);
  583. fz_rethrow(ctx);
  584. }
  585. return op;
  586. }
  587. pdf_obj *
  588. pdf_parse_dict(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
  589. {
  590. pdf_obj *dict;
  591. pdf_obj *key = NULL;
  592. pdf_obj *val = NULL;
  593. pdf_token tok;
  594. int64_t a, b;
  595. dict = pdf_new_dict(ctx, doc, 8);
  596. fz_var(key);
  597. fz_var(val);
  598. fz_try(ctx)
  599. {
  600. while (1)
  601. {
  602. tok = pdf_lex(ctx, file, buf);
  603. skip:
  604. if (tok == PDF_TOK_CLOSE_DICT)
  605. break;
  606. /* for BI .. ID .. EI in content streams */
  607. if (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID"))
  608. break;
  609. if (tok != PDF_TOK_NAME)
  610. fz_throw(ctx, FZ_ERROR_SYNTAX, "invalid key in dict");
  611. key = pdf_new_name(ctx, buf->scratch);
  612. tok = pdf_lex(ctx, file, buf);
  613. switch (tok)
  614. {
  615. case PDF_TOK_OPEN_ARRAY:
  616. val = pdf_parse_array(ctx, doc, file, buf);
  617. break;
  618. case PDF_TOK_OPEN_DICT:
  619. val = pdf_parse_dict(ctx, doc, file, buf);
  620. break;
  621. case PDF_TOK_NAME: val = pdf_new_name(ctx, buf->scratch); break;
  622. case PDF_TOK_REAL: val = pdf_new_real(ctx, buf->f); break;
  623. case PDF_TOK_STRING: val = pdf_new_string(ctx, buf->scratch, buf->len); break;
  624. case PDF_TOK_TRUE: val = PDF_TRUE; break;
  625. case PDF_TOK_FALSE: val = PDF_FALSE; break;
  626. case PDF_TOK_NULL: val = PDF_NULL; break;
  627. case PDF_TOK_INT:
  628. /* 64-bit to allow for numbers > INT_MAX and overflow */
  629. a = buf->i;
  630. tok = pdf_lex(ctx, file, buf);
  631. if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME ||
  632. (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID")))
  633. {
  634. pdf_dict_put_int(ctx, dict, key, a);
  635. pdf_drop_obj(ctx, key);
  636. key = NULL;
  637. goto skip;
  638. }
  639. if (tok == PDF_TOK_INT)
  640. {
  641. b = buf->i;
  642. tok = pdf_lex(ctx, file, buf);
  643. if (tok == PDF_TOK_R)
  644. {
  645. val = pdf_new_indirect(ctx, doc, a, b);
  646. break;
  647. }
  648. }
  649. fz_warn(ctx, "invalid indirect reference in dict");
  650. val = PDF_NULL;
  651. break;
  652. default:
  653. val = PDF_NULL;
  654. break;
  655. }
  656. pdf_dict_put(ctx, dict, key, val);
  657. pdf_drop_obj(ctx, val);
  658. val = NULL;
  659. pdf_drop_obj(ctx, key);
  660. key = NULL;
  661. }
  662. }
  663. fz_catch(ctx)
  664. {
  665. pdf_drop_obj(ctx, dict);
  666. pdf_drop_obj(ctx, key);
  667. pdf_drop_obj(ctx, val);
  668. fz_rethrow(ctx);
  669. }
  670. return dict;
  671. }
  672. pdf_obj *
  673. pdf_parse_stm_obj(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
  674. {
  675. pdf_token tok;
  676. tok = pdf_lex(ctx, file, buf);
  677. switch (tok)
  678. {
  679. case PDF_TOK_OPEN_ARRAY:
  680. return pdf_parse_array(ctx, doc, file, buf);
  681. case PDF_TOK_OPEN_DICT:
  682. return pdf_parse_dict(ctx, doc, file, buf);
  683. case PDF_TOK_NAME: return pdf_new_name(ctx, buf->scratch);
  684. case PDF_TOK_REAL: return pdf_new_real(ctx, buf->f);
  685. case PDF_TOK_STRING: return pdf_new_string(ctx, buf->scratch, buf->len);
  686. case PDF_TOK_TRUE: return PDF_TRUE;
  687. case PDF_TOK_FALSE: return PDF_FALSE;
  688. case PDF_TOK_NULL: return PDF_NULL;
  689. case PDF_TOK_INT: return pdf_new_int(ctx, buf->i);
  690. default: fz_throw(ctx, FZ_ERROR_SYNTAX, "unknown token in object stream");
  691. }
  692. }
  693. pdf_obj *
  694. pdf_parse_ind_obj_or_newobj(fz_context *ctx, pdf_document *doc, fz_stream *file,
  695. int *onum, int *ogen, int64_t *ostmofs, int *try_repair, int *newobj)
  696. {
  697. pdf_obj *obj = NULL;
  698. int num = 0, gen = 0;
  699. int64_t stm_ofs;
  700. pdf_token tok;
  701. pdf_lexbuf *buf = &doc->lexbuf.base;
  702. int64_t a, b;
  703. int read_next_token = 1;
  704. fz_var(obj);
  705. tok = pdf_lex(ctx, file, buf);
  706. if (tok != PDF_TOK_INT)
  707. {
  708. if (try_repair)
  709. *try_repair = 1;
  710. fz_throw(ctx, FZ_ERROR_SYNTAX, "expected object number");
  711. }
  712. num = buf->i;
  713. if (num < 0 || num > PDF_MAX_OBJECT_NUMBER)
  714. fz_throw(ctx, FZ_ERROR_SYNTAX, "object number out of range");
  715. tok = pdf_lex(ctx, file, buf);
  716. if (tok != PDF_TOK_INT)
  717. {
  718. if (try_repair)
  719. *try_repair = 1;
  720. fz_throw(ctx, FZ_ERROR_SYNTAX, "expected generation number (%d ? obj)", num);
  721. }
  722. gen = buf->i;
  723. if (gen < 0 || gen >= 65536)
  724. {
  725. if (try_repair)
  726. *try_repair = 1;
  727. fz_throw(ctx, FZ_ERROR_SYNTAX, "invalid generation number (%d)", gen);
  728. }
  729. tok = pdf_lex(ctx, file, buf);
  730. if (tok == PDF_TOK_NEWOBJ && newobj)
  731. {
  732. *newobj = 1;
  733. if (onum) *onum = num;
  734. if (ogen) *ogen = gen;
  735. if (ostmofs) *ostmofs = 0;
  736. return NULL;
  737. }
  738. if (tok != PDF_TOK_OBJ)
  739. {
  740. if (try_repair)
  741. *try_repair = 1;
  742. fz_throw(ctx, FZ_ERROR_SYNTAX, "expected 'obj' keyword (%d %d ?)", num, gen);
  743. }
  744. tok = pdf_lex(ctx, file, buf);
  745. switch (tok)
  746. {
  747. case PDF_TOK_OPEN_ARRAY:
  748. obj = pdf_parse_array(ctx, doc, file, buf);
  749. break;
  750. case PDF_TOK_OPEN_DICT:
  751. obj = pdf_parse_dict(ctx, doc, file, buf);
  752. break;
  753. case PDF_TOK_NAME: obj = pdf_new_name(ctx, buf->scratch); break;
  754. case PDF_TOK_REAL: obj = pdf_new_real(ctx, buf->f); break;
  755. case PDF_TOK_STRING: obj = pdf_new_string(ctx, buf->scratch, buf->len); break;
  756. case PDF_TOK_TRUE: obj = PDF_TRUE; break;
  757. case PDF_TOK_FALSE: obj = PDF_FALSE; break;
  758. case PDF_TOK_NULL: obj = PDF_NULL; break;
  759. case PDF_TOK_INT:
  760. a = buf->i;
  761. tok = pdf_lex(ctx, file, buf);
  762. if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ)
  763. {
  764. obj = pdf_new_int(ctx, a);
  765. read_next_token = 0;
  766. break;
  767. }
  768. else if (tok == PDF_TOK_INT)
  769. {
  770. b = buf->i;
  771. tok = pdf_lex(ctx, file, buf);
  772. if (tok == PDF_TOK_R)
  773. {
  774. obj = pdf_new_indirect(ctx, doc, a, b);
  775. break;
  776. }
  777. }
  778. fz_throw(ctx, FZ_ERROR_SYNTAX, "expected 'R' keyword (%d %d R)", num, gen);
  779. case PDF_TOK_ENDOBJ:
  780. obj = PDF_NULL;
  781. read_next_token = 0;
  782. break;
  783. default:
  784. fz_throw(ctx, FZ_ERROR_SYNTAX, "syntax error in object (%d %d R)", num, gen);
  785. }
  786. fz_try(ctx)
  787. {
  788. if (read_next_token)
  789. tok = pdf_lex(ctx, file, buf);
  790. if (tok == PDF_TOK_STREAM)
  791. {
  792. int c = fz_read_byte(ctx, file);
  793. while (c == ' ')
  794. c = fz_read_byte(ctx, file);
  795. if (c == '\r')
  796. {
  797. c = fz_peek_byte(ctx, file);
  798. if (c != '\n')
  799. fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen);
  800. else
  801. fz_read_byte(ctx, file);
  802. }
  803. stm_ofs = fz_tell(ctx, file);
  804. }
  805. else if (tok == PDF_TOK_ENDOBJ)
  806. {
  807. stm_ofs = 0;
  808. }
  809. else
  810. {
  811. fz_warn(ctx, "expected 'endobj' or 'stream' keyword (%d %d R)", num, gen);
  812. stm_ofs = 0;
  813. }
  814. }
  815. fz_catch(ctx)
  816. {
  817. pdf_drop_obj(ctx, obj);
  818. fz_rethrow(ctx);
  819. }
  820. if (onum) *onum = num;
  821. if (ogen) *ogen = gen;
  822. if (ostmofs) *ostmofs = stm_ofs;
  823. return obj;
  824. }
  825. pdf_obj *
  826. pdf_parse_ind_obj(fz_context *ctx, pdf_document *doc, fz_stream *file,
  827. int *onum, int *ogen, int64_t *ostmofs, int *try_repair)
  828. {
  829. return pdf_parse_ind_obj_or_newobj(ctx, doc, file, onum, ogen, ostmofs, try_repair, NULL);
  830. }
  831. pdf_obj *
  832. pdf_parse_journal_obj(fz_context *ctx, pdf_document *doc, fz_stream *stm,
  833. int *onum, fz_buffer **ostm, int *newobj)
  834. {
  835. pdf_obj *obj = NULL;
  836. pdf_token tok;
  837. pdf_lexbuf *buf = &doc->lexbuf.base;
  838. int64_t stmofs;
  839. *newobj = 0;
  840. obj = pdf_parse_ind_obj_or_newobj(ctx, doc, stm, onum, NULL, &stmofs, NULL, newobj);
  841. /* This will have consumed either the stream or the endobj keywords. */
  842. *ostm = NULL;
  843. if (stmofs)
  844. {
  845. fz_stream *stream = NULL;
  846. fz_var(stream);
  847. fz_try(ctx)
  848. {
  849. stream = fz_open_endstream_filter(ctx, stm, 0, stmofs);
  850. *ostm = fz_read_all(ctx, stream, 32);
  851. fz_drop_stream(ctx, stream);
  852. stream = NULL;
  853. fz_seek(ctx, stm, stmofs + (*ostm ? (*ostm)->len : 0), SEEK_SET);
  854. tok = pdf_lex(ctx, stm, buf);
  855. if (tok != PDF_TOK_ENDSTREAM)
  856. fz_throw(ctx, FZ_ERROR_SYNTAX, "expected 'endstream' keyword");
  857. tok = pdf_lex(ctx, stm, buf);
  858. if (tok != PDF_TOK_ENDOBJ)
  859. fz_throw(ctx, FZ_ERROR_SYNTAX, "expected 'endobj' keyword");
  860. }
  861. fz_always(ctx)
  862. fz_drop_stream(ctx, stream);
  863. fz_catch(ctx)
  864. {
  865. pdf_drop_obj(ctx, obj);
  866. fz_rethrow(ctx);
  867. }
  868. }
  869. return obj;
  870. }