pdf-stream.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764
  1. // Copyright (C) 2004-2025 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #include "mupdf/fitz.h"
  23. #include "mupdf/pdf.h"
  24. #include <string.h>
  25. int
  26. pdf_obj_num_is_stream(fz_context *ctx, pdf_document *doc, int num)
  27. {
  28. pdf_xref_entry *entry;
  29. if (num <= 0 || num >= pdf_xref_len(ctx, doc))
  30. return 0;
  31. fz_try(ctx)
  32. entry = pdf_cache_object(ctx, doc, num);
  33. fz_catch(ctx)
  34. {
  35. fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
  36. fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
  37. fz_report_error(ctx);
  38. return 0;
  39. }
  40. return entry->stm_ofs != 0 || entry->stm_buf;
  41. }
  42. int
  43. pdf_is_stream(fz_context *ctx, pdf_obj *ref)
  44. {
  45. pdf_document *doc = pdf_get_indirect_document(ctx, ref);
  46. if (doc)
  47. return pdf_obj_num_is_stream(ctx, doc, pdf_to_num(ctx, ref));
  48. return 0;
  49. }
  50. /*
  51. * Scan stream dictionary for an explicit /Crypt filter
  52. */
  53. static int
  54. pdf_stream_has_crypt(fz_context *ctx, pdf_obj *stm)
  55. {
  56. pdf_obj *filters;
  57. pdf_obj *obj;
  58. int i;
  59. filters = pdf_dict_geta(ctx, stm, PDF_NAME(Filter), PDF_NAME(F));
  60. if (filters)
  61. {
  62. if (pdf_name_eq(ctx, filters, PDF_NAME(Crypt)))
  63. return 1;
  64. if (pdf_is_array(ctx, filters))
  65. {
  66. int n = pdf_array_len(ctx, filters);
  67. for (i = 0; i < n; i++)
  68. {
  69. obj = pdf_array_get(ctx, filters, i);
  70. if (pdf_name_eq(ctx, obj, PDF_NAME(Crypt)))
  71. return 1;
  72. }
  73. }
  74. }
  75. return 0;
  76. }
  77. static fz_jbig2_globals *
  78. pdf_load_jbig2_globals(fz_context *ctx, pdf_obj *dict)
  79. {
  80. fz_jbig2_globals *globals;
  81. fz_buffer *buf = NULL;
  82. fz_var(buf);
  83. if ((globals = pdf_find_item(ctx, fz_drop_jbig2_globals_imp, dict)) != NULL)
  84. return globals;
  85. if (pdf_mark_obj(ctx, dict))
  86. fz_throw(ctx, FZ_ERROR_FORMAT, "cyclic reference when loading JBIG2 globals");
  87. fz_try(ctx)
  88. {
  89. buf = pdf_load_stream(ctx, dict);
  90. globals = fz_load_jbig2_globals(ctx, buf);
  91. if (globals)
  92. pdf_store_item(ctx, dict, globals, fz_buffer_storage(ctx, buf, NULL));
  93. }
  94. fz_always(ctx)
  95. {
  96. fz_drop_buffer(ctx, buf);
  97. pdf_unmark_obj(ctx, dict);
  98. }
  99. fz_catch(ctx)
  100. {
  101. fz_rethrow(ctx);
  102. }
  103. return globals;
  104. }
  105. static void
  106. build_compression_params(fz_context *ctx, pdf_obj *f, pdf_obj *p, fz_compression_params *params)
  107. {
  108. params->type = FZ_IMAGE_RAW;
  109. if (pdf_name_eq(ctx, f, PDF_NAME(CCITTFaxDecode)) || pdf_name_eq(ctx, f, PDF_NAME(CCF)))
  110. {
  111. params->type = FZ_IMAGE_FAX;
  112. params->u.fax.k = pdf_dict_get_int_default(ctx, p, PDF_NAME(K), 0);
  113. params->u.fax.end_of_line = pdf_dict_get_bool_default(ctx, p, PDF_NAME(EndOfLine), 0);
  114. params->u.fax.encoded_byte_align = pdf_dict_get_bool_default(ctx, p, PDF_NAME(EncodedByteAlign), 0);
  115. params->u.fax.columns = pdf_dict_get_int_default(ctx, p, PDF_NAME(Columns), 1728);
  116. params->u.fax.rows = pdf_dict_get_int_default(ctx, p, PDF_NAME(Rows), 0);
  117. params->u.fax.end_of_block = pdf_dict_get_bool_default(ctx, p, PDF_NAME(EndOfBlock), 1);
  118. params->u.fax.black_is_1 = pdf_dict_get_bool_default(ctx, p, PDF_NAME(BlackIs1), 0);
  119. }
  120. else if (pdf_name_eq(ctx, f, PDF_NAME(DCTDecode)) || pdf_name_eq(ctx, f, PDF_NAME(DCT)))
  121. {
  122. params->type = FZ_IMAGE_JPEG;
  123. params->u.jpeg.color_transform = pdf_dict_get_int_default(ctx, p, PDF_NAME(ColorTransform), -1);
  124. params->u.jpeg.invert_cmyk = 0;
  125. }
  126. else if (pdf_name_eq(ctx, f, PDF_NAME(RunLengthDecode)) || pdf_name_eq(ctx, f, PDF_NAME(RL)))
  127. {
  128. params->type = FZ_IMAGE_RLD;
  129. }
  130. else if (pdf_name_eq(ctx, f, PDF_NAME(FlateDecode)) || pdf_name_eq(ctx, f, PDF_NAME(Fl)))
  131. {
  132. params->type = FZ_IMAGE_FLATE;
  133. params->u.flate.predictor = pdf_dict_get_int_default(ctx, p, PDF_NAME(Predictor), 1);
  134. params->u.flate.columns = pdf_dict_get_int_default(ctx, p, PDF_NAME(Columns), 1);
  135. params->u.flate.colors = pdf_dict_get_int_default(ctx, p, PDF_NAME(Colors), 1);
  136. params->u.flate.bpc = pdf_dict_get_int_default(ctx, p, PDF_NAME(BitsPerComponent), 8);
  137. }
  138. else if (pdf_name_eq(ctx, f, PDF_NAME(BrotliDecode)) || pdf_name_eq(ctx, f, PDF_NAME(Br)))
  139. {
  140. params->type = FZ_IMAGE_BROTLI;
  141. params->u.brotli.predictor = pdf_dict_get_int_default(ctx, p, PDF_NAME(Predictor), 1);
  142. params->u.brotli.columns = pdf_dict_get_int_default(ctx, p, PDF_NAME(Columns), 1);
  143. params->u.brotli.colors = pdf_dict_get_int_default(ctx, p, PDF_NAME(Colors), 1);
  144. params->u.brotli.bpc = pdf_dict_get_int_default(ctx, p, PDF_NAME(BitsPerComponent), 8);
  145. }
  146. else if (pdf_name_eq(ctx, f, PDF_NAME(LZWDecode)) || pdf_name_eq(ctx, f, PDF_NAME(LZW)))
  147. {
  148. params->type = FZ_IMAGE_LZW;
  149. params->u.lzw.predictor = pdf_dict_get_int_default(ctx, p, PDF_NAME(Predictor), 1);
  150. params->u.lzw.columns = pdf_dict_get_int_default(ctx, p, PDF_NAME(Columns), 1);
  151. params->u.lzw.colors = pdf_dict_get_int_default(ctx, p, PDF_NAME(Colors), 1);
  152. params->u.lzw.bpc = pdf_dict_get_int_default(ctx, p, PDF_NAME(BitsPerComponent), 8);
  153. params->u.lzw.early_change = pdf_dict_get_int_default(ctx, p, PDF_NAME(EarlyChange), 1);
  154. }
  155. else if (pdf_name_eq(ctx, f, PDF_NAME(JBIG2Decode)))
  156. {
  157. pdf_obj *g = pdf_dict_get(ctx, p, PDF_NAME(JBIG2Globals));
  158. params->type = FZ_IMAGE_JBIG2;
  159. params->u.jbig2.globals = NULL;
  160. params->u.jbig2.embedded = 1; /* jbig2 streams are always embedded without file headers */
  161. if (g)
  162. {
  163. if (!pdf_is_stream(ctx, g))
  164. fz_warn(ctx, "jbig2 globals is not a stream, skipping globals");
  165. else
  166. params->u.jbig2.globals = pdf_load_jbig2_globals(ctx, g);
  167. }
  168. }
  169. }
  170. /*
  171. * Create a filter given a name and param dictionary.
  172. */
  173. static fz_stream *
  174. build_filter(fz_context *ctx, fz_stream *chain, pdf_document *doc, pdf_obj *f, pdf_obj *p, int num, int gen, fz_compression_params *params, int might_be_image)
  175. {
  176. fz_compression_params local_params;
  177. local_params.u.jbig2.globals = NULL;
  178. if (params == NULL)
  179. params = &local_params;
  180. if (!might_be_image &&
  181. (pdf_name_eq(ctx, f, PDF_NAME(CCITTFaxDecode)) ||
  182. pdf_name_eq(ctx, f, PDF_NAME(CCF)) ||
  183. pdf_name_eq(ctx, f, PDF_NAME(DCTDecode)) ||
  184. pdf_name_eq(ctx, f, PDF_NAME(DCT)) ||
  185. pdf_name_eq(ctx, f, PDF_NAME(JBIG2Decode)) ||
  186. pdf_name_eq(ctx, f, PDF_NAME(JPXDecode))))
  187. {
  188. fz_warn(ctx, "Can't open image only stream for non-image purposes");
  189. return fz_open_memory(ctx, (unsigned char *)"", 0);
  190. }
  191. build_compression_params(ctx, f, p, params);
  192. /* If we were using params we were passed in, and we successfully
  193. * recognised the image type, we can use the existing filter and
  194. * shortstop here. */
  195. if (params != &local_params && params->type != FZ_IMAGE_RAW)
  196. return fz_keep_stream(ctx, chain); /* nothing to do */
  197. else if (params->type == FZ_IMAGE_JBIG2)
  198. {
  199. fz_stream *stm;
  200. fz_try(ctx)
  201. stm = fz_open_image_decomp_stream(ctx, chain, params, NULL);
  202. fz_always(ctx)
  203. fz_drop_jbig2_globals(ctx, local_params.u.jbig2.globals);
  204. fz_catch(ctx)
  205. fz_rethrow(ctx);
  206. return stm;
  207. }
  208. else if (params->type != FZ_IMAGE_RAW)
  209. return fz_open_image_decomp_stream(ctx, chain, params, NULL);
  210. else if (pdf_name_eq(ctx, f, PDF_NAME(ASCIIHexDecode)) || pdf_name_eq(ctx, f, PDF_NAME(AHx)))
  211. return fz_open_ahxd(ctx, chain);
  212. else if (pdf_name_eq(ctx, f, PDF_NAME(ASCII85Decode)) || pdf_name_eq(ctx, f, PDF_NAME(A85)))
  213. return fz_open_a85d(ctx, chain);
  214. else if (pdf_name_eq(ctx, f, PDF_NAME(JPXDecode)))
  215. return fz_keep_stream(ctx, chain); /* JPX decoding is special cased in the image loading code */
  216. else if (pdf_name_eq(ctx, f, PDF_NAME(Crypt)))
  217. {
  218. if (!doc->crypt)
  219. fz_warn(ctx, "crypt filter in unencrypted document");
  220. else
  221. {
  222. pdf_obj *name = pdf_dict_get(ctx, p, PDF_NAME(Name));
  223. if (pdf_is_name(ctx, name))
  224. return pdf_open_crypt_with_filter(ctx, chain, doc->crypt, name, num, gen);
  225. }
  226. }
  227. else
  228. fz_warn(ctx, "unknown filter name (%s)", pdf_to_name(ctx, f));
  229. return fz_keep_stream(ctx, chain);
  230. }
  231. /* Build filter, and assume ownership of chain */
  232. static fz_stream *
  233. build_filter_drop(fz_context *ctx, fz_stream *tail, pdf_document *doc, pdf_obj *f, pdf_obj *p, int num, int gen, fz_compression_params *params, int might_be_image)
  234. {
  235. fz_stream *head;
  236. fz_try(ctx)
  237. head = build_filter(ctx, tail, doc, f, p, num, gen, params, might_be_image);
  238. fz_always(ctx)
  239. fz_drop_stream(ctx, tail);
  240. fz_catch(ctx)
  241. fz_rethrow(ctx);
  242. return head;
  243. }
  244. /*
  245. * Build a chain of filters given filter names and param dicts.
  246. * If chain is given, start filter chain with it.
  247. * Assume ownership of chain.
  248. */
  249. static fz_stream *
  250. build_filter_chain_drop(fz_context *ctx, fz_stream *chain, pdf_document *doc, pdf_obj *fs, pdf_obj *ps, int num, int gen, fz_compression_params *params, int might_be_image)
  251. {
  252. fz_var(chain);
  253. fz_try(ctx)
  254. {
  255. int i, n = pdf_array_len(ctx, fs);
  256. for (i = 0; i < n; i++)
  257. {
  258. pdf_obj *f = pdf_array_get(ctx, fs, i);
  259. pdf_obj *p = pdf_array_get(ctx, ps, i);
  260. chain = build_filter_drop(ctx, chain, doc, f, p, num, gen, (i == n-1 ? params : NULL), might_be_image);
  261. }
  262. }
  263. fz_catch(ctx)
  264. fz_rethrow(ctx);
  265. return chain;
  266. }
  267. static fz_stream *
  268. build_filter_chain(fz_context *ctx, fz_stream *chain, pdf_document *doc, pdf_obj *fs, pdf_obj *ps, int num, int gen, fz_compression_params *params, int might_be_image)
  269. {
  270. return build_filter_chain_drop(ctx, fz_keep_stream(ctx, chain), doc, fs, ps, num, gen, params, might_be_image);
  271. }
  272. /*
  273. * Build a filter for reading raw stream data.
  274. * This is a null filter to constrain reading to the stream length (and to
  275. * allow for other people accessing the file), followed by a decryption
  276. * filter.
  277. *
  278. * orig_num and orig_gen are used purely to seed the encryption.
  279. */
  280. static fz_stream *
  281. pdf_open_raw_filter(fz_context *ctx, fz_stream *file_stm, pdf_document *doc, pdf_obj *stmobj, int num, int *orig_num, int *orig_gen, int64_t offset)
  282. {
  283. pdf_xref_entry *x = NULL;
  284. fz_stream *null_stm, *crypt_stm;
  285. int hascrypt;
  286. int64_t len;
  287. if (num > 0 && num < pdf_xref_len(ctx, doc))
  288. {
  289. x = pdf_get_xref_entry(ctx, doc, num);
  290. }
  291. if (x == NULL)
  292. {
  293. /* We only end up here when called from pdf_open_stream_with_offset to parse new format XRef sections. */
  294. /* New style XRef sections must have generation number 0. */
  295. *orig_num = num;
  296. *orig_gen = 0;
  297. }
  298. else
  299. {
  300. *orig_num = x->num;
  301. *orig_gen = x->gen;
  302. if (x->stm_buf)
  303. return fz_open_buffer(ctx, x->stm_buf);
  304. }
  305. hascrypt = pdf_stream_has_crypt(ctx, stmobj);
  306. len = pdf_dict_get_int64(ctx, stmobj, PDF_NAME(Length));
  307. if (len < 0)
  308. len = 0;
  309. null_stm = fz_open_endstream_filter(ctx, file_stm, (uint64_t)len, offset);
  310. if (doc->crypt && !hascrypt)
  311. {
  312. fz_try(ctx)
  313. crypt_stm = pdf_open_crypt(ctx, null_stm, doc->crypt, *orig_num, *orig_gen);
  314. fz_always(ctx)
  315. fz_drop_stream(ctx, null_stm);
  316. fz_catch(ctx)
  317. fz_rethrow(ctx);
  318. return crypt_stm;
  319. }
  320. return null_stm;
  321. }
  322. /*
  323. * Construct a filter to decode a stream, constraining
  324. * to stream length and decrypting.
  325. */
  326. static fz_stream *
  327. pdf_open_filter(fz_context *ctx, pdf_document *doc, fz_stream *file_stm, pdf_obj *stmobj, int num, int64_t offset, fz_compression_params *imparams, int might_be_image)
  328. {
  329. pdf_obj *filters = pdf_dict_geta(ctx, stmobj, PDF_NAME(Filter), PDF_NAME(F));
  330. pdf_obj *params = pdf_dict_geta(ctx, stmobj, PDF_NAME(DecodeParms), PDF_NAME(DP));
  331. int orig_num, orig_gen;
  332. fz_stream *rstm, *fstm;
  333. rstm = pdf_open_raw_filter(ctx, file_stm, doc, stmobj, num, &orig_num, &orig_gen, offset);
  334. fz_try(ctx)
  335. {
  336. if (pdf_is_name(ctx, filters))
  337. fstm = build_filter(ctx, rstm, doc, filters, params, orig_num, orig_gen, imparams, might_be_image);
  338. else if (pdf_array_len(ctx, filters) > 0)
  339. fstm = build_filter_chain(ctx, rstm, doc, filters, params, orig_num, orig_gen, imparams, might_be_image);
  340. else
  341. {
  342. if (imparams)
  343. imparams->type = FZ_IMAGE_RAW;
  344. fstm = fz_keep_stream(ctx, rstm);
  345. }
  346. }
  347. fz_always(ctx)
  348. fz_drop_stream(ctx, rstm);
  349. fz_catch(ctx)
  350. fz_rethrow(ctx);
  351. return fstm;
  352. }
  353. fz_stream *
  354. pdf_open_inline_stream(fz_context *ctx, pdf_document *doc, pdf_obj *stmobj, int length, fz_stream *file_stm, fz_compression_params *imparams)
  355. {
  356. pdf_obj *filters = pdf_dict_geta(ctx, stmobj, PDF_NAME(Filter), PDF_NAME(F));
  357. pdf_obj *params = pdf_dict_geta(ctx, stmobj, PDF_NAME(DecodeParms), PDF_NAME(DP));
  358. if (pdf_is_name(ctx, filters))
  359. return build_filter(ctx, file_stm, doc, filters, params, 0, 0, imparams, 1);
  360. else if (pdf_array_len(ctx, filters) > 0)
  361. return build_filter_chain(ctx, file_stm, doc, filters, params, 0, 0, imparams, 1);
  362. if (imparams)
  363. imparams->type = FZ_IMAGE_RAW;
  364. return fz_open_null_filter(ctx, file_stm, length, fz_tell(ctx, file_stm));
  365. }
  366. void
  367. pdf_load_compressed_inline_image(fz_context *ctx, pdf_document *doc, pdf_obj *dict, int length, fz_stream *file_stm, int indexed, fz_compressed_image *image)
  368. {
  369. fz_stream *istm = NULL, *leech = NULL, *decomp = NULL;
  370. fz_pixmap *pixmap = NULL;
  371. fz_compressed_buffer *bc;
  372. int dummy_l2factor = 0;
  373. fz_var(istm);
  374. fz_var(leech);
  375. fz_var(decomp);
  376. fz_var(pixmap);
  377. bc = fz_new_compressed_buffer(ctx);
  378. fz_try(ctx)
  379. {
  380. bc->buffer = fz_new_buffer(ctx, 1024);
  381. istm = pdf_open_inline_stream(ctx, doc, dict, length, file_stm, &bc->params);
  382. leech = fz_open_leecher(ctx, istm, bc->buffer);
  383. decomp = fz_open_image_decomp_stream(ctx, leech, &bc->params, &dummy_l2factor);
  384. pixmap = fz_decomp_image_from_stream(ctx, decomp, image, NULL, indexed, 0, NULL);
  385. fz_set_compressed_image_buffer(ctx, image, bc);
  386. }
  387. fz_always(ctx)
  388. {
  389. fz_drop_stream(ctx, istm);
  390. fz_drop_stream(ctx, leech);
  391. fz_drop_stream(ctx, decomp);
  392. fz_drop_pixmap(ctx, pixmap);
  393. }
  394. fz_catch(ctx)
  395. {
  396. fz_drop_compressed_buffer(ctx, bc);
  397. fz_rethrow(ctx);
  398. }
  399. }
  400. fz_stream *
  401. pdf_open_raw_stream_number(fz_context *ctx, pdf_document *doc, int num)
  402. {
  403. pdf_xref_entry *x;
  404. int orig_num, orig_gen;
  405. x = pdf_cache_object(ctx, doc, num);
  406. if (x->stm_ofs == 0)
  407. fz_throw(ctx, FZ_ERROR_FORMAT, "object is not a stream");
  408. return pdf_open_raw_filter(ctx, doc->file, doc, x->obj, num, &orig_num, &orig_gen, x->stm_ofs);
  409. }
  410. static fz_stream *
  411. pdf_open_image_stream(fz_context *ctx, pdf_document *doc, int num, fz_compression_params *params, int might_be_image)
  412. {
  413. pdf_xref_entry *x;
  414. x = pdf_cache_object(ctx, doc, num);
  415. if (x->stm_ofs == 0 && x->stm_buf == NULL)
  416. fz_throw(ctx, FZ_ERROR_FORMAT, "object is not a stream");
  417. return pdf_open_filter(ctx, doc, doc->file, x->obj, num, x->stm_ofs, params, might_be_image);
  418. }
  419. fz_stream *
  420. pdf_open_stream_number(fz_context *ctx, pdf_document *doc, int num)
  421. {
  422. return pdf_open_image_stream(ctx, doc, num, NULL, 1);
  423. }
  424. fz_stream *
  425. pdf_open_stream_with_offset(fz_context *ctx, pdf_document *doc, int num, pdf_obj *dict, int64_t stm_ofs)
  426. {
  427. if (stm_ofs == 0)
  428. fz_throw(ctx, FZ_ERROR_FORMAT, "object is not a stream");
  429. return pdf_open_filter(ctx, doc, doc->file, dict, num, stm_ofs, NULL, 1);
  430. }
  431. fz_buffer *
  432. pdf_load_raw_stream_number(fz_context *ctx, pdf_document *doc, int num)
  433. {
  434. fz_stream *stm;
  435. pdf_obj *dict;
  436. int64_t len;
  437. fz_buffer *buf = NULL;
  438. pdf_xref_entry *x;
  439. if (num > 0 && num < pdf_xref_len(ctx, doc))
  440. {
  441. x = pdf_get_xref_entry_no_null(ctx, doc, num);
  442. if (x->stm_buf)
  443. return fz_keep_buffer(ctx, x->stm_buf);
  444. }
  445. dict = pdf_load_object(ctx, doc, num);
  446. fz_try(ctx)
  447. len = pdf_dict_get_int64(ctx, dict, PDF_NAME(Length));
  448. fz_always(ctx)
  449. pdf_drop_obj(ctx, dict);
  450. fz_catch(ctx)
  451. fz_rethrow(ctx);
  452. stm = pdf_open_raw_stream_number(ctx, doc, num);
  453. if (len < 0)
  454. len = 1024;
  455. fz_try(ctx)
  456. buf = fz_read_all(ctx, stm, (size_t)len);
  457. fz_always(ctx)
  458. fz_drop_stream(ctx, stm);
  459. fz_catch(ctx)
  460. fz_rethrow(ctx);
  461. return buf;
  462. }
  463. static size_t
  464. pdf_guess_filter_length(size_t len, const char *filter)
  465. {
  466. size_t nlen = len;
  467. /* First ones get smaller, no overflow check required. */
  468. if (!strcmp(filter, "ASCIIHexDecode"))
  469. return len / 2;
  470. else if (!strcmp(filter, "ASCII85Decode"))
  471. return len * 4 / 5;
  472. if (!strcmp(filter, "FlateDecode"))
  473. nlen = len * 3;
  474. else if (!strcmp(filter, "BrotliDecode"))
  475. nlen = len * 4;
  476. else if (!strcmp(filter, "RunLengthDecode"))
  477. nlen = len * 3;
  478. else if (!strcmp(filter, "LZWDecode"))
  479. nlen = len * 2;
  480. /* Live with a bad estimate - we'll malloc up as we go, but
  481. * it's probably destined to fail anyway. */
  482. if (nlen < len)
  483. return len;
  484. return nlen;
  485. }
  486. /* Check if an entry has a cached stream and return whether it is directly
  487. * reusable. A buffer is directly reusable only if the stream is
  488. * uncompressed, or if it is compressed purely a compression method we can
  489. * return details of in fz_compression_params.
  490. *
  491. * If the stream is reusable return 1, and set params as required, otherwise
  492. * return 0. */
  493. static int
  494. can_reuse_buffer(fz_context *ctx, pdf_xref_entry *entry, fz_compression_params *params)
  495. {
  496. pdf_obj *f;
  497. pdf_obj *p;
  498. if (!entry || !entry->obj || !entry->stm_buf)
  499. return 0;
  500. if (params)
  501. params->type = FZ_IMAGE_RAW;
  502. f = pdf_dict_geta(ctx, entry->obj, PDF_NAME(Filter), PDF_NAME(F));
  503. /* If there are no filters, it's uncompressed, and we can use it */
  504. if (!f)
  505. return 1;
  506. p = pdf_dict_geta(ctx, entry->obj, PDF_NAME(DecodeParms), PDF_NAME(DP));
  507. if (pdf_is_array(ctx, f))
  508. {
  509. int len = pdf_array_len(ctx, f);
  510. /* Empty array of filters. Its uncompressed. We can cope. */
  511. if (len == 0)
  512. return 1;
  513. /* 1 filter is the most we can hope to cope with - if more,*/
  514. if (len != 1)
  515. return 0;
  516. p = pdf_array_get(ctx, p, 0);
  517. }
  518. if (pdf_is_null(ctx, f))
  519. return 1; /* Null filter is uncompressed */
  520. if (!pdf_is_name(ctx, f))
  521. return 0;
  522. /* There are filters, so unless we have the option of shortstopping,
  523. * we can't use the existing buffer. */
  524. if (!params)
  525. return 0;
  526. build_compression_params(ctx, f, p, params);
  527. return (params->type == FZ_IMAGE_RAW) ? 0 : 1;
  528. }
  529. static fz_buffer *
  530. pdf_load_image_stream(fz_context *ctx, pdf_document *doc, int num, fz_compression_params *params, int *truncated, size_t worst_case)
  531. {
  532. fz_stream *stm = NULL;
  533. pdf_obj *dict, *obj;
  534. int i, n;
  535. size_t len;
  536. fz_buffer *buf;
  537. fz_var(buf);
  538. if (num > 0 && num < pdf_xref_len(ctx, doc))
  539. {
  540. pdf_xref_entry *entry = pdf_get_xref_entry(ctx, doc, num);
  541. /* Return ref to existing buffer, but only if uncompressed,
  542. * or shortstoppable */
  543. if (can_reuse_buffer(ctx, entry, params))
  544. return fz_keep_buffer(ctx, entry->stm_buf);
  545. }
  546. dict = pdf_load_object(ctx, doc, num);
  547. fz_try(ctx)
  548. {
  549. int64_t ilen = pdf_dict_get_int64(ctx, dict, PDF_NAME(Length));
  550. if (ilen < 0)
  551. ilen = 0;
  552. len = (size_t)ilen;
  553. /* In 32 bit builds, we might find a length being too
  554. * large for a size_t. */
  555. if ((int64_t)len != ilen)
  556. fz_throw(ctx, FZ_ERROR_LIMIT, "Stream too large");
  557. obj = pdf_dict_get(ctx, dict, PDF_NAME(Filter));
  558. len = pdf_guess_filter_length(len, pdf_to_name(ctx, obj));
  559. n = pdf_array_len(ctx, obj);
  560. for (i = 0; i < n; i++)
  561. len = pdf_guess_filter_length(len, pdf_array_get_name(ctx, obj, i));
  562. }
  563. fz_always(ctx)
  564. {
  565. pdf_drop_obj(ctx, dict);
  566. }
  567. fz_catch(ctx)
  568. {
  569. fz_rethrow(ctx);
  570. }
  571. stm = pdf_open_image_stream(ctx, doc, num, params, 1);
  572. fz_try(ctx)
  573. {
  574. buf = fz_read_best(ctx, stm, len, truncated, worst_case);
  575. }
  576. fz_always(ctx)
  577. {
  578. fz_drop_stream(ctx, stm);
  579. }
  580. fz_catch(ctx)
  581. {
  582. fz_rethrow(ctx);
  583. }
  584. return buf;
  585. }
  586. fz_buffer *
  587. pdf_load_stream_number(fz_context *ctx, pdf_document *doc, int num)
  588. {
  589. return pdf_load_image_stream(ctx, doc, num, NULL, NULL, 0);
  590. }
  591. fz_compressed_buffer *
  592. pdf_load_compressed_stream(fz_context *ctx, pdf_document *doc, int num, size_t worst_case)
  593. {
  594. fz_compressed_buffer *bc = fz_new_compressed_buffer(ctx);
  595. fz_try(ctx)
  596. {
  597. bc->buffer = pdf_load_image_stream(ctx, doc, num, &bc->params, NULL, worst_case);
  598. }
  599. fz_catch(ctx)
  600. {
  601. fz_free(ctx, bc);
  602. fz_rethrow(ctx);
  603. }
  604. return bc;
  605. }
  606. static fz_stream *
  607. pdf_open_object_array(fz_context *ctx, pdf_document *doc, pdf_obj *list)
  608. {
  609. fz_stream *stm;
  610. int i, n;
  611. n = pdf_array_len(ctx, list);
  612. stm = fz_open_concat(ctx, n, 1);
  613. for (i = 0; i < n; i++)
  614. {
  615. pdf_obj *obj = pdf_array_get(ctx, list, i);
  616. fz_try(ctx)
  617. fz_concat_push_drop(ctx, stm, pdf_open_stream(ctx, obj));
  618. fz_catch(ctx)
  619. {
  620. if (fz_caught(ctx) == FZ_ERROR_TRYLATER || fz_caught(ctx) == FZ_ERROR_SYSTEM)
  621. {
  622. fz_drop_stream(ctx, stm);
  623. fz_rethrow(ctx);
  624. }
  625. fz_report_error(ctx);
  626. fz_warn(ctx, "cannot load content stream part %d/%d", i + 1, n);
  627. }
  628. }
  629. return stm;
  630. }
  631. fz_stream *
  632. pdf_open_contents_stream(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
  633. {
  634. int num;
  635. if (pdf_is_array(ctx, obj))
  636. return pdf_open_object_array(ctx, doc, obj);
  637. num = pdf_to_num(ctx, obj);
  638. if (pdf_is_stream(ctx, obj))
  639. return pdf_open_image_stream(ctx, doc, num, NULL, 0);
  640. fz_warn(ctx, "content stream is not a stream (%d 0 R)", num);
  641. return fz_open_memory(ctx, (unsigned char *)"", 0);
  642. }
  643. fz_buffer *pdf_load_raw_stream(fz_context *ctx, pdf_obj *ref)
  644. {
  645. if (pdf_is_stream(ctx, ref))
  646. return pdf_load_raw_stream_number(ctx, pdf_get_indirect_document(ctx, ref), pdf_to_num(ctx, ref));
  647. fz_throw(ctx, FZ_ERROR_FORMAT, "object is not a stream");
  648. }
  649. fz_buffer *pdf_load_stream(fz_context *ctx, pdf_obj *ref)
  650. {
  651. if (pdf_is_stream(ctx, ref))
  652. return pdf_load_stream_number(ctx, pdf_get_indirect_document(ctx, ref), pdf_to_num(ctx, ref));
  653. fz_throw(ctx, FZ_ERROR_FORMAT, "object is not a stream");
  654. }
  655. fz_stream *pdf_open_raw_stream(fz_context *ctx, pdf_obj *ref)
  656. {
  657. if (pdf_is_stream(ctx, ref))
  658. return pdf_open_raw_stream_number(ctx, pdf_get_indirect_document(ctx, ref), pdf_to_num(ctx, ref));
  659. fz_throw(ctx, FZ_ERROR_FORMAT, "object is not a stream");
  660. }
  661. fz_stream *pdf_open_stream(fz_context *ctx, pdf_obj *ref)
  662. {
  663. if (pdf_is_stream(ctx, ref))
  664. return pdf_open_stream_number(ctx, pdf_get_indirect_document(ctx, ref), pdf_to_num(ctx, ref));
  665. fz_throw(ctx, FZ_ERROR_FORMAT, "object is not a stream");
  666. }