pdf-repair.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987
  1. // Copyright (C) 2004-2025 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #include "mupdf/fitz.h"
  23. #include "pdf-imp.h"
  24. #include <string.h>
  25. /* Scan file for objects and reconstruct xref table */
  26. struct entry
  27. {
  28. int num;
  29. int gen;
  30. int64_t ofs;
  31. int64_t stm_ofs;
  32. int64_t stm_len;
  33. };
  34. typedef struct
  35. {
  36. int max;
  37. int len;
  38. pdf_obj **roots;
  39. } pdf_root_list;
  40. static void
  41. add_root(fz_context *ctx, pdf_root_list *roots, pdf_obj *obj)
  42. {
  43. if (roots->max == roots->len)
  44. {
  45. int new_max_roots = roots->max * 2;
  46. if (new_max_roots == 0)
  47. new_max_roots = 4;
  48. roots->roots = fz_realloc(ctx, roots->roots, new_max_roots * sizeof(roots->roots[0]));
  49. roots->max = new_max_roots;
  50. }
  51. roots->roots[roots->len] = pdf_keep_obj(ctx, obj);
  52. roots->len++;
  53. }
  54. static pdf_root_list *
  55. fz_new_root_list(fz_context *ctx)
  56. {
  57. return fz_malloc_struct(ctx, pdf_root_list);
  58. }
  59. static void
  60. pdf_drop_root_list(fz_context *ctx, pdf_root_list *roots)
  61. {
  62. int i, n;
  63. if (roots == NULL)
  64. return;
  65. n = roots->len;
  66. for (i = 0; i < n; i++)
  67. pdf_drop_obj(ctx, roots->roots[i]);
  68. fz_free(ctx, roots->roots);
  69. fz_free(ctx, roots);
  70. }
  71. int
  72. pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, int64_t *stmofsp, int64_t *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, int64_t *tmpofs, pdf_obj **root)
  73. {
  74. fz_stream *file = doc->file;
  75. pdf_token tok;
  76. int64_t stm_len;
  77. int64_t local_ofs;
  78. if (tmpofs == NULL)
  79. tmpofs = &local_ofs;
  80. if (stmofsp == NULL)
  81. stmofsp = &local_ofs;
  82. *stmofsp = 0;
  83. if (stmlenp)
  84. *stmlenp = -1;
  85. stm_len = 0;
  86. *tmpofs = fz_tell(ctx, file);
  87. if (*tmpofs < 0)
  88. fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
  89. /* On entry to this function, we know that we've just seen
  90. * '<int> <int> obj'. We expect the next thing we see to be a
  91. * pdf object. Regardless of the type of thing we meet next
  92. * we only need to fully parse it if it is a dictionary. */
  93. tok = pdf_lex(ctx, file, buf);
  94. /* Don't let a truncated object at EOF overwrite a good one */
  95. if (tok == PDF_TOK_EOF)
  96. fz_throw(ctx, FZ_ERROR_SYNTAX, "truncated object");
  97. if (tok == PDF_TOK_OPEN_DICT)
  98. {
  99. pdf_obj *obj, *dict = NULL;
  100. fz_try(ctx)
  101. {
  102. dict = pdf_parse_dict(ctx, doc, file, buf);
  103. }
  104. fz_catch(ctx)
  105. {
  106. fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
  107. fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
  108. /* Don't let a broken object at EOF overwrite a good one */
  109. if (file->eof)
  110. fz_rethrow(ctx);
  111. /* Silently swallow the error */
  112. fz_report_error(ctx);
  113. dict = pdf_new_dict(ctx, doc, 2);
  114. }
  115. /* We must be careful not to try to resolve any indirections
  116. * here. We have just read dict, so we know it to be a non
  117. * indirected dictionary. Before we look at any values that
  118. * we get back from looking up in it, we need to check they
  119. * aren't indirected. */
  120. if (encrypt || id || root)
  121. {
  122. obj = pdf_dict_get(ctx, dict, PDF_NAME(Type));
  123. if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME(XRef)))
  124. {
  125. if (encrypt)
  126. {
  127. obj = pdf_dict_get(ctx, dict, PDF_NAME(Encrypt));
  128. if (obj)
  129. {
  130. pdf_drop_obj(ctx, *encrypt);
  131. *encrypt = pdf_keep_obj(ctx, obj);
  132. }
  133. }
  134. if (id)
  135. {
  136. obj = pdf_dict_get(ctx, dict, PDF_NAME(ID));
  137. if (obj)
  138. {
  139. pdf_drop_obj(ctx, *id);
  140. *id = pdf_keep_obj(ctx, obj);
  141. }
  142. }
  143. if (root)
  144. *root = pdf_keep_obj(ctx, pdf_dict_get(ctx, dict, PDF_NAME(Root)));
  145. }
  146. }
  147. obj = pdf_dict_get(ctx, dict, PDF_NAME(Length));
  148. if (!pdf_is_indirect(ctx, obj) && pdf_is_int(ctx, obj))
  149. stm_len = pdf_to_int64(ctx, obj);
  150. if (doc->file_reading_linearly && page)
  151. {
  152. obj = pdf_dict_get(ctx, dict, PDF_NAME(Type));
  153. if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME(Page)))
  154. {
  155. pdf_drop_obj(ctx, *page);
  156. *page = pdf_keep_obj(ctx, dict);
  157. }
  158. }
  159. pdf_drop_obj(ctx, dict);
  160. }
  161. while ( tok != PDF_TOK_STREAM &&
  162. tok != PDF_TOK_ENDOBJ &&
  163. tok != PDF_TOK_ERROR &&
  164. tok != PDF_TOK_EOF &&
  165. tok != PDF_TOK_INT )
  166. {
  167. *tmpofs = fz_tell(ctx, file);
  168. if (*tmpofs < 0)
  169. fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
  170. tok = pdf_lex(ctx, file, buf);
  171. }
  172. if (tok == PDF_TOK_STREAM)
  173. {
  174. int c = fz_read_byte(ctx, file);
  175. if (c == '\r') {
  176. c = fz_peek_byte(ctx, file);
  177. if (c == '\n')
  178. fz_read_byte(ctx, file);
  179. }
  180. *stmofsp = fz_tell(ctx, file);
  181. if (*stmofsp < 0)
  182. fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
  183. if (stm_len > 0)
  184. {
  185. fz_seek(ctx, file, *stmofsp + stm_len, 0);
  186. fz_try(ctx)
  187. {
  188. tok = pdf_lex(ctx, file, buf);
  189. }
  190. fz_catch(ctx)
  191. {
  192. fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
  193. fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
  194. fz_report_error(ctx);
  195. fz_warn(ctx, "cannot find endstream token, falling back to scanning");
  196. }
  197. if (tok == PDF_TOK_ENDSTREAM)
  198. goto atobjend;
  199. fz_seek(ctx, file, *stmofsp, 0);
  200. }
  201. (void)fz_read(ctx, file, (unsigned char *) buf->scratch, 9);
  202. while (memcmp(buf->scratch, "endstream", 9) != 0)
  203. {
  204. c = fz_read_byte(ctx, file);
  205. if (c == EOF)
  206. break;
  207. memmove(&buf->scratch[0], &buf->scratch[1], 8);
  208. buf->scratch[8] = c;
  209. }
  210. if (stmlenp)
  211. *stmlenp = fz_tell(ctx, file) - *stmofsp - 9;
  212. atobjend:
  213. *tmpofs = fz_tell(ctx, file);
  214. if (*tmpofs < 0)
  215. fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
  216. tok = pdf_lex(ctx, file, buf);
  217. if (tok != PDF_TOK_ENDOBJ)
  218. fz_warn(ctx, "object missing 'endobj' token");
  219. else
  220. {
  221. /* Read another token as we always return the next one */
  222. *tmpofs = fz_tell(ctx, file);
  223. if (*tmpofs < 0)
  224. fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
  225. tok = pdf_lex(ctx, file, buf);
  226. }
  227. }
  228. return tok;
  229. }
  230. static int64_t
  231. entry_offset(fz_context *ctx, pdf_document *doc, int num)
  232. {
  233. pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, num);
  234. if (entry->type == 0 || entry->type == 'f')
  235. return 0;
  236. if (entry->type == 'n')
  237. return entry->ofs;
  238. assert(entry->type == 'o');
  239. /* It must be in a stream. Return the entry of that stream. */
  240. entry = pdf_get_populating_xref_entry(ctx, doc, entry->ofs);
  241. /* If it's NOT in a stream, then we'll invalidate this entry in a moment.
  242. * For now, just return an illegal offset. */
  243. if (entry->type != 'n')
  244. return -1;
  245. return entry->ofs;
  246. }
  247. static void
  248. pdf_repair_obj_stm(fz_context *ctx, pdf_document *doc, int stm_num)
  249. {
  250. pdf_obj *obj;
  251. fz_stream *stm = NULL;
  252. pdf_token tok;
  253. int i, n, count;
  254. pdf_lexbuf buf;
  255. fz_var(stm);
  256. pdf_lexbuf_init(ctx, &buf, PDF_LEXBUF_SMALL);
  257. fz_try(ctx)
  258. {
  259. obj = pdf_load_object(ctx, doc, stm_num);
  260. count = pdf_dict_get_int(ctx, obj, PDF_NAME(N));
  261. pdf_drop_obj(ctx, obj);
  262. stm = pdf_open_stream_number(ctx, doc, stm_num);
  263. for (i = 0; i < count; i++)
  264. {
  265. pdf_xref_entry *entry;
  266. int replace;
  267. tok = pdf_lex(ctx, stm, &buf);
  268. if (tok != PDF_TOK_INT)
  269. fz_throw(ctx, FZ_ERROR_FORMAT, "corrupt object stream (%d 0 R)", stm_num);
  270. n = buf.i;
  271. if (n < 0)
  272. {
  273. fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i);
  274. continue;
  275. }
  276. else if (n >= PDF_MAX_OBJECT_NUMBER)
  277. {
  278. fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i);
  279. continue;
  280. }
  281. entry = pdf_get_populating_xref_entry(ctx, doc, n);
  282. /* Bug 708286: Do not allow an object from an ObjStm to override an object
  283. * that isn't in an ObjStm that we've already read, that occurs after it
  284. * in the file. */
  285. replace = 1;
  286. if (entry->type != 0 && entry->type != 'f')
  287. {
  288. int64_t existing_entry_offset = entry_offset(ctx, doc, n);
  289. if (existing_entry_offset < 0)
  290. {
  291. /* The existing entry is invalid. Anything must be better than that! */
  292. }
  293. else
  294. {
  295. int64_t this_entry_offset = entry_offset(ctx, doc, stm_num);
  296. if (existing_entry_offset > this_entry_offset)
  297. replace = 0;
  298. }
  299. }
  300. if (replace)
  301. {
  302. entry->ofs = stm_num;
  303. entry->gen = i;
  304. entry->num = n;
  305. entry->stm_ofs = 0;
  306. pdf_drop_obj(ctx, entry->obj);
  307. entry->obj = NULL;
  308. entry->type = 'o';
  309. }
  310. tok = pdf_lex(ctx, stm, &buf);
  311. if (tok != PDF_TOK_INT)
  312. fz_throw(ctx, FZ_ERROR_FORMAT, "corrupt object stream (%d 0 R)", stm_num);
  313. }
  314. }
  315. fz_always(ctx)
  316. {
  317. fz_drop_stream(ctx, stm);
  318. pdf_lexbuf_fin(ctx, &buf);
  319. }
  320. fz_catch(ctx)
  321. {
  322. fz_rethrow(ctx);
  323. }
  324. }
  325. static void
  326. orphan_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
  327. {
  328. if (doc->orphans_count == doc->orphans_max)
  329. {
  330. int new_max = (doc->orphans_max ? doc->orphans_max*2 : 32);
  331. fz_try(ctx)
  332. {
  333. doc->orphans = fz_realloc_array(ctx, doc->orphans, new_max, pdf_obj*);
  334. doc->orphans_max = new_max;
  335. }
  336. fz_catch(ctx)
  337. {
  338. pdf_drop_obj(ctx, obj);
  339. fz_rethrow(ctx);
  340. }
  341. }
  342. doc->orphans[doc->orphans_count++] = obj;
  343. }
  344. static int is_white(int c)
  345. {
  346. return c == '\x00' || c == '\x09' || c == '\x0a' || c == '\x0c' || c == '\x0d' || c == '\x20';
  347. }
  348. static pdf_root_list *
  349. pdf_repair_xref_base(fz_context *ctx, pdf_document *doc)
  350. {
  351. pdf_obj *dict, *obj = NULL;
  352. pdf_obj *length;
  353. pdf_obj *encrypt = NULL;
  354. pdf_obj *id = NULL;
  355. pdf_obj *info = NULL;
  356. pdf_root_list *roots = NULL;
  357. struct entry *list = NULL;
  358. int listlen;
  359. int listcap;
  360. int maxnum = 0;
  361. int num = 0;
  362. int gen = 0;
  363. int64_t tmpofs, stm_ofs, numofs = 0, genofs = 0;
  364. int64_t stm_len;
  365. pdf_token tok;
  366. int next;
  367. int i;
  368. size_t j, n;
  369. int c;
  370. pdf_lexbuf *buf = &doc->lexbuf.base;
  371. fz_var(encrypt);
  372. fz_var(id);
  373. fz_var(info);
  374. fz_var(list);
  375. fz_var(obj);
  376. fz_var(roots);
  377. if (!doc->is_fdf)
  378. fz_warn(ctx, "repairing PDF document");
  379. if (doc->repair_attempted)
  380. fz_throw(ctx, FZ_ERROR_FORMAT, "Repair failed already - not trying again");
  381. doc->bias = 0; // reset bias!
  382. doc->repair_attempted = 1;
  383. doc->repair_in_progress = 1;
  384. pdf_drop_page_tree_internal(ctx, doc);
  385. doc->page_tree_broken = 0;
  386. pdf_forget_xref(ctx, doc);
  387. fz_seek(ctx, doc->file, 0, 0);
  388. fz_try(ctx)
  389. {
  390. pdf_xref_entry *entry;
  391. listlen = 0;
  392. listcap = 1024;
  393. list = fz_malloc_array(ctx, listcap, struct entry);
  394. roots = fz_new_root_list(ctx);
  395. /* look for '%PDF' version marker within first kilobyte of file */
  396. n = fz_read(ctx, doc->file, (unsigned char *)buf->scratch, fz_minz(buf->size, 1024));
  397. fz_seek(ctx, doc->file, 0, 0);
  398. if (n >= 5)
  399. {
  400. for (j = 0; j < n - 5; j++)
  401. {
  402. if (memcmp(&buf->scratch[j], "%PDF-", 5) == 0 || memcmp(&buf->scratch[j], "%FDF-", 5) == 0)
  403. {
  404. fz_seek(ctx, doc->file, (int64_t)(j + 8), 0); /* skip "%PDF-X.Y" */
  405. break;
  406. }
  407. }
  408. }
  409. /* skip comment line after version marker since some generators
  410. * forget to terminate the comment with a newline */
  411. c = fz_read_byte(ctx, doc->file);
  412. while (c >= 0 && (c == ' ' || c == '%'))
  413. c = fz_read_byte(ctx, doc->file);
  414. if (c != EOF)
  415. fz_unread_byte(ctx, doc->file);
  416. while (1)
  417. {
  418. tmpofs = fz_tell(ctx, doc->file);
  419. if (tmpofs < 0)
  420. fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
  421. fz_try(ctx)
  422. tok = pdf_lex_no_string(ctx, doc->file, buf);
  423. fz_catch(ctx)
  424. {
  425. fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
  426. fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
  427. fz_report_error(ctx);
  428. fz_warn(ctx, "skipping ahead to next token");
  429. do
  430. c = fz_read_byte(ctx, doc->file);
  431. while (c != EOF && !is_white(c));
  432. if (c == EOF)
  433. tok = PDF_TOK_EOF;
  434. else
  435. continue;
  436. }
  437. /* If we have the next token already, then we'll jump
  438. * back here, rather than going through the top of
  439. * the loop. */
  440. have_next_token:
  441. if (tok == PDF_TOK_INT)
  442. {
  443. if (buf->i < 0)
  444. {
  445. num = 0;
  446. gen = 0;
  447. continue;
  448. }
  449. numofs = genofs;
  450. num = gen;
  451. genofs = tmpofs;
  452. gen = buf->i;
  453. }
  454. else if (tok == PDF_TOK_OBJ)
  455. {
  456. pdf_obj *root = NULL;
  457. fz_try(ctx)
  458. {
  459. stm_len = 0;
  460. stm_ofs = 0;
  461. tok = pdf_repair_obj(ctx, doc, buf, &stm_ofs, &stm_len, &encrypt, &id, NULL, &tmpofs, &root);
  462. if (root)
  463. add_root(ctx, roots, root);
  464. }
  465. fz_always(ctx)
  466. {
  467. pdf_drop_obj(ctx, root);
  468. }
  469. fz_catch(ctx)
  470. {
  471. int errcode = fz_caught(ctx);
  472. /* If we haven't seen a root yet, there is nothing
  473. * we can do, but give up. Otherwise, we'll make
  474. * do. */
  475. if (roots->len == 0 ||
  476. errcode == FZ_ERROR_TRYLATER ||
  477. errcode == FZ_ERROR_SYSTEM)
  478. {
  479. pdf_drop_root_list(ctx, roots);
  480. roots = NULL;
  481. fz_rethrow(ctx);
  482. }
  483. fz_report_error(ctx);
  484. fz_warn(ctx, "cannot parse object (%d %d R) - ignoring rest of file", num, gen);
  485. break;
  486. }
  487. if (num <= 0 || num > PDF_MAX_OBJECT_NUMBER)
  488. {
  489. fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", num, gen);
  490. goto have_next_token;
  491. }
  492. gen = fz_clampi(gen, 0, 65535);
  493. if (listlen + 1 == listcap)
  494. {
  495. listcap = (listcap * 3) / 2;
  496. list = fz_realloc_array(ctx, list, listcap, struct entry);
  497. }
  498. list[listlen].num = num;
  499. list[listlen].gen = gen;
  500. list[listlen].ofs = numofs;
  501. list[listlen].stm_ofs = stm_ofs;
  502. list[listlen].stm_len = stm_len;
  503. listlen ++;
  504. if (num > maxnum)
  505. maxnum = num;
  506. goto have_next_token;
  507. }
  508. /* If we find a dictionary it is probably the trailer,
  509. * but could be a stream (or bogus) dictionary caused
  510. * by a corrupt file. */
  511. else if (tok == PDF_TOK_OPEN_DICT)
  512. {
  513. pdf_obj *dictobj;
  514. fz_try(ctx)
  515. {
  516. dict = pdf_parse_dict(ctx, doc, doc->file, buf);
  517. }
  518. fz_catch(ctx)
  519. {
  520. fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
  521. fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
  522. /* If this was the real trailer dict
  523. * it was broken, in which case we are
  524. * in trouble. Keep going though in
  525. * case this was just a bogus dict. */
  526. fz_report_error(ctx);
  527. continue;
  528. }
  529. fz_try(ctx)
  530. {
  531. dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Encrypt));
  532. if (dictobj)
  533. {
  534. pdf_drop_obj(ctx, encrypt);
  535. encrypt = pdf_keep_obj(ctx, dictobj);
  536. }
  537. dictobj = pdf_dict_get(ctx, dict, PDF_NAME(ID));
  538. if (dictobj && (!id || !encrypt || pdf_dict_get(ctx, dict, PDF_NAME(Encrypt))))
  539. {
  540. pdf_drop_obj(ctx, id);
  541. id = pdf_keep_obj(ctx, dictobj);
  542. }
  543. dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Root));
  544. if (dictobj)
  545. add_root(ctx, roots, dictobj);
  546. dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Info));
  547. if (dictobj)
  548. {
  549. pdf_drop_obj(ctx, info);
  550. info = pdf_keep_obj(ctx, dictobj);
  551. }
  552. }
  553. fz_always(ctx)
  554. pdf_drop_obj(ctx, dict);
  555. fz_catch(ctx)
  556. fz_rethrow(ctx);
  557. }
  558. else if (tok == PDF_TOK_EOF)
  559. {
  560. break;
  561. }
  562. else
  563. {
  564. num = 0;
  565. gen = 0;
  566. }
  567. }
  568. if (listlen == 0)
  569. fz_throw(ctx, FZ_ERROR_FORMAT, "no objects found");
  570. /* make xref reasonable */
  571. /*
  572. Dummy access to entry to assure sufficient space in the xref table
  573. and avoid repeated reallocs in the loop
  574. */
  575. /* Ensure that the first xref table is a 'solid' one from
  576. * 0 to maxnum. */
  577. pdf_ensure_solid_xref(ctx, doc, maxnum);
  578. for (i = 1; i < maxnum; i++)
  579. {
  580. entry = pdf_get_populating_xref_entry(ctx, doc, i);
  581. if (entry->obj != NULL)
  582. continue;
  583. entry->type = 'f';
  584. entry->ofs = 0;
  585. entry->gen = 0;
  586. entry->num = 0;
  587. entry->stm_ofs = 0;
  588. }
  589. for (i = 0; i < listlen; i++)
  590. {
  591. entry = pdf_get_populating_xref_entry(ctx, doc, list[i].num);
  592. entry->type = 'n';
  593. entry->ofs = list[i].ofs;
  594. entry->gen = list[i].gen;
  595. entry->num = list[i].num;
  596. entry->stm_ofs = list[i].stm_ofs;
  597. /* correct stream length for unencrypted documents */
  598. if (!encrypt && list[i].stm_len >= 0)
  599. {
  600. pdf_obj *old_obj = NULL;
  601. dict = pdf_load_object(ctx, doc, list[i].num);
  602. fz_try(ctx)
  603. {
  604. length = pdf_new_int(ctx, list[i].stm_len);
  605. pdf_dict_get_put_drop(ctx, dict, PDF_NAME(Length), length, &old_obj);
  606. if (old_obj)
  607. orphan_object(ctx, doc, old_obj);
  608. }
  609. fz_always(ctx)
  610. pdf_drop_obj(ctx, dict);
  611. fz_catch(ctx)
  612. fz_rethrow(ctx);
  613. }
  614. }
  615. entry = pdf_get_populating_xref_entry(ctx, doc, 0);
  616. entry->type = 'f';
  617. entry->ofs = 0;
  618. entry->gen = 65535;
  619. entry->num = 0;
  620. entry->stm_ofs = 0;
  621. next = 0;
  622. for (i = pdf_xref_len(ctx, doc) - 1; i >= 0; i--)
  623. {
  624. entry = pdf_get_populating_xref_entry(ctx, doc, i);
  625. if (entry->type == 'f')
  626. {
  627. entry->ofs = next;
  628. if (entry->gen < 65535)
  629. entry->gen ++;
  630. next = i;
  631. }
  632. }
  633. /* create a repaired trailer, Root will be added later */
  634. obj = pdf_new_dict(ctx, doc, 5);
  635. /* During repair there is only a single xref section */
  636. pdf_set_populating_xref_trailer(ctx, doc, obj);
  637. pdf_drop_obj(ctx, obj);
  638. obj = NULL;
  639. pdf_dict_put_int(ctx, pdf_trailer(ctx, doc), PDF_NAME(Size), maxnum + 1);
  640. if (info)
  641. {
  642. pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info), info);
  643. pdf_drop_obj(ctx, info);
  644. info = NULL;
  645. }
  646. if (encrypt)
  647. {
  648. if (pdf_is_indirect(ctx, encrypt))
  649. {
  650. /* create new reference with non-NULL xref pointer */
  651. obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, encrypt), pdf_to_gen(ctx, encrypt));
  652. pdf_drop_obj(ctx, encrypt);
  653. encrypt = obj;
  654. obj = NULL;
  655. }
  656. pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt), encrypt);
  657. pdf_drop_obj(ctx, encrypt);
  658. encrypt = NULL;
  659. }
  660. if (id)
  661. {
  662. if (pdf_is_indirect(ctx, id))
  663. {
  664. /* create new reference with non-NULL xref pointer */
  665. obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, id), pdf_to_gen(ctx, id));
  666. pdf_drop_obj(ctx, id);
  667. id = obj;
  668. obj = NULL;
  669. }
  670. pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(ID), id);
  671. pdf_drop_obj(ctx, id);
  672. id = NULL;
  673. }
  674. }
  675. fz_always(ctx)
  676. {
  677. fz_free(ctx, list);
  678. doc->repair_in_progress = 0;
  679. }
  680. fz_catch(ctx)
  681. {
  682. pdf_drop_root_list(ctx, roots);
  683. pdf_drop_obj(ctx, encrypt);
  684. pdf_drop_obj(ctx, id);
  685. pdf_drop_obj(ctx, obj);
  686. pdf_drop_obj(ctx, info);
  687. if (ctx->throw_on_repair)
  688. fz_throw(ctx, FZ_ERROR_REPAIRED, "Error during repair attempt");
  689. fz_rethrow(ctx);
  690. }
  691. if (ctx->throw_on_repair)
  692. {
  693. pdf_drop_root_list(ctx, roots);
  694. fz_throw(ctx, FZ_ERROR_REPAIRED, "File repaired");
  695. }
  696. return roots;
  697. }
  698. static void
  699. pdf_repair_obj_stms(fz_context *ctx, pdf_document *doc)
  700. {
  701. pdf_obj *dict;
  702. int i;
  703. int xref_len = pdf_xref_len(ctx, doc);
  704. for (i = 0; i < xref_len; i++)
  705. {
  706. pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i);
  707. if (entry->stm_ofs)
  708. {
  709. dict = pdf_load_object(ctx, doc, i);
  710. fz_try(ctx)
  711. {
  712. if (pdf_name_eq(ctx, pdf_dict_get(ctx, dict, PDF_NAME(Type)), PDF_NAME(ObjStm)))
  713. pdf_repair_obj_stm(ctx, doc, i);
  714. }
  715. fz_always(ctx)
  716. pdf_drop_obj(ctx, dict);
  717. fz_catch(ctx)
  718. {
  719. fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
  720. fz_report_error(ctx);
  721. fz_warn(ctx, "ignoring broken object stream (%d 0 R)", i);
  722. }
  723. }
  724. }
  725. /* Ensure that streamed objects reside inside a known non-streamed object */
  726. for (i = 0; i < xref_len; i++)
  727. {
  728. pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i);
  729. if (entry->type == 'o' && pdf_get_populating_xref_entry(ctx, doc, entry->ofs)->type != 'n')
  730. {
  731. fz_warn(ctx, "invalid reference to non-object-stream: %d, assuming %d 0 R is a freed object", (int)entry->ofs, i);
  732. entry->type = 'f';
  733. }
  734. }
  735. }
  736. static void
  737. pdf_repair_roots(fz_context *ctx, pdf_document *doc, pdf_root_list *roots)
  738. {
  739. int i;
  740. for (i = roots->len-1; i >= 0; i--)
  741. {
  742. if (pdf_is_indirect(ctx, roots->roots[i]) && pdf_is_dict(ctx, roots->roots[i]))
  743. {
  744. pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), roots->roots[i]);
  745. break;
  746. }
  747. }
  748. }
  749. static void
  750. pdf_repair_trailer(fz_context *ctx, pdf_document *doc)
  751. {
  752. int hasroot, hasinfo;
  753. pdf_obj *obj, *nobj;
  754. pdf_obj *dict = NULL;
  755. int i;
  756. int xref_len = pdf_xref_len(ctx, doc);
  757. hasroot = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root)) != NULL);
  758. hasinfo = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info)) != NULL);
  759. fz_var(dict);
  760. fz_try(ctx)
  761. {
  762. /* Scan from the end so we have a better chance of finding
  763. * newer objects if there are multiple instances of Info and
  764. * Root objects.
  765. */
  766. for (i = xref_len - 1; i > 0 && (!hasinfo || !hasroot); --i)
  767. {
  768. pdf_xref_entry *entry = pdf_get_xref_entry_no_null(ctx, doc, i);
  769. if (entry->type == 0 || entry->type == 'f')
  770. continue;
  771. fz_try(ctx)
  772. {
  773. dict = pdf_load_object(ctx, doc, i);
  774. }
  775. fz_catch(ctx)
  776. {
  777. fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
  778. fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
  779. fz_report_error(ctx);
  780. fz_warn(ctx, "ignoring broken object (%d 0 R)", i);
  781. continue;
  782. }
  783. if (!hasroot)
  784. {
  785. obj = pdf_dict_get(ctx, dict, PDF_NAME(Type));
  786. if (obj == PDF_NAME(Catalog))
  787. {
  788. nobj = pdf_new_indirect(ctx, doc, i, 0);
  789. pdf_dict_put_drop(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), nobj);
  790. hasroot = 1;
  791. }
  792. }
  793. if (!hasinfo)
  794. {
  795. if (pdf_dict_get(ctx, dict, PDF_NAME(Creator)) || pdf_dict_get(ctx, dict, PDF_NAME(Producer)))
  796. {
  797. nobj = pdf_new_indirect(ctx, doc, i, 0);
  798. pdf_dict_put_drop(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info), nobj);
  799. hasinfo = 1;
  800. }
  801. }
  802. pdf_drop_obj(ctx, dict);
  803. dict = NULL;
  804. }
  805. }
  806. fz_always(ctx)
  807. {
  808. /* ensure that strings are not used in their repaired, non-decrypted form */
  809. if (doc->crypt)
  810. {
  811. pdf_crypt *tmp;
  812. pdf_clear_xref(ctx, doc);
  813. /* ensure that Encryption dictionary and ID are cached without decryption,
  814. otherwise a decrypted Encryption dictionary and ID may be used when saving
  815. the PDF causing it to be inconsistent (since strings/streams are encrypted
  816. with the actual encryption key, not the decrypted encryption key). */
  817. tmp = doc->crypt;
  818. doc->crypt = NULL;
  819. fz_try(ctx)
  820. {
  821. (void) pdf_resolve_indirect(ctx, pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt)));
  822. (void) pdf_resolve_indirect(ctx, pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(ID)));
  823. }
  824. fz_always(ctx)
  825. doc->crypt = tmp;
  826. fz_catch(ctx)
  827. {
  828. fz_rethrow(ctx);
  829. }
  830. }
  831. }
  832. fz_catch(ctx)
  833. {
  834. pdf_drop_obj(ctx, dict);
  835. fz_rethrow(ctx);
  836. }
  837. }
  838. void pdf_repair_xref_aux(fz_context *ctx, pdf_document *doc, void (*mid)(fz_context *ctx, pdf_document *doc))
  839. {
  840. pdf_root_list *roots = NULL;
  841. fz_var(roots);
  842. fz_try(ctx)
  843. {
  844. roots = pdf_repair_xref_base(ctx, doc);
  845. if (mid)
  846. mid(ctx, doc);
  847. pdf_repair_obj_stms(ctx, doc);
  848. pdf_repair_roots(ctx, doc, roots);
  849. pdf_repair_trailer(ctx, doc);
  850. }
  851. fz_always(ctx)
  852. pdf_drop_root_list(ctx, roots);
  853. fz_catch(ctx)
  854. fz_rethrow(ctx);
  855. }