epub-doc.c 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137
  1. // Copyright (C) 2004-2024 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #include "mupdf/fitz.h"
  23. #include "html-imp.h"
  24. #include <string.h>
  25. #include <math.h>
  26. #include <zlib.h> /* for crc32 */
  27. enum { T, R, B, L };
  28. typedef struct epub_chapter epub_chapter;
  29. typedef struct epub_page epub_page;
  30. typedef struct
  31. {
  32. int max_chapters;
  33. int num_chapters;
  34. float layout_w;
  35. float layout_h;
  36. float layout_em;
  37. uint32_t css_sum;
  38. int use_doc_css;
  39. int *pages_in_chapter;
  40. } epub_accelerator;
  41. typedef struct
  42. {
  43. fz_document super;
  44. fz_archive *zip;
  45. fz_html_font_set *set;
  46. int count;
  47. epub_chapter *spine;
  48. fz_outline *outline;
  49. char *dc_title, *dc_creator;
  50. float layout_w, layout_h, layout_em;
  51. epub_accelerator *accel;
  52. uint32_t css_sum;
  53. /* A common pattern of use is for us to open a document,
  54. * load a page, draw it, drop it, load the next page,
  55. * draw it, drop it etc. This means that the HTML for
  56. * a chapter might get thrown away between the drop and
  57. * the the next load (if the chapter is large, and the
  58. * store size is low). Accordingly, we store a handle
  59. * to the most recently used html block here, thus
  60. * ensuring that the stored copy won't be evicted. */
  61. fz_html *most_recent_html;
  62. } epub_document;
  63. struct epub_chapter
  64. {
  65. epub_document *doc;
  66. char *path;
  67. int number;
  68. epub_chapter *next;
  69. };
  70. struct epub_page
  71. {
  72. fz_page super;
  73. epub_chapter *ch;
  74. int number;
  75. fz_html *html;
  76. };
  77. static uint32_t
  78. user_css_sum(fz_context *ctx)
  79. {
  80. uint32_t sum = 0;
  81. const char *css = fz_user_css(ctx);
  82. sum = crc32(0, NULL, 0);
  83. if (css)
  84. sum = crc32(sum, (Byte*)css, (int)strlen(css));
  85. return sum;
  86. }
  87. static int dummy = 1;
  88. struct encrypted {
  89. fz_archive super;
  90. fz_archive *chain;
  91. fz_tree *info;
  92. };
  93. static int has_encrypted_entry(fz_context *ctx, fz_archive *arch_, const char *name)
  94. {
  95. struct encrypted *arch = (struct encrypted *)arch_;
  96. return fz_has_archive_entry(ctx, arch->chain, name);
  97. }
  98. static fz_stream *open_encrypted_entry(fz_context *ctx, fz_archive *arch_, const char *name)
  99. {
  100. struct encrypted *arch = (struct encrypted *)arch_;
  101. if (fz_tree_lookup(ctx, arch->info, name))
  102. return NULL;
  103. return fz_open_archive_entry(ctx, arch->chain, name);
  104. }
  105. static fz_buffer *read_encrypted_entry(fz_context *ctx, fz_archive *arch_, const char *name)
  106. {
  107. struct encrypted *arch = (struct encrypted *)arch_;
  108. if (fz_tree_lookup(ctx, arch->info, name))
  109. return NULL;
  110. return fz_read_archive_entry(ctx, arch->chain, name);
  111. }
  112. static void drop_encrypted_archive(fz_context *ctx, fz_archive *arch_)
  113. {
  114. struct encrypted *arch = (struct encrypted *)arch_;
  115. fz_drop_tree(ctx, arch->info, NULL);
  116. fz_drop_archive(ctx, arch->chain);
  117. }
  118. static fz_archive *new_encrypted_archive(fz_context *ctx, fz_archive *chain, fz_tree *info)
  119. {
  120. struct encrypted *arch;
  121. arch = fz_new_derived_archive(ctx, NULL, struct encrypted);
  122. arch->super.format = "encrypted";
  123. arch->super.has_entry = has_encrypted_entry;
  124. arch->super.read_entry = read_encrypted_entry;
  125. arch->super.open_entry = open_encrypted_entry;
  126. arch->super.drop_archive = drop_encrypted_archive;
  127. arch->chain = chain;
  128. arch->info = info;
  129. return &arch->super;
  130. }
  131. static void
  132. epub_parse_encryption(fz_context *ctx, epub_document *doc, fz_xml *root)
  133. {
  134. fz_tree *info = NULL;
  135. fz_xml *edata;
  136. for (edata = fz_xml_find_down(root, "EncryptedData"); edata; edata = fz_xml_find_next(edata, "EncryptedData"))
  137. {
  138. fz_xml *cdata = fz_xml_find_down(edata, "CipherData");
  139. fz_xml *cref = fz_xml_find_down(cdata, "CipherReference");
  140. char *uri = fz_xml_att(cref, "URI");
  141. if (uri)
  142. {
  143. // TODO: Support reading EncryptedKey and EncryptionMethod to decrypt content.
  144. info = fz_tree_insert(ctx, info, uri, &dummy);
  145. }
  146. }
  147. if (info)
  148. {
  149. doc->zip = new_encrypted_archive(ctx, doc->zip, info);
  150. }
  151. }
  152. static fz_html *epub_get_laid_out_html(fz_context *ctx, epub_document *doc, epub_chapter *ch);
  153. static int count_laid_out_pages(fz_html *html)
  154. {
  155. if (html->tree.root->s.layout.b > 0)
  156. return ceilf(html->tree.root->s.layout.b / html->page_h);
  157. return 1;
  158. }
  159. static void
  160. invalidate_accelerator(fz_context *ctx, epub_accelerator *acc)
  161. {
  162. int i;
  163. for (i = 0; i < acc->max_chapters; i++)
  164. acc->pages_in_chapter[i] = -1;
  165. }
  166. static int count_chapter_pages(fz_context *ctx, epub_document *doc, epub_chapter *ch)
  167. {
  168. epub_accelerator *acc = doc->accel;
  169. int use_doc_css = fz_use_document_css(ctx);
  170. if (use_doc_css != acc->use_doc_css || doc->css_sum != acc->css_sum)
  171. {
  172. acc->use_doc_css = use_doc_css;
  173. acc->css_sum = doc->css_sum;
  174. invalidate_accelerator(ctx, acc);
  175. }
  176. if (ch->number < acc->num_chapters && acc->pages_in_chapter[ch->number] != -1)
  177. return acc->pages_in_chapter[ch->number];
  178. fz_drop_html(ctx, epub_get_laid_out_html(ctx, doc, ch));
  179. return acc->pages_in_chapter[ch->number];
  180. }
  181. static fz_link_dest
  182. epub_resolve_link(fz_context *ctx, fz_document *doc_, const char *dest)
  183. {
  184. epub_document *doc = (epub_document*)doc_;
  185. epub_chapter *ch;
  186. int i;
  187. const char *s = strchr(dest, '#');
  188. size_t n = s ? (size_t)(s - dest) : strlen(dest);
  189. if (s && s[1] == 0)
  190. s = NULL;
  191. for (i = 0, ch = doc->spine; ch; ++i, ch = ch->next)
  192. {
  193. if (!strncmp(ch->path, dest, n) && ch->path[n] == 0)
  194. {
  195. if (s)
  196. {
  197. float y;
  198. fz_html *html = epub_get_laid_out_html(ctx, doc, ch);
  199. int ph = html->page_h;
  200. /* Search for a matching fragment */
  201. y = fz_find_html_target(ctx, html, s+1);
  202. fz_drop_html(ctx, html);
  203. if (y >= 0)
  204. {
  205. int page = y / ph;
  206. return fz_make_link_dest_xyz(i, page, 0, y - page * ph, 0);
  207. }
  208. return fz_make_link_dest_none();
  209. }
  210. return fz_make_link_dest_xyz(i, 0, 0, 0, 0);
  211. }
  212. }
  213. return fz_make_link_dest_none();
  214. }
  215. static void
  216. epub_layout(fz_context *ctx, fz_document *doc_, float w, float h, float em)
  217. {
  218. epub_document *doc = (epub_document*)doc_;
  219. uint32_t css_sum = user_css_sum(ctx);
  220. int use_doc_css = fz_use_document_css(ctx);
  221. if (doc->layout_w == w && doc->layout_h == h && doc->layout_em == em && doc->css_sum == css_sum)
  222. return;
  223. doc->layout_w = w;
  224. doc->layout_h = h;
  225. doc->layout_em = em;
  226. if (doc->accel == NULL)
  227. return;
  228. /* When we load the saved accelerator, doc->accel
  229. * can be populated with different values than doc.
  230. * This is really useful as doc starts out with the
  231. * values being 0. If we've got the right values
  232. * already, then don't bin the data! */
  233. if (doc->accel->layout_w == w &&
  234. doc->accel->layout_h == h &&
  235. doc->accel->layout_em == em &&
  236. doc->accel->use_doc_css == use_doc_css &&
  237. doc->accel->css_sum == css_sum)
  238. return;
  239. doc->accel->layout_w = w;
  240. doc->accel->layout_h = h;
  241. doc->accel->layout_em = em;
  242. doc->accel->use_doc_css = use_doc_css;
  243. doc->accel->css_sum = css_sum;
  244. invalidate_accelerator(ctx, doc->accel);
  245. }
  246. static int
  247. epub_count_chapters(fz_context *ctx, fz_document *doc_)
  248. {
  249. epub_document *doc = (epub_document*)doc_;
  250. epub_chapter *ch;
  251. int count = 0;
  252. for (ch = doc->spine; ch; ch = ch->next)
  253. ++count;
  254. return count;
  255. }
  256. static int
  257. epub_count_pages(fz_context *ctx, fz_document *doc_, int chapter)
  258. {
  259. epub_document *doc = (epub_document*)doc_;
  260. epub_chapter *ch;
  261. int i;
  262. for (i = 0, ch = doc->spine; ch; ++i, ch = ch->next)
  263. {
  264. if (i == chapter)
  265. {
  266. return count_chapter_pages(ctx, doc, ch);
  267. }
  268. }
  269. return 0;
  270. }
  271. #define MAGIC_ACCELERATOR 0xacce1e7a
  272. #define MAGIC_ACCEL_EPUB 0x62755065
  273. #define ACCEL_VERSION 0x00010001
  274. static void epub_load_accelerator(fz_context *ctx, epub_document *doc, fz_stream *accel)
  275. {
  276. int v;
  277. float w, h, em;
  278. int num_chapters;
  279. epub_accelerator *acc = NULL;
  280. uint32_t css_sum;
  281. int use_doc_css;
  282. int make_new = (accel == NULL);
  283. fz_var(acc);
  284. if (accel)
  285. {
  286. /* Try to read the accelerator data. If we fail silently give up. */
  287. fz_try(ctx)
  288. {
  289. v = fz_read_int32_le(ctx, accel);
  290. if (v != (int32_t)MAGIC_ACCELERATOR)
  291. {
  292. make_new = 1;
  293. break;
  294. }
  295. v = fz_read_int32_le(ctx, accel);
  296. if (v != MAGIC_ACCEL_EPUB)
  297. {
  298. make_new = 1;
  299. break;
  300. }
  301. v = fz_read_int32_le(ctx, accel);
  302. if (v != ACCEL_VERSION)
  303. {
  304. make_new = 1;
  305. break;
  306. }
  307. w = fz_read_float_le(ctx, accel);
  308. h = fz_read_float_le(ctx, accel);
  309. em = fz_read_float_le(ctx, accel);
  310. css_sum = fz_read_uint32_le(ctx, accel);
  311. use_doc_css = fz_read_int32_le(ctx, accel);
  312. num_chapters = fz_read_int32_le(ctx, accel);
  313. if (num_chapters <= 0)
  314. {
  315. make_new = 1;
  316. break;
  317. }
  318. acc = fz_malloc_struct(ctx, epub_accelerator);
  319. acc->pages_in_chapter = Memento_label(fz_malloc_array(ctx, num_chapters, int), "accel_pages_in_chapter");
  320. acc->max_chapters = acc->num_chapters = num_chapters;
  321. acc->layout_w = w;
  322. acc->layout_h = h;
  323. acc->layout_em = em;
  324. acc->css_sum = css_sum;
  325. acc->use_doc_css = use_doc_css;
  326. for (v = 0; v < num_chapters; v++)
  327. acc->pages_in_chapter[v] = fz_read_int32_le(ctx, accel);
  328. }
  329. fz_catch(ctx)
  330. {
  331. if (acc)
  332. fz_free(ctx, acc->pages_in_chapter);
  333. fz_free(ctx, acc);
  334. /* Swallow the error and run unaccelerated */
  335. fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
  336. fz_report_error(ctx);
  337. make_new = 1;
  338. }
  339. }
  340. /* If we aren't given an accelerator to load (or the one we're given
  341. * is bad) create a blank stub and we can fill it out as we go. */
  342. if (make_new)
  343. {
  344. acc = fz_malloc_struct(ctx, epub_accelerator);
  345. acc->css_sum = doc->css_sum;
  346. acc->use_doc_css = fz_use_document_css(ctx);
  347. }
  348. doc->accel = acc;
  349. }
  350. static void
  351. accelerate_chapter(fz_context *ctx, epub_document *doc, epub_chapter *ch, fz_html *html)
  352. {
  353. epub_accelerator *acc = doc->accel;
  354. int p = count_laid_out_pages(html);
  355. if (ch->number < acc->num_chapters)
  356. {
  357. if (acc->pages_in_chapter[ch->number] != p && acc->pages_in_chapter[ch->number] != -1)
  358. {
  359. fz_warn(ctx, "Invalidating stale accelerator data.");
  360. invalidate_accelerator(ctx, doc->accel);
  361. }
  362. acc->pages_in_chapter[ch->number] = p;
  363. return;
  364. }
  365. if (ch->number >= acc->max_chapters)
  366. {
  367. int n = acc->max_chapters;
  368. int i;
  369. if (n == 0)
  370. n = 4;
  371. while (n <= ch->number)
  372. n *= 2;
  373. acc->pages_in_chapter = fz_realloc_array(ctx, acc->pages_in_chapter, n, int);
  374. for (i = acc->max_chapters; i < n; i++)
  375. acc->pages_in_chapter[i] = -1;
  376. acc->max_chapters = n;
  377. }
  378. acc->pages_in_chapter[ch->number] = p;
  379. if (acc->num_chapters < ch->number+1)
  380. acc->num_chapters = ch->number+1;
  381. }
  382. static void
  383. epub_drop_page(fz_context *ctx, fz_page *page_)
  384. {
  385. epub_page *page = (epub_page *)page_;
  386. fz_drop_html(ctx, page->html);
  387. }
  388. static epub_chapter *
  389. epub_load_chapter(fz_context *ctx, epub_document *doc, const char *path, int i)
  390. {
  391. epub_chapter *ch;
  392. ch = fz_malloc_struct(ctx, epub_chapter);
  393. fz_try(ctx)
  394. {
  395. ch->path = Memento_label(fz_strdup(ctx, path), "chapter_path");
  396. ch->number = i;
  397. }
  398. fz_catch(ctx)
  399. {
  400. fz_free(ctx, ch);
  401. fz_rethrow(ctx);
  402. }
  403. return ch;
  404. }
  405. static fz_html *
  406. epub_parse_chapter(fz_context *ctx, epub_document *doc, epub_chapter *ch)
  407. {
  408. fz_archive *zip = doc->zip;
  409. fz_buffer *buf;
  410. char base_uri[2048];
  411. fz_html *html;
  412. /* Look for one we made earlier */
  413. html = fz_find_html(ctx, doc, ch->number);
  414. if (html)
  415. return html;
  416. fz_dirname(base_uri, ch->path, sizeof base_uri);
  417. buf = fz_read_archive_entry(ctx, zip, ch->path);
  418. fz_try(ctx)
  419. html = fz_parse_html(ctx, doc->set, zip, base_uri, buf, fz_user_css(ctx), 1, 1, 0);
  420. fz_always(ctx)
  421. fz_drop_buffer(ctx, buf);
  422. fz_catch(ctx)
  423. fz_rethrow(ctx);
  424. return fz_store_html(ctx, html, doc, ch->number);
  425. }
  426. static fz_html *
  427. epub_get_laid_out_html(fz_context *ctx, epub_document *doc, epub_chapter *ch)
  428. {
  429. fz_html *html = epub_parse_chapter(ctx, doc, ch);
  430. fz_try(ctx)
  431. {
  432. fz_layout_html(ctx, html, doc->layout_w, doc->layout_h, doc->layout_em);
  433. accelerate_chapter(ctx, doc, ch, html);
  434. }
  435. fz_catch(ctx)
  436. {
  437. fz_drop_html(ctx, html);
  438. fz_rethrow(ctx);
  439. }
  440. fz_drop_html(ctx, doc->most_recent_html);
  441. doc->most_recent_html = fz_keep_html(ctx, html);
  442. return html;
  443. }
  444. static fz_rect
  445. epub_bound_page(fz_context *ctx, fz_page *page_, fz_box_type box)
  446. {
  447. epub_document *doc = (epub_document*)page_->doc;
  448. epub_page *page = (epub_page*)page_;
  449. epub_chapter *ch = page->ch;
  450. fz_rect bbox;
  451. fz_html *html = epub_get_laid_out_html(ctx, doc, ch);
  452. bbox.x0 = 0;
  453. bbox.y0 = 0;
  454. bbox.x1 = html->page_w + html->page_margin[L] + html->page_margin[R];
  455. bbox.y1 = html->page_h + html->page_margin[T] + html->page_margin[B];
  456. fz_drop_html(ctx, html);
  457. return bbox;
  458. }
  459. static void
  460. epub_run_page(fz_context *ctx, fz_page *page_, fz_device *dev, fz_matrix ctm, fz_cookie *cookie)
  461. {
  462. epub_page *page = (epub_page*)page_;
  463. fz_draw_html(ctx, dev, ctm, page->html, page->number);
  464. }
  465. static fz_link *
  466. epub_load_links(fz_context *ctx, fz_page *page_)
  467. {
  468. epub_page *page = (epub_page*)page_;
  469. epub_chapter *ch = page->ch;
  470. return fz_load_html_links(ctx, page->html, page->number, ch->path);
  471. }
  472. static fz_bookmark
  473. epub_make_bookmark(fz_context *ctx, fz_document *doc_, fz_location loc)
  474. {
  475. epub_document *doc = (epub_document*)doc_;
  476. epub_chapter *ch;
  477. int i;
  478. for (i = 0, ch = doc->spine; ch; ++i, ch = ch->next)
  479. {
  480. if (i == loc.chapter)
  481. {
  482. fz_html *html = epub_get_laid_out_html(ctx, doc, ch);
  483. fz_bookmark mark = fz_make_html_bookmark(ctx, html, loc.page);
  484. fz_drop_html(ctx, html);
  485. return mark;
  486. }
  487. }
  488. return 0;
  489. }
  490. static fz_location
  491. epub_lookup_bookmark(fz_context *ctx, fz_document *doc_, fz_bookmark mark)
  492. {
  493. epub_document *doc = (epub_document*)doc_;
  494. epub_chapter *ch;
  495. int i;
  496. for (i = 0, ch = doc->spine; ch; ++i, ch = ch->next)
  497. {
  498. fz_html *html = epub_get_laid_out_html(ctx, doc, ch);
  499. int p = fz_lookup_html_bookmark(ctx, html, mark);
  500. fz_drop_html(ctx, html);
  501. if (p != -1)
  502. return fz_make_location(i, p);
  503. }
  504. return fz_make_location(-1, -1);
  505. }
  506. static fz_page *
  507. epub_load_page(fz_context *ctx, fz_document *doc_, int chapter, int number)
  508. {
  509. epub_document *doc = (epub_document*)doc_;
  510. epub_chapter *ch;
  511. int i;
  512. for (i = 0, ch = doc->spine; ch; ++i, ch = ch->next)
  513. {
  514. if (i == chapter)
  515. {
  516. epub_page *page = fz_new_derived_page(ctx, epub_page, doc_);
  517. page->super.bound_page = epub_bound_page;
  518. page->super.run_page_contents = epub_run_page;
  519. page->super.load_links = epub_load_links;
  520. page->super.drop_page = epub_drop_page;
  521. page->ch = ch;
  522. page->number = number;
  523. page->html = epub_get_laid_out_html(ctx, doc, ch);
  524. return (fz_page*)page;
  525. }
  526. }
  527. return NULL;
  528. }
  529. static void
  530. epub_page_label(fz_context *ctx, fz_document *doc_, int chapter, int number, char *buf, size_t size)
  531. {
  532. fz_snprintf(buf, size, "ch. %d, p. %d", chapter+1, number+1);
  533. }
  534. static void
  535. epub_drop_accelerator(fz_context *ctx, epub_accelerator *acc)
  536. {
  537. if (acc == NULL)
  538. return;
  539. fz_free(ctx, acc->pages_in_chapter);
  540. fz_free(ctx, acc);
  541. }
  542. static void
  543. epub_drop_document(fz_context *ctx, fz_document *doc_)
  544. {
  545. epub_document *doc = (epub_document*)doc_;
  546. epub_chapter *ch, *next;
  547. ch = doc->spine;
  548. while (ch)
  549. {
  550. next = ch->next;
  551. fz_free(ctx, ch->path);
  552. fz_free(ctx, ch);
  553. ch = next;
  554. }
  555. epub_drop_accelerator(ctx, doc->accel);
  556. fz_drop_archive(ctx, doc->zip);
  557. fz_drop_html_font_set(ctx, doc->set);
  558. fz_drop_outline(ctx, doc->outline);
  559. fz_free(ctx, doc->dc_title);
  560. fz_free(ctx, doc->dc_creator);
  561. fz_drop_html(ctx, doc->most_recent_html);
  562. fz_purge_stored_html(ctx, doc);
  563. }
  564. static const char *
  565. rel_path_from_idref(fz_xml *manifest, const char *idref)
  566. {
  567. fz_xml *item;
  568. if (!idref)
  569. return NULL;
  570. item = fz_xml_find_down(manifest, "item");
  571. while (item)
  572. {
  573. const char *id = fz_xml_att(item, "id");
  574. if (id && !strcmp(id, idref))
  575. return fz_xml_att(item, "href");
  576. item = fz_xml_find_next(item, "item");
  577. }
  578. return NULL;
  579. }
  580. static const char *
  581. path_from_idref(char *path, fz_xml *manifest, const char *base_uri, const char *idref, int n)
  582. {
  583. const char *rel_path = rel_path_from_idref(manifest, idref);
  584. if (!rel_path)
  585. {
  586. path[0] = 0;
  587. return NULL;
  588. }
  589. fz_strlcpy(path, base_uri, n);
  590. fz_strlcat(path, "/", n);
  591. fz_strlcat(path, rel_path, n);
  592. return fz_cleanname(fz_urldecode(path));
  593. }
  594. static fz_outline *
  595. epub_parse_ncx_imp(fz_context *ctx, epub_document *doc, fz_xml *node, char *base_uri)
  596. {
  597. char path[2048];
  598. fz_outline *outline, *head, **tailp;
  599. head = NULL;
  600. tailp = &head;
  601. node = fz_xml_find_down(node, "navPoint");
  602. while (node)
  603. {
  604. char *text = fz_xml_text(fz_xml_down(fz_xml_find_down(fz_xml_find_down(node, "navLabel"), "text")));
  605. char *content = fz_xml_att(fz_xml_find_down(node, "content"), "src");
  606. if (text && content)
  607. {
  608. fz_strlcpy(path, base_uri, sizeof path);
  609. fz_strlcat(path, "/", sizeof path);
  610. fz_strlcat(path, content, sizeof path);
  611. fz_urldecode(path);
  612. fz_cleanname(path);
  613. fz_try(ctx)
  614. {
  615. *tailp = outline = fz_new_outline(ctx);
  616. tailp = &(*tailp)->next;
  617. outline->title = Memento_label(fz_strdup(ctx, text), "outline_title");
  618. outline->uri = Memento_label(fz_strdup(ctx, path), "outline_uri");
  619. outline->page = fz_make_location(-1, -1);
  620. outline->down = epub_parse_ncx_imp(ctx, doc, node, base_uri);
  621. outline->is_open = 1;
  622. }
  623. fz_catch(ctx)
  624. {
  625. fz_drop_outline(ctx, head);
  626. fz_rethrow(ctx);
  627. }
  628. }
  629. node = fz_xml_find_next(node, "navPoint");
  630. }
  631. return head;
  632. }
  633. static void
  634. epub_parse_ncx(fz_context *ctx, epub_document *doc, const char *path)
  635. {
  636. fz_archive *zip = doc->zip;
  637. fz_buffer *buf = NULL;
  638. fz_xml_doc *ncx = NULL;
  639. char base_uri[2048];
  640. fz_var(buf);
  641. fz_var(ncx);
  642. fz_try(ctx)
  643. {
  644. fz_dirname(base_uri, path, sizeof base_uri);
  645. buf = fz_read_archive_entry(ctx, zip, path);
  646. ncx = fz_parse_xml(ctx, buf, 0);
  647. doc->outline = epub_parse_ncx_imp(ctx, doc, fz_xml_find_down(fz_xml_root(ncx), "navMap"), base_uri);
  648. }
  649. fz_always(ctx)
  650. {
  651. fz_drop_buffer(ctx, buf);
  652. fz_drop_xml(ctx, ncx);
  653. }
  654. fz_catch(ctx)
  655. fz_rethrow(ctx);
  656. }
  657. static char *
  658. find_metadata(fz_context *ctx, fz_xml *metadata, char *key)
  659. {
  660. char *text = fz_xml_text(fz_xml_down(fz_xml_find_down(metadata, key)));
  661. if (text)
  662. return fz_strdup(ctx, text);
  663. return NULL;
  664. }
  665. static fz_buffer *
  666. read_container_and_prefix(fz_context *ctx, fz_archive *zip, char *prefix, size_t prefix_len)
  667. {
  668. int n = fz_count_archive_entries(ctx, zip);
  669. int i;
  670. prefix[0] = 0;
  671. /* First off, look for the container.xml at the top level. */
  672. for (i = 0; i < n; i++)
  673. {
  674. const char *p = fz_list_archive_entry(ctx, zip, i);
  675. if (!strcmp(p, "META-INF/container.xml"))
  676. return fz_read_archive_entry(ctx, zip, "META-INF/container.xml");
  677. }
  678. /* If that failed, look for the first such file in a subdirectory. */
  679. for (i = 0; i < n; i++)
  680. {
  681. const char *p = fz_list_archive_entry(ctx, zip, i);
  682. size_t z = strlen(p);
  683. size_t z0 = sizeof("META-INF/container.xml")-1;
  684. if (z < z0)
  685. continue;
  686. if (!strcmp(p + z - z0, "META-INF/container.xml"))
  687. {
  688. if (z - z0 >= prefix_len)
  689. {
  690. fz_warn(ctx, "Ignoring %s as path too long.", p);
  691. continue;
  692. }
  693. memcpy(prefix, p, z-z0);
  694. prefix[z-z0] = 0;
  695. return fz_read_archive_entry(ctx, zip, p);
  696. }
  697. }
  698. return fz_read_archive_entry(ctx, zip, "META-INF/container.xml");
  699. }
  700. static void
  701. epub_parse_header(fz_context *ctx, epub_document *doc)
  702. {
  703. fz_archive *zip = doc->zip;
  704. fz_buffer *buf = NULL;
  705. fz_xml_doc *encryption_xml = NULL;
  706. fz_xml_doc *container_xml = NULL;
  707. fz_xml_doc *content_opf = NULL;
  708. fz_xml *container, *rootfiles, *rootfile;
  709. fz_xml *package, *manifest, *spine, *itemref, *metadata;
  710. char base_uri[2048];
  711. const char *full_path;
  712. const char *version;
  713. char ncx[2048], s[2048];
  714. char *prefixed_full_path = NULL;
  715. size_t prefix_len;
  716. epub_chapter **tailp;
  717. int i;
  718. fz_var(buf);
  719. fz_var(encryption_xml);
  720. fz_var(container_xml);
  721. fz_var(content_opf);
  722. fz_var(prefixed_full_path);
  723. fz_try(ctx)
  724. {
  725. /* parse META-INF/encryption.xml to figure out which entries are encrypted */
  726. /* parse META-INF/container.xml to find OPF */
  727. /* Reuse base_uri to read the prefix. */
  728. buf = read_container_and_prefix(ctx, zip, base_uri, sizeof(base_uri));
  729. container_xml = fz_parse_xml(ctx, buf, 0);
  730. fz_drop_buffer(ctx, buf);
  731. buf = NULL;
  732. /* Some epub files can be prefixed by a directory name. This (normally
  733. * empty!) will be in base_uri. */
  734. prefix_len = strlen(base_uri);
  735. {
  736. /* Further abuse base_uri to hold a temporary name. */
  737. const size_t z0 = sizeof("META-INF/encryption.xml")-1;
  738. if (sizeof(base_uri) <= prefix_len + z0)
  739. fz_throw(ctx, FZ_ERROR_FORMAT, "Prefix too long in epub");
  740. strcpy(base_uri + prefix_len, "META-INF/encryption.xml");
  741. if (fz_has_archive_entry(ctx, zip, base_uri))
  742. {
  743. fz_warn(ctx, "EPUB may be locked by DRM");
  744. buf = fz_read_archive_entry(ctx, zip, base_uri);
  745. encryption_xml = fz_parse_xml(ctx, buf, 0);
  746. fz_drop_buffer(ctx, buf);
  747. buf = NULL;
  748. epub_parse_encryption(ctx, doc, fz_xml_find(fz_xml_root(encryption_xml), "encryption"));
  749. zip = doc->zip;
  750. }
  751. }
  752. container = fz_xml_find(fz_xml_root(container_xml), "container");
  753. rootfiles = fz_xml_find_down(container, "rootfiles");
  754. rootfile = fz_xml_find_down(rootfiles, "rootfile");
  755. full_path = fz_xml_att(rootfile, "full-path");
  756. if (!full_path)
  757. fz_throw(ctx, FZ_ERROR_FORMAT, "cannot find root file in EPUB");
  758. fz_dirname(base_uri+prefix_len, full_path, sizeof(base_uri) - prefix_len);
  759. prefixed_full_path = fz_malloc(ctx, strlen(full_path) + prefix_len + 1);
  760. memcpy(prefixed_full_path, base_uri, prefix_len);
  761. strcpy(prefixed_full_path + prefix_len, full_path);
  762. /* parse OPF to find NCX and spine */
  763. buf = fz_read_archive_entry(ctx, zip, prefixed_full_path);
  764. content_opf = fz_parse_xml(ctx, buf, 0);
  765. fz_drop_buffer(ctx, buf);
  766. buf = NULL;
  767. package = fz_xml_find(fz_xml_root(content_opf), "package");
  768. version = fz_xml_att(package, "version");
  769. if (!version || strcmp(version, "2.0"))
  770. fz_warn(ctx, "unknown epub version: %s", version ? version : "<none>");
  771. metadata = fz_xml_find_down(package, "metadata");
  772. if (metadata)
  773. {
  774. doc->dc_title = Memento_label(find_metadata(ctx, metadata, "title"), "epub_title");
  775. doc->dc_creator = Memento_label(find_metadata(ctx, metadata, "creator"), "epub_creator");
  776. }
  777. manifest = fz_xml_find_down(package, "manifest");
  778. spine = fz_xml_find_down(package, "spine");
  779. if (path_from_idref(ncx, manifest, base_uri, fz_xml_att(spine, "toc"), sizeof ncx))
  780. {
  781. epub_parse_ncx(ctx, doc, ncx);
  782. }
  783. doc->spine = NULL;
  784. tailp = &doc->spine;
  785. itemref = fz_xml_find_down(spine, "itemref");
  786. i = 0;
  787. while (itemref)
  788. {
  789. if (path_from_idref(s, manifest, base_uri, fz_xml_att(itemref, "idref"), sizeof s))
  790. {
  791. fz_try(ctx)
  792. {
  793. *tailp = epub_load_chapter(ctx, doc, s, i);
  794. tailp = &(*tailp)->next;
  795. i++;
  796. }
  797. fz_catch(ctx)
  798. {
  799. fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
  800. fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
  801. fz_report_error(ctx);
  802. fz_warn(ctx, "ignoring chapter %s", s);
  803. }
  804. }
  805. itemref = fz_xml_find_next(itemref, "itemref");
  806. }
  807. }
  808. fz_always(ctx)
  809. {
  810. fz_drop_xml(ctx, content_opf);
  811. fz_drop_xml(ctx, container_xml);
  812. fz_drop_xml(ctx, encryption_xml);
  813. fz_drop_buffer(ctx, buf);
  814. fz_free(ctx, prefixed_full_path);
  815. }
  816. fz_catch(ctx)
  817. fz_rethrow(ctx);
  818. }
  819. static fz_outline *
  820. epub_load_outline(fz_context *ctx, fz_document *doc_)
  821. {
  822. epub_document *doc = (epub_document*)doc_;
  823. return fz_keep_outline(ctx, doc->outline);
  824. }
  825. static int
  826. epub_lookup_metadata(fz_context *ctx, fz_document *doc_, const char *key, char *buf, size_t size)
  827. {
  828. epub_document *doc = (epub_document*)doc_;
  829. if (!strcmp(key, FZ_META_FORMAT))
  830. return 1 + (int)fz_strlcpy(buf, "EPUB", size);
  831. if (!strcmp(key, FZ_META_INFO_TITLE) && doc->dc_title)
  832. return 1 + (int)fz_strlcpy(buf, doc->dc_title, size);
  833. if (!strcmp(key, FZ_META_INFO_AUTHOR) && doc->dc_creator)
  834. return 1 + (int)fz_strlcpy(buf, doc->dc_creator, size);
  835. return -1;
  836. }
  837. static void
  838. epub_output_accelerator(fz_context *ctx, fz_document *doc_, fz_output *out)
  839. {
  840. epub_document *doc = (epub_document*)doc_;
  841. int i;
  842. fz_try(ctx)
  843. {
  844. if (doc->accel == NULL)
  845. fz_throw(ctx, FZ_ERROR_ARGUMENT, "No accelerator data to write");
  846. fz_write_int32_le(ctx, out, MAGIC_ACCELERATOR);
  847. fz_write_int32_le(ctx, out, MAGIC_ACCEL_EPUB);
  848. fz_write_int32_le(ctx, out, ACCEL_VERSION);
  849. fz_write_float_le(ctx, out, doc->accel->layout_w);
  850. fz_write_float_le(ctx, out, doc->accel->layout_h);
  851. fz_write_float_le(ctx, out, doc->accel->layout_em);
  852. fz_write_uint32_le(ctx, out, doc->accel->css_sum);
  853. fz_write_int32_le(ctx, out, doc->accel->use_doc_css);
  854. fz_write_int32_le(ctx, out, doc->accel->num_chapters);
  855. for (i = 0; i < doc->accel->num_chapters; i++)
  856. fz_write_int32_le(ctx, out, doc->accel->pages_in_chapter[i]);
  857. fz_close_output(ctx, out);
  858. }
  859. fz_always(ctx)
  860. fz_drop_output(ctx, out);
  861. fz_catch(ctx)
  862. fz_rethrow(ctx);
  863. }
  864. /* Takes ownership of zip. Will always eventually drop it.
  865. * Never takes ownership of accel. */
  866. static fz_document *
  867. epub_init(fz_context *ctx, fz_archive *zip, fz_stream *accel)
  868. {
  869. epub_document *doc = NULL;
  870. fz_var(doc);
  871. fz_var(zip);
  872. fz_try(ctx)
  873. {
  874. doc = fz_new_derived_document(ctx, epub_document);
  875. doc->zip = zip;
  876. zip = NULL;
  877. doc->super.drop_document = epub_drop_document;
  878. doc->super.layout = epub_layout;
  879. doc->super.load_outline = epub_load_outline;
  880. doc->super.resolve_link_dest = epub_resolve_link;
  881. doc->super.make_bookmark = epub_make_bookmark;
  882. doc->super.lookup_bookmark = epub_lookup_bookmark;
  883. doc->super.count_chapters = epub_count_chapters;
  884. doc->super.count_pages = epub_count_pages;
  885. doc->super.load_page = epub_load_page;
  886. doc->super.page_label = epub_page_label;
  887. doc->super.lookup_metadata = epub_lookup_metadata;
  888. doc->super.output_accelerator = epub_output_accelerator;
  889. doc->super.is_reflowable = 1;
  890. doc->set = fz_new_html_font_set(ctx);
  891. doc->css_sum = user_css_sum(ctx);
  892. epub_load_accelerator(ctx, doc, accel);
  893. epub_parse_header(ctx, doc);
  894. }
  895. fz_catch(ctx)
  896. {
  897. fz_drop_archive(ctx, zip);
  898. fz_drop_document(ctx, &doc->super);
  899. fz_rethrow(ctx);
  900. }
  901. return (fz_document*)doc;
  902. }
  903. static fz_document *
  904. epub_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state)
  905. {
  906. fz_stream *file2 = NULL;
  907. fz_document *doc;
  908. fz_archive *zip = NULL;
  909. if (file == NULL)
  910. {
  911. /* Directory case: file == NULL and dir == the directory. */
  912. if (fz_has_archive_entry(ctx, dir, "META-INF/container.xml"))
  913. file2 = file = fz_open_archive_entry(ctx, dir, "META-INF/container.xml");
  914. else
  915. file2 = file = fz_open_archive_entry(ctx, dir, "META-INF\\container.xml");
  916. if (file == NULL)
  917. fz_throw(ctx, FZ_ERROR_FORMAT, "Not an epub file");
  918. zip = fz_keep_archive(ctx, dir);
  919. }
  920. else
  921. {
  922. /* File case: file != NULL and dir can be ignored. */
  923. zip = fz_open_archive_with_stream(ctx, file);
  924. }
  925. fz_try(ctx)
  926. doc = epub_init(ctx, zip, file);
  927. fz_always(ctx)
  928. fz_drop_stream(ctx, file2);
  929. fz_catch(ctx)
  930. fz_rethrow(ctx);
  931. return doc;
  932. }
  933. static int
  934. epub_recognize(fz_context *doc, const fz_document_handler *handler, const char *magic)
  935. {
  936. if (strstr(magic, "META-INF/container.xml") || strstr(magic, "META-INF\\container.xml"))
  937. return 200;
  938. return 0;
  939. }
  940. static int
  941. epub_recognize_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **state, fz_document_recognize_state_free_fn **free_state)
  942. {
  943. fz_archive *arch = NULL;
  944. int ret = 0;
  945. fz_var(arch);
  946. fz_var(ret);
  947. if (state)
  948. *state = NULL;
  949. if (free_state)
  950. *free_state = NULL;
  951. fz_try(ctx)
  952. {
  953. if (stream == NULL)
  954. arch = fz_keep_archive(ctx, dir);
  955. else
  956. {
  957. arch = fz_try_open_archive_with_stream(ctx, stream);
  958. if (arch == NULL)
  959. break;
  960. }
  961. if (fz_has_archive_entry(ctx, arch, "META-INF/container.xml") ||
  962. fz_has_archive_entry(ctx, arch, "META-INF\\container.xml"))
  963. ret = 74; /* One less than the 75 that HWPX files are detected as. */
  964. }
  965. fz_always(ctx)
  966. fz_drop_archive(ctx, arch);
  967. fz_catch(ctx)
  968. fz_rethrow(ctx);
  969. return ret;
  970. }
  971. static const char *epub_extensions[] =
  972. {
  973. "epub",
  974. NULL
  975. };
  976. static const char *epub_mimetypes[] =
  977. {
  978. "application/epub+zip",
  979. NULL
  980. };
  981. fz_document_handler epub_document_handler =
  982. {
  983. epub_recognize,
  984. epub_open_document,
  985. epub_extensions,
  986. epub_mimetypes,
  987. epub_recognize_content
  988. };