html-outline.c 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440
  1. // Copyright (C) 2004-2024 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #include "mupdf/fitz.h"
  23. #include "html-imp.h"
  24. #include <string.h>
  25. enum { T, R, B, L };
  26. static int is_internal_uri(const char *uri)
  27. {
  28. while (*uri >= 'a' && *uri <= 'z')
  29. ++uri;
  30. if (uri[0] == ':' && uri[1] == '/' && uri[2] == '/')
  31. return 0;
  32. return 1;
  33. }
  34. static fz_link *load_link_flow(fz_context *ctx, fz_html_flow *flow, fz_link *head, int page, float page_h, const char *dir, const char *file)
  35. {
  36. fz_link *link;
  37. fz_html_flow *next;
  38. char path[2048];
  39. fz_rect bbox;
  40. const char *dest;
  41. const char *href;
  42. float end;
  43. float page_y0 = page * page_h;
  44. float page_y1 = (page + 1) * page_h;
  45. while (flow)
  46. {
  47. next = flow->next;
  48. if (flow->y >= page_y0 && flow->y <= page_y1)
  49. {
  50. href = flow->box->href;
  51. if (href)
  52. {
  53. /* Coalesce contiguous flow boxes into one link node */
  54. end = flow->x + flow->w;
  55. while (next &&
  56. next->y == flow->y &&
  57. next->h == flow->h &&
  58. next->box->href == href)
  59. {
  60. end = next->x + next->w;
  61. next = next->next;
  62. }
  63. bbox.x0 = flow->x;
  64. bbox.y0 = flow->y - page * page_h;
  65. bbox.x1 = end;
  66. bbox.y1 = bbox.y0 + flow->h;
  67. if (flow->type != FLOW_IMAGE)
  68. {
  69. /* flow->y is the baseline, adjust bbox appropriately */
  70. bbox.y0 -= 0.8f * flow->h;
  71. bbox.y1 -= 0.8f * flow->h;
  72. }
  73. if (is_internal_uri(href))
  74. {
  75. if (href[0] == '#')
  76. {
  77. fz_strlcpy(path, file, sizeof path);
  78. fz_strlcat(path, href, sizeof path);
  79. }
  80. else
  81. {
  82. fz_strlcpy(path, dir, sizeof path);
  83. fz_strlcat(path, "/", sizeof path);
  84. fz_strlcat(path, href, sizeof path);
  85. }
  86. fz_urldecode(path);
  87. fz_cleanname(path);
  88. dest = path;
  89. }
  90. else
  91. {
  92. dest = href;
  93. }
  94. link = fz_new_derived_link(ctx, fz_link, bbox, dest);
  95. link->next = head;
  96. head = link;
  97. }
  98. }
  99. flow = next;
  100. }
  101. return head;
  102. }
  103. static fz_link *load_link_box(fz_context *ctx, fz_html_box *box, fz_link *head, int page, float page_h, const char *dir, const char *file)
  104. {
  105. while (box)
  106. {
  107. if (box->type == BOX_FLOW)
  108. head = load_link_flow(ctx, box->u.flow.head, head, page, page_h, dir, file);
  109. if (box->down)
  110. head = load_link_box(ctx, box->down, head, page, page_h, dir, file);
  111. box = box->next;
  112. }
  113. return head;
  114. }
  115. fz_link *
  116. fz_load_html_links(fz_context *ctx, fz_html *html, int page, const char *file)
  117. {
  118. fz_link *link, *head;
  119. char dir[2048];
  120. fz_dirname(dir, file, sizeof dir);
  121. head = load_link_box(ctx, html->tree.root, NULL, page, html->page_h, dir, file);
  122. for (link = head; link; link = link->next)
  123. {
  124. /* Adjust for page margins */
  125. link->rect.x0 += html->page_margin[L];
  126. link->rect.x1 += html->page_margin[L];
  127. link->rect.y0 += html->page_margin[T];
  128. link->rect.y1 += html->page_margin[T];
  129. }
  130. return head;
  131. }
  132. static fz_html_flow *
  133. find_first_content(fz_html_box *box)
  134. {
  135. while (box)
  136. {
  137. if (box->type == BOX_FLOW)
  138. return box->u.flow.head;
  139. box = box->down;
  140. }
  141. return NULL;
  142. }
  143. static float
  144. find_flow_target(fz_html_flow *flow, const char *id)
  145. {
  146. while (flow)
  147. {
  148. if (flow->box->id && !strcmp(id, flow->box->id))
  149. return flow->y;
  150. flow = flow->next;
  151. }
  152. return -1;
  153. }
  154. static float
  155. find_box_target(fz_html_box *box, const char *id)
  156. {
  157. float y;
  158. while (box)
  159. {
  160. if (box->id && !strcmp(id, box->id))
  161. {
  162. fz_html_flow *flow = find_first_content(box);
  163. if (flow)
  164. return flow->y;
  165. return box->s.layout.y;
  166. }
  167. if (box->type == BOX_FLOW)
  168. {
  169. y = find_flow_target(box->u.flow.head, id);
  170. if (y >= 0)
  171. return y;
  172. }
  173. else
  174. {
  175. y = find_box_target(box->down, id);
  176. if (y >= 0)
  177. return y;
  178. }
  179. box = box->next;
  180. }
  181. return -1;
  182. }
  183. float
  184. fz_find_html_target(fz_context *ctx, fz_html *html, const char *id)
  185. {
  186. return find_box_target(html->tree.root, id);
  187. }
  188. static fz_html_flow *
  189. make_flow_bookmark(fz_context *ctx, fz_html_flow *flow, float y, fz_html_flow **candidate)
  190. {
  191. while (flow)
  192. {
  193. *candidate = flow;
  194. if (flow->y >= y)
  195. return flow;
  196. flow = flow->next;
  197. }
  198. return NULL;
  199. }
  200. static fz_html_flow *
  201. make_box_bookmark(fz_context *ctx, fz_html_box *box, float y, fz_html_flow **candidate)
  202. {
  203. fz_html_flow *mark;
  204. fz_html_flow *dummy = NULL;
  205. if (candidate == NULL)
  206. candidate = &dummy;
  207. while (box)
  208. {
  209. if (box->type == BOX_FLOW)
  210. {
  211. if (box->s.layout.y >= y)
  212. {
  213. mark = make_flow_bookmark(ctx, box->u.flow.head, y, candidate);
  214. if (mark)
  215. return mark;
  216. }
  217. else
  218. *candidate = make_flow_bookmark(ctx, box->u.flow.head, y, candidate);
  219. }
  220. else
  221. {
  222. mark = make_box_bookmark(ctx, box->down, y, candidate);
  223. if (mark)
  224. return mark;
  225. }
  226. box = box->next;
  227. }
  228. return *candidate;
  229. }
  230. fz_bookmark
  231. fz_make_html_bookmark(fz_context *ctx, fz_html *html, int page)
  232. {
  233. return (fz_bookmark)make_box_bookmark(ctx, html->tree.root, page * html->page_h, NULL);
  234. }
  235. static int
  236. lookup_flow_bookmark(fz_context *ctx, fz_html_flow *flow, fz_html_flow *mark)
  237. {
  238. while (flow)
  239. {
  240. if (flow == mark)
  241. return 1;
  242. flow = flow->next;
  243. }
  244. return 0;
  245. }
  246. static int
  247. lookup_box_bookmark(fz_context *ctx, fz_html_box *box, fz_html_flow *mark)
  248. {
  249. while (box)
  250. {
  251. if (box->type == BOX_FLOW)
  252. {
  253. if (lookup_flow_bookmark(ctx, box->u.flow.head, mark))
  254. return 1;
  255. }
  256. else
  257. {
  258. if (lookup_box_bookmark(ctx, box->down, mark))
  259. return 1;
  260. }
  261. box = box->next;
  262. }
  263. return 0;
  264. }
  265. int
  266. fz_lookup_html_bookmark(fz_context *ctx, fz_html *html, fz_bookmark mark)
  267. {
  268. fz_html_flow *flow = (fz_html_flow*)mark;
  269. if (flow && lookup_box_bookmark(ctx, html->tree.root, flow))
  270. return (int)(flow->y / html->page_h);
  271. return -1;
  272. }
  273. struct outline_parser
  274. {
  275. fz_html *html;
  276. fz_buffer *cat;
  277. fz_outline *head;
  278. fz_outline **tail[6];
  279. fz_outline **down[6];
  280. int level[6];
  281. int current;
  282. int id;
  283. };
  284. static void
  285. cat_html_flow(fz_context *ctx, fz_buffer *cat, fz_html_flow *flow)
  286. {
  287. while (flow)
  288. {
  289. switch (flow->type)
  290. {
  291. case FLOW_WORD:
  292. fz_append_string(ctx, cat, flow->content.text);
  293. break;
  294. case FLOW_SPACE:
  295. case FLOW_BREAK:
  296. fz_append_byte(ctx, cat, ' ');
  297. break;
  298. default:
  299. break;
  300. }
  301. flow = flow->next;
  302. }
  303. }
  304. static void
  305. cat_html_box(fz_context *ctx, fz_buffer *cat, fz_html_box *box)
  306. {
  307. while (box)
  308. {
  309. if (box->type == BOX_FLOW)
  310. cat_html_flow(ctx, cat, box->u.flow.head);
  311. cat_html_box(ctx, cat, box->down);
  312. box = box->next;
  313. }
  314. }
  315. static const char *
  316. cat_html_text(fz_context *ctx, struct outline_parser *x, fz_html_box *box)
  317. {
  318. if (!x->cat)
  319. x->cat = fz_new_buffer(ctx, 1024);
  320. else
  321. fz_clear_buffer(ctx, x->cat);
  322. cat_html_flow(ctx, x->cat, box->u.flow.head);
  323. cat_html_box(ctx, x->cat, box->down);
  324. return fz_string_from_buffer(ctx, x->cat);
  325. }
  326. static void
  327. add_html_outline(fz_context *ctx, struct outline_parser *x, fz_html_box *box)
  328. {
  329. fz_outline *node;
  330. char buf[100];
  331. int heading;
  332. node = fz_new_outline(ctx);
  333. fz_try(ctx)
  334. {
  335. node->title = Memento_label(fz_strdup(ctx, cat_html_text(ctx, x, box)), "outline_title");
  336. if (!box->id)
  337. {
  338. fz_snprintf(buf, sizeof buf, "'%d", x->id++);
  339. box->id = Memento_label(fz_pool_strdup(ctx, x->html->tree.pool, buf), "box_id");
  340. }
  341. node->uri = Memento_label(fz_asprintf(ctx, "#%s", box->id), "outline_uri");
  342. node->is_open = 1;
  343. }
  344. fz_catch(ctx)
  345. {
  346. fz_free(ctx, node);
  347. fz_rethrow(ctx);
  348. }
  349. heading = box->heading;
  350. if (x->level[x->current] < heading && x->current < 5)
  351. {
  352. x->tail[x->current+1] = x->down[x->current];
  353. x->current += 1;
  354. }
  355. else
  356. {
  357. while (x->current > 0 && x->level[x->current] > heading)
  358. {
  359. x->current -= 1;
  360. }
  361. }
  362. x->level[x->current] = heading;
  363. *(x->tail[x->current]) = node;
  364. x->tail[x->current] = &node->next;
  365. x->down[x->current] = &node->down;
  366. }
  367. static void
  368. load_html_outline(fz_context *ctx, struct outline_parser *x, fz_html_box *box)
  369. {
  370. while (box)
  371. {
  372. int heading = box->heading;
  373. if (heading)
  374. add_html_outline(ctx, x, box);
  375. if (box->down)
  376. load_html_outline(ctx, x, box->down);
  377. box = box->next;
  378. }
  379. }
  380. fz_outline *
  381. fz_load_html_outline(fz_context *ctx, fz_html *html)
  382. {
  383. struct outline_parser state;
  384. state.html = html;
  385. state.cat = NULL;
  386. state.head = NULL;
  387. state.tail[0] = &state.head;
  388. state.down[0] = NULL;
  389. state.level[0] = 99;
  390. state.current = 0;
  391. state.id = 1;
  392. fz_try(ctx)
  393. load_html_outline(ctx, &state, html->tree.root);
  394. fz_always(ctx)
  395. fz_drop_buffer(ctx, state.cat);
  396. fz_catch(ctx)
  397. {
  398. fz_drop_outline(ctx, state.head);
  399. state.head = NULL;
  400. }
  401. return state.head;
  402. }