html-imp.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583
  1. // Copyright (C) 2004-2025 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #ifndef SOURCE_HTML_IMP_H
  23. #define SOURCE_HTML_IMP_H
  24. #include "mupdf/fitz.h"
  25. #include "mupdf/html.h"
  26. #include "../fitz/xml-imp.h"
  27. typedef struct fz_html_font_face_s fz_html_font_face;
  28. typedef struct fz_html_box_s fz_html_box;
  29. typedef struct fz_html_flow_s fz_html_flow;
  30. typedef struct fz_css_style_splay_s fz_css_style_splay;
  31. typedef struct fz_css_s fz_css;
  32. typedef struct fz_css_rule_s fz_css_rule;
  33. typedef struct fz_css_match_s fz_css_match;
  34. typedef struct fz_css_style_s fz_css_style;
  35. typedef struct fz_css_selector_s fz_css_selector;
  36. typedef struct fz_css_condition_s fz_css_condition;
  37. typedef struct fz_css_property_s fz_css_property;
  38. typedef struct fz_css_value_s fz_css_value;
  39. typedef struct fz_css_number_s fz_css_number;
  40. typedef struct fz_css_color_s fz_css_color;
  41. struct fz_html_font_face_s
  42. {
  43. char *family;
  44. int is_bold;
  45. int is_italic;
  46. int is_small_caps;
  47. fz_font *font;
  48. char *src;
  49. fz_html_font_face *next;
  50. };
  51. struct fz_html_font_set_s
  52. {
  53. fz_font *fonts[12]; /* Times, Helvetica, Courier in R,I,B,BI */
  54. fz_html_font_face *custom;
  55. };
  56. #define UCS_MAX 0x10ffff
  57. enum
  58. {
  59. CSS_KEYWORD = UCS_MAX+1,
  60. CSS_HASH,
  61. CSS_STRING,
  62. CSS_NUMBER,
  63. CSS_LENGTH,
  64. CSS_PERCENT,
  65. CSS_URI,
  66. };
  67. struct fz_css_s
  68. {
  69. fz_pool *pool;
  70. fz_css_rule *rule;
  71. };
  72. struct fz_css_rule_s
  73. {
  74. fz_css_selector *selector;
  75. fz_css_property *declaration;
  76. fz_css_rule *next;
  77. int loaded;
  78. };
  79. struct fz_css_selector_s
  80. {
  81. char *name;
  82. int combine;
  83. fz_css_condition *cond;
  84. fz_css_selector *left;
  85. fz_css_selector *right;
  86. fz_css_selector *next;
  87. };
  88. struct fz_css_condition_s
  89. {
  90. int type;
  91. char *key;
  92. char *val;
  93. fz_css_condition *next;
  94. };
  95. struct fz_css_property_s
  96. {
  97. int name;
  98. fz_css_value *value;
  99. short spec;
  100. short important;
  101. fz_css_property *next;
  102. };
  103. struct fz_css_value_s
  104. {
  105. int type;
  106. char *data;
  107. fz_css_value *args; /* function arguments */
  108. fz_css_value *next;
  109. };
  110. enum
  111. {
  112. PRO_BACKGROUND_COLOR,
  113. PRO_BORDER_BOTTOM_COLOR,
  114. PRO_BORDER_BOTTOM_STYLE,
  115. PRO_BORDER_BOTTOM_WIDTH,
  116. PRO_BORDER_LEFT_COLOR,
  117. PRO_BORDER_LEFT_STYLE,
  118. PRO_BORDER_LEFT_WIDTH,
  119. PRO_BORDER_RIGHT_COLOR,
  120. PRO_BORDER_RIGHT_STYLE,
  121. PRO_BORDER_RIGHT_WIDTH,
  122. PRO_BORDER_TOP_COLOR,
  123. PRO_BORDER_TOP_STYLE,
  124. PRO_BORDER_TOP_WIDTH,
  125. PRO_BORDER_SPACING,
  126. PRO_COLOR,
  127. PRO_DIRECTION,
  128. PRO_DISPLAY,
  129. PRO_FONT,
  130. PRO_FONT_FAMILY,
  131. PRO_FONT_SIZE,
  132. PRO_FONT_STYLE,
  133. PRO_FONT_VARIANT,
  134. PRO_FONT_WEIGHT,
  135. PRO_HEIGHT,
  136. PRO_LEADING,
  137. PRO_LETTER_SPACING,
  138. PRO_LINE_HEIGHT,
  139. PRO_LIST_STYLE_IMAGE,
  140. PRO_LIST_STYLE_POSITION,
  141. PRO_LIST_STYLE_TYPE,
  142. PRO_MARGIN_BOTTOM,
  143. PRO_MARGIN_LEFT,
  144. PRO_MARGIN_RIGHT,
  145. PRO_MARGIN_TOP,
  146. PRO_ORPHANS,
  147. PRO_OVERFLOW_WRAP,
  148. PRO_PADDING_BOTTOM,
  149. PRO_PADDING_LEFT,
  150. PRO_PADDING_RIGHT,
  151. PRO_PADDING_TOP,
  152. PRO_PAGE_BREAK_AFTER,
  153. PRO_PAGE_BREAK_BEFORE,
  154. PRO_QUOTES,
  155. PRO_SRC,
  156. PRO_TEXT_ALIGN,
  157. PRO_TEXT_DECORATION,
  158. PRO_TEXT_FILL_COLOR,
  159. PRO_TEXT_INDENT,
  160. PRO_TEXT_TRANSFORM,
  161. PRO_TEXT_STROKE_WIDTH,
  162. PRO_TEXT_STROKE_COLOR,
  163. PRO_VERTICAL_ALIGN,
  164. PRO_VISIBILITY,
  165. PRO_WHITE_SPACE,
  166. PRO_WIDOWS,
  167. PRO_WIDTH,
  168. PRO_WORD_SPACING,
  169. /* Number of real properties. */
  170. NUM_PROPERTIES,
  171. /* Short-hand properties (always expanded when applied, never used as is): */
  172. PRO_BORDER,
  173. PRO_BORDER_BOTTOM,
  174. PRO_BORDER_COLOR,
  175. PRO_BORDER_LEFT,
  176. PRO_BORDER_RIGHT,
  177. PRO_BORDER_STYLE,
  178. PRO_BORDER_TOP,
  179. PRO_BORDER_WIDTH,
  180. PRO_LIST_STYLE,
  181. PRO_MARGIN,
  182. PRO_PADDING,
  183. };
  184. struct fz_css_match_s
  185. {
  186. fz_css_match *up;
  187. short spec[NUM_PROPERTIES];
  188. fz_css_value *value[NUM_PROPERTIES];
  189. };
  190. enum { DIS_NONE, DIS_BLOCK, DIS_INLINE, DIS_LIST_ITEM, DIS_INLINE_BLOCK, DIS_TABLE, DIS_TABLE_GROUP, DIS_TABLE_ROW, DIS_TABLE_CELL };
  191. enum { POS_STATIC, POS_RELATIVE, POS_ABSOLUTE, POS_FIXED };
  192. enum { TA_LEFT, TA_RIGHT, TA_CENTER, TA_JUSTIFY };
  193. enum { VA_BASELINE, VA_SUB, VA_SUPER, VA_TOP, VA_BOTTOM, VA_TEXT_TOP, VA_TEXT_BOTTOM };
  194. enum { BS_NONE, BS_SOLID };
  195. enum { V_VISIBLE, V_HIDDEN, V_COLLAPSE };
  196. enum { PB_AUTO, PB_ALWAYS, PB_AVOID, PB_LEFT, PB_RIGHT };
  197. enum { TD_NONE, TD_UNDERLINE, TD_LINE_THROUGH };
  198. enum {
  199. WS_COLLAPSE = 1,
  200. WS_ALLOW_BREAK_SPACE = 2,
  201. WS_FORCE_BREAK_NEWLINE = 4,
  202. WS_NORMAL = WS_COLLAPSE | WS_ALLOW_BREAK_SPACE,
  203. WS_PRE = WS_FORCE_BREAK_NEWLINE,
  204. WS_NOWRAP = WS_COLLAPSE,
  205. WS_PRE_WRAP = WS_ALLOW_BREAK_SPACE | WS_FORCE_BREAK_NEWLINE,
  206. WS_PRE_LINE = WS_COLLAPSE | WS_ALLOW_BREAK_SPACE | WS_FORCE_BREAK_NEWLINE
  207. };
  208. enum {
  209. LST_NONE,
  210. LST_DISC, LST_CIRCLE, LST_SQUARE,
  211. LST_DECIMAL, LST_DECIMAL_ZERO,
  212. LST_LC_ROMAN, LST_UC_ROMAN,
  213. LST_LC_GREEK, LST_UC_GREEK,
  214. LST_LC_LATIN, LST_UC_LATIN,
  215. LST_LC_ALPHA, LST_UC_ALPHA,
  216. LST_ARMENIAN, LST_GEORGIAN,
  217. };
  218. enum {
  219. OVERFLOW_WRAP_NORMAL = 0,
  220. OVERFLOW_WRAP_BREAK_WORD = 1
  221. /* We do not support 'anywhere'. */
  222. };
  223. enum { N_NUMBER='u', N_LENGTH='p', N_SCALE='m', N_PERCENT='%', N_AUTO='a', N_UNDEFINED='x' };
  224. struct fz_css_number_s
  225. {
  226. float value;
  227. int unit;
  228. };
  229. struct fz_css_color_s
  230. {
  231. unsigned char r, g, b, a;
  232. };
  233. struct fz_css_style_s
  234. {
  235. fz_css_number font_size;
  236. fz_css_number width, height;
  237. fz_css_number margin[4];
  238. fz_css_number padding[4];
  239. fz_css_number border_width[4];
  240. fz_css_number border_spacing;
  241. fz_css_number text_indent;
  242. fz_css_number text_stroke_width;
  243. unsigned int visibility : 2;
  244. unsigned int white_space : 3;
  245. unsigned int text_align : 2;
  246. unsigned int vertical_align : 3;
  247. unsigned int list_style_type : 4;
  248. unsigned int page_break_before : 3;
  249. unsigned int page_break_after : 3;
  250. unsigned int border_style_0 : 1;
  251. unsigned int border_style_1 : 1;
  252. unsigned int border_style_2 : 1;
  253. unsigned int border_style_3 : 1;
  254. unsigned int small_caps : 1;
  255. unsigned int text_decoration: 2;
  256. unsigned int overflow_wrap : 1;
  257. /* Ensure the extra bits in the bitfield are copied
  258. * on structure copies. */
  259. unsigned int blank : 3;
  260. fz_css_number line_height;
  261. fz_css_number leading;
  262. fz_css_color background_color;
  263. fz_css_color border_color[4];
  264. fz_css_color color;
  265. fz_css_color text_fill_color;
  266. fz_css_color text_stroke_color;
  267. fz_font *font;
  268. };
  269. struct fz_css_style_splay_s {
  270. fz_css_style style;
  271. fz_css_style_splay *lt;
  272. fz_css_style_splay *gt;
  273. fz_css_style_splay *up;
  274. };
  275. enum
  276. {
  277. BOX_BLOCK, /* block-level: contains block, break, flow, and table boxes */
  278. BOX_FLOW, /* block-level: contains only inline boxes */
  279. BOX_INLINE, /* inline-level: contains only inline boxes */
  280. BOX_TABLE, /* table: contains table-row */
  281. BOX_TABLE_ROW, /* table-row: contains table-cell */
  282. BOX_TABLE_CELL, /* table-cell: contains block */
  283. };
  284. typedef struct
  285. {
  286. fz_storable storable;
  287. fz_pool *pool; /* pool allocator for this html tree */
  288. fz_html_box *root;
  289. } fz_html_tree;
  290. struct fz_html_s
  291. {
  292. /* fz_html is derived from fz_html_tree, so must start with that. */
  293. /* Arguably 'tree' should be called 'super'. */
  294. fz_html_tree tree;
  295. float page_w, page_h;
  296. float layout_w, layout_h, layout_em;
  297. float page_margin[4];
  298. char *title;
  299. };
  300. typedef enum
  301. {
  302. FZ_HTML_RESTART_REASON_NONE = 0,
  303. FZ_HTML_RESTART_REASON_LINE_HEIGHT = 1,
  304. FZ_HTML_RESTART_REASON_LINE_WIDTH = 2
  305. } fz_html_restart_reason;
  306. enum
  307. {
  308. FZ_HTML_RESTARTER_FLAGS_NO_OVERFLOW = 1
  309. };
  310. typedef struct {
  311. /* start will be filled in on entry with the first node to start
  312. * operation on. NULL means start 'immediately'. As we traverse
  313. * the tree, once we reach the node to start on, we set this to
  314. * NULL, hence if 'start != NULL' then we are still skipping to
  315. * find the starting node. */
  316. fz_html_box *start;
  317. /* If start is a BOX_FLOW, then start_flow will be the flow entry
  318. * at which we should start. */
  319. fz_html_flow *start_flow;
  320. /* end should be NULL on entry. On exit, if it's NULL, then we
  321. * finished. Otherwise, this is where we should restart the
  322. * process the next time. */
  323. fz_html_box *end;
  324. /* If end is a BOX_FLOW, then end_flow will be the flow entry at which
  325. * we should restart next time. */
  326. fz_html_flow *end_flow;
  327. /* Workspace used on the traversal of the tree to store a good place
  328. * to restart. Typically this will be set to an enclosing box with
  329. * a border, so that if we then fail to put any content into the box
  330. * we'll elide the entire box/border, not output an empty one. */
  331. fz_html_box *potential;
  332. fz_html_restart_reason reason;
  333. int flags;
  334. } fz_html_restarter;
  335. struct fz_story
  336. {
  337. /* fz_story is derived from fz_html_tree, so must start with */
  338. /* that. Arguably 'tree' should be called 'super'. */
  339. fz_html_tree tree;
  340. /* The user_css (or NULL) */
  341. char *user_css;
  342. /* The HTML story as XML nodes with a DOM */
  343. fz_xml *dom;
  344. /* The fontset for the content. */
  345. fz_html_font_set *font_set;
  346. /* restart_place holds the start position for the next place.
  347. * This is updated by draw. */
  348. fz_html_restarter restart_place;
  349. /* restart_draw holds the start position for the next draw.
  350. * This is updated by place. */
  351. fz_html_restarter restart_draw;
  352. /* complete is set true when all the story has been placed and
  353. * drawn. */
  354. int complete;
  355. /* The last bbox we laid out for. Used for making a clipping
  356. * rectangle. */
  357. fz_rect bbox;
  358. /* The default 'em' size. */
  359. float em;
  360. /* Collected parsing warnings. */
  361. fz_buffer *warnings;
  362. /* Rectangle layout count. */
  363. int rect_count;
  364. /* Archive from which to load any resources. */
  365. fz_archive *zip;
  366. };
  367. struct fz_html_box_s
  368. {
  369. unsigned int type : 3;
  370. unsigned int is_first_flow : 1; /* for text-indent */
  371. unsigned int markup_dir : 2;
  372. unsigned int heading : 3;
  373. unsigned int list_item : 21;
  374. fz_html_box *up, *down, *next;
  375. const char *tag, *id, *href;
  376. const fz_css_style *style;
  377. union {
  378. /* Only needed during build stage */
  379. struct {
  380. fz_html_box *last_child;
  381. fz_html_flow **flow_tail;
  382. } build;
  383. /* Only needed during layout */
  384. struct {
  385. float x, y, w, b; /* content */
  386. float em, baseline;
  387. } layout;
  388. } s;
  389. union {
  390. /* Only BOX_FLOW use the following */
  391. struct {
  392. fz_html_flow *head;
  393. } flow;
  394. /* Only BOX_{BLOCK,TABLE,TABLE_ROW,TABLE_CELL} use the following */
  395. struct {
  396. float margin[4]; // TODO: is margin needed post layout?
  397. float border[4];
  398. float padding[4];
  399. } block;
  400. } u;
  401. };
  402. static inline int
  403. fz_html_box_has_boxes(fz_html_box *box)
  404. {
  405. return (box->type == BOX_BLOCK || box->type == BOX_TABLE || box->type == BOX_TABLE_ROW || box->type == BOX_TABLE_CELL);
  406. }
  407. enum
  408. {
  409. FLOW_WORD = 0,
  410. FLOW_SPACE = 1,
  411. FLOW_BREAK = 2,
  412. FLOW_IMAGE = 3,
  413. FLOW_SBREAK = 4,
  414. FLOW_SHYPHEN = 5,
  415. FLOW_ANCHOR = 6
  416. };
  417. struct fz_html_flow_s
  418. {
  419. /* What type of node */
  420. unsigned int type : 3;
  421. /* Whether this should expand during justification */
  422. unsigned int expand : 1;
  423. /* Whether this node is currently taken as a line break */
  424. unsigned int breaks_line : 1;
  425. /* Whether this word node can be split or consists of a single glyph cluster */
  426. unsigned int atomic : 1;
  427. /* Whether lines may be broken before this word for overflow-wrap: word-break */
  428. unsigned int overflow_wrap : 1;
  429. /* Direction setting for text - UAX#9 says 125 is the max */
  430. unsigned int bidi_level : 7;
  431. /* The script detected by the bidi code. */
  432. unsigned int script : 8;
  433. /* Whether the markup specifies a given language. */
  434. unsigned short markup_lang;
  435. float x, y, w, h;
  436. fz_html_box *box; /* for style and em */
  437. fz_html_flow *next;
  438. union {
  439. char text[1];
  440. fz_image *image;
  441. } content;
  442. };
  443. fz_css *fz_new_css(fz_context *ctx);
  444. void fz_parse_css(fz_context *ctx, fz_css *css, const char *source, const char *file);
  445. fz_css_property *fz_parse_css_properties(fz_context *ctx, fz_pool *pool, const char *source);
  446. void fz_drop_css(fz_context *ctx, fz_css *css);
  447. void fz_debug_css(fz_context *ctx, fz_css *css);
  448. const char *fz_css_property_name(int name);
  449. void fz_match_css(fz_context *ctx, fz_css_match *match, fz_css_match *up, fz_css *css, fz_xml *node);
  450. void fz_match_css_at_page(fz_context *ctx, fz_css_match *match, fz_css *css);
  451. int fz_get_css_match_display(fz_css_match *node);
  452. void fz_default_css_style(fz_context *ctx, fz_css_style *style);
  453. void fz_apply_css_style(fz_context *ctx, fz_html_font_set *set, fz_css_style *style, fz_css_match *match);
  454. /*
  455. Lookup style in the splay tree, returning a pointer
  456. to the found instance if there is one, creating and
  457. inserting (and moving to root) one if there is not.
  458. */
  459. const fz_css_style *fz_css_enlist(fz_context *ctx, const fz_css_style *style, fz_css_style_splay **tree, fz_pool *pool);
  460. float fz_from_css_number(fz_css_number number, float em, float percent_value, float auto_value);
  461. float fz_from_css_number_scale(fz_css_number number, float scale);
  462. int fz_css_number_defined(fz_css_number number);
  463. fz_html_font_set *fz_new_html_font_set(fz_context *ctx);
  464. void fz_add_html_font_face(fz_context *ctx, fz_html_font_set *set,
  465. const char *family, int is_bold, int is_italic, int is_small_caps, const char *src, fz_font *font);
  466. fz_font *fz_load_html_font(fz_context *ctx, fz_html_font_set *set, const char *family, int is_bold, int is_italic, int is_small_caps);
  467. void fz_drop_html_font_set(fz_context *ctx, fz_html_font_set *htx);
  468. void fz_add_css_font_faces(fz_context *ctx, fz_html_font_set *set, fz_archive *dir, const char *base_uri, fz_css *css);
  469. void fz_layout_html(fz_context *ctx, fz_html *html, float w, float h, float em);
  470. void fz_draw_html(fz_context *ctx, fz_device *dev, fz_matrix ctm, fz_html *html, int page);
  471. fz_outline *fz_load_html_outline(fz_context *ctx, fz_html *node);
  472. float fz_find_html_target(fz_context *ctx, fz_html *html, const char *id);
  473. fz_link *fz_load_html_links(fz_context *ctx, fz_html *html, int page, const char *base_uri);
  474. fz_html *fz_keep_html(fz_context *ctx, fz_html *html);
  475. void fz_drop_html(fz_context *ctx, fz_html *html);
  476. fz_bookmark fz_make_html_bookmark(fz_context *ctx, fz_html *html, int page);
  477. int fz_lookup_html_bookmark(fz_context *ctx, fz_html *html, fz_bookmark mark);
  478. void fz_debug_html(fz_context *ctx, fz_html_box *box);
  479. fz_html *fz_store_html(fz_context *ctx, fz_html *html, void *doc, int chapter);
  480. fz_html *fz_find_html(fz_context *ctx, void *doc, int chapter);
  481. void fz_purge_stored_html(fz_context *ctx, void *doc);
  482. void fz_restartable_layout_html(fz_context *ctx, fz_html_tree *tree, float start_x, float start_y, float page_w, float page_h, float em, fz_html_restarter *restart);
  483. fz_html_flow *fz_html_split_flow(fz_context *ctx, fz_pool *pool, fz_html_flow *flow, size_t offset);
  484. fz_archive *fz_extract_html_from_mobi(fz_context *ctx, fz_buffer *mobi);
  485. fz_structure fz_html_tag_to_structure(const char *tag);
  486. fz_html *fz_parse_html(fz_context *ctx,
  487. fz_html_font_set *set, fz_archive *dir, const char *base_uri, fz_buffer *buf, const char *user_css,
  488. int try_xml, int try_html5, int patch_mobi);
  489. fz_buffer *fz_txt_buffer_to_html(fz_context *ctx, fz_buffer *in);
  490. #endif