stext-output.c 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359
  1. // Copyright (C) 2004-2025 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #include "mupdf/fitz.h"
  23. #define SUBSCRIPT_OFFSET 0.2f
  24. #define SUPERSCRIPT_OFFSET -0.2f
  25. #include <ft2build.h>
  26. #include FT_FREETYPE_H
  27. // Text black color when converted from DeviceCMYK to RGB
  28. #define CMYK_BLACK 0x221f1f
  29. static void
  30. scale_run(fz_context *ctx, fz_stext_block *block, float scale)
  31. {
  32. fz_matrix m = fz_scale(scale, scale);
  33. fz_stext_line *line;
  34. fz_stext_char *ch;
  35. while (block)
  36. {
  37. block->bbox = fz_transform_rect(block->bbox, m);
  38. switch (block->type)
  39. {
  40. case FZ_STEXT_BLOCK_TEXT:
  41. for (line = block->u.t.first_line; line; line = line->next)
  42. {
  43. line->bbox = fz_transform_rect(block->bbox, m);
  44. for (ch = line->first_char; ch; ch = ch->next)
  45. {
  46. ch->origin = fz_transform_point(ch->origin, m);
  47. ch->quad = fz_transform_quad(ch->quad, m);
  48. ch->size = ch->size * scale;
  49. }
  50. }
  51. break;
  52. case FZ_STEXT_BLOCK_IMAGE:
  53. block->u.i.transform = fz_post_scale(block->u.i.transform, scale, scale);
  54. break;
  55. case FZ_STEXT_BLOCK_STRUCT:
  56. if (block->u.s.down)
  57. scale_run(ctx, block->u.s.down->first_block, scale);
  58. break;
  59. }
  60. block = block->next;
  61. }
  62. }
  63. static void fz_scale_stext_page(fz_context *ctx, fz_stext_page *page, float scale)
  64. {
  65. scale_run(ctx, page->first_block, scale);
  66. }
  67. /* HTML output (visual formatting with preserved layout) */
  68. static int
  69. detect_super_script(fz_stext_line *line, fz_stext_char *ch)
  70. {
  71. if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0)
  72. return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f;
  73. return 0;
  74. }
  75. static const char *
  76. font_full_name(fz_context *ctx, fz_font *font)
  77. {
  78. const char *name = fz_font_name(ctx, font);
  79. const char *s = strchr(name, '+');
  80. return s ? s + 1 : name;
  81. }
  82. static const char *
  83. html_clean_font_name(const char *fontname)
  84. {
  85. if (strstr(fontname, "Times"))
  86. return "Times New Roman";
  87. if (strstr(fontname, "Arial") || strstr(fontname, "Helvetica"))
  88. {
  89. if (strstr(fontname, "Narrow") || strstr(fontname, "Condensed"))
  90. return "Arial Narrow";
  91. return "Arial";
  92. }
  93. if (strstr(fontname, "Courier"))
  94. return "Courier";
  95. return fontname;
  96. }
  97. static void
  98. font_family_name(fz_context *ctx, fz_font *font, char *buf, int size, int is_mono, int is_serif)
  99. {
  100. const char *name = html_clean_font_name(font_full_name(ctx, font));
  101. char *s;
  102. fz_strlcpy(buf, name, size);
  103. s = strrchr(buf, '-');
  104. if (s)
  105. *s = 0;
  106. if (is_mono)
  107. fz_strlcat(buf, ",monospace", size);
  108. else
  109. fz_strlcat(buf, is_serif ? ",serif" : ",sans-serif", size);
  110. }
  111. static void
  112. fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup, int color)
  113. {
  114. char family[80];
  115. int is_bold = fz_font_is_bold(ctx, font);
  116. int is_italic = fz_font_is_italic(ctx, font);
  117. int is_serif = fz_font_is_serif(ctx, font);
  118. int is_mono = fz_font_is_monospaced(ctx, font);
  119. font_family_name(ctx, font, family, sizeof family, is_mono, is_serif);
  120. if (sup) fz_write_string(ctx, out, "<sup>");
  121. if (is_mono) fz_write_string(ctx, out, "<tt>");
  122. if (is_bold) fz_write_string(ctx, out, "<b>");
  123. if (is_italic) fz_write_string(ctx, out, "<i>");
  124. fz_write_printf(ctx, out, "<span style=\"font-family:%s;font-size:%.1fpt", family, size);
  125. if (color != 0 && color != CMYK_BLACK)
  126. fz_write_printf(ctx, out, ";color:#%06x", color & 0xffffff);
  127. fz_write_printf(ctx, out, "\">");
  128. }
  129. static void
  130. fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup, int color)
  131. {
  132. int is_mono = fz_font_is_monospaced(ctx, font);
  133. int is_bold = fz_font_is_bold(ctx,font);
  134. int is_italic = fz_font_is_italic(ctx, font);
  135. fz_write_string(ctx, out, "</span>");
  136. if (is_italic) fz_write_string(ctx, out, "</i>");
  137. if (is_bold) fz_write_string(ctx, out, "</b>");
  138. if (is_mono) fz_write_string(ctx, out, "</tt>");
  139. if (sup) fz_write_string(ctx, out, "</sup>");
  140. }
  141. static void
  142. fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
  143. {
  144. fz_matrix ctm = block->u.i.transform;
  145. #define USE_CSS_MATRIX_TRANSFORMS
  146. #ifdef USE_CSS_MATRIX_TRANSFORMS
  147. /* Matrix maths notes.
  148. * When we get here ctm maps the unit square to the position in device
  149. * space occupied by the image.
  150. *
  151. * That is to say that mapping the 4 corners of the unit square through
  152. * the transform, give us the 4 target corners. We extend the corners
  153. * by adding an extra '1' into them to allow transforms to work. Thus
  154. * (x,y) maps through ctm = (a b c d e f) as:
  155. *
  156. * (x y 1) (a b 0) = (X Y 1)
  157. * (c d 0)
  158. * (e f 1)
  159. *
  160. * To simplify reading of matrix maths, we use the trick where we
  161. * 'drop' the first matrix down the page. Thus the corners c0=(0,0),
  162. * c1=(1,0), c2=(1,1), c3=(0,1) map to C0, C1, C2, C3 respectively:
  163. *
  164. * ( a b 0)
  165. * ( c d 0)
  166. * ( e f 1)
  167. * (0 0 1) ( e f 1)
  168. * (0 1 1) ( c+e d+f 1)
  169. * (1 1 1) (a+c+e b+d+f 1)
  170. * (1 0 1) ( a+e b+f 1)
  171. *
  172. * where C0 = (e,f), C1=(c+e, d+f) C2=(a+c+e, b+d+f), C3=(a+e, b+f)
  173. *
  174. * Unfortunately, the CSS matrix transform, does not map the unit square.
  175. * Rather it does something moderately mad. As far as I can work out, the
  176. * top left corner of a (0,0) -> (w, h) box is transformed using the .e
  177. * and .f entries of the matrix. Then the image from within that square
  178. * is transformed using the centre of that square as the origin.
  179. *
  180. * So, an image placed at (0,0) in destination space with 1:1 transform
  181. * will result in an image a (0,0) as you'd expect. But an image at (0,0)
  182. * with a scale of 2, will result in 25% of the image off the left of the
  183. * screen, and 25% off the top.
  184. *
  185. * Accordingly, we have to adjust the ctm in several steps.
  186. */
  187. /* Move to moving the centre of the image. */
  188. ctm.e += (ctm.a+ctm.c)/2;
  189. ctm.f += (ctm.b+ctm.d)/2;
  190. /* Move from transforming the unit square to w/h */
  191. ctm.a /= block->u.i.image->w;
  192. ctm.b /= block->u.i.image->w;
  193. ctm.c /= block->u.i.image->h;
  194. ctm.d /= block->u.i.image->h;
  195. /* Move from points to pixels */
  196. ctm.a *= 96.0f/72;
  197. ctm.b *= 96.0f/72;
  198. ctm.c *= 96.0f/72;
  199. ctm.d *= 96.0f/72;
  200. ctm.e *= 96.0f/72;
  201. ctm.f *= 96.0f/72;
  202. /* Move to moving the top left of the untransformed image box, cos HTML is bonkers. */
  203. ctm.e -= block->u.i.image->w/2;
  204. ctm.f -= block->u.i.image->h/2;
  205. fz_write_printf(ctx, out, "<img style=\"position:absolute;transform:matrix(%g,%g,%g,%g,%g,%g)\" src=\"",
  206. ctm.a, ctm.b, ctm.c, ctm.d, ctm.e, ctm.f);
  207. #else
  208. /* Alternative version of the code that uses scaleX/Y and rotate
  209. * instead, but only copes with axis aligned cases. */
  210. int t;
  211. int x = block->bbox.x0;
  212. int y = block->bbox.y0;
  213. int w = block->bbox.x1 - block->bbox.x0;
  214. int h = block->bbox.y1 - block->bbox.y0;
  215. const char *flip = "";
  216. if (ctm.b == 0 && ctm.c == 0)
  217. {
  218. if (ctm.a < 0 && ctm.d < 0)
  219. flip = "transform: scaleX(-1) scaleY(-1);";
  220. else if (ctm.a < 0)
  221. {
  222. flip = "transform: scaleX(-1);";
  223. }
  224. else if (ctm.d < 0)
  225. {
  226. flip = "transform: scaleY(-1);";
  227. }
  228. } else if (ctm.a == 0 && ctm.d == 0) {
  229. if (ctm.b < 0 && ctm.c < 0)
  230. {
  231. flip = "transform: scaleY(-1) rotate(90deg);";
  232. x += (w-h)/2;
  233. y -= (w-h)/2;
  234. t = w; w = h; h = t;
  235. }
  236. else if (ctm.b < 0)
  237. {
  238. flip = "transform: scaleX(-1) scaleY(-1) rotate(90deg);";
  239. x += (w-h)/2;
  240. y -= (w-h)/2;
  241. t = w; w = h; h = t;
  242. }
  243. else if (ctm.c < 0)
  244. {
  245. flip = "transform: scaleX(-1) scaleY(-1) rotate(270deg);";
  246. x += (w-h)/2;
  247. y -= (w-h)/2;
  248. t = w; w = h; h = t;
  249. }
  250. else
  251. {
  252. flip = "transform: scaleY(-1) rotate(270deg);";
  253. x += (w-h)/2;
  254. y -= (w-h)/2;
  255. t = w; w = h; h = t;
  256. }
  257. }
  258. fz_write_printf(ctx, out, "<img style=\"position:absolute;%stop:%dpt;left:%dpt;width:%dpt;height:%dpt\" src=\"", flip, y, x, w, h);
  259. #endif
  260. fz_write_image_as_data_uri(ctx, out, block->u.i.image);
  261. fz_write_string(ctx, out, "\">\n");
  262. }
  263. void
  264. fz_print_stext_block_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
  265. {
  266. fz_stext_line *line;
  267. fz_stext_char *ch;
  268. float x, y, h;
  269. fz_font *font = NULL;
  270. float size = 0;
  271. int sup = 0;
  272. uint32_t color = 0;
  273. for (line = block->u.t.first_line; line; line = line->next)
  274. {
  275. x = line->bbox.x0;
  276. y = line->bbox.y0;
  277. h = line->bbox.y1 - line->bbox.y0;
  278. if (line->first_char)
  279. {
  280. h = line->first_char->size;
  281. y = line->first_char->origin.y - h * 0.8f;
  282. }
  283. fz_write_printf(ctx, out, "<p style=\"top:%.1fpt;left:%.1fpt;line-height:%.1fpt\">", y, x, h);
  284. font = NULL;
  285. for (ch = line->first_char; ch; ch = ch->next)
  286. {
  287. int ch_sup = detect_super_script(line, ch);
  288. if (ch->font != font || ch->size != size || ch_sup != sup || ch->argb != color)
  289. {
  290. if (font)
  291. fz_print_style_end_html(ctx, out, font, size, sup, color);
  292. font = ch->font;
  293. size = ch->size;
  294. color = ch->argb;
  295. sup = ch_sup;
  296. fz_print_style_begin_html(ctx, out, font, size, sup, color);
  297. }
  298. switch (ch->c)
  299. {
  300. default:
  301. if (ch->c >= 32 && ch->c <= 127)
  302. fz_write_byte(ctx, out, ch->c);
  303. else
  304. fz_write_printf(ctx, out, "&#x%x;", ch->c);
  305. break;
  306. case '<': fz_write_string(ctx, out, "&lt;"); break;
  307. case '>': fz_write_string(ctx, out, "&gt;"); break;
  308. case '&': fz_write_string(ctx, out, "&amp;"); break;
  309. case '"': fz_write_string(ctx, out, "&quot;"); break;
  310. case '\'': fz_write_string(ctx, out, "&apos;"); break;
  311. }
  312. }
  313. if (font)
  314. fz_print_style_end_html(ctx, out, font, size, sup, color);
  315. fz_write_string(ctx, out, "</p>\n");
  316. }
  317. }
  318. static const char *
  319. html_tag_for_struct(fz_stext_struct *s)
  320. {
  321. const char *raw;
  322. if (s == NULL)
  323. return "DIV";
  324. raw = s->raw;
  325. if (raw == NULL)
  326. raw = fz_structure_to_string(s->standard);
  327. if (!fz_strcasecmp(raw, "blockquote"))
  328. return "blockquote";
  329. if (!fz_strcasecmp(raw, "title"))
  330. return "h1";
  331. if (!fz_strcasecmp(raw, "sub"))
  332. return "sub";
  333. if (!fz_strcasecmp(raw, "p"))
  334. return "p";
  335. if (!fz_strcasecmp(raw, "h"))
  336. return "h1"; /* Pick one! */
  337. if (!fz_strcasecmp(raw, "h1"))
  338. return "h1";
  339. if (!fz_strcasecmp(raw, "h2"))
  340. return "h2";
  341. if (!fz_strcasecmp(raw, "h3"))
  342. return "h3";
  343. if (!fz_strcasecmp(raw, "h4"))
  344. return "h4";
  345. if (!fz_strcasecmp(raw, "h5"))
  346. return "h5";
  347. if (!fz_strcasecmp(raw, "h6"))
  348. return "h6";
  349. if (!fz_strcasecmp(raw, "list"))
  350. return "ul";
  351. if (!fz_strcasecmp(raw, "listitem"))
  352. return "li";
  353. if (!fz_strcasecmp(raw, "table"))
  354. return "table";
  355. if (!fz_strcasecmp(raw, "tr"))
  356. return "tr";
  357. if (!fz_strcasecmp(raw, "th"))
  358. return "th";
  359. if (!fz_strcasecmp(raw, "td"))
  360. return "td";
  361. if (!fz_strcasecmp(raw, "thead"))
  362. return "thead";
  363. if (!fz_strcasecmp(raw, "tbody"))
  364. return "tbody";
  365. if (!fz_strcasecmp(raw, "tfoot"))
  366. return "tfoot";
  367. if (!fz_strcasecmp(raw, "span"))
  368. return "span";
  369. if (!fz_strcasecmp(raw, "code"))
  370. return "code";
  371. if (!fz_strcasecmp(raw, "em"))
  372. return "em";
  373. if (!fz_strcasecmp(raw, "strong"))
  374. return "strong";
  375. return "div";
  376. }
  377. static void
  378. print_blocks_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block);
  379. static void
  380. fz_print_stext_struct_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
  381. {
  382. const char *tag;
  383. if (block->u.s.down == NULL)
  384. return;
  385. tag = html_tag_for_struct(block->u.s.down);
  386. fz_write_printf(ctx, out, "<%s>\n", tag);
  387. print_blocks_as_html(ctx, out, block->u.s.down->first_block);
  388. fz_write_printf(ctx, out, "</%s>\n", tag);
  389. }
  390. static void
  391. print_blocks_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
  392. {
  393. for (; block; block = block->next)
  394. {
  395. if (block->type == FZ_STEXT_BLOCK_IMAGE)
  396. fz_print_stext_image_as_html(ctx, out, block);
  397. else if (block->type == FZ_STEXT_BLOCK_TEXT)
  398. fz_print_stext_block_as_html(ctx, out, block);
  399. else if (block->type == FZ_STEXT_BLOCK_STRUCT)
  400. fz_print_stext_struct_as_html(ctx, out, block);
  401. }
  402. }
  403. void
  404. fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
  405. {
  406. float w = page->mediabox.x1 - page->mediabox.x0;
  407. float h = page->mediabox.y1 - page->mediabox.y0;
  408. fz_write_printf(ctx, out, "<div id=\"page%d\" style=\"width:%.1fpt;height:%.1fpt\">\n", id, w, h);
  409. print_blocks_as_html(ctx, out, page->first_block);
  410. fz_write_string(ctx, out, "</div>\n");
  411. }
  412. void
  413. fz_print_stext_header_as_html(fz_context *ctx, fz_output *out)
  414. {
  415. fz_write_string(ctx, out, "<!DOCTYPE html>\n");
  416. fz_write_string(ctx, out, "<html>\n");
  417. fz_write_string(ctx, out, "<head>\n");
  418. fz_write_string(ctx, out, "<style>\n");
  419. fz_write_string(ctx, out, "body{background-color:slategray}\n");
  420. fz_write_string(ctx, out, "div{position:relative;background-color:white;margin:1em auto;box-shadow:1px 1px 8px -2px black}\n");
  421. fz_write_string(ctx, out, "p{position:absolute;white-space:pre;margin:0}\n");
  422. fz_write_string(ctx, out, "</style>\n");
  423. fz_write_string(ctx, out, "</head>\n");
  424. fz_write_string(ctx, out, "<body>\n");
  425. }
  426. void
  427. fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out)
  428. {
  429. fz_write_string(ctx, out, "</body>\n");
  430. fz_write_string(ctx, out, "</html>\n");
  431. }
  432. /* XHTML output (semantic, little layout, suitable for reflow) */
  433. static void
  434. find_table_pos(fz_stext_grid_positions *xs, float x0, float x1, int *ix0, int *ix1)
  435. {
  436. int i;
  437. *ix0 = -1;
  438. *ix1 = -1;
  439. for (i = 1; i < xs->len; i++)
  440. if (x0 < xs->list[i].pos)
  441. {
  442. *ix0 = i-1;
  443. break;
  444. }
  445. for (; i < xs->len; i++)
  446. if (x1 < xs->list[i].pos)
  447. {
  448. *ix1 = i-1;
  449. break;
  450. }
  451. if (i == xs->len)
  452. *ix1 = i-1;
  453. }
  454. static void
  455. run_to_xhtml(fz_context *ctx, fz_stext_block *block, fz_output *out);
  456. static void
  457. fz_print_stext_table_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
  458. {
  459. fz_stext_block *grid, *tr, *td;
  460. int w, h;
  461. int x, y;
  462. uint8_t *cells;
  463. int malformed = 0;
  464. for (grid = block; grid != NULL; grid = grid->next)
  465. if (grid->type == FZ_STEXT_BLOCK_GRID)
  466. break;
  467. if (grid == NULL)
  468. {
  469. fz_warn(ctx, "Malformed table data");
  470. return;
  471. }
  472. w = grid->u.b.xs->len;
  473. h = grid->u.b.ys->len;
  474. cells = fz_calloc(ctx, w, h);
  475. fz_try(ctx)
  476. {
  477. fz_write_printf(ctx, out, "<table>\n");
  478. y = 0;
  479. for (tr = grid->next; tr != NULL; tr = tr->next)
  480. {
  481. if (tr->type != FZ_STEXT_BLOCK_STRUCT || tr->u.s.down == NULL || tr->u.s.down->standard != FZ_STRUCTURE_TR)
  482. {
  483. malformed = 1;
  484. continue;
  485. }
  486. fz_write_printf(ctx, out, "<tr>\n");
  487. x = 0;
  488. for (td = tr->u.s.down->first_block; td != NULL; td = td->next)
  489. {
  490. int x0, y0, x1, y1;
  491. if (td->type != FZ_STEXT_BLOCK_STRUCT || td->u.s.down == NULL || td->u.s.down->standard != FZ_STRUCTURE_TD)
  492. {
  493. malformed = 1;
  494. continue;
  495. }
  496. find_table_pos(grid->u.b.xs, td->bbox.x0, td->bbox.x1, &x0, &x1);
  497. find_table_pos(grid->u.b.ys, td->bbox.y0, td->bbox.y1, &y0, &y1);
  498. if (x0 < 0 || x1 < 0 || x1 >= w)
  499. {
  500. malformed = 1;
  501. x0 = x;
  502. x1 = x+1;
  503. }
  504. if (y0 < 0 || y1 < 0 || y1 >= h)
  505. {
  506. malformed = 1;
  507. y0 = y;
  508. y1 = y+1;
  509. }
  510. if (y < y0)
  511. {
  512. malformed = 1;
  513. continue;
  514. }
  515. if (x > x0)
  516. {
  517. malformed = 1;
  518. }
  519. while (x < x0)
  520. {
  521. uint8_t *c = &cells[x + w*y];
  522. if (*c == 0)
  523. {
  524. fz_write_printf(ctx, out, "<td></td>");
  525. *c = 1;
  526. }
  527. x++;
  528. }
  529. fz_write_string(ctx, out, "<td");
  530. if (x1 > x0+1)
  531. fz_write_printf(ctx, out, " rowspan=%d", x1-x0);
  532. if (y1 > y0+1)
  533. fz_write_printf(ctx, out, " colspan=%d", y1-y0);
  534. fz_write_string(ctx, out, ">\n");
  535. run_to_xhtml(ctx, td->u.s.down->first_block, out);
  536. fz_write_printf(ctx, out, "</td>\n");
  537. for ( ; y0 < y1; y0++)
  538. for (x = x0; x < x1; x++)
  539. {
  540. uint8_t *c = &cells[x + w*y0];
  541. if (*c != 0)
  542. malformed = 1;
  543. *c = 1;
  544. }
  545. }
  546. fz_write_printf(ctx, out, "</tr>\n");
  547. y++;
  548. }
  549. fz_write_printf(ctx, out, "</table>\n");
  550. }
  551. fz_always(ctx)
  552. fz_free(ctx, cells);
  553. fz_catch(ctx)
  554. fz_rethrow(ctx);
  555. if (malformed)
  556. fz_warn(ctx, "Malformed table data");
  557. }
  558. static void
  559. fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
  560. {
  561. int w = block->bbox.x1 - block->bbox.x0;
  562. int h = block->bbox.y1 - block->bbox.y0;
  563. fz_write_printf(ctx, out, "<p><img width=\"%d\" height=\"%d\" src=\"", w, h);
  564. fz_write_image_as_data_uri(ctx, out, block->u.i.image);
  565. fz_write_string(ctx, out, "\"/></p>\n");
  566. }
  567. static void
  568. fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup)
  569. {
  570. int is_mono = fz_font_is_monospaced(ctx, font);
  571. int is_bold = fz_font_is_bold(ctx, font);
  572. int is_italic = fz_font_is_italic(ctx, font);
  573. if (sup)
  574. fz_write_string(ctx, out, "<sup>");
  575. if (is_mono)
  576. fz_write_string(ctx, out, "<tt>");
  577. if (is_bold)
  578. fz_write_string(ctx, out, "<b>");
  579. if (is_italic)
  580. fz_write_string(ctx, out, "<i>");
  581. }
  582. static void
  583. fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup)
  584. {
  585. int is_mono = fz_font_is_monospaced(ctx, font);
  586. int is_bold = fz_font_is_bold(ctx, font);
  587. int is_italic = fz_font_is_italic(ctx, font);
  588. if (is_italic)
  589. fz_write_string(ctx, out, "</i>");
  590. if (is_bold)
  591. fz_write_string(ctx, out, "</b>");
  592. if (is_mono)
  593. fz_write_string(ctx, out, "</tt>");
  594. if (sup)
  595. fz_write_string(ctx, out, "</sup>");
  596. }
  597. static float avg_font_size_of_line(fz_stext_char *ch)
  598. {
  599. float size = 0;
  600. int n = 0;
  601. if (!ch)
  602. return 0;
  603. while (ch)
  604. {
  605. size += ch->size;
  606. ++n;
  607. ch = ch->next;
  608. }
  609. return size / n;
  610. }
  611. static const char *tag_from_font_size(float size)
  612. {
  613. if (size >= 20) return "h1";
  614. if (size >= 15) return "h2";
  615. if (size >= 12) return "h3";
  616. return "p";
  617. }
  618. static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
  619. {
  620. fz_stext_line *line;
  621. fz_stext_char *ch;
  622. fz_font *font = NULL;
  623. int sup = 0;
  624. int sp = 1;
  625. const char *tag = NULL;
  626. const char *new_tag;
  627. for (line = block->u.t.first_line; line; line = line->next)
  628. {
  629. new_tag = tag_from_font_size(avg_font_size_of_line(line->first_char));
  630. if (tag != new_tag)
  631. {
  632. if (tag)
  633. {
  634. if (font)
  635. fz_print_style_end_xhtml(ctx, out, font, sup);
  636. fz_write_printf(ctx, out, "</%s>", tag);
  637. }
  638. tag = new_tag;
  639. fz_write_printf(ctx, out, "<%s>", tag);
  640. if (font)
  641. fz_print_style_begin_xhtml(ctx, out, font, sup);
  642. }
  643. if (!sp)
  644. fz_write_byte(ctx, out, ' ');
  645. for (ch = line->first_char; ch; ch = ch->next)
  646. {
  647. int ch_sup = detect_super_script(line, ch);
  648. if (ch->font != font || ch_sup != sup)
  649. {
  650. if (font)
  651. fz_print_style_end_xhtml(ctx, out, font, sup);
  652. font = ch->font;
  653. sup = ch_sup;
  654. fz_print_style_begin_xhtml(ctx, out, font, sup);
  655. }
  656. sp = (ch->c == ' ');
  657. switch (ch->c)
  658. {
  659. default:
  660. if (ch->c >= 32 && ch->c <= 127)
  661. fz_write_byte(ctx, out, ch->c);
  662. else
  663. fz_write_printf(ctx, out, "&#x%x;", ch->c);
  664. break;
  665. case '<': fz_write_string(ctx, out, "&lt;"); break;
  666. case '>': fz_write_string(ctx, out, "&gt;"); break;
  667. case '&': fz_write_string(ctx, out, "&amp;"); break;
  668. case '"': fz_write_string(ctx, out, "&quot;"); break;
  669. case '\'': fz_write_string(ctx, out, "&apos;"); break;
  670. }
  671. }
  672. }
  673. if (font)
  674. fz_print_style_end_xhtml(ctx, out, font, sup);
  675. fz_write_printf(ctx, out, "</%s>\n", tag);
  676. }
  677. static void
  678. fz_print_struct_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
  679. {
  680. const char *tag;
  681. if (block->u.s.down == NULL)
  682. return;
  683. if (block->u.s.down->standard == FZ_STRUCTURE_TABLE)
  684. {
  685. fz_print_stext_table_as_xhtml(ctx, out, block->u.s.down->first_block);
  686. return;
  687. }
  688. tag = html_tag_for_struct(block->u.s.down);
  689. fz_write_printf(ctx, out, "<%s>\n", tag);
  690. run_to_xhtml(ctx, block->u.s.down->first_block, out);
  691. fz_write_printf(ctx, out, "</%s>\n", tag);
  692. }
  693. static void
  694. run_to_xhtml(fz_context *ctx, fz_stext_block *block, fz_output *out)
  695. {
  696. while (block)
  697. {
  698. switch(block->type)
  699. {
  700. case FZ_STEXT_BLOCK_IMAGE:
  701. fz_print_stext_image_as_xhtml(ctx, out, block);
  702. break;
  703. case FZ_STEXT_BLOCK_TEXT:
  704. fz_print_stext_block_as_xhtml(ctx, out, block);
  705. break;
  706. case FZ_STEXT_BLOCK_STRUCT:
  707. fz_print_struct_as_xhtml(ctx, out, block);
  708. break;
  709. }
  710. block = block->next;
  711. }
  712. }
  713. void
  714. fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
  715. {
  716. fz_write_printf(ctx, out, "<div id=\"page%d\">\n", id);
  717. run_to_xhtml(ctx, page->first_block, out);
  718. fz_write_string(ctx, out, "</div>\n");
  719. }
  720. void
  721. fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out)
  722. {
  723. fz_write_string(ctx, out, "<?xml version=\"1.0\"?>\n");
  724. fz_write_string(ctx, out, "<!DOCTYPE html");
  725. fz_write_string(ctx, out, " PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"");
  726. fz_write_string(ctx, out, " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n");
  727. fz_write_string(ctx, out, "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n");
  728. fz_write_string(ctx, out, "<head>\n");
  729. fz_write_string(ctx, out, "<style>\n");
  730. fz_write_string(ctx, out, "p{white-space:pre-wrap}\n");
  731. fz_write_string(ctx, out, "</style>\n");
  732. fz_write_string(ctx, out, "</head>\n");
  733. fz_write_string(ctx, out, "<body>\n");
  734. }
  735. void
  736. fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out)
  737. {
  738. fz_write_string(ctx, out, "</body>\n");
  739. fz_write_string(ctx, out, "</html>\n");
  740. }
  741. /* Detailed XML dump of the entire structured text data */
  742. static void
  743. xml_write_char(fz_context *ctx, fz_output *out, int c)
  744. {
  745. switch (c)
  746. {
  747. case '<': fz_write_string(ctx, out, "&lt;"); break;
  748. case '>': fz_write_string(ctx, out, "&gt;"); break;
  749. case '&': fz_write_string(ctx, out, "&amp;"); break;
  750. case '"': fz_write_string(ctx, out, "&quot;"); break;
  751. case '\'': fz_write_string(ctx, out, "&apos;"); break;
  752. default:
  753. if (c >= 32 && c <= 127)
  754. fz_write_printf(ctx, out, "%c", c);
  755. else
  756. fz_write_printf(ctx, out, "&#x%x;", c);
  757. break;
  758. }
  759. }
  760. static void
  761. as_xml(fz_context *ctx, fz_stext_block *block, fz_output *out)
  762. {
  763. fz_stext_line *line;
  764. fz_stext_char *ch;
  765. int i;
  766. while (block)
  767. {
  768. switch (block->type)
  769. {
  770. case FZ_STEXT_BLOCK_TEXT:
  771. fz_write_printf(ctx, out, "<block bbox=\"%g %g %g %g\"",
  772. block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
  773. if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_UNKNOWN)
  774. fz_write_printf(ctx, out, " justify=\"unknown\"");
  775. if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_LEFT)
  776. fz_write_printf(ctx, out, " justify=\"left\"");
  777. if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_CENTRE)
  778. fz_write_printf(ctx, out, " justify=\"centre\"");
  779. if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_RIGHT)
  780. fz_write_printf(ctx, out, " justify=\"right\"");
  781. if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL)
  782. fz_write_printf(ctx, out, " justify=\"full\"");
  783. fz_write_printf(ctx, out, ">\n");
  784. for (line = block->u.t.first_line; line; line = line->next)
  785. {
  786. fz_font *font = NULL;
  787. float size = 0;
  788. const char *name = NULL;
  789. fz_write_printf(ctx, out, "<line bbox=\"%g %g %g %g\" wmode=\"%d\" dir=\"%g %g\"",
  790. line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1,
  791. line->wmode,
  792. line->dir.x, line->dir.y);
  793. /* This is duplication of information, but it makes it MUCH easier to search for
  794. * text fragments in large output. */
  795. {
  796. int valid = 1;
  797. fz_write_printf(ctx, out, " text=\"");
  798. for (ch = line->first_char; ch; ch = ch->next)
  799. {
  800. if (valid)
  801. valid = fz_is_valid_xml_char(ch->c);
  802. xml_write_char(ctx, out, fz_range_limit_xml_char(ch->c));
  803. }
  804. if (!valid)
  805. {
  806. fz_write_printf(ctx, out, "\" hextext=\"");
  807. for (ch = line->first_char; ch; ch = ch->next)
  808. {
  809. char text[8];
  810. int n = fz_runetochar(text, ch->c);
  811. for (i = 0; i < n; i++)
  812. fz_write_printf(ctx, out, "%02x", text[i]);
  813. }
  814. }
  815. fz_write_printf(ctx, out, "\"");
  816. }
  817. fz_write_printf(ctx, out, ">\n");
  818. for (ch = line->first_char; ch; ch = ch->next)
  819. {
  820. if (ch->font != font || ch->size != size)
  821. {
  822. const char *s;
  823. if (font)
  824. fz_write_string(ctx, out, "</font>\n");
  825. font = ch->font;
  826. size = ch->size;
  827. s = name = font_full_name(ctx, font);
  828. while (*s)
  829. {
  830. int c = *s++;
  831. if (c < 32 || c >= 127)
  832. break;
  833. }
  834. if (*s)
  835. fz_write_printf(ctx, out, "<font hexname=%>", name);
  836. else
  837. fz_write_printf(ctx, out, "<font name=\"%s\"", name);
  838. fz_write_printf(ctx, out, " size=\"%g\">\n", size);
  839. }
  840. fz_write_printf(ctx, out, "<char quad=\"%g %g %g %g %g %g %g %g\" x=\"%g\" y=\"%g\" bidi=\"%d\" color=\"#%06x\" alpha=\"#%02x\" flags=\"%d\" c=\"",
  841. ch->quad.ul.x, ch->quad.ul.y,
  842. ch->quad.ur.x, ch->quad.ur.y,
  843. ch->quad.ll.x, ch->quad.ll.y,
  844. ch->quad.lr.x, ch->quad.lr.y,
  845. ch->origin.x, ch->origin.y,
  846. ch->bidi,
  847. ch->argb & 0xFFFFFF,
  848. ch->argb>>24,
  849. ch->flags);
  850. xml_write_char(ctx, out, ch->c);
  851. if (!fz_is_valid_xml_char(ch->c))
  852. {
  853. char text[8];
  854. int n = fz_runetochar(text, ch->c);
  855. fz_write_string(ctx, out, "\" hexc=\"");
  856. for (i = 0; i < n; i++)
  857. fz_write_printf(ctx, out, "%02x", text[i]);
  858. }
  859. fz_write_string(ctx, out, "\"/>\n");
  860. }
  861. if (font)
  862. fz_write_string(ctx, out, "</font>\n");
  863. fz_write_string(ctx, out, "</line>\n");
  864. }
  865. fz_write_string(ctx, out, "</block>\n");
  866. break;
  867. case FZ_STEXT_BLOCK_IMAGE:
  868. fz_write_printf(ctx, out, "<image bbox=\"%g %g %g %g\" />\n",
  869. block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
  870. break;
  871. case FZ_STEXT_BLOCK_STRUCT:
  872. fz_write_printf(ctx, out, "<struct idx=\"%d\" bbox=\"%g %g %g %g\"", block->u.s.index,
  873. block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
  874. if (block->u.s.down)
  875. fz_write_printf(ctx, out, " raw=\"%s\" std=\"%s\"",
  876. block->u.s.down->raw, fz_structure_to_string(block->u.s.down->standard));
  877. fz_write_printf(ctx, out, ">\n");
  878. if (block->u.s.down)
  879. as_xml(ctx, block->u.s.down->first_block, out);
  880. fz_write_printf(ctx, out, "</struct>\n");
  881. break;
  882. case FZ_STEXT_BLOCK_VECTOR:
  883. fz_write_printf(ctx, out, "<vector bbox=\"%g %g %g %g\" stroke=\"%d\" rectangle=\"%d\" continues=\"%d\" argb=\"%08x\"/>\n",
  884. block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1,
  885. !!(block->u.v.flags & FZ_STEXT_VECTOR_IS_STROKED),
  886. !!(block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE),
  887. !!(block->u.v.flags & FZ_STEXT_VECTOR_CONTINUES),
  888. block->u.v.argb);
  889. break;
  890. case FZ_STEXT_BLOCK_GRID:
  891. fz_write_printf(ctx, out, "<grid xpos=\"");
  892. for (i = 0; i < block->u.b.xs->len; i++)
  893. fz_write_printf(ctx, out, "%g ", block->u.b.xs->list[i].pos);
  894. fz_write_printf(ctx, out, "\" xuncertainty=\"");
  895. for (i = 0; i < block->u.b.xs->len; i++)
  896. fz_write_printf(ctx, out, "%d ", block->u.b.xs->list[i].uncertainty);
  897. fz_write_printf(ctx, out, "\" xmaxuncertainty=\"%d\" ypos=\"", block->u.b.xs->max_uncertainty);
  898. for (i = 0; i < block->u.b.ys->len; i++)
  899. fz_write_printf(ctx, out, "%g ", block->u.b.ys->list[i].pos);
  900. fz_write_printf(ctx, out, "\" yuncertainty=\"");
  901. for (i = 0; i < block->u.b.ys->len; i++)
  902. fz_write_printf(ctx, out, "%d ", block->u.b.ys->list[i].uncertainty);
  903. fz_write_printf(ctx, out, "\" ymaxuncertainty=\"%d\" />\n", block->u.b.ys->max_uncertainty);
  904. break;
  905. }
  906. block = block->next;
  907. }
  908. }
  909. void
  910. fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
  911. {
  912. fz_write_printf(ctx, out, "<page id=\"page%d\" width=\"%g\" height=\"%g\">\n", id,
  913. page->mediabox.x1 - page->mediabox.x0,
  914. page->mediabox.y1 - page->mediabox.y0);
  915. as_xml(ctx, page->first_block, out);
  916. fz_write_string(ctx, out, "</page>\n");
  917. }
  918. /* JSON dump */
  919. static void
  920. as_json(fz_context *ctx, fz_stext_block *block, fz_output *out, float scale)
  921. {
  922. fz_stext_line *line;
  923. fz_stext_char *ch;
  924. int comma = 0;
  925. while (block)
  926. {
  927. if (comma)
  928. fz_write_string(ctx, out, ",");
  929. comma = 1;
  930. switch (block->type)
  931. {
  932. case FZ_STEXT_BLOCK_TEXT:
  933. fz_write_printf(ctx, out, "{%q:%q,", "type", "text");
  934. fz_write_printf(ctx, out, "%q:{", "bbox");
  935. fz_write_printf(ctx, out, "%q:%d,", "x", (int)(block->bbox.x0 * scale));
  936. fz_write_printf(ctx, out, "%q:%d,", "y", (int)(block->bbox.y0 * scale));
  937. fz_write_printf(ctx, out, "%q:%d,", "w", (int)((block->bbox.x1 - block->bbox.x0) * scale));
  938. fz_write_printf(ctx, out, "%q:%d},", "h", (int)((block->bbox.y1 - block->bbox.y0) * scale));
  939. fz_write_printf(ctx, out, "%q:[", "lines");
  940. for (line = block->u.t.first_line; line; line = line->next)
  941. {
  942. if (line != block->u.t.first_line)
  943. fz_write_string(ctx, out, ",");
  944. fz_write_printf(ctx, out, "{%q:%d,", "wmode", line->wmode);
  945. fz_write_printf(ctx, out, "%q:{", "bbox");
  946. fz_write_printf(ctx, out, "%q:%d,", "x", (int)(line->bbox.x0 * scale));
  947. fz_write_printf(ctx, out, "%q:%d,", "y", (int)(line->bbox.y0 * scale));
  948. fz_write_printf(ctx, out, "%q:%d,", "w", (int)((line->bbox.x1 - line->bbox.x0) * scale));
  949. fz_write_printf(ctx, out, "%q:%d},", "h", (int)((line->bbox.y1 - line->bbox.y0) * scale));
  950. /* Since we force preserve-spans, the first char has the style for the entire line. */
  951. if (line->first_char)
  952. {
  953. fz_font *font = line->first_char->font;
  954. char *font_family = "sans-serif";
  955. char *font_weight = "normal";
  956. char *font_style = "normal";
  957. if (fz_font_is_monospaced(ctx, font)) font_family = "monospace";
  958. else if (fz_font_is_serif(ctx, font)) font_family = "serif";
  959. if (fz_font_is_bold(ctx, font)) font_weight = "bold";
  960. if (fz_font_is_italic(ctx, font)) font_style = "italic";
  961. fz_write_printf(ctx, out, "%q:{", "font");
  962. fz_write_printf(ctx, out, "%q:%q,", "name", fz_font_name(ctx, font));
  963. fz_write_printf(ctx, out, "%q:%q,", "family", font_family);
  964. fz_write_printf(ctx, out, "%q:%q,", "weight", font_weight);
  965. fz_write_printf(ctx, out, "%q:%q,", "style", font_style);
  966. fz_write_printf(ctx, out, "%q:%d},", "size", (int)(line->first_char->size * scale));
  967. fz_write_printf(ctx, out, "%q:%d,", "x", (int)(line->first_char->origin.x * scale));
  968. fz_write_printf(ctx, out, "%q:%d,", "y", (int)(line->first_char->origin.y * scale));
  969. }
  970. fz_write_printf(ctx, out, "%q:\"", "text");
  971. for (ch = line->first_char; ch; ch = ch->next)
  972. {
  973. if (ch->c == '"' || ch->c == '\\')
  974. fz_write_printf(ctx, out, "\\%c", ch->c);
  975. else if (ch->c < 32)
  976. fz_write_printf(ctx, out, "\\u%04x", ch->c);
  977. else
  978. fz_write_printf(ctx, out, "%C", ch->c);
  979. }
  980. fz_write_printf(ctx, out, "\"}");
  981. }
  982. fz_write_string(ctx, out, "]}");
  983. break;
  984. case FZ_STEXT_BLOCK_IMAGE:
  985. fz_write_printf(ctx, out, "{%q:%q,", "type", "image");
  986. fz_write_printf(ctx, out, "%q:{", "bbox");
  987. fz_write_printf(ctx, out, "%q:%d,", "x", (int)(block->bbox.x0 * scale));
  988. fz_write_printf(ctx, out, "%q:%d,", "y", (int)(block->bbox.y0 * scale));
  989. fz_write_printf(ctx, out, "%q:%d,", "w", (int)((block->bbox.x1 - block->bbox.x0) * scale));
  990. fz_write_printf(ctx, out, "%q:%d}}", "h", (int)((block->bbox.y1 - block->bbox.y0) * scale));
  991. break;
  992. case FZ_STEXT_BLOCK_STRUCT:
  993. fz_write_printf(ctx, out, "{%q:%q,", "type", "structure");
  994. fz_write_printf(ctx, out, "%q:%d", "index", block->u.s.index);
  995. if (block->u.s.down)
  996. {
  997. fz_write_printf(ctx, out, ",%q:%q", "raw", block->u.s.down->raw);
  998. fz_write_printf(ctx, out, ",%q:%q", "std", fz_structure_to_string(block->u.s.down->standard));
  999. fz_write_printf(ctx, out, ",%q:[", "contents");
  1000. as_json(ctx, block->u.s.down->first_block, out, scale);
  1001. fz_write_printf(ctx, out, "]");
  1002. }
  1003. fz_write_printf(ctx, out, "}");
  1004. break;
  1005. }
  1006. block = block->next;
  1007. }
  1008. }
  1009. void
  1010. fz_print_stext_page_as_json(fz_context *ctx, fz_output *out, fz_stext_page *page, float scale)
  1011. {
  1012. fz_write_printf(ctx, out, "{%q:[", "blocks");
  1013. as_json(ctx, page->first_block, out, scale);
  1014. fz_write_string(ctx, out, "]}");
  1015. }
  1016. /* Plain text */
  1017. static void
  1018. do_as_text(fz_context *ctx, fz_output *out, fz_stext_block *first_block)
  1019. {
  1020. fz_stext_block *block;
  1021. fz_stext_line *line;
  1022. fz_stext_char *ch;
  1023. char utf[10];
  1024. int i, n;
  1025. for (block = first_block; block; block = block->next)
  1026. {
  1027. switch (block->type)
  1028. {
  1029. case FZ_STEXT_BLOCK_TEXT:
  1030. for (line = block->u.t.first_line; line; line = line->next)
  1031. {
  1032. for (ch = line->first_char; ch; ch = ch->next)
  1033. {
  1034. n = fz_runetochar(utf, ch->c);
  1035. for (i = 0; i < n; i++)
  1036. fz_write_byte(ctx, out, utf[i]);
  1037. }
  1038. fz_write_string(ctx, out, "\n");
  1039. }
  1040. fz_write_string(ctx, out, "\n");
  1041. break;
  1042. case FZ_STEXT_BLOCK_STRUCT:
  1043. if (block->u.s.down != NULL)
  1044. do_as_text(ctx, out, block->u.s.down->first_block);
  1045. break;
  1046. }
  1047. }
  1048. }
  1049. void
  1050. fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page)
  1051. {
  1052. do_as_text(ctx, out, page->first_block);
  1053. }
  1054. /* Text output writer */
  1055. enum {
  1056. FZ_FORMAT_TEXT,
  1057. FZ_FORMAT_HTML,
  1058. FZ_FORMAT_XHTML,
  1059. FZ_FORMAT_STEXT_XML,
  1060. FZ_FORMAT_STEXT_JSON,
  1061. };
  1062. typedef struct
  1063. {
  1064. fz_document_writer super;
  1065. int format;
  1066. int number;
  1067. fz_stext_options opts;
  1068. fz_stext_page *page;
  1069. fz_output *out;
  1070. } fz_text_writer;
  1071. static fz_device *
  1072. text_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox)
  1073. {
  1074. fz_text_writer *wri = (fz_text_writer*)wri_;
  1075. float s = wri->opts.scale;
  1076. if (wri->page)
  1077. {
  1078. fz_drop_stext_page(ctx, wri->page);
  1079. wri->page = NULL;
  1080. }
  1081. wri->number++;
  1082. wri->page = fz_new_stext_page(ctx, fz_transform_rect(mediabox, fz_scale(s, s)));
  1083. return fz_new_stext_device(ctx, wri->page, &wri->opts);
  1084. }
  1085. static void
  1086. text_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev)
  1087. {
  1088. fz_text_writer *wri = (fz_text_writer*)wri_;
  1089. float s = wri->opts.scale;
  1090. fz_scale_stext_page(ctx, wri->page, s);
  1091. fz_try(ctx)
  1092. {
  1093. fz_close_device(ctx, dev);
  1094. switch (wri->format)
  1095. {
  1096. default:
  1097. case FZ_FORMAT_TEXT:
  1098. fz_print_stext_page_as_text(ctx, wri->out, wri->page);
  1099. break;
  1100. case FZ_FORMAT_HTML:
  1101. fz_print_stext_page_as_html(ctx, wri->out, wri->page, wri->number);
  1102. break;
  1103. case FZ_FORMAT_XHTML:
  1104. fz_print_stext_page_as_xhtml(ctx, wri->out, wri->page, wri->number);
  1105. break;
  1106. case FZ_FORMAT_STEXT_XML:
  1107. fz_print_stext_page_as_xml(ctx, wri->out, wri->page, wri->number);
  1108. break;
  1109. case FZ_FORMAT_STEXT_JSON:
  1110. if (wri->number > 1)
  1111. fz_write_string(ctx, wri->out, ",");
  1112. fz_print_stext_page_as_json(ctx, wri->out, wri->page, 1);
  1113. break;
  1114. }
  1115. }
  1116. fz_always(ctx)
  1117. {
  1118. fz_drop_device(ctx, dev);
  1119. fz_drop_stext_page(ctx, wri->page);
  1120. wri->page = NULL;
  1121. }
  1122. fz_catch(ctx)
  1123. fz_rethrow(ctx);
  1124. }
  1125. static void
  1126. text_close_writer(fz_context *ctx, fz_document_writer *wri_)
  1127. {
  1128. fz_text_writer *wri = (fz_text_writer*)wri_;
  1129. switch (wri->format)
  1130. {
  1131. case FZ_FORMAT_HTML:
  1132. fz_print_stext_trailer_as_html(ctx, wri->out);
  1133. break;
  1134. case FZ_FORMAT_XHTML:
  1135. fz_print_stext_trailer_as_xhtml(ctx, wri->out);
  1136. break;
  1137. case FZ_FORMAT_STEXT_XML:
  1138. fz_write_string(ctx, wri->out, "</document>\n");
  1139. break;
  1140. case FZ_FORMAT_STEXT_JSON:
  1141. fz_write_string(ctx, wri->out, "]\n");
  1142. break;
  1143. }
  1144. fz_close_output(ctx, wri->out);
  1145. }
  1146. static void
  1147. text_drop_writer(fz_context *ctx, fz_document_writer *wri_)
  1148. {
  1149. fz_text_writer *wri = (fz_text_writer*)wri_;
  1150. fz_drop_stext_page(ctx, wri->page);
  1151. fz_drop_output(ctx, wri->out);
  1152. }
  1153. fz_document_writer *
  1154. fz_new_text_writer_with_output(fz_context *ctx, const char *format, fz_output *out, const char *options)
  1155. {
  1156. fz_text_writer *wri = NULL;
  1157. fz_var(wri);
  1158. fz_try(ctx)
  1159. {
  1160. wri = fz_new_derived_document_writer(ctx, fz_text_writer, text_begin_page, text_end_page, text_close_writer, text_drop_writer);
  1161. fz_parse_stext_options(ctx, &wri->opts, options);
  1162. wri->format = FZ_FORMAT_TEXT;
  1163. if (!strcmp(format, "text"))
  1164. wri->format = FZ_FORMAT_TEXT;
  1165. else if (!strcmp(format, "html"))
  1166. wri->format = FZ_FORMAT_HTML;
  1167. else if (!strcmp(format, "xhtml"))
  1168. wri->format = FZ_FORMAT_XHTML;
  1169. else if (!strcmp(format, "stext"))
  1170. wri->format = FZ_FORMAT_STEXT_XML;
  1171. else if (!strcmp(format, "stext.xml"))
  1172. wri->format = FZ_FORMAT_STEXT_XML;
  1173. else if (!strcmp(format, "stext.json"))
  1174. {
  1175. wri->format = FZ_FORMAT_STEXT_JSON;
  1176. wri->opts.flags |= FZ_STEXT_PRESERVE_SPANS;
  1177. }
  1178. wri->out = out;
  1179. switch (wri->format)
  1180. {
  1181. case FZ_FORMAT_HTML:
  1182. fz_print_stext_header_as_html(ctx, wri->out);
  1183. break;
  1184. case FZ_FORMAT_XHTML:
  1185. fz_print_stext_header_as_xhtml(ctx, wri->out);
  1186. break;
  1187. case FZ_FORMAT_STEXT_XML:
  1188. fz_write_string(ctx, wri->out, "<?xml version=\"1.0\"?>\n");
  1189. fz_write_string(ctx, wri->out, "<document>\n");
  1190. break;
  1191. case FZ_FORMAT_STEXT_JSON:
  1192. fz_write_string(ctx, wri->out, "[");
  1193. break;
  1194. }
  1195. }
  1196. fz_catch(ctx)
  1197. {
  1198. fz_drop_output(ctx, out);
  1199. fz_free(ctx, wri);
  1200. fz_rethrow(ctx);
  1201. }
  1202. return (fz_document_writer*)wri;
  1203. }
  1204. fz_document_writer *
  1205. fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const char *options)
  1206. {
  1207. fz_output *out = fz_new_output_with_path(ctx, path ? path : "out.txt", 0);
  1208. return fz_new_text_writer_with_output(ctx, format, out, options);
  1209. }