output-csv.c 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. // Copyright (C) 2024-2025 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #include "mupdf/fitz.h"
  23. #include <zlib.h>
  24. #include <limits.h>
  25. typedef struct
  26. {
  27. fz_document_writer super;
  28. int count;
  29. fz_stext_page *page;
  30. fz_output *out;
  31. fz_stext_options options;
  32. int pagenum;
  33. } fz_csv_writer;
  34. static fz_device *
  35. csv_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox)
  36. {
  37. fz_csv_writer *wri = (fz_csv_writer*)wri_;
  38. wri->page = fz_new_stext_page(ctx, mediabox);
  39. wri->options.flags |= FZ_STEXT_COLLECT_VECTORS;
  40. wri->options.flags |= FZ_STEXT_ACCURATE_BBOXES;
  41. wri->options.flags |= FZ_STEXT_SEGMENT;
  42. wri->options.flags |= FZ_STEXT_TABLE_HUNT;
  43. return fz_new_stext_device(ctx, wri->page, &wri->options);
  44. }
  45. typedef struct
  46. {
  47. int leading;
  48. int spaces;
  49. } space_data;
  50. static void
  51. output_line(fz_context *ctx, fz_output *out, fz_stext_line *line, space_data *sd)
  52. {
  53. for (; line != NULL; line = line->next)
  54. {
  55. fz_stext_char *ch;
  56. for (ch = line->first_char; ch != NULL; ch = ch->next)
  57. {
  58. if (ch->c == ' ')
  59. {
  60. if (!sd->leading)
  61. sd->spaces++;
  62. continue;
  63. }
  64. sd->leading = 0;
  65. /* Compact all runs of spaces to single ones. */
  66. if (sd->spaces > 0)
  67. {
  68. fz_write_printf(ctx, out, " ");
  69. sd->spaces = 0;
  70. }
  71. if (ch->c == '\"')
  72. {
  73. fz_write_printf(ctx, out, "\"\"");
  74. }
  75. else
  76. {
  77. fz_write_printf(ctx, out, "%C", ch->c);
  78. }
  79. }
  80. }
  81. }
  82. static fz_rect
  83. whitespaceless_bbox(fz_context *ctx, fz_stext_block *block)
  84. {
  85. fz_rect r = fz_empty_rect;
  86. fz_stext_line *line;
  87. fz_stext_char *ch;
  88. for (; block != NULL; block = block->next)
  89. {
  90. if (block->type == FZ_STEXT_BLOCK_STRUCT)
  91. {
  92. if (block->u.s.down)
  93. r = fz_union_rect(r, whitespaceless_bbox(ctx, block->u.s.down->first_block));
  94. continue;
  95. }
  96. if (block->type != FZ_STEXT_BLOCK_TEXT)
  97. {
  98. r = fz_union_rect(r, block->bbox);
  99. continue;
  100. }
  101. for (line = block->u.t.first_line; line != NULL; line = line->next)
  102. {
  103. for (ch = line->first_char; ch != NULL; ch = ch->next)
  104. {
  105. if (ch->c != ' ')
  106. r = fz_union_rect(r, fz_rect_from_quad(ch->quad));
  107. }
  108. }
  109. }
  110. return r;
  111. }
  112. static void
  113. output_td_contents(fz_context *ctx, fz_output *out, fz_stext_block *block, space_data *sd)
  114. {
  115. for (; block != NULL; block = block->next)
  116. {
  117. if (block->type == FZ_STEXT_BLOCK_STRUCT)
  118. {
  119. if (block->u.s.down)
  120. output_td_contents(ctx, out, block->u.s.down->first_block, sd);
  121. continue;
  122. }
  123. if (block->type == FZ_STEXT_BLOCK_TEXT)
  124. output_line(ctx, out, block->u.t.first_line, sd);
  125. }
  126. }
  127. /* We have output up to and including position *pos on entry to this function.
  128. * We preserve that on output. */
  129. static void
  130. output_td(fz_context *ctx, fz_csv_writer *wri, fz_stext_block *grid, int *pos, fz_stext_block *block)
  131. {
  132. int x0, x1;
  133. space_data sd = { 0 };
  134. fz_rect r = whitespaceless_bbox(ctx, block);
  135. if (fz_is_empty_rect(r))
  136. return;
  137. if (block && grid)
  138. {
  139. for (x0 = 0; x0 < grid->u.b.xs->len; x0++)
  140. if (r.x0 < grid->u.b.xs->list[x0].pos)
  141. break;
  142. for (x1 = x0; x1 < grid->u.b.xs->len; x1++)
  143. if (r.x1 <= grid->u.b.xs->list[x1].pos)
  144. break;
  145. x0--;
  146. x1--;
  147. }
  148. else
  149. x0 = *pos+1, x1 = *pos+1;
  150. /* Send enough , to get us to the right position. */
  151. while (*pos < x0)
  152. {
  153. if (*pos >= 0)
  154. fz_write_printf(ctx, wri->out, ",");
  155. *pos = (*pos)+1;
  156. }
  157. fz_write_printf(ctx, wri->out, "\"");
  158. output_td_contents(ctx, wri->out, block, &sd);
  159. fz_write_printf(ctx, wri->out, "\"");
  160. /* Send any extra , to allow for colspans */
  161. while (*pos < x1)
  162. {
  163. fz_write_printf(ctx, wri->out, ",");
  164. *pos = (*pos)+1;
  165. }
  166. }
  167. static void
  168. output_tr(fz_context *ctx, fz_csv_writer *wri, fz_stext_block *grid, fz_stext_block *block)
  169. {
  170. int pos = -1;
  171. for (; block != NULL; block = block->next)
  172. {
  173. if (block->type == FZ_STEXT_BLOCK_STRUCT)
  174. {
  175. if (!block->u.s.down)
  176. continue;
  177. if (block->u.s.down->standard == FZ_STRUCTURE_TD)
  178. output_td(ctx, wri, grid, &pos, block->u.s.down->first_block);
  179. }
  180. }
  181. if (pos != -1)
  182. fz_write_printf(ctx, wri->out, "\n");
  183. }
  184. static void
  185. output_table(fz_context *ctx, fz_csv_writer *wri, fz_rect bbox, fz_stext_block *first)
  186. {
  187. fz_stext_block *block;
  188. fz_stext_block *grid = NULL;
  189. int rows = 0;
  190. fz_try(ctx)
  191. {
  192. /* First, walk to find the div positions */
  193. for (block = first; block != NULL; block = block->next)
  194. {
  195. if (block->type == FZ_STEXT_BLOCK_GRID)
  196. {
  197. grid = block;
  198. break;
  199. }
  200. }
  201. /* Then, count the rows */
  202. for (block = first; block != NULL; block = block->next)
  203. {
  204. if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down != NULL && block->u.s.down->standard == FZ_STRUCTURE_TR)
  205. rows++;
  206. }
  207. fz_write_printf(ctx, wri->out, "Table %d,%d,%d,%g,%g,%g,%g\n",
  208. wri->count++,
  209. rows,
  210. wri->pagenum,
  211. bbox.x0, bbox.y0, bbox.x1, bbox.y1);
  212. /* Then do the output */
  213. for (block = first; block != NULL; block = block->next)
  214. {
  215. if (block->type == FZ_STEXT_BLOCK_STRUCT)
  216. {
  217. if (!block->u.s.down)
  218. continue;
  219. if (block->u.s.down->standard == FZ_STRUCTURE_TR)
  220. output_tr(ctx, wri, grid, block->u.s.down->first_block);
  221. }
  222. }
  223. }
  224. fz_catch(ctx)
  225. fz_rethrow(ctx);
  226. }
  227. static void
  228. output_tables(fz_context *ctx, fz_csv_writer *wri, fz_stext_page *page, fz_stext_block *block)
  229. {
  230. for (; block; block = block->next)
  231. {
  232. if (block->type == FZ_STEXT_BLOCK_STRUCT)
  233. {
  234. if (!block->u.s.down)
  235. continue;
  236. if (block->u.s.down->standard == FZ_STRUCTURE_TABLE)
  237. output_table(ctx, wri, block->bbox, block->u.s.down->first_block);
  238. else
  239. output_tables(ctx, wri, page, block->u.s.down->first_block);
  240. }
  241. }
  242. }
  243. static void
  244. csv_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev)
  245. {
  246. fz_csv_writer *wri = (fz_csv_writer*)wri_;
  247. fz_try(ctx)
  248. {
  249. fz_close_device(ctx, dev);
  250. /* Output UTF-8 BOM */
  251. fz_write_printf(ctx, wri->out, "%C", 0xFEFF);
  252. output_tables(ctx, wri, wri->page, wri->page->first_block);
  253. wri->pagenum++;
  254. }
  255. fz_always(ctx)
  256. {
  257. fz_drop_device(ctx, dev);
  258. }
  259. fz_catch(ctx)
  260. fz_rethrow(ctx);
  261. }
  262. static void
  263. csv_close_writer(fz_context *ctx, fz_document_writer *wri_)
  264. {
  265. fz_csv_writer *wri = (fz_csv_writer*)wri_;
  266. fz_close_output(ctx, wri->out);
  267. }
  268. static void
  269. csv_drop_writer(fz_context *ctx, fz_document_writer *wri_)
  270. {
  271. fz_csv_writer *wri = (fz_csv_writer*)wri_;
  272. fz_drop_output(ctx, wri->out);
  273. }
  274. fz_document_writer *
  275. fz_new_csv_writer_with_output(fz_context *ctx, fz_output *out, const char *options)
  276. {
  277. fz_csv_writer *wri = NULL;
  278. fz_var(wri);
  279. fz_var(out);
  280. fz_try(ctx)
  281. {
  282. wri = fz_new_derived_document_writer(ctx, fz_csv_writer, csv_begin_page, csv_end_page, csv_close_writer, csv_drop_writer);
  283. fz_parse_stext_options(ctx, &wri->options, options);
  284. wri->out = out;
  285. }
  286. fz_catch(ctx)
  287. {
  288. fz_drop_output(ctx, out);
  289. fz_free(ctx, wri);
  290. fz_rethrow(ctx);
  291. }
  292. return (fz_document_writer*)wri;
  293. }
  294. fz_document_writer *
  295. fz_new_csv_writer(fz_context *ctx, const char *path, const char *options)
  296. {
  297. fz_output *out = fz_new_output_with_path(ctx, path ? path : "out.csv", 0);
  298. fz_document_writer *wri = NULL;
  299. fz_try(ctx)
  300. wri = fz_new_csv_writer_with_output(ctx, out, options);
  301. fz_catch(ctx)
  302. {
  303. fz_drop_output(ctx, out);
  304. fz_rethrow(ctx);
  305. }
  306. return wri;
  307. }