| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345 |
- // Copyright (C) 2024-2025 Artifex Software, Inc.
- //
- // This file is part of MuPDF.
- //
- // MuPDF is free software: you can redistribute it and/or modify it under the
- // terms of the GNU Affero General Public License as published by the Free
- // Software Foundation, either version 3 of the License, or (at your option)
- // any later version.
- //
- // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
- // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- // details.
- //
- // You should have received a copy of the GNU Affero General Public License
- // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
- //
- // Alternative licensing terms are available from the licensor.
- // For commercial licensing, see <https://www.artifex.com/> or contact
- // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
- // CA 94129, USA, for further information.
- #include "mupdf/fitz.h"
- #include <zlib.h>
- #include <limits.h>
- typedef struct
- {
- fz_document_writer super;
- int count;
- fz_stext_page *page;
- fz_output *out;
- fz_stext_options options;
- int pagenum;
- } fz_csv_writer;
- static fz_device *
- csv_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox)
- {
- fz_csv_writer *wri = (fz_csv_writer*)wri_;
- wri->page = fz_new_stext_page(ctx, mediabox);
- wri->options.flags |= FZ_STEXT_COLLECT_VECTORS;
- wri->options.flags |= FZ_STEXT_ACCURATE_BBOXES;
- wri->options.flags |= FZ_STEXT_SEGMENT;
- wri->options.flags |= FZ_STEXT_TABLE_HUNT;
- return fz_new_stext_device(ctx, wri->page, &wri->options);
- }
- typedef struct
- {
- int leading;
- int spaces;
- } space_data;
- static void
- output_line(fz_context *ctx, fz_output *out, fz_stext_line *line, space_data *sd)
- {
- for (; line != NULL; line = line->next)
- {
- fz_stext_char *ch;
- for (ch = line->first_char; ch != NULL; ch = ch->next)
- {
- if (ch->c == ' ')
- {
- if (!sd->leading)
- sd->spaces++;
- continue;
- }
- sd->leading = 0;
- /* Compact all runs of spaces to single ones. */
- if (sd->spaces > 0)
- {
- fz_write_printf(ctx, out, " ");
- sd->spaces = 0;
- }
- if (ch->c == '\"')
- {
- fz_write_printf(ctx, out, "\"\"");
- }
- else
- {
- fz_write_printf(ctx, out, "%C", ch->c);
- }
- }
- }
- }
- static fz_rect
- whitespaceless_bbox(fz_context *ctx, fz_stext_block *block)
- {
- fz_rect r = fz_empty_rect;
- fz_stext_line *line;
- fz_stext_char *ch;
- for (; block != NULL; block = block->next)
- {
- if (block->type == FZ_STEXT_BLOCK_STRUCT)
- {
- if (block->u.s.down)
- r = fz_union_rect(r, whitespaceless_bbox(ctx, block->u.s.down->first_block));
- continue;
- }
- if (block->type != FZ_STEXT_BLOCK_TEXT)
- {
- r = fz_union_rect(r, block->bbox);
- continue;
- }
- for (line = block->u.t.first_line; line != NULL; line = line->next)
- {
- for (ch = line->first_char; ch != NULL; ch = ch->next)
- {
- if (ch->c != ' ')
- r = fz_union_rect(r, fz_rect_from_quad(ch->quad));
- }
- }
- }
- return r;
- }
- static void
- output_td_contents(fz_context *ctx, fz_output *out, fz_stext_block *block, space_data *sd)
- {
- for (; block != NULL; block = block->next)
- {
- if (block->type == FZ_STEXT_BLOCK_STRUCT)
- {
- if (block->u.s.down)
- output_td_contents(ctx, out, block->u.s.down->first_block, sd);
- continue;
- }
- if (block->type == FZ_STEXT_BLOCK_TEXT)
- output_line(ctx, out, block->u.t.first_line, sd);
- }
- }
- /* We have output up to and including position *pos on entry to this function.
- * We preserve that on output. */
- static void
- output_td(fz_context *ctx, fz_csv_writer *wri, fz_stext_block *grid, int *pos, fz_stext_block *block)
- {
- int x0, x1;
- space_data sd = { 0 };
- fz_rect r = whitespaceless_bbox(ctx, block);
- if (fz_is_empty_rect(r))
- return;
- if (block && grid)
- {
- for (x0 = 0; x0 < grid->u.b.xs->len; x0++)
- if (r.x0 < grid->u.b.xs->list[x0].pos)
- break;
- for (x1 = x0; x1 < grid->u.b.xs->len; x1++)
- if (r.x1 <= grid->u.b.xs->list[x1].pos)
- break;
- x0--;
- x1--;
- }
- else
- x0 = *pos+1, x1 = *pos+1;
- /* Send enough , to get us to the right position. */
- while (*pos < x0)
- {
- if (*pos >= 0)
- fz_write_printf(ctx, wri->out, ",");
- *pos = (*pos)+1;
- }
- fz_write_printf(ctx, wri->out, "\"");
- output_td_contents(ctx, wri->out, block, &sd);
- fz_write_printf(ctx, wri->out, "\"");
- /* Send any extra , to allow for colspans */
- while (*pos < x1)
- {
- fz_write_printf(ctx, wri->out, ",");
- *pos = (*pos)+1;
- }
- }
- static void
- output_tr(fz_context *ctx, fz_csv_writer *wri, fz_stext_block *grid, fz_stext_block *block)
- {
- int pos = -1;
- for (; block != NULL; block = block->next)
- {
- if (block->type == FZ_STEXT_BLOCK_STRUCT)
- {
- if (!block->u.s.down)
- continue;
- if (block->u.s.down->standard == FZ_STRUCTURE_TD)
- output_td(ctx, wri, grid, &pos, block->u.s.down->first_block);
- }
- }
- if (pos != -1)
- fz_write_printf(ctx, wri->out, "\n");
- }
- static void
- output_table(fz_context *ctx, fz_csv_writer *wri, fz_rect bbox, fz_stext_block *first)
- {
- fz_stext_block *block;
- fz_stext_block *grid = NULL;
- int rows = 0;
- fz_try(ctx)
- {
- /* First, walk to find the div positions */
- for (block = first; block != NULL; block = block->next)
- {
- if (block->type == FZ_STEXT_BLOCK_GRID)
- {
- grid = block;
- break;
- }
- }
- /* Then, count the rows */
- for (block = first; block != NULL; block = block->next)
- {
- if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down != NULL && block->u.s.down->standard == FZ_STRUCTURE_TR)
- rows++;
- }
- fz_write_printf(ctx, wri->out, "Table %d,%d,%d,%g,%g,%g,%g\n",
- wri->count++,
- rows,
- wri->pagenum,
- bbox.x0, bbox.y0, bbox.x1, bbox.y1);
- /* Then do the output */
- for (block = first; block != NULL; block = block->next)
- {
- if (block->type == FZ_STEXT_BLOCK_STRUCT)
- {
- if (!block->u.s.down)
- continue;
- if (block->u.s.down->standard == FZ_STRUCTURE_TR)
- output_tr(ctx, wri, grid, block->u.s.down->first_block);
- }
- }
- }
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- static void
- output_tables(fz_context *ctx, fz_csv_writer *wri, fz_stext_page *page, fz_stext_block *block)
- {
- for (; block; block = block->next)
- {
- if (block->type == FZ_STEXT_BLOCK_STRUCT)
- {
- if (!block->u.s.down)
- continue;
- if (block->u.s.down->standard == FZ_STRUCTURE_TABLE)
- output_table(ctx, wri, block->bbox, block->u.s.down->first_block);
- else
- output_tables(ctx, wri, page, block->u.s.down->first_block);
- }
- }
- }
- static void
- csv_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev)
- {
- fz_csv_writer *wri = (fz_csv_writer*)wri_;
- fz_try(ctx)
- {
- fz_close_device(ctx, dev);
- /* Output UTF-8 BOM */
- fz_write_printf(ctx, wri->out, "%C", 0xFEFF);
- output_tables(ctx, wri, wri->page, wri->page->first_block);
- wri->pagenum++;
- }
- fz_always(ctx)
- {
- fz_drop_device(ctx, dev);
- }
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- static void
- csv_close_writer(fz_context *ctx, fz_document_writer *wri_)
- {
- fz_csv_writer *wri = (fz_csv_writer*)wri_;
- fz_close_output(ctx, wri->out);
- }
- static void
- csv_drop_writer(fz_context *ctx, fz_document_writer *wri_)
- {
- fz_csv_writer *wri = (fz_csv_writer*)wri_;
- fz_drop_output(ctx, wri->out);
- }
- fz_document_writer *
- fz_new_csv_writer_with_output(fz_context *ctx, fz_output *out, const char *options)
- {
- fz_csv_writer *wri = NULL;
- fz_var(wri);
- fz_var(out);
- fz_try(ctx)
- {
- wri = fz_new_derived_document_writer(ctx, fz_csv_writer, csv_begin_page, csv_end_page, csv_close_writer, csv_drop_writer);
- fz_parse_stext_options(ctx, &wri->options, options);
- wri->out = out;
- }
- fz_catch(ctx)
- {
- fz_drop_output(ctx, out);
- fz_free(ctx, wri);
- fz_rethrow(ctx);
- }
- return (fz_document_writer*)wri;
- }
- fz_document_writer *
- fz_new_csv_writer(fz_context *ctx, const char *path, const char *options)
- {
- fz_output *out = fz_new_output_with_path(ctx, path ? path : "out.csv", 0);
- fz_document_writer *wri = NULL;
- fz_try(ctx)
- wri = fz_new_csv_writer_with_output(ctx, out, options);
- fz_catch(ctx)
- {
- fz_drop_output(ctx, out);
- fz_rethrow(ctx);
- }
- return wri;
- }
|