| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584 |
- // Copyright (C) 2025 Artifex Software, Inc.
- //
- // This file is part of MuPDF.
- //
- // MuPDF is free software: you can redistribute it and/or modify it under the
- // terms of the GNU Affero General Public License as published by the Free
- // Software Foundation, either version 3 of the License, or (at your option)
- // any later version.
- //
- // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
- // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- // details.
- //
- // You should have received a copy of the GNU Affero General Public License
- // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
- //
- // Alternative licensing terms are available from the licensor.
- // For commercial licensing, see <https://www.artifex.com/> or contact
- // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
- // CA 94129, USA, for further information.
- #include "mupdf/fitz.h"
- #include <assert.h>
- /* #define DEBUG_SPLITS */
- /* #define DEBUG_PARA_SPLITS */
- static void
- recalc_bbox(fz_stext_block *block)
- {
- fz_rect bbox = fz_empty_rect;
- fz_stext_line *line;
- for (line = block->u.t.first_line; line != NULL; line = line->next)
- bbox = fz_union_rect(bbox, line->bbox);
- block->bbox = bbox;
- }
- typedef enum
- {
- UNDERLINE_UNKNOWN,
- UNDERLINE_YES,
- UNDERLINE_NO,
- UNDERLINE_MIXED
- } underline_state;
- /* Some crap heuristics to spot a bold font. */
- static int
- font_is_bold(fz_font *font)
- {
- const char *c;
- if (font == NULL)
- return 0;
- if (font->flags.is_bold)
- return 1;
- if (fz_strstrcase(font->name, "Bold") != NULL)
- return 1;
- if (fz_strstrcase(font->name, "Black") != NULL)
- return 1;
- if (fz_strstrcase(font->name, "Medium") != NULL)
- return 0;
- if (fz_strstrcase(font->name, "Light") != NULL)
- return 0;
- c = fz_strstr(font->name, " B");
- if (c && (c[2] == ' ' || c[2] == 0))
- return 1;
- return 0;
- }
- /* Check to see if lines move left to right and downwards. */
- /* FIXME: Maybe allow right to left? checking unicode values? */
- static int
- lines_move_plausibly_like_paragraph(fz_stext_block *block)
- {
- fz_stext_line *line;
- int firstline = 1;
- float line_height, line_x, line_y;
- /* Do the lines that make up this block move in an appropriate way? */
- for (line = block->u.t.first_line; line != NULL; line = line->next)
- {
- float x = (line->bbox.x0 + line->bbox.x1)/2;
- float y = (line->bbox.y0 + line->bbox.y1)/2;
- float height = line->bbox.y1 - line->bbox.y0;
- fz_stext_char *ch;
- /* Ignore any completely empty lines */
- for (ch = line->first_char; ch != NULL; ch = ch->next)
- if (ch->c != ' ')
- break;
- if (ch == NULL)
- continue;
- if (firstline)
- {
- line_height = height;
- line_x = x;
- line_y = y;
- firstline = 0;
- }
- else if (line_y - line_height/2 < y && line_y + line_height/2 > y)
- {
- /* We are plausibly the same line. Only accept if we move right. */
- if (x < line_x)
- return 0;
- else
- line_x = x;
- }
- else if (line_y < y)
- {
- /* Moving downwards. Plausible. */
- line_y = y;
- line_height = height;
- line_x = x;
- }
- else
- {
- /* Nothing else is plausible. */
- return 0;
- }
- }
- return 1;
- }
- #ifdef DEBUG_SPLITS
- static void dump_line(fz_context *ctx, const char *str, fz_stext_line *line)
- {
- fz_stext_char *ch;
- if (str)
- fz_write_printf(ctx, fz_stddbg(ctx), "%s\n", str);
- if (line == NULL)
- return;
- for (ch = line->first_char; ch != NULL; ch = ch->next)
- fz_write_printf(ctx, fz_stddbg(ctx), "%c", (char)ch->c);
- fz_write_printf(ctx, fz_stddbg(ctx), "\n");
- }
- static void dump_block(fz_context *ctx, const char *fmt, fz_stext_block *block)
- {
- fz_stext_line *line;
- fz_write_printf(ctx, fz_stddbg(ctx), "%s\n", fmt);
- if (block == NULL || block->type != FZ_STEXT_BLOCK_TEXT)
- return;
- for (line = block->u.t.first_line; line != NULL; line = line->next)
- dump_line(ctx, NULL, line);
- }
- #endif
- typedef struct
- {
- fz_pool *pool;
- fz_stext_struct *parent;
- int idx;
- fz_stext_block **pfirst;
- fz_stext_block **plast;
- } stext_pos;
- static fz_stext_block *split_block_at_line(fz_context *ctx, stext_pos *pos, fz_stext_block *block, fz_stext_line *line)
- {
- fz_stext_block *newblock = fz_pool_alloc(ctx, pos->pool, sizeof *newblock);
- #ifdef DEBUG_SPLITS
- dump_block(ctx, "Splitting:", block);
- dump_line(ctx, "At line:", line);
- #endif
- newblock->bbox = fz_empty_rect;
- newblock->prev = block;
- newblock->next = block->next;
- if (block->next)
- block->next->prev = newblock;
- else
- {
- assert(*pos->plast == block);
- *pos->plast = newblock;
- }
- block->next = newblock;
- newblock->type = FZ_STEXT_BLOCK_TEXT;
- newblock->u.t.flags = block->u.t.flags;
- newblock->u.t.first_line = line;
- newblock->u.t.last_line = block->u.t.last_line;
- block->u.t.last_line = line->prev;
- line->prev->next = NULL;
- line->prev = NULL;
- recalc_bbox(block);
- recalc_bbox(newblock);
- #ifdef DEBUG_SPLITS
- dump_block(ctx, "Giving:", block);
- dump_block(ctx, "and:", newblock);
- #endif
- return newblock;
- }
- /* Convert a block to being a struct that contains just that block. */
- static void block_to_struct(fz_context *ctx, stext_pos *pos, fz_stext_block *block, int structtype)
- {
- fz_stext_struct *str = fz_pool_alloc_flexible(ctx, pos->pool, fz_stext_struct, raw, 1);
- fz_stext_block *new_block = fz_pool_alloc(ctx, pos->pool, sizeof(*new_block));
- str->up = block;
- str->parent = pos->parent;
- str->first_block = new_block;
- str->last_block = new_block;
- str->standard = structtype;
- str->raw[0] = 0;
- new_block->type = block->type;
- new_block->bbox = block->bbox;
- new_block->u = block->u;
- block->type = FZ_STEXT_BLOCK_STRUCT;
- block->u.s.down = str;
- block->u.s.index = pos->idx++;
- }
- /*
- We are going to repeatedly walk the lines that make up a block.
- To reduce the boilerplate here, we'll use a line_walker function.
- This will call a bunch of callbacks as it goes.
- newline_fn Called whenever we move to a new horizontal line (i.e.
- as if we've got a newline). This is not the same as being
- called every fz_stext_line, as we frequently get multiple
- fz_stext_line's on a single horizontal line. If this returns
- 0, execution continues. Return 1 to stop the walking.
- line_fn Called for every fz_stext_line (typically used to process
- characters).
- end_fn Called at the end of the block (with line being the final
- line of the block.
- arg An opaque pointer passed to all the callbacks.
- */
- typedef int (line_walker_newline_fn)(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height);
- typedef int (line_walker_fn)(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg);
- typedef void (line_walker_end_fn)(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg);
- static void
- line_walker(fz_context *ctx, fz_stext_block *block, line_walker_newline_fn *newline_fn, line_walker_fn *line_fn, line_walker_end_fn *end_fn, void *arg)
- {
- int firstline = 1;
- fz_stext_line *line;
- float line_height, line_y;
- if (block->u.t.first_line == NULL)
- return;
- for (line = block->u.t.first_line; line != NULL; line = line->next)
- {
- float y = (line->bbox.y0 + line->bbox.y1)/2;
- float height = line->bbox.y1 - line->bbox.y0;
- if (line->first_char == NULL)
- continue; /* Should never happen, but makes life easier to assume this later. */
- if (firstline)
- {
- line_height = height;
- firstline = 0;
- line_y = y;
- }
- else if (line_y - line_height/2 < y && line_y + line_height/2 > y)
- {
- /* We are plausibly the same horizontal line. */
- }
- else if (line_y < y)
- {
- /* Moving downwards. */
- line_height = height;
- line_y = y;
- if (newline_fn && newline_fn(ctx, block, line, arg, line_height))
- return;
- }
- if (line_fn && line_fn(ctx, block, line, arg))
- return;
- }
- if (end_fn)
- end_fn(ctx, block, block->u.t.last_line, arg);
- }
- /* We scan through the block, collecting lines up that look
- * "title-ish" (by which here, we mean "are completely
- * underlined"). As soon as we finish such a region, we split
- * the block (either before or after it as appropriate), and
- * mark it as a title.
- *
- * e.g.
- *
- * _THIS_IS_LIKELY_A
- * _TITLE_ ___ < BREAK HERE
- * Lorem ipsum dolor sit
- * amet, consectetur
- * adipiscing elit. ___ < BREAK HERE
- * _LIKELY_ANOTHER_TITLE_ ____< BREAK HERE
- * Sed do eiusmod tempor
- * incididunt ut labore
- * et dolore magna aliqua.
- */
- typedef struct
- {
- stext_pos *pos;
- fz_stext_line *title_start;
- fz_stext_line *title_end;
- underline_state underlined;
- int changed;
- } underlined_data;
- static int
- underlined_break(fz_context *ctx, fz_stext_block *block, underlined_data *data)
- {
- fz_stext_line *line;
- /* We have a block that looks like a title. */
- if (data->title_start != block->u.t.first_line)
- {
- /* We need to split the block before title_start */
- line = data->title_start;
- }
- else if (data->title_end != block->u.t.last_line)
- {
- /* We need to split the block after title_end */
- line = data->title_end->next;
- }
- else
- {
- /* This block is already entirely title. */
- line = NULL;
- }
- if (line)
- {
- (void)split_block_at_line(ctx, data->pos, block, line);
- data->changed = 1;
- if (line == data->title_start)
- {
- /* Don't label the latter part as a title yet, we'll do it when
- * we step back in, but we don't know how much of the latter
- * block is title yet. */
- }
- else
- {
- block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_H);
- }
- }
- else
- {
- block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_H);
- }
- return 1;
- }
- static int
- underlined_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
- {
- underlined_data *data = (underlined_data *)arg;
- if (data->underlined == UNDERLINE_YES)
- {
- /* Add the line we've just finished to the start/stop region */
- if (data->title_start == NULL)
- data->title_start = line->prev;
- data->title_end = line->prev;
- }
- else if (data->title_start != NULL)
- {
- /* We've reached the end of a title region. */
- return underlined_break(ctx, block, data);
- }
- data->underlined = UNDERLINE_UNKNOWN;
- return 0;
- }
- static int
- underlined_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
- {
- underlined_data *data = (underlined_data *)arg;
- fz_stext_char *ch;
- /* If we already know that this line is mixed underlined, then no point in
- * wasting time. */
- if (data->underlined == UNDERLINE_MIXED)
- return 0;
- /* If we haven't started looking yet, prime the value. */
- if (data->underlined == UNDERLINE_UNKNOWN)
- data->underlined = (line->first_char->flags & FZ_STEXT_UNDERLINE) ? UNDERLINE_YES : UNDERLINE_NO;
- /* Check that all the rest of the the chars match our expected value. */
- for (ch = line->first_char; ch != NULL; ch = ch->next)
- if ((!!(ch->flags & FZ_STEXT_UNDERLINE)) ^ (data->underlined == UNDERLINE_YES))
- {
- /* Differs! So, Mixed. */
- data->underlined = UNDERLINE_MIXED;
- break;
- }
- return 0;
- }
- static void
- underlined_end(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
- {
- underlined_data *data = (underlined_data *)arg;
- if (data->underlined == UNDERLINE_YES)
- {
- /* Add the line we've just finished to the start/stop region */
- if (data->title_start == NULL)
- data->title_start = block->u.t.last_line;
- data->title_end = block->u.t.last_line;
- }
- /* If we didn't find a region, bale. */
- if (data->title_start)
- underlined_break(ctx, block, data);
- }
- static int
- detect_underlined_titles(fz_context *ctx, stext_pos *pos, fz_stext_block *block)
- {
- /* Let's do the title scanning, where our criteria is
- * "the entire line is underlined". */
- underlined_data data[1];
- data->pos = pos;
- data->title_start = NULL;
- data->title_end = NULL;
- data->underlined = UNDERLINE_UNKNOWN;
- data->changed = 0;
- line_walker(ctx, block, underlined_newline, underlined_line, underlined_end, data);
- return data->changed;
- }
- /* Now we scan again, where the 'title' criteria is based upon
- * the titles being entirely in a different font. */
- typedef struct
- {
- stext_pos *pos;
- fz_stext_line *title_start;
- fz_stext_line *title_end;
- fz_font *font;
- int changed;
- } font_data;
- #define MIXED_FONT ((fz_font *)1)
- static int
- font_break(fz_context *ctx, fz_stext_block *block, font_data *data)
- {
- fz_stext_line *line;
- /* We have a block that looks like a title. */
- if (data->title_start != block->u.t.first_line)
- {
- /* We need to split the block before title_start */
- line = data->title_start;
- }
- else if (data->title_end != block->u.t.last_line)
- {
- /* We need to split the block after title_end */
- line = data->title_end->next;
- }
- else
- {
- /* This block is already entirely title. */
- line = NULL;
- }
- if (line)
- {
- (void)split_block_at_line(ctx, data->pos, block, line);
- data->changed = 1;
- if (line == data->title_start)
- {
- /* Don't label the latter part as a title yet, we'll do it when
- * we step back in, but we don't know how much of the latter
- * block is title yet. */
- }
- else
- {
- block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_H);
- }
- }
- else
- {
- block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_H);
- }
- return 1;
- }
- static int
- font_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
- {
- font_data *data = (font_data *)arg;
- if (data->font != NULL && data->font != MIXED_FONT && font_is_bold(data->font))
- {
- /* Add the line we've just finished to the start/stop region */
- if (data->title_start == NULL)
- data->title_start = line->prev;
- data->title_end = line->prev;
- }
- else if (data->title_start != NULL)
- {
- /* We've reached the end of a title region. */
- return font_break(ctx, block, data);
- }
- data->font = NULL;
- return 0;
- }
- static int
- font_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
- {
- font_data *data = (font_data *)arg;
- fz_stext_char *ch;
- /* If we already know that this line is mixed fonts, then no point in
- * wasting time. */
- if (data->font == MIXED_FONT)
- return 0;
- /* If we are just starting, prime it. */
- if (data->font == NULL)
- data->font = line->first_char->font;
- for (ch = line->first_char; ch != NULL; ch = ch->next)
- if (ch->font != data->font)
- {
- data->font = MIXED_FONT;
- break;
- }
- return 0;
- }
- static void
- font_end(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
- {
- font_data *data = (font_data *)arg;
- if (data->font != NULL && data->font != MIXED_FONT && font_is_bold(data->font))
- {
- /* Add the line we've just finished to the start/stop region */
- if (data->title_start == NULL)
- data->title_start = block->u.t.last_line;
- data->title_end = block->u.t.last_line;
- }
- if (data->title_start)
- font_break(ctx, block, data);
- }
- static int
- detect_titles_by_font_usage(fz_context *ctx, stext_pos *pos, fz_stext_block *block)
- {
- font_data data[1];
- data->pos = pos;
- data->title_start = NULL;
- data->title_end = NULL;
- data->font = NULL;
- data->changed = 0;
- line_walker(ctx, block, font_newline, font_line, font_end, data);
- return data->changed;
- }
- typedef struct
- {
- fz_rect bbox;
- stext_pos *pos;
- int changed;
- } indent_data;
- static int
- indent_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
- {
- indent_data *data = (indent_data *)arg;
- float indent = line->bbox.x0 - data->bbox.x0;
- if (indent > line_height)
- {
- /* Break the block here! */
- (void)split_block_at_line(ctx, data->pos, block, line);
- data->changed = 1;
- return 1;
- }
- return 0;
- }
- static int
- break_paragraphs_by_indent(fz_context *ctx, stext_pos *pos, fz_stext_block *block, fz_rect bbox)
- {
- indent_data data[1];
- data->pos = pos;
- data->bbox = bbox;
- data->changed = 0;
- line_walker(ctx, block, indent_newline, NULL, NULL, data);
- return data->changed;
- }
- typedef struct
- {
- fz_rect bbox;
- stext_pos *pos;
- float line_gap;
- float prev_line_gap;
- int looking_for_space;
- float space_size;
- int maybe_ends_paragraph;
- int changed;
- } trailing_data;
- static int
- trailing_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
- {
- trailing_data *data = (trailing_data *)arg;
- data->prev_line_gap = data->line_gap;
- if (data->looking_for_space)
- {
- /* We've moved downwards onto a line, and failed to find
- * a space on that line. Presumably that means that whole
- * line is a single word. */
- float line_len = line->bbox.x1 - line->bbox.x0;
- if (line_len + data->space_size < data->prev_line_gap)
- {
- /* We could have fitted this word into the previous line. */
- /* So presumably that was a paragraph break. Split here. */
- (void)split_block_at_line(ctx, data->pos, block, line);
- data->changed = 1;
- return 1;
- }
- data->looking_for_space = 0;
- }
- /* If we the last line we looked at ended plausibly for a paragraph,
- * then look for a space in this line... */
- data->looking_for_space = data->maybe_ends_paragraph;
- return 0;
- }
- static int
- trailing_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
- {
- trailing_data *data = (trailing_data *)arg;
- fz_stext_char *ch;
- data->line_gap = data->bbox.x1 - line->bbox.x1;
- if (line->last_char && (
- (line->last_char->c >= 'A' && line->last_char->c <= 'Z') ||
- (line->last_char->c >= 'a' && line->last_char->c <= 'z') ||
- (line->last_char->c >= '0' && line->last_char->c <= '9')))
- {
- /* In Latin text, paragraphs should always end up some form
- * of punctuation. I suspect that's less true of some other
- * languages (particularly far-eastern ones). Let's just say
- * that if we end in A-Za-z0-9 we can't possibly be the last
- * line of a paragraph. */
- data->maybe_ends_paragraph = 0;
- }
- else
- {
- /* Plausibly the next line might be the first line of a new paragraph */
- data->maybe_ends_paragraph = 1;
- }
- for (ch = line->first_char; ch != NULL; ch = ch->next)
- {
- fz_rect r;
- float w, line_len;
- if (ch->c != ' ')
- continue;
- r = fz_rect_from_quad(ch->quad);
- w = r.x1 - r.x0;
- if (w < data->space_size)
- data->space_size = w;
- /* If we aren't looking_for_space, then no point in checking for
- * whether the prefix will fit. But keep looping as we want to
- * continue to refine our idea of how big a space is. */
- if (!data->looking_for_space)
- continue;
- line_len = r.x0 - line->bbox.x0;
- if (line_len + data->space_size < data->prev_line_gap)
- {
- /* We could have fitted this word into the previous line. */
- /* So presumably that was a paragraph break. Split here. */
- (void)split_block_at_line(ctx, data->pos, block, line);
- data->changed = 1;
- return 1;
- }
- data->looking_for_space = 0;
- }
- return 0;
- }
- static int
- break_paragraphs_by_analysing_trailing_gaps(fz_context *ctx, stext_pos *pos, fz_stext_block *block, fz_rect bbox)
- {
- trailing_data data[1];
- data->bbox = bbox;
- data->pos = pos;
- data->line_gap = 0;
- data->prev_line_gap = 0;
- data->looking_for_space = 0;
- data->space_size = 99999;
- data->maybe_ends_paragraph = 0;
- data->changed = 0;
- line_walker(ctx, block, trailing_newline, trailing_line, NULL, data);
- return data->changed;
- }
- typedef struct
- {
- fz_rect bbox;
- stext_pos *pos;
- int count_lines;
- int count_justified;
- int non_digits_exist_in_this_line;
- fz_rect fragment_box;
- fz_rect line_box;
- int gap_count_this_line;
- float gap_size_this_line;
- int bad_gap;
- float xmin, xmax;
- float last_min_space;
- int changed;
- } justify_data;
- #define JUSTIFY_THRESHOLD 1
- static int
- justify_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
- {
- justify_data *data = (justify_data *)arg;
- if (line->prev)
- line = line->prev;
- data->line_box = fz_union_rect(data->line_box, data->fragment_box);
- if (data->line_box.x0 < data->bbox.x0 + JUSTIFY_THRESHOLD && data->line_box.x1 > data->bbox.x1 - JUSTIFY_THRESHOLD && data->gap_count_this_line && data->non_digits_exist_in_this_line)
- data->count_justified++;
- data->non_digits_exist_in_this_line = 0;
- data->count_lines++;
- data->gap_size_this_line = 0;
- data->gap_count_this_line = 0;
- data->fragment_box = fz_empty_rect;
- data->line_box = fz_empty_rect;
- data->xmin = INFINITY;
- data->xmax = -INFINITY;
- return 0;
- }
- static void
- fragment_end(justify_data *data)
- {
- float gap;
- if (fz_is_empty_rect(data->fragment_box))
- {
- /* No fragment. Nothing to do. */
- return;
- }
- if (fz_is_empty_rect(data->line_box))
- {
- /* First fragment of the line; no gap yet. */
- gap = 0;
- }
- else if (data->fragment_box.x0 > data->line_box.x1)
- {
- /* This whole fragment is to the right of the line so far. */
- gap = data->fragment_box.x0 - data->line_box.x1;
- }
- else if (data->fragment_box.x1 < data->line_box.x0)
- {
- /* This whole fragment is the left of the line so far. */
- gap = data->line_box.x1 - data->fragment_box.x0;
- }
- else
- {
- /* Abutting or overlapping fragment. Ignore it. */
- gap = 0;
- }
- data->line_box = fz_union_rect(data->line_box, data->fragment_box);
- data->fragment_box = fz_empty_rect;
- if (gap < data->last_min_space)
- return;
- /* So we have a gap to consider */
- if (data->gap_count_this_line > 0)
- {
- /* Allow for double spaces, cos some layouts put
- * double spaces before full stops. */
- if (fabs(gap - data->gap_size_this_line) > 1 &&
- fabs(gap/2.0 - data->gap_size_this_line) < 1)
- gap /= 2;
- if (fabs(gap - data->gap_size_this_line) > 1)
- data->bad_gap = 1;
- }
- data->gap_size_this_line = (data->gap_size_this_line * data->gap_count_this_line + gap) / (data->gap_count_this_line + 1);
- data->gap_count_this_line++;
- }
- /* This is trickier than you'd imagine. We want to walk the line, looking
- * for how large the spaces are. In a justified line, all the spaces should
- * be pretty much the same size. (Except maybe before periods). But we want
- * to cope with bidirectional text which can send glyphs in unexpected orders.
- * e.g. abc fed ghi
- * So we have to walk over "fragments" at a time.
- */
- static int
- justify_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
- {
- justify_data *data = (justify_data *)arg;
- fz_stext_char *ch;
- for (ch = line->first_char; ch != NULL; ch = ch->next)
- {
- fz_rect r = fz_rect_from_quad(ch->quad);
- float min_space = ch->size * 0.15f; /* Matches SPACE_DIST from stext-device. */
- if (ch->c == ' ')
- {
- /* This ends a fragment, but we don't treat it as such.
- * Just continue, because we'll end the fragment next time
- * around the loop (this copes with trailing spaces, and
- * multiple spaces, and gaps between 'lines' that are on
- * the same line. */
- data->last_min_space = min_space;
- continue;
- }
- if ((ch->c <= '0' || ch->c >= '9') && ch->c != '.')
- data->non_digits_exist_in_this_line = 1;
- if (!fz_is_empty_rect(data->fragment_box))
- {
- if (r.x0 > data->fragment_box.x1 + data->last_min_space)
- {
- /* Fragment ends due to gap on right. */
- fragment_end(data);
- }
- else if (r.x1 < data->fragment_box.x0 - data->last_min_space)
- {
- /* Fragment ends due to gap on left. */
- fragment_end(data);
- }
- }
- /* Extend the fragment */
- data->fragment_box = fz_union_rect(data->fragment_box, r);
- data->last_min_space = min_space;
- }
- return 0;
- }
- static void
- justify_end(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
- {
- justify_data *data = (justify_data *)arg;
- fragment_end(data);
- data->line_box = fz_union_rect(data->line_box, data->fragment_box);
- if (data->line_box.x0 < data->bbox.x0 + JUSTIFY_THRESHOLD && data->line_box.x1 > data->bbox.x1 - JUSTIFY_THRESHOLD && data->gap_count_this_line && data->non_digits_exist_in_this_line)
- data->count_justified++;
- data->count_lines++;
- }
- static int
- justify2_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
- {
- justify_data *data = (justify_data *)arg;
- if (data->line_box.x0 < data->bbox.x0 + JUSTIFY_THRESHOLD && data->line_box.x1 > data->bbox.x1 - JUSTIFY_THRESHOLD)
- {
- /* Justified */
- }
- else
- {
- /* Break after line */
- (void)split_block_at_line(ctx, data->pos, block, line);
- data->changed = 1;
- return 1;
- }
- data->line_box = fz_empty_rect;
- return 0;
- }
- static int
- justify2_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
- {
- justify_data *data = (justify_data *)arg;
- fz_stext_char *ch;
- for (ch = line->first_char; ch != NULL; ch = ch->next)
- {
- if (ch->c == ' ')
- continue;
- data->line_box = fz_union_rect(data->line_box, fz_rect_from_quad(ch->quad));
- }
- return 0;
- }
- static fz_rect
- text_block_marked_bbox(fz_context *ctx, fz_stext_block *block)
- {
- fz_stext_line *line;
- fz_stext_char *ch;
- fz_rect r = fz_empty_rect;
- for (line = block->u.t.first_line; line != NULL; line = line->next)
- {
- for (ch = line->first_char; ch != NULL; ch = ch->next)
- {
- if (ch->c == ' ')
- continue;
- r = fz_union_rect(r, fz_rect_from_quad(ch->quad));
- }
- }
- return r;
- }
- static int
- break_paragraphs_within_justified_text(fz_context *ctx, stext_pos *pos, fz_stext_block *block, fz_rect bbox)
- {
- justify_data data[1];
- if (block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_UNKNOWN)
- return 0;
- data->bbox = bbox;
- data->pos = pos;
- data->count_lines = 0;
- data->count_justified = 0;
- data->non_digits_exist_in_this_line = 0;
- data->bad_gap = 0;
- data->gap_size_this_line = 0;
- data->gap_count_this_line = 0;
- data->fragment_box = fz_empty_rect;
- data->line_box = fz_empty_rect;
- data->xmin = INFINITY;
- data->xmax = -INFINITY;
- data->changed = 0;
- line_walker(ctx, block, justify_newline, justify_line, justify_end, data);
- /* We can't really derive anything about single lines! */
- if (data->count_lines < 2)
- return 0;
- /* If at least half of the lines don't appear to be justified, then
- * don't trust 'em. */
- if (data->count_justified * 2 < data->count_lines)
- return 0;
- /* If the "badness" we've seen to do with big gaps (i.e. how much
- * bigger the gaps are than we'd reasonably expect) is too large
- * then we can't be a justified block. We are prepared to forgive
- * larger sizes in larger paragraphs. */
- if (data->bad_gap)
- return 0;
- block->u.t.flags = FZ_STEXT_TEXT_JUSTIFY_FULL;
- line_walker(ctx, block, justify2_newline, justify2_line, NULL, data);
- return data->changed;
- }
- typedef enum
- {
- LOOKING_FOR_BULLET = 0,
- LOOKING_FOR_POST_BULLET = 1,
- LOOKING_FOR_POST_NUMERICAL_BULLET = 2,
- FOUND_BULLET = 3,
- CONTINUATION_LINE = 4,
- NO_BULLET = 5
- } list_state;
- typedef struct
- {
- stext_pos *pos;
- list_state state;
- int buffer[10];
- int buffer_fill;
- float bullet_r;
- float post_bullet_indent;
- float l;
- fz_stext_line *bullet_line_start;
- fz_stext_line *this_line_start;
- int changed;
- } list_data;
- static int
- list_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
- {
- list_data *data = (list_data *)arg;
- if (data->state == FOUND_BULLET)
- {
- if (block->u.t.first_line != data->bullet_line_start && data->state == FOUND_BULLET)
- {
- /* We need to split the block before the bullet started. */
- (void)split_block_at_line(ctx, data->pos, block, data->bullet_line_start);
- data->changed = 1;
- return 1;
- }
- if (data->bullet_line_start != data->this_line_start)
- {
- /* We've found a second bullet. Break before the previous line. */
- (void)split_block_at_line(ctx, data->pos, block, data->this_line_start);
- block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
- data->changed = 1;
- return 1;
- }
- }
- else if (data->state == NO_BULLET && data->bullet_line_start)
- {
- /* We've found a bullet before, and the line we've just completed
- * is neither a new bullet line, or a continuation so, we need to
- * break that into a new block. */
- (void)split_block_at_line(ctx, data->pos, block, data->this_line_start);
- block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
- data->changed = 1;
- return 1;
- }
- data->this_line_start = line;
- data->state = LOOKING_FOR_BULLET;
- data->buffer_fill = 0;
- data->l = block->bbox.x1;
- data->bullet_r = block->bbox.x0;
- return 0;
- }
- static int
- approx_eq(float a, float b, float c)
- {
- return fabs(a - b) <= c;
- }
- static int
- is_roman(int c)
- {
- switch (c)
- {
- case 'm': case 'M':
- case 'c': case 'C':
- case 'l': case 'L':
- case 'x': case 'X':
- case 'v': case 'V':
- case 'i': case 'I':
- return 1;
- }
- return 0;
- }
- typedef enum {
- NOT_A_BULLET,
- BULLET,
- NUMERICAL_BULLET
- } bullet_t;
- static bullet_t
- is_bullet_aux(int *buffer, int len, int contained)
- {
- int i, decimal_pos, decimals_found;
- if (len == 1 && (
- buffer[0] == '*' ||
- buffer[0] == 0x00B7 || /* Middle Dot */
- buffer[0] == 0x2022 || /* Bullet */
- buffer[0] == 0x2023 || /* Triangular Bullet */
- buffer[0] == 0x2043 || /* Hyphen Bullet */
- buffer[0] == 0x204C || /* Back leftwards bullet */
- buffer[0] == 0x204D || /* Back rightwards bullet */
- buffer[0] == 0x2219 || /* Bullet operator */
- buffer[0] == 0x25C9 || /* Fisheye */
- buffer[0] == 0x25CB || /* White circle */
- buffer[0] == 0x25CF || /* Black circle */
- buffer[0] == 0x25D8 || /* Inverse Bullet */
- buffer[0] == 0x25E6 || /* White Bullet */
- buffer[0] == 0x2619 || /* Reversed Rotated Floral Heart Bullet / Fleuron */
- buffer[0] == 0x261a || /* Black left pointing index */
- buffer[0] == 0x261b || /* Black right pointing index */
- buffer[0] == 0x261c || /* White left pointing index */
- buffer[0] == 0x261d || /* White up pointing index */
- buffer[0] == 0x261e || /* White right pointing index */
- buffer[0] == 0x261f || /* White down pointing index */
- buffer[0] == 0x2765 || /* Rotated Heavy Heart Black Heart Bullet */
- buffer[0] == 0x2767 || /* Rotated Floral Heart Bullet / Fleuron */
- buffer[0] == 0x29BE || /* Circled White Bullet */
- buffer[0] == 0x29BF || /* Circled Bullet */
- buffer[0] == 0x2660 || /* Black Spade suit */
- buffer[0] == 0x2661 || /* White Heart suit */
- buffer[0] == 0x2662 || /* White Diamond suit */
- buffer[0] == 0x2663 || /* Black Club suit */
- buffer[0] == 0x2664 || /* White Spade suit */
- buffer[0] == 0x2665 || /* Black Heart suit */
- buffer[0] == 0x2666 || /* Black Diamond suit */
- buffer[0] == 0x2667 || /* White Clud suit */
- buffer[0] == 0x1F446 || /* WHITE UP POINTING BACKHAND INDEX */
- buffer[0] == 0x1F447 || /* WHITE DOWN POINTING BACKHAND INDEX */
- buffer[0] == 0x1F448 || /* WHITE LEFT POINTING BACKHAND INDEX */
- buffer[0] == 0x1F449 || /* WHITE RIGHT POINTING BACKHAND INDEX */
- buffer[0] == 0x1f597 || /* White down pointing left hand index */
- buffer[0] == 0x1F598 || /* SIDEWAYS WHITE LEFT POINTING INDEX */
- buffer[0] == 0x1F599 || /* SIDEWAYS WHITE RIGHT POINTING INDEX */
- buffer[0] == 0x1F59A || /* SIDEWAYS BLACK LEFT POINTING INDEX */
- buffer[0] == 0x1F59B || /* SIDEWAYS BLACK RIGHT POINTING INDEX */
- buffer[0] == 0x1F59C || /* BLACK LEFT POINTING BACKHAND INDEX */
- buffer[0] == 0x1F59D || /* BLACK RIGHT POINTING BACKHAND INDEX */
- buffer[0] == 0x1F59E || /* SIDEWAYS WHITE UP POINTING INDEX */
- buffer[0] == 0x1F59F || /* SIDEWAYS WHITE DOWN POINTING INDEX */
- buffer[0] == 0x1F5A0 || /* SIDEWAYS BLACK UP POINTING INDEX */
- buffer[0] == 0x1F5A1 || /* SIDEWAYS BLACK DOWN POINTING INDEX */
- buffer[0] == 0x1F5A2 || /* BLACK UP POINTING BACKHAND INDEX */
- buffer[0] == 0x1F5A3 || /* BLACK DOWN POINTING BACKHAND INDEX */
- buffer[0] == 0x1FBC1 || /* LEFT THIRD WHITE RIGHT POINTING INDEX */
- buffer[0] == 0x1FBC2 || /* MIDDLE THIRD WHITE RIGHT POINTING INDEX */
- buffer[0] == 0x1FBC3 || /* RIGHT THIRD WHITE RIGHT POINTING INDEX */
- buffer[0] == 0xFFFD || /* UNICODE_REPLACEMENT_CHARACTER */
- 0))
- return BULLET;
- if (!contained)
- {
- if (len > 2 && buffer[0] == '(' && buffer[len-1] == ')')
- return is_bullet_aux(buffer+1, len-2, 1) ? BULLET : NOT_A_BULLET;
- if (len > 2 && buffer[0] == '<' && buffer[len-1] == '>')
- return is_bullet_aux(buffer+1, len-2, 1) ? BULLET : NOT_A_BULLET;
- if (len > 2 && buffer[0] == '[' && buffer[len-1] == ']')
- return is_bullet_aux(buffer+1, len-2, 1) ? BULLET : NOT_A_BULLET;
- if (len > 2 && buffer[0] == '{' && buffer[len-1] == '}')
- return is_bullet_aux(buffer+1, len-2, 1) ? BULLET : NOT_A_BULLET;
- if (len > 1 && buffer[len-1] == ':')
- return is_bullet_aux(buffer, len-1, 1) ? BULLET : NOT_A_BULLET;
- if (len > 1 && buffer[len-1] == ')')
- return is_bullet_aux(buffer, len-1, 1) ? BULLET : NOT_A_BULLET;
- }
- /* Look for numbers */
- /* Be careful not to interpret rows of numbers, like:
- * 10.02 12.03
- * as bullets.
- */
- decimal_pos = 0;
- decimals_found = 0;
- for (i = 0; i < len; i++)
- {
- if (buffer[i] >= '0' && buffer[i] <= '9')
- {
- }
- else if (buffer[i] == '.')
- {
- decimal_pos = i;
- decimals_found++;
- }
- else
- break;
- }
- if (i == len && decimals_found <= 1)
- return NUMERICAL_BULLET;
- /* or number.something */
- if (decimals_found && i == decimal_pos+1 && i < len)
- return is_bullet_aux(buffer+i, len-i, 0) ? BULLET : NOT_A_BULLET;;
- /* Look for roman */
- for (i = 0; i < len; i++)
- if (!is_roman(buffer[i]))
- break;
- if (i == len)
- return 1;
- /* or roman.something */
- if (buffer[i] == '.' && i < len-1)
- return is_bullet_aux(buffer+i+1, len-i-1, 0) ? BULLET : NOT_A_BULLET;
- /* FIXME: Others. */
- return NOT_A_BULLET;
- }
- static bullet_t
- is_bullet(int *buffer, int len)
- {
- return is_bullet_aux(buffer, len, 0);
- }
- static int
- eval_buffer_for_bullet(fz_context *ctx, list_data *data, float size)
- {
- bullet_t bullet_type;
- bullet_type = is_bullet(data->buffer, data->buffer_fill);
- if (bullet_type == NUMERICAL_BULLET)
- data->state = LOOKING_FOR_POST_NUMERICAL_BULLET;
- else if (bullet_type)
- data->state = LOOKING_FOR_POST_BULLET;
- else
- {
- if (approx_eq(data->l, data->post_bullet_indent, size/2))
- data->state = CONTINUATION_LINE;
- else
- data->state = NO_BULLET;
- return 1;
- }
- return 0;
- }
- static int
- list_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
- {
- list_data *data = (list_data *)arg;
- fz_stext_char *ch;
- for (ch = line->first_char; ch != NULL; ch = ch->next)
- {
- fz_rect r = fz_rect_from_quad(ch->quad);
- if (r.x0 < data->l)
- data->l = line->bbox.x0;
- switch (data->state)
- {
- case LOOKING_FOR_BULLET:
- if (ch->c == ' ')
- {
- /* We have a space */
- if (data->buffer_fill == 0)
- continue; /* Just skip leading spaces */
- if (eval_buffer_for_bullet(ctx, data, ch->size))
- return 0;
- }
- else if (data->buffer_fill > 0 && r.x0 - data->bullet_r > ch->size/2)
- {
- /* We have a gap large enough to be a space while we've
- * got something in the buffer. */
- if (eval_buffer_for_bullet(ctx, data, ch->size))
- return 0;
- }
- else if (data->buffer_fill < (int)nelem(data->buffer))
- {
- /* Stick it in the buffer for evaluation later. */
- data->buffer[data->buffer_fill++] = ch->c;
- }
- else
- {
- /* Buffer overflowed. Can't be a bullet. */
- if (approx_eq(data->l, data->post_bullet_indent, ch->size))
- data->state = CONTINUATION_LINE;
- else
- data->state = NO_BULLET;
- return 0;
- }
- data->bullet_r = r.x1;
- break;
- case LOOKING_FOR_POST_BULLET:
- if (ch->c != ' ')
- {
- data->state = FOUND_BULLET;
- if (data->bullet_line_start == NULL)
- data->bullet_line_start = data->this_line_start;
- data->post_bullet_indent = r.x0;
- }
- break;
- case LOOKING_FOR_POST_NUMERICAL_BULLET:
- if (ch->c >= '0' && ch->c <= '9')
- {
- /* Numerical bullets can't be followed by numbers. */
- if (approx_eq(data->l, data->post_bullet_indent, ch->size))
- data->state = CONTINUATION_LINE;
- else
- data->state = NO_BULLET;
- return 0;
- }
- if (ch->c != ' ')
- {
- data->state = FOUND_BULLET;
- if (data->bullet_line_start == NULL)
- data->bullet_line_start = data->this_line_start;
- data->post_bullet_indent = r.x0;
- }
- break;
- default:
- break;
- }
- }
- return 0;
- }
- static void
- list_end(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
- {
- list_data *data = (list_data *)arg;
- if (data->state == LOOKING_FOR_BULLET)
- {
- eval_buffer_for_bullet(ctx, data, 0);
- /* If we ended up thinking we'd found a bullet, subject to
- * what follows not being of a specific form, then we're
- * fine, because nothing follows us! */
- if (data->state == LOOKING_FOR_POST_NUMERICAL_BULLET ||
- data->state == LOOKING_FOR_POST_BULLET)
- {
- data->state = FOUND_BULLET;
- if (data->bullet_line_start == NULL)
- data->bullet_line_start = data->this_line_start;
- }
- /* FIXME: This block contains just a bullet - not the content
- * for the bullet. We see this with page-12.pdf.
- * <> Rising commitment to battery...
- * committed to in-house battery...
- * developing and manufacturing...
- *
- * The <> is in a whole different DIV to the following text.
- * Really we want to look for if the "next" content (for some
- * definition of next) is on the same line as the bullet. If
- * it is, we want to merge the 2 divs.
- *
- * But that's a really tricky thing to do given the recursive
- * block walk we are current doing. Think about this.
- * For now, we just mark the <> as being a list item.
- */
- }
- if (data->state == FOUND_BULLET)
- {
- if (block->u.t.first_line != data->bullet_line_start && data->state == FOUND_BULLET)
- {
- /* We need to split the block before the start of the bullet. */
- (void)split_block_at_line(ctx, data->pos, block, data->bullet_line_start);
- data->changed = 1;
- return;
- }
- if (data->bullet_line_start != data->this_line_start)
- {
- /* We've found a second bullet. Break before the line. */
- (void)split_block_at_line(ctx, data->pos, block, data->this_line_start);
- block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
- data->changed = 1;
- return;
- }
- block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
- }
- else if (data->state == NO_BULLET && data->bullet_line_start)
- {
- /* We've found a bullet before, and the line we've just completed
- * is neither a new bullet line, or a continuation so, we need to
- * break that into a new block. */
- (void)split_block_at_line(ctx, data->pos, block, data->this_line_start);
- block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
- data->changed = 1;
- return;
- }
- else if (data->bullet_line_start)
- {
- /* We've come to the end of the block still in the list item. */
- block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
- }
- }
- static int
- break_list_items(fz_context *ctx, stext_pos *pos, fz_stext_block *block)
- {
- list_data data[1];
- if (block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_UNKNOWN)
- return 0;
- data->pos = pos;
- data->state = LOOKING_FOR_BULLET;
- data->buffer_fill = 0;
- data->l = block->bbox.x1;
- data->bullet_line_start = NULL;
- data->this_line_start = block->u.t.first_line;
- data->bullet_r = block->bbox.x0;
- data->changed = 0;
- line_walker(ctx, block, list_newline, list_line, list_end, data);
- return data->changed;
- }
- static int
- is_header(fz_structure s)
- {
- return (s == FZ_STRUCTURE_H ||
- s == FZ_STRUCTURE_H1 ||
- s == FZ_STRUCTURE_H2 ||
- s == FZ_STRUCTURE_H3 ||
- s == FZ_STRUCTURE_H4 ||
- s == FZ_STRUCTURE_H5 ||
- s == FZ_STRUCTURE_H6);
- }
- static void
- do_para_break(fz_context *ctx, fz_stext_page *page, fz_stext_block **pfirst, fz_stext_block **plast, fz_stext_struct *parent, int in_header)
- {
- fz_stext_block *block, *next_block;
- stext_pos pos;
- fz_rect bbox;
- pos.pool = page->pool;
- pos.idx = 0;
- pos.pfirst = pfirst;
- pos.plast = plast;
- pos.parent = parent;
- /* First off, in order for us to consider a block to be suitable for paragraph
- * splitting, we want it to be a series of lines moving down the page, (or left
- * to right within a line). */
- for (block = *pfirst; block != NULL; block = next_block)
- {
- next_block = block->next;
- switch (block->type)
- {
- case FZ_STEXT_BLOCK_STRUCT:
- if (block->u.s.index < pos.idx)
- block->u.s.index = pos.idx++;
- else
- pos.idx = block->u.s.index+1;
- if (block->u.s.down)
- {
- int header = in_header | is_header(block->u.s.down->standard);
- do_para_break(ctx, page, &block->u.s.down->first_block, &block->u.s.down->last_block, block->u.s.down, header);
- }
- break;
- case FZ_STEXT_BLOCK_TEXT:
- if (!lines_move_plausibly_like_paragraph(block))
- break;
- #ifdef DEBUG_SPLITS
- dump_block(ctx, "Around the top level block loop:", block);
- #endif
- /* Firstly, and somewhat annoyingly we need to find the bbox of the
- * block that doesn't include for trailing spaces. If we just use
- * the normal bbox, then lines that end in "foo " will end further
- * to the right of lines that end in "ba-", and consequently we'll
- * fail to detect blocks as being justified.
- * See PMC2656817_00002.pdf as an example. */
- bbox = text_block_marked_bbox(ctx, block);
- #ifdef DEBUG_PARA_SPLITS
- {
- fz_stext_line *line;
- for (line = block->u.t.first_line; line != NULL; line = line->next)
- {
- fz_stext_char *ch;
- for (ch = line->first_char; ch != NULL; ch = ch->next)
- {
- fz_write_printf(ctx, fz_stddbg(ctx), "%C", ch->c);
- }
- }
- }
- #endif
- /* Think about breaking lines at Titles. */
- /* First, underlined ones. */
- if (detect_underlined_titles(ctx, &pos, block))
- next_block = block->next; /* We split the block! */
- if (block->type != FZ_STEXT_BLOCK_TEXT)
- {
- next_block = block;
- break;
- }
- #ifdef DEBUG_PARA_SPLITS
- fz_write_printf(ctx, fz_stddbg(ctx), "A");
- #endif
- /* Next, ones that use bold fonts. */
- if (!in_header)
- {
- if (detect_titles_by_font_usage(ctx, &pos, block))
- next_block = block->next; /* We split the block! */
- if (block->type != FZ_STEXT_BLOCK_TEXT)
- {
- next_block = block;
- break;
- }
- }
- #ifdef DEBUG_PARA_SPLITS
- fz_write_printf(ctx, fz_stddbg(ctx), "B");
- #endif
- /* Now look at breaking based upon indents */
- if (break_paragraphs_by_indent(ctx, &pos, block, bbox))
- next_block = block->next; /* We split the block! */
- if (block->type != FZ_STEXT_BLOCK_TEXT)
- {
- next_block = block;
- break;
- }
- #ifdef DEBUG_PARA_SPLITS
- fz_write_printf(ctx, fz_stddbg(ctx), "C");
- #endif
- /* Now we're going to look for unindented paragraphs. We do this by
- * considering if the first word on the next line would have fitted
- * into the space left at the end of the previous line. */
- if (break_paragraphs_by_analysing_trailing_gaps(ctx, &pos, block, bbox))
- next_block = block->next; /* We split the block! */
- if (block->type != FZ_STEXT_BLOCK_TEXT)
- {
- next_block = block;
- break;
- }
- #ifdef DEBUG_PARA_SPLITS
- fz_write_printf(ctx, fz_stddbg(ctx), "D");
- #endif
- /* Now look to see if a block looks like fully justified text. If it
- * does, then any line that doesn't reach the right hand side must be
- * a paragraph break. */
- if (break_paragraphs_within_justified_text(ctx, &pos, block, bbox))
- next_block = block->next; /* We split the block! */
- if (block->type != FZ_STEXT_BLOCK_TEXT)
- {
- next_block = block;
- break;
- }
- #ifdef DEBUG_PARA_SPLITS
- fz_write_printf(ctx, fz_stddbg(ctx), "E");
- #endif
- /* Look for bulleted list items. */
- if (break_list_items(ctx, &pos, block))
- next_block = block->next; /* We split the block! */
- break;
- }
- }
- }
- void
- fz_paragraph_break(fz_context *ctx, fz_stext_page *page)
- {
- do_para_break(ctx, page, &page->first_block, &page->last_block, NULL, 0);
- }
|