sfreundel
/
SharpMuPDF


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584
							// Copyright (C) 2025 Artifex Software, Inc.
//
// This file is part of MuPDF.
//
// MuPDF is free software: you can redistribute it and/or modify it under the
// terms of the GNU Affero General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
// details.
//
// You should have received a copy of the GNU Affero General Public License
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
//
// Alternative licensing terms are available from the licensor.
// For commercial licensing, see <https://www.artifex.com/> or contact
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
// CA 94129, USA, for further information.

#include "mupdf/fitz.h"

#include <assert.h>

/* #define DEBUG_SPLITS */

/* #define DEBUG_PARA_SPLITS */

static void
recalc_bbox(fz_stext_block *block)
{
	fz_rect bbox = fz_empty_rect;
	fz_stext_line *line;

	for (line = block->u.t.first_line; line != NULL; line = line->next)
		bbox = fz_union_rect(bbox, line->bbox);

	block->bbox = bbox;
}

typedef enum
{
	UNDERLINE_UNKNOWN,
	UNDERLINE_YES,
	UNDERLINE_NO,
	UNDERLINE_MIXED
} underline_state;

/* Some crap heuristics to spot a bold font. */
static int
font_is_bold(fz_font *font)
{
	const char *c;

	if (font == NULL)
		return 0;
	if (font->flags.is_bold)
		return 1;

	if (fz_strstrcase(font->name, "Bold") != NULL)
		return 1;
	if (fz_strstrcase(font->name, "Black") != NULL)
		return 1;
	if (fz_strstrcase(font->name, "Medium") != NULL)
		return 0;
	if (fz_strstrcase(font->name, "Light") != NULL)
		return 0;

	c = fz_strstr(font->name, " B");
	if (c && (c[2] == ' ' || c[2] == 0))
		return 1;

	return 0;
}

/* Check to see if lines move left to right and downwards. */
/* FIXME: Maybe allow right to left? checking unicode values? */
static int
lines_move_plausibly_like_paragraph(fz_stext_block *block)
{
	fz_stext_line *line;
	int firstline = 1;
	float line_height, line_x, line_y;

	/* Do the lines that make up this block move in an appropriate way? */
	for (line = block->u.t.first_line; line != NULL; line = line->next)
	{
		float x = (line->bbox.x0 + line->bbox.x1)/2;
		float y = (line->bbox.y0 + line->bbox.y1)/2;
		float height = line->bbox.y1 - line->bbox.y0;
		fz_stext_char *ch;

		/* Ignore any completely empty lines */
		for (ch = line->first_char; ch != NULL; ch = ch->next)
			if (ch->c != ' ')
				break;
		if (ch == NULL)
			continue;

		if (firstline)
		{
			line_height = height;
			line_x = x;
			line_y = y;
			firstline = 0;
		}
		else if (line_y - line_height/2 < y && line_y + line_height/2 > y)
		{
			/* We are plausibly the same line. Only accept if we move right. */
			if (x < line_x)
				return 0;
			else
				line_x = x;
		}
		else if (line_y < y)
		{
			/* Moving downwards. Plausible. */
			line_y = y;
			line_height = height;
			line_x = x;
		}
		else
		{
			/* Nothing else is plausible. */
			return 0;
		}
	}
	return 1;
}

#ifdef DEBUG_SPLITS
static void dump_line(fz_context *ctx, const char *str, fz_stext_line *line)
{
	fz_stext_char *ch;

	if (str)
		fz_write_printf(ctx, fz_stddbg(ctx), "%s\n", str);

	if (line == NULL)
		return;

	for (ch = line->first_char; ch != NULL; ch = ch->next)
		fz_write_printf(ctx, fz_stddbg(ctx), "%c", (char)ch->c);
	fz_write_printf(ctx, fz_stddbg(ctx), "\n");
}

static void dump_block(fz_context *ctx, const char *fmt, fz_stext_block *block)
{
	fz_stext_line *line;

	fz_write_printf(ctx, fz_stddbg(ctx), "%s\n", fmt);
	if (block == NULL || block->type != FZ_STEXT_BLOCK_TEXT)
		return;

	for (line = block->u.t.first_line; line != NULL; line = line->next)
		dump_line(ctx, NULL, line);
}
#endif

typedef struct
{
	fz_pool *pool;
	fz_stext_struct *parent;
	int idx;
	fz_stext_block **pfirst;
	fz_stext_block **plast;
} stext_pos;

static fz_stext_block *split_block_at_line(fz_context *ctx, stext_pos *pos, fz_stext_block *block, fz_stext_line *line)
{
	fz_stext_block *newblock = fz_pool_alloc(ctx, pos->pool, sizeof *newblock);

#ifdef DEBUG_SPLITS
	dump_block(ctx, "Splitting:", block);
	dump_line(ctx, "At line:", line);
#endif

	newblock->bbox = fz_empty_rect;
	newblock->prev = block;
	newblock->next = block->next;
	if (block->next)
		block->next->prev = newblock;
	else
	{
		assert(*pos->plast == block);
		*pos->plast = newblock;
	}
	block->next = newblock;
	newblock->type = FZ_STEXT_BLOCK_TEXT;
	newblock->u.t.flags = block->u.t.flags;
	newblock->u.t.first_line = line;
	newblock->u.t.last_line = block->u.t.last_line;
	block->u.t.last_line = line->prev;
	line->prev->next = NULL;
	line->prev = NULL;
	recalc_bbox(block);
	recalc_bbox(newblock);

#ifdef DEBUG_SPLITS
	dump_block(ctx, "Giving:", block);
	dump_block(ctx, "and:", newblock);
#endif

	return newblock;
}

/* Convert a block to being a struct that contains just that block. */
static void block_to_struct(fz_context *ctx, stext_pos *pos, fz_stext_block *block, int structtype)
{
	fz_stext_struct *str = fz_pool_alloc_flexible(ctx, pos->pool, fz_stext_struct, raw, 1);
	fz_stext_block *new_block = fz_pool_alloc(ctx, pos->pool, sizeof(*new_block));

	str->up = block;
	str->parent = pos->parent;
	str->first_block = new_block;
	str->last_block = new_block;
	str->standard = structtype;
	str->raw[0] = 0;

	new_block->type = block->type;
	new_block->bbox = block->bbox;
	new_block->u = block->u;

	block->type = FZ_STEXT_BLOCK_STRUCT;
	block->u.s.down = str;
	block->u.s.index = pos->idx++;
}

/*
	We are going to repeatedly walk the lines that make up a block.
	To reduce the boilerplate here, we'll use a line_walker function.
	This will call a bunch of callbacks as it goes.

	newline_fn	Called whenever we move to a new horizontal line (i.e.
			as if we've got a newline). This is not the same as being
			called every fz_stext_line, as we frequently get multiple
			fz_stext_line's on a single horizontal line. If this returns
			0, execution continues. Return 1 to stop the walking.
	line_fn		Called for every fz_stext_line (typically used to process
			characters).
	end_fn		Called at the end of the block (with line being the final
			line of the block.
	arg		An opaque pointer passed to all the callbacks.
*/
typedef int (line_walker_newline_fn)(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height);
typedef int (line_walker_fn)(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg);
typedef void (line_walker_end_fn)(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg);

static void
line_walker(fz_context *ctx, fz_stext_block *block, line_walker_newline_fn *newline_fn, line_walker_fn *line_fn, line_walker_end_fn *end_fn, void *arg)
{
	int firstline = 1;
	fz_stext_line *line;
	float line_height, line_y;

	if (block->u.t.first_line == NULL)
		return;

	for (line = block->u.t.first_line; line != NULL; line = line->next)
	{
		float y = (line->bbox.y0 + line->bbox.y1)/2;
		float height = line->bbox.y1 - line->bbox.y0;

		if (line->first_char == NULL)
			continue; /* Should never happen, but makes life easier to assume this later. */

		if (firstline)
		{
			line_height = height;
			firstline = 0;
			line_y = y;
		}
		else if (line_y - line_height/2 < y && line_y + line_height/2 > y)
		{
			/* We are plausibly the same horizontal line. */
		}
		else if (line_y < y)
		{
			/* Moving downwards. */
			line_height = height;
			line_y = y;
			if (newline_fn && newline_fn(ctx, block, line, arg, line_height))
				return;
		}
		if (line_fn && line_fn(ctx, block, line, arg))
			return;
	}
	if (end_fn)
		end_fn(ctx, block, block->u.t.last_line, arg);
}

/* We scan through the block, collecting lines up that look
 * "title-ish" (by which here, we mean "are completely
 * underlined"). As soon as we finish such a region, we split
 * the block (either before or after it as appropriate), and
 * mark it as a title.
 *
 * e.g.
 *
 * _THIS_IS_LIKELY_A
 * _TITLE_			___ < BREAK HERE
 * Lorem ipsum dolor sit
 * amet, consectetur
 * adipiscing elit.		___ < BREAK HERE
 * _LIKELY_ANOTHER_TITLE_	____< BREAK HERE
 * Sed do eiusmod tempor
 * incididunt ut labore
 * et dolore magna aliqua.
 */
typedef struct
{
	stext_pos *pos;
	fz_stext_line *title_start;
	fz_stext_line *title_end;
	underline_state underlined;
	int changed;
} underlined_data;

static int
underlined_break(fz_context *ctx, fz_stext_block *block, underlined_data *data)
{
	fz_stext_line *line;

	/* We have a block that looks like a title. */
	if (data->title_start != block->u.t.first_line)
	{
		/* We need to split the block before title_start */
		line = data->title_start;
	}
	else if (data->title_end != block->u.t.last_line)
	{
		/* We need to split the block after title_end */
		line = data->title_end->next;
	}
	else
	{
		/* This block is already entirely title. */
		line = NULL;
	}
	if (line)
	{
		(void)split_block_at_line(ctx, data->pos, block, line);
		data->changed = 1;
		if (line == data->title_start)
		{
			/* Don't label the latter part as a title yet, we'll do it when
			 * we step back in, but we don't know how much of the latter
			 * block is title yet. */
		}
		else
		{
			block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_H);
		}
	}
	else
	{
		block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_H);
	}
	return 1;
}

static int
underlined_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
{
	underlined_data *data = (underlined_data *)arg;

	if (data->underlined == UNDERLINE_YES)
	{
		/* Add the line we've just finished to the start/stop region */
		if (data->title_start == NULL)
			data->title_start = line->prev;
		data->title_end = line->prev;
	}
	else if (data->title_start != NULL)
	{
		/* We've reached the end of a title region. */
		return underlined_break(ctx, block, data);
	}
	data->underlined = UNDERLINE_UNKNOWN;

	return 0;
}

static int
underlined_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
{
	underlined_data *data = (underlined_data *)arg;
	fz_stext_char *ch;

	/* If we already know that this line is mixed underlined, then no point in
	 * wasting time. */
	if (data->underlined == UNDERLINE_MIXED)
		return 0;

	/* If we haven't started looking yet, prime the value. */
	if (data->underlined == UNDERLINE_UNKNOWN)
		data->underlined = (line->first_char->flags & FZ_STEXT_UNDERLINE) ? UNDERLINE_YES : UNDERLINE_NO;

	/* Check that all the rest of the the chars match our expected value. */
	for (ch = line->first_char; ch != NULL; ch = ch->next)
		if ((!!(ch->flags & FZ_STEXT_UNDERLINE)) ^ (data->underlined == UNDERLINE_YES))
		{
			/* Differs! So, Mixed. */
			data->underlined = UNDERLINE_MIXED;
			break;
		}

	return 0;
}

static void
underlined_end(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
{
	underlined_data *data = (underlined_data *)arg;

	if (data->underlined == UNDERLINE_YES)
	{
		/* Add the line we've just finished to the start/stop region */
		if (data->title_start == NULL)
			data->title_start = block->u.t.last_line;
		data->title_end = block->u.t.last_line;
	}

	/* If we didn't find a region, bale. */
	if (data->title_start)
		underlined_break(ctx, block, data);
}

static int
detect_underlined_titles(fz_context *ctx, stext_pos *pos, fz_stext_block *block)
{
	/* Let's do the title scanning, where our criteria is
	 * "the entire line is underlined". */
	underlined_data data[1];

	data->pos = pos;
	data->title_start = NULL;
	data->title_end = NULL;
	data->underlined = UNDERLINE_UNKNOWN;
	data->changed = 0;

	line_walker(ctx, block, underlined_newline, underlined_line, underlined_end, data);

	return data->changed;
}


/* Now we scan again, where the 'title' criteria is based upon
 * the titles being entirely in a different font. */
typedef struct
{
	stext_pos *pos;
	fz_stext_line *title_start;
	fz_stext_line *title_end;
	fz_font *font;
	int changed;
} font_data;

#define MIXED_FONT ((fz_font *)1)

static int
font_break(fz_context *ctx, fz_stext_block *block, font_data *data)
{
	fz_stext_line *line;

	/* We have a block that looks like a title. */
	if (data->title_start != block->u.t.first_line)
	{
		/* We need to split the block before title_start */
		line = data->title_start;
	}
	else if (data->title_end != block->u.t.last_line)
	{
		/* We need to split the block after title_end */
		line = data->title_end->next;
	}
	else
	{
		/* This block is already entirely title. */
		line = NULL;
	}
	if (line)
	{
		(void)split_block_at_line(ctx, data->pos, block, line);
		data->changed = 1;
		if (line == data->title_start)
		{
			/* Don't label the latter part as a title yet, we'll do it when
			 * we step back in, but we don't know how much of the latter
			 * block is title yet. */
		}
		else
		{
			block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_H);
		}
	}
	else
	{
		block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_H);
	}

	return 1;
}

static int
font_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
{
	font_data *data = (font_data *)arg;

	if (data->font != NULL && data->font != MIXED_FONT && font_is_bold(data->font))
	{
		/* Add the line we've just finished to the start/stop region */
		if (data->title_start == NULL)
			data->title_start = line->prev;
		data->title_end = line->prev;
	}
	else if (data->title_start != NULL)
	{
		/* We've reached the end of a title region. */
		return font_break(ctx, block, data);
	}
	data->font = NULL;

	return 0;
}

static int
font_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
{
	font_data *data = (font_data *)arg;
	fz_stext_char *ch;

	/* If we already know that this line is mixed fonts, then no point in
	 * wasting time. */
	if (data->font == MIXED_FONT)
		return 0;

	/* If we are just starting, prime it. */
	if (data->font == NULL)
		data->font = line->first_char->font;

	for (ch = line->first_char; ch != NULL; ch = ch->next)
		if (ch->font != data->font)
		{
			data->font = MIXED_FONT;
			break;
		}

	return 0;
}

static void
font_end(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
{
	font_data *data = (font_data *)arg;

	if (data->font != NULL && data->font != MIXED_FONT && font_is_bold(data->font))
	{
		/* Add the line we've just finished to the start/stop region */
		if (data->title_start == NULL)
			data->title_start = block->u.t.last_line;
		data->title_end = block->u.t.last_line;
	}

	if (data->title_start)
		font_break(ctx, block, data);
}

static int
detect_titles_by_font_usage(fz_context *ctx, stext_pos *pos, fz_stext_block *block)
{
	font_data data[1];

	data->pos = pos;
	data->title_start = NULL;
	data->title_end = NULL;
	data->font = NULL;
	data->changed = 0;

	line_walker(ctx, block, font_newline, font_line, font_end, data);

	return data->changed;
}

typedef struct
{
	fz_rect bbox;
	stext_pos *pos;
	int changed;
} indent_data;

static int
indent_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
{
	indent_data *data = (indent_data *)arg;
	float indent = line->bbox.x0 - data->bbox.x0;

	if (indent > line_height)
	{
		/* Break the block here! */
		(void)split_block_at_line(ctx, data->pos, block, line);
		data->changed = 1;
		return 1;
	}

	return 0;
}

static int
break_paragraphs_by_indent(fz_context *ctx, stext_pos *pos, fz_stext_block *block, fz_rect bbox)
{
	indent_data data[1];

	data->pos = pos;
	data->bbox = bbox;
	data->changed = 0;

	line_walker(ctx, block, indent_newline, NULL, NULL, data);

	return data->changed;
}

typedef struct
{
	fz_rect bbox;
	stext_pos *pos;
	float line_gap;
	float prev_line_gap;
	int looking_for_space;
	float space_size;
	int maybe_ends_paragraph;
	int changed;
} trailing_data;

static int
trailing_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
{
	trailing_data *data = (trailing_data *)arg;

	data->prev_line_gap = data->line_gap;

	if (data->looking_for_space)
	{
		/* We've moved downwards onto a line, and failed to find
		 * a space on that line. Presumably that means that whole
		 * line is a single word. */
		float line_len = line->bbox.x1 - line->bbox.x0;

		if (line_len + data->space_size < data->prev_line_gap)
		{
			/* We could have fitted this word into the previous line. */
			/* So presumably that was a paragraph break. Split here. */
			(void)split_block_at_line(ctx, data->pos, block, line);
			data->changed = 1;
			return 1;
		}
		data->looking_for_space = 0;
	}

	/* If we the last line we looked at ended plausibly for a paragraph,
	 * then look for a space in this line... */
	data->looking_for_space = data->maybe_ends_paragraph;

	return 0;
}

static int
trailing_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
{
	trailing_data *data = (trailing_data *)arg;
	fz_stext_char *ch;

	data->line_gap = data->bbox.x1 - line->bbox.x1;
	if (line->last_char && (
		(line->last_char->c >= 'A' && line->last_char->c <= 'Z') ||
		(line->last_char->c >= 'a' && line->last_char->c <= 'z') ||
		(line->last_char->c >= '0' && line->last_char->c <= '9')))
	{
		/* In Latin text, paragraphs should always end up some form
		 * of punctuation. I suspect that's less true of some other
		 * languages (particularly far-eastern ones). Let's just say
		 * that if we end in A-Za-z0-9 we can't possibly be the last
		 * line of a paragraph. */
		data->maybe_ends_paragraph = 0;
	}
	else
	{
		/* Plausibly the next line might be the first line of a new paragraph */
		data->maybe_ends_paragraph = 1;
	}
	for (ch = line->first_char; ch != NULL; ch = ch->next)
	{
		fz_rect r;
		float w, line_len;

		if (ch->c != ' ')
			continue;

		r = fz_rect_from_quad(ch->quad);
		w = r.x1 - r.x0;

		if (w < data->space_size)
			data->space_size = w;

		/* If we aren't looking_for_space, then no point in checking for
		 * whether the prefix will fit. But keep looping as we want to
		 * continue to refine our idea of how big a space is. */
		if (!data->looking_for_space)
			continue;

		line_len = r.x0 - line->bbox.x0;
		if (line_len + data->space_size < data->prev_line_gap)
		{
			/* We could have fitted this word into the previous line. */
			/* So presumably that was a paragraph break. Split here. */
			(void)split_block_at_line(ctx, data->pos, block, line);
			data->changed = 1;
			return 1;
		}
		data->looking_for_space = 0;
	}

	return 0;
}

static int
break_paragraphs_by_analysing_trailing_gaps(fz_context *ctx, stext_pos *pos, fz_stext_block *block, fz_rect bbox)
{
	trailing_data data[1];

	data->bbox = bbox;
	data->pos = pos;
	data->line_gap = 0;
	data->prev_line_gap = 0;
	data->looking_for_space = 0;
	data->space_size = 99999;
	data->maybe_ends_paragraph = 0;
	data->changed = 0;

	line_walker(ctx, block, trailing_newline, trailing_line, NULL, data);

	return data->changed;
}

typedef struct
{
	fz_rect bbox;
	stext_pos *pos;
	int count_lines;
	int count_justified;
	int non_digits_exist_in_this_line;
	fz_rect fragment_box;
	fz_rect line_box;
	int gap_count_this_line;
	float gap_size_this_line;
	int bad_gap;
	float xmin, xmax;
	float last_min_space;
	int changed;
} justify_data;

#define JUSTIFY_THRESHOLD 1

static int
justify_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
{
	justify_data *data = (justify_data *)arg;

	if (line->prev)
		line = line->prev;

	data->line_box = fz_union_rect(data->line_box, data->fragment_box);
	if (data->line_box.x0 < data->bbox.x0 + JUSTIFY_THRESHOLD && data->line_box.x1 > data->bbox.x1 - JUSTIFY_THRESHOLD && data->gap_count_this_line && data->non_digits_exist_in_this_line)
		data->count_justified++;
	data->non_digits_exist_in_this_line = 0;
	data->count_lines++;
	data->gap_size_this_line = 0;
	data->gap_count_this_line = 0;
	data->fragment_box = fz_empty_rect;
	data->line_box = fz_empty_rect;

	data->xmin = INFINITY;
	data->xmax = -INFINITY;

	return 0;
}

static void
fragment_end(justify_data *data)
{
	float gap;

	if (fz_is_empty_rect(data->fragment_box))
	{
		/* No fragment. Nothing to do. */
		return;
	}
	if (fz_is_empty_rect(data->line_box))
	{
		/* First fragment of the line; no gap yet. */
		gap = 0;
	}
	else if (data->fragment_box.x0 > data->line_box.x1)
	{
		/* This whole fragment is to the right of the line so far. */
		gap = data->fragment_box.x0 - data->line_box.x1;
	}
	else if (data->fragment_box.x1 < data->line_box.x0)
	{
		/* This whole fragment is the left of the line so far. */
		gap = data->line_box.x1 - data->fragment_box.x0;
	}
	else
	{
		/* Abutting or overlapping fragment. Ignore it. */
		gap = 0;
	}
	data->line_box = fz_union_rect(data->line_box, data->fragment_box);
	data->fragment_box = fz_empty_rect;
	if (gap < data->last_min_space)
		return;
	/* So we have a gap to consider */
	if (data->gap_count_this_line > 0)
	{
		/* Allow for double spaces, cos some layouts put
		 * double spaces before full stops. */
		if (fabs(gap - data->gap_size_this_line) > 1 &&
			fabs(gap/2.0 - data->gap_size_this_line) < 1)
			gap /= 2;
		if (fabs(gap - data->gap_size_this_line) > 1)
			data->bad_gap = 1;
	}
	data->gap_size_this_line = (data->gap_size_this_line * data->gap_count_this_line + gap) / (data->gap_count_this_line + 1);
	data->gap_count_this_line++;
}

/* This is trickier than you'd imagine. We want to walk the line, looking
 * for how large the spaces are. In a justified line, all the spaces should
 * be pretty much the same size. (Except maybe before periods). But we want
 * to cope with bidirectional text which can send glyphs in unexpected orders.
 * e.g.   abc fed ghi
 * So we have to walk over "fragments" at a time.
 */
static int
justify_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
{
	justify_data *data = (justify_data *)arg;
	fz_stext_char *ch;

	for (ch = line->first_char; ch != NULL; ch = ch->next)
	{
		fz_rect r = fz_rect_from_quad(ch->quad);
		float min_space = ch->size * 0.15f; /* Matches SPACE_DIST from stext-device. */

		if (ch->c == ' ')
		{
			/* This ends a fragment, but we don't treat it as such.
			 * Just continue, because we'll end the fragment next time
			 * around the loop (this copes with trailing spaces, and
			 * multiple spaces, and gaps between 'lines' that are on
			 * the same line. */
			data->last_min_space = min_space;
			continue;
		}
		if ((ch->c <= '0' || ch->c >= '9') && ch->c != '.')
			data->non_digits_exist_in_this_line = 1;
		if (!fz_is_empty_rect(data->fragment_box))
		{
			if (r.x0 > data->fragment_box.x1 + data->last_min_space)
			{
				/* Fragment ends due to gap on right. */
				fragment_end(data);
			}
			else if (r.x1 < data->fragment_box.x0 - data->last_min_space)
			{
				/* Fragment ends due to gap on left. */
				fragment_end(data);
			}
		}
		/* Extend the fragment */
		data->fragment_box = fz_union_rect(data->fragment_box, r);
		data->last_min_space = min_space;
	}

	return 0;
}

static void
justify_end(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
{
	justify_data *data = (justify_data *)arg;

	fragment_end(data);
	data->line_box = fz_union_rect(data->line_box, data->fragment_box);
	if (data->line_box.x0 < data->bbox.x0 + JUSTIFY_THRESHOLD && data->line_box.x1 > data->bbox.x1 - JUSTIFY_THRESHOLD && data->gap_count_this_line && data->non_digits_exist_in_this_line)
		data->count_justified++;
	data->count_lines++;
}

static int
justify2_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
{
	justify_data *data = (justify_data *)arg;

	if (data->line_box.x0 < data->bbox.x0 + JUSTIFY_THRESHOLD && data->line_box.x1 > data->bbox.x1 - JUSTIFY_THRESHOLD)
	{
		/* Justified */
	}
	else
	{
		/* Break after line */
		(void)split_block_at_line(ctx, data->pos, block, line);
		data->changed = 1;
		return 1;
	}

	data->line_box = fz_empty_rect;

	return 0;
}

static int
justify2_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
{
	justify_data *data = (justify_data *)arg;
	fz_stext_char *ch;

	for (ch = line->first_char; ch != NULL; ch = ch->next)
	{
		if (ch->c == ' ')
			continue;

		data->line_box = fz_union_rect(data->line_box, fz_rect_from_quad(ch->quad));
	}

	return 0;
}

static fz_rect
text_block_marked_bbox(fz_context *ctx, fz_stext_block *block)
{
	fz_stext_line *line;
	fz_stext_char *ch;
	fz_rect r = fz_empty_rect;

	for (line = block->u.t.first_line; line != NULL; line = line->next)
	{
		for (ch = line->first_char; ch != NULL; ch = ch->next)
		{
			if (ch->c == ' ')
				continue;
			r = fz_union_rect(r, fz_rect_from_quad(ch->quad));
		}
	}

	return r;
}

static int
break_paragraphs_within_justified_text(fz_context *ctx, stext_pos *pos, fz_stext_block *block, fz_rect bbox)
{
	justify_data data[1];

	if (block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_UNKNOWN)
		return 0;

	data->bbox = bbox;

	data->pos = pos;
	data->count_lines = 0;
	data->count_justified = 0;
	data->non_digits_exist_in_this_line = 0;
	data->bad_gap = 0;
	data->gap_size_this_line = 0;
	data->gap_count_this_line = 0;
	data->fragment_box = fz_empty_rect;
	data->line_box = fz_empty_rect;
	data->xmin = INFINITY;
	data->xmax = -INFINITY;
	data->changed = 0;

	line_walker(ctx, block, justify_newline, justify_line, justify_end, data);

	/* We can't really derive anything about single lines! */
	if (data->count_lines < 2)
		return 0;
	/* If at least half of the lines don't appear to be justified, then
	 * don't trust 'em. */
	if (data->count_justified * 2 < data->count_lines)
		return 0;
	/* If the "badness" we've seen to do with big gaps (i.e. how much
	 * bigger the gaps are than we'd reasonably expect) is too large
	 * then we can't be a justified block. We are prepared to forgive
	 * larger sizes in larger paragraphs. */
	if (data->bad_gap)
		return 0;
	block->u.t.flags = FZ_STEXT_TEXT_JUSTIFY_FULL;

	line_walker(ctx, block, justify2_newline, justify2_line, NULL, data);

	return data->changed;
}

typedef enum
{
	LOOKING_FOR_BULLET = 0,
	LOOKING_FOR_POST_BULLET = 1,
	LOOKING_FOR_POST_NUMERICAL_BULLET = 2,
	FOUND_BULLET = 3,
	CONTINUATION_LINE = 4,
	NO_BULLET = 5
} list_state;

typedef struct
{
	stext_pos *pos;
	list_state state;
	int buffer[10];
	int buffer_fill;
	float bullet_r;
	float post_bullet_indent;
	float l;
	fz_stext_line *bullet_line_start;
	fz_stext_line *this_line_start;
	int changed;
} list_data;

static int
list_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
{
	list_data *data = (list_data *)arg;

	if (data->state == FOUND_BULLET)
	{
		if (block->u.t.first_line != data->bullet_line_start && data->state == FOUND_BULLET)
		{
			/* We need to split the block before the bullet started. */
			(void)split_block_at_line(ctx, data->pos, block, data->bullet_line_start);
			data->changed = 1;
			return 1;
		}
		if (data->bullet_line_start != data->this_line_start)
		{
			/* We've found a second bullet. Break before the previous line. */
			(void)split_block_at_line(ctx, data->pos, block, data->this_line_start);
			block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
			data->changed = 1;
			return 1;
		}
	}
	else if (data->state == NO_BULLET && data->bullet_line_start)
	{
		/* We've found a bullet before, and the line we've just completed
		 * is neither a new bullet line, or a continuation so, we need to
		 * break that into a new block. */
		(void)split_block_at_line(ctx, data->pos, block, data->this_line_start);
		block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
		data->changed = 1;
		return 1;
	}

	data->this_line_start = line;
	data->state = LOOKING_FOR_BULLET;
	data->buffer_fill = 0;
	data->l = block->bbox.x1;
	data->bullet_r = block->bbox.x0;

	return 0;
}

static int
approx_eq(float a, float b, float c)
{
	return fabs(a - b) <= c;
}

static int
is_roman(int c)
{
	switch (c)
	{
	case 'm': case 'M':
	case 'c': case 'C':
	case 'l': case 'L':
	case 'x': case 'X':
	case 'v': case 'V':
	case 'i': case 'I':
		return 1;
	}
	return 0;
}

typedef enum {
	NOT_A_BULLET,
	BULLET,
	NUMERICAL_BULLET
} bullet_t;

static bullet_t
is_bullet_aux(int *buffer, int len, int contained)
{
	int i, decimal_pos, decimals_found;

	if (len == 1 && (
		buffer[0] == '*' ||
		buffer[0] == 0x00B7 || /* Middle Dot */
		buffer[0] == 0x2022 || /* Bullet */
		buffer[0] == 0x2023 || /* Triangular Bullet */
		buffer[0] == 0x2043 || /* Hyphen Bullet */
		buffer[0] == 0x204C || /* Back leftwards bullet */
		buffer[0] == 0x204D || /* Back rightwards bullet */
		buffer[0] == 0x2219 || /* Bullet operator */
		buffer[0] == 0x25C9 || /* Fisheye */
		buffer[0] == 0x25CB || /* White circle */
		buffer[0] == 0x25CF || /* Black circle */
		buffer[0] == 0x25D8 || /* Inverse Bullet */
		buffer[0] == 0x25E6 || /* White Bullet */
		buffer[0] == 0x2619 || /* Reversed Rotated Floral Heart Bullet / Fleuron */
		buffer[0] == 0x261a || /* Black left pointing index */
		buffer[0] == 0x261b || /* Black right pointing index */
		buffer[0] == 0x261c || /* White left pointing index */
		buffer[0] == 0x261d || /* White up pointing index */
		buffer[0] == 0x261e || /* White right pointing index */
		buffer[0] == 0x261f || /* White down pointing index */
		buffer[0] == 0x2765 || /* Rotated Heavy Heart Black Heart Bullet */
		buffer[0] == 0x2767 || /* Rotated Floral Heart Bullet / Fleuron */
		buffer[0] == 0x29BE || /* Circled White Bullet */
		buffer[0] == 0x29BF || /* Circled Bullet */
		buffer[0] == 0x2660 || /* Black Spade suit */
		buffer[0] == 0x2661 || /* White Heart suit */
		buffer[0] == 0x2662 || /* White Diamond suit */
		buffer[0] == 0x2663 || /* Black Club suit */
		buffer[0] == 0x2664 || /* White Spade suit */
		buffer[0] == 0x2665 || /* Black Heart suit */
		buffer[0] == 0x2666 || /* Black Diamond suit */
		buffer[0] == 0x2667 || /* White Clud suit */
		buffer[0] == 0x1F446 || /* WHITE UP POINTING BACKHAND INDEX */
		buffer[0] == 0x1F447 || /* WHITE DOWN POINTING BACKHAND INDEX */
		buffer[0] == 0x1F448 || /* WHITE LEFT POINTING BACKHAND INDEX */
		buffer[0] == 0x1F449 || /* WHITE RIGHT POINTING BACKHAND INDEX */
		buffer[0] == 0x1f597 || /* White down pointing left hand index */
		buffer[0] == 0x1F598 || /* SIDEWAYS WHITE LEFT POINTING INDEX */
		buffer[0] == 0x1F599 || /* SIDEWAYS WHITE RIGHT POINTING INDEX */
		buffer[0] == 0x1F59A || /* SIDEWAYS BLACK LEFT POINTING INDEX */
		buffer[0] == 0x1F59B || /* SIDEWAYS BLACK RIGHT POINTING INDEX */
		buffer[0] == 0x1F59C || /* BLACK LEFT POINTING BACKHAND INDEX */
		buffer[0] == 0x1F59D || /* BLACK RIGHT POINTING BACKHAND INDEX */
		buffer[0] == 0x1F59E || /* SIDEWAYS WHITE UP POINTING INDEX */
		buffer[0] == 0x1F59F || /* SIDEWAYS WHITE DOWN POINTING INDEX */
		buffer[0] == 0x1F5A0 || /* SIDEWAYS BLACK UP POINTING INDEX */
		buffer[0] == 0x1F5A1 || /* SIDEWAYS BLACK DOWN POINTING INDEX */
		buffer[0] == 0x1F5A2 || /* BLACK UP POINTING BACKHAND INDEX */
		buffer[0] == 0x1F5A3 || /* BLACK DOWN POINTING BACKHAND INDEX */
		buffer[0] == 0x1FBC1 || /* LEFT THIRD WHITE RIGHT POINTING INDEX */
		buffer[0] == 0x1FBC2 || /* MIDDLE THIRD WHITE RIGHT POINTING INDEX */
		buffer[0] == 0x1FBC3 || /* RIGHT THIRD WHITE RIGHT POINTING INDEX */
		buffer[0] == 0xFFFD || /* UNICODE_REPLACEMENT_CHARACTER */
		0))
		return BULLET;

	if (!contained)
	{
		if (len > 2 && buffer[0] == '(' && buffer[len-1] == ')')
			return is_bullet_aux(buffer+1, len-2, 1) ? BULLET : NOT_A_BULLET;
		if (len > 2 && buffer[0] == '<' && buffer[len-1] == '>')
			return is_bullet_aux(buffer+1, len-2, 1) ? BULLET : NOT_A_BULLET;
		if (len > 2 && buffer[0] == '[' && buffer[len-1] == ']')
			return is_bullet_aux(buffer+1, len-2, 1) ? BULLET : NOT_A_BULLET;
		if (len > 2 && buffer[0] == '{' && buffer[len-1] == '}')
			return is_bullet_aux(buffer+1, len-2, 1) ? BULLET : NOT_A_BULLET;

		if (len > 1 && buffer[len-1] == ':')
			return is_bullet_aux(buffer, len-1, 1) ? BULLET : NOT_A_BULLET;
		if (len > 1 && buffer[len-1] == ')')
			return is_bullet_aux(buffer, len-1, 1) ? BULLET : NOT_A_BULLET;
	}

	/* Look for numbers */
	/* Be careful not to interpret rows of numbers, like:
	 *    10.02 12.03
	 * as bullets.
	 */
	decimal_pos = 0;
	decimals_found = 0;
	for (i = 0; i < len; i++)
	{
		if (buffer[i] >= '0' && buffer[i] <= '9')
		{
		}
		else if (buffer[i] == '.')
		{
			decimal_pos = i;
			decimals_found++;
		}
		else
			break;
	}
	if (i == len && decimals_found <= 1)
		return NUMERICAL_BULLET;
	/* or number.something */
	if (decimals_found && i == decimal_pos+1 && i < len)
		return is_bullet_aux(buffer+i, len-i, 0) ? BULLET : NOT_A_BULLET;;

	/* Look for roman */
	for (i = 0; i < len; i++)
		if (!is_roman(buffer[i]))
			break;
	if (i == len)
		return 1;
	/* or roman.something */
	if (buffer[i] == '.' && i < len-1)
		return is_bullet_aux(buffer+i+1, len-i-1, 0) ? BULLET : NOT_A_BULLET;

	/* FIXME: Others. */
	return NOT_A_BULLET;
}

static bullet_t
is_bullet(int *buffer, int len)
{
	return is_bullet_aux(buffer, len, 0);
}

static int
eval_buffer_for_bullet(fz_context *ctx, list_data *data, float size)
{
	bullet_t bullet_type;

	bullet_type = is_bullet(data->buffer, data->buffer_fill);
	if (bullet_type == NUMERICAL_BULLET)
		data->state = LOOKING_FOR_POST_NUMERICAL_BULLET;
	else if (bullet_type)
		data->state = LOOKING_FOR_POST_BULLET;
	else
	{
		if (approx_eq(data->l, data->post_bullet_indent, size/2))
			data->state = CONTINUATION_LINE;
		else
			data->state = NO_BULLET;
		return 1;
	}
	return 0;
}

static int
list_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
{
	list_data *data = (list_data *)arg;
	fz_stext_char *ch;

	for (ch = line->first_char; ch != NULL; ch = ch->next)
	{
		fz_rect r = fz_rect_from_quad(ch->quad);

		if (r.x0 < data->l)
			data->l = line->bbox.x0;

		switch (data->state)
		{
		case LOOKING_FOR_BULLET:
			if (ch->c == ' ')
			{
				/* We have a space */
				if (data->buffer_fill == 0)
					continue; /* Just skip leading spaces */
				if (eval_buffer_for_bullet(ctx, data, ch->size))
					return 0;
			}
			else if (data->buffer_fill > 0 && r.x0 - data->bullet_r > ch->size/2)
			{
				/* We have a gap large enough to be a space while we've
				 * got something in the buffer. */
				if (eval_buffer_for_bullet(ctx, data, ch->size))
					return 0;
			}
			else if (data->buffer_fill < (int)nelem(data->buffer))
			{
				/* Stick it in the buffer for evaluation later. */
				data->buffer[data->buffer_fill++] = ch->c;
			}
			else
			{
				/* Buffer overflowed. Can't be a bullet. */
				if (approx_eq(data->l, data->post_bullet_indent, ch->size))
					data->state = CONTINUATION_LINE;
				else
					data->state = NO_BULLET;
				return 0;
			}
			data->bullet_r = r.x1;
			break;
		case LOOKING_FOR_POST_BULLET:
			if (ch->c != ' ')
			{
				data->state = FOUND_BULLET;
				if (data->bullet_line_start == NULL)
					data->bullet_line_start = data->this_line_start;
				data->post_bullet_indent = r.x0;
			}
			break;
		case LOOKING_FOR_POST_NUMERICAL_BULLET:
			if (ch->c >= '0' && ch->c <= '9')
			{
				/* Numerical bullets can't be followed by numbers. */
				if (approx_eq(data->l, data->post_bullet_indent, ch->size))
					data->state = CONTINUATION_LINE;
				else
					data->state = NO_BULLET;
				return 0;
			}
			if (ch->c != ' ')
			{
				data->state = FOUND_BULLET;
				if (data->bullet_line_start == NULL)
					data->bullet_line_start = data->this_line_start;
				data->post_bullet_indent = r.x0;
			}
			break;
		default:
			break;
		}
	}

	return 0;
}

static void
list_end(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
{
	list_data *data = (list_data *)arg;

	if (data->state == LOOKING_FOR_BULLET)
	{
		eval_buffer_for_bullet(ctx, data, 0);
		/* If we ended up thinking we'd found a bullet, subject to
		 * what follows not being of a specific form, then we're
		 * fine, because nothing follows us! */
		if (data->state == LOOKING_FOR_POST_NUMERICAL_BULLET ||
			data->state == LOOKING_FOR_POST_BULLET)
		{
			data->state = FOUND_BULLET;
			if (data->bullet_line_start == NULL)
				data->bullet_line_start = data->this_line_start;
		}
		/* FIXME: This block contains just a bullet - not the content
		 * for the bullet. We see this with page-12.pdf.
		 *    <>    Rising commitment to battery...
		 *          committed to in-house battery...
		 *          developing and manufacturing...
		 *
		 * The <> is in a whole different DIV to the following text.
		 * Really we want to look for if the "next" content (for some
		 * definition of next) is on the same line as the bullet. If
		 * it is, we want to merge the 2 divs.
		 *
		 * But that's a really tricky thing to do given the recursive
		 * block walk we are current doing. Think about this.
		 * For now, we just mark the <> as being a list item.
		 */
	}
	if (data->state == FOUND_BULLET)
	{
		if (block->u.t.first_line != data->bullet_line_start && data->state == FOUND_BULLET)
		{
			/* We need to split the block before the start of the bullet. */
			(void)split_block_at_line(ctx, data->pos, block, data->bullet_line_start);
			data->changed = 1;
			return;
		}
		if (data->bullet_line_start != data->this_line_start)
		{
			/* We've found a second bullet. Break before the line. */
			(void)split_block_at_line(ctx, data->pos, block, data->this_line_start);
			block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
			data->changed = 1;
			return;
		}
		block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
	}
	else if (data->state == NO_BULLET && data->bullet_line_start)
	{
		/* We've found a bullet before, and the line we've just completed
		 * is neither a new bullet line, or a continuation so, we need to
		 * break that into a new block. */
		(void)split_block_at_line(ctx, data->pos, block, data->this_line_start);
		block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
		data->changed = 1;
		return;
	}
	else if (data->bullet_line_start)
	{
		/* We've come to the end of the block still in the list item. */
		block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
	}
}

static int
break_list_items(fz_context *ctx, stext_pos *pos, fz_stext_block *block)
{
	list_data data[1];

	if (block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_UNKNOWN)
		return 0;

	data->pos = pos;
	data->state = LOOKING_FOR_BULLET;
	data->buffer_fill = 0;
	data->l = block->bbox.x1;
	data->bullet_line_start = NULL;
	data->this_line_start = block->u.t.first_line;
	data->bullet_r = block->bbox.x0;
	data->changed = 0;

	line_walker(ctx, block, list_newline, list_line, list_end, data);

	return data->changed;
}

static int
is_header(fz_structure s)
{
	return (s == FZ_STRUCTURE_H ||
		s == FZ_STRUCTURE_H1 ||
		s == FZ_STRUCTURE_H2 ||
		s == FZ_STRUCTURE_H3 ||
		s == FZ_STRUCTURE_H4 ||
		s == FZ_STRUCTURE_H5 ||
		s == FZ_STRUCTURE_H6);
}

static void
do_para_break(fz_context *ctx, fz_stext_page *page, fz_stext_block **pfirst, fz_stext_block **plast, fz_stext_struct *parent, int in_header)
{
	fz_stext_block *block, *next_block;
	stext_pos pos;
	fz_rect bbox;

	pos.pool = page->pool;
	pos.idx = 0;
	pos.pfirst = pfirst;
	pos.plast = plast;
	pos.parent = parent;

	/* First off, in order for us to consider a block to be suitable for paragraph
	 * splitting, we want it to be a series of lines moving down the page, (or left
	 * to right within a line). */
	for (block = *pfirst; block != NULL; block = next_block)
	{
		next_block = block->next;

		switch (block->type)
		{
		case FZ_STEXT_BLOCK_STRUCT:
			if (block->u.s.index < pos.idx)
				block->u.s.index = pos.idx++;
			else
				pos.idx = block->u.s.index+1;
			if (block->u.s.down)
			{
				int header = in_header | is_header(block->u.s.down->standard);
				do_para_break(ctx, page, &block->u.s.down->first_block, &block->u.s.down->last_block, block->u.s.down, header);
			}
			break;
		case FZ_STEXT_BLOCK_TEXT:
			if (!lines_move_plausibly_like_paragraph(block))
				break;

#ifdef DEBUG_SPLITS
			dump_block(ctx, "Around the top level block loop:", block);
#endif

			/* Firstly, and somewhat annoyingly we need to find the bbox of the
			 * block that doesn't include for trailing spaces. If we just use
			 * the normal bbox, then lines that end in "foo " will end further
			 * to the right of lines that end in "ba-", and consequently we'll
			 * fail to detect blocks as being justified.
			 * See PMC2656817_00002.pdf as an example. */
			bbox = text_block_marked_bbox(ctx, block);

#ifdef DEBUG_PARA_SPLITS
			{
				fz_stext_line *line;

				for (line = block->u.t.first_line; line != NULL; line = line->next)
				{
					fz_stext_char *ch;

					for (ch = line->first_char; ch != NULL; ch = ch->next)
					{
						fz_write_printf(ctx, fz_stddbg(ctx), "%C", ch->c);
					}
				}
			}
#endif

			/* Think about breaking lines at Titles. */
			/* First, underlined ones. */
			if (detect_underlined_titles(ctx, &pos, block))
				next_block = block->next; /* We split the block! */
			if (block->type != FZ_STEXT_BLOCK_TEXT)
			{
				next_block = block;
				break;
			}

#ifdef DEBUG_PARA_SPLITS
			fz_write_printf(ctx, fz_stddbg(ctx), "A");
#endif

			/* Next, ones that use bold fonts. */
			if (!in_header)
			{
				if (detect_titles_by_font_usage(ctx, &pos, block))
					next_block = block->next; /* We split the block! */
				if (block->type != FZ_STEXT_BLOCK_TEXT)
				{
					next_block = block;
					break;
				}
			}

#ifdef DEBUG_PARA_SPLITS
			fz_write_printf(ctx, fz_stddbg(ctx), "B");
#endif

			/* Now look at breaking based upon indents */
			if (break_paragraphs_by_indent(ctx, &pos, block, bbox))
				next_block = block->next; /* We split the block! */
			if (block->type != FZ_STEXT_BLOCK_TEXT)
			{
				next_block = block;
				break;
			}

#ifdef DEBUG_PARA_SPLITS
			fz_write_printf(ctx, fz_stddbg(ctx), "C");
#endif

			/* Now we're going to look for unindented paragraphs. We do this by
			 * considering if the first word on the next line would have fitted
			 * into the space left at the end of the previous line. */
			if (break_paragraphs_by_analysing_trailing_gaps(ctx, &pos, block, bbox))
				next_block = block->next; /* We split the block! */
			if (block->type != FZ_STEXT_BLOCK_TEXT)
			{
				next_block = block;
				break;
			}

#ifdef DEBUG_PARA_SPLITS
			fz_write_printf(ctx, fz_stddbg(ctx), "D");
#endif

			/* Now look to see if a block looks like fully justified text. If it
			 * does, then any line that doesn't reach the right hand side must be
			 * a paragraph break. */
			if (break_paragraphs_within_justified_text(ctx, &pos, block, bbox))
				next_block = block->next; /* We split the block! */
			if (block->type != FZ_STEXT_BLOCK_TEXT)
			{
				next_block = block;
				break;
			}

#ifdef DEBUG_PARA_SPLITS
			fz_write_printf(ctx, fz_stddbg(ctx), "E");
#endif

			/* Look for bulleted list items. */
			if (break_list_items(ctx, &pos, block))
				next_block = block->next; /* We split the block! */

			break;
		}
	}
}

void
fz_paragraph_break(fz_context *ctx, fz_stext_page *page)
{
	do_para_break(ctx, page, &page->first_block, &page->last_block, NULL, 0);
}