stext-para.c 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584
  1. // Copyright (C) 2025 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #include "mupdf/fitz.h"
  23. #include <assert.h>
  24. /* #define DEBUG_SPLITS */
  25. /* #define DEBUG_PARA_SPLITS */
  26. static void
  27. recalc_bbox(fz_stext_block *block)
  28. {
  29. fz_rect bbox = fz_empty_rect;
  30. fz_stext_line *line;
  31. for (line = block->u.t.first_line; line != NULL; line = line->next)
  32. bbox = fz_union_rect(bbox, line->bbox);
  33. block->bbox = bbox;
  34. }
  35. typedef enum
  36. {
  37. UNDERLINE_UNKNOWN,
  38. UNDERLINE_YES,
  39. UNDERLINE_NO,
  40. UNDERLINE_MIXED
  41. } underline_state;
  42. /* Some crap heuristics to spot a bold font. */
  43. static int
  44. font_is_bold(fz_font *font)
  45. {
  46. const char *c;
  47. if (font == NULL)
  48. return 0;
  49. if (font->flags.is_bold)
  50. return 1;
  51. if (fz_strstrcase(font->name, "Bold") != NULL)
  52. return 1;
  53. if (fz_strstrcase(font->name, "Black") != NULL)
  54. return 1;
  55. if (fz_strstrcase(font->name, "Medium") != NULL)
  56. return 0;
  57. if (fz_strstrcase(font->name, "Light") != NULL)
  58. return 0;
  59. c = fz_strstr(font->name, " B");
  60. if (c && (c[2] == ' ' || c[2] == 0))
  61. return 1;
  62. return 0;
  63. }
  64. /* Check to see if lines move left to right and downwards. */
  65. /* FIXME: Maybe allow right to left? checking unicode values? */
  66. static int
  67. lines_move_plausibly_like_paragraph(fz_stext_block *block)
  68. {
  69. fz_stext_line *line;
  70. int firstline = 1;
  71. float line_height, line_x, line_y;
  72. /* Do the lines that make up this block move in an appropriate way? */
  73. for (line = block->u.t.first_line; line != NULL; line = line->next)
  74. {
  75. float x = (line->bbox.x0 + line->bbox.x1)/2;
  76. float y = (line->bbox.y0 + line->bbox.y1)/2;
  77. float height = line->bbox.y1 - line->bbox.y0;
  78. fz_stext_char *ch;
  79. /* Ignore any completely empty lines */
  80. for (ch = line->first_char; ch != NULL; ch = ch->next)
  81. if (ch->c != ' ')
  82. break;
  83. if (ch == NULL)
  84. continue;
  85. if (firstline)
  86. {
  87. line_height = height;
  88. line_x = x;
  89. line_y = y;
  90. firstline = 0;
  91. }
  92. else if (line_y - line_height/2 < y && line_y + line_height/2 > y)
  93. {
  94. /* We are plausibly the same line. Only accept if we move right. */
  95. if (x < line_x)
  96. return 0;
  97. else
  98. line_x = x;
  99. }
  100. else if (line_y < y)
  101. {
  102. /* Moving downwards. Plausible. */
  103. line_y = y;
  104. line_height = height;
  105. line_x = x;
  106. }
  107. else
  108. {
  109. /* Nothing else is plausible. */
  110. return 0;
  111. }
  112. }
  113. return 1;
  114. }
  115. #ifdef DEBUG_SPLITS
  116. static void dump_line(fz_context *ctx, const char *str, fz_stext_line *line)
  117. {
  118. fz_stext_char *ch;
  119. if (str)
  120. fz_write_printf(ctx, fz_stddbg(ctx), "%s\n", str);
  121. if (line == NULL)
  122. return;
  123. for (ch = line->first_char; ch != NULL; ch = ch->next)
  124. fz_write_printf(ctx, fz_stddbg(ctx), "%c", (char)ch->c);
  125. fz_write_printf(ctx, fz_stddbg(ctx), "\n");
  126. }
  127. static void dump_block(fz_context *ctx, const char *fmt, fz_stext_block *block)
  128. {
  129. fz_stext_line *line;
  130. fz_write_printf(ctx, fz_stddbg(ctx), "%s\n", fmt);
  131. if (block == NULL || block->type != FZ_STEXT_BLOCK_TEXT)
  132. return;
  133. for (line = block->u.t.first_line; line != NULL; line = line->next)
  134. dump_line(ctx, NULL, line);
  135. }
  136. #endif
  137. typedef struct
  138. {
  139. fz_pool *pool;
  140. fz_stext_struct *parent;
  141. int idx;
  142. fz_stext_block **pfirst;
  143. fz_stext_block **plast;
  144. } stext_pos;
  145. static fz_stext_block *split_block_at_line(fz_context *ctx, stext_pos *pos, fz_stext_block *block, fz_stext_line *line)
  146. {
  147. fz_stext_block *newblock = fz_pool_alloc(ctx, pos->pool, sizeof *newblock);
  148. #ifdef DEBUG_SPLITS
  149. dump_block(ctx, "Splitting:", block);
  150. dump_line(ctx, "At line:", line);
  151. #endif
  152. newblock->bbox = fz_empty_rect;
  153. newblock->prev = block;
  154. newblock->next = block->next;
  155. if (block->next)
  156. block->next->prev = newblock;
  157. else
  158. {
  159. assert(*pos->plast == block);
  160. *pos->plast = newblock;
  161. }
  162. block->next = newblock;
  163. newblock->type = FZ_STEXT_BLOCK_TEXT;
  164. newblock->u.t.flags = block->u.t.flags;
  165. newblock->u.t.first_line = line;
  166. newblock->u.t.last_line = block->u.t.last_line;
  167. block->u.t.last_line = line->prev;
  168. line->prev->next = NULL;
  169. line->prev = NULL;
  170. recalc_bbox(block);
  171. recalc_bbox(newblock);
  172. #ifdef DEBUG_SPLITS
  173. dump_block(ctx, "Giving:", block);
  174. dump_block(ctx, "and:", newblock);
  175. #endif
  176. return newblock;
  177. }
  178. /* Convert a block to being a struct that contains just that block. */
  179. static void block_to_struct(fz_context *ctx, stext_pos *pos, fz_stext_block *block, int structtype)
  180. {
  181. fz_stext_struct *str = fz_pool_alloc_flexible(ctx, pos->pool, fz_stext_struct, raw, 1);
  182. fz_stext_block *new_block = fz_pool_alloc(ctx, pos->pool, sizeof(*new_block));
  183. str->up = block;
  184. str->parent = pos->parent;
  185. str->first_block = new_block;
  186. str->last_block = new_block;
  187. str->standard = structtype;
  188. str->raw[0] = 0;
  189. new_block->type = block->type;
  190. new_block->bbox = block->bbox;
  191. new_block->u = block->u;
  192. block->type = FZ_STEXT_BLOCK_STRUCT;
  193. block->u.s.down = str;
  194. block->u.s.index = pos->idx++;
  195. }
  196. /*
  197. We are going to repeatedly walk the lines that make up a block.
  198. To reduce the boilerplate here, we'll use a line_walker function.
  199. This will call a bunch of callbacks as it goes.
  200. newline_fn Called whenever we move to a new horizontal line (i.e.
  201. as if we've got a newline). This is not the same as being
  202. called every fz_stext_line, as we frequently get multiple
  203. fz_stext_line's on a single horizontal line. If this returns
  204. 0, execution continues. Return 1 to stop the walking.
  205. line_fn Called for every fz_stext_line (typically used to process
  206. characters).
  207. end_fn Called at the end of the block (with line being the final
  208. line of the block.
  209. arg An opaque pointer passed to all the callbacks.
  210. */
  211. typedef int (line_walker_newline_fn)(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height);
  212. typedef int (line_walker_fn)(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg);
  213. typedef void (line_walker_end_fn)(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg);
  214. static void
  215. line_walker(fz_context *ctx, fz_stext_block *block, line_walker_newline_fn *newline_fn, line_walker_fn *line_fn, line_walker_end_fn *end_fn, void *arg)
  216. {
  217. int firstline = 1;
  218. fz_stext_line *line;
  219. float line_height, line_y;
  220. if (block->u.t.first_line == NULL)
  221. return;
  222. for (line = block->u.t.first_line; line != NULL; line = line->next)
  223. {
  224. float y = (line->bbox.y0 + line->bbox.y1)/2;
  225. float height = line->bbox.y1 - line->bbox.y0;
  226. if (line->first_char == NULL)
  227. continue; /* Should never happen, but makes life easier to assume this later. */
  228. if (firstline)
  229. {
  230. line_height = height;
  231. firstline = 0;
  232. line_y = y;
  233. }
  234. else if (line_y - line_height/2 < y && line_y + line_height/2 > y)
  235. {
  236. /* We are plausibly the same horizontal line. */
  237. }
  238. else if (line_y < y)
  239. {
  240. /* Moving downwards. */
  241. line_height = height;
  242. line_y = y;
  243. if (newline_fn && newline_fn(ctx, block, line, arg, line_height))
  244. return;
  245. }
  246. if (line_fn && line_fn(ctx, block, line, arg))
  247. return;
  248. }
  249. if (end_fn)
  250. end_fn(ctx, block, block->u.t.last_line, arg);
  251. }
  252. /* We scan through the block, collecting lines up that look
  253. * "title-ish" (by which here, we mean "are completely
  254. * underlined"). As soon as we finish such a region, we split
  255. * the block (either before or after it as appropriate), and
  256. * mark it as a title.
  257. *
  258. * e.g.
  259. *
  260. * _THIS_IS_LIKELY_A
  261. * _TITLE_ ___ < BREAK HERE
  262. * Lorem ipsum dolor sit
  263. * amet, consectetur
  264. * adipiscing elit. ___ < BREAK HERE
  265. * _LIKELY_ANOTHER_TITLE_ ____< BREAK HERE
  266. * Sed do eiusmod tempor
  267. * incididunt ut labore
  268. * et dolore magna aliqua.
  269. */
  270. typedef struct
  271. {
  272. stext_pos *pos;
  273. fz_stext_line *title_start;
  274. fz_stext_line *title_end;
  275. underline_state underlined;
  276. int changed;
  277. } underlined_data;
  278. static int
  279. underlined_break(fz_context *ctx, fz_stext_block *block, underlined_data *data)
  280. {
  281. fz_stext_line *line;
  282. /* We have a block that looks like a title. */
  283. if (data->title_start != block->u.t.first_line)
  284. {
  285. /* We need to split the block before title_start */
  286. line = data->title_start;
  287. }
  288. else if (data->title_end != block->u.t.last_line)
  289. {
  290. /* We need to split the block after title_end */
  291. line = data->title_end->next;
  292. }
  293. else
  294. {
  295. /* This block is already entirely title. */
  296. line = NULL;
  297. }
  298. if (line)
  299. {
  300. (void)split_block_at_line(ctx, data->pos, block, line);
  301. data->changed = 1;
  302. if (line == data->title_start)
  303. {
  304. /* Don't label the latter part as a title yet, we'll do it when
  305. * we step back in, but we don't know how much of the latter
  306. * block is title yet. */
  307. }
  308. else
  309. {
  310. block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_H);
  311. }
  312. }
  313. else
  314. {
  315. block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_H);
  316. }
  317. return 1;
  318. }
  319. static int
  320. underlined_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
  321. {
  322. underlined_data *data = (underlined_data *)arg;
  323. if (data->underlined == UNDERLINE_YES)
  324. {
  325. /* Add the line we've just finished to the start/stop region */
  326. if (data->title_start == NULL)
  327. data->title_start = line->prev;
  328. data->title_end = line->prev;
  329. }
  330. else if (data->title_start != NULL)
  331. {
  332. /* We've reached the end of a title region. */
  333. return underlined_break(ctx, block, data);
  334. }
  335. data->underlined = UNDERLINE_UNKNOWN;
  336. return 0;
  337. }
  338. static int
  339. underlined_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
  340. {
  341. underlined_data *data = (underlined_data *)arg;
  342. fz_stext_char *ch;
  343. /* If we already know that this line is mixed underlined, then no point in
  344. * wasting time. */
  345. if (data->underlined == UNDERLINE_MIXED)
  346. return 0;
  347. /* If we haven't started looking yet, prime the value. */
  348. if (data->underlined == UNDERLINE_UNKNOWN)
  349. data->underlined = (line->first_char->flags & FZ_STEXT_UNDERLINE) ? UNDERLINE_YES : UNDERLINE_NO;
  350. /* Check that all the rest of the the chars match our expected value. */
  351. for (ch = line->first_char; ch != NULL; ch = ch->next)
  352. if ((!!(ch->flags & FZ_STEXT_UNDERLINE)) ^ (data->underlined == UNDERLINE_YES))
  353. {
  354. /* Differs! So, Mixed. */
  355. data->underlined = UNDERLINE_MIXED;
  356. break;
  357. }
  358. return 0;
  359. }
  360. static void
  361. underlined_end(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
  362. {
  363. underlined_data *data = (underlined_data *)arg;
  364. if (data->underlined == UNDERLINE_YES)
  365. {
  366. /* Add the line we've just finished to the start/stop region */
  367. if (data->title_start == NULL)
  368. data->title_start = block->u.t.last_line;
  369. data->title_end = block->u.t.last_line;
  370. }
  371. /* If we didn't find a region, bale. */
  372. if (data->title_start)
  373. underlined_break(ctx, block, data);
  374. }
  375. static int
  376. detect_underlined_titles(fz_context *ctx, stext_pos *pos, fz_stext_block *block)
  377. {
  378. /* Let's do the title scanning, where our criteria is
  379. * "the entire line is underlined". */
  380. underlined_data data[1];
  381. data->pos = pos;
  382. data->title_start = NULL;
  383. data->title_end = NULL;
  384. data->underlined = UNDERLINE_UNKNOWN;
  385. data->changed = 0;
  386. line_walker(ctx, block, underlined_newline, underlined_line, underlined_end, data);
  387. return data->changed;
  388. }
  389. /* Now we scan again, where the 'title' criteria is based upon
  390. * the titles being entirely in a different font. */
  391. typedef struct
  392. {
  393. stext_pos *pos;
  394. fz_stext_line *title_start;
  395. fz_stext_line *title_end;
  396. fz_font *font;
  397. int changed;
  398. } font_data;
  399. #define MIXED_FONT ((fz_font *)1)
  400. static int
  401. font_break(fz_context *ctx, fz_stext_block *block, font_data *data)
  402. {
  403. fz_stext_line *line;
  404. /* We have a block that looks like a title. */
  405. if (data->title_start != block->u.t.first_line)
  406. {
  407. /* We need to split the block before title_start */
  408. line = data->title_start;
  409. }
  410. else if (data->title_end != block->u.t.last_line)
  411. {
  412. /* We need to split the block after title_end */
  413. line = data->title_end->next;
  414. }
  415. else
  416. {
  417. /* This block is already entirely title. */
  418. line = NULL;
  419. }
  420. if (line)
  421. {
  422. (void)split_block_at_line(ctx, data->pos, block, line);
  423. data->changed = 1;
  424. if (line == data->title_start)
  425. {
  426. /* Don't label the latter part as a title yet, we'll do it when
  427. * we step back in, but we don't know how much of the latter
  428. * block is title yet. */
  429. }
  430. else
  431. {
  432. block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_H);
  433. }
  434. }
  435. else
  436. {
  437. block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_H);
  438. }
  439. return 1;
  440. }
  441. static int
  442. font_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
  443. {
  444. font_data *data = (font_data *)arg;
  445. if (data->font != NULL && data->font != MIXED_FONT && font_is_bold(data->font))
  446. {
  447. /* Add the line we've just finished to the start/stop region */
  448. if (data->title_start == NULL)
  449. data->title_start = line->prev;
  450. data->title_end = line->prev;
  451. }
  452. else if (data->title_start != NULL)
  453. {
  454. /* We've reached the end of a title region. */
  455. return font_break(ctx, block, data);
  456. }
  457. data->font = NULL;
  458. return 0;
  459. }
  460. static int
  461. font_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
  462. {
  463. font_data *data = (font_data *)arg;
  464. fz_stext_char *ch;
  465. /* If we already know that this line is mixed fonts, then no point in
  466. * wasting time. */
  467. if (data->font == MIXED_FONT)
  468. return 0;
  469. /* If we are just starting, prime it. */
  470. if (data->font == NULL)
  471. data->font = line->first_char->font;
  472. for (ch = line->first_char; ch != NULL; ch = ch->next)
  473. if (ch->font != data->font)
  474. {
  475. data->font = MIXED_FONT;
  476. break;
  477. }
  478. return 0;
  479. }
  480. static void
  481. font_end(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
  482. {
  483. font_data *data = (font_data *)arg;
  484. if (data->font != NULL && data->font != MIXED_FONT && font_is_bold(data->font))
  485. {
  486. /* Add the line we've just finished to the start/stop region */
  487. if (data->title_start == NULL)
  488. data->title_start = block->u.t.last_line;
  489. data->title_end = block->u.t.last_line;
  490. }
  491. if (data->title_start)
  492. font_break(ctx, block, data);
  493. }
  494. static int
  495. detect_titles_by_font_usage(fz_context *ctx, stext_pos *pos, fz_stext_block *block)
  496. {
  497. font_data data[1];
  498. data->pos = pos;
  499. data->title_start = NULL;
  500. data->title_end = NULL;
  501. data->font = NULL;
  502. data->changed = 0;
  503. line_walker(ctx, block, font_newline, font_line, font_end, data);
  504. return data->changed;
  505. }
  506. typedef struct
  507. {
  508. fz_rect bbox;
  509. stext_pos *pos;
  510. int changed;
  511. } indent_data;
  512. static int
  513. indent_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
  514. {
  515. indent_data *data = (indent_data *)arg;
  516. float indent = line->bbox.x0 - data->bbox.x0;
  517. if (indent > line_height)
  518. {
  519. /* Break the block here! */
  520. (void)split_block_at_line(ctx, data->pos, block, line);
  521. data->changed = 1;
  522. return 1;
  523. }
  524. return 0;
  525. }
  526. static int
  527. break_paragraphs_by_indent(fz_context *ctx, stext_pos *pos, fz_stext_block *block, fz_rect bbox)
  528. {
  529. indent_data data[1];
  530. data->pos = pos;
  531. data->bbox = bbox;
  532. data->changed = 0;
  533. line_walker(ctx, block, indent_newline, NULL, NULL, data);
  534. return data->changed;
  535. }
  536. typedef struct
  537. {
  538. fz_rect bbox;
  539. stext_pos *pos;
  540. float line_gap;
  541. float prev_line_gap;
  542. int looking_for_space;
  543. float space_size;
  544. int maybe_ends_paragraph;
  545. int changed;
  546. } trailing_data;
  547. static int
  548. trailing_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
  549. {
  550. trailing_data *data = (trailing_data *)arg;
  551. data->prev_line_gap = data->line_gap;
  552. if (data->looking_for_space)
  553. {
  554. /* We've moved downwards onto a line, and failed to find
  555. * a space on that line. Presumably that means that whole
  556. * line is a single word. */
  557. float line_len = line->bbox.x1 - line->bbox.x0;
  558. if (line_len + data->space_size < data->prev_line_gap)
  559. {
  560. /* We could have fitted this word into the previous line. */
  561. /* So presumably that was a paragraph break. Split here. */
  562. (void)split_block_at_line(ctx, data->pos, block, line);
  563. data->changed = 1;
  564. return 1;
  565. }
  566. data->looking_for_space = 0;
  567. }
  568. /* If we the last line we looked at ended plausibly for a paragraph,
  569. * then look for a space in this line... */
  570. data->looking_for_space = data->maybe_ends_paragraph;
  571. return 0;
  572. }
  573. static int
  574. trailing_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
  575. {
  576. trailing_data *data = (trailing_data *)arg;
  577. fz_stext_char *ch;
  578. data->line_gap = data->bbox.x1 - line->bbox.x1;
  579. if (line->last_char && (
  580. (line->last_char->c >= 'A' && line->last_char->c <= 'Z') ||
  581. (line->last_char->c >= 'a' && line->last_char->c <= 'z') ||
  582. (line->last_char->c >= '0' && line->last_char->c <= '9')))
  583. {
  584. /* In Latin text, paragraphs should always end up some form
  585. * of punctuation. I suspect that's less true of some other
  586. * languages (particularly far-eastern ones). Let's just say
  587. * that if we end in A-Za-z0-9 we can't possibly be the last
  588. * line of a paragraph. */
  589. data->maybe_ends_paragraph = 0;
  590. }
  591. else
  592. {
  593. /* Plausibly the next line might be the first line of a new paragraph */
  594. data->maybe_ends_paragraph = 1;
  595. }
  596. for (ch = line->first_char; ch != NULL; ch = ch->next)
  597. {
  598. fz_rect r;
  599. float w, line_len;
  600. if (ch->c != ' ')
  601. continue;
  602. r = fz_rect_from_quad(ch->quad);
  603. w = r.x1 - r.x0;
  604. if (w < data->space_size)
  605. data->space_size = w;
  606. /* If we aren't looking_for_space, then no point in checking for
  607. * whether the prefix will fit. But keep looping as we want to
  608. * continue to refine our idea of how big a space is. */
  609. if (!data->looking_for_space)
  610. continue;
  611. line_len = r.x0 - line->bbox.x0;
  612. if (line_len + data->space_size < data->prev_line_gap)
  613. {
  614. /* We could have fitted this word into the previous line. */
  615. /* So presumably that was a paragraph break. Split here. */
  616. (void)split_block_at_line(ctx, data->pos, block, line);
  617. data->changed = 1;
  618. return 1;
  619. }
  620. data->looking_for_space = 0;
  621. }
  622. return 0;
  623. }
  624. static int
  625. break_paragraphs_by_analysing_trailing_gaps(fz_context *ctx, stext_pos *pos, fz_stext_block *block, fz_rect bbox)
  626. {
  627. trailing_data data[1];
  628. data->bbox = bbox;
  629. data->pos = pos;
  630. data->line_gap = 0;
  631. data->prev_line_gap = 0;
  632. data->looking_for_space = 0;
  633. data->space_size = 99999;
  634. data->maybe_ends_paragraph = 0;
  635. data->changed = 0;
  636. line_walker(ctx, block, trailing_newline, trailing_line, NULL, data);
  637. return data->changed;
  638. }
  639. typedef struct
  640. {
  641. fz_rect bbox;
  642. stext_pos *pos;
  643. int count_lines;
  644. int count_justified;
  645. int non_digits_exist_in_this_line;
  646. fz_rect fragment_box;
  647. fz_rect line_box;
  648. int gap_count_this_line;
  649. float gap_size_this_line;
  650. int bad_gap;
  651. float xmin, xmax;
  652. float last_min_space;
  653. int changed;
  654. } justify_data;
  655. #define JUSTIFY_THRESHOLD 1
  656. static int
  657. justify_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
  658. {
  659. justify_data *data = (justify_data *)arg;
  660. if (line->prev)
  661. line = line->prev;
  662. data->line_box = fz_union_rect(data->line_box, data->fragment_box);
  663. if (data->line_box.x0 < data->bbox.x0 + JUSTIFY_THRESHOLD && data->line_box.x1 > data->bbox.x1 - JUSTIFY_THRESHOLD && data->gap_count_this_line && data->non_digits_exist_in_this_line)
  664. data->count_justified++;
  665. data->non_digits_exist_in_this_line = 0;
  666. data->count_lines++;
  667. data->gap_size_this_line = 0;
  668. data->gap_count_this_line = 0;
  669. data->fragment_box = fz_empty_rect;
  670. data->line_box = fz_empty_rect;
  671. data->xmin = INFINITY;
  672. data->xmax = -INFINITY;
  673. return 0;
  674. }
  675. static void
  676. fragment_end(justify_data *data)
  677. {
  678. float gap;
  679. if (fz_is_empty_rect(data->fragment_box))
  680. {
  681. /* No fragment. Nothing to do. */
  682. return;
  683. }
  684. if (fz_is_empty_rect(data->line_box))
  685. {
  686. /* First fragment of the line; no gap yet. */
  687. gap = 0;
  688. }
  689. else if (data->fragment_box.x0 > data->line_box.x1)
  690. {
  691. /* This whole fragment is to the right of the line so far. */
  692. gap = data->fragment_box.x0 - data->line_box.x1;
  693. }
  694. else if (data->fragment_box.x1 < data->line_box.x0)
  695. {
  696. /* This whole fragment is the left of the line so far. */
  697. gap = data->line_box.x1 - data->fragment_box.x0;
  698. }
  699. else
  700. {
  701. /* Abutting or overlapping fragment. Ignore it. */
  702. gap = 0;
  703. }
  704. data->line_box = fz_union_rect(data->line_box, data->fragment_box);
  705. data->fragment_box = fz_empty_rect;
  706. if (gap < data->last_min_space)
  707. return;
  708. /* So we have a gap to consider */
  709. if (data->gap_count_this_line > 0)
  710. {
  711. /* Allow for double spaces, cos some layouts put
  712. * double spaces before full stops. */
  713. if (fabs(gap - data->gap_size_this_line) > 1 &&
  714. fabs(gap/2.0 - data->gap_size_this_line) < 1)
  715. gap /= 2;
  716. if (fabs(gap - data->gap_size_this_line) > 1)
  717. data->bad_gap = 1;
  718. }
  719. data->gap_size_this_line = (data->gap_size_this_line * data->gap_count_this_line + gap) / (data->gap_count_this_line + 1);
  720. data->gap_count_this_line++;
  721. }
  722. /* This is trickier than you'd imagine. We want to walk the line, looking
  723. * for how large the spaces are. In a justified line, all the spaces should
  724. * be pretty much the same size. (Except maybe before periods). But we want
  725. * to cope with bidirectional text which can send glyphs in unexpected orders.
  726. * e.g. abc fed ghi
  727. * So we have to walk over "fragments" at a time.
  728. */
  729. static int
  730. justify_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
  731. {
  732. justify_data *data = (justify_data *)arg;
  733. fz_stext_char *ch;
  734. for (ch = line->first_char; ch != NULL; ch = ch->next)
  735. {
  736. fz_rect r = fz_rect_from_quad(ch->quad);
  737. float min_space = ch->size * 0.15f; /* Matches SPACE_DIST from stext-device. */
  738. if (ch->c == ' ')
  739. {
  740. /* This ends a fragment, but we don't treat it as such.
  741. * Just continue, because we'll end the fragment next time
  742. * around the loop (this copes with trailing spaces, and
  743. * multiple spaces, and gaps between 'lines' that are on
  744. * the same line. */
  745. data->last_min_space = min_space;
  746. continue;
  747. }
  748. if ((ch->c <= '0' || ch->c >= '9') && ch->c != '.')
  749. data->non_digits_exist_in_this_line = 1;
  750. if (!fz_is_empty_rect(data->fragment_box))
  751. {
  752. if (r.x0 > data->fragment_box.x1 + data->last_min_space)
  753. {
  754. /* Fragment ends due to gap on right. */
  755. fragment_end(data);
  756. }
  757. else if (r.x1 < data->fragment_box.x0 - data->last_min_space)
  758. {
  759. /* Fragment ends due to gap on left. */
  760. fragment_end(data);
  761. }
  762. }
  763. /* Extend the fragment */
  764. data->fragment_box = fz_union_rect(data->fragment_box, r);
  765. data->last_min_space = min_space;
  766. }
  767. return 0;
  768. }
  769. static void
  770. justify_end(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
  771. {
  772. justify_data *data = (justify_data *)arg;
  773. fragment_end(data);
  774. data->line_box = fz_union_rect(data->line_box, data->fragment_box);
  775. if (data->line_box.x0 < data->bbox.x0 + JUSTIFY_THRESHOLD && data->line_box.x1 > data->bbox.x1 - JUSTIFY_THRESHOLD && data->gap_count_this_line && data->non_digits_exist_in_this_line)
  776. data->count_justified++;
  777. data->count_lines++;
  778. }
  779. static int
  780. justify2_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
  781. {
  782. justify_data *data = (justify_data *)arg;
  783. if (data->line_box.x0 < data->bbox.x0 + JUSTIFY_THRESHOLD && data->line_box.x1 > data->bbox.x1 - JUSTIFY_THRESHOLD)
  784. {
  785. /* Justified */
  786. }
  787. else
  788. {
  789. /* Break after line */
  790. (void)split_block_at_line(ctx, data->pos, block, line);
  791. data->changed = 1;
  792. return 1;
  793. }
  794. data->line_box = fz_empty_rect;
  795. return 0;
  796. }
  797. static int
  798. justify2_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
  799. {
  800. justify_data *data = (justify_data *)arg;
  801. fz_stext_char *ch;
  802. for (ch = line->first_char; ch != NULL; ch = ch->next)
  803. {
  804. if (ch->c == ' ')
  805. continue;
  806. data->line_box = fz_union_rect(data->line_box, fz_rect_from_quad(ch->quad));
  807. }
  808. return 0;
  809. }
  810. static fz_rect
  811. text_block_marked_bbox(fz_context *ctx, fz_stext_block *block)
  812. {
  813. fz_stext_line *line;
  814. fz_stext_char *ch;
  815. fz_rect r = fz_empty_rect;
  816. for (line = block->u.t.first_line; line != NULL; line = line->next)
  817. {
  818. for (ch = line->first_char; ch != NULL; ch = ch->next)
  819. {
  820. if (ch->c == ' ')
  821. continue;
  822. r = fz_union_rect(r, fz_rect_from_quad(ch->quad));
  823. }
  824. }
  825. return r;
  826. }
  827. static int
  828. break_paragraphs_within_justified_text(fz_context *ctx, stext_pos *pos, fz_stext_block *block, fz_rect bbox)
  829. {
  830. justify_data data[1];
  831. if (block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_UNKNOWN)
  832. return 0;
  833. data->bbox = bbox;
  834. data->pos = pos;
  835. data->count_lines = 0;
  836. data->count_justified = 0;
  837. data->non_digits_exist_in_this_line = 0;
  838. data->bad_gap = 0;
  839. data->gap_size_this_line = 0;
  840. data->gap_count_this_line = 0;
  841. data->fragment_box = fz_empty_rect;
  842. data->line_box = fz_empty_rect;
  843. data->xmin = INFINITY;
  844. data->xmax = -INFINITY;
  845. data->changed = 0;
  846. line_walker(ctx, block, justify_newline, justify_line, justify_end, data);
  847. /* We can't really derive anything about single lines! */
  848. if (data->count_lines < 2)
  849. return 0;
  850. /* If at least half of the lines don't appear to be justified, then
  851. * don't trust 'em. */
  852. if (data->count_justified * 2 < data->count_lines)
  853. return 0;
  854. /* If the "badness" we've seen to do with big gaps (i.e. how much
  855. * bigger the gaps are than we'd reasonably expect) is too large
  856. * then we can't be a justified block. We are prepared to forgive
  857. * larger sizes in larger paragraphs. */
  858. if (data->bad_gap)
  859. return 0;
  860. block->u.t.flags = FZ_STEXT_TEXT_JUSTIFY_FULL;
  861. line_walker(ctx, block, justify2_newline, justify2_line, NULL, data);
  862. return data->changed;
  863. }
  864. typedef enum
  865. {
  866. LOOKING_FOR_BULLET = 0,
  867. LOOKING_FOR_POST_BULLET = 1,
  868. LOOKING_FOR_POST_NUMERICAL_BULLET = 2,
  869. FOUND_BULLET = 3,
  870. CONTINUATION_LINE = 4,
  871. NO_BULLET = 5
  872. } list_state;
  873. typedef struct
  874. {
  875. stext_pos *pos;
  876. list_state state;
  877. int buffer[10];
  878. int buffer_fill;
  879. float bullet_r;
  880. float post_bullet_indent;
  881. float l;
  882. fz_stext_line *bullet_line_start;
  883. fz_stext_line *this_line_start;
  884. int changed;
  885. } list_data;
  886. static int
  887. list_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
  888. {
  889. list_data *data = (list_data *)arg;
  890. if (data->state == FOUND_BULLET)
  891. {
  892. if (block->u.t.first_line != data->bullet_line_start && data->state == FOUND_BULLET)
  893. {
  894. /* We need to split the block before the bullet started. */
  895. (void)split_block_at_line(ctx, data->pos, block, data->bullet_line_start);
  896. data->changed = 1;
  897. return 1;
  898. }
  899. if (data->bullet_line_start != data->this_line_start)
  900. {
  901. /* We've found a second bullet. Break before the previous line. */
  902. (void)split_block_at_line(ctx, data->pos, block, data->this_line_start);
  903. block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
  904. data->changed = 1;
  905. return 1;
  906. }
  907. }
  908. else if (data->state == NO_BULLET && data->bullet_line_start)
  909. {
  910. /* We've found a bullet before, and the line we've just completed
  911. * is neither a new bullet line, or a continuation so, we need to
  912. * break that into a new block. */
  913. (void)split_block_at_line(ctx, data->pos, block, data->this_line_start);
  914. block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
  915. data->changed = 1;
  916. return 1;
  917. }
  918. data->this_line_start = line;
  919. data->state = LOOKING_FOR_BULLET;
  920. data->buffer_fill = 0;
  921. data->l = block->bbox.x1;
  922. data->bullet_r = block->bbox.x0;
  923. return 0;
  924. }
  925. static int
  926. approx_eq(float a, float b, float c)
  927. {
  928. return fabs(a - b) <= c;
  929. }
  930. static int
  931. is_roman(int c)
  932. {
  933. switch (c)
  934. {
  935. case 'm': case 'M':
  936. case 'c': case 'C':
  937. case 'l': case 'L':
  938. case 'x': case 'X':
  939. case 'v': case 'V':
  940. case 'i': case 'I':
  941. return 1;
  942. }
  943. return 0;
  944. }
  945. typedef enum {
  946. NOT_A_BULLET,
  947. BULLET,
  948. NUMERICAL_BULLET
  949. } bullet_t;
  950. static bullet_t
  951. is_bullet_aux(int *buffer, int len, int contained)
  952. {
  953. int i, decimal_pos, decimals_found;
  954. if (len == 1 && (
  955. buffer[0] == '*' ||
  956. buffer[0] == 0x00B7 || /* Middle Dot */
  957. buffer[0] == 0x2022 || /* Bullet */
  958. buffer[0] == 0x2023 || /* Triangular Bullet */
  959. buffer[0] == 0x2043 || /* Hyphen Bullet */
  960. buffer[0] == 0x204C || /* Back leftwards bullet */
  961. buffer[0] == 0x204D || /* Back rightwards bullet */
  962. buffer[0] == 0x2219 || /* Bullet operator */
  963. buffer[0] == 0x25C9 || /* Fisheye */
  964. buffer[0] == 0x25CB || /* White circle */
  965. buffer[0] == 0x25CF || /* Black circle */
  966. buffer[0] == 0x25D8 || /* Inverse Bullet */
  967. buffer[0] == 0x25E6 || /* White Bullet */
  968. buffer[0] == 0x2619 || /* Reversed Rotated Floral Heart Bullet / Fleuron */
  969. buffer[0] == 0x261a || /* Black left pointing index */
  970. buffer[0] == 0x261b || /* Black right pointing index */
  971. buffer[0] == 0x261c || /* White left pointing index */
  972. buffer[0] == 0x261d || /* White up pointing index */
  973. buffer[0] == 0x261e || /* White right pointing index */
  974. buffer[0] == 0x261f || /* White down pointing index */
  975. buffer[0] == 0x2765 || /* Rotated Heavy Heart Black Heart Bullet */
  976. buffer[0] == 0x2767 || /* Rotated Floral Heart Bullet / Fleuron */
  977. buffer[0] == 0x29BE || /* Circled White Bullet */
  978. buffer[0] == 0x29BF || /* Circled Bullet */
  979. buffer[0] == 0x2660 || /* Black Spade suit */
  980. buffer[0] == 0x2661 || /* White Heart suit */
  981. buffer[0] == 0x2662 || /* White Diamond suit */
  982. buffer[0] == 0x2663 || /* Black Club suit */
  983. buffer[0] == 0x2664 || /* White Spade suit */
  984. buffer[0] == 0x2665 || /* Black Heart suit */
  985. buffer[0] == 0x2666 || /* Black Diamond suit */
  986. buffer[0] == 0x2667 || /* White Clud suit */
  987. buffer[0] == 0x1F446 || /* WHITE UP POINTING BACKHAND INDEX */
  988. buffer[0] == 0x1F447 || /* WHITE DOWN POINTING BACKHAND INDEX */
  989. buffer[0] == 0x1F448 || /* WHITE LEFT POINTING BACKHAND INDEX */
  990. buffer[0] == 0x1F449 || /* WHITE RIGHT POINTING BACKHAND INDEX */
  991. buffer[0] == 0x1f597 || /* White down pointing left hand index */
  992. buffer[0] == 0x1F598 || /* SIDEWAYS WHITE LEFT POINTING INDEX */
  993. buffer[0] == 0x1F599 || /* SIDEWAYS WHITE RIGHT POINTING INDEX */
  994. buffer[0] == 0x1F59A || /* SIDEWAYS BLACK LEFT POINTING INDEX */
  995. buffer[0] == 0x1F59B || /* SIDEWAYS BLACK RIGHT POINTING INDEX */
  996. buffer[0] == 0x1F59C || /* BLACK LEFT POINTING BACKHAND INDEX */
  997. buffer[0] == 0x1F59D || /* BLACK RIGHT POINTING BACKHAND INDEX */
  998. buffer[0] == 0x1F59E || /* SIDEWAYS WHITE UP POINTING INDEX */
  999. buffer[0] == 0x1F59F || /* SIDEWAYS WHITE DOWN POINTING INDEX */
  1000. buffer[0] == 0x1F5A0 || /* SIDEWAYS BLACK UP POINTING INDEX */
  1001. buffer[0] == 0x1F5A1 || /* SIDEWAYS BLACK DOWN POINTING INDEX */
  1002. buffer[0] == 0x1F5A2 || /* BLACK UP POINTING BACKHAND INDEX */
  1003. buffer[0] == 0x1F5A3 || /* BLACK DOWN POINTING BACKHAND INDEX */
  1004. buffer[0] == 0x1FBC1 || /* LEFT THIRD WHITE RIGHT POINTING INDEX */
  1005. buffer[0] == 0x1FBC2 || /* MIDDLE THIRD WHITE RIGHT POINTING INDEX */
  1006. buffer[0] == 0x1FBC3 || /* RIGHT THIRD WHITE RIGHT POINTING INDEX */
  1007. buffer[0] == 0xFFFD || /* UNICODE_REPLACEMENT_CHARACTER */
  1008. 0))
  1009. return BULLET;
  1010. if (!contained)
  1011. {
  1012. if (len > 2 && buffer[0] == '(' && buffer[len-1] == ')')
  1013. return is_bullet_aux(buffer+1, len-2, 1) ? BULLET : NOT_A_BULLET;
  1014. if (len > 2 && buffer[0] == '<' && buffer[len-1] == '>')
  1015. return is_bullet_aux(buffer+1, len-2, 1) ? BULLET : NOT_A_BULLET;
  1016. if (len > 2 && buffer[0] == '[' && buffer[len-1] == ']')
  1017. return is_bullet_aux(buffer+1, len-2, 1) ? BULLET : NOT_A_BULLET;
  1018. if (len > 2 && buffer[0] == '{' && buffer[len-1] == '}')
  1019. return is_bullet_aux(buffer+1, len-2, 1) ? BULLET : NOT_A_BULLET;
  1020. if (len > 1 && buffer[len-1] == ':')
  1021. return is_bullet_aux(buffer, len-1, 1) ? BULLET : NOT_A_BULLET;
  1022. if (len > 1 && buffer[len-1] == ')')
  1023. return is_bullet_aux(buffer, len-1, 1) ? BULLET : NOT_A_BULLET;
  1024. }
  1025. /* Look for numbers */
  1026. /* Be careful not to interpret rows of numbers, like:
  1027. * 10.02 12.03
  1028. * as bullets.
  1029. */
  1030. decimal_pos = 0;
  1031. decimals_found = 0;
  1032. for (i = 0; i < len; i++)
  1033. {
  1034. if (buffer[i] >= '0' && buffer[i] <= '9')
  1035. {
  1036. }
  1037. else if (buffer[i] == '.')
  1038. {
  1039. decimal_pos = i;
  1040. decimals_found++;
  1041. }
  1042. else
  1043. break;
  1044. }
  1045. if (i == len && decimals_found <= 1)
  1046. return NUMERICAL_BULLET;
  1047. /* or number.something */
  1048. if (decimals_found && i == decimal_pos+1 && i < len)
  1049. return is_bullet_aux(buffer+i, len-i, 0) ? BULLET : NOT_A_BULLET;;
  1050. /* Look for roman */
  1051. for (i = 0; i < len; i++)
  1052. if (!is_roman(buffer[i]))
  1053. break;
  1054. if (i == len)
  1055. return 1;
  1056. /* or roman.something */
  1057. if (buffer[i] == '.' && i < len-1)
  1058. return is_bullet_aux(buffer+i+1, len-i-1, 0) ? BULLET : NOT_A_BULLET;
  1059. /* FIXME: Others. */
  1060. return NOT_A_BULLET;
  1061. }
  1062. static bullet_t
  1063. is_bullet(int *buffer, int len)
  1064. {
  1065. return is_bullet_aux(buffer, len, 0);
  1066. }
  1067. static int
  1068. eval_buffer_for_bullet(fz_context *ctx, list_data *data, float size)
  1069. {
  1070. bullet_t bullet_type;
  1071. bullet_type = is_bullet(data->buffer, data->buffer_fill);
  1072. if (bullet_type == NUMERICAL_BULLET)
  1073. data->state = LOOKING_FOR_POST_NUMERICAL_BULLET;
  1074. else if (bullet_type)
  1075. data->state = LOOKING_FOR_POST_BULLET;
  1076. else
  1077. {
  1078. if (approx_eq(data->l, data->post_bullet_indent, size/2))
  1079. data->state = CONTINUATION_LINE;
  1080. else
  1081. data->state = NO_BULLET;
  1082. return 1;
  1083. }
  1084. return 0;
  1085. }
  1086. static int
  1087. list_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
  1088. {
  1089. list_data *data = (list_data *)arg;
  1090. fz_stext_char *ch;
  1091. for (ch = line->first_char; ch != NULL; ch = ch->next)
  1092. {
  1093. fz_rect r = fz_rect_from_quad(ch->quad);
  1094. if (r.x0 < data->l)
  1095. data->l = line->bbox.x0;
  1096. switch (data->state)
  1097. {
  1098. case LOOKING_FOR_BULLET:
  1099. if (ch->c == ' ')
  1100. {
  1101. /* We have a space */
  1102. if (data->buffer_fill == 0)
  1103. continue; /* Just skip leading spaces */
  1104. if (eval_buffer_for_bullet(ctx, data, ch->size))
  1105. return 0;
  1106. }
  1107. else if (data->buffer_fill > 0 && r.x0 - data->bullet_r > ch->size/2)
  1108. {
  1109. /* We have a gap large enough to be a space while we've
  1110. * got something in the buffer. */
  1111. if (eval_buffer_for_bullet(ctx, data, ch->size))
  1112. return 0;
  1113. }
  1114. else if (data->buffer_fill < (int)nelem(data->buffer))
  1115. {
  1116. /* Stick it in the buffer for evaluation later. */
  1117. data->buffer[data->buffer_fill++] = ch->c;
  1118. }
  1119. else
  1120. {
  1121. /* Buffer overflowed. Can't be a bullet. */
  1122. if (approx_eq(data->l, data->post_bullet_indent, ch->size))
  1123. data->state = CONTINUATION_LINE;
  1124. else
  1125. data->state = NO_BULLET;
  1126. return 0;
  1127. }
  1128. data->bullet_r = r.x1;
  1129. break;
  1130. case LOOKING_FOR_POST_BULLET:
  1131. if (ch->c != ' ')
  1132. {
  1133. data->state = FOUND_BULLET;
  1134. if (data->bullet_line_start == NULL)
  1135. data->bullet_line_start = data->this_line_start;
  1136. data->post_bullet_indent = r.x0;
  1137. }
  1138. break;
  1139. case LOOKING_FOR_POST_NUMERICAL_BULLET:
  1140. if (ch->c >= '0' && ch->c <= '9')
  1141. {
  1142. /* Numerical bullets can't be followed by numbers. */
  1143. if (approx_eq(data->l, data->post_bullet_indent, ch->size))
  1144. data->state = CONTINUATION_LINE;
  1145. else
  1146. data->state = NO_BULLET;
  1147. return 0;
  1148. }
  1149. if (ch->c != ' ')
  1150. {
  1151. data->state = FOUND_BULLET;
  1152. if (data->bullet_line_start == NULL)
  1153. data->bullet_line_start = data->this_line_start;
  1154. data->post_bullet_indent = r.x0;
  1155. }
  1156. break;
  1157. default:
  1158. break;
  1159. }
  1160. }
  1161. return 0;
  1162. }
  1163. static void
  1164. list_end(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
  1165. {
  1166. list_data *data = (list_data *)arg;
  1167. if (data->state == LOOKING_FOR_BULLET)
  1168. {
  1169. eval_buffer_for_bullet(ctx, data, 0);
  1170. /* If we ended up thinking we'd found a bullet, subject to
  1171. * what follows not being of a specific form, then we're
  1172. * fine, because nothing follows us! */
  1173. if (data->state == LOOKING_FOR_POST_NUMERICAL_BULLET ||
  1174. data->state == LOOKING_FOR_POST_BULLET)
  1175. {
  1176. data->state = FOUND_BULLET;
  1177. if (data->bullet_line_start == NULL)
  1178. data->bullet_line_start = data->this_line_start;
  1179. }
  1180. /* FIXME: This block contains just a bullet - not the content
  1181. * for the bullet. We see this with page-12.pdf.
  1182. * <> Rising commitment to battery...
  1183. * committed to in-house battery...
  1184. * developing and manufacturing...
  1185. *
  1186. * The <> is in a whole different DIV to the following text.
  1187. * Really we want to look for if the "next" content (for some
  1188. * definition of next) is on the same line as the bullet. If
  1189. * it is, we want to merge the 2 divs.
  1190. *
  1191. * But that's a really tricky thing to do given the recursive
  1192. * block walk we are current doing. Think about this.
  1193. * For now, we just mark the <> as being a list item.
  1194. */
  1195. }
  1196. if (data->state == FOUND_BULLET)
  1197. {
  1198. if (block->u.t.first_line != data->bullet_line_start && data->state == FOUND_BULLET)
  1199. {
  1200. /* We need to split the block before the start of the bullet. */
  1201. (void)split_block_at_line(ctx, data->pos, block, data->bullet_line_start);
  1202. data->changed = 1;
  1203. return;
  1204. }
  1205. if (data->bullet_line_start != data->this_line_start)
  1206. {
  1207. /* We've found a second bullet. Break before the line. */
  1208. (void)split_block_at_line(ctx, data->pos, block, data->this_line_start);
  1209. block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
  1210. data->changed = 1;
  1211. return;
  1212. }
  1213. block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
  1214. }
  1215. else if (data->state == NO_BULLET && data->bullet_line_start)
  1216. {
  1217. /* We've found a bullet before, and the line we've just completed
  1218. * is neither a new bullet line, or a continuation so, we need to
  1219. * break that into a new block. */
  1220. (void)split_block_at_line(ctx, data->pos, block, data->this_line_start);
  1221. block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
  1222. data->changed = 1;
  1223. return;
  1224. }
  1225. else if (data->bullet_line_start)
  1226. {
  1227. /* We've come to the end of the block still in the list item. */
  1228. block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
  1229. }
  1230. }
  1231. static int
  1232. break_list_items(fz_context *ctx, stext_pos *pos, fz_stext_block *block)
  1233. {
  1234. list_data data[1];
  1235. if (block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_UNKNOWN)
  1236. return 0;
  1237. data->pos = pos;
  1238. data->state = LOOKING_FOR_BULLET;
  1239. data->buffer_fill = 0;
  1240. data->l = block->bbox.x1;
  1241. data->bullet_line_start = NULL;
  1242. data->this_line_start = block->u.t.first_line;
  1243. data->bullet_r = block->bbox.x0;
  1244. data->changed = 0;
  1245. line_walker(ctx, block, list_newline, list_line, list_end, data);
  1246. return data->changed;
  1247. }
  1248. static int
  1249. is_header(fz_structure s)
  1250. {
  1251. return (s == FZ_STRUCTURE_H ||
  1252. s == FZ_STRUCTURE_H1 ||
  1253. s == FZ_STRUCTURE_H2 ||
  1254. s == FZ_STRUCTURE_H3 ||
  1255. s == FZ_STRUCTURE_H4 ||
  1256. s == FZ_STRUCTURE_H5 ||
  1257. s == FZ_STRUCTURE_H6);
  1258. }
  1259. static void
  1260. do_para_break(fz_context *ctx, fz_stext_page *page, fz_stext_block **pfirst, fz_stext_block **plast, fz_stext_struct *parent, int in_header)
  1261. {
  1262. fz_stext_block *block, *next_block;
  1263. stext_pos pos;
  1264. fz_rect bbox;
  1265. pos.pool = page->pool;
  1266. pos.idx = 0;
  1267. pos.pfirst = pfirst;
  1268. pos.plast = plast;
  1269. pos.parent = parent;
  1270. /* First off, in order for us to consider a block to be suitable for paragraph
  1271. * splitting, we want it to be a series of lines moving down the page, (or left
  1272. * to right within a line). */
  1273. for (block = *pfirst; block != NULL; block = next_block)
  1274. {
  1275. next_block = block->next;
  1276. switch (block->type)
  1277. {
  1278. case FZ_STEXT_BLOCK_STRUCT:
  1279. if (block->u.s.index < pos.idx)
  1280. block->u.s.index = pos.idx++;
  1281. else
  1282. pos.idx = block->u.s.index+1;
  1283. if (block->u.s.down)
  1284. {
  1285. int header = in_header | is_header(block->u.s.down->standard);
  1286. do_para_break(ctx, page, &block->u.s.down->first_block, &block->u.s.down->last_block, block->u.s.down, header);
  1287. }
  1288. break;
  1289. case FZ_STEXT_BLOCK_TEXT:
  1290. if (!lines_move_plausibly_like_paragraph(block))
  1291. break;
  1292. #ifdef DEBUG_SPLITS
  1293. dump_block(ctx, "Around the top level block loop:", block);
  1294. #endif
  1295. /* Firstly, and somewhat annoyingly we need to find the bbox of the
  1296. * block that doesn't include for trailing spaces. If we just use
  1297. * the normal bbox, then lines that end in "foo " will end further
  1298. * to the right of lines that end in "ba-", and consequently we'll
  1299. * fail to detect blocks as being justified.
  1300. * See PMC2656817_00002.pdf as an example. */
  1301. bbox = text_block_marked_bbox(ctx, block);
  1302. #ifdef DEBUG_PARA_SPLITS
  1303. {
  1304. fz_stext_line *line;
  1305. for (line = block->u.t.first_line; line != NULL; line = line->next)
  1306. {
  1307. fz_stext_char *ch;
  1308. for (ch = line->first_char; ch != NULL; ch = ch->next)
  1309. {
  1310. fz_write_printf(ctx, fz_stddbg(ctx), "%C", ch->c);
  1311. }
  1312. }
  1313. }
  1314. #endif
  1315. /* Think about breaking lines at Titles. */
  1316. /* First, underlined ones. */
  1317. if (detect_underlined_titles(ctx, &pos, block))
  1318. next_block = block->next; /* We split the block! */
  1319. if (block->type != FZ_STEXT_BLOCK_TEXT)
  1320. {
  1321. next_block = block;
  1322. break;
  1323. }
  1324. #ifdef DEBUG_PARA_SPLITS
  1325. fz_write_printf(ctx, fz_stddbg(ctx), "A");
  1326. #endif
  1327. /* Next, ones that use bold fonts. */
  1328. if (!in_header)
  1329. {
  1330. if (detect_titles_by_font_usage(ctx, &pos, block))
  1331. next_block = block->next; /* We split the block! */
  1332. if (block->type != FZ_STEXT_BLOCK_TEXT)
  1333. {
  1334. next_block = block;
  1335. break;
  1336. }
  1337. }
  1338. #ifdef DEBUG_PARA_SPLITS
  1339. fz_write_printf(ctx, fz_stddbg(ctx), "B");
  1340. #endif
  1341. /* Now look at breaking based upon indents */
  1342. if (break_paragraphs_by_indent(ctx, &pos, block, bbox))
  1343. next_block = block->next; /* We split the block! */
  1344. if (block->type != FZ_STEXT_BLOCK_TEXT)
  1345. {
  1346. next_block = block;
  1347. break;
  1348. }
  1349. #ifdef DEBUG_PARA_SPLITS
  1350. fz_write_printf(ctx, fz_stddbg(ctx), "C");
  1351. #endif
  1352. /* Now we're going to look for unindented paragraphs. We do this by
  1353. * considering if the first word on the next line would have fitted
  1354. * into the space left at the end of the previous line. */
  1355. if (break_paragraphs_by_analysing_trailing_gaps(ctx, &pos, block, bbox))
  1356. next_block = block->next; /* We split the block! */
  1357. if (block->type != FZ_STEXT_BLOCK_TEXT)
  1358. {
  1359. next_block = block;
  1360. break;
  1361. }
  1362. #ifdef DEBUG_PARA_SPLITS
  1363. fz_write_printf(ctx, fz_stddbg(ctx), "D");
  1364. #endif
  1365. /* Now look to see if a block looks like fully justified text. If it
  1366. * does, then any line that doesn't reach the right hand side must be
  1367. * a paragraph break. */
  1368. if (break_paragraphs_within_justified_text(ctx, &pos, block, bbox))
  1369. next_block = block->next; /* We split the block! */
  1370. if (block->type != FZ_STEXT_BLOCK_TEXT)
  1371. {
  1372. next_block = block;
  1373. break;
  1374. }
  1375. #ifdef DEBUG_PARA_SPLITS
  1376. fz_write_printf(ctx, fz_stddbg(ctx), "E");
  1377. #endif
  1378. /* Look for bulleted list items. */
  1379. if (break_list_items(ctx, &pos, block))
  1380. next_block = block->next; /* We split the block! */
  1381. break;
  1382. }
  1383. }
  1384. }
  1385. void
  1386. fz_paragraph_break(fz_context *ctx, fz_stext_page *page)
  1387. {
  1388. do_para_break(ctx, page, &page->first_block, &page->last_block, NULL, 0);
  1389. }