stext-boxer.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956
  1. // Copyright (C) 2025 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #include "mupdf/fitz.h"
  23. #undef DEBUG_WRITE_AS_PS
  24. #undef DEBUG_STRUCT
  25. typedef struct boxer_s boxer_t;
  26. typedef struct {
  27. int len;
  28. int max;
  29. fz_rect list[FZ_FLEXIBLE_ARRAY];
  30. } rectlist_t;
  31. struct boxer_s {
  32. fz_rect mediabox;
  33. rectlist_t *list;
  34. };
  35. static int fz_rect_contains_rect(fz_rect a, fz_rect b)
  36. {
  37. if (a.x0 > b.x0)
  38. return 0;
  39. if (a.y0 > b.y0)
  40. return 0;
  41. if (a.x1 < b.x1)
  42. return 0;
  43. if (a.y1 < b.y1)
  44. return 0;
  45. return 1;
  46. }
  47. static rectlist_t *
  48. rectlist_create(fz_context *ctx, int max)
  49. {
  50. rectlist_t *list = fz_malloc_flexible(ctx, rectlist_t, list, max);
  51. list->len = 0;
  52. list->max = max;
  53. return list;
  54. }
  55. /* Push box onto rectlist, unless it is completely enclosed by
  56. * another box, or completely encloses others (in which case they
  57. * are replaced by it). */
  58. static void
  59. rectlist_append(rectlist_t *list, fz_rect *box)
  60. {
  61. int i;
  62. for (i = 0; i < list->len; i++)
  63. {
  64. fz_rect *r = &list->list[i];
  65. fz_rect smaller, larger;
  66. /* We allow ourselves a fudge factor of 4 points when checking for inclusion. */
  67. double r_fudge = 4;
  68. smaller.x0 = r->x0 + r_fudge;
  69. larger. x0 = r->x0 - r_fudge;
  70. smaller.y0 = r->y0 + r_fudge;
  71. larger. y0 = r->y0 - r_fudge;
  72. smaller.x1 = r->x1 - r_fudge;
  73. larger. x1 = r->x1 + r_fudge;
  74. smaller.y1 = r->y1 - r_fudge;
  75. larger. y1 = r->y1 + r_fudge;
  76. if (fz_rect_contains_rect(larger, *box))
  77. return; /* box is enclosed! Nothing to do. */
  78. if (fz_rect_contains_rect(*box, smaller))
  79. {
  80. /* box encloses r. Ditch r. */
  81. /* Shorten the list */
  82. --list->len;
  83. /* If the one that just got chopped off wasn't r, move it down. */
  84. if (i < list->len)
  85. {
  86. memcpy(r, &list->list[list->len], sizeof(*r));
  87. i--; /* Reconsider this entry next time. */
  88. }
  89. }
  90. }
  91. assert(list->len < list->max);
  92. memcpy(&list->list[list->len], box, sizeof(*box));
  93. list->len++;
  94. }
  95. static boxer_t *
  96. boxer_create_length(fz_context *ctx, fz_rect *mediabox, int len)
  97. {
  98. boxer_t *boxer = fz_malloc_struct(ctx, boxer_t);
  99. if (boxer == NULL)
  100. return NULL;
  101. memcpy(&boxer->mediabox, mediabox, sizeof(*mediabox));
  102. boxer->list = rectlist_create(ctx, len);
  103. return boxer;
  104. }
  105. /* Create a boxer structure for a page of size mediabox. */
  106. static boxer_t *
  107. boxer_create(fz_context *ctx, fz_rect *mediabox)
  108. {
  109. boxer_t *boxer = boxer_create_length(ctx, mediabox, 1);
  110. if (boxer == NULL)
  111. return NULL;
  112. rectlist_append(boxer->list, mediabox);
  113. return boxer;
  114. }
  115. static void
  116. push_if_intersect_suitable(rectlist_t *dst, const fz_rect *a, const fz_rect *b)
  117. {
  118. fz_rect c;
  119. /* Intersect a and b. */
  120. c = fz_intersect_rect(*a, *b);
  121. /* If no intersection, nothing to push. */
  122. if (!fz_is_valid_rect(c))
  123. return;
  124. rectlist_append(dst, &c);
  125. }
  126. static void
  127. boxlist_feed_intersect(rectlist_t *dst, const rectlist_t *src, const fz_rect *box)
  128. {
  129. int i;
  130. for (i = 0; i < src->len; i++)
  131. push_if_intersect_suitable(dst, &src->list[i], box);
  132. }
  133. /* Mark a given box as being occupied (typically by a glyph) */
  134. static void boxer_feed(fz_context *ctx, boxer_t *boxer, fz_rect *bbox)
  135. {
  136. fz_rect box;
  137. /* When we feed a box into a the boxer, we can never make
  138. * the list more than 4 times as long. */
  139. rectlist_t *newlist = rectlist_create(ctx, boxer->list->len * 4);
  140. #ifdef DEBUG_WRITE_AS_PS
  141. printf("0 0 1 setrgbcolor\n");
  142. printf("%g %g moveto %g %g lineto %g %g lineto %g %g lineto closepath fill\n",
  143. bbox->x0, bbox->y0,
  144. bbox->x0, bbox->y1,
  145. bbox->x1, bbox->y1,
  146. bbox->x1, bbox->y0
  147. );
  148. #endif
  149. /* Left (0,0) (x0,H) */
  150. box.x0 = boxer->mediabox.x0;
  151. box.y0 = boxer->mediabox.y0;
  152. box.x1 = bbox->x0;
  153. box.y1 = boxer->mediabox.y1;
  154. boxlist_feed_intersect(newlist, boxer->list, &box);
  155. /* Right (x1,0) (W,H) */
  156. box.x0 = bbox->x1;
  157. box.y0 = boxer->mediabox.y0;
  158. box.x1 = boxer->mediabox.x1;
  159. box.y1 = boxer->mediabox.y1;
  160. boxlist_feed_intersect(newlist, boxer->list, &box);
  161. /* Bottom (0,0) (W,y0) */
  162. box.x0 = boxer->mediabox.x0;
  163. box.y0 = boxer->mediabox.y0;
  164. box.x1 = boxer->mediabox.x1;
  165. box.y1 = bbox->y0;
  166. boxlist_feed_intersect(newlist, boxer->list, &box);
  167. /* Top (0,y1) (W,H) */
  168. box.x0 = boxer->mediabox.x0;
  169. box.y0 = bbox->y1;
  170. box.x1 = boxer->mediabox.x1;
  171. box.y1 = boxer->mediabox.y1;
  172. boxlist_feed_intersect(newlist, boxer->list, &box);
  173. fz_free(ctx, boxer->list);
  174. boxer->list = newlist;
  175. }
  176. #ifdef DEBUG_WRITE_AS_PS
  177. static int
  178. compare_areas(const void *a_, const void *b_)
  179. {
  180. const fz_rect *a = (const fz_rect *)a_;
  181. const fz_rect *b = (const fz_rect *)b_;
  182. double area_a = (a->x1-a->x0) * (a->y1-a->y0);
  183. double area_b = (b->x1-b->x0) * (b->y1-b->y0);
  184. if (area_a < area_b)
  185. return 1;
  186. else if (area_a > area_b)
  187. return -1;
  188. else
  189. return 0;
  190. }
  191. /* Sort the rectangle list to be largest area first. For ease of humans
  192. * reading debug output. */
  193. static void boxer_sort(boxer_t *boxer)
  194. {
  195. qsort(boxer->list->list, boxer->list->len, sizeof(fz_rect), compare_areas);
  196. }
  197. /* Get the rectangle list for a given boxer. Return value is the length of
  198. * the list. Lifespan is until the boxer is modified or freed. */
  199. static int boxer_results(boxer_t *boxer, fz_rect **list)
  200. {
  201. *list = boxer->list->list;
  202. return boxer->list->len;
  203. }
  204. #endif
  205. /* Currently unused debugging routine */
  206. #if 0
  207. static void
  208. boxer_dump(fz_context *ctx, boxer_t *boxer)
  209. {
  210. int i;
  211. printf("bbox = %g %g %g %g\n", boxer->mediabox.x0, boxer->mediabox.y0, boxer->mediabox.x1, boxer->mediabox.y1);
  212. for (i = 0; i < boxer->list->len; i++)
  213. {
  214. printf("%d %g %g %g %g\n", i, boxer->list->list[i].x0, boxer->list->list[i].y0, boxer->list->list[i].x1, boxer->list->list[i].y1);
  215. }
  216. }
  217. #endif
  218. /* Destroy a boxer. */
  219. static void boxer_destroy(fz_context *ctx, boxer_t *boxer)
  220. {
  221. if (!boxer)
  222. return;
  223. fz_free(ctx, boxer->list);
  224. fz_free(ctx, boxer);
  225. }
  226. /* Find the margins for a given boxer. */
  227. static fz_rect boxer_margins(boxer_t *boxer)
  228. {
  229. rectlist_t *list = boxer->list;
  230. int i;
  231. fz_rect margins = boxer->mediabox;
  232. for (i = 0; i < list->len; i++)
  233. {
  234. fz_rect *r = &list->list[i];
  235. if (r->x0 <= margins.x0 && r->y0 <= margins.y0 && r->y1 >= margins.y1)
  236. margins.x0 = r->x1; /* Left Margin */
  237. else if (r->x1 >= margins.x1 && r->y0 <= margins.y0 && r->y1 >= margins.y1)
  238. margins.x1 = r->x0; /* Right Margin */
  239. else if (r->x0 <= margins.x0 && r->x1 >= margins.x1 && r->y0 <= margins.y0)
  240. margins.y0 = r->y1; /* Top Margin */
  241. else if (r->x0 <= margins.x0 && r->x1 >= margins.x1 && r->y1 >= margins.y1)
  242. margins.y1 = r->y0; /* Bottom Margin */
  243. }
  244. return margins;
  245. }
  246. /* Create a new boxer from a subset of an old one. */
  247. static boxer_t *boxer_subset(fz_context *ctx, boxer_t *boxer, fz_rect rect)
  248. {
  249. boxer_t *new_boxer = boxer_create_length(ctx, &rect, boxer->list->len);
  250. int i;
  251. if (new_boxer == NULL)
  252. return NULL;
  253. for (i = 0; i < boxer->list->len; i++)
  254. {
  255. fz_rect r = fz_intersect_rect(boxer->list->list[i], rect);
  256. if (fz_is_empty_rect(r))
  257. continue;
  258. rectlist_append(new_boxer->list, &r);
  259. }
  260. return new_boxer;
  261. }
  262. static int analyse_sub(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block, boxer_t *big_boxer, int depth);
  263. static void
  264. analyse_subset(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block, boxer_t *boxer, fz_rect r, int depth)
  265. {
  266. boxer_t *sub_box = boxer_subset(ctx, boxer, r);
  267. fz_try(ctx)
  268. (void)analyse_sub(ctx, page, first_block, last_block, sub_box, depth);
  269. fz_always(ctx)
  270. boxer_destroy(ctx, sub_box);
  271. fz_catch(ctx)
  272. fz_rethrow(ctx);
  273. }
  274. /* Consider a boxer for subdivision.
  275. * Returns 0 if no suitable subdivision point found.
  276. * Returns 1 if a subdivision point is found.*/
  277. static int
  278. boxer_subdivide(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block, boxer_t *boxer, int depth)
  279. {
  280. rectlist_t *list = boxer->list;
  281. double max_h = 0, max_v = 0;
  282. int i;
  283. for (i = 0; i < list->len; i++)
  284. {
  285. fz_rect r = boxer->list->list[i];
  286. if (r.x0 <= boxer->mediabox.x0 && r.x1 >= boxer->mediabox.x1)
  287. {
  288. /* Horizontal divider */
  289. double size = r.y1 - r.y0;
  290. if (size > max_h)
  291. {
  292. max_h = size;
  293. }
  294. }
  295. if (r.y0 <= boxer->mediabox.y0 && r.y1 >= boxer->mediabox.y1)
  296. {
  297. /* Vertical divider */
  298. double size = r.x1 - r.x0;
  299. if (size > max_v)
  300. {
  301. max_v = size;
  302. }
  303. }
  304. }
  305. if (max_h > max_v)
  306. {
  307. fz_rect r;
  308. float min_gap;
  309. float top;
  310. /* Divider runs horizontally. */
  311. /* We want to list out all the horizontal subregions that are separated
  312. * by an appropriate gap, from top to bottom. */
  313. /* Any gap larger than the the gap we ignored will do. */
  314. min_gap = max_v;
  315. /* We're going to need to run through the data multiple times to find the
  316. * topmost block each time. We'll use 'top' to cull against, and gradually
  317. * lower that after each successful pass. */
  318. top = boxer->mediabox.y0;
  319. while (1)
  320. {
  321. /* Find the topmost divider below 'top' of height at least min_gap */
  322. float found_top;
  323. int found = -1;
  324. for (i = 0; i < list->len; i++)
  325. {
  326. fz_rect *b = &list->list[i];
  327. if (b->x0 <= boxer->mediabox.x0 && boxer->mediabox.x1 <= b->x1 && b->y0 > top && b->y1 - b->y0 >= min_gap)
  328. {
  329. if (found == -1 || b->y0 < found_top)
  330. {
  331. found = i;
  332. found_top = b->y0;
  333. }
  334. }
  335. }
  336. /* If we failed to find one, we're done. */
  337. if (found == -1)
  338. break;
  339. /* So we have a region from top to found_top */
  340. r = boxer->mediabox;
  341. r.y0 = top;
  342. r.y1 = found_top;
  343. analyse_subset(ctx, page, first_block, last_block, boxer, r, depth);
  344. /* Now move top down for the next go. */
  345. top = list->list[found].y1;
  346. }
  347. /* One final region, from top to bottom */
  348. r = boxer->mediabox;
  349. r.y0 = top;
  350. analyse_subset(ctx, page, first_block, last_block, boxer, r, depth);
  351. return 1;
  352. }
  353. else if (max_v > 0)
  354. {
  355. fz_rect r;
  356. float min_gap;
  357. float left;
  358. /* Divider runs vertically. */
  359. /* We want to list out all the vertical subregions that are separated
  360. * by an appropriate gap, from left to right. */
  361. /* Any gap larger than the the gap we ignored will do. */
  362. min_gap = max_h;
  363. /* We're going to need to run through the data multiple times to find the
  364. * leftmost block each time. We'll use 'left' to cull against, and gradually
  365. * lower that after each successful pass. */
  366. left = boxer->mediabox.x0;
  367. while (1)
  368. {
  369. /* Find the leftmost divider to the right of 'left' of width at least min_gap */
  370. float found_left;
  371. int found = -1;
  372. for (i = 0; i < list->len; i++)
  373. {
  374. fz_rect *b = &list->list[i];
  375. if (b->y0 <= boxer->mediabox.y0 && boxer->mediabox.y1 <= b->y1 && b->x0 > left && b->x1 - b->x0 >= min_gap)
  376. {
  377. if (found == -1 || b->x0 < found_left)
  378. {
  379. found = i;
  380. found_left = b->x0;
  381. }
  382. }
  383. }
  384. /* If we failed to find one, we're done. */
  385. if (found == -1)
  386. break;
  387. /* So we have a region from top to found_top */
  388. r = boxer->mediabox;
  389. r.x0 = left;
  390. r.x1 = found_left;
  391. analyse_subset(ctx, page, first_block, last_block, boxer, r, depth);
  392. /* Now move left right for the next go. */
  393. left = list->list[found].x1;
  394. }
  395. /* One final region, from left to right */
  396. r = boxer->mediabox;
  397. r.x0 = left;
  398. analyse_subset(ctx, page, first_block, last_block, boxer, r, depth);
  399. return 1;
  400. }
  401. return 0;
  402. }
  403. static void
  404. new_stext_struct(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, fz_structure standard, const char *raw)
  405. {
  406. fz_stext_struct *str;
  407. size_t z;
  408. if (raw == NULL)
  409. raw = "";
  410. z = strlen(raw);
  411. str = fz_pool_alloc(ctx, page->pool, offsetof(fz_stext_struct, raw) + z + 1);
  412. str->first_block = NULL;
  413. str->last_block = NULL;
  414. str->standard = standard;
  415. str->parent = page->last_struct;
  416. str->up = block;
  417. memcpy(str->raw, raw, z+1);
  418. block->u.s.down = str;
  419. }
  420. #ifdef DEBUG_STRUCT
  421. static void
  422. do_dump_stext(fz_stext_block *block, int depth)
  423. {
  424. int d;
  425. while (block)
  426. {
  427. for (d = 0; d < depth; d++)
  428. printf(" ");
  429. switch(block->type)
  430. {
  431. case FZ_STEXT_BLOCK_TEXT:
  432. printf("TEXT %p\n", block);
  433. break;
  434. case FZ_STEXT_BLOCK_IMAGE:
  435. printf("IMAGE %p\n", block);
  436. break;
  437. case FZ_STEXT_BLOCK_VECTOR:
  438. printf("VECTOR %p\n", block);
  439. break;
  440. case FZ_STEXT_BLOCK_STRUCT:
  441. printf("STRUCT %p\n", block);
  442. do_dump_stext(block->u.s.down->first_block, depth+1);
  443. break;
  444. }
  445. block = block->next;
  446. }
  447. }
  448. static void
  449. dump_stext(char *str, fz_stext_block *block)
  450. {
  451. printf("%s\n", str);
  452. do_dump_stext(block, 0);
  453. }
  454. #endif
  455. static void
  456. recalc_bbox(fz_stext_block *block)
  457. {
  458. fz_rect bbox = fz_empty_rect;
  459. fz_stext_line *line;
  460. for (line = block->u.t.first_line; line != NULL; line = line->next)
  461. bbox = fz_union_rect(bbox, line->bbox);
  462. block->bbox = bbox;
  463. }
  464. static fz_stext_struct *
  465. page_subset(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block, fz_rect mediabox)
  466. {
  467. fz_stext_block *block, *next_block;
  468. fz_stext_block *target = NULL;
  469. fz_stext_block *last = NULL;
  470. fz_stext_block *newblock;
  471. int idx = 0;
  472. #ifdef DEBUG_STRUCT
  473. dump_stext("BEFORE", *first_block);
  474. #endif
  475. for (block = *first_block; block != NULL; block = next_block)
  476. {
  477. fz_rect bbox;
  478. next_block = block->next;
  479. if (block->type != FZ_STEXT_BLOCK_TEXT && block->type != FZ_STEXT_BLOCK_VECTOR)
  480. continue;
  481. bbox = block->bbox;
  482. /* Can we take the whole block? */
  483. if (bbox.x0 >= mediabox.x0 && bbox.y0 >= mediabox.y0 && bbox.x1 <= mediabox.x1 && bbox.y1 <= mediabox.y1)
  484. {
  485. /* Unlink block from the current list. */
  486. if (block->prev)
  487. block->prev->next = next_block;
  488. else
  489. *first_block = next_block;
  490. if (next_block)
  491. next_block->prev = block->prev;
  492. else
  493. *last_block = block->prev;
  494. /* Add block onto our target list */
  495. if (target == NULL)
  496. {
  497. target = block;
  498. block->prev = NULL;
  499. }
  500. else
  501. {
  502. last->next = block;
  503. block->prev = last;
  504. }
  505. last = block;
  506. block->next = NULL;
  507. }
  508. else if (block->type == FZ_STEXT_BLOCK_TEXT && !fz_is_empty_rect(fz_intersect_rect(bbox, mediabox)))
  509. {
  510. /* Need to look at the parts. */
  511. fz_stext_line *line, *next_line;
  512. newblock = NULL;
  513. for (line = block->u.t.first_line; line != NULL; line = next_line)
  514. {
  515. next_line = line->next;
  516. if (line->bbox.x0 >= mediabox.x0 && line->bbox.y0 >= mediabox.y0 && line->bbox.x1 <= mediabox.x1 && line->bbox.y1 <= mediabox.y1)
  517. {
  518. /* We need to take this line */
  519. if (newblock == NULL)
  520. {
  521. newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block));
  522. /* Add the block onto our target list */
  523. if (target == NULL)
  524. {
  525. target = newblock;
  526. }
  527. else
  528. {
  529. last->next = newblock;
  530. newblock->prev = last;
  531. }
  532. last = newblock;
  533. }
  534. /* Unlink line from the current list. */
  535. if (line->prev)
  536. line->prev->next = next_line;
  537. else
  538. block->u.t.first_line = next_line;
  539. if (next_line)
  540. next_line->prev = line->prev;
  541. else
  542. block->u.t.last_line = line->prev;
  543. /* Add line onto our new block */
  544. if (newblock->u.t.last_line == NULL)
  545. {
  546. newblock->u.t.first_line = newblock->u.t.last_line = line;
  547. line->prev = NULL;
  548. }
  549. else
  550. {
  551. line->prev = newblock->u.t.last_line;
  552. newblock->u.t.last_line->next = line;
  553. newblock->u.t.last_line = line;
  554. }
  555. line->next = NULL;
  556. }
  557. }
  558. if (newblock)
  559. {
  560. recalc_bbox(block);
  561. recalc_bbox(newblock);
  562. }
  563. }
  564. }
  565. /* If no content to add, bale! */
  566. if (target == NULL)
  567. return NULL;
  568. /* We want to insert a structure node that contains target as the last structure
  569. * node on this blocklist. Find the first block that's not a structure block. */
  570. for (block = *first_block; block != NULL; block = block->next)
  571. {
  572. if (block->type != FZ_STEXT_BLOCK_STRUCT)
  573. break;
  574. idx++;
  575. }
  576. /* So we want to insert just before block. */
  577. /* We are going to need to create a new block. Create a complete unlinked one here. */
  578. newblock = fz_pool_alloc(ctx, page->pool, sizeof *newblock);
  579. newblock->bbox = fz_empty_rect;
  580. newblock->prev = block ? block->prev : *last_block;
  581. newblock->next = block;
  582. newblock->type = FZ_STEXT_BLOCK_STRUCT;
  583. newblock->u.s.index = idx;
  584. newblock->u.s.down = NULL;
  585. /* If this throws, we leak newblock but it's within the pool, so it doesn't matter. */
  586. /* And create a new struct and have newblock point to it. */
  587. new_stext_struct(ctx, page, newblock, FZ_STRUCTURE_DIV, "Split");
  588. /* Now insert newblock just before block */
  589. /* If block was first, now we are. */
  590. if (*first_block == block)
  591. *first_block = newblock;
  592. if (block == NULL)
  593. {
  594. /* Inserting at the end! */
  595. if (*last_block)
  596. (*last_block)->next = newblock;
  597. *last_block = newblock;
  598. }
  599. else
  600. {
  601. if (block->prev)
  602. block->prev->next = newblock;
  603. block->prev = newblock;
  604. }
  605. newblock->u.s.down->first_block = target;
  606. target->prev = NULL;
  607. for (block = target; block->next != NULL; block = block->next)
  608. newblock->bbox = fz_union_rect(newblock->bbox, block->bbox);
  609. newblock->bbox = fz_union_rect(newblock->bbox, block->bbox);
  610. newblock->u.s.down->last_block = block;
  611. #ifdef DEBUG_STRUCT
  612. dump_stext("AFTER", *first_block);
  613. #endif
  614. return newblock->u.s.down;
  615. }
  616. enum {
  617. MAX_ANALYSIS_DEPTH = 6
  618. };
  619. static int
  620. analyse_sub(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block, boxer_t *big_boxer, int depth)
  621. {
  622. fz_rect margins;
  623. boxer_t *boxer;
  624. boxer_t *boxer1 = NULL;
  625. boxer_t *boxer2 = NULL;
  626. fz_stext_struct *div;
  627. int ret = 0;
  628. /* Find the margins in the enclosing boxer. This returns
  629. * a subset of the bbox of the original. */
  630. margins = boxer_margins(big_boxer);
  631. #ifdef DEBUG_WRITE_AS_PS
  632. printf("\n\n%% MARGINS %g %g %g %g\n", margins.x0, margins.y0, margins.x1, margins.y1);
  633. #endif
  634. /* Now subset the rectangles just to include those that are in our bbox. */
  635. boxer = boxer_subset(ctx, big_boxer, margins);
  636. fz_var(boxer1);
  637. fz_var(boxer2);
  638. fz_try(ctx)
  639. {
  640. div = page_subset(ctx, page, first_block, last_block, boxer->mediabox);
  641. /* If nothing subsetted (no textual content in that region), give up. */
  642. if (div == NULL)
  643. break;
  644. ret = 1;
  645. if (depth < MAX_ANALYSIS_DEPTH)
  646. {
  647. /* Can we subdivide that region any more? */
  648. if (boxer_subdivide(ctx, page, &div->first_block, &div->last_block, boxer, depth+1))
  649. break;
  650. }
  651. #ifdef DEBUG_WRITE_AS_PS
  652. {
  653. int i, n;
  654. fz_rect *list;
  655. boxer_sort(boxer);
  656. n = boxer_results(boxer, &list);
  657. printf("%% SUBDIVISION\n");
  658. for (i = 0; i < n; i++)
  659. {
  660. printf("%% %g %g %g %g\n",
  661. list[i].x0, list[i].y0, list[i].x1, list[i].y1);
  662. }
  663. printf("0 0 0 setrgbcolor\n");
  664. for (i = 0; i < n; i++) {
  665. printf("%g %g moveto\n%g %g lineto\n%g %g lineto\n%g %g lineto\nclosepath\nstroke\n\n",
  666. list[i].x0, list[i].y0,
  667. list[i].x0, list[i].y1,
  668. list[i].x1, list[i].y1,
  669. list[i].x1, list[i].y0);
  670. }
  671. printf("1 0 0 setrgbcolor\n");
  672. printf("%g %g moveto\n%g %g lineto\n%g %g lineto\n%g %g lineto\nclosepath\nstroke\n\n",
  673. margins.x0, margins.y0,
  674. margins.x0, margins.y1,
  675. margins.x1, margins.y1,
  676. margins.x1, margins.y0);
  677. }
  678. #endif
  679. }
  680. fz_always(ctx)
  681. {
  682. boxer_destroy(ctx, boxer1);
  683. boxer_destroy(ctx, boxer2);
  684. boxer_destroy(ctx, boxer);
  685. }
  686. fz_catch(ctx)
  687. fz_rethrow(ctx);
  688. return ret;
  689. }
  690. static int
  691. line_isnt_all_spaces(fz_context *ctx, fz_stext_line *line)
  692. {
  693. fz_stext_char *ch;
  694. for (ch = line->first_char; ch != NULL; ch = ch->next)
  695. if (ch->c != 32 && ch->c != 160)
  696. return 1;
  697. return 0;
  698. }
  699. static void
  700. feed_line(fz_context *ctx, boxer_t *boxer, fz_stext_line *line)
  701. {
  702. fz_stext_char *ch;
  703. for (ch = line->first_char; ch != NULL; ch = ch->next)
  704. {
  705. fz_rect r = fz_empty_rect;
  706. if (ch->c == ' ')
  707. continue;
  708. do
  709. {
  710. fz_rect bbox = fz_rect_from_quad(ch->quad);
  711. float margin = ch->size/2;
  712. bbox.x0 -= margin;
  713. bbox.y0 -= margin;
  714. bbox.x1 += margin;
  715. bbox.y1 += margin;
  716. r = fz_union_rect(r, bbox);
  717. ch = ch->next;
  718. }
  719. while (ch != NULL && ch->c != ' ');
  720. boxer_feed(ctx, boxer, &r);
  721. if (ch == NULL)
  722. break;
  723. }
  724. }
  725. /* Internal, non-API function, shared with stext-table. */
  726. fz_rect
  727. fz_collate_small_vector_run(fz_stext_block **blockp)
  728. {
  729. fz_stext_block *block = *blockp;
  730. fz_stext_block *next;
  731. fz_rect r = block->bbox;
  732. int MAX_SIZE = 2;
  733. int MAX_GAP = 2;
  734. float w = r.x1 - r.x0;
  735. float h = r.y1 - r.y0;
  736. if (w < MAX_SIZE)
  737. {
  738. while ((next = block->next) != NULL &&
  739. next->type == FZ_STEXT_BLOCK_VECTOR &&
  740. next->bbox.x0 == r.x0 &&
  741. next->bbox.x1 == r.x1 &&
  742. ((next->bbox.y1 > r.y1 && next->bbox.y0 <= r.y1 + MAX_GAP) ||
  743. (next->bbox.y0 < r.y0 && next->bbox.y1 >= r.y0 - MAX_GAP)))
  744. {
  745. r = fz_union_rect(r, next->bbox);
  746. block = next;
  747. }
  748. }
  749. if (h < MAX_SIZE)
  750. {
  751. while ((next = block->next) != NULL &&
  752. next->type == FZ_STEXT_BLOCK_VECTOR &&
  753. next->bbox.y0 == r.y0 &&
  754. next->bbox.y1 == r.y1 &&
  755. ((next->bbox.x1 > r.x1 && next->bbox.x0 <= r.x1 + MAX_GAP) ||
  756. (next->bbox.x0 < r.x0 && next->bbox.x1 >= r.x0 - MAX_GAP)))
  757. {
  758. r = fz_union_rect(r, next->bbox);
  759. block = next;
  760. }
  761. }
  762. *blockp = block;
  763. return r;
  764. }
  765. int fz_segment_stext_page(fz_context *ctx, fz_stext_page *page)
  766. {
  767. boxer_t *boxer;
  768. fz_stext_block *block;
  769. int ret = 0;
  770. /* If we have structure already, give up. We can't hope to beat
  771. * proper structure! */
  772. for (block = page->first_block; block != NULL; block = block->next)
  773. if (block->type == FZ_STEXT_BLOCK_STRUCT)
  774. return 0;
  775. #ifdef DEBUG_WRITE_AS_PS
  776. printf("1 -1 scale 0 -%g translate\n", page->mediabox.y1-page->mediabox.y0);
  777. #endif
  778. boxer = boxer_create(ctx, &page->mediabox);
  779. fz_try(ctx)
  780. {
  781. /* Just walking the blocks is safe as we're assuming no structure here. */
  782. for (block = page->first_block; block != NULL; block = block->next)
  783. {
  784. fz_stext_line *line;
  785. switch (block->type)
  786. {
  787. case FZ_STEXT_BLOCK_TEXT:
  788. for (line = block->u.t.first_line; line != NULL; line = line->next)
  789. if (line_isnt_all_spaces(ctx, line))
  790. feed_line(ctx, boxer, line);
  791. break;
  792. case FZ_STEXT_BLOCK_VECTOR:
  793. {
  794. /* Allow a 1 point margin around vectors to avoid hairline
  795. * cracks between supposedly abutting things. */
  796. int VECTOR_MARGIN = 1;
  797. fz_rect r = fz_collate_small_vector_run(&block);
  798. r.x0 -= VECTOR_MARGIN;
  799. r.y0 -= VECTOR_MARGIN;
  800. r.x1 += VECTOR_MARGIN;
  801. r.y1 += VECTOR_MARGIN;
  802. boxer_feed(ctx, boxer, &r);
  803. }
  804. }
  805. }
  806. ret = analyse_sub(ctx, page, &page->first_block, &page->last_block, boxer, 0);
  807. }
  808. fz_always(ctx)
  809. boxer_destroy(ctx, boxer);
  810. fz_catch(ctx)
  811. fz_rethrow(ctx);
  812. #ifdef DEBUG_WRITE_AS_PS
  813. printf("showpage\n");
  814. #endif
  815. return ret;
  816. }