stext-device.c 67 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519
  1. // Copyright (C) 2004-2025 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. // You should have received a copy of the GNU Affero General Public License
  15. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  16. //
  17. // Alternative licensing terms are available from the licensor.
  18. // For commercial licensing, see <https://www.artifex.com/> or contact
  19. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  20. // CA 94129, USA, for further information.
  21. #include "mupdf/fitz.h"
  22. #include "glyphbox.h"
  23. #include <float.h>
  24. #include <string.h>
  25. /* Simple layout structure */
  26. fz_layout_block *fz_new_layout(fz_context *ctx)
  27. {
  28. fz_pool *pool = fz_new_pool(ctx);
  29. fz_layout_block *block;
  30. fz_try(ctx)
  31. {
  32. block = fz_pool_alloc(ctx, pool, sizeof (fz_layout_block));
  33. block->pool = pool;
  34. block->head = NULL;
  35. block->tailp = &block->head;
  36. }
  37. fz_catch(ctx)
  38. {
  39. fz_drop_pool(ctx, pool);
  40. fz_rethrow(ctx);
  41. }
  42. return block;
  43. }
  44. void fz_drop_layout(fz_context *ctx, fz_layout_block *block)
  45. {
  46. if (block)
  47. fz_drop_pool(ctx, block->pool);
  48. }
  49. void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float font_size, const char *p)
  50. {
  51. fz_layout_line *line = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_line));
  52. line->x = x;
  53. line->y = y;
  54. line->font_size = font_size;
  55. line->p = p;
  56. line->text = NULL;
  57. line->next = NULL;
  58. *block->tailp = line;
  59. block->tailp = &line->next;
  60. block->text_tailp = &line->text;
  61. }
  62. void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float advance, const char *p)
  63. {
  64. fz_layout_char *ch = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_char));
  65. ch->x = x;
  66. ch->advance = advance;
  67. ch->p = p;
  68. ch->next = NULL;
  69. *block->text_tailp = ch;
  70. block->text_tailp = &ch->next;
  71. }
  72. /* Extract text into blocks and lines. */
  73. #define PARAGRAPH_DIST 1.5f
  74. #define SPACE_DIST 0.15f
  75. #define SPACE_MAX_DIST 0.8f
  76. #define BASE_MAX_DIST 0.8f
  77. #define FAKE_BOLD_MAX_DIST 0.1f
  78. /* We keep a stack of the different metatexts that apply at any
  79. * given point (normally none!). Whenever we get some content
  80. * with a metatext in force, we really want to update the bounds
  81. * for that metatext. But running along the whole list each time
  82. * would be painful. So we just update the bounds for dev->metatext
  83. * and rely on metatext_bounds() propagating it upwards 'just in
  84. * time' for us to use metatexts other than the latest one. This
  85. * also means we need to propagate bounds upwards when we pop
  86. * a metatext.
  87. *
  88. * Why do we need bounds at all? Well, suppose we get:
  89. * /Span <</ActualText (c) >> BDC /Im0 Do EMC
  90. * Then where on the page do we put 'c' ? By collecting the
  91. * bounds, we can place 'c' wherever the image was.
  92. */
  93. typedef struct metatext_t
  94. {
  95. fz_metatext type;
  96. char *text;
  97. fz_rect bounds;
  98. struct metatext_t *prev;
  99. } metatext_t;
  100. typedef struct
  101. {
  102. fz_point from;
  103. fz_point to;
  104. float thickness;
  105. } rect_details;
  106. typedef struct
  107. {
  108. fz_device super;
  109. fz_stext_page *page;
  110. int id;
  111. fz_point pen, start;
  112. fz_point lag_pen;
  113. fz_matrix trm;
  114. int new_obj;
  115. int lastchar;
  116. int lastbidi;
  117. int flags;
  118. int color;
  119. int last_was_fake_bold;
  120. const fz_text *lasttext;
  121. fz_stext_options opts;
  122. metatext_t *metatext;
  123. /* Store the last values we saw. We need this for flushing the actualtext. */
  124. struct
  125. {
  126. int valid;
  127. int clipped;
  128. fz_matrix trm;
  129. int wmode;
  130. int bidi_level;
  131. fz_font *font;
  132. int flags;
  133. } last;
  134. /* The list of 'rects' seen during processing (if we're collecting styles). */
  135. int rect_max;
  136. int rect_len;
  137. rect_details *rects;
  138. } fz_stext_device;
  139. const char *fz_stext_options_usage =
  140. "Text output options:\n"
  141. "\tpreserve-images: keep images in output\n"
  142. "\tpreserve-ligatures: do not expand ligatures into constituent characters\n"
  143. "\tpreserve-spans: do not merge spans on the same line\n"
  144. "\tpreserve-whitespace: do not convert all whitespace into space characters\n"
  145. "\tinhibit-spaces: don't add spaces between gaps in the text\n"
  146. "\tparagraph-break: break blocks at paragraph boundaries\n"
  147. "\tdehyphenate: attempt to join up hyphenated words\n"
  148. "\tignore-actualtext: do not apply ActualText replacements\n"
  149. "\tuse-cid-for-unknown-unicode: use character code if unicode mapping fails\n"
  150. "\tuse-gid-for-unknown-unicode: use glyph index if unicode mapping fails\n"
  151. "\taccurate-bboxes: calculate char bboxes from the outlines\n"
  152. "\taccurate-ascenders: calculate ascender/descender from font glyphs\n"
  153. "\taccurate-side-bearings: expand char bboxes to completely include width of glyphs\n"
  154. "\tcollect-styles: attempt to detect text features (fake bold, strikeout, underlined etc)\n"
  155. "\tclip: do not include text that is completely clipped\n"
  156. "\tclip-rect=x0:y0:x1:y1 specify clipping rectangle within which to collect content\n"
  157. "\tstructured: collect structure markup\n"
  158. "\tvectors: include vector bboxes in output\n"
  159. "\tsegment: attempt to segment the page\n"
  160. "\ttable-hunt: hunt for tables within a (segmented) page\n"
  161. "\n";
  162. /* Find the current actualtext, if any. Will abort if dev == NULL. */
  163. static metatext_t *
  164. find_actualtext(fz_stext_device *dev)
  165. {
  166. metatext_t *mt = dev->metatext;
  167. while (mt && mt->type != FZ_METATEXT_ACTUALTEXT)
  168. mt = mt->prev;
  169. return mt;
  170. }
  171. /* Find the bounds of the given metatext. Will abort if mt or
  172. * dev are NULL. */
  173. static fz_rect *
  174. metatext_bounds(metatext_t *mt, fz_stext_device *dev)
  175. {
  176. metatext_t *mt2 = dev->metatext;
  177. while (mt2 != mt)
  178. {
  179. mt2->prev->bounds = fz_union_rect(mt2->prev->bounds, mt2->bounds);
  180. mt2 = mt2->prev;
  181. }
  182. return &mt->bounds;
  183. }
  184. /* Find the bounds of the current actualtext, or NULL if there
  185. * isn't one. Will abort if dev is NULL. */
  186. static fz_rect *
  187. actualtext_bounds(fz_stext_device *dev)
  188. {
  189. metatext_t *mt = find_actualtext(dev);
  190. if (mt == NULL)
  191. return NULL;
  192. return metatext_bounds(mt, dev);
  193. }
  194. fz_stext_page *
  195. fz_new_stext_page(fz_context *ctx, fz_rect mediabox)
  196. {
  197. fz_pool *pool = fz_new_pool(ctx);
  198. fz_stext_page *page = NULL;
  199. fz_try(ctx)
  200. {
  201. page = fz_pool_alloc(ctx, pool, sizeof(*page));
  202. page->pool = pool;
  203. page->mediabox = mediabox;
  204. page->first_block = NULL;
  205. page->last_block = NULL;
  206. }
  207. fz_catch(ctx)
  208. {
  209. fz_drop_pool(ctx, pool);
  210. fz_rethrow(ctx);
  211. }
  212. return page;
  213. }
  214. static void
  215. drop_run(fz_context *ctx, fz_stext_block *block)
  216. {
  217. fz_stext_line *line;
  218. fz_stext_char *ch;
  219. while (block)
  220. {
  221. switch (block->type)
  222. {
  223. case FZ_STEXT_BLOCK_IMAGE:
  224. fz_drop_image(ctx, block->u.i.image);
  225. break;
  226. case FZ_STEXT_BLOCK_TEXT:
  227. for (line = block->u.t.first_line; line; line = line->next)
  228. for (ch = line->first_char; ch; ch = ch->next)
  229. fz_drop_font(ctx, ch->font);
  230. break;
  231. case FZ_STEXT_BLOCK_STRUCT:
  232. drop_run(ctx, block->u.s.down->first_block);
  233. break;
  234. default:
  235. break;
  236. }
  237. block = block->next;
  238. }
  239. }
  240. void
  241. fz_drop_stext_page(fz_context *ctx, fz_stext_page *page)
  242. {
  243. if (page)
  244. {
  245. drop_run(ctx, page->first_block);
  246. fz_drop_pool(ctx, page->pool);
  247. }
  248. }
  249. /*
  250. * This adds a new block at the end of the page. This should not be used
  251. * to add 'struct' blocks to the page as those have to be added internally,
  252. * with more complicated pointer setup.
  253. */
  254. static fz_stext_block *
  255. add_block_to_page(fz_context *ctx, fz_stext_page *page)
  256. {
  257. fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block);
  258. block->bbox = fz_empty_rect; /* Fixes bug 703267. */
  259. block->prev = page->last_block;
  260. if (page->last_struct)
  261. {
  262. if (page->last_struct->last_block)
  263. {
  264. block->prev = page->last_struct->last_block;
  265. block->prev->next = block;
  266. page->last_struct->last_block = block;
  267. }
  268. else
  269. page->last_struct->last_block = page->last_struct->first_block = block;
  270. }
  271. else if (!page->last_block)
  272. {
  273. page->last_block = block;
  274. if (!page->first_block)
  275. page->first_block = block;
  276. }
  277. else
  278. {
  279. page->last_block->next = block;
  280. page->last_block = block;
  281. }
  282. return block;
  283. }
  284. static fz_stext_block *
  285. add_text_block_to_page(fz_context *ctx, fz_stext_page *page)
  286. {
  287. fz_stext_block *block = add_block_to_page(ctx, page);
  288. block->type = FZ_STEXT_BLOCK_TEXT;
  289. return block;
  290. }
  291. static fz_stext_block *
  292. add_image_block_to_page(fz_context *ctx, fz_stext_page *page, fz_matrix ctm, fz_image *image)
  293. {
  294. fz_stext_block *block = add_block_to_page(ctx, page);
  295. block->type = FZ_STEXT_BLOCK_IMAGE;
  296. block->u.i.transform = ctm;
  297. block->u.i.image = fz_keep_image(ctx, image);
  298. block->bbox = fz_transform_rect(fz_unit_rect, ctm);
  299. return block;
  300. }
  301. static fz_stext_line *
  302. add_line_to_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, const fz_point *dir, int wmode, int bidi)
  303. {
  304. fz_stext_line *line = fz_pool_alloc(ctx, page->pool, sizeof *block->u.t.first_line);
  305. line->prev = block->u.t.last_line;
  306. if (!block->u.t.first_line)
  307. block->u.t.first_line = block->u.t.last_line = line;
  308. else
  309. {
  310. block->u.t.last_line->next = line;
  311. block->u.t.last_line = line;
  312. }
  313. line->dir = *dir;
  314. line->wmode = wmode;
  315. return line;
  316. }
  317. #define NON_ACCURATE_GLYPH_ADDED_SPACE (-2)
  318. #define NON_ACCURATE_GLYPH (-1)
  319. static fz_stext_char *
  320. add_char_to_line(fz_context *ctx, fz_stext_page *page, fz_stext_line *line, fz_matrix trm, fz_font *font, float size, int c, int glyph, fz_point *p, fz_point *q, int bidi, int color, int synthetic, int flags, int dev_flags)
  321. {
  322. fz_stext_char *ch = fz_pool_alloc(ctx, page->pool, sizeof *line->first_char);
  323. fz_point a, d;
  324. if (!line->first_char)
  325. line->first_char = line->last_char = ch;
  326. else
  327. {
  328. line->last_char->next = ch;
  329. line->last_char = ch;
  330. }
  331. ch->c = c;
  332. ch->argb = color;
  333. ch->bidi = bidi;
  334. ch->origin = *p;
  335. ch->size = size;
  336. ch->font = fz_keep_font(ctx, font);
  337. ch->flags = flags | (synthetic ? FZ_STEXT_SYNTHETIC : 0);
  338. if (font->flags.is_bold)
  339. ch->flags |= FZ_STEXT_BOLD;
  340. if (line->wmode == 0)
  341. {
  342. fz_rect bounds;
  343. int bounded = 0;
  344. a.x = 0;
  345. d.x = 0;
  346. if (glyph == NON_ACCURATE_GLYPH_ADDED_SPACE)
  347. {
  348. /* Added space, in accurate mode. */
  349. a.y = d.y = 0;
  350. }
  351. else if (glyph == NON_ACCURATE_GLYPH)
  352. {
  353. /* Non accurate mode. */
  354. a.y = fz_font_ascender(ctx, font);
  355. d.y = fz_font_descender(ctx, font);
  356. }
  357. else
  358. {
  359. /* Any glyph in accurate mode */
  360. bounds = fz_bound_glyph(ctx, font, glyph, fz_identity);
  361. bounded = 1;
  362. a.y = bounds.y1;
  363. d.y = bounds.y0;
  364. }
  365. if (dev_flags & FZ_STEXT_ACCURATE_SIDE_BEARINGS)
  366. {
  367. if (!bounded)
  368. bounds = fz_bound_glyph(ctx, font, glyph, fz_identity);
  369. if (a.x > bounds.x0)
  370. a.x = bounds.x0;
  371. if (d.y < bounds.x1)
  372. d.y = bounds.x1;
  373. }
  374. }
  375. else
  376. {
  377. a.x = 1;
  378. d.x = 0;
  379. a.y = 0;
  380. d.y = 0;
  381. }
  382. a = fz_transform_vector(a, trm);
  383. d = fz_transform_vector(d, trm);
  384. ch->quad.ll = fz_make_point(p->x + d.x, p->y + d.y);
  385. ch->quad.ul = fz_make_point(p->x + a.x, p->y + a.y);
  386. ch->quad.lr = fz_make_point(q->x + d.x, q->y + d.y);
  387. ch->quad.ur = fz_make_point(q->x + a.x, q->y + a.y);
  388. return ch;
  389. }
  390. static void
  391. remove_last_char(fz_context *ctx, fz_stext_line *line)
  392. {
  393. if (line && line->first_char)
  394. {
  395. fz_stext_char *prev = NULL;
  396. fz_stext_char *ch = line->first_char;
  397. while (ch->next)
  398. {
  399. prev = ch;
  400. ch = ch->next;
  401. }
  402. if (prev)
  403. {
  404. /* The characters are pool allocated, so we don't actually leak the removed node. */
  405. /* We do need to drop the char's font reference though. */
  406. fz_drop_font(ctx, prev->next->font);
  407. line->last_char = prev;
  408. line->last_char->next = NULL;
  409. }
  410. }
  411. }
  412. static fz_stext_char *reverse_bidi_span(fz_stext_char *curr, fz_stext_char *tail)
  413. {
  414. fz_stext_char *prev, *next;
  415. prev = tail;
  416. while (curr != tail)
  417. {
  418. next = curr->next;
  419. curr->next = prev;
  420. prev = curr;
  421. curr = next;
  422. }
  423. return prev;
  424. }
  425. static void reverse_bidi_line(fz_stext_line *line)
  426. {
  427. fz_stext_char *a, *b, **prev;
  428. prev = &line->first_char;
  429. for (a = line->first_char; a; a = a->next)
  430. {
  431. if (a->bidi)
  432. {
  433. b = a;
  434. while (b->next && b->next->bidi)
  435. b = b->next;
  436. if (a != b)
  437. *prev = reverse_bidi_span(a, b->next);
  438. }
  439. prev = &a->next;
  440. line->last_char = a;
  441. }
  442. }
  443. static int is_hyphen(int c)
  444. {
  445. /* check for: hyphen-minus, soft hyphen, hyphen, and non-breaking hyphen */
  446. return (c == '-' || c == 0xAD || c == 0x2010 || c == 0x2011);
  447. }
  448. static float
  449. vec_dot(const fz_point *a, const fz_point *b)
  450. {
  451. return a->x * b->x + a->y * b->y;
  452. }
  453. static int may_add_space(int lastchar)
  454. {
  455. /* Basic latin, greek, cyrillic, hebrew, arabic,
  456. * general punctuation,
  457. * superscripts and subscripts,
  458. * and currency symbols.
  459. */
  460. return (lastchar != ' ' && (lastchar < 0x700 || (lastchar >= 0x2000 && lastchar <= 0x20CF)));
  461. }
  462. #define FAKEBOLD_THRESHOLD_RECIP 10
  463. static int
  464. close(float a, float b, float size)
  465. {
  466. a -= b;
  467. if (a < 0)
  468. a = -a;
  469. return FAKEBOLD_THRESHOLD_RECIP * a < size;
  470. }
  471. static int
  472. font_equiv(fz_context *ctx, fz_font *f, fz_font *g)
  473. {
  474. unsigned char fdigest[16];
  475. unsigned char gdigest[16];
  476. if (f == g)
  477. return 1;
  478. if (strcmp(f->name, g->name) != 0)
  479. return 0;
  480. fz_font_digest(ctx, f, fdigest);
  481. fz_font_digest(ctx, g, gdigest);
  482. return (memcmp(fdigest, gdigest, 16) == 0);
  483. }
  484. static int
  485. check_for_fake_bold(fz_context *ctx, fz_stext_block *block, fz_font *font, int c, fz_point p, float size, int flags)
  486. {
  487. fz_stext_line *line;
  488. fz_stext_char *ch;
  489. for (; block != NULL; block = block->next)
  490. {
  491. if (block->type == FZ_STEXT_BLOCK_STRUCT)
  492. {
  493. if (block->u.s.down != NULL && check_for_fake_bold(ctx, block->u.s.down->first_block, font, c, p, size, flags))
  494. return 1;
  495. }
  496. else if (block->type == FZ_STEXT_BLOCK_TEXT)
  497. {
  498. for (line = block->u.t.first_line; line != NULL; line = line->next)
  499. {
  500. fz_stext_char *pr = NULL;
  501. for (ch = line->first_char; ch != NULL; ch = ch->next)
  502. {
  503. /* Not perfect, but it'll do! */
  504. if (ch->c == c && close(ch->origin.x, p.x, size) && close(ch->origin.y, p.y, size) && font_equiv(ctx, ch->font, font))
  505. {
  506. /* If we were filled before, and we are stroking now... */
  507. if ((ch->flags & (FZ_STEXT_FILLED | FZ_STEXT_STROKED)) == FZ_STEXT_FILLED &&
  508. (flags & (FZ_STEXT_FILLED | FZ_STEXT_STROKED)) == FZ_STEXT_STROKED)
  509. {
  510. /* Update this to be filled + stroked, but don't specifically mark it as fake bold. */
  511. ch->flags |= flags;
  512. return 1;
  513. }
  514. /* Overlaying spaces is tricksy. How can that count as boldening when it doesn't mark? We only accept these
  515. * as boldening if either the char before, or the char after were also boldened. */
  516. ch->flags |= flags;
  517. if (c == ' ')
  518. {
  519. if ((pr && (pr->flags & FZ_STEXT_BOLD) != 0) ||
  520. (ch->next && (ch->next->flags & FZ_STEXT_BOLD) != 0))
  521. {
  522. /* OK, we can be bold. */
  523. ch->flags |= FZ_STEXT_BOLD;
  524. return 1;
  525. }
  526. /* Ignore this and keep going */
  527. }
  528. else
  529. {
  530. ch->flags |= FZ_STEXT_BOLD;
  531. return 1;
  532. }
  533. }
  534. pr = ch;
  535. }
  536. }
  537. }
  538. }
  539. return 0;
  540. }
  541. static void
  542. fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode, int bidi, int force_new_line, int flags)
  543. {
  544. fz_stext_page *page = dev->page;
  545. fz_stext_block *cur_block;
  546. fz_stext_line *cur_line;
  547. int new_para = 0;
  548. int new_line = 1;
  549. int add_space = 0;
  550. fz_point dir, ndir, p, q;
  551. float size;
  552. fz_point delta;
  553. float spacing = 0;
  554. float base_offset = 0;
  555. float dist;
  556. /* Preserve RTL-ness only (and ignore level) so we can use bit 2 as "visual" tag for reordering pass. */
  557. bidi = bidi & 1;
  558. /* dir = direction vector for motion. ndir = normalised(dir) */
  559. if (wmode == 0)
  560. {
  561. dir.x = 1;
  562. dir.y = 0;
  563. }
  564. else
  565. {
  566. dir.x = 0;
  567. dir.y = -1;
  568. }
  569. dir = fz_transform_vector(dir, trm);
  570. ndir = fz_normalize_vector(dir);
  571. size = fz_matrix_expansion(trm);
  572. /* We need to identify where glyphs 'start' (p) and 'stop' (q).
  573. * Each glyph holds its 'start' position, and the next glyph in the
  574. * span (or span->max if there is no next glyph) holds its 'end'
  575. * position.
  576. *
  577. * For both horizontal and vertical motion, trm->{e,f} gives the
  578. * origin (usually the bottom left) of the glyph.
  579. *
  580. * In horizontal mode:
  581. * + p is bottom left.
  582. * + q is the bottom right
  583. * In vertical mode:
  584. * + p is top left (where it advanced from)
  585. * + q is bottom left
  586. */
  587. if (wmode == 0)
  588. {
  589. p.x = trm.e;
  590. p.y = trm.f;
  591. q.x = trm.e + adv * dir.x;
  592. q.y = trm.f + adv * dir.y;
  593. }
  594. else
  595. {
  596. p.x = trm.e - adv * dir.x;
  597. p.y = trm.f - adv * dir.y;
  598. q.x = trm.e;
  599. q.y = trm.f;
  600. }
  601. if ((dev->opts.flags & FZ_STEXT_COLLECT_STYLES) != 0)
  602. {
  603. if (glyph == -1)
  604. {
  605. if (dev->last_was_fake_bold)
  606. goto move_pen_and_exit;
  607. }
  608. else if (check_for_fake_bold(ctx, page->first_block, font, c, p, size, flags))
  609. {
  610. dev->last_was_fake_bold = 1;
  611. goto move_pen_and_exit;
  612. }
  613. dev->last_was_fake_bold = 0;
  614. }
  615. /* Find current position to enter new text. */
  616. cur_block = page->last_struct ? page->last_struct->last_block : page->last_block;
  617. if (cur_block && cur_block->type != FZ_STEXT_BLOCK_TEXT)
  618. cur_block = NULL;
  619. cur_line = cur_block ? cur_block->u.t.last_line : NULL;
  620. if (cur_line && glyph < 0)
  621. {
  622. /* Don't advance pen or break lines for no-glyph characters in a cluster */
  623. add_char_to_line(ctx, page, cur_line, trm, font, size, c, (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? glyph : NON_ACCURATE_GLYPH, &dev->pen, &dev->pen, bidi, dev->color, 0, flags, dev->flags);
  624. dev->lastbidi = bidi;
  625. dev->lastchar = c;
  626. return;
  627. }
  628. if (cur_line == NULL || cur_line->wmode != wmode || vec_dot(&ndir, &cur_line->dir) < 0.999f)
  629. {
  630. /* If the matrix has changed rotation, or the wmode is different (or if we don't have a line at all),
  631. * then we can't append to the current block/line. */
  632. new_para = 1;
  633. new_line = 1;
  634. }
  635. else
  636. {
  637. /* Detect fake bold where text is printed twice in the same place. */
  638. /* Largely supplanted by the check_for_fake_bold mechanism above,
  639. * but we leave this in for backward compatibility as it's cheap,
  640. * and works even when FZ_STEXT_COLLECT_STYLES is not set. */
  641. dist = hypotf(q.x - dev->pen.x, q.y - dev->pen.y) / size;
  642. if (dist < FAKE_BOLD_MAX_DIST && c == dev->lastchar)
  643. return;
  644. /* Calculate how far we've moved since the last character. */
  645. delta.x = p.x - dev->pen.x;
  646. delta.y = p.y - dev->pen.y;
  647. /* The transform has not changed, so we know we're in the same
  648. * direction. Calculate 2 distances; how far off the previous
  649. * baseline we are, together with how far along the baseline
  650. * we are from the expected position. */
  651. spacing = (ndir.x * delta.x + ndir.y * delta.y) / size;
  652. base_offset = (-ndir.y * delta.x + ndir.x * delta.y) / size;
  653. /* Only a small amount off the baseline - we'll take this */
  654. if (fabsf(base_offset) < BASE_MAX_DIST)
  655. {
  656. /* If mixed LTR and RTL content */
  657. if ((bidi & 1) != (dev->lastbidi & 1))
  658. {
  659. /* Ignore jumps within line when switching between LTR and RTL text. */
  660. new_line = 0;
  661. }
  662. /* RTL */
  663. else if (bidi & 1)
  664. {
  665. fz_point logical_delta = fz_make_point(p.x - dev->lag_pen.x, p.y - dev->lag_pen.y);
  666. float logical_spacing = (ndir.x * logical_delta.x + ndir.y * logical_delta.y) / size + adv;
  667. /* If the pen is where we would have been if we
  668. * had advanced backwards from the previous
  669. * character by this character's advance, we
  670. * are probably seeing characters emitted in
  671. * logical order.
  672. */
  673. if (fabsf(logical_spacing) < SPACE_DIST)
  674. {
  675. new_line = 0;
  676. }
  677. /* However, if the pen has advanced to where we would expect it
  678. * in an LTR context, we're seeing them emitted in visual order
  679. * and should flag them for reordering!
  680. */
  681. else if (fabsf(spacing) < SPACE_DIST)
  682. {
  683. bidi = 3; /* mark line as visual */
  684. new_line = 0;
  685. }
  686. /* And any other small jump could be a missing space. */
  687. else if (logical_spacing < 0 && logical_spacing > -SPACE_MAX_DIST)
  688. {
  689. if (wmode == 0 && may_add_space(dev->lastchar))
  690. add_space = 1;
  691. new_line = 0;
  692. }
  693. else if (spacing < 0 && spacing > -SPACE_MAX_DIST)
  694. {
  695. /* Motion is in line, but negative. We've probably got overlapping
  696. * chars here. Live with it. */
  697. new_line = 0;
  698. }
  699. else if (spacing > 0 && spacing < SPACE_MAX_DIST)
  700. {
  701. bidi = 3; /* mark line as visual */
  702. if (wmode == 0 && may_add_space(dev->lastchar))
  703. add_space = 1;
  704. new_line = 0;
  705. }
  706. else
  707. {
  708. /* Motion is large and unexpected (probably a new table column). */
  709. new_line = 1;
  710. }
  711. }
  712. /* LTR or neutral character */
  713. else
  714. {
  715. if (fabsf(spacing) < SPACE_DIST)
  716. {
  717. /* Motion is in line and small enough to ignore. */
  718. new_line = 0;
  719. }
  720. else if (spacing < 0 && spacing > -SPACE_MAX_DIST)
  721. {
  722. /* Motion is in line, but negative. We've probably got overlapping
  723. * chars here. Live with it. */
  724. new_line = 0;
  725. }
  726. else if (spacing > 0 && spacing < SPACE_MAX_DIST)
  727. {
  728. /* Motion is forward in line and large enough to warrant us adding a space. */
  729. if (wmode == 0 && may_add_space(dev->lastchar))
  730. add_space = 1;
  731. new_line = 0;
  732. }
  733. else
  734. {
  735. /* Motion is large and unexpected (probably a new table column). */
  736. new_line = 1;
  737. }
  738. }
  739. }
  740. /* Enough for a new line, but not enough for a new paragraph */
  741. else if (fabsf(base_offset) <= PARAGRAPH_DIST)
  742. {
  743. /* Check indent to spot text-indent style paragraphs */
  744. if (wmode == 0 && cur_line && dev->new_obj)
  745. if ((p.x - dev->start.x) > 0.5f)
  746. new_para = 1;
  747. new_line = 1;
  748. }
  749. /* Way off the baseline - open a new paragraph */
  750. else
  751. {
  752. new_para = 1;
  753. new_line = 1;
  754. }
  755. }
  756. /* Start a new block (but only at the beginning of a text object) */
  757. if (new_para || !cur_block)
  758. {
  759. cur_block = add_text_block_to_page(ctx, page);
  760. cur_line = cur_block->u.t.last_line;
  761. }
  762. if (new_line && (dev->flags & FZ_STEXT_DEHYPHENATE) && is_hyphen(dev->lastchar))
  763. {
  764. remove_last_char(ctx, cur_line);
  765. new_line = 0;
  766. }
  767. /* Start a new line */
  768. if (new_line || !cur_line || force_new_line)
  769. {
  770. cur_line = add_line_to_block(ctx, page, cur_block, &ndir, wmode, bidi);
  771. dev->start = p;
  772. }
  773. /* Add synthetic space */
  774. if (add_space && !(dev->flags & FZ_STEXT_INHIBIT_SPACES))
  775. add_char_to_line(ctx, page, cur_line, trm, font, size, ' ', (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? NON_ACCURATE_GLYPH_ADDED_SPACE : NON_ACCURATE_GLYPH, &dev->pen, &p, bidi, dev->color, 1, flags, dev->flags);
  776. add_char_to_line(ctx, page, cur_line, trm, font, size, c, (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? glyph : NON_ACCURATE_GLYPH, &p, &q, bidi, dev->color, 0, flags, dev->flags);
  777. move_pen_and_exit:
  778. dev->lastchar = c;
  779. dev->lastbidi = bidi;
  780. dev->lag_pen = p;
  781. dev->pen = q;
  782. dev->new_obj = 0;
  783. dev->trm = trm;
  784. }
  785. static void
  786. fz_add_stext_char(fz_context *ctx,
  787. fz_stext_device *dev,
  788. fz_font *font,
  789. int c,
  790. int glyph,
  791. fz_matrix trm,
  792. float adv,
  793. int wmode,
  794. int bidi,
  795. int force_new_line,
  796. int flags)
  797. {
  798. /* ignore when one unicode character maps to multiple glyphs */
  799. if (c == -1)
  800. return;
  801. if (dev->flags & FZ_STEXT_ACCURATE_ASCENDERS)
  802. fz_calculate_font_ascender_descender(ctx, font);
  803. if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES))
  804. {
  805. switch (c)
  806. {
  807. case 0xFB00: /* ff */
  808. fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
  809. fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags);
  810. return;
  811. case 0xFB01: /* fi */
  812. fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
  813. fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, bidi, 0, flags);
  814. return;
  815. case 0xFB02: /* fl */
  816. fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
  817. fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, bidi, 0, flags);
  818. return;
  819. case 0xFB03: /* ffi */
  820. fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
  821. fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags);
  822. fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, bidi, 0, flags);
  823. return;
  824. case 0xFB04: /* ffl */
  825. fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
  826. fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags);
  827. fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, bidi, 0, flags);
  828. return;
  829. case 0xFB05: /* long st */
  830. case 0xFB06: /* st */
  831. fz_add_stext_char_imp(ctx, dev, font, 's', glyph, trm, adv, wmode, bidi, force_new_line, flags);
  832. fz_add_stext_char_imp(ctx, dev, font, 't', -1, trm, 0, wmode, bidi, 0, flags);
  833. return;
  834. }
  835. }
  836. if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE))
  837. {
  838. switch (c)
  839. {
  840. case 0x0009: /* tab */
  841. case 0x0020: /* space */
  842. case 0x00A0: /* no-break space */
  843. case 0x1680: /* ogham space mark */
  844. case 0x180E: /* mongolian vowel separator */
  845. case 0x2000: /* en quad */
  846. case 0x2001: /* em quad */
  847. case 0x2002: /* en space */
  848. case 0x2003: /* em space */
  849. case 0x2004: /* three-per-em space */
  850. case 0x2005: /* four-per-em space */
  851. case 0x2006: /* six-per-em space */
  852. case 0x2007: /* figure space */
  853. case 0x2008: /* punctuation space */
  854. case 0x2009: /* thin space */
  855. case 0x200A: /* hair space */
  856. case 0x202F: /* narrow no-break space */
  857. case 0x205F: /* medium mathematical space */
  858. case 0x3000: /* ideographic space */
  859. c = ' ';
  860. }
  861. }
  862. fz_add_stext_char_imp(ctx, dev, font, c, glyph, trm, adv, wmode, bidi, force_new_line, flags);
  863. }
  864. static fz_rect
  865. current_clip(fz_context *ctx, fz_stext_device *dev)
  866. {
  867. fz_rect r = fz_infinite_rect;
  868. if (dev->flags & FZ_STEXT_CLIP)
  869. {
  870. r = fz_device_current_scissor(ctx, &dev->super);
  871. r = fz_intersect_rect(r, dev->page->mediabox);
  872. }
  873. if (dev->flags & FZ_STEXT_CLIP_RECT)
  874. r = fz_intersect_rect(r, dev->opts.clip);
  875. return r;
  876. }
  877. static void
  878. do_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, int start, int end, int flags)
  879. {
  880. fz_font *font = span->font;
  881. fz_matrix tm = span->trm;
  882. float adv;
  883. int unicode;
  884. int i;
  885. for (i = start; i < end; i++)
  886. {
  887. /* Calculate new pen location and delta */
  888. tm.e = span->items[i].x;
  889. tm.f = span->items[i].y;
  890. dev->last.trm = fz_concat(tm, ctm);
  891. dev->last.bidi_level = span->bidi_level;
  892. dev->last.wmode = span->wmode;
  893. if (font != dev->last.font)
  894. {
  895. fz_drop_font(ctx, dev->last.font);
  896. dev->last.font = fz_keep_font(ctx, font);
  897. }
  898. dev->last.valid = 1;
  899. dev->last.flags = flags;
  900. if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT))
  901. {
  902. fz_rect r = current_clip(ctx, dev);
  903. if (fz_glyph_entirely_outside_box(ctx, &ctm, span, &span->items[i], &r))
  904. {
  905. dev->last.clipped = 1;
  906. continue;
  907. }
  908. }
  909. dev->last.clipped = 0;
  910. /* Calculate bounding box and new pen position based on font metrics */
  911. if (span->items[i].gid >= 0)
  912. adv = span->items[i].adv;
  913. else
  914. adv = 0;
  915. unicode = span->items[i].ucs;
  916. if (unicode == FZ_REPLACEMENT_CHARACTER)
  917. {
  918. if (dev->flags & FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE)
  919. {
  920. unicode = span->items[i].cid;
  921. flags |= FZ_STEXT_UNICODE_IS_CID;
  922. }
  923. else if (dev->flags & FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE)
  924. {
  925. unicode = span->items[i].gid;
  926. flags |= FZ_STEXT_UNICODE_IS_GID;
  927. }
  928. }
  929. /* Send the chars we have through. */
  930. fz_add_stext_char(ctx, dev, font,
  931. unicode,
  932. span->items[i].gid,
  933. dev->last.trm,
  934. adv,
  935. dev->last.wmode,
  936. dev->last.bidi_level,
  937. (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS),
  938. flags);
  939. }
  940. }
  941. static int
  942. rune_index(const char *utf8, size_t idx)
  943. {
  944. int rune;
  945. do
  946. {
  947. int len = fz_chartorune(&rune, utf8);
  948. if (rune == 0)
  949. return -1;
  950. utf8 += len;
  951. }
  952. while (idx--);
  953. return rune;
  954. }
  955. static void
  956. flush_actualtext(fz_context *ctx, fz_stext_device *dev, const char *actualtext, int i)
  957. {
  958. if (*actualtext == 0)
  959. return;
  960. while (1)
  961. {
  962. int rune;
  963. actualtext += fz_chartorune(&rune, actualtext);
  964. if (rune == 0)
  965. break;
  966. if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT))
  967. if (dev->last.clipped)
  968. continue;
  969. fz_add_stext_char(ctx, dev, dev->last.font,
  970. rune,
  971. -1,
  972. dev->last.trm,
  973. 0,
  974. dev->last.wmode,
  975. dev->last.bidi_level,
  976. (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS),
  977. dev->last.flags);
  978. i++;
  979. }
  980. }
  981. static void
  982. do_extract_within_actualtext(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, metatext_t *mt, int flags)
  983. {
  984. /* We are within an actualtext block. This means we can't just add the chars
  985. * as they are. We need to add the chars as they are meant to be. Sadly the
  986. * actualtext mechanism doesn't help us at all with positioning. */
  987. fz_font *font = span->font;
  988. fz_matrix tm = span->trm;
  989. float adv;
  990. int start, i, end;
  991. char *actualtext = mt->text;
  992. size_t z = fz_utflen(actualtext);
  993. /* If actualtext is empty, nothing to do! */
  994. if (z == 0)
  995. return;
  996. /* Now, we HOPE that the creator of a PDF will minimise the actual text
  997. * differences, so that we'll get:
  998. * "Politicians <Actualtext="lie">fib</ActualText>, always."
  999. * rather than:
  1000. * "<Actualtext="Politicians lie, always">Politicians fib, always.</ActualText>
  1001. * but experience with PDF files tells us that this won't always be the case.
  1002. *
  1003. * We try to minimise the actualtext section here, just in case.
  1004. */
  1005. /* Spot a matching prefix and send it. */
  1006. for (start = 0; start < span->len; start++)
  1007. {
  1008. int rune;
  1009. int len = fz_chartorune(&rune, actualtext);
  1010. if (span->items[start].gid != rune || rune == 0)
  1011. break;
  1012. actualtext += len; z--;
  1013. }
  1014. if (start != 0)
  1015. do_extract(ctx, dev, span, ctm, 0, start, flags);
  1016. if (start == span->len)
  1017. {
  1018. /* The prefix has consumed all this object. Just shorten the actualtext and we'll
  1019. * catch the rest next time. */
  1020. z = strlen(actualtext)+1;
  1021. memmove(mt->text, actualtext, z);
  1022. return;
  1023. }
  1024. /* We haven't consumed the whole string, so there must be runes left.
  1025. * Shut coverity up. */
  1026. assert(z != 0);
  1027. /* Spot a matching postfix. Can't send it til the end. */
  1028. for (end = span->len; end > start; end--)
  1029. {
  1030. /* Nasty n^2 algo here, cos backtracking through utf8 is not trivial. It'll do. */
  1031. int rune = rune_index(actualtext, z-1);
  1032. if (span->items[end-1].gid != rune)
  1033. break;
  1034. z--;
  1035. }
  1036. /* So we can send end -> span->len at the end. */
  1037. /* So we have at least SOME chars that don't match. */
  1038. /* Now, do the difficult bit in the middle.*/
  1039. /* items[start..end] have to be sent with actualtext[start..z] */
  1040. for (i = start; i < end; i++)
  1041. {
  1042. fz_text_item *item = &span->items[i];
  1043. int rune = -1;
  1044. if ((size_t)i < z)
  1045. actualtext += fz_chartorune(&rune, actualtext);
  1046. /* Calculate new pen location and delta */
  1047. tm.e = item->x;
  1048. tm.f = item->y;
  1049. dev->last.trm = fz_concat(tm, ctm);
  1050. dev->last.bidi_level = span->bidi_level;
  1051. dev->last.wmode = span->wmode;
  1052. if (font != dev->last.font)
  1053. {
  1054. fz_drop_font(ctx, dev->last.font);
  1055. dev->last.font = fz_keep_font(ctx, font);
  1056. }
  1057. dev->last.valid = 1;
  1058. if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT))
  1059. {
  1060. fz_rect r = current_clip(ctx, dev);
  1061. if (fz_glyph_entirely_outside_box(ctx, &ctm, span, &span->items[i], &r))
  1062. {
  1063. dev->last.clipped = 1;
  1064. continue;
  1065. }
  1066. }
  1067. dev->last.clipped = 0;
  1068. /* Calculate bounding box and new pen position based on font metrics */
  1069. if (item->gid >= 0)
  1070. adv = item->adv;
  1071. else
  1072. adv = 0;
  1073. fz_add_stext_char(ctx, dev, font,
  1074. rune,
  1075. span->items[i].gid,
  1076. dev->last.trm,
  1077. adv,
  1078. dev->last.wmode,
  1079. dev->last.bidi_level,
  1080. (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS),
  1081. flags);
  1082. }
  1083. /* If we haven't spotted a postfix by this point, then don't force ourselves to output
  1084. * any more of the actualtext at this point. We might get a new text object that matches
  1085. * more of it. */
  1086. if (end == span->len)
  1087. {
  1088. /* Shorten actualtext and exit. */
  1089. z = strlen(actualtext)+1;
  1090. memmove(mt->text, actualtext, z);
  1091. return;
  1092. }
  1093. /* We found a matching postfix. It seems likely that this is going to be the only
  1094. * text object we get, so send any remaining actualtext now. */
  1095. flush_actualtext(ctx, dev, actualtext, i);
  1096. /* Send the postfix */
  1097. if (end != span->len)
  1098. do_extract(ctx, dev, span, ctm, end, span->len, flags);
  1099. mt->text[0] = 0;
  1100. }
  1101. static void
  1102. fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, int flags)
  1103. {
  1104. fz_stext_device *tdev = (fz_stext_device*)dev;
  1105. metatext_t *mt = NULL;
  1106. if (span->len == 0)
  1107. return;
  1108. /* Are we in an actualtext? */
  1109. if (!(tdev->opts.flags & FZ_STEXT_IGNORE_ACTUALTEXT))
  1110. mt = find_actualtext(dev);
  1111. if (mt)
  1112. do_extract_within_actualtext(ctx, dev, span, ctm, mt, flags);
  1113. else
  1114. do_extract(ctx, dev, span, ctm, 0, span->len, flags);
  1115. }
  1116. static uint32_t hexrgba_from_color(fz_context *ctx, fz_colorspace *colorspace, const float *color, float alpha)
  1117. {
  1118. float rgb[3];
  1119. fz_convert_color(ctx, colorspace, color, fz_device_rgb(ctx), rgb, NULL, fz_default_color_params);
  1120. return
  1121. (fz_clampi(alpha * 255 + 0.5f, 0, 255) << 24) |
  1122. (fz_clampi(rgb[0] * 255 + 0.5f, 0, 255) << 16) |
  1123. (fz_clampi(rgb[1] * 255 + 0.5f, 0, 255) << 8) |
  1124. (fz_clampi(rgb[2] * 255 + 0.5f, 0, 255));
  1125. }
  1126. static void
  1127. fz_stext_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm,
  1128. fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
  1129. {
  1130. fz_stext_device *tdev = (fz_stext_device*)dev;
  1131. fz_text_span *span;
  1132. if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
  1133. return;
  1134. tdev->color = hexrgba_from_color(ctx, colorspace, color, alpha);
  1135. tdev->new_obj = 1;
  1136. for (span = text->head; span; span = span->next)
  1137. fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_FILLED);
  1138. fz_drop_text(ctx, tdev->lasttext);
  1139. tdev->lasttext = fz_keep_text(ctx, text);
  1140. }
  1141. static void
  1142. fz_stext_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm,
  1143. fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
  1144. {
  1145. fz_stext_device *tdev = (fz_stext_device*)dev;
  1146. fz_text_span *span;
  1147. if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
  1148. return;
  1149. tdev->color = hexrgba_from_color(ctx, colorspace, color, alpha);
  1150. tdev->new_obj = 1;
  1151. for (span = text->head; span; span = span->next)
  1152. fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_STROKED);
  1153. fz_drop_text(ctx, tdev->lasttext);
  1154. tdev->lasttext = fz_keep_text(ctx, text);
  1155. }
  1156. static void
  1157. fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_rect scissor)
  1158. {
  1159. fz_stext_device *tdev = (fz_stext_device*)dev;
  1160. fz_text_span *span;
  1161. if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
  1162. return;
  1163. tdev->color = 0;
  1164. tdev->new_obj = 1;
  1165. for (span = text->head; span; span = span->next)
  1166. fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_FILLED | FZ_STEXT_CLIPPED);
  1167. fz_drop_text(ctx, tdev->lasttext);
  1168. tdev->lasttext = fz_keep_text(ctx, text);
  1169. }
  1170. static void
  1171. fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor)
  1172. {
  1173. fz_stext_device *tdev = (fz_stext_device*)dev;
  1174. fz_text_span *span;
  1175. if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
  1176. return;
  1177. tdev->color = 0;
  1178. tdev->new_obj = 1;
  1179. for (span = text->head; span; span = span->next)
  1180. fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_STROKED | FZ_STEXT_CLIPPED);
  1181. fz_drop_text(ctx, tdev->lasttext);
  1182. tdev->lasttext = fz_keep_text(ctx, text);
  1183. }
  1184. static void
  1185. fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm)
  1186. {
  1187. fz_stext_device *tdev = (fz_stext_device*)dev;
  1188. fz_text_span *span;
  1189. if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
  1190. return;
  1191. tdev->color = 0;
  1192. tdev->new_obj = 1;
  1193. for (span = text->head; span; span = span->next)
  1194. fz_stext_extract(ctx, tdev, span, ctm, 0);
  1195. fz_drop_text(ctx, tdev->lasttext);
  1196. tdev->lasttext = fz_keep_text(ctx, text);
  1197. }
  1198. static void
  1199. fz_stext_begin_metatext(fz_context *ctx, fz_device *dev, fz_metatext meta, const char *text)
  1200. {
  1201. fz_stext_device *tdev = (fz_stext_device*)dev;
  1202. metatext_t *mt = fz_malloc_struct(ctx, metatext_t);
  1203. mt->prev = tdev->metatext;
  1204. tdev->metatext = mt;
  1205. mt->type = meta;
  1206. mt->text = text ? fz_strdup(ctx, text) : NULL;
  1207. mt->bounds = fz_empty_rect;
  1208. }
  1209. static void
  1210. pop_metatext(fz_context *ctx, fz_stext_device *dev)
  1211. {
  1212. metatext_t *prev;
  1213. fz_rect bounds;
  1214. if (!dev->metatext)
  1215. return;
  1216. prev = dev->metatext->prev;
  1217. bounds = dev->metatext->bounds;
  1218. fz_free(ctx, dev->metatext->text);
  1219. fz_free(ctx, dev->metatext);
  1220. dev->metatext = prev;
  1221. if (prev)
  1222. prev->bounds = fz_union_rect(prev->bounds, bounds);
  1223. }
  1224. static void
  1225. fz_stext_end_metatext(fz_context *ctx, fz_device *dev)
  1226. {
  1227. fz_stext_device *tdev = (fz_stext_device*)dev;
  1228. fz_font *myfont = NULL;
  1229. if (!tdev->metatext)
  1230. return; /* Mismatched pop. Live with it. */
  1231. if (tdev->metatext->type != FZ_METATEXT_ACTUALTEXT)
  1232. {
  1233. /* We only deal with ActualText here. Just pop anything else off,
  1234. * and we're done. */
  1235. pop_metatext(ctx, tdev);
  1236. return;
  1237. }
  1238. /* If we have a 'last' text position, send the content after that. */
  1239. if (tdev->last.valid)
  1240. {
  1241. flush_actualtext(ctx, tdev, tdev->metatext->text, 0);
  1242. pop_metatext(ctx, tdev);
  1243. return;
  1244. }
  1245. /* If we have collected a rectangle for content that encloses the actual text,
  1246. * send the content there. */
  1247. if (!fz_is_empty_rect(tdev->metatext->bounds))
  1248. {
  1249. tdev->last.trm.a = tdev->metatext->bounds.x1 - tdev->metatext->bounds.x0;
  1250. tdev->last.trm.b = 0;
  1251. tdev->last.trm.c = 0;
  1252. tdev->last.trm.d = tdev->metatext->bounds.y1 - tdev->metatext->bounds.y0;
  1253. tdev->last.trm.e = tdev->metatext->bounds.x0;
  1254. tdev->last.trm.f = tdev->metatext->bounds.y0;
  1255. }
  1256. else
  1257. fz_warn(ctx, "Actualtext with no position. Text may be lost or mispositioned.");
  1258. fz_var(myfont);
  1259. fz_try(ctx)
  1260. {
  1261. if (tdev->last.font == NULL)
  1262. {
  1263. myfont = fz_new_base14_font(ctx, "Helvetica");
  1264. tdev->last.font = myfont;
  1265. }
  1266. flush_actualtext(ctx, tdev, tdev->metatext->text, 0);
  1267. pop_metatext(ctx, tdev);
  1268. }
  1269. fz_always(ctx)
  1270. {
  1271. if (myfont)
  1272. {
  1273. tdev->last.font = NULL;
  1274. fz_drop_font(ctx, myfont);
  1275. }
  1276. }
  1277. fz_catch(ctx)
  1278. fz_rethrow(ctx);
  1279. }
  1280. /* Images and shadings */
  1281. static void
  1282. fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, float alpha, fz_color_params color_params)
  1283. {
  1284. fz_stext_device *tdev = (fz_stext_device*)dev;
  1285. fz_rect *bounds = actualtext_bounds(tdev);
  1286. /* If there is an actualtext in force, update its bounds. */
  1287. if (bounds)
  1288. {
  1289. static const fz_rect unit = { 0, 0, 1, 1 };
  1290. *bounds = fz_union_rect(*bounds, fz_transform_rect(unit, ctm));
  1291. }
  1292. /* Unless we are being told to preserve images, nothing to do here. */
  1293. if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0)
  1294. return;
  1295. /* If the alpha is less than 50% then it's probably a watermark or effect or something. Skip it. */
  1296. if (alpha >= 0.5f)
  1297. add_image_block_to_page(ctx, tdev->page, ctm, img);
  1298. }
  1299. static void
  1300. fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm,
  1301. fz_colorspace *cspace, const float *color, float alpha, fz_color_params color_params)
  1302. {
  1303. fz_stext_fill_image(ctx, dev, img, ctm, alpha, color_params);
  1304. }
  1305. static fz_image *
  1306. fz_new_image_from_shade(fz_context *ctx, fz_shade *shade, fz_matrix *in_out_ctm, fz_color_params color_params, fz_rect scissor)
  1307. {
  1308. fz_matrix ctm = *in_out_ctm;
  1309. fz_pixmap *pix;
  1310. fz_image *img = NULL;
  1311. fz_rect bounds;
  1312. fz_irect bbox;
  1313. bounds = fz_bound_shade(ctx, shade, ctm);
  1314. bounds = fz_intersect_rect(bounds, scissor);
  1315. bbox = fz_irect_from_rect(bounds);
  1316. pix = fz_new_pixmap_with_bbox(ctx, fz_device_rgb(ctx), bbox, NULL, !shade->use_background);
  1317. fz_try(ctx)
  1318. {
  1319. if (shade->use_background)
  1320. fz_fill_pixmap_with_color(ctx, pix, shade->colorspace, shade->background, color_params);
  1321. else
  1322. fz_clear_pixmap(ctx, pix);
  1323. fz_paint_shade(ctx, shade, NULL, ctm, pix, color_params, bbox, NULL, NULL);
  1324. img = fz_new_image_from_pixmap(ctx, pix, NULL);
  1325. }
  1326. fz_always(ctx)
  1327. fz_drop_pixmap(ctx, pix);
  1328. fz_catch(ctx)
  1329. fz_rethrow(ctx);
  1330. in_out_ctm->a = pix->w;
  1331. in_out_ctm->b = 0;
  1332. in_out_ctm->c = 0;
  1333. in_out_ctm->d = pix->h;
  1334. in_out_ctm->e = pix->x;
  1335. in_out_ctm->f = pix->y;
  1336. return img;
  1337. }
  1338. static void
  1339. fz_stext_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, fz_matrix ctm, float alpha, fz_color_params color_params)
  1340. {
  1341. fz_stext_device *tdev = (fz_stext_device*)dev;
  1342. fz_rect *bounds = actualtext_bounds(tdev);
  1343. fz_matrix local_ctm;
  1344. fz_rect scissor;
  1345. fz_image *image;
  1346. /* If we aren't keeping images, but we are in a bound, update the bounds
  1347. * without generating the entire image. */
  1348. if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0 && bounds)
  1349. {
  1350. *bounds = fz_union_rect(*bounds, fz_bound_shade(ctx, shade, ctm));
  1351. return;
  1352. }
  1353. /* Unless we are preserving image, nothing to do here. */
  1354. if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0)
  1355. return;
  1356. local_ctm = ctm;
  1357. scissor = fz_device_current_scissor(ctx, dev);
  1358. if (dev->flags & FZ_STEXT_CLIP_RECT)
  1359. scissor = fz_intersect_rect(scissor, tdev->opts.clip);
  1360. scissor = fz_intersect_rect(scissor, tdev->page->mediabox);
  1361. image = fz_new_image_from_shade(ctx, shade, &local_ctm, color_params, scissor);
  1362. fz_try(ctx)
  1363. fz_stext_fill_image(ctx, dev, image, local_ctm, alpha, color_params);
  1364. fz_always(ctx)
  1365. fz_drop_image(ctx, image);
  1366. fz_catch(ctx)
  1367. fz_rethrow(ctx);
  1368. }
  1369. static void
  1370. fixup_bboxes_and_bidi(fz_context *ctx, fz_stext_block *block)
  1371. {
  1372. fz_stext_line *line;
  1373. fz_stext_char *ch;
  1374. for ( ; block != NULL; block = block->next)
  1375. {
  1376. if (block->type == FZ_STEXT_BLOCK_STRUCT)
  1377. if (block->u.s.down)
  1378. fixup_bboxes_and_bidi(ctx, block->u.s.down->first_block);
  1379. if (block->type != FZ_STEXT_BLOCK_TEXT)
  1380. continue;
  1381. for (line = block->u.t.first_line; line; line = line->next)
  1382. {
  1383. int reorder = 0;
  1384. for (ch = line->first_char; ch; ch = ch->next)
  1385. {
  1386. fz_rect ch_box = fz_rect_from_quad(ch->quad);
  1387. if (ch == line->first_char)
  1388. line->bbox = ch_box;
  1389. else
  1390. line->bbox = fz_union_rect(line->bbox, ch_box);
  1391. if (ch->bidi == 3)
  1392. reorder = 1;
  1393. }
  1394. block->bbox = fz_union_rect(block->bbox, line->bbox);
  1395. if (reorder)
  1396. reverse_bidi_line(line);
  1397. }
  1398. }
  1399. }
  1400. static void
  1401. advance_x(fz_point *a, fz_point b, float d)
  1402. {
  1403. a->y += (b.y - a->y) * d / (b.x - a->x);
  1404. a->x += d;
  1405. }
  1406. static void
  1407. advance_y(fz_point *a, fz_point b, float d)
  1408. {
  1409. a->x += (b.x - a->x) * d / (b.y - a->y);
  1410. a->y += d;
  1411. }
  1412. static int
  1413. line_crosses_rect(fz_point a, fz_point b, fz_rect r)
  1414. {
  1415. /* Cope with trivial exclusions */
  1416. if (a.x < r.x0 && b.x < r.x0)
  1417. return 0;
  1418. if (a.x > r.x1 && b.x > r.x1)
  1419. return 0;
  1420. if (a.y < r.y0 && b.y < r.y0)
  1421. return 0;
  1422. if (a.y > r.y1 && b.y > r.y1)
  1423. return 0;
  1424. if (a.x < r.x0)
  1425. advance_x(&a, b, r.x0 - a.x);
  1426. if (a.x > r.x1)
  1427. advance_x(&a, b, r.x1 - a.x);
  1428. if (a.y < r.y0)
  1429. advance_y(&a, b, r.y0 - a.y);
  1430. if (a.y > r.y1)
  1431. advance_y(&a, b, r.y1 - a.y);
  1432. return fz_is_point_inside_rect(a, r);
  1433. }
  1434. static float
  1435. calculate_ascent(fz_point p, fz_point origin, fz_point dir)
  1436. {
  1437. return fabsf((origin.x-p.x)*dir.y - (origin.y-p.y)*dir.x);
  1438. }
  1439. /* Create us a rect from the given quad, but extend it downwards
  1440. * to allow for underlines that pass under the glyphs. */
  1441. static fz_rect expanded_rect_from_quad(fz_quad quad, fz_point dir, fz_point origin, float size)
  1442. {
  1443. /* Consider the two rects from A and g respectively.
  1444. *
  1445. * ul +------+ ur or
  1446. * | /\ | ul +------+ ur
  1447. * | /__\ | | /''\ |
  1448. * |/ \| |( ||
  1449. * ll +------+ lr | ''''||
  1450. * | ''' | <-expected underline level
  1451. * ll +------+ lr
  1452. *
  1453. * So an underline won't cross A's rect, but will cross g's.
  1454. * We want to make a rect that includes a suitable amount of
  1455. * space underneath. The information we have available to us
  1456. * is summed up here:
  1457. *
  1458. * ul +---------+ ur
  1459. * | |
  1460. * | origin |
  1461. * |+----------> dir
  1462. * | |
  1463. * ll +---------+ lr
  1464. *
  1465. * Consider the distance from ul to the line that passes through
  1466. * the origin with direction dir. Similarly, consider the distance
  1467. * from ur to the same line. This can be thought of as the 'ascent'
  1468. * of this character.
  1469. *
  1470. * We'd like the distance from ul to ll to be greater than this, so
  1471. * as to ensure we cover the possible location where an underline
  1472. * might reasonably go.
  1473. *
  1474. * If we have a line (l) through point A with direction vector u,
  1475. * the distance between point P and line(l) is:
  1476. *
  1477. * d(P,l) = || AP x u || / || u ||
  1478. *
  1479. * where x is the cross product.
  1480. *
  1481. * For us, because || dir || = 1:
  1482. *
  1483. * d(ul, origin) = || (origin-ul) x dir ||
  1484. *
  1485. * The cross product is only defined in 3 (or 7!) dimensions, so
  1486. * extend both vectors into 3d by defining a 0 z component.
  1487. *
  1488. * (origin-ul) x dir = [ (origin.y - ul.y) . 0 - 0 . dir.y ]
  1489. * [ 0 . dir.x - (origin.x - ul.y) . 0 ]
  1490. * [ (origin.x - ul.x) . dir.y - (origin.y - ul.y) . dir.x ]
  1491. *
  1492. * So d(ul, origin) = abs(D) where D = (origin.x-ul.x).dir.y - (origin.y-ul.y).dir.x
  1493. */
  1494. float ascent = (calculate_ascent(quad.ul, origin, dir) + calculate_ascent(quad.ur, origin, dir)) / 2;
  1495. fz_point left = { quad.ll.x - quad.ul.x, quad.ll.y - quad.ul.y };
  1496. fz_point right = { quad.lr.x - quad.ur.x, quad.lr.y - quad.ur.y };
  1497. float height = (hypotf(left.x, left.y) + hypotf(right.x, right.y))/2;
  1498. int neg = 0;
  1499. /* We'd like height to be at least ascent + 1/4 size */
  1500. if (height < 0)
  1501. neg = 1, height = -height;
  1502. if (height < ascent + size * 0.25f)
  1503. height = ascent + size * 0.25f;
  1504. height -= ascent;
  1505. if (neg)
  1506. height = -height;
  1507. quad.ll.x += - height * dir.y;
  1508. quad.ll.y += height * dir.x;
  1509. quad.lr.x += - height * dir.y;
  1510. quad.lr.y += height * dir.x;
  1511. return fz_rect_from_quad(quad);
  1512. }
  1513. static int feq(float a,float b)
  1514. {
  1515. #define EPSILON 0.00001
  1516. a -= b;
  1517. if (a < 0)
  1518. a = -a;
  1519. return a < EPSILON;
  1520. }
  1521. static void
  1522. check_strikeout(fz_context *ctx, fz_stext_block *block, fz_point from, fz_point to, fz_point dir)
  1523. {
  1524. for ( ; block; block = block->next)
  1525. {
  1526. fz_stext_line *line;
  1527. if (block->type != FZ_STEXT_BLOCK_TEXT)
  1528. continue;
  1529. for (line = block->u.t.first_line; line != NULL; line = line->next)
  1530. {
  1531. fz_stext_char *ch;
  1532. if ((!feq(line->dir.x, dir.x) || !feq(line->dir.y, dir.y)) &&
  1533. (!feq(line->dir.x, -dir.x) || !feq(line->dir.y, -dir.y)))
  1534. continue;
  1535. /* Matching directions... */
  1536. /* Unfortunately, we don't have a valid line->bbox at this point, so we need to check
  1537. * chars. - FIXME: Now we do! */
  1538. for (ch = line->first_char; ch; ch = ch->next)
  1539. {
  1540. fz_point up;
  1541. float dx, dy, dot;
  1542. fz_rect ch_box = expanded_rect_from_quad(ch->quad, line->dir, ch->origin, ch->size);
  1543. if (!line_crosses_rect(from, to, ch_box))
  1544. continue;
  1545. /* Is this a strikeout or an underline? */
  1546. /* The baseline moves from ch->origin in the direction line->dir */
  1547. up.x = line->dir.y;
  1548. up.y = -line->dir.x;
  1549. /* How far is our line displaced from the line through the origin? */
  1550. dx = from.x - ch->origin.x;
  1551. dy = from.y - ch->origin.y;
  1552. /* Dot product with up. up is normalised */
  1553. dot = dx * up.x + dy * up.y;
  1554. if (dot > 0)
  1555. ch->flags |= FZ_STEXT_STRIKEOUT;
  1556. else
  1557. ch->flags |= FZ_STEXT_UNDERLINE;
  1558. }
  1559. }
  1560. }
  1561. }
  1562. static void
  1563. check_rects_for_strikeout(fz_context *ctx, fz_stext_device *tdev, fz_stext_page *page)
  1564. {
  1565. int i, n = tdev->rect_len;
  1566. for (i = 0; i < n; i++)
  1567. {
  1568. fz_point from = tdev->rects[i].from;
  1569. fz_point to = tdev->rects[i].to;
  1570. fz_point dir;
  1571. dir.x = to.x - from.x;
  1572. dir.y = to.y - from.y;
  1573. dir = fz_normalize_vector(dir);
  1574. check_strikeout(ctx, page->first_block, from, to, dir);
  1575. }
  1576. }
  1577. static void
  1578. fz_stext_close_device(fz_context *ctx, fz_device *dev)
  1579. {
  1580. fz_stext_device *tdev = (fz_stext_device*)dev;
  1581. fz_stext_page *page = tdev->page;
  1582. fixup_bboxes_and_bidi(ctx, page->first_block);
  1583. if (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES)
  1584. check_rects_for_strikeout(ctx, tdev, page);
  1585. /* TODO: smart sorting of blocks and lines in reading order */
  1586. /* TODO: unicode NFC normalization */
  1587. if (tdev->opts.flags & FZ_STEXT_SEGMENT)
  1588. fz_segment_stext_page(ctx, page);
  1589. if (tdev->opts.flags & FZ_STEXT_PARAGRAPH_BREAK)
  1590. fz_paragraph_break(ctx, page);
  1591. if (tdev->opts.flags & FZ_STEXT_TABLE_HUNT)
  1592. fz_table_hunt(ctx, page);
  1593. }
  1594. static void
  1595. fz_stext_drop_device(fz_context *ctx, fz_device *dev)
  1596. {
  1597. fz_stext_device *tdev = (fz_stext_device*)dev;
  1598. fz_drop_text(ctx, tdev->lasttext);
  1599. fz_drop_font(ctx, tdev->last.font);
  1600. while (tdev->metatext)
  1601. pop_metatext(ctx, tdev);
  1602. fz_free(ctx, tdev->rects);
  1603. }
  1604. static int
  1605. val_is_rect(const char *val, fz_rect *rp)
  1606. {
  1607. fz_rect r;
  1608. const char *s;
  1609. s = strchr(val, ':');
  1610. if (s == NULL || s == val)
  1611. return 0;
  1612. r.x0 = fz_atof(val);
  1613. val = s+1;
  1614. s = strchr(val, ':');
  1615. if (s == NULL || s == val)
  1616. return 0;
  1617. r.y0 = fz_atof(val);
  1618. val = s+1;
  1619. s = strchr(val, ':');
  1620. if (s == NULL || s == val)
  1621. return 0;
  1622. r.x1 = fz_atof(val);
  1623. val = s+1;
  1624. r.y1 = fz_atof(val);
  1625. *rp = r;
  1626. return 1;
  1627. }
  1628. fz_stext_options *
  1629. fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string)
  1630. {
  1631. const char *val;
  1632. memset(opts, 0, sizeof *opts);
  1633. if (fz_has_option(ctx, string, "preserve-ligatures", &val) && fz_option_eq(val, "yes"))
  1634. opts->flags |= FZ_STEXT_PRESERVE_LIGATURES;
  1635. if (fz_has_option(ctx, string, "preserve-whitespace", &val) && fz_option_eq(val, "yes"))
  1636. opts->flags |= FZ_STEXT_PRESERVE_WHITESPACE;
  1637. if (fz_has_option(ctx, string, "preserve-images", &val) && fz_option_eq(val, "yes"))
  1638. opts->flags |= FZ_STEXT_PRESERVE_IMAGES;
  1639. if (fz_has_option(ctx, string, "inhibit-spaces", &val) && fz_option_eq(val, "yes"))
  1640. opts->flags |= FZ_STEXT_INHIBIT_SPACES;
  1641. if (fz_has_option(ctx, string, "dehyphenate", &val) && fz_option_eq(val, "yes"))
  1642. opts->flags |= FZ_STEXT_DEHYPHENATE;
  1643. if (fz_has_option(ctx, string, "preserve-spans", &val) && fz_option_eq(val, "yes"))
  1644. opts->flags |= FZ_STEXT_PRESERVE_SPANS;
  1645. if (fz_has_option(ctx, string, "structured", &val) && fz_option_eq(val, "yes"))
  1646. opts->flags |= FZ_STEXT_COLLECT_STRUCTURE;
  1647. if (fz_has_option(ctx, string, "use-cid-for-unknown-unicode", &val) && fz_option_eq(val, "yes"))
  1648. opts->flags |= FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE;
  1649. if (fz_has_option(ctx, string, "use-gid-for-unknown-unicode", &val) && fz_option_eq(val, "yes"))
  1650. opts->flags |= FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE;
  1651. if (fz_has_option(ctx, string, "accurate-bboxes", &val) && fz_option_eq(val, "yes"))
  1652. opts->flags |= FZ_STEXT_ACCURATE_BBOXES;
  1653. if (fz_has_option(ctx, string, "vectors", &val) && fz_option_eq(val, "yes"))
  1654. opts->flags |= FZ_STEXT_COLLECT_VECTORS;
  1655. if (fz_has_option(ctx, string, "ignore-actualtext", & val) && fz_option_eq(val, "yes"))
  1656. opts->flags |= FZ_STEXT_IGNORE_ACTUALTEXT;
  1657. if (fz_has_option(ctx, string, "segment", &val) && fz_option_eq(val, "yes"))
  1658. opts->flags |= FZ_STEXT_SEGMENT;
  1659. if (fz_has_option(ctx, string, "paragraph-break", &val) && fz_option_eq(val, "yes"))
  1660. opts->flags |= FZ_STEXT_PARAGRAPH_BREAK;
  1661. if (fz_has_option(ctx, string, "table-hunt", &val) && fz_option_eq(val, "yes"))
  1662. opts->flags |= FZ_STEXT_TABLE_HUNT;
  1663. if (fz_has_option(ctx, string, "collect-styles", &val) && fz_option_eq(val, "yes"))
  1664. opts->flags |= FZ_STEXT_COLLECT_STYLES;
  1665. if (fz_has_option(ctx, string, "accurate-ascenders", &val) && fz_option_eq(val, "yes"))
  1666. opts->flags |= FZ_STEXT_ACCURATE_ASCENDERS;
  1667. if (fz_has_option(ctx, string, "accurate-side-bearings", &val) && fz_option_eq(val, "yes"))
  1668. opts->flags |= FZ_STEXT_ACCURATE_SIDE_BEARINGS;
  1669. opts->flags |= FZ_STEXT_CLIP;
  1670. if (fz_has_option(ctx, string, "mediabox-clip", &val))
  1671. {
  1672. fz_warn(ctx, "The 'mediabox-clip' option has been deprecated. Use 'clip' instead.");
  1673. if (fz_option_eq(val, "no"))
  1674. opts->flags ^= FZ_STEXT_CLIP;
  1675. }
  1676. if (fz_has_option(ctx, string, "clip", &val) && fz_option_eq(val, "no"))
  1677. opts->flags ^= FZ_STEXT_CLIP;
  1678. if (fz_has_option(ctx, string, "clip-rect", &val) && val_is_rect(val, &opts->clip))
  1679. opts->flags |= FZ_STEXT_CLIP_RECT;
  1680. opts->scale = 1;
  1681. if (fz_has_option(ctx, string, "resolution", &val))
  1682. opts->scale = fz_atof(val) / 96.0f; /* HTML base resolution is 96ppi */
  1683. return opts;
  1684. }
  1685. typedef struct
  1686. {
  1687. int fail;
  1688. int count;
  1689. fz_point corners[4];
  1690. } is_rect_data;
  1691. static void
  1692. stash_point(is_rect_data *rd, float x, float y)
  1693. {
  1694. if (rd->count > 3)
  1695. {
  1696. rd->fail = 1;
  1697. return;
  1698. }
  1699. rd->corners[rd->count].x = x;
  1700. rd->corners[rd->count].y = y;
  1701. rd->count++;
  1702. }
  1703. static void
  1704. is_rect_moveto(fz_context *ctx, void *arg, float x, float y)
  1705. {
  1706. is_rect_data *rd = arg;
  1707. if (rd->fail)
  1708. return;
  1709. if (rd->count != 0)
  1710. {
  1711. rd->fail = 1;
  1712. return;
  1713. }
  1714. stash_point(rd, x, y);
  1715. }
  1716. static void
  1717. is_rect_lineto(fz_context *ctx, void *arg, float x, float y)
  1718. {
  1719. is_rect_data *rd = arg;
  1720. if (rd->fail)
  1721. return;
  1722. if (rd->count == 4 && rd->corners[0].x == x && rd->corners[1].y == y)
  1723. return;
  1724. stash_point(rd, x, y);
  1725. }
  1726. static void
  1727. is_rect_curveto(fz_context *ctx, void *arg, float x1, float y1, float x2, float y2, float x3, float y3)
  1728. {
  1729. is_rect_data *rd = arg;
  1730. rd->fail = 1;
  1731. }
  1732. static void
  1733. is_rect_closepath(fz_context *ctx, void *arg)
  1734. {
  1735. is_rect_data *rd = arg;
  1736. if (rd->fail)
  1737. return;
  1738. if (rd->count == 3)
  1739. stash_point(rd, rd->corners[0].x, rd->corners[0].y);
  1740. if (rd->count != 4)
  1741. rd->fail = 1;
  1742. }
  1743. static int
  1744. is_path_rect(fz_context *ctx, const fz_path *path, fz_point *from, fz_point *to, float *thickness, fz_matrix ctm)
  1745. {
  1746. float d01, d01x, d01y, d03, d03x, d03y, d32x, d32y;
  1747. is_rect_data rd = { 0 };
  1748. static const fz_path_walker walker =
  1749. {
  1750. is_rect_moveto, is_rect_lineto, is_rect_curveto, is_rect_closepath
  1751. };
  1752. int i;
  1753. fz_walk_path(ctx, path, &walker, &rd);
  1754. if (rd.fail)
  1755. return 0;
  1756. if (rd.count == 2)
  1757. {
  1758. stash_point(&rd, rd.corners[1].x, rd.corners[1].y);
  1759. stash_point(&rd, rd.corners[0].x, rd.corners[0].y);
  1760. }
  1761. for (i = 0 ; i < 4; i++)
  1762. {
  1763. fz_point p = fz_transform_point(rd.corners[i], ctm);
  1764. rd.corners[i].x = p.x;
  1765. rd.corners[i].y = p.y;
  1766. }
  1767. /* So we have a 4 cornered path. Hopefully something like:
  1768. * 0---------1
  1769. * | |
  1770. * 3---------2
  1771. * but it might be:
  1772. * 0---------3
  1773. * | |
  1774. * 1---------2
  1775. */
  1776. while (1)
  1777. {
  1778. d01x = rd.corners[1].x - rd.corners[0].x;
  1779. d01y = rd.corners[1].y - rd.corners[0].y;
  1780. d01 = d01x * d01x + d01y * d01y;
  1781. d03x = rd.corners[3].x - rd.corners[0].x;
  1782. d03y = rd.corners[3].y - rd.corners[0].y;
  1783. d03 = d03x * d03x + d03y * d03y;
  1784. if(d01 < d03)
  1785. {
  1786. /* We are the latter case. Transpose it. */
  1787. fz_point p = rd.corners[1];
  1788. rd.corners[1] = rd.corners[3];
  1789. rd.corners[3] = p;
  1790. }
  1791. else
  1792. break;
  1793. }
  1794. d32x = rd.corners[2].x - rd.corners[3].x;
  1795. d32y = rd.corners[2].y - rd.corners[3].y;
  1796. /* So d32x and d01x need to be the same for this to be a strikeout. */
  1797. if (!feq(d32x, d01x) || !feq(d32y, d01y))
  1798. return 0;
  1799. /* We are plausibly a rectangle. */
  1800. *thickness = sqrtf(d03x * d03x + d03y * d03y);
  1801. from->x = (rd.corners[0].x + rd.corners[3].x)/2;
  1802. from->y = (rd.corners[0].y + rd.corners[3].y)/2;
  1803. to->x = (rd.corners[1].x + rd.corners[2].x)/2;
  1804. to->y = (rd.corners[1].y + rd.corners[2].y)/2;
  1805. return 1;
  1806. }
  1807. static void
  1808. check_for_strikeout(fz_context *ctx, fz_stext_device *tdev, fz_stext_page *page, const fz_path *path, fz_matrix ctm)
  1809. {
  1810. float thickness;
  1811. fz_point from, to;
  1812. /* Is this path a thin rectangle (possibly rotated)? If so, then we need to
  1813. * consider it as being a strikeout or underline. */
  1814. if (!is_path_rect(ctx, path, &from, &to, &thickness, ctm))
  1815. return;
  1816. /* Add to the list of rects in the device. */
  1817. if (tdev->rect_len == tdev->rect_max)
  1818. {
  1819. int newmax = tdev->rect_max * 2;
  1820. if (newmax == 0)
  1821. newmax = 32;
  1822. tdev->rects = fz_realloc(ctx, tdev->rects, sizeof(*tdev->rects) * newmax);
  1823. tdev->rect_max = newmax;
  1824. }
  1825. tdev->rects[tdev->rect_len].from = from;
  1826. tdev->rects[tdev->rect_len].to = to;
  1827. tdev->rects[tdev->rect_len].thickness = thickness;
  1828. tdev->rect_len++;
  1829. }
  1830. static void
  1831. add_vector(fz_context *ctx, fz_stext_page *page, fz_rect bbox, uint32_t flags, uint32_t argb)
  1832. {
  1833. fz_stext_block *b = add_block_to_page(ctx, page);
  1834. b->type = FZ_STEXT_BLOCK_VECTOR;
  1835. b->bbox = bbox;
  1836. b->u.v.flags = flags;
  1837. b->u.v.argb = argb;
  1838. }
  1839. typedef struct
  1840. {
  1841. fz_matrix ctm;
  1842. uint32_t argb;
  1843. uint32_t flags;
  1844. fz_stext_page *page;
  1845. fz_rect leftovers;
  1846. fz_rect pending;
  1847. int count;
  1848. fz_point p[5];
  1849. } split_path_data;
  1850. static void
  1851. maybe_rect(fz_context *ctx, split_path_data *sp)
  1852. {
  1853. int rect = 0;
  1854. int i;
  1855. if (sp->count >= 0)
  1856. {
  1857. if (sp->count == 3)
  1858. {
  1859. /* Allow for "moveto A, lineto B, lineto A, close" */
  1860. if (feq(sp->p[0].x, sp->p[2].x) || feq(sp->p[0].y, sp->p[2].y))
  1861. sp->count = 2;
  1862. }
  1863. if (sp->count == 2)
  1864. {
  1865. if (feq(sp->p[0].x, sp->p[1].x) || feq(sp->p[0].y, sp->p[1].y))
  1866. rect = 1; /* Count that as a rect */
  1867. }
  1868. else if (sp->count == 4 || sp->count == 5)
  1869. {
  1870. if (feq(sp->p[0].x, sp->p[1].x) && feq(sp->p[2].x, sp->p[3].x) && feq(sp->p[0].y, sp->p[3].y) && feq(sp->p[1].y, sp->p[2].y))
  1871. rect = 1;
  1872. else if (feq(sp->p[0].x, sp->p[3].x) && feq(sp->p[1].x, sp->p[2].x) && feq(sp->p[0].y, sp->p[1].y) && feq(sp->p[2].y, sp->p[3].y))
  1873. rect = 1;
  1874. }
  1875. if (rect)
  1876. {
  1877. fz_rect bounds;
  1878. bounds.x0 = bounds.x1 = sp->p[0].x;
  1879. bounds.y0 = bounds.y1 = sp->p[0].y;
  1880. for (i = 1; i < sp->count; i++)
  1881. bounds = fz_include_point_in_rect(bounds, sp->p[i]);
  1882. if (fz_is_valid_rect(sp->pending))
  1883. add_vector(ctx, sp->page, sp->pending, sp->flags | FZ_STEXT_VECTOR_IS_RECTANGLE | FZ_STEXT_VECTOR_CONTINUES, sp->argb);
  1884. sp->pending = bounds;
  1885. return;
  1886. }
  1887. for (i = 0; i < sp->count; i++)
  1888. sp->leftovers = fz_include_point_in_rect(sp->leftovers, sp->p[i]);
  1889. }
  1890. }
  1891. static void
  1892. split_move(fz_context *ctx, void *arg, float x, float y)
  1893. {
  1894. split_path_data *sp = (split_path_data *)arg;
  1895. fz_point p = fz_transform_point_xy(x, y, sp->ctm);
  1896. maybe_rect(ctx, sp);
  1897. sp->p[0] = p;
  1898. sp->count = 1;
  1899. }
  1900. static void
  1901. split_line(fz_context *ctx, void *arg, float x, float y)
  1902. {
  1903. split_path_data *sp = (split_path_data *)arg;
  1904. fz_point p = fz_transform_point_xy(x, y, sp->ctm);
  1905. int i;
  1906. if (sp->count >= 0)
  1907. {
  1908. /* Check for lines to the same point. */
  1909. if (feq(sp->p[sp->count-1].x, p.x) && feq(sp->p[sp->count-1].y, p.y))
  1910. return;
  1911. /* If we're still maybe a rect, just record the point. */
  1912. if (sp->count < 4)
  1913. {
  1914. sp->p[sp->count++] = p;
  1915. return;
  1916. }
  1917. /* Check for close line? */
  1918. if (sp->count == 4)
  1919. {
  1920. if (feq(sp->p[0].x, p.x) && feq(sp->p[0].y, p.y))
  1921. {
  1922. /* We've just drawn a line back to the start point. */
  1923. /* Needless saving of point, but it makes the logic
  1924. * easier elsewhere. */
  1925. sp->p[sp->count++] = p;
  1926. return;
  1927. }
  1928. }
  1929. /* We can no longer be a rect. Output the points we had saved. */
  1930. for (i = 0; i < sp->count; i++)
  1931. sp->leftovers = fz_include_point_in_rect(sp->leftovers, sp->p[i]);
  1932. /* Remember we're not a rect. */
  1933. sp->count = -1;
  1934. }
  1935. /* Roll this point into the non-rect bounds. */
  1936. sp->leftovers = fz_include_point_in_rect(sp->leftovers, p);
  1937. }
  1938. static void
  1939. split_curve(fz_context *ctx, void *arg, float x1, float y1, float x2, float y2, float x3, float y3)
  1940. {
  1941. split_path_data *sp = (split_path_data *)arg;
  1942. fz_point p1 = fz_transform_point_xy(x1, y1, sp->ctm);
  1943. fz_point p2 = fz_transform_point_xy(x2, y2, sp->ctm);
  1944. fz_point p3 = fz_transform_point_xy(x3, y3, sp->ctm);
  1945. int i;
  1946. if (sp->count >= 0)
  1947. {
  1948. /* We can no longer be a rect. Output the points we had saved. */
  1949. for (i = 0; i < sp->count; i++)
  1950. sp->leftovers = fz_include_point_in_rect(sp->leftovers, sp->p[i]);
  1951. /* Remember we're not a rect. */
  1952. sp->count = -1;
  1953. }
  1954. /* Roll these points into the non-rect bounds. */
  1955. sp->leftovers = fz_include_point_in_rect(sp->leftovers, p1);
  1956. sp->leftovers = fz_include_point_in_rect(sp->leftovers, p2);
  1957. sp->leftovers = fz_include_point_in_rect(sp->leftovers, p3);
  1958. }
  1959. static void
  1960. split_close(fz_context *ctx, void *arg)
  1961. {
  1962. split_path_data *sp = (split_path_data *)arg;
  1963. maybe_rect(ctx, sp);
  1964. sp->count = 0;
  1965. }
  1966. static const
  1967. fz_path_walker split_path_rects =
  1968. {
  1969. split_move,
  1970. split_line,
  1971. split_curve,
  1972. split_close
  1973. };
  1974. static void
  1975. add_vectors_from_path(fz_context *ctx, fz_stext_page *page, const fz_path *path, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp, int stroke)
  1976. {
  1977. int have_leftovers;
  1978. split_path_data sp;
  1979. sp.ctm = ctm;
  1980. sp.argb = hexrgba_from_color(ctx, cs, color, alpha);
  1981. sp.flags = stroke ? FZ_STEXT_VECTOR_IS_STROKED : 0;
  1982. sp.page = page;
  1983. sp.count = 0;
  1984. sp.leftovers = fz_empty_rect;
  1985. sp.pending = fz_empty_rect;
  1986. fz_walk_path(ctx, path, &split_path_rects, &sp);
  1987. have_leftovers = fz_is_valid_rect(sp.leftovers);
  1988. maybe_rect(ctx, &sp);
  1989. if (fz_is_valid_rect(sp.pending))
  1990. add_vector(ctx, page, sp.pending, sp.flags | FZ_STEXT_VECTOR_IS_RECTANGLE | (have_leftovers ? FZ_STEXT_VECTOR_CONTINUES : 0), sp.argb);
  1991. if (have_leftovers)
  1992. add_vector(ctx, page, sp.leftovers, sp.flags, sp.argb);
  1993. }
  1994. static void
  1995. fz_stext_fill_path(fz_context *ctx, fz_device *dev, const fz_path *path, int even_odd, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp)
  1996. {
  1997. fz_stext_device *tdev = (fz_stext_device*)dev;
  1998. fz_stext_page *page = tdev->page;
  1999. fz_rect path_bounds = fz_bound_path(ctx, path, NULL, ctm);
  2000. fz_rect *bounds = actualtext_bounds(tdev);
  2001. /* If we're in an actualtext, then update the bounds to include this content. */
  2002. if (bounds != NULL)
  2003. *bounds = fz_union_rect(*bounds, path_bounds);
  2004. if (tdev->flags & FZ_STEXT_COLLECT_STYLES)
  2005. check_for_strikeout(ctx, tdev, page, path, ctm);
  2006. if (tdev->flags & FZ_STEXT_COLLECT_VECTORS)
  2007. add_vectors_from_path(ctx, page, path, ctm, cs, color, alpha, cp, 0);
  2008. }
  2009. static void
  2010. fz_stext_stroke_path(fz_context *ctx, fz_device *dev, const fz_path *path, const fz_stroke_state *ss, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp)
  2011. {
  2012. fz_stext_device *tdev = (fz_stext_device*)dev;
  2013. fz_stext_page *page = tdev->page;
  2014. fz_rect path_bounds = fz_bound_path(ctx, path, ss, ctm);
  2015. fz_rect *bounds = actualtext_bounds((fz_stext_device *)dev);
  2016. /* If we're in an actualtext, then update the bounds to include this content. */
  2017. if (bounds != NULL)
  2018. *bounds = fz_union_rect(*bounds, path_bounds);
  2019. if (tdev->flags & FZ_STEXT_COLLECT_STYLES)
  2020. check_for_strikeout(ctx, tdev, page, path, ctm);
  2021. if (tdev->flags & FZ_STEXT_COLLECT_VECTORS)
  2022. add_vectors_from_path(ctx, page, path, ctm, cs, color, alpha, cp, 1);
  2023. }
  2024. static void
  2025. new_stext_struct(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, fz_structure standard, const char *raw)
  2026. {
  2027. fz_stext_struct *str;
  2028. size_t z;
  2029. if (raw == NULL)
  2030. raw = "";
  2031. z = strlen(raw);
  2032. str = fz_pool_alloc(ctx, page->pool, sizeof(*str) + z);
  2033. str->first_block = NULL;
  2034. str->last_block = NULL;
  2035. str->standard = standard;
  2036. str->parent = page->last_struct;
  2037. str->up = block;
  2038. memcpy(str->raw, raw, z+1);
  2039. block->u.s.down = str;
  2040. }
  2041. static void
  2042. fz_stext_begin_structure(fz_context *ctx, fz_device *dev, fz_structure standard, const char *raw, int idx)
  2043. {
  2044. fz_stext_device *tdev = (fz_stext_device*)dev;
  2045. fz_stext_page *page = tdev->page;
  2046. fz_stext_block *block, *le, *gt, *newblock;
  2047. /* Find a pointer to the last block. */
  2048. if (page->last_block)
  2049. {
  2050. block = page->last_block;
  2051. }
  2052. else if (page->last_struct)
  2053. {
  2054. block = page->last_struct->last_block;
  2055. }
  2056. else
  2057. {
  2058. block = page->first_block;
  2059. }
  2060. /* So block is somewhere in the content chain. Let's try and find:
  2061. * le = the struct node <= idx before block in the content chain.
  2062. * ge = the struct node >= idx after block in the content chain.
  2063. * Search backwards to start with.
  2064. */
  2065. gt = NULL;
  2066. le = block;
  2067. while (le)
  2068. {
  2069. if (le->type == FZ_STEXT_BLOCK_STRUCT)
  2070. {
  2071. if (le->u.s.index > idx)
  2072. gt = le;
  2073. if (le->u.s.index <= idx)
  2074. break;
  2075. }
  2076. le = le->prev;
  2077. }
  2078. /* The following loop copes with finding gt (the smallest block with an index higher
  2079. * than we want) if we haven't found it already. The while loop in here was designed
  2080. * to cope with 'block' being in the middle of a list. In fact, the way the code is
  2081. * currently, block will always be at the end of a list, so the while won't do anything.
  2082. * But I'm loathe to remove it in case we ever change this code to start from wherever
  2083. * we did the last insertion. */
  2084. if (gt == NULL)
  2085. {
  2086. gt = block;
  2087. while (gt)
  2088. {
  2089. if (gt->type == FZ_STEXT_BLOCK_STRUCT)
  2090. {
  2091. if (gt->u.s.index <= idx)
  2092. le = gt;
  2093. if (gt->u.s.index >= idx)
  2094. break;
  2095. }
  2096. block = gt;
  2097. gt = gt->next;
  2098. }
  2099. }
  2100. if (le && le->u.s.index == idx)
  2101. {
  2102. /* We want to move down into the le block. Does it have a struct
  2103. * attached yet? */
  2104. if (le->u.s.down == NULL)
  2105. {
  2106. /* No. We need to create a new struct node. */
  2107. new_stext_struct(ctx, page, le, standard, raw);
  2108. }
  2109. else if (le->u.s.down->standard != standard ||
  2110. (raw == NULL && le->u.s.down->raw[0] != 0) ||
  2111. (raw != NULL && strcmp(raw, le->u.s.down->raw) != 0))
  2112. {
  2113. /* Yes, but it doesn't match the one we expect! */
  2114. fz_warn(ctx, "Mismatched structure type!");
  2115. }
  2116. page->last_struct = le->u.s.down;
  2117. page->last_block = le->u.s.down->last_block;
  2118. return;
  2119. }
  2120. /* We are going to need to create a new block. Create a complete unlinked one here. */
  2121. newblock = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block);
  2122. newblock->bbox = fz_empty_rect;
  2123. newblock->prev = NULL;
  2124. newblock->next = NULL;
  2125. newblock->type = FZ_STEXT_BLOCK_STRUCT;
  2126. newblock->u.s.index = idx;
  2127. newblock->u.s.down = NULL;
  2128. /* If this throws, we leak newblock but it's within the pool, so it doesn't matter. */
  2129. new_stext_struct(ctx, page, newblock, standard, raw);
  2130. /* So now we just need to link it in somewhere. */
  2131. if (gt)
  2132. {
  2133. /* Link it in before gt. */
  2134. newblock->prev = gt->prev;
  2135. if (gt->prev)
  2136. gt->prev->next = newblock;
  2137. gt->prev = newblock;
  2138. newblock->next = gt;
  2139. }
  2140. else if (block)
  2141. {
  2142. /* Link it in at the end of the list (i.e. after 'block') */
  2143. newblock->prev = block;
  2144. block->next = newblock;
  2145. }
  2146. else if (page->last_struct)
  2147. {
  2148. /* We have no blocks at all at this level. */
  2149. page->last_struct->first_block = newblock;
  2150. page->last_struct->last_block = newblock;
  2151. }
  2152. else
  2153. {
  2154. /* We have no blocks at ANY level. */
  2155. page->first_block = newblock;
  2156. }
  2157. /* Wherever we linked it in, that's where we want to continue adding content. */
  2158. page->last_struct = newblock->u.s.down;
  2159. page->last_block = NULL;
  2160. }
  2161. static void
  2162. fz_stext_end_structure(fz_context *ctx, fz_device *dev)
  2163. {
  2164. fz_stext_device *tdev = (fz_stext_device*)dev;
  2165. fz_stext_page *page = tdev->page;
  2166. fz_stext_struct *str = page->last_struct;
  2167. if (str == NULL)
  2168. {
  2169. fz_warn(ctx, "Structure out of sync");
  2170. return;
  2171. }
  2172. page->last_struct = str->parent;
  2173. if (page->last_struct == NULL)
  2174. {
  2175. page->last_block = page->first_block;
  2176. /* Yuck */
  2177. while (page->last_block->next)
  2178. page->last_block = page->last_block->next;
  2179. }
  2180. else
  2181. {
  2182. page->last_block = page->last_struct->last_block;
  2183. }
  2184. }
  2185. fz_device *
  2186. fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts)
  2187. {
  2188. fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device);
  2189. dev->super.close_device = fz_stext_close_device;
  2190. dev->super.drop_device = fz_stext_drop_device;
  2191. dev->super.fill_text = fz_stext_fill_text;
  2192. dev->super.stroke_text = fz_stext_stroke_text;
  2193. dev->super.clip_text = fz_stext_clip_text;
  2194. dev->super.clip_stroke_text = fz_stext_clip_stroke_text;
  2195. dev->super.ignore_text = fz_stext_ignore_text;
  2196. dev->super.begin_metatext = fz_stext_begin_metatext;
  2197. dev->super.end_metatext = fz_stext_end_metatext;
  2198. dev->super.fill_shade = fz_stext_fill_shade;
  2199. dev->super.fill_image = fz_stext_fill_image;
  2200. dev->super.fill_image_mask = fz_stext_fill_image_mask;
  2201. if (opts)
  2202. {
  2203. dev->flags = opts->flags;
  2204. if (opts->flags & FZ_STEXT_COLLECT_STRUCTURE)
  2205. {
  2206. dev->super.begin_structure = fz_stext_begin_structure;
  2207. dev->super.end_structure = fz_stext_end_structure;
  2208. }
  2209. if (opts->flags & (FZ_STEXT_COLLECT_VECTORS | FZ_STEXT_COLLECT_STYLES))
  2210. {
  2211. dev->super.fill_path = fz_stext_fill_path;
  2212. dev->super.stroke_path = fz_stext_stroke_path;
  2213. }
  2214. }
  2215. dev->page = page;
  2216. dev->pen.x = 0;
  2217. dev->pen.y = 0;
  2218. dev->trm = fz_identity;
  2219. dev->lastchar = ' ';
  2220. dev->lasttext = NULL;
  2221. dev->lastbidi = 0;
  2222. dev->last_was_fake_bold = 1;
  2223. if (opts)
  2224. dev->opts = *opts;
  2225. if ((dev->flags & FZ_STEXT_PRESERVE_IMAGES) == 0)
  2226. dev->super.hints |= FZ_DONT_DECODE_IMAGES;
  2227. dev->rect_max = 0;
  2228. dev->rect_len = 0;
  2229. dev->rects = NULL;
  2230. return (fz_device*)dev;
  2231. }