| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519 |
- // Copyright (C) 2004-2025 Artifex Software, Inc.
- //
- // This file is part of MuPDF.
- //
- // MuPDF is free software: you can redistribute it and/or modify it under the
- // terms of the GNU Affero General Public License as published by the Free
- // Software Foundation, either version 3 of the License, or (at your option)
- // any later version.
- //
- // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
- // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- // details.
- // You should have received a copy of the GNU Affero General Public License
- // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
- //
- // Alternative licensing terms are available from the licensor.
- // For commercial licensing, see <https://www.artifex.com/> or contact
- // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
- // CA 94129, USA, for further information.
- #include "mupdf/fitz.h"
- #include "glyphbox.h"
- #include <float.h>
- #include <string.h>
- /* Simple layout structure */
- fz_layout_block *fz_new_layout(fz_context *ctx)
- {
- fz_pool *pool = fz_new_pool(ctx);
- fz_layout_block *block;
- fz_try(ctx)
- {
- block = fz_pool_alloc(ctx, pool, sizeof (fz_layout_block));
- block->pool = pool;
- block->head = NULL;
- block->tailp = &block->head;
- }
- fz_catch(ctx)
- {
- fz_drop_pool(ctx, pool);
- fz_rethrow(ctx);
- }
- return block;
- }
- void fz_drop_layout(fz_context *ctx, fz_layout_block *block)
- {
- if (block)
- fz_drop_pool(ctx, block->pool);
- }
- void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float font_size, const char *p)
- {
- fz_layout_line *line = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_line));
- line->x = x;
- line->y = y;
- line->font_size = font_size;
- line->p = p;
- line->text = NULL;
- line->next = NULL;
- *block->tailp = line;
- block->tailp = &line->next;
- block->text_tailp = &line->text;
- }
- void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float advance, const char *p)
- {
- fz_layout_char *ch = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_char));
- ch->x = x;
- ch->advance = advance;
- ch->p = p;
- ch->next = NULL;
- *block->text_tailp = ch;
- block->text_tailp = &ch->next;
- }
- /* Extract text into blocks and lines. */
- #define PARAGRAPH_DIST 1.5f
- #define SPACE_DIST 0.15f
- #define SPACE_MAX_DIST 0.8f
- #define BASE_MAX_DIST 0.8f
- #define FAKE_BOLD_MAX_DIST 0.1f
- /* We keep a stack of the different metatexts that apply at any
- * given point (normally none!). Whenever we get some content
- * with a metatext in force, we really want to update the bounds
- * for that metatext. But running along the whole list each time
- * would be painful. So we just update the bounds for dev->metatext
- * and rely on metatext_bounds() propagating it upwards 'just in
- * time' for us to use metatexts other than the latest one. This
- * also means we need to propagate bounds upwards when we pop
- * a metatext.
- *
- * Why do we need bounds at all? Well, suppose we get:
- * /Span <</ActualText (c) >> BDC /Im0 Do EMC
- * Then where on the page do we put 'c' ? By collecting the
- * bounds, we can place 'c' wherever the image was.
- */
- typedef struct metatext_t
- {
- fz_metatext type;
- char *text;
- fz_rect bounds;
- struct metatext_t *prev;
- } metatext_t;
- typedef struct
- {
- fz_point from;
- fz_point to;
- float thickness;
- } rect_details;
- typedef struct
- {
- fz_device super;
- fz_stext_page *page;
- int id;
- fz_point pen, start;
- fz_point lag_pen;
- fz_matrix trm;
- int new_obj;
- int lastchar;
- int lastbidi;
- int flags;
- int color;
- int last_was_fake_bold;
- const fz_text *lasttext;
- fz_stext_options opts;
- metatext_t *metatext;
- /* Store the last values we saw. We need this for flushing the actualtext. */
- struct
- {
- int valid;
- int clipped;
- fz_matrix trm;
- int wmode;
- int bidi_level;
- fz_font *font;
- int flags;
- } last;
- /* The list of 'rects' seen during processing (if we're collecting styles). */
- int rect_max;
- int rect_len;
- rect_details *rects;
- } fz_stext_device;
- const char *fz_stext_options_usage =
- "Text output options:\n"
- "\tpreserve-images: keep images in output\n"
- "\tpreserve-ligatures: do not expand ligatures into constituent characters\n"
- "\tpreserve-spans: do not merge spans on the same line\n"
- "\tpreserve-whitespace: do not convert all whitespace into space characters\n"
- "\tinhibit-spaces: don't add spaces between gaps in the text\n"
- "\tparagraph-break: break blocks at paragraph boundaries\n"
- "\tdehyphenate: attempt to join up hyphenated words\n"
- "\tignore-actualtext: do not apply ActualText replacements\n"
- "\tuse-cid-for-unknown-unicode: use character code if unicode mapping fails\n"
- "\tuse-gid-for-unknown-unicode: use glyph index if unicode mapping fails\n"
- "\taccurate-bboxes: calculate char bboxes from the outlines\n"
- "\taccurate-ascenders: calculate ascender/descender from font glyphs\n"
- "\taccurate-side-bearings: expand char bboxes to completely include width of glyphs\n"
- "\tcollect-styles: attempt to detect text features (fake bold, strikeout, underlined etc)\n"
- "\tclip: do not include text that is completely clipped\n"
- "\tclip-rect=x0:y0:x1:y1 specify clipping rectangle within which to collect content\n"
- "\tstructured: collect structure markup\n"
- "\tvectors: include vector bboxes in output\n"
- "\tsegment: attempt to segment the page\n"
- "\ttable-hunt: hunt for tables within a (segmented) page\n"
- "\n";
- /* Find the current actualtext, if any. Will abort if dev == NULL. */
- static metatext_t *
- find_actualtext(fz_stext_device *dev)
- {
- metatext_t *mt = dev->metatext;
- while (mt && mt->type != FZ_METATEXT_ACTUALTEXT)
- mt = mt->prev;
- return mt;
- }
- /* Find the bounds of the given metatext. Will abort if mt or
- * dev are NULL. */
- static fz_rect *
- metatext_bounds(metatext_t *mt, fz_stext_device *dev)
- {
- metatext_t *mt2 = dev->metatext;
- while (mt2 != mt)
- {
- mt2->prev->bounds = fz_union_rect(mt2->prev->bounds, mt2->bounds);
- mt2 = mt2->prev;
- }
- return &mt->bounds;
- }
- /* Find the bounds of the current actualtext, or NULL if there
- * isn't one. Will abort if dev is NULL. */
- static fz_rect *
- actualtext_bounds(fz_stext_device *dev)
- {
- metatext_t *mt = find_actualtext(dev);
- if (mt == NULL)
- return NULL;
- return metatext_bounds(mt, dev);
- }
- fz_stext_page *
- fz_new_stext_page(fz_context *ctx, fz_rect mediabox)
- {
- fz_pool *pool = fz_new_pool(ctx);
- fz_stext_page *page = NULL;
- fz_try(ctx)
- {
- page = fz_pool_alloc(ctx, pool, sizeof(*page));
- page->pool = pool;
- page->mediabox = mediabox;
- page->first_block = NULL;
- page->last_block = NULL;
- }
- fz_catch(ctx)
- {
- fz_drop_pool(ctx, pool);
- fz_rethrow(ctx);
- }
- return page;
- }
- static void
- drop_run(fz_context *ctx, fz_stext_block *block)
- {
- fz_stext_line *line;
- fz_stext_char *ch;
- while (block)
- {
- switch (block->type)
- {
- case FZ_STEXT_BLOCK_IMAGE:
- fz_drop_image(ctx, block->u.i.image);
- break;
- case FZ_STEXT_BLOCK_TEXT:
- for (line = block->u.t.first_line; line; line = line->next)
- for (ch = line->first_char; ch; ch = ch->next)
- fz_drop_font(ctx, ch->font);
- break;
- case FZ_STEXT_BLOCK_STRUCT:
- drop_run(ctx, block->u.s.down->first_block);
- break;
- default:
- break;
- }
- block = block->next;
- }
- }
- void
- fz_drop_stext_page(fz_context *ctx, fz_stext_page *page)
- {
- if (page)
- {
- drop_run(ctx, page->first_block);
- fz_drop_pool(ctx, page->pool);
- }
- }
- /*
- * This adds a new block at the end of the page. This should not be used
- * to add 'struct' blocks to the page as those have to be added internally,
- * with more complicated pointer setup.
- */
- static fz_stext_block *
- add_block_to_page(fz_context *ctx, fz_stext_page *page)
- {
- fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block);
- block->bbox = fz_empty_rect; /* Fixes bug 703267. */
- block->prev = page->last_block;
- if (page->last_struct)
- {
- if (page->last_struct->last_block)
- {
- block->prev = page->last_struct->last_block;
- block->prev->next = block;
- page->last_struct->last_block = block;
- }
- else
- page->last_struct->last_block = page->last_struct->first_block = block;
- }
- else if (!page->last_block)
- {
- page->last_block = block;
- if (!page->first_block)
- page->first_block = block;
- }
- else
- {
- page->last_block->next = block;
- page->last_block = block;
- }
- return block;
- }
- static fz_stext_block *
- add_text_block_to_page(fz_context *ctx, fz_stext_page *page)
- {
- fz_stext_block *block = add_block_to_page(ctx, page);
- block->type = FZ_STEXT_BLOCK_TEXT;
- return block;
- }
- static fz_stext_block *
- add_image_block_to_page(fz_context *ctx, fz_stext_page *page, fz_matrix ctm, fz_image *image)
- {
- fz_stext_block *block = add_block_to_page(ctx, page);
- block->type = FZ_STEXT_BLOCK_IMAGE;
- block->u.i.transform = ctm;
- block->u.i.image = fz_keep_image(ctx, image);
- block->bbox = fz_transform_rect(fz_unit_rect, ctm);
- return block;
- }
- static fz_stext_line *
- add_line_to_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, const fz_point *dir, int wmode, int bidi)
- {
- fz_stext_line *line = fz_pool_alloc(ctx, page->pool, sizeof *block->u.t.first_line);
- line->prev = block->u.t.last_line;
- if (!block->u.t.first_line)
- block->u.t.first_line = block->u.t.last_line = line;
- else
- {
- block->u.t.last_line->next = line;
- block->u.t.last_line = line;
- }
- line->dir = *dir;
- line->wmode = wmode;
- return line;
- }
- #define NON_ACCURATE_GLYPH_ADDED_SPACE (-2)
- #define NON_ACCURATE_GLYPH (-1)
- static fz_stext_char *
- add_char_to_line(fz_context *ctx, fz_stext_page *page, fz_stext_line *line, fz_matrix trm, fz_font *font, float size, int c, int glyph, fz_point *p, fz_point *q, int bidi, int color, int synthetic, int flags, int dev_flags)
- {
- fz_stext_char *ch = fz_pool_alloc(ctx, page->pool, sizeof *line->first_char);
- fz_point a, d;
- if (!line->first_char)
- line->first_char = line->last_char = ch;
- else
- {
- line->last_char->next = ch;
- line->last_char = ch;
- }
- ch->c = c;
- ch->argb = color;
- ch->bidi = bidi;
- ch->origin = *p;
- ch->size = size;
- ch->font = fz_keep_font(ctx, font);
- ch->flags = flags | (synthetic ? FZ_STEXT_SYNTHETIC : 0);
- if (font->flags.is_bold)
- ch->flags |= FZ_STEXT_BOLD;
- if (line->wmode == 0)
- {
- fz_rect bounds;
- int bounded = 0;
- a.x = 0;
- d.x = 0;
- if (glyph == NON_ACCURATE_GLYPH_ADDED_SPACE)
- {
- /* Added space, in accurate mode. */
- a.y = d.y = 0;
- }
- else if (glyph == NON_ACCURATE_GLYPH)
- {
- /* Non accurate mode. */
- a.y = fz_font_ascender(ctx, font);
- d.y = fz_font_descender(ctx, font);
- }
- else
- {
- /* Any glyph in accurate mode */
- bounds = fz_bound_glyph(ctx, font, glyph, fz_identity);
- bounded = 1;
- a.y = bounds.y1;
- d.y = bounds.y0;
- }
- if (dev_flags & FZ_STEXT_ACCURATE_SIDE_BEARINGS)
- {
- if (!bounded)
- bounds = fz_bound_glyph(ctx, font, glyph, fz_identity);
- if (a.x > bounds.x0)
- a.x = bounds.x0;
- if (d.y < bounds.x1)
- d.y = bounds.x1;
- }
- }
- else
- {
- a.x = 1;
- d.x = 0;
- a.y = 0;
- d.y = 0;
- }
- a = fz_transform_vector(a, trm);
- d = fz_transform_vector(d, trm);
- ch->quad.ll = fz_make_point(p->x + d.x, p->y + d.y);
- ch->quad.ul = fz_make_point(p->x + a.x, p->y + a.y);
- ch->quad.lr = fz_make_point(q->x + d.x, q->y + d.y);
- ch->quad.ur = fz_make_point(q->x + a.x, q->y + a.y);
- return ch;
- }
- static void
- remove_last_char(fz_context *ctx, fz_stext_line *line)
- {
- if (line && line->first_char)
- {
- fz_stext_char *prev = NULL;
- fz_stext_char *ch = line->first_char;
- while (ch->next)
- {
- prev = ch;
- ch = ch->next;
- }
- if (prev)
- {
- /* The characters are pool allocated, so we don't actually leak the removed node. */
- /* We do need to drop the char's font reference though. */
- fz_drop_font(ctx, prev->next->font);
- line->last_char = prev;
- line->last_char->next = NULL;
- }
- }
- }
- static fz_stext_char *reverse_bidi_span(fz_stext_char *curr, fz_stext_char *tail)
- {
- fz_stext_char *prev, *next;
- prev = tail;
- while (curr != tail)
- {
- next = curr->next;
- curr->next = prev;
- prev = curr;
- curr = next;
- }
- return prev;
- }
- static void reverse_bidi_line(fz_stext_line *line)
- {
- fz_stext_char *a, *b, **prev;
- prev = &line->first_char;
- for (a = line->first_char; a; a = a->next)
- {
- if (a->bidi)
- {
- b = a;
- while (b->next && b->next->bidi)
- b = b->next;
- if (a != b)
- *prev = reverse_bidi_span(a, b->next);
- }
- prev = &a->next;
- line->last_char = a;
- }
- }
- static int is_hyphen(int c)
- {
- /* check for: hyphen-minus, soft hyphen, hyphen, and non-breaking hyphen */
- return (c == '-' || c == 0xAD || c == 0x2010 || c == 0x2011);
- }
- static float
- vec_dot(const fz_point *a, const fz_point *b)
- {
- return a->x * b->x + a->y * b->y;
- }
- static int may_add_space(int lastchar)
- {
- /* Basic latin, greek, cyrillic, hebrew, arabic,
- * general punctuation,
- * superscripts and subscripts,
- * and currency symbols.
- */
- return (lastchar != ' ' && (lastchar < 0x700 || (lastchar >= 0x2000 && lastchar <= 0x20CF)));
- }
- #define FAKEBOLD_THRESHOLD_RECIP 10
- static int
- close(float a, float b, float size)
- {
- a -= b;
- if (a < 0)
- a = -a;
- return FAKEBOLD_THRESHOLD_RECIP * a < size;
- }
- static int
- font_equiv(fz_context *ctx, fz_font *f, fz_font *g)
- {
- unsigned char fdigest[16];
- unsigned char gdigest[16];
- if (f == g)
- return 1;
- if (strcmp(f->name, g->name) != 0)
- return 0;
- fz_font_digest(ctx, f, fdigest);
- fz_font_digest(ctx, g, gdigest);
- return (memcmp(fdigest, gdigest, 16) == 0);
- }
- static int
- check_for_fake_bold(fz_context *ctx, fz_stext_block *block, fz_font *font, int c, fz_point p, float size, int flags)
- {
- fz_stext_line *line;
- fz_stext_char *ch;
- for (; block != NULL; block = block->next)
- {
- if (block->type == FZ_STEXT_BLOCK_STRUCT)
- {
- if (block->u.s.down != NULL && check_for_fake_bold(ctx, block->u.s.down->first_block, font, c, p, size, flags))
- return 1;
- }
- else if (block->type == FZ_STEXT_BLOCK_TEXT)
- {
- for (line = block->u.t.first_line; line != NULL; line = line->next)
- {
- fz_stext_char *pr = NULL;
- for (ch = line->first_char; ch != NULL; ch = ch->next)
- {
- /* Not perfect, but it'll do! */
- if (ch->c == c && close(ch->origin.x, p.x, size) && close(ch->origin.y, p.y, size) && font_equiv(ctx, ch->font, font))
- {
- /* If we were filled before, and we are stroking now... */
- if ((ch->flags & (FZ_STEXT_FILLED | FZ_STEXT_STROKED)) == FZ_STEXT_FILLED &&
- (flags & (FZ_STEXT_FILLED | FZ_STEXT_STROKED)) == FZ_STEXT_STROKED)
- {
- /* Update this to be filled + stroked, but don't specifically mark it as fake bold. */
- ch->flags |= flags;
- return 1;
- }
- /* Overlaying spaces is tricksy. How can that count as boldening when it doesn't mark? We only accept these
- * as boldening if either the char before, or the char after were also boldened. */
- ch->flags |= flags;
- if (c == ' ')
- {
- if ((pr && (pr->flags & FZ_STEXT_BOLD) != 0) ||
- (ch->next && (ch->next->flags & FZ_STEXT_BOLD) != 0))
- {
- /* OK, we can be bold. */
- ch->flags |= FZ_STEXT_BOLD;
- return 1;
- }
- /* Ignore this and keep going */
- }
- else
- {
- ch->flags |= FZ_STEXT_BOLD;
- return 1;
- }
- }
- pr = ch;
- }
- }
- }
- }
- return 0;
- }
- static void
- fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode, int bidi, int force_new_line, int flags)
- {
- fz_stext_page *page = dev->page;
- fz_stext_block *cur_block;
- fz_stext_line *cur_line;
- int new_para = 0;
- int new_line = 1;
- int add_space = 0;
- fz_point dir, ndir, p, q;
- float size;
- fz_point delta;
- float spacing = 0;
- float base_offset = 0;
- float dist;
- /* Preserve RTL-ness only (and ignore level) so we can use bit 2 as "visual" tag for reordering pass. */
- bidi = bidi & 1;
- /* dir = direction vector for motion. ndir = normalised(dir) */
- if (wmode == 0)
- {
- dir.x = 1;
- dir.y = 0;
- }
- else
- {
- dir.x = 0;
- dir.y = -1;
- }
- dir = fz_transform_vector(dir, trm);
- ndir = fz_normalize_vector(dir);
- size = fz_matrix_expansion(trm);
- /* We need to identify where glyphs 'start' (p) and 'stop' (q).
- * Each glyph holds its 'start' position, and the next glyph in the
- * span (or span->max if there is no next glyph) holds its 'end'
- * position.
- *
- * For both horizontal and vertical motion, trm->{e,f} gives the
- * origin (usually the bottom left) of the glyph.
- *
- * In horizontal mode:
- * + p is bottom left.
- * + q is the bottom right
- * In vertical mode:
- * + p is top left (where it advanced from)
- * + q is bottom left
- */
- if (wmode == 0)
- {
- p.x = trm.e;
- p.y = trm.f;
- q.x = trm.e + adv * dir.x;
- q.y = trm.f + adv * dir.y;
- }
- else
- {
- p.x = trm.e - adv * dir.x;
- p.y = trm.f - adv * dir.y;
- q.x = trm.e;
- q.y = trm.f;
- }
- if ((dev->opts.flags & FZ_STEXT_COLLECT_STYLES) != 0)
- {
- if (glyph == -1)
- {
- if (dev->last_was_fake_bold)
- goto move_pen_and_exit;
- }
- else if (check_for_fake_bold(ctx, page->first_block, font, c, p, size, flags))
- {
- dev->last_was_fake_bold = 1;
- goto move_pen_and_exit;
- }
- dev->last_was_fake_bold = 0;
- }
- /* Find current position to enter new text. */
- cur_block = page->last_struct ? page->last_struct->last_block : page->last_block;
- if (cur_block && cur_block->type != FZ_STEXT_BLOCK_TEXT)
- cur_block = NULL;
- cur_line = cur_block ? cur_block->u.t.last_line : NULL;
- if (cur_line && glyph < 0)
- {
- /* Don't advance pen or break lines for no-glyph characters in a cluster */
- add_char_to_line(ctx, page, cur_line, trm, font, size, c, (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? glyph : NON_ACCURATE_GLYPH, &dev->pen, &dev->pen, bidi, dev->color, 0, flags, dev->flags);
- dev->lastbidi = bidi;
- dev->lastchar = c;
- return;
- }
- if (cur_line == NULL || cur_line->wmode != wmode || vec_dot(&ndir, &cur_line->dir) < 0.999f)
- {
- /* If the matrix has changed rotation, or the wmode is different (or if we don't have a line at all),
- * then we can't append to the current block/line. */
- new_para = 1;
- new_line = 1;
- }
- else
- {
- /* Detect fake bold where text is printed twice in the same place. */
- /* Largely supplanted by the check_for_fake_bold mechanism above,
- * but we leave this in for backward compatibility as it's cheap,
- * and works even when FZ_STEXT_COLLECT_STYLES is not set. */
- dist = hypotf(q.x - dev->pen.x, q.y - dev->pen.y) / size;
- if (dist < FAKE_BOLD_MAX_DIST && c == dev->lastchar)
- return;
- /* Calculate how far we've moved since the last character. */
- delta.x = p.x - dev->pen.x;
- delta.y = p.y - dev->pen.y;
- /* The transform has not changed, so we know we're in the same
- * direction. Calculate 2 distances; how far off the previous
- * baseline we are, together with how far along the baseline
- * we are from the expected position. */
- spacing = (ndir.x * delta.x + ndir.y * delta.y) / size;
- base_offset = (-ndir.y * delta.x + ndir.x * delta.y) / size;
- /* Only a small amount off the baseline - we'll take this */
- if (fabsf(base_offset) < BASE_MAX_DIST)
- {
- /* If mixed LTR and RTL content */
- if ((bidi & 1) != (dev->lastbidi & 1))
- {
- /* Ignore jumps within line when switching between LTR and RTL text. */
- new_line = 0;
- }
- /* RTL */
- else if (bidi & 1)
- {
- fz_point logical_delta = fz_make_point(p.x - dev->lag_pen.x, p.y - dev->lag_pen.y);
- float logical_spacing = (ndir.x * logical_delta.x + ndir.y * logical_delta.y) / size + adv;
- /* If the pen is where we would have been if we
- * had advanced backwards from the previous
- * character by this character's advance, we
- * are probably seeing characters emitted in
- * logical order.
- */
- if (fabsf(logical_spacing) < SPACE_DIST)
- {
- new_line = 0;
- }
- /* However, if the pen has advanced to where we would expect it
- * in an LTR context, we're seeing them emitted in visual order
- * and should flag them for reordering!
- */
- else if (fabsf(spacing) < SPACE_DIST)
- {
- bidi = 3; /* mark line as visual */
- new_line = 0;
- }
- /* And any other small jump could be a missing space. */
- else if (logical_spacing < 0 && logical_spacing > -SPACE_MAX_DIST)
- {
- if (wmode == 0 && may_add_space(dev->lastchar))
- add_space = 1;
- new_line = 0;
- }
- else if (spacing < 0 && spacing > -SPACE_MAX_DIST)
- {
- /* Motion is in line, but negative. We've probably got overlapping
- * chars here. Live with it. */
- new_line = 0;
- }
- else if (spacing > 0 && spacing < SPACE_MAX_DIST)
- {
- bidi = 3; /* mark line as visual */
- if (wmode == 0 && may_add_space(dev->lastchar))
- add_space = 1;
- new_line = 0;
- }
- else
- {
- /* Motion is large and unexpected (probably a new table column). */
- new_line = 1;
- }
- }
- /* LTR or neutral character */
- else
- {
- if (fabsf(spacing) < SPACE_DIST)
- {
- /* Motion is in line and small enough to ignore. */
- new_line = 0;
- }
- else if (spacing < 0 && spacing > -SPACE_MAX_DIST)
- {
- /* Motion is in line, but negative. We've probably got overlapping
- * chars here. Live with it. */
- new_line = 0;
- }
- else if (spacing > 0 && spacing < SPACE_MAX_DIST)
- {
- /* Motion is forward in line and large enough to warrant us adding a space. */
- if (wmode == 0 && may_add_space(dev->lastchar))
- add_space = 1;
- new_line = 0;
- }
- else
- {
- /* Motion is large and unexpected (probably a new table column). */
- new_line = 1;
- }
- }
- }
- /* Enough for a new line, but not enough for a new paragraph */
- else if (fabsf(base_offset) <= PARAGRAPH_DIST)
- {
- /* Check indent to spot text-indent style paragraphs */
- if (wmode == 0 && cur_line && dev->new_obj)
- if ((p.x - dev->start.x) > 0.5f)
- new_para = 1;
- new_line = 1;
- }
- /* Way off the baseline - open a new paragraph */
- else
- {
- new_para = 1;
- new_line = 1;
- }
- }
- /* Start a new block (but only at the beginning of a text object) */
- if (new_para || !cur_block)
- {
- cur_block = add_text_block_to_page(ctx, page);
- cur_line = cur_block->u.t.last_line;
- }
- if (new_line && (dev->flags & FZ_STEXT_DEHYPHENATE) && is_hyphen(dev->lastchar))
- {
- remove_last_char(ctx, cur_line);
- new_line = 0;
- }
- /* Start a new line */
- if (new_line || !cur_line || force_new_line)
- {
- cur_line = add_line_to_block(ctx, page, cur_block, &ndir, wmode, bidi);
- dev->start = p;
- }
- /* Add synthetic space */
- if (add_space && !(dev->flags & FZ_STEXT_INHIBIT_SPACES))
- add_char_to_line(ctx, page, cur_line, trm, font, size, ' ', (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? NON_ACCURATE_GLYPH_ADDED_SPACE : NON_ACCURATE_GLYPH, &dev->pen, &p, bidi, dev->color, 1, flags, dev->flags);
- add_char_to_line(ctx, page, cur_line, trm, font, size, c, (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? glyph : NON_ACCURATE_GLYPH, &p, &q, bidi, dev->color, 0, flags, dev->flags);
- move_pen_and_exit:
- dev->lastchar = c;
- dev->lastbidi = bidi;
- dev->lag_pen = p;
- dev->pen = q;
- dev->new_obj = 0;
- dev->trm = trm;
- }
- static void
- fz_add_stext_char(fz_context *ctx,
- fz_stext_device *dev,
- fz_font *font,
- int c,
- int glyph,
- fz_matrix trm,
- float adv,
- int wmode,
- int bidi,
- int force_new_line,
- int flags)
- {
- /* ignore when one unicode character maps to multiple glyphs */
- if (c == -1)
- return;
- if (dev->flags & FZ_STEXT_ACCURATE_ASCENDERS)
- fz_calculate_font_ascender_descender(ctx, font);
- if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES))
- {
- switch (c)
- {
- case 0xFB00: /* ff */
- fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
- fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags);
- return;
- case 0xFB01: /* fi */
- fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
- fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, bidi, 0, flags);
- return;
- case 0xFB02: /* fl */
- fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
- fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, bidi, 0, flags);
- return;
- case 0xFB03: /* ffi */
- fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
- fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags);
- fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, bidi, 0, flags);
- return;
- case 0xFB04: /* ffl */
- fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
- fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags);
- fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, bidi, 0, flags);
- return;
- case 0xFB05: /* long st */
- case 0xFB06: /* st */
- fz_add_stext_char_imp(ctx, dev, font, 's', glyph, trm, adv, wmode, bidi, force_new_line, flags);
- fz_add_stext_char_imp(ctx, dev, font, 't', -1, trm, 0, wmode, bidi, 0, flags);
- return;
- }
- }
- if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE))
- {
- switch (c)
- {
- case 0x0009: /* tab */
- case 0x0020: /* space */
- case 0x00A0: /* no-break space */
- case 0x1680: /* ogham space mark */
- case 0x180E: /* mongolian vowel separator */
- case 0x2000: /* en quad */
- case 0x2001: /* em quad */
- case 0x2002: /* en space */
- case 0x2003: /* em space */
- case 0x2004: /* three-per-em space */
- case 0x2005: /* four-per-em space */
- case 0x2006: /* six-per-em space */
- case 0x2007: /* figure space */
- case 0x2008: /* punctuation space */
- case 0x2009: /* thin space */
- case 0x200A: /* hair space */
- case 0x202F: /* narrow no-break space */
- case 0x205F: /* medium mathematical space */
- case 0x3000: /* ideographic space */
- c = ' ';
- }
- }
- fz_add_stext_char_imp(ctx, dev, font, c, glyph, trm, adv, wmode, bidi, force_new_line, flags);
- }
- static fz_rect
- current_clip(fz_context *ctx, fz_stext_device *dev)
- {
- fz_rect r = fz_infinite_rect;
- if (dev->flags & FZ_STEXT_CLIP)
- {
- r = fz_device_current_scissor(ctx, &dev->super);
- r = fz_intersect_rect(r, dev->page->mediabox);
- }
- if (dev->flags & FZ_STEXT_CLIP_RECT)
- r = fz_intersect_rect(r, dev->opts.clip);
- return r;
- }
- static void
- do_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, int start, int end, int flags)
- {
- fz_font *font = span->font;
- fz_matrix tm = span->trm;
- float adv;
- int unicode;
- int i;
- for (i = start; i < end; i++)
- {
- /* Calculate new pen location and delta */
- tm.e = span->items[i].x;
- tm.f = span->items[i].y;
- dev->last.trm = fz_concat(tm, ctm);
- dev->last.bidi_level = span->bidi_level;
- dev->last.wmode = span->wmode;
- if (font != dev->last.font)
- {
- fz_drop_font(ctx, dev->last.font);
- dev->last.font = fz_keep_font(ctx, font);
- }
- dev->last.valid = 1;
- dev->last.flags = flags;
- if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT))
- {
- fz_rect r = current_clip(ctx, dev);
- if (fz_glyph_entirely_outside_box(ctx, &ctm, span, &span->items[i], &r))
- {
- dev->last.clipped = 1;
- continue;
- }
- }
- dev->last.clipped = 0;
- /* Calculate bounding box and new pen position based on font metrics */
- if (span->items[i].gid >= 0)
- adv = span->items[i].adv;
- else
- adv = 0;
- unicode = span->items[i].ucs;
- if (unicode == FZ_REPLACEMENT_CHARACTER)
- {
- if (dev->flags & FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE)
- {
- unicode = span->items[i].cid;
- flags |= FZ_STEXT_UNICODE_IS_CID;
- }
- else if (dev->flags & FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE)
- {
- unicode = span->items[i].gid;
- flags |= FZ_STEXT_UNICODE_IS_GID;
- }
- }
- /* Send the chars we have through. */
- fz_add_stext_char(ctx, dev, font,
- unicode,
- span->items[i].gid,
- dev->last.trm,
- adv,
- dev->last.wmode,
- dev->last.bidi_level,
- (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS),
- flags);
- }
- }
- static int
- rune_index(const char *utf8, size_t idx)
- {
- int rune;
- do
- {
- int len = fz_chartorune(&rune, utf8);
- if (rune == 0)
- return -1;
- utf8 += len;
- }
- while (idx--);
- return rune;
- }
- static void
- flush_actualtext(fz_context *ctx, fz_stext_device *dev, const char *actualtext, int i)
- {
- if (*actualtext == 0)
- return;
- while (1)
- {
- int rune;
- actualtext += fz_chartorune(&rune, actualtext);
- if (rune == 0)
- break;
- if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT))
- if (dev->last.clipped)
- continue;
- fz_add_stext_char(ctx, dev, dev->last.font,
- rune,
- -1,
- dev->last.trm,
- 0,
- dev->last.wmode,
- dev->last.bidi_level,
- (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS),
- dev->last.flags);
- i++;
- }
- }
- static void
- do_extract_within_actualtext(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, metatext_t *mt, int flags)
- {
- /* We are within an actualtext block. This means we can't just add the chars
- * as they are. We need to add the chars as they are meant to be. Sadly the
- * actualtext mechanism doesn't help us at all with positioning. */
- fz_font *font = span->font;
- fz_matrix tm = span->trm;
- float adv;
- int start, i, end;
- char *actualtext = mt->text;
- size_t z = fz_utflen(actualtext);
- /* If actualtext is empty, nothing to do! */
- if (z == 0)
- return;
- /* Now, we HOPE that the creator of a PDF will minimise the actual text
- * differences, so that we'll get:
- * "Politicians <Actualtext="lie">fib</ActualText>, always."
- * rather than:
- * "<Actualtext="Politicians lie, always">Politicians fib, always.</ActualText>
- * but experience with PDF files tells us that this won't always be the case.
- *
- * We try to minimise the actualtext section here, just in case.
- */
- /* Spot a matching prefix and send it. */
- for (start = 0; start < span->len; start++)
- {
- int rune;
- int len = fz_chartorune(&rune, actualtext);
- if (span->items[start].gid != rune || rune == 0)
- break;
- actualtext += len; z--;
- }
- if (start != 0)
- do_extract(ctx, dev, span, ctm, 0, start, flags);
- if (start == span->len)
- {
- /* The prefix has consumed all this object. Just shorten the actualtext and we'll
- * catch the rest next time. */
- z = strlen(actualtext)+1;
- memmove(mt->text, actualtext, z);
- return;
- }
- /* We haven't consumed the whole string, so there must be runes left.
- * Shut coverity up. */
- assert(z != 0);
- /* Spot a matching postfix. Can't send it til the end. */
- for (end = span->len; end > start; end--)
- {
- /* Nasty n^2 algo here, cos backtracking through utf8 is not trivial. It'll do. */
- int rune = rune_index(actualtext, z-1);
- if (span->items[end-1].gid != rune)
- break;
- z--;
- }
- /* So we can send end -> span->len at the end. */
- /* So we have at least SOME chars that don't match. */
- /* Now, do the difficult bit in the middle.*/
- /* items[start..end] have to be sent with actualtext[start..z] */
- for (i = start; i < end; i++)
- {
- fz_text_item *item = &span->items[i];
- int rune = -1;
- if ((size_t)i < z)
- actualtext += fz_chartorune(&rune, actualtext);
- /* Calculate new pen location and delta */
- tm.e = item->x;
- tm.f = item->y;
- dev->last.trm = fz_concat(tm, ctm);
- dev->last.bidi_level = span->bidi_level;
- dev->last.wmode = span->wmode;
- if (font != dev->last.font)
- {
- fz_drop_font(ctx, dev->last.font);
- dev->last.font = fz_keep_font(ctx, font);
- }
- dev->last.valid = 1;
- if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT))
- {
- fz_rect r = current_clip(ctx, dev);
- if (fz_glyph_entirely_outside_box(ctx, &ctm, span, &span->items[i], &r))
- {
- dev->last.clipped = 1;
- continue;
- }
- }
- dev->last.clipped = 0;
- /* Calculate bounding box and new pen position based on font metrics */
- if (item->gid >= 0)
- adv = item->adv;
- else
- adv = 0;
- fz_add_stext_char(ctx, dev, font,
- rune,
- span->items[i].gid,
- dev->last.trm,
- adv,
- dev->last.wmode,
- dev->last.bidi_level,
- (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS),
- flags);
- }
- /* If we haven't spotted a postfix by this point, then don't force ourselves to output
- * any more of the actualtext at this point. We might get a new text object that matches
- * more of it. */
- if (end == span->len)
- {
- /* Shorten actualtext and exit. */
- z = strlen(actualtext)+1;
- memmove(mt->text, actualtext, z);
- return;
- }
- /* We found a matching postfix. It seems likely that this is going to be the only
- * text object we get, so send any remaining actualtext now. */
- flush_actualtext(ctx, dev, actualtext, i);
- /* Send the postfix */
- if (end != span->len)
- do_extract(ctx, dev, span, ctm, end, span->len, flags);
- mt->text[0] = 0;
- }
- static void
- fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, int flags)
- {
- fz_stext_device *tdev = (fz_stext_device*)dev;
- metatext_t *mt = NULL;
- if (span->len == 0)
- return;
- /* Are we in an actualtext? */
- if (!(tdev->opts.flags & FZ_STEXT_IGNORE_ACTUALTEXT))
- mt = find_actualtext(dev);
- if (mt)
- do_extract_within_actualtext(ctx, dev, span, ctm, mt, flags);
- else
- do_extract(ctx, dev, span, ctm, 0, span->len, flags);
- }
- static uint32_t hexrgba_from_color(fz_context *ctx, fz_colorspace *colorspace, const float *color, float alpha)
- {
- float rgb[3];
- fz_convert_color(ctx, colorspace, color, fz_device_rgb(ctx), rgb, NULL, fz_default_color_params);
- return
- (fz_clampi(alpha * 255 + 0.5f, 0, 255) << 24) |
- (fz_clampi(rgb[0] * 255 + 0.5f, 0, 255) << 16) |
- (fz_clampi(rgb[1] * 255 + 0.5f, 0, 255) << 8) |
- (fz_clampi(rgb[2] * 255 + 0.5f, 0, 255));
- }
- static void
- fz_stext_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm,
- fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
- {
- fz_stext_device *tdev = (fz_stext_device*)dev;
- fz_text_span *span;
- if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
- return;
- tdev->color = hexrgba_from_color(ctx, colorspace, color, alpha);
- tdev->new_obj = 1;
- for (span = text->head; span; span = span->next)
- fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_FILLED);
- fz_drop_text(ctx, tdev->lasttext);
- tdev->lasttext = fz_keep_text(ctx, text);
- }
- static void
- fz_stext_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm,
- fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
- {
- fz_stext_device *tdev = (fz_stext_device*)dev;
- fz_text_span *span;
- if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
- return;
- tdev->color = hexrgba_from_color(ctx, colorspace, color, alpha);
- tdev->new_obj = 1;
- for (span = text->head; span; span = span->next)
- fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_STROKED);
- fz_drop_text(ctx, tdev->lasttext);
- tdev->lasttext = fz_keep_text(ctx, text);
- }
- static void
- fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_rect scissor)
- {
- fz_stext_device *tdev = (fz_stext_device*)dev;
- fz_text_span *span;
- if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
- return;
- tdev->color = 0;
- tdev->new_obj = 1;
- for (span = text->head; span; span = span->next)
- fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_FILLED | FZ_STEXT_CLIPPED);
- fz_drop_text(ctx, tdev->lasttext);
- tdev->lasttext = fz_keep_text(ctx, text);
- }
- static void
- fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor)
- {
- fz_stext_device *tdev = (fz_stext_device*)dev;
- fz_text_span *span;
- if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
- return;
- tdev->color = 0;
- tdev->new_obj = 1;
- for (span = text->head; span; span = span->next)
- fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_STROKED | FZ_STEXT_CLIPPED);
- fz_drop_text(ctx, tdev->lasttext);
- tdev->lasttext = fz_keep_text(ctx, text);
- }
- static void
- fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm)
- {
- fz_stext_device *tdev = (fz_stext_device*)dev;
- fz_text_span *span;
- if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
- return;
- tdev->color = 0;
- tdev->new_obj = 1;
- for (span = text->head; span; span = span->next)
- fz_stext_extract(ctx, tdev, span, ctm, 0);
- fz_drop_text(ctx, tdev->lasttext);
- tdev->lasttext = fz_keep_text(ctx, text);
- }
- static void
- fz_stext_begin_metatext(fz_context *ctx, fz_device *dev, fz_metatext meta, const char *text)
- {
- fz_stext_device *tdev = (fz_stext_device*)dev;
- metatext_t *mt = fz_malloc_struct(ctx, metatext_t);
- mt->prev = tdev->metatext;
- tdev->metatext = mt;
- mt->type = meta;
- mt->text = text ? fz_strdup(ctx, text) : NULL;
- mt->bounds = fz_empty_rect;
- }
- static void
- pop_metatext(fz_context *ctx, fz_stext_device *dev)
- {
- metatext_t *prev;
- fz_rect bounds;
- if (!dev->metatext)
- return;
- prev = dev->metatext->prev;
- bounds = dev->metatext->bounds;
- fz_free(ctx, dev->metatext->text);
- fz_free(ctx, dev->metatext);
- dev->metatext = prev;
- if (prev)
- prev->bounds = fz_union_rect(prev->bounds, bounds);
- }
- static void
- fz_stext_end_metatext(fz_context *ctx, fz_device *dev)
- {
- fz_stext_device *tdev = (fz_stext_device*)dev;
- fz_font *myfont = NULL;
- if (!tdev->metatext)
- return; /* Mismatched pop. Live with it. */
- if (tdev->metatext->type != FZ_METATEXT_ACTUALTEXT)
- {
- /* We only deal with ActualText here. Just pop anything else off,
- * and we're done. */
- pop_metatext(ctx, tdev);
- return;
- }
- /* If we have a 'last' text position, send the content after that. */
- if (tdev->last.valid)
- {
- flush_actualtext(ctx, tdev, tdev->metatext->text, 0);
- pop_metatext(ctx, tdev);
- return;
- }
- /* If we have collected a rectangle for content that encloses the actual text,
- * send the content there. */
- if (!fz_is_empty_rect(tdev->metatext->bounds))
- {
- tdev->last.trm.a = tdev->metatext->bounds.x1 - tdev->metatext->bounds.x0;
- tdev->last.trm.b = 0;
- tdev->last.trm.c = 0;
- tdev->last.trm.d = tdev->metatext->bounds.y1 - tdev->metatext->bounds.y0;
- tdev->last.trm.e = tdev->metatext->bounds.x0;
- tdev->last.trm.f = tdev->metatext->bounds.y0;
- }
- else
- fz_warn(ctx, "Actualtext with no position. Text may be lost or mispositioned.");
- fz_var(myfont);
- fz_try(ctx)
- {
- if (tdev->last.font == NULL)
- {
- myfont = fz_new_base14_font(ctx, "Helvetica");
- tdev->last.font = myfont;
- }
- flush_actualtext(ctx, tdev, tdev->metatext->text, 0);
- pop_metatext(ctx, tdev);
- }
- fz_always(ctx)
- {
- if (myfont)
- {
- tdev->last.font = NULL;
- fz_drop_font(ctx, myfont);
- }
- }
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- /* Images and shadings */
- static void
- fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, float alpha, fz_color_params color_params)
- {
- fz_stext_device *tdev = (fz_stext_device*)dev;
- fz_rect *bounds = actualtext_bounds(tdev);
- /* If there is an actualtext in force, update its bounds. */
- if (bounds)
- {
- static const fz_rect unit = { 0, 0, 1, 1 };
- *bounds = fz_union_rect(*bounds, fz_transform_rect(unit, ctm));
- }
- /* Unless we are being told to preserve images, nothing to do here. */
- if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0)
- return;
- /* If the alpha is less than 50% then it's probably a watermark or effect or something. Skip it. */
- if (alpha >= 0.5f)
- add_image_block_to_page(ctx, tdev->page, ctm, img);
- }
- static void
- fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm,
- fz_colorspace *cspace, const float *color, float alpha, fz_color_params color_params)
- {
- fz_stext_fill_image(ctx, dev, img, ctm, alpha, color_params);
- }
- static fz_image *
- fz_new_image_from_shade(fz_context *ctx, fz_shade *shade, fz_matrix *in_out_ctm, fz_color_params color_params, fz_rect scissor)
- {
- fz_matrix ctm = *in_out_ctm;
- fz_pixmap *pix;
- fz_image *img = NULL;
- fz_rect bounds;
- fz_irect bbox;
- bounds = fz_bound_shade(ctx, shade, ctm);
- bounds = fz_intersect_rect(bounds, scissor);
- bbox = fz_irect_from_rect(bounds);
- pix = fz_new_pixmap_with_bbox(ctx, fz_device_rgb(ctx), bbox, NULL, !shade->use_background);
- fz_try(ctx)
- {
- if (shade->use_background)
- fz_fill_pixmap_with_color(ctx, pix, shade->colorspace, shade->background, color_params);
- else
- fz_clear_pixmap(ctx, pix);
- fz_paint_shade(ctx, shade, NULL, ctm, pix, color_params, bbox, NULL, NULL);
- img = fz_new_image_from_pixmap(ctx, pix, NULL);
- }
- fz_always(ctx)
- fz_drop_pixmap(ctx, pix);
- fz_catch(ctx)
- fz_rethrow(ctx);
- in_out_ctm->a = pix->w;
- in_out_ctm->b = 0;
- in_out_ctm->c = 0;
- in_out_ctm->d = pix->h;
- in_out_ctm->e = pix->x;
- in_out_ctm->f = pix->y;
- return img;
- }
- static void
- fz_stext_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, fz_matrix ctm, float alpha, fz_color_params color_params)
- {
- fz_stext_device *tdev = (fz_stext_device*)dev;
- fz_rect *bounds = actualtext_bounds(tdev);
- fz_matrix local_ctm;
- fz_rect scissor;
- fz_image *image;
- /* If we aren't keeping images, but we are in a bound, update the bounds
- * without generating the entire image. */
- if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0 && bounds)
- {
- *bounds = fz_union_rect(*bounds, fz_bound_shade(ctx, shade, ctm));
- return;
- }
- /* Unless we are preserving image, nothing to do here. */
- if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0)
- return;
- local_ctm = ctm;
- scissor = fz_device_current_scissor(ctx, dev);
- if (dev->flags & FZ_STEXT_CLIP_RECT)
- scissor = fz_intersect_rect(scissor, tdev->opts.clip);
- scissor = fz_intersect_rect(scissor, tdev->page->mediabox);
- image = fz_new_image_from_shade(ctx, shade, &local_ctm, color_params, scissor);
- fz_try(ctx)
- fz_stext_fill_image(ctx, dev, image, local_ctm, alpha, color_params);
- fz_always(ctx)
- fz_drop_image(ctx, image);
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- static void
- fixup_bboxes_and_bidi(fz_context *ctx, fz_stext_block *block)
- {
- fz_stext_line *line;
- fz_stext_char *ch;
- for ( ; block != NULL; block = block->next)
- {
- if (block->type == FZ_STEXT_BLOCK_STRUCT)
- if (block->u.s.down)
- fixup_bboxes_and_bidi(ctx, block->u.s.down->first_block);
- if (block->type != FZ_STEXT_BLOCK_TEXT)
- continue;
- for (line = block->u.t.first_line; line; line = line->next)
- {
- int reorder = 0;
- for (ch = line->first_char; ch; ch = ch->next)
- {
- fz_rect ch_box = fz_rect_from_quad(ch->quad);
- if (ch == line->first_char)
- line->bbox = ch_box;
- else
- line->bbox = fz_union_rect(line->bbox, ch_box);
- if (ch->bidi == 3)
- reorder = 1;
- }
- block->bbox = fz_union_rect(block->bbox, line->bbox);
- if (reorder)
- reverse_bidi_line(line);
- }
- }
- }
- static void
- advance_x(fz_point *a, fz_point b, float d)
- {
- a->y += (b.y - a->y) * d / (b.x - a->x);
- a->x += d;
- }
- static void
- advance_y(fz_point *a, fz_point b, float d)
- {
- a->x += (b.x - a->x) * d / (b.y - a->y);
- a->y += d;
- }
- static int
- line_crosses_rect(fz_point a, fz_point b, fz_rect r)
- {
- /* Cope with trivial exclusions */
- if (a.x < r.x0 && b.x < r.x0)
- return 0;
- if (a.x > r.x1 && b.x > r.x1)
- return 0;
- if (a.y < r.y0 && b.y < r.y0)
- return 0;
- if (a.y > r.y1 && b.y > r.y1)
- return 0;
- if (a.x < r.x0)
- advance_x(&a, b, r.x0 - a.x);
- if (a.x > r.x1)
- advance_x(&a, b, r.x1 - a.x);
- if (a.y < r.y0)
- advance_y(&a, b, r.y0 - a.y);
- if (a.y > r.y1)
- advance_y(&a, b, r.y1 - a.y);
- return fz_is_point_inside_rect(a, r);
- }
- static float
- calculate_ascent(fz_point p, fz_point origin, fz_point dir)
- {
- return fabsf((origin.x-p.x)*dir.y - (origin.y-p.y)*dir.x);
- }
- /* Create us a rect from the given quad, but extend it downwards
- * to allow for underlines that pass under the glyphs. */
- static fz_rect expanded_rect_from_quad(fz_quad quad, fz_point dir, fz_point origin, float size)
- {
- /* Consider the two rects from A and g respectively.
- *
- * ul +------+ ur or
- * | /\ | ul +------+ ur
- * | /__\ | | /''\ |
- * |/ \| |( ||
- * ll +------+ lr | ''''||
- * | ''' | <-expected underline level
- * ll +------+ lr
- *
- * So an underline won't cross A's rect, but will cross g's.
- * We want to make a rect that includes a suitable amount of
- * space underneath. The information we have available to us
- * is summed up here:
- *
- * ul +---------+ ur
- * | |
- * | origin |
- * |+----------> dir
- * | |
- * ll +---------+ lr
- *
- * Consider the distance from ul to the line that passes through
- * the origin with direction dir. Similarly, consider the distance
- * from ur to the same line. This can be thought of as the 'ascent'
- * of this character.
- *
- * We'd like the distance from ul to ll to be greater than this, so
- * as to ensure we cover the possible location where an underline
- * might reasonably go.
- *
- * If we have a line (l) through point A with direction vector u,
- * the distance between point P and line(l) is:
- *
- * d(P,l) = || AP x u || / || u ||
- *
- * where x is the cross product.
- *
- * For us, because || dir || = 1:
- *
- * d(ul, origin) = || (origin-ul) x dir ||
- *
- * The cross product is only defined in 3 (or 7!) dimensions, so
- * extend both vectors into 3d by defining a 0 z component.
- *
- * (origin-ul) x dir = [ (origin.y - ul.y) . 0 - 0 . dir.y ]
- * [ 0 . dir.x - (origin.x - ul.y) . 0 ]
- * [ (origin.x - ul.x) . dir.y - (origin.y - ul.y) . dir.x ]
- *
- * So d(ul, origin) = abs(D) where D = (origin.x-ul.x).dir.y - (origin.y-ul.y).dir.x
- */
- float ascent = (calculate_ascent(quad.ul, origin, dir) + calculate_ascent(quad.ur, origin, dir)) / 2;
- fz_point left = { quad.ll.x - quad.ul.x, quad.ll.y - quad.ul.y };
- fz_point right = { quad.lr.x - quad.ur.x, quad.lr.y - quad.ur.y };
- float height = (hypotf(left.x, left.y) + hypotf(right.x, right.y))/2;
- int neg = 0;
- /* We'd like height to be at least ascent + 1/4 size */
- if (height < 0)
- neg = 1, height = -height;
- if (height < ascent + size * 0.25f)
- height = ascent + size * 0.25f;
- height -= ascent;
- if (neg)
- height = -height;
- quad.ll.x += - height * dir.y;
- quad.ll.y += height * dir.x;
- quad.lr.x += - height * dir.y;
- quad.lr.y += height * dir.x;
- return fz_rect_from_quad(quad);
- }
- static int feq(float a,float b)
- {
- #define EPSILON 0.00001
- a -= b;
- if (a < 0)
- a = -a;
- return a < EPSILON;
- }
- static void
- check_strikeout(fz_context *ctx, fz_stext_block *block, fz_point from, fz_point to, fz_point dir)
- {
- for ( ; block; block = block->next)
- {
- fz_stext_line *line;
- if (block->type != FZ_STEXT_BLOCK_TEXT)
- continue;
- for (line = block->u.t.first_line; line != NULL; line = line->next)
- {
- fz_stext_char *ch;
- if ((!feq(line->dir.x, dir.x) || !feq(line->dir.y, dir.y)) &&
- (!feq(line->dir.x, -dir.x) || !feq(line->dir.y, -dir.y)))
- continue;
- /* Matching directions... */
- /* Unfortunately, we don't have a valid line->bbox at this point, so we need to check
- * chars. - FIXME: Now we do! */
- for (ch = line->first_char; ch; ch = ch->next)
- {
- fz_point up;
- float dx, dy, dot;
- fz_rect ch_box = expanded_rect_from_quad(ch->quad, line->dir, ch->origin, ch->size);
- if (!line_crosses_rect(from, to, ch_box))
- continue;
- /* Is this a strikeout or an underline? */
- /* The baseline moves from ch->origin in the direction line->dir */
- up.x = line->dir.y;
- up.y = -line->dir.x;
- /* How far is our line displaced from the line through the origin? */
- dx = from.x - ch->origin.x;
- dy = from.y - ch->origin.y;
- /* Dot product with up. up is normalised */
- dot = dx * up.x + dy * up.y;
- if (dot > 0)
- ch->flags |= FZ_STEXT_STRIKEOUT;
- else
- ch->flags |= FZ_STEXT_UNDERLINE;
- }
- }
- }
- }
- static void
- check_rects_for_strikeout(fz_context *ctx, fz_stext_device *tdev, fz_stext_page *page)
- {
- int i, n = tdev->rect_len;
- for (i = 0; i < n; i++)
- {
- fz_point from = tdev->rects[i].from;
- fz_point to = tdev->rects[i].to;
- fz_point dir;
- dir.x = to.x - from.x;
- dir.y = to.y - from.y;
- dir = fz_normalize_vector(dir);
- check_strikeout(ctx, page->first_block, from, to, dir);
- }
- }
- static void
- fz_stext_close_device(fz_context *ctx, fz_device *dev)
- {
- fz_stext_device *tdev = (fz_stext_device*)dev;
- fz_stext_page *page = tdev->page;
- fixup_bboxes_and_bidi(ctx, page->first_block);
- if (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES)
- check_rects_for_strikeout(ctx, tdev, page);
- /* TODO: smart sorting of blocks and lines in reading order */
- /* TODO: unicode NFC normalization */
- if (tdev->opts.flags & FZ_STEXT_SEGMENT)
- fz_segment_stext_page(ctx, page);
- if (tdev->opts.flags & FZ_STEXT_PARAGRAPH_BREAK)
- fz_paragraph_break(ctx, page);
- if (tdev->opts.flags & FZ_STEXT_TABLE_HUNT)
- fz_table_hunt(ctx, page);
- }
- static void
- fz_stext_drop_device(fz_context *ctx, fz_device *dev)
- {
- fz_stext_device *tdev = (fz_stext_device*)dev;
- fz_drop_text(ctx, tdev->lasttext);
- fz_drop_font(ctx, tdev->last.font);
- while (tdev->metatext)
- pop_metatext(ctx, tdev);
- fz_free(ctx, tdev->rects);
- }
- static int
- val_is_rect(const char *val, fz_rect *rp)
- {
- fz_rect r;
- const char *s;
- s = strchr(val, ':');
- if (s == NULL || s == val)
- return 0;
- r.x0 = fz_atof(val);
- val = s+1;
- s = strchr(val, ':');
- if (s == NULL || s == val)
- return 0;
- r.y0 = fz_atof(val);
- val = s+1;
- s = strchr(val, ':');
- if (s == NULL || s == val)
- return 0;
- r.x1 = fz_atof(val);
- val = s+1;
- r.y1 = fz_atof(val);
- *rp = r;
- return 1;
- }
- fz_stext_options *
- fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string)
- {
- const char *val;
- memset(opts, 0, sizeof *opts);
- if (fz_has_option(ctx, string, "preserve-ligatures", &val) && fz_option_eq(val, "yes"))
- opts->flags |= FZ_STEXT_PRESERVE_LIGATURES;
- if (fz_has_option(ctx, string, "preserve-whitespace", &val) && fz_option_eq(val, "yes"))
- opts->flags |= FZ_STEXT_PRESERVE_WHITESPACE;
- if (fz_has_option(ctx, string, "preserve-images", &val) && fz_option_eq(val, "yes"))
- opts->flags |= FZ_STEXT_PRESERVE_IMAGES;
- if (fz_has_option(ctx, string, "inhibit-spaces", &val) && fz_option_eq(val, "yes"))
- opts->flags |= FZ_STEXT_INHIBIT_SPACES;
- if (fz_has_option(ctx, string, "dehyphenate", &val) && fz_option_eq(val, "yes"))
- opts->flags |= FZ_STEXT_DEHYPHENATE;
- if (fz_has_option(ctx, string, "preserve-spans", &val) && fz_option_eq(val, "yes"))
- opts->flags |= FZ_STEXT_PRESERVE_SPANS;
- if (fz_has_option(ctx, string, "structured", &val) && fz_option_eq(val, "yes"))
- opts->flags |= FZ_STEXT_COLLECT_STRUCTURE;
- if (fz_has_option(ctx, string, "use-cid-for-unknown-unicode", &val) && fz_option_eq(val, "yes"))
- opts->flags |= FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE;
- if (fz_has_option(ctx, string, "use-gid-for-unknown-unicode", &val) && fz_option_eq(val, "yes"))
- opts->flags |= FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE;
- if (fz_has_option(ctx, string, "accurate-bboxes", &val) && fz_option_eq(val, "yes"))
- opts->flags |= FZ_STEXT_ACCURATE_BBOXES;
- if (fz_has_option(ctx, string, "vectors", &val) && fz_option_eq(val, "yes"))
- opts->flags |= FZ_STEXT_COLLECT_VECTORS;
- if (fz_has_option(ctx, string, "ignore-actualtext", & val) && fz_option_eq(val, "yes"))
- opts->flags |= FZ_STEXT_IGNORE_ACTUALTEXT;
- if (fz_has_option(ctx, string, "segment", &val) && fz_option_eq(val, "yes"))
- opts->flags |= FZ_STEXT_SEGMENT;
- if (fz_has_option(ctx, string, "paragraph-break", &val) && fz_option_eq(val, "yes"))
- opts->flags |= FZ_STEXT_PARAGRAPH_BREAK;
- if (fz_has_option(ctx, string, "table-hunt", &val) && fz_option_eq(val, "yes"))
- opts->flags |= FZ_STEXT_TABLE_HUNT;
- if (fz_has_option(ctx, string, "collect-styles", &val) && fz_option_eq(val, "yes"))
- opts->flags |= FZ_STEXT_COLLECT_STYLES;
- if (fz_has_option(ctx, string, "accurate-ascenders", &val) && fz_option_eq(val, "yes"))
- opts->flags |= FZ_STEXT_ACCURATE_ASCENDERS;
- if (fz_has_option(ctx, string, "accurate-side-bearings", &val) && fz_option_eq(val, "yes"))
- opts->flags |= FZ_STEXT_ACCURATE_SIDE_BEARINGS;
- opts->flags |= FZ_STEXT_CLIP;
- if (fz_has_option(ctx, string, "mediabox-clip", &val))
- {
- fz_warn(ctx, "The 'mediabox-clip' option has been deprecated. Use 'clip' instead.");
- if (fz_option_eq(val, "no"))
- opts->flags ^= FZ_STEXT_CLIP;
- }
- if (fz_has_option(ctx, string, "clip", &val) && fz_option_eq(val, "no"))
- opts->flags ^= FZ_STEXT_CLIP;
- if (fz_has_option(ctx, string, "clip-rect", &val) && val_is_rect(val, &opts->clip))
- opts->flags |= FZ_STEXT_CLIP_RECT;
- opts->scale = 1;
- if (fz_has_option(ctx, string, "resolution", &val))
- opts->scale = fz_atof(val) / 96.0f; /* HTML base resolution is 96ppi */
- return opts;
- }
- typedef struct
- {
- int fail;
- int count;
- fz_point corners[4];
- } is_rect_data;
- static void
- stash_point(is_rect_data *rd, float x, float y)
- {
- if (rd->count > 3)
- {
- rd->fail = 1;
- return;
- }
- rd->corners[rd->count].x = x;
- rd->corners[rd->count].y = y;
- rd->count++;
- }
- static void
- is_rect_moveto(fz_context *ctx, void *arg, float x, float y)
- {
- is_rect_data *rd = arg;
- if (rd->fail)
- return;
- if (rd->count != 0)
- {
- rd->fail = 1;
- return;
- }
- stash_point(rd, x, y);
- }
- static void
- is_rect_lineto(fz_context *ctx, void *arg, float x, float y)
- {
- is_rect_data *rd = arg;
- if (rd->fail)
- return;
- if (rd->count == 4 && rd->corners[0].x == x && rd->corners[1].y == y)
- return;
- stash_point(rd, x, y);
- }
- static void
- is_rect_curveto(fz_context *ctx, void *arg, float x1, float y1, float x2, float y2, float x3, float y3)
- {
- is_rect_data *rd = arg;
- rd->fail = 1;
- }
- static void
- is_rect_closepath(fz_context *ctx, void *arg)
- {
- is_rect_data *rd = arg;
- if (rd->fail)
- return;
- if (rd->count == 3)
- stash_point(rd, rd->corners[0].x, rd->corners[0].y);
- if (rd->count != 4)
- rd->fail = 1;
- }
- static int
- is_path_rect(fz_context *ctx, const fz_path *path, fz_point *from, fz_point *to, float *thickness, fz_matrix ctm)
- {
- float d01, d01x, d01y, d03, d03x, d03y, d32x, d32y;
- is_rect_data rd = { 0 };
- static const fz_path_walker walker =
- {
- is_rect_moveto, is_rect_lineto, is_rect_curveto, is_rect_closepath
- };
- int i;
- fz_walk_path(ctx, path, &walker, &rd);
- if (rd.fail)
- return 0;
- if (rd.count == 2)
- {
- stash_point(&rd, rd.corners[1].x, rd.corners[1].y);
- stash_point(&rd, rd.corners[0].x, rd.corners[0].y);
- }
- for (i = 0 ; i < 4; i++)
- {
- fz_point p = fz_transform_point(rd.corners[i], ctm);
- rd.corners[i].x = p.x;
- rd.corners[i].y = p.y;
- }
- /* So we have a 4 cornered path. Hopefully something like:
- * 0---------1
- * | |
- * 3---------2
- * but it might be:
- * 0---------3
- * | |
- * 1---------2
- */
- while (1)
- {
- d01x = rd.corners[1].x - rd.corners[0].x;
- d01y = rd.corners[1].y - rd.corners[0].y;
- d01 = d01x * d01x + d01y * d01y;
- d03x = rd.corners[3].x - rd.corners[0].x;
- d03y = rd.corners[3].y - rd.corners[0].y;
- d03 = d03x * d03x + d03y * d03y;
- if(d01 < d03)
- {
- /* We are the latter case. Transpose it. */
- fz_point p = rd.corners[1];
- rd.corners[1] = rd.corners[3];
- rd.corners[3] = p;
- }
- else
- break;
- }
- d32x = rd.corners[2].x - rd.corners[3].x;
- d32y = rd.corners[2].y - rd.corners[3].y;
- /* So d32x and d01x need to be the same for this to be a strikeout. */
- if (!feq(d32x, d01x) || !feq(d32y, d01y))
- return 0;
- /* We are plausibly a rectangle. */
- *thickness = sqrtf(d03x * d03x + d03y * d03y);
- from->x = (rd.corners[0].x + rd.corners[3].x)/2;
- from->y = (rd.corners[0].y + rd.corners[3].y)/2;
- to->x = (rd.corners[1].x + rd.corners[2].x)/2;
- to->y = (rd.corners[1].y + rd.corners[2].y)/2;
- return 1;
- }
- static void
- check_for_strikeout(fz_context *ctx, fz_stext_device *tdev, fz_stext_page *page, const fz_path *path, fz_matrix ctm)
- {
- float thickness;
- fz_point from, to;
- /* Is this path a thin rectangle (possibly rotated)? If so, then we need to
- * consider it as being a strikeout or underline. */
- if (!is_path_rect(ctx, path, &from, &to, &thickness, ctm))
- return;
- /* Add to the list of rects in the device. */
- if (tdev->rect_len == tdev->rect_max)
- {
- int newmax = tdev->rect_max * 2;
- if (newmax == 0)
- newmax = 32;
- tdev->rects = fz_realloc(ctx, tdev->rects, sizeof(*tdev->rects) * newmax);
- tdev->rect_max = newmax;
- }
- tdev->rects[tdev->rect_len].from = from;
- tdev->rects[tdev->rect_len].to = to;
- tdev->rects[tdev->rect_len].thickness = thickness;
- tdev->rect_len++;
- }
- static void
- add_vector(fz_context *ctx, fz_stext_page *page, fz_rect bbox, uint32_t flags, uint32_t argb)
- {
- fz_stext_block *b = add_block_to_page(ctx, page);
- b->type = FZ_STEXT_BLOCK_VECTOR;
- b->bbox = bbox;
- b->u.v.flags = flags;
- b->u.v.argb = argb;
- }
- typedef struct
- {
- fz_matrix ctm;
- uint32_t argb;
- uint32_t flags;
- fz_stext_page *page;
- fz_rect leftovers;
- fz_rect pending;
- int count;
- fz_point p[5];
- } split_path_data;
- static void
- maybe_rect(fz_context *ctx, split_path_data *sp)
- {
- int rect = 0;
- int i;
- if (sp->count >= 0)
- {
- if (sp->count == 3)
- {
- /* Allow for "moveto A, lineto B, lineto A, close" */
- if (feq(sp->p[0].x, sp->p[2].x) || feq(sp->p[0].y, sp->p[2].y))
- sp->count = 2;
- }
- if (sp->count == 2)
- {
- if (feq(sp->p[0].x, sp->p[1].x) || feq(sp->p[0].y, sp->p[1].y))
- rect = 1; /* Count that as a rect */
- }
- else if (sp->count == 4 || sp->count == 5)
- {
- if (feq(sp->p[0].x, sp->p[1].x) && feq(sp->p[2].x, sp->p[3].x) && feq(sp->p[0].y, sp->p[3].y) && feq(sp->p[1].y, sp->p[2].y))
- rect = 1;
- else if (feq(sp->p[0].x, sp->p[3].x) && feq(sp->p[1].x, sp->p[2].x) && feq(sp->p[0].y, sp->p[1].y) && feq(sp->p[2].y, sp->p[3].y))
- rect = 1;
- }
- if (rect)
- {
- fz_rect bounds;
- bounds.x0 = bounds.x1 = sp->p[0].x;
- bounds.y0 = bounds.y1 = sp->p[0].y;
- for (i = 1; i < sp->count; i++)
- bounds = fz_include_point_in_rect(bounds, sp->p[i]);
- if (fz_is_valid_rect(sp->pending))
- add_vector(ctx, sp->page, sp->pending, sp->flags | FZ_STEXT_VECTOR_IS_RECTANGLE | FZ_STEXT_VECTOR_CONTINUES, sp->argb);
- sp->pending = bounds;
- return;
- }
- for (i = 0; i < sp->count; i++)
- sp->leftovers = fz_include_point_in_rect(sp->leftovers, sp->p[i]);
- }
- }
- static void
- split_move(fz_context *ctx, void *arg, float x, float y)
- {
- split_path_data *sp = (split_path_data *)arg;
- fz_point p = fz_transform_point_xy(x, y, sp->ctm);
- maybe_rect(ctx, sp);
- sp->p[0] = p;
- sp->count = 1;
- }
- static void
- split_line(fz_context *ctx, void *arg, float x, float y)
- {
- split_path_data *sp = (split_path_data *)arg;
- fz_point p = fz_transform_point_xy(x, y, sp->ctm);
- int i;
- if (sp->count >= 0)
- {
- /* Check for lines to the same point. */
- if (feq(sp->p[sp->count-1].x, p.x) && feq(sp->p[sp->count-1].y, p.y))
- return;
- /* If we're still maybe a rect, just record the point. */
- if (sp->count < 4)
- {
- sp->p[sp->count++] = p;
- return;
- }
- /* Check for close line? */
- if (sp->count == 4)
- {
- if (feq(sp->p[0].x, p.x) && feq(sp->p[0].y, p.y))
- {
- /* We've just drawn a line back to the start point. */
- /* Needless saving of point, but it makes the logic
- * easier elsewhere. */
- sp->p[sp->count++] = p;
- return;
- }
- }
- /* We can no longer be a rect. Output the points we had saved. */
- for (i = 0; i < sp->count; i++)
- sp->leftovers = fz_include_point_in_rect(sp->leftovers, sp->p[i]);
- /* Remember we're not a rect. */
- sp->count = -1;
- }
- /* Roll this point into the non-rect bounds. */
- sp->leftovers = fz_include_point_in_rect(sp->leftovers, p);
- }
- static void
- split_curve(fz_context *ctx, void *arg, float x1, float y1, float x2, float y2, float x3, float y3)
- {
- split_path_data *sp = (split_path_data *)arg;
- fz_point p1 = fz_transform_point_xy(x1, y1, sp->ctm);
- fz_point p2 = fz_transform_point_xy(x2, y2, sp->ctm);
- fz_point p3 = fz_transform_point_xy(x3, y3, sp->ctm);
- int i;
- if (sp->count >= 0)
- {
- /* We can no longer be a rect. Output the points we had saved. */
- for (i = 0; i < sp->count; i++)
- sp->leftovers = fz_include_point_in_rect(sp->leftovers, sp->p[i]);
- /* Remember we're not a rect. */
- sp->count = -1;
- }
- /* Roll these points into the non-rect bounds. */
- sp->leftovers = fz_include_point_in_rect(sp->leftovers, p1);
- sp->leftovers = fz_include_point_in_rect(sp->leftovers, p2);
- sp->leftovers = fz_include_point_in_rect(sp->leftovers, p3);
- }
- static void
- split_close(fz_context *ctx, void *arg)
- {
- split_path_data *sp = (split_path_data *)arg;
- maybe_rect(ctx, sp);
- sp->count = 0;
- }
- static const
- fz_path_walker split_path_rects =
- {
- split_move,
- split_line,
- split_curve,
- split_close
- };
- static void
- add_vectors_from_path(fz_context *ctx, fz_stext_page *page, const fz_path *path, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp, int stroke)
- {
- int have_leftovers;
- split_path_data sp;
- sp.ctm = ctm;
- sp.argb = hexrgba_from_color(ctx, cs, color, alpha);
- sp.flags = stroke ? FZ_STEXT_VECTOR_IS_STROKED : 0;
- sp.page = page;
- sp.count = 0;
- sp.leftovers = fz_empty_rect;
- sp.pending = fz_empty_rect;
- fz_walk_path(ctx, path, &split_path_rects, &sp);
- have_leftovers = fz_is_valid_rect(sp.leftovers);
- maybe_rect(ctx, &sp);
- if (fz_is_valid_rect(sp.pending))
- add_vector(ctx, page, sp.pending, sp.flags | FZ_STEXT_VECTOR_IS_RECTANGLE | (have_leftovers ? FZ_STEXT_VECTOR_CONTINUES : 0), sp.argb);
- if (have_leftovers)
- add_vector(ctx, page, sp.leftovers, sp.flags, sp.argb);
- }
- static void
- fz_stext_fill_path(fz_context *ctx, fz_device *dev, const fz_path *path, int even_odd, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp)
- {
- fz_stext_device *tdev = (fz_stext_device*)dev;
- fz_stext_page *page = tdev->page;
- fz_rect path_bounds = fz_bound_path(ctx, path, NULL, ctm);
- fz_rect *bounds = actualtext_bounds(tdev);
- /* If we're in an actualtext, then update the bounds to include this content. */
- if (bounds != NULL)
- *bounds = fz_union_rect(*bounds, path_bounds);
- if (tdev->flags & FZ_STEXT_COLLECT_STYLES)
- check_for_strikeout(ctx, tdev, page, path, ctm);
- if (tdev->flags & FZ_STEXT_COLLECT_VECTORS)
- add_vectors_from_path(ctx, page, path, ctm, cs, color, alpha, cp, 0);
- }
- static void
- fz_stext_stroke_path(fz_context *ctx, fz_device *dev, const fz_path *path, const fz_stroke_state *ss, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp)
- {
- fz_stext_device *tdev = (fz_stext_device*)dev;
- fz_stext_page *page = tdev->page;
- fz_rect path_bounds = fz_bound_path(ctx, path, ss, ctm);
- fz_rect *bounds = actualtext_bounds((fz_stext_device *)dev);
- /* If we're in an actualtext, then update the bounds to include this content. */
- if (bounds != NULL)
- *bounds = fz_union_rect(*bounds, path_bounds);
- if (tdev->flags & FZ_STEXT_COLLECT_STYLES)
- check_for_strikeout(ctx, tdev, page, path, ctm);
- if (tdev->flags & FZ_STEXT_COLLECT_VECTORS)
- add_vectors_from_path(ctx, page, path, ctm, cs, color, alpha, cp, 1);
- }
- static void
- new_stext_struct(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, fz_structure standard, const char *raw)
- {
- fz_stext_struct *str;
- size_t z;
- if (raw == NULL)
- raw = "";
- z = strlen(raw);
- str = fz_pool_alloc(ctx, page->pool, sizeof(*str) + z);
- str->first_block = NULL;
- str->last_block = NULL;
- str->standard = standard;
- str->parent = page->last_struct;
- str->up = block;
- memcpy(str->raw, raw, z+1);
- block->u.s.down = str;
- }
- static void
- fz_stext_begin_structure(fz_context *ctx, fz_device *dev, fz_structure standard, const char *raw, int idx)
- {
- fz_stext_device *tdev = (fz_stext_device*)dev;
- fz_stext_page *page = tdev->page;
- fz_stext_block *block, *le, *gt, *newblock;
- /* Find a pointer to the last block. */
- if (page->last_block)
- {
- block = page->last_block;
- }
- else if (page->last_struct)
- {
- block = page->last_struct->last_block;
- }
- else
- {
- block = page->first_block;
- }
- /* So block is somewhere in the content chain. Let's try and find:
- * le = the struct node <= idx before block in the content chain.
- * ge = the struct node >= idx after block in the content chain.
- * Search backwards to start with.
- */
- gt = NULL;
- le = block;
- while (le)
- {
- if (le->type == FZ_STEXT_BLOCK_STRUCT)
- {
- if (le->u.s.index > idx)
- gt = le;
- if (le->u.s.index <= idx)
- break;
- }
- le = le->prev;
- }
- /* The following loop copes with finding gt (the smallest block with an index higher
- * than we want) if we haven't found it already. The while loop in here was designed
- * to cope with 'block' being in the middle of a list. In fact, the way the code is
- * currently, block will always be at the end of a list, so the while won't do anything.
- * But I'm loathe to remove it in case we ever change this code to start from wherever
- * we did the last insertion. */
- if (gt == NULL)
- {
- gt = block;
- while (gt)
- {
- if (gt->type == FZ_STEXT_BLOCK_STRUCT)
- {
- if (gt->u.s.index <= idx)
- le = gt;
- if (gt->u.s.index >= idx)
- break;
- }
- block = gt;
- gt = gt->next;
- }
- }
- if (le && le->u.s.index == idx)
- {
- /* We want to move down into the le block. Does it have a struct
- * attached yet? */
- if (le->u.s.down == NULL)
- {
- /* No. We need to create a new struct node. */
- new_stext_struct(ctx, page, le, standard, raw);
- }
- else if (le->u.s.down->standard != standard ||
- (raw == NULL && le->u.s.down->raw[0] != 0) ||
- (raw != NULL && strcmp(raw, le->u.s.down->raw) != 0))
- {
- /* Yes, but it doesn't match the one we expect! */
- fz_warn(ctx, "Mismatched structure type!");
- }
- page->last_struct = le->u.s.down;
- page->last_block = le->u.s.down->last_block;
- return;
- }
- /* We are going to need to create a new block. Create a complete unlinked one here. */
- newblock = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block);
- newblock->bbox = fz_empty_rect;
- newblock->prev = NULL;
- newblock->next = NULL;
- newblock->type = FZ_STEXT_BLOCK_STRUCT;
- newblock->u.s.index = idx;
- newblock->u.s.down = NULL;
- /* If this throws, we leak newblock but it's within the pool, so it doesn't matter. */
- new_stext_struct(ctx, page, newblock, standard, raw);
- /* So now we just need to link it in somewhere. */
- if (gt)
- {
- /* Link it in before gt. */
- newblock->prev = gt->prev;
- if (gt->prev)
- gt->prev->next = newblock;
- gt->prev = newblock;
- newblock->next = gt;
- }
- else if (block)
- {
- /* Link it in at the end of the list (i.e. after 'block') */
- newblock->prev = block;
- block->next = newblock;
- }
- else if (page->last_struct)
- {
- /* We have no blocks at all at this level. */
- page->last_struct->first_block = newblock;
- page->last_struct->last_block = newblock;
- }
- else
- {
- /* We have no blocks at ANY level. */
- page->first_block = newblock;
- }
- /* Wherever we linked it in, that's where we want to continue adding content. */
- page->last_struct = newblock->u.s.down;
- page->last_block = NULL;
- }
- static void
- fz_stext_end_structure(fz_context *ctx, fz_device *dev)
- {
- fz_stext_device *tdev = (fz_stext_device*)dev;
- fz_stext_page *page = tdev->page;
- fz_stext_struct *str = page->last_struct;
- if (str == NULL)
- {
- fz_warn(ctx, "Structure out of sync");
- return;
- }
- page->last_struct = str->parent;
- if (page->last_struct == NULL)
- {
- page->last_block = page->first_block;
- /* Yuck */
- while (page->last_block->next)
- page->last_block = page->last_block->next;
- }
- else
- {
- page->last_block = page->last_struct->last_block;
- }
- }
- fz_device *
- fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts)
- {
- fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device);
- dev->super.close_device = fz_stext_close_device;
- dev->super.drop_device = fz_stext_drop_device;
- dev->super.fill_text = fz_stext_fill_text;
- dev->super.stroke_text = fz_stext_stroke_text;
- dev->super.clip_text = fz_stext_clip_text;
- dev->super.clip_stroke_text = fz_stext_clip_stroke_text;
- dev->super.ignore_text = fz_stext_ignore_text;
- dev->super.begin_metatext = fz_stext_begin_metatext;
- dev->super.end_metatext = fz_stext_end_metatext;
- dev->super.fill_shade = fz_stext_fill_shade;
- dev->super.fill_image = fz_stext_fill_image;
- dev->super.fill_image_mask = fz_stext_fill_image_mask;
- if (opts)
- {
- dev->flags = opts->flags;
- if (opts->flags & FZ_STEXT_COLLECT_STRUCTURE)
- {
- dev->super.begin_structure = fz_stext_begin_structure;
- dev->super.end_structure = fz_stext_end_structure;
- }
- if (opts->flags & (FZ_STEXT_COLLECT_VECTORS | FZ_STEXT_COLLECT_STYLES))
- {
- dev->super.fill_path = fz_stext_fill_path;
- dev->super.stroke_path = fz_stext_stroke_path;
- }
- }
- dev->page = page;
- dev->pen.x = 0;
- dev->pen.y = 0;
- dev->trm = fz_identity;
- dev->lastchar = ' ';
- dev->lasttext = NULL;
- dev->lastbidi = 0;
- dev->last_was_fake_bold = 1;
- if (opts)
- dev->opts = *opts;
- if ((dev->flags & FZ_STEXT_PRESERVE_IMAGES) == 0)
- dev->super.hints |= FZ_DONT_DECODE_IMAGES;
- dev->rect_max = 0;
- dev->rect_len = 0;
- dev->rects = NULL;
- return (fz_device*)dev;
- }
|