| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174 |
- // Copyright (C) 2025 Artifex Software, Inc.
- //
- // This file is part of MuPDF.
- //
- // MuPDF is free software: you can redistribute it and/or modify it under the
- // terms of the GNU Affero General Public License as published by the Free
- // Software Foundation, either version 3 of the License, or (at your option)
- // any later version.
- //
- // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
- // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- // details.
- //
- // You should have received a copy of the GNU Affero General Public License
- // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
- //
- // Alternative licensing terms are available from the licensor.
- // For commercial licensing, see <https://www.artifex.com/> or contact
- // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
- // CA 94129, USA, for further information.
- #include "mupdf/fitz.h"
- #include <assert.h>
- /* #define DEBUG_WRITE_AS_PS */
- /* #define DEBUG_TABLE_STRUCTURE */
- /* #define DEBUG_TABLE_HUNT */
- /*
- * The algorithm.
- *
- * The goal of the algorithm is to identify tables on a page.
- * First we have to find the tables on a page, then we have to
- * figure out where the columns/rows are, and then how the
- * cells span them.
- *
- * We do this as a series of steps.
- *
- * To illustrate what's going on, let's use an example page
- * that we can follow through all the steps.
- *
- * +---------------------------+
- * | |
- * | #### ## ### ## | <- Title
- * | |
- * | ##### ##### #### ## | \
- * | ## ###### ###### ## | |
- * | #### ####### ###### | |- Abstract
- * | ####### #### ## ### | |
- * | ### ##### ###### | /
- * | |
- * | ######### ######### | 2 Columns of text
- * | ######### ######### |
- * | ######## ######### |
- * | ######### |
- * | +-------+ ####### | <- With an image on the left
- * | | | |
- * | | | ## ## # # | <- And a table on the right
- * | +-------+ ## ## # # |
- * | ## ## # # |
- * | ######### ## ## # # |
- * | ######### ## ## # # |
- * | ######### ## ## # # |
- * | |
- * +---------------------------+
- *
- *
- * Step 1: Segmentation.
- *
- * First, we segment the page, trying to break it down into a
- * series of non-overlapping rectangles. We do this (in stext-boxer.c)
- * by looking for where the content isn't. If we can identify breaks
- * that run through the page (either from top to bottom or from left
- * to right), then we can split the page there, and recursively consider
- * the two halves of the page.
- *
- * It's not a perfect algorithm, but it manages to in many cases.
- *
- * After segmenting the above example, first we'll find the horizontal
- * splits, giving:
- *
- * +---------------------------+
- * | |
- * | #### ## ### ## |
- * +---------------------------+
- * | ##### ##### #### ## |
- * | ## ###### ###### ## |
- * | #### ####### ###### |
- * | ####### #### ## ### |
- * | ### ##### ###### |
- * +---------------------------+
- * | ######### ######### |
- * | ######### ######### |
- * | ######## ######### |
- * | ######### |
- * | +-------+ ####### |
- * | | | |
- * | | | ## ## # # |
- * | +-------+ ## ## # # |
- * | ## ## # # |
- * | ######### ## ## # # |
- * | ######### ## ## # # |
- * | ######### ## ## # # |
- * | |
- * +---------------------------+
- *
- * Then we'll recurse and find the vertical split between
- * the columns:
- *
- * +---------------------------+
- * | |
- * | #### ## ### ## |
- * +---------------------------+
- * | ##### ##### #### ## |
- * | ## ###### ###### ## |
- * | #### ####### ###### |
- * | ####### #### ## ### |
- * | ### ##### ###### |
- * +-------------+-------------+
- * | ######### | ######### |
- * | ######### | ######### |
- * | ######## | ######### |
- * | | ######### |
- * | +-------+ | ####### |
- * | | | | |
- * | | | | ## ## # # |
- * | +-------+ | ## ## # # |
- * | | ## ## # # |
- * | ######### | ## ## # # |
- * | ######### | ## ## # # |
- * | ######### | ## ## # # |
- * | | |
- * +-------------+-------------+
- *
- * Then we recurse again and find the horizontal splits
- * within the columns:
- *
- * +---------------------------+
- * | |
- * | #### ## ### ## |
- * +---------------------------+
- * | ##### ##### #### ## |
- * | ## ###### ###### ## |
- * | #### ####### ###### |
- * | ####### #### ## ### |
- * | ### ##### ###### |
- * +-------------+-------------+
- * | ######### | ######### |
- * | ######### | ######### |
- * | ######## | ######### |
- * +-------------+ ######### |
- * | +-------+ | ####### |
- * | | | +-------------+
- * | | | | ## ## # # |
- * | +-------+ | ## ## # # |
- * +-------------+ ## ## # # |
- * | ######### | ## ## # # |
- * | ######### | ## ## # # |
- * | ######### | ## ## # # |
- * | | |
- * +-------------+-------------+
- *
- * We recurse a fixed maximum number of times (currently
- * 6, IIRC) or until we fail to find any suitable splits.
- *
- * This completes the page segmentation step.
- *
- * Step 2: Grid finding
- *
- * Next, we look at each of those segments and try to identify
- * where grids might be.
- *
- * Imagine the bottom right section of that page as
- * a board with lego blocks on where there is text.
- * Now imagine viewing that from the bottom of the page.
- * The gaps between the columns of the table are where you
- * can see through to the top between the blocks.
- *
- * Similarly, if you view it from the side, the gaps between the
- * rows of the page are where you can see through to the other
- * side.
- *
- * So, how do we code that? Well, we run through the page content
- * (obviously, restricted to the content that falls into this
- * segment of the page - that'll go without saying from here on
- * in). For each bit of content, we look at the "x extent" of that
- * content - for instance a given string might start at position
- * 10 and continue to position 100. We build a list of all these
- * start, and stop positions, and keep them in a sorted list.
- *
- * Then we walk this list from left to right, keeping a sum. I
- * call this sum "wind", because it's very similar to the winding
- * number that you get when doing scan conversion of bezier shapes.
- *
- * wind starts out as 0. We increment it whenever we pass a 'start'
- * position, and decrement it whenever we pass a 'stop' position.
- * So at any given x position along the line wind tells us the
- * number of pieces of content that overlap that x position.
- * So wind(left) = 0 = wind(right), and wind(x) >= x for all x.
- *
- * So, if we walk from left to right, the trace of wind might
- * look something like:
- *
- * __
- * ___ / \ _ __
- * / \ / \/ \ _/ \_
- * / \___/ \___/ \
- *
- * The left and right edges of the table are pretty clear.
- * The regions where wind drops to 0 represent the column dividers.
- * The left and right hand side of those regions gives us the min
- * and max values for that divider.
- *
- * We can then repeat this process for Y ranges instead of X ranges
- * to get the row dividers.
- *
- * BUT, this only works for pure grid tables. It falls down for
- * cases where we have merged cells (which is very common due to
- * titles etc).
- *
- * We can modify the algorithm slightly to allow for this.
- *
- * Consider the following table:
- *
- * +-----------------------------------+
- * | Long Table title across the top |
- * +---------------+---------+---------+
- * | Name | Result1 | Result2 |
- * +---------------+----+----+----+----+
- * | Homer Simpson | 1 | 23 | 4 | 56 |
- * | Barney Gumble | 1 | 23 | 4 | 56 |
- * | Moe | 1 | 23 | 4 | 56 |
- * | Apu | 1 | 23 | 4 | 56 |
- * | Ned Flanders | 1 | 23 | 4 | 56 |
- * +---------------+----+----+----+----+
- *
- * The wind trace for that looks something like
- * (with a certain degree of artistic license for the
- * limitations of ascii art):
- *
- * ________
- * / \ _ __ _ _
- * / \____/ \_/ \___/ \_/ \
- * / \
- *
- * So, the trace never quite drops back to zero in the
- * middle due to the spanning of the top title.
- *
- * So, instead of just looking for points where the trace
- * drops to zero, we instead look for local minima. Each local
- * minima represents a place where there might be a grid divider.
- * The value of wind at such points can be considered the
- * "uncertainty" with which there might be a divider there.
- * Clear dividers (with a wind value of 0) have no uncertainty.
- * Places where cells are spanned have a higher value of uncertainty.
- *
- * The output from this step is the list of possible grid positions
- * (X and Y), with uncertainty values.
- *
- * We have skipped over the handling of spaces in the above
- * description. On the one hand, we'd quite like to treat
- * sentences as long unbroken regions of content. But sometimes
- * we can get awkward content like:
- *
- * P subtelomeric 24.38 8.15 11.31 0.94 1.46
- * 11.08 6.93 15.34 0.00 0.73
- * Pericentromeric region; C band 20.42 9.26 13.64 0.81 0.81
- * 16.50 7.03 14.45 0.17 5.15
- * where the content is frequently sent with spaces instead of
- * small gaps (particular in tables of digits, because numerals
- * are often the same width even in proportional fonts).
- *
- * To cope with this, as we send potential edges into the start/stop
- * positions, we send 'weak' edges for the start and stops of runs
- * of spaces. We then post process the edges to remove any local
- * minima regions in the 'wind' values that are bounded purely by
- * 'weak' edges.
- *
- * Step 3: Cell analysis
- *
- * So, armed with the output from step 2, we can examine each grid
- * found. If we have W x-dividers and H y-dividers, we know we have
- * a potential table with (W-1) x (H-1) cells in it.
- *
- * We represent this as a W x H grid of cells, each like:
- *
- * . .
- * . .
- * . . .+-------+. . . Each cell holds information about the
- * | . edges above, and to the left of it.
- * | .
- * | .
- * . . .+. . . .+. . .
- * . .
- * . .
- *
- * h_line: Is there a horizontal divider drawn on the page that
- * corresponds to the top of this cell (i.e. is there a cell border
- * here?)
- * v_line: Is there a vertical divider drawn on the page that
- * corresponds to the left of this cell (i.e. is there a cell border
- * here?)
- * h_crossed: Does content cross this line (i.e. are we merged
- * with the cell above?)
- * v_crossed: Does content cross this line (i.e. are we merged
- * with the cell to the left?)
- * full: Is there any content in this cell at all?
- *
- * We need a W x H grid of cells to represent the entire table due
- * to the potential right and bottom edge lines. The right and
- * bottom rows of cells should never be full, or be crossed, but
- * it's easiest just to use a simple representation that copes with
- * the h_line and v_line values naturally.
- *
- * So, we start with the cells structure empty, and we run through
- * the page content, filling in the details as we go.
- *
- * At the end of the process, we have enough information to draw
- * an asciiart representation of our table. It might look something
- * like this (this comes from dotted-gridlines-tables.pdf):
- *
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+
- * | | | | | |#|#|#|#| | | |
- * + + + + + + +v+ +v+v+ + + +
- * | | | | | |#|#|#|#| | | |
- * + + + + + + + +v+ + + + + +
- * | |#| | |#|#|#|#|#|#|#|#|
- * + +v+ + + +v+v+ +v+v+v+v+v+
- * | |#| |#|#|#|#|#|#|#|#|#|
- * + + + + +v+ + +v+ + + + + +
- * |#|#| #|#|#|#|#|#|#|#|#|#|
- * +v+v+ +v+ +v+v+ +v+v+v+v+v+
- * |#|#| #|#|#|#|#|#|#|#|#|#|
- * + + + + +v+ + +v+ + + + + +
- * | |#| |#|#|#|#|#|#|#|#|#|
- * + +v+ + + +v+v+ +v+v+v+v+v+
- * | |#| | |#|#|#|#|#|#|#|#|
- * + + + + + + + +v+ + + + + +
- * | | | | | |#|#|#|#| | | |
- * + + + + + + +v+ +v+v+ + + +
- * | | | | | |#|#|#|#| | | |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+
- * | |# |#| | |#| | |#|#|#|
- * + +-+-+-+-+-+-+-+-+-+-+-+-+
- * | |# |#| | |#| | |#|#|#|
- * + +-+-+-+-+-+-+-+-+-+-+-+-+
- * | |#>#|#| | |#| | |#|#|#|
- * + +-+-+-+-+-+-+-+-+-+-+-+-+
- * | |#>#|#| | |#| | |#|#|#|
- * + +-+-+-+-+-+-+-+-+-+-+-+-+
- * | |#>#|#| |#| | | |#|#|#|
- * + +-+-+-+-+-+-+-+-+-+-+-+-+
- * | |# |#| | |#| | |#|#|#|
- * + +-+-+-+-+-+-+-+-+-+-+-+-+
- * | |# |#| | |#| | |#|#|#|
- * + +-+-+-+-+-+-+-+-+-+-+-+-+
- * | |# |#| | |#| | |#|#|#|
- * + +-+-+-+-+-+-+-+-+-+-+-+-+
- * | |# |#| | |#| | |#|#|#|
- * + +-+-+-+-+-+-+-+-+-+-+-+-+
- * |# |#|#|#|#|#|#|#|#|#|
- * + +-+-+-+-+-+-+-+-+-+-+-+-+
- *
- * This shows where lines are detected ( - and | ),
- * where they are crossed ( > and v) and where cells
- * are full ( # ).
- *
- * Step 4: Row and column merging.
- *
- * Based on the information above, we then try to merge
- * cells and columns to simplify the table.
- *
- * The best rules I've come up with this so far are:
- * We can merge two adjacent columns if all the pairs of
- * cells in the two columns are mergeable.
- *
- * Cells are held to be mergeable or not based upon the following
- * rules:
- * If there is a line between 2 cells - not mergeable.
- * else if the uncertainty between 2 cells is 0 - not mergeable.
- * else if the line between the 2 cells is crossed - mergeable.
- * else if strictly one of the cells is full - mergeable.
- * else not mergeable.
- *
- * So in the above example, column 2 (numbered from 0) can be merged
- * with column 3.
- *
- * This gives:
- *
- * +-+-+-+-+-+-+-+-+-+-+-+-+
- * | | | | | |#|#|#|#| | | |
- * + + + + + +v+ +v+v+ + + +
- * | | | | | |#|#|#|#| | | |
- * + + + + + + +v+ + + + + +
- * | |#| | |#|#|#|#|#|#|#|#|
- * + +v+ + +v+v+ +v+v+v+v+v+
- * | |#| |#|#|#|#|#|#|#|#|#|
- * + + + +v+ + +v+ + + + + +
- * |#|#|#|#|#|#|#|#|#|#|#|#|
- * +v+v+v+ +v+v+ +v+v+v+v+v+
- * |#|#|#|#|#|#|#|#|#|#|#|#|
- * + + + +v+ + +v+ + + + + +
- * | |#| |#|#|#|#|#|#|#|#|#|
- * + +v+ + +v+v+ +v+v+v+v+v+
- * | |#| | |#|#|#|#|#|#|#|#|
- * + + + + + + +v+ + + + + +
- * | | | | | |#|#|#|#| | | |
- * + + + + + +v+ +v+v+ + + +
- * | | | | | |#|#|#|#| | | |
- * +-+-+-+-+-+-+-+-+-+-+-+-+
- * | |#|#| | |#| | |#|#|#|
- * + +-+-+-+-+-+-+-+-+-+-+-+
- * | |#|#| | |#| | |#|#|#|
- * + +-+-+-+-+-+-+-+-+-+-+-+
- * | |#|#| | |#| | |#|#|#|
- * + +-+-+-+-+-+-+-+-+-+-+-+
- * | |#|#| | |#| | |#|#|#|
- * + +-+-+-+-+-+-+-+-+-+-+-+
- * | |#|#| |#| | | |#|#|#|
- * + +-+-+-+-+-+-+-+-+-+-+-+
- * | |#|#| | |#| | |#|#|#|
- * + +-+-+-+-+-+-+-+-+-+-+-+
- * | |#|#| | |#| | |#|#|#|
- * + +-+-+-+-+-+-+-+-+-+-+-+
- * | |#|#| | |#| | |#|#|#|
- * + +-+-+-+-+-+-+-+-+-+-+-+
- * | |#|#| | |#| | |#|#|#|
- * + +-+-+-+-+-+-+-+-+-+-+-+
- * |# |#|#|#|#|#|#|#|#|#|
- * + +-+-+-+-+-+-+-+-+-+-+-+
- *
- * We then perform the same merging process for rows as for
- * columns - though there are no rows in the above example
- * that can be merged.
- *
- * You'll note that, for example, we don't merge row 0 and
- * row 1 in the above, because we have a pair of cells that
- * are both full without crossing.
- *
- * Step 5: Cell spanning
- *
- * Now we actually start to output the table. We keep a 'sent_table'
- * (a grid of W x H bools) to keep track of whether we've output
- * the content for a given cell or not yet.
- *
- * For each cell we reach, assuming sent_table[x,y] is false,
- * we merge it with as many cells on the right as required,
- * according to 'v_crossed' values (subject to not passing
- * v_lines or uncertainty == 0's).
- *
- * We then try to merge cells below according to 'h_crossed'
- * values (subject to not passing h_lines or uncertainty == 0's).
- *
- * In theory this can leave us with some cases where we'd like
- * to merge some cells (because of crossed) and can't (because
- * of lines or sent_table[]) values. In the absence of better
- * cell spanning algorithms we have no choice here.
- *
- * Then we output the contents and set sent_table[] values as
- * appropriate.
- *
- * If a row has no cells in it, we currently omit the TR. If/when
- * we figure out how to indicate rowspan/colspan in stext, we can
- * revisit that.
- */
- static fz_stext_block *
- add_grid_block(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block)
- {
- fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof(**first_block));
- memset(block, 0, sizeof(*block));
- block->type = FZ_STEXT_BLOCK_GRID;
- block->bbox = fz_empty_rect; /* Fixes bug 703267. */
- block->next = *first_block;
- if (*first_block)
- {
- (*first_block)->prev = block;
- assert(*last_block);
- }
- else
- {
- assert(*last_block == NULL);
- *last_block = block;
- }
- *first_block = block;
- return block;
- }
- static void
- insert_block_before(fz_stext_block *block, fz_stext_block *before, fz_stext_page *page, fz_stext_struct *dest)
- {
- if (before)
- {
- /* We have a block to insert it before, so we know it's not the last. */
- assert(dest ? (dest->first_block != NULL && dest->last_block != NULL) : (page->first_block != NULL && page->last_block != NULL));
- block->next = before;
- block->prev = before->prev;
- if (before->prev)
- {
- assert(before->prev->next == before);
- before->prev->next = block;
- }
- else if (dest)
- {
- assert(dest->first_block == before);
- dest->first_block = block;
- }
- else
- {
- assert(page->first_block == before);
- page->first_block = block;
- }
- before->prev = block;
- }
- else if (dest)
- {
- /* Will be the last block. */
- block->next = NULL;
- block->prev = dest->last_block;
- if (dest->last_block)
- {
- assert(dest->last_block->next == NULL);
- dest->last_block->next = block;
- }
- if (dest->first_block == NULL)
- dest->first_block = block;
- dest->last_block = block;
- }
- else
- {
- /* Will be the last block. */
- block->next = NULL;
- block->prev = page->last_block;
- if (page->last_block)
- {
- assert(page->last_block->next == NULL);
- page->last_block->next = block;
- }
- if (page->first_block == NULL)
- page->first_block = block;
- page->last_block = block;
- }
- }
- static fz_stext_struct *
- add_struct_block_before(fz_context *ctx, fz_stext_block *before, fz_stext_page *page, fz_stext_struct *parent, fz_structure std, const char *raw)
- {
- fz_stext_block *block;
- int idx = 0;
- size_t z;
- fz_stext_struct *newstruct;
- if (raw == NULL)
- raw = "";
- z = strlen(raw);
- /* We're going to insert a struct block. We need an idx, so walk the list */
- for (block = parent ? parent->first_block : page->first_block; block != before; block = block->next)
- {
- if (block->type == FZ_STEXT_BLOCK_STRUCT)
- {
- assert(block->u.s.index >= idx);
- idx = block->u.s.index + 1;
- }
- }
- /* So we'll add our block as idx. But all the other struct blocks that follow us need to have
- * larger values. */
- /* Update all the subsequent structs to have a higher idx */
- if (before)
- {
- int idx2 = idx+1;
- for (block = before->next; block != NULL; block = block->next)
- {
- if (block->type != FZ_STEXT_BLOCK_STRUCT)
- continue;
- if (block->u.s.index > idx2)
- break;
- block->u.s.index = idx2++;
- }
- }
- /* Now make our new struct block and insert it. */
- block = fz_pool_alloc(ctx, page->pool, sizeof(*block));
- block->type = FZ_STEXT_BLOCK_STRUCT;
- block->bbox = fz_empty_rect; /* Fixes bug 703267. */
- insert_block_before(block, before, page, parent);
- block->u.s.down = newstruct = fz_pool_alloc(ctx, page->pool, sizeof(*block->u.s.down) + z);
- block->u.s.index = idx;
- newstruct->parent = parent;
- newstruct->standard = std;
- memcpy(newstruct->raw, raw, z);
- newstruct->raw[z] = 0;
- newstruct->up = block;
- return newstruct;
- }
- typedef struct
- {
- int len;
- int max;
- struct {
- uint8_t left;
- uint8_t weak;
- float pos;
- int freq;
- } *list;
- } div_list;
- static void
- div_list_push(fz_context *ctx, div_list *div, int left, int weak, float pos)
- {
- int i;
- /* FIXME: Could be bsearch. */
- for (i = 0; i < div->len; i++)
- {
- if (div->list[i].pos > pos)
- break;
- else if (div->list[i].pos == pos && div->list[i].left == left)
- {
- div->list[i].freq++;
- div->list[i].weak &= weak;
- return;
- }
- }
- if (div->len == div->max)
- {
- int newmax = div->max * 2;
- if (newmax == 0)
- newmax = 32;
- div->list = fz_realloc(ctx, div->list, sizeof(div->list[0]) * newmax);
- div->max = newmax;
- }
- if (i < div->len)
- memmove(&div->list[i+1], &div->list[i], sizeof(div->list[0]) * (div->len - i));
- div->len++;
- div->list[i].left = left;
- div->list[i].weak = weak;
- div->list[i].pos = pos;
- div->list[i].freq = 1;
- }
- static fz_stext_grid_positions *
- make_table_positions(fz_context *ctx, div_list *xs, float min, float max)
- {
- int wind;
- fz_stext_grid_positions *pos;
- int len = xs->len;
- int i;
- int hi = 0;
- /* Count the number of edges */
- int local_min = 0;
- int edges = 2;
- if (len == 0)
- return NULL;
- assert(xs->list[0].left);
- for (i = 0; i < len; i++)
- {
- if (xs->list[i].pos >= min)
- break;
- }
- for (; i < len; i++)
- {
- if (xs->list[i].pos >= max)
- break;
- if (xs->list[i].left)
- {
- if (local_min)
- edges++;
- }
- else
- local_min = 1;
- }
- assert(!xs->list[len-1].left);
- pos = fz_malloc_flexible(ctx, fz_stext_grid_positions, list, edges);
- pos->len = edges;
- /* Copy the edges in */
- wind = 0;
- local_min = 0;
- edges = 1;
- pos->list[0].pos = min;
- pos->list[0].min = min;
- pos->list[0].max = fz_max(xs->list[0].pos, min);
- pos->list[0].uncertainty = 0;
- pos->list[0].reinforcement = 0;
- #ifdef DEBUG_TABLE_HUNT
- printf("|%g ", pos->list[0].pos);
- #endif
- /* Skip over entries to the left of min. */
- for (i = 0; i < len; i++)
- {
- if (xs->list[i].pos >= min)
- break;
- if (xs->list[i].left)
- wind += xs->list[i].freq;
- else
- wind -= xs->list[i].freq;
- }
- for (; i < len; i++)
- {
- if (xs->list[i].pos >= max)
- break;
- if (xs->list[i].left)
- {
- if (local_min)
- {
- pos->list[edges].min = xs->list[i-1].pos;
- pos->list[edges].max = xs->list[i].pos;
- pos->list[edges].pos = (xs->list[i-1].pos + xs->list[i].pos)/2;
- pos->list[edges].uncertainty = wind;
- #ifdef DEBUG_TABLE_HUNT
- if (wind)
- printf("?%g(%d) ", pos->list[edges].pos, wind);
- else
- printf("|%g ", pos->list[edges].pos);
- #endif
- edges++;
- }
- wind += xs->list[i].freq;
- if (wind > hi)
- hi = wind;
- }
- else
- {
- wind -= xs->list[i].freq;
- local_min = 1;
- }
- }
- assert(i < len || wind == 0);
- pos->list[edges].pos = max;
- pos->list[edges].min = fz_min(xs->list[i-1].pos, max);
- pos->list[edges].max = max;
- assert(max >= xs->list[i-1].pos);
- pos->list[edges].uncertainty = 0;
- pos->list[edges].reinforcement = 0;
- pos->max_uncertainty = hi;
- #ifdef DEBUG_TABLE_HUNT
- printf("|%g\n", pos->list[edges].pos);
- #endif
- return pos;
- }
- static fz_stext_grid_positions *
- copy_grid_positions_to_pool(fz_context *ctx, fz_stext_page *page, fz_stext_grid_positions *xs)
- {
- size_t z = offsetof(fz_stext_grid_positions, list) + sizeof(xs->list[0]) * (xs->len);
- fz_stext_grid_positions *xs2 = fz_pool_alloc(ctx, page->pool, z);
- memcpy(xs2, xs, z);
- return xs2;
- }
- static void
- sanitize_positions(fz_context *ctx, div_list *xs)
- {
- int i, j, wind, changed;
- #ifdef DEBUG_TABLE_HUNT
- printf("OK:\n");
- for (i = 0; i < xs->len; i++)
- {
- if (xs->list[i].left)
- printf("[");
- printf("%g(%d%s)", xs->list[i].pos, xs->list[i].freq, xs->list[i].weak ? "weak" : "");
- if (!xs->list[i].left)
- printf("]");
- printf(" ");
- }
- printf("\n");
- #endif
- if (xs->len == 0)
- return;
- do
- {
- /* Now, combine runs of left and right */
- for (i = 0; i < xs->len; i++)
- {
- if (xs->list[i].left)
- {
- j = i;
- while (i < xs->len-1 && xs->list[i+1].left)
- {
- i++;
- xs->list[j].freq += xs->list[i].freq;
- xs->list[j].weak &= xs->list[i].weak;
- xs->list[i].freq = 0;
- }
- }
- else
- {
- while (i < xs->len-1 && !xs->list[i+1].left)
- {
- i++;
- xs->list[i].freq += xs->list[i-1].freq;
- xs->list[i].weak &= xs->list[i-1].weak;
- xs->list[i-1].freq = 0;
- }
- }
- }
- #ifdef DEBUG_TABLE_HUNT
- printf("Shrunk:\n");
- for (i = 0; i < xs->len; i++)
- {
- if (xs->list[i].left)
- printf("[");
- printf("%g(%d%s)", xs->list[i].pos, xs->list[i].freq, xs->list[i].weak ? "weak" : "");
- if (!xs->list[i].left)
- printf("]");
- printf(" ");
- }
- printf("\n");
- #endif
- /* Now remove the 0 frequency ones. */
- j = 0;
- for (i = 0; i < xs->len; i++)
- {
- if (xs->list[i].freq == 0)
- continue;
- if (i != j)
- xs->list[j] = xs->list[i];
- j++;
- }
- xs->len = j;
- /* Now run across looking for local minima where at least one
- * edge is 'weak'. If the wind at that point is non-zero, then
- * remove the weak edges from consideration and retry. */
- wind = 0;
- changed = 0;
- i = 0;
- while (1)
- {
- assert(xs->list[i].left);
- for (; xs->list[i].left; i++)
- {
- wind += xs->list[i].freq;
- }
- assert(i < xs->len);
- for (; xs->list[i].left == 0 && i < xs->len; i++)
- {
- wind -= xs->list[i].freq;
- }
- if (i == xs->len)
- break;
- if (wind != 0 && (xs->list[i-1].weak || xs->list[i].weak))
- {
- int m = fz_mini(xs->list[i-1].freq, xs->list[i].freq);
- assert(m > 0);
- xs->list[i-1].freq -= m;
- xs->list[i].freq -= m;
- changed = 1;
- }
- }
- }
- while (changed);
- #ifdef DEBUG_TABLE_HUNT
- printf("Compacted:\n");
- for (i = 0; i < xs->len; i++)
- {
- if (xs->list[i].left)
- printf("[");
- printf("%g(%d%s)", xs->list[i].pos, xs->list[i].freq, xs->list[i].weak ? "weak" : "");
- if (!xs->list[i].left)
- printf("]");
- printf(" ");
- }
- printf("\n");
- #endif
- }
- /* We want to check for whether a DIV that we are about to descend into
- * contains a column of justified text. We will accept some headers in
- * this text, but not JUST headers. */
- static int
- all_blocks_are_justified_or_headers(fz_context *ctx, fz_stext_block *block)
- {
- int just_headers = 1;
- for (; block != NULL; block = block->next)
- {
- if (block->type == FZ_STEXT_BLOCK_STRUCT)
- {
- if (block->u.s.down == NULL)
- continue;
- if (block->u.s.down->standard == FZ_STRUCTURE_H ||
- block->u.s.down->standard == FZ_STRUCTURE_H1 ||
- block->u.s.down->standard == FZ_STRUCTURE_H2 ||
- block->u.s.down->standard == FZ_STRUCTURE_H3 ||
- block->u.s.down->standard == FZ_STRUCTURE_H4 ||
- block->u.s.down->standard == FZ_STRUCTURE_H5 ||
- block->u.s.down->standard == FZ_STRUCTURE_H6)
- continue;
- if (!all_blocks_are_justified_or_headers(ctx, block->u.s.down->first_block))
- return 0;
- }
- just_headers = 0;
- if (block->type == FZ_STEXT_BLOCK_TEXT && block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_FULL)
- return 0;
- }
- if (just_headers)
- return 0;
- return 1;
- }
- #define TWO_INCHES (72*2)
- /* Walk through the blocks, finding the bbox. */
- static fz_rect
- walk_to_find_bounds(fz_context *ctx, fz_stext_block *first_block)
- {
- fz_rect bounds = fz_empty_rect;
- fz_stext_block *block;
- fz_stext_line *line;
- fz_stext_char *ch;
- for (block = first_block; block != NULL; block = block->next)
- {
- switch (block->type)
- {
- case FZ_STEXT_BLOCK_STRUCT:
- if (!block->u.s.down)
- continue;
- if (block->u.s.down->standard == FZ_STRUCTURE_H)
- {
- if (block->next != NULL &&
- block->next->type == FZ_STEXT_BLOCK_TEXT &&
- block->next->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL)
- continue;
- }
- bounds = fz_union_rect(bounds, walk_to_find_bounds(ctx, block->u.s.down->first_block));
- break;
- case FZ_STEXT_BLOCK_VECTOR:
- bounds = fz_union_rect(bounds, block->bbox);
- break;
- case FZ_STEXT_BLOCK_TEXT:
- if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL && block->bbox.x1 - block->bbox.x0 >= TWO_INCHES)
- continue;
- for (line = block->u.t.first_line; line != NULL; line = line->next)
- {
- for (ch = line->first_char; ch != NULL; ch = ch->next)
- {
- if (ch->c != ' ')
- bounds = fz_union_rect(bounds, fz_rect_from_quad(ch->quad));
- }
- }
- break;
- }
- }
- return bounds;
- }
- static void
- walk_to_find_content(fz_context *ctx, div_list *xs, div_list *ys, fz_stext_block *first_block, fz_rect bounds)
- {
- fz_stext_block *block;
- fz_stext_line *line;
- fz_stext_char *ch;
- for (block = first_block; block != NULL; block = block->next)
- {
- switch (block->type)
- {
- case FZ_STEXT_BLOCK_STRUCT:
- if (block->u.s.down && !fz_is_empty_rect(fz_intersect_rect(bounds, block->bbox)))
- walk_to_find_content(ctx, xs, ys, block->u.s.down->first_block, bounds);
- break;
- case FZ_STEXT_BLOCK_VECTOR:
- break;
- case FZ_STEXT_BLOCK_TEXT:
- {
- fz_rect justified_region = fz_empty_rect;
- for (line = block->u.t.first_line; line != NULL; line = line->next)
- {
- fz_rect region = fz_empty_rect;
- region.y0 = line->bbox.y0;
- region.y1 = line->bbox.y1;
- if (region.y0 < bounds.y0)
- region.y0 = bounds.y0;
- if (region.y1 > bounds.y1)
- region.y1 = bounds.y1;
- if (region.y0 >= region.y1)
- break;
- /* Skip leading spaces. */
- for (ch = line->first_char; ch != NULL; ch = ch->next)
- if (ch->c != ' ')
- break;
- for (; ch != NULL; ch = ch->next)
- {
- if (ch->c == ' ')
- {
- /* Find the last space char in this run. */
- fz_stext_char *last_space;
- for (last_space = ch; last_space->next != NULL && last_space->next->c == ' '; last_space = last_space->next);
- /* If we're not the last char in the line (i.e. we're not a trailing space,
- * then send a 'weak' gap for the spaces, assuming it's sane to do so). */
- if (last_space->next != NULL)
- {
- float rpos = fz_min(ch->quad.ll.x, ch->quad.ul.x);
- float lpos = fz_min(last_space->next->quad.ll.x, last_space->next->quad.ll.x);
- /* Clamp these to the bounds */
- rpos = fz_clamp(rpos, bounds.x0, bounds.x1);
- lpos = fz_clamp(lpos, bounds.x0, bounds.x1);
- /* So we have a region (rpos...lpos) to add. */
- /* This can be positioned in various different ways relative to the
- * current region:
- * [region]
- * (rpos..lpos) OK
- * (rpos..lpos) OK, but adjust lpos
- * (rpos..lpos) OK, but adjust rpos
- * (rpos..lpos) OK
- * (rpos .. lpos) OK
- */
- if (lpos >= region.x1)
- {
- if (rpos >= region.x0 && rpos < region.x1)
- rpos = region.x1;
- }
- else if (rpos <= region.x0)
- {
- if (lpos > region.x0)
- lpos = region.x0;
- }
- else
- rpos = lpos; /* Make it an invalid region */
- if (rpos < lpos)
- {
- /* Send a weak right at the start of the spaces... */
- div_list_push(ctx, xs, 0, 1, rpos);
- /* and a weak left at the end. */
- div_list_push(ctx, xs, 1, 1, lpos);
- }
- /* Expand the region as required. */
- if (rpos < region.x0)
- region.x0 = rpos;
- if (lpos > region.x1)
- region.x1 = lpos;
- }
- /* Jump over the spaces */
- ch = last_space;
- }
- else
- {
- float lpos = fz_min(ch->quad.ll.x, ch->quad.ul.x);
- float rpos = fz_max(ch->quad.lr.x, ch->quad.ur.x);
- if (lpos < region.x0)
- region.x0 = lpos;
- if (rpos > region.x1)
- region.x1 = rpos;
- }
- }
- if (!fz_is_empty_rect(region))
- {
- div_list_push(ctx, xs, 1, 0, region.x0);
- div_list_push(ctx, xs, 0, 0, region.x1);
- /* For justified regions, we don't break after each line, but
- * rather before/after the region as a whole. */
- if (block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_FULL)
- {
- div_list_push(ctx, ys, 1, 0, region.y0);
- div_list_push(ctx, ys, 0, 0, region.y1);
- }
- else
- justified_region = fz_union_rect(justified_region, region);
- }
- }
- if (!fz_is_empty_rect(justified_region) && block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL)
- {
- div_list_push(ctx, ys, 1, 0, justified_region.y0);
- div_list_push(ctx, ys, 0, 0, justified_region.y1);
- }
- break;
- }
- }
- }
- }
- /* One of our datastructures (cells_t) is an array of details about the
- * cells that make up our table. It's a w * h array of cell_t's. Each
- * cell contains data on one of the cells in the table, as you'd expect.
- *
- * . .
- * . .
- * - - +-------+ - -
- * | .
- * | .
- * | .
- * - - + - - - + - -
- * . .
- * . .
- *
- * For any given cell, we store details about the top (lowest y coord)
- * and left (lowest x coord) edges. Specifically we store whether
- * there is a line at this position (h_line and v_line) (i.e. a drawn
- * border), and we also store whether content crosses this edge (h_crossed
- * and y_crossed). Finally, we store whether the cell has any content
- * in it at all (full).
- *
- * A table which has w positions across and h positions vertically, will
- * only really have (w-1) * (h-1) cells. We store w*h though to allow for
- * the right and bottom edges to have their lines represented.
- */
- typedef struct
- {
- int h_line;
- int v_line;
- int h_crossed;
- int v_crossed;
- int full;
- } cell_t;
- typedef struct
- {
- int w;
- int h;
- cell_t cell[FZ_FLEXIBLE_ARRAY];
- } cells_t;
- typedef struct
- {
- cells_t *cells;
- fz_stext_grid_positions *xpos;
- fz_stext_grid_positions *ypos;
- fz_rect bounds;
- } grid_walker_data;
- static cell_t *
- get_cell(cells_t *cells, int x, int y)
- {
- return &cells->cell[x + y * cells->w];
- }
- #ifdef DEBUG_TABLE_STRUCTURE
- static void
- asciiart_table(grid_walker_data *gd);
- #endif
- static fz_stext_grid_positions *
- split_grid_pos(fz_context *ctx, grid_walker_data *gd, int row, int i, int early)
- {
- fz_stext_grid_positions **posp = row ? &gd->ypos : &gd->xpos;
- fz_stext_grid_positions *pos = *posp;
- int n = pos->len;
- int x, y, w, h;
- cells_t *cells;
- /* Realloc the required structs */
- *posp = pos = fz_realloc_flexible(ctx, pos, fz_stext_grid_positions, list, n+1);
- cells = gd->cells = fz_realloc_flexible(ctx, gd->cells, cells_t, cell, (gd->cells->w + (1-row)) * (gd->cells->h + row));
- /* If both pass, then we're safe to shuffle the data. */
- #ifdef DEBUG_TABLE_STRUCTURE
- printf("Before split %s %d\n", row ? "row" : "col", i);
- asciiart_table(gd);
- #endif
- assert(i >= 0 && i < n);
- memmove(&pos->list[i+1], &pos->list[i], sizeof(pos->list[0]) * (n-i));
- pos->len++;
- /* Next expand the cells. Only h_line and v_line are filled in so far. */
- w = cells->w;
- h = cells->h;
- if (early && i > 0)
- i--, early = 0;
- if (row)
- {
- /* Add a row */
- cells->h = h+1;
- /* Expand the table, duplicating row i */
- memmove(&cells->cell[(i+1)*w], &cells->cell[i*w], sizeof(cells->cell[0])*(h-i)*w);
- if (early)
- {
- /* We are splitting row 0 into 0 and 1, with 0 being the new one. */
- for (x = 0; x < w; x++)
- {
- cells->cell[x].h_line = 0;
- cells->cell[x].v_line = 0;
- }
- }
- else
- {
- /* We are splitting row i into i and i+1, with i+1 being the new one. */
- /* v_lines are carried over. h_lines need to be unset. */
- for (x = 0; x < w; x++)
- cells->cell[x + (i+1)*w].h_line = 0;
- }
- }
- else
- {
- /* Add a column */
- cells->w = w+1;
- /* Expand the table, duplicating column i */
- for (y = h-1; y >= 0; y--)
- {
- for (x = w; x > i; x--)
- cells->cell[x + y*(w+1)] = cells->cell[x-1 + y*w];
- for (; x >= 0; x--)
- cells->cell[x + y*(w+1)] = cells->cell[x + y*w];
- }
- if (early)
- {
- /* We are splitting col 0 into 0 and 1, with 0 being the new one. */
- for (y = 0; y < h; y++)
- {
- cells->cell[y*(w+1)].h_line = 0;
- cells->cell[y*(w+1)].v_line = 0;
- }
- }
- else
- {
- /* h_lines are carried over. v_lines need to be reset */
- for (y = 0; y < h; y++)
- cells->cell[i+1 + y*(w+1)].v_line = 0;
- }
- }
- #ifdef DEBUG_TABLE_STRUCTURE
- printf("After split\n");
- asciiart_table(gd);
- #endif
- return pos;
- }
- /* This routine finds (and reinforces) grid positions for lines.
- *
- * If we have a thin line from (x0, y0) to (x1, y0), then we are
- * pretty sure that y0 will be on the edge of a cell. We are less
- * sure that x0 and x1 will match up to the edge of a cell.
- * Stylistically some tables overrun or underrun such lines.
- *
- * Similarly from (x0, y0) to (x0, y1), we expect x0 to be accurate
- * but y0 and y1 less so.
- *
- * If we have a wider rectangle, from (x0, y0) to (x1, y1) then
- * we fully expect all sides to be accurate.
- */
- static int
- find_grid_pos(fz_context *ctx, grid_walker_data *gd, int row, float x, int inaccurate)
- {
- const int WIGGLE_ROOM = 2;
- int i;
- fz_stext_grid_positions *pos = row ? gd->ypos : gd->xpos;
- assert(x >= pos->list[0].min && x <= pos->list[pos->len-1].max);
- #ifdef DEBUG_TABLE_STRUCTURE
- printf("Looking for %g in %s splits:\n", x, row ? "row" : "col");
- for (i = 0; i < pos->len; i++)
- {
- printf("%d\t%g\t%g\t%g\t%d\n", i, pos->list[i].min, pos->list[i].pos, pos->list[i].max, pos->list[i].reinforcement);
- }
- #endif
- while (inaccurate) /* So we can break out */
- {
- float prev = 0;
- /* If we start/finish outside the range of the table, then we
- * want to extend the table. So ignore 'inaccurate' in this
- * case. Match the logic below. */
- if (x < pos->list[0].min)
- break;
- if (x < pos->list[0].pos - WIGGLE_ROOM && pos->list[0].reinforcement > 0)
- break;
- if (x > pos->list[pos->len-1].max)
- break;
- if (x > pos->list[pos->len-1].pos + WIGGLE_ROOM && pos->list[pos->len-1].reinforcement > 0)
- break;
- /* Just find the closest one. No reinforcement. */
- for (i = 0; i < pos->len; i++)
- {
- if (x < pos->list[i].min)
- {
- float mid = (prev + pos->list[i].min)/2;
- if (x < mid)
- return i-1;
- return i;
- }
- prev = pos->list[i].max;
- if (x <= prev)
- return i;
- }
- assert("Never happens" == NULL);
- return -1;
- }
- for (i = 0; i < pos->len; i++)
- {
- if (x < pos->list[i].min)
- {
- /* Split i into i and i+1, and make i the new one. */
- assert(i > 0);
- #ifdef DEBUG_TABLE_STRUCTURE
- printf("Splitting before %d\n", i);
- #endif
- pos = split_grid_pos(ctx, gd, row, i, 1);
- pos->list[i-1].max = pos->list[i].min = (pos->list[i-1].max + x)/2;
- pos->list[i].pos = x;
- pos->list[i].max = pos->list[i+1].min = (pos->list[i+1].pos + x)/2;
- pos->list[i].reinforcement = 1;
- return i;
- }
- else if (x <= pos->list[i].max)
- {
- /* We are in the range for the ith divider. */
- if (pos->list[i].reinforcement == 0)
- {
- /* If we've not been reinforced before, reinforce now. */
- pos->list[i].pos = x;
- pos->list[i].reinforcement = 1;
- return i;
- }
- /* We've been reinforced before. This ought to be a pretty good
- * indication. */
- if (pos->list[i].pos - WIGGLE_ROOM < x && x < pos->list[i].pos + WIGGLE_ROOM)
- {
- /* We are a close match to the previously predicted pos
- * value. */
- pos->list[i].pos = pos->list[i].pos * pos->list[i].reinforcement + x;
- pos->list[i].pos /= ++pos->list[i].reinforcement;
- return i;
- }
- /* We need to split i into i and i+1. */
- pos = split_grid_pos(ctx, gd, row, i, pos->list[i].pos > x);
- if (pos->list[i].pos > x)
- {
- /* Make i the new one */
- #ifdef DEBUG_TABLE_STRUCTURE
- printf("Splitting %d (early)\n", i);
- #endif
- pos->list[i].pos = x;
- pos->list[i].max = pos->list[i+1].min = (pos->list[i+1].pos + x)/2;
- pos->list[i].reinforcement = 1;
- return i;
- }
- else
- {
- /* Make i+1 the new one */
- #ifdef DEBUG_TABLE_STRUCTURE
- printf("Splitting %d (late)\n", i);
- #endif
- pos->list[i+1].pos = x;
- pos->list[i].max = pos->list[i+1].min = (pos->list[i].pos + x)/2;
- pos->list[i].reinforcement = 1;
- return i+1;
- }
- }
- }
- assert("Never happens" == NULL);
- return -1;
- }
- static int
- find_cell_l(fz_stext_grid_positions *pos, float x)
- {
- int i;
- for (i = 0; i < pos->len; i++)
- if (x < pos->list[i].pos)
- return i-1;
- return -1;
- }
- static int
- find_cell_r(fz_stext_grid_positions *pos, float x)
- {
- int i;
- for (i = 0; i < pos->len; i++)
- if (x <= pos->list[i].pos)
- return i-1;
- return -1;
- }
- /* Add a horizontal line. Return 1 if the line doesn't seem to be a border line.
- * Record which cells that was a border for. */
- static void
- add_h_line(fz_context *ctx, grid_walker_data *gd, float x0, float x1, float y0, float y1)
- {
- int start = find_grid_pos(ctx, gd, 0, x0, 1);
- int end = find_grid_pos(ctx, gd, 0, x1, 1);
- float y = (y0 + y1) / 2;
- int yidx = find_grid_pos(ctx, gd, 1, y, 0);
- int i;
- assert(start > 0 && end > 0);
- for (i = start; i < end; i++)
- get_cell(gd->cells, i, yidx)->h_line++;
- }
- /* Add a vertical line. Return 1 if the line doesn't seem to be a border line.
- * Record which cells that was a border for. */
- static void
- add_v_line(fz_context *ctx, grid_walker_data *gd, float y0, float y1, float x0, float x1)
- {
- int start = find_grid_pos(ctx, gd, 1, y0, 1);
- int end = find_grid_pos(ctx, gd, 1, y1, 1);
- float x = (x0 + x1) / 2;
- int xidx = find_grid_pos(ctx, gd, 0, x, 0);
- int i;
- for (i = start; i < end; i++)
- get_cell(gd->cells, xidx, i)->v_line++;
- }
- static void
- add_hv_line(fz_context *ctx, grid_walker_data *gd, float x0, float x1, float y0, float y1, int stroked)
- {
- int ix0 = find_grid_pos(ctx, gd, 0, x0, 0);
- int ix1 = find_grid_pos(ctx, gd, 0, x1, 0);
- int iy0 = find_grid_pos(ctx, gd, 1, y0, 0);
- int iy1 = find_grid_pos(ctx, gd, 1, y1, 0);
- int i;
- if (stroked)
- {
- for (i = ix0; i < ix1; i++)
- {
- get_cell(gd->cells, i, iy0)->h_line++;
- get_cell(gd->cells, i, iy1)->h_line++;
- }
- for (i = iy0; i < iy1; i++)
- {
- get_cell(gd->cells, ix0, i)->v_line++;
- get_cell(gd->cells, ix1, i)->v_line++;
- }
- }
- }
- /* Shared internal routine with stext-boxer.c */
- fz_rect fz_collate_small_vector_run(fz_stext_block **blockp);
- static void
- walk_grid_lines(fz_context *ctx, grid_walker_data *gd, fz_stext_block *block)
- {
- for (; block != NULL; block = block->next)
- {
- if (block->type == FZ_STEXT_BLOCK_STRUCT)
- {
- if (block->u.s.down)
- walk_grid_lines(ctx, gd, block->u.s.down->first_block);
- continue;
- }
- else if (block->type == FZ_STEXT_BLOCK_VECTOR)
- {
- fz_rect r;
- float w, h;
- /* Only process rectangle blocks. */
- if ((block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE) == 0)
- continue;
- r = fz_collate_small_vector_run(&block);
- r = fz_intersect_rect(r, gd->bounds);
- if (!fz_is_valid_rect(r))
- continue;
- w = r.x1 - r.x0;
- h = r.y1 - r.y0;
- if (w > h && h < 2)
- {
- /* Thin, wide line */
- (void) add_h_line(ctx, gd, r.x0, r.x1, r.y0, r.y1);
- }
- else if (w < h && w < 2)
- {
- /* Thin, wide line */
- (void) add_v_line(ctx, gd, r.y0, r.y1, r.x0, r.x1);
- }
- else
- {
- /* Rectangle */
- (void) add_hv_line(ctx, gd, r.x0, r.x1, r.y0, r.y1, block->u.v.flags & FZ_STEXT_VECTOR_IS_STROKED);
- }
- }
- }
- }
- static int
- is_numeric(int c)
- {
- return (c >= '0' && c <= '9');
- }
- static int
- mark_cells_for_content(fz_context *ctx, grid_walker_data *gd, fz_rect s)
- {
- fz_rect r = fz_intersect_rect(gd->bounds, s);
- int x0, x1, y0, y1, x, y;
- if (fz_is_empty_rect(r))
- return 0;
- x0 = find_cell_l(gd->xpos, r.x0);
- x1 = find_cell_r(gd->xpos, r.x1);
- y0 = find_cell_l(gd->ypos, r.y0);
- y1 = find_cell_r(gd->ypos, r.y1);
- if (x0 < 0 || x1 < 0 || y0 < 0 || y1 < 0)
- return 1;
- if (x0 < x1)
- {
- for (y = y0; y <= y1; y++)
- for (x = x0; x < x1; x++)
- get_cell(gd->cells, x+1, y)->v_crossed++;
- }
- if (y0 < y1)
- {
- for (y = y0; y < y1; y++)
- for (x = x0; x <= x1; x++)
- get_cell(gd->cells, x, y+1)->h_crossed++;
- }
- for (y = y0; y <= y1; y++)
- for (x = x0; x <= x1; x++)
- get_cell(gd->cells, x, y)->full++;
- return 0;
- }
- #define IN_CELL 0
- #define IN_BORDER 1
- #define IN_UNKNOWN 2
- #define SPLIT_MARGIN 4
- static int
- where_is(fz_stext_grid_positions *pos, float x, int *in)
- {
- int i;
- *in = IN_UNKNOWN;
- /* If the table is empty, nothing to do. */
- if (pos->len == 0)
- return -1;
- /* If we are completely outside the table, give up. */
- if (x <= pos->list[0].pos - SPLIT_MARGIN || x >= pos->list[pos->len-1].max + SPLIT_MARGIN)
- return -1;
- for (i = 0; i < pos->len; i++)
- {
- /* Calculate the region (below..above) that counts as being
- * on the border of position i. */
- float prev = i > 0 ? pos->list[i-1].max : pos->list[0].min;
- float next = i < pos->len-1 ? pos->list[i+1].min : pos->list[i].max;
- float below = pos->list[i].pos - SPLIT_MARGIN;
- float above = pos->list[i].pos + SPLIT_MARGIN;
- /* Find the distance half way back to the previous pos as
- * a limit to our margin. */
- prev = (prev + pos->list[i].pos)/2;
- next = (next + pos->list[i].pos)/2;
- if (below < prev)
- below = prev;
- if (above > next)
- above = next;
- if (x < below)
- {
- *in = IN_CELL;
- return i-1;
- }
- else if (x <= above)
- {
- *in = IN_BORDER;
- return i;
- }
- }
- *in = IN_BORDER;
- return i-1;
- }
- enum
- {
- VECTOR_IS_CONTENT = 0,
- VECTOR_IS_BORDER = 1,
- VECTOR_IS_UNKNOWN = 2,
- VECTOR_IS_IGNORABLE = 3
- };
- /* So a vector can either be a border, or contained
- * in some cells, or something completely else. */
- static int
- classify_vector(fz_context *ctx, grid_walker_data *gd, fz_rect r, int is_rect)
- {
- int at_x0, at_x1, at_y0, at_y1;
- int ix0, ix1, iy0, iy1;
- r = fz_intersect_rect(r, gd->bounds);
- if (fz_is_empty_rect(r))
- return VECTOR_IS_IGNORABLE;
- ix0 = where_is(gd->xpos, r.x0, &at_x0);
- ix1 = where_is(gd->xpos, r.x1, &at_x1);
- iy0 = where_is(gd->ypos, r.y0, &at_y0);
- iy1 = where_is(gd->ypos, r.y1, &at_y1);
- /* No idea, just treat it as a border. */
- if (at_x0 == IN_UNKNOWN || at_x1 == IN_UNKNOWN || at_y0 == IN_UNKNOWN || at_y1 == IN_UNKNOWN)
- return VECTOR_IS_IGNORABLE;
- if (at_x0 == IN_BORDER && at_x1 == IN_BORDER)
- {
- /* Vector is aligned along sides of cells. */
- return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT;
- }
- if (at_y0 == IN_BORDER && at_y1 == IN_BORDER)
- {
- /* Vector is aligned along sides of cells. */
- return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT;
- }
- if (at_x0 == IN_CELL && at_x1 == IN_CELL)
- {
- /* Content within a cell (or 1d range of cells). */
- return VECTOR_IS_CONTENT;
- }
- if (at_y0 == IN_CELL && at_y1 == IN_CELL)
- {
- /* Content within a cell (or 1d range of cells). */
- return VECTOR_IS_CONTENT;
- }
- if (at_x0 == IN_BORDER && at_x1 == IN_CELL && ix0 == ix1)
- {
- return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT;
- }
- if (at_x0 == IN_CELL && at_x1 == IN_BORDER && ix0+1 == ix1)
- {
- return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT;
- }
- if (at_y0 == IN_BORDER && at_y1 == IN_CELL && iy0 == iy1)
- {
- return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT;
- }
- if (at_y0 == IN_CELL && at_y1 == IN_BORDER && iy0+1 == iy1)
- {
- return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT;
- }
- if (is_rect)
- {
- return VECTOR_IS_IGNORABLE;
- }
- /* unknown - take this as indication that this maybe isn't
- * table. */
- return VECTOR_IS_UNKNOWN;
- }
- #undef IN_CELL
- #undef IN_BORDER
- #undef IN_UNKNOWN
- /* Walk through the content, looking at how it spans our grid.
- * Record gridlines, which cells have content that cross into
- * neighbours, and which cells have any content at all.
- * Return a count of vector graphics that are found that don't
- * look plausibly like cell contents. */
- static int
- calculate_spanned_content(fz_context *ctx, grid_walker_data *gd, fz_stext_block *block)
- {
- int duff = 0;
- fz_rect bounds = {
- gd->xpos->list[0].pos,
- gd->ypos->list[0].pos,
- gd->xpos->list[gd->xpos->len-1].pos,
- gd->ypos->list[gd->ypos->len-1].pos };
- for (; block != NULL; block = block->next)
- {
- if (block->type == FZ_STEXT_BLOCK_STRUCT)
- {
- if (block->u.s.down)
- duff += calculate_spanned_content(ctx, gd, block->u.s.down->first_block);
- continue;
- }
- else if (block->type == FZ_STEXT_BLOCK_VECTOR)
- {
- switch (classify_vector(ctx, gd, block->bbox, !!(block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE)))
- {
- case VECTOR_IS_CONTENT:
- mark_cells_for_content(ctx, gd, block->bbox);
- break;
- case VECTOR_IS_BORDER:
- case VECTOR_IS_IGNORABLE:
- break;
- default:
- duff++;
- break;
- }
- }
- else if (block->type == FZ_STEXT_BLOCK_TEXT)
- {
- fz_stext_line *line;
- if (block->bbox.x0 >= bounds.x1 || block->bbox.y0 >= bounds.y1 ||
- block->bbox.x1 <= bounds.x0 || block->bbox.y1 <= bounds.y0)
- continue;
- for (line = block->u.t.first_line; line != NULL; line = line->next)
- {
- fz_stext_char *ch = line->first_char;
- int was_numeric = 0;
- /* Skip leading spaces */
- while (ch != NULL && ch->c == ' ')
- ch = ch->next;
- for (; ch != NULL; ch = ch->next)
- {
- if (ch->c == 32)
- {
- /* Trailing space, skip it. */
- if (ch->next == NULL)
- break;
- if (ch->next->c == 32)
- {
- /* Run of spaces. Skip 'em. */
- while (ch->next && ch->next->c == 32)
- ch = ch->next;
- was_numeric = 0;
- continue;
- }
- if (was_numeric || is_numeric(ch->next->c))
- {
- /* Single spaces around numbers are ignored. */
- was_numeric = 0;
- continue;
- }
- /* A single space. Accept it. */
- was_numeric = 0;
- }
- else
- was_numeric = is_numeric(ch->c);
- duff += mark_cells_for_content(ctx, gd, fz_rect_from_quad(ch->quad));
- }
- }
- }
- }
- return duff;
- }
- static cells_t *new_cells(fz_context *ctx, int w, int h)
- {
- cells_t *cells = fz_malloc_flexible(ctx, cells_t, cell, w * h);
- cells->w = w;
- cells->h = h;
- return cells;
- }
- #ifdef DEBUG_TABLE_STRUCTURE
- static void
- asciiart_table(grid_walker_data *gd)
- {
- int w = gd->xpos->len;
- int h = gd->ypos->len;
- int x, y;
- for (y = 0; y < h; y++)
- {
- for (x = 0; x < w-1; x++)
- {
- cell_t *cell = get_cell(gd->cells, x, y);
- int line = cell->h_line;
- int erase = cell->h_crossed;
- printf("+");
- if (line && !erase)
- {
- printf("-");
- }
- else if (!line && erase)
- {
- printf("v");
- }
- else if (line && erase)
- {
- printf("*");
- }
- else
- {
- printf(" ");
- }
- }
- printf("+\n");
- if (y == h-1)
- break;
- for (x = 0; x < w; x++)
- {
- cell_t *cell = get_cell(gd->cells, x, y);
- int line = cell->v_line;
- int erase = cell->v_crossed;
- if (line && !erase)
- {
- printf("|");
- }
- else if (!line && erase)
- {
- printf(">");
- }
- else if (line && erase)
- {
- printf("*");
- }
- else
- {
- printf(" ");
- }
- if (x < w-1)
- {
- if (cell->full)
- printf("#");
- else
- printf(" ");
- }
- else
- printf("\n");
- }
- }
- }
- #endif
- static void
- recalc_bbox(fz_stext_block *block)
- {
- fz_rect bbox = fz_empty_rect;
- fz_stext_line *line;
- for (line = block->u.t.first_line; line != NULL; line = line->next)
- bbox = fz_union_rect(bbox, line->bbox);
- block->bbox = bbox;
- }
- static void
- unlink_line_from_block(fz_stext_line *line, fz_stext_block *block)
- {
- fz_stext_line *next_line = line->next;
- if (line->prev)
- {
- assert(line->prev->next == line);
- line->prev->next = next_line;
- }
- else
- {
- assert(block->u.t.first_line == line);
- block->u.t.first_line = next_line;
- }
- if (next_line)
- {
- assert(next_line->prev == line);
- next_line->prev = line->prev;
- }
- else
- {
- assert(block->u.t.last_line == line);
- block->u.t.last_line = line->prev;
- }
- }
- static void
- append_line_to_block(fz_stext_line *line, fz_stext_block *block)
- {
- if (block->u.t.last_line == NULL)
- {
- assert(block->u.t.first_line == NULL);
- block->u.t.first_line = block->u.t.last_line = line;
- line->prev = NULL;
- }
- else
- {
- assert(block->u.t.last_line->next == NULL);
- line->prev = block->u.t.last_line;
- block->u.t.last_line->next = line;
- block->u.t.last_line = line;
- }
- line->next = NULL;
- }
- static void
- unlink_block(fz_stext_block *block, fz_stext_block **first, fz_stext_block **last)
- {
- if (block->prev)
- {
- assert(block->prev->next == block);
- block->prev->next = block->next;
- }
- else
- {
- assert(*first == block);
- *first = block->next;
- }
- if (block->next)
- {
- assert(block->next->prev == block);
- block->next->prev = block->prev;
- }
- else
- {
- assert(*last == block);
- *last = block->prev;
- }
- }
- #ifndef NDEBUG
- static int
- verify_stext(fz_context *ctx, fz_stext_page *page, fz_stext_struct *src)
- {
- fz_stext_block *block;
- fz_stext_block **first = src ? &src->first_block : &page->first_block;
- fz_stext_block **last = src ? &src->last_block : &page->last_block;
- int max = 0;
- assert((*first == NULL) == (*last == NULL));
- for (block = *first; block != NULL; block = block->next)
- {
- fz_stext_line *line;
- if (block->prev == NULL)
- assert(*first == block);
- else
- assert(block->prev->next == block);
- if (block->next == NULL)
- assert(*last == block);
- else
- assert(block->next->prev == block);
- if (block->type == FZ_STEXT_BLOCK_STRUCT)
- {
- if (block->u.s.down)
- {
- int m = verify_stext(ctx, page, block->u.s.down);
- if (m > max)
- max = m;
- }
- continue;
- }
- if (block->type != FZ_STEXT_BLOCK_TEXT)
- continue;
- assert((block->u.t.first_line == NULL) == (block->u.t.last_line == NULL));
- for (line = block->u.t.first_line; line != NULL; line = line->next)
- {
- fz_stext_char *ch;
- if (line->next == NULL)
- assert(block->u.t.last_line == line);
- else
- assert(line->next->prev == line);
- assert((line->first_char == NULL) == (line->last_char == NULL));
- for (ch = line->first_char; ch != NULL; ch = ch->next)
- assert(ch->next != NULL || line->last_char == ch);
- }
- }
- return max+1;
- }
- #endif
- static fz_rect
- move_contained_content(fz_context *ctx, fz_stext_page *page, fz_stext_struct *dest, fz_stext_struct *src, fz_rect r)
- {
- fz_stext_block *before = dest ? dest->first_block : page->first_block;
- fz_stext_block **sfirst = src ? &src->first_block : &page->first_block;
- fz_stext_block **slast = src ? &src->last_block : &page->last_block;
- fz_stext_block *block, *next_block;
- for (block = *sfirst; block != NULL; block = next_block)
- {
- fz_rect bbox = fz_intersect_rect(block->bbox, r);
- next_block = block->next;
- /* Don't use fz_is_empty_rect here, as that will exclude zero height areas like spaces. */
- if (bbox.x0 > bbox.x1 || bbox.y0 > bbox.y1)
- continue; /* Trivially excluded */
- if (bbox.x0 == block->bbox.x0 && bbox.y0 == block->bbox.y0 && bbox.x1 == block->bbox.x1 && bbox.y1 == block->bbox.y1)
- {
- /* Trivially included */
- if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down)
- {
- if (block->u.s.down->standard == FZ_STRUCTURE_TD)
- {
- /* Don't copy TDs! Just copy the contents */
- move_contained_content(ctx, page, dest, block->u.s.down, r);
- continue;
- }
- }
- unlink_block(block, sfirst, slast);
- insert_block_before(block, before, page, dest);
- assert(before == block->next);
- continue;
- }
- if (block->type == FZ_STEXT_BLOCK_STRUCT)
- {
- if (block->u.s.down)
- move_contained_content(ctx, page, dest, block->u.s.down, r);
- }
- if (block->type == FZ_STEXT_BLOCK_TEXT)
- {
- /* Partially included text block */
- fz_stext_line *line, *next_line;
- fz_stext_block *newblock = NULL;
- for (line = block->u.t.first_line; line != NULL; line = next_line)
- {
- fz_rect lrect = fz_intersect_rect(line->bbox, r);
- next_line = line->next;
- /* Don't use fz_is_empty_rect here, as that will exclude zero height areas like spaces. */
- if (lrect.x0 > lrect.x1 || lrect.y0 > lrect.y1)
- continue; /* Trivial exclusion */
- if (line->bbox.x0 == lrect.x0 && line->bbox.y0 == lrect.y0 && line->bbox.x1 == lrect.x1 && line->bbox.y1 == lrect.y1)
- {
- /* Trivial inclusion */
- if (newblock == NULL)
- {
- newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block));
- insert_block_before(newblock, before, page, dest);
- newblock->u.t.flags = block->u.t.flags;
- assert(before == newblock->next);
- }
- unlink_line_from_block(line, block);
- append_line_to_block(line, newblock);
- }
- else
- {
- /* Need to walk the line and just take parts */
- fz_stext_line *newline = NULL;
- fz_stext_char *ch, *next_ch, *prev_ch = NULL;
- for (ch = line->first_char; ch != NULL; ch = next_ch)
- {
- fz_rect crect = fz_rect_from_quad(ch->quad);
- float x = (crect.x0 + crect.x1)/2;
- float y = (crect.y0 + crect.y1)/2;
- next_ch = ch->next;
- if (r.x0 > x || r.x1 < x || r.y0 > y || r.y1 < y)
- {
- prev_ch = ch;
- continue;
- }
- /* Take this char */
- if (newline == NULL)
- {
- newline = fz_pool_alloc(ctx, page->pool, sizeof(*newline));
- newline->dir = line->dir;
- newline->wmode = line->wmode;
- newline->bbox = fz_empty_rect;
- }
- /* Unlink char */
- if (prev_ch == NULL)
- line->first_char = next_ch;
- else
- prev_ch->next = next_ch;
- if (next_ch == NULL)
- line->last_char = prev_ch;
- /* Relink char */
- ch->next = NULL;
- if (newline->last_char == NULL)
- newline->first_char = ch;
- else
- newline->last_char->next = ch;
- newline->last_char = ch;
- newline->bbox = fz_union_rect(newline->bbox, crect);
- }
- if (line->first_char == NULL)
- {
- /* We've removed all the chars from this line.
- * Better remove the line too. */
- if (line->prev)
- line->prev->next = next_line;
- else
- block->u.t.first_line = next_line;
- if (next_line)
- next_line->prev = line->prev;
- else
- block->u.t.last_line = line->prev;
- }
- if (newline)
- {
- if (newblock == NULL)
- {
- newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block));
- /* Add the block onto our target list */
- insert_block_before(newblock, before, page, dest);
- newblock->u.t.flags = block->u.t.flags;
- before = newblock->next;
- }
- append_line_to_block(newline, newblock);
- }
- }
- }
- if (newblock)
- {
- recalc_bbox(block);
- recalc_bbox(newblock);
- }
- if (block->u.t.first_line == NULL)
- {
- /* We've removed all the lines from the block. Should remove that too! */
- if (block->prev)
- {
- assert(block->prev->next == block);
- block->prev->next = next_block;
- }
- else
- {
- assert(*sfirst == block);
- *sfirst = block->next;
- }
- if (next_block)
- {
- assert(next_block->prev == block);
- next_block->prev = block->prev;
- }
- else
- {
- assert(*slast == block);
- *slast = block->prev;
- }
- }
- }
- }
- return r;
- }
- typedef struct
- {
- fz_stext_block *block;
- fz_stext_struct *parent;
- } tree_pos;
- /* This is still not perfect, but it's better! */
- static tree_pos
- find_table_insertion_point(fz_context *ctx, fz_rect r, tree_pos current, tree_pos best)
- {
- fz_stext_block *block;
- for (block = current.block; block != NULL; block = block->next)
- {
- if (block->type == FZ_STEXT_BLOCK_STRUCT)
- {
- if (block->u.s.down != NULL && fz_is_rect_inside_rect(r, block->bbox))
- {
- tree_pos down;
- down.parent = block->u.s.down;
- down.block = block->u.s.down->first_block;
- best = find_table_insertion_point(ctx, r, down, best);
- }
- }
- else
- {
- /* Is block a better precursor than best? (Or a valid precursor, if best.block == NULL) */
- if (block->bbox.y1 < r.y0 && (best.block == NULL || best.block->bbox.y1 < block->bbox.y1))
- {
- best.block = block;
- best.parent = current.parent;
- }
- }
- }
- return best;
- }
- static int
- tr_is_empty(fz_context *ctx, fz_stext_block *block)
- {
- for (; block != NULL; block = block->next)
- {
- if (block->type != FZ_STEXT_BLOCK_STRUCT)
- return 0;
- if (!block->u.s.down)
- {
- }
- else if (block->u.s.down->standard != FZ_STRUCTURE_TD)
- return 0;
- else if (block->u.s.down->first_block != NULL)
- return 0;
- }
- return 1;
- }
- static int
- table_is_empty(fz_context *ctx, fz_stext_block *block)
- {
- for (; block != NULL; block = block->next)
- {
- if (block->type == FZ_STEXT_BLOCK_GRID)
- continue;
- if (block->type != FZ_STEXT_BLOCK_STRUCT)
- return 0;
- if (!block->u.s.down)
- {
- }
- else if (block->u.s.down->standard != FZ_STRUCTURE_TR)
- return 0;
- else if (!tr_is_empty(ctx, block->u.s.down->first_block))
- return 0;
- }
- return 1;
- }
- static void
- tidy_td_divs(fz_context *ctx, fz_stext_struct *parent)
- {
- while (1)
- {
- fz_stext_block *block = parent->first_block;
- if (block == NULL || block->next != NULL || block->type != FZ_STEXT_BLOCK_STRUCT || block->u.s.down == NULL || block->u.s.down->standard != FZ_STRUCTURE_DIV)
- return;
- parent->first_block = block->u.s.down->first_block;
- parent->last_block = block->u.s.down->last_block;
- }
- }
- static void
- tidy_tr_divs(fz_context *ctx, fz_stext_block *block)
- {
- for (; block != NULL; block = block->next)
- {
- if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down && block->u.s.down->standard == FZ_STRUCTURE_TD)
- tidy_td_divs(ctx, block->u.s.down);
- }
- }
- static void
- tidy_table_divs(fz_context *ctx, fz_stext_block *block)
- {
- for (; block != NULL; block = block->next)
- {
- if (block->type == FZ_STEXT_BLOCK_GRID)
- continue;
- if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down && block->u.s.down->standard == FZ_STRUCTURE_TR)
- tidy_tr_divs(ctx, block->u.s.down->first_block);
- }
- }
- static int
- struct_is_empty(fz_context *ctx, fz_stext_block *block)
- {
- for (; block != NULL; block = block->next)
- {
- if (block->type != FZ_STEXT_BLOCK_STRUCT)
- return 0;
- if (!block->u.s.down)
- {
- }
- else if (!struct_is_empty(ctx, block->u.s.down->first_block))
- return 0;
- }
- return 1;
- }
- static int
- div_is_empty(fz_context *ctx, fz_stext_block *block)
- {
- for (; block != NULL; block = block->next)
- {
- if (block->type != FZ_STEXT_BLOCK_STRUCT)
- return 0;
- if (!block->u.s.down)
- {
- }
- else if (block->u.s.down->standard == FZ_STRUCTURE_TABLE)
- {
- tidy_table_divs(ctx, block->u.s.down->first_block);
- return table_is_empty(ctx, block->u.s.down->first_block);
- }
- else if (block->u.s.down->standard != FZ_STRUCTURE_DIV)
- {
- if (!struct_is_empty(ctx, block->u.s.down->first_block))
- return 0;
- }
- else if (!div_is_empty(ctx, block->u.s.down->first_block))
- return 0;
- }
- return 1;
- }
- static void
- tidy_orphaned_tables(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent)
- {
- fz_stext_block **first_blockp = parent ? &parent->first_block : &page->first_block;
- fz_stext_block **last_blockp = parent ? &parent->last_block : &page->last_block;
- fz_stext_block *block, *next_block;
- for (block = *first_blockp; block != NULL; block = next_block)
- {
- next_block = block->next;
- if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down)
- {
- if (block->u.s.down->standard == FZ_STRUCTURE_TABLE)
- {
- tidy_table_divs(ctx, block->u.s.down->first_block);
- if (table_is_empty(ctx, block->u.s.down->first_block))
- {
- /* Remove block */
- if (block->prev)
- {
- assert(block->prev->next == block);
- block->prev->next = next_block;
- }
- else
- {
- assert(*first_blockp == block);
- *first_blockp = next_block;
- }
- if (next_block)
- {
- assert(next_block->prev == block);
- next_block->prev = block->prev;
- }
- else
- {
- assert(*last_blockp == block);
- *last_blockp = block->prev;
- }
- }
- else
- {
- tidy_orphaned_tables(ctx, page, block->u.s.down);
- }
- }
- if (block->u.s.down->standard == FZ_STRUCTURE_DIV)
- {
- tidy_orphaned_tables(ctx, page, block->u.s.down);
- if (div_is_empty(ctx, block->u.s.down->first_block))
- {
- /* Remove block */
- if (block->prev)
- {
- assert(block->prev->next == block);
- block->prev->next = next_block;
- }
- else
- {
- assert(*first_blockp == block);
- *first_blockp = next_block;
- }
- if (next_block)
- {
- assert(next_block->prev == block);
- next_block->prev = block->prev;
- }
- else
- {
- assert(*last_blockp == block);
- *last_blockp = block->prev;
- }
- }
- }
- }
- }
- }
- static fz_stext_struct *
- transcribe_table(fz_context *ctx, grid_walker_data *gd, fz_stext_page *page, fz_stext_struct *parent)
- {
- int w = gd->xpos->len;
- int h = gd->ypos->len;
- int x, y;
- char *sent_tab = fz_calloc(ctx, 1, w*(size_t)h);
- fz_stext_block **first_block = parent ? &parent->first_block : &page->first_block;
- fz_stext_struct *table, *tr, *td;
- fz_rect r;
- tree_pos insert, top;
- /* Where should we insert the table in the data? */
- r.x0 = gd->xpos->list[0].pos;
- r.x1 = gd->xpos->list[w-1].pos;
- r.y0 = gd->ypos->list[0].pos;
- r.y1 = gd->ypos->list[h-1].pos;
- #ifdef DEBUG_TABLE_STRUCTURE
- fz_print_stext_page_as_xml(ctx, fz_stddbg(ctx), page, 0);
- #endif
- top.block = *first_block;
- top.parent = parent;
- insert.block = NULL;
- insert.parent = NULL;
- insert = find_table_insertion_point(ctx, r, top, insert);
- /* Convert to 'before' */
- if (insert.block)
- insert.block = insert.block->next;
- /* Make table */
- table = add_struct_block_before(ctx, insert.block, page, insert.parent, FZ_STRUCTURE_TABLE, "Table");
- /* Run through the cells, and guess at spanning. */
- for (y = 0; y < h-1; y++)
- {
- /* Have we sent this entire row before? */
- for (x = 0; x < w-1; x++)
- {
- if (!sent_tab[x+y*w])
- break;
- }
- if (x == w-1)
- continue; /* No point in sending a row with nothing in it! */
- /* Make TR */
- tr = add_struct_block_before(ctx, NULL, page, table, FZ_STRUCTURE_TR, "TR");
- for (x = 0; x < w-1; x++)
- {
- int x2, y2;
- int cellw = 1;
- int cellh = 1;
- /* Have we sent this cell already? */
- if (sent_tab[x+y*w])
- continue;
- /* Find the width of the cell */
- for (x2 = x+1; x2 < w-1; x2++)
- {
- cell_t *cell = get_cell(gd->cells, x2, y);
- if (cell->v_line)
- break; /* Can't go past a line */
- if (gd->xpos->list[x2].uncertainty == 0)
- break; /* An uncertainty of 0 is as good as a line. */
- if (!cell->v_crossed)
- break;
- cellw++;
- }
- /* Find the height of the cell */
- for (y2 = y+1; y2 < h-1; y2++)
- {
- cell_t *cell;
- int h_crossed = 0;
- if (gd->ypos->list[y2].uncertainty == 0)
- break; /* An uncertainty of 0 is as good as a line. */
- cell = get_cell(gd->cells, x, y2);
- if (cell->h_line)
- break; /* Can't extend down through a line. */
- if (cell->h_crossed)
- h_crossed = 1;
- for (x2 = x+1; x2 < x+cellw; x2++)
- {
- cell = get_cell(gd->cells, x2, y2);
- if (cell->h_line)
- break;
- if (cell->v_line)
- break; /* Can't go past a line */
- if (gd->xpos->list[x2].uncertainty == 0)
- break; /* An uncertainty of 0 is as good as a line. */
- if (!cell->v_crossed)
- break;
- if (cell->h_crossed)
- h_crossed = 1;
- }
- if (x2 == x+cellw && h_crossed)
- cellh++;
- else
- break;
- }
- /* Make TD */
- td = add_struct_block_before(ctx, NULL, page, tr, FZ_STRUCTURE_TD, "TD");
- r.x0 = gd->xpos->list[x].pos;
- r.x1 = gd->xpos->list[x+cellw].pos;
- r.y0 = gd->ypos->list[y].pos;
- r.y1 = gd->ypos->list[y+cellh].pos;
- /* Use r, not REAL contents bbox, as otherwise spanned rows
- * can end up empty. */
- td->up->bbox = r;
- move_contained_content(ctx, page, td, parent, r);
- #ifdef DEBUG_TABLE_STRUCTURE
- printf("(%d,%d) + (%d,%d)\n", x, y, cellw, cellh);
- #endif
- for (y2 = y; y2 < y+cellh; y2++)
- for (x2 = x; x2 < x+cellw; x2++)
- sent_tab[x2+y2*w] = 1;
- }
- r.x0 = gd->xpos->list[0].pos;
- r.x1 = gd->xpos->list[gd->xpos->len-1].pos;
- r.y0 = gd->ypos->list[y].pos;
- r.y1 = gd->ypos->list[y+1].pos;
- tr->up->bbox = r;
- table->up->bbox = fz_union_rect(table->up->bbox, tr->up->bbox);
- }
- fz_free(ctx, sent_tab);
- {
- fz_stext_block *block;
- fz_stext_grid_positions *xps2 = copy_grid_positions_to_pool(ctx, page, gd->xpos);
- fz_stext_grid_positions *yps2 = copy_grid_positions_to_pool(ctx, page, gd->ypos);
- block = add_grid_block(ctx, page, &table->first_block, &table->last_block);
- block->u.b.xs = xps2;
- block->u.b.ys = yps2;
- block->bbox.x0 = block->u.b.xs->list[0].pos;
- block->bbox.y0 = block->u.b.ys->list[0].pos;
- block->bbox.x1 = block->u.b.xs->list[block->u.b.xs->len-1].pos;
- block->bbox.y1 = block->u.b.ys->list[block->u.b.ys->len-1].pos;
- }
- tidy_orphaned_tables(ctx, page, parent);
- return table;
- }
- static void
- merge_column(grid_walker_data *gd, int x)
- {
- int y;
- for (y = 0; y < gd->cells->h; y++)
- {
- cell_t *d = &gd->cells->cell[x + y * (gd->cells->w-1)];
- cell_t *s = &gd->cells->cell[x + y * gd->cells->w];
- if (x > 0)
- memcpy(d-x, s-x, sizeof(*d) * x);
- d->full = s[0].full || s[1].full;
- d->h_crossed = s[0].h_crossed || s[1].h_crossed;
- d->h_line = s[0].h_line; /* == s[1].h_line */
- d->v_crossed = s[0].v_crossed;
- d->v_line = s[0].v_line;
- if (x < gd->cells->w - 2)
- memcpy(d+1, s+2, sizeof(*d) * (gd->cells->w - 2 - x));
- }
- gd->cells->w--;
- if (x < gd->xpos->len - 2)
- memcpy(&gd->xpos->list[x+1], &gd->xpos->list[x+2], sizeof(gd->xpos->list[0]) * (gd->xpos->len - 2 - x));
- gd->xpos->len--;
- }
- static void
- merge_columns(grid_walker_data *gd)
- {
- int x, y;
- for (x = gd->cells->w-3; x >= 0; x--)
- {
- /* Can column x be merged with column x+1? */
- /* An empty column can certainly be merged if the h_lines are the same,
- * and there is no v_line. */
- for (y = 0; y < gd->cells->h-1; y++)
- {
- cell_t *a = get_cell(gd->cells, x, y);
- cell_t *b = get_cell(gd->cells, x+1, y);
- if (a->full || !!a->h_line != !!b->h_line || b->v_line)
- break;
- }
- if (y == gd->cells->h-1)
- goto merge_column;
- /* An empty column can certainly be merged if the h_lines are the same. */
- for (y = 0; y < gd->cells->h-1; y++)
- {
- cell_t *a = get_cell(gd->cells, x, y);
- cell_t *b = get_cell(gd->cells, x+1, y);
- if (b->full || !!a->h_line != !!b->h_line || b->v_line)
- break;
- }
- if (y == gd->cells->h-1)
- goto merge_column;
- /* We only ever want to merge columns if content crossed between them somewhere.
- * Don't use uncertainty for this, because uncertainty doesn't allow for
- * whitespace. */
- for (y = 0; y < gd->cells->h-1; y++)
- if (get_cell(gd->cells, x+1, y)->v_crossed == 1)
- break;
- if (y == gd->cells->h-1)
- continue;
- /* This requires all the pairs of cells in those 2 columns to be mergeable. */
- for (y = 0; y < gd->cells->h-1; y++)
- {
- cell_t *a = get_cell(gd->cells, x, y);
- cell_t *b = get_cell(gd->cells, x+1, y);
- /* If there is a divider, we can't merge. */
- if (b->v_line)
- break;
- /* If either is empty, we can merge. */
- if (!a->full || !b->full)
- continue;
- /* If we differ in h linedness, we can't merge */
- if (!!a->h_line != !!b->h_line)
- break;
- /* If both are full, we can only merge if we cross. */
- if (a->full && b->full && b->v_crossed)
- continue;
- /* Otherwise we can't merge */
- break;
- }
- if (y == gd->cells->h-1)
- {
- /* Merge the column! */
- merge_column:
- #ifdef DEBUG_TABLE_STRUCTURE
- printf("Merging column %d\n", x);
- #endif
- merge_column(gd, x);
- #ifdef DEBUG_TABLE_STRUCTURE
- asciiart_table(gd);
- #endif
- }
- }
- }
- static void
- merge_row(grid_walker_data *gd, int y)
- {
- int x;
- int w = gd->cells->w;
- cell_t *d = &gd->cells->cell[y * w];
- for (x = 0; x < gd->cells->w-1; x++)
- {
- if (d->full == 0)
- d->full = d[w].full;
- if (d->v_crossed == 0)
- d->v_crossed = d[w].v_crossed;
- d++;
- }
- if (y < gd->cells->h - 2)
- memcpy(d, d+w, sizeof(*d) * (gd->cells->h - 2 - y) * w);
- gd->cells->h--;
- if (y < gd->ypos->len - 2)
- memcpy(&gd->ypos->list[y+1], &gd->ypos->list[y+2], sizeof(gd->ypos->list[0]) * (gd->ypos->len - 2 - y));
- gd->ypos->len--;
- }
- static void
- merge_rows(grid_walker_data *gd)
- {
- int x, y;
- for (y = gd->cells->h-3; y >= 0; y--)
- {
- /* Can row y be merged with row y+1? */
- /* An empty row can certainly be merged if the v_lines are the same,
- * and there is no h_line. */
- for (x = 0; x < gd->cells->w-1; x++)
- {
- cell_t *a = get_cell(gd->cells, x, y);
- cell_t *b = get_cell(gd->cells, x, y+1);
- if (a->full || !!a->v_line != !!b->v_line || b->h_line)
- break;
- }
- if (x == gd->cells->w-1)
- goto merge_row;
- /* An empty row can certainly be merged if the v_lines are the same. */
- for (x = 0; x < gd->cells->w-1; x++)
- {
- cell_t *a = get_cell(gd->cells, x, y);
- cell_t *b = get_cell(gd->cells, x, y+1);
- if (b->full || !!a->v_line != !!b->v_line || b->h_line)
- break;
- }
- if (x == gd->cells->w-1)
- goto merge_row;
- /* We only ever want to merge rows if content crossed between them somewhere.
- * Don't use uncertainty for this, because uncertainty doesn't allow for
- * whitespace. */
- for (x = 0; x < gd->cells->w-1; x++)
- if (get_cell(gd->cells, x, y+1)->h_crossed == 1)
- break;
- if (x == gd->cells->w-1)
- continue;
- /* This requires all the pairs of cells in those 2 rows to be mergeable. */
- for (x = 0; x < gd->cells->w-1; x++)
- {
- cell_t *a = get_cell(gd->cells, x, y);
- cell_t *b = get_cell(gd->cells, x, y+1);
- /* If there is a divider, we can't merge. */
- if (b->h_line)
- goto cant_merge;
- /* If either is empty, we can merge. */
- if (!a->full || !b->full)
- continue;
- /* If we differ in v linedness, we can't merge */
- if (!!a->v_line != !!b->v_line)
- goto cant_merge;
- /* If both are full, we can only merge if we cross. */
- if (a->full && b->full && b->h_crossed)
- continue;
- /* Otherwise we can't merge */
- break;
- }
- if (x == gd->cells->w-1)
- {
- /* Merge the row! */
- merge_row:
- #ifdef DEBUG_TABLE_STRUCTURE
- printf("Merging row %d\n", y);
- #endif
- merge_row(gd, y);
- #ifdef DEBUG_TABLE_STRUCTURE
- asciiart_table(gd);
- #endif
- continue;
- }
- /* OK, so we failed to be able to merge y and y+1. But if we get here, we
- * know this wasn't because of any lines being in the way. So we can try
- * a different set of criteria. If every non-empty cell in line y+1 is either
- * into from line y, or crosses into line y+2, then it's probably a 'straddling'
- * line. Just remove it. */
- if (y+2 >= gd->cells->h)
- continue;
- for (x = 0; x < gd->cells->w-1; x++)
- {
- cell_t *b = get_cell(gd->cells, x, y+1);
- cell_t *c = get_cell(gd->cells, x, y+2);
- if (!b->full)
- continue;
- if (!b->h_crossed && !c->h_crossed)
- break;
- }
- if (x == gd->cells->w-1)
- {
- /* Merge the row! */
- #ifdef DEBUG_TABLE_STRUCTURE
- printf("Merging row %d (case 2)\n", y);
- #endif
- merge_row(gd, y);
- #ifdef DEBUG_TABLE_STRUCTURE
- asciiart_table(gd);
- #endif
- }
- cant_merge:
- {}
- }
- }
- static int
- remove_bordered_empty_cells(grid_walker_data *gd)
- {
- int x, y, x1, y1, x2, y2;
- int changed = 0;
- /* We are looking for a region of cells that's bordered,
- * and empty, and where the borders don't extend...
- *
- * ' '
- * ' '
- * . . .'____'. . .
- * | |
- * | |
- * . . .|____|. . .
- * ' '
- * ' '
- * ' '
- */
- for (y = 0; y < gd->cells->h-1; y++)
- {
- for (x = 0; x < gd->cells->w-1; x++)
- {
- cell_t *u = y > 0 ? get_cell(gd->cells, x, y-1) : NULL;
- cell_t *l = x > 0 ? get_cell(gd->cells, x-1, y) : NULL;
- cell_t *c = get_cell(gd->cells, x, y);
- if (c->full)
- continue;
- if (!c->h_line || !c->v_line)
- continue;
- if (l != NULL && l->h_line)
- continue;
- if (u != NULL && u->v_line)
- continue;
- /* So c (x, y) is a reasonable top left of the bounded region. */
- for (x1 = x+1; x1 < gd->cells->w; x1++)
- {
- u = y > 0 ? get_cell(gd->cells, x1, y-1) : NULL;
- c = get_cell(gd->cells, x1, y);
- if (u != NULL && u->v_line)
- goto bad_region;
- if (c->h_line && !c->v_line)
- continue;
- if (c->h_line || !c->v_line)
- goto bad_region;
- break;
- }
- if (x1 == gd->cells->w)
- goto bad_region;
- /* If we get here, then we have the top row of a bounded region
- * that runs from (x,y) to (x1-1, y). So, can we extend that region
- * downwards? */
- for (y1 = y+1; y1 < gd->cells->h; y1++)
- {
- c = get_cell(gd->cells, x, y1);
- if (!c->h_line && c->v_line)
- {
- /* This could be the start of a line. */
- for (x2 = x+1; x2 < x1; x2++)
- {
- c = get_cell(gd->cells, x2, y1);
- if (c->full || c->h_line || c->v_line)
- goto bad_region;
- }
- c = get_cell(gd->cells, x1, y1);
- if (c->h_line || !c->v_line)
- goto bad_region;
- /* That line was fine! Region is extended. */
- }
- else if (c->h_line && !c->v_line)
- {
- /* This might be the bottom line of the bounded region. */
- for (x2 = x+1; x2 < x1; x2++)
- {
- c = get_cell(gd->cells, x2, y1);
- if (!c->h_line || c->v_line)
- goto bad_region;
- }
- c = get_cell(gd->cells, x1, y1);
- if (c->h_line || c->v_line)
- goto bad_region;
- break;
- }
- else
- goto bad_region;
- }
- if (y1 == gd->cells->h)
- goto bad_region;
- /* So we have a bounded region from x,y to x1-1,y1-1 */
- for (y2 = y; y2 < y1; y2++)
- {
- for (x2 = x; x2 < x1; x2++)
- {
- c = get_cell(gd->cells, x2, y2);
- c->h_line = 0;
- c->v_line = 0;
- c->full = 1;
- if (x2 > x)
- c->v_crossed = 1;
- if (y2 > y)
- c->h_crossed = 1;
- }
- c = get_cell(gd->cells, x2, y2);
- c->v_line = 0;
- }
- for (x2 = x; x2 < x1; x2++)
- get_cell(gd->cells, x2, y2)->h_line = 0;
- changed = 1;
- bad_region:
- {}
- }
- }
- #ifdef DEBUG_TABLE_STRUCTURE
- if (changed)
- {
- printf("Simplified empty bounded cells to give:\n");
- asciiart_table(gd);
- }
- #endif
- return changed;
- }
- static int
- find_table_within_bounds(fz_context *ctx, grid_walker_data *gd, fz_stext_block *content, fz_rect bounds)
- {
- div_list xs = { 0 };
- div_list ys = { 0 };
- int failed = 1;
- fz_try(ctx)
- {
- walk_to_find_content(ctx, &xs, &ys, content, bounds);
- sanitize_positions(ctx, &xs);
- sanitize_positions(ctx, &ys);
- /* Run across the line, counting 'winding' */
- /* If we don't have at least 2 rows and 2 columns, give up. */
- if (xs.len <= 2 || ys.len <= 2)
- break;
- gd->xpos = make_table_positions(ctx, &xs, bounds.x0, bounds.x1);
- gd->ypos = make_table_positions(ctx, &ys, bounds.y0, bounds.y1);
- gd->cells = new_cells(ctx, gd->xpos->len, gd->ypos->len);
- /* Walk the content looking for grid lines. These
- * lines refine our positions. */
- walk_grid_lines(ctx, gd, content);
- /* Now, we walk the content looking for content that crosses
- * these grid lines. This allows us to spot spanned cells. */
- if (calculate_spanned_content(ctx, gd, content))
- break; /* Unlikely to be a table. */
- #ifdef DEBUG_TABLE_STRUCTURE
- asciiart_table(gd);
- #endif
- /* Now, can we remove some columns or rows? i.e. have we oversegmented? */
- do
- {
- merge_columns(gd);
- merge_rows(gd);
- }
- while (remove_bordered_empty_cells(gd));
- /* Did we shrink the table so much it's not a table any more? */
- if (gd->xpos->len <= 2 || gd->ypos->len <= 2)
- break;
- failed = 0;
- }
- fz_always(ctx)
- {
- fz_free(ctx, xs.list);
- fz_free(ctx, ys.list);
- }
- fz_catch(ctx)
- {
- fz_rethrow(ctx);
- }
- return failed;
- }
- static int
- do_table_hunt(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent)
- {
- fz_stext_block *block;
- int count;
- fz_stext_block **first_block = parent ? &parent->first_block : &page->first_block;
- int num_subtables = 0;
- grid_walker_data gd = { 0 };
- /* No content? Just bale. */
- if (*first_block == NULL)
- return 0;
- /* If all the content here looks like a column of text, don't
- * hunt for a table within it. */
- if (all_blocks_are_justified_or_headers(ctx, *first_block))
- return num_subtables;
- gd.bounds = fz_infinite_rect;
- /* First off, descend into any children to see if those look like tables. */
- count = 0;
- for (block = *first_block; block != NULL; block = block->next)
- {
- if (block->type == FZ_STEXT_BLOCK_STRUCT)
- {
- if (block->u.s.down)
- {
- num_subtables += do_table_hunt(ctx, page, block->u.s.down);
- count++;
- }
- }
- else if (block->type == FZ_STEXT_BLOCK_TEXT)
- count++;
- }
- /* If we don't have at least a single child, no more to hunt. */
- /* We only need a single block, because a single text block can
- * contain an entire unbordered table. */
- if (count < 1)
- return num_subtables;
- /* We only look for a table at this level if we either at the top
- * or on a div. This saves us looking for tables within an 'H'
- * for example. */
- if (parent != NULL && parent->standard != FZ_STRUCTURE_DIV)
- return num_subtables;
- fz_var(gd);
- fz_try(ctx)
- {
- /* Now see whether the content looks like tables. */
- fz_rect bounds = walk_to_find_bounds(ctx, *first_block);
- if (find_table_within_bounds(ctx, &gd, *first_block, bounds))
- break;
- if (num_subtables > 0)
- {
- /* We are risking throwing away a table we've already found for this
- * one. Only do it if this one is really convincing. */
- int x, y;
- for (x = 0; x < gd.xpos->len; x++)
- if (gd.xpos->list[x].uncertainty != 0)
- break;
- if (x != gd.xpos->len)
- break;
- for (y = 0; y < gd.ypos->len; y++)
- if (gd.ypos->list[y].uncertainty != 0)
- break;
- if (y != gd.ypos->len)
- break;
- }
- #ifdef DEBUG_TABLE_STRUCTURE
- printf("Transcribing table: (%g,%g)->(%g,%g)\n",
- gd.xpos->list[0].pos,
- gd.ypos->list[0].pos,
- gd.xpos->list[gd.xpos->len-1].pos,
- gd.ypos->list[gd.ypos->len-1].pos);
- #endif
- /* Now we should have the entire table calculated. */
- (void)transcribe_table(ctx, &gd, page, parent);
- num_subtables = 1;
- #ifdef DEBUG_WRITE_AS_PS
- {
- int i;
- printf("%% TABLE\n");
- for (i = 0; i < block->u.b.xs->len; i++)
- {
- if (block->u.b.xs->list[i].uncertainty)
- printf("0 1 0 setrgbcolor\n");
- else
- printf("0 0.5 0 setrgbcolor\n");
- printf("%g %g moveto %g %g lineto stroke\n",
- block->u.b.xs->list[i].pos, block->bbox.y0,
- block->u.b.xs->list[i].pos, block->bbox.y1);
- }
- for (i = 0; i < block->u.b.ys->len; i++)
- {
- if (block->u.b.ys->list[i].uncertainty)
- printf("0 1 0 setrgbcolor\n");
- else
- printf("0 0.5 0 setrgbcolor\n");
- printf("%g %g moveto %g %g lineto stroke\n",
- block->bbox.x0, block->u.b.ys->list[i].pos,
- block->bbox.x1, block->u.b.ys->list[i].pos);
- }
- }
- #endif
- }
- fz_always(ctx)
- {
- fz_free(ctx, gd.xpos);
- fz_free(ctx, gd.ypos);
- fz_free(ctx, gd.cells);
- }
- fz_catch(ctx)
- fz_rethrow(ctx);
- return num_subtables;
- }
- void
- fz_table_hunt(fz_context *ctx, fz_stext_page *page)
- {
- if (page == NULL)
- return;
- assert(verify_stext(ctx, page, NULL));
- do_table_hunt(ctx, page, NULL);
- assert(verify_stext(ctx, page, NULL));
- }
- fz_stext_block *
- fz_find_table_within_bounds(fz_context *ctx, fz_stext_page *page, fz_rect bounds)
- {
- fz_stext_struct *table = NULL;
- grid_walker_data gd = { 0 };
- /* No content? Just bale. */
- if (page == NULL || page->first_block == NULL)
- return NULL;
- fz_var(gd);
- fz_try(ctx)
- {
- gd.bounds = bounds;
- if (find_table_within_bounds(ctx, &gd, page->first_block, bounds))
- break;
- #ifdef DEBUG_TABLE_STRUCTURE
- printf("Transcribing table: (%g,%g)->(%g,%g)\n",
- gd.xpos->list[0].pos,
- gd.ypos->list[0].pos,
- gd.xpos->list[gd.xpos->len-1].pos,
- gd.ypos->list[gd.ypos->len-1].pos);
- #endif
- /* Now we should have the entire table calculated. */
- table = transcribe_table(ctx, &gd, page, NULL);
- #ifdef DEBUG_WRITE_AS_PS
- {
- int i;
- printf("%% TABLE\n");
- for (i = 0; i < block->u.b.xs->len; i++)
- {
- if (block->u.b.xs->list[i].uncertainty)
- printf("0 1 0 setrgbcolor\n");
- else
- printf("0 0.5 0 setrgbcolor\n");
- printf("%g %g moveto %g %g lineto stroke\n",
- block->u.b.xs->list[i].pos, block->bbox.y0,
- block->u.b.xs->list[i].pos, block->bbox.y1);
- }
- for (i = 0; i < block->u.b.ys->len; i++)
- {
- if (block->u.b.ys->list[i].uncertainty)
- printf("0 1 0 setrgbcolor\n");
- else
- printf("0 0.5 0 setrgbcolor\n");
- printf("%g %g moveto %g %g lineto stroke\n",
- block->bbox.x0, block->u.b.ys->list[i].pos,
- block->bbox.x1, block->u.b.ys->list[i].pos);
- }
- }
- #endif
- }
- fz_always(ctx)
- {
- fz_free(ctx, gd.xpos);
- fz_free(ctx, gd.ypos);
- fz_free(ctx, gd.cells);
- }
- fz_catch(ctx)
- fz_rethrow(ctx);
- return table ? table->first_block : NULL;
- }
|