stext-table.c 82 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174
  1. // Copyright (C) 2025 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #include "mupdf/fitz.h"
  23. #include <assert.h>
  24. /* #define DEBUG_WRITE_AS_PS */
  25. /* #define DEBUG_TABLE_STRUCTURE */
  26. /* #define DEBUG_TABLE_HUNT */
  27. /*
  28. * The algorithm.
  29. *
  30. * The goal of the algorithm is to identify tables on a page.
  31. * First we have to find the tables on a page, then we have to
  32. * figure out where the columns/rows are, and then how the
  33. * cells span them.
  34. *
  35. * We do this as a series of steps.
  36. *
  37. * To illustrate what's going on, let's use an example page
  38. * that we can follow through all the steps.
  39. *
  40. * +---------------------------+
  41. * | |
  42. * | #### ## ### ## | <- Title
  43. * | |
  44. * | ##### ##### #### ## | \
  45. * | ## ###### ###### ## | |
  46. * | #### ####### ###### | |- Abstract
  47. * | ####### #### ## ### | |
  48. * | ### ##### ###### | /
  49. * | |
  50. * | ######### ######### | 2 Columns of text
  51. * | ######### ######### |
  52. * | ######## ######### |
  53. * | ######### |
  54. * | +-------+ ####### | <- With an image on the left
  55. * | | | |
  56. * | | | ## ## # # | <- And a table on the right
  57. * | +-------+ ## ## # # |
  58. * | ## ## # # |
  59. * | ######### ## ## # # |
  60. * | ######### ## ## # # |
  61. * | ######### ## ## # # |
  62. * | |
  63. * +---------------------------+
  64. *
  65. *
  66. * Step 1: Segmentation.
  67. *
  68. * First, we segment the page, trying to break it down into a
  69. * series of non-overlapping rectangles. We do this (in stext-boxer.c)
  70. * by looking for where the content isn't. If we can identify breaks
  71. * that run through the page (either from top to bottom or from left
  72. * to right), then we can split the page there, and recursively consider
  73. * the two halves of the page.
  74. *
  75. * It's not a perfect algorithm, but it manages to in many cases.
  76. *
  77. * After segmenting the above example, first we'll find the horizontal
  78. * splits, giving:
  79. *
  80. * +---------------------------+
  81. * | |
  82. * | #### ## ### ## |
  83. * +---------------------------+
  84. * | ##### ##### #### ## |
  85. * | ## ###### ###### ## |
  86. * | #### ####### ###### |
  87. * | ####### #### ## ### |
  88. * | ### ##### ###### |
  89. * +---------------------------+
  90. * | ######### ######### |
  91. * | ######### ######### |
  92. * | ######## ######### |
  93. * | ######### |
  94. * | +-------+ ####### |
  95. * | | | |
  96. * | | | ## ## # # |
  97. * | +-------+ ## ## # # |
  98. * | ## ## # # |
  99. * | ######### ## ## # # |
  100. * | ######### ## ## # # |
  101. * | ######### ## ## # # |
  102. * | |
  103. * +---------------------------+
  104. *
  105. * Then we'll recurse and find the vertical split between
  106. * the columns:
  107. *
  108. * +---------------------------+
  109. * | |
  110. * | #### ## ### ## |
  111. * +---------------------------+
  112. * | ##### ##### #### ## |
  113. * | ## ###### ###### ## |
  114. * | #### ####### ###### |
  115. * | ####### #### ## ### |
  116. * | ### ##### ###### |
  117. * +-------------+-------------+
  118. * | ######### | ######### |
  119. * | ######### | ######### |
  120. * | ######## | ######### |
  121. * | | ######### |
  122. * | +-------+ | ####### |
  123. * | | | | |
  124. * | | | | ## ## # # |
  125. * | +-------+ | ## ## # # |
  126. * | | ## ## # # |
  127. * | ######### | ## ## # # |
  128. * | ######### | ## ## # # |
  129. * | ######### | ## ## # # |
  130. * | | |
  131. * +-------------+-------------+
  132. *
  133. * Then we recurse again and find the horizontal splits
  134. * within the columns:
  135. *
  136. * +---------------------------+
  137. * | |
  138. * | #### ## ### ## |
  139. * +---------------------------+
  140. * | ##### ##### #### ## |
  141. * | ## ###### ###### ## |
  142. * | #### ####### ###### |
  143. * | ####### #### ## ### |
  144. * | ### ##### ###### |
  145. * +-------------+-------------+
  146. * | ######### | ######### |
  147. * | ######### | ######### |
  148. * | ######## | ######### |
  149. * +-------------+ ######### |
  150. * | +-------+ | ####### |
  151. * | | | +-------------+
  152. * | | | | ## ## # # |
  153. * | +-------+ | ## ## # # |
  154. * +-------------+ ## ## # # |
  155. * | ######### | ## ## # # |
  156. * | ######### | ## ## # # |
  157. * | ######### | ## ## # # |
  158. * | | |
  159. * +-------------+-------------+
  160. *
  161. * We recurse a fixed maximum number of times (currently
  162. * 6, IIRC) or until we fail to find any suitable splits.
  163. *
  164. * This completes the page segmentation step.
  165. *
  166. * Step 2: Grid finding
  167. *
  168. * Next, we look at each of those segments and try to identify
  169. * where grids might be.
  170. *
  171. * Imagine the bottom right section of that page as
  172. * a board with lego blocks on where there is text.
  173. * Now imagine viewing that from the bottom of the page.
  174. * The gaps between the columns of the table are where you
  175. * can see through to the top between the blocks.
  176. *
  177. * Similarly, if you view it from the side, the gaps between the
  178. * rows of the page are where you can see through to the other
  179. * side.
  180. *
  181. * So, how do we code that? Well, we run through the page content
  182. * (obviously, restricted to the content that falls into this
  183. * segment of the page - that'll go without saying from here on
  184. * in). For each bit of content, we look at the "x extent" of that
  185. * content - for instance a given string might start at position
  186. * 10 and continue to position 100. We build a list of all these
  187. * start, and stop positions, and keep them in a sorted list.
  188. *
  189. * Then we walk this list from left to right, keeping a sum. I
  190. * call this sum "wind", because it's very similar to the winding
  191. * number that you get when doing scan conversion of bezier shapes.
  192. *
  193. * wind starts out as 0. We increment it whenever we pass a 'start'
  194. * position, and decrement it whenever we pass a 'stop' position.
  195. * So at any given x position along the line wind tells us the
  196. * number of pieces of content that overlap that x position.
  197. * So wind(left) = 0 = wind(right), and wind(x) >= x for all x.
  198. *
  199. * So, if we walk from left to right, the trace of wind might
  200. * look something like:
  201. *
  202. * __
  203. * ___ / \ _ __
  204. * / \ / \/ \ _/ \_
  205. * / \___/ \___/ \
  206. *
  207. * The left and right edges of the table are pretty clear.
  208. * The regions where wind drops to 0 represent the column dividers.
  209. * The left and right hand side of those regions gives us the min
  210. * and max values for that divider.
  211. *
  212. * We can then repeat this process for Y ranges instead of X ranges
  213. * to get the row dividers.
  214. *
  215. * BUT, this only works for pure grid tables. It falls down for
  216. * cases where we have merged cells (which is very common due to
  217. * titles etc).
  218. *
  219. * We can modify the algorithm slightly to allow for this.
  220. *
  221. * Consider the following table:
  222. *
  223. * +-----------------------------------+
  224. * | Long Table title across the top |
  225. * +---------------+---------+---------+
  226. * | Name | Result1 | Result2 |
  227. * +---------------+----+----+----+----+
  228. * | Homer Simpson | 1 | 23 | 4 | 56 |
  229. * | Barney Gumble | 1 | 23 | 4 | 56 |
  230. * | Moe | 1 | 23 | 4 | 56 |
  231. * | Apu | 1 | 23 | 4 | 56 |
  232. * | Ned Flanders | 1 | 23 | 4 | 56 |
  233. * +---------------+----+----+----+----+
  234. *
  235. * The wind trace for that looks something like
  236. * (with a certain degree of artistic license for the
  237. * limitations of ascii art):
  238. *
  239. * ________
  240. * / \ _ __ _ _
  241. * / \____/ \_/ \___/ \_/ \
  242. * / \
  243. *
  244. * So, the trace never quite drops back to zero in the
  245. * middle due to the spanning of the top title.
  246. *
  247. * So, instead of just looking for points where the trace
  248. * drops to zero, we instead look for local minima. Each local
  249. * minima represents a place where there might be a grid divider.
  250. * The value of wind at such points can be considered the
  251. * "uncertainty" with which there might be a divider there.
  252. * Clear dividers (with a wind value of 0) have no uncertainty.
  253. * Places where cells are spanned have a higher value of uncertainty.
  254. *
  255. * The output from this step is the list of possible grid positions
  256. * (X and Y), with uncertainty values.
  257. *
  258. * We have skipped over the handling of spaces in the above
  259. * description. On the one hand, we'd quite like to treat
  260. * sentences as long unbroken regions of content. But sometimes
  261. * we can get awkward content like:
  262. *
  263. * P subtelomeric 24.38 8.15 11.31 0.94 1.46
  264. * 11.08 6.93 15.34 0.00 0.73
  265. * Pericentromeric region; C band 20.42 9.26 13.64 0.81 0.81
  266. * 16.50 7.03 14.45 0.17 5.15
  267. * where the content is frequently sent with spaces instead of
  268. * small gaps (particular in tables of digits, because numerals
  269. * are often the same width even in proportional fonts).
  270. *
  271. * To cope with this, as we send potential edges into the start/stop
  272. * positions, we send 'weak' edges for the start and stops of runs
  273. * of spaces. We then post process the edges to remove any local
  274. * minima regions in the 'wind' values that are bounded purely by
  275. * 'weak' edges.
  276. *
  277. * Step 3: Cell analysis
  278. *
  279. * So, armed with the output from step 2, we can examine each grid
  280. * found. If we have W x-dividers and H y-dividers, we know we have
  281. * a potential table with (W-1) x (H-1) cells in it.
  282. *
  283. * We represent this as a W x H grid of cells, each like:
  284. *
  285. * . .
  286. * . .
  287. * . . .+-------+. . . Each cell holds information about the
  288. * | . edges above, and to the left of it.
  289. * | .
  290. * | .
  291. * . . .+. . . .+. . .
  292. * . .
  293. * . .
  294. *
  295. * h_line: Is there a horizontal divider drawn on the page that
  296. * corresponds to the top of this cell (i.e. is there a cell border
  297. * here?)
  298. * v_line: Is there a vertical divider drawn on the page that
  299. * corresponds to the left of this cell (i.e. is there a cell border
  300. * here?)
  301. * h_crossed: Does content cross this line (i.e. are we merged
  302. * with the cell above?)
  303. * v_crossed: Does content cross this line (i.e. are we merged
  304. * with the cell to the left?)
  305. * full: Is there any content in this cell at all?
  306. *
  307. * We need a W x H grid of cells to represent the entire table due
  308. * to the potential right and bottom edge lines. The right and
  309. * bottom rows of cells should never be full, or be crossed, but
  310. * it's easiest just to use a simple representation that copes with
  311. * the h_line and v_line values naturally.
  312. *
  313. * So, we start with the cells structure empty, and we run through
  314. * the page content, filling in the details as we go.
  315. *
  316. * At the end of the process, we have enough information to draw
  317. * an asciiart representation of our table. It might look something
  318. * like this (this comes from dotted-gridlines-tables.pdf):
  319. *
  320. * +-+-+-+-+-+-+-+-+-+-+-+-+-+
  321. * | | | | | |#|#|#|#| | | |
  322. * + + + + + + +v+ +v+v+ + + +
  323. * | | | | | |#|#|#|#| | | |
  324. * + + + + + + + +v+ + + + + +
  325. * | |#| | |#|#|#|#|#|#|#|#|
  326. * + +v+ + + +v+v+ +v+v+v+v+v+
  327. * | |#| |#|#|#|#|#|#|#|#|#|
  328. * + + + + +v+ + +v+ + + + + +
  329. * |#|#| #|#|#|#|#|#|#|#|#|#|
  330. * +v+v+ +v+ +v+v+ +v+v+v+v+v+
  331. * |#|#| #|#|#|#|#|#|#|#|#|#|
  332. * + + + + +v+ + +v+ + + + + +
  333. * | |#| |#|#|#|#|#|#|#|#|#|
  334. * + +v+ + + +v+v+ +v+v+v+v+v+
  335. * | |#| | |#|#|#|#|#|#|#|#|
  336. * + + + + + + + +v+ + + + + +
  337. * | | | | | |#|#|#|#| | | |
  338. * + + + + + + +v+ +v+v+ + + +
  339. * | | | | | |#|#|#|#| | | |
  340. * +-+-+-+-+-+-+-+-+-+-+-+-+-+
  341. * | |# |#| | |#| | |#|#|#|
  342. * + +-+-+-+-+-+-+-+-+-+-+-+-+
  343. * | |# |#| | |#| | |#|#|#|
  344. * + +-+-+-+-+-+-+-+-+-+-+-+-+
  345. * | |#>#|#| | |#| | |#|#|#|
  346. * + +-+-+-+-+-+-+-+-+-+-+-+-+
  347. * | |#>#|#| | |#| | |#|#|#|
  348. * + +-+-+-+-+-+-+-+-+-+-+-+-+
  349. * | |#>#|#| |#| | | |#|#|#|
  350. * + +-+-+-+-+-+-+-+-+-+-+-+-+
  351. * | |# |#| | |#| | |#|#|#|
  352. * + +-+-+-+-+-+-+-+-+-+-+-+-+
  353. * | |# |#| | |#| | |#|#|#|
  354. * + +-+-+-+-+-+-+-+-+-+-+-+-+
  355. * | |# |#| | |#| | |#|#|#|
  356. * + +-+-+-+-+-+-+-+-+-+-+-+-+
  357. * | |# |#| | |#| | |#|#|#|
  358. * + +-+-+-+-+-+-+-+-+-+-+-+-+
  359. * |# |#|#|#|#|#|#|#|#|#|
  360. * + +-+-+-+-+-+-+-+-+-+-+-+-+
  361. *
  362. * This shows where lines are detected ( - and | ),
  363. * where they are crossed ( > and v) and where cells
  364. * are full ( # ).
  365. *
  366. * Step 4: Row and column merging.
  367. *
  368. * Based on the information above, we then try to merge
  369. * cells and columns to simplify the table.
  370. *
  371. * The best rules I've come up with this so far are:
  372. * We can merge two adjacent columns if all the pairs of
  373. * cells in the two columns are mergeable.
  374. *
  375. * Cells are held to be mergeable or not based upon the following
  376. * rules:
  377. * If there is a line between 2 cells - not mergeable.
  378. * else if the uncertainty between 2 cells is 0 - not mergeable.
  379. * else if the line between the 2 cells is crossed - mergeable.
  380. * else if strictly one of the cells is full - mergeable.
  381. * else not mergeable.
  382. *
  383. * So in the above example, column 2 (numbered from 0) can be merged
  384. * with column 3.
  385. *
  386. * This gives:
  387. *
  388. * +-+-+-+-+-+-+-+-+-+-+-+-+
  389. * | | | | | |#|#|#|#| | | |
  390. * + + + + + +v+ +v+v+ + + +
  391. * | | | | | |#|#|#|#| | | |
  392. * + + + + + + +v+ + + + + +
  393. * | |#| | |#|#|#|#|#|#|#|#|
  394. * + +v+ + +v+v+ +v+v+v+v+v+
  395. * | |#| |#|#|#|#|#|#|#|#|#|
  396. * + + + +v+ + +v+ + + + + +
  397. * |#|#|#|#|#|#|#|#|#|#|#|#|
  398. * +v+v+v+ +v+v+ +v+v+v+v+v+
  399. * |#|#|#|#|#|#|#|#|#|#|#|#|
  400. * + + + +v+ + +v+ + + + + +
  401. * | |#| |#|#|#|#|#|#|#|#|#|
  402. * + +v+ + +v+v+ +v+v+v+v+v+
  403. * | |#| | |#|#|#|#|#|#|#|#|
  404. * + + + + + + +v+ + + + + +
  405. * | | | | | |#|#|#|#| | | |
  406. * + + + + + +v+ +v+v+ + + +
  407. * | | | | | |#|#|#|#| | | |
  408. * +-+-+-+-+-+-+-+-+-+-+-+-+
  409. * | |#|#| | |#| | |#|#|#|
  410. * + +-+-+-+-+-+-+-+-+-+-+-+
  411. * | |#|#| | |#| | |#|#|#|
  412. * + +-+-+-+-+-+-+-+-+-+-+-+
  413. * | |#|#| | |#| | |#|#|#|
  414. * + +-+-+-+-+-+-+-+-+-+-+-+
  415. * | |#|#| | |#| | |#|#|#|
  416. * + +-+-+-+-+-+-+-+-+-+-+-+
  417. * | |#|#| |#| | | |#|#|#|
  418. * + +-+-+-+-+-+-+-+-+-+-+-+
  419. * | |#|#| | |#| | |#|#|#|
  420. * + +-+-+-+-+-+-+-+-+-+-+-+
  421. * | |#|#| | |#| | |#|#|#|
  422. * + +-+-+-+-+-+-+-+-+-+-+-+
  423. * | |#|#| | |#| | |#|#|#|
  424. * + +-+-+-+-+-+-+-+-+-+-+-+
  425. * | |#|#| | |#| | |#|#|#|
  426. * + +-+-+-+-+-+-+-+-+-+-+-+
  427. * |# |#|#|#|#|#|#|#|#|#|
  428. * + +-+-+-+-+-+-+-+-+-+-+-+
  429. *
  430. * We then perform the same merging process for rows as for
  431. * columns - though there are no rows in the above example
  432. * that can be merged.
  433. *
  434. * You'll note that, for example, we don't merge row 0 and
  435. * row 1 in the above, because we have a pair of cells that
  436. * are both full without crossing.
  437. *
  438. * Step 5: Cell spanning
  439. *
  440. * Now we actually start to output the table. We keep a 'sent_table'
  441. * (a grid of W x H bools) to keep track of whether we've output
  442. * the content for a given cell or not yet.
  443. *
  444. * For each cell we reach, assuming sent_table[x,y] is false,
  445. * we merge it with as many cells on the right as required,
  446. * according to 'v_crossed' values (subject to not passing
  447. * v_lines or uncertainty == 0's).
  448. *
  449. * We then try to merge cells below according to 'h_crossed'
  450. * values (subject to not passing h_lines or uncertainty == 0's).
  451. *
  452. * In theory this can leave us with some cases where we'd like
  453. * to merge some cells (because of crossed) and can't (because
  454. * of lines or sent_table[]) values. In the absence of better
  455. * cell spanning algorithms we have no choice here.
  456. *
  457. * Then we output the contents and set sent_table[] values as
  458. * appropriate.
  459. *
  460. * If a row has no cells in it, we currently omit the TR. If/when
  461. * we figure out how to indicate rowspan/colspan in stext, we can
  462. * revisit that.
  463. */
  464. static fz_stext_block *
  465. add_grid_block(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block)
  466. {
  467. fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof(**first_block));
  468. memset(block, 0, sizeof(*block));
  469. block->type = FZ_STEXT_BLOCK_GRID;
  470. block->bbox = fz_empty_rect; /* Fixes bug 703267. */
  471. block->next = *first_block;
  472. if (*first_block)
  473. {
  474. (*first_block)->prev = block;
  475. assert(*last_block);
  476. }
  477. else
  478. {
  479. assert(*last_block == NULL);
  480. *last_block = block;
  481. }
  482. *first_block = block;
  483. return block;
  484. }
  485. static void
  486. insert_block_before(fz_stext_block *block, fz_stext_block *before, fz_stext_page *page, fz_stext_struct *dest)
  487. {
  488. if (before)
  489. {
  490. /* We have a block to insert it before, so we know it's not the last. */
  491. assert(dest ? (dest->first_block != NULL && dest->last_block != NULL) : (page->first_block != NULL && page->last_block != NULL));
  492. block->next = before;
  493. block->prev = before->prev;
  494. if (before->prev)
  495. {
  496. assert(before->prev->next == before);
  497. before->prev->next = block;
  498. }
  499. else if (dest)
  500. {
  501. assert(dest->first_block == before);
  502. dest->first_block = block;
  503. }
  504. else
  505. {
  506. assert(page->first_block == before);
  507. page->first_block = block;
  508. }
  509. before->prev = block;
  510. }
  511. else if (dest)
  512. {
  513. /* Will be the last block. */
  514. block->next = NULL;
  515. block->prev = dest->last_block;
  516. if (dest->last_block)
  517. {
  518. assert(dest->last_block->next == NULL);
  519. dest->last_block->next = block;
  520. }
  521. if (dest->first_block == NULL)
  522. dest->first_block = block;
  523. dest->last_block = block;
  524. }
  525. else
  526. {
  527. /* Will be the last block. */
  528. block->next = NULL;
  529. block->prev = page->last_block;
  530. if (page->last_block)
  531. {
  532. assert(page->last_block->next == NULL);
  533. page->last_block->next = block;
  534. }
  535. if (page->first_block == NULL)
  536. page->first_block = block;
  537. page->last_block = block;
  538. }
  539. }
  540. static fz_stext_struct *
  541. add_struct_block_before(fz_context *ctx, fz_stext_block *before, fz_stext_page *page, fz_stext_struct *parent, fz_structure std, const char *raw)
  542. {
  543. fz_stext_block *block;
  544. int idx = 0;
  545. size_t z;
  546. fz_stext_struct *newstruct;
  547. if (raw == NULL)
  548. raw = "";
  549. z = strlen(raw);
  550. /* We're going to insert a struct block. We need an idx, so walk the list */
  551. for (block = parent ? parent->first_block : page->first_block; block != before; block = block->next)
  552. {
  553. if (block->type == FZ_STEXT_BLOCK_STRUCT)
  554. {
  555. assert(block->u.s.index >= idx);
  556. idx = block->u.s.index + 1;
  557. }
  558. }
  559. /* So we'll add our block as idx. But all the other struct blocks that follow us need to have
  560. * larger values. */
  561. /* Update all the subsequent structs to have a higher idx */
  562. if (before)
  563. {
  564. int idx2 = idx+1;
  565. for (block = before->next; block != NULL; block = block->next)
  566. {
  567. if (block->type != FZ_STEXT_BLOCK_STRUCT)
  568. continue;
  569. if (block->u.s.index > idx2)
  570. break;
  571. block->u.s.index = idx2++;
  572. }
  573. }
  574. /* Now make our new struct block and insert it. */
  575. block = fz_pool_alloc(ctx, page->pool, sizeof(*block));
  576. block->type = FZ_STEXT_BLOCK_STRUCT;
  577. block->bbox = fz_empty_rect; /* Fixes bug 703267. */
  578. insert_block_before(block, before, page, parent);
  579. block->u.s.down = newstruct = fz_pool_alloc(ctx, page->pool, sizeof(*block->u.s.down) + z);
  580. block->u.s.index = idx;
  581. newstruct->parent = parent;
  582. newstruct->standard = std;
  583. memcpy(newstruct->raw, raw, z);
  584. newstruct->raw[z] = 0;
  585. newstruct->up = block;
  586. return newstruct;
  587. }
  588. typedef struct
  589. {
  590. int len;
  591. int max;
  592. struct {
  593. uint8_t left;
  594. uint8_t weak;
  595. float pos;
  596. int freq;
  597. } *list;
  598. } div_list;
  599. static void
  600. div_list_push(fz_context *ctx, div_list *div, int left, int weak, float pos)
  601. {
  602. int i;
  603. /* FIXME: Could be bsearch. */
  604. for (i = 0; i < div->len; i++)
  605. {
  606. if (div->list[i].pos > pos)
  607. break;
  608. else if (div->list[i].pos == pos && div->list[i].left == left)
  609. {
  610. div->list[i].freq++;
  611. div->list[i].weak &= weak;
  612. return;
  613. }
  614. }
  615. if (div->len == div->max)
  616. {
  617. int newmax = div->max * 2;
  618. if (newmax == 0)
  619. newmax = 32;
  620. div->list = fz_realloc(ctx, div->list, sizeof(div->list[0]) * newmax);
  621. div->max = newmax;
  622. }
  623. if (i < div->len)
  624. memmove(&div->list[i+1], &div->list[i], sizeof(div->list[0]) * (div->len - i));
  625. div->len++;
  626. div->list[i].left = left;
  627. div->list[i].weak = weak;
  628. div->list[i].pos = pos;
  629. div->list[i].freq = 1;
  630. }
  631. static fz_stext_grid_positions *
  632. make_table_positions(fz_context *ctx, div_list *xs, float min, float max)
  633. {
  634. int wind;
  635. fz_stext_grid_positions *pos;
  636. int len = xs->len;
  637. int i;
  638. int hi = 0;
  639. /* Count the number of edges */
  640. int local_min = 0;
  641. int edges = 2;
  642. if (len == 0)
  643. return NULL;
  644. assert(xs->list[0].left);
  645. for (i = 0; i < len; i++)
  646. {
  647. if (xs->list[i].pos >= min)
  648. break;
  649. }
  650. for (; i < len; i++)
  651. {
  652. if (xs->list[i].pos >= max)
  653. break;
  654. if (xs->list[i].left)
  655. {
  656. if (local_min)
  657. edges++;
  658. }
  659. else
  660. local_min = 1;
  661. }
  662. assert(!xs->list[len-1].left);
  663. pos = fz_malloc_flexible(ctx, fz_stext_grid_positions, list, edges);
  664. pos->len = edges;
  665. /* Copy the edges in */
  666. wind = 0;
  667. local_min = 0;
  668. edges = 1;
  669. pos->list[0].pos = min;
  670. pos->list[0].min = min;
  671. pos->list[0].max = fz_max(xs->list[0].pos, min);
  672. pos->list[0].uncertainty = 0;
  673. pos->list[0].reinforcement = 0;
  674. #ifdef DEBUG_TABLE_HUNT
  675. printf("|%g ", pos->list[0].pos);
  676. #endif
  677. /* Skip over entries to the left of min. */
  678. for (i = 0; i < len; i++)
  679. {
  680. if (xs->list[i].pos >= min)
  681. break;
  682. if (xs->list[i].left)
  683. wind += xs->list[i].freq;
  684. else
  685. wind -= xs->list[i].freq;
  686. }
  687. for (; i < len; i++)
  688. {
  689. if (xs->list[i].pos >= max)
  690. break;
  691. if (xs->list[i].left)
  692. {
  693. if (local_min)
  694. {
  695. pos->list[edges].min = xs->list[i-1].pos;
  696. pos->list[edges].max = xs->list[i].pos;
  697. pos->list[edges].pos = (xs->list[i-1].pos + xs->list[i].pos)/2;
  698. pos->list[edges].uncertainty = wind;
  699. #ifdef DEBUG_TABLE_HUNT
  700. if (wind)
  701. printf("?%g(%d) ", pos->list[edges].pos, wind);
  702. else
  703. printf("|%g ", pos->list[edges].pos);
  704. #endif
  705. edges++;
  706. }
  707. wind += xs->list[i].freq;
  708. if (wind > hi)
  709. hi = wind;
  710. }
  711. else
  712. {
  713. wind -= xs->list[i].freq;
  714. local_min = 1;
  715. }
  716. }
  717. assert(i < len || wind == 0);
  718. pos->list[edges].pos = max;
  719. pos->list[edges].min = fz_min(xs->list[i-1].pos, max);
  720. pos->list[edges].max = max;
  721. assert(max >= xs->list[i-1].pos);
  722. pos->list[edges].uncertainty = 0;
  723. pos->list[edges].reinforcement = 0;
  724. pos->max_uncertainty = hi;
  725. #ifdef DEBUG_TABLE_HUNT
  726. printf("|%g\n", pos->list[edges].pos);
  727. #endif
  728. return pos;
  729. }
  730. static fz_stext_grid_positions *
  731. copy_grid_positions_to_pool(fz_context *ctx, fz_stext_page *page, fz_stext_grid_positions *xs)
  732. {
  733. size_t z = offsetof(fz_stext_grid_positions, list) + sizeof(xs->list[0]) * (xs->len);
  734. fz_stext_grid_positions *xs2 = fz_pool_alloc(ctx, page->pool, z);
  735. memcpy(xs2, xs, z);
  736. return xs2;
  737. }
  738. static void
  739. sanitize_positions(fz_context *ctx, div_list *xs)
  740. {
  741. int i, j, wind, changed;
  742. #ifdef DEBUG_TABLE_HUNT
  743. printf("OK:\n");
  744. for (i = 0; i < xs->len; i++)
  745. {
  746. if (xs->list[i].left)
  747. printf("[");
  748. printf("%g(%d%s)", xs->list[i].pos, xs->list[i].freq, xs->list[i].weak ? "weak" : "");
  749. if (!xs->list[i].left)
  750. printf("]");
  751. printf(" ");
  752. }
  753. printf("\n");
  754. #endif
  755. if (xs->len == 0)
  756. return;
  757. do
  758. {
  759. /* Now, combine runs of left and right */
  760. for (i = 0; i < xs->len; i++)
  761. {
  762. if (xs->list[i].left)
  763. {
  764. j = i;
  765. while (i < xs->len-1 && xs->list[i+1].left)
  766. {
  767. i++;
  768. xs->list[j].freq += xs->list[i].freq;
  769. xs->list[j].weak &= xs->list[i].weak;
  770. xs->list[i].freq = 0;
  771. }
  772. }
  773. else
  774. {
  775. while (i < xs->len-1 && !xs->list[i+1].left)
  776. {
  777. i++;
  778. xs->list[i].freq += xs->list[i-1].freq;
  779. xs->list[i].weak &= xs->list[i-1].weak;
  780. xs->list[i-1].freq = 0;
  781. }
  782. }
  783. }
  784. #ifdef DEBUG_TABLE_HUNT
  785. printf("Shrunk:\n");
  786. for (i = 0; i < xs->len; i++)
  787. {
  788. if (xs->list[i].left)
  789. printf("[");
  790. printf("%g(%d%s)", xs->list[i].pos, xs->list[i].freq, xs->list[i].weak ? "weak" : "");
  791. if (!xs->list[i].left)
  792. printf("]");
  793. printf(" ");
  794. }
  795. printf("\n");
  796. #endif
  797. /* Now remove the 0 frequency ones. */
  798. j = 0;
  799. for (i = 0; i < xs->len; i++)
  800. {
  801. if (xs->list[i].freq == 0)
  802. continue;
  803. if (i != j)
  804. xs->list[j] = xs->list[i];
  805. j++;
  806. }
  807. xs->len = j;
  808. /* Now run across looking for local minima where at least one
  809. * edge is 'weak'. If the wind at that point is non-zero, then
  810. * remove the weak edges from consideration and retry. */
  811. wind = 0;
  812. changed = 0;
  813. i = 0;
  814. while (1)
  815. {
  816. assert(xs->list[i].left);
  817. for (; xs->list[i].left; i++)
  818. {
  819. wind += xs->list[i].freq;
  820. }
  821. assert(i < xs->len);
  822. for (; xs->list[i].left == 0 && i < xs->len; i++)
  823. {
  824. wind -= xs->list[i].freq;
  825. }
  826. if (i == xs->len)
  827. break;
  828. if (wind != 0 && (xs->list[i-1].weak || xs->list[i].weak))
  829. {
  830. int m = fz_mini(xs->list[i-1].freq, xs->list[i].freq);
  831. assert(m > 0);
  832. xs->list[i-1].freq -= m;
  833. xs->list[i].freq -= m;
  834. changed = 1;
  835. }
  836. }
  837. }
  838. while (changed);
  839. #ifdef DEBUG_TABLE_HUNT
  840. printf("Compacted:\n");
  841. for (i = 0; i < xs->len; i++)
  842. {
  843. if (xs->list[i].left)
  844. printf("[");
  845. printf("%g(%d%s)", xs->list[i].pos, xs->list[i].freq, xs->list[i].weak ? "weak" : "");
  846. if (!xs->list[i].left)
  847. printf("]");
  848. printf(" ");
  849. }
  850. printf("\n");
  851. #endif
  852. }
  853. /* We want to check for whether a DIV that we are about to descend into
  854. * contains a column of justified text. We will accept some headers in
  855. * this text, but not JUST headers. */
  856. static int
  857. all_blocks_are_justified_or_headers(fz_context *ctx, fz_stext_block *block)
  858. {
  859. int just_headers = 1;
  860. for (; block != NULL; block = block->next)
  861. {
  862. if (block->type == FZ_STEXT_BLOCK_STRUCT)
  863. {
  864. if (block->u.s.down == NULL)
  865. continue;
  866. if (block->u.s.down->standard == FZ_STRUCTURE_H ||
  867. block->u.s.down->standard == FZ_STRUCTURE_H1 ||
  868. block->u.s.down->standard == FZ_STRUCTURE_H2 ||
  869. block->u.s.down->standard == FZ_STRUCTURE_H3 ||
  870. block->u.s.down->standard == FZ_STRUCTURE_H4 ||
  871. block->u.s.down->standard == FZ_STRUCTURE_H5 ||
  872. block->u.s.down->standard == FZ_STRUCTURE_H6)
  873. continue;
  874. if (!all_blocks_are_justified_or_headers(ctx, block->u.s.down->first_block))
  875. return 0;
  876. }
  877. just_headers = 0;
  878. if (block->type == FZ_STEXT_BLOCK_TEXT && block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_FULL)
  879. return 0;
  880. }
  881. if (just_headers)
  882. return 0;
  883. return 1;
  884. }
  885. #define TWO_INCHES (72*2)
  886. /* Walk through the blocks, finding the bbox. */
  887. static fz_rect
  888. walk_to_find_bounds(fz_context *ctx, fz_stext_block *first_block)
  889. {
  890. fz_rect bounds = fz_empty_rect;
  891. fz_stext_block *block;
  892. fz_stext_line *line;
  893. fz_stext_char *ch;
  894. for (block = first_block; block != NULL; block = block->next)
  895. {
  896. switch (block->type)
  897. {
  898. case FZ_STEXT_BLOCK_STRUCT:
  899. if (!block->u.s.down)
  900. continue;
  901. if (block->u.s.down->standard == FZ_STRUCTURE_H)
  902. {
  903. if (block->next != NULL &&
  904. block->next->type == FZ_STEXT_BLOCK_TEXT &&
  905. block->next->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL)
  906. continue;
  907. }
  908. bounds = fz_union_rect(bounds, walk_to_find_bounds(ctx, block->u.s.down->first_block));
  909. break;
  910. case FZ_STEXT_BLOCK_VECTOR:
  911. bounds = fz_union_rect(bounds, block->bbox);
  912. break;
  913. case FZ_STEXT_BLOCK_TEXT:
  914. if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL && block->bbox.x1 - block->bbox.x0 >= TWO_INCHES)
  915. continue;
  916. for (line = block->u.t.first_line; line != NULL; line = line->next)
  917. {
  918. for (ch = line->first_char; ch != NULL; ch = ch->next)
  919. {
  920. if (ch->c != ' ')
  921. bounds = fz_union_rect(bounds, fz_rect_from_quad(ch->quad));
  922. }
  923. }
  924. break;
  925. }
  926. }
  927. return bounds;
  928. }
  929. static void
  930. walk_to_find_content(fz_context *ctx, div_list *xs, div_list *ys, fz_stext_block *first_block, fz_rect bounds)
  931. {
  932. fz_stext_block *block;
  933. fz_stext_line *line;
  934. fz_stext_char *ch;
  935. for (block = first_block; block != NULL; block = block->next)
  936. {
  937. switch (block->type)
  938. {
  939. case FZ_STEXT_BLOCK_STRUCT:
  940. if (block->u.s.down && !fz_is_empty_rect(fz_intersect_rect(bounds, block->bbox)))
  941. walk_to_find_content(ctx, xs, ys, block->u.s.down->first_block, bounds);
  942. break;
  943. case FZ_STEXT_BLOCK_VECTOR:
  944. break;
  945. case FZ_STEXT_BLOCK_TEXT:
  946. {
  947. fz_rect justified_region = fz_empty_rect;
  948. for (line = block->u.t.first_line; line != NULL; line = line->next)
  949. {
  950. fz_rect region = fz_empty_rect;
  951. region.y0 = line->bbox.y0;
  952. region.y1 = line->bbox.y1;
  953. if (region.y0 < bounds.y0)
  954. region.y0 = bounds.y0;
  955. if (region.y1 > bounds.y1)
  956. region.y1 = bounds.y1;
  957. if (region.y0 >= region.y1)
  958. break;
  959. /* Skip leading spaces. */
  960. for (ch = line->first_char; ch != NULL; ch = ch->next)
  961. if (ch->c != ' ')
  962. break;
  963. for (; ch != NULL; ch = ch->next)
  964. {
  965. if (ch->c == ' ')
  966. {
  967. /* Find the last space char in this run. */
  968. fz_stext_char *last_space;
  969. for (last_space = ch; last_space->next != NULL && last_space->next->c == ' '; last_space = last_space->next);
  970. /* If we're not the last char in the line (i.e. we're not a trailing space,
  971. * then send a 'weak' gap for the spaces, assuming it's sane to do so). */
  972. if (last_space->next != NULL)
  973. {
  974. float rpos = fz_min(ch->quad.ll.x, ch->quad.ul.x);
  975. float lpos = fz_min(last_space->next->quad.ll.x, last_space->next->quad.ll.x);
  976. /* Clamp these to the bounds */
  977. rpos = fz_clamp(rpos, bounds.x0, bounds.x1);
  978. lpos = fz_clamp(lpos, bounds.x0, bounds.x1);
  979. /* So we have a region (rpos...lpos) to add. */
  980. /* This can be positioned in various different ways relative to the
  981. * current region:
  982. * [region]
  983. * (rpos..lpos) OK
  984. * (rpos..lpos) OK, but adjust lpos
  985. * (rpos..lpos) OK, but adjust rpos
  986. * (rpos..lpos) OK
  987. * (rpos .. lpos) OK
  988. */
  989. if (lpos >= region.x1)
  990. {
  991. if (rpos >= region.x0 && rpos < region.x1)
  992. rpos = region.x1;
  993. }
  994. else if (rpos <= region.x0)
  995. {
  996. if (lpos > region.x0)
  997. lpos = region.x0;
  998. }
  999. else
  1000. rpos = lpos; /* Make it an invalid region */
  1001. if (rpos < lpos)
  1002. {
  1003. /* Send a weak right at the start of the spaces... */
  1004. div_list_push(ctx, xs, 0, 1, rpos);
  1005. /* and a weak left at the end. */
  1006. div_list_push(ctx, xs, 1, 1, lpos);
  1007. }
  1008. /* Expand the region as required. */
  1009. if (rpos < region.x0)
  1010. region.x0 = rpos;
  1011. if (lpos > region.x1)
  1012. region.x1 = lpos;
  1013. }
  1014. /* Jump over the spaces */
  1015. ch = last_space;
  1016. }
  1017. else
  1018. {
  1019. float lpos = fz_min(ch->quad.ll.x, ch->quad.ul.x);
  1020. float rpos = fz_max(ch->quad.lr.x, ch->quad.ur.x);
  1021. if (lpos < region.x0)
  1022. region.x0 = lpos;
  1023. if (rpos > region.x1)
  1024. region.x1 = rpos;
  1025. }
  1026. }
  1027. if (!fz_is_empty_rect(region))
  1028. {
  1029. div_list_push(ctx, xs, 1, 0, region.x0);
  1030. div_list_push(ctx, xs, 0, 0, region.x1);
  1031. /* For justified regions, we don't break after each line, but
  1032. * rather before/after the region as a whole. */
  1033. if (block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_FULL)
  1034. {
  1035. div_list_push(ctx, ys, 1, 0, region.y0);
  1036. div_list_push(ctx, ys, 0, 0, region.y1);
  1037. }
  1038. else
  1039. justified_region = fz_union_rect(justified_region, region);
  1040. }
  1041. }
  1042. if (!fz_is_empty_rect(justified_region) && block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL)
  1043. {
  1044. div_list_push(ctx, ys, 1, 0, justified_region.y0);
  1045. div_list_push(ctx, ys, 0, 0, justified_region.y1);
  1046. }
  1047. break;
  1048. }
  1049. }
  1050. }
  1051. }
  1052. /* One of our datastructures (cells_t) is an array of details about the
  1053. * cells that make up our table. It's a w * h array of cell_t's. Each
  1054. * cell contains data on one of the cells in the table, as you'd expect.
  1055. *
  1056. * . .
  1057. * . .
  1058. * - - +-------+ - -
  1059. * | .
  1060. * | .
  1061. * | .
  1062. * - - + - - - + - -
  1063. * . .
  1064. * . .
  1065. *
  1066. * For any given cell, we store details about the top (lowest y coord)
  1067. * and left (lowest x coord) edges. Specifically we store whether
  1068. * there is a line at this position (h_line and v_line) (i.e. a drawn
  1069. * border), and we also store whether content crosses this edge (h_crossed
  1070. * and y_crossed). Finally, we store whether the cell has any content
  1071. * in it at all (full).
  1072. *
  1073. * A table which has w positions across and h positions vertically, will
  1074. * only really have (w-1) * (h-1) cells. We store w*h though to allow for
  1075. * the right and bottom edges to have their lines represented.
  1076. */
  1077. typedef struct
  1078. {
  1079. int h_line;
  1080. int v_line;
  1081. int h_crossed;
  1082. int v_crossed;
  1083. int full;
  1084. } cell_t;
  1085. typedef struct
  1086. {
  1087. int w;
  1088. int h;
  1089. cell_t cell[FZ_FLEXIBLE_ARRAY];
  1090. } cells_t;
  1091. typedef struct
  1092. {
  1093. cells_t *cells;
  1094. fz_stext_grid_positions *xpos;
  1095. fz_stext_grid_positions *ypos;
  1096. fz_rect bounds;
  1097. } grid_walker_data;
  1098. static cell_t *
  1099. get_cell(cells_t *cells, int x, int y)
  1100. {
  1101. return &cells->cell[x + y * cells->w];
  1102. }
  1103. #ifdef DEBUG_TABLE_STRUCTURE
  1104. static void
  1105. asciiart_table(grid_walker_data *gd);
  1106. #endif
  1107. static fz_stext_grid_positions *
  1108. split_grid_pos(fz_context *ctx, grid_walker_data *gd, int row, int i, int early)
  1109. {
  1110. fz_stext_grid_positions **posp = row ? &gd->ypos : &gd->xpos;
  1111. fz_stext_grid_positions *pos = *posp;
  1112. int n = pos->len;
  1113. int x, y, w, h;
  1114. cells_t *cells;
  1115. /* Realloc the required structs */
  1116. *posp = pos = fz_realloc_flexible(ctx, pos, fz_stext_grid_positions, list, n+1);
  1117. cells = gd->cells = fz_realloc_flexible(ctx, gd->cells, cells_t, cell, (gd->cells->w + (1-row)) * (gd->cells->h + row));
  1118. /* If both pass, then we're safe to shuffle the data. */
  1119. #ifdef DEBUG_TABLE_STRUCTURE
  1120. printf("Before split %s %d\n", row ? "row" : "col", i);
  1121. asciiart_table(gd);
  1122. #endif
  1123. assert(i >= 0 && i < n);
  1124. memmove(&pos->list[i+1], &pos->list[i], sizeof(pos->list[0]) * (n-i));
  1125. pos->len++;
  1126. /* Next expand the cells. Only h_line and v_line are filled in so far. */
  1127. w = cells->w;
  1128. h = cells->h;
  1129. if (early && i > 0)
  1130. i--, early = 0;
  1131. if (row)
  1132. {
  1133. /* Add a row */
  1134. cells->h = h+1;
  1135. /* Expand the table, duplicating row i */
  1136. memmove(&cells->cell[(i+1)*w], &cells->cell[i*w], sizeof(cells->cell[0])*(h-i)*w);
  1137. if (early)
  1138. {
  1139. /* We are splitting row 0 into 0 and 1, with 0 being the new one. */
  1140. for (x = 0; x < w; x++)
  1141. {
  1142. cells->cell[x].h_line = 0;
  1143. cells->cell[x].v_line = 0;
  1144. }
  1145. }
  1146. else
  1147. {
  1148. /* We are splitting row i into i and i+1, with i+1 being the new one. */
  1149. /* v_lines are carried over. h_lines need to be unset. */
  1150. for (x = 0; x < w; x++)
  1151. cells->cell[x + (i+1)*w].h_line = 0;
  1152. }
  1153. }
  1154. else
  1155. {
  1156. /* Add a column */
  1157. cells->w = w+1;
  1158. /* Expand the table, duplicating column i */
  1159. for (y = h-1; y >= 0; y--)
  1160. {
  1161. for (x = w; x > i; x--)
  1162. cells->cell[x + y*(w+1)] = cells->cell[x-1 + y*w];
  1163. for (; x >= 0; x--)
  1164. cells->cell[x + y*(w+1)] = cells->cell[x + y*w];
  1165. }
  1166. if (early)
  1167. {
  1168. /* We are splitting col 0 into 0 and 1, with 0 being the new one. */
  1169. for (y = 0; y < h; y++)
  1170. {
  1171. cells->cell[y*(w+1)].h_line = 0;
  1172. cells->cell[y*(w+1)].v_line = 0;
  1173. }
  1174. }
  1175. else
  1176. {
  1177. /* h_lines are carried over. v_lines need to be reset */
  1178. for (y = 0; y < h; y++)
  1179. cells->cell[i+1 + y*(w+1)].v_line = 0;
  1180. }
  1181. }
  1182. #ifdef DEBUG_TABLE_STRUCTURE
  1183. printf("After split\n");
  1184. asciiart_table(gd);
  1185. #endif
  1186. return pos;
  1187. }
  1188. /* This routine finds (and reinforces) grid positions for lines.
  1189. *
  1190. * If we have a thin line from (x0, y0) to (x1, y0), then we are
  1191. * pretty sure that y0 will be on the edge of a cell. We are less
  1192. * sure that x0 and x1 will match up to the edge of a cell.
  1193. * Stylistically some tables overrun or underrun such lines.
  1194. *
  1195. * Similarly from (x0, y0) to (x0, y1), we expect x0 to be accurate
  1196. * but y0 and y1 less so.
  1197. *
  1198. * If we have a wider rectangle, from (x0, y0) to (x1, y1) then
  1199. * we fully expect all sides to be accurate.
  1200. */
  1201. static int
  1202. find_grid_pos(fz_context *ctx, grid_walker_data *gd, int row, float x, int inaccurate)
  1203. {
  1204. const int WIGGLE_ROOM = 2;
  1205. int i;
  1206. fz_stext_grid_positions *pos = row ? gd->ypos : gd->xpos;
  1207. assert(x >= pos->list[0].min && x <= pos->list[pos->len-1].max);
  1208. #ifdef DEBUG_TABLE_STRUCTURE
  1209. printf("Looking for %g in %s splits:\n", x, row ? "row" : "col");
  1210. for (i = 0; i < pos->len; i++)
  1211. {
  1212. printf("%d\t%g\t%g\t%g\t%d\n", i, pos->list[i].min, pos->list[i].pos, pos->list[i].max, pos->list[i].reinforcement);
  1213. }
  1214. #endif
  1215. while (inaccurate) /* So we can break out */
  1216. {
  1217. float prev = 0;
  1218. /* If we start/finish outside the range of the table, then we
  1219. * want to extend the table. So ignore 'inaccurate' in this
  1220. * case. Match the logic below. */
  1221. if (x < pos->list[0].min)
  1222. break;
  1223. if (x < pos->list[0].pos - WIGGLE_ROOM && pos->list[0].reinforcement > 0)
  1224. break;
  1225. if (x > pos->list[pos->len-1].max)
  1226. break;
  1227. if (x > pos->list[pos->len-1].pos + WIGGLE_ROOM && pos->list[pos->len-1].reinforcement > 0)
  1228. break;
  1229. /* Just find the closest one. No reinforcement. */
  1230. for (i = 0; i < pos->len; i++)
  1231. {
  1232. if (x < pos->list[i].min)
  1233. {
  1234. float mid = (prev + pos->list[i].min)/2;
  1235. if (x < mid)
  1236. return i-1;
  1237. return i;
  1238. }
  1239. prev = pos->list[i].max;
  1240. if (x <= prev)
  1241. return i;
  1242. }
  1243. assert("Never happens" == NULL);
  1244. return -1;
  1245. }
  1246. for (i = 0; i < pos->len; i++)
  1247. {
  1248. if (x < pos->list[i].min)
  1249. {
  1250. /* Split i into i and i+1, and make i the new one. */
  1251. assert(i > 0);
  1252. #ifdef DEBUG_TABLE_STRUCTURE
  1253. printf("Splitting before %d\n", i);
  1254. #endif
  1255. pos = split_grid_pos(ctx, gd, row, i, 1);
  1256. pos->list[i-1].max = pos->list[i].min = (pos->list[i-1].max + x)/2;
  1257. pos->list[i].pos = x;
  1258. pos->list[i].max = pos->list[i+1].min = (pos->list[i+1].pos + x)/2;
  1259. pos->list[i].reinforcement = 1;
  1260. return i;
  1261. }
  1262. else if (x <= pos->list[i].max)
  1263. {
  1264. /* We are in the range for the ith divider. */
  1265. if (pos->list[i].reinforcement == 0)
  1266. {
  1267. /* If we've not been reinforced before, reinforce now. */
  1268. pos->list[i].pos = x;
  1269. pos->list[i].reinforcement = 1;
  1270. return i;
  1271. }
  1272. /* We've been reinforced before. This ought to be a pretty good
  1273. * indication. */
  1274. if (pos->list[i].pos - WIGGLE_ROOM < x && x < pos->list[i].pos + WIGGLE_ROOM)
  1275. {
  1276. /* We are a close match to the previously predicted pos
  1277. * value. */
  1278. pos->list[i].pos = pos->list[i].pos * pos->list[i].reinforcement + x;
  1279. pos->list[i].pos /= ++pos->list[i].reinforcement;
  1280. return i;
  1281. }
  1282. /* We need to split i into i and i+1. */
  1283. pos = split_grid_pos(ctx, gd, row, i, pos->list[i].pos > x);
  1284. if (pos->list[i].pos > x)
  1285. {
  1286. /* Make i the new one */
  1287. #ifdef DEBUG_TABLE_STRUCTURE
  1288. printf("Splitting %d (early)\n", i);
  1289. #endif
  1290. pos->list[i].pos = x;
  1291. pos->list[i].max = pos->list[i+1].min = (pos->list[i+1].pos + x)/2;
  1292. pos->list[i].reinforcement = 1;
  1293. return i;
  1294. }
  1295. else
  1296. {
  1297. /* Make i+1 the new one */
  1298. #ifdef DEBUG_TABLE_STRUCTURE
  1299. printf("Splitting %d (late)\n", i);
  1300. #endif
  1301. pos->list[i+1].pos = x;
  1302. pos->list[i].max = pos->list[i+1].min = (pos->list[i].pos + x)/2;
  1303. pos->list[i].reinforcement = 1;
  1304. return i+1;
  1305. }
  1306. }
  1307. }
  1308. assert("Never happens" == NULL);
  1309. return -1;
  1310. }
  1311. static int
  1312. find_cell_l(fz_stext_grid_positions *pos, float x)
  1313. {
  1314. int i;
  1315. for (i = 0; i < pos->len; i++)
  1316. if (x < pos->list[i].pos)
  1317. return i-1;
  1318. return -1;
  1319. }
  1320. static int
  1321. find_cell_r(fz_stext_grid_positions *pos, float x)
  1322. {
  1323. int i;
  1324. for (i = 0; i < pos->len; i++)
  1325. if (x <= pos->list[i].pos)
  1326. return i-1;
  1327. return -1;
  1328. }
  1329. /* Add a horizontal line. Return 1 if the line doesn't seem to be a border line.
  1330. * Record which cells that was a border for. */
  1331. static void
  1332. add_h_line(fz_context *ctx, grid_walker_data *gd, float x0, float x1, float y0, float y1)
  1333. {
  1334. int start = find_grid_pos(ctx, gd, 0, x0, 1);
  1335. int end = find_grid_pos(ctx, gd, 0, x1, 1);
  1336. float y = (y0 + y1) / 2;
  1337. int yidx = find_grid_pos(ctx, gd, 1, y, 0);
  1338. int i;
  1339. assert(start > 0 && end > 0);
  1340. for (i = start; i < end; i++)
  1341. get_cell(gd->cells, i, yidx)->h_line++;
  1342. }
  1343. /* Add a vertical line. Return 1 if the line doesn't seem to be a border line.
  1344. * Record which cells that was a border for. */
  1345. static void
  1346. add_v_line(fz_context *ctx, grid_walker_data *gd, float y0, float y1, float x0, float x1)
  1347. {
  1348. int start = find_grid_pos(ctx, gd, 1, y0, 1);
  1349. int end = find_grid_pos(ctx, gd, 1, y1, 1);
  1350. float x = (x0 + x1) / 2;
  1351. int xidx = find_grid_pos(ctx, gd, 0, x, 0);
  1352. int i;
  1353. for (i = start; i < end; i++)
  1354. get_cell(gd->cells, xidx, i)->v_line++;
  1355. }
  1356. static void
  1357. add_hv_line(fz_context *ctx, grid_walker_data *gd, float x0, float x1, float y0, float y1, int stroked)
  1358. {
  1359. int ix0 = find_grid_pos(ctx, gd, 0, x0, 0);
  1360. int ix1 = find_grid_pos(ctx, gd, 0, x1, 0);
  1361. int iy0 = find_grid_pos(ctx, gd, 1, y0, 0);
  1362. int iy1 = find_grid_pos(ctx, gd, 1, y1, 0);
  1363. int i;
  1364. if (stroked)
  1365. {
  1366. for (i = ix0; i < ix1; i++)
  1367. {
  1368. get_cell(gd->cells, i, iy0)->h_line++;
  1369. get_cell(gd->cells, i, iy1)->h_line++;
  1370. }
  1371. for (i = iy0; i < iy1; i++)
  1372. {
  1373. get_cell(gd->cells, ix0, i)->v_line++;
  1374. get_cell(gd->cells, ix1, i)->v_line++;
  1375. }
  1376. }
  1377. }
  1378. /* Shared internal routine with stext-boxer.c */
  1379. fz_rect fz_collate_small_vector_run(fz_stext_block **blockp);
  1380. static void
  1381. walk_grid_lines(fz_context *ctx, grid_walker_data *gd, fz_stext_block *block)
  1382. {
  1383. for (; block != NULL; block = block->next)
  1384. {
  1385. if (block->type == FZ_STEXT_BLOCK_STRUCT)
  1386. {
  1387. if (block->u.s.down)
  1388. walk_grid_lines(ctx, gd, block->u.s.down->first_block);
  1389. continue;
  1390. }
  1391. else if (block->type == FZ_STEXT_BLOCK_VECTOR)
  1392. {
  1393. fz_rect r;
  1394. float w, h;
  1395. /* Only process rectangle blocks. */
  1396. if ((block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE) == 0)
  1397. continue;
  1398. r = fz_collate_small_vector_run(&block);
  1399. r = fz_intersect_rect(r, gd->bounds);
  1400. if (!fz_is_valid_rect(r))
  1401. continue;
  1402. w = r.x1 - r.x0;
  1403. h = r.y1 - r.y0;
  1404. if (w > h && h < 2)
  1405. {
  1406. /* Thin, wide line */
  1407. (void) add_h_line(ctx, gd, r.x0, r.x1, r.y0, r.y1);
  1408. }
  1409. else if (w < h && w < 2)
  1410. {
  1411. /* Thin, wide line */
  1412. (void) add_v_line(ctx, gd, r.y0, r.y1, r.x0, r.x1);
  1413. }
  1414. else
  1415. {
  1416. /* Rectangle */
  1417. (void) add_hv_line(ctx, gd, r.x0, r.x1, r.y0, r.y1, block->u.v.flags & FZ_STEXT_VECTOR_IS_STROKED);
  1418. }
  1419. }
  1420. }
  1421. }
  1422. static int
  1423. is_numeric(int c)
  1424. {
  1425. return (c >= '0' && c <= '9');
  1426. }
  1427. static int
  1428. mark_cells_for_content(fz_context *ctx, grid_walker_data *gd, fz_rect s)
  1429. {
  1430. fz_rect r = fz_intersect_rect(gd->bounds, s);
  1431. int x0, x1, y0, y1, x, y;
  1432. if (fz_is_empty_rect(r))
  1433. return 0;
  1434. x0 = find_cell_l(gd->xpos, r.x0);
  1435. x1 = find_cell_r(gd->xpos, r.x1);
  1436. y0 = find_cell_l(gd->ypos, r.y0);
  1437. y1 = find_cell_r(gd->ypos, r.y1);
  1438. if (x0 < 0 || x1 < 0 || y0 < 0 || y1 < 0)
  1439. return 1;
  1440. if (x0 < x1)
  1441. {
  1442. for (y = y0; y <= y1; y++)
  1443. for (x = x0; x < x1; x++)
  1444. get_cell(gd->cells, x+1, y)->v_crossed++;
  1445. }
  1446. if (y0 < y1)
  1447. {
  1448. for (y = y0; y < y1; y++)
  1449. for (x = x0; x <= x1; x++)
  1450. get_cell(gd->cells, x, y+1)->h_crossed++;
  1451. }
  1452. for (y = y0; y <= y1; y++)
  1453. for (x = x0; x <= x1; x++)
  1454. get_cell(gd->cells, x, y)->full++;
  1455. return 0;
  1456. }
  1457. #define IN_CELL 0
  1458. #define IN_BORDER 1
  1459. #define IN_UNKNOWN 2
  1460. #define SPLIT_MARGIN 4
  1461. static int
  1462. where_is(fz_stext_grid_positions *pos, float x, int *in)
  1463. {
  1464. int i;
  1465. *in = IN_UNKNOWN;
  1466. /* If the table is empty, nothing to do. */
  1467. if (pos->len == 0)
  1468. return -1;
  1469. /* If we are completely outside the table, give up. */
  1470. if (x <= pos->list[0].pos - SPLIT_MARGIN || x >= pos->list[pos->len-1].max + SPLIT_MARGIN)
  1471. return -1;
  1472. for (i = 0; i < pos->len; i++)
  1473. {
  1474. /* Calculate the region (below..above) that counts as being
  1475. * on the border of position i. */
  1476. float prev = i > 0 ? pos->list[i-1].max : pos->list[0].min;
  1477. float next = i < pos->len-1 ? pos->list[i+1].min : pos->list[i].max;
  1478. float below = pos->list[i].pos - SPLIT_MARGIN;
  1479. float above = pos->list[i].pos + SPLIT_MARGIN;
  1480. /* Find the distance half way back to the previous pos as
  1481. * a limit to our margin. */
  1482. prev = (prev + pos->list[i].pos)/2;
  1483. next = (next + pos->list[i].pos)/2;
  1484. if (below < prev)
  1485. below = prev;
  1486. if (above > next)
  1487. above = next;
  1488. if (x < below)
  1489. {
  1490. *in = IN_CELL;
  1491. return i-1;
  1492. }
  1493. else if (x <= above)
  1494. {
  1495. *in = IN_BORDER;
  1496. return i;
  1497. }
  1498. }
  1499. *in = IN_BORDER;
  1500. return i-1;
  1501. }
  1502. enum
  1503. {
  1504. VECTOR_IS_CONTENT = 0,
  1505. VECTOR_IS_BORDER = 1,
  1506. VECTOR_IS_UNKNOWN = 2,
  1507. VECTOR_IS_IGNORABLE = 3
  1508. };
  1509. /* So a vector can either be a border, or contained
  1510. * in some cells, or something completely else. */
  1511. static int
  1512. classify_vector(fz_context *ctx, grid_walker_data *gd, fz_rect r, int is_rect)
  1513. {
  1514. int at_x0, at_x1, at_y0, at_y1;
  1515. int ix0, ix1, iy0, iy1;
  1516. r = fz_intersect_rect(r, gd->bounds);
  1517. if (fz_is_empty_rect(r))
  1518. return VECTOR_IS_IGNORABLE;
  1519. ix0 = where_is(gd->xpos, r.x0, &at_x0);
  1520. ix1 = where_is(gd->xpos, r.x1, &at_x1);
  1521. iy0 = where_is(gd->ypos, r.y0, &at_y0);
  1522. iy1 = where_is(gd->ypos, r.y1, &at_y1);
  1523. /* No idea, just treat it as a border. */
  1524. if (at_x0 == IN_UNKNOWN || at_x1 == IN_UNKNOWN || at_y0 == IN_UNKNOWN || at_y1 == IN_UNKNOWN)
  1525. return VECTOR_IS_IGNORABLE;
  1526. if (at_x0 == IN_BORDER && at_x1 == IN_BORDER)
  1527. {
  1528. /* Vector is aligned along sides of cells. */
  1529. return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT;
  1530. }
  1531. if (at_y0 == IN_BORDER && at_y1 == IN_BORDER)
  1532. {
  1533. /* Vector is aligned along sides of cells. */
  1534. return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT;
  1535. }
  1536. if (at_x0 == IN_CELL && at_x1 == IN_CELL)
  1537. {
  1538. /* Content within a cell (or 1d range of cells). */
  1539. return VECTOR_IS_CONTENT;
  1540. }
  1541. if (at_y0 == IN_CELL && at_y1 == IN_CELL)
  1542. {
  1543. /* Content within a cell (or 1d range of cells). */
  1544. return VECTOR_IS_CONTENT;
  1545. }
  1546. if (at_x0 == IN_BORDER && at_x1 == IN_CELL && ix0 == ix1)
  1547. {
  1548. return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT;
  1549. }
  1550. if (at_x0 == IN_CELL && at_x1 == IN_BORDER && ix0+1 == ix1)
  1551. {
  1552. return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT;
  1553. }
  1554. if (at_y0 == IN_BORDER && at_y1 == IN_CELL && iy0 == iy1)
  1555. {
  1556. return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT;
  1557. }
  1558. if (at_y0 == IN_CELL && at_y1 == IN_BORDER && iy0+1 == iy1)
  1559. {
  1560. return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT;
  1561. }
  1562. if (is_rect)
  1563. {
  1564. return VECTOR_IS_IGNORABLE;
  1565. }
  1566. /* unknown - take this as indication that this maybe isn't
  1567. * table. */
  1568. return VECTOR_IS_UNKNOWN;
  1569. }
  1570. #undef IN_CELL
  1571. #undef IN_BORDER
  1572. #undef IN_UNKNOWN
  1573. /* Walk through the content, looking at how it spans our grid.
  1574. * Record gridlines, which cells have content that cross into
  1575. * neighbours, and which cells have any content at all.
  1576. * Return a count of vector graphics that are found that don't
  1577. * look plausibly like cell contents. */
  1578. static int
  1579. calculate_spanned_content(fz_context *ctx, grid_walker_data *gd, fz_stext_block *block)
  1580. {
  1581. int duff = 0;
  1582. fz_rect bounds = {
  1583. gd->xpos->list[0].pos,
  1584. gd->ypos->list[0].pos,
  1585. gd->xpos->list[gd->xpos->len-1].pos,
  1586. gd->ypos->list[gd->ypos->len-1].pos };
  1587. for (; block != NULL; block = block->next)
  1588. {
  1589. if (block->type == FZ_STEXT_BLOCK_STRUCT)
  1590. {
  1591. if (block->u.s.down)
  1592. duff += calculate_spanned_content(ctx, gd, block->u.s.down->first_block);
  1593. continue;
  1594. }
  1595. else if (block->type == FZ_STEXT_BLOCK_VECTOR)
  1596. {
  1597. switch (classify_vector(ctx, gd, block->bbox, !!(block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE)))
  1598. {
  1599. case VECTOR_IS_CONTENT:
  1600. mark_cells_for_content(ctx, gd, block->bbox);
  1601. break;
  1602. case VECTOR_IS_BORDER:
  1603. case VECTOR_IS_IGNORABLE:
  1604. break;
  1605. default:
  1606. duff++;
  1607. break;
  1608. }
  1609. }
  1610. else if (block->type == FZ_STEXT_BLOCK_TEXT)
  1611. {
  1612. fz_stext_line *line;
  1613. if (block->bbox.x0 >= bounds.x1 || block->bbox.y0 >= bounds.y1 ||
  1614. block->bbox.x1 <= bounds.x0 || block->bbox.y1 <= bounds.y0)
  1615. continue;
  1616. for (line = block->u.t.first_line; line != NULL; line = line->next)
  1617. {
  1618. fz_stext_char *ch = line->first_char;
  1619. int was_numeric = 0;
  1620. /* Skip leading spaces */
  1621. while (ch != NULL && ch->c == ' ')
  1622. ch = ch->next;
  1623. for (; ch != NULL; ch = ch->next)
  1624. {
  1625. if (ch->c == 32)
  1626. {
  1627. /* Trailing space, skip it. */
  1628. if (ch->next == NULL)
  1629. break;
  1630. if (ch->next->c == 32)
  1631. {
  1632. /* Run of spaces. Skip 'em. */
  1633. while (ch->next && ch->next->c == 32)
  1634. ch = ch->next;
  1635. was_numeric = 0;
  1636. continue;
  1637. }
  1638. if (was_numeric || is_numeric(ch->next->c))
  1639. {
  1640. /* Single spaces around numbers are ignored. */
  1641. was_numeric = 0;
  1642. continue;
  1643. }
  1644. /* A single space. Accept it. */
  1645. was_numeric = 0;
  1646. }
  1647. else
  1648. was_numeric = is_numeric(ch->c);
  1649. duff += mark_cells_for_content(ctx, gd, fz_rect_from_quad(ch->quad));
  1650. }
  1651. }
  1652. }
  1653. }
  1654. return duff;
  1655. }
  1656. static cells_t *new_cells(fz_context *ctx, int w, int h)
  1657. {
  1658. cells_t *cells = fz_malloc_flexible(ctx, cells_t, cell, w * h);
  1659. cells->w = w;
  1660. cells->h = h;
  1661. return cells;
  1662. }
  1663. #ifdef DEBUG_TABLE_STRUCTURE
  1664. static void
  1665. asciiart_table(grid_walker_data *gd)
  1666. {
  1667. int w = gd->xpos->len;
  1668. int h = gd->ypos->len;
  1669. int x, y;
  1670. for (y = 0; y < h; y++)
  1671. {
  1672. for (x = 0; x < w-1; x++)
  1673. {
  1674. cell_t *cell = get_cell(gd->cells, x, y);
  1675. int line = cell->h_line;
  1676. int erase = cell->h_crossed;
  1677. printf("+");
  1678. if (line && !erase)
  1679. {
  1680. printf("-");
  1681. }
  1682. else if (!line && erase)
  1683. {
  1684. printf("v");
  1685. }
  1686. else if (line && erase)
  1687. {
  1688. printf("*");
  1689. }
  1690. else
  1691. {
  1692. printf(" ");
  1693. }
  1694. }
  1695. printf("+\n");
  1696. if (y == h-1)
  1697. break;
  1698. for (x = 0; x < w; x++)
  1699. {
  1700. cell_t *cell = get_cell(gd->cells, x, y);
  1701. int line = cell->v_line;
  1702. int erase = cell->v_crossed;
  1703. if (line && !erase)
  1704. {
  1705. printf("|");
  1706. }
  1707. else if (!line && erase)
  1708. {
  1709. printf(">");
  1710. }
  1711. else if (line && erase)
  1712. {
  1713. printf("*");
  1714. }
  1715. else
  1716. {
  1717. printf(" ");
  1718. }
  1719. if (x < w-1)
  1720. {
  1721. if (cell->full)
  1722. printf("#");
  1723. else
  1724. printf(" ");
  1725. }
  1726. else
  1727. printf("\n");
  1728. }
  1729. }
  1730. }
  1731. #endif
  1732. static void
  1733. recalc_bbox(fz_stext_block *block)
  1734. {
  1735. fz_rect bbox = fz_empty_rect;
  1736. fz_stext_line *line;
  1737. for (line = block->u.t.first_line; line != NULL; line = line->next)
  1738. bbox = fz_union_rect(bbox, line->bbox);
  1739. block->bbox = bbox;
  1740. }
  1741. static void
  1742. unlink_line_from_block(fz_stext_line *line, fz_stext_block *block)
  1743. {
  1744. fz_stext_line *next_line = line->next;
  1745. if (line->prev)
  1746. {
  1747. assert(line->prev->next == line);
  1748. line->prev->next = next_line;
  1749. }
  1750. else
  1751. {
  1752. assert(block->u.t.first_line == line);
  1753. block->u.t.first_line = next_line;
  1754. }
  1755. if (next_line)
  1756. {
  1757. assert(next_line->prev == line);
  1758. next_line->prev = line->prev;
  1759. }
  1760. else
  1761. {
  1762. assert(block->u.t.last_line == line);
  1763. block->u.t.last_line = line->prev;
  1764. }
  1765. }
  1766. static void
  1767. append_line_to_block(fz_stext_line *line, fz_stext_block *block)
  1768. {
  1769. if (block->u.t.last_line == NULL)
  1770. {
  1771. assert(block->u.t.first_line == NULL);
  1772. block->u.t.first_line = block->u.t.last_line = line;
  1773. line->prev = NULL;
  1774. }
  1775. else
  1776. {
  1777. assert(block->u.t.last_line->next == NULL);
  1778. line->prev = block->u.t.last_line;
  1779. block->u.t.last_line->next = line;
  1780. block->u.t.last_line = line;
  1781. }
  1782. line->next = NULL;
  1783. }
  1784. static void
  1785. unlink_block(fz_stext_block *block, fz_stext_block **first, fz_stext_block **last)
  1786. {
  1787. if (block->prev)
  1788. {
  1789. assert(block->prev->next == block);
  1790. block->prev->next = block->next;
  1791. }
  1792. else
  1793. {
  1794. assert(*first == block);
  1795. *first = block->next;
  1796. }
  1797. if (block->next)
  1798. {
  1799. assert(block->next->prev == block);
  1800. block->next->prev = block->prev;
  1801. }
  1802. else
  1803. {
  1804. assert(*last == block);
  1805. *last = block->prev;
  1806. }
  1807. }
  1808. #ifndef NDEBUG
  1809. static int
  1810. verify_stext(fz_context *ctx, fz_stext_page *page, fz_stext_struct *src)
  1811. {
  1812. fz_stext_block *block;
  1813. fz_stext_block **first = src ? &src->first_block : &page->first_block;
  1814. fz_stext_block **last = src ? &src->last_block : &page->last_block;
  1815. int max = 0;
  1816. assert((*first == NULL) == (*last == NULL));
  1817. for (block = *first; block != NULL; block = block->next)
  1818. {
  1819. fz_stext_line *line;
  1820. if (block->prev == NULL)
  1821. assert(*first == block);
  1822. else
  1823. assert(block->prev->next == block);
  1824. if (block->next == NULL)
  1825. assert(*last == block);
  1826. else
  1827. assert(block->next->prev == block);
  1828. if (block->type == FZ_STEXT_BLOCK_STRUCT)
  1829. {
  1830. if (block->u.s.down)
  1831. {
  1832. int m = verify_stext(ctx, page, block->u.s.down);
  1833. if (m > max)
  1834. max = m;
  1835. }
  1836. continue;
  1837. }
  1838. if (block->type != FZ_STEXT_BLOCK_TEXT)
  1839. continue;
  1840. assert((block->u.t.first_line == NULL) == (block->u.t.last_line == NULL));
  1841. for (line = block->u.t.first_line; line != NULL; line = line->next)
  1842. {
  1843. fz_stext_char *ch;
  1844. if (line->next == NULL)
  1845. assert(block->u.t.last_line == line);
  1846. else
  1847. assert(line->next->prev == line);
  1848. assert((line->first_char == NULL) == (line->last_char == NULL));
  1849. for (ch = line->first_char; ch != NULL; ch = ch->next)
  1850. assert(ch->next != NULL || line->last_char == ch);
  1851. }
  1852. }
  1853. return max+1;
  1854. }
  1855. #endif
  1856. static fz_rect
  1857. move_contained_content(fz_context *ctx, fz_stext_page *page, fz_stext_struct *dest, fz_stext_struct *src, fz_rect r)
  1858. {
  1859. fz_stext_block *before = dest ? dest->first_block : page->first_block;
  1860. fz_stext_block **sfirst = src ? &src->first_block : &page->first_block;
  1861. fz_stext_block **slast = src ? &src->last_block : &page->last_block;
  1862. fz_stext_block *block, *next_block;
  1863. for (block = *sfirst; block != NULL; block = next_block)
  1864. {
  1865. fz_rect bbox = fz_intersect_rect(block->bbox, r);
  1866. next_block = block->next;
  1867. /* Don't use fz_is_empty_rect here, as that will exclude zero height areas like spaces. */
  1868. if (bbox.x0 > bbox.x1 || bbox.y0 > bbox.y1)
  1869. continue; /* Trivially excluded */
  1870. if (bbox.x0 == block->bbox.x0 && bbox.y0 == block->bbox.y0 && bbox.x1 == block->bbox.x1 && bbox.y1 == block->bbox.y1)
  1871. {
  1872. /* Trivially included */
  1873. if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down)
  1874. {
  1875. if (block->u.s.down->standard == FZ_STRUCTURE_TD)
  1876. {
  1877. /* Don't copy TDs! Just copy the contents */
  1878. move_contained_content(ctx, page, dest, block->u.s.down, r);
  1879. continue;
  1880. }
  1881. }
  1882. unlink_block(block, sfirst, slast);
  1883. insert_block_before(block, before, page, dest);
  1884. assert(before == block->next);
  1885. continue;
  1886. }
  1887. if (block->type == FZ_STEXT_BLOCK_STRUCT)
  1888. {
  1889. if (block->u.s.down)
  1890. move_contained_content(ctx, page, dest, block->u.s.down, r);
  1891. }
  1892. if (block->type == FZ_STEXT_BLOCK_TEXT)
  1893. {
  1894. /* Partially included text block */
  1895. fz_stext_line *line, *next_line;
  1896. fz_stext_block *newblock = NULL;
  1897. for (line = block->u.t.first_line; line != NULL; line = next_line)
  1898. {
  1899. fz_rect lrect = fz_intersect_rect(line->bbox, r);
  1900. next_line = line->next;
  1901. /* Don't use fz_is_empty_rect here, as that will exclude zero height areas like spaces. */
  1902. if (lrect.x0 > lrect.x1 || lrect.y0 > lrect.y1)
  1903. continue; /* Trivial exclusion */
  1904. if (line->bbox.x0 == lrect.x0 && line->bbox.y0 == lrect.y0 && line->bbox.x1 == lrect.x1 && line->bbox.y1 == lrect.y1)
  1905. {
  1906. /* Trivial inclusion */
  1907. if (newblock == NULL)
  1908. {
  1909. newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block));
  1910. insert_block_before(newblock, before, page, dest);
  1911. newblock->u.t.flags = block->u.t.flags;
  1912. assert(before == newblock->next);
  1913. }
  1914. unlink_line_from_block(line, block);
  1915. append_line_to_block(line, newblock);
  1916. }
  1917. else
  1918. {
  1919. /* Need to walk the line and just take parts */
  1920. fz_stext_line *newline = NULL;
  1921. fz_stext_char *ch, *next_ch, *prev_ch = NULL;
  1922. for (ch = line->first_char; ch != NULL; ch = next_ch)
  1923. {
  1924. fz_rect crect = fz_rect_from_quad(ch->quad);
  1925. float x = (crect.x0 + crect.x1)/2;
  1926. float y = (crect.y0 + crect.y1)/2;
  1927. next_ch = ch->next;
  1928. if (r.x0 > x || r.x1 < x || r.y0 > y || r.y1 < y)
  1929. {
  1930. prev_ch = ch;
  1931. continue;
  1932. }
  1933. /* Take this char */
  1934. if (newline == NULL)
  1935. {
  1936. newline = fz_pool_alloc(ctx, page->pool, sizeof(*newline));
  1937. newline->dir = line->dir;
  1938. newline->wmode = line->wmode;
  1939. newline->bbox = fz_empty_rect;
  1940. }
  1941. /* Unlink char */
  1942. if (prev_ch == NULL)
  1943. line->first_char = next_ch;
  1944. else
  1945. prev_ch->next = next_ch;
  1946. if (next_ch == NULL)
  1947. line->last_char = prev_ch;
  1948. /* Relink char */
  1949. ch->next = NULL;
  1950. if (newline->last_char == NULL)
  1951. newline->first_char = ch;
  1952. else
  1953. newline->last_char->next = ch;
  1954. newline->last_char = ch;
  1955. newline->bbox = fz_union_rect(newline->bbox, crect);
  1956. }
  1957. if (line->first_char == NULL)
  1958. {
  1959. /* We've removed all the chars from this line.
  1960. * Better remove the line too. */
  1961. if (line->prev)
  1962. line->prev->next = next_line;
  1963. else
  1964. block->u.t.first_line = next_line;
  1965. if (next_line)
  1966. next_line->prev = line->prev;
  1967. else
  1968. block->u.t.last_line = line->prev;
  1969. }
  1970. if (newline)
  1971. {
  1972. if (newblock == NULL)
  1973. {
  1974. newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block));
  1975. /* Add the block onto our target list */
  1976. insert_block_before(newblock, before, page, dest);
  1977. newblock->u.t.flags = block->u.t.flags;
  1978. before = newblock->next;
  1979. }
  1980. append_line_to_block(newline, newblock);
  1981. }
  1982. }
  1983. }
  1984. if (newblock)
  1985. {
  1986. recalc_bbox(block);
  1987. recalc_bbox(newblock);
  1988. }
  1989. if (block->u.t.first_line == NULL)
  1990. {
  1991. /* We've removed all the lines from the block. Should remove that too! */
  1992. if (block->prev)
  1993. {
  1994. assert(block->prev->next == block);
  1995. block->prev->next = next_block;
  1996. }
  1997. else
  1998. {
  1999. assert(*sfirst == block);
  2000. *sfirst = block->next;
  2001. }
  2002. if (next_block)
  2003. {
  2004. assert(next_block->prev == block);
  2005. next_block->prev = block->prev;
  2006. }
  2007. else
  2008. {
  2009. assert(*slast == block);
  2010. *slast = block->prev;
  2011. }
  2012. }
  2013. }
  2014. }
  2015. return r;
  2016. }
  2017. typedef struct
  2018. {
  2019. fz_stext_block *block;
  2020. fz_stext_struct *parent;
  2021. } tree_pos;
  2022. /* This is still not perfect, but it's better! */
  2023. static tree_pos
  2024. find_table_insertion_point(fz_context *ctx, fz_rect r, tree_pos current, tree_pos best)
  2025. {
  2026. fz_stext_block *block;
  2027. for (block = current.block; block != NULL; block = block->next)
  2028. {
  2029. if (block->type == FZ_STEXT_BLOCK_STRUCT)
  2030. {
  2031. if (block->u.s.down != NULL && fz_is_rect_inside_rect(r, block->bbox))
  2032. {
  2033. tree_pos down;
  2034. down.parent = block->u.s.down;
  2035. down.block = block->u.s.down->first_block;
  2036. best = find_table_insertion_point(ctx, r, down, best);
  2037. }
  2038. }
  2039. else
  2040. {
  2041. /* Is block a better precursor than best? (Or a valid precursor, if best.block == NULL) */
  2042. if (block->bbox.y1 < r.y0 && (best.block == NULL || best.block->bbox.y1 < block->bbox.y1))
  2043. {
  2044. best.block = block;
  2045. best.parent = current.parent;
  2046. }
  2047. }
  2048. }
  2049. return best;
  2050. }
  2051. static int
  2052. tr_is_empty(fz_context *ctx, fz_stext_block *block)
  2053. {
  2054. for (; block != NULL; block = block->next)
  2055. {
  2056. if (block->type != FZ_STEXT_BLOCK_STRUCT)
  2057. return 0;
  2058. if (!block->u.s.down)
  2059. {
  2060. }
  2061. else if (block->u.s.down->standard != FZ_STRUCTURE_TD)
  2062. return 0;
  2063. else if (block->u.s.down->first_block != NULL)
  2064. return 0;
  2065. }
  2066. return 1;
  2067. }
  2068. static int
  2069. table_is_empty(fz_context *ctx, fz_stext_block *block)
  2070. {
  2071. for (; block != NULL; block = block->next)
  2072. {
  2073. if (block->type == FZ_STEXT_BLOCK_GRID)
  2074. continue;
  2075. if (block->type != FZ_STEXT_BLOCK_STRUCT)
  2076. return 0;
  2077. if (!block->u.s.down)
  2078. {
  2079. }
  2080. else if (block->u.s.down->standard != FZ_STRUCTURE_TR)
  2081. return 0;
  2082. else if (!tr_is_empty(ctx, block->u.s.down->first_block))
  2083. return 0;
  2084. }
  2085. return 1;
  2086. }
  2087. static void
  2088. tidy_td_divs(fz_context *ctx, fz_stext_struct *parent)
  2089. {
  2090. while (1)
  2091. {
  2092. fz_stext_block *block = parent->first_block;
  2093. if (block == NULL || block->next != NULL || block->type != FZ_STEXT_BLOCK_STRUCT || block->u.s.down == NULL || block->u.s.down->standard != FZ_STRUCTURE_DIV)
  2094. return;
  2095. parent->first_block = block->u.s.down->first_block;
  2096. parent->last_block = block->u.s.down->last_block;
  2097. }
  2098. }
  2099. static void
  2100. tidy_tr_divs(fz_context *ctx, fz_stext_block *block)
  2101. {
  2102. for (; block != NULL; block = block->next)
  2103. {
  2104. if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down && block->u.s.down->standard == FZ_STRUCTURE_TD)
  2105. tidy_td_divs(ctx, block->u.s.down);
  2106. }
  2107. }
  2108. static void
  2109. tidy_table_divs(fz_context *ctx, fz_stext_block *block)
  2110. {
  2111. for (; block != NULL; block = block->next)
  2112. {
  2113. if (block->type == FZ_STEXT_BLOCK_GRID)
  2114. continue;
  2115. if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down && block->u.s.down->standard == FZ_STRUCTURE_TR)
  2116. tidy_tr_divs(ctx, block->u.s.down->first_block);
  2117. }
  2118. }
  2119. static int
  2120. struct_is_empty(fz_context *ctx, fz_stext_block *block)
  2121. {
  2122. for (; block != NULL; block = block->next)
  2123. {
  2124. if (block->type != FZ_STEXT_BLOCK_STRUCT)
  2125. return 0;
  2126. if (!block->u.s.down)
  2127. {
  2128. }
  2129. else if (!struct_is_empty(ctx, block->u.s.down->first_block))
  2130. return 0;
  2131. }
  2132. return 1;
  2133. }
  2134. static int
  2135. div_is_empty(fz_context *ctx, fz_stext_block *block)
  2136. {
  2137. for (; block != NULL; block = block->next)
  2138. {
  2139. if (block->type != FZ_STEXT_BLOCK_STRUCT)
  2140. return 0;
  2141. if (!block->u.s.down)
  2142. {
  2143. }
  2144. else if (block->u.s.down->standard == FZ_STRUCTURE_TABLE)
  2145. {
  2146. tidy_table_divs(ctx, block->u.s.down->first_block);
  2147. return table_is_empty(ctx, block->u.s.down->first_block);
  2148. }
  2149. else if (block->u.s.down->standard != FZ_STRUCTURE_DIV)
  2150. {
  2151. if (!struct_is_empty(ctx, block->u.s.down->first_block))
  2152. return 0;
  2153. }
  2154. else if (!div_is_empty(ctx, block->u.s.down->first_block))
  2155. return 0;
  2156. }
  2157. return 1;
  2158. }
  2159. static void
  2160. tidy_orphaned_tables(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent)
  2161. {
  2162. fz_stext_block **first_blockp = parent ? &parent->first_block : &page->first_block;
  2163. fz_stext_block **last_blockp = parent ? &parent->last_block : &page->last_block;
  2164. fz_stext_block *block, *next_block;
  2165. for (block = *first_blockp; block != NULL; block = next_block)
  2166. {
  2167. next_block = block->next;
  2168. if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down)
  2169. {
  2170. if (block->u.s.down->standard == FZ_STRUCTURE_TABLE)
  2171. {
  2172. tidy_table_divs(ctx, block->u.s.down->first_block);
  2173. if (table_is_empty(ctx, block->u.s.down->first_block))
  2174. {
  2175. /* Remove block */
  2176. if (block->prev)
  2177. {
  2178. assert(block->prev->next == block);
  2179. block->prev->next = next_block;
  2180. }
  2181. else
  2182. {
  2183. assert(*first_blockp == block);
  2184. *first_blockp = next_block;
  2185. }
  2186. if (next_block)
  2187. {
  2188. assert(next_block->prev == block);
  2189. next_block->prev = block->prev;
  2190. }
  2191. else
  2192. {
  2193. assert(*last_blockp == block);
  2194. *last_blockp = block->prev;
  2195. }
  2196. }
  2197. else
  2198. {
  2199. tidy_orphaned_tables(ctx, page, block->u.s.down);
  2200. }
  2201. }
  2202. if (block->u.s.down->standard == FZ_STRUCTURE_DIV)
  2203. {
  2204. tidy_orphaned_tables(ctx, page, block->u.s.down);
  2205. if (div_is_empty(ctx, block->u.s.down->first_block))
  2206. {
  2207. /* Remove block */
  2208. if (block->prev)
  2209. {
  2210. assert(block->prev->next == block);
  2211. block->prev->next = next_block;
  2212. }
  2213. else
  2214. {
  2215. assert(*first_blockp == block);
  2216. *first_blockp = next_block;
  2217. }
  2218. if (next_block)
  2219. {
  2220. assert(next_block->prev == block);
  2221. next_block->prev = block->prev;
  2222. }
  2223. else
  2224. {
  2225. assert(*last_blockp == block);
  2226. *last_blockp = block->prev;
  2227. }
  2228. }
  2229. }
  2230. }
  2231. }
  2232. }
  2233. static fz_stext_struct *
  2234. transcribe_table(fz_context *ctx, grid_walker_data *gd, fz_stext_page *page, fz_stext_struct *parent)
  2235. {
  2236. int w = gd->xpos->len;
  2237. int h = gd->ypos->len;
  2238. int x, y;
  2239. char *sent_tab = fz_calloc(ctx, 1, w*(size_t)h);
  2240. fz_stext_block **first_block = parent ? &parent->first_block : &page->first_block;
  2241. fz_stext_struct *table, *tr, *td;
  2242. fz_rect r;
  2243. tree_pos insert, top;
  2244. /* Where should we insert the table in the data? */
  2245. r.x0 = gd->xpos->list[0].pos;
  2246. r.x1 = gd->xpos->list[w-1].pos;
  2247. r.y0 = gd->ypos->list[0].pos;
  2248. r.y1 = gd->ypos->list[h-1].pos;
  2249. #ifdef DEBUG_TABLE_STRUCTURE
  2250. fz_print_stext_page_as_xml(ctx, fz_stddbg(ctx), page, 0);
  2251. #endif
  2252. top.block = *first_block;
  2253. top.parent = parent;
  2254. insert.block = NULL;
  2255. insert.parent = NULL;
  2256. insert = find_table_insertion_point(ctx, r, top, insert);
  2257. /* Convert to 'before' */
  2258. if (insert.block)
  2259. insert.block = insert.block->next;
  2260. /* Make table */
  2261. table = add_struct_block_before(ctx, insert.block, page, insert.parent, FZ_STRUCTURE_TABLE, "Table");
  2262. /* Run through the cells, and guess at spanning. */
  2263. for (y = 0; y < h-1; y++)
  2264. {
  2265. /* Have we sent this entire row before? */
  2266. for (x = 0; x < w-1; x++)
  2267. {
  2268. if (!sent_tab[x+y*w])
  2269. break;
  2270. }
  2271. if (x == w-1)
  2272. continue; /* No point in sending a row with nothing in it! */
  2273. /* Make TR */
  2274. tr = add_struct_block_before(ctx, NULL, page, table, FZ_STRUCTURE_TR, "TR");
  2275. for (x = 0; x < w-1; x++)
  2276. {
  2277. int x2, y2;
  2278. int cellw = 1;
  2279. int cellh = 1;
  2280. /* Have we sent this cell already? */
  2281. if (sent_tab[x+y*w])
  2282. continue;
  2283. /* Find the width of the cell */
  2284. for (x2 = x+1; x2 < w-1; x2++)
  2285. {
  2286. cell_t *cell = get_cell(gd->cells, x2, y);
  2287. if (cell->v_line)
  2288. break; /* Can't go past a line */
  2289. if (gd->xpos->list[x2].uncertainty == 0)
  2290. break; /* An uncertainty of 0 is as good as a line. */
  2291. if (!cell->v_crossed)
  2292. break;
  2293. cellw++;
  2294. }
  2295. /* Find the height of the cell */
  2296. for (y2 = y+1; y2 < h-1; y2++)
  2297. {
  2298. cell_t *cell;
  2299. int h_crossed = 0;
  2300. if (gd->ypos->list[y2].uncertainty == 0)
  2301. break; /* An uncertainty of 0 is as good as a line. */
  2302. cell = get_cell(gd->cells, x, y2);
  2303. if (cell->h_line)
  2304. break; /* Can't extend down through a line. */
  2305. if (cell->h_crossed)
  2306. h_crossed = 1;
  2307. for (x2 = x+1; x2 < x+cellw; x2++)
  2308. {
  2309. cell = get_cell(gd->cells, x2, y2);
  2310. if (cell->h_line)
  2311. break;
  2312. if (cell->v_line)
  2313. break; /* Can't go past a line */
  2314. if (gd->xpos->list[x2].uncertainty == 0)
  2315. break; /* An uncertainty of 0 is as good as a line. */
  2316. if (!cell->v_crossed)
  2317. break;
  2318. if (cell->h_crossed)
  2319. h_crossed = 1;
  2320. }
  2321. if (x2 == x+cellw && h_crossed)
  2322. cellh++;
  2323. else
  2324. break;
  2325. }
  2326. /* Make TD */
  2327. td = add_struct_block_before(ctx, NULL, page, tr, FZ_STRUCTURE_TD, "TD");
  2328. r.x0 = gd->xpos->list[x].pos;
  2329. r.x1 = gd->xpos->list[x+cellw].pos;
  2330. r.y0 = gd->ypos->list[y].pos;
  2331. r.y1 = gd->ypos->list[y+cellh].pos;
  2332. /* Use r, not REAL contents bbox, as otherwise spanned rows
  2333. * can end up empty. */
  2334. td->up->bbox = r;
  2335. move_contained_content(ctx, page, td, parent, r);
  2336. #ifdef DEBUG_TABLE_STRUCTURE
  2337. printf("(%d,%d) + (%d,%d)\n", x, y, cellw, cellh);
  2338. #endif
  2339. for (y2 = y; y2 < y+cellh; y2++)
  2340. for (x2 = x; x2 < x+cellw; x2++)
  2341. sent_tab[x2+y2*w] = 1;
  2342. }
  2343. r.x0 = gd->xpos->list[0].pos;
  2344. r.x1 = gd->xpos->list[gd->xpos->len-1].pos;
  2345. r.y0 = gd->ypos->list[y].pos;
  2346. r.y1 = gd->ypos->list[y+1].pos;
  2347. tr->up->bbox = r;
  2348. table->up->bbox = fz_union_rect(table->up->bbox, tr->up->bbox);
  2349. }
  2350. fz_free(ctx, sent_tab);
  2351. {
  2352. fz_stext_block *block;
  2353. fz_stext_grid_positions *xps2 = copy_grid_positions_to_pool(ctx, page, gd->xpos);
  2354. fz_stext_grid_positions *yps2 = copy_grid_positions_to_pool(ctx, page, gd->ypos);
  2355. block = add_grid_block(ctx, page, &table->first_block, &table->last_block);
  2356. block->u.b.xs = xps2;
  2357. block->u.b.ys = yps2;
  2358. block->bbox.x0 = block->u.b.xs->list[0].pos;
  2359. block->bbox.y0 = block->u.b.ys->list[0].pos;
  2360. block->bbox.x1 = block->u.b.xs->list[block->u.b.xs->len-1].pos;
  2361. block->bbox.y1 = block->u.b.ys->list[block->u.b.ys->len-1].pos;
  2362. }
  2363. tidy_orphaned_tables(ctx, page, parent);
  2364. return table;
  2365. }
  2366. static void
  2367. merge_column(grid_walker_data *gd, int x)
  2368. {
  2369. int y;
  2370. for (y = 0; y < gd->cells->h; y++)
  2371. {
  2372. cell_t *d = &gd->cells->cell[x + y * (gd->cells->w-1)];
  2373. cell_t *s = &gd->cells->cell[x + y * gd->cells->w];
  2374. if (x > 0)
  2375. memcpy(d-x, s-x, sizeof(*d) * x);
  2376. d->full = s[0].full || s[1].full;
  2377. d->h_crossed = s[0].h_crossed || s[1].h_crossed;
  2378. d->h_line = s[0].h_line; /* == s[1].h_line */
  2379. d->v_crossed = s[0].v_crossed;
  2380. d->v_line = s[0].v_line;
  2381. if (x < gd->cells->w - 2)
  2382. memcpy(d+1, s+2, sizeof(*d) * (gd->cells->w - 2 - x));
  2383. }
  2384. gd->cells->w--;
  2385. if (x < gd->xpos->len - 2)
  2386. memcpy(&gd->xpos->list[x+1], &gd->xpos->list[x+2], sizeof(gd->xpos->list[0]) * (gd->xpos->len - 2 - x));
  2387. gd->xpos->len--;
  2388. }
  2389. static void
  2390. merge_columns(grid_walker_data *gd)
  2391. {
  2392. int x, y;
  2393. for (x = gd->cells->w-3; x >= 0; x--)
  2394. {
  2395. /* Can column x be merged with column x+1? */
  2396. /* An empty column can certainly be merged if the h_lines are the same,
  2397. * and there is no v_line. */
  2398. for (y = 0; y < gd->cells->h-1; y++)
  2399. {
  2400. cell_t *a = get_cell(gd->cells, x, y);
  2401. cell_t *b = get_cell(gd->cells, x+1, y);
  2402. if (a->full || !!a->h_line != !!b->h_line || b->v_line)
  2403. break;
  2404. }
  2405. if (y == gd->cells->h-1)
  2406. goto merge_column;
  2407. /* An empty column can certainly be merged if the h_lines are the same. */
  2408. for (y = 0; y < gd->cells->h-1; y++)
  2409. {
  2410. cell_t *a = get_cell(gd->cells, x, y);
  2411. cell_t *b = get_cell(gd->cells, x+1, y);
  2412. if (b->full || !!a->h_line != !!b->h_line || b->v_line)
  2413. break;
  2414. }
  2415. if (y == gd->cells->h-1)
  2416. goto merge_column;
  2417. /* We only ever want to merge columns if content crossed between them somewhere.
  2418. * Don't use uncertainty for this, because uncertainty doesn't allow for
  2419. * whitespace. */
  2420. for (y = 0; y < gd->cells->h-1; y++)
  2421. if (get_cell(gd->cells, x+1, y)->v_crossed == 1)
  2422. break;
  2423. if (y == gd->cells->h-1)
  2424. continue;
  2425. /* This requires all the pairs of cells in those 2 columns to be mergeable. */
  2426. for (y = 0; y < gd->cells->h-1; y++)
  2427. {
  2428. cell_t *a = get_cell(gd->cells, x, y);
  2429. cell_t *b = get_cell(gd->cells, x+1, y);
  2430. /* If there is a divider, we can't merge. */
  2431. if (b->v_line)
  2432. break;
  2433. /* If either is empty, we can merge. */
  2434. if (!a->full || !b->full)
  2435. continue;
  2436. /* If we differ in h linedness, we can't merge */
  2437. if (!!a->h_line != !!b->h_line)
  2438. break;
  2439. /* If both are full, we can only merge if we cross. */
  2440. if (a->full && b->full && b->v_crossed)
  2441. continue;
  2442. /* Otherwise we can't merge */
  2443. break;
  2444. }
  2445. if (y == gd->cells->h-1)
  2446. {
  2447. /* Merge the column! */
  2448. merge_column:
  2449. #ifdef DEBUG_TABLE_STRUCTURE
  2450. printf("Merging column %d\n", x);
  2451. #endif
  2452. merge_column(gd, x);
  2453. #ifdef DEBUG_TABLE_STRUCTURE
  2454. asciiart_table(gd);
  2455. #endif
  2456. }
  2457. }
  2458. }
  2459. static void
  2460. merge_row(grid_walker_data *gd, int y)
  2461. {
  2462. int x;
  2463. int w = gd->cells->w;
  2464. cell_t *d = &gd->cells->cell[y * w];
  2465. for (x = 0; x < gd->cells->w-1; x++)
  2466. {
  2467. if (d->full == 0)
  2468. d->full = d[w].full;
  2469. if (d->v_crossed == 0)
  2470. d->v_crossed = d[w].v_crossed;
  2471. d++;
  2472. }
  2473. if (y < gd->cells->h - 2)
  2474. memcpy(d, d+w, sizeof(*d) * (gd->cells->h - 2 - y) * w);
  2475. gd->cells->h--;
  2476. if (y < gd->ypos->len - 2)
  2477. memcpy(&gd->ypos->list[y+1], &gd->ypos->list[y+2], sizeof(gd->ypos->list[0]) * (gd->ypos->len - 2 - y));
  2478. gd->ypos->len--;
  2479. }
  2480. static void
  2481. merge_rows(grid_walker_data *gd)
  2482. {
  2483. int x, y;
  2484. for (y = gd->cells->h-3; y >= 0; y--)
  2485. {
  2486. /* Can row y be merged with row y+1? */
  2487. /* An empty row can certainly be merged if the v_lines are the same,
  2488. * and there is no h_line. */
  2489. for (x = 0; x < gd->cells->w-1; x++)
  2490. {
  2491. cell_t *a = get_cell(gd->cells, x, y);
  2492. cell_t *b = get_cell(gd->cells, x, y+1);
  2493. if (a->full || !!a->v_line != !!b->v_line || b->h_line)
  2494. break;
  2495. }
  2496. if (x == gd->cells->w-1)
  2497. goto merge_row;
  2498. /* An empty row can certainly be merged if the v_lines are the same. */
  2499. for (x = 0; x < gd->cells->w-1; x++)
  2500. {
  2501. cell_t *a = get_cell(gd->cells, x, y);
  2502. cell_t *b = get_cell(gd->cells, x, y+1);
  2503. if (b->full || !!a->v_line != !!b->v_line || b->h_line)
  2504. break;
  2505. }
  2506. if (x == gd->cells->w-1)
  2507. goto merge_row;
  2508. /* We only ever want to merge rows if content crossed between them somewhere.
  2509. * Don't use uncertainty for this, because uncertainty doesn't allow for
  2510. * whitespace. */
  2511. for (x = 0; x < gd->cells->w-1; x++)
  2512. if (get_cell(gd->cells, x, y+1)->h_crossed == 1)
  2513. break;
  2514. if (x == gd->cells->w-1)
  2515. continue;
  2516. /* This requires all the pairs of cells in those 2 rows to be mergeable. */
  2517. for (x = 0; x < gd->cells->w-1; x++)
  2518. {
  2519. cell_t *a = get_cell(gd->cells, x, y);
  2520. cell_t *b = get_cell(gd->cells, x, y+1);
  2521. /* If there is a divider, we can't merge. */
  2522. if (b->h_line)
  2523. goto cant_merge;
  2524. /* If either is empty, we can merge. */
  2525. if (!a->full || !b->full)
  2526. continue;
  2527. /* If we differ in v linedness, we can't merge */
  2528. if (!!a->v_line != !!b->v_line)
  2529. goto cant_merge;
  2530. /* If both are full, we can only merge if we cross. */
  2531. if (a->full && b->full && b->h_crossed)
  2532. continue;
  2533. /* Otherwise we can't merge */
  2534. break;
  2535. }
  2536. if (x == gd->cells->w-1)
  2537. {
  2538. /* Merge the row! */
  2539. merge_row:
  2540. #ifdef DEBUG_TABLE_STRUCTURE
  2541. printf("Merging row %d\n", y);
  2542. #endif
  2543. merge_row(gd, y);
  2544. #ifdef DEBUG_TABLE_STRUCTURE
  2545. asciiart_table(gd);
  2546. #endif
  2547. continue;
  2548. }
  2549. /* OK, so we failed to be able to merge y and y+1. But if we get here, we
  2550. * know this wasn't because of any lines being in the way. So we can try
  2551. * a different set of criteria. If every non-empty cell in line y+1 is either
  2552. * into from line y, or crosses into line y+2, then it's probably a 'straddling'
  2553. * line. Just remove it. */
  2554. if (y+2 >= gd->cells->h)
  2555. continue;
  2556. for (x = 0; x < gd->cells->w-1; x++)
  2557. {
  2558. cell_t *b = get_cell(gd->cells, x, y+1);
  2559. cell_t *c = get_cell(gd->cells, x, y+2);
  2560. if (!b->full)
  2561. continue;
  2562. if (!b->h_crossed && !c->h_crossed)
  2563. break;
  2564. }
  2565. if (x == gd->cells->w-1)
  2566. {
  2567. /* Merge the row! */
  2568. #ifdef DEBUG_TABLE_STRUCTURE
  2569. printf("Merging row %d (case 2)\n", y);
  2570. #endif
  2571. merge_row(gd, y);
  2572. #ifdef DEBUG_TABLE_STRUCTURE
  2573. asciiart_table(gd);
  2574. #endif
  2575. }
  2576. cant_merge:
  2577. {}
  2578. }
  2579. }
  2580. static int
  2581. remove_bordered_empty_cells(grid_walker_data *gd)
  2582. {
  2583. int x, y, x1, y1, x2, y2;
  2584. int changed = 0;
  2585. /* We are looking for a region of cells that's bordered,
  2586. * and empty, and where the borders don't extend...
  2587. *
  2588. * ' '
  2589. * ' '
  2590. * . . .'____'. . .
  2591. * | |
  2592. * | |
  2593. * . . .|____|. . .
  2594. * ' '
  2595. * ' '
  2596. * ' '
  2597. */
  2598. for (y = 0; y < gd->cells->h-1; y++)
  2599. {
  2600. for (x = 0; x < gd->cells->w-1; x++)
  2601. {
  2602. cell_t *u = y > 0 ? get_cell(gd->cells, x, y-1) : NULL;
  2603. cell_t *l = x > 0 ? get_cell(gd->cells, x-1, y) : NULL;
  2604. cell_t *c = get_cell(gd->cells, x, y);
  2605. if (c->full)
  2606. continue;
  2607. if (!c->h_line || !c->v_line)
  2608. continue;
  2609. if (l != NULL && l->h_line)
  2610. continue;
  2611. if (u != NULL && u->v_line)
  2612. continue;
  2613. /* So c (x, y) is a reasonable top left of the bounded region. */
  2614. for (x1 = x+1; x1 < gd->cells->w; x1++)
  2615. {
  2616. u = y > 0 ? get_cell(gd->cells, x1, y-1) : NULL;
  2617. c = get_cell(gd->cells, x1, y);
  2618. if (u != NULL && u->v_line)
  2619. goto bad_region;
  2620. if (c->h_line && !c->v_line)
  2621. continue;
  2622. if (c->h_line || !c->v_line)
  2623. goto bad_region;
  2624. break;
  2625. }
  2626. if (x1 == gd->cells->w)
  2627. goto bad_region;
  2628. /* If we get here, then we have the top row of a bounded region
  2629. * that runs from (x,y) to (x1-1, y). So, can we extend that region
  2630. * downwards? */
  2631. for (y1 = y+1; y1 < gd->cells->h; y1++)
  2632. {
  2633. c = get_cell(gd->cells, x, y1);
  2634. if (!c->h_line && c->v_line)
  2635. {
  2636. /* This could be the start of a line. */
  2637. for (x2 = x+1; x2 < x1; x2++)
  2638. {
  2639. c = get_cell(gd->cells, x2, y1);
  2640. if (c->full || c->h_line || c->v_line)
  2641. goto bad_region;
  2642. }
  2643. c = get_cell(gd->cells, x1, y1);
  2644. if (c->h_line || !c->v_line)
  2645. goto bad_region;
  2646. /* That line was fine! Region is extended. */
  2647. }
  2648. else if (c->h_line && !c->v_line)
  2649. {
  2650. /* This might be the bottom line of the bounded region. */
  2651. for (x2 = x+1; x2 < x1; x2++)
  2652. {
  2653. c = get_cell(gd->cells, x2, y1);
  2654. if (!c->h_line || c->v_line)
  2655. goto bad_region;
  2656. }
  2657. c = get_cell(gd->cells, x1, y1);
  2658. if (c->h_line || c->v_line)
  2659. goto bad_region;
  2660. break;
  2661. }
  2662. else
  2663. goto bad_region;
  2664. }
  2665. if (y1 == gd->cells->h)
  2666. goto bad_region;
  2667. /* So we have a bounded region from x,y to x1-1,y1-1 */
  2668. for (y2 = y; y2 < y1; y2++)
  2669. {
  2670. for (x2 = x; x2 < x1; x2++)
  2671. {
  2672. c = get_cell(gd->cells, x2, y2);
  2673. c->h_line = 0;
  2674. c->v_line = 0;
  2675. c->full = 1;
  2676. if (x2 > x)
  2677. c->v_crossed = 1;
  2678. if (y2 > y)
  2679. c->h_crossed = 1;
  2680. }
  2681. c = get_cell(gd->cells, x2, y2);
  2682. c->v_line = 0;
  2683. }
  2684. for (x2 = x; x2 < x1; x2++)
  2685. get_cell(gd->cells, x2, y2)->h_line = 0;
  2686. changed = 1;
  2687. bad_region:
  2688. {}
  2689. }
  2690. }
  2691. #ifdef DEBUG_TABLE_STRUCTURE
  2692. if (changed)
  2693. {
  2694. printf("Simplified empty bounded cells to give:\n");
  2695. asciiart_table(gd);
  2696. }
  2697. #endif
  2698. return changed;
  2699. }
  2700. static int
  2701. find_table_within_bounds(fz_context *ctx, grid_walker_data *gd, fz_stext_block *content, fz_rect bounds)
  2702. {
  2703. div_list xs = { 0 };
  2704. div_list ys = { 0 };
  2705. int failed = 1;
  2706. fz_try(ctx)
  2707. {
  2708. walk_to_find_content(ctx, &xs, &ys, content, bounds);
  2709. sanitize_positions(ctx, &xs);
  2710. sanitize_positions(ctx, &ys);
  2711. /* Run across the line, counting 'winding' */
  2712. /* If we don't have at least 2 rows and 2 columns, give up. */
  2713. if (xs.len <= 2 || ys.len <= 2)
  2714. break;
  2715. gd->xpos = make_table_positions(ctx, &xs, bounds.x0, bounds.x1);
  2716. gd->ypos = make_table_positions(ctx, &ys, bounds.y0, bounds.y1);
  2717. gd->cells = new_cells(ctx, gd->xpos->len, gd->ypos->len);
  2718. /* Walk the content looking for grid lines. These
  2719. * lines refine our positions. */
  2720. walk_grid_lines(ctx, gd, content);
  2721. /* Now, we walk the content looking for content that crosses
  2722. * these grid lines. This allows us to spot spanned cells. */
  2723. if (calculate_spanned_content(ctx, gd, content))
  2724. break; /* Unlikely to be a table. */
  2725. #ifdef DEBUG_TABLE_STRUCTURE
  2726. asciiart_table(gd);
  2727. #endif
  2728. /* Now, can we remove some columns or rows? i.e. have we oversegmented? */
  2729. do
  2730. {
  2731. merge_columns(gd);
  2732. merge_rows(gd);
  2733. }
  2734. while (remove_bordered_empty_cells(gd));
  2735. /* Did we shrink the table so much it's not a table any more? */
  2736. if (gd->xpos->len <= 2 || gd->ypos->len <= 2)
  2737. break;
  2738. failed = 0;
  2739. }
  2740. fz_always(ctx)
  2741. {
  2742. fz_free(ctx, xs.list);
  2743. fz_free(ctx, ys.list);
  2744. }
  2745. fz_catch(ctx)
  2746. {
  2747. fz_rethrow(ctx);
  2748. }
  2749. return failed;
  2750. }
  2751. static int
  2752. do_table_hunt(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent)
  2753. {
  2754. fz_stext_block *block;
  2755. int count;
  2756. fz_stext_block **first_block = parent ? &parent->first_block : &page->first_block;
  2757. int num_subtables = 0;
  2758. grid_walker_data gd = { 0 };
  2759. /* No content? Just bale. */
  2760. if (*first_block == NULL)
  2761. return 0;
  2762. /* If all the content here looks like a column of text, don't
  2763. * hunt for a table within it. */
  2764. if (all_blocks_are_justified_or_headers(ctx, *first_block))
  2765. return num_subtables;
  2766. gd.bounds = fz_infinite_rect;
  2767. /* First off, descend into any children to see if those look like tables. */
  2768. count = 0;
  2769. for (block = *first_block; block != NULL; block = block->next)
  2770. {
  2771. if (block->type == FZ_STEXT_BLOCK_STRUCT)
  2772. {
  2773. if (block->u.s.down)
  2774. {
  2775. num_subtables += do_table_hunt(ctx, page, block->u.s.down);
  2776. count++;
  2777. }
  2778. }
  2779. else if (block->type == FZ_STEXT_BLOCK_TEXT)
  2780. count++;
  2781. }
  2782. /* If we don't have at least a single child, no more to hunt. */
  2783. /* We only need a single block, because a single text block can
  2784. * contain an entire unbordered table. */
  2785. if (count < 1)
  2786. return num_subtables;
  2787. /* We only look for a table at this level if we either at the top
  2788. * or on a div. This saves us looking for tables within an 'H'
  2789. * for example. */
  2790. if (parent != NULL && parent->standard != FZ_STRUCTURE_DIV)
  2791. return num_subtables;
  2792. fz_var(gd);
  2793. fz_try(ctx)
  2794. {
  2795. /* Now see whether the content looks like tables. */
  2796. fz_rect bounds = walk_to_find_bounds(ctx, *first_block);
  2797. if (find_table_within_bounds(ctx, &gd, *first_block, bounds))
  2798. break;
  2799. if (num_subtables > 0)
  2800. {
  2801. /* We are risking throwing away a table we've already found for this
  2802. * one. Only do it if this one is really convincing. */
  2803. int x, y;
  2804. for (x = 0; x < gd.xpos->len; x++)
  2805. if (gd.xpos->list[x].uncertainty != 0)
  2806. break;
  2807. if (x != gd.xpos->len)
  2808. break;
  2809. for (y = 0; y < gd.ypos->len; y++)
  2810. if (gd.ypos->list[y].uncertainty != 0)
  2811. break;
  2812. if (y != gd.ypos->len)
  2813. break;
  2814. }
  2815. #ifdef DEBUG_TABLE_STRUCTURE
  2816. printf("Transcribing table: (%g,%g)->(%g,%g)\n",
  2817. gd.xpos->list[0].pos,
  2818. gd.ypos->list[0].pos,
  2819. gd.xpos->list[gd.xpos->len-1].pos,
  2820. gd.ypos->list[gd.ypos->len-1].pos);
  2821. #endif
  2822. /* Now we should have the entire table calculated. */
  2823. (void)transcribe_table(ctx, &gd, page, parent);
  2824. num_subtables = 1;
  2825. #ifdef DEBUG_WRITE_AS_PS
  2826. {
  2827. int i;
  2828. printf("%% TABLE\n");
  2829. for (i = 0; i < block->u.b.xs->len; i++)
  2830. {
  2831. if (block->u.b.xs->list[i].uncertainty)
  2832. printf("0 1 0 setrgbcolor\n");
  2833. else
  2834. printf("0 0.5 0 setrgbcolor\n");
  2835. printf("%g %g moveto %g %g lineto stroke\n",
  2836. block->u.b.xs->list[i].pos, block->bbox.y0,
  2837. block->u.b.xs->list[i].pos, block->bbox.y1);
  2838. }
  2839. for (i = 0; i < block->u.b.ys->len; i++)
  2840. {
  2841. if (block->u.b.ys->list[i].uncertainty)
  2842. printf("0 1 0 setrgbcolor\n");
  2843. else
  2844. printf("0 0.5 0 setrgbcolor\n");
  2845. printf("%g %g moveto %g %g lineto stroke\n",
  2846. block->bbox.x0, block->u.b.ys->list[i].pos,
  2847. block->bbox.x1, block->u.b.ys->list[i].pos);
  2848. }
  2849. }
  2850. #endif
  2851. }
  2852. fz_always(ctx)
  2853. {
  2854. fz_free(ctx, gd.xpos);
  2855. fz_free(ctx, gd.ypos);
  2856. fz_free(ctx, gd.cells);
  2857. }
  2858. fz_catch(ctx)
  2859. fz_rethrow(ctx);
  2860. return num_subtables;
  2861. }
  2862. void
  2863. fz_table_hunt(fz_context *ctx, fz_stext_page *page)
  2864. {
  2865. if (page == NULL)
  2866. return;
  2867. assert(verify_stext(ctx, page, NULL));
  2868. do_table_hunt(ctx, page, NULL);
  2869. assert(verify_stext(ctx, page, NULL));
  2870. }
  2871. fz_stext_block *
  2872. fz_find_table_within_bounds(fz_context *ctx, fz_stext_page *page, fz_rect bounds)
  2873. {
  2874. fz_stext_struct *table = NULL;
  2875. grid_walker_data gd = { 0 };
  2876. /* No content? Just bale. */
  2877. if (page == NULL || page->first_block == NULL)
  2878. return NULL;
  2879. fz_var(gd);
  2880. fz_try(ctx)
  2881. {
  2882. gd.bounds = bounds;
  2883. if (find_table_within_bounds(ctx, &gd, page->first_block, bounds))
  2884. break;
  2885. #ifdef DEBUG_TABLE_STRUCTURE
  2886. printf("Transcribing table: (%g,%g)->(%g,%g)\n",
  2887. gd.xpos->list[0].pos,
  2888. gd.ypos->list[0].pos,
  2889. gd.xpos->list[gd.xpos->len-1].pos,
  2890. gd.ypos->list[gd.ypos->len-1].pos);
  2891. #endif
  2892. /* Now we should have the entire table calculated. */
  2893. table = transcribe_table(ctx, &gd, page, NULL);
  2894. #ifdef DEBUG_WRITE_AS_PS
  2895. {
  2896. int i;
  2897. printf("%% TABLE\n");
  2898. for (i = 0; i < block->u.b.xs->len; i++)
  2899. {
  2900. if (block->u.b.xs->list[i].uncertainty)
  2901. printf("0 1 0 setrgbcolor\n");
  2902. else
  2903. printf("0 0.5 0 setrgbcolor\n");
  2904. printf("%g %g moveto %g %g lineto stroke\n",
  2905. block->u.b.xs->list[i].pos, block->bbox.y0,
  2906. block->u.b.xs->list[i].pos, block->bbox.y1);
  2907. }
  2908. for (i = 0; i < block->u.b.ys->len; i++)
  2909. {
  2910. if (block->u.b.ys->list[i].uncertainty)
  2911. printf("0 1 0 setrgbcolor\n");
  2912. else
  2913. printf("0 0.5 0 setrgbcolor\n");
  2914. printf("%g %g moveto %g %g lineto stroke\n",
  2915. block->bbox.x0, block->u.b.ys->list[i].pos,
  2916. block->bbox.x1, block->u.b.ys->list[i].pos);
  2917. }
  2918. }
  2919. #endif
  2920. }
  2921. fz_always(ctx)
  2922. {
  2923. fz_free(ctx, gd.xpos);
  2924. fz_free(ctx, gd.ypos);
  2925. fz_free(ctx, gd.cells);
  2926. }
  2927. fz_catch(ctx)
  2928. fz_rethrow(ctx);
  2929. return table ? table->first_block : NULL;
  2930. }