| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392 |
- // Copyright (C) 2004-2025 Artifex Software, Inc.
- //
- // This file is part of MuPDF.
- //
- // MuPDF is free software: you can redistribute it and/or modify it under the
- // terms of the GNU Affero General Public License as published by the Free
- // Software Foundation, either version 3 of the License, or (at your option)
- // any later version.
- //
- // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
- // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- // details.
- //
- // You should have received a copy of the GNU Affero General Public License
- // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
- //
- // Alternative licensing terms are available from the licensor.
- // For commercial licensing, see <https://www.artifex.com/> or contact
- // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
- // CA 94129, USA, for further information.
- #include "mupdf/fitz.h"
- #include "mupdf/ucdn.h"
- #include "html-imp.h"
- #include <string.h>
- #include <stdio.h>
- #include <assert.h>
- enum { T, R, B, L };
- #define DEFAULT_DIR FZ_BIDI_LTR
- static const char *html_default_css =
- "@page{margin:3em 2em}"
- "a{color:#06C;text-decoration:underline}"
- "address{display:block;font-style:italic}"
- "b{font-weight:bold}"
- "bdo{direction:rtl;unicode-bidi:bidi-override}"
- "blockquote{display:block;margin:1em 40px}"
- "body{display:block;margin:1em}"
- "cite{font-style:italic}"
- "code{font-family:monospace}"
- "dd{display:block;margin:0 0 0 40px}"
- "del{text-decoration:line-through}"
- "div{display:block}"
- "dl{display:block;margin:1em 0}"
- "dt{display:block}"
- "em{font-style:italic}"
- "h1{display:block;font-size:2em;font-weight:bold;margin:0.67em 0;page-break-after:avoid}"
- "h2{display:block;font-size:1.5em;font-weight:bold;margin:0.83em 0;page-break-after:avoid}"
- "h3{display:block;font-size:1.17em;font-weight:bold;margin:1em 0;page-break-after:avoid}"
- "h4{display:block;font-size:1em;font-weight:bold;margin:1.33em 0;page-break-after:avoid}"
- "h5{display:block;font-size:0.83em;font-weight:bold;margin:1.67em 0;page-break-after:avoid}"
- "h6{display:block;font-size:0.67em;font-weight:bold;margin:2.33em 0;page-break-after:avoid}"
- "head{display:none}"
- "hr{border-style:solid;border-width:1px;display:block;margin-bottom:0.5em;margin-top:0.5em;text-align:center}"
- "html{display:block}"
- "i{font-style:italic}"
- "ins{text-decoration:underline}"
- "kbd{font-family:monospace}"
- "li{display:list-item}"
- "menu{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}"
- "ol{display:block;list-style-type:decimal;margin:1em 0;padding:0 0 0 30pt}"
- "p{display:block;margin:1em 0}"
- "pre{display:block;font-family:monospace;margin:1em 0;white-space:pre}"
- "samp{font-family:monospace}"
- "script{display:none}"
- "small{font-size:0.83em}"
- "strong{font-weight:bold}"
- "style{display:none}"
- "sub{font-size:0.83em;vertical-align:sub}"
- "sup{font-size:0.83em;vertical-align:super}"
- "table{display:table;border-spacing:2px}"
- "tbody{display:table-row-group}"
- "td{display:table-cell;padding:1px;background-color:inherit}"
- "tfoot{display:table-footer-group}"
- "th{display:table-cell;font-weight:bold;padding:1px;text-align:center;background-color:inherit}"
- "thead{display:table-header-group}"
- "tr{display:table-row}"
- "ul{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}"
- "ul ul{list-style-type:circle}"
- "ul ul ul{list-style-type:square}"
- "var{font-style:italic}"
- "colgroup{display:table-column-group}"
- "col{display:table-column}"
- "caption{display:block;text-align:center}"
- ;
- static const char *mobi_default_css =
- "pagebreak{display:block;page-break-before:always}"
- "dl,ol,ul{margin:0}"
- "p{margin:0}"
- "blockquote{margin:0 40px}"
- "center{display:block;text-align:center}"
- "big{font-size:1.17em}"
- "strike{text-decoration:line-through}"
- ;
- static const char *fb2_default_css =
- "@page{margin:3em 2em}"
- "FictionBook{display:block;margin:1em}"
- "stylesheet,binary{display:none}"
- "description>*{display:none}"
- "description>title-info{display:block}"
- "description>title-info>*{display:none}"
- "description>title-info>coverpage{display:block;page-break-before:always;page-break-after:always}"
- "body,section,title,subtitle,p,cite,epigraph,text-author,date,poem,stanza,v,empty-line{display:block}"
- "image{display:block}"
- "p>image{display:inline}"
- "table{display:table}"
- "tr{display:table-row}"
- "th,td{display:table-cell}"
- "a{color:#06C;text-decoration:underline}"
- "a[type=note]{font-size:small;vertical-align:super}"
- "code{white-space:pre;font-family:monospace}"
- "emphasis{font-style:italic}"
- "strikethrough{text-decoration:line-through}"
- "strong{font-weight:bold}"
- "sub{font-size:small;vertical-align:sub}"
- "sup{font-size:small;vertical-align:super}"
- "image{margin:1em 0;text-align:center}"
- "cite,poem{margin:1em 2em}"
- "subtitle,epigraph,stanza{margin:1em 0}"
- "title>p{text-align:center;font-size:x-large}"
- "subtitle{text-align:center;font-size:large}"
- "p{margin-top:1em;text-align:justify}"
- "empty-line{padding-top:1em}"
- "p+p{margin-top:0;text-indent:1.5em}"
- "empty-line+p{margin-top:0}"
- "section>title{page-break-before:always}"
- ;
- static const char *known_html_tags[] = {
- // TODO: add known FB2 tags?
- // Sorted list of all HTML tags.
- "a", "abbr", "acronym", "address", "annotation-xml", "applet", "area",
- "article", "aside", "audio", "b", "base", "basefont", "bdi", "bdo",
- "bgsound", "big", "blink", "blockquote", "body", "br", "button",
- "canvas", "caption", "center", "cite", "code", "col", "colgroup",
- "data", "datalist", "dd", "del", "desc", "details", "dfn", "dir",
- "div", "dl", "dt", "em", "embed", "fieldset", "figcaption", "figure",
- "font", "footer", "foreignobject", "form", "frame", "frameset", "h1",
- "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html",
- "i", "iframe", "image", "img", "input", "ins", "isindex", "kbd",
- "keygen", "label", "legend", "li", "link", "listing", "main",
- "malignmark", "map", "mark", "marquee", "math", "menu", "menuitem",
- "meta", "meter", "mglyph", "mi", "mn", "mo", "ms", "mtext", "multicol",
- "nav", "nextid", "nobr", "noembed", "noframes", "noscript", "object",
- "ol", "optgroup", "option", "output", "p", "param", "plaintext", "pre",
- "progress", "q", "rb", "rp", "rt", "rtc", "ruby", "s", "samp",
- "script", "section", "select", "small", "source", "spacer", "span",
- "strike", "strong", "style", "sub", "summary", "sup", "svg", "table",
- "tbody", "td", "template", "textarea", "tfoot", "th", "thead", "time",
- "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr", "xmp",
- };
- static const char *known_fb2_tags[] = {
- "FictionBook", "a", "binary", "body", "cite", "code", "coverpage",
- "date", "description", "emphasis", "empty-line", "epigraph", "image",
- "p", "poem", "section", "stanza", "strikethrough", "strong",
- "stylesheet", "sub", "subtitle", "sup", "table", "td", "text-author",
- "th", "title", "title-info", "tr", "v",
- };
- static const char *find_known_html_tag(const char *tag)
- {
- int l = 0;
- int r = nelem(known_html_tags) / 2 - 1;
- while (l <= r)
- {
- int m = (l + r) >> 1;
- int c = strcmp(tag, known_html_tags[m]);
- if (c < 0)
- r = m - 1;
- else if (c > 0)
- l = m + 1;
- else
- return known_html_tags[m];
- }
- return NULL;
- }
- static const char *find_known_fb2_tag(const char *tag)
- {
- int l = 0;
- int r = nelem(known_fb2_tags) / 2 - 1;
- while (l <= r)
- {
- int m = (l + r) >> 1;
- int c = strcmp(tag, known_fb2_tags[m]);
- if (c < 0)
- r = m - 1;
- else if (c > 0)
- l = m + 1;
- else
- return known_fb2_tags[m];
- }
- return NULL;
- }
- struct genstate
- {
- fz_pool *pool;
- fz_html_font_set *set;
- fz_archive *zip;
- fz_tree *images;
- fz_xml_doc *xml;
- int is_fb2;
- const char *base_uri;
- fz_css *css;
- int at_bol;
- fz_html_box *emit_white;
- int last_brk_cls;
- int list_counter;
- int section_depth;
- fz_bidi_direction markup_dir;
- fz_text_language markup_lang;
- char *href;
- fz_css_style_splay *styles;
- };
- static int iswhite(int c)
- {
- return c == ' ' || c == '\t' || c == '\r' || c == '\n';
- }
- static int is_all_white(const char *s)
- {
- while (*s)
- {
- if (!iswhite(*s))
- return 0;
- ++s;
- }
- return 1;
- }
- /* TODO: pool allocator for flow nodes */
- /* TODO: store text by pointing to a giant buffer */
- static void fz_drop_html_flow(fz_context *ctx, fz_html_flow *flow)
- {
- while (flow)
- {
- fz_html_flow *next = flow->next;
- if (flow->type == FLOW_IMAGE)
- fz_drop_image(ctx, flow->content.image);
- flow = next;
- }
- }
- static fz_html_flow *add_flow(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, int type, int extras)
- {
- size_t size = (type == FLOW_IMAGE ? sizeof(fz_html_flow) : offsetof(fz_html_flow, content) + extras);
- fz_html_flow *flow;
- /* Shouldn't happen, but bug 705324. */
- if (top == NULL || top->type != BOX_FLOW)
- return NULL;
- flow = fz_pool_alloc(ctx, pool, size);
- flow->type = type;
- flow->expand = 0;
- flow->bidi_level = 0;
- flow->markup_lang = 0;
- flow->breaks_line = 0;
- flow->box = inline_box;
- (*top->s.build.flow_tail) = flow;
- top->s.build.flow_tail = &flow->next;
- return flow;
- }
- static void add_flow_space(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
- {
- fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_SPACE, 0);
- if (flow)
- flow->expand = 1;
- }
- static void add_flow_break(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
- {
- (void)add_flow(ctx, pool, top, inline_box, FLOW_BREAK, 0);
- }
- static void add_flow_sbreak(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
- {
- (void)add_flow(ctx, pool, top, inline_box, FLOW_SBREAK, 0);
- }
- static void add_flow_shyphen(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
- {
- (void)add_flow(ctx, pool, top, inline_box, FLOW_SHYPHEN, 0);
- }
- static void add_flow_word(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, const char *a, const char *b, int lang)
- {
- fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_WORD, b - a + 1);
- if (flow == NULL)
- return;
- memcpy(flow->content.text, a, b - a);
- flow->content.text[b - a] = 0;
- flow->markup_lang = lang;
- }
- static void add_flow_image(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, fz_image *img)
- {
- fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_IMAGE, 0);
- if (flow)
- flow->content.image = fz_keep_image(ctx, img);
- }
- static void add_flow_anchor(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
- {
- (void)add_flow(ctx, pool, top, inline_box, FLOW_ANCHOR, 0);
- }
- fz_html_flow *fz_html_split_flow(fz_context *ctx, fz_pool *pool, fz_html_flow *flow, size_t offset)
- {
- fz_html_flow *new_flow;
- char *text;
- size_t len;
- assert(flow->type == FLOW_WORD);
- if (offset == 0)
- return flow;
- text = flow->content.text;
- while (*text && offset)
- {
- int rune;
- text += fz_chartorune(&rune, text);
- offset--;
- }
- len = strlen(text);
- new_flow = fz_pool_alloc(ctx, pool, offsetof(fz_html_flow, content) + len+1);
- memcpy(new_flow, flow, offsetof(fz_html_flow, content));
- new_flow->next = flow->next;
- flow->next = new_flow;
- strcpy(new_flow->content.text, text);
- *text = 0;
- return new_flow;
- }
- static void flush_space(fz_context *ctx, fz_html_box *flow, int lang, struct genstate *g)
- {
- static const char *space = " ";
- fz_pool *pool = g->pool;
- if (g->emit_white)
- {
- int bsp = g->emit_white->style->white_space & WS_ALLOW_BREAK_SPACE;
- if (!g->at_bol)
- {
- if (bsp)
- add_flow_space(ctx, pool, flow, g->emit_white);
- else
- add_flow_word(ctx, pool, flow, g->emit_white, space, space+1, lang);
- }
- g->emit_white = 0;
- }
- }
- /* pair-wise lookup table for UAX#14 linebreaks
- The linebreak table entries mean:
- ^ prohibited break
- never break before A and after B, even with one or more spaces in between
- % indirect break
- do not break before A, unless one or more spaces follow B
- _ direct break
- break allowed before A
- */
- static const char *pairbrk[32] =
- {
- /* -OCCQGNESIPPNAHIIHBBBZCWHHJJJREEZ- */
- /* -PLPULSXYSROULLDNYAB2WMJ23LVTIBMW- */
- /* - J- */
- "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^", /* OP open punctuation */
- "_^^%%^^^^%%____%%%__^^^________%", /* CL close punctuation */
- "_^^%%^^^^%%%%%_%%%__^^^________%", /* CP close parenthesis */
- "^^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* QU quotation */
- "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* GL non-breaking glue */
- "_^^%%%^^^______%%%__^^^________%", /* NS nonstarters */
- "_^^%%%^^^______%%%__^^^________%", /* EX exclamation/interrogation */
- "_^^%%%^^^__%_%_%%%__^^^________%", /* SY symbols allowing break after */
- "_^^%%%^^^__%%%_%%%__^^^________%", /* IS infix numeric separator */
- "%^^%%%^^^__%%%%%%%__^^^%%%%%_%%%", /* PR prefix numeric */
- "%^^%%%^^^__%%%_%%%__^^^________%", /* PO postfix numeric */
- "%^^%%%^^^%%%%%_%%%__^^^________%", /* NU numeric */
- "%^^%%%^^^%%%%%_%%%__^^^________%", /* AL ordinary alphabetic and symbol characters */
- "%^^%%%^^^%%%%%_%%%__^^^________%", /* HL hebrew letter */
- "_^^%%%^^^_%____%%%__^^^________%", /* ID ideographic */
- "_^^%%%^^^______%%%__^^^________%", /* IN inseparable characters */
- "_^^%_%^^^__%___%%%__^^^________%", /* HY hyphens */
- "_^^%_%^^^______%%%__^^^________%", /* BA break after */
- "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* BB break before */
- "_^^%%%^^^______%%%_^^^^________%", /* B2 break opportunity before and after */
- "____________________^___________", /* ZW zero width space */
- "%^^%%%^^^%_%%%_%%%__^^^________%", /* CM combining mark */
- "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* WJ word joiner */
- "_^^%%%^^^_%____%%%__^^^___%%___%", /* H2 hangul leading/vowel syllable */
- "_^^%%%^^^_%____%%%__^^^____%___%", /* H3 hangul leading/vowel/trailing syllable */
- "_^^%%%^^^_%____%%%__^^^%%%%____%", /* JL hangul leading jamo */
- "_^^%%%^^^_%____%%%__^^^___%%___%", /* JV hangul vowel jamo */
- "_^^%%%^^^_%____%%%__^^^____%___%", /* JT hangul trailing jamo */
- "_^^%%%^^^______%%%__^^^_____%__%", /* RI regional indicator */
- "_^^%%%^^^_%____%%%__^^^_______%%", /* EB emoji base */
- "_^^%%%^^^_%____%%%__^^^________%", /* EM emoji modifier */
- "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* ZWJ zero width joiner */
- };
- static fz_html_box *
- find_flow_encloser(fz_context *ctx, fz_html_box *flow)
- {
- /* This code was written to assume that there will always be a
- * flow box enclosing callers of this. Bug 705324 shows that
- * this isn't always the case. In the absence of a reproducer
- * file, all I can do is try to patch around the issue so that
- * we won't crash. */
- while (flow->type != BOX_FLOW)
- {
- if (flow->up == NULL)
- {
- fz_warn(ctx, "Flow encloser not found. Please report this file!");
- break;
- }
- flow = flow->up;
- }
- return flow;
- }
- static void generate_text(fz_context *ctx, fz_html_box *box, const char *text, int lang, struct genstate *g)
- {
- fz_html_box *flow;
- fz_pool *pool = g->pool;
- int collapse = box->style->white_space & WS_COLLAPSE;
- int bsp = box->style->white_space & WS_ALLOW_BREAK_SPACE;
- int bnl = box->style->white_space & WS_FORCE_BREAK_NEWLINE;
- static const char *space = " ";
- flow = find_flow_encloser(ctx, box);
- if (flow == NULL)
- return;
- while (*text)
- {
- if (bnl && (*text == '\n' || *text == '\r'))
- {
- if (text[0] == '\r' && text[1] == '\n')
- text += 2;
- else
- text += 1;
- add_flow_break(ctx, pool, flow, box);
- g->at_bol = 1;
- }
- else if (iswhite(*text))
- {
- if (collapse)
- {
- if (bnl)
- while (*text == ' ' || *text == '\t')
- ++text;
- else
- while (iswhite(*text))
- ++text;
- g->emit_white = box;
- }
- else
- {
- // TODO: tabs
- if (bsp)
- add_flow_space(ctx, pool, flow, box);
- else
- add_flow_word(ctx, pool, flow, box, space, space+1, lang);
- ++text;
- }
- g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a space */
- }
- else
- {
- const char *prev, *mark = text;
- int c;
- flush_space(ctx, flow, lang, g);
- if (g->at_bol)
- g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ;
- while (*text && !iswhite(*text))
- {
- prev = text;
- text += fz_chartorune(&c, text);
- if (c == 0xAD) /* soft hyphen */
- {
- if (mark != prev)
- add_flow_word(ctx, pool, flow, box, mark, prev, lang);
- add_flow_shyphen(ctx, pool, flow, box);
- mark = text;
- g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a soft hyphen */
- }
- else if (bsp) /* allow soft breaks */
- {
- int this_brk_cls = ucdn_get_resolved_linebreak_class(c);
- if (this_brk_cls <= UCDN_LINEBREAK_CLASS_ZWJ)
- {
- int brk = pairbrk[g->last_brk_cls][this_brk_cls];
- /* we handle spaces elsewhere, so ignore these classes */
- if (brk == '@') brk = '^';
- if (brk == '#') brk = '^';
- if (brk == '%') brk = '^';
- if (brk == '_')
- {
- if (mark != prev)
- add_flow_word(ctx, pool, flow, box, mark, prev, lang);
- add_flow_sbreak(ctx, pool, flow, box);
- mark = prev;
- }
- g->last_brk_cls = this_brk_cls;
- }
- }
- }
- if (mark != text)
- add_flow_word(ctx, pool, flow, box, mark, text, lang);
- g->at_bol = 0;
- }
- }
- }
- static fz_image *load_html_image(fz_context *ctx, fz_archive *zip, const char *base_uri, const char *src)
- {
- char path[2048];
- fz_image *img = NULL;
- fz_buffer *buf = NULL;
- fz_var(img);
- fz_var(buf);
- fz_try(ctx)
- {
- if (!strncmp(src, "data:image/jpeg;base64,", 23))
- buf = fz_new_buffer_from_base64(ctx, src+23, 0);
- else if (!strncmp(src, "data:image/png;base64,", 22))
- buf = fz_new_buffer_from_base64(ctx, src+22, 0);
- else if (!strncmp(src, "data:image/gif;base64,", 22))
- buf = fz_new_buffer_from_base64(ctx, src+22, 0);
- else
- {
- fz_strlcpy(path, base_uri, sizeof path);
- fz_strlcat(path, "/", sizeof path);
- fz_strlcat(path, src, sizeof path);
- fz_urldecode(path);
- fz_cleanname(path);
- buf = fz_read_archive_entry(ctx, zip, path);
- }
- #if FZ_ENABLE_SVG
- if (strstr(src, ".svg"))
- img = fz_new_image_from_svg(ctx, buf, base_uri, zip);
- else
- #endif
- img = fz_new_image_from_buffer(ctx, buf);
- }
- fz_always(ctx)
- fz_drop_buffer(ctx, buf);
- fz_catch(ctx)
- {
- fz_ignore_error(ctx);
- fz_warn(ctx, "html: cannot load image src='%s'", src);
- }
- return img;
- }
- static fz_image *load_svg_image(fz_context *ctx, fz_archive *zip, const char *base_uri,
- fz_xml_doc *xmldoc, fz_xml *node)
- {
- fz_image *img = NULL;
- #if FZ_ENABLE_SVG
- fz_try(ctx)
- img = fz_new_image_from_svg_xml(ctx, xmldoc, node, base_uri, zip);
- fz_catch(ctx)
- {
- fz_ignore_error(ctx);
- fz_warn(ctx, "html: cannot load embedded svg document");
- }
- #endif
- return img;
- }
- static void generate_image(fz_context *ctx, fz_html_box *box, fz_image *img, struct genstate *g)
- {
- fz_html_box *flow;
- fz_pool *pool = g->pool;
- flow = find_flow_encloser(ctx, box);
- flush_space(ctx, flow, 0, g);
- if (!img)
- {
- const char *alt = "[image]";
- add_flow_word(ctx, pool, flow, box, alt, alt + 7, 0);
- }
- else
- {
- fz_try(ctx)
- {
- add_flow_sbreak(ctx, pool, flow, box);
- add_flow_image(ctx, pool, flow, box, img);
- add_flow_sbreak(ctx, pool, flow, box);
- }
- fz_always(ctx)
- {
- fz_drop_image(ctx, img);
- }
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- g->at_bol = 0;
- }
- static void fz_drop_html_box(fz_context *ctx, fz_html_box *box)
- {
- while (box)
- {
- fz_html_box *next = box->next;
- if (box->type == BOX_FLOW)
- fz_drop_html_flow(ctx, box->u.flow.head);
- fz_drop_html_box(ctx, box->down);
- box = next;
- }
- }
- static void fz_drop_html_imp(fz_context *ctx, fz_storable *stor)
- {
- fz_html *html = (fz_html *)stor;
- fz_drop_html_box(ctx, html->tree.root);
- fz_drop_pool(ctx, html->tree.pool);
- }
- static void fz_drop_story_imp(fz_context *ctx, fz_storable *stor)
- {
- fz_story *story = (fz_story *)stor;
- fz_free(ctx, story->user_css);
- fz_drop_html_font_set(ctx, story->font_set);
- fz_drop_xml(ctx, story->dom);
- fz_drop_html_box(ctx, story->tree.root);
- fz_drop_buffer(ctx, story->warnings);
- fz_drop_archive(ctx, story->zip);
- /* The pool must be the last thing dropped. */
- fz_drop_pool(ctx, story->tree.pool);
- }
- /* Drop a structure derived from an html_tree. The exact things
- * freed here will depend upon the drop function with which it
- * was created. */
- static void
- fz_drop_html_tree(fz_context *ctx, fz_html_tree *tree)
- {
- fz_defer_reap_start(ctx);
- fz_drop_storable(ctx, &tree->storable);
- fz_defer_reap_end(ctx);
- }
- void fz_drop_html(fz_context *ctx, fz_html *html)
- {
- fz_drop_html_tree(ctx, &html->tree);
- }
- void fz_drop_story(fz_context *ctx, fz_story *story)
- {
- if (!story)
- return;
- fz_drop_html_tree(ctx, &story->tree);
- }
- fz_html *fz_keep_html(fz_context *ctx, fz_html *html)
- {
- return fz_keep_storable(ctx, &html->tree.storable);
- }
- static fz_html_box *new_box(fz_context *ctx, struct genstate *g, fz_xml *node, int type, fz_css_style *style)
- {
- fz_html_box *box;
- const char *tag = fz_xml_tag(node);
- const char *id = fz_xml_att(node, "id");
- const char *href;
- if (type == BOX_INLINE)
- box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u));
- else if (type == BOX_FLOW)
- box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u) + sizeof(box->u.flow));
- else
- box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u) + sizeof(box->u.block));
- box->type = type;
- box->is_first_flow = 0;
- box->markup_dir = g->markup_dir;
- box->heading = 0;
- box->list_item = 0;
- box->style = fz_css_enlist(ctx, style, &g->styles, g->pool);
- if (tag)
- {
- box->tag = find_known_html_tag(tag);
- if (!box->tag && g->is_fb2)
- box->tag = find_known_fb2_tag(tag);
- if (!box->tag)
- box->tag = fz_pool_strdup(ctx, g->pool, tag);
- }
- else
- {
- box->tag = "#anon";
- }
- if (id)
- box->id = fz_pool_strdup(ctx, g->pool, id);
- if (tag && tag[0]=='a' && tag[1]==0)
- {
- // Support deprecated anchor syntax with id in "name" instead of "id" attribute.
- if (!id)
- {
- const char *name = fz_xml_att(node, "name");
- if (name)
- box->id = fz_pool_strdup(ctx, g->pool, name);
- }
- if (g->is_fb2)
- {
- href = fz_xml_att(node, "l:href");
- if (!href)
- href = fz_xml_att(node, "xlink:href");
- }
- else
- {
- href = fz_xml_att(node, "href");
- }
- if (href)
- g->href = fz_pool_strdup(ctx, g->pool, href);
- }
- if (g->href)
- box->href = g->href;
- if (type == BOX_FLOW)
- {
- box->u.flow.head = NULL;
- box->s.build.flow_tail = &box->u.flow.head;
- }
- return box;
- }
- static void append_box(fz_context *ctx, fz_html_box *parent, fz_html_box *child)
- {
- child->up = parent;
- if (!parent->down)
- parent->down = child;
- if (parent->s.build.last_child)
- parent->s.build.last_child->next = child;
- parent->s.build.last_child = child;
- }
- static fz_html_box *find_block_context(fz_context *ctx, fz_html_box *box)
- {
- while (box->type != BOX_BLOCK && box->type != BOX_TABLE_CELL)
- box = box->up;
- return box;
- }
- static fz_html_box *find_table_row_context(fz_context *ctx, fz_html_box *box)
- {
- fz_html_box *look = box;
- while (look && look->type != BOX_TABLE)
- look = look->up;
- if (look)
- return look;
- fz_warn(ctx, "table-row not inside table element");
- return NULL;
- }
- static fz_html_box *find_table_cell_context(fz_context *ctx, fz_html_box *box)
- {
- fz_html_box *look = box;
- while (look && look->type != BOX_TABLE_ROW)
- look = look->up;
- if (look)
- return look;
- fz_warn(ctx, "table-cell not inside table-row element");
- return NULL;
- }
- static fz_html_box *find_inline_context(fz_context *ctx, struct genstate *g, fz_html_box *box)
- {
- fz_css_style style;
- fz_html_box *flow_box;
- if (box->type == BOX_FLOW || box->type == BOX_INLINE)
- return box;
- // We have an inline element that is not in an existing flow/inline context.
- // Find the closest block level box to insert content into.
- while (box->type != BOX_BLOCK && box->type != BOX_TABLE_CELL)
- box = box->up;
- // Concatenate onto the last open flow box if we have one.
- if (box->s.build.last_child && box->s.build.last_child->type == BOX_FLOW)
- return box->s.build.last_child;
- // No flow box found, create and insert one!
- // TODO: null style instead of default for flow box?
- fz_default_css_style(ctx, &style);
- flow_box = new_box(ctx, g, NULL, BOX_FLOW, &style);
- flow_box->is_first_flow = !box->down;
- g->at_bol = 1;
- append_box(ctx, box, flow_box);
- return flow_box;
- }
- static void gen2_children(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *root_node, fz_css_match *root_match);
- static void gen2_text(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node)
- {
- fz_html_box *anon_box;
- fz_css_style style;
- const char *text;
- int collapse;
- text = fz_xml_text(node);
- collapse = root_box->style->white_space & WS_COLLAPSE;
- if (collapse && is_all_white(text))
- {
- g->emit_white = root_box;
- }
- else
- {
- if (root_box->type != BOX_INLINE)
- {
- /* Create anonymous inline box, with the same style as the top block box. */
- style = *root_box->style;
- // Make sure not to recursively multiply font sizes
- style.font_size.value = 1;
- style.font_size.unit = N_SCALE;
- root_box = find_inline_context(ctx, g, root_box);
- anon_box = new_box(ctx, g, NULL, BOX_INLINE, &style);
- append_box(ctx, root_box, anon_box);
- root_box = anon_box;
- }
- generate_text(ctx, root_box, text, g->markup_lang, g);
- }
- }
- static fz_html_box *gen2_inline(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style)
- {
- fz_html_box *this_box;
- fz_html_box *flow_box;
- root_box = find_inline_context(ctx, g, root_box);
- this_box = new_box(ctx, g, node, BOX_INLINE, style);
- append_box(ctx, root_box, this_box);
- if (this_box->id)
- {
- flow_box = find_flow_encloser(ctx, this_box);
- add_flow_anchor(ctx, g->pool, flow_box, this_box);
- }
- return this_box;
- }
- static void gen2_break(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node)
- {
- fz_html_box *this_box;
- fz_html_box *flow_box;
- if (root_box->type != BOX_INLINE)
- {
- /* Create inline box to hold the <br> tag, with the same style as containing block. */
- /* Make sure not to recursively multiply font sizes. */
- fz_css_style style = *root_box->style;
- style.font_size.value = 1;
- style.font_size.unit = N_SCALE;
- this_box = new_box(ctx, g, node, BOX_INLINE, &style);
- append_box(ctx, find_inline_context(ctx, g, root_box), this_box);
- }
- else
- {
- this_box = root_box;
- }
- flow_box = find_flow_encloser(ctx, this_box);
- add_flow_break(ctx, g->pool, flow_box, this_box);
- g->at_bol = 1;
- }
- static fz_html_box *gen2_block(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style)
- {
- fz_html_box *this_box;
- root_box = find_block_context(ctx, root_box);
- this_box = new_box(ctx, g, node, BOX_BLOCK, style);
- append_box(ctx, root_box, this_box);
- return this_box;
- }
- static fz_html_box *gen2_table(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style)
- {
- fz_html_box *this_box;
- root_box = find_block_context(ctx, root_box);
- this_box = new_box(ctx, g, node, BOX_TABLE, style);
- append_box(ctx, root_box, this_box);
- return this_box;
- }
- static fz_html_box *gen2_table_row(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style)
- {
- fz_html_box *this_box, *table_box;
- table_box = find_table_row_context(ctx, root_box);
- if (!table_box)
- return gen2_block(ctx, g, root_box, node, style);
- this_box = new_box(ctx, g, node, BOX_TABLE_ROW, style);
- append_box(ctx, table_box, this_box);
- return this_box;
- }
- static fz_html_box *gen2_table_cell(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style)
- {
- fz_html_box *this_box, *row_box;
- row_box = find_table_cell_context(ctx, root_box);
- if (!row_box)
- return gen2_block(ctx, g, root_box, node, style);
- this_box = new_box(ctx, g, node, BOX_TABLE_CELL, style);
- append_box(ctx, row_box, this_box);
- return this_box;
- }
- static void gen2_image_common(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_image *img, int display, fz_css_style *style)
- {
- fz_html_box *img_block_box;
- fz_html_box *img_inline_box;
- if (display == DIS_INLINE || display == DIS_INLINE_BLOCK)
- {
- root_box = find_inline_context(ctx, g, root_box);
- img_inline_box = new_box(ctx, g, node, BOX_INLINE, style);
- append_box(ctx, root_box, img_inline_box);
- generate_image(ctx, img_inline_box, img, g);
- }
- else
- {
- root_box = find_block_context(ctx, root_box);
- img_block_box = new_box(ctx, g, node, BOX_BLOCK, style);
- append_box(ctx, root_box, img_block_box);
- root_box = find_inline_context(ctx, g, img_block_box);
- img_inline_box = new_box(ctx, g, NULL, BOX_INLINE, style);
- append_box(ctx, root_box, img_inline_box);
- generate_image(ctx, img_inline_box, img, g);
- }
- }
- static void gen2_image_html(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style)
- {
- const char *src = fz_xml_att(node, "src");
- if (src)
- {
- fz_css_style local_style = *style;
- fz_image *img;
- int w, h;
- const char *w_att = fz_xml_att(node, "width");
- const char *h_att = fz_xml_att(node, "height");
- if (w_att && (w = fz_atoi(w_att)) > 0)
- {
- local_style.width.value = w;
- local_style.width.unit = strchr(w_att, '%') ? N_PERCENT : N_LENGTH;
- }
- if (h_att && (h = fz_atoi(h_att)) > 0)
- {
- local_style.height.value = h;
- local_style.height.unit = strchr(h_att, '%') ? N_PERCENT : N_LENGTH;
- }
- img = load_html_image(ctx, g->zip, g->base_uri, src);
- gen2_image_common(ctx, g, root_box, node, img, display, &local_style);
- }
- }
- static void gen2_image_fb2(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style)
- {
- const char *src = fz_xml_att(node, "l:href");
- if (!src)
- src = fz_xml_att(node, "xlink:href");
- if (src && src[0] == '#')
- {
- fz_image *img = fz_tree_lookup(ctx, g->images, src+1);
- gen2_image_common(ctx, g, root_box, node, fz_keep_image(ctx, img), display, style);
- }
- }
- static void gen2_image_svg(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style)
- {
- fz_image *img = load_svg_image(ctx, g->zip, g->base_uri, g->xml, node);
- gen2_image_common(ctx, g, root_box, node, img, display, style);
- }
- static int get_heading_from_tag(fz_context *ctx, struct genstate *g, const char *tag)
- {
- if (tag[0] == 'h' && tag[1] != 0 && tag[2] == 0)
- {
- switch (tag[1])
- {
- case '1': return 1;
- case '2': return 2;
- case '3': return 3;
- case '4': return 4;
- case '5': return 5;
- case '6': return 6;
- }
- }
- if (g->is_fb2)
- {
- if (!strcmp(tag, "title") || !strcmp(tag, "subtitle"))
- return fz_mini(g->section_depth, 6);
- }
- return 0;
- }
- static void gen2_tag(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node,
- fz_css_match *match, int display, fz_css_style *style)
- {
- fz_html_box *this_box;
- const char *tag;
- const char *lang_att;
- const char *dir_att;
- int save_markup_dir = g->markup_dir;
- int save_markup_lang = g->markup_lang;
- char *save_href = g->href;
- if (display == DIS_NONE)
- return;
- tag = fz_xml_tag(node);
- dir_att = fz_xml_att(node, "dir");
- if (dir_att)
- {
- if (!strcmp(dir_att, "auto"))
- g->markup_dir = FZ_BIDI_NEUTRAL;
- else if (!strcmp(dir_att, "rtl"))
- g->markup_dir = FZ_BIDI_RTL;
- else if (!strcmp(dir_att, "ltr"))
- g->markup_dir = FZ_BIDI_LTR;
- else
- g->markup_dir = DEFAULT_DIR;
- }
- lang_att = fz_xml_att(node, "lang");
- if (lang_att)
- g->markup_lang = fz_text_language_from_string(lang_att);
- switch (display)
- {
- case DIS_INLINE_BLOCK:
- // TODO handle inline block as a flow node
- this_box = gen2_block(ctx, g, root_box, node, style);
- break;
- case DIS_BLOCK:
- this_box = gen2_block(ctx, g, root_box, node, style);
- this_box->heading = get_heading_from_tag(ctx, g, tag);
- break;
- case DIS_LIST_ITEM:
- this_box = gen2_block(ctx, g, root_box, node, style);
- this_box->list_item = ++g->list_counter;
- break;
- // TODO: https://www.w3.org/TR/CSS2/tables.html#anonymous-boxes
- //
- // The table generation code should insert and create anonymous boxes
- // for any missing child/parent elements.
- //
- // MISSING CHILDREN:
- // 1: Wrap consecutive BLOCK found in a TABLE in an anon TABLE_ROW.
- // 2: Wrap consecutive BLOCK found in a TABLE_ROW in an anon TABLE_CELL.
- //
- // MISSING PARENTS:
- // 1: Wrap consecutive TABLE_CELL found outside TABLE_ROW in an anon TABLE_ROW
- // 2: Wrap consecutive TABLE_ROW found outside TABLE in an anon TABLE
- //
- // For now we ignore this and treat any such elements that are out of
- // context as plain block elements.
- case DIS_TABLE:
- this_box = gen2_table(ctx, g, root_box, node, style);
- break;
- case DIS_TABLE_GROUP:
- // no box for table-row-group elements
- this_box = root_box;
- break;
- case DIS_TABLE_ROW:
- this_box = gen2_table_row(ctx, g, root_box, node, style);
- break;
- case DIS_TABLE_CELL:
- this_box = gen2_table_cell(ctx, g, root_box, node, style);
- break;
- case DIS_INLINE:
- default:
- this_box = gen2_inline(ctx, g, root_box, node, style);
- break;
- }
- if (tag && (!strcmp(tag, "ol") || !strcmp(tag, "ul") || !strcmp(tag, "dl")))
- {
- int save_list_counter = g->list_counter;
- g->list_counter = 0;
- gen2_children(ctx, g, this_box, node, match);
- g->list_counter = save_list_counter;
- }
- else if (tag && !strcmp(tag, "section"))
- {
- int save_section_depth = g->section_depth;
- g->section_depth++;
- gen2_children(ctx, g, this_box, node, match);
- g->section_depth = save_section_depth;
- }
- else
- {
- gen2_children(ctx, g, this_box, node, match);
- }
- g->markup_dir = save_markup_dir;
- g->markup_lang = save_markup_lang;
- g->href = save_href;
- }
- static void gen2_children(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *root_node, fz_css_match *root_match)
- {
- fz_xml *node;
- const char *tag;
- fz_css_match match;
- fz_css_style style;
- int display;
- for (node = fz_xml_down(root_node); node; node = fz_xml_next(node))
- {
- tag = fz_xml_tag(node);
- if (tag)
- {
- fz_match_css(ctx, &match, root_match, g->css, node);
- fz_apply_css_style(ctx, g->set, &style, &match);
- display = fz_get_css_match_display(&match);
- if (tag[0]=='b' && tag[1]=='r' && tag[2]==0)
- {
- gen2_break(ctx, g, root_box, node);
- }
- else if (tag[0]=='i' && tag[1]=='m' && tag[2]=='g' && tag[3]==0)
- {
- gen2_image_html(ctx, g, root_box, node, display, &style);
- }
- else if (g->is_fb2 && tag[0]=='i' && tag[1]=='m' && tag[2]=='a' && tag[3]=='g' && tag[4]=='e' && tag[5]==0)
- {
- gen2_image_fb2(ctx, g, root_box, node, display, &style);
- }
- else if (tag[0]=='s' && tag[1]=='v' && tag[2]=='g' && tag[3]==0)
- {
- gen2_image_svg(ctx, g, root_box, node, display, &style);
- }
- else
- {
- gen2_tag(ctx, g, root_box, node, &match, display, &style);
- }
- }
- else
- {
- gen2_text(ctx, g, root_box, node);
- }
- }
- }
- static char *concat_text(fz_context *ctx, fz_xml *root)
- {
- fz_xml *node;
- size_t i = 0, n = 1;
- char *s;
- for (node = fz_xml_down(root); node; node = fz_xml_next(node))
- {
- const char *text = fz_xml_text(node);
- n += text ? strlen(text) : 0;
- }
- s = Memento_label(fz_malloc(ctx, n), "concat_html");
- for (node = fz_xml_down(root); node; node = fz_xml_next(node))
- {
- const char *text = fz_xml_text(node);
- if (text)
- {
- n = strlen(text);
- memcpy(s+i, text, n);
- i += n;
- }
- }
- s[i] = 0;
- return s;
- }
- static void
- html_load_css_link(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root, const char *href)
- {
- char path[2048];
- char css_base_uri[2048];
- fz_buffer *buf;
- fz_var(buf);
- fz_strlcpy(path, base_uri, sizeof path);
- fz_strlcat(path, "/", sizeof path);
- fz_strlcat(path, href, sizeof path);
- fz_urldecode(path);
- fz_cleanname(path);
- fz_dirname(css_base_uri, path, sizeof css_base_uri);
- buf = NULL;
- fz_try(ctx)
- {
- buf = fz_read_archive_entry(ctx, zip, path);
- fz_parse_css(ctx, css, fz_string_from_buffer(ctx, buf), path);
- fz_add_css_font_faces(ctx, set, zip, css_base_uri, css);
- }
- fz_always(ctx)
- fz_drop_buffer(ctx, buf);
- fz_catch(ctx)
- {
- fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
- fz_report_error(ctx);
- fz_warn(ctx, "ignoring stylesheet %s", path);
- }
- }
- static void
- html_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root)
- {
- fz_xml *html, *head, *node;
- html = fz_xml_find(root, "html");
- head = fz_xml_find_down(html, "head");
- for (node = fz_xml_down(head); node; node = fz_xml_next(node))
- {
- if (fz_xml_is_tag(node, "link"))
- {
- char *rel = fz_xml_att(node, "rel");
- if (rel && !fz_strcasecmp(rel, "stylesheet"))
- {
- char *type = fz_xml_att(node, "type");
- if ((type && !strcmp(type, "text/css")) || !type)
- {
- char *href = fz_xml_att(node, "href");
- if (href)
- {
- html_load_css_link(ctx, set, zip, base_uri, css, root, href);
- }
- }
- }
- }
- else if (fz_xml_is_tag(node, "style"))
- {
- char *s = concat_text(ctx, node);
- fz_try(ctx)
- {
- fz_parse_css(ctx, css, s, "<style>");
- fz_add_css_font_faces(ctx, set, zip, base_uri, css);
- }
- fz_always(ctx)
- fz_free(ctx, s);
- fz_catch(ctx)
- {
- fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
- fz_report_error(ctx);
- fz_warn(ctx, "ignoring inline stylesheet");
- }
- }
- }
- }
- static void
- fb2_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root)
- {
- fz_xml *fictionbook, *stylesheet;
- fictionbook = fz_xml_find(root, "FictionBook");
- stylesheet = fz_xml_find_down(fictionbook, "stylesheet");
- if (stylesheet)
- {
- char *s = concat_text(ctx, stylesheet);
- fz_try(ctx)
- {
- fz_parse_css(ctx, css, s, "<stylesheet>");
- fz_add_css_font_faces(ctx, set, zip, base_uri, css);
- }
- fz_catch(ctx)
- {
- fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
- fz_report_error(ctx);
- fz_warn(ctx, "ignoring inline stylesheet");
- }
- fz_free(ctx, s);
- }
- }
- static fz_tree *
- load_fb2_images(fz_context *ctx, fz_xml *root)
- {
- fz_xml *fictionbook, *binary;
- fz_tree *images = NULL;
- fictionbook = fz_xml_find(root, "FictionBook");
- for (binary = fz_xml_find_down(fictionbook, "binary"); binary; binary = fz_xml_find_next(binary, "binary"))
- {
- const char *id = fz_xml_att(binary, "id");
- char *b64 = NULL;
- fz_buffer *buf = NULL;
- fz_image *img = NULL;
- fz_var(b64);
- fz_var(buf);
- if (id == NULL)
- {
- fz_warn(ctx, "Skipping image with no id");
- continue;
- }
- fz_try(ctx)
- {
- b64 = concat_text(ctx, binary);
- buf = fz_new_buffer_from_base64(ctx, b64, strlen(b64));
- img = fz_new_image_from_buffer(ctx, buf);
- }
- fz_always(ctx)
- {
- fz_drop_buffer(ctx, buf);
- fz_free(ctx, b64);
- }
- fz_catch(ctx)
- fz_rethrow(ctx);
- images = fz_tree_insert(ctx, images, id, img);
- }
- return images;
- }
- typedef struct
- {
- uint32_t *data;
- size_t cap;
- size_t len;
- } uni_buf;
- typedef struct
- {
- fz_context *ctx;
- fz_pool *pool;
- fz_html_flow *flow;
- uni_buf *buffer;
- } bidi_data;
- static void fragment_cb(const uint32_t *fragment,
- size_t fragment_len,
- int bidi_level,
- int script,
- void *arg)
- {
- bidi_data *data = (bidi_data *)arg;
- /* We are guaranteed that fragmentOffset will be at the beginning
- * of flow. */
- while (fragment_len > 0)
- {
- size_t len;
- if (data->flow->type == FLOW_SPACE)
- {
- len = 1;
- }
- else if (data->flow->type == FLOW_BREAK || data->flow->type == FLOW_SBREAK ||
- data->flow->type == FLOW_SHYPHEN || data->flow->type == FLOW_ANCHOR)
- {
- len = 0;
- }
- else
- {
- /* Must be text */
- len = fz_utflen(data->flow->content.text);
- if (len > fragment_len)
- {
- /* We need to split this flow box */
- (void)fz_html_split_flow(data->ctx, data->pool, data->flow, fragment_len);
- len = fz_utflen(data->flow->content.text);
- }
- }
- /* This flow box is entirely contained within this fragment. */
- data->flow->bidi_level = bidi_level;
- data->flow->script = script;
- data->flow = data->flow->next;
- fragment_len -= len;
- }
- }
- static fz_bidi_direction
- detect_flow_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_bidi_direction bidi_dir, fz_html_flow *flow)
- {
- fz_html_flow *end = flow;
- bidi_data data;
- while (end)
- {
- unsigned int level = end->bidi_level;
- /* Gather the text from the flow up into a single buffer (at
- * least, as much of it as has the same direction markup). */
- buffer->len = 0;
- while (end && (level & 1) == (end->bidi_level & 1))
- {
- size_t len = 0;
- const char *text = "";
- int broken = 0;
- switch (end->type)
- {
- case FLOW_WORD:
- len = fz_utflen(end->content.text);
- text = end->content.text;
- break;
- case FLOW_SPACE:
- len = 1;
- text = " ";
- break;
- case FLOW_SHYPHEN:
- case FLOW_SBREAK:
- break;
- case FLOW_BREAK:
- case FLOW_IMAGE:
- broken = 1;
- break;
- }
- end = end->next;
- if (broken)
- break;
- /* Make sure the buffer is large enough */
- if (buffer->len + len > buffer->cap)
- {
- size_t newcap = buffer->cap;
- if (newcap < 128)
- newcap = 128; /* Sensible small default */
- while (newcap < buffer->len + len)
- newcap = (newcap * 3) / 2;
- buffer->data = fz_realloc_array(ctx, buffer->data, newcap, uint32_t);
- buffer->cap = newcap;
- }
- /* Expand the utf8 text into Unicode and store it in the buffer */
- while (*text)
- {
- int rune;
- text += fz_chartorune(&rune, text);
- buffer->data[buffer->len++] = rune;
- }
- }
- /* Detect directionality for the buffer */
- data.ctx = ctx;
- data.pool = pool;
- data.flow = flow;
- data.buffer = buffer;
- fz_bidi_fragment_text(ctx, buffer->data, buffer->len, &bidi_dir, fragment_cb, &data, 0 /* Flags */);
- flow = end;
- }
- return bidi_dir;
- }
- static void
- detect_box_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_html_box *box)
- {
- while (box)
- {
- if (box->type == BOX_FLOW)
- box->markup_dir = detect_flow_directionality(ctx, pool, buffer, box->markup_dir, box->u.flow.head);
- detect_box_directionality(ctx, pool, buffer, box->down);
- box = box->next;
- }
- }
- static void
- detect_directionality(fz_context *ctx, fz_pool *pool, fz_html_box *box)
- {
- uni_buf buffer = { NULL };
- fz_try(ctx)
- detect_box_directionality(ctx, pool, &buffer, box);
- fz_always(ctx)
- fz_free(ctx, buffer.data);
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- static fz_xml_doc *
- parse_to_xml(fz_context *ctx, fz_buffer *buf, int try_xml, int try_html5)
- {
- fz_xml_doc *xml;
- if (try_xml && try_html5)
- {
- fz_try(ctx)
- xml = fz_parse_xml(ctx, buf, 1);
- fz_catch(ctx)
- {
- if (fz_caught(ctx) == FZ_ERROR_SYNTAX)
- {
- fz_report_error(ctx);
- fz_warn(ctx, "syntax error in XHTML; retrying using HTML5 parser");
- xml = fz_parse_xml_from_html5(ctx, buf);
- }
- else
- fz_rethrow(ctx);
- }
- }
- else if (try_xml)
- xml = fz_parse_xml(ctx, buf, 1);
- else
- {
- assert(try_html5);
- xml = fz_parse_xml_from_html5(ctx, buf);
- }
- return xml;
- }
- static void move_background_color_style_up(fz_context *ctx, struct genstate *g, fz_html_box *root, fz_html_box *from)
- {
- fz_css_color transparent = { 0, 0, 0, 0 };
- fz_css_style s1, s2;
- memcpy(&s1, root->style, sizeof s1);
- memcpy(&s2, from->style, sizeof s2);
- s1.background_color = s2.background_color;
- s2.background_color = transparent;
- root->style = fz_css_enlist(ctx, &s1, &g->styles, g->pool);
- from->style = fz_css_enlist(ctx, &s2, &g->styles, g->pool);
- }
- static void move_background_color_up(fz_context *ctx, struct genstate *g, fz_html_box *root)
- {
- fz_html_box *html, *body;
- if (root->style->background_color.a != 0)
- {
- return;
- }
- html = root->down;
- if (html && !strcmp(html->tag, "html"))
- {
- if (html->style->background_color.a != 0)
- {
- move_background_color_style_up(ctx, g, root, html);
- return;
- }
- body = html->down;
- if (body && !strcmp(body->tag, "body"))
- {
- if (body->style->background_color.a != 0)
- {
- move_background_color_style_up(ctx, g, root, body);
- return;
- }
- }
- }
- }
- static void
- xml_to_boxes(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, const char *user_css,
- fz_xml_doc *xml, fz_html_tree *tree, char **rtitle, int try_fictionbook, int is_mobi)
- {
- fz_xml *root, *node;
- char *title;
- fz_css_match root_match, match;
- struct genstate g = {0};
- g.pool = NULL;
- g.set = set;
- g.zip = zip;
- g.images = NULL;
- g.xml = xml;
- g.is_fb2 = 0;
- g.base_uri = base_uri;
- g.css = NULL;
- g.at_bol = 0;
- g.emit_white = 0;
- g.last_brk_cls = UCDN_LINEBREAK_CLASS_OP;
- g.list_counter = 0;
- g.section_depth = 0;
- g.markup_dir = FZ_BIDI_LTR;
- g.markup_lang = FZ_LANG_UNSET;
- g.href = NULL;
- g.styles = NULL;
- if (rtitle)
- *rtitle = NULL;
- root = fz_xml_root(g.xml);
- g.css = fz_new_css(ctx);
- #ifndef NDEBUG
- if (fz_atoi(getenv("FZ_DEBUG_XML")))
- fz_debug_xml(root, 0);
- #endif
- fz_try(ctx)
- {
- if (try_fictionbook && fz_xml_find(root, "FictionBook"))
- {
- g.is_fb2 = 1;
- fz_parse_css(ctx, g.css, fb2_default_css, "<default:fb2>");
- if (fz_use_document_css(ctx))
- fb2_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root);
- g.images = load_fb2_images(ctx, root);
- }
- else if (is_mobi)
- {
- g.is_fb2 = 0;
- fz_parse_css(ctx, g.css, html_default_css, "<default:html>");
- fz_parse_css(ctx, g.css, mobi_default_css, "<default:mobi>");
- if (fz_use_document_css(ctx))
- html_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root);
- }
- else
- {
- g.is_fb2 = 0;
- fz_parse_css(ctx, g.css, html_default_css, "<default:html>");
- if (fz_use_document_css(ctx))
- html_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root);
- }
- if (user_css)
- {
- fz_parse_css(ctx, g.css, user_css, "<user>");
- fz_add_css_font_faces(ctx, g.set, g.zip, ".", g.css);
- }
- }
- fz_catch(ctx)
- {
- fz_drop_tree(ctx, g.images, (void(*)(fz_context*,void*))fz_drop_image);
- fz_drop_css(ctx, g.css);
- fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
- fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
- fz_report_error(ctx);
- fz_warn(ctx, "ignoring styles");
- g.css = fz_new_css(ctx);
- g.images = NULL;
- }
- #ifndef NDEBUG
- if (fz_atoi(getenv("FZ_DEBUG_CSS")))
- fz_debug_css(ctx, g.css);
- #endif
- fz_try(ctx)
- {
- fz_css_style style;
- int display;
- fz_match_css_at_page(ctx, &root_match, g.css);
- fz_apply_css_style(ctx, g.set, &style, &root_match);
- g.pool = tree->pool;
- g.markup_dir = DEFAULT_DIR;
- g.markup_lang = FZ_LANG_UNSET;
- // Create root node
- tree->root = new_box(ctx, &g, NULL, BOX_BLOCK, &style);
- // TODO: transfer page margins out of this hacky box
- tree->root->tag = ":root";
- tree->root->s.layout.em = 0;
- tree->root->s.layout.x = 0;
- tree->root->s.layout.y = 0;
- tree->root->s.layout.w = 0;
- tree->root->s.layout.b = 0;
- // Create document node (html).
- fz_match_css(ctx, &match, &root_match, g.css, root);
- fz_apply_css_style(ctx, g.set, &style, &match);
- display = fz_get_css_match_display(&match);
- gen2_tag(ctx, &g, tree->root, root, &match, display, &style);
- detect_directionality(ctx, g.pool, tree->root);
- if (g.is_fb2)
- {
- node = fz_xml_find(root, "FictionBook");
- node = fz_xml_find_down(node, "description");
- node = fz_xml_find_down(node, "title-info");
- node = fz_xml_find_down(node, "book-title");
- if (rtitle)
- {
- title = fz_xml_text(fz_xml_down(node));
- if (title)
- *rtitle = fz_pool_strdup(ctx, g.pool, title);
- }
- }
- else
- {
- node = fz_xml_find(root, "html");
- node = fz_xml_find_down(node, "head");
- node = fz_xml_find_down(node, "title");
- if (rtitle)
- {
- title = fz_xml_text(fz_xml_down(node));
- if (title)
- *rtitle = fz_pool_strdup(ctx, g.pool, title);
- }
- // Move html or body background-color to :root.
- move_background_color_up(ctx, &g, tree->root);
- }
- }
- fz_always(ctx)
- {
- fz_drop_tree(ctx, g.images, (void(*)(fz_context*,void*))fz_drop_image);
- fz_drop_css(ctx, g.css);
- }
- fz_catch(ctx)
- {
- if (rtitle)
- {
- fz_free(ctx, *rtitle);
- *rtitle = NULL;
- }
- fz_rethrow(ctx);
- }
- }
- static const char *mobi_font_size[7] = {
- "0.67em",
- "0.83em",
- "1em",
- "1.17em",
- "1.33em",
- "1.5em",
- "1.67em",
- };
- static void
- patch_mobi_html(fz_context *ctx, fz_pool *pool, fz_xml *node)
- {
- fz_xml *down;
- char buf[500];
- while (node)
- {
- char *tag = fz_xml_tag(node);
- if (tag)
- {
- // Read MOBI attributes, convert to inline CSS style
- if (!strcmp(tag, "font"))
- {
- const char *size = fz_xml_att(node, "size");
- if (size)
- {
- if (!strcmp(size, "1")) size = mobi_font_size[0];
- else if (!strcmp(size, "2")) size = mobi_font_size[1];
- else if (!strcmp(size, "3")) size = mobi_font_size[2];
- else if (!strcmp(size, "4")) size = mobi_font_size[3];
- else if (!strcmp(size, "5")) size = mobi_font_size[4];
- else if (!strcmp(size, "6")) size = mobi_font_size[5];
- else if (!strcmp(size, "7")) size = mobi_font_size[6];
- else if (!strcmp(size, "+1")) size = mobi_font_size[3];
- else if (!strcmp(size, "+2")) size = mobi_font_size[4];
- else if (!strcmp(size, "+3")) size = mobi_font_size[5];
- else if (!strcmp(size, "+4")) size = mobi_font_size[6];
- else if (!strcmp(size, "+5")) size = mobi_font_size[6];
- else if (!strcmp(size, "+6")) size = mobi_font_size[6];
- else if (!strcmp(size, "-1")) size = mobi_font_size[1];
- else if (!strcmp(size, "-2")) size = mobi_font_size[0];
- else if (!strcmp(size, "-3")) size = mobi_font_size[0];
- else if (!strcmp(size, "-4")) size = mobi_font_size[0];
- else if (!strcmp(size, "-5")) size = mobi_font_size[0];
- else if (!strcmp(size, "-6")) size = mobi_font_size[0];
- fz_snprintf(buf, sizeof buf, "font-size:%s", size);
- fz_xml_add_att(ctx, pool, node, "style", buf);
- }
- }
- else
- {
- char *height = fz_xml_att(node, "height");
- char *width = fz_xml_att(node, "width");
- char *align = fz_xml_att(node, "align");
- if (height || width || align)
- {
- buf[0] = 0;
- if (height)
- {
- fz_strlcat(buf, "margin-top:", sizeof buf);
- fz_strlcat(buf, height, sizeof buf);
- fz_strlcat(buf, ";", sizeof buf);
- }
- if (width)
- {
- fz_strlcat(buf, "text-indent:", sizeof buf);
- fz_strlcat(buf, width, sizeof buf);
- fz_strlcat(buf, ";", sizeof buf);
- }
- if (align)
- {
- fz_strlcat(buf, "text-align:", sizeof buf);
- fz_strlcat(buf, align, sizeof buf);
- fz_strlcat(buf, ";", sizeof buf);
- }
- fz_xml_add_att(ctx, pool, node, "style", buf);
- }
- if (!strcmp(tag, "img"))
- {
- char *recindex = fz_xml_att(node, "recindex");
- if (recindex)
- fz_xml_add_att(ctx, pool, node, "src", recindex);
- }
- }
- }
- down = fz_xml_down(node);
- if (down)
- patch_mobi_html(ctx, pool, down);
- node = fz_xml_next(node);
- }
- }
- static void
- fz_parse_html_tree(fz_context *ctx,
- fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css,
- int try_xml, int try_html5, fz_html_tree *tree, char **rtitle, int try_fictionbook, int patch_mobi)
- {
- fz_xml_doc *xml;
- if (rtitle)
- *rtitle = NULL;
- xml = parse_to_xml(ctx, buf, try_xml, try_html5);
- if (patch_mobi)
- patch_mobi_html(ctx, xml->u.doc.pool, xml);
- fz_try(ctx)
- xml_to_boxes(ctx, set, zip, base_uri, user_css, xml, tree, rtitle, try_fictionbook, patch_mobi);
- fz_always(ctx)
- fz_drop_xml(ctx, xml);
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- #define fz_new_derived_html_tree(CTX, TYPE, DROP) \
- ((TYPE *)Memento_label(fz_new_html_tree_of_size(CTX, sizeof(TYPE), DROP), #TYPE))
- static fz_html_tree *
- fz_new_html_tree_of_size(fz_context *ctx, size_t size, fz_store_drop_fn *drop)
- {
- fz_pool *pool = fz_new_pool(ctx);
- fz_html_tree *tree;
- fz_try(ctx)
- {
- tree = fz_pool_alloc(ctx, pool, size);
- FZ_INIT_STORABLE(tree, 1, drop);
- tree->pool = pool;
- }
- fz_catch(ctx)
- {
- fz_drop_pool(ctx, pool);
- fz_rethrow(ctx);
- }
- return tree;
- }
- fz_html *
- fz_parse_html(fz_context *ctx,
- fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css,
- int try_xml, int try_html5, int patch_mobi)
- {
- fz_html *html = fz_new_derived_html_tree(ctx, fz_html, fz_drop_html_imp);
- html->layout_w = 0;
- html->layout_h = 0;
- html->layout_em = 0;
- fz_try(ctx)
- fz_parse_html_tree(ctx, set, zip, base_uri, buf, user_css, try_xml, try_html5, &html->tree, &html->title, 1, patch_mobi);
- fz_catch(ctx)
- {
- fz_drop_html(ctx, html);
- fz_rethrow(ctx);
- }
- return html;
- }
- typedef struct
- {
- int saved;
- fz_warning_cb *old;
- void *arg;
- fz_buffer *buffer;
- fz_context *ctx;
- } warning_save;
- static void
- warn_to_buffer(void *user, const char *message)
- {
- warning_save *save = (warning_save *)user;
- fz_context *ctx = save->ctx;
- fz_try(ctx)
- {
- fz_append_string(ctx, save->buffer, message);
- fz_append_byte(ctx, save->buffer, '\n');
- }
- fz_catch(ctx)
- {
- /* Silently swallow the error. */
- fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
- fz_report_error(ctx);
- }
- }
- static void
- redirect_warnings_to_buffer(fz_context *ctx, fz_buffer *buf, warning_save *save)
- {
- save->saved = 1;
- save->old = fz_warning_callback(ctx, &save->arg);
- save->buffer = buf;
- save->ctx = ctx;
- fz_flush_warnings(ctx);
- fz_set_warning_callback(ctx, warn_to_buffer, save);
- }
- static void
- restore_warnings(fz_context *ctx, warning_save *save)
- {
- if (!save->saved)
- return;
- fz_flush_warnings(ctx);
- fz_set_warning_callback(ctx, save->old, save->arg);
- }
- fz_story *
- fz_new_story(fz_context *ctx, fz_buffer *buf, const char *user_css, float em, fz_archive *zip)
- {
- fz_story *story = fz_new_derived_html_tree(ctx, fz_story, fz_drop_story_imp);
- warning_save saved = { 0 };
- fz_buffer *local_buffer = NULL;
- if (buf == NULL)
- {
- local_buffer = fz_new_buffer(ctx, 0);
- buf = local_buffer;
- }
- fz_var(local_buffer);
- fz_var(saved);
- fz_try(ctx)
- {
- story->zip = fz_keep_archive(ctx, zip);
- story->font_set = fz_new_html_font_set(ctx);
- story->em = em;
- story->user_css = user_css ? fz_strdup(ctx, user_css) : NULL;
- story->warnings = fz_new_buffer(ctx, 128);
- redirect_warnings_to_buffer(ctx, story->warnings, &saved);
- story->dom = parse_to_xml(ctx, buf, 0, 1);
- }
- fz_always(ctx)
- {
- restore_warnings(ctx, &saved);
- fz_drop_buffer(ctx, local_buffer);
- }
- fz_catch(ctx)
- {
- fz_drop_html_tree(ctx, &story->tree);
- fz_rethrow(ctx);
- }
- return story;
- }
- fz_html *
- fz_parse_xhtml(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css)
- {
- /* try as XML first, fall back to HTML5 */
- return fz_parse_html(ctx, set, zip, base_uri, buf, user_css, 1, 1, 0);
- }
- static void indent(int level)
- {
- while (level-- > 0)
- putchar('\t');
- }
- static void
- fz_debug_html_flow(fz_context *ctx, fz_html_flow *flow, int level)
- {
- fz_html_box *sbox = NULL;
- while (flow)
- {
- if (flow->box != sbox) {
- sbox = flow->box;
- indent(level);
- #ifndef NDEBUG
- printf("@style <%s> em=%g font='%s'", sbox->tag, sbox->s.layout.em, fz_font_name(ctx, sbox->style->font));
- #else
- printf("@style em=%g font='%s'", sbox->s.layout.em, fz_font_name(ctx, sbox->style->font));
- #endif
- if (fz_font_is_serif(ctx, sbox->style->font))
- printf(" serif");
- else
- printf(" sans");
- if (fz_font_is_monospaced(ctx, sbox->style->font))
- printf(" monospaced");
- if (fz_font_is_bold(ctx, sbox->style->font))
- printf(" bold");
- if (fz_font_is_italic(ctx, sbox->style->font))
- printf(" italic");
- if (sbox->style->small_caps)
- printf(" small-caps");
- printf("\n");
- }
- indent(level);
- switch (flow->type) {
- case FLOW_WORD: printf("word "); break;
- case FLOW_SPACE: printf("space"); break;
- case FLOW_SBREAK: printf("sbrk "); break;
- case FLOW_SHYPHEN: printf("shy "); break;
- case FLOW_BREAK: printf("break"); break;
- case FLOW_IMAGE: printf("image"); break;
- case FLOW_ANCHOR: printf("anchor"); break;
- }
- // printf(" y=%g x=%g w=%g", flow->y, flow->x, flow->w);
- if (flow->type == FLOW_IMAGE)
- printf(" h=%g", flow->h);
- if (flow->type == FLOW_WORD)
- printf(" text='%s'", flow->content.text);
- printf("\n");
- if (flow->breaks_line) {
- indent(level);
- printf("*\n");
- }
- flow = flow->next;
- }
- }
- fz_structure fz_html_tag_to_structure(const char *tag)
- {
- if (!strcmp(tag, "body")) return FZ_STRUCTURE_DOCUMENT;
- if (!strcmp(tag, "div")) return FZ_STRUCTURE_DIV;
- if (!strcmp(tag, "span")) return FZ_STRUCTURE_SPAN;
- if (!strcmp(tag, "blockquote")) return FZ_STRUCTURE_BLOCKQUOTE;
- if (!strcmp(tag, "p")) return FZ_STRUCTURE_P;
- if (!strcmp(tag, "h1")) return FZ_STRUCTURE_H1;
- if (!strcmp(tag, "h2")) return FZ_STRUCTURE_H2;
- if (!strcmp(tag, "h3")) return FZ_STRUCTURE_H3;
- if (!strcmp(tag, "h4")) return FZ_STRUCTURE_H4;
- if (!strcmp(tag, "h5")) return FZ_STRUCTURE_H5;
- if (!strcmp(tag, "h6")) return FZ_STRUCTURE_H6;
- if (!strcmp(tag, "ol")) return FZ_STRUCTURE_LIST;
- if (!strcmp(tag, "ul")) return FZ_STRUCTURE_LIST;
- if (!strcmp(tag, "dl")) return FZ_STRUCTURE_LIST;
- if (!strcmp(tag, "li")) return FZ_STRUCTURE_LISTITEM;
- if (!strcmp(tag, "table")) return FZ_STRUCTURE_TABLE;
- if (!strcmp(tag, "tr")) return FZ_STRUCTURE_TR;
- if (!strcmp(tag, "th")) return FZ_STRUCTURE_TH;
- if (!strcmp(tag, "td")) return FZ_STRUCTURE_TD;
- if (!strcmp(tag, "thead")) return FZ_STRUCTURE_THEAD;
- if (!strcmp(tag, "tbody")) return FZ_STRUCTURE_TBODY;
- if (!strcmp(tag, "tfoot")) return FZ_STRUCTURE_TFOOT;
- return FZ_STRUCTURE_INVALID;
- }
- static void
- fz_debug_html_box(fz_context *ctx, fz_html_box *box, int level)
- {
- while (box)
- {
- indent(level);
- printf("box ");
- switch (box->type) {
- case BOX_BLOCK: printf("block"); break;
- case BOX_FLOW: printf("flow"); break;
- case BOX_INLINE: printf("inline"); break;
- case BOX_TABLE: printf("table"); break;
- case BOX_TABLE_ROW: printf("table-row"); break;
- case BOX_TABLE_CELL: printf("table-cell"); break;
- }
- printf(" <%s>", box->tag);
- // printf(" em=%g", box->em);
- // printf(" x=%g y=%g w=%g b=%g", box->x, box->y, box->w, box->b);
- if (box->is_first_flow)
- printf(" is-first-flow");
- if (box->list_item)
- printf(" list=%d", box->list_item);
- if (box->id)
- printf(" id=(%s)", box->id);
- if (box->href)
- printf(" href=(%s)", box->href);
- printf("\n");
- if (box->type == BOX_BLOCK || box->type == BOX_TABLE) {
- indent(level+1);
- printf(">margin=(%g %g %g %g)\n", box->u.block.margin[0], box->u.block.margin[1], box->u.block.margin[2], box->u.block.margin[3]);
- //indent(level+1);
- //printf(">padding=(%g %g %g %g)\n", box->u.block.padding[0], box->u.block.padding[1], box->u.block.padding[2], box->u.block.padding[3]);
- //indent(level+1);
- //printf(">border=(%g %g %g %g)\n", box->u.block.border[0], box->u.block.border[1], box->u.block.border[2], box->u.block.border[3]);
- }
- if (box->down)
- fz_debug_html_box(ctx, box->down, level + 1);
- if (box->type == BOX_FLOW) {
- indent(level+1);
- printf("flow\n");
- fz_debug_html_flow(ctx, box->u.flow.head, level + 2);
- }
- box = box->next;
- }
- }
- void
- fz_debug_html(fz_context *ctx, fz_html_box *box)
- {
- fz_debug_html_box(ctx, box, 0);
- }
- static size_t
- fz_html_size(fz_context *ctx, fz_html *html)
- {
- return html ? fz_pool_size(ctx, html->tree.pool) : 0;
- }
- /* Magic to make html storable. */
- typedef struct {
- int refs;
- void *doc;
- int chapter_num;
- } fz_html_key;
- static int
- fz_make_hash_html_key(fz_context *ctx, fz_store_hash *hash, void *key_)
- {
- fz_html_key *key = (fz_html_key *)key_;
- hash->u.pi.ptr = key->doc;
- hash->u.pi.i = key->chapter_num;
- return 1;
- }
- static void *
- fz_keep_html_key(fz_context *ctx, void *key_)
- {
- fz_html_key *key = (fz_html_key *)key_;
- return fz_keep_imp(ctx, key, &key->refs);
- }
- static void
- fz_drop_html_key(fz_context *ctx, void *key_)
- {
- fz_html_key *key = (fz_html_key *)key_;
- if (fz_drop_imp(ctx, key, &key->refs))
- {
- fz_free(ctx, key);
- }
- }
- static int
- fz_cmp_html_key(fz_context *ctx, void *k0_, void *k1_)
- {
- fz_html_key *k0 = (fz_html_key *)k0_;
- fz_html_key *k1 = (fz_html_key *)k1_;
- return k0->doc == k1->doc && k0->chapter_num == k1->chapter_num;
- }
- static void
- fz_format_html_key(fz_context *ctx, char *s, size_t n, void *key_)
- {
- fz_html_key *key = (fz_html_key *)key_;
- fz_snprintf(s, n, "(html doc=%p, ch=%d)", key->doc, key->chapter_num);
- }
- static const fz_store_type fz_html_store_type =
- {
- "fz_html",
- fz_make_hash_html_key,
- fz_keep_html_key,
- fz_drop_html_key,
- fz_cmp_html_key,
- fz_format_html_key,
- NULL
- };
- fz_html *fz_store_html(fz_context *ctx, fz_html *html, void *doc, int chapter)
- {
- fz_html_key *key = NULL;
- fz_html *other_html;
- /* Stick the parsed html in the store */
- fz_var(key);
- fz_try(ctx)
- {
- key = fz_malloc_struct(ctx, fz_html_key);
- key->refs = 1;
- key->doc = doc;
- key->chapter_num = chapter;
- other_html = fz_store_item(ctx, key, html, fz_html_size(ctx, html), &fz_html_store_type);
- if (other_html)
- {
- fz_drop_html(ctx, html);
- html = other_html;
- }
- }
- fz_always(ctx)
- fz_drop_html_key(ctx, key);
- fz_catch(ctx)
- {
- /* Do nothing */
- }
- return html;
- }
- fz_html *fz_find_html(fz_context *ctx, void *doc, int chapter)
- {
- fz_html_key key;
- key.refs = 1;
- key.doc = doc;
- key.chapter_num = chapter;
- return fz_find_item(ctx, &fz_drop_html_imp, &key, &fz_html_store_type);
- }
- static int
- html_filter_store(fz_context *ctx, void *doc, void *key_)
- {
- fz_html_key *key = (fz_html_key *)key_;
- return (doc == key->doc);
- }
- void fz_purge_stored_html(fz_context *ctx, void *doc)
- {
- fz_filter_store(ctx, html_filter_store, doc, &fz_html_store_type);
- }
- static void
- convert_to_boxes(fz_context *ctx, fz_story *story)
- {
- warning_save saved = { 0 };
- if (story->dom == NULL)
- return;
- fz_var(saved);
- fz_try(ctx)
- {
- redirect_warnings_to_buffer(ctx, story->warnings, &saved);
- xml_to_boxes(ctx, story->font_set, story->zip, ".", story->user_css, story->dom, &story->tree, NULL, 0, 0);
- }
- fz_always(ctx)
- {
- fz_drop_xml(ctx, story->dom);
- story->dom = NULL;
- restore_warnings(ctx, &saved);
- }
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- int fz_place_story(fz_context *ctx, fz_story *story, fz_rect where, fz_rect *filled)
- {
- return fz_place_story_flags(ctx, story, where, filled, 0);
- }
- int fz_place_story_flags(fz_context *ctx, fz_story *story, fz_rect where, fz_rect *filled, int flags)
- {
- float w, h;
- if (filled)
- *filled = fz_empty_rect;
- if (story == NULL || story->complete)
- return 0;
- /* Convert from XML to box model on the first attempt to place.
- * The DOM is unusable from here on in. */
- convert_to_boxes(ctx, story);
- w = where.x1 - where.x0;
- h = where.y1 - where.y0;
- /* Confusingly, we call the layout using restart_draw, not restart_place,
- * because we don't want to destroy the current values in restart_place
- * in case we have to retry later. This means the values are left in
- * the correct struct though! */
- story->restart_draw.start = story->restart_place.start;
- story->restart_draw.start_flow = story->restart_place.start_flow;
- story->restart_draw.end = NULL;
- story->restart_draw.end_flow = NULL;
- story->restart_draw.reason = FZ_HTML_RESTART_REASON_NONE;
- story->restart_draw.flags = flags;
- story->bbox = where;
- fz_restartable_layout_html(ctx, &story->tree, where.x0, where.y0, w, h, story->em, &story->restart_draw);
- story->restart_draw.start = story->restart_place.start;
- story->restart_draw.start_flow = story->restart_place.start_flow;
- if (filled)
- {
- fz_html_box *b = story->tree.root;
- filled->x0 = b->s.layout.x - b->u.block.margin[L] - b->u.block.border[L] - b->u.block.padding[L];
- filled->x1 = b->s.layout.w + b->u.block.margin[R] + b->u.block.border[R] + b->u.block.padding[R] + b->s.layout.x;
- filled->y0 = b->s.layout.y - b->u.block.margin[T] - b->u.block.border[T] - b->u.block.padding[T];
- filled->y1 = b->s.layout.b + b->u.block.margin[B] + b->u.block.border[B] + b->u.block.padding[B];
- }
- #ifndef NDEBUG
- if (fz_atoi(getenv("FZ_DEBUG_HTML")))
- fz_debug_html(ctx, story->tree.root);
- #endif
- if (story->restart_draw.end == NULL)
- return FZ_HTML_RESTART_REASON_NONE;
- if (story->restart_draw.reason == FZ_HTML_RESTART_REASON_LINE_WIDTH)
- return FZ_HTML_RESTART_REASON_LINE_WIDTH;
- return FZ_HTML_RESTART_REASON_LINE_HEIGHT;
- }
- const char *
- fz_story_warnings(fz_context *ctx, fz_story *story)
- {
- unsigned char *data;
- if (!story)
- return NULL;
- convert_to_boxes(ctx, story);
- fz_terminate_buffer(ctx, story->warnings);
- if (fz_buffer_storage(ctx, story->warnings, &data) == 0)
- return NULL;
- return (const char *)data;
- }
|