| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405 |
- // Copyright (C) 2004-2025 Artifex Software, Inc.
- //
- // This file is part of MuPDF.
- //
- // MuPDF is free software: you can redistribute it and/or modify it under the
- // terms of the GNU Affero General Public License as published by the Free
- // Software Foundation, either version 3 of the License, or (at your option)
- // any later version.
- //
- // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
- // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- // details.
- //
- // You should have received a copy of the GNU Affero General Public License
- // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
- //
- // Alternative licensing terms are available from the licensor.
- // For commercial licensing, see <https://www.artifex.com/> or contact
- // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
- // CA 94129, USA, for further information.
- #include "xml-imp.h"
- #include <string.h>
- #include <stdlib.h>
- #include <stdio.h>
- #if FZ_ENABLE_HTML_ENGINE
- #include <gumbo.h>
- #endif
- #define FZ_XML_MAX_DEPTH 4096
- /* #define FZ_XML_SEQ */
- static const struct { const char *name; int c; } html_entities[] = {
- {"nbsp",160}, {"iexcl",161}, {"cent",162}, {"pound",163},
- {"curren",164}, {"yen",165}, {"brvbar",166}, {"sect",167},
- {"uml",168}, {"copy",169}, {"ordf",170}, {"laquo",171},
- {"not",172}, {"shy",173}, {"reg",174}, {"macr",175}, {"deg",176},
- {"plusmn",177}, {"sup2",178}, {"sup3",179}, {"acute",180},
- {"micro",181}, {"para",182}, {"middot",183}, {"cedil",184},
- {"sup1",185}, {"ordm",186}, {"raquo",187}, {"frac14",188},
- {"frac12",189}, {"frac34",190}, {"iquest",191}, {"Agrave",192},
- {"Aacute",193}, {"Acirc",194}, {"Atilde",195}, {"Auml",196},
- {"Aring",197}, {"AElig",198}, {"Ccedil",199}, {"Egrave",200},
- {"Eacute",201}, {"Ecirc",202}, {"Euml",203}, {"Igrave",204},
- {"Iacute",205}, {"Icirc",206}, {"Iuml",207}, {"ETH",208},
- {"Ntilde",209}, {"Ograve",210}, {"Oacute",211}, {"Ocirc",212},
- {"Otilde",213}, {"Ouml",214}, {"times",215}, {"Oslash",216},
- {"Ugrave",217}, {"Uacute",218}, {"Ucirc",219}, {"Uuml",220},
- {"Yacute",221}, {"THORN",222}, {"szlig",223}, {"agrave",224},
- {"aacute",225}, {"acirc",226}, {"atilde",227}, {"auml",228},
- {"aring",229}, {"aelig",230}, {"ccedil",231}, {"egrave",232},
- {"eacute",233}, {"ecirc",234}, {"euml",235}, {"igrave",236},
- {"iacute",237}, {"icirc",238}, {"iuml",239}, {"eth",240},
- {"ntilde",241}, {"ograve",242}, {"oacute",243}, {"ocirc",244},
- {"otilde",245}, {"ouml",246}, {"divide",247}, {"oslash",248},
- {"ugrave",249}, {"uacute",250}, {"ucirc",251}, {"uuml",252},
- {"yacute",253}, {"thorn",254}, {"yuml",255}, {"lt",60}, {"gt",62},
- {"amp",38}, {"apos",39}, {"quot",34}, {"OElig",338}, {"oelig",339},
- {"Scaron",352}, {"scaron",353}, {"Yuml",376}, {"circ",710},
- {"tilde",732}, {"ensp",8194}, {"emsp",8195}, {"thinsp",8201},
- {"zwnj",8204}, {"zwj",8205}, {"lrm",8206}, {"rlm",8207},
- {"ndash",8211}, {"mdash",8212}, {"lsquo",8216}, {"rsquo",8217},
- {"sbquo",8218}, {"ldquo",8220}, {"rdquo",8221}, {"bdquo",8222},
- {"dagger",8224}, {"Dagger",8225}, {"permil",8240}, {"lsaquo",8249},
- {"rsaquo",8250}, {"euro",8364}, {"fnof",402}, {"Alpha",913},
- {"Beta",914}, {"Gamma",915}, {"Delta",916}, {"Epsilon",917},
- {"Zeta",918}, {"Eta",919}, {"Theta",920}, {"Iota",921}, {"Kappa",922},
- {"Lambda",923}, {"Mu",924}, {"Nu",925}, {"Xi",926}, {"Omicron",927},
- {"Pi",928}, {"Rho",929}, {"Sigma",931}, {"Tau",932}, {"Upsilon",933},
- {"Phi",934}, {"Chi",935}, {"Psi",936}, {"Omega",937}, {"alpha",945},
- {"beta",946}, {"gamma",947}, {"delta",948}, {"epsilon",949},
- {"zeta",950}, {"eta",951}, {"theta",952}, {"iota",953}, {"kappa",954},
- {"lambda",955}, {"mu",956}, {"nu",957}, {"xi",958}, {"omicron",959},
- {"pi",960}, {"rho",961}, {"sigmaf",962}, {"sigma",963}, {"tau",964},
- {"upsilon",965}, {"phi",966}, {"chi",967}, {"psi",968}, {"omega",969},
- {"thetasym",977}, {"upsih",978}, {"piv",982}, {"bull",8226},
- {"hellip",8230}, {"prime",8242}, {"Prime",8243}, {"oline",8254},
- {"frasl",8260}, {"weierp",8472}, {"image",8465}, {"real",8476},
- {"trade",8482}, {"alefsym",8501}, {"larr",8592}, {"uarr",8593},
- {"rarr",8594}, {"darr",8595}, {"harr",8596}, {"crarr",8629},
- {"lArr",8656}, {"uArr",8657}, {"rArr",8658}, {"dArr",8659},
- {"hArr",8660}, {"forall",8704}, {"part",8706}, {"exist",8707},
- {"empty",8709}, {"nabla",8711}, {"isin",8712}, {"notin",8713},
- {"ni",8715}, {"prod",8719}, {"sum",8721}, {"minus",8722},
- {"lowast",8727}, {"radic",8730}, {"prop",8733}, {"infin",8734},
- {"ang",8736}, {"and",8743}, {"or",8744}, {"cap",8745}, {"cup",8746},
- {"int",8747}, {"there4",8756}, {"sim",8764}, {"cong",8773},
- {"asymp",8776}, {"ne",8800}, {"equiv",8801}, {"le",8804}, {"ge",8805},
- {"sub",8834}, {"sup",8835}, {"nsub",8836}, {"sube",8838},
- {"supe",8839}, {"oplus",8853}, {"otimes",8855}, {"perp",8869},
- {"sdot",8901}, {"lceil",8968}, {"rceil",8969}, {"lfloor",8970},
- {"rfloor",8971}, {"lang",9001}, {"rang",9002}, {"loz",9674},
- {"spades",9824}, {"clubs",9827}, {"hearts",9829}, {"diams",9830},
- };
- struct parser
- {
- fz_pool *pool;
- fz_xml *head;
- int preserve_white;
- int depth;
- #ifdef FZ_XML_SEQ
- int seq;
- #endif
- };
- static void xml_indent(fz_context *ctx, fz_output *out, int n)
- {
- while (n--) {
- fz_write_byte(ctx, out, ' ');
- fz_write_byte(ctx, out, ' ');
- }
- }
- void fz_debug_xml(fz_xml *item, int level)
- {
- /* This is a bit nasty as it relies on implementation
- * details of both fz_stdout, and fz_write_printf coping
- * with NULL ctx. */
- fz_output_xml(NULL, fz_stdout(NULL), item, level);
- }
- void fz_output_xml(fz_context *ctx, fz_output *out, fz_xml *item, int level)
- {
- char *s;
- if (item == NULL)
- return;
- /* Skip over the DOC object at the top. */
- if (item->up == NULL)
- {
- fz_xml *child;
- for (child = fz_xml_down(item); child; child = child->u.node.next)
- fz_output_xml(ctx, out, child, level + 1);
- return;
- }
- s = fz_xml_text(item);
- xml_indent(ctx, out, level);
- if (s)
- {
- int c;
- fz_write_byte(ctx, out, '"');
- while (*s) {
- s += fz_chartorune(&c, s);
- switch (c) {
- default:
- if (c > 0xFFFF)
- fz_write_printf(ctx, out, "\\u{%X}", c);
- else if (c < 32 || c > 127)
- fz_write_printf(ctx, out, "\\u%04X", c);
- else
- fz_write_byte(ctx, out, c);
- break;
- case '\\': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, '\\'); break;
- case '\b': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'b'); break;
- case '\f': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'f'); break;
- case '\n': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'n'); break;
- case '\r': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'r'); break;
- case '\t': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 't'); break;
- }
- }
- fz_write_byte(ctx, out, '"');
- #ifdef FZ_XML_SEQ
- fz_write_printf(ctx, out, " <%d>", item->seq);
- #endif
- fz_write_byte(ctx, out, '\n');
- }
- else
- {
- fz_xml *child;
- struct attribute *att;
- #ifdef FZ_XML_SEQ
- fz_write_printf(ctx, out, "(%s <%d>\n", item->u.node.u.d.name, item->u.node.seq);
- #else
- fz_write_printf(ctx, out, "(%s\n", item->u.node.u.d.name);
- #endif
- for (att = item->u.node.u.d.atts; att; att = att->next)
- {
- xml_indent(ctx, out, level);
- fz_write_printf(ctx, out, "=%s %s\n", att->name, att->value);
- }
- for (child = fz_xml_down(item); child; child = child->u.node.next)
- fz_output_xml(ctx, out, child, level + 1);
- xml_indent(ctx, out, level);
- #ifdef FZ_XML_SEQ
- fz_write_printf(ctx, out, ")%s <%d>\n", item->u.node.u.d.name, item->u.node.seq);
- #else
- fz_write_printf(ctx, out, ")%s\n", item->u.node.u.d.name);
- #endif
- }
- }
- fz_xml *fz_xml_prev(fz_xml *item)
- {
- return item && item->up ? item->u.node.prev : NULL;
- }
- fz_xml *fz_xml_next(fz_xml *item)
- {
- return item && item->up ? item->u.node.next : NULL;
- }
- fz_xml *fz_xml_up(fz_xml *item)
- {
- /* Never step up to the DOC. */
- return item && item->up && item->up->up ? item->up : NULL;
- }
- fz_xml *fz_xml_down(fz_xml *item)
- {
- /* DOC items can never have MAGIC_TEXT as their down value,
- * so this is safe. */
- return item && !FZ_TEXT_ITEM(item) ? item->down : NULL;
- }
- char *fz_xml_text(fz_xml *item)
- {
- /* DOC items can never have MAGIC_TEXT as their down value,
- * so this is safe. */
- return (item && FZ_TEXT_ITEM(item)) ? item->u.node.u.text : NULL;
- }
- char *fz_xml_tag(fz_xml *item)
- {
- /* DOC items can never have MAGIC_TEXT as their down value,
- * so this is safe. */
- return item && !FZ_TEXT_ITEM(item) ? item->u.node.u.d.name : NULL;
- }
- int fz_xml_is_tag(fz_xml *item, const char *name)
- {
- if (!item || FZ_DOCUMENT_ITEM(item) || FZ_TEXT_ITEM(item))
- return 0;
- return !strcmp(item->u.node.u.d.name, name);
- }
- char *fz_xml_att(fz_xml *item, const char *name)
- {
- struct attribute *att;
- if (!item || FZ_DOCUMENT_ITEM(item) || FZ_TEXT_ITEM(item))
- return NULL;
- for (att = item->u.node.u.d.atts; att; att = att->next)
- if (!strcmp(att->name, name))
- return att->value;
- return NULL;
- }
- char *fz_xml_att_alt(fz_xml *item, const char *one, const char *two)
- {
- char *val = fz_xml_att(item, one);
- if (!val)
- val = fz_xml_att(item, two);
- return val;
- }
- fz_xml *fz_xml_find(fz_xml *item, const char *tag)
- {
- /* Skip over any DOC item. */
- if (item && FZ_DOCUMENT_ITEM(item))
- item = item->down;
- while (item)
- {
- if (!FZ_TEXT_ITEM(item) && !strcmp(item->u.node.u.d.name, tag))
- return item;
- item = item->u.node.next;
- }
- return NULL;
- }
- fz_xml *fz_xml_find_next(fz_xml *item, const char *tag)
- {
- /* Skip over any DOC item. */
- if (item && FZ_DOCUMENT_ITEM(item))
- item = item->down;
- if (item)
- item = item->u.node.next;
- return fz_xml_find(item, tag);
- }
- fz_xml *fz_xml_find_down(fz_xml *item, const char *tag)
- {
- if (item)
- item = fz_xml_down(item);
- return fz_xml_find(item, tag);
- }
- int fz_xml_att_eq(fz_xml *item, const char *name, const char *match)
- {
- const char *val = fz_xml_att(item, name);
- return val ? !strcmp(val, match) : 0;
- }
- fz_xml *fz_xml_find_match(fz_xml *item, const char *tag, const char *att, const char *match)
- {
- /* Skip over any document item. */
- if (item && FZ_DOCUMENT_ITEM(item))
- item = item->down;
- while (1)
- {
- item = tag ? fz_xml_find(item, tag) : item;
- if (item == NULL || fz_xml_att_eq(item, att, match))
- break;
- item = item->u.node.next;
- }
- return item;
- }
- fz_xml *fz_xml_find_next_match(fz_xml *item, const char *tag, const char *att, const char *match)
- {
- /* Skip over any document item. */
- if (item && FZ_DOCUMENT_ITEM(item))
- item = item->down;
- if (item != NULL)
- {
- do
- {
- item = tag ? fz_xml_find_next(item, tag) : item->u.node.next;
- }
- while (item != NULL && !fz_xml_att_eq(item, att, match));
- }
- return item;
- }
- fz_xml *fz_xml_find_down_match(fz_xml *item, const char *tag, const char *att, const char *match)
- {
- return fz_xml_find_match(fz_xml_down(item), tag, att, match);
- }
- fz_xml *fz_xml_root(fz_xml *xml)
- {
- if (xml == NULL)
- return NULL;
- /* If we've been given a node mid-tree, run up to the root to find
- * the doc node. */
- while (xml->up)
- xml = xml->up;
- /* And the root is the child of the doc.*/
- return xml->down;
- }
- void fz_drop_xml(fz_context *ctx, fz_xml *xml)
- {
- if (!xml)
- return;
- /* Wherever we are in the tree, we want the doc node at the root. */
- while (xml->up)
- xml = xml->up;
- /* Drop a reference to the tree as a whole. */
- if (fz_drop_imp(ctx, xml, &xml->u.doc.refs) == 0)
- return;
- fz_drop_pool(ctx, xml->u.doc.pool);
- }
- void fz_detach_xml(fz_context *ctx, fz_xml *node)
- {
- fz_xml *doc = node;
- /* If we're already a document node, then this is a NOP. */
- if (doc->up == NULL)
- return;
- /* Move doc to be the doc pointer at the top of the tree. */
- while (doc->up)
- {
- doc = doc->up;
- }
- /* Relocate node to be the child of doc. */
- node->up->down = NULL;
- doc->down = node;
- /* NOTE: Suppose that X = doc->down on entry. On exit doc->down == node, but
- * X->up = doc. We need to be careful throughout this code to not assume that
- * Y is always a child of Y->up. */
- }
- size_t xml_parse_entity(int *c, const char *a)
- {
- char *b;
- size_t i;
- if (a[1] == '#') {
- if (a[2] == 'x')
- *c = strtol(a + 3, &b, 16);
- else
- *c = strtol(a + 2, &b, 10);
- if (*b == ';')
- return b - a + 1;
- }
- else if (a[1] == 'l' && a[2] == 't' && a[3] == ';') {
- *c = '<';
- return 4;
- }
- else if (a[1] == 'g' && a[2] == 't' && a[3] == ';') {
- *c = '>';
- return 4;
- }
- else if (a[1] == 'a' && a[2] == 'm' && a[3] == 'p' && a[4] == ';') {
- *c = '&';
- return 5;
- }
- else if (a[1] == 'a' && a[2] == 'p' && a[3] == 'o' && a[4] == 's' && a[5] == ';') {
- *c = '\'';
- return 6;
- }
- else if (a[1] == 'q' && a[2] == 'u' && a[3] == 'o' && a[4] == 't' && a[5] == ';') {
- *c = '"';
- return 6;
- }
- /* We should only be doing this for XHTML, but it shouldn't be a problem. */
- for (i = 0; i < nelem(html_entities); ++i) {
- size_t n = strlen(html_entities[i].name);
- if (!strncmp(a+1, html_entities[i].name, n) && a[n+1] == ';') {
- *c = html_entities[i].c;
- return n + 2;
- }
- }
- *c = *a;
- return 1;
- }
- static inline int isname(int c)
- {
- return c == '.' || c == '-' || c == '_' || c == ':' ||
- (c >= '0' && c <= '9') ||
- (c >= 'A' && c <= 'Z') ||
- (c >= 'a' && c <= 'z');
- }
- static inline int iswhite(int c)
- {
- return c == ' ' || c == '\r' || c == '\n' || c == '\t';
- }
- static void xml_emit_open_tag(fz_context *ctx, struct parser *parser, const char *a, const char *b, int is_text)
- {
- fz_xml *head, *tail;
- const char *ns;
- size_t size;
- if (is_text)
- size = offsetof(fz_xml, u.node.u.text) + b-a+1;
- else
- {
- /* skip namespace prefix */
- for (ns = a; ns < b - 1; ++ns)
- if (*ns == ':')
- a = ns + 1;
- size = offsetof(fz_xml, u.node.u.d.name) + b-a+1;
- }
- head = fz_pool_alloc(ctx, parser->pool, size);
- if (is_text)
- head->down = MAGIC_TEXT;
- else
- {
- memcpy(head->u.node.u.d.name, a, b - a);
- head->u.node.u.d.name[b - a] = 0;
- head->u.node.u.d.atts = NULL;
- head->down = NULL;
- }
- head->up = parser->head;
- head->u.node.next = NULL;
- #ifdef FZ_XML_SEQ
- head->u.node.seq = parser->seq++;
- #endif
- /* During construction, we use head->next to mean "the
- * tail of the children. When we close the tag, we
- * rewrite it to be NULL. */
- if (!parser->head->down) {
- parser->head->down = head;
- parser->head->u.node.next = head;
- head->u.node.prev = NULL;
- }
- else {
- tail = parser->head->u.node.next;
- tail->u.node.next = head;
- head->u.node.prev = tail;
- parser->head->u.node.next = head;
- }
- parser->head = head;
- parser->depth++;
- if (parser->depth >= FZ_XML_MAX_DEPTH)
- fz_throw(ctx, FZ_ERROR_SYNTAX, "too deep xml element nesting");
- }
- static void xml_emit_att_name(fz_context *ctx, struct parser *parser, const char *a, const char *b)
- {
- fz_xml *head = parser->head;
- struct attribute *att;
- size_t size;
- size = offsetof(struct attribute, name) + b-a+1;
- att = fz_pool_alloc(ctx, parser->pool, size);
- memcpy(att->name, a, b - a);
- att->name[b - a] = 0;
- att->value = NULL;
- att->next = head->u.node.u.d.atts;
- head->u.node.u.d.atts = att;
- }
- void fz_xml_add_att(fz_context *ctx, fz_pool *pool, fz_xml *node, const char *key, const char *val)
- {
- size_t size = offsetof(struct attribute, name) + strlen(key) + 1;
- struct attribute *att = fz_pool_alloc(ctx, pool, size);
- memcpy(att->name, key, strlen(key)+1);
- att->value = fz_pool_alloc(ctx, pool, strlen(val) + 1);
- memcpy(att->value, val, strlen(val)+1);
- att->next = node->u.node.u.d.atts;
- node->u.node.u.d.atts = att;
- }
- static void xml_emit_att_value(fz_context *ctx, struct parser *parser, const char *a, const char *b)
- {
- fz_xml *head = parser->head;
- struct attribute *att = head->u.node.u.d.atts;
- char *s;
- int c;
- /* entities are all longer than UTFmax so runetochar is safe */
- s = att->value = fz_pool_alloc(ctx, parser->pool, b - a + 1);
- while (a < b) {
- if (*a == '&') {
- a += xml_parse_entity(&c, a);
- s += fz_runetochar(s, c);
- }
- else {
- *s++ = *a++;
- }
- }
- *s = 0;
- }
- static void xml_emit_close_tag(fz_context *ctx, struct parser *parser)
- {
- parser->depth--;
- parser->head->u.node.next = NULL;
- if (parser->head->up)
- parser->head = parser->head->up;
- }
- static void xml_emit_text(fz_context *ctx, struct parser *parser, const char *a, const char *b)
- {
- fz_xml *head;
- const char *p;
- char *s;
- int c;
- /* Skip text outside the root tag */
- if (parser->depth == 0)
- return;
- /* Skip all-whitespace text nodes */
- if (!parser->preserve_white)
- {
- for (p = a; p < b; p++)
- if (!iswhite(*p))
- break;
- if (p == b)
- return;
- }
- xml_emit_open_tag(ctx, parser, a, b, 1);
- head = parser->head;
- /* entities are all longer than UTFmax so runetochar is safe */
- s = fz_xml_text(head);
- while (a < b) {
- if (*a == '&') {
- a += xml_parse_entity(&c, a);
- s += fz_runetochar(s, c);
- }
- else {
- *s++ = *a++;
- }
- }
- *s = 0;
- xml_emit_close_tag(ctx, parser);
- }
- static void xml_emit_cdata(fz_context *ctx, struct parser *parser, const char *a, const char *b)
- {
- fz_xml *head;
- char *s;
- xml_emit_open_tag(ctx, parser, a, b, 1);
- head = parser->head;
- s = head->u.node.u.text;
- while (a < b)
- *s++ = *a++;
- *s = 0;
- xml_emit_close_tag(ctx, parser);
- }
- static int close_tag(fz_context *ctx, struct parser *parser, const char *mark, const char *p)
- {
- const char *ns, *tag;
- /* skip namespace prefix */
- for (ns = mark; ns < p - 1; ++ns)
- if (*ns == ':')
- mark = ns + 1;
- tag = fz_xml_tag(parser->head);
- if (tag && strncmp(tag, mark, p-mark) == 0 && tag[p-mark] == 0)
- {
- xml_emit_close_tag(ctx, parser);
- return 0;
- }
- return 1;
- }
- static char *xml_parse_document_imp(fz_context *ctx, struct parser *parser, const char *p) /* lgtm [cpp/use-of-goto] */
- {
- const char *mark;
- int quote;
- parse_text:
- mark = p;
- while (*p && *p != '<') ++p;
- if (*p == '<') {
- if (mark < p)
- xml_emit_text(ctx, parser, mark, p);
- ++p;
- goto parse_element;
- } else if (mark < p)
- xml_emit_text(ctx, parser, mark, p);
- return NULL;
- parse_element:
- if (*p == '/') { ++p; goto parse_closing_element; }
- if (*p == '!') { ++p; goto parse_comment; }
- if (*p == '?') { ++p; goto parse_processing_instruction; }
- while (iswhite(*p)) ++p;
- if (isname(*p))
- goto parse_element_name;
- return "syntax error in element";
- parse_comment:
- if (p[0]=='D' && p[1]=='O' && p[2]=='C' && p[3]=='T' && p[4]=='Y' && p[5]=='P' && p[6]=='E')
- goto parse_declaration;
- if (p[0]=='E' && p[1]=='N' && p[2]=='T' && p[3]=='I' && p[4]=='T' && p[5]=='Y')
- goto parse_declaration;
- if (*p == '[') goto parse_cdata;
- if (*p++ != '-') return "syntax error in comment (<! not followed by --)";
- if (*p++ != '-') return "syntax error in comment (<!- not followed by -)";
- while (*p) {
- if (p[0] == '-' && p[1] == '-' && p[2] == '>') {
- p += 3;
- goto parse_text;
- }
- ++p;
- }
- return "end of data in comment";
- parse_declaration:
- while (*p) if (*p++ == '>') goto parse_text;
- return "end of data in declaration";
- parse_cdata:
- if (p[1] != 'C' || p[2] != 'D' || p[3] != 'A' || p[4] != 'T' || p[5] != 'A' || p[6] != '[')
- return "syntax error in CDATA section";
- p += 7;
- mark = p;
- while (*p) {
- if (p[0] == ']' && p[1] == ']' && p[2] == '>') {
- xml_emit_cdata(ctx, parser, mark, p);
- p += 3;
- goto parse_text;
- }
- ++p;
- }
- return "end of data in CDATA section";
- parse_processing_instruction:
- while (*p) {
- if (p[0] == '?' && p[1] == '>') {
- p += 2;
- goto parse_text;
- }
- ++p;
- }
- return "end of data in processing instruction";
- parse_closing_element:
- while (iswhite(*p)) ++p;
- mark = p;
- while (isname(*p)) ++p;
- if (!isname(*mark))
- return "syntax error in closing element";
- if (close_tag(ctx, parser, mark, p))
- return "opening and closing tag mismatch";
- while (iswhite(*p)) ++p;
- if (*p != '>')
- return "syntax error in closing element";
- ++p;
- goto parse_text;
- parse_element_name:
- mark = p;
- while (isname(*p)) ++p;
- xml_emit_open_tag(ctx, parser, mark, p, 0);
- if (*p == '>') {
- ++p;
- goto parse_text;
- }
- if (p[0] == '/' && p[1] == '>') {
- xml_emit_close_tag(ctx, parser);
- p += 2;
- goto parse_text;
- }
- if (iswhite(*p))
- goto parse_attributes;
- return "syntax error after element name";
- parse_attributes:
- while (iswhite(*p)) ++p;
- if (isname(*p))
- goto parse_attribute_name;
- if (*p == '>') {
- ++p;
- goto parse_text;
- }
- if (p[0] == '/' && p[1] == '>') {
- xml_emit_close_tag(ctx, parser);
- p += 2;
- goto parse_text;
- }
- return "syntax error in attributes";
- parse_attribute_name:
- mark = p;
- while (isname(*p)) ++p;
- xml_emit_att_name(ctx, parser, mark, p);
- while (iswhite(*p)) ++p;
- if (*p == '=') { ++p; goto parse_attribute_value; }
- return "syntax error after attribute name";
- parse_attribute_value:
- while (iswhite(*p)) ++p;
- quote = *p++;
- mark = p;
- /* special case for handling MOBI filepos=00000 syntax */
- if (quote >= '0' && quote <= '9') {
- while (*p >= '0' && *p <= '9') ++p;
- xml_emit_att_value(ctx, parser, mark, p);
- goto parse_attributes;
- }
- if (quote != '"' && quote != '\'')
- return "missing quote character";
- while (*p && *p != quote) ++p;
- if (*p == quote) {
- xml_emit_att_value(ctx, parser, mark, p++);
- goto parse_attributes;
- }
- return "end of data in attribute value";
- }
- static int fast_tolower(int c)
- {
- if ((unsigned)c - 'A' < 26)
- return c | 32;
- return c;
- }
- static int fast_strncasecmp(const char *a, const char *b, size_t n)
- {
- if (!n--)
- return 0;
- for (; *a && *b && n && fast_tolower(*a) == fast_tolower(*b); a++, b++, n--)
- ;
- return fast_tolower(*a) - fast_tolower(*b);
- }
- static char *fast_strcasestr(char *h, char *n)
- {
- int n0 = fast_tolower(*n++);
- size_t nn = strlen(n);
- while (*h != 0)
- {
- if (fast_tolower(*h) == n0 && fast_strncasecmp(h+1, n, nn) == 0)
- return h;
- ++h;
- }
- return NULL;
- }
- static int startswith(const char *a, const char *b)
- {
- return !fast_strncasecmp(a, b, strlen(b));
- }
- /* https://encoding.spec.whatwg.org/#names-and-labels */
- static struct { char *encoding; char *alias; } encoding_aliases[] = {
- { "big5", "big5" },
- { "big5", "big5-hkscs" },
- { "big5", "cn-big5" },
- { "big5", "csbig5" },
- { "big5", "x-x-big5" },
- { "euc-cn", "euc-cn" },
- { "euc-jp", "cseucpkdfmtjapanese" },
- { "euc-jp", "euc-jp" },
- { "euc-jp", "x-euc-jp" },
- { "euc-kr", "cseuckr" },
- { "euc-kr", "csksc56011987" },
- { "euc-kr", "euc-kr" },
- { "euc-kr", "iso-ir-149" },
- { "euc-kr", "korean" },
- { "euc-kr", "ks_c_5601" },
- { "euc-kr", "ksc5601" },
- { "euc-kr", "ksc_5601" },
- { "euc-kr", "windows-949" },
- { "euc-tw", "euc-tw" },
- { "gb18030", "chinese" },
- { "gb18030", "csgb2312" },
- { "gb18030", "csiso58gb231280" },
- { "gb18030", "gb18030" },
- { "gb18030", "gb2312" },
- { "gb18030", "gb_2312" },
- { "gb18030", "gbk" },
- { "gb18030", "iso-ir-58" },
- { "gb18030", "x-gbk" },
- { "iso-8859-1", "ascii" },
- { "iso-8859-1", "iso-8859-1" },
- { "iso-8859-1", "iso8859-1" },
- { "iso-8859-1", "latin1" },
- { "iso-8859-1", "us-ascii" },
- { "iso-8859-7", "greek" },
- { "iso-8859-7", "greek8" },
- { "iso-8859-7", "iso-8859-1" },
- { "iso-8859-7", "iso8859-1" },
- { "koi8-r", "koi" },
- { "koi8-r", "koi8" },
- { "koi8-r", "koi8-r" },
- { "koi8-r", "koi8-ru" },
- { "koi8-r", "koi8-u" },
- { "koi8-r", "koi8_r" },
- { "shift_jis", "csshiftjis" },
- { "shift_jis", "ms932" },
- { "shift_jis", "ms_kanji" },
- { "shift_jis", "shift-jis" },
- { "shift_jis", "shift_jis" },
- { "shift_jis", "sjis" },
- { "shift_jis", "windows-31j" },
- { "shift_jis", "x-sjis" },
- { "windows-1250", "cp1250" },
- { "windows-1250", "windows-1250" },
- { "windows-1251", "cp1251" },
- { "windows-1251", "windows-1251" },
- { "windows-1252", "cp1252" },
- { "windows-1252", "cp819" },
- { "windows-1252", "windows-1252" },
- };
- static char *match_encoding_name(char *enc)
- {
- size_t i;
- for (i = 0; i < nelem(encoding_aliases); ++i)
- if (startswith(enc, encoding_aliases[i].alias))
- return encoding_aliases[i].encoding;
- return NULL;
- }
- // Look for encoding in <meta http-equiv="content-type" content="text/html; charset=XXX"> tags
- static const char *find_meta_encoding(char *s)
- {
- const char *table = NULL;
- char *end, *meta, *charset, *enc;
- meta = fast_strcasestr(s, "<meta");
- while (meta && !table)
- {
- end = strchr(meta, '>');
- if (end)
- {
- *end = 0;
- if (fast_strcasestr(meta, "http-equiv") && fast_strcasestr(meta, "content-type"))
- {
- charset = fast_strcasestr(meta, "charset=");
- if (charset)
- {
- enc = match_encoding_name(charset + 8);
- if (enc)
- table = enc;
- }
- }
- *end = '>';
- }
- meta = fast_strcasestr(meta + 5, "<meta");
- }
- return table;
- }
- static const char *find_xml_encoding(char *s)
- {
- const char *table = NULL;
- char *end, *xml, *enc;
- end = strchr(s, '>');
- if (end)
- {
- *end = 0;
- xml = strstr(s, "<?xml");
- if (xml)
- {
- enc = strstr(xml, "encoding=");
- if (enc)
- {
- enc = match_encoding_name(enc + 10);
- if (enc)
- table = enc;
- }
- }
- *end = '>';
- }
- if (!table)
- table = find_meta_encoding(s);
- return table;
- }
- static char *convert_to_utf8(fz_context *ctx, unsigned char *s, size_t n, int *dofree)
- {
- fz_text_decoder dec;
- const char *enc;
- const unsigned char *e = s + n;
- char *dst, *d;
- int m;
- int c;
- if (s[0] == 0xFE && s[1] == 0xFF) {
- s += 2;
- dst = d = Memento_label(fz_malloc(ctx, n * FZ_UTFMAX), "utf8_from_be");
- while (s + 1 < e) {
- c = s[0] << 8 | s[1];
- d += fz_runetochar(d, c);
- s += 2;
- }
- *d = 0;
- *dofree = 1;
- return dst;
- }
- if (s[0] == 0xFF && s[1] == 0xFE) {
- s += 2;
- dst = d = Memento_label(fz_malloc(ctx, n * FZ_UTFMAX), "utf8_from_le");
- while (s + 1 < e) {
- c = s[0] | s[1] << 8;
- d += fz_runetochar(d, c);
- s += 2;
- }
- *d = 0;
- *dofree = 1;
- return dst;
- }
- enc = find_xml_encoding((char*)s);
- if (enc)
- {
- fz_init_text_decoder(ctx, &dec, enc);
- // NOTE: use decode_size if memory is more important than speed
- m = (int)dec.decode_bound(&dec, s, (int)n);
- dst = Memento_label(fz_malloc(ctx, m), "utf8");
- dec.decode(&dec, dst, s, (int)n);
- *dofree = 1;
- return dst;
- }
- *dofree = 0;
- if (s[0] == 0xEF && s[1] == 0xBB && s[2] == 0xBF)
- return (char*)s+3;
- return (char*)s;
- }
- fz_xml *
- fz_parse_xml_stream(fz_context *ctx, fz_stream *stm, int preserve_white)
- {
- fz_buffer *buf = fz_read_all(ctx, stm, 128);
- fz_xml *xml = NULL;
- fz_var(xml);
- fz_try(ctx)
- xml = fz_parse_xml(ctx, buf, preserve_white);
- fz_always(ctx)
- fz_drop_buffer(ctx, buf);
- fz_catch(ctx)
- fz_rethrow(ctx);
- return xml;
- }
- static fz_xml *
- parse_and_drop_buffer(fz_context *ctx, fz_buffer *buf, int preserve_white)
- {
- fz_xml *xml = NULL;
- fz_var(xml);
- fz_try(ctx)
- xml = fz_parse_xml(ctx, buf, preserve_white);
- fz_always(ctx)
- fz_drop_buffer(ctx, buf);
- fz_catch(ctx)
- fz_rethrow(ctx);
- return xml;
- }
- fz_xml *
- fz_parse_xml_archive_entry(fz_context *ctx, fz_archive *arch, const char *filename, int preserve_white)
- {
- fz_buffer *buf = fz_read_archive_entry(ctx, arch, filename);
- return parse_and_drop_buffer(ctx, buf, preserve_white);
- }
- fz_xml *
- fz_try_parse_xml_archive_entry(fz_context *ctx, fz_archive *arch, const char *filename, int preserve_white)
- {
- fz_buffer *buf = fz_try_read_archive_entry(ctx, arch, filename);
- if (buf == NULL)
- return NULL;
- return parse_and_drop_buffer(ctx, buf, preserve_white);
- }
- fz_xml *
- fz_parse_xml(fz_context *ctx, fz_buffer *buf, int preserve_white)
- {
- struct parser parser;
- fz_xml *xml = NULL;
- fz_xml *root, *node;
- char *p = NULL;
- char *error;
- int dofree = 0;
- unsigned char *s;
- size_t n;
- static unsigned char empty_string[] = "";
- fz_var(dofree);
- fz_var(p);
- if (buf == NULL)
- {
- n = 0;
- s = empty_string;
- }
- else
- {
- /* ensure we are zero-terminated */
- fz_terminate_buffer(ctx, buf);
- n = fz_buffer_storage(ctx, buf, &s);
- }
- parser.pool = fz_new_pool(ctx);
- parser.head = root = fz_pool_alloc_flexible(ctx, parser.pool, fz_xml, u.node.u.d.name, 1);
- parser.preserve_white = preserve_white;
- parser.depth = 0;
- #ifdef FZ_XML_SEQ
- parser.seq = 0;
- #endif
- fz_try(ctx)
- {
- p = convert_to_utf8(ctx, s, n, &dofree);
- error = xml_parse_document_imp(ctx, &parser, p);
- if (error)
- fz_throw(ctx, FZ_ERROR_SYNTAX, "%s", error);
- for (node = parser.head; node; node = node->up)
- node->u.node.next = NULL;
- xml = fz_pool_alloc(ctx, parser.pool, sizeof *xml);
- xml->up = NULL;
- xml->down = root->down;
- xml->u.doc.refs = 1;
- xml->u.doc.pool = parser.pool;
- for (node = root->down; node; node = node->u.node.next)
- node->up = xml;
- }
- fz_always(ctx)
- {
- if (dofree)
- fz_free(ctx, p);
- }
- fz_catch(ctx)
- {
- fz_drop_pool(ctx, parser.pool);
- fz_rethrow(ctx);
- }
- return xml;
- }
- #if FZ_ENABLE_HTML_ENGINE
- /*
- Parse the contents of buffer into a tree of XML nodes, using the HTML5 syntax.
- Gumbo doesn't check for malloc errors. Use our pool allocator and let it longjmp
- out of Gumbo on allocation errors. At the end (success or fail) we release the
- pool used for Gumbo's parse tree all at once.
- */
- struct mem_gumbo {
- fz_context *ctx;
- fz_pool *pool;
- };
- static void *alloc_gumbo(void *ctx, size_t size)
- {
- struct mem_gumbo *mem = ctx;
- return fz_pool_alloc(mem->ctx, mem->pool, size);
- }
- static void dealloc_gumbo(void *ctx, void *ptr)
- {
- /* nothing */
- }
- static void xml_from_gumbo(fz_context *ctx, struct parser *parser, GumboNode *node)
- {
- unsigned int i;
- const char *tag, *end, *sentinel;
- switch (node->type)
- {
- case GUMBO_NODE_ELEMENT:
- if (node->v.element.tag != GUMBO_TAG_UNKNOWN)
- {
- tag = gumbo_normalized_tagname(node->v.element.tag);
- end = tag + strlen(tag);
- }
- else
- {
- tag = node->v.element.original_tag.data;
- sentinel = tag + node->v.element.original_tag.length;
- if (tag[0] == '<')
- ++tag;
- for (end = tag; end < sentinel; ++end)
- if (end[0] == '>' || end[0] == '/' || iswhite(end[0]))
- break;
- }
- xml_emit_open_tag(ctx, parser, tag, end, 0);
- for (i = 0; i < node->v.element.attributes.length; ++i)
- {
- GumboAttribute *att = node->v.element.attributes.data[i];
- xml_emit_att_name(ctx, parser, att->name, att->name+strlen(att->name));
- xml_emit_att_value(ctx, parser, att->value, att->value+strlen(att->value));
- }
- for (i = 0; i < node->v.element.children.length; ++i)
- {
- GumboNode *child = node->v.element.children.data[i];
- xml_from_gumbo(ctx, parser, child);
- }
- xml_emit_close_tag(ctx, parser);
- break;
- case GUMBO_NODE_TEXT:
- case GUMBO_NODE_CDATA:
- case GUMBO_NODE_WHITESPACE:
- xml_emit_text(ctx, parser, node->v.text.text, node->v.text.text+strlen(node->v.text.text));
- break;
- case GUMBO_NODE_DOCUMENT:
- case GUMBO_NODE_COMMENT:
- case GUMBO_NODE_TEMPLATE:
- break;
- }
- }
- #endif
- fz_xml *
- fz_parse_xml_from_html5(fz_context *ctx, fz_buffer *buf)
- {
- #if FZ_ENABLE_HTML_ENGINE
- struct parser parser;
- fz_xml *xml = NULL;
- fz_xml root, *node;
- char *p = NULL;
- int dofree = 0;
- unsigned char *s;
- size_t n;
- GumboOutput *soup = NULL;
- GumboOptions opts;
- struct mem_gumbo mem;
- static unsigned char empty_string[] = "";
- fz_var(mem.pool);
- fz_var(soup);
- fz_var(dofree);
- fz_var(p);
- if (buf == NULL)
- {
- n = 0;
- s = empty_string;
- }
- else
- {
- /* ensure we are zero-terminated */
- fz_terminate_buffer(ctx, buf);
- n = fz_buffer_storage(ctx, buf, &s);
- }
- mem.ctx = ctx;
- mem.pool = NULL;
- memset(&root, 0, sizeof(root));
- parser.pool = fz_new_pool(ctx);
- parser.head = &root;
- parser.preserve_white = 1;
- parser.depth = 0;
- #ifdef FZ_XML_SEQ
- parser.seq = 0;
- #endif
- fz_try(ctx)
- {
- p = convert_to_utf8(ctx, s, n, &dofree);
- mem.pool = fz_new_pool(ctx);
- memset(&opts, 0, sizeof opts);
- opts.allocator = alloc_gumbo;
- opts.deallocator = dealloc_gumbo;
- opts.userdata = &mem;
- opts.tab_stop = 8;
- opts.stop_on_first_error = 0;
- opts.max_errors = -1;
- opts.fragment_context = GUMBO_TAG_LAST;
- opts.fragment_namespace = GUMBO_NAMESPACE_HTML;
- soup = gumbo_parse_with_options(&opts, (const char *)p, strlen(p));
- xml_from_gumbo(ctx, &parser, soup->root);
- for (node = parser.head; node; node = node->up)
- node->u.node.next = NULL;
- xml = fz_pool_alloc(ctx, parser.pool, sizeof *xml);
- xml->up = NULL;
- xml->down = root.down;
- xml->u.doc.pool = parser.pool;
- xml->u.doc.refs = 1;
- for (node = root.down; node; node = node->u.node.next)
- node->up = xml;
- }
- fz_always(ctx)
- {
- if (soup)
- gumbo_destroy_output(&opts, soup);
- fz_drop_pool(ctx, mem.pool);
- if (dofree)
- fz_free(ctx, p);
- }
- fz_catch(ctx)
- {
- fz_drop_pool(ctx, parser.pool);
- fz_rethrow(ctx);
- }
- return xml;
- #else
- fz_throw(ctx, FZ_ERROR_GENERIC, "HTML Engine not enabled in this build");
- #endif
- }
- fz_xml *fz_xml_find_dfs(fz_xml *item, const char *tag, const char *att, const char *match)
- {
- return fz_xml_find_dfs_top(item, tag, att, match, NULL);
- }
- fz_xml *fz_xml_find_dfs_top(fz_xml *item, const char *tag, const char *att, const char *match, fz_xml *top)
- {
- /* Skip over any DOC object. */
- if (item && FZ_DOCUMENT_ITEM(item))
- item = item->down;
- while (item)
- {
- if (!FZ_TEXT_ITEM(item) && (tag == NULL || !strcmp(item->u.node.u.d.name, tag)))
- {
- if (att == NULL || (match == NULL ? fz_xml_att(item, att) != NULL : fz_xml_att_eq(item, att, match)))
- return item;
- }
- if (!FZ_TEXT_ITEM(item) && item->down)
- item = item->down;
- else if (item->u.node.next)
- item = item->u.node.next;
- else
- while (1) {
- item = item->up;
- /* Stop searching if we hit our declared 'top' item. */
- if (item == top)
- return NULL;
- /* We should never reach item == NULL, but just in case. */
- if (item == NULL)
- return NULL;
- /* If we reach the DOC object at the top, we're done. */
- if (item->up == NULL)
- return NULL;
- if (item->u.node.next)
- {
- item = item->u.node.next;
- break;
- }
- }
- }
- return NULL;
- }
- fz_xml *fz_xml_find_next_dfs(fz_xml *item, const char *tag, const char *att, const char *match)
- {
- return fz_xml_find_next_dfs_top(item, tag, att, match, NULL);
- }
- fz_xml *fz_xml_find_next_dfs_top(fz_xml *item, const char *tag, const char *att, const char *match, fz_xml *top)
- {
- /* Skip over any DOC object. */
- if (item && FZ_DOCUMENT_ITEM(item))
- item = item->down;
- if (item == NULL)
- return NULL;
- if (item->down)
- item = item->down;
- else if (item->u.node.next)
- item = item->u.node.next;
- else
- while (1) {
- item = item->up;
- /* Stop searching if we hit our declared 'top' item. */
- if (item == top)
- return NULL;
- /* We should never reach item == NULL, but just in case. */
- if (item == NULL)
- return NULL;
- /* If we reach the DOC object at the top, we're done. */
- if (item->up == NULL)
- return NULL;
- if (item->u.node.next)
- {
- item = item->u.node.next;
- break;
- }
- }
- return fz_xml_find_dfs_top(item, tag, att, match, top);
- }
- fz_xml *fz_keep_xml(fz_context *ctx, fz_xml *xml)
- {
- fz_xml *dom = xml;
- if (xml == NULL)
- return xml;
- while (dom->up)
- dom = dom->up;
- fz_keep_imp(ctx, dom, &dom->u.doc.refs);
- /* Return the original node pointer, not the dom pointer! */
- return xml;
- }
|