xml.c 34 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405
  1. // Copyright (C) 2004-2025 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #include "xml-imp.h"
  23. #include <string.h>
  24. #include <stdlib.h>
  25. #include <stdio.h>
  26. #if FZ_ENABLE_HTML_ENGINE
  27. #include <gumbo.h>
  28. #endif
  29. #define FZ_XML_MAX_DEPTH 4096
  30. /* #define FZ_XML_SEQ */
  31. static const struct { const char *name; int c; } html_entities[] = {
  32. {"nbsp",160}, {"iexcl",161}, {"cent",162}, {"pound",163},
  33. {"curren",164}, {"yen",165}, {"brvbar",166}, {"sect",167},
  34. {"uml",168}, {"copy",169}, {"ordf",170}, {"laquo",171},
  35. {"not",172}, {"shy",173}, {"reg",174}, {"macr",175}, {"deg",176},
  36. {"plusmn",177}, {"sup2",178}, {"sup3",179}, {"acute",180},
  37. {"micro",181}, {"para",182}, {"middot",183}, {"cedil",184},
  38. {"sup1",185}, {"ordm",186}, {"raquo",187}, {"frac14",188},
  39. {"frac12",189}, {"frac34",190}, {"iquest",191}, {"Agrave",192},
  40. {"Aacute",193}, {"Acirc",194}, {"Atilde",195}, {"Auml",196},
  41. {"Aring",197}, {"AElig",198}, {"Ccedil",199}, {"Egrave",200},
  42. {"Eacute",201}, {"Ecirc",202}, {"Euml",203}, {"Igrave",204},
  43. {"Iacute",205}, {"Icirc",206}, {"Iuml",207}, {"ETH",208},
  44. {"Ntilde",209}, {"Ograve",210}, {"Oacute",211}, {"Ocirc",212},
  45. {"Otilde",213}, {"Ouml",214}, {"times",215}, {"Oslash",216},
  46. {"Ugrave",217}, {"Uacute",218}, {"Ucirc",219}, {"Uuml",220},
  47. {"Yacute",221}, {"THORN",222}, {"szlig",223}, {"agrave",224},
  48. {"aacute",225}, {"acirc",226}, {"atilde",227}, {"auml",228},
  49. {"aring",229}, {"aelig",230}, {"ccedil",231}, {"egrave",232},
  50. {"eacute",233}, {"ecirc",234}, {"euml",235}, {"igrave",236},
  51. {"iacute",237}, {"icirc",238}, {"iuml",239}, {"eth",240},
  52. {"ntilde",241}, {"ograve",242}, {"oacute",243}, {"ocirc",244},
  53. {"otilde",245}, {"ouml",246}, {"divide",247}, {"oslash",248},
  54. {"ugrave",249}, {"uacute",250}, {"ucirc",251}, {"uuml",252},
  55. {"yacute",253}, {"thorn",254}, {"yuml",255}, {"lt",60}, {"gt",62},
  56. {"amp",38}, {"apos",39}, {"quot",34}, {"OElig",338}, {"oelig",339},
  57. {"Scaron",352}, {"scaron",353}, {"Yuml",376}, {"circ",710},
  58. {"tilde",732}, {"ensp",8194}, {"emsp",8195}, {"thinsp",8201},
  59. {"zwnj",8204}, {"zwj",8205}, {"lrm",8206}, {"rlm",8207},
  60. {"ndash",8211}, {"mdash",8212}, {"lsquo",8216}, {"rsquo",8217},
  61. {"sbquo",8218}, {"ldquo",8220}, {"rdquo",8221}, {"bdquo",8222},
  62. {"dagger",8224}, {"Dagger",8225}, {"permil",8240}, {"lsaquo",8249},
  63. {"rsaquo",8250}, {"euro",8364}, {"fnof",402}, {"Alpha",913},
  64. {"Beta",914}, {"Gamma",915}, {"Delta",916}, {"Epsilon",917},
  65. {"Zeta",918}, {"Eta",919}, {"Theta",920}, {"Iota",921}, {"Kappa",922},
  66. {"Lambda",923}, {"Mu",924}, {"Nu",925}, {"Xi",926}, {"Omicron",927},
  67. {"Pi",928}, {"Rho",929}, {"Sigma",931}, {"Tau",932}, {"Upsilon",933},
  68. {"Phi",934}, {"Chi",935}, {"Psi",936}, {"Omega",937}, {"alpha",945},
  69. {"beta",946}, {"gamma",947}, {"delta",948}, {"epsilon",949},
  70. {"zeta",950}, {"eta",951}, {"theta",952}, {"iota",953}, {"kappa",954},
  71. {"lambda",955}, {"mu",956}, {"nu",957}, {"xi",958}, {"omicron",959},
  72. {"pi",960}, {"rho",961}, {"sigmaf",962}, {"sigma",963}, {"tau",964},
  73. {"upsilon",965}, {"phi",966}, {"chi",967}, {"psi",968}, {"omega",969},
  74. {"thetasym",977}, {"upsih",978}, {"piv",982}, {"bull",8226},
  75. {"hellip",8230}, {"prime",8242}, {"Prime",8243}, {"oline",8254},
  76. {"frasl",8260}, {"weierp",8472}, {"image",8465}, {"real",8476},
  77. {"trade",8482}, {"alefsym",8501}, {"larr",8592}, {"uarr",8593},
  78. {"rarr",8594}, {"darr",8595}, {"harr",8596}, {"crarr",8629},
  79. {"lArr",8656}, {"uArr",8657}, {"rArr",8658}, {"dArr",8659},
  80. {"hArr",8660}, {"forall",8704}, {"part",8706}, {"exist",8707},
  81. {"empty",8709}, {"nabla",8711}, {"isin",8712}, {"notin",8713},
  82. {"ni",8715}, {"prod",8719}, {"sum",8721}, {"minus",8722},
  83. {"lowast",8727}, {"radic",8730}, {"prop",8733}, {"infin",8734},
  84. {"ang",8736}, {"and",8743}, {"or",8744}, {"cap",8745}, {"cup",8746},
  85. {"int",8747}, {"there4",8756}, {"sim",8764}, {"cong",8773},
  86. {"asymp",8776}, {"ne",8800}, {"equiv",8801}, {"le",8804}, {"ge",8805},
  87. {"sub",8834}, {"sup",8835}, {"nsub",8836}, {"sube",8838},
  88. {"supe",8839}, {"oplus",8853}, {"otimes",8855}, {"perp",8869},
  89. {"sdot",8901}, {"lceil",8968}, {"rceil",8969}, {"lfloor",8970},
  90. {"rfloor",8971}, {"lang",9001}, {"rang",9002}, {"loz",9674},
  91. {"spades",9824}, {"clubs",9827}, {"hearts",9829}, {"diams",9830},
  92. };
  93. struct parser
  94. {
  95. fz_pool *pool;
  96. fz_xml *head;
  97. int preserve_white;
  98. int depth;
  99. #ifdef FZ_XML_SEQ
  100. int seq;
  101. #endif
  102. };
  103. static void xml_indent(fz_context *ctx, fz_output *out, int n)
  104. {
  105. while (n--) {
  106. fz_write_byte(ctx, out, ' ');
  107. fz_write_byte(ctx, out, ' ');
  108. }
  109. }
  110. void fz_debug_xml(fz_xml *item, int level)
  111. {
  112. /* This is a bit nasty as it relies on implementation
  113. * details of both fz_stdout, and fz_write_printf coping
  114. * with NULL ctx. */
  115. fz_output_xml(NULL, fz_stdout(NULL), item, level);
  116. }
  117. void fz_output_xml(fz_context *ctx, fz_output *out, fz_xml *item, int level)
  118. {
  119. char *s;
  120. if (item == NULL)
  121. return;
  122. /* Skip over the DOC object at the top. */
  123. if (item->up == NULL)
  124. {
  125. fz_xml *child;
  126. for (child = fz_xml_down(item); child; child = child->u.node.next)
  127. fz_output_xml(ctx, out, child, level + 1);
  128. return;
  129. }
  130. s = fz_xml_text(item);
  131. xml_indent(ctx, out, level);
  132. if (s)
  133. {
  134. int c;
  135. fz_write_byte(ctx, out, '"');
  136. while (*s) {
  137. s += fz_chartorune(&c, s);
  138. switch (c) {
  139. default:
  140. if (c > 0xFFFF)
  141. fz_write_printf(ctx, out, "\\u{%X}", c);
  142. else if (c < 32 || c > 127)
  143. fz_write_printf(ctx, out, "\\u%04X", c);
  144. else
  145. fz_write_byte(ctx, out, c);
  146. break;
  147. case '\\': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, '\\'); break;
  148. case '\b': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'b'); break;
  149. case '\f': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'f'); break;
  150. case '\n': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'n'); break;
  151. case '\r': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'r'); break;
  152. case '\t': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 't'); break;
  153. }
  154. }
  155. fz_write_byte(ctx, out, '"');
  156. #ifdef FZ_XML_SEQ
  157. fz_write_printf(ctx, out, " <%d>", item->seq);
  158. #endif
  159. fz_write_byte(ctx, out, '\n');
  160. }
  161. else
  162. {
  163. fz_xml *child;
  164. struct attribute *att;
  165. #ifdef FZ_XML_SEQ
  166. fz_write_printf(ctx, out, "(%s <%d>\n", item->u.node.u.d.name, item->u.node.seq);
  167. #else
  168. fz_write_printf(ctx, out, "(%s\n", item->u.node.u.d.name);
  169. #endif
  170. for (att = item->u.node.u.d.atts; att; att = att->next)
  171. {
  172. xml_indent(ctx, out, level);
  173. fz_write_printf(ctx, out, "=%s %s\n", att->name, att->value);
  174. }
  175. for (child = fz_xml_down(item); child; child = child->u.node.next)
  176. fz_output_xml(ctx, out, child, level + 1);
  177. xml_indent(ctx, out, level);
  178. #ifdef FZ_XML_SEQ
  179. fz_write_printf(ctx, out, ")%s <%d>\n", item->u.node.u.d.name, item->u.node.seq);
  180. #else
  181. fz_write_printf(ctx, out, ")%s\n", item->u.node.u.d.name);
  182. #endif
  183. }
  184. }
  185. fz_xml *fz_xml_prev(fz_xml *item)
  186. {
  187. return item && item->up ? item->u.node.prev : NULL;
  188. }
  189. fz_xml *fz_xml_next(fz_xml *item)
  190. {
  191. return item && item->up ? item->u.node.next : NULL;
  192. }
  193. fz_xml *fz_xml_up(fz_xml *item)
  194. {
  195. /* Never step up to the DOC. */
  196. return item && item->up && item->up->up ? item->up : NULL;
  197. }
  198. fz_xml *fz_xml_down(fz_xml *item)
  199. {
  200. /* DOC items can never have MAGIC_TEXT as their down value,
  201. * so this is safe. */
  202. return item && !FZ_TEXT_ITEM(item) ? item->down : NULL;
  203. }
  204. char *fz_xml_text(fz_xml *item)
  205. {
  206. /* DOC items can never have MAGIC_TEXT as their down value,
  207. * so this is safe. */
  208. return (item && FZ_TEXT_ITEM(item)) ? item->u.node.u.text : NULL;
  209. }
  210. char *fz_xml_tag(fz_xml *item)
  211. {
  212. /* DOC items can never have MAGIC_TEXT as their down value,
  213. * so this is safe. */
  214. return item && !FZ_TEXT_ITEM(item) ? item->u.node.u.d.name : NULL;
  215. }
  216. int fz_xml_is_tag(fz_xml *item, const char *name)
  217. {
  218. if (!item || FZ_DOCUMENT_ITEM(item) || FZ_TEXT_ITEM(item))
  219. return 0;
  220. return !strcmp(item->u.node.u.d.name, name);
  221. }
  222. char *fz_xml_att(fz_xml *item, const char *name)
  223. {
  224. struct attribute *att;
  225. if (!item || FZ_DOCUMENT_ITEM(item) || FZ_TEXT_ITEM(item))
  226. return NULL;
  227. for (att = item->u.node.u.d.atts; att; att = att->next)
  228. if (!strcmp(att->name, name))
  229. return att->value;
  230. return NULL;
  231. }
  232. char *fz_xml_att_alt(fz_xml *item, const char *one, const char *two)
  233. {
  234. char *val = fz_xml_att(item, one);
  235. if (!val)
  236. val = fz_xml_att(item, two);
  237. return val;
  238. }
  239. fz_xml *fz_xml_find(fz_xml *item, const char *tag)
  240. {
  241. /* Skip over any DOC item. */
  242. if (item && FZ_DOCUMENT_ITEM(item))
  243. item = item->down;
  244. while (item)
  245. {
  246. if (!FZ_TEXT_ITEM(item) && !strcmp(item->u.node.u.d.name, tag))
  247. return item;
  248. item = item->u.node.next;
  249. }
  250. return NULL;
  251. }
  252. fz_xml *fz_xml_find_next(fz_xml *item, const char *tag)
  253. {
  254. /* Skip over any DOC item. */
  255. if (item && FZ_DOCUMENT_ITEM(item))
  256. item = item->down;
  257. if (item)
  258. item = item->u.node.next;
  259. return fz_xml_find(item, tag);
  260. }
  261. fz_xml *fz_xml_find_down(fz_xml *item, const char *tag)
  262. {
  263. if (item)
  264. item = fz_xml_down(item);
  265. return fz_xml_find(item, tag);
  266. }
  267. int fz_xml_att_eq(fz_xml *item, const char *name, const char *match)
  268. {
  269. const char *val = fz_xml_att(item, name);
  270. return val ? !strcmp(val, match) : 0;
  271. }
  272. fz_xml *fz_xml_find_match(fz_xml *item, const char *tag, const char *att, const char *match)
  273. {
  274. /* Skip over any document item. */
  275. if (item && FZ_DOCUMENT_ITEM(item))
  276. item = item->down;
  277. while (1)
  278. {
  279. item = tag ? fz_xml_find(item, tag) : item;
  280. if (item == NULL || fz_xml_att_eq(item, att, match))
  281. break;
  282. item = item->u.node.next;
  283. }
  284. return item;
  285. }
  286. fz_xml *fz_xml_find_next_match(fz_xml *item, const char *tag, const char *att, const char *match)
  287. {
  288. /* Skip over any document item. */
  289. if (item && FZ_DOCUMENT_ITEM(item))
  290. item = item->down;
  291. if (item != NULL)
  292. {
  293. do
  294. {
  295. item = tag ? fz_xml_find_next(item, tag) : item->u.node.next;
  296. }
  297. while (item != NULL && !fz_xml_att_eq(item, att, match));
  298. }
  299. return item;
  300. }
  301. fz_xml *fz_xml_find_down_match(fz_xml *item, const char *tag, const char *att, const char *match)
  302. {
  303. return fz_xml_find_match(fz_xml_down(item), tag, att, match);
  304. }
  305. fz_xml *fz_xml_root(fz_xml *xml)
  306. {
  307. if (xml == NULL)
  308. return NULL;
  309. /* If we've been given a node mid-tree, run up to the root to find
  310. * the doc node. */
  311. while (xml->up)
  312. xml = xml->up;
  313. /* And the root is the child of the doc.*/
  314. return xml->down;
  315. }
  316. void fz_drop_xml(fz_context *ctx, fz_xml *xml)
  317. {
  318. if (!xml)
  319. return;
  320. /* Wherever we are in the tree, we want the doc node at the root. */
  321. while (xml->up)
  322. xml = xml->up;
  323. /* Drop a reference to the tree as a whole. */
  324. if (fz_drop_imp(ctx, xml, &xml->u.doc.refs) == 0)
  325. return;
  326. fz_drop_pool(ctx, xml->u.doc.pool);
  327. }
  328. void fz_detach_xml(fz_context *ctx, fz_xml *node)
  329. {
  330. fz_xml *doc = node;
  331. /* If we're already a document node, then this is a NOP. */
  332. if (doc->up == NULL)
  333. return;
  334. /* Move doc to be the doc pointer at the top of the tree. */
  335. while (doc->up)
  336. {
  337. doc = doc->up;
  338. }
  339. /* Relocate node to be the child of doc. */
  340. node->up->down = NULL;
  341. doc->down = node;
  342. /* NOTE: Suppose that X = doc->down on entry. On exit doc->down == node, but
  343. * X->up = doc. We need to be careful throughout this code to not assume that
  344. * Y is always a child of Y->up. */
  345. }
  346. size_t xml_parse_entity(int *c, const char *a)
  347. {
  348. char *b;
  349. size_t i;
  350. if (a[1] == '#') {
  351. if (a[2] == 'x')
  352. *c = strtol(a + 3, &b, 16);
  353. else
  354. *c = strtol(a + 2, &b, 10);
  355. if (*b == ';')
  356. return b - a + 1;
  357. }
  358. else if (a[1] == 'l' && a[2] == 't' && a[3] == ';') {
  359. *c = '<';
  360. return 4;
  361. }
  362. else if (a[1] == 'g' && a[2] == 't' && a[3] == ';') {
  363. *c = '>';
  364. return 4;
  365. }
  366. else if (a[1] == 'a' && a[2] == 'm' && a[3] == 'p' && a[4] == ';') {
  367. *c = '&';
  368. return 5;
  369. }
  370. else if (a[1] == 'a' && a[2] == 'p' && a[3] == 'o' && a[4] == 's' && a[5] == ';') {
  371. *c = '\'';
  372. return 6;
  373. }
  374. else if (a[1] == 'q' && a[2] == 'u' && a[3] == 'o' && a[4] == 't' && a[5] == ';') {
  375. *c = '"';
  376. return 6;
  377. }
  378. /* We should only be doing this for XHTML, but it shouldn't be a problem. */
  379. for (i = 0; i < nelem(html_entities); ++i) {
  380. size_t n = strlen(html_entities[i].name);
  381. if (!strncmp(a+1, html_entities[i].name, n) && a[n+1] == ';') {
  382. *c = html_entities[i].c;
  383. return n + 2;
  384. }
  385. }
  386. *c = *a;
  387. return 1;
  388. }
  389. static inline int isname(int c)
  390. {
  391. return c == '.' || c == '-' || c == '_' || c == ':' ||
  392. (c >= '0' && c <= '9') ||
  393. (c >= 'A' && c <= 'Z') ||
  394. (c >= 'a' && c <= 'z');
  395. }
  396. static inline int iswhite(int c)
  397. {
  398. return c == ' ' || c == '\r' || c == '\n' || c == '\t';
  399. }
  400. static void xml_emit_open_tag(fz_context *ctx, struct parser *parser, const char *a, const char *b, int is_text)
  401. {
  402. fz_xml *head, *tail;
  403. const char *ns;
  404. size_t size;
  405. if (is_text)
  406. size = offsetof(fz_xml, u.node.u.text) + b-a+1;
  407. else
  408. {
  409. /* skip namespace prefix */
  410. for (ns = a; ns < b - 1; ++ns)
  411. if (*ns == ':')
  412. a = ns + 1;
  413. size = offsetof(fz_xml, u.node.u.d.name) + b-a+1;
  414. }
  415. head = fz_pool_alloc(ctx, parser->pool, size);
  416. if (is_text)
  417. head->down = MAGIC_TEXT;
  418. else
  419. {
  420. memcpy(head->u.node.u.d.name, a, b - a);
  421. head->u.node.u.d.name[b - a] = 0;
  422. head->u.node.u.d.atts = NULL;
  423. head->down = NULL;
  424. }
  425. head->up = parser->head;
  426. head->u.node.next = NULL;
  427. #ifdef FZ_XML_SEQ
  428. head->u.node.seq = parser->seq++;
  429. #endif
  430. /* During construction, we use head->next to mean "the
  431. * tail of the children. When we close the tag, we
  432. * rewrite it to be NULL. */
  433. if (!parser->head->down) {
  434. parser->head->down = head;
  435. parser->head->u.node.next = head;
  436. head->u.node.prev = NULL;
  437. }
  438. else {
  439. tail = parser->head->u.node.next;
  440. tail->u.node.next = head;
  441. head->u.node.prev = tail;
  442. parser->head->u.node.next = head;
  443. }
  444. parser->head = head;
  445. parser->depth++;
  446. if (parser->depth >= FZ_XML_MAX_DEPTH)
  447. fz_throw(ctx, FZ_ERROR_SYNTAX, "too deep xml element nesting");
  448. }
  449. static void xml_emit_att_name(fz_context *ctx, struct parser *parser, const char *a, const char *b)
  450. {
  451. fz_xml *head = parser->head;
  452. struct attribute *att;
  453. size_t size;
  454. size = offsetof(struct attribute, name) + b-a+1;
  455. att = fz_pool_alloc(ctx, parser->pool, size);
  456. memcpy(att->name, a, b - a);
  457. att->name[b - a] = 0;
  458. att->value = NULL;
  459. att->next = head->u.node.u.d.atts;
  460. head->u.node.u.d.atts = att;
  461. }
  462. void fz_xml_add_att(fz_context *ctx, fz_pool *pool, fz_xml *node, const char *key, const char *val)
  463. {
  464. size_t size = offsetof(struct attribute, name) + strlen(key) + 1;
  465. struct attribute *att = fz_pool_alloc(ctx, pool, size);
  466. memcpy(att->name, key, strlen(key)+1);
  467. att->value = fz_pool_alloc(ctx, pool, strlen(val) + 1);
  468. memcpy(att->value, val, strlen(val)+1);
  469. att->next = node->u.node.u.d.atts;
  470. node->u.node.u.d.atts = att;
  471. }
  472. static void xml_emit_att_value(fz_context *ctx, struct parser *parser, const char *a, const char *b)
  473. {
  474. fz_xml *head = parser->head;
  475. struct attribute *att = head->u.node.u.d.atts;
  476. char *s;
  477. int c;
  478. /* entities are all longer than UTFmax so runetochar is safe */
  479. s = att->value = fz_pool_alloc(ctx, parser->pool, b - a + 1);
  480. while (a < b) {
  481. if (*a == '&') {
  482. a += xml_parse_entity(&c, a);
  483. s += fz_runetochar(s, c);
  484. }
  485. else {
  486. *s++ = *a++;
  487. }
  488. }
  489. *s = 0;
  490. }
  491. static void xml_emit_close_tag(fz_context *ctx, struct parser *parser)
  492. {
  493. parser->depth--;
  494. parser->head->u.node.next = NULL;
  495. if (parser->head->up)
  496. parser->head = parser->head->up;
  497. }
  498. static void xml_emit_text(fz_context *ctx, struct parser *parser, const char *a, const char *b)
  499. {
  500. fz_xml *head;
  501. const char *p;
  502. char *s;
  503. int c;
  504. /* Skip text outside the root tag */
  505. if (parser->depth == 0)
  506. return;
  507. /* Skip all-whitespace text nodes */
  508. if (!parser->preserve_white)
  509. {
  510. for (p = a; p < b; p++)
  511. if (!iswhite(*p))
  512. break;
  513. if (p == b)
  514. return;
  515. }
  516. xml_emit_open_tag(ctx, parser, a, b, 1);
  517. head = parser->head;
  518. /* entities are all longer than UTFmax so runetochar is safe */
  519. s = fz_xml_text(head);
  520. while (a < b) {
  521. if (*a == '&') {
  522. a += xml_parse_entity(&c, a);
  523. s += fz_runetochar(s, c);
  524. }
  525. else {
  526. *s++ = *a++;
  527. }
  528. }
  529. *s = 0;
  530. xml_emit_close_tag(ctx, parser);
  531. }
  532. static void xml_emit_cdata(fz_context *ctx, struct parser *parser, const char *a, const char *b)
  533. {
  534. fz_xml *head;
  535. char *s;
  536. xml_emit_open_tag(ctx, parser, a, b, 1);
  537. head = parser->head;
  538. s = head->u.node.u.text;
  539. while (a < b)
  540. *s++ = *a++;
  541. *s = 0;
  542. xml_emit_close_tag(ctx, parser);
  543. }
  544. static int close_tag(fz_context *ctx, struct parser *parser, const char *mark, const char *p)
  545. {
  546. const char *ns, *tag;
  547. /* skip namespace prefix */
  548. for (ns = mark; ns < p - 1; ++ns)
  549. if (*ns == ':')
  550. mark = ns + 1;
  551. tag = fz_xml_tag(parser->head);
  552. if (tag && strncmp(tag, mark, p-mark) == 0 && tag[p-mark] == 0)
  553. {
  554. xml_emit_close_tag(ctx, parser);
  555. return 0;
  556. }
  557. return 1;
  558. }
  559. static char *xml_parse_document_imp(fz_context *ctx, struct parser *parser, const char *p) /* lgtm [cpp/use-of-goto] */
  560. {
  561. const char *mark;
  562. int quote;
  563. parse_text:
  564. mark = p;
  565. while (*p && *p != '<') ++p;
  566. if (*p == '<') {
  567. if (mark < p)
  568. xml_emit_text(ctx, parser, mark, p);
  569. ++p;
  570. goto parse_element;
  571. } else if (mark < p)
  572. xml_emit_text(ctx, parser, mark, p);
  573. return NULL;
  574. parse_element:
  575. if (*p == '/') { ++p; goto parse_closing_element; }
  576. if (*p == '!') { ++p; goto parse_comment; }
  577. if (*p == '?') { ++p; goto parse_processing_instruction; }
  578. while (iswhite(*p)) ++p;
  579. if (isname(*p))
  580. goto parse_element_name;
  581. return "syntax error in element";
  582. parse_comment:
  583. if (p[0]=='D' && p[1]=='O' && p[2]=='C' && p[3]=='T' && p[4]=='Y' && p[5]=='P' && p[6]=='E')
  584. goto parse_declaration;
  585. if (p[0]=='E' && p[1]=='N' && p[2]=='T' && p[3]=='I' && p[4]=='T' && p[5]=='Y')
  586. goto parse_declaration;
  587. if (*p == '[') goto parse_cdata;
  588. if (*p++ != '-') return "syntax error in comment (<! not followed by --)";
  589. if (*p++ != '-') return "syntax error in comment (<!- not followed by -)";
  590. while (*p) {
  591. if (p[0] == '-' && p[1] == '-' && p[2] == '>') {
  592. p += 3;
  593. goto parse_text;
  594. }
  595. ++p;
  596. }
  597. return "end of data in comment";
  598. parse_declaration:
  599. while (*p) if (*p++ == '>') goto parse_text;
  600. return "end of data in declaration";
  601. parse_cdata:
  602. if (p[1] != 'C' || p[2] != 'D' || p[3] != 'A' || p[4] != 'T' || p[5] != 'A' || p[6] != '[')
  603. return "syntax error in CDATA section";
  604. p += 7;
  605. mark = p;
  606. while (*p) {
  607. if (p[0] == ']' && p[1] == ']' && p[2] == '>') {
  608. xml_emit_cdata(ctx, parser, mark, p);
  609. p += 3;
  610. goto parse_text;
  611. }
  612. ++p;
  613. }
  614. return "end of data in CDATA section";
  615. parse_processing_instruction:
  616. while (*p) {
  617. if (p[0] == '?' && p[1] == '>') {
  618. p += 2;
  619. goto parse_text;
  620. }
  621. ++p;
  622. }
  623. return "end of data in processing instruction";
  624. parse_closing_element:
  625. while (iswhite(*p)) ++p;
  626. mark = p;
  627. while (isname(*p)) ++p;
  628. if (!isname(*mark))
  629. return "syntax error in closing element";
  630. if (close_tag(ctx, parser, mark, p))
  631. return "opening and closing tag mismatch";
  632. while (iswhite(*p)) ++p;
  633. if (*p != '>')
  634. return "syntax error in closing element";
  635. ++p;
  636. goto parse_text;
  637. parse_element_name:
  638. mark = p;
  639. while (isname(*p)) ++p;
  640. xml_emit_open_tag(ctx, parser, mark, p, 0);
  641. if (*p == '>') {
  642. ++p;
  643. goto parse_text;
  644. }
  645. if (p[0] == '/' && p[1] == '>') {
  646. xml_emit_close_tag(ctx, parser);
  647. p += 2;
  648. goto parse_text;
  649. }
  650. if (iswhite(*p))
  651. goto parse_attributes;
  652. return "syntax error after element name";
  653. parse_attributes:
  654. while (iswhite(*p)) ++p;
  655. if (isname(*p))
  656. goto parse_attribute_name;
  657. if (*p == '>') {
  658. ++p;
  659. goto parse_text;
  660. }
  661. if (p[0] == '/' && p[1] == '>') {
  662. xml_emit_close_tag(ctx, parser);
  663. p += 2;
  664. goto parse_text;
  665. }
  666. return "syntax error in attributes";
  667. parse_attribute_name:
  668. mark = p;
  669. while (isname(*p)) ++p;
  670. xml_emit_att_name(ctx, parser, mark, p);
  671. while (iswhite(*p)) ++p;
  672. if (*p == '=') { ++p; goto parse_attribute_value; }
  673. return "syntax error after attribute name";
  674. parse_attribute_value:
  675. while (iswhite(*p)) ++p;
  676. quote = *p++;
  677. mark = p;
  678. /* special case for handling MOBI filepos=00000 syntax */
  679. if (quote >= '0' && quote <= '9') {
  680. while (*p >= '0' && *p <= '9') ++p;
  681. xml_emit_att_value(ctx, parser, mark, p);
  682. goto parse_attributes;
  683. }
  684. if (quote != '"' && quote != '\'')
  685. return "missing quote character";
  686. while (*p && *p != quote) ++p;
  687. if (*p == quote) {
  688. xml_emit_att_value(ctx, parser, mark, p++);
  689. goto parse_attributes;
  690. }
  691. return "end of data in attribute value";
  692. }
  693. static int fast_tolower(int c)
  694. {
  695. if ((unsigned)c - 'A' < 26)
  696. return c | 32;
  697. return c;
  698. }
  699. static int fast_strncasecmp(const char *a, const char *b, size_t n)
  700. {
  701. if (!n--)
  702. return 0;
  703. for (; *a && *b && n && fast_tolower(*a) == fast_tolower(*b); a++, b++, n--)
  704. ;
  705. return fast_tolower(*a) - fast_tolower(*b);
  706. }
  707. static char *fast_strcasestr(char *h, char *n)
  708. {
  709. int n0 = fast_tolower(*n++);
  710. size_t nn = strlen(n);
  711. while (*h != 0)
  712. {
  713. if (fast_tolower(*h) == n0 && fast_strncasecmp(h+1, n, nn) == 0)
  714. return h;
  715. ++h;
  716. }
  717. return NULL;
  718. }
  719. static int startswith(const char *a, const char *b)
  720. {
  721. return !fast_strncasecmp(a, b, strlen(b));
  722. }
  723. /* https://encoding.spec.whatwg.org/#names-and-labels */
  724. static struct { char *encoding; char *alias; } encoding_aliases[] = {
  725. { "big5", "big5" },
  726. { "big5", "big5-hkscs" },
  727. { "big5", "cn-big5" },
  728. { "big5", "csbig5" },
  729. { "big5", "x-x-big5" },
  730. { "euc-cn", "euc-cn" },
  731. { "euc-jp", "cseucpkdfmtjapanese" },
  732. { "euc-jp", "euc-jp" },
  733. { "euc-jp", "x-euc-jp" },
  734. { "euc-kr", "cseuckr" },
  735. { "euc-kr", "csksc56011987" },
  736. { "euc-kr", "euc-kr" },
  737. { "euc-kr", "iso-ir-149" },
  738. { "euc-kr", "korean" },
  739. { "euc-kr", "ks_c_5601" },
  740. { "euc-kr", "ksc5601" },
  741. { "euc-kr", "ksc_5601" },
  742. { "euc-kr", "windows-949" },
  743. { "euc-tw", "euc-tw" },
  744. { "gb18030", "chinese" },
  745. { "gb18030", "csgb2312" },
  746. { "gb18030", "csiso58gb231280" },
  747. { "gb18030", "gb18030" },
  748. { "gb18030", "gb2312" },
  749. { "gb18030", "gb_2312" },
  750. { "gb18030", "gbk" },
  751. { "gb18030", "iso-ir-58" },
  752. { "gb18030", "x-gbk" },
  753. { "iso-8859-1", "ascii" },
  754. { "iso-8859-1", "iso-8859-1" },
  755. { "iso-8859-1", "iso8859-1" },
  756. { "iso-8859-1", "latin1" },
  757. { "iso-8859-1", "us-ascii" },
  758. { "iso-8859-7", "greek" },
  759. { "iso-8859-7", "greek8" },
  760. { "iso-8859-7", "iso-8859-1" },
  761. { "iso-8859-7", "iso8859-1" },
  762. { "koi8-r", "koi" },
  763. { "koi8-r", "koi8" },
  764. { "koi8-r", "koi8-r" },
  765. { "koi8-r", "koi8-ru" },
  766. { "koi8-r", "koi8-u" },
  767. { "koi8-r", "koi8_r" },
  768. { "shift_jis", "csshiftjis" },
  769. { "shift_jis", "ms932" },
  770. { "shift_jis", "ms_kanji" },
  771. { "shift_jis", "shift-jis" },
  772. { "shift_jis", "shift_jis" },
  773. { "shift_jis", "sjis" },
  774. { "shift_jis", "windows-31j" },
  775. { "shift_jis", "x-sjis" },
  776. { "windows-1250", "cp1250" },
  777. { "windows-1250", "windows-1250" },
  778. { "windows-1251", "cp1251" },
  779. { "windows-1251", "windows-1251" },
  780. { "windows-1252", "cp1252" },
  781. { "windows-1252", "cp819" },
  782. { "windows-1252", "windows-1252" },
  783. };
  784. static char *match_encoding_name(char *enc)
  785. {
  786. size_t i;
  787. for (i = 0; i < nelem(encoding_aliases); ++i)
  788. if (startswith(enc, encoding_aliases[i].alias))
  789. return encoding_aliases[i].encoding;
  790. return NULL;
  791. }
  792. // Look for encoding in <meta http-equiv="content-type" content="text/html; charset=XXX"> tags
  793. static const char *find_meta_encoding(char *s)
  794. {
  795. const char *table = NULL;
  796. char *end, *meta, *charset, *enc;
  797. meta = fast_strcasestr(s, "<meta");
  798. while (meta && !table)
  799. {
  800. end = strchr(meta, '>');
  801. if (end)
  802. {
  803. *end = 0;
  804. if (fast_strcasestr(meta, "http-equiv") && fast_strcasestr(meta, "content-type"))
  805. {
  806. charset = fast_strcasestr(meta, "charset=");
  807. if (charset)
  808. {
  809. enc = match_encoding_name(charset + 8);
  810. if (enc)
  811. table = enc;
  812. }
  813. }
  814. *end = '>';
  815. }
  816. meta = fast_strcasestr(meta + 5, "<meta");
  817. }
  818. return table;
  819. }
  820. static const char *find_xml_encoding(char *s)
  821. {
  822. const char *table = NULL;
  823. char *end, *xml, *enc;
  824. end = strchr(s, '>');
  825. if (end)
  826. {
  827. *end = 0;
  828. xml = strstr(s, "<?xml");
  829. if (xml)
  830. {
  831. enc = strstr(xml, "encoding=");
  832. if (enc)
  833. {
  834. enc = match_encoding_name(enc + 10);
  835. if (enc)
  836. table = enc;
  837. }
  838. }
  839. *end = '>';
  840. }
  841. if (!table)
  842. table = find_meta_encoding(s);
  843. return table;
  844. }
  845. static char *convert_to_utf8(fz_context *ctx, unsigned char *s, size_t n, int *dofree)
  846. {
  847. fz_text_decoder dec;
  848. const char *enc;
  849. const unsigned char *e = s + n;
  850. char *dst, *d;
  851. int m;
  852. int c;
  853. if (s[0] == 0xFE && s[1] == 0xFF) {
  854. s += 2;
  855. dst = d = Memento_label(fz_malloc(ctx, n * FZ_UTFMAX), "utf8_from_be");
  856. while (s + 1 < e) {
  857. c = s[0] << 8 | s[1];
  858. d += fz_runetochar(d, c);
  859. s += 2;
  860. }
  861. *d = 0;
  862. *dofree = 1;
  863. return dst;
  864. }
  865. if (s[0] == 0xFF && s[1] == 0xFE) {
  866. s += 2;
  867. dst = d = Memento_label(fz_malloc(ctx, n * FZ_UTFMAX), "utf8_from_le");
  868. while (s + 1 < e) {
  869. c = s[0] | s[1] << 8;
  870. d += fz_runetochar(d, c);
  871. s += 2;
  872. }
  873. *d = 0;
  874. *dofree = 1;
  875. return dst;
  876. }
  877. enc = find_xml_encoding((char*)s);
  878. if (enc)
  879. {
  880. fz_init_text_decoder(ctx, &dec, enc);
  881. // NOTE: use decode_size if memory is more important than speed
  882. m = (int)dec.decode_bound(&dec, s, (int)n);
  883. dst = Memento_label(fz_malloc(ctx, m), "utf8");
  884. dec.decode(&dec, dst, s, (int)n);
  885. *dofree = 1;
  886. return dst;
  887. }
  888. *dofree = 0;
  889. if (s[0] == 0xEF && s[1] == 0xBB && s[2] == 0xBF)
  890. return (char*)s+3;
  891. return (char*)s;
  892. }
  893. fz_xml *
  894. fz_parse_xml_stream(fz_context *ctx, fz_stream *stm, int preserve_white)
  895. {
  896. fz_buffer *buf = fz_read_all(ctx, stm, 128);
  897. fz_xml *xml = NULL;
  898. fz_var(xml);
  899. fz_try(ctx)
  900. xml = fz_parse_xml(ctx, buf, preserve_white);
  901. fz_always(ctx)
  902. fz_drop_buffer(ctx, buf);
  903. fz_catch(ctx)
  904. fz_rethrow(ctx);
  905. return xml;
  906. }
  907. static fz_xml *
  908. parse_and_drop_buffer(fz_context *ctx, fz_buffer *buf, int preserve_white)
  909. {
  910. fz_xml *xml = NULL;
  911. fz_var(xml);
  912. fz_try(ctx)
  913. xml = fz_parse_xml(ctx, buf, preserve_white);
  914. fz_always(ctx)
  915. fz_drop_buffer(ctx, buf);
  916. fz_catch(ctx)
  917. fz_rethrow(ctx);
  918. return xml;
  919. }
  920. fz_xml *
  921. fz_parse_xml_archive_entry(fz_context *ctx, fz_archive *arch, const char *filename, int preserve_white)
  922. {
  923. fz_buffer *buf = fz_read_archive_entry(ctx, arch, filename);
  924. return parse_and_drop_buffer(ctx, buf, preserve_white);
  925. }
  926. fz_xml *
  927. fz_try_parse_xml_archive_entry(fz_context *ctx, fz_archive *arch, const char *filename, int preserve_white)
  928. {
  929. fz_buffer *buf = fz_try_read_archive_entry(ctx, arch, filename);
  930. if (buf == NULL)
  931. return NULL;
  932. return parse_and_drop_buffer(ctx, buf, preserve_white);
  933. }
  934. fz_xml *
  935. fz_parse_xml(fz_context *ctx, fz_buffer *buf, int preserve_white)
  936. {
  937. struct parser parser;
  938. fz_xml *xml = NULL;
  939. fz_xml *root, *node;
  940. char *p = NULL;
  941. char *error;
  942. int dofree = 0;
  943. unsigned char *s;
  944. size_t n;
  945. static unsigned char empty_string[] = "";
  946. fz_var(dofree);
  947. fz_var(p);
  948. if (buf == NULL)
  949. {
  950. n = 0;
  951. s = empty_string;
  952. }
  953. else
  954. {
  955. /* ensure we are zero-terminated */
  956. fz_terminate_buffer(ctx, buf);
  957. n = fz_buffer_storage(ctx, buf, &s);
  958. }
  959. parser.pool = fz_new_pool(ctx);
  960. parser.head = root = fz_pool_alloc_flexible(ctx, parser.pool, fz_xml, u.node.u.d.name, 1);
  961. parser.preserve_white = preserve_white;
  962. parser.depth = 0;
  963. #ifdef FZ_XML_SEQ
  964. parser.seq = 0;
  965. #endif
  966. fz_try(ctx)
  967. {
  968. p = convert_to_utf8(ctx, s, n, &dofree);
  969. error = xml_parse_document_imp(ctx, &parser, p);
  970. if (error)
  971. fz_throw(ctx, FZ_ERROR_SYNTAX, "%s", error);
  972. for (node = parser.head; node; node = node->up)
  973. node->u.node.next = NULL;
  974. xml = fz_pool_alloc(ctx, parser.pool, sizeof *xml);
  975. xml->up = NULL;
  976. xml->down = root->down;
  977. xml->u.doc.refs = 1;
  978. xml->u.doc.pool = parser.pool;
  979. for (node = root->down; node; node = node->u.node.next)
  980. node->up = xml;
  981. }
  982. fz_always(ctx)
  983. {
  984. if (dofree)
  985. fz_free(ctx, p);
  986. }
  987. fz_catch(ctx)
  988. {
  989. fz_drop_pool(ctx, parser.pool);
  990. fz_rethrow(ctx);
  991. }
  992. return xml;
  993. }
  994. #if FZ_ENABLE_HTML_ENGINE
  995. /*
  996. Parse the contents of buffer into a tree of XML nodes, using the HTML5 syntax.
  997. Gumbo doesn't check for malloc errors. Use our pool allocator and let it longjmp
  998. out of Gumbo on allocation errors. At the end (success or fail) we release the
  999. pool used for Gumbo's parse tree all at once.
  1000. */
  1001. struct mem_gumbo {
  1002. fz_context *ctx;
  1003. fz_pool *pool;
  1004. };
  1005. static void *alloc_gumbo(void *ctx, size_t size)
  1006. {
  1007. struct mem_gumbo *mem = ctx;
  1008. return fz_pool_alloc(mem->ctx, mem->pool, size);
  1009. }
  1010. static void dealloc_gumbo(void *ctx, void *ptr)
  1011. {
  1012. /* nothing */
  1013. }
  1014. static void xml_from_gumbo(fz_context *ctx, struct parser *parser, GumboNode *node)
  1015. {
  1016. unsigned int i;
  1017. const char *tag, *end, *sentinel;
  1018. switch (node->type)
  1019. {
  1020. case GUMBO_NODE_ELEMENT:
  1021. if (node->v.element.tag != GUMBO_TAG_UNKNOWN)
  1022. {
  1023. tag = gumbo_normalized_tagname(node->v.element.tag);
  1024. end = tag + strlen(tag);
  1025. }
  1026. else
  1027. {
  1028. tag = node->v.element.original_tag.data;
  1029. sentinel = tag + node->v.element.original_tag.length;
  1030. if (tag[0] == '<')
  1031. ++tag;
  1032. for (end = tag; end < sentinel; ++end)
  1033. if (end[0] == '>' || end[0] == '/' || iswhite(end[0]))
  1034. break;
  1035. }
  1036. xml_emit_open_tag(ctx, parser, tag, end, 0);
  1037. for (i = 0; i < node->v.element.attributes.length; ++i)
  1038. {
  1039. GumboAttribute *att = node->v.element.attributes.data[i];
  1040. xml_emit_att_name(ctx, parser, att->name, att->name+strlen(att->name));
  1041. xml_emit_att_value(ctx, parser, att->value, att->value+strlen(att->value));
  1042. }
  1043. for (i = 0; i < node->v.element.children.length; ++i)
  1044. {
  1045. GumboNode *child = node->v.element.children.data[i];
  1046. xml_from_gumbo(ctx, parser, child);
  1047. }
  1048. xml_emit_close_tag(ctx, parser);
  1049. break;
  1050. case GUMBO_NODE_TEXT:
  1051. case GUMBO_NODE_CDATA:
  1052. case GUMBO_NODE_WHITESPACE:
  1053. xml_emit_text(ctx, parser, node->v.text.text, node->v.text.text+strlen(node->v.text.text));
  1054. break;
  1055. case GUMBO_NODE_DOCUMENT:
  1056. case GUMBO_NODE_COMMENT:
  1057. case GUMBO_NODE_TEMPLATE:
  1058. break;
  1059. }
  1060. }
  1061. #endif
  1062. fz_xml *
  1063. fz_parse_xml_from_html5(fz_context *ctx, fz_buffer *buf)
  1064. {
  1065. #if FZ_ENABLE_HTML_ENGINE
  1066. struct parser parser;
  1067. fz_xml *xml = NULL;
  1068. fz_xml root, *node;
  1069. char *p = NULL;
  1070. int dofree = 0;
  1071. unsigned char *s;
  1072. size_t n;
  1073. GumboOutput *soup = NULL;
  1074. GumboOptions opts;
  1075. struct mem_gumbo mem;
  1076. static unsigned char empty_string[] = "";
  1077. fz_var(mem.pool);
  1078. fz_var(soup);
  1079. fz_var(dofree);
  1080. fz_var(p);
  1081. if (buf == NULL)
  1082. {
  1083. n = 0;
  1084. s = empty_string;
  1085. }
  1086. else
  1087. {
  1088. /* ensure we are zero-terminated */
  1089. fz_terminate_buffer(ctx, buf);
  1090. n = fz_buffer_storage(ctx, buf, &s);
  1091. }
  1092. mem.ctx = ctx;
  1093. mem.pool = NULL;
  1094. memset(&root, 0, sizeof(root));
  1095. parser.pool = fz_new_pool(ctx);
  1096. parser.head = &root;
  1097. parser.preserve_white = 1;
  1098. parser.depth = 0;
  1099. #ifdef FZ_XML_SEQ
  1100. parser.seq = 0;
  1101. #endif
  1102. fz_try(ctx)
  1103. {
  1104. p = convert_to_utf8(ctx, s, n, &dofree);
  1105. mem.pool = fz_new_pool(ctx);
  1106. memset(&opts, 0, sizeof opts);
  1107. opts.allocator = alloc_gumbo;
  1108. opts.deallocator = dealloc_gumbo;
  1109. opts.userdata = &mem;
  1110. opts.tab_stop = 8;
  1111. opts.stop_on_first_error = 0;
  1112. opts.max_errors = -1;
  1113. opts.fragment_context = GUMBO_TAG_LAST;
  1114. opts.fragment_namespace = GUMBO_NAMESPACE_HTML;
  1115. soup = gumbo_parse_with_options(&opts, (const char *)p, strlen(p));
  1116. xml_from_gumbo(ctx, &parser, soup->root);
  1117. for (node = parser.head; node; node = node->up)
  1118. node->u.node.next = NULL;
  1119. xml = fz_pool_alloc(ctx, parser.pool, sizeof *xml);
  1120. xml->up = NULL;
  1121. xml->down = root.down;
  1122. xml->u.doc.pool = parser.pool;
  1123. xml->u.doc.refs = 1;
  1124. for (node = root.down; node; node = node->u.node.next)
  1125. node->up = xml;
  1126. }
  1127. fz_always(ctx)
  1128. {
  1129. if (soup)
  1130. gumbo_destroy_output(&opts, soup);
  1131. fz_drop_pool(ctx, mem.pool);
  1132. if (dofree)
  1133. fz_free(ctx, p);
  1134. }
  1135. fz_catch(ctx)
  1136. {
  1137. fz_drop_pool(ctx, parser.pool);
  1138. fz_rethrow(ctx);
  1139. }
  1140. return xml;
  1141. #else
  1142. fz_throw(ctx, FZ_ERROR_GENERIC, "HTML Engine not enabled in this build");
  1143. #endif
  1144. }
  1145. fz_xml *fz_xml_find_dfs(fz_xml *item, const char *tag, const char *att, const char *match)
  1146. {
  1147. return fz_xml_find_dfs_top(item, tag, att, match, NULL);
  1148. }
  1149. fz_xml *fz_xml_find_dfs_top(fz_xml *item, const char *tag, const char *att, const char *match, fz_xml *top)
  1150. {
  1151. /* Skip over any DOC object. */
  1152. if (item && FZ_DOCUMENT_ITEM(item))
  1153. item = item->down;
  1154. while (item)
  1155. {
  1156. if (!FZ_TEXT_ITEM(item) && (tag == NULL || !strcmp(item->u.node.u.d.name, tag)))
  1157. {
  1158. if (att == NULL || (match == NULL ? fz_xml_att(item, att) != NULL : fz_xml_att_eq(item, att, match)))
  1159. return item;
  1160. }
  1161. if (!FZ_TEXT_ITEM(item) && item->down)
  1162. item = item->down;
  1163. else if (item->u.node.next)
  1164. item = item->u.node.next;
  1165. else
  1166. while (1) {
  1167. item = item->up;
  1168. /* Stop searching if we hit our declared 'top' item. */
  1169. if (item == top)
  1170. return NULL;
  1171. /* We should never reach item == NULL, but just in case. */
  1172. if (item == NULL)
  1173. return NULL;
  1174. /* If we reach the DOC object at the top, we're done. */
  1175. if (item->up == NULL)
  1176. return NULL;
  1177. if (item->u.node.next)
  1178. {
  1179. item = item->u.node.next;
  1180. break;
  1181. }
  1182. }
  1183. }
  1184. return NULL;
  1185. }
  1186. fz_xml *fz_xml_find_next_dfs(fz_xml *item, const char *tag, const char *att, const char *match)
  1187. {
  1188. return fz_xml_find_next_dfs_top(item, tag, att, match, NULL);
  1189. }
  1190. fz_xml *fz_xml_find_next_dfs_top(fz_xml *item, const char *tag, const char *att, const char *match, fz_xml *top)
  1191. {
  1192. /* Skip over any DOC object. */
  1193. if (item && FZ_DOCUMENT_ITEM(item))
  1194. item = item->down;
  1195. if (item == NULL)
  1196. return NULL;
  1197. if (item->down)
  1198. item = item->down;
  1199. else if (item->u.node.next)
  1200. item = item->u.node.next;
  1201. else
  1202. while (1) {
  1203. item = item->up;
  1204. /* Stop searching if we hit our declared 'top' item. */
  1205. if (item == top)
  1206. return NULL;
  1207. /* We should never reach item == NULL, but just in case. */
  1208. if (item == NULL)
  1209. return NULL;
  1210. /* If we reach the DOC object at the top, we're done. */
  1211. if (item->up == NULL)
  1212. return NULL;
  1213. if (item->u.node.next)
  1214. {
  1215. item = item->u.node.next;
  1216. break;
  1217. }
  1218. }
  1219. return fz_xml_find_dfs_top(item, tag, att, match, top);
  1220. }
  1221. fz_xml *fz_keep_xml(fz_context *ctx, fz_xml *xml)
  1222. {
  1223. fz_xml *dom = xml;
  1224. if (xml == NULL)
  1225. return xml;
  1226. while (dom->up)
  1227. dom = dom->up;
  1228. fz_keep_imp(ctx, dom, &dom->u.doc.refs);
  1229. /* Return the original node pointer, not the dom pointer! */
  1230. return xml;
  1231. }