xml-dom.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655
  1. // Copyright (C) 2022-2025 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #include "html-imp.h"
  23. #include "string.h"
  24. fz_xml *fz_story_document(fz_context *ctx, fz_story *story)
  25. {
  26. if (story == NULL || story->dom == NULL)
  27. return NULL;
  28. return story->dom;
  29. }
  30. fz_xml *fz_dom_body(fz_context *ctx, fz_xml *dom)
  31. {
  32. if (dom == NULL)
  33. return NULL;
  34. return fz_xml_find_dfs(dom, "body", NULL, NULL);
  35. }
  36. fz_xml *fz_dom_document_element(fz_context *ctx, fz_xml *dom)
  37. {
  38. if (dom == NULL)
  39. return NULL;
  40. while (dom->up)
  41. dom = dom->up;
  42. return dom->down;
  43. }
  44. static fz_xml *
  45. doc_pointer(fz_xml *a)
  46. {
  47. while (a->up)
  48. a = a->up;
  49. return a;
  50. }
  51. static void
  52. check_same_doc(fz_context *ctx, fz_xml *a, fz_xml *b)
  53. {
  54. /* Sanity check: The child and parent must come from the same doc. */
  55. if (doc_pointer(a) != doc_pointer(b))
  56. fz_throw(ctx, FZ_ERROR_ARGUMENT, "Parent and child must be from the same document");
  57. }
  58. /* Helper function to skip forward if we are passed a
  59. * doc pointer in circumstances where we should not be. */
  60. static fz_xml *
  61. skip_doc_pointer(fz_xml *x)
  62. {
  63. return (x == NULL || !FZ_DOCUMENT_ITEM(x)) ? x : x->down;
  64. }
  65. fz_xml *
  66. fz_new_dom(fz_context *ctx, const char *tag)
  67. {
  68. fz_pool *pool = fz_new_pool(ctx);
  69. fz_xml *xml;
  70. fz_try(ctx)
  71. {
  72. xml = fz_pool_alloc(ctx, pool, sizeof *xml);
  73. xml->up = NULL;
  74. xml->down = NULL;
  75. xml->u.doc.refs = 1;
  76. xml->u.doc.pool = pool;
  77. xml->down = fz_new_dom_node(ctx, xml, tag);
  78. xml->down->up = xml;
  79. }
  80. fz_catch(ctx)
  81. {
  82. fz_drop_pool(ctx, pool);
  83. fz_rethrow(ctx);
  84. }
  85. return xml->down;
  86. }
  87. fz_xml *
  88. fz_new_dom_node(fz_context *ctx, fz_xml *dom, const char *tag)
  89. {
  90. const char *ns;
  91. fz_xml *xml;
  92. size_t size;
  93. dom = doc_pointer(dom);
  94. /* skip namespace prefix */
  95. for (ns = tag; *ns; ++ns)
  96. if (*ns == ':')
  97. tag = ns + 1;
  98. size = offsetof(fz_xml, u.node.u.d.name) + ns-tag+1;
  99. xml = fz_pool_alloc(ctx, dom->u.doc.pool, size);
  100. memcpy(xml->u.node.u.d.name, tag, ns-tag+1);
  101. xml->u.node.u.d.atts = NULL;
  102. xml->down = NULL;
  103. xml->up = dom;
  104. xml->u.node.next = NULL;
  105. xml->u.node.prev = NULL;
  106. #ifdef FZ_XML_SEQ
  107. /* We don't have sequence numbers here. */
  108. xml->seq = 0;
  109. #endif
  110. return xml;
  111. }
  112. fz_xml *
  113. fz_new_dom_text_node(fz_context *ctx, fz_xml *dom, const char *text)
  114. {
  115. fz_xml *xml;
  116. size_t len = text ? strlen(text) : 0;
  117. size_t size;
  118. dom = doc_pointer(dom);
  119. size = offsetof(fz_xml, u.node.u.text) + len + 1;
  120. xml = fz_pool_alloc(ctx, dom->u.doc.pool, size);
  121. if (text)
  122. memcpy(xml->u.node.u.text, text, len);
  123. xml->u.node.u.text[len] = 0;
  124. xml->down = MAGIC_TEXT;
  125. xml->up = dom;
  126. xml->u.node.next = NULL;
  127. xml->u.node.prev = NULL;
  128. #ifdef FZ_XML_SEQ
  129. /* We don't have sequence numbers here. */
  130. xml->u.node.seq = 0;
  131. #endif
  132. return xml;
  133. }
  134. static fz_xml *
  135. clone_xml(fz_context *ctx, fz_xml *dom, fz_xml *node)
  136. {
  137. fz_xml *clone;
  138. struct attribute **dst;
  139. struct attribute *attr;
  140. fz_xml *child, *prev;
  141. if (dom == NULL || node == NULL)
  142. return NULL;
  143. /* Text nodes are simple. No children. */
  144. if (FZ_TEXT_ITEM(node))
  145. {
  146. return fz_new_dom_text_node(ctx, dom, node->u.node.u.text);
  147. }
  148. /* Clone a non-text node. */
  149. clone = fz_new_dom_node(ctx, dom, node->u.node.u.d.name);
  150. /* Clone the attributes. */
  151. attr = node->u.node.u.d.atts;
  152. dst = &clone->u.node.u.d.atts;
  153. while (attr)
  154. {
  155. size_t len = strlen(attr->name) + 1;
  156. size_t size = offsetof(struct attribute, name) + len;
  157. struct attribute *a = fz_pool_alloc(ctx, dom->u.doc.pool, size);
  158. memcpy(a->name, attr->name, len);
  159. a->next = NULL;
  160. a->value = NULL;
  161. if (attr->value)
  162. {
  163. a->value = fz_pool_alloc(ctx, dom->u.doc.pool, strlen(attr->value)+1);
  164. strcpy(a->value, attr->value);
  165. }
  166. *dst = a;
  167. dst = &a->next;
  168. attr = attr->next;
  169. }
  170. /* If we have no children, we're done. */
  171. if (node->down == NULL)
  172. return clone;
  173. /* Copy the first child. */
  174. clone->down = clone_xml(ctx, dom, node->down);
  175. clone->down->up = clone;
  176. /* And then run along all the successive children. */
  177. prev = clone->down;
  178. child = node->down->u.node.next;
  179. while (child)
  180. {
  181. prev->u.node.next = clone_xml(ctx, dom, child);
  182. prev->u.node.prev = prev;
  183. prev = prev->u.node.next;
  184. prev->up = clone;
  185. child = child->u.node.next;
  186. }
  187. return clone;
  188. }
  189. fz_xml *fz_dom_clone(fz_context *ctx, fz_xml *elt)
  190. {
  191. fz_xml *dom;
  192. if (elt == NULL)
  193. return NULL;
  194. /* We shouldn't be passed a document item really, but
  195. * cope. */
  196. if (FZ_DOCUMENT_ITEM(elt))
  197. elt = elt->down;
  198. /* Find the document pointer. */
  199. dom = elt;
  200. while (dom->up)
  201. dom = dom->up;
  202. return clone_xml(ctx, dom, elt);
  203. }
  204. fz_xml *fz_dom_create_element(fz_context *ctx, fz_xml *dom, const char *tag)
  205. {
  206. if (dom == NULL || tag == NULL)
  207. return NULL;
  208. /* We make a new node, unconnected to anything else.
  209. * up will still point to the dom root though. */
  210. return fz_new_dom_node(ctx, dom, tag);
  211. }
  212. fz_xml *fz_dom_create_text_node(fz_context *ctx, fz_xml *dom, const char *text)
  213. {
  214. if (dom == NULL || text == NULL)
  215. return NULL;
  216. /* We make a new node, unconnected to anything else. */
  217. return fz_new_dom_text_node(ctx, dom, text);
  218. }
  219. fz_xml *fz_dom_find(fz_context *ctx, fz_xml *elt, const char *tag, const char *att, const char *match)
  220. {
  221. if (elt == NULL)
  222. return NULL;
  223. return fz_xml_find_dfs(elt, tag, att, match);
  224. }
  225. fz_xml *fz_dom_find_next(fz_context *ctx, fz_xml *elt, const char *tag, const char *att, const char *match)
  226. {
  227. if (elt == NULL)
  228. return NULL;
  229. return fz_xml_find_next_dfs(elt, tag, att, match);
  230. }
  231. void fz_dom_append_child(fz_context *ctx, fz_xml *parent, fz_xml *child)
  232. {
  233. fz_xml *x;
  234. child = skip_doc_pointer(child);
  235. if (parent == NULL || child == NULL)
  236. return;
  237. check_same_doc(ctx, parent, child);
  238. /* Sanity checks: We can't add child to parent if parent is
  239. * a child of child. */
  240. x = parent;
  241. while (x)
  242. {
  243. if (x == child)
  244. fz_throw(ctx, FZ_ERROR_ARGUMENT, "Can't add a parent to its child.");
  245. x = x->up;
  246. }
  247. /* First unlink child from anywhere it's currently linked in. */
  248. if (child->u.node.prev)
  249. child->u.node.prev->u.node.next = child->u.node.next;
  250. else if (child->up->down == child && !FZ_DOCUMENT_ITEM(child->up))
  251. child->up->down = child->u.node.next;
  252. if (child->u.node.next)
  253. child->u.node.next->u.node.prev = child->u.node.prev;
  254. child->u.node.next = NULL;
  255. child->u.node.prev = NULL;
  256. /* Now find where to insert the child. */
  257. if (parent->down == NULL)
  258. {
  259. /* Insert as first (and only) child. */
  260. parent->down = child;
  261. }
  262. else
  263. {
  264. /* Find x, the current last child. */
  265. x = parent->down;
  266. while (x->u.node.next)
  267. x = x->u.node.next;
  268. /* And insert xchild after that. */
  269. x->u.node.next = child;
  270. child->u.node.prev = x;
  271. }
  272. child->up = parent;
  273. }
  274. void fz_dom_insert_before(fz_context *ctx, fz_xml *existing, fz_xml *elt)
  275. {
  276. fz_xml *x;
  277. existing = skip_doc_pointer(existing);
  278. elt = skip_doc_pointer(elt);
  279. if (existing == NULL || elt == NULL)
  280. return;
  281. check_same_doc(ctx, existing, elt);
  282. /* Sanity check: We can't add elt before existing if existing is
  283. * a child of elt. */
  284. x = existing;
  285. while (x)
  286. {
  287. if (x == elt)
  288. fz_throw(ctx, FZ_ERROR_ARGUMENT, "Can't add a node before its child.");
  289. x = x->up;
  290. }
  291. /* First unlink elt from anywhere it's currently linked in. */
  292. if (elt->u.node.prev)
  293. elt->u.node.prev->u.node.next = elt->u.node.next;
  294. else if (elt->up && !FZ_DOCUMENT_ITEM(elt->up))
  295. elt->up->down = elt->u.node.next;
  296. if (elt->u.node.next)
  297. elt->u.node.next->u.node.prev = elt->u.node.prev;
  298. elt->u.node.next = NULL;
  299. elt->u.node.prev = NULL;
  300. elt->up = NULL;
  301. /* Now insert the element */
  302. elt->u.node.prev = existing->u.node.prev;
  303. if (elt->u.node.prev)
  304. elt->u.node.prev->u.node.next = elt;
  305. else if (existing->up && !FZ_DOCUMENT_ITEM(existing->up))
  306. existing->up->down = elt;
  307. elt->u.node.next = existing;
  308. existing->u.node.prev = elt;
  309. elt->up = existing->up;
  310. }
  311. void fz_dom_insert_after(fz_context *ctx, fz_xml *existing, fz_xml *elt)
  312. {
  313. fz_xml *x;
  314. existing = skip_doc_pointer(existing);
  315. elt = skip_doc_pointer(elt);
  316. if (existing == NULL || elt == NULL)
  317. return;
  318. check_same_doc(ctx, existing, elt);
  319. /* Sanity check: We can't add elt before existing if existing is
  320. * a child of elt. */
  321. x = existing;
  322. while (x)
  323. {
  324. if (x == elt)
  325. fz_throw(ctx, FZ_ERROR_ARGUMENT, "Can't add a node after its child.");
  326. x = x->up;
  327. }
  328. /* First unlink child from anywhere it's currently linked in. */
  329. if (elt->u.node.prev)
  330. elt->u.node.prev->u.node.next = elt->u.node.next;
  331. else if (elt->up && !FZ_DOCUMENT_ITEM(elt->up))
  332. elt->up->down = elt->u.node.next;
  333. if (elt->u.node.next)
  334. elt->u.node.next->u.node.prev = elt->u.node.prev;
  335. elt->u.node.next = NULL;
  336. elt->u.node.prev = NULL;
  337. /* Now insert the element */
  338. elt->u.node.next = existing->u.node.next;
  339. if (elt->u.node.next)
  340. elt->u.node.next->u.node.prev = elt;
  341. elt->u.node.prev = existing;
  342. existing->u.node.next = elt;
  343. elt->up = existing->up;
  344. }
  345. void fz_dom_remove(fz_context *ctx, fz_xml *elt)
  346. {
  347. elt = skip_doc_pointer(elt);
  348. if (elt == NULL)
  349. return;
  350. /* Unlink child from anywhere it's currently linked in. */
  351. if (elt->u.node.prev)
  352. elt->u.node.prev->u.node.next = elt->u.node.next;
  353. else if (elt->up && !FZ_DOCUMENT_ITEM(elt))
  354. elt->up->down = elt->u.node.next;
  355. if (elt->u.node.next)
  356. elt->u.node.next->u.node.prev = elt->u.node.prev;
  357. elt->u.node.next = NULL;
  358. elt->u.node.prev = NULL;
  359. elt->up = doc_pointer(elt);
  360. }
  361. fz_xml *fz_dom_first_child(fz_context *ctx, fz_xml *elt)
  362. {
  363. elt = skip_doc_pointer(elt);
  364. if (elt == NULL || FZ_TEXT_ITEM(elt))
  365. return NULL;
  366. return elt->down;
  367. }
  368. fz_xml *fz_dom_parent(fz_context *ctx, fz_xml *elt)
  369. {
  370. elt = skip_doc_pointer(elt);
  371. if (elt == NULL)
  372. return NULL;
  373. if (FZ_DOCUMENT_ITEM(elt->up))
  374. return NULL;
  375. return elt->up;
  376. }
  377. fz_xml *fz_dom_next(fz_context *ctx, fz_xml *elt)
  378. {
  379. elt = skip_doc_pointer(elt);
  380. if (elt == NULL)
  381. return NULL;
  382. return elt->u.node.next;
  383. }
  384. fz_xml *fz_dom_previous(fz_context *ctx, fz_xml *elt)
  385. {
  386. elt = skip_doc_pointer(elt);
  387. if (elt == NULL)
  388. return NULL;
  389. return elt->u.node.prev;
  390. }
  391. void fz_dom_add_attribute(fz_context *ctx, fz_xml *elt, const char *att, const char *value)
  392. {
  393. struct attribute *attr;
  394. size_t len, size;
  395. char *mvalue = NULL;
  396. fz_xml *doc;
  397. elt = skip_doc_pointer(elt);
  398. if (elt == NULL || att == NULL)
  399. return;
  400. if (FZ_TEXT_ITEM(elt))
  401. fz_throw(ctx, FZ_ERROR_ARGUMENT, "Cannot add attributes to text node.");
  402. /* Move value to being a malloced thing, with the entity parsing done. */
  403. if (value) {
  404. char *d;
  405. const char *s = value;
  406. d = mvalue = fz_malloc(ctx, strlen(value)+1);
  407. while (*s)
  408. {
  409. if (*s == '&') {
  410. int c;
  411. s += xml_parse_entity(&c, s);
  412. d += fz_runetochar(d, c);
  413. }
  414. else
  415. *d++ = *s++;
  416. }
  417. *d = 0;
  418. }
  419. /* Do we have an attribute we can reuse? */
  420. attr = elt->u.node.u.d.atts;
  421. while (attr)
  422. {
  423. if (strcmp(att, attr->name) == 0)
  424. {
  425. /* Reuse this one. */
  426. break;
  427. }
  428. attr = attr->next;
  429. }
  430. if (attr && attr->value)
  431. {
  432. if (mvalue == NULL)
  433. {
  434. /* Just rewrite the existing value to be NULL. This
  435. * 'leaks' the old value within the pool, so it will
  436. * be cleaned up at the end. */
  437. attr->value = NULL;
  438. return;
  439. }
  440. if (strcmp(mvalue, attr->value) == 0)
  441. {
  442. /* Old and new values match. Nothing to change. */
  443. return;
  444. }
  445. }
  446. doc = doc_pointer(elt);
  447. /* Move mvalue to be an fz_pool thing. */
  448. if (mvalue)
  449. {
  450. char *tmp;
  451. fz_try(ctx)
  452. {
  453. tmp = fz_pool_alloc(ctx, doc->u.doc.pool, strlen(mvalue)+1);
  454. strcpy(tmp, mvalue);
  455. }
  456. fz_always(ctx)
  457. fz_free(ctx, mvalue);
  458. fz_catch(ctx)
  459. fz_rethrow(ctx);
  460. mvalue = tmp;
  461. }
  462. /* Make a new one and prepend it. */
  463. len = strlen(att) + 1;
  464. size = offsetof(struct attribute, name) + len;
  465. attr = fz_pool_alloc(ctx, doc->u.doc.pool, size);
  466. memcpy(attr->name, att, len);
  467. attr->next = elt->u.node.u.d.atts;
  468. elt->u.node.u.d.atts = attr;
  469. attr->value = mvalue;
  470. }
  471. void fz_dom_remove_attribute(fz_context *ctx, fz_xml *elt, const char *att)
  472. {
  473. struct attribute **attr;
  474. elt = skip_doc_pointer(elt);
  475. if (elt == NULL || att == NULL)
  476. return;
  477. if (FZ_TEXT_ITEM(elt))
  478. fz_throw(ctx, FZ_ERROR_ARGUMENT, "Cannot add attributes to text node.");
  479. attr = &elt->u.node.u.d.atts;
  480. while (*attr)
  481. {
  482. if (strcmp(att, (*attr)->name) == 0)
  483. {
  484. /* Delete this one. */
  485. /* The old attr/value are 'leaked' within the pool. */
  486. *attr = (*attr)->next;
  487. break;
  488. }
  489. attr = &(*attr)->next;
  490. }
  491. }
  492. const char *fz_dom_attribute(fz_context *ctx, fz_xml *elt, const char *att)
  493. {
  494. struct attribute *attr;
  495. elt = skip_doc_pointer(elt);
  496. if (elt == NULL || att == NULL)
  497. return NULL;
  498. /* Text nodes don't have attributes. */
  499. if (FZ_TEXT_ITEM(elt))
  500. return NULL;
  501. attr = elt->u.node.u.d.atts;
  502. while (attr)
  503. {
  504. if (strcmp(att, attr->name) == 0)
  505. {
  506. /* Found! */
  507. return attr->value;
  508. }
  509. }
  510. return NULL;
  511. }
  512. const char *fz_dom_get_attribute(fz_context *ctx, fz_xml *elt, int i, const char **att)
  513. {
  514. struct attribute *attr;
  515. if (elt == NULL || att == NULL)
  516. {
  517. if (att)
  518. *att = NULL;
  519. return NULL;
  520. }
  521. /* Text nodes don't have attributes. */
  522. if (FZ_TEXT_ITEM(elt) || i < 0)
  523. {
  524. *att = NULL;
  525. return NULL;
  526. }
  527. attr = elt->u.node.u.d.atts;
  528. while (attr)
  529. {
  530. if (i == 0)
  531. {
  532. /* Found! */
  533. *att = attr->name;
  534. return attr->value;
  535. }
  536. i--;
  537. attr = attr->next;
  538. }
  539. *att = NULL;
  540. return NULL;
  541. }