office.c 29 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343
  1. // Copyright (C) 2023-2025 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #include "mupdf/fitz.h"
  23. #include "html-imp.h"
  24. #undef DEBUG_OFFICE_TO_HTML
  25. /* Defaults are all 0's. FIXME: Very subject to change. Possibly might be removed entirely. */
  26. typedef struct
  27. {
  28. int output_page_numbers;
  29. int output_sheet_names;
  30. int output_cell_markers;
  31. int output_cell_row_markers;
  32. int output_cell_names;
  33. int output_formatting;
  34. int output_filenames;
  35. int output_errors;
  36. }
  37. fz_office_to_html_opts;
  38. typedef struct
  39. {
  40. fz_office_to_html_opts opts;
  41. fz_output *out;
  42. int page;
  43. /* State for if we are parsing a sheet. */
  44. /* The last column label we have to send. */
  45. char *label;
  46. /* Columns are numbered from 1. */
  47. /* The column we are at. */
  48. int col_at;
  49. /* The column we last signalled. If this is 0, then we haven't
  50. * even started a row yet. */
  51. int col_signalled;
  52. /* If we are currently processing a spreadsheet, store the current
  53. * sheets name here. */
  54. const char *sheet_name;
  55. int shared_string_max;
  56. int shared_string_len;
  57. char **shared_strings;
  58. int footnotes_max;
  59. char **footnotes;
  60. char *title;
  61. } doc_info;
  62. static void
  63. doc_escape(fz_context *ctx, fz_output *output, const char *str_)
  64. {
  65. const unsigned char *str = (const unsigned char *)str_;
  66. int c;
  67. if (!str)
  68. return;
  69. while ((c = *str++) != 0)
  70. {
  71. if (c == '&')
  72. {
  73. fz_write_string(ctx, output, "&amp;");
  74. }
  75. else if (c == '<')
  76. {
  77. fz_write_string(ctx, output, "&lt;");
  78. }
  79. else if (c == '>')
  80. {
  81. fz_write_string(ctx, output, "&gt;");
  82. }
  83. else
  84. {
  85. /* We get utf-8 in, just parrot it out again. */
  86. fz_write_byte(ctx, output, c);
  87. }
  88. }
  89. }
  90. static void
  91. show_text(fz_context *ctx, fz_xml *top, doc_info *info)
  92. {
  93. fz_xml *pos = top;
  94. fz_xml *next;
  95. while (pos)
  96. {
  97. doc_escape(ctx, info->out, fz_xml_text(pos));
  98. if (fz_xml_is_tag(pos, "lineBreak"))
  99. {
  100. fz_write_string(ctx, info->out, "\n");
  101. }
  102. else if (fz_xml_is_tag(pos, "tab"))
  103. {
  104. fz_write_string(ctx, info->out, "\t");
  105. }
  106. else if (fz_xml_is_tag(pos, "lastRenderedPageBreak"))
  107. {
  108. info->page++;
  109. }
  110. /* Always try to move down. */
  111. next = fz_xml_down(pos);
  112. if (next)
  113. {
  114. /* We can move down, easy! */
  115. pos = next;
  116. continue;
  117. }
  118. if (pos == top)
  119. break;
  120. /* We can't move down, try moving to next. */
  121. next = fz_xml_next(pos);
  122. if (next)
  123. {
  124. /* We can move to next, easy! */
  125. pos = next;
  126. continue;
  127. }
  128. /* If we can't go down, or next, pop up until we
  129. * find somewhere we can go next from. */
  130. while (1)
  131. {
  132. /* OK. So move up. */
  133. pos = fz_xml_up(pos);
  134. /* Check for hitting the top. */
  135. if (pos == top)
  136. pos = NULL;
  137. if (pos == NULL)
  138. break;
  139. /* We've returned to a node. See if it's a 'p'. */
  140. if (fz_xml_is_tag(pos, "p"))
  141. {
  142. fz_write_string(ctx, info->out, "\n");
  143. }
  144. next = fz_xml_next(pos);
  145. if (next)
  146. {
  147. pos = next;
  148. break;
  149. }
  150. }
  151. }
  152. }
  153. static void
  154. show_footnote(fz_context *ctx, fz_xml *v, doc_info *info)
  155. {
  156. int n = fz_atoi(fz_xml_att(v, "w:id"));
  157. if (n < 0 || n >= info->footnotes_max)
  158. return;
  159. if (info->footnotes[n] == NULL ||
  160. info->footnotes[n][0] == 0)
  161. return;
  162. /* Then send the strings. */
  163. doc_escape(ctx, info->out, info->footnotes[n]);
  164. }
  165. static void
  166. process_doc_stream(fz_context *ctx, fz_xml *xml, doc_info *info, int do_pages)
  167. {
  168. fz_xml *pos;
  169. fz_xml *next;
  170. const char *paragraph_style = NULL;
  171. const char *inline_style = NULL;
  172. #ifdef DEBUG_OFFICE_TO_HTML
  173. fz_write_printf(ctx, fz_stddbg(ctx), "process_doc_stream:\n");
  174. fz_output_xml(ctx, fz_stddbg(ctx), xml, 0);
  175. #endif
  176. /* First off, see if we can do page numbers. */
  177. if (do_pages)
  178. {
  179. pos = fz_xml_find_dfs(xml, "lastRenderedPageBreak", NULL, NULL);
  180. if (pos)
  181. {
  182. /* We *can* do page numbers, so start here. */
  183. fz_write_string(ctx, info->out, "<div id=\"page1\">\n");
  184. info->page = 1;
  185. }
  186. }
  187. /* Now walk the tree for real. */
  188. pos = xml;
  189. while (pos)
  190. {
  191. /* When we arrive on a node, check if it's a 't'. */
  192. if (fz_xml_is_tag(pos, "t"))
  193. {
  194. show_text(ctx, pos, info);
  195. /* Do NOT go down, we've already dealt with that. */
  196. }
  197. else if (fz_xml_is_tag(pos, "br"))
  198. {
  199. if (paragraph_style && strcmp(paragraph_style, "pre"))
  200. {
  201. fz_write_printf(ctx, info->out, "<br/>\n");
  202. }
  203. else
  204. {
  205. fz_write_printf(ctx, info->out, "\n");
  206. }
  207. }
  208. else if (fz_xml_is_tag(pos, "footnoteReference"))
  209. {
  210. show_footnote(ctx, pos, info);
  211. /* Do NOT go down, we've already dealt with that. */
  212. }
  213. else if (fz_xml_is_tag(pos, "tabs"))
  214. {
  215. /* Don't walk through tabs, or we will hit lots of 'tab' entries and
  216. * output incorrect information. */
  217. }
  218. else if (fz_xml_is_tag(pos, "pStyle"))
  219. {
  220. /* Should prob fix fz_xml_*() to strip namespace prefix
  221. from attributes, to match what it does for tag names.
  222. */
  223. paragraph_style = fz_xml_att(pos, "w:val");
  224. if (paragraph_style)
  225. {
  226. if (!strcmp(paragraph_style, "BodyText"))
  227. paragraph_style = NULL;
  228. else if (!strcmp(paragraph_style, "Heading1"))
  229. paragraph_style = "h1";
  230. else if (!strcmp(paragraph_style, "Heading2"))
  231. paragraph_style = "h2";
  232. else if (!strcmp(paragraph_style, "Heading3"))
  233. paragraph_style = "h3";
  234. else if (!strcmp(paragraph_style, "Heading4"))
  235. paragraph_style = "h4";
  236. else if (!strcmp(paragraph_style, "Heading5"))
  237. paragraph_style = "h5";
  238. else if (!strcmp(paragraph_style, "Heading6"))
  239. paragraph_style = "h6";
  240. else if (!strcmp(paragraph_style, "SourceCode"))
  241. paragraph_style = "pre";
  242. else
  243. paragraph_style = NULL;
  244. if (paragraph_style)
  245. fz_write_printf(ctx, info->out, "<%s>", paragraph_style);
  246. }
  247. }
  248. else if (fz_xml_is_tag(pos, "rStyle"))
  249. {
  250. inline_style = fz_xml_att(pos, "w:val");
  251. if (inline_style)
  252. {
  253. if (!strcmp(inline_style, "VerbatimChar"))
  254. inline_style = "tt";
  255. else
  256. {
  257. if (0)
  258. fz_write_printf(ctx, info->out, "<!-- %s -->", inline_style);
  259. inline_style = NULL;
  260. }
  261. if (inline_style)
  262. fz_write_printf(ctx, info->out, "<%s>", inline_style);
  263. }
  264. }
  265. else
  266. {
  267. fz_xml *down;
  268. if (fz_xml_is_tag(pos, "lineBreak"))
  269. {
  270. fz_write_string(ctx, info->out, "\n");
  271. }
  272. else if (fz_xml_is_tag(pos, "p"))
  273. {
  274. fz_write_string(ctx, info->out, "<p>");
  275. }
  276. else if (fz_xml_is_tag(pos, "tab"))
  277. {
  278. fz_write_string(ctx, info->out, "\t");
  279. }
  280. else if (do_pages && fz_xml_is_tag(pos, "lastRenderedPageBreak"))
  281. {
  282. if (info->page)
  283. fz_write_string(ctx, info->out, "\n</div>\n");
  284. info->page++;
  285. fz_write_printf(ctx, info->out, "<div id=\"page%d\">\n", info->page);
  286. }
  287. /* Try to move down. */
  288. down = fz_xml_down(pos);
  289. if (down)
  290. {
  291. /* We can move down, easy! */
  292. pos = down;
  293. continue;
  294. }
  295. }
  296. /* Try moving to next. */
  297. next = fz_xml_next(pos);
  298. if (next)
  299. {
  300. /* We can move to next, easy! */
  301. pos = next;
  302. continue;
  303. }
  304. /* If we can't go down, or next, pop up until we
  305. * find somewhere we can go next from. */
  306. while (1)
  307. {
  308. /* OK. So move up. */
  309. pos = fz_xml_up(pos);
  310. /* Check for hitting the top. */
  311. if (pos == NULL)
  312. break;
  313. /* We've returned to a node. See if it's a 'p'. */
  314. if (fz_xml_is_tag(pos, "p"))
  315. {
  316. if (paragraph_style)
  317. {
  318. fz_write_printf(ctx, info->out, "</%s>", paragraph_style);
  319. paragraph_style = NULL;
  320. }
  321. fz_write_string(ctx, info->out, "</p>\n");
  322. }
  323. else if (fz_xml_is_tag(pos, "r"))
  324. {
  325. /* Seems to be pseudo-close for rStyle. */
  326. if (inline_style)
  327. {
  328. fz_write_printf(ctx, info->out, "</%s>", inline_style);
  329. inline_style = NULL;
  330. }
  331. }
  332. next = fz_xml_next(pos);
  333. if (next)
  334. {
  335. pos = next;
  336. break;
  337. }
  338. }
  339. }
  340. if (do_pages && info->page)
  341. fz_write_string(ctx, info->out, "\n</div>\n");
  342. }
  343. static void
  344. process_item(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info, int do_pages)
  345. {
  346. fz_xml *xml = fz_parse_xml_archive_entry(ctx, arch, file, 1);
  347. fz_try(ctx)
  348. process_doc_stream(ctx, xml, info, do_pages);
  349. fz_always(ctx)
  350. fz_drop_xml(ctx, xml);
  351. fz_catch(ctx)
  352. fz_rethrow(ctx);
  353. }
  354. static void
  355. process_rootfile(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info)
  356. {
  357. fz_xml *xml = fz_parse_xml_archive_entry(ctx, arch, file, 0);
  358. fz_try(ctx)
  359. {
  360. /* FIXME: Should really search for these just inside 'spine'. */
  361. fz_xml *pos = fz_xml_find_dfs(xml, "itemref", NULL, NULL);
  362. while (pos)
  363. {
  364. char *idref = fz_xml_att(pos, "idref");
  365. fz_xml *item = fz_xml_find_dfs(xml, "item", "id", idref);
  366. while (item)
  367. {
  368. char *type = fz_xml_att(item, "media-type");
  369. char *href = fz_xml_att(item, "href");
  370. if (type && href && !strcmp(type, "application/xml"))
  371. {
  372. process_item(ctx, arch, href, info, 1);
  373. }
  374. item = fz_xml_find_next_dfs(pos, "item", "id", idref);
  375. }
  376. pos = fz_xml_find_next_dfs(pos, "itemref", NULL, NULL);
  377. }
  378. }
  379. fz_always(ctx)
  380. fz_drop_xml(ctx, xml);
  381. fz_catch(ctx)
  382. fz_rethrow(ctx);
  383. }
  384. /* XLSX support */
  385. static char *
  386. make_rel_name(fz_context *ctx, const char *file)
  387. {
  388. size_t z = strlen(file);
  389. char *s = fz_malloc(ctx, z + 12);
  390. char *t;
  391. const char *p;
  392. const char *slash = file;
  393. for (p = file; *p != 0; p++)
  394. if (*p == '/')
  395. slash = p+1;
  396. t = s;
  397. if (slash != file)
  398. {
  399. memcpy(t, file, slash - file);
  400. t += slash - file;
  401. }
  402. memcpy(t, "_rels/", 6);
  403. t += 6;
  404. memcpy(t, file + (slash - file), z - (slash - file));
  405. t += z - (slash - file);
  406. memcpy(t, ".rels", 6);
  407. return s;
  408. }
  409. static char *lookup_rel(fz_context *ctx, fz_xml *rels, const char *id)
  410. {
  411. fz_xml *pos;
  412. if (id == NULL)
  413. return NULL;
  414. pos = fz_xml_find_dfs(rels, "Relationship", NULL, NULL);
  415. while (pos)
  416. {
  417. char *id2 = fz_xml_att(pos, "Id");
  418. if (id2 && !strcmp(id, id2))
  419. return fz_xml_att(pos, "Target");
  420. pos = fz_xml_find_next_dfs(pos, "Relationship", NULL, NULL);
  421. }
  422. return NULL;
  423. }
  424. static void
  425. send_cell_formatting(fz_context *ctx, doc_info *info)
  426. {
  427. if (info->col_signalled == 0)
  428. {
  429. fz_write_string(ctx, info->out, "<tr>\n");
  430. info->col_signalled = 1;
  431. if (info->col_at > 1)
  432. fz_write_string(ctx, info->out, "<td>");
  433. }
  434. /* Send the label */
  435. while (info->col_signalled < info->col_at)
  436. {
  437. fz_write_string(ctx, info->out, "</td>");
  438. info->col_signalled++;
  439. if (info->col_signalled < info->col_at)
  440. fz_write_string(ctx, info->out, "<td>");
  441. }
  442. if (info->sheet_name && info->sheet_name[0])
  443. fz_write_printf(ctx, info->out, "<td id=\"%s!%s\">", info->sheet_name, info->label);
  444. else
  445. fz_write_printf(ctx, info->out, "<td id=\"%s\">", info->label);
  446. }
  447. static void
  448. show_shared_string(fz_context *ctx, fz_xml *v, doc_info *info)
  449. {
  450. const char *t = fz_xml_text(fz_xml_down(v));
  451. int n = fz_atoi(t);
  452. if (n < 0 || n >= info->shared_string_len)
  453. return;
  454. if (info->shared_strings[n] == NULL ||
  455. info->shared_strings[n][0] == 0)
  456. return;
  457. send_cell_formatting(ctx, info);
  458. /* Then send the strings. */
  459. doc_escape(ctx, info->out, info->shared_strings[n]);
  460. }
  461. static int
  462. col_from_label(const char *label)
  463. {
  464. int col = 0;
  465. int len = 26;
  466. int base = 0;
  467. /* If we can't read the column, return 0. */
  468. if (label == NULL || *label < 'A' || *label > 'Z')
  469. return 0;
  470. /* Each section (A-Z, AA-ZZ, AAA-ZZZ etc) is of len 'len', and starts
  471. * at base index 'base'. Each section is 26 times as long, and starts
  472. * at base + len from the previous section.
  473. *
  474. * A: col = 26 * 0 + 0 + 0
  475. * AA: col = (26 * 0 + 0 + 0) * 26 + 0 + 26 = 26
  476. * AAA: col = (((26 * 0 + 0 + 0) * 26 + 0 + 26)*26 + 0 + 26*26 = 26 + 26 * 26
  477. */
  478. do
  479. {
  480. col = 26 * col + (*label++) - 'A' + base;
  481. base += len;
  482. len *= 26;
  483. }
  484. while (*label >= 'A' && *label <= 'Z');
  485. return col+1;
  486. }
  487. static void
  488. show_cell_text(fz_context *ctx, fz_xml *top, doc_info *info)
  489. {
  490. fz_xml *pos = top;
  491. fz_xml *next;
  492. while (pos)
  493. {
  494. char *text = fz_xml_text(pos);
  495. if (text)
  496. {
  497. send_cell_formatting(ctx, info);
  498. doc_escape(ctx, info->out, text);
  499. }
  500. /* Always try to move down. */
  501. next = fz_xml_down(pos);
  502. if (next)
  503. {
  504. /* We can move down, easy! */
  505. pos = next;
  506. continue;
  507. }
  508. if (pos == top)
  509. break;
  510. /* We can't move down, try moving to next. */
  511. next = fz_xml_next(pos);
  512. if (next)
  513. {
  514. /* We can move to next, easy! */
  515. pos = next;
  516. continue;
  517. }
  518. /* If we can't go down, or next, pop up until we
  519. * find somewhere we can go next from. */
  520. while (1)
  521. {
  522. /* OK. So move up. */
  523. pos = fz_xml_up(pos);
  524. /* Check for hitting the top. */
  525. if (pos == top)
  526. pos = NULL;
  527. if (pos == NULL)
  528. break;
  529. next = fz_xml_next(pos);
  530. if (next)
  531. {
  532. pos = next;
  533. break;
  534. }
  535. }
  536. }
  537. }
  538. static void
  539. arrived_at_cell(fz_context *ctx, doc_info *info, const char *label)
  540. {
  541. int col;
  542. /* If we have a label queued, and no label is given here, then we're
  543. * processing a 'cell' callback after having had a 'cellname'
  544. * callback. So don't signal it twice! */
  545. if (label == NULL && info->label)
  546. return;
  547. col = label ? col_from_label(label) : 0;
  548. fz_free(ctx, info->label);
  549. info->label = NULL;
  550. info->label = label ? fz_strdup(ctx, label) : NULL;
  551. info->col_at = col;
  552. }
  553. static void
  554. show_cell(fz_context *ctx, fz_xml *cell, doc_info *info)
  555. {
  556. char *t = fz_xml_att(cell, "t");
  557. fz_xml *v = fz_xml_find_down(cell, "v");
  558. const char *r = fz_xml_att(cell, "r");
  559. arrived_at_cell(ctx, info, r);
  560. if (t && t[0] == 's' && t[1] == 0)
  561. show_shared_string(ctx, v, info);
  562. else
  563. show_cell_text(ctx, v, info);
  564. }
  565. static void
  566. new_row(fz_context *ctx, doc_info *info)
  567. {
  568. if (info->col_signalled)
  569. {
  570. /* We've sent at least one cell. So need to close the
  571. * td and tr */
  572. fz_write_string(ctx, info->out, "</td>\n</tr>\n");
  573. }
  574. else
  575. {
  576. /* We've not sent anything for this row. Keep the counts
  577. * correct. */
  578. fz_write_string(ctx, info->out, "<tr></tr>\n");
  579. }
  580. info->col_at = 1;
  581. info->col_signalled = 0;
  582. fz_free(ctx, info->label);
  583. info->label = NULL;
  584. }
  585. static void
  586. process_sheet(fz_context *ctx, fz_archive *arch, const char *name, const char *file, doc_info *info)
  587. {
  588. fz_xml *xml = fz_parse_xml_archive_entry(ctx, arch, file, 1);
  589. #ifdef DEBUG_OFFICE_TO_HTML
  590. fz_write_printf(ctx, fz_stddbg(ctx), "process_sheet:\n");
  591. fz_output_xml(ctx, fz_stddbg(ctx), xml, 0);
  592. #endif
  593. fz_write_printf(ctx, info->out, "<table id=\"%s\">\n", name);
  594. info->sheet_name = name;
  595. info->col_at = 0;
  596. info->col_signalled = 0;
  597. fz_try(ctx)
  598. {
  599. fz_xml *pos = xml;
  600. fz_xml *next;
  601. while (pos)
  602. {
  603. /* When we arrive on a node, check if it's a cell. */
  604. if (fz_xml_is_tag(pos, "c"))
  605. {
  606. show_cell(ctx, pos, info);
  607. /* Do NOT go down, we've already dealt with that. */
  608. }
  609. else
  610. {
  611. /* Try to move down. */
  612. next = fz_xml_down(pos);
  613. if (next)
  614. {
  615. /* We can move down, easy! */
  616. pos = next;
  617. continue;
  618. }
  619. }
  620. /* Try moving to next. */
  621. next = fz_xml_next(pos);
  622. if (next)
  623. {
  624. /* We can move to next, easy! */
  625. pos = next;
  626. continue;
  627. }
  628. /* If we can't go down, or next, pop up until we
  629. * find somewhere we can go next from. */
  630. while (1)
  631. {
  632. /* OK. So move up. */
  633. pos = fz_xml_up(pos);
  634. /* Check for hitting the top. */
  635. if (pos == NULL)
  636. break;
  637. /* We've returned to a node. See if it's a 'row'. */
  638. if (fz_xml_is_tag(pos, "row"))
  639. new_row(ctx, info);
  640. next = fz_xml_next(pos);
  641. if (next)
  642. {
  643. pos = next;
  644. break;
  645. }
  646. }
  647. }
  648. if (info->col_signalled)
  649. fz_write_printf(ctx, info->out, "</td>\n</tr>\n");
  650. fz_write_printf(ctx, info->out, "</table>\n");
  651. }
  652. fz_always(ctx)
  653. fz_drop_xml(ctx, xml);
  654. fz_catch(ctx)
  655. fz_rethrow(ctx);
  656. }
  657. static void
  658. process_slide(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info)
  659. {
  660. fz_write_printf(ctx, info->out, "<div id=\"slide%d\">\n", info->page++);
  661. process_item(ctx, arch, file, info, 0);
  662. fz_write_printf(ctx, info->out, "</div>\n");
  663. }
  664. static char *
  665. make_absolute_path(fz_context *ctx, const char *abs, const char *rel)
  666. {
  667. const char *a = abs;
  668. const char *aslash = a;
  669. int up = 0;
  670. size_t z1, z2;
  671. char *s;
  672. if (rel == NULL)
  673. return NULL;
  674. if (abs == NULL || *rel == '/')
  675. return fz_strdup(ctx, rel);
  676. for (a = abs; *a != 0; a++)
  677. if (*a == '/')
  678. aslash = a+1;
  679. while (rel[0] == '.')
  680. {
  681. if (rel[1] == '/')
  682. rel += 2;
  683. else if (rel[1] == '.' && rel[2] == '/')
  684. rel += 3, up++;
  685. else
  686. fz_throw(ctx, FZ_ERROR_FORMAT, "Unresolvable path");
  687. }
  688. if (rel[0] == 0)
  689. fz_throw(ctx, FZ_ERROR_FORMAT, "Unresolvable path");
  690. while (up)
  691. {
  692. while (aslash != abs && aslash[-1] != '/')
  693. aslash--;
  694. up--;
  695. }
  696. z1 = aslash - abs;
  697. z2 = strlen(rel);
  698. s = fz_malloc(ctx, z1 + z2 + 1);
  699. if (z1)
  700. memcpy(s, abs, z1);
  701. memcpy(s+z1, rel, z2+1);
  702. return s;
  703. }
  704. static char *
  705. collate_t_content(fz_context *ctx, fz_xml *top)
  706. {
  707. char *val = NULL;
  708. fz_xml *next;
  709. fz_xml *pos = fz_xml_down(top);
  710. while (pos != top)
  711. {
  712. /* Capture all the 't' content. */
  713. if (fz_xml_is_tag(pos, "t"))
  714. {
  715. /* Remember the content. */
  716. char *s = fz_xml_text(fz_xml_down(pos));
  717. if (s == NULL)
  718. {
  719. /* Do nothing */
  720. }
  721. else if (val == NULL)
  722. val = fz_strdup(ctx, s);
  723. else
  724. {
  725. char *val2;
  726. size_t z1 = strlen(val);
  727. size_t z2 = strlen(s) + 1;
  728. fz_try(ctx)
  729. {
  730. val2 = fz_malloc(ctx, z1 + z2);
  731. }
  732. fz_catch(ctx)
  733. {
  734. fz_free(ctx, val);
  735. fz_rethrow(ctx);
  736. }
  737. memcpy(val2, val, z1);
  738. memcpy(val2 + z1, s, z2);
  739. fz_free(ctx, val);
  740. val = val2;
  741. }
  742. /* Do NOT go down, we've already dealt with that. */
  743. }
  744. else if (fz_xml_is_tag(pos, "rPr") || fz_xml_is_tag(pos, "rPh"))
  745. {
  746. /* We do not want the 't' content from within these. */
  747. }
  748. else
  749. {
  750. /* Try to move down. */
  751. next = fz_xml_down(pos);
  752. if (next)
  753. {
  754. /* We can move down, easy! */
  755. pos = next;
  756. continue;
  757. }
  758. }
  759. /* Try moving to next. */
  760. next = fz_xml_next(pos);
  761. if (next)
  762. {
  763. /* We can move to next, easy! */
  764. pos = next;
  765. continue;
  766. }
  767. /* If we can't go down, or next, pop up until we
  768. * find somewhere we can go next from. */
  769. while (1)
  770. {
  771. /* OK. So move up. */
  772. pos = fz_xml_up(pos);
  773. /* Check for hitting the top. */
  774. if (pos == top)
  775. break;
  776. next = fz_xml_next(pos);
  777. if (next)
  778. {
  779. pos = next;
  780. break;
  781. }
  782. }
  783. }
  784. return val;
  785. }
  786. static fz_xml *
  787. try_parse_xml_archive_entry(fz_context *ctx, fz_archive *arch, const char *filename, int preserve_white)
  788. {
  789. if (!fz_has_archive_entry(ctx, arch, filename))
  790. return NULL;
  791. return fz_parse_xml_archive_entry(ctx, arch, filename, preserve_white);
  792. }
  793. static void
  794. load_shared_strings(fz_context *ctx, fz_archive *arch, fz_xml *rels, doc_info *info, const char *file)
  795. {
  796. fz_xml *pos = fz_xml_find_dfs(rels, "Relationship", "Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings");
  797. const char *ss_file = fz_xml_att(pos, "Target");
  798. char *resolved = NULL;
  799. fz_xml *xml = NULL;
  800. char *str = NULL;
  801. if (ss_file == NULL)
  802. return;
  803. fz_var(xml);
  804. fz_var(str);
  805. fz_var(resolved);
  806. fz_try(ctx)
  807. {
  808. resolved = make_absolute_path(ctx, file, ss_file);
  809. xml = fz_parse_xml_archive_entry(ctx, arch, resolved, 1);
  810. pos = fz_xml_find_dfs(xml, "si", NULL, NULL);
  811. while (pos)
  812. {
  813. int n = info->shared_string_len;
  814. str = collate_t_content(ctx, pos);
  815. if (n == info->shared_string_max)
  816. {
  817. int max = info->shared_string_max;
  818. int newmax = max ? max * 2 : 1024;
  819. char **arr = fz_realloc(ctx, info->shared_strings, sizeof(*arr) * newmax);
  820. memset(&arr[max], 0, sizeof(*arr) * (newmax - max));
  821. info->shared_strings = arr;
  822. info->shared_string_max = newmax;
  823. }
  824. info->shared_strings[n] = str;
  825. str = NULL;
  826. info->shared_string_len++;
  827. pos = fz_xml_find_next_dfs(pos, "si", NULL, NULL);
  828. }
  829. }
  830. fz_always(ctx)
  831. {
  832. fz_drop_xml(ctx, xml);
  833. fz_free(ctx, resolved);
  834. fz_free(ctx, str);
  835. }
  836. fz_catch(ctx)
  837. fz_rethrow(ctx);
  838. }
  839. static void
  840. load_footnotes(fz_context *ctx, fz_archive *arch, fz_xml *rels, doc_info *info, const char *file)
  841. {
  842. char *resolved = NULL;
  843. fz_xml *xml = NULL;
  844. char *str = NULL;
  845. fz_var(xml);
  846. fz_var(str);
  847. fz_var(resolved);
  848. fz_try(ctx)
  849. {
  850. fz_xml *pos;
  851. resolved = make_absolute_path(ctx, file, "footnotes.xml");
  852. xml = try_parse_xml_archive_entry(ctx, arch, resolved, 1);
  853. if (xml == NULL)
  854. break;
  855. pos = fz_xml_find_dfs(xml, "footnote", NULL, NULL);
  856. while (pos)
  857. {
  858. int n = fz_atoi(fz_xml_att(pos, "w:id"));
  859. str = collate_t_content(ctx, pos);
  860. if (str && n >= 0)
  861. {
  862. if (n >= info->footnotes_max)
  863. {
  864. int max = info->footnotes_max;
  865. int newmax = max ? max * 2 : 1024;
  866. char **arr;
  867. if (newmax < n)
  868. newmax = n+1;
  869. arr = fz_realloc(ctx, info->footnotes, sizeof(*arr) * newmax);
  870. memset(&arr[max], 0, sizeof(*arr) * (newmax - max));
  871. info->footnotes = arr;
  872. info->footnotes_max = newmax;
  873. }
  874. info->footnotes[n] = str;
  875. str = NULL;
  876. }
  877. pos = fz_xml_find_next_dfs(pos, "footnote", NULL, NULL);
  878. }
  879. }
  880. fz_always(ctx)
  881. {
  882. fz_drop_xml(ctx, xml);
  883. fz_free(ctx, resolved);
  884. fz_free(ctx, str);
  885. }
  886. fz_catch(ctx)
  887. fz_rethrow(ctx);
  888. }
  889. static void
  890. process_office_document(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info)
  891. {
  892. char *file_rels;
  893. fz_xml *xml = NULL;
  894. fz_xml *rels = NULL;
  895. char *resolved_rel = NULL;
  896. if (file == NULL)
  897. return;
  898. file_rels = make_rel_name(ctx, file);
  899. fz_var(resolved_rel);
  900. fz_var(rels);
  901. fz_var(xml);
  902. fz_try(ctx)
  903. {
  904. fz_xml *pos;
  905. rels = fz_parse_xml_archive_entry(ctx, arch, file_rels, 0);
  906. xml = fz_parse_xml_archive_entry(ctx, arch, file, 1);
  907. /* XLSX */
  908. pos = fz_xml_find_dfs(xml, "sheet", NULL, NULL);
  909. if (pos)
  910. {
  911. load_shared_strings(ctx, arch, rels, info, file);
  912. while (pos)
  913. {
  914. char *name = fz_xml_att(pos, "name");
  915. char *id = fz_xml_att(pos, "r:id");
  916. char *sheet = lookup_rel(ctx, rels, id);
  917. if (sheet)
  918. {
  919. resolved_rel = make_absolute_path(ctx, file, sheet);
  920. process_sheet(ctx, arch, name, resolved_rel, info);
  921. fz_free(ctx, resolved_rel);
  922. resolved_rel = NULL;
  923. }
  924. pos = fz_xml_find_next_dfs(pos, "sheet", NULL, NULL);
  925. }
  926. break;
  927. }
  928. /* Let's try it as a powerpoint */
  929. pos = fz_xml_find_dfs(xml, "sldId", NULL, NULL);
  930. if (pos)
  931. {
  932. while (pos)
  933. {
  934. char *id = fz_xml_att(pos, "r:id");
  935. char *sheet = lookup_rel(ctx, rels, id);
  936. if (sheet)
  937. {
  938. resolved_rel = make_absolute_path(ctx, file, sheet);
  939. process_slide(ctx, arch, resolved_rel, info);
  940. fz_free(ctx, resolved_rel);
  941. resolved_rel = NULL;
  942. }
  943. pos = fz_xml_find_next_dfs(pos, "sldId", NULL, NULL);
  944. }
  945. break;
  946. }
  947. /* Let's try it as word. */
  948. {
  949. load_footnotes(ctx, arch, rels, info, file);
  950. process_doc_stream(ctx, xml, info, 1);
  951. }
  952. }
  953. fz_always(ctx)
  954. {
  955. fz_drop_xml(ctx, xml);
  956. fz_drop_xml(ctx, rels);
  957. fz_free(ctx, resolved_rel);
  958. fz_free(ctx, file_rels);
  959. }
  960. fz_catch(ctx)
  961. fz_rethrow(ctx);
  962. }
  963. static void
  964. process_office_document_properties(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info)
  965. {
  966. fz_xml *xml = NULL;
  967. char *title;
  968. fz_var(xml);
  969. fz_try(ctx)
  970. {
  971. fz_xml *pos;
  972. xml = fz_parse_xml_archive_entry(ctx, arch, file, 1);
  973. pos = fz_xml_find_dfs(xml, "title", NULL, NULL);
  974. title = fz_xml_text(fz_xml_down(pos));
  975. if (title)
  976. {
  977. fz_write_string(ctx, info->out, "<title>");
  978. doc_escape(ctx, info->out, title);
  979. fz_write_string(ctx, info->out, "</title>");
  980. }
  981. }
  982. fz_always(ctx)
  983. {
  984. fz_drop_xml(ctx, xml);
  985. }
  986. fz_catch(ctx)
  987. fz_rethrow(ctx);
  988. }
  989. static fz_buffer *
  990. fz_office_to_html(fz_context *ctx, fz_html_font_set *set, fz_buffer *buffer_in, fz_archive *dir, const char *user_css, fz_office_to_html_opts *opts)
  991. {
  992. fz_stream *stream = NULL;
  993. fz_archive *archive = NULL;
  994. fz_buffer *buffer_out = NULL;
  995. fz_xml *xml = NULL;
  996. fz_xml *pos = NULL;
  997. fz_xml *rels = NULL;
  998. const char *schema = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
  999. const char *schema_props = "http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties";
  1000. doc_info info = { 0 };
  1001. int i;
  1002. fz_var(archive);
  1003. fz_var(stream);
  1004. fz_var(buffer_out);
  1005. fz_var(xml);
  1006. fz_var(rels);
  1007. if (opts)
  1008. info.opts = *opts;
  1009. fz_try(ctx)
  1010. {
  1011. if (buffer_in)
  1012. {
  1013. stream = fz_open_buffer(ctx, buffer_in);
  1014. archive = fz_open_archive_with_stream(ctx, stream);
  1015. }
  1016. else
  1017. archive = fz_keep_archive(ctx, dir);
  1018. buffer_out = fz_new_buffer(ctx, 1024);
  1019. info.out = fz_new_output_with_buffer(ctx, buffer_out);
  1020. /* Is it an HWPX ?*/
  1021. xml = try_parse_xml_archive_entry(ctx, archive, "META-INF/container.xml", 0);
  1022. if (xml)
  1023. {
  1024. pos = fz_xml_find_dfs(xml, "rootfile", "media-type", "application/hwpml-package+xml");
  1025. if (!pos)
  1026. fz_throw(ctx, FZ_ERROR_FORMAT, "Archive not hwpx.");
  1027. while (pos)
  1028. {
  1029. const char *file = fz_xml_att(pos, "full-path");
  1030. process_rootfile(ctx, archive, file, &info);
  1031. pos = fz_xml_find_next_dfs(pos, "rootfile", "media-type", "application/hwpml-package+xml");
  1032. }
  1033. fz_close_output(ctx, info.out);
  1034. break;
  1035. }
  1036. /* Try other types */
  1037. {
  1038. xml = try_parse_xml_archive_entry(ctx, archive, "_rels/.rels", 0);
  1039. fz_write_string(ctx, info.out, "<html>\n");
  1040. pos = fz_xml_find_dfs(xml, "Relationship", "Type", schema_props);
  1041. if (pos)
  1042. {
  1043. const char *file = fz_xml_att(pos, "Target");
  1044. fz_write_string(ctx, info.out, "<head>\n");
  1045. process_office_document_properties(ctx, archive, file, &info);
  1046. fz_write_string(ctx, info.out, "</head>\n");
  1047. }
  1048. fz_write_string(ctx, info.out, "<body>\n");
  1049. pos = fz_xml_find_dfs(xml, "Relationship", "Type", schema);
  1050. if (!pos)
  1051. fz_throw(ctx, FZ_ERROR_FORMAT, "Archive not docx.");
  1052. while (pos)
  1053. {
  1054. const char *file = fz_xml_att(pos, "Target");
  1055. if (file)
  1056. process_office_document(ctx, archive, file, &info);
  1057. pos = fz_xml_find_next_dfs(pos, "Relationship", "Type", schema);
  1058. }
  1059. }
  1060. fz_close_output(ctx, info.out);
  1061. }
  1062. fz_always(ctx)
  1063. {
  1064. fz_drop_xml(ctx, rels);
  1065. fz_drop_xml(ctx, xml);
  1066. for (i = 0; i < info.shared_string_len; ++i)
  1067. fz_free(ctx, info.shared_strings[i]);
  1068. fz_free(ctx, info.shared_strings);
  1069. for (i = 0; i < info.footnotes_max; ++i)
  1070. fz_free(ctx, info.footnotes[i]);
  1071. fz_free(ctx, info.footnotes);
  1072. fz_drop_output(ctx, info.out);
  1073. fz_drop_archive(ctx, archive);
  1074. fz_drop_stream(ctx, stream);
  1075. }
  1076. fz_catch(ctx)
  1077. {
  1078. fz_drop_buffer(ctx, buffer_out);
  1079. fz_rethrow(ctx);
  1080. }
  1081. #ifdef DEBUG_OFFICE_TO_HTML
  1082. {
  1083. unsigned char *storage;
  1084. size_t len = fz_buffer_storage(ctx, buffer_out, &storage);
  1085. fz_write_printf(ctx, fz_stddbg(ctx), "fz_office_to_html: Output buffer, len=%zd:\n", len);
  1086. fz_write_buffer(ctx, fz_stddbg(ctx), buffer_out);
  1087. }
  1088. #endif
  1089. return buffer_out;
  1090. }
  1091. /* Office document handler */
  1092. static fz_buffer *
  1093. office_to_html(fz_context *ctx, fz_html_font_set *set, fz_buffer *buf, fz_archive *zip, const char *user_css)
  1094. {
  1095. fz_office_to_html_opts opts = { 0 };
  1096. return fz_office_to_html(ctx, set, buf, zip, user_css, &opts);
  1097. }
  1098. static const fz_htdoc_format_t fz_htdoc_office =
  1099. {
  1100. "Office document",
  1101. office_to_html,
  1102. 0, 1, 0
  1103. };
  1104. static fz_document *
  1105. office_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *zip, void *state)
  1106. {
  1107. return fz_htdoc_open_document_with_stream_and_dir(ctx, file, zip, &fz_htdoc_office);
  1108. }
  1109. static const char *office_extensions[] =
  1110. {
  1111. "docx",
  1112. "xlsx",
  1113. "pptx",
  1114. "hwpx",
  1115. NULL
  1116. };
  1117. static const char *office_mimetypes[] =
  1118. {
  1119. // DOCX
  1120. "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
  1121. // XLSX
  1122. "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
  1123. // PPTX
  1124. "application/vnd.openxmlformats-officedocument.presentationml.presentation",
  1125. // HWPX
  1126. "application/haansofthwpx",
  1127. "application/vnd.hancom.hwpx",
  1128. NULL
  1129. };
  1130. /* We are only ever 75% sure here, to allow a 'better' handler, such as sodochandler
  1131. * to override us by returning 100. */
  1132. static int
  1133. office_recognize_doc_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *zip, void **state, fz_document_recognize_state_free_fn **free_state)
  1134. {
  1135. fz_archive *arch = NULL;
  1136. int ret = 0;
  1137. fz_xml *xml = NULL;
  1138. if (state)
  1139. *state = NULL;
  1140. if (free_state)
  1141. *free_state = NULL;
  1142. fz_var(arch);
  1143. fz_var(ret);
  1144. fz_var(xml);
  1145. fz_try(ctx)
  1146. {
  1147. if (stream)
  1148. {
  1149. arch = fz_try_open_archive_with_stream(ctx, stream);
  1150. if (arch == NULL)
  1151. break;
  1152. }
  1153. else
  1154. arch = fz_keep_archive(ctx, zip);
  1155. xml = fz_try_parse_xml_archive_entry(ctx, arch, "META-INF/container.xml", 0);
  1156. if (xml)
  1157. {
  1158. if (fz_xml_find_dfs(xml, "rootfile", "media-type", "application/hwpml-package+xml"))
  1159. ret = 75; /* HWPX */
  1160. break;
  1161. }
  1162. xml = fz_try_parse_xml_archive_entry(ctx, arch, "_rels/.rels", 0);
  1163. if (xml)
  1164. {
  1165. if (fz_xml_find_dfs(xml, "Relationship", "Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"))
  1166. {
  1167. ret = 75; /* DOCX | PPTX | XLSX */
  1168. }
  1169. break;
  1170. }
  1171. }
  1172. fz_always(ctx)
  1173. {
  1174. fz_drop_xml(ctx, xml);
  1175. fz_drop_archive(ctx, arch);
  1176. }
  1177. fz_catch(ctx)
  1178. fz_rethrow(ctx);
  1179. return ret;
  1180. }
  1181. fz_document_handler office_document_handler =
  1182. {
  1183. NULL,
  1184. office_open_document,
  1185. office_extensions,
  1186. office_mimetypes,
  1187. office_recognize_doc_content
  1188. };