| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343 |
- // Copyright (C) 2023-2025 Artifex Software, Inc.
- //
- // This file is part of MuPDF.
- //
- // MuPDF is free software: you can redistribute it and/or modify it under the
- // terms of the GNU Affero General Public License as published by the Free
- // Software Foundation, either version 3 of the License, or (at your option)
- // any later version.
- //
- // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
- // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- // details.
- //
- // You should have received a copy of the GNU Affero General Public License
- // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
- //
- // Alternative licensing terms are available from the licensor.
- // For commercial licensing, see <https://www.artifex.com/> or contact
- // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
- // CA 94129, USA, for further information.
- #include "mupdf/fitz.h"
- #include "html-imp.h"
- #undef DEBUG_OFFICE_TO_HTML
- /* Defaults are all 0's. FIXME: Very subject to change. Possibly might be removed entirely. */
- typedef struct
- {
- int output_page_numbers;
- int output_sheet_names;
- int output_cell_markers;
- int output_cell_row_markers;
- int output_cell_names;
- int output_formatting;
- int output_filenames;
- int output_errors;
- }
- fz_office_to_html_opts;
- typedef struct
- {
- fz_office_to_html_opts opts;
- fz_output *out;
- int page;
- /* State for if we are parsing a sheet. */
- /* The last column label we have to send. */
- char *label;
- /* Columns are numbered from 1. */
- /* The column we are at. */
- int col_at;
- /* The column we last signalled. If this is 0, then we haven't
- * even started a row yet. */
- int col_signalled;
- /* If we are currently processing a spreadsheet, store the current
- * sheets name here. */
- const char *sheet_name;
- int shared_string_max;
- int shared_string_len;
- char **shared_strings;
- int footnotes_max;
- char **footnotes;
- char *title;
- } doc_info;
- static void
- doc_escape(fz_context *ctx, fz_output *output, const char *str_)
- {
- const unsigned char *str = (const unsigned char *)str_;
- int c;
- if (!str)
- return;
- while ((c = *str++) != 0)
- {
- if (c == '&')
- {
- fz_write_string(ctx, output, "&");
- }
- else if (c == '<')
- {
- fz_write_string(ctx, output, "<");
- }
- else if (c == '>')
- {
- fz_write_string(ctx, output, ">");
- }
- else
- {
- /* We get utf-8 in, just parrot it out again. */
- fz_write_byte(ctx, output, c);
- }
- }
- }
- static void
- show_text(fz_context *ctx, fz_xml *top, doc_info *info)
- {
- fz_xml *pos = top;
- fz_xml *next;
- while (pos)
- {
- doc_escape(ctx, info->out, fz_xml_text(pos));
- if (fz_xml_is_tag(pos, "lineBreak"))
- {
- fz_write_string(ctx, info->out, "\n");
- }
- else if (fz_xml_is_tag(pos, "tab"))
- {
- fz_write_string(ctx, info->out, "\t");
- }
- else if (fz_xml_is_tag(pos, "lastRenderedPageBreak"))
- {
- info->page++;
- }
- /* Always try to move down. */
- next = fz_xml_down(pos);
- if (next)
- {
- /* We can move down, easy! */
- pos = next;
- continue;
- }
- if (pos == top)
- break;
- /* We can't move down, try moving to next. */
- next = fz_xml_next(pos);
- if (next)
- {
- /* We can move to next, easy! */
- pos = next;
- continue;
- }
- /* If we can't go down, or next, pop up until we
- * find somewhere we can go next from. */
- while (1)
- {
- /* OK. So move up. */
- pos = fz_xml_up(pos);
- /* Check for hitting the top. */
- if (pos == top)
- pos = NULL;
- if (pos == NULL)
- break;
- /* We've returned to a node. See if it's a 'p'. */
- if (fz_xml_is_tag(pos, "p"))
- {
- fz_write_string(ctx, info->out, "\n");
- }
- next = fz_xml_next(pos);
- if (next)
- {
- pos = next;
- break;
- }
- }
- }
- }
- static void
- show_footnote(fz_context *ctx, fz_xml *v, doc_info *info)
- {
- int n = fz_atoi(fz_xml_att(v, "w:id"));
- if (n < 0 || n >= info->footnotes_max)
- return;
- if (info->footnotes[n] == NULL ||
- info->footnotes[n][0] == 0)
- return;
- /* Then send the strings. */
- doc_escape(ctx, info->out, info->footnotes[n]);
- }
- static void
- process_doc_stream(fz_context *ctx, fz_xml *xml, doc_info *info, int do_pages)
- {
- fz_xml *pos;
- fz_xml *next;
- const char *paragraph_style = NULL;
- const char *inline_style = NULL;
- #ifdef DEBUG_OFFICE_TO_HTML
- fz_write_printf(ctx, fz_stddbg(ctx), "process_doc_stream:\n");
- fz_output_xml(ctx, fz_stddbg(ctx), xml, 0);
- #endif
- /* First off, see if we can do page numbers. */
- if (do_pages)
- {
- pos = fz_xml_find_dfs(xml, "lastRenderedPageBreak", NULL, NULL);
- if (pos)
- {
- /* We *can* do page numbers, so start here. */
- fz_write_string(ctx, info->out, "<div id=\"page1\">\n");
- info->page = 1;
- }
- }
- /* Now walk the tree for real. */
- pos = xml;
- while (pos)
- {
- /* When we arrive on a node, check if it's a 't'. */
- if (fz_xml_is_tag(pos, "t"))
- {
- show_text(ctx, pos, info);
- /* Do NOT go down, we've already dealt with that. */
- }
- else if (fz_xml_is_tag(pos, "br"))
- {
- if (paragraph_style && strcmp(paragraph_style, "pre"))
- {
- fz_write_printf(ctx, info->out, "<br/>\n");
- }
- else
- {
- fz_write_printf(ctx, info->out, "\n");
- }
- }
- else if (fz_xml_is_tag(pos, "footnoteReference"))
- {
- show_footnote(ctx, pos, info);
- /* Do NOT go down, we've already dealt with that. */
- }
- else if (fz_xml_is_tag(pos, "tabs"))
- {
- /* Don't walk through tabs, or we will hit lots of 'tab' entries and
- * output incorrect information. */
- }
- else if (fz_xml_is_tag(pos, "pStyle"))
- {
- /* Should prob fix fz_xml_*() to strip namespace prefix
- from attributes, to match what it does for tag names.
- */
- paragraph_style = fz_xml_att(pos, "w:val");
- if (paragraph_style)
- {
- if (!strcmp(paragraph_style, "BodyText"))
- paragraph_style = NULL;
- else if (!strcmp(paragraph_style, "Heading1"))
- paragraph_style = "h1";
- else if (!strcmp(paragraph_style, "Heading2"))
- paragraph_style = "h2";
- else if (!strcmp(paragraph_style, "Heading3"))
- paragraph_style = "h3";
- else if (!strcmp(paragraph_style, "Heading4"))
- paragraph_style = "h4";
- else if (!strcmp(paragraph_style, "Heading5"))
- paragraph_style = "h5";
- else if (!strcmp(paragraph_style, "Heading6"))
- paragraph_style = "h6";
- else if (!strcmp(paragraph_style, "SourceCode"))
- paragraph_style = "pre";
- else
- paragraph_style = NULL;
- if (paragraph_style)
- fz_write_printf(ctx, info->out, "<%s>", paragraph_style);
- }
- }
- else if (fz_xml_is_tag(pos, "rStyle"))
- {
- inline_style = fz_xml_att(pos, "w:val");
- if (inline_style)
- {
- if (!strcmp(inline_style, "VerbatimChar"))
- inline_style = "tt";
- else
- {
- if (0)
- fz_write_printf(ctx, info->out, "<!-- %s -->", inline_style);
- inline_style = NULL;
- }
- if (inline_style)
- fz_write_printf(ctx, info->out, "<%s>", inline_style);
- }
- }
- else
- {
- fz_xml *down;
- if (fz_xml_is_tag(pos, "lineBreak"))
- {
- fz_write_string(ctx, info->out, "\n");
- }
- else if (fz_xml_is_tag(pos, "p"))
- {
- fz_write_string(ctx, info->out, "<p>");
- }
- else if (fz_xml_is_tag(pos, "tab"))
- {
- fz_write_string(ctx, info->out, "\t");
- }
- else if (do_pages && fz_xml_is_tag(pos, "lastRenderedPageBreak"))
- {
- if (info->page)
- fz_write_string(ctx, info->out, "\n</div>\n");
- info->page++;
- fz_write_printf(ctx, info->out, "<div id=\"page%d\">\n", info->page);
- }
- /* Try to move down. */
- down = fz_xml_down(pos);
- if (down)
- {
- /* We can move down, easy! */
- pos = down;
- continue;
- }
- }
- /* Try moving to next. */
- next = fz_xml_next(pos);
- if (next)
- {
- /* We can move to next, easy! */
- pos = next;
- continue;
- }
- /* If we can't go down, or next, pop up until we
- * find somewhere we can go next from. */
- while (1)
- {
- /* OK. So move up. */
- pos = fz_xml_up(pos);
- /* Check for hitting the top. */
- if (pos == NULL)
- break;
- /* We've returned to a node. See if it's a 'p'. */
- if (fz_xml_is_tag(pos, "p"))
- {
- if (paragraph_style)
- {
- fz_write_printf(ctx, info->out, "</%s>", paragraph_style);
- paragraph_style = NULL;
- }
- fz_write_string(ctx, info->out, "</p>\n");
- }
- else if (fz_xml_is_tag(pos, "r"))
- {
- /* Seems to be pseudo-close for rStyle. */
- if (inline_style)
- {
- fz_write_printf(ctx, info->out, "</%s>", inline_style);
- inline_style = NULL;
- }
- }
- next = fz_xml_next(pos);
- if (next)
- {
- pos = next;
- break;
- }
- }
- }
- if (do_pages && info->page)
- fz_write_string(ctx, info->out, "\n</div>\n");
- }
- static void
- process_item(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info, int do_pages)
- {
- fz_xml *xml = fz_parse_xml_archive_entry(ctx, arch, file, 1);
- fz_try(ctx)
- process_doc_stream(ctx, xml, info, do_pages);
- fz_always(ctx)
- fz_drop_xml(ctx, xml);
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- static void
- process_rootfile(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info)
- {
- fz_xml *xml = fz_parse_xml_archive_entry(ctx, arch, file, 0);
- fz_try(ctx)
- {
- /* FIXME: Should really search for these just inside 'spine'. */
- fz_xml *pos = fz_xml_find_dfs(xml, "itemref", NULL, NULL);
- while (pos)
- {
- char *idref = fz_xml_att(pos, "idref");
- fz_xml *item = fz_xml_find_dfs(xml, "item", "id", idref);
- while (item)
- {
- char *type = fz_xml_att(item, "media-type");
- char *href = fz_xml_att(item, "href");
- if (type && href && !strcmp(type, "application/xml"))
- {
- process_item(ctx, arch, href, info, 1);
- }
- item = fz_xml_find_next_dfs(pos, "item", "id", idref);
- }
- pos = fz_xml_find_next_dfs(pos, "itemref", NULL, NULL);
- }
- }
- fz_always(ctx)
- fz_drop_xml(ctx, xml);
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- /* XLSX support */
- static char *
- make_rel_name(fz_context *ctx, const char *file)
- {
- size_t z = strlen(file);
- char *s = fz_malloc(ctx, z + 12);
- char *t;
- const char *p;
- const char *slash = file;
- for (p = file; *p != 0; p++)
- if (*p == '/')
- slash = p+1;
- t = s;
- if (slash != file)
- {
- memcpy(t, file, slash - file);
- t += slash - file;
- }
- memcpy(t, "_rels/", 6);
- t += 6;
- memcpy(t, file + (slash - file), z - (slash - file));
- t += z - (slash - file);
- memcpy(t, ".rels", 6);
- return s;
- }
- static char *lookup_rel(fz_context *ctx, fz_xml *rels, const char *id)
- {
- fz_xml *pos;
- if (id == NULL)
- return NULL;
- pos = fz_xml_find_dfs(rels, "Relationship", NULL, NULL);
- while (pos)
- {
- char *id2 = fz_xml_att(pos, "Id");
- if (id2 && !strcmp(id, id2))
- return fz_xml_att(pos, "Target");
- pos = fz_xml_find_next_dfs(pos, "Relationship", NULL, NULL);
- }
- return NULL;
- }
- static void
- send_cell_formatting(fz_context *ctx, doc_info *info)
- {
- if (info->col_signalled == 0)
- {
- fz_write_string(ctx, info->out, "<tr>\n");
- info->col_signalled = 1;
- if (info->col_at > 1)
- fz_write_string(ctx, info->out, "<td>");
- }
- /* Send the label */
- while (info->col_signalled < info->col_at)
- {
- fz_write_string(ctx, info->out, "</td>");
- info->col_signalled++;
- if (info->col_signalled < info->col_at)
- fz_write_string(ctx, info->out, "<td>");
- }
- if (info->sheet_name && info->sheet_name[0])
- fz_write_printf(ctx, info->out, "<td id=\"%s!%s\">", info->sheet_name, info->label);
- else
- fz_write_printf(ctx, info->out, "<td id=\"%s\">", info->label);
- }
- static void
- show_shared_string(fz_context *ctx, fz_xml *v, doc_info *info)
- {
- const char *t = fz_xml_text(fz_xml_down(v));
- int n = fz_atoi(t);
- if (n < 0 || n >= info->shared_string_len)
- return;
- if (info->shared_strings[n] == NULL ||
- info->shared_strings[n][0] == 0)
- return;
- send_cell_formatting(ctx, info);
- /* Then send the strings. */
- doc_escape(ctx, info->out, info->shared_strings[n]);
- }
- static int
- col_from_label(const char *label)
- {
- int col = 0;
- int len = 26;
- int base = 0;
- /* If we can't read the column, return 0. */
- if (label == NULL || *label < 'A' || *label > 'Z')
- return 0;
- /* Each section (A-Z, AA-ZZ, AAA-ZZZ etc) is of len 'len', and starts
- * at base index 'base'. Each section is 26 times as long, and starts
- * at base + len from the previous section.
- *
- * A: col = 26 * 0 + 0 + 0
- * AA: col = (26 * 0 + 0 + 0) * 26 + 0 + 26 = 26
- * AAA: col = (((26 * 0 + 0 + 0) * 26 + 0 + 26)*26 + 0 + 26*26 = 26 + 26 * 26
- */
- do
- {
- col = 26 * col + (*label++) - 'A' + base;
- base += len;
- len *= 26;
- }
- while (*label >= 'A' && *label <= 'Z');
- return col+1;
- }
- static void
- show_cell_text(fz_context *ctx, fz_xml *top, doc_info *info)
- {
- fz_xml *pos = top;
- fz_xml *next;
- while (pos)
- {
- char *text = fz_xml_text(pos);
- if (text)
- {
- send_cell_formatting(ctx, info);
- doc_escape(ctx, info->out, text);
- }
- /* Always try to move down. */
- next = fz_xml_down(pos);
- if (next)
- {
- /* We can move down, easy! */
- pos = next;
- continue;
- }
- if (pos == top)
- break;
- /* We can't move down, try moving to next. */
- next = fz_xml_next(pos);
- if (next)
- {
- /* We can move to next, easy! */
- pos = next;
- continue;
- }
- /* If we can't go down, or next, pop up until we
- * find somewhere we can go next from. */
- while (1)
- {
- /* OK. So move up. */
- pos = fz_xml_up(pos);
- /* Check for hitting the top. */
- if (pos == top)
- pos = NULL;
- if (pos == NULL)
- break;
- next = fz_xml_next(pos);
- if (next)
- {
- pos = next;
- break;
- }
- }
- }
- }
- static void
- arrived_at_cell(fz_context *ctx, doc_info *info, const char *label)
- {
- int col;
- /* If we have a label queued, and no label is given here, then we're
- * processing a 'cell' callback after having had a 'cellname'
- * callback. So don't signal it twice! */
- if (label == NULL && info->label)
- return;
- col = label ? col_from_label(label) : 0;
- fz_free(ctx, info->label);
- info->label = NULL;
- info->label = label ? fz_strdup(ctx, label) : NULL;
- info->col_at = col;
- }
- static void
- show_cell(fz_context *ctx, fz_xml *cell, doc_info *info)
- {
- char *t = fz_xml_att(cell, "t");
- fz_xml *v = fz_xml_find_down(cell, "v");
- const char *r = fz_xml_att(cell, "r");
- arrived_at_cell(ctx, info, r);
- if (t && t[0] == 's' && t[1] == 0)
- show_shared_string(ctx, v, info);
- else
- show_cell_text(ctx, v, info);
- }
- static void
- new_row(fz_context *ctx, doc_info *info)
- {
- if (info->col_signalled)
- {
- /* We've sent at least one cell. So need to close the
- * td and tr */
- fz_write_string(ctx, info->out, "</td>\n</tr>\n");
- }
- else
- {
- /* We've not sent anything for this row. Keep the counts
- * correct. */
- fz_write_string(ctx, info->out, "<tr></tr>\n");
- }
- info->col_at = 1;
- info->col_signalled = 0;
- fz_free(ctx, info->label);
- info->label = NULL;
- }
- static void
- process_sheet(fz_context *ctx, fz_archive *arch, const char *name, const char *file, doc_info *info)
- {
- fz_xml *xml = fz_parse_xml_archive_entry(ctx, arch, file, 1);
- #ifdef DEBUG_OFFICE_TO_HTML
- fz_write_printf(ctx, fz_stddbg(ctx), "process_sheet:\n");
- fz_output_xml(ctx, fz_stddbg(ctx), xml, 0);
- #endif
- fz_write_printf(ctx, info->out, "<table id=\"%s\">\n", name);
- info->sheet_name = name;
- info->col_at = 0;
- info->col_signalled = 0;
- fz_try(ctx)
- {
- fz_xml *pos = xml;
- fz_xml *next;
- while (pos)
- {
- /* When we arrive on a node, check if it's a cell. */
- if (fz_xml_is_tag(pos, "c"))
- {
- show_cell(ctx, pos, info);
- /* Do NOT go down, we've already dealt with that. */
- }
- else
- {
- /* Try to move down. */
- next = fz_xml_down(pos);
- if (next)
- {
- /* We can move down, easy! */
- pos = next;
- continue;
- }
- }
- /* Try moving to next. */
- next = fz_xml_next(pos);
- if (next)
- {
- /* We can move to next, easy! */
- pos = next;
- continue;
- }
- /* If we can't go down, or next, pop up until we
- * find somewhere we can go next from. */
- while (1)
- {
- /* OK. So move up. */
- pos = fz_xml_up(pos);
- /* Check for hitting the top. */
- if (pos == NULL)
- break;
- /* We've returned to a node. See if it's a 'row'. */
- if (fz_xml_is_tag(pos, "row"))
- new_row(ctx, info);
- next = fz_xml_next(pos);
- if (next)
- {
- pos = next;
- break;
- }
- }
- }
- if (info->col_signalled)
- fz_write_printf(ctx, info->out, "</td>\n</tr>\n");
- fz_write_printf(ctx, info->out, "</table>\n");
- }
- fz_always(ctx)
- fz_drop_xml(ctx, xml);
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- static void
- process_slide(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info)
- {
- fz_write_printf(ctx, info->out, "<div id=\"slide%d\">\n", info->page++);
- process_item(ctx, arch, file, info, 0);
- fz_write_printf(ctx, info->out, "</div>\n");
- }
- static char *
- make_absolute_path(fz_context *ctx, const char *abs, const char *rel)
- {
- const char *a = abs;
- const char *aslash = a;
- int up = 0;
- size_t z1, z2;
- char *s;
- if (rel == NULL)
- return NULL;
- if (abs == NULL || *rel == '/')
- return fz_strdup(ctx, rel);
- for (a = abs; *a != 0; a++)
- if (*a == '/')
- aslash = a+1;
- while (rel[0] == '.')
- {
- if (rel[1] == '/')
- rel += 2;
- else if (rel[1] == '.' && rel[2] == '/')
- rel += 3, up++;
- else
- fz_throw(ctx, FZ_ERROR_FORMAT, "Unresolvable path");
- }
- if (rel[0] == 0)
- fz_throw(ctx, FZ_ERROR_FORMAT, "Unresolvable path");
- while (up)
- {
- while (aslash != abs && aslash[-1] != '/')
- aslash--;
- up--;
- }
- z1 = aslash - abs;
- z2 = strlen(rel);
- s = fz_malloc(ctx, z1 + z2 + 1);
- if (z1)
- memcpy(s, abs, z1);
- memcpy(s+z1, rel, z2+1);
- return s;
- }
- static char *
- collate_t_content(fz_context *ctx, fz_xml *top)
- {
- char *val = NULL;
- fz_xml *next;
- fz_xml *pos = fz_xml_down(top);
- while (pos != top)
- {
- /* Capture all the 't' content. */
- if (fz_xml_is_tag(pos, "t"))
- {
- /* Remember the content. */
- char *s = fz_xml_text(fz_xml_down(pos));
- if (s == NULL)
- {
- /* Do nothing */
- }
- else if (val == NULL)
- val = fz_strdup(ctx, s);
- else
- {
- char *val2;
- size_t z1 = strlen(val);
- size_t z2 = strlen(s) + 1;
- fz_try(ctx)
- {
- val2 = fz_malloc(ctx, z1 + z2);
- }
- fz_catch(ctx)
- {
- fz_free(ctx, val);
- fz_rethrow(ctx);
- }
- memcpy(val2, val, z1);
- memcpy(val2 + z1, s, z2);
- fz_free(ctx, val);
- val = val2;
- }
- /* Do NOT go down, we've already dealt with that. */
- }
- else if (fz_xml_is_tag(pos, "rPr") || fz_xml_is_tag(pos, "rPh"))
- {
- /* We do not want the 't' content from within these. */
- }
- else
- {
- /* Try to move down. */
- next = fz_xml_down(pos);
- if (next)
- {
- /* We can move down, easy! */
- pos = next;
- continue;
- }
- }
- /* Try moving to next. */
- next = fz_xml_next(pos);
- if (next)
- {
- /* We can move to next, easy! */
- pos = next;
- continue;
- }
- /* If we can't go down, or next, pop up until we
- * find somewhere we can go next from. */
- while (1)
- {
- /* OK. So move up. */
- pos = fz_xml_up(pos);
- /* Check for hitting the top. */
- if (pos == top)
- break;
- next = fz_xml_next(pos);
- if (next)
- {
- pos = next;
- break;
- }
- }
- }
- return val;
- }
- static fz_xml *
- try_parse_xml_archive_entry(fz_context *ctx, fz_archive *arch, const char *filename, int preserve_white)
- {
- if (!fz_has_archive_entry(ctx, arch, filename))
- return NULL;
- return fz_parse_xml_archive_entry(ctx, arch, filename, preserve_white);
- }
- static void
- load_shared_strings(fz_context *ctx, fz_archive *arch, fz_xml *rels, doc_info *info, const char *file)
- {
- fz_xml *pos = fz_xml_find_dfs(rels, "Relationship", "Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings");
- const char *ss_file = fz_xml_att(pos, "Target");
- char *resolved = NULL;
- fz_xml *xml = NULL;
- char *str = NULL;
- if (ss_file == NULL)
- return;
- fz_var(xml);
- fz_var(str);
- fz_var(resolved);
- fz_try(ctx)
- {
- resolved = make_absolute_path(ctx, file, ss_file);
- xml = fz_parse_xml_archive_entry(ctx, arch, resolved, 1);
- pos = fz_xml_find_dfs(xml, "si", NULL, NULL);
- while (pos)
- {
- int n = info->shared_string_len;
- str = collate_t_content(ctx, pos);
- if (n == info->shared_string_max)
- {
- int max = info->shared_string_max;
- int newmax = max ? max * 2 : 1024;
- char **arr = fz_realloc(ctx, info->shared_strings, sizeof(*arr) * newmax);
- memset(&arr[max], 0, sizeof(*arr) * (newmax - max));
- info->shared_strings = arr;
- info->shared_string_max = newmax;
- }
- info->shared_strings[n] = str;
- str = NULL;
- info->shared_string_len++;
- pos = fz_xml_find_next_dfs(pos, "si", NULL, NULL);
- }
- }
- fz_always(ctx)
- {
- fz_drop_xml(ctx, xml);
- fz_free(ctx, resolved);
- fz_free(ctx, str);
- }
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- static void
- load_footnotes(fz_context *ctx, fz_archive *arch, fz_xml *rels, doc_info *info, const char *file)
- {
- char *resolved = NULL;
- fz_xml *xml = NULL;
- char *str = NULL;
- fz_var(xml);
- fz_var(str);
- fz_var(resolved);
- fz_try(ctx)
- {
- fz_xml *pos;
- resolved = make_absolute_path(ctx, file, "footnotes.xml");
- xml = try_parse_xml_archive_entry(ctx, arch, resolved, 1);
- if (xml == NULL)
- break;
- pos = fz_xml_find_dfs(xml, "footnote", NULL, NULL);
- while (pos)
- {
- int n = fz_atoi(fz_xml_att(pos, "w:id"));
- str = collate_t_content(ctx, pos);
- if (str && n >= 0)
- {
- if (n >= info->footnotes_max)
- {
- int max = info->footnotes_max;
- int newmax = max ? max * 2 : 1024;
- char **arr;
- if (newmax < n)
- newmax = n+1;
- arr = fz_realloc(ctx, info->footnotes, sizeof(*arr) * newmax);
- memset(&arr[max], 0, sizeof(*arr) * (newmax - max));
- info->footnotes = arr;
- info->footnotes_max = newmax;
- }
- info->footnotes[n] = str;
- str = NULL;
- }
- pos = fz_xml_find_next_dfs(pos, "footnote", NULL, NULL);
- }
- }
- fz_always(ctx)
- {
- fz_drop_xml(ctx, xml);
- fz_free(ctx, resolved);
- fz_free(ctx, str);
- }
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- static void
- process_office_document(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info)
- {
- char *file_rels;
- fz_xml *xml = NULL;
- fz_xml *rels = NULL;
- char *resolved_rel = NULL;
- if (file == NULL)
- return;
- file_rels = make_rel_name(ctx, file);
- fz_var(resolved_rel);
- fz_var(rels);
- fz_var(xml);
- fz_try(ctx)
- {
- fz_xml *pos;
- rels = fz_parse_xml_archive_entry(ctx, arch, file_rels, 0);
- xml = fz_parse_xml_archive_entry(ctx, arch, file, 1);
- /* XLSX */
- pos = fz_xml_find_dfs(xml, "sheet", NULL, NULL);
- if (pos)
- {
- load_shared_strings(ctx, arch, rels, info, file);
- while (pos)
- {
- char *name = fz_xml_att(pos, "name");
- char *id = fz_xml_att(pos, "r:id");
- char *sheet = lookup_rel(ctx, rels, id);
- if (sheet)
- {
- resolved_rel = make_absolute_path(ctx, file, sheet);
- process_sheet(ctx, arch, name, resolved_rel, info);
- fz_free(ctx, resolved_rel);
- resolved_rel = NULL;
- }
- pos = fz_xml_find_next_dfs(pos, "sheet", NULL, NULL);
- }
- break;
- }
- /* Let's try it as a powerpoint */
- pos = fz_xml_find_dfs(xml, "sldId", NULL, NULL);
- if (pos)
- {
- while (pos)
- {
- char *id = fz_xml_att(pos, "r:id");
- char *sheet = lookup_rel(ctx, rels, id);
- if (sheet)
- {
- resolved_rel = make_absolute_path(ctx, file, sheet);
- process_slide(ctx, arch, resolved_rel, info);
- fz_free(ctx, resolved_rel);
- resolved_rel = NULL;
- }
- pos = fz_xml_find_next_dfs(pos, "sldId", NULL, NULL);
- }
- break;
- }
- /* Let's try it as word. */
- {
- load_footnotes(ctx, arch, rels, info, file);
- process_doc_stream(ctx, xml, info, 1);
- }
- }
- fz_always(ctx)
- {
- fz_drop_xml(ctx, xml);
- fz_drop_xml(ctx, rels);
- fz_free(ctx, resolved_rel);
- fz_free(ctx, file_rels);
- }
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- static void
- process_office_document_properties(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info)
- {
- fz_xml *xml = NULL;
- char *title;
- fz_var(xml);
- fz_try(ctx)
- {
- fz_xml *pos;
- xml = fz_parse_xml_archive_entry(ctx, arch, file, 1);
- pos = fz_xml_find_dfs(xml, "title", NULL, NULL);
- title = fz_xml_text(fz_xml_down(pos));
- if (title)
- {
- fz_write_string(ctx, info->out, "<title>");
- doc_escape(ctx, info->out, title);
- fz_write_string(ctx, info->out, "</title>");
- }
- }
- fz_always(ctx)
- {
- fz_drop_xml(ctx, xml);
- }
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- static fz_buffer *
- fz_office_to_html(fz_context *ctx, fz_html_font_set *set, fz_buffer *buffer_in, fz_archive *dir, const char *user_css, fz_office_to_html_opts *opts)
- {
- fz_stream *stream = NULL;
- fz_archive *archive = NULL;
- fz_buffer *buffer_out = NULL;
- fz_xml *xml = NULL;
- fz_xml *pos = NULL;
- fz_xml *rels = NULL;
- const char *schema = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
- const char *schema_props = "http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties";
- doc_info info = { 0 };
- int i;
- fz_var(archive);
- fz_var(stream);
- fz_var(buffer_out);
- fz_var(xml);
- fz_var(rels);
- if (opts)
- info.opts = *opts;
- fz_try(ctx)
- {
- if (buffer_in)
- {
- stream = fz_open_buffer(ctx, buffer_in);
- archive = fz_open_archive_with_stream(ctx, stream);
- }
- else
- archive = fz_keep_archive(ctx, dir);
- buffer_out = fz_new_buffer(ctx, 1024);
- info.out = fz_new_output_with_buffer(ctx, buffer_out);
- /* Is it an HWPX ?*/
- xml = try_parse_xml_archive_entry(ctx, archive, "META-INF/container.xml", 0);
- if (xml)
- {
- pos = fz_xml_find_dfs(xml, "rootfile", "media-type", "application/hwpml-package+xml");
- if (!pos)
- fz_throw(ctx, FZ_ERROR_FORMAT, "Archive not hwpx.");
- while (pos)
- {
- const char *file = fz_xml_att(pos, "full-path");
- process_rootfile(ctx, archive, file, &info);
- pos = fz_xml_find_next_dfs(pos, "rootfile", "media-type", "application/hwpml-package+xml");
- }
- fz_close_output(ctx, info.out);
- break;
- }
- /* Try other types */
- {
- xml = try_parse_xml_archive_entry(ctx, archive, "_rels/.rels", 0);
- fz_write_string(ctx, info.out, "<html>\n");
- pos = fz_xml_find_dfs(xml, "Relationship", "Type", schema_props);
- if (pos)
- {
- const char *file = fz_xml_att(pos, "Target");
- fz_write_string(ctx, info.out, "<head>\n");
- process_office_document_properties(ctx, archive, file, &info);
- fz_write_string(ctx, info.out, "</head>\n");
- }
- fz_write_string(ctx, info.out, "<body>\n");
- pos = fz_xml_find_dfs(xml, "Relationship", "Type", schema);
- if (!pos)
- fz_throw(ctx, FZ_ERROR_FORMAT, "Archive not docx.");
- while (pos)
- {
- const char *file = fz_xml_att(pos, "Target");
- if (file)
- process_office_document(ctx, archive, file, &info);
- pos = fz_xml_find_next_dfs(pos, "Relationship", "Type", schema);
- }
- }
- fz_close_output(ctx, info.out);
- }
- fz_always(ctx)
- {
- fz_drop_xml(ctx, rels);
- fz_drop_xml(ctx, xml);
- for (i = 0; i < info.shared_string_len; ++i)
- fz_free(ctx, info.shared_strings[i]);
- fz_free(ctx, info.shared_strings);
- for (i = 0; i < info.footnotes_max; ++i)
- fz_free(ctx, info.footnotes[i]);
- fz_free(ctx, info.footnotes);
- fz_drop_output(ctx, info.out);
- fz_drop_archive(ctx, archive);
- fz_drop_stream(ctx, stream);
- }
- fz_catch(ctx)
- {
- fz_drop_buffer(ctx, buffer_out);
- fz_rethrow(ctx);
- }
- #ifdef DEBUG_OFFICE_TO_HTML
- {
- unsigned char *storage;
- size_t len = fz_buffer_storage(ctx, buffer_out, &storage);
- fz_write_printf(ctx, fz_stddbg(ctx), "fz_office_to_html: Output buffer, len=%zd:\n", len);
- fz_write_buffer(ctx, fz_stddbg(ctx), buffer_out);
- }
- #endif
- return buffer_out;
- }
- /* Office document handler */
- static fz_buffer *
- office_to_html(fz_context *ctx, fz_html_font_set *set, fz_buffer *buf, fz_archive *zip, const char *user_css)
- {
- fz_office_to_html_opts opts = { 0 };
- return fz_office_to_html(ctx, set, buf, zip, user_css, &opts);
- }
- static const fz_htdoc_format_t fz_htdoc_office =
- {
- "Office document",
- office_to_html,
- 0, 1, 0
- };
- static fz_document *
- office_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *zip, void *state)
- {
- return fz_htdoc_open_document_with_stream_and_dir(ctx, file, zip, &fz_htdoc_office);
- }
- static const char *office_extensions[] =
- {
- "docx",
- "xlsx",
- "pptx",
- "hwpx",
- NULL
- };
- static const char *office_mimetypes[] =
- {
- // DOCX
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
- // XLSX
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
- // PPTX
- "application/vnd.openxmlformats-officedocument.presentationml.presentation",
- // HWPX
- "application/haansofthwpx",
- "application/vnd.hancom.hwpx",
- NULL
- };
- /* We are only ever 75% sure here, to allow a 'better' handler, such as sodochandler
- * to override us by returning 100. */
- static int
- office_recognize_doc_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *zip, void **state, fz_document_recognize_state_free_fn **free_state)
- {
- fz_archive *arch = NULL;
- int ret = 0;
- fz_xml *xml = NULL;
- if (state)
- *state = NULL;
- if (free_state)
- *free_state = NULL;
- fz_var(arch);
- fz_var(ret);
- fz_var(xml);
- fz_try(ctx)
- {
- if (stream)
- {
- arch = fz_try_open_archive_with_stream(ctx, stream);
- if (arch == NULL)
- break;
- }
- else
- arch = fz_keep_archive(ctx, zip);
- xml = fz_try_parse_xml_archive_entry(ctx, arch, "META-INF/container.xml", 0);
- if (xml)
- {
- if (fz_xml_find_dfs(xml, "rootfile", "media-type", "application/hwpml-package+xml"))
- ret = 75; /* HWPX */
- break;
- }
- xml = fz_try_parse_xml_archive_entry(ctx, arch, "_rels/.rels", 0);
- if (xml)
- {
- if (fz_xml_find_dfs(xml, "Relationship", "Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"))
- {
- ret = 75; /* DOCX | PPTX | XLSX */
- }
- break;
- }
- }
- fz_always(ctx)
- {
- fz_drop_xml(ctx, xml);
- fz_drop_archive(ctx, arch);
- }
- fz_catch(ctx)
- fz_rethrow(ctx);
- return ret;
- }
- fz_document_handler office_document_handler =
- {
- NULL,
- office_open_document,
- office_extensions,
- office_mimetypes,
- office_recognize_doc_content
- };
|