| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751 |
- // Copyright (C) 2004-2024 Artifex Software, Inc.
- //
- // This file is part of MuPDF.
- //
- // MuPDF is free software: you can redistribute it and/or modify it under the
- // terms of the GNU Affero General Public License as published by the Free
- // Software Foundation, either version 3 of the License, or (at your option)
- // any later version.
- //
- // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
- // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- // details.
- //
- // You should have received a copy of the GNU Affero General Public License
- // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
- //
- // Alternative licensing terms are available from the licensor.
- // For commercial licensing, see <https://www.artifex.com/> or contact
- // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
- // CA 94129, USA, for further information.
- #include "mupdf/fitz.h"
- #include "html-imp.h"
- #include <string.h>
- #include <math.h>
- enum { T, R, B, L };
- typedef struct
- {
- fz_document super;
- fz_archive *zip;
- fz_html_font_set *set;
- fz_html *html;
- fz_outline *outline;
- const fz_htdoc_format_t *format;
- } html_document;
- typedef struct
- {
- fz_page super;
- html_document *doc;
- int number;
- } html_page;
- static void
- htdoc_drop_document(fz_context *ctx, fz_document *doc_)
- {
- html_document *doc = (html_document*)doc_;
- fz_drop_archive(ctx, doc->zip);
- fz_drop_html(ctx, doc->html);
- fz_drop_html_font_set(ctx, doc->set);
- fz_drop_outline(ctx, doc->outline);
- }
- static fz_link_dest
- htdoc_resolve_link(fz_context *ctx, fz_document *doc_, const char *dest)
- {
- html_document *doc = (html_document*)doc_;
- const char *s = strchr(dest, '#');
- if (s && s[1] != 0)
- {
- float y = fz_find_html_target(ctx, doc->html, s+1);
- if (y >= 0)
- {
- int page = y / doc->html->page_h;
- return fz_make_link_dest_xyz(0, page, 0, y - page * doc->html->page_h, 0);
- }
- }
- return fz_make_link_dest_none();
- }
- static int
- htdoc_count_pages(fz_context *ctx, fz_document *doc_, int chapter)
- {
- html_document *doc = (html_document*)doc_;
- if (doc->html->tree.root->s.layout.b > 0)
- return ceilf(doc->html->tree.root->s.layout.b / doc->html->page_h);
- return 1;
- }
- static void
- htdoc_update_outline(fz_context *ctx, fz_document *doc, fz_outline *node)
- {
- while (node)
- {
- fz_link_dest dest = htdoc_resolve_link(ctx, doc, node->uri);
- node->page = dest.loc;
- node->x = dest.x;
- node->y = dest.y;
- htdoc_update_outline(ctx, doc, node->down);
- node = node->next;
- }
- }
- static void
- htdoc_layout(fz_context *ctx, fz_document *doc_, float w, float h, float em)
- {
- html_document *doc = (html_document*)doc_;
- fz_layout_html(ctx, doc->html, w, h, em);
- htdoc_update_outline(ctx, doc_, doc->outline);
- }
- static void
- htdoc_drop_page(fz_context *ctx, fz_page *page_)
- {
- }
- static fz_rect
- htdoc_bound_page(fz_context *ctx, fz_page *page_, fz_box_type box)
- {
- html_page *page = (html_page*)page_;
- html_document *doc = page->doc;
- fz_rect bbox;
- bbox.x0 = 0;
- bbox.y0 = 0;
- bbox.x1 = doc->html->page_w + doc->html->page_margin[L] + doc->html->page_margin[R];
- bbox.y1 = doc->html->page_h + doc->html->page_margin[T] + doc->html->page_margin[B];
- return bbox;
- }
- static void
- htdoc_run_page(fz_context *ctx, fz_page *page_, fz_device *dev, fz_matrix ctm, fz_cookie *cookie)
- {
- html_page *page = (html_page*)page_;
- html_document *doc = page->doc;
- fz_draw_html(ctx, dev, ctm, doc->html, page->number);
- }
- static fz_link *
- htdoc_load_links(fz_context *ctx, fz_page *page_)
- {
- html_page *page = (html_page*)page_;
- html_document *doc = page->doc;
- return fz_load_html_links(ctx, doc->html, page->number, "");
- }
- static fz_bookmark
- htdoc_make_bookmark(fz_context *ctx, fz_document *doc_, fz_location loc)
- {
- html_document *doc = (html_document*)doc_;
- return fz_make_html_bookmark(ctx, doc->html, loc.page);
- }
- static fz_location
- htdoc_lookup_bookmark(fz_context *ctx, fz_document *doc_, fz_bookmark mark)
- {
- html_document *doc = (html_document*)doc_;
- return fz_make_location(0, fz_lookup_html_bookmark(ctx, doc->html, mark));
- }
- static fz_page *
- htdoc_load_page(fz_context *ctx, fz_document *doc_, int chapter, int number)
- {
- html_document *doc = (html_document*)doc_;
- html_page *page = fz_new_derived_page(ctx, html_page, doc_);
- page->super.bound_page = htdoc_bound_page;
- page->super.run_page_contents = htdoc_run_page;
- page->super.load_links = htdoc_load_links;
- page->super.drop_page = htdoc_drop_page;
- page->doc = doc;
- page->number = number;
- return (fz_page*)page;
- }
- static fz_outline *
- htdoc_load_outline(fz_context *ctx, fz_document *doc_)
- {
- html_document *doc = (html_document*)doc_;
- return fz_keep_outline(ctx, doc->outline);
- }
- static int
- htdoc_lookup_metadata(fz_context *ctx, fz_document *doc_, const char *key, char *buf, size_t size)
- {
- html_document *doc = (html_document *)doc_;
- if (!strcmp(key, FZ_META_FORMAT))
- return 1 + (int)fz_strlcpy(buf, doc->format->format_name, size);
- if (!strcmp(key, FZ_META_INFO_TITLE) && doc->html->title)
- return 1 + (int)fz_strlcpy(buf, doc->html->title, size);
- return -1;
- }
- static fz_html *
- generic_parse(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buffer_in, const char *user_css, const fz_htdoc_format_t *format)
- {
- fz_buffer *buffer_html = NULL;
- fz_html *html = NULL;
- fz_try(ctx)
- {
- if (format->convert_to_html)
- buffer_html = format->convert_to_html(ctx, set, buffer_in, zip, user_css);
- else
- buffer_html = fz_keep_buffer(ctx, buffer_in);
- html = fz_parse_html(ctx, set, zip, base_uri, buffer_html, user_css, format->try_xml, format->try_html5, format->patch_mobi);
- }
- fz_always(ctx)
- {
- fz_drop_buffer(ctx, buffer_html);
- }
- fz_catch(ctx)
- {
- fz_drop_html(ctx, html);
- fz_rethrow(ctx);
- }
- return html;
- }
- fz_document *
- fz_htdoc_open_document_with_buffer(fz_context *ctx, fz_archive *dir, fz_buffer *buf, const fz_htdoc_format_t *format)
- {
- html_document *doc = NULL;
- fz_var(doc);
- fz_var(dir);
- fz_try(ctx)
- {
- doc = fz_new_derived_document(ctx, html_document);
- doc->super.drop_document = htdoc_drop_document;
- doc->super.layout = htdoc_layout;
- doc->super.load_outline = htdoc_load_outline;
- doc->super.resolve_link_dest = htdoc_resolve_link;
- doc->super.make_bookmark = htdoc_make_bookmark;
- doc->super.lookup_bookmark = htdoc_lookup_bookmark;
- doc->super.count_pages = htdoc_count_pages;
- doc->super.load_page = htdoc_load_page;
- doc->super.lookup_metadata = htdoc_lookup_metadata;
- doc->super.is_reflowable = 1;
- doc->zip = fz_keep_archive(ctx, dir);
- doc->format = format;
- doc->set = fz_new_html_font_set(ctx);
- doc->html = generic_parse(ctx, doc->set, doc->zip, ".", buf, fz_user_css(ctx), format);
- doc->outline = fz_load_html_outline(ctx, doc->html);
- }
- fz_always(ctx)
- fz_drop_buffer(ctx, buf);
- fz_catch(ctx)
- {
- fz_drop_document(ctx, &doc->super);
- fz_rethrow(ctx);
- }
- return (fz_document*)doc;
- }
- fz_document *
- fz_htdoc_open_document_with_stream_and_dir(fz_context *ctx, fz_stream *stm, fz_archive *dir, const fz_htdoc_format_t *format)
- {
- fz_buffer *buf = NULL;
- if (stm)
- buf = fz_read_all(ctx, stm, 0);
- return fz_htdoc_open_document_with_buffer(ctx, dir, buf, format);
- }
- /* Variant specific functions */
- /* Generic HTML document handler */
- static int isws(int c)
- {
- return c == 32 || c == 9 || c == 10 || c == 13 || c == 12;
- }
- static int recognize_html_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **hstate, fz_document_recognize_state_free_fn **free_state, int xhtml)
- {
- uint8_t buffer[4096];
- size_t i, n, m;
- enum {
- state_top,
- state_open,
- state_pling,
- state_query,
- state_maybe_doctype,
- state_maybe_doctype_ws,
- state_maybe_doctype_html,
- state_maybe_doctype_html_xhtml,
- state_maybe_comment,
- state_maybe_html,
- state_maybe_html_xhtml,
- state_comment
- };
- int state = state_top;
- int type = 0;
- if (hstate)
- *hstate = NULL;
- if (free_state)
- *free_state = NULL;
- if (stream == NULL)
- return 0;
- /* Simple state machine. Search for "<!doctype html" or "<html" in the first
- * 4K of the file, allowing for comments and whitespace and case insensitivity. */
- n = fz_read(ctx, stream, buffer, sizeof(buffer));
- fz_seek(ctx, stream, 0, SEEK_SET);
- if (n == 0)
- return 0;
- i = 0;
- if (n >= 3 && buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF)
- {
- /* UTF-8 encoded BOM. Just skip it. */
- i = 3;
- }
- else if (n >= 2 && buffer[0] == 0xFE && buffer[1] == 0xFF)
- {
- /* UTF-16, big endian. */
- type = 1;
- i = 2;
- n &= ~1;
- }
- else if (n >= 2 && buffer[0] == 0xFF && buffer[1] == 0xFE)
- {
- /* UTF-16, little endian. */
- i = 2;
- type = 2;
- n &= ~1;
- }
- while (i < n)
- {
- int c;
- switch (type)
- {
- case 0: /* UTF-8 */
- c = buffer[i++];
- break;
- case 1: /* UTF-16 - big endian */
- c = buffer[i++] << 8;
- c |= buffer[i++];
- break;
- case 2: /* UTF-16 - little endian */
- c = buffer[i++];
- c |= buffer[i++] << 8;
- break;
- }
- switch (state)
- {
- case state_top:
- if (isws(c))
- continue; /* whitespace */
- if (c == '<')
- state = state_open;
- else
- return 0; /* Non whitespace found at the top level prior to a known tag. Fail. */
- break;
- case state_open:
- if (isws(c))
- continue; /* whitespace */
- if (c == '!')
- state = state_pling;
- else if (c == '?')
- state = state_query;
- else if (c == 'h' || c == 'H')
- state = state_maybe_html;
- else
- return 0; /* Not an acceptable opening tag. */
- m = 0;
- break;
- case state_query:
- if (c == '>')
- state = state_top;
- break;
- case state_pling:
- if (isws(c))
- continue; /* whitespace */
- else if (c == '-')
- state = state_maybe_comment;
- else if (c == 'd' || c == 'D')
- state = state_maybe_doctype;
- else
- return 0; /* Not an acceptable opening tag. */
- break;
- case state_maybe_comment:
- if (c == '-')
- state = state_comment;
- else
- return 0; /* Not an acceptable opening tag. */
- break;
- case state_comment:
- if (c == '-')
- {
- m++;
- }
- else if (c == '>' && m >= 2)
- {
- state = state_top;
- }
- else
- m = 0;
- break;
- case state_maybe_doctype:
- if (c == "octype"[m] || c == "OCTYPE"[m])
- {
- m++;
- if (m == 6)
- {
- state = state_maybe_doctype_ws;
- m = 0;
- }
- }
- else
- return 0; /* Not an acceptable opening tag. */
- break;
- case state_maybe_doctype_ws:
- if (isws(c))
- m++;
- else if (m > 0 && (c == 'h' || c == 'H'))
- {
- state = state_maybe_doctype_html;
- m = 0;
- }
- else
- return 0; /* Not an acceptable opening tag. */
- break;
- case state_maybe_doctype_html:
- if (c == "tml"[m] || c == "TML"[m])
- {
- m++;
- if (m == 3)
- {
- state = state_maybe_doctype_html_xhtml;
- m = 0;
- }
- }
- else
- return 0; /* Not an acceptable opening tag. */
- break;
- case state_maybe_doctype_html_xhtml:
- if (c == '>')
- {
- /* Not xhtml - the xhtml agent can handle this at a pinch (so 25),
- * but we'd rather the html one did (75). */
- return xhtml ? 25 : 75;
- }
- if (c >= 'A' && c <= 'Z')
- c += 'a'-'A';
- if (c == "xhtml"[m])
- {
- m++;
- if (m == 5)
- {
- /* xhtml - the xhtml agent would be better (75) than the html
- * agent (25). */
- return xhtml ? 75 : 25;
- }
- }
- else
- m = 0;
- break;
- case state_maybe_html:
- if (c == "tml"[m] || c == "TML"[m])
- {
- m++;
- if (m == 3)
- {
- state = state_maybe_html_xhtml;
- m = 0;
- }
- }
- else
- return 0; /* Not an acceptable opening tag. */
- break;
- case state_maybe_html_xhtml:
- if (c == '>')
- {
- /* Not xhtml - the xhtml agent can handle this at a pinch (so 25),
- * but we'd rather the html one did (75). */
- return xhtml ? 25 : 75;
- }
- if (c >= 'A' && c <= 'Z')
- c += 'a'-'A';
- if (c == "xhtml"[m])
- {
- m++;
- if (m == 5)
- {
- /* xhtml - the xhtml agent would be better (75) than the html
- * agent (25). */
- return xhtml ? 75 : 25;
- }
- }
- else
- m = 0;
- break;
- }
- }
- return 0;
- }
- int htdoc_recognize_html_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **hstate, fz_document_recognize_state_free_fn **free_state)
- {
- return recognize_html_content(ctx, handler, stream, dir, hstate, free_state, 0);
- }
- static const fz_htdoc_format_t fz_htdoc_html5 =
- {
- "HTML5",
- NULL,
- 0, 1, 0
- };
- static fz_document *
- htdoc_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state)
- {
- return fz_htdoc_open_document_with_stream_and_dir(ctx, file, dir, &fz_htdoc_html5);
- }
- static const char *htdoc_extensions[] =
- {
- "htm",
- "html",
- NULL
- };
- static const char *htdoc_mimetypes[] =
- {
- "text/html",
- NULL
- };
- fz_document_handler html_document_handler =
- {
- NULL,
- htdoc_open_document,
- htdoc_extensions,
- htdoc_mimetypes,
- htdoc_recognize_html_content,
- 1
- };
- /* XHTML document handler */
- static const fz_htdoc_format_t fz_htdoc_xhtml =
- {
- "XHTML",
- NULL,
- 1, 1, 0
- };
- static fz_document *
- xhtdoc_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state)
- {
- return fz_htdoc_open_document_with_stream_and_dir(ctx, file, dir, &fz_htdoc_xhtml);
- }
- int xhtdoc_recognize_xhtml_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **hstate, fz_document_recognize_state_free_fn **free_state)
- {
- return recognize_html_content(ctx, handler, stream, dir, hstate, free_state, 1);
- }
- static const char *xhtdoc_extensions[] =
- {
- "xhtml",
- NULL
- };
- static const char *xhtdoc_mimetypes[] =
- {
- "application/xhtml+xml",
- NULL
- };
- fz_document_handler xhtml_document_handler =
- {
- NULL,
- xhtdoc_open_document,
- xhtdoc_extensions,
- xhtdoc_mimetypes,
- xhtdoc_recognize_xhtml_content,
- 1
- };
- /* FB2 document handler */
- static const fz_htdoc_format_t fz_htdoc_fb2 =
- {
- "FictionBook2",
- NULL,
- 1, 0, 0
- };
- static fz_document *
- fb2doc_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state)
- {
- return fz_htdoc_open_document_with_stream_and_dir(ctx, file, dir, &fz_htdoc_fb2);
- }
- static int
- fb2doc_recognize_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **state, fz_document_recognize_state_free_fn **free_state)
- {
- const char *match = "<FictionBook";
- int pos = 0;
- int n = 4096;
- int c;
- if (state)
- *state = NULL;
- if (free_state)
- *free_state = NULL;
- if (stream == NULL)
- return 0;
- do
- {
- c = fz_read_byte(ctx, stream);
- if (c == EOF)
- return 0;
- if (c == match[pos])
- {
- pos++;
- if (pos == 12)
- return 100;
- }
- else
- {
- /* Restart matching, but recheck c against the start. */
- pos = (c == match[0]);
- }
- }
- while (--n > 0);
- return 0;
- }
- static const char *fb2doc_extensions[] =
- {
- "fb2",
- "xml",
- NULL
- };
- static const char *fb2doc_mimetypes[] =
- {
- "application/x-fictionbook",
- "application/xml",
- "text/xml",
- NULL
- };
- fz_document_handler fb2_document_handler =
- {
- NULL,
- fb2doc_open_document,
- fb2doc_extensions,
- fb2doc_mimetypes,
- fb2doc_recognize_content
- };
- /* Mobi document handler */
- static const fz_htdoc_format_t fz_htdoc_mobi =
- {
- "MOBI",
- NULL,
- 1, 1, 1
- };
- static fz_document *
- mobi_open_document_with_buffer(fz_context *ctx, fz_buffer *mobi)
- {
- fz_archive *dir = NULL;
- fz_buffer *html;
- fz_document *doc;
- fz_var(dir);
- fz_try(ctx)
- {
- dir = fz_extract_html_from_mobi(ctx, mobi);
- html = fz_read_archive_entry(ctx, dir, "index.html");
- doc = fz_htdoc_open_document_with_buffer(ctx, dir, html, &fz_htdoc_mobi);
- }
- fz_always(ctx)
- {
- fz_drop_buffer(ctx, mobi);
- fz_drop_archive(ctx, dir);
- }
- fz_catch(ctx)
- {
- fz_rethrow(ctx);
- }
- return doc;
- }
- static int
- mobi_recognize_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **state, fz_document_recognize_state_free_fn **free_state)
- {
- char text[8];
- if (state)
- *state = NULL;
- if (free_state)
- *free_state = NULL;
- if (stream == NULL)
- return 0;
- fz_seek(ctx, stream, 32 + 28, SEEK_SET);
- if (fz_read(ctx, stream, (unsigned char *)text, 8) != 8)
- return 0;
- if (memcmp(text, "BOOKMOBI", 8) == 0)
- return 100;
- if (memcmp(text, "TEXtREAd", 8) == 0)
- return 100;
- return 0;
- }
- static fz_document *
- mobi_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state)
- {
- return mobi_open_document_with_buffer(ctx, fz_read_all(ctx, file, 0));
- }
- static const char *mobi_extensions[] =
- {
- "mobi",
- "prc",
- "pdb",
- NULL
- };
- static const char *mobi_mimetypes[] =
- {
- "application/x-mobipocket-ebook",
- NULL
- };
- fz_document_handler mobi_document_handler =
- {
- NULL,
- mobi_open_document,
- mobi_extensions,
- mobi_mimetypes,
- mobi_recognize_content
- };
|