| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347 |
- // Copyright (C) 2004-2025 Artifex Software, Inc.
- //
- // This file is part of MuPDF.
- //
- // MuPDF is free software: you can redistribute it and/or modify it under the
- // terms of the GNU Affero General Public License as published by the Free
- // Software Foundation, either version 3 of the License, or (at your option)
- // any later version.
- //
- // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
- // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- // details.
- //
- // You should have received a copy of the GNU Affero General Public License
- // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
- //
- // Alternative licensing terms are available from the licensor.
- // For commercial licensing, see <https://www.artifex.com/> or contact
- // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
- // CA 94129, USA, for further information.
- #include "mupdf/fitz.h"
- #include "html-imp.h"
- #include <string.h>
- #define FORMAT_HTML 1
- #define FORMAT_TEXT 2
- #define COMPRESSION_NONE 1
- #define COMPRESSION_PALMDOC 2
- #define COMPRESSION_HUFF_CDIC 17480
- #define TEXT_ENCODING_LATIN_1 0
- #define TEXT_ENCODING_1252 1252
- #define TEXT_ENCODING_UTF8 65001
- static void
- skip_bytes(fz_context *ctx, fz_stream *stm, size_t len)
- {
- size_t skipped = fz_skip(ctx, stm, len);
- if (skipped < len)
- fz_throw(ctx, FZ_ERROR_FORMAT, "premature end in data");
- }
- static void
- mobi_read_text_none(fz_context *ctx, fz_buffer *out, fz_stream *stm, uint32_t size)
- {
- unsigned char buf[4096];
- size_t n;
- if (size > 4096)
- fz_throw(ctx, FZ_ERROR_FORMAT, "text block too large");
- n = fz_read(ctx, stm, buf, size);
- if (n < size)
- fz_warn(ctx, "premature end in mobi uncompressed text data");
- fz_append_data(ctx, out, buf, n);
- }
- static void
- mobi_read_text_palmdoc(fz_context *ctx, fz_buffer *out, fz_stream *stm, uint32_t size)
- {
- // https://wiki.mobileread.com/wiki/PalmDOC
- size_t end = out->len + size;
- while (out->len < end)
- {
- int c = fz_read_byte(ctx, stm);
- if (c == EOF)
- break;
- if (c >= 0x01 && c <= 0x08)
- {
- unsigned char buf[8];
- size_t n = fz_read(ctx, stm, buf, c);
- fz_append_data(ctx, out, buf, n);
- if (n < (size_t) c)
- break;
- }
- else if (c <= 0x7f)
- {
- fz_append_byte(ctx, out, c);
- }
- else if (c >= 0x80 && c <= 0xbf)
- {
- int cc, x, distance, length;
- cc = fz_read_byte(ctx, stm);
- if (cc == EOF)
- break;
- x = (c << 8) | cc;
- distance = (x >> 3) & 0x7ff;
- length = (x & 7) + 3;
- if (distance > 0 && (size_t)distance <= out->len)
- {
- int i;
- int p = (int)(out->len - distance);
- for (i = 0; i < length; ++i)
- fz_append_byte(ctx, out, out->data[p + i]);
- }
- }
- else if (c >= 0xc0 && c <= 0xff)
- {
- fz_append_byte(ctx, out, ' ');
- fz_append_byte(ctx, out, c ^ 0x80);
- }
- }
- if (out->len < end)
- fz_warn(ctx, "premature end in mobi palmdoc data");
- }
- static uint32_t
- mobi_read_data(fz_context *ctx, fz_buffer *out, fz_stream *stm, uint32_t *offset, uint32_t total_count, int format)
- {
- // https://wiki.mobileread.com/wiki/MOBI
- uint32_t compression, text_length, record_count, text_encoding, i;
- unsigned char buf[4];
- fz_range range = { 0 };
- fz_stream *rec = NULL;
- size_t n;
- fz_var(rec);
- fz_try(ctx)
- {
- range.offset = offset[0];
- range.length = offset[1] - offset[0];
- rec = fz_open_range_filter(ctx, stm, &range, 1);
- // PalmDOC header
- compression = fz_read_uint16(ctx, rec);
- skip_bytes(ctx, rec, 2);
- text_length = fz_read_uint32(ctx, rec);
- record_count = fz_read_uint16(ctx, rec);
- skip_bytes(ctx, rec, 2);
- skip_bytes(ctx, rec, 2); // encryption
- skip_bytes(ctx, rec, 2);
- // Optional MOBI header
- text_encoding = TEXT_ENCODING_LATIN_1;
- n = fz_read(ctx, rec, buf, 4);
- if (n == 4 && !memcmp(buf, "MOBI", 4))
- {
- skip_bytes(ctx, rec, 4);
- skip_bytes(ctx, rec, 4);
- text_encoding = fz_read_uint32(ctx, rec);
- }
- }
- fz_always(ctx)
- fz_drop_stream(ctx, rec);
- fz_catch(ctx)
- fz_rethrow(ctx);
- if (compression != COMPRESSION_NONE && compression != COMPRESSION_PALMDOC)
- fz_throw(ctx, FZ_ERROR_FORMAT, "unknown compression method");
- if (text_encoding != TEXT_ENCODING_LATIN_1 &&
- text_encoding != TEXT_ENCODING_1252 &&
- text_encoding != TEXT_ENCODING_UTF8)
- fz_throw(ctx, FZ_ERROR_FORMAT, "unknown text encoding");
- for (i = 1; i <= record_count && i < total_count; ++i)
- {
- uint32_t remain = text_length - (uint32_t)out->len;
- uint32_t size = remain < 4096 ? remain : 4096;
- fz_try(ctx)
- {
- range.offset = offset[i];
- range.length = offset[i + 1] - offset[i];
- rec = fz_open_range_filter(ctx, stm, &range, 1);
- if (compression == COMPRESSION_NONE)
- mobi_read_text_none(ctx, out, rec, size);
- else
- mobi_read_text_palmdoc(ctx, out, rec, size);
- }
- fz_always(ctx)
- fz_drop_stream(ctx, rec);
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- if (format == FORMAT_TEXT && out->len > 6)
- {
- if (!memcmp(out->data, "<html>", 6) || !memcmp(out->data, "<HTML>", 6))
- format = FORMAT_HTML;
- }
- if (text_encoding != TEXT_ENCODING_UTF8 || format == FORMAT_TEXT)
- {
- unsigned char *p;
- size_t j, z = fz_buffer_extract(ctx, out, &p);
- fz_resize_buffer(ctx, out, 0);
- if (format == FORMAT_TEXT)
- fz_append_string(ctx, out, "<html><head><style>body{white-space:pre-wrap}</style></head><body>");
- for (j = 0; j < z; ++j)
- {
- int c = p[j];
- if (format == FORMAT_TEXT && (c == '<' || c == '>' || c == '&'))
- {
- if (c == '<')
- fz_append_string(ctx, out, "<");
- else if (c == '>')
- fz_append_string(ctx, out, ">");
- else if (c == '&')
- fz_append_string(ctx, out, "&");
- }
- else
- {
- switch (text_encoding)
- {
- case TEXT_ENCODING_UTF8:
- fz_append_byte(ctx, out, c);
- break;
- case TEXT_ENCODING_LATIN_1:
- fz_append_rune(ctx, out, c);
- break;
- case TEXT_ENCODING_1252:
- fz_append_rune(ctx, out, fz_unicode_from_windows_1252[c]);
- break;
- }
- }
- }
- if (format == FORMAT_TEXT)
- fz_append_string(ctx, out, "</body></html>");
- fz_free(ctx, p);
- }
- return record_count;
- }
- static void drop_tree_entry(fz_context *ctx, void *ent)
- {
- fz_drop_buffer(ctx, ent);
- }
- fz_archive *
- fz_extract_html_from_mobi(fz_context *ctx, fz_buffer *mobi)
- {
- fz_stream *stm = NULL;
- fz_buffer *buffer = NULL;
- fz_tree *tree = NULL;
- uint32_t *offsets = NULL;
- char buf[32];
- uint32_t i, k, extra;
- uint32_t recindex;
- uint32_t minoffset, maxoffset;
- int format = FORMAT_TEXT;
- size_t n;
- // https://wiki.mobileread.com/wiki/PalmDOC
- fz_var(stm);
- fz_var(buffer);
- fz_var(offsets);
- fz_var(tree);
- fz_try(ctx)
- {
- stm = fz_open_buffer(ctx, mobi);
- skip_bytes(ctx, stm, 32); // database name
- skip_bytes(ctx, stm, 28); // database attributes, version, dates, etc
- n = fz_read(ctx, stm, (unsigned char *)buf, 8); // database type and creator
- buf[8] = 0;
- if (n == 8 && !memcmp(buf, "BOOKMOBI", 8))
- format = FORMAT_HTML;
- else if (n == 8 && !memcmp(buf, "TEXtREAd", 8))
- format = FORMAT_TEXT;
- else if (n != 8)
- fz_warn(ctx, "premature end in data");
- else
- fz_warn(ctx, "Unknown MOBI/PRC format: %s.", buf);
- skip_bytes(ctx, stm, 8); // database internal fields
- // record info list count
- n = fz_read_uint16(ctx, stm);
- minoffset = (uint32_t)(fz_tell(ctx, stm) + n * 2 * sizeof (uint32_t) - 1);
- maxoffset = (uint32_t)mobi->len;
- // record info list
- offsets = fz_malloc_array(ctx, n + 1, uint32_t);
- for (i = 0, k = 0; i < n; ++i)
- {
- uint32_t offset = fz_read_uint32(ctx, stm);
- if (offset <= minoffset)
- continue;
- if (offset >= maxoffset)
- continue;
- minoffset = offsets[k++] = offset;
- skip_bytes(ctx, stm, 4);
- }
- offsets[k] = (uint32_t)mobi->len;
- // adjust n in case some out of bound offsets were skipped
- n = k;
- if (n == 0)
- fz_throw(ctx, FZ_ERROR_FORMAT, "no mobi records to read");
- // decompress text data
- buffer = fz_new_buffer(ctx, 128 << 10);
- extra = mobi_read_data(ctx, buffer, stm, offsets, (uint32_t)n, format);
- fz_terminate_buffer(ctx, buffer);
- #ifndef NDEBUG
- if (fz_atoi(getenv("FZ_DEBUG_MOBI")))
- fz_save_buffer(ctx, buffer, "mobi.xhtml");
- #endif
- tree = fz_tree_insert(ctx, tree, "index.html", buffer);
- buffer = NULL;
- // copy image data records into tree
- recindex = 1;
- for (i = extra; i < n; ++i)
- {
- uint32_t size = offsets[i+1] - offsets[i];
- if (size > 8)
- {
- unsigned char *data = mobi->data + offsets[i];
- if (fz_recognize_image_format(ctx, data))
- {
- buffer = fz_new_buffer_from_copied_data(ctx, data, size);
- fz_snprintf(buf, sizeof buf, "%05d", recindex);
- tree = fz_tree_insert(ctx, tree, buf, buffer);
- buffer = NULL;
- recindex++;
- }
- }
- }
- }
- fz_always(ctx)
- {
- fz_drop_stream(ctx, stm);
- fz_free(ctx, offsets);
- }
- fz_catch(ctx)
- {
- fz_drop_buffer(ctx, buffer);
- fz_drop_tree(ctx, tree, drop_tree_entry);
- fz_rethrow(ctx);
- }
- return fz_new_tree_archive(ctx, tree);
- }
|