mobi.c 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347
  1. // Copyright (C) 2004-2025 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #include "mupdf/fitz.h"
  23. #include "html-imp.h"
  24. #include <string.h>
  25. #define FORMAT_HTML 1
  26. #define FORMAT_TEXT 2
  27. #define COMPRESSION_NONE 1
  28. #define COMPRESSION_PALMDOC 2
  29. #define COMPRESSION_HUFF_CDIC 17480
  30. #define TEXT_ENCODING_LATIN_1 0
  31. #define TEXT_ENCODING_1252 1252
  32. #define TEXT_ENCODING_UTF8 65001
  33. static void
  34. skip_bytes(fz_context *ctx, fz_stream *stm, size_t len)
  35. {
  36. size_t skipped = fz_skip(ctx, stm, len);
  37. if (skipped < len)
  38. fz_throw(ctx, FZ_ERROR_FORMAT, "premature end in data");
  39. }
  40. static void
  41. mobi_read_text_none(fz_context *ctx, fz_buffer *out, fz_stream *stm, uint32_t size)
  42. {
  43. unsigned char buf[4096];
  44. size_t n;
  45. if (size > 4096)
  46. fz_throw(ctx, FZ_ERROR_FORMAT, "text block too large");
  47. n = fz_read(ctx, stm, buf, size);
  48. if (n < size)
  49. fz_warn(ctx, "premature end in mobi uncompressed text data");
  50. fz_append_data(ctx, out, buf, n);
  51. }
  52. static void
  53. mobi_read_text_palmdoc(fz_context *ctx, fz_buffer *out, fz_stream *stm, uint32_t size)
  54. {
  55. // https://wiki.mobileread.com/wiki/PalmDOC
  56. size_t end = out->len + size;
  57. while (out->len < end)
  58. {
  59. int c = fz_read_byte(ctx, stm);
  60. if (c == EOF)
  61. break;
  62. if (c >= 0x01 && c <= 0x08)
  63. {
  64. unsigned char buf[8];
  65. size_t n = fz_read(ctx, stm, buf, c);
  66. fz_append_data(ctx, out, buf, n);
  67. if (n < (size_t) c)
  68. break;
  69. }
  70. else if (c <= 0x7f)
  71. {
  72. fz_append_byte(ctx, out, c);
  73. }
  74. else if (c >= 0x80 && c <= 0xbf)
  75. {
  76. int cc, x, distance, length;
  77. cc = fz_read_byte(ctx, stm);
  78. if (cc == EOF)
  79. break;
  80. x = (c << 8) | cc;
  81. distance = (x >> 3) & 0x7ff;
  82. length = (x & 7) + 3;
  83. if (distance > 0 && (size_t)distance <= out->len)
  84. {
  85. int i;
  86. int p = (int)(out->len - distance);
  87. for (i = 0; i < length; ++i)
  88. fz_append_byte(ctx, out, out->data[p + i]);
  89. }
  90. }
  91. else if (c >= 0xc0 && c <= 0xff)
  92. {
  93. fz_append_byte(ctx, out, ' ');
  94. fz_append_byte(ctx, out, c ^ 0x80);
  95. }
  96. }
  97. if (out->len < end)
  98. fz_warn(ctx, "premature end in mobi palmdoc data");
  99. }
  100. static uint32_t
  101. mobi_read_data(fz_context *ctx, fz_buffer *out, fz_stream *stm, uint32_t *offset, uint32_t total_count, int format)
  102. {
  103. // https://wiki.mobileread.com/wiki/MOBI
  104. uint32_t compression, text_length, record_count, text_encoding, i;
  105. unsigned char buf[4];
  106. fz_range range = { 0 };
  107. fz_stream *rec = NULL;
  108. size_t n;
  109. fz_var(rec);
  110. fz_try(ctx)
  111. {
  112. range.offset = offset[0];
  113. range.length = offset[1] - offset[0];
  114. rec = fz_open_range_filter(ctx, stm, &range, 1);
  115. // PalmDOC header
  116. compression = fz_read_uint16(ctx, rec);
  117. skip_bytes(ctx, rec, 2);
  118. text_length = fz_read_uint32(ctx, rec);
  119. record_count = fz_read_uint16(ctx, rec);
  120. skip_bytes(ctx, rec, 2);
  121. skip_bytes(ctx, rec, 2); // encryption
  122. skip_bytes(ctx, rec, 2);
  123. // Optional MOBI header
  124. text_encoding = TEXT_ENCODING_LATIN_1;
  125. n = fz_read(ctx, rec, buf, 4);
  126. if (n == 4 && !memcmp(buf, "MOBI", 4))
  127. {
  128. skip_bytes(ctx, rec, 4);
  129. skip_bytes(ctx, rec, 4);
  130. text_encoding = fz_read_uint32(ctx, rec);
  131. }
  132. }
  133. fz_always(ctx)
  134. fz_drop_stream(ctx, rec);
  135. fz_catch(ctx)
  136. fz_rethrow(ctx);
  137. if (compression != COMPRESSION_NONE && compression != COMPRESSION_PALMDOC)
  138. fz_throw(ctx, FZ_ERROR_FORMAT, "unknown compression method");
  139. if (text_encoding != TEXT_ENCODING_LATIN_1 &&
  140. text_encoding != TEXT_ENCODING_1252 &&
  141. text_encoding != TEXT_ENCODING_UTF8)
  142. fz_throw(ctx, FZ_ERROR_FORMAT, "unknown text encoding");
  143. for (i = 1; i <= record_count && i < total_count; ++i)
  144. {
  145. uint32_t remain = text_length - (uint32_t)out->len;
  146. uint32_t size = remain < 4096 ? remain : 4096;
  147. fz_try(ctx)
  148. {
  149. range.offset = offset[i];
  150. range.length = offset[i + 1] - offset[i];
  151. rec = fz_open_range_filter(ctx, stm, &range, 1);
  152. if (compression == COMPRESSION_NONE)
  153. mobi_read_text_none(ctx, out, rec, size);
  154. else
  155. mobi_read_text_palmdoc(ctx, out, rec, size);
  156. }
  157. fz_always(ctx)
  158. fz_drop_stream(ctx, rec);
  159. fz_catch(ctx)
  160. fz_rethrow(ctx);
  161. }
  162. if (format == FORMAT_TEXT && out->len > 6)
  163. {
  164. if (!memcmp(out->data, "<html>", 6) || !memcmp(out->data, "<HTML>", 6))
  165. format = FORMAT_HTML;
  166. }
  167. if (text_encoding != TEXT_ENCODING_UTF8 || format == FORMAT_TEXT)
  168. {
  169. unsigned char *p;
  170. size_t j, z = fz_buffer_extract(ctx, out, &p);
  171. fz_resize_buffer(ctx, out, 0);
  172. if (format == FORMAT_TEXT)
  173. fz_append_string(ctx, out, "<html><head><style>body{white-space:pre-wrap}</style></head><body>");
  174. for (j = 0; j < z; ++j)
  175. {
  176. int c = p[j];
  177. if (format == FORMAT_TEXT && (c == '<' || c == '>' || c == '&'))
  178. {
  179. if (c == '<')
  180. fz_append_string(ctx, out, "&lt;");
  181. else if (c == '>')
  182. fz_append_string(ctx, out, "&gt;");
  183. else if (c == '&')
  184. fz_append_string(ctx, out, "&amp;");
  185. }
  186. else
  187. {
  188. switch (text_encoding)
  189. {
  190. case TEXT_ENCODING_UTF8:
  191. fz_append_byte(ctx, out, c);
  192. break;
  193. case TEXT_ENCODING_LATIN_1:
  194. fz_append_rune(ctx, out, c);
  195. break;
  196. case TEXT_ENCODING_1252:
  197. fz_append_rune(ctx, out, fz_unicode_from_windows_1252[c]);
  198. break;
  199. }
  200. }
  201. }
  202. if (format == FORMAT_TEXT)
  203. fz_append_string(ctx, out, "</body></html>");
  204. fz_free(ctx, p);
  205. }
  206. return record_count;
  207. }
  208. static void drop_tree_entry(fz_context *ctx, void *ent)
  209. {
  210. fz_drop_buffer(ctx, ent);
  211. }
  212. fz_archive *
  213. fz_extract_html_from_mobi(fz_context *ctx, fz_buffer *mobi)
  214. {
  215. fz_stream *stm = NULL;
  216. fz_buffer *buffer = NULL;
  217. fz_tree *tree = NULL;
  218. uint32_t *offsets = NULL;
  219. char buf[32];
  220. uint32_t i, k, extra;
  221. uint32_t recindex;
  222. uint32_t minoffset, maxoffset;
  223. int format = FORMAT_TEXT;
  224. size_t n;
  225. // https://wiki.mobileread.com/wiki/PalmDOC
  226. fz_var(stm);
  227. fz_var(buffer);
  228. fz_var(offsets);
  229. fz_var(tree);
  230. fz_try(ctx)
  231. {
  232. stm = fz_open_buffer(ctx, mobi);
  233. skip_bytes(ctx, stm, 32); // database name
  234. skip_bytes(ctx, stm, 28); // database attributes, version, dates, etc
  235. n = fz_read(ctx, stm, (unsigned char *)buf, 8); // database type and creator
  236. buf[8] = 0;
  237. if (n == 8 && !memcmp(buf, "BOOKMOBI", 8))
  238. format = FORMAT_HTML;
  239. else if (n == 8 && !memcmp(buf, "TEXtREAd", 8))
  240. format = FORMAT_TEXT;
  241. else if (n != 8)
  242. fz_warn(ctx, "premature end in data");
  243. else
  244. fz_warn(ctx, "Unknown MOBI/PRC format: %s.", buf);
  245. skip_bytes(ctx, stm, 8); // database internal fields
  246. // record info list count
  247. n = fz_read_uint16(ctx, stm);
  248. minoffset = (uint32_t)(fz_tell(ctx, stm) + n * 2 * sizeof (uint32_t) - 1);
  249. maxoffset = (uint32_t)mobi->len;
  250. // record info list
  251. offsets = fz_malloc_array(ctx, n + 1, uint32_t);
  252. for (i = 0, k = 0; i < n; ++i)
  253. {
  254. uint32_t offset = fz_read_uint32(ctx, stm);
  255. if (offset <= minoffset)
  256. continue;
  257. if (offset >= maxoffset)
  258. continue;
  259. minoffset = offsets[k++] = offset;
  260. skip_bytes(ctx, stm, 4);
  261. }
  262. offsets[k] = (uint32_t)mobi->len;
  263. // adjust n in case some out of bound offsets were skipped
  264. n = k;
  265. if (n == 0)
  266. fz_throw(ctx, FZ_ERROR_FORMAT, "no mobi records to read");
  267. // decompress text data
  268. buffer = fz_new_buffer(ctx, 128 << 10);
  269. extra = mobi_read_data(ctx, buffer, stm, offsets, (uint32_t)n, format);
  270. fz_terminate_buffer(ctx, buffer);
  271. #ifndef NDEBUG
  272. if (fz_atoi(getenv("FZ_DEBUG_MOBI")))
  273. fz_save_buffer(ctx, buffer, "mobi.xhtml");
  274. #endif
  275. tree = fz_tree_insert(ctx, tree, "index.html", buffer);
  276. buffer = NULL;
  277. // copy image data records into tree
  278. recindex = 1;
  279. for (i = extra; i < n; ++i)
  280. {
  281. uint32_t size = offsets[i+1] - offsets[i];
  282. if (size > 8)
  283. {
  284. unsigned char *data = mobi->data + offsets[i];
  285. if (fz_recognize_image_format(ctx, data))
  286. {
  287. buffer = fz_new_buffer_from_copied_data(ctx, data, size);
  288. fz_snprintf(buf, sizeof buf, "%05d", recindex);
  289. tree = fz_tree_insert(ctx, tree, buf, buffer);
  290. buffer = NULL;
  291. recindex++;
  292. }
  293. }
  294. }
  295. }
  296. fz_always(ctx)
  297. {
  298. fz_drop_stream(ctx, stm);
  299. fz_free(ctx, offsets);
  300. }
  301. fz_catch(ctx)
  302. {
  303. fz_drop_buffer(ctx, buffer);
  304. fz_drop_tree(ctx, tree, drop_tree_entry);
  305. fz_rethrow(ctx);
  306. }
  307. return fz_new_tree_archive(ctx, tree);
  308. }