txt.c 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. // Copyright (C) 2023-2024 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #include "mupdf/fitz.h"
  23. #include "mupdf/html.h"
  24. enum { ENCODING_ASCII, ENCODING_UTF8, ENCODING_UTF8_BOM, ENCODING_UTF16_LE, ENCODING_UTF16_BE };
  25. static int
  26. detect_txt_encoding(fz_context *ctx, fz_buffer *buf)
  27. {
  28. const uint8_t *d = buf->data;
  29. size_t len = buf->len;
  30. const uint8_t *end = buf->data + len;
  31. int count_tabs = 0;
  32. int count_hi = 0;
  33. int count_controls = 0;
  34. int plausibly_utf8 = 1;
  35. /* If we find a BOM, believe it. */
  36. if (len >= 3 && d[0] == 0xef && d[1] == 0xbb && d[2] == 0xBF)
  37. return ENCODING_UTF8_BOM;
  38. else if (len >= 2 && d[0] == 0xff && d[1] == 0xfe)
  39. return ENCODING_UTF16_LE;
  40. else if (len >= 2 && d[0] == 0xfe && d[1] == 0xff)
  41. return ENCODING_UTF16_BE;
  42. while (d < end)
  43. {
  44. uint8_t c = *d++;
  45. if (c == 9)
  46. count_tabs++;
  47. else if (c == 12)
  48. {
  49. /* Form feed. Ignore that. */
  50. }
  51. else if (c == 10)
  52. {
  53. if (d < end && d[0] == 13)
  54. d++;
  55. }
  56. else if (c == 13)
  57. {
  58. if (d < end && d[0] == 10)
  59. d++;
  60. }
  61. else if (c < 32 || c == 0x7f)
  62. count_controls++;
  63. else if (c < 0x7f)
  64. {
  65. /* Reasonable ASCII value */
  66. }
  67. else
  68. {
  69. count_hi++;
  70. if ((c & 0xf8) == 0xF0)
  71. {
  72. /* Could be UTF8 with 3 following bytes */
  73. if (d+2 >= end ||
  74. (d[0] & 0xC0) != 0x80 ||
  75. (d[1] & 0xC0) != 0x80 ||
  76. (d[2] & 0xC0) != 0x80)
  77. plausibly_utf8 = 0;
  78. else
  79. d += 3;
  80. }
  81. else if ((c & 0xf0) == 0xE0)
  82. {
  83. /* Could be UTF8 with 2 following bytes */
  84. if (d+1 >= end ||
  85. (d[0] & 0xC0) != 0x80 ||
  86. (d[1] & 0xC0) != 0x80)
  87. plausibly_utf8 = 0;
  88. else
  89. d += 2;
  90. }
  91. else if ((c & 0xE0) == 0xC0)
  92. {
  93. /* Could be UTF8 with 1 following bytes */
  94. if (d+1 >= end ||
  95. (d[0] & 0xC0) != 0x80)
  96. plausibly_utf8 = 0;
  97. else
  98. d++;
  99. }
  100. else
  101. plausibly_utf8 = 0;
  102. }
  103. }
  104. (void)count_tabs;
  105. (void)count_hi;
  106. (void)count_controls;
  107. if (plausibly_utf8)
  108. return ENCODING_UTF8;
  109. return ENCODING_ASCII;
  110. }
  111. fz_buffer *
  112. fz_txt_buffer_to_html(fz_context *ctx, fz_buffer *in)
  113. {
  114. int encoding = detect_txt_encoding(ctx, in);
  115. fz_stream *stream = fz_open_buffer(ctx, in);
  116. fz_buffer *outbuf = NULL;
  117. fz_output *out = NULL;
  118. int col = 0;
  119. fz_var(outbuf);
  120. fz_var(out);
  121. fz_try(ctx)
  122. {
  123. outbuf = fz_new_buffer(ctx, 1024);
  124. out = fz_new_output_with_buffer(ctx, outbuf);
  125. fz_write_string(ctx, out, "<!doctype html><style>body{margin:0}pre{page-break-before:always;margin:0;white-space:pre-wrap;}</style><pre>");
  126. if (encoding == ENCODING_UTF16_LE || encoding == ENCODING_UTF16_BE)
  127. {
  128. fz_read_byte(ctx, stream);
  129. fz_read_byte(ctx, stream);
  130. }
  131. else if (encoding == ENCODING_UTF8_BOM)
  132. {
  133. fz_read_byte(ctx, stream);
  134. fz_read_byte(ctx, stream);
  135. fz_read_byte(ctx, stream);
  136. }
  137. while (!fz_is_eof(ctx, stream))
  138. {
  139. int c;
  140. switch (encoding)
  141. {
  142. default:
  143. case ENCODING_ASCII:
  144. c = fz_read_byte(ctx, stream);
  145. break;
  146. case ENCODING_UTF8:
  147. case ENCODING_UTF8_BOM:
  148. c = fz_read_rune(ctx, stream);
  149. break;
  150. case ENCODING_UTF16_LE:
  151. c = fz_read_utf16_le(ctx, stream);
  152. break;
  153. case ENCODING_UTF16_BE:
  154. c = fz_read_utf16_be(ctx, stream);
  155. }
  156. if (c == 10 || c == 13)
  157. {
  158. col = -1;
  159. fz_write_byte(ctx, out, c);
  160. }
  161. else if (c == 9)
  162. {
  163. int n = (8 - col) & 7;
  164. if (n == 0)
  165. n = 8;
  166. col += n-1;
  167. while (n--)
  168. fz_write_byte(ctx, out, ' ');
  169. }
  170. else if (c == 12)
  171. {
  172. col = -1;
  173. fz_write_string(ctx, out, "</pre><pre>\n");
  174. }
  175. else if (c == '<')
  176. fz_write_string(ctx, out, "&lt;");
  177. else if (c == '>')
  178. fz_write_string(ctx, out, "&gt;");
  179. else if (c == '"')
  180. fz_write_string(ctx, out, "&quot;");
  181. else
  182. fz_write_rune(ctx, out, c);
  183. ++col;
  184. }
  185. fz_close_output(ctx, out);
  186. }
  187. fz_always(ctx)
  188. {
  189. fz_drop_stream(ctx, stream);
  190. fz_drop_output(ctx, out);
  191. }
  192. fz_catch(ctx)
  193. {
  194. fz_drop_buffer(ctx, outbuf);
  195. fz_rethrow(ctx);
  196. }
  197. return outbuf;
  198. }
  199. static fz_buffer *
  200. txt_to_html(fz_context *ctx, fz_html_font_set *set, fz_buffer *buf, fz_archive *zip, const char *user_css)
  201. {
  202. return fz_txt_buffer_to_html(ctx, buf);
  203. }
  204. static const fz_htdoc_format_t fz_htdoc_txt =
  205. {
  206. "Text",
  207. txt_to_html,
  208. 0, 1, 0
  209. };
  210. static fz_document *
  211. txt_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *zip, void *state)
  212. {
  213. return fz_htdoc_open_document_with_stream_and_dir(ctx, file, zip, &fz_htdoc_txt);
  214. }
  215. static const char *txt_extensions[] =
  216. {
  217. "txt",
  218. "text",
  219. "log",
  220. NULL
  221. };
  222. static const char *txt_mimetypes[] =
  223. {
  224. "text.plain",
  225. NULL
  226. };
  227. fz_document_handler txt_document_handler =
  228. {
  229. NULL,
  230. txt_open_document,
  231. txt_extensions,
  232. txt_mimetypes
  233. };