| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256 |
- // Copyright (C) 2023-2024 Artifex Software, Inc.
- //
- // This file is part of MuPDF.
- //
- // MuPDF is free software: you can redistribute it and/or modify it under the
- // terms of the GNU Affero General Public License as published by the Free
- // Software Foundation, either version 3 of the License, or (at your option)
- // any later version.
- //
- // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
- // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- // details.
- //
- // You should have received a copy of the GNU Affero General Public License
- // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
- //
- // Alternative licensing terms are available from the licensor.
- // For commercial licensing, see <https://www.artifex.com/> or contact
- // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
- // CA 94129, USA, for further information.
- #include "mupdf/fitz.h"
- #include "mupdf/html.h"
- enum { ENCODING_ASCII, ENCODING_UTF8, ENCODING_UTF8_BOM, ENCODING_UTF16_LE, ENCODING_UTF16_BE };
- static int
- detect_txt_encoding(fz_context *ctx, fz_buffer *buf)
- {
- const uint8_t *d = buf->data;
- size_t len = buf->len;
- const uint8_t *end = buf->data + len;
- int count_tabs = 0;
- int count_hi = 0;
- int count_controls = 0;
- int plausibly_utf8 = 1;
- /* If we find a BOM, believe it. */
- if (len >= 3 && d[0] == 0xef && d[1] == 0xbb && d[2] == 0xBF)
- return ENCODING_UTF8_BOM;
- else if (len >= 2 && d[0] == 0xff && d[1] == 0xfe)
- return ENCODING_UTF16_LE;
- else if (len >= 2 && d[0] == 0xfe && d[1] == 0xff)
- return ENCODING_UTF16_BE;
- while (d < end)
- {
- uint8_t c = *d++;
- if (c == 9)
- count_tabs++;
- else if (c == 12)
- {
- /* Form feed. Ignore that. */
- }
- else if (c == 10)
- {
- if (d < end && d[0] == 13)
- d++;
- }
- else if (c == 13)
- {
- if (d < end && d[0] == 10)
- d++;
- }
- else if (c < 32 || c == 0x7f)
- count_controls++;
- else if (c < 0x7f)
- {
- /* Reasonable ASCII value */
- }
- else
- {
- count_hi++;
- if ((c & 0xf8) == 0xF0)
- {
- /* Could be UTF8 with 3 following bytes */
- if (d+2 >= end ||
- (d[0] & 0xC0) != 0x80 ||
- (d[1] & 0xC0) != 0x80 ||
- (d[2] & 0xC0) != 0x80)
- plausibly_utf8 = 0;
- else
- d += 3;
- }
- else if ((c & 0xf0) == 0xE0)
- {
- /* Could be UTF8 with 2 following bytes */
- if (d+1 >= end ||
- (d[0] & 0xC0) != 0x80 ||
- (d[1] & 0xC0) != 0x80)
- plausibly_utf8 = 0;
- else
- d += 2;
- }
- else if ((c & 0xE0) == 0xC0)
- {
- /* Could be UTF8 with 1 following bytes */
- if (d+1 >= end ||
- (d[0] & 0xC0) != 0x80)
- plausibly_utf8 = 0;
- else
- d++;
- }
- else
- plausibly_utf8 = 0;
- }
- }
- (void)count_tabs;
- (void)count_hi;
- (void)count_controls;
- if (plausibly_utf8)
- return ENCODING_UTF8;
- return ENCODING_ASCII;
- }
- fz_buffer *
- fz_txt_buffer_to_html(fz_context *ctx, fz_buffer *in)
- {
- int encoding = detect_txt_encoding(ctx, in);
- fz_stream *stream = fz_open_buffer(ctx, in);
- fz_buffer *outbuf = NULL;
- fz_output *out = NULL;
- int col = 0;
- fz_var(outbuf);
- fz_var(out);
- fz_try(ctx)
- {
- outbuf = fz_new_buffer(ctx, 1024);
- out = fz_new_output_with_buffer(ctx, outbuf);
- fz_write_string(ctx, out, "<!doctype html><style>body{margin:0}pre{page-break-before:always;margin:0;white-space:pre-wrap;}</style><pre>");
- if (encoding == ENCODING_UTF16_LE || encoding == ENCODING_UTF16_BE)
- {
- fz_read_byte(ctx, stream);
- fz_read_byte(ctx, stream);
- }
- else if (encoding == ENCODING_UTF8_BOM)
- {
- fz_read_byte(ctx, stream);
- fz_read_byte(ctx, stream);
- fz_read_byte(ctx, stream);
- }
- while (!fz_is_eof(ctx, stream))
- {
- int c;
- switch (encoding)
- {
- default:
- case ENCODING_ASCII:
- c = fz_read_byte(ctx, stream);
- break;
- case ENCODING_UTF8:
- case ENCODING_UTF8_BOM:
- c = fz_read_rune(ctx, stream);
- break;
- case ENCODING_UTF16_LE:
- c = fz_read_utf16_le(ctx, stream);
- break;
- case ENCODING_UTF16_BE:
- c = fz_read_utf16_be(ctx, stream);
- }
- if (c == 10 || c == 13)
- {
- col = -1;
- fz_write_byte(ctx, out, c);
- }
- else if (c == 9)
- {
- int n = (8 - col) & 7;
- if (n == 0)
- n = 8;
- col += n-1;
- while (n--)
- fz_write_byte(ctx, out, ' ');
- }
- else if (c == 12)
- {
- col = -1;
- fz_write_string(ctx, out, "</pre><pre>\n");
- }
- else if (c == '<')
- fz_write_string(ctx, out, "<");
- else if (c == '>')
- fz_write_string(ctx, out, ">");
- else if (c == '"')
- fz_write_string(ctx, out, """);
- else
- fz_write_rune(ctx, out, c);
- ++col;
- }
- fz_close_output(ctx, out);
- }
- fz_always(ctx)
- {
- fz_drop_stream(ctx, stream);
- fz_drop_output(ctx, out);
- }
- fz_catch(ctx)
- {
- fz_drop_buffer(ctx, outbuf);
- fz_rethrow(ctx);
- }
- return outbuf;
- }
- static fz_buffer *
- txt_to_html(fz_context *ctx, fz_html_font_set *set, fz_buffer *buf, fz_archive *zip, const char *user_css)
- {
- return fz_txt_buffer_to_html(ctx, buf);
- }
- static const fz_htdoc_format_t fz_htdoc_txt =
- {
- "Text",
- txt_to_html,
- 0, 1, 0
- };
- static fz_document *
- txt_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *zip, void *state)
- {
- return fz_htdoc_open_document_with_stream_and_dir(ctx, file, zip, &fz_htdoc_txt);
- }
- static const char *txt_extensions[] =
- {
- "txt",
- "text",
- "log",
- NULL
- };
- static const char *txt_mimetypes[] =
- {
- "text.plain",
- NULL
- };
- fz_document_handler txt_document_handler =
- {
- NULL,
- txt_open_document,
- txt_extensions,
- txt_mimetypes
- };
|