| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734 |
- // Copyright (C) 2004-2024 Artifex Software, Inc.
- //
- // This file is part of MuPDF.
- //
- // MuPDF is free software: you can redistribute it and/or modify it under the
- // terms of the GNU Affero General Public License as published by the Free
- // Software Foundation, either version 3 of the License, or (at your option)
- // any later version.
- //
- // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
- // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- // details.
- //
- // You should have received a copy of the GNU Affero General Public License
- // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
- //
- // Alternative licensing terms are available from the licensor.
- // For commercial licensing, see <https://www.artifex.com/> or contact
- // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
- // CA 94129, USA, for further information.
- #include "mupdf/fitz.h"
- #include "mupdf/pdf.h"
- #include <string.h>
- #define IS_NUMBER \
- '+':case'-':case'.':case'0':case'1':case'2':case'3':\
- case'4':case'5':case'6':case'7':case'8':case'9'
- #define IS_WHITE \
- '\x00':case'\x09':case'\x0a':case'\x0c':case'\x0d':case'\x20'
- #define IS_HEX \
- '0':case'1':case'2':case'3':case'4':case'5':case'6':\
- case'7':case'8':case'9':case'A':case'B':case'C':\
- case'D':case'E':case'F':case'a':case'b':case'c':\
- case'd':case'e':case'f'
- #define IS_DELIM \
- '(':case')':case'<':case'>':case'[':case']':case'{':\
- case'}':case'/':case'%'
- #define RANGE_0_9 \
- '0':case'1':case'2':case'3':case'4':case'5':\
- case'6':case'7':case'8':case'9'
- #define RANGE_a_f \
- 'a':case'b':case'c':case'd':case'e':case'f'
- #define RANGE_A_F \
- 'A':case'B':case'C':case'D':case'E':case'F'
- #define RANGE_0_7 \
- '0':case'1':case'2':case'3':case'4':case'5':case'6':case'7'
- /* #define DUMP_LEXER_STREAM */
- #ifdef DUMP_LEXER_STREAM
- static inline int lex_byte(fz_context *ctx, fz_stream *stm)
- {
- int c = fz_read_byte(ctx, stm);
- if (c == EOF)
- fz_write_printf(ctx, fz_stdout(ctx), "<EOF>");
- else if (c >= 32 && c < 128)
- fz_write_printf(ctx, fz_stdout(ctx), "%c", c);
- else
- fz_write_printf(ctx, fz_stdout(ctx), "<%02x>", c);
- return c;
- }
- #else
- #define lex_byte(C,S) fz_read_byte(C,S)
- #endif
- static inline int iswhite(int ch)
- {
- return
- ch == '\000' ||
- ch == '\011' ||
- ch == '\012' ||
- ch == '\014' ||
- ch == '\015' ||
- ch == '\040';
- }
- static inline int fz_isprint(int ch)
- {
- return ch >= ' ' && ch <= '~';
- }
- static inline int unhex(int ch)
- {
- if (ch >= '0' && ch <= '9') return ch - '0';
- if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA;
- if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA;
- return 0;
- }
- static void
- lex_white(fz_context *ctx, fz_stream *f)
- {
- int c;
- do {
- c = lex_byte(ctx, f);
- } while ((c <= 32) && (iswhite(c)));
- if (c != EOF)
- fz_unread_byte(ctx, f);
- }
- static void
- lex_comment(fz_context *ctx, fz_stream *f)
- {
- int c;
- do {
- c = lex_byte(ctx, f);
- } while ((c != '\012') && (c != '\015') && (c != EOF));
- }
- /* Fast(ish) but inaccurate strtof, with Adobe overflow handling. */
- static float acrobat_compatible_atof(char *s)
- {
- int neg = 0;
- int i = 0;
- while (*s == '-')
- {
- neg = 1;
- ++s;
- }
- while (*s == '+')
- {
- ++s;
- }
- while (*s >= '0' && *s <= '9')
- {
- /* We deliberately ignore overflow here.
- * Tests show that Acrobat handles * overflows in exactly the same way we do:
- * 123450000000000000000678 is read as 678.
- */
- i = i * 10 + (*s - '0');
- ++s;
- }
- if (*s == '.')
- {
- float v = i;
- float n = 0;
- float d = 1;
- ++s;
- while (*s >= '0' && *s <= '9')
- {
- n = 10 * n + (*s - '0');
- d = 10 * d;
- ++s;
- }
- v += n / d;
- return neg ? -v : v;
- }
- else
- {
- return neg ? -i : i;
- }
- }
- /* Fast but inaccurate atoi. */
- static int64_t fast_atoi(char *s)
- {
- int neg = 0;
- int64_t i = 0;
- while (*s == '-')
- {
- neg = 1;
- ++s;
- }
- while (*s == '+')
- {
- ++s;
- }
- while (*s >= '0' && *s <= '9')
- {
- /* We deliberately ignore overflow here. */
- i = i * 10 + (*s - '0');
- ++s;
- }
- return neg ? -i : i;
- }
- static int
- lex_number(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf, int c)
- {
- char *s = buf->scratch;
- char *e = buf->scratch + buf->size - 1; /* leave space for zero terminator */
- char *isreal = (c == '.' ? s : NULL);
- int neg = (c == '-');
- int isbad = 0;
- *s++ = c;
- c = lex_byte(ctx, f);
- /* skip extra '-' signs at start of number */
- if (neg)
- {
- while (c == '-')
- c = lex_byte(ctx, f);
- }
- while (s < e)
- {
- switch (c)
- {
- case IS_WHITE:
- case IS_DELIM:
- fz_unread_byte(ctx, f);
- goto end;
- case EOF:
- goto end;
- case '.':
- if (isreal)
- isbad = 1;
- isreal = s;
- *s++ = c;
- break;
- case '-':
- /* Bug 703248: Some PDFs (particularly those
- * generated by google docs) apparently have
- * numbers like 0.000000000000-5684342 in them.
- * We'll stop our interpretation at the -, but
- * keep reading to skip over the trailing
- * digits so they aren't parsed later. */
- *s++ = '\0';
- break;
- case RANGE_0_9:
- *s++ = c;
- break;
- default:
- isbad = 1;
- *s++ = c;
- break;
- }
- c = lex_byte(ctx, f);
- }
- end:
- *s = '\0';
- if (isbad)
- return PDF_TOK_KEYWORD;
- if (isreal)
- {
- /* We'd like to use the fastest possible atof
- * routine, but we'd rather match acrobats
- * handling of broken numbers. As such, we
- * spot common broken cases and call an
- * acrobat compatible routine where required. */
- if (neg > 1 || isreal - buf->scratch >= 10)
- buf->f = acrobat_compatible_atof(buf->scratch);
- else
- buf->f = fz_atof(buf->scratch);
- return PDF_TOK_REAL;
- }
- else
- {
- buf->i = fast_atoi(buf->scratch);
- return PDF_TOK_INT;
- }
- }
- static void
- lex_name(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
- {
- char *s = lb->scratch;
- char *e = s + fz_minz(127, lb->size);
- int c;
- while (1)
- {
- if (s == e)
- {
- if (e - lb->scratch < 127)
- {
- s += pdf_lexbuf_grow(ctx, lb);
- e = lb->scratch + fz_minz(127, lb->size);
- }
- else
- {
- /* truncate names that are too long */
- fz_warn(ctx, "name is too long");
- *s = 0;
- lb->len = s - lb->scratch;
- s = NULL;
- }
- }
- c = lex_byte(ctx, f);
- switch (c)
- {
- case IS_WHITE:
- case IS_DELIM:
- fz_unread_byte(ctx, f);
- goto end;
- case EOF:
- goto end;
- case '#':
- {
- int hex[2];
- int i;
- for (i = 0; i < 2; i++)
- {
- c = fz_peek_byte(ctx, f);
- switch (c)
- {
- case RANGE_0_9:
- if (i == 1 && c == '0' && hex[0] == 0)
- goto illegal;
- hex[i] = lex_byte(ctx, f) - '0';
- break;
- case RANGE_a_f:
- hex[i] = lex_byte(ctx, f) - 'a' + 10;
- break;
- case RANGE_A_F:
- hex[i] = lex_byte(ctx, f) - 'A' + 10;
- break;
- default:
- goto illegal;
- case EOF:
- goto illegal_eof;
- }
- }
- if (s) *s++ = (hex[0] << 4) + hex[1];
- break;
- illegal:
- if (i == 1)
- fz_unread_byte(ctx, f);
- illegal_eof:
- if (s) *s++ = '#';
- continue;
- }
- default:
- if (s) *s++ = c;
- break;
- }
- }
- end:
- if (s)
- {
- *s = '\0';
- lb->len = s - lb->scratch;
- }
- }
- static int
- lex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
- {
- char *s = lb->scratch;
- char *e = s + lb->size;
- int bal = 1;
- int oct;
- int c;
- while (1)
- {
- if (s == e)
- {
- s += pdf_lexbuf_grow(ctx, lb);
- e = lb->scratch + lb->size;
- }
- c = lex_byte(ctx, f);
- switch (c)
- {
- case EOF:
- return PDF_TOK_ERROR;
- case '(':
- bal++;
- *s++ = c;
- break;
- case ')':
- bal --;
- if (bal == 0)
- goto end;
- *s++ = c;
- break;
- case '\\':
- c = lex_byte(ctx, f);
- switch (c)
- {
- case EOF:
- return PDF_TOK_ERROR;
- case 'n':
- *s++ = '\n';
- break;
- case 'r':
- *s++ = '\r';
- break;
- case 't':
- *s++ = '\t';
- break;
- case 'b':
- *s++ = '\b';
- break;
- case 'f':
- *s++ = '\f';
- break;
- case '(':
- *s++ = '(';
- break;
- case ')':
- *s++ = ')';
- break;
- case '\\':
- *s++ = '\\';
- break;
- case RANGE_0_7:
- oct = c - '0';
- c = lex_byte(ctx, f);
- if (c >= '0' && c <= '7')
- {
- oct = oct * 8 + (c - '0');
- c = lex_byte(ctx, f);
- if (c >= '0' && c <= '7')
- oct = oct * 8 + (c - '0');
- else if (c != EOF)
- fz_unread_byte(ctx, f);
- }
- else if (c != EOF)
- fz_unread_byte(ctx, f);
- *s++ = oct;
- break;
- case '\n':
- break;
- case '\r':
- c = lex_byte(ctx, f);
- if ((c != '\n') && (c != EOF))
- fz_unread_byte(ctx, f);
- break;
- default:
- *s++ = c;
- }
- break;
- /* Bug 708256: PDF 32000-1 says that any occurence of \n, \r, or \r\n in a
- * (unless escaped with a '\') should be interpreted as a single 0x0a byte. */
- case '\n':
- *s++ = 0x0a;
- break;
- case '\r':
- *s++ = 0x0a;
- c = lex_byte(ctx, f);
- if ((c != '\n') && (c != EOF))
- fz_unread_byte(ctx, f);
- break;
- default:
- *s++ = c;
- break;
- }
- }
- end:
- lb->len = s - lb->scratch;
- return PDF_TOK_STRING;
- }
- static int
- lex_hex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
- {
- char *s = lb->scratch;
- char *e = s + lb->size;
- int a = 0, x = 0;
- int c;
- while (1)
- {
- if (s == e)
- {
- s += pdf_lexbuf_grow(ctx, lb);
- e = lb->scratch + lb->size;
- }
- c = lex_byte(ctx, f);
- switch (c)
- {
- case IS_WHITE:
- break;
- default:
- fz_warn(ctx, "invalid character in hex string");
- /* fall through */
- case IS_HEX:
- if (x)
- {
- *s++ = a * 16 + unhex(c);
- x = !x;
- }
- else
- {
- a = unhex(c);
- x = !x;
- }
- break;
- case '>':
- if (x)
- {
- *s++ = a * 16; /* pad truncated string with '0' */
- }
- goto end;
- case EOF:
- return PDF_TOK_ERROR;
- }
- }
- end:
- lb->len = s - lb->scratch;
- return PDF_TOK_STRING;
- }
- static pdf_token
- pdf_token_from_keyword(char *key)
- {
- switch (*key)
- {
- case 'R':
- if (!strcmp(key, "R")) return PDF_TOK_R;
- break;
- case 't':
- if (!strcmp(key, "true")) return PDF_TOK_TRUE;
- if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER;
- break;
- case 'f':
- if (!strcmp(key, "false")) return PDF_TOK_FALSE;
- break;
- case 'n':
- if (!strcmp(key, "null")) return PDF_TOK_NULL;
- if (!strcmp(key, "newobj")) return PDF_TOK_NEWOBJ;
- break;
- case 'o':
- if (!strcmp(key, "obj")) return PDF_TOK_OBJ;
- break;
- case 'e':
- if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ;
- if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM;
- break;
- case 's':
- if (!strcmp(key, "stream")) return PDF_TOK_STREAM;
- if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF;
- break;
- case 'x':
- if (!strcmp(key, "xref")) return PDF_TOK_XREF;
- break;
- }
- while (*key)
- {
- if (!fz_isprint(*key))
- return PDF_TOK_ERROR;
- ++key;
- }
- return PDF_TOK_KEYWORD;
- }
- void pdf_lexbuf_init(fz_context *ctx, pdf_lexbuf *lb, int size)
- {
- lb->size = lb->base_size = size;
- lb->len = 0;
- lb->scratch = &lb->buffer[0];
- }
- void pdf_lexbuf_fin(fz_context *ctx, pdf_lexbuf *lb)
- {
- if (lb && lb->size != lb->base_size)
- fz_free(ctx, lb->scratch);
- }
- ptrdiff_t pdf_lexbuf_grow(fz_context *ctx, pdf_lexbuf *lb)
- {
- char *old = lb->scratch;
- size_t newsize = lb->size * 2;
- if (lb->size == lb->base_size)
- {
- lb->scratch = Memento_label(fz_malloc(ctx, newsize), "pdf_lexbuf");
- memcpy(lb->scratch, lb->buffer, lb->size);
- }
- else
- {
- lb->scratch = fz_realloc(ctx, lb->scratch, newsize);
- }
- lb->size = newsize;
- return lb->scratch - old;
- }
- pdf_token
- pdf_lex(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
- {
- while (1)
- {
- int c = lex_byte(ctx, f);
- switch (c)
- {
- case EOF:
- return PDF_TOK_EOF;
- case IS_WHITE:
- lex_white(ctx, f);
- break;
- case '%':
- lex_comment(ctx, f);
- break;
- case '/':
- lex_name(ctx, f, buf);
- return PDF_TOK_NAME;
- case '(':
- return lex_string(ctx, f, buf);
- case ')':
- return PDF_TOK_ERROR;
- case '<':
- c = lex_byte(ctx, f);
- if (c == '<')
- return PDF_TOK_OPEN_DICT;
- if (c != EOF)
- fz_unread_byte(ctx, f);
- return lex_hex_string(ctx, f, buf);
- case '>':
- c = lex_byte(ctx, f);
- if (c == '>')
- return PDF_TOK_CLOSE_DICT;
- if (c != EOF)
- fz_unread_byte(ctx, f);
- return PDF_TOK_ERROR;
- case '[':
- return PDF_TOK_OPEN_ARRAY;
- case ']':
- return PDF_TOK_CLOSE_ARRAY;
- case '{':
- return PDF_TOK_OPEN_BRACE;
- case '}':
- return PDF_TOK_CLOSE_BRACE;
- case IS_NUMBER:
- return lex_number(ctx, f, buf, c);
- default: /* isregular: !isdelim && !iswhite && c != EOF */
- fz_unread_byte(ctx, f);
- lex_name(ctx, f, buf);
- return pdf_token_from_keyword(buf->scratch);
- }
- }
- }
- pdf_token
- pdf_lex_no_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
- {
- while (1)
- {
- int c = lex_byte(ctx, f);
- switch (c)
- {
- case EOF:
- return PDF_TOK_EOF;
- case IS_WHITE:
- lex_white(ctx, f);
- break;
- case '%':
- lex_comment(ctx, f);
- break;
- case '/':
- lex_name(ctx, f, buf);
- return PDF_TOK_NAME;
- case '(':
- return PDF_TOK_ERROR; /* no strings allowed */
- case ')':
- return PDF_TOK_ERROR; /* no strings allowed */
- case '<':
- c = lex_byte(ctx, f);
- if (c == '<')
- return PDF_TOK_OPEN_DICT;
- if (c != EOF)
- fz_unread_byte(ctx, f);
- return PDF_TOK_ERROR; /* no strings allowed */
- case '>':
- c = lex_byte(ctx, f);
- if (c == '>')
- return PDF_TOK_CLOSE_DICT;
- if (c != EOF)
- fz_unread_byte(ctx, f);
- return PDF_TOK_ERROR;
- case '[':
- return PDF_TOK_OPEN_ARRAY;
- case ']':
- return PDF_TOK_CLOSE_ARRAY;
- case '{':
- return PDF_TOK_OPEN_BRACE;
- case '}':
- return PDF_TOK_CLOSE_BRACE;
- case IS_NUMBER:
- return lex_number(ctx, f, buf, c);
- default: /* isregular: !isdelim && !iswhite && c != EOF */
- fz_unread_byte(ctx, f);
- lex_name(ctx, f, buf);
- return pdf_token_from_keyword(buf->scratch);
- }
- }
- }
- void pdf_append_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf)
- {
- switch (tok)
- {
- case PDF_TOK_NAME:
- fz_append_printf(ctx, fzbuf, "/%s", buf->scratch);
- break;
- case PDF_TOK_STRING:
- if (buf->len >= buf->size)
- pdf_lexbuf_grow(ctx, buf);
- buf->scratch[buf->len] = 0;
- fz_append_pdf_string(ctx, fzbuf, buf->scratch);
- break;
- case PDF_TOK_OPEN_DICT:
- fz_append_string(ctx, fzbuf, "<<");
- break;
- case PDF_TOK_CLOSE_DICT:
- fz_append_string(ctx, fzbuf, ">>");
- break;
- case PDF_TOK_OPEN_ARRAY:
- fz_append_byte(ctx, fzbuf, '[');
- break;
- case PDF_TOK_CLOSE_ARRAY:
- fz_append_byte(ctx, fzbuf, ']');
- break;
- case PDF_TOK_OPEN_BRACE:
- fz_append_byte(ctx, fzbuf, '{');
- break;
- case PDF_TOK_CLOSE_BRACE:
- fz_append_byte(ctx, fzbuf, '}');
- break;
- case PDF_TOK_INT:
- fz_append_printf(ctx, fzbuf, "%ld", buf->i);
- break;
- case PDF_TOK_REAL:
- fz_append_printf(ctx, fzbuf, "%g", buf->f);
- break;
- default:
- fz_append_data(ctx, fzbuf, buf->scratch, buf->len);
- break;
- }
- }
|