| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589 |
- /*
- * Bidirectional text processing.
- *
- * Processes unicode text by arranging the characters into an order suitable
- * for display. E.g. Hebrew text will be arranged from right-to-left and
- * any English within the text will remain in the left-to-right order.
- * Characters such as parenthesis will be substituted for their mirrored
- * equivalents if they are part of text which must be reversed.
- *
- * This is an implementation of the unicode Bidirectional Algorithm which
- * can be found here: http://www.unicode.org/reports/tr9/ and is based
- * on the reference implementation of the algorithm found on that page.
- *
- * For a nice overview of how it works, read this...
- * http://www.w3.org/TR/REC-html40/struct/dirlang.html
- *
- * Extracted from the SmartOffice code, where it was modified by Ian
- * Beveridge.
- *
- * Copyright (C) Picsel, 2004. All Rights Reserved.
- */
- /*
- * Original copyright notice from unicode reference implementation.
- * ----------------------------------------------------------------
- * Written by: Asmus Freytag
- * C++ and Windows dependencies removed, and
- * command line interface added by: Rick McGowan
- *
- * Copyright (C) 1999, ASMUS, Inc. All Rights Reserved
- */
- /*
- * Includes...
- */
- #include "mupdf/fitz.h"
- #include "mupdf/ucdn.h"
- #include "bidi-imp.h" /* standard bidi code interface */
- #include <assert.h>
- /*
- * Macros...
- */
- #define ODD(x) ((x) & 1)
- #define REPLACEABLE_TYPE(t) ( \
- ((t)==BDI_ES) || ((t)==BDI_ET) || ((t)==BDI_CS) || \
- ((t)==BDI_NSM) || ((t)==BDI_PDF) || ((t)==BDI_BN) || \
- ((t)==BDI_S) || ((t)==BDI_WS) || ((t)==BDI_N) )
- #ifdef DEBUG_BIDI_VERBOSE
- #define DBUGVF(params) do { fz_warn params; } while (0)
- #else
- #define DBUGVF(params) do {} while (0)
- #endif
- #ifdef DEBUG_BIDI_OUTLINE
- #define DBUGH(params) do { fz_warn params; } while (0)
- #else
- #define DBUGH(params) do {} while (0)
- #endif
- #define UNICODE_EOS 0
- #define UNICODE_DIGIT_ZERO 0x0030
- #define UNICODE_DIGIT_NINE 0x0039
- #define UNICODE_SUPERSCRIPT_TWO 0x00B2
- #define UNICODE_SUPERSCRIPT_THREE 0x00B3
- #define UNICODE_SUPERSCRIPT_ONE 0x00B9
- #define UNICODE_RTL_START 0x0590
- #define UNICODE_RTL_END 0x07BF
- #define UNICODE_ARABIC_INDIC_DIGIT_ZERO 0x0660
- #define UNICODE_ARABIC_INDIC_DIGIT_NINE 0x0669
- #define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_ZERO 0x06F0
- #define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_NINE 0x06F9
- #define UNICODE_ZERO_WIDTH_NON_JOINER 0x200C
- #define UNICODE_SUPERSCRIPT_ZERO 0x2070
- #define UNICODE_SUPERSCRIPT_FOUR 0x2074
- #define UNICODE_SUPERSCRIPT_NINE 0x2079
- #define UNICODE_SUBSCRIPT_ZERO 0x2080
- #define UNICODE_SUBSCRIPT_NINE 0x2089
- #define UNICODE_CIRCLED_DIGIT_ONE 0x2460
- #define UNICODE_NUMBER_TWENTY_FULL_STOP 0x249B
- #define UNICODE_CIRCLED_DIGIT_ZERO 0x24EA
- #define UNICODE_FULLWIDTH_DIGIT_ZERO 0xFF10
- #define UNICODE_FULLWIDTH_DIGIT_NINE 0xFF19
- #ifndef TRUE
- #define TRUE (1)
- #endif
- #ifndef FALSE
- #define FALSE (0)
- #endif
- /*
- * Enumerations...
- */
- #ifdef DEBUG_BIDI_VERBOSE
- /* display support: */
- static const char char_from_types[] =
- {
- ' ', /* ON */
- '>', /* L */
- '<', /* R */
- '9', /* AN */
- '1', /* EN */
- 'a', /* AL */
- '@', /* NSM */
- '.', /* CS */
- ',', /* ES */
- '$', /* ET */
- ':', /* BN */
- 'X', /* S */
- '_', /* WS */
- 'B', /* B */
- '+', /* RLO */
- '+', /* RLE */
- '+', /* LRO */
- '+', /* LRE */
- '-', /* PDF */
- '=' /* LS */
- };
- #endif
- /*
- * Functions and static functions...
- */
- /* UCDN uses a different ordering than Bidi does. We cannot
- * change to the UCDN ordering, as the bidi-std.c code relies
- * on the exact ordering (at least that N = ON = 0). We
- * therefore map between the two using this small table. It
- * also takes care of fudging LRI, RLI, FSI and PDI, that this
- * code does not currently support. */
- static const uint8_t ucdn_to_bidi[] =
- {
- BDI_L, /* UCDN_BIDI_CLASS_L = 0 */
- BDI_LRE, /* UCDN_BIDI_CLASS_LRE = 1 */
- BDI_LRO, /* UCDN_BIDI_CLASS_LRO = 2 */
- BDI_R, /* UCDN_BIDI_CLASS_R = 3 */
- BDI_AL, /* UCDN_BIDI_CLASS_AL = 4 */
- BDI_RLE, /* UCDN_BIDI_CLASS_RLE = 5 */
- BDI_RLO, /* UCDN_BIDI_CLASS_RLO = 6 */
- BDI_PDF, /* UCDN_BIDI_CLASS_PDF = 7 */
- BDI_EN, /* UCDN_BIDI_CLASS_EN = 8 */
- BDI_ES, /* UCDN_BIDI_CLASS_ES = 9 */
- BDI_ET, /* UCDN_BIDI_CLASS_ET = 10 */
- BDI_AN, /* UCDN_BIDI_CLASS_AN = 11 */
- BDI_CS, /* UCDN_BIDI_CLASS_CS = 12 */
- BDI_NSM, /* UCDN_BIDI_CLASS_NSM = 13 */
- BDI_BN, /* UCDN_BIDI_CLASS_BN = 14 */
- BDI_B, /* UCDN_BIDI_CLASS_B = 15 */
- BDI_S, /* UCDN_BIDI_CLASS_S = 16 */
- BDI_WS, /* UCDN_BIDI_CLASS_WS = 17 */
- BDI_ON, /* UCDN_BIDI_CLASS_ON = 18 */
- BDI_LRE, /* UCDN_BIDI_CLASS_LRI = 19 */
- BDI_RLE, /* UCDN_BIDI_CLASS_RLI = 20 */
- BDI_N, /* UCDN_BIDI_CLASS_FSI = 21 */
- BDI_N, /* UCDN_BIDI_CLASS_PDI = 22 */
- };
- #define class_from_ch_ws(ch) (ucdn_to_bidi[ucdn_get_bidi_class(ch)])
- /* Return a direction for white-space on the second pass of the algorithm. */
- static fz_bidi_chartype class_from_ch_n(uint32_t ch)
- {
- fz_bidi_chartype from_ch_ws = class_from_ch_ws(ch);
- if (from_ch_ws == BDI_S || from_ch_ws == BDI_WS)
- return BDI_N;
- return from_ch_ws;
- }
- /* Split fragments into single scripts (or punctuation + single script) */
- static void
- split_at_script(const uint32_t *fragment,
- size_t fragment_len,
- int level,
- void *arg,
- fz_bidi_fragment_fn *callback)
- {
- int script = UCDN_SCRIPT_COMMON;
- size_t script_start, i;
- script_start = 0;
- for (i = 0; i < fragment_len; i++)
- {
- int s = ucdn_get_script(fragment[i]);
- if (s == UCDN_SCRIPT_COMMON || s == UCDN_SCRIPT_INHERITED)
- {
- /* Punctuation etc. This is fine. */
- }
- else if (s == script)
- {
- /* Same script. Still fine. */
- }
- else if (script == UCDN_SCRIPT_COMMON || script == UCDN_SCRIPT_INHERITED)
- {
- /* First non punctuation thing. Set the script. */
- script = s;
- }
- else
- {
- /* Change of script. Break the fragment. */
- (*callback)(&fragment[script_start], i - script_start, level, script, arg);
- script_start = i;
- script = s;
- }
- }
- if (script_start != fragment_len)
- {
- (*callback)(&fragment[script_start], fragment_len - script_start, level, script, arg);
- }
- }
- /* Determines the character classes for all following
- * passes of the algorithm. A character class is basically the type of Bidi
- * behaviour that the character exhibits.
- */
- static void
- classify_characters(const uint32_t *text,
- fz_bidi_chartype *types,
- size_t len,
- fz_bidi_flags flags)
- {
- size_t i;
- if ((flags & FZ_BIDI_CLASSIFY_WHITE_SPACE)!=0)
- {
- for (i = 0; i < len; i++)
- {
- types[i] = class_from_ch_ws(text[i]);
- }
- }
- else
- {
- #ifdef DEBUG_BIDI_VERBOSE
- fprintf(stderr, "Text: ");
- for (i = 0; i < len; i++)
- {
- /* So that we can actually sort of read the debug string, any
- * non-ascii characters are replaced with a 1-digit hash
- * value from 0-9, making non-english characters appear
- * as numbers
- */
- fprintf(stderr, "%c", (text[i] <= 127 && text[i] >= 32) ?
- text[i] : text[i] % 9 + '0');
- }
- fprintf(stderr, "\nTypes: ");
- #endif
- for (i = 0; i < len; i++)
- {
- types[i] = class_from_ch_n(text[i]);
- #ifdef DEBUG_BIDI_VERBOSE
- fprintf(stderr, "%c", char_from_types[(int)types[i]]);
- #endif
- }
- #ifdef DEBUG_BIDI_VERBOSE
- fprintf(stderr, "\n");
- #endif
- }
- }
- /* Determines the base level of the text.
- * Implements rule P2 of the Unicode Bidi Algorithm.
- * Note: Ignores explicit embeddings
- */
- static fz_bidi_level base_level_from_text(fz_bidi_chartype *types, size_t len)
- {
- size_t i;
- for (i = 0; i < len; i++)
- {
- switch (types[i])
- {
- /* strong left */
- case BDI_L:
- return FZ_BIDI_LTR;
- /* strong right */
- case BDI_R:
- case BDI_AL:
- return FZ_BIDI_RTL;
- }
- }
- return FZ_BIDI_LTR;
- }
- static fz_bidi_direction direction_from_type(fz_bidi_chartype type)
- {
- switch (type)
- {
- case BDI_L:
- case BDI_EN:
- return FZ_BIDI_LTR;
- case BDI_R:
- case BDI_AL:
- return FZ_BIDI_RTL;
- default:
- return FZ_BIDI_NEUTRAL;
- }
- }
- static void
- classify_quoted_blocks(const uint32_t *text,
- fz_bidi_chartype *types,
- size_t len)
- {
- size_t i;
- int inQuote = FALSE;
- int pdfNeeded = FALSE;
- int ltrFound = FALSE;
- int rtlFound = FALSE;
- /* Only do anything special here if there is mixed content
- * (LTR *and* RTL) in the text.
- */
- for (i = 0; i < len; i++)
- {
- switch (direction_from_type(types[i]))
- {
- case FZ_BIDI_LTR:
- ltrFound = TRUE;
- break;
- case FZ_BIDI_RTL:
- rtlFound = TRUE;
- break;
- default:
- break;
- }
- }
- /* Only make any changes if *both* LTR and RTL characters exist
- * in this text.
- */
- if (!ltrFound || !rtlFound)
- {
- return;
- }
- for (i = 0; i < len; i++)
- {
- if (text[i]=='"')
- {
- /* If we're already in a quote then terminate it,
- * else start a new block.
- */
- if (inQuote)
- {
- inQuote = FALSE;
- if (pdfNeeded)
- {
- pdfNeeded = FALSE;
- types[i] = BDI_PDF;
- }
- }
- else
- {
- size_t j;
- int done = FALSE;
- inQuote = TRUE;
- /* Find the first strong right or left type and
- * use that to determine whether we should classify
- * the quote as LRE or RLE. Or neither, if we
- * hit another quote before any strongly-directional
- * character.
- */
- for (j = i + 1; !done && (j < len) && text[j] != '"'; ++j)
- {
- switch(types[j])
- {
- case BDI_RLE:
- case BDI_LRE:
- done = TRUE;
- break;
- case BDI_L:
- case BDI_EN:
- types[i] = BDI_LRE;
- pdfNeeded = TRUE;
- done = TRUE;
- break;
- case BDI_R:
- case BDI_AL:
- types[i] = BDI_RLE;
- pdfNeeded = TRUE;
- done = TRUE;
- break;
- default:
- break;
- }
- }
- }
- }
- }
- }
- /* Creates a buffer with an embedding level for every character in the
- * given text. Also determines the base level and returns it in
- * *baseDir if *baseDir does not initially contain a valid direction.
- */
- static fz_bidi_level *
- create_levels(fz_context *ctx,
- const uint32_t *text,
- size_t len,
- fz_bidi_direction *baseDir,
- int resolveWhiteSpace,
- int flags)
- {
- fz_bidi_level *levels, *plevels;
- fz_bidi_chartype *types = NULL;
- fz_bidi_chartype *ptypes;
- fz_bidi_level baseLevel;
- const uint32_t *ptext;
- size_t plen, remaining;
- levels = Memento_label(fz_malloc(ctx, len * sizeof(*levels)), "bidi_levels");
- fz_var(types);
- fz_try(ctx)
- {
- types = fz_malloc(ctx, len * sizeof(fz_bidi_chartype));
- classify_characters(text, types, len, flags);
- if (*baseDir != FZ_BIDI_LTR && *baseDir != FZ_BIDI_RTL)
- {
- /* Derive the base level from the text and
- * update *baseDir in case the caller wants to know.
- */
- baseLevel = base_level_from_text(types, len);
- *baseDir = ODD(baseLevel)==1 ? FZ_BIDI_RTL : FZ_BIDI_LTR;
- }
- else
- {
- baseLevel = (fz_bidi_level)*baseDir;
- }
- {
- /* Replace tab with base direction, i.e. make tab appear as
- * 'strong left' if the base direction is left-to-right and
- * 'strong right' if base direction is right-to-left. This
- * allows Layout to implicitly treat tabs as 'segment separators'.
- */
- size_t i;
- for (i = 0u; i < len; i++)
- {
- if (text[i]=='\t')
- {
- types[i] = (*baseDir == FZ_BIDI_RTL) ? BDI_R : BDI_L;
- }
- }
- }
- /* Look for quotation marks. Classify them as RLE or LRE
- * or leave them alone, depending on what follows them.
- */
- classify_quoted_blocks(text, types, len);
- /* Work one paragraph at a time. */
- plevels = levels;
- ptypes = types;
- ptext = text;
- remaining = len;
- while (remaining)
- {
- plen = fz_bidi_resolve_paragraphs(ptypes, remaining);
- /* Work out the levels and character types... */
- (void)fz_bidi_resolve_explicit(baseLevel, BDI_N, ptypes, plevels, plen, 0);
- fz_bidi_resolve_weak(ctx, baseLevel, ptypes, plevels, plen);
- fz_bidi_resolve_neutrals(baseLevel, ptypes, plevels, plen);
- fz_bidi_resolve_implicit(ptypes, plevels, plen);
- classify_characters(ptext, ptypes, plen, FZ_BIDI_CLASSIFY_WHITE_SPACE);
- if (resolveWhiteSpace)
- {
- /* resolve whitespace */
- fz_bidi_resolve_whitespace(baseLevel, ptypes, plevels, plen);
- }
- plevels += plen;
- ptypes += plen;
- ptext += plen;
- remaining -= plen;
- }
- /* The levels buffer now has odd and even numbers indicating
- * rtl or ltr characters, respectively.
- */
- #ifdef DEBUG_BIDI_VERBOSE
- fprintf(stderr, "Levels: ");
- {
- size_t i;
- for (i = 0; i < len; i++)
- {
- fprintf(stderr, "%d", levels[i]>9?0:levels[i]);
- }
- fprintf(stderr, "\n");
- }
- #endif
- }
- fz_always(ctx)
- {
- fz_free(ctx, types);
- }
- fz_catch(ctx)
- {
- fz_free(ctx, levels);
- fz_rethrow(ctx);
- }
- return levels;
- }
- /* Partitions the given character sequence into one or more unidirectional
- * fragments and invokes the given callback function for each fragment.
- */
- void fz_bidi_fragment_text(fz_context *ctx,
- const uint32_t *text,
- size_t textlen,
- fz_bidi_direction *baseDir,
- fz_bidi_fragment_fn *callback,
- void *arg,
- int flags)
- {
- size_t startOfFragment;
- size_t i;
- fz_bidi_level *levels;
- if (text == NULL || callback == NULL || textlen == 0)
- return;
- DBUGH((ctx, "fz_bidi_fragment_text('%S', len = %d)\n", text, textlen));
- levels = create_levels(ctx, text, textlen, baseDir, FALSE, flags);
- /* We now have an array with an embedding level
- * for each character in text.
- */
- assert(levels != NULL);
- fz_try(ctx)
- {
- startOfFragment = 0;
- for (i = 1; i < textlen; i++)
- {
- if (levels[i] != levels[i-1])
- {
- /* We've gone past the end of the fragment.
- * Create a text object for it, then start
- * a new fragment.
- */
- split_at_script(&text[startOfFragment],
- i - startOfFragment,
- levels[startOfFragment],
- arg,
- callback);
- startOfFragment = i;
- }
- }
- /* Now i == textlen. Deal with the final (or maybe only) fragment. */
- /* otherwise create 1 fragment */
- split_at_script(&text[startOfFragment],
- i - startOfFragment,
- levels[startOfFragment],
- arg,
- callback);
- }
- fz_always(ctx)
- {
- fz_free(ctx, levels);
- }
- fz_catch(ctx)
- {
- fz_rethrow(ctx);
- }
- }
|