| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173 |
- // Extracted from Bidi.cpp - version 26
- // Reference implementation for Unicode Bidirectional Algorithm
- // Bidi include file
- #include "mupdf/fitz.h"
- #include "bidi-imp.h"
- #include <assert.h>
- #ifndef TRUE
- #define TRUE (1)
- #endif
- #ifndef FALSE
- #define FALSE (0)
- #endif
- /*------------------------------------------------------------------------
- File: Bidi.Cpp
- Description
- -----------
- Sample Implementation of the Unicode Bidirectional Algorithm as it
- was revised by Revision 5 of the Unicode Technical Report # 9
- (1999-8-17)
- Verified for changes to the algorithm up through Unicode 5.2.0 (2009).
- This implementation is organized into several passes, each implemen-
- ting one or more of the rules of the Unicode Bidi Algorithm. The
- resolution of Weak Types and of Neutrals each use a state table
- approach.
- Both a printf based interface and a Windows DlgProc are provided for
- interactive testing.
- A stress harness comparing this implementation (v24) to a Java based
- implementation was used by Doug Felt to verify that the two
- implementations produce identical results for all strings up to six
- bidi classes and stochastic strings up to length 20.
- Version 26 was verified by the author against the Unicode 5.2.0
- file BidiTest.txt, which provides an exhaustive text of strings of
- length 4 or less, but covers some important cases where the language
- in UAX#9 had been clarified.
- To see this code running in an actual Windows program,
- download the free Unibook utility from http://unicode.org/unibook
- The bidi demo is executed from the tools menu. It is build from
- this source file.
- Build Notes
- -----------
- To compile the sample implementation please set the #define
- directives above so the correct headers get included. Not all the
- files are needed for all purposes. For the commandline version
- only bidi.h and bidi.cpp are needed.
- The Win32 version is provided as a dialog procedure. To use as a
- standalone program compile with the lbmain.cpp file. If all you
- need is the ability to run the code "as is", you can instead download
- the unibook utility from http://uincode.org/unibook/ which contains
- the bidi demo compiled from this source file.
- This code uses an extension to C++ that gives variables declared in
- a for() statement function the same scope as the for() statement.
- If your compiler does not support this extension, you may need to
- move the declaration, e.g. int ich = 0; in front of the for statement.
- Implementation Note
- -------------------
- NOTE: The Unicode Bidirectional Algorithm removes all explicit
- formatting codes in rule X9, but states that this can be
- simulated by conformant implementations. This implementation
- attempts to demonstrate such a simulation
- To demonstrate this, the current implementation does the
- following:
- in resolveExplicit()
- - change LRE, LRO, RLE, RLO, PDF to BN
- - assign nested levels to BN
- in resolveWeak and resolveNeutrals
- - assign L and R to BN's where they exist in place of
- sor and eor by changing the last BN in front of a
- level change to a strong type
- - skip over BN's for the purpose of determining actions
- - include BN in the count of deferred runs
- which will resolve some of them to EN, AN and N
- in resolveWhiteSpace
- - set the level of any surviving BN to the base level,
- or the level of the preceding character
- - include LRE,LRO, RLE, RLO, PDF and BN in the count
- whitespace to be reset
- This will result in the same order for non-BN characters as
- if the BN characters had been removed.
- The clean() function can be used to remove boundary marks for
- verification purposes.
- Notation
- --------
- Pointer variables generally start with the letter p
- Counter variables generally start with the letter c
- Index variables generally start with the letter i
- Boolean variables generally start with the letter f
- The enumerated bidirectional types have the same name as in the
- description for the Unicode Bidirectional Algorithm
- Using this code outside a demo context
- --------------------------------------
- The way the functions are broken down in this demo code is based
- on the needs of the demo to show the evolution in internal state
- as the algorithm proceeds. This obscures how the algorithm would
- be used in practice. These are the steps:
- 1. Allocate a pair of arrays large enough to hold bidi class
- and calculated levels (one for each input character)
- 2. Provide your own function to assign directional types (bidi
- classes) corresponding to each character in the input,
- conflating ON, WS, S to N. Unlike the classify function in this
- demo, the input would be actual Unicode characters.
- 3. Process the first paragraph by calling BidiParagraph. That
- function changes B into BN and returns a length including the
- paragraph separator. (The iteration over multiple paragraphs
- is left as exercise for the reader).
- 4. Assign directional types again, but now assign specific types
- to whitespace characters.
- 5. Instead of reordering the input in place it is often desirable
- to calculate an array of offsets indicating the reordering.
- To that end, allocate such an array here and use it instead
- of the input array in the next step.
- 6. Resolve and reorder the lines by calling BidiLines. That
- function 'breaks' lines on LS characters. Provide an optional
- array of flags indicating the location of other line breaks as
- needed.
- Update History
- --------------
- Version 24 is the initial published and verified version of this
- reference implementation. Version 25 and its updates fix various
- minor issues with the scaffolding used for demonstrating the
- algorithm using pseudo-alphabets from the command line or dialog
- box. No changes to the implementation of the actual bidi algorithm
- are made in any of the minor updates to version 25. Version 26
- also makes no change to the actual algorithm but was verified
- against the official BidiTest.txt file for Unicode 5.2.0.
- - updated pseudo-alphabet
- - Last Revised 12-10-99 (25)
- - enable demo mode for release builds - no other changes
- - Last Revised 12-10-00 (25a)
- - fix regression in pseudo alphabet use for Windows UI
- - Last Revised 02-01-01 (25b)
- - fixed a few comments, renamed a variable
- - Last Revised 03-04-01 (25c)
- - make base level settable, enable mirror by default,
- fix dialog size
- - Last Revised 06-02-01 (25e)
- - fixed some comments
- - Last Revised 09-29-01 (25f)
- - fixed classification for LS,RLM,LRM in pseudo alphabet,
- focus issues in UI, regression fix to commandline from 25(e)
- fix DEMO switch
- - Last Revised 11-07-01 (25g)
- - fixed classification for plus/minus in pseudo alphabet
- to track changes made in Unicode 4.0.1
- - Last Revised 12-03-04 (25h)
- - now compiles as dialog-only program for WINDOWS_UI==1
- using new bidimain.cpp
- - Last Revised 12-02-07 (25i)
- - cleaned up whitespace and indenting in the source,
- fixed two comments (table headers)
- - Last Revised 15-03-07 (25j)
- - named enumerations
- - Last Revised 30-05-07 (25k)
- - added usage notes, minor edits to comments, indentation, etc
- throughout. Added the bidiParagraph function. Checked against
- changes in the Unicode Bidi Algorithm for Unicode 5.2.0. No
- changes needed to this implementation to match the values in
- the BidiTest.txt file in the Unicode Character Database.
- Minor fixes to dialog/windows proc, updated preprocessor directives.
- - Last Revised 03-08-09 (26)
- Credits:
- -------
- Written by: Asmus Freytag
- Command line interface by: Rick McGowan
- Verification (v24): Doug Felt
- Disclaimer and legal rights:
- ---------------------------
- Copyright (C) 1999-2009, ASMUS, Inc. All Rights Reserved.
- Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
- THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
- IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
- BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
- OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
- ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE SOFTWARE.
- The file bid.rc is included in the software covered by the above.
- ------------------------------------------------------------------------*/
- /* === HELPER FUNCTIONS AND DECLARATIONS ================================= */
- #define odd(x) ((x) & 1)
- /*----------------------------------------------------------------------
- The following array maps character codes to types for the purpose of
- this sample implementation. The legend string gives a human readable
- explanation of the pseudo alphabet.
- For simplicity, characters entered by buttons are given a 1:1 mapping
- between their type and pseudo character value. Pseudo characters that
- can be typed from the keyboard are explained in the legend string.
- Use the Unicode Character Database for the real values in real use.
- ---------------------------------------------------------------------*/
- enum
- {
- chLS = 0x15
- };
- #if 0
- static const fz_bidi_chartype types_from_char[] =
- {
- // 0 1 2 3 4 5 6 7 8 9 a b c d e f
- BDI_BN, BDI_BN, BDI_BN, BDI_BN, BDI_L, BDI_R, BDI_BN, BDI_BN, BDI_BN, BDI_S, BDI_B, BDI_S, BDI_WS, BDI_B, BDI_BN, BDI_BN, /*00-0f*/
- BDI_LRO,BDI_LRE,BDI_PDF,BDI_RLO,BDI_RLE,BDI_WS, BDI_L, BDI_R, BDI_BN, BDI_BN, BDI_BN, BDI_BN, BDI_B, BDI_B, BDI_B, BDI_S, /*10-1f*/
- BDI_WS, BDI_ON, BDI_ON, BDI_ET, BDI_ET, BDI_ET, BDI_ON, BDI_ON, BDI_ON, BDI_ON, BDI_ON, BDI_ES, BDI_CS, BDI_ES, BDI_CS, BDI_ES, /*20-2f*/
- BDI_EN, BDI_EN, BDI_EN, BDI_EN, BDI_EN, BDI_EN, BDI_AN, BDI_AN, BDI_AN, BDI_AN, BDI_CS, BDI_ON, BDI_ON, BDI_ON, BDI_ON, BDI_ON, /*30-3f*/
- BDI_ON, BDI_AL, BDI_AL, BDI_AL, BDI_AL, BDI_AL, BDI_AL, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, /*40-4f*/
- BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_LRE,BDI_ON, BDI_RLE,BDI_PDF,BDI_S, /*50-5f*/
- BDI_NSM,BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, /*60-6f*/
- BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_LRO,BDI_B, BDI_RLO,BDI_BN, BDI_ON, /*70-7f*/
- };
- #endif
- /***************************************
- Reverse, human readable reference:
- LRM: 0x4
- RLM: 0x5
- L: 0x16,a-z
- LRE: 0x11,[
- LRO: 0x10,{
- R: 0x17,G-Z
- AL: A-F
- RLE: 0x14,]
- RLO: 0x13,}
- PDF: 0x12,^
- EN: 0-5
- ES: /,+,[hyphen]
- ET: #,$,%
- AN: 6-9
- CS: [comma],.,:
- NSM: `
- BN: 0x0-0x8,0xe,0xf,0x18-0x1b,~
- B: 0xa,0xd,0x1c-0x1e,|
- S: 0x9,0xb,0x1f,_
- WS: 0xc,0x15,[space]
- ON: !,",&,',(,),*,;,<,=,>,?,@,\,0x7f
- ****************************************/
- // === HELPER FUNCTIONS ================================================
- #ifdef BIDI_LINE_AT_A_TIME
- // reverse cch characters
- static
- void reverse(uint32_t *psz, int cch)
- {
- uint32_t ch_temp;
- int ich;
- for (ich = 0; ich < --cch; ich++)
- {
- ch_temp = psz[ich];
- psz[ich] = psz[cch];
- psz[cch] = ch_temp;
- }
- }
- #endif
- // Set a run of cval values at locations all prior to, but not including
- // iStart, to the new value nval.
- static
- void set_deferred_run(fz_bidi_chartype *pval, size_t cval, size_t iStart, fz_bidi_chartype nval)
- {
- size_t i;
- for (i = iStart; i > iStart - cval; )
- {
- pval[--i] = nval;
- }
- }
- static
- void set_deferred_level_run(fz_bidi_level *pval, size_t cval, size_t iStart, fz_bidi_level nval)
- {
- size_t i;
- for (i = iStart; i > iStart - cval; )
- {
- pval[--i] = nval;
- }
- }
- // === ASSIGNING BIDI CLASSES ============================================
- // === THE PARAGRAPH LEVEL ===============================================
- /*------------------------------------------------------------------------
- Function: resolve_paragraphs
- Resolves the input strings into blocks over which the algorithm
- is then applied.
- Implements Rule P1 of the Unicode Bidi Algorithm
- Input: Text string
- Character count
- Output: revised character count
- Note: This is a very simplistic function. In effect it restricts
- the action of the algorithm to the first paragraph in the input
- where a paragraph ends at the end of the first block separator
- or at the end of the input text.
- ------------------------------------------------------------------------*/
- size_t fz_bidi_resolve_paragraphs(fz_bidi_chartype *types, size_t cch)
- {
- size_t ich;
- // skip characters not of type B
- for(ich = 0; ich < cch && types[ich] != BDI_B; ich++)
- ;
- // stop after first B, make it a BN for use in the next steps
- if (ich < cch && types[ich] == BDI_B)
- types[ich++] = BDI_BN;
- return ich;
- }
- #if 0
- /*------------------------------------------------------------------------
- Function: base_level
- Determines the base level
- Implements rule P2 of the Unicode Bidi Algorithm.
- Input: Array of directional classes
- Character count
- Note: Ignores explicit embeddings
- ------------------------------------------------------------------------*/
- static int base_level(const fz_bidi_chartype *pcls, int cch)
- {
- int ich;
- for (ich = 0; ich < cch; ich++)
- {
- switch (pcls[ich])
- {
- // strong left
- case BDI_L:
- return 0;
- // strong right
- case BDI_R:
- case BDI_AL:
- return 1;
- }
- }
- return 0;
- }
- #endif
- //====== RESOLVE EXPLICIT ================================================
- static fz_bidi_level greater_even(fz_bidi_level i)
- {
- return odd(i) ? i + 1 : i + 2;
- }
- static fz_bidi_level greater_odd(fz_bidi_level i)
- {
- return odd(i) ? i + 2 : i + 1;
- }
- static fz_bidi_chartype embedding_direction(fz_bidi_chartype level)
- {
- return odd(level) ? BDI_R : BDI_L;
- }
- /*------------------------------------------------------------------------
- Function: resolveExplicit
- Recursively resolves explicit embedding levels and overrides.
- Implements rules X1-X9, of the Unicode Bidirectional Algorithm.
- Input: Base embedding level and direction
- Character count
- Output: Array of embedding levels
- Caller must allocate (one level per input character)
- In/Out: Array of direction classes
- Note: The function uses two simple counters to keep track of
- matching explicit codes and PDF. Use the default argument for
- the outermost call. The nesting counter counts the recursion
- depth and not the embedding level.
- ------------------------------------------------------------------------*/
- size_t fz_bidi_resolve_explicit(fz_bidi_level level, fz_bidi_chartype dir, fz_bidi_chartype *pcls, fz_bidi_level *plevel, size_t cch,
- fz_bidi_level n_nest)
- {
- size_t ich;
- // always called with a valid nesting level
- // nesting levels are != embedding levels
- int nLastValid = n_nest;
- // check input values
- assert(n_nest >= 0 && level >= 0 && level <= BIDI_LEVEL_MAX);
- // process the text
- for (ich = 0; ich < cch; ich++)
- {
- fz_bidi_chartype cls = pcls[ich];
- switch (cls)
- {
- case BDI_LRO:
- case BDI_LRE:
- n_nest++;
- if (greater_even(level) <= BIDI_LEVEL_MAX)
- {
- plevel[ich] = greater_even(level);
- pcls[ich] = BDI_BN;
- ich += fz_bidi_resolve_explicit(plevel[ich], (cls == BDI_LRE ? BDI_N : BDI_L),
- &pcls[ich+1], &plevel[ich+1],
- cch - (ich+1), n_nest);
- n_nest--;
- continue;
- }
- cls = pcls[ich] = BDI_BN;
- break;
- case BDI_RLO:
- case BDI_RLE:
- n_nest++;
- if (greater_odd(level) <= BIDI_LEVEL_MAX)
- {
- plevel[ich] = greater_odd(level);
- pcls[ich] = BDI_BN;
- ich += fz_bidi_resolve_explicit(plevel[ich], (cls == BDI_RLE ? BDI_N : BDI_R),
- &pcls[ich+1], &plevel[ich+1],
- cch - (ich+1), n_nest);
- n_nest--;
- continue;
- }
- cls = pcls[ich] = BDI_BN;
- break;
- case BDI_PDF:
- cls = pcls[ich] = BDI_BN;
- if (n_nest)
- {
- if (nLastValid < n_nest)
- {
- n_nest--;
- }
- else
- {
- cch = ich; // break the loop, but complete body
- }
- }
- break;
- }
- // Apply the override
- if (dir != BDI_N)
- {
- cls = dir;
- }
- plevel[ich] = level;
- if (pcls[ich] != BDI_BN)
- pcls[ich] = cls;
- }
- return ich;
- }
- // === RESOLVE WEAK TYPES ================================================
- enum bidi_state // possible states
- {
- xa, // arabic letter
- xr, // right letter
- xl, // left letter
- ao, // arabic lett. foll by ON
- ro, // right lett. foll by ON
- lo, // left lett. foll by ON
- rt, // ET following R
- lt, // ET following L
- cn, // EN, AN following AL
- ra, // arabic number foll R
- re, // european number foll R
- la, // arabic number foll L
- le, // european number foll L
- ac, // CS following cn
- rc, // CS following ra
- rs, // CS,ES following re
- lc, // CS following la
- ls, // CS,ES following le
- ret, // ET following re
- let // ET following le
- } ;
- static const unsigned char state_weak[][10] =
- {
- // N, L, R, AN, EN, AL,NSM, CS, ES, ET,
- /*xa*/ { ao, xl, xr, cn, cn, xa, xa, ao, ao, ao }, /* arabic letter */
- /*xr*/ { ro, xl, xr, ra, re, xa, xr, ro, ro, rt }, /* right letter */
- /*xl*/ { lo, xl, xr, la, le, xa, xl, lo, lo, lt }, /* left letter */
- /*ao*/ { ao, xl, xr, cn, cn, xa, ao, ao, ao, ao }, /* arabic lett. foll by ON*/
- /*ro*/ { ro, xl, xr, ra, re, xa, ro, ro, ro, rt }, /* right lett. foll by ON */
- /*lo*/ { lo, xl, xr, la, le, xa, lo, lo, lo, lt }, /* left lett. foll by ON */
- /*rt*/ { ro, xl, xr, ra, re, xa, rt, ro, ro, rt }, /* ET following R */
- /*lt*/ { lo, xl, xr, la, le, xa, lt, lo, lo, lt }, /* ET following L */
- /*cn*/ { ao, xl, xr, cn, cn, xa, cn, ac, ao, ao }, /* EN, AN following AL */
- /*ra*/ { ro, xl, xr, ra, re, xa, ra, rc, ro, rt }, /* arabic number foll R */
- /*re*/ { ro, xl, xr, ra, re, xa, re, rs, rs,ret }, /* european number foll R */
- /*la*/ { lo, xl, xr, la, le, xa, la, lc, lo, lt }, /* arabic number foll L */
- /*le*/ { lo, xl, xr, la, le, xa, le, ls, ls,let }, /* european number foll L */
- /*ac*/ { ao, xl, xr, cn, cn, xa, ao, ao, ao, ao }, /* CS following cn */
- /*rc*/ { ro, xl, xr, ra, re, xa, ro, ro, ro, rt }, /* CS following ra */
- /*rs*/ { ro, xl, xr, ra, re, xa, ro, ro, ro, rt }, /* CS,ES following re */
- /*lc*/ { lo, xl, xr, la, le, xa, lo, lo, lo, lt }, /* CS following la */
- /*ls*/ { lo, xl, xr, la, le, xa, lo, lo, lo, lt }, /* CS,ES following le */
- /*ret*/ { ro, xl, xr, ra, re, xa,ret, ro, ro,ret }, /* ET following re */
- /*let*/ { lo, xl, xr, la, le, xa,let, lo, lo,let } /* ET following le */
- };
- enum bidi_action // possible actions
- {
- // primitives
- IX = 0x100, // increment
- XX = 0xF, // no-op
- // actions
- xxx = (XX << 4) + XX, // no-op
- xIx = IX + xxx, // increment run
- xxN = (XX << 4) + BDI_ON, // set current to N
- xxE = (XX << 4) + BDI_EN, // set current to EN
- xxA = (XX << 4) + BDI_AN, // set current to AN
- xxR = (XX << 4) + BDI_R, // set current to R
- xxL = (XX << 4) + BDI_L, // set current to L
- Nxx = (BDI_ON << 4) + 0xF, // set run to neutral
- Axx = (BDI_AN << 4) + 0xF, // set run to AN
- ExE = (BDI_EN << 4) + BDI_EN, // set run to EN, set current to EN
- NIx = (BDI_ON << 4) + 0xF + IX, // set run to N, increment
- NxN = (BDI_ON << 4) + BDI_ON, // set run to N, set current to N
- NxR = (BDI_ON << 4) + BDI_R, // set run to N, set current to R
- NxE = (BDI_ON << 4) + BDI_EN, // set run to N, set current to EN
- AxA = (BDI_AN << 4) + BDI_AN, // set run to AN, set current to AN
- NxL = (BDI_ON << 4) + BDI_L, // set run to N, set current to L
- LxL = (BDI_L << 4) + BDI_L // set run to L, set current to L
- };
- typedef uint16_t fz_bidi_action;
- static const fz_bidi_action action_weak[][10] =
- {
- // N,.. L, R, AN, EN, AL, NSM, CS,..ES, ET,
- /*xa*/ { xxx, xxx, xxx, xxx, xxA, xxR, xxR, xxN, xxN, xxN }, /* arabic letter */
- /*xr*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxR, xxN, xxN, xIx }, /* right letter */
- /*xl*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxL, xxN, xxN, xIx }, /* left letter */
- /*ao*/ { xxx, xxx, xxx, xxx, xxA, xxR, xxN, xxN, xxN, xxN }, /* arabic lett. foll by ON */
- /*ro*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxN, xxN, xxN, xIx }, /* right lett. foll by ON */
- /*lo*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxN, xxN, xxN, xIx }, /* left lett. foll by ON */
- /*rt*/ { Nxx, Nxx, Nxx, Nxx, ExE, NxR, xIx, NxN, NxN, xIx }, /* ET following R */
- /*lt*/ { Nxx, Nxx, Nxx, Nxx, LxL, NxR, xIx, NxN, NxN, xIx }, /* ET following L */
- /*cn*/ { xxx, xxx, xxx, xxx, xxA, xxR, xxA, xIx, xxN, xxN }, /* EN, AN following AL */
- /*ra*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxA, xIx, xxN, xIx }, /* arabic number foll R */
- /*re*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxE, xIx, xIx, xxE }, /* european number foll R */
- /*la*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxA, xIx, xxN, xIx }, /* arabic number foll L */
- /*le*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxL, xIx, xIx, xxL }, /* european number foll L */
- /*ac*/ { Nxx, Nxx, Nxx, Axx, AxA, NxR, NxN, NxN, NxN, NxN }, /* CS following cn */
- /*rc*/ { Nxx, Nxx, Nxx, Axx, NxE, NxR, NxN, NxN, NxN, NIx }, /* CS following ra */
- /*rs*/ { Nxx, Nxx, Nxx, Nxx, ExE, NxR, NxN, NxN, NxN, NIx }, /* CS,ES following re */
- /*lc*/ { Nxx, Nxx, Nxx, Axx, NxL, NxR, NxN, NxN, NxN, NIx }, /* CS following la */
- /*ls*/ { Nxx, Nxx, Nxx, Nxx, LxL, NxR, NxN, NxN, NxN, NIx }, /* CS,ES following le */
- /*ret*/{ xxx, xxx, xxx, xxx, xxE, xxR, xxE, xxN, xxN, xxE }, /* ET following re */
- /*let*/{ xxx, xxx, xxx, xxx, xxL, xxR, xxL, xxN, xxN, xxL } /* ET following le */
- };
- static
- fz_bidi_chartype get_deferred_type(fz_bidi_action action)
- {
- return (action >> 4) & 0xF;
- }
- static
- fz_bidi_chartype get_resolved_type(fz_bidi_action action)
- {
- return action & 0xF;
- }
- /* Note on action table:
- States can be of two kinds:
- - Immediate Resolution State, where each input token
- is resolved as soon as it is seen. These states have
- only single action codes (xxN) or the no-op (xxx)
- for static input tokens.
- - Deferred Resolution State, where input tokens either
- either extend the run (xIx) or resolve its Type (e.g. Nxx).
- Input classes are of three kinds
- - Static Input Token, where the class of the token remains
- unchanged on output (AN, L, N, R)
- - Replaced Input Token, where the class of the token is
- always replaced on output (AL, BDI_BN, NSM, CS, ES, ET)
- - Conditional Input Token, where the class of the token is
- changed on output in some but not all cases (EN)
- Where tokens are subject to change, a double action
- (e.g. NxA, or NxN) is _required_ after deferred states,
- resolving both the deferred state and changing the current token.
- These properties of the table are verified by assertions below.
- This code is needed only during debugging and maintenance
- */
- /*------------------------------------------------------------------------
- Function: resolveWeak
- Resolves the directionality of numeric and other weak character types
- Implements rules X10 and W1-W6 of the Unicode Bidirectional Algorithm.
- Input: Array of embedding levels
- Character count
- In/Out: Array of directional classes
- Note: On input only these directional classes are expected
- AL, HL, R, L, ON, BDI_BN, NSM, AN, EN, ES, ET, CS,
- ------------------------------------------------------------------------*/
- void fz_bidi_resolve_weak(fz_context *ctx, fz_bidi_level baselevel, fz_bidi_chartype *pcls, fz_bidi_level *plevel, size_t cch)
- {
- int state = odd(baselevel) ? xr : xl;
- fz_bidi_chartype cls;
- size_t ich;
- fz_bidi_action action;
- fz_bidi_chartype cls_run;
- fz_bidi_chartype cls_new;
- fz_bidi_level level = baselevel;
- size_t cch_run = 0;
- for (ich = 0; ich < cch; ich++)
- {
- if (pcls[ich] > BDI_BN) {
- fz_warn(ctx, "error: pcls[%zu] > BN (%d)\n", ich, pcls[ich]);
- }
- // ignore boundary neutrals
- if (pcls[ich] == BDI_BN)
- {
- // must flatten levels unless at a level change;
- plevel[ich] = level;
- // lookahead for level changes
- if (ich + 1 == cch && level != baselevel)
- {
- // have to fixup last BN before end of the loop, since
- // its fix-upped value will be needed below the assert
- pcls[ich] = embedding_direction(level);
- }
- else if (ich + 1 < cch && level != plevel[ich+1] && pcls[ich+1] != BDI_BN)
- {
- // fixup LAST BN in front / after a level run to make
- // it act like the SOR/EOR in rule X10
- int newlevel = plevel[ich+1];
- if (level > newlevel) {
- newlevel = level;
- }
- plevel[ich] = newlevel;
- // must match assigned level
- pcls[ich] = embedding_direction(newlevel);
- level = plevel[ich+1];
- }
- else
- {
- // don't interrupt runs
- if (cch_run)
- {
- cch_run++;
- }
- continue;
- }
- }
- assert(pcls[ich] <= BDI_BN);
- cls = pcls[ich];
- action = action_weak[state][cls];
- // resolve the directionality for deferred runs
- cls_run = get_deferred_type(action);
- if (cls_run != XX)
- {
- set_deferred_run(pcls, cch_run, ich, cls_run);
- cch_run = 0;
- }
- // resolve the directionality class at the current location
- cls_new = get_resolved_type(action);
- if (cls_new != XX)
- pcls[ich] = cls_new;
- // increment a deferred run
- if (IX & action)
- cch_run++;
- state = state_weak[state][cls];
- }
- // resolve any deferred runs
- // use the direction of the current level to emulate PDF
- cls = embedding_direction(level);
- // resolve the directionality for deferred runs
- cls_run = get_deferred_type(action_weak[state][cls]);
- if (cls_run != XX)
- set_deferred_run(pcls, cch_run, ich, cls_run);
- }
- // === RESOLVE NEUTRAL TYPES ==============================================
- // action values
- enum neutral_action
- {
- // action to resolve previous input
- nL = BDI_L, // resolve EN to L
- En = 3 << 4, // resolve neutrals run to embedding level direction
- Rn = BDI_R << 4, // resolve neutrals run to strong right
- Ln = BDI_L << 4, // resolved neutrals run to strong left
- In = (1<<8), // increment count of deferred neutrals
- LnL = (1<<4)+BDI_L // set run and EN to L
- };
- static fz_bidi_chartype
- get_deferred_neutrals(fz_bidi_action action, fz_bidi_level level)
- {
- action = (action >> 4) & 0xF;
- if (action == (En >> 4))
- return embedding_direction(level);
- else
- return action;
- }
- static fz_bidi_chartype get_resolved_neutrals(fz_bidi_action action)
- {
- action = action & 0xF;
- /* RJW: The original code does:
- * if (action == In)
- * return 0;
- * else
- * return action;
- * BUT, this can never be true, as In == 256.
- * This upsets coverity. We fix this here, but include this note
- * so that we understand what changed in case we ever update to
- * a newer release of the bidirectional code.
- */
- return action;
- }
- // state values
- enum neutral_state
- {
- // new temporary class
- r, // R and characters resolved to R
- l, // L and characters resolved to L
- rn, // N preceded by right
- ln, // N preceded by left
- a, // AN preceded by left (the abbrev 'la' is used up above)
- na // N preceded by a
- } ;
- /*------------------------------------------------------------------------
- Notes:
- By rule W7, whenever a EN is 'dominated' by an L (including start of
- run with embedding direction = L) it is resolved to, and further treated
- as L.
- This leads to the need for 'a' and 'na' states.
- ------------------------------------------------------------------------*/
- static const int action_neutrals[][5] =
- {
- // N, L, R, AN, EN, = cls
- // state =
- {In, 0, 0, 0, 0}, // r right
- {In, 0, 0, 0, BDI_L}, // l left
- {In, En, Rn, Rn, Rn}, // rn N preceded by right
- {In, Ln, En, En, LnL}, // ln N preceded by left
- {In, 0, 0, 0, BDI_L}, // a AN preceded by left
- {In, En, Rn, Rn, En} // na N preceded by a
- } ;
- static const int state_neutrals[][5] =
- {
- // N, L, R, AN, EN = cls
- // state =
- {rn, l, r, r, r}, // r right
- {ln, l, r, a, l}, // l left
- {rn, l, r, r, r}, // rn N preceded by right
- {ln, l, r, a, l}, // ln N preceded by left
- {na, l, r, a, l}, // a AN preceded by left
- {na, l, r, a, l} // na N preceded by la
- } ;
- /*------------------------------------------------------------------------
- Function: resolveNeutrals
- Resolves the directionality of neutral character types.
- Implements rules W7, N1 and N2 of the Unicode Bidi Algorithm.
- Input: Array of embedding levels
- Character count
- Baselevel
- In/Out: Array of directional classes
- Note: On input only these directional classes are expected
- R, L, N, AN, EN and BN
- W8 resolves a number of ENs to L
- ------------------------------------------------------------------------*/
- void fz_bidi_resolve_neutrals(fz_bidi_level baselevel, fz_bidi_chartype *pcls, const fz_bidi_level *plevel, size_t cch)
- {
- // the state at the start of text depends on the base level
- int state = odd(baselevel) ? r : l;
- fz_bidi_chartype cls;
- size_t ich;
- fz_bidi_chartype cls_run;
- size_t cch_run = 0;
- fz_bidi_level level = baselevel;
- for (ich = 0; ich < cch; ich++)
- {
- int action;
- fz_bidi_chartype cls_new;
- // ignore boundary neutrals
- if (pcls[ich] == BDI_BN)
- {
- // include in the count for a deferred run
- if (cch_run)
- cch_run++;
- // skip any further processing
- continue;
- }
- assert(pcls[ich] < 5); // "Only N, L, R, AN, EN are allowed"
- cls = pcls[ich];
- action = action_neutrals[state][cls];
- // resolve the directionality for deferred runs
- cls_run = get_deferred_neutrals(action, level);
- if (cls_run != BDI_N)
- {
- set_deferred_run(pcls, cch_run, ich, cls_run);
- cch_run = 0;
- }
- // resolve the directionality class at the current location
- cls_new = get_resolved_neutrals(action);
- if (cls_new != BDI_N)
- pcls[ich] = cls_new;
- if (In & action)
- cch_run++;
- state = state_neutrals[state][cls];
- level = plevel[ich];
- }
- // resolve any deferred runs
- cls = embedding_direction(level); // eor has type of current level
- // resolve the directionality for deferred runs
- cls_run = get_deferred_neutrals(action_neutrals[state][cls], level);
- if (cls_run != BDI_N)
- set_deferred_run(pcls, cch_run, ich, cls_run);
- }
- // === RESOLVE IMPLICITLY =================================================
- /*------------------------------------------------------------------------
- Function: resolveImplicit
- Recursively resolves implicit embedding levels.
- Implements rules I1 and I2 of the Unicode Bidirectional Algorithm.
- Input: Array of direction classes
- Character count
- Base level
- In/Out: Array of embedding levels
- Note: levels may exceed 15 on output.
- Accepted subset of direction classes
- R, L, AN, EN
- ------------------------------------------------------------------------*/
- static const fz_bidi_level add_level[][4] =
- {
- // L, R, AN, EN = cls
- // level =
- /* even */ { 0, 1, 2, 2 }, // EVEN
- /* odd */ { 1, 0, 1, 1 } // ODD
- };
- void fz_bidi_resolve_implicit(const fz_bidi_chartype *pcls, fz_bidi_level *plevel, size_t cch)
- {
- size_t ich;
- for (ich = 0; ich < cch; ich++)
- {
- // cannot resolve bn here, since some bn were resolved to strong
- // types in resolveWeak. To remove these we need the original
- // types, which are available again in resolveWhiteSpace
- if (pcls[ich] == BDI_BN)
- {
- continue;
- }
- assert(pcls[ich] > 0); // "No Neutrals allowed to survive here."
- assert(pcls[ich] < 5); // "Out of range."
- plevel[ich] += add_level[odd(plevel[ich])][pcls[ich] - 1];
- }
- }
- #if 0
- // === REORDER ===========================================================
- /*------------------------------------------------------------------------
- Function: resolve_lines
- Breaks a paragraph into lines
- Input: Character count
- Array of line break flags
- In/Out: Array of characters
- Returns the count of characters on the first line
- Note: This function only breaks lines at hard line breaks. Other
- line breaks can be passed in. If pbrk[n] is true, then a break
- occurs after the character in psz_input[n]. Breaks before the first
- character are not allowed.
- ------------------------------------------------------------------------*/
- static int resolve_lines(uint32_t *psz_input, int *pbrk, int cch)
- {
- int ich;
- // skip characters not of type LS
- for(ich = 0; ich < cch; ich++)
- {
- if (psz_input[ich] == chLS || (pbrk && pbrk[ich]))
- {
- ich++;
- break;
- }
- }
- return ich;
- }
- #endif
- /*------------------------------------------------------------------------
- Function: fz_bidi_resolve_whitespace
- Resolves levels for WS and S
- Implements rule L1 of the Unicode bidi Algorithm.
- Input: Base embedding level
- Character count
- Array of direction classes (for one line of text)
- In/Out: Array of embedding levels (for one line of text)
- Note: this should be applied a line at a time. The default driver
- code supplied in this file assumes a single line of text; for
- a real implementation, cch and the initial pointer values
- would have to be adjusted.
- ------------------------------------------------------------------------*/
- void fz_bidi_resolve_whitespace(fz_bidi_level baselevel, const fz_bidi_chartype *pcls, fz_bidi_level *plevel,
- size_t cch)
- {
- size_t cchrun = 0;
- fz_bidi_level oldlevel = baselevel;
- size_t ich;
- for (ich = 0; ich < cch; ich++)
- {
- switch(pcls[ich])
- {
- default:
- cchrun = 0; // any other character breaks the run
- break;
- case BDI_WS:
- cchrun++;
- break;
- case BDI_RLE:
- case BDI_LRE:
- case BDI_LRO:
- case BDI_RLO:
- case BDI_PDF:
- case BDI_BN:
- plevel[ich] = oldlevel;
- cchrun++;
- break;
- case BDI_S:
- case BDI_B:
- // reset levels for WS before eot
- set_deferred_level_run(plevel, cchrun, ich, baselevel);
- cchrun = 0;
- plevel[ich] = baselevel;
- break;
- }
- oldlevel = plevel[ich];
- }
- // reset level before eot
- set_deferred_level_run(plevel, cchrun, ich, baselevel);
- }
- #ifdef BIDI_LINE_AT_A_TIME
- /*------------------------------------------------------------------------
- Functions: reorder/reorderLevel
- Recursively reorders the display string
- "From the highest level down, reverse all characters at that level and
- higher, down to the lowest odd level"
- Implements rule L2 of the Unicode bidi Algorithm.
- Input: Array of embedding levels
- Character count
- Flag enabling reversal (set to false by initial caller)
- In/Out: Text to reorder
- Note: levels may exceed 15 resp. 61 on input.
- Rule L3 - reorder combining marks is not implemented here
- Rule L4 - glyph mirroring is implemented as a display option below
- Note: this should be applied a line at a time
- -------------------------------------------------------------------------*/
- static int reorderLevel(fz_bidi_level level, uint32_t *psz_text, const fz_bidi_level *plevel, int cch,
- int f_reverse)
- {
- int ich;
- // true as soon as first odd level encountered
- f_reverse = f_reverse || odd(level);
- for (ich = 0; ich < cch; ich++)
- {
- if (plevel[ich] < level)
- {
- break;
- }
- else if (plevel[ich] > level)
- {
- ich += reorderLevel(level + 1, psz_text + ich, plevel + ich,
- cch - ich, f_reverse) - 1;
- }
- }
- if (f_reverse)
- {
- reverse(psz_text, ich);
- }
- return ich;
- }
- int Bidi_reorder(fz_bidi_level baselevel, uint32_t *psz_text, const fz_bidi_level *plevel, int cch)
- {
- int ich = 0;
- while (ich < cch)
- {
- ich += reorderLevel(baselevel, psz_text + ich, plevel + ich,
- cch - ich, FALSE);
- }
- return ich;
- }
- #endif
|