bidi.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589
  1. /*
  2. * Bidirectional text processing.
  3. *
  4. * Processes unicode text by arranging the characters into an order suitable
  5. * for display. E.g. Hebrew text will be arranged from right-to-left and
  6. * any English within the text will remain in the left-to-right order.
  7. * Characters such as parenthesis will be substituted for their mirrored
  8. * equivalents if they are part of text which must be reversed.
  9. *
  10. * This is an implementation of the unicode Bidirectional Algorithm which
  11. * can be found here: http://www.unicode.org/reports/tr9/ and is based
  12. * on the reference implementation of the algorithm found on that page.
  13. *
  14. * For a nice overview of how it works, read this...
  15. * http://www.w3.org/TR/REC-html40/struct/dirlang.html
  16. *
  17. * Extracted from the SmartOffice code, where it was modified by Ian
  18. * Beveridge.
  19. *
  20. * Copyright (C) Picsel, 2004. All Rights Reserved.
  21. */
  22. /*
  23. * Original copyright notice from unicode reference implementation.
  24. * ----------------------------------------------------------------
  25. * Written by: Asmus Freytag
  26. * C++ and Windows dependencies removed, and
  27. * command line interface added by: Rick McGowan
  28. *
  29. * Copyright (C) 1999, ASMUS, Inc. All Rights Reserved
  30. */
  31. /*
  32. * Includes...
  33. */
  34. #include "mupdf/fitz.h"
  35. #include "mupdf/ucdn.h"
  36. #include "bidi-imp.h" /* standard bidi code interface */
  37. #include <assert.h>
  38. /*
  39. * Macros...
  40. */
  41. #define ODD(x) ((x) & 1)
  42. #define REPLACEABLE_TYPE(t) ( \
  43. ((t)==BDI_ES) || ((t)==BDI_ET) || ((t)==BDI_CS) || \
  44. ((t)==BDI_NSM) || ((t)==BDI_PDF) || ((t)==BDI_BN) || \
  45. ((t)==BDI_S) || ((t)==BDI_WS) || ((t)==BDI_N) )
  46. #ifdef DEBUG_BIDI_VERBOSE
  47. #define DBUGVF(params) do { fz_warn params; } while (0)
  48. #else
  49. #define DBUGVF(params) do {} while (0)
  50. #endif
  51. #ifdef DEBUG_BIDI_OUTLINE
  52. #define DBUGH(params) do { fz_warn params; } while (0)
  53. #else
  54. #define DBUGH(params) do {} while (0)
  55. #endif
  56. #define UNICODE_EOS 0
  57. #define UNICODE_DIGIT_ZERO 0x0030
  58. #define UNICODE_DIGIT_NINE 0x0039
  59. #define UNICODE_SUPERSCRIPT_TWO 0x00B2
  60. #define UNICODE_SUPERSCRIPT_THREE 0x00B3
  61. #define UNICODE_SUPERSCRIPT_ONE 0x00B9
  62. #define UNICODE_RTL_START 0x0590
  63. #define UNICODE_RTL_END 0x07BF
  64. #define UNICODE_ARABIC_INDIC_DIGIT_ZERO 0x0660
  65. #define UNICODE_ARABIC_INDIC_DIGIT_NINE 0x0669
  66. #define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_ZERO 0x06F0
  67. #define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_NINE 0x06F9
  68. #define UNICODE_ZERO_WIDTH_NON_JOINER 0x200C
  69. #define UNICODE_SUPERSCRIPT_ZERO 0x2070
  70. #define UNICODE_SUPERSCRIPT_FOUR 0x2074
  71. #define UNICODE_SUPERSCRIPT_NINE 0x2079
  72. #define UNICODE_SUBSCRIPT_ZERO 0x2080
  73. #define UNICODE_SUBSCRIPT_NINE 0x2089
  74. #define UNICODE_CIRCLED_DIGIT_ONE 0x2460
  75. #define UNICODE_NUMBER_TWENTY_FULL_STOP 0x249B
  76. #define UNICODE_CIRCLED_DIGIT_ZERO 0x24EA
  77. #define UNICODE_FULLWIDTH_DIGIT_ZERO 0xFF10
  78. #define UNICODE_FULLWIDTH_DIGIT_NINE 0xFF19
  79. #ifndef TRUE
  80. #define TRUE (1)
  81. #endif
  82. #ifndef FALSE
  83. #define FALSE (0)
  84. #endif
  85. /*
  86. * Enumerations...
  87. */
  88. #ifdef DEBUG_BIDI_VERBOSE
  89. /* display support: */
  90. static const char char_from_types[] =
  91. {
  92. ' ', /* ON */
  93. '>', /* L */
  94. '<', /* R */
  95. '9', /* AN */
  96. '1', /* EN */
  97. 'a', /* AL */
  98. '@', /* NSM */
  99. '.', /* CS */
  100. ',', /* ES */
  101. '$', /* ET */
  102. ':', /* BN */
  103. 'X', /* S */
  104. '_', /* WS */
  105. 'B', /* B */
  106. '+', /* RLO */
  107. '+', /* RLE */
  108. '+', /* LRO */
  109. '+', /* LRE */
  110. '-', /* PDF */
  111. '=' /* LS */
  112. };
  113. #endif
  114. /*
  115. * Functions and static functions...
  116. */
  117. /* UCDN uses a different ordering than Bidi does. We cannot
  118. * change to the UCDN ordering, as the bidi-std.c code relies
  119. * on the exact ordering (at least that N = ON = 0). We
  120. * therefore map between the two using this small table. It
  121. * also takes care of fudging LRI, RLI, FSI and PDI, that this
  122. * code does not currently support. */
  123. static const uint8_t ucdn_to_bidi[] =
  124. {
  125. BDI_L, /* UCDN_BIDI_CLASS_L = 0 */
  126. BDI_LRE, /* UCDN_BIDI_CLASS_LRE = 1 */
  127. BDI_LRO, /* UCDN_BIDI_CLASS_LRO = 2 */
  128. BDI_R, /* UCDN_BIDI_CLASS_R = 3 */
  129. BDI_AL, /* UCDN_BIDI_CLASS_AL = 4 */
  130. BDI_RLE, /* UCDN_BIDI_CLASS_RLE = 5 */
  131. BDI_RLO, /* UCDN_BIDI_CLASS_RLO = 6 */
  132. BDI_PDF, /* UCDN_BIDI_CLASS_PDF = 7 */
  133. BDI_EN, /* UCDN_BIDI_CLASS_EN = 8 */
  134. BDI_ES, /* UCDN_BIDI_CLASS_ES = 9 */
  135. BDI_ET, /* UCDN_BIDI_CLASS_ET = 10 */
  136. BDI_AN, /* UCDN_BIDI_CLASS_AN = 11 */
  137. BDI_CS, /* UCDN_BIDI_CLASS_CS = 12 */
  138. BDI_NSM, /* UCDN_BIDI_CLASS_NSM = 13 */
  139. BDI_BN, /* UCDN_BIDI_CLASS_BN = 14 */
  140. BDI_B, /* UCDN_BIDI_CLASS_B = 15 */
  141. BDI_S, /* UCDN_BIDI_CLASS_S = 16 */
  142. BDI_WS, /* UCDN_BIDI_CLASS_WS = 17 */
  143. BDI_ON, /* UCDN_BIDI_CLASS_ON = 18 */
  144. BDI_LRE, /* UCDN_BIDI_CLASS_LRI = 19 */
  145. BDI_RLE, /* UCDN_BIDI_CLASS_RLI = 20 */
  146. BDI_N, /* UCDN_BIDI_CLASS_FSI = 21 */
  147. BDI_N, /* UCDN_BIDI_CLASS_PDI = 22 */
  148. };
  149. #define class_from_ch_ws(ch) (ucdn_to_bidi[ucdn_get_bidi_class(ch)])
  150. /* Return a direction for white-space on the second pass of the algorithm. */
  151. static fz_bidi_chartype class_from_ch_n(uint32_t ch)
  152. {
  153. fz_bidi_chartype from_ch_ws = class_from_ch_ws(ch);
  154. if (from_ch_ws == BDI_S || from_ch_ws == BDI_WS)
  155. return BDI_N;
  156. return from_ch_ws;
  157. }
  158. /* Split fragments into single scripts (or punctuation + single script) */
  159. static void
  160. split_at_script(const uint32_t *fragment,
  161. size_t fragment_len,
  162. int level,
  163. void *arg,
  164. fz_bidi_fragment_fn *callback)
  165. {
  166. int script = UCDN_SCRIPT_COMMON;
  167. size_t script_start, i;
  168. script_start = 0;
  169. for (i = 0; i < fragment_len; i++)
  170. {
  171. int s = ucdn_get_script(fragment[i]);
  172. if (s == UCDN_SCRIPT_COMMON || s == UCDN_SCRIPT_INHERITED)
  173. {
  174. /* Punctuation etc. This is fine. */
  175. }
  176. else if (s == script)
  177. {
  178. /* Same script. Still fine. */
  179. }
  180. else if (script == UCDN_SCRIPT_COMMON || script == UCDN_SCRIPT_INHERITED)
  181. {
  182. /* First non punctuation thing. Set the script. */
  183. script = s;
  184. }
  185. else
  186. {
  187. /* Change of script. Break the fragment. */
  188. (*callback)(&fragment[script_start], i - script_start, level, script, arg);
  189. script_start = i;
  190. script = s;
  191. }
  192. }
  193. if (script_start != fragment_len)
  194. {
  195. (*callback)(&fragment[script_start], fragment_len - script_start, level, script, arg);
  196. }
  197. }
  198. /* Determines the character classes for all following
  199. * passes of the algorithm. A character class is basically the type of Bidi
  200. * behaviour that the character exhibits.
  201. */
  202. static void
  203. classify_characters(const uint32_t *text,
  204. fz_bidi_chartype *types,
  205. size_t len,
  206. fz_bidi_flags flags)
  207. {
  208. size_t i;
  209. if ((flags & FZ_BIDI_CLASSIFY_WHITE_SPACE)!=0)
  210. {
  211. for (i = 0; i < len; i++)
  212. {
  213. types[i] = class_from_ch_ws(text[i]);
  214. }
  215. }
  216. else
  217. {
  218. #ifdef DEBUG_BIDI_VERBOSE
  219. fprintf(stderr, "Text: ");
  220. for (i = 0; i < len; i++)
  221. {
  222. /* So that we can actually sort of read the debug string, any
  223. * non-ascii characters are replaced with a 1-digit hash
  224. * value from 0-9, making non-english characters appear
  225. * as numbers
  226. */
  227. fprintf(stderr, "%c", (text[i] <= 127 && text[i] >= 32) ?
  228. text[i] : text[i] % 9 + '0');
  229. }
  230. fprintf(stderr, "\nTypes: ");
  231. #endif
  232. for (i = 0; i < len; i++)
  233. {
  234. types[i] = class_from_ch_n(text[i]);
  235. #ifdef DEBUG_BIDI_VERBOSE
  236. fprintf(stderr, "%c", char_from_types[(int)types[i]]);
  237. #endif
  238. }
  239. #ifdef DEBUG_BIDI_VERBOSE
  240. fprintf(stderr, "\n");
  241. #endif
  242. }
  243. }
  244. /* Determines the base level of the text.
  245. * Implements rule P2 of the Unicode Bidi Algorithm.
  246. * Note: Ignores explicit embeddings
  247. */
  248. static fz_bidi_level base_level_from_text(fz_bidi_chartype *types, size_t len)
  249. {
  250. size_t i;
  251. for (i = 0; i < len; i++)
  252. {
  253. switch (types[i])
  254. {
  255. /* strong left */
  256. case BDI_L:
  257. return FZ_BIDI_LTR;
  258. /* strong right */
  259. case BDI_R:
  260. case BDI_AL:
  261. return FZ_BIDI_RTL;
  262. }
  263. }
  264. return FZ_BIDI_LTR;
  265. }
  266. static fz_bidi_direction direction_from_type(fz_bidi_chartype type)
  267. {
  268. switch (type)
  269. {
  270. case BDI_L:
  271. case BDI_EN:
  272. return FZ_BIDI_LTR;
  273. case BDI_R:
  274. case BDI_AL:
  275. return FZ_BIDI_RTL;
  276. default:
  277. return FZ_BIDI_NEUTRAL;
  278. }
  279. }
  280. static void
  281. classify_quoted_blocks(const uint32_t *text,
  282. fz_bidi_chartype *types,
  283. size_t len)
  284. {
  285. size_t i;
  286. int inQuote = FALSE;
  287. int pdfNeeded = FALSE;
  288. int ltrFound = FALSE;
  289. int rtlFound = FALSE;
  290. /* Only do anything special here if there is mixed content
  291. * (LTR *and* RTL) in the text.
  292. */
  293. for (i = 0; i < len; i++)
  294. {
  295. switch (direction_from_type(types[i]))
  296. {
  297. case FZ_BIDI_LTR:
  298. ltrFound = TRUE;
  299. break;
  300. case FZ_BIDI_RTL:
  301. rtlFound = TRUE;
  302. break;
  303. default:
  304. break;
  305. }
  306. }
  307. /* Only make any changes if *both* LTR and RTL characters exist
  308. * in this text.
  309. */
  310. if (!ltrFound || !rtlFound)
  311. {
  312. return;
  313. }
  314. for (i = 0; i < len; i++)
  315. {
  316. if (text[i]=='"')
  317. {
  318. /* If we're already in a quote then terminate it,
  319. * else start a new block.
  320. */
  321. if (inQuote)
  322. {
  323. inQuote = FALSE;
  324. if (pdfNeeded)
  325. {
  326. pdfNeeded = FALSE;
  327. types[i] = BDI_PDF;
  328. }
  329. }
  330. else
  331. {
  332. size_t j;
  333. int done = FALSE;
  334. inQuote = TRUE;
  335. /* Find the first strong right or left type and
  336. * use that to determine whether we should classify
  337. * the quote as LRE or RLE. Or neither, if we
  338. * hit another quote before any strongly-directional
  339. * character.
  340. */
  341. for (j = i + 1; !done && (j < len) && text[j] != '"'; ++j)
  342. {
  343. switch(types[j])
  344. {
  345. case BDI_RLE:
  346. case BDI_LRE:
  347. done = TRUE;
  348. break;
  349. case BDI_L:
  350. case BDI_EN:
  351. types[i] = BDI_LRE;
  352. pdfNeeded = TRUE;
  353. done = TRUE;
  354. break;
  355. case BDI_R:
  356. case BDI_AL:
  357. types[i] = BDI_RLE;
  358. pdfNeeded = TRUE;
  359. done = TRUE;
  360. break;
  361. default:
  362. break;
  363. }
  364. }
  365. }
  366. }
  367. }
  368. }
  369. /* Creates a buffer with an embedding level for every character in the
  370. * given text. Also determines the base level and returns it in
  371. * *baseDir if *baseDir does not initially contain a valid direction.
  372. */
  373. static fz_bidi_level *
  374. create_levels(fz_context *ctx,
  375. const uint32_t *text,
  376. size_t len,
  377. fz_bidi_direction *baseDir,
  378. int resolveWhiteSpace,
  379. int flags)
  380. {
  381. fz_bidi_level *levels, *plevels;
  382. fz_bidi_chartype *types = NULL;
  383. fz_bidi_chartype *ptypes;
  384. fz_bidi_level baseLevel;
  385. const uint32_t *ptext;
  386. size_t plen, remaining;
  387. levels = Memento_label(fz_malloc(ctx, len * sizeof(*levels)), "bidi_levels");
  388. fz_var(types);
  389. fz_try(ctx)
  390. {
  391. types = fz_malloc(ctx, len * sizeof(fz_bidi_chartype));
  392. classify_characters(text, types, len, flags);
  393. if (*baseDir != FZ_BIDI_LTR && *baseDir != FZ_BIDI_RTL)
  394. {
  395. /* Derive the base level from the text and
  396. * update *baseDir in case the caller wants to know.
  397. */
  398. baseLevel = base_level_from_text(types, len);
  399. *baseDir = ODD(baseLevel)==1 ? FZ_BIDI_RTL : FZ_BIDI_LTR;
  400. }
  401. else
  402. {
  403. baseLevel = (fz_bidi_level)*baseDir;
  404. }
  405. {
  406. /* Replace tab with base direction, i.e. make tab appear as
  407. * 'strong left' if the base direction is left-to-right and
  408. * 'strong right' if base direction is right-to-left. This
  409. * allows Layout to implicitly treat tabs as 'segment separators'.
  410. */
  411. size_t i;
  412. for (i = 0u; i < len; i++)
  413. {
  414. if (text[i]=='\t')
  415. {
  416. types[i] = (*baseDir == FZ_BIDI_RTL) ? BDI_R : BDI_L;
  417. }
  418. }
  419. }
  420. /* Look for quotation marks. Classify them as RLE or LRE
  421. * or leave them alone, depending on what follows them.
  422. */
  423. classify_quoted_blocks(text, types, len);
  424. /* Work one paragraph at a time. */
  425. plevels = levels;
  426. ptypes = types;
  427. ptext = text;
  428. remaining = len;
  429. while (remaining)
  430. {
  431. plen = fz_bidi_resolve_paragraphs(ptypes, remaining);
  432. /* Work out the levels and character types... */
  433. (void)fz_bidi_resolve_explicit(baseLevel, BDI_N, ptypes, plevels, plen, 0);
  434. fz_bidi_resolve_weak(ctx, baseLevel, ptypes, plevels, plen);
  435. fz_bidi_resolve_neutrals(baseLevel, ptypes, plevels, plen);
  436. fz_bidi_resolve_implicit(ptypes, plevels, plen);
  437. classify_characters(ptext, ptypes, plen, FZ_BIDI_CLASSIFY_WHITE_SPACE);
  438. if (resolveWhiteSpace)
  439. {
  440. /* resolve whitespace */
  441. fz_bidi_resolve_whitespace(baseLevel, ptypes, plevels, plen);
  442. }
  443. plevels += plen;
  444. ptypes += plen;
  445. ptext += plen;
  446. remaining -= plen;
  447. }
  448. /* The levels buffer now has odd and even numbers indicating
  449. * rtl or ltr characters, respectively.
  450. */
  451. #ifdef DEBUG_BIDI_VERBOSE
  452. fprintf(stderr, "Levels: ");
  453. {
  454. size_t i;
  455. for (i = 0; i < len; i++)
  456. {
  457. fprintf(stderr, "%d", levels[i]>9?0:levels[i]);
  458. }
  459. fprintf(stderr, "\n");
  460. }
  461. #endif
  462. }
  463. fz_always(ctx)
  464. {
  465. fz_free(ctx, types);
  466. }
  467. fz_catch(ctx)
  468. {
  469. fz_free(ctx, levels);
  470. fz_rethrow(ctx);
  471. }
  472. return levels;
  473. }
  474. /* Partitions the given character sequence into one or more unidirectional
  475. * fragments and invokes the given callback function for each fragment.
  476. */
  477. void fz_bidi_fragment_text(fz_context *ctx,
  478. const uint32_t *text,
  479. size_t textlen,
  480. fz_bidi_direction *baseDir,
  481. fz_bidi_fragment_fn *callback,
  482. void *arg,
  483. int flags)
  484. {
  485. size_t startOfFragment;
  486. size_t i;
  487. fz_bidi_level *levels;
  488. if (text == NULL || callback == NULL || textlen == 0)
  489. return;
  490. DBUGH((ctx, "fz_bidi_fragment_text('%S', len = %d)\n", text, textlen));
  491. levels = create_levels(ctx, text, textlen, baseDir, FALSE, flags);
  492. /* We now have an array with an embedding level
  493. * for each character in text.
  494. */
  495. assert(levels != NULL);
  496. fz_try(ctx)
  497. {
  498. startOfFragment = 0;
  499. for (i = 1; i < textlen; i++)
  500. {
  501. if (levels[i] != levels[i-1])
  502. {
  503. /* We've gone past the end of the fragment.
  504. * Create a text object for it, then start
  505. * a new fragment.
  506. */
  507. split_at_script(&text[startOfFragment],
  508. i - startOfFragment,
  509. levels[startOfFragment],
  510. arg,
  511. callback);
  512. startOfFragment = i;
  513. }
  514. }
  515. /* Now i == textlen. Deal with the final (or maybe only) fragment. */
  516. /* otherwise create 1 fragment */
  517. split_at_script(&text[startOfFragment],
  518. i - startOfFragment,
  519. levels[startOfFragment],
  520. arg,
  521. callback);
  522. }
  523. fz_always(ctx)
  524. {
  525. fz_free(ctx, levels);
  526. }
  527. fz_catch(ctx)
  528. {
  529. fz_rethrow(ctx);
  530. }
  531. }