ucdn.c 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361
  1. /*
  2. * Copyright (C) 2012 Grigori Goronzy <greg@kinoho.net>
  3. *
  4. * Permission to use, copy, modify, and/or distribute this software for any
  5. * purpose with or without fee is hereby granted, provided that the above
  6. * copyright notice and this permission notice appear in all copies.
  7. *
  8. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9. * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  10. * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  11. * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  12. * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  13. * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  14. * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  15. */
  16. #include "mupdf/fitz.h"
  17. #include "mupdf/ucdn.h"
  18. #include <stdio.h>
  19. #include <stdlib.h>
  20. typedef struct {
  21. unsigned char category;
  22. unsigned char combining;
  23. unsigned char bidi_class;
  24. unsigned char east_asian_width;
  25. unsigned char script;
  26. unsigned char linebreak_class;
  27. } UCDRecord;
  28. typedef struct {
  29. unsigned short from, to;
  30. } MirrorPair;
  31. typedef struct {
  32. unsigned short from, to;
  33. unsigned char type;
  34. } BracketPair;
  35. typedef struct {
  36. unsigned int start;
  37. short count, index;
  38. } Reindex;
  39. #include "ucdn_db.h"
  40. /* constants required for Hangul (de)composition */
  41. #define SBASE 0xAC00
  42. #define LBASE 0x1100
  43. #define VBASE 0x1161
  44. #define TBASE 0x11A7
  45. #define SCOUNT 11172
  46. #define LCOUNT 19
  47. #define VCOUNT 21
  48. #define TCOUNT 28
  49. #define NCOUNT (VCOUNT * TCOUNT)
  50. static const UCDRecord *get_ucd_record(uint32_t code)
  51. {
  52. int index, offset;
  53. if (code >= 0x110000)
  54. index = 0;
  55. else {
  56. index = index0[code >> (SHIFT1+SHIFT2)] << SHIFT1;
  57. offset = (code >> SHIFT2) & ((1<<SHIFT1) - 1);
  58. index = index1[index + offset] << SHIFT2;
  59. offset = code & ((1<<SHIFT2) - 1);
  60. index = index2[index + offset];
  61. }
  62. return &ucd_records[index];
  63. }
  64. static const unsigned short *get_decomp_record(uint32_t code)
  65. {
  66. int index, offset;
  67. if (code >= 0x110000)
  68. index = 0;
  69. else {
  70. index = decomp_index0[code >> (DECOMP_SHIFT1+DECOMP_SHIFT2)]
  71. << DECOMP_SHIFT1;
  72. offset = (code >> DECOMP_SHIFT2) & ((1<<DECOMP_SHIFT1) - 1);
  73. index = decomp_index1[index + offset] << DECOMP_SHIFT2;
  74. offset = code & ((1<<DECOMP_SHIFT2) - 1);
  75. index = decomp_index2[index + offset];
  76. }
  77. return &decomp_data[index];
  78. }
  79. static int compare_reindex(const void *a, const void *b)
  80. {
  81. Reindex *ra = (Reindex *)a;
  82. Reindex *rb = (Reindex *)b;
  83. if (ra->start < rb->start)
  84. return -1;
  85. else if (ra->start > (rb->start + rb->count))
  86. return 1;
  87. else
  88. return 0;
  89. }
  90. static int get_comp_index(uint32_t code, const Reindex *idx, size_t len)
  91. {
  92. Reindex *res;
  93. Reindex r = {0, 0, 0};
  94. r.start = code;
  95. res = (Reindex *) bsearch(&r, idx, len, sizeof(Reindex), compare_reindex);
  96. if (res != NULL)
  97. return res->index + (code - res->start);
  98. else
  99. return -1;
  100. }
  101. static int compare_mp(const void *a, const void *b)
  102. {
  103. MirrorPair *mpa = (MirrorPair *)a;
  104. MirrorPair *mpb = (MirrorPair *)b;
  105. return mpa->from - mpb->from;
  106. }
  107. static int compare_bp(const void *a, const void *b)
  108. {
  109. BracketPair *bpa = (BracketPair *)a;
  110. BracketPair *bpb = (BracketPair *)b;
  111. return bpa->from - bpb->from;
  112. }
  113. static BracketPair *search_bp(uint32_t code)
  114. {
  115. BracketPair bp = {0,0,2};
  116. BracketPair *res;
  117. bp.from = code;
  118. res = (BracketPair *) bsearch(&bp, bracket_pairs, BIDI_BRACKET_LEN,
  119. sizeof(BracketPair), compare_bp);
  120. return res;
  121. }
  122. static int hangul_pair_decompose(uint32_t code, uint32_t *a, uint32_t *b)
  123. {
  124. int si = code - SBASE;
  125. if (si < 0 || si >= SCOUNT)
  126. return 0;
  127. if (si % TCOUNT) {
  128. /* LV,T */
  129. *a = SBASE + (si / TCOUNT) * TCOUNT;
  130. *b = TBASE + (si % TCOUNT);
  131. return 3;
  132. } else {
  133. /* L,V */
  134. *a = LBASE + (si / NCOUNT);
  135. *b = VBASE + (si % NCOUNT) / TCOUNT;
  136. return 2;
  137. }
  138. }
  139. static int hangul_pair_compose(uint32_t *code, uint32_t a, uint32_t b)
  140. {
  141. if (a >= SBASE && a < (SBASE + SCOUNT) && b >= TBASE && b < (TBASE + TCOUNT)) {
  142. /* LV,T */
  143. *code = a + (b - TBASE);
  144. return 3;
  145. } else if (a >= LBASE && a < (LBASE + LCOUNT) && b >= VBASE && b < (VBASE + VCOUNT)) {
  146. /* L,V */
  147. int li = a - LBASE;
  148. int vi = b - VBASE;
  149. *code = SBASE + li * NCOUNT + vi * TCOUNT;
  150. return 2;
  151. } else {
  152. return 0;
  153. }
  154. }
  155. static uint32_t decode_utf16(const unsigned short **code_ptr)
  156. {
  157. const unsigned short *code = *code_ptr;
  158. if (code[0] < 0xd800 || code[0] > 0xdc00) {
  159. *code_ptr += 1;
  160. return (uint32_t)code[0];
  161. } else {
  162. *code_ptr += 2;
  163. return 0x10000 + ((uint32_t)code[1] - 0xdc00) +
  164. (((uint32_t)code[0] - 0xd800) << 10);
  165. }
  166. }
  167. const char *ucdn_get_unicode_version(void)
  168. {
  169. return UNIDATA_VERSION;
  170. }
  171. int ucdn_get_combining_class(uint32_t code)
  172. {
  173. return get_ucd_record(code)->combining;
  174. }
  175. int ucdn_get_east_asian_width(uint32_t code)
  176. {
  177. return get_ucd_record(code)->east_asian_width;
  178. }
  179. int ucdn_get_general_category(uint32_t code)
  180. {
  181. return get_ucd_record(code)->category;
  182. }
  183. int ucdn_get_bidi_class(uint32_t code)
  184. {
  185. return get_ucd_record(code)->bidi_class;
  186. }
  187. int ucdn_get_mirrored(uint32_t code)
  188. {
  189. return ucdn_mirror(code) != code;
  190. }
  191. int ucdn_get_script(uint32_t code)
  192. {
  193. return get_ucd_record(code)->script;
  194. }
  195. int ucdn_get_linebreak_class(uint32_t code)
  196. {
  197. return get_ucd_record(code)->linebreak_class;
  198. }
  199. int ucdn_get_resolved_linebreak_class(uint32_t code)
  200. {
  201. const UCDRecord *record = get_ucd_record(code);
  202. switch (record->linebreak_class)
  203. {
  204. case UCDN_LINEBREAK_CLASS_AI:
  205. case UCDN_LINEBREAK_CLASS_SG:
  206. case UCDN_LINEBREAK_CLASS_XX:
  207. return UCDN_LINEBREAK_CLASS_AL;
  208. case UCDN_LINEBREAK_CLASS_SA:
  209. if (record->category == UCDN_GENERAL_CATEGORY_MC ||
  210. record->category == UCDN_GENERAL_CATEGORY_MN)
  211. return UCDN_LINEBREAK_CLASS_CM;
  212. return UCDN_LINEBREAK_CLASS_AL;
  213. case UCDN_LINEBREAK_CLASS_CJ:
  214. return UCDN_LINEBREAK_CLASS_NS;
  215. case UCDN_LINEBREAK_CLASS_CB:
  216. return UCDN_LINEBREAK_CLASS_B2;
  217. case UCDN_LINEBREAK_CLASS_NL:
  218. return UCDN_LINEBREAK_CLASS_BK;
  219. default:
  220. return record->linebreak_class;
  221. }
  222. }
  223. uint32_t ucdn_mirror(uint32_t code)
  224. {
  225. MirrorPair mp = {0};
  226. MirrorPair *res;
  227. mp.from = code;
  228. res = (MirrorPair *) bsearch(&mp, mirror_pairs, BIDI_MIRROR_LEN,
  229. sizeof(MirrorPair), compare_mp);
  230. if (res == NULL)
  231. return code;
  232. else
  233. return res->to;
  234. }
  235. uint32_t ucdn_paired_bracket(uint32_t code)
  236. {
  237. BracketPair *res = search_bp(code);
  238. if (res == NULL)
  239. return code;
  240. else
  241. return res->to;
  242. }
  243. int ucdn_paired_bracket_type(uint32_t code)
  244. {
  245. BracketPair *res = search_bp(code);
  246. if (res == NULL)
  247. return UCDN_BIDI_PAIRED_BRACKET_TYPE_NONE;
  248. else
  249. return res->type;
  250. }
  251. int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b)
  252. {
  253. const unsigned short *rec;
  254. int len;
  255. if (hangul_pair_decompose(code, a, b))
  256. return 1;
  257. rec = get_decomp_record(code);
  258. len = rec[0] >> 8;
  259. if ((rec[0] & 0xff) != 0 || len == 0)
  260. return 0;
  261. rec++;
  262. *a = decode_utf16(&rec);
  263. if (len > 1)
  264. *b = decode_utf16(&rec);
  265. else
  266. *b = 0;
  267. return 1;
  268. }
  269. int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b)
  270. {
  271. int l, r, index, indexi, offset;
  272. if (hangul_pair_compose(code, a, b))
  273. return 1;
  274. l = get_comp_index(a, nfc_first, sizeof(nfc_first) / sizeof(Reindex));
  275. r = get_comp_index(b, nfc_last, sizeof(nfc_last) / sizeof(Reindex));
  276. if (l < 0 || r < 0)
  277. return 0;
  278. indexi = l * TOTAL_LAST + r;
  279. index = comp_index0[indexi >> (COMP_SHIFT1+COMP_SHIFT2)] << COMP_SHIFT1;
  280. offset = (indexi >> COMP_SHIFT2) & ((1<<COMP_SHIFT1) - 1);
  281. index = comp_index1[index + offset] << COMP_SHIFT2;
  282. offset = indexi & ((1<<COMP_SHIFT2) - 1);
  283. *code = comp_data[index + offset];
  284. return *code != 0;
  285. }
  286. int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed)
  287. {
  288. int i, len;
  289. const unsigned short *rec = get_decomp_record(code);
  290. len = rec[0] >> 8;
  291. if (len == 0)
  292. return 0;
  293. rec++;
  294. for (i = 0; i < len; i++)
  295. decomposed[i] = decode_utf16(&rec);
  296. return len;
  297. }