string.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260
  1. // Copyright (C) 2004-2025 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #include "mupdf/fitz.h"
  23. #include <string.h>
  24. #include <errno.h>
  25. #include <math.h>
  26. #include <float.h>
  27. #include <stdlib.h>
  28. #ifdef _WIN32
  29. #include <windows.h> /* for MultiByteToWideChar etc. */
  30. #endif
  31. #include "utfdata.h"
  32. static const int *
  33. fz_ucd_bsearch(int c, const int *t, int n, int ne)
  34. {
  35. const int *p;
  36. int m;
  37. while (n > 1)
  38. {
  39. m = n/2;
  40. p = t + m*ne;
  41. if (c >= p[0])
  42. {
  43. t = p;
  44. n = n - m;
  45. }
  46. else
  47. {
  48. n = m;
  49. }
  50. }
  51. if (n && c >= t[0])
  52. return t;
  53. return 0;
  54. }
  55. int
  56. fz_tolower(int c)
  57. {
  58. const int *p;
  59. /* Make ASCII fast. */
  60. if (c < 128)
  61. {
  62. if (c >= 'A' && c <= 'Z')
  63. c += 'a' - 'A';
  64. return c;
  65. }
  66. p = fz_ucd_bsearch(c, ucd_tolower2, nelem(ucd_tolower2) / 3, 3);
  67. if (p && c >= p[0] && c <= p[1])
  68. return c + p[2];
  69. p = fz_ucd_bsearch(c, ucd_tolower1, nelem(ucd_tolower1) / 2, 2);
  70. if (p && c == p[0])
  71. return c + p[1];
  72. return c;
  73. }
  74. int
  75. fz_toupper(int c)
  76. {
  77. const int *p;
  78. p = fz_ucd_bsearch(c, ucd_toupper2, nelem(ucd_toupper2) / 3, 3);
  79. if (p && c >= p[0] && c <= p[1])
  80. return c + p[2];
  81. p = fz_ucd_bsearch(c, ucd_toupper1, nelem(ucd_toupper1) / 2, 2);
  82. if (p && c == p[0])
  83. return c + p[1];
  84. return c;
  85. }
  86. size_t
  87. fz_strnlen(const char *s, size_t n)
  88. {
  89. const char *p = memchr(s, 0, n);
  90. return p ? (size_t) (p - s) : n;
  91. }
  92. int
  93. fz_strncasecmp(const char *a, const char *b, size_t n)
  94. {
  95. while (n > 0)
  96. {
  97. int ucs_a, ucs_b, n_a, n_b;
  98. n_a = fz_chartorunen(&ucs_a, a, n);
  99. n_b = fz_chartorunen(&ucs_b, b, n);
  100. /* We believe that for all unicode characters X and Y, s.t.
  101. * fz_tolower(X) == fz_tolower(Y), X and Y must utf8 encode to
  102. * the same number of bytes. */
  103. assert(n_a == n_b);
  104. assert((size_t)n_a <= n);
  105. // one or both of the strings are short
  106. if (ucs_a == 0 || ucs_b == 0)
  107. return ucs_a - ucs_b;
  108. if (ucs_a != ucs_b)
  109. {
  110. ucs_a = fz_tolower(ucs_a);
  111. ucs_b = fz_tolower(ucs_b);
  112. }
  113. if (ucs_a != ucs_b)
  114. return ucs_a - ucs_b;
  115. a += n_a;
  116. b += n_b;
  117. n -= n_a;
  118. }
  119. return 0;
  120. }
  121. int
  122. fz_strcasecmp(const char *a, const char *b)
  123. {
  124. while (1)
  125. {
  126. int ucs_a, ucs_b;
  127. a += fz_chartorune(&ucs_a, a);
  128. b += fz_chartorune(&ucs_b, b);
  129. ucs_a = fz_tolower(ucs_a);
  130. ucs_b = fz_tolower(ucs_b);
  131. if (ucs_a == ucs_b)
  132. {
  133. if (ucs_a == 0)
  134. return 0;
  135. }
  136. else
  137. return ucs_a - ucs_b;
  138. }
  139. }
  140. char *
  141. fz_strsep(char **stringp, const char *delim)
  142. {
  143. char *ret = *stringp;
  144. if (!ret) return NULL;
  145. if ((*stringp = strpbrk(*stringp, delim)) != NULL)
  146. *((*stringp)++) = '\0';
  147. return ret;
  148. }
  149. size_t
  150. fz_strlcpy(char *dst, const char *src, size_t siz)
  151. {
  152. register char *d = dst;
  153. register const char *s = src;
  154. register size_t n = siz;
  155. /* Copy as many bytes as will fit */
  156. if (n != 0 && --n != 0) {
  157. do {
  158. if ((*d++ = *s++) == 0)
  159. break;
  160. } while (--n != 0);
  161. }
  162. /* Not enough room in dst, add NUL and traverse rest of src */
  163. if (n == 0) {
  164. if (siz != 0)
  165. *d = '\0'; /* NUL-terminate dst */
  166. while (*s++)
  167. ;
  168. }
  169. return(s - src - 1); /* count does not include NUL */
  170. }
  171. size_t
  172. fz_strlcat(char *dst, const char *src, size_t siz)
  173. {
  174. register char *d = dst;
  175. register const char *s = src;
  176. register size_t n = siz;
  177. size_t dlen;
  178. /* Find the end of dst and adjust bytes left but don't go past end */
  179. while (*d != '\0' && n-- != 0)
  180. d++;
  181. dlen = d - dst;
  182. n = siz - dlen;
  183. if (n == 0)
  184. return dlen + strlen(s);
  185. while (*s != '\0') {
  186. if (n != 1) {
  187. *d++ = *s;
  188. n--;
  189. }
  190. s++;
  191. }
  192. *d = '\0';
  193. return dlen + (s - src); /* count does not include NUL */
  194. }
  195. void
  196. fz_dirname(char *dir, const char *path, size_t n)
  197. {
  198. size_t i;
  199. if (!path || !path[0])
  200. {
  201. fz_strlcpy(dir, ".", n);
  202. return;
  203. }
  204. fz_strlcpy(dir, path, n);
  205. i = strlen(dir);
  206. for(; dir[i] == '/'; --i) if (!i) { fz_strlcpy(dir, "/", n); return; }
  207. for(; dir[i] != '/'; --i) if (!i) { fz_strlcpy(dir, ".", n); return; }
  208. for(; dir[i] == '/'; --i) if (!i) { fz_strlcpy(dir, "/", n); return; }
  209. dir[i+1] = 0;
  210. }
  211. const char *
  212. fz_basename(const char *path)
  213. {
  214. const char *name = strrchr(path, '/');
  215. if (!name)
  216. name = strrchr(path, '\\');
  217. if (!name)
  218. return path;
  219. return name + 1;
  220. }
  221. #ifdef _WIN32
  222. char *fz_realpath(const char *path, char *buf)
  223. {
  224. wchar_t wpath[PATH_MAX];
  225. wchar_t wbuf[PATH_MAX];
  226. int i;
  227. if (!MultiByteToWideChar(CP_UTF8, 0, path, -1, wpath, PATH_MAX))
  228. return NULL;
  229. if (!GetFullPathNameW(wpath, PATH_MAX, wbuf, NULL))
  230. return NULL;
  231. if (!WideCharToMultiByte(CP_UTF8, 0, wbuf, -1, buf, PATH_MAX, NULL, NULL))
  232. return NULL;
  233. for (i=0; buf[i]; ++i)
  234. if (buf[i] == '\\')
  235. buf[i] = '/';
  236. return buf;
  237. }
  238. #else
  239. char *fz_realpath(const char *path, char *buf)
  240. {
  241. return realpath(path, buf);
  242. }
  243. #endif
  244. static inline int ishex(int a)
  245. {
  246. return (a >= 'A' && a <= 'F') ||
  247. (a >= 'a' && a <= 'f') ||
  248. (a >= '0' && a <= '9');
  249. }
  250. static inline int tohex(int c)
  251. {
  252. if (c >= '0' && c <= '9') return c - '0';
  253. if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
  254. if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
  255. return 0;
  256. }
  257. #define URIRESERVED ";/?:@&=+$,"
  258. #define URIALPHA "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
  259. #define URIDIGIT "0123456789"
  260. #define URIMARK "-_.!~*'()"
  261. #define URIUNESCAPED URIALPHA URIDIGIT URIMARK
  262. #define HEX "0123456789ABCDEF"
  263. /* Same as fz_decode_uri_component but in-place */
  264. char *
  265. fz_urldecode(char *url)
  266. {
  267. char *s = url;
  268. char *p = url;
  269. while (*s)
  270. {
  271. int c = (unsigned char) *s++;
  272. if (c == '%' && ishex(s[0]) && ishex(s[1]))
  273. {
  274. int a = tohex(*s++);
  275. int b = tohex(*s++);
  276. *p++ = a << 4 | b;
  277. }
  278. else
  279. {
  280. *p++ = c;
  281. }
  282. }
  283. *p = 0;
  284. return url;
  285. }
  286. char *
  287. fz_decode_uri_component(fz_context *ctx, const char *s)
  288. {
  289. char *uri = fz_malloc(ctx, strlen(s) + 1);
  290. char *p = uri;
  291. while (*s)
  292. {
  293. int c = (unsigned char) *s++;
  294. if (c == '%' && ishex(s[0]) && ishex(s[1]))
  295. {
  296. int a = tohex(*s++);
  297. int b = tohex(*s++);
  298. *p++ = a << 4 | b;
  299. }
  300. else
  301. {
  302. *p++ = c;
  303. }
  304. }
  305. *p = 0;
  306. return uri;
  307. }
  308. char *
  309. fz_decode_uri(fz_context *ctx, const char *s)
  310. {
  311. char *uri = fz_malloc(ctx, strlen(s) + 1);
  312. char *p = uri;
  313. while (*s)
  314. {
  315. int c = (unsigned char) *s++;
  316. if (c == '%' && ishex(s[0]) && ishex(s[1]))
  317. {
  318. int a = tohex(*s++);
  319. int b = tohex(*s++);
  320. c = a << 4 | b;
  321. if (strchr(URIRESERVED "#", c)) {
  322. *p++ = '%';
  323. *p++ = HEX[a];
  324. *p++ = HEX[b];
  325. } else {
  326. *p++ = c;
  327. }
  328. }
  329. else
  330. {
  331. *p++ = c;
  332. }
  333. }
  334. *p = 0;
  335. return uri;
  336. }
  337. static char *
  338. fz_encode_uri_imp(fz_context *ctx, const char *s, const char *unescaped)
  339. {
  340. char *uri = fz_malloc(ctx, strlen(s) * 3 + 1); /* allocate enough for worst case */
  341. char *p = uri;
  342. while (*s)
  343. {
  344. int c = (unsigned char) *s++;
  345. if (strchr(unescaped, c))
  346. {
  347. *p++ = c;
  348. }
  349. else
  350. {
  351. *p++ = '%';
  352. *p++ = HEX[(c >> 4) & 15];
  353. *p++ = HEX[(c) & 15];
  354. }
  355. }
  356. *p = 0;
  357. return uri;
  358. }
  359. char *
  360. fz_encode_uri_component(fz_context *ctx, const char *s)
  361. {
  362. return fz_encode_uri_imp(ctx, s, URIUNESCAPED);
  363. }
  364. char *
  365. fz_encode_uri_pathname(fz_context *ctx, const char *s)
  366. {
  367. return fz_encode_uri_imp(ctx, s, URIUNESCAPED "/");
  368. }
  369. char *
  370. fz_encode_uri(fz_context *ctx, const char *s)
  371. {
  372. return fz_encode_uri_imp(ctx, s, URIUNESCAPED URIRESERVED "#");
  373. }
  374. void
  375. fz_format_output_path(fz_context *ctx, char *path, size_t size, const char *fmt, int page)
  376. {
  377. const char *s, *p;
  378. char num[40];
  379. int i, n;
  380. int z = 0;
  381. for (i = 0; page; page /= 10)
  382. num[i++] = '0' + page % 10;
  383. num[i] = 0;
  384. s = p = strchr(fmt, '%');
  385. if (p)
  386. {
  387. ++p;
  388. while (*p >= '0' && *p <= '9')
  389. z = z * 10 + (*p++ - '0');
  390. }
  391. if (p && *p == 'd')
  392. {
  393. ++p;
  394. }
  395. else
  396. {
  397. s = p = strrchr(fmt, '.');
  398. if (!p)
  399. s = p = fmt + strlen(fmt);
  400. }
  401. if (z < 1)
  402. z = 1;
  403. while (i < z && i < (int)sizeof num)
  404. num[i++] = '0';
  405. n = s - fmt;
  406. if (n + i + strlen(p) >= size)
  407. fz_throw(ctx, FZ_ERROR_ARGUMENT, "path name buffer overflow");
  408. memcpy(path, fmt, n);
  409. while (i > 0)
  410. path[n++] = num[--i];
  411. fz_strlcpy(path + n, p, size - n);
  412. }
  413. #define SEP(x) ((x)=='/' || (x) == 0)
  414. char *
  415. fz_cleanname(char *name)
  416. {
  417. char *p, *q, *dotdot;
  418. int rooted;
  419. rooted = name[0] == '/';
  420. /*
  421. * invariants:
  422. * p points at beginning of path element we're considering.
  423. * q points just past the last path element we wrote (no slash).
  424. * dotdot points just past the point where .. cannot backtrack
  425. * any further (no slash).
  426. */
  427. p = q = dotdot = name + rooted;
  428. while (*p)
  429. {
  430. if(p[0] == '/') /* null element */
  431. p++;
  432. else if (p[0] == '.' && SEP(p[1]))
  433. p += 1; /* don't count the separator in case it is nul */
  434. else if (p[0] == '.' && p[1] == '.' && SEP(p[2]))
  435. {
  436. p += 2;
  437. if (q > dotdot) /* can backtrack */
  438. {
  439. while(--q > dotdot && *q != '/')
  440. ;
  441. }
  442. else if (!rooted) /* /.. is / but ./../ is .. */
  443. {
  444. if (q != name)
  445. *q++ = '/';
  446. *q++ = '.';
  447. *q++ = '.';
  448. dotdot = q;
  449. }
  450. }
  451. else /* real path element */
  452. {
  453. if (q != name+rooted)
  454. *q++ = '/';
  455. while ((*q = *p) != '/' && *q != 0)
  456. p++, q++;
  457. }
  458. }
  459. if (q == name) /* empty string is really "." */
  460. *q++ = '.';
  461. *q = '\0';
  462. return name;
  463. }
  464. char *
  465. fz_cleanname_strdup(fz_context *ctx, const char *name)
  466. {
  467. size_t len = strlen(name);
  468. char *newname = fz_malloc(ctx, fz_maxz(2, len + 1));
  469. memcpy(newname, name, len + 1);
  470. newname[len] = '\0';
  471. return fz_cleanname(newname);
  472. }
  473. enum
  474. {
  475. UTFmax = 4, /* maximum bytes per rune */
  476. Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
  477. Runeself = 0x80, /* rune and UTF sequences are the same (<) */
  478. Runeerror = 0xFFFD, /* decoding error in UTF */
  479. Runemax = 0x10FFFF, /* maximum rune value */
  480. };
  481. enum
  482. {
  483. Bit1 = 7,
  484. Bitx = 6,
  485. Bit2 = 5,
  486. Bit3 = 4,
  487. Bit4 = 3,
  488. Bit5 = 2,
  489. T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
  490. Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
  491. T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
  492. T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
  493. T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
  494. T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
  495. Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
  496. Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
  497. Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
  498. Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0001 1111 1111 1111 1111 1111 */
  499. Maskx = (1<<Bitx)-1, /* 0011 1111 */
  500. Testx = Maskx ^ 0xFF, /* 1100 0000 */
  501. Bad = Runeerror,
  502. };
  503. int
  504. fz_chartorune(int *rune, const char *str)
  505. {
  506. int c, c1, c2, c3;
  507. int l;
  508. /* overlong null character */
  509. if((unsigned char)str[0] == 0xc0 && (unsigned char)str[1] == 0x80) {
  510. *rune = 0;
  511. return 2;
  512. }
  513. /*
  514. * one character sequence
  515. * 00000-0007F => T1
  516. */
  517. c = *(const unsigned char*)str;
  518. if(c < Tx) {
  519. *rune = c;
  520. return 1;
  521. }
  522. /*
  523. * two character sequence
  524. * 0080-07FF => T2 Tx
  525. */
  526. c1 = *(const unsigned char*)(str+1) ^ Tx;
  527. if(c1 & Testx)
  528. goto bad;
  529. if(c < T3) {
  530. if(c < T2)
  531. goto bad;
  532. l = ((c << Bitx) | c1) & Rune2;
  533. if(l <= Rune1)
  534. goto bad;
  535. *rune = l;
  536. return 2;
  537. }
  538. /*
  539. * three character sequence
  540. * 0800-FFFF => T3 Tx Tx
  541. */
  542. c2 = *(const unsigned char*)(str+2) ^ Tx;
  543. if(c2 & Testx)
  544. goto bad;
  545. if(c < T4) {
  546. l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
  547. if(l <= Rune2)
  548. goto bad;
  549. *rune = l;
  550. return 3;
  551. }
  552. /*
  553. * four character sequence (21-bit value)
  554. * 10000-1FFFFF => T4 Tx Tx Tx
  555. */
  556. c3 = *(const unsigned char*)(str+3) ^ Tx;
  557. if (c3 & Testx)
  558. goto bad;
  559. if (c < T5) {
  560. l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
  561. if (l <= Rune3)
  562. goto bad;
  563. *rune = l;
  564. return 4;
  565. }
  566. /*
  567. * Support for 5-byte or longer UTF-8 would go here, but
  568. * since we don't have that, we'll just fall through to bad.
  569. */
  570. /*
  571. * bad decoding
  572. */
  573. bad:
  574. *rune = Bad;
  575. return 1;
  576. }
  577. int
  578. fz_chartorunen(int *rune, const char *str, size_t n)
  579. {
  580. int c, c1, c2, c3;
  581. int l;
  582. if (n < 1)
  583. goto bad;
  584. /*
  585. * one character sequence
  586. * 00000-0007F => T1
  587. */
  588. c = *(const unsigned char*)str;
  589. if(c < Tx) {
  590. *rune = c;
  591. return 1;
  592. }
  593. if (n < 2)
  594. goto bad;
  595. /* overlong null character */
  596. if((unsigned char)str[0] == 0xc0 && (unsigned char)str[1] == 0x80) {
  597. *rune = 0;
  598. return 2;
  599. }
  600. /*
  601. * two character sequence
  602. * 0080-07FF => T2 Tx
  603. */
  604. c1 = *(const unsigned char*)(str+1) ^ Tx;
  605. if(c1 & Testx)
  606. goto bad;
  607. if(c < T3) {
  608. if(c < T2)
  609. goto bad;
  610. l = ((c << Bitx) | c1) & Rune2;
  611. if(l <= Rune1)
  612. goto bad;
  613. *rune = l;
  614. return 2;
  615. }
  616. if (n < 3)
  617. goto bad;
  618. /*
  619. * three character sequence
  620. * 0800-FFFF => T3 Tx Tx
  621. */
  622. c2 = *(const unsigned char*)(str+2) ^ Tx;
  623. if(c2 & Testx)
  624. goto bad;
  625. if(c < T4) {
  626. l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
  627. if(l <= Rune2)
  628. goto bad;
  629. *rune = l;
  630. return 3;
  631. }
  632. if (n < 4)
  633. goto bad;
  634. /*
  635. * four character sequence (21-bit value)
  636. * 10000-1FFFFF => T4 Tx Tx Tx
  637. */
  638. c3 = *(const unsigned char*)(str+3) ^ Tx;
  639. if (c3 & Testx)
  640. goto bad;
  641. if (c < T5) {
  642. l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
  643. if (l <= Rune3)
  644. goto bad;
  645. *rune = l;
  646. return 4;
  647. }
  648. /*
  649. * Support for 5-byte or longer UTF-8 would go here, but
  650. * since we don't have that, we'll just fall through to bad.
  651. */
  652. /*
  653. * bad decoding
  654. */
  655. bad:
  656. *rune = Bad;
  657. return 1;
  658. }
  659. int
  660. fz_runetochar(char *str, int rune)
  661. {
  662. /* Runes are signed, so convert to unsigned for range check. */
  663. unsigned int c = (unsigned int)rune;
  664. /* overlong null character */
  665. if (c == 0) {
  666. ((unsigned char *)str)[0] = 0xc0;
  667. ((unsigned char *)str)[1] = 0x80;
  668. return 2;
  669. }
  670. /*
  671. * one character sequence
  672. * 00000-0007F => 00-7F
  673. */
  674. if(c <= Rune1) {
  675. str[0] = c;
  676. return 1;
  677. }
  678. /*
  679. * two character sequence
  680. * 0080-07FF => T2 Tx
  681. */
  682. if(c <= Rune2) {
  683. str[0] = T2 | (c >> 1*Bitx);
  684. str[1] = Tx | (c & Maskx);
  685. return 2;
  686. }
  687. /*
  688. * If the Rune is out of range, convert it to the error rune.
  689. * Do this test here because the error rune encodes to three bytes.
  690. * Doing it earlier would duplicate work, since an out of range
  691. * Rune wouldn't have fit in one or two bytes.
  692. */
  693. if (c > Runemax)
  694. c = Runeerror;
  695. /*
  696. * three character sequence
  697. * 0800-FFFF => T3 Tx Tx
  698. */
  699. if (c <= Rune3) {
  700. str[0] = T3 | (c >> 2*Bitx);
  701. str[1] = Tx | ((c >> 1*Bitx) & Maskx);
  702. str[2] = Tx | (c & Maskx);
  703. return 3;
  704. }
  705. /*
  706. * four character sequence (21-bit value)
  707. * 10000-1FFFFF => T4 Tx Tx Tx
  708. */
  709. str[0] = T4 | (c >> 3*Bitx);
  710. str[1] = Tx | ((c >> 2*Bitx) & Maskx);
  711. str[2] = Tx | ((c >> 1*Bitx) & Maskx);
  712. str[3] = Tx | (c & Maskx);
  713. return 4;
  714. }
  715. int
  716. fz_runelen(int c)
  717. {
  718. char str[10];
  719. return fz_runetochar(str, c);
  720. }
  721. int
  722. fz_runeidx(const char *s, const char *p)
  723. {
  724. int rune;
  725. int i = 0;
  726. while (s < p) {
  727. if (*(unsigned char *)s < Runeself)
  728. ++s;
  729. else
  730. s += fz_chartorune(&rune, s);
  731. ++i;
  732. }
  733. return i;
  734. }
  735. const char *
  736. fz_runeptr(const char *s, int i)
  737. {
  738. int rune;
  739. while (i-- > 0) {
  740. rune = *(unsigned char*)s;
  741. if (rune < Runeself) {
  742. if (rune == 0)
  743. return NULL;
  744. ++s;
  745. } else
  746. s += fz_chartorune(&rune, s);
  747. }
  748. return s;
  749. }
  750. int
  751. fz_utflen(const char *s)
  752. {
  753. int c, n, rune;
  754. n = 0;
  755. for(;;) {
  756. c = *(const unsigned char*)s;
  757. if(c < Runeself) {
  758. if(c == 0)
  759. return n;
  760. s++;
  761. } else
  762. s += fz_chartorune(&rune, s);
  763. n++;
  764. }
  765. }
  766. float fz_atof(const char *s)
  767. {
  768. float result;
  769. if (s == NULL)
  770. return 0;
  771. errno = 0;
  772. result = fz_strtof(s, NULL);
  773. if ((errno == ERANGE && result == 0) || isnan(result))
  774. /* Return 1.0 on underflow, as it's a small known value that won't cause a divide by 0. */
  775. return 1;
  776. result = fz_clamp(result, -FLT_MAX, FLT_MAX);
  777. return result;
  778. }
  779. int fz_atoi(const char *s)
  780. {
  781. if (s == NULL)
  782. return 0;
  783. return atoi(s);
  784. }
  785. int64_t fz_atoi64(const char *s)
  786. {
  787. if (s == NULL)
  788. return 0;
  789. return atoll(s);
  790. }
  791. size_t fz_atoz(const char *s)
  792. {
  793. int64_t i;
  794. if (s == NULL)
  795. return 0;
  796. i = atoll(s);
  797. if (i < 0 || (int64_t)(size_t)i != i)
  798. return 0;
  799. return (size_t)i;
  800. }
  801. int fz_is_page_range(fz_context *ctx, const char *s)
  802. {
  803. /* TODO: check the actual syntax... */
  804. while (*s)
  805. {
  806. if ((*s < '0' || *s > '9') && *s != 'N' && *s != '-' && *s != ',')
  807. return 0;
  808. s++;
  809. }
  810. return 1;
  811. }
  812. const char *fz_parse_page_range(fz_context *ctx, const char *s, int *a, int *b, int n)
  813. {
  814. const char *orig = s;
  815. if (!s || !s[0])
  816. return NULL;
  817. if (s[0] == ',')
  818. s += 1;
  819. if (s[0] == 'N')
  820. {
  821. *a = n;
  822. s += 1;
  823. }
  824. else
  825. *a = strtol(s, (char**)&s, 10);
  826. if (s[0] == '-')
  827. {
  828. if (s[1] == 'N')
  829. {
  830. *b = n;
  831. s += 2;
  832. }
  833. else
  834. *b = strtol(s+1, (char**)&s, 10);
  835. }
  836. else
  837. *b = *a;
  838. if (*a < 0) *a = n + 1 + *a;
  839. if (*b < 0) *b = n + 1 + *b;
  840. *a = fz_clampi(*a, 1, n);
  841. *b = fz_clampi(*b, 1, n);
  842. if (s == orig)
  843. {
  844. fz_warn(ctx, "skipping invalid page range");
  845. return NULL;
  846. }
  847. return s;
  848. }
  849. /* memmem from musl */
  850. #define MAX(a,b) ((a)>(b)?(a):(b))
  851. #define BITOP(a,b,op) \
  852. ((a)[(size_t)(b)/(8*sizeof *(a))] op (size_t)1<<((size_t)(b)%(8*sizeof *(a))))
  853. static char *twobyte_memmem(const unsigned char *h, size_t k, const unsigned char *n)
  854. {
  855. uint16_t nw = n[0]<<8 | n[1], hw = h[0]<<8 | h[1];
  856. for (h++, k--; k; k--, hw = hw<<8 | *++h)
  857. if (hw == nw) return (char *)h-1;
  858. return 0;
  859. }
  860. static char *threebyte_memmem(const unsigned char *h, size_t k, const unsigned char *n)
  861. {
  862. uint32_t nw = n[0]<<24 | n[1]<<16 | n[2]<<8;
  863. uint32_t hw = h[0]<<24 | h[1]<<16 | h[2]<<8;
  864. for (h+=2, k-=2; k; k--, hw = (hw|*++h)<<8)
  865. if (hw == nw) return (char *)h-2;
  866. return 0;
  867. }
  868. static char *fourbyte_memmem(const unsigned char *h, size_t k, const unsigned char *n)
  869. {
  870. uint32_t nw = n[0]<<24 | n[1]<<16 | n[2]<<8 | n[3];
  871. uint32_t hw = h[0]<<24 | h[1]<<16 | h[2]<<8 | h[3];
  872. for (h+=3, k-=3; k; k--, hw = hw<<8 | *++h)
  873. if (hw == nw) return (char *)h-3;
  874. return 0;
  875. }
  876. static char *twoway_memmem(const unsigned char *h, const unsigned char *z, const unsigned char *n, size_t l)
  877. {
  878. size_t i, ip, jp, k, p, ms, p0, mem, mem0;
  879. size_t byteset[32 / sizeof(size_t)] = { 0 };
  880. size_t shift[256];
  881. /* Computing length of needle and fill shift table */
  882. for (i=0; i<l; i++)
  883. BITOP(byteset, n[i], |=), shift[n[i]] = i+1;
  884. /* Compute maximal suffix */
  885. ip = (size_t)-1; jp = 0; k = p = 1;
  886. while (jp+k<l) {
  887. if (n[ip+k] == n[jp+k]) {
  888. if (k == p) {
  889. jp += p;
  890. k = 1;
  891. } else k++;
  892. } else if (n[ip+k] > n[jp+k]) {
  893. jp += k;
  894. k = 1;
  895. p = jp - ip;
  896. } else {
  897. ip = jp++;
  898. k = p = 1;
  899. }
  900. }
  901. ms = ip;
  902. p0 = p;
  903. /* And with the opposite comparison */
  904. ip = (size_t)-1; jp = 0; k = p = 1;
  905. while (jp+k<l) {
  906. if (n[ip+k] == n[jp+k]) {
  907. if (k == p) {
  908. jp += p;
  909. k = 1;
  910. } else k++;
  911. } else if (n[ip+k] < n[jp+k]) {
  912. jp += k;
  913. k = 1;
  914. p = jp - ip;
  915. } else {
  916. ip = jp++;
  917. k = p = 1;
  918. }
  919. }
  920. if (ip+1 > ms+1) ms = ip;
  921. else p = p0;
  922. /* Periodic needle? */
  923. if (memcmp(n, n+p, ms+1)) {
  924. mem0 = 0;
  925. p = MAX(ms, l-ms-1) + 1;
  926. } else mem0 = l-p;
  927. mem = 0;
  928. /* Search loop */
  929. for (;;) {
  930. /* If remainder of haystack is shorter than needle, done */
  931. if ((size_t)(z-h) < l) return 0;
  932. /* Check last byte first; advance by shift on mismatch */
  933. if (BITOP(byteset, h[l-1], &)) {
  934. k = l-shift[h[l-1]];
  935. if (k) {
  936. if (mem0 && mem && k < p) k = l-p;
  937. h += k;
  938. mem = 0;
  939. continue;
  940. }
  941. } else {
  942. h += l;
  943. mem = 0;
  944. continue;
  945. }
  946. /* Compare right half */
  947. for (k=MAX(ms+1,mem); k<l && n[k] == h[k]; k++);
  948. if (k < l) {
  949. h += k-ms;
  950. mem = 0;
  951. continue;
  952. }
  953. /* Compare left half */
  954. for (k=ms+1; k>mem && n[k-1] == h[k-1]; k--);
  955. if (k <= mem) return (char *)h;
  956. h += p;
  957. mem = mem0;
  958. }
  959. }
  960. void *fz_memmem(const void *h0, size_t k, const void *n0, size_t l)
  961. {
  962. const unsigned char *h = h0, *n = n0;
  963. /* Return immediately on empty needle */
  964. if (!l) return (void *)h;
  965. /* Return immediately when needle is longer than haystack */
  966. if (k<l) return 0;
  967. /* Use faster algorithms for short needles */
  968. h = memchr(h0, *n, k);
  969. if (!h || l==1) return (void *)h;
  970. k -= h - (const unsigned char *)h0;
  971. if (k<l) return 0;
  972. if (l==2) return twobyte_memmem(h, k, n);
  973. if (l==3) return threebyte_memmem(h, k, n);
  974. if (l==4) return fourbyte_memmem(h, k, n);
  975. return twoway_memmem(h, h+k, n, l);
  976. }
  977. char *
  978. fz_utf8_from_wchar(fz_context *ctx, const wchar_t *s)
  979. {
  980. const wchar_t *src = s;
  981. char *d;
  982. char *dst;
  983. int len = 1;
  984. while (*src)
  985. {
  986. len += fz_runelen(*src++);
  987. }
  988. d = Memento_label(fz_malloc(ctx, len), "utf8_from_wchar");
  989. dst = d;
  990. src = s;
  991. while (*src)
  992. {
  993. dst += fz_runetochar(dst, *src++);
  994. }
  995. *dst = 0;
  996. return d;
  997. }
  998. wchar_t *
  999. fz_wchar_from_utf8(fz_context *ctx, const char *path)
  1000. {
  1001. size_t z = 0;
  1002. const char *p = path;
  1003. wchar_t *wpath, *w;
  1004. if (!path)
  1005. return NULL;
  1006. while (*p)
  1007. {
  1008. int c;
  1009. p += fz_chartorune(&c, p);
  1010. z++;
  1011. if (c >= 0x10000)
  1012. z++;
  1013. }
  1014. w = wpath = fz_malloc(ctx, 2*(z+1));
  1015. while (*path)
  1016. {
  1017. int c;
  1018. path += fz_chartorune(&c, path);
  1019. if (c >= 0x10000)
  1020. {
  1021. c -= 0x10000;
  1022. *w++ = 0xd800 + (c>>10);
  1023. *w++ = 0xdc00 + (c&1023);
  1024. }
  1025. else
  1026. *w++ = c;
  1027. }
  1028. *w = 0;
  1029. return wpath;
  1030. }
  1031. const char *
  1032. fz_strstr(const char *haystack, const char *needle)
  1033. {
  1034. size_t matchlen = 0;
  1035. char d;
  1036. if (haystack == NULL || needle == NULL)
  1037. return NULL;
  1038. while ((d = needle[matchlen]) != 0)
  1039. {
  1040. char c = *haystack++;
  1041. if (c == 0)
  1042. return NULL;
  1043. if (c == d)
  1044. matchlen++;
  1045. else
  1046. {
  1047. haystack -= matchlen;
  1048. matchlen = 0;
  1049. }
  1050. }
  1051. return haystack - matchlen;
  1052. }
  1053. const char *
  1054. fz_strstrcase(const char *haystack, const char *needle)
  1055. {
  1056. size_t matchlen = 0;
  1057. size_t firstlen;
  1058. if (haystack == NULL || needle == NULL)
  1059. return NULL;
  1060. while (1)
  1061. {
  1062. int c, d;
  1063. int nc, nd;
  1064. nd = fz_chartorune(&d, &needle[matchlen]);
  1065. if (d == 0)
  1066. break;
  1067. nc = fz_chartorune(&c, haystack);
  1068. if (matchlen == 0)
  1069. firstlen = nc;
  1070. haystack += nc;
  1071. matchlen += nd;
  1072. if (c == 0)
  1073. return NULL;
  1074. if (c != d)
  1075. haystack -= matchlen - firstlen, matchlen = 0;
  1076. }
  1077. return haystack - matchlen;
  1078. }
  1079. static inline int my_isdigit(int c) {
  1080. return c >= '0' && c <= '9';
  1081. }
  1082. int
  1083. fz_strverscmp(const char *l0, const char *r0)
  1084. {
  1085. // This strverscmp implementation is borrowed from musl.
  1086. // Copyright © 2005-2020 Rich Felker, et al.
  1087. // Standard MIT license.
  1088. const unsigned char *l = (const void *)l0;
  1089. const unsigned char *r = (const void *)r0;
  1090. size_t i, dp, j;
  1091. int z = 1;
  1092. /* Find maximal matching prefix and track its maximal digit
  1093. * suffix and whether those digits are all zeros. */
  1094. for (dp=i=0; l[i]==r[i]; i++) {
  1095. int c = l[i];
  1096. if (!c) return 0;
  1097. if (!my_isdigit(c)) dp=i+1, z=1;
  1098. else if (c!='0') z=0;
  1099. }
  1100. if (l[dp]!='0' && r[dp]!='0') {
  1101. /* If we're not looking at a digit sequence that began
  1102. * with a zero, longest digit string is greater. */
  1103. for (j=i; my_isdigit(l[j]); j++)
  1104. if (!my_isdigit(r[j])) return 1;
  1105. if (my_isdigit(r[j])) return -1;
  1106. } else if (z && dp<i && (my_isdigit(l[i]) || my_isdigit(r[i]))) {
  1107. /* Otherwise, if common prefix of digit sequence is
  1108. * all zeros, digits order less than non-digits. */
  1109. return (unsigned char)(l[i]-'0') - (unsigned char)(r[i]-'0');
  1110. }
  1111. return l[i] - r[i];
  1112. }