pdf-unicode.c 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. // Copyright (C) 2004-2021 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #include "mupdf/fitz.h"
  23. #include "mupdf/pdf.h"
  24. #include <string.h>
  25. /* Load or synthesize ToUnicode map for fonts */
  26. static void
  27. pdf_remap_cmap_range(fz_context *ctx, pdf_cmap *ucs_from_gid,
  28. unsigned int cpt, unsigned int gid, unsigned int n, pdf_cmap *ucs_from_cpt)
  29. {
  30. unsigned int k;
  31. int ucsbuf[PDF_MRANGE_CAP];
  32. int ucslen;
  33. for (k = 0; k <= n; ++k)
  34. {
  35. ucslen = pdf_lookup_cmap_full(ucs_from_cpt, cpt + k, ucsbuf);
  36. if (ucslen == 1)
  37. pdf_map_range_to_range(ctx, ucs_from_gid, gid + k, gid + k, ucsbuf[0]);
  38. else if (ucslen > 1)
  39. pdf_map_one_to_many(ctx, ucs_from_gid, gid + k, ucsbuf, ucslen);
  40. }
  41. }
  42. static pdf_cmap *
  43. pdf_remap_cmap(fz_context *ctx, pdf_cmap *gid_from_cpt, pdf_cmap *ucs_from_cpt)
  44. {
  45. pdf_cmap *ucs_from_gid;
  46. unsigned int a, b, x;
  47. int i;
  48. ucs_from_gid = pdf_new_cmap(ctx);
  49. fz_try(ctx)
  50. {
  51. if (gid_from_cpt->usecmap)
  52. ucs_from_gid->usecmap = pdf_remap_cmap(ctx, gid_from_cpt->usecmap, ucs_from_cpt);
  53. pdf_add_codespace(ctx, ucs_from_gid, 0, 0x7fffffff, 4);
  54. for (i = 0; i < gid_from_cpt->rlen; ++i)
  55. {
  56. a = gid_from_cpt->ranges[i].low;
  57. b = gid_from_cpt->ranges[i].high;
  58. x = gid_from_cpt->ranges[i].out;
  59. pdf_remap_cmap_range(ctx, ucs_from_gid, a, x, b - a, ucs_from_cpt);
  60. }
  61. for (i = 0; i < gid_from_cpt->xlen; ++i)
  62. {
  63. a = gid_from_cpt->xranges[i].low;
  64. b = gid_from_cpt->xranges[i].high;
  65. x = gid_from_cpt->xranges[i].out;
  66. pdf_remap_cmap_range(ctx, ucs_from_gid, a, x, b - a, ucs_from_cpt);
  67. }
  68. /* Font encoding CMaps don't have one-to-many mappings, so we can ignore the mranges. */
  69. pdf_sort_cmap(ctx, ucs_from_gid);
  70. }
  71. fz_catch(ctx)
  72. {
  73. pdf_drop_cmap(ctx, ucs_from_gid);
  74. fz_rethrow(ctx);
  75. }
  76. return ucs_from_gid;
  77. }
  78. void
  79. pdf_load_to_unicode(fz_context *ctx, pdf_document *doc, pdf_font_desc *font,
  80. const char **strings, char *collection, pdf_obj *cmapstm)
  81. {
  82. unsigned int cpt;
  83. if (pdf_is_stream(ctx, cmapstm))
  84. {
  85. pdf_cmap *ucs_from_cpt = pdf_load_embedded_cmap(ctx, doc, cmapstm);
  86. fz_try(ctx)
  87. font->to_unicode = pdf_remap_cmap(ctx, font->encoding, ucs_from_cpt);
  88. fz_always(ctx)
  89. pdf_drop_cmap(ctx, ucs_from_cpt);
  90. fz_catch(ctx)
  91. fz_rethrow(ctx);
  92. font->size += pdf_cmap_size(ctx, font->to_unicode);
  93. }
  94. else if (pdf_is_name(ctx, cmapstm))
  95. {
  96. pdf_cmap *ucs_from_cpt = pdf_load_system_cmap(ctx, pdf_to_name(ctx, cmapstm));
  97. fz_try(ctx)
  98. font->to_unicode = pdf_remap_cmap(ctx, font->encoding, ucs_from_cpt);
  99. fz_always(ctx)
  100. pdf_drop_cmap(ctx, ucs_from_cpt);
  101. fz_catch(ctx)
  102. fz_rethrow(ctx);
  103. font->size += pdf_cmap_size(ctx, font->to_unicode);
  104. }
  105. else if (collection)
  106. {
  107. if (!strcmp(collection, "Adobe-CNS1"))
  108. font->to_unicode = pdf_load_system_cmap(ctx, "Adobe-CNS1-UCS2");
  109. else if (!strcmp(collection, "Adobe-GB1"))
  110. font->to_unicode = pdf_load_system_cmap(ctx, "Adobe-GB1-UCS2");
  111. else if (!strcmp(collection, "Adobe-Japan1"))
  112. font->to_unicode = pdf_load_system_cmap(ctx, "Adobe-Japan1-UCS2");
  113. else if (!strcmp(collection, "Adobe-Korea1"))
  114. font->to_unicode = pdf_load_system_cmap(ctx, "Adobe-Korea1-UCS2");
  115. }
  116. if (strings)
  117. {
  118. /* TODO one-to-many mappings */
  119. font->cid_to_ucs = Memento_label(fz_malloc_array(ctx, 256, unsigned short), "cid_to_ucs");
  120. font->cid_to_ucs_len = 256;
  121. font->size += 256 * sizeof *font->cid_to_ucs;
  122. for (cpt = 0; cpt < 256; cpt++)
  123. {
  124. if (strings[cpt])
  125. font->cid_to_ucs[cpt] = fz_unicode_from_glyph_name(strings[cpt]);
  126. else
  127. font->cid_to_ucs[cpt] = FZ_REPLACEMENT_CHARACTER;
  128. }
  129. }
  130. if (!font->to_unicode && !font->cid_to_ucs)
  131. {
  132. /* TODO: synthesize a ToUnicode if it's a freetype font with
  133. * cmap and/or post tables or if it has glyph names. */
  134. }
  135. }