cmapdump.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. #!/usr/bin/env python3
  2. # Parse a CMap file and dump it as a C struct.
  3. import sys
  4. # Decode a subset of CMap syntax (only what is needed for our built-in resources)
  5. # We require that tokens are whitespace separated.
  6. def dumpcmap(filename):
  7. codespacerange = []
  8. usecmap = ""
  9. cmapname = ""
  10. wmode = 0
  11. map = {}
  12. def tocode(s):
  13. if s[0] == '<' and s[-1] == '>':
  14. return int(s[1:-1], 16)
  15. return int(s, 10)
  16. def map_cidchar(lo, v):
  17. map[lo] = v
  18. def map_cidrange(lo, hi, v):
  19. while lo <= hi:
  20. map[lo] = v
  21. lo = lo + 1
  22. v = v + 1
  23. def add_bf(lo, v):
  24. # Decode unicode surrogate pairs
  25. if len(v) == 2 and v[0] >= 0xd800 and v[0] <= 0xdbff and v[1] >= 0xdc00 and v[1] <= 0xdfff:
  26. map[lo] = ((v[0] - 0xd800) << 10) + (v[1] - 0xdc00) + 0x10000
  27. elif len(v) == 1:
  28. map[lo] = v[0]
  29. elif len(v) <= 8:
  30. map[lo] = v[:]
  31. else:
  32. print("/* warning: too long one-to-many mapping: %s */" % (v))
  33. def map_bfchar(lo, bf):
  34. bf = bf[1:-1] # drop < >
  35. v = [int(bf[i:i+4],16) for i in range(0, len(bf), 4)]
  36. add_bf(lo, v)
  37. def map_bfrange(lo, hi, bf):
  38. bf = bf[1:-1] # drop < >
  39. v = [int(bf[i:i+4],16) for i in range(0, len(bf), 4)]
  40. while lo <= hi:
  41. add_bf(lo, v)
  42. lo = lo + 1
  43. v[-1] = v[-1] + 1
  44. current = None
  45. for line in open(filename, "r").readlines():
  46. if line[0] == '%':
  47. continue
  48. line = line.strip().split()
  49. if len(line) == 0:
  50. continue
  51. if line[0] == '/CMapName':
  52. cmapname = line[1][1:]
  53. elif line[0] == '/WMode':
  54. wmode = int(line[1])
  55. elif len(line) > 1 and line[1] == 'usecmap':
  56. usecmap = line[0][1:]
  57. elif len(line) > 1 and line[1] == 'begincodespacerange': current = 'codespacerange'
  58. elif len(line) > 1 and line[1] == 'begincidrange': current = 'cidrange'
  59. elif len(line) > 1 and line[1] == 'beginbfrange': current = 'bfrange'
  60. elif len(line) > 1 and line[1] == 'begincidchar': current = 'cidchar'
  61. elif len(line) > 1 and line[1] == 'beginbfchar': current = 'bfchar'
  62. elif line[0] == 'begincodespacerange': current = 'codespacerange'
  63. elif line[0] == 'begincidrange': current = 'cidrange'
  64. elif line[0] == 'beginbfrange': current = 'bfrange'
  65. elif line[0] == 'begincidchar': current = 'cidchar'
  66. elif line[0] == 'beginbfchar': current = 'bfchar'
  67. elif line[0].startswith("end"):
  68. current = None
  69. elif current == 'codespacerange' and len(line) == 2:
  70. n, a, b = (len(line[0])-2)/2, tocode(line[0]), tocode(line[1])
  71. codespacerange.append((n, a, b))
  72. elif current == 'cidrange' and len(line) == 3:
  73. a, b, c = tocode(line[0]), tocode(line[1]), tocode(line[2])
  74. map_cidrange(a, b, c)
  75. elif current == 'cidchar' and len(line) == 2:
  76. a, b = tocode(line[0]), tocode(line[1])
  77. map_cidchar(a, b)
  78. elif current == 'bfchar' and len(line) == 2:
  79. a, b = tocode(line[0]), line[1]
  80. map_bfchar(a, b)
  81. elif current == 'bfrange' and len(line) == 3:
  82. a, b, c = tocode(line[0]), tocode(line[1]), line[2]
  83. map_bfrange(a, b, c)
  84. # Create ranges
  85. ranges = []
  86. xranges = []
  87. mranges = []
  88. mdata = []
  89. out_lo = -100
  90. out_hi = -100
  91. out_v_lo = 0
  92. out_v_hi = 0
  93. def flush_range():
  94. if out_lo >= 0:
  95. if out_lo > 0xffff or out_hi > 0xffff or out_v_lo > 0xffff:
  96. xranges.append((out_lo, out_hi, out_v_lo))
  97. else:
  98. ranges.append((out_lo, out_hi, out_v_lo))
  99. keys = list(map.keys())
  100. keys.sort()
  101. for code in keys:
  102. v = map[code]
  103. if type(v) is not int:
  104. flush_range()
  105. out_lo = out_hi = -100
  106. mranges.append((code, len(mdata)))
  107. mdata.append(len(v))
  108. mdata.extend(v)
  109. else:
  110. if code != out_hi + 1 or v != out_v_hi + 1:
  111. flush_range()
  112. out_lo = out_hi = code
  113. out_v_lo = out_v_hi = v
  114. else:
  115. out_hi = out_hi + 1
  116. out_v_hi = out_v_hi + 1
  117. flush_range()
  118. # Print C file
  119. cname = cmapname.replace('-', '_')
  120. print()
  121. print("/*", cmapname, "*/")
  122. print()
  123. if len(ranges) > 0:
  124. print("static const pdf_range cmap_%s_ranges[] = {" % cname)
  125. for r in ranges:
  126. print("{0x%x,0x%x,0x%x}," % r)
  127. print("};")
  128. print()
  129. if len(xranges) > 0:
  130. print("static const pdf_xrange cmap_%s_xranges[] = {" % cname)
  131. for r in xranges:
  132. print("{0x%x,0x%x,0x%x}," % r)
  133. print("};")
  134. print()
  135. if len(mranges) > 0:
  136. print("static const pdf_mrange cmap_%s_mranges[] = {" % cname)
  137. for r in mranges:
  138. print("{0x%x,0x%x}," % r)
  139. print("};")
  140. print()
  141. print("static const int cmap_%s_table[] = {" % cname)
  142. n = mdata[0]
  143. i = 0
  144. for r in mdata:
  145. if i <= n:
  146. sys.stdout.write("0x%x," % r)
  147. i = i + 1
  148. else:
  149. sys.stdout.write("\n0x%x," % r)
  150. i = 1
  151. n = r
  152. sys.stdout.write("\n")
  153. print("};")
  154. print()
  155. print("static pdf_cmap cmap_%s = {" % cname)
  156. print("\t{ -1, pdf_drop_cmap_imp },")
  157. print("\t/* cmapname */ \"%s\"," % cmapname)
  158. print("\t/* usecmap */ \"%s\", NULL," % usecmap)
  159. print("\t/* wmode */ %d," % wmode)
  160. print("\t/* codespaces */ %d, {" % len(codespacerange))
  161. if len(codespacerange) > 0:
  162. for codespace in codespacerange:
  163. fmt = "\t\t{ %%d, 0x%%0%dx, 0x%%0%dx }," % (codespace[0]*2, codespace[0]*2)
  164. print(fmt % codespace)
  165. else:
  166. print("\t\t{ 0, 0, 0 },")
  167. print("\t},")
  168. if len(ranges) > 0:
  169. print("\t%d, %d, (pdf_range*)cmap_%s_ranges," % (len(ranges),len(ranges),cname))
  170. else:
  171. print("\t0, 0, NULL, /* ranges */")
  172. if len(xranges) > 0:
  173. print("\t%d, %d, (pdf_xrange*)cmap_%s_xranges," % (len(xranges),len(xranges),cname))
  174. else:
  175. print("\t0, 0, NULL, /* xranges */")
  176. if len(mranges) > 0:
  177. print("\t%d, %d, (pdf_mrange*)cmap_%s_mranges," % (len(mranges),len(mranges),cname))
  178. else:
  179. print("\t0, 0, NULL, /* mranges */")
  180. if len(mdata) > 0:
  181. print("\t%d, %d, (int*)cmap_%s_table," % (len(mdata),len(mdata),cname))
  182. else:
  183. print("\t0, 0, NULL, /* table */")
  184. print("\t0, 0, 0, NULL /* splay tree */")
  185. print("};")
  186. print("/* This is an automatically generated file. Do not edit. */")
  187. for arg in sys.argv[1:]:
  188. dumpcmap(arg)