cmapclean.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. #!/usr/bin/env python3
  2. # Parse a CMap file and dump it back out.
  3. import sys
  4. # Decode a subset of CMap syntax (only what is needed for our built-in resources)
  5. # We require that tokens are whitespace separated.
  6. def cleancmap(filename):
  7. codespacerange = []
  8. usecmap = ""
  9. cmapname = ""
  10. cmapversion = "1.0"
  11. csi_registry = "(Adobe)"
  12. csi_ordering = "(Unknown)"
  13. csi_supplement = 1
  14. wmode = 0
  15. isbf = False
  16. map = {}
  17. def tocode(s):
  18. if s[0] == '<' and s[-1] == '>':
  19. return int(s[1:-1], 16)
  20. return int(s, 10)
  21. def map_cidchar(lo, v):
  22. map[lo] = v
  23. def map_cidrange(lo, hi, v):
  24. while lo <= hi:
  25. map[lo] = v
  26. lo = lo + 1
  27. v = v + 1
  28. def add_bf(lo, v):
  29. # Decode unicode surrogate pairs
  30. if len(v) == 2 and v[0] >= 0xd800 and v[0] <= 0xdbff and v[1] >= 0xdc00 and v[1] <= 0xdfff:
  31. map[lo] = ((v[0] - 0xd800) << 10) + (v[1] - 0xdc00) + 0x10000
  32. elif len(v) == 1:
  33. map[lo] = v[0]
  34. elif len(v) <= 8:
  35. map[lo] = v[:]
  36. else:
  37. print("/* warning: too long one-to-many mapping: %s */" % (v))
  38. def map_bfchar(lo, bf):
  39. bf = bf[1:-1] # drop < >
  40. v = [int(bf[i:i+4],16) for i in range(0, len(bf), 4)]
  41. add_bf(lo, v)
  42. def map_bfrange(lo, hi, bf):
  43. bf = bf[1:-1] # drop < >
  44. v = [int(bf[i:i+4],16) for i in range(0, len(bf), 4)]
  45. while lo <= hi:
  46. add_bf(lo, v)
  47. lo = lo + 1
  48. v[-1] = v[-1] + 1
  49. current = None
  50. for line in open(filename, "r").readlines():
  51. if line[0] == '%':
  52. continue
  53. line = line.strip().split()
  54. if len(line) == 0:
  55. continue
  56. if line[0] == '/CMapVersion': cmapversion = line[1]
  57. elif line[0] == '/CMapName': cmapname = line[1][1:]
  58. elif line[0] == '/WMode': wmode = int(line[1])
  59. elif line[0] == '/Registry': csi_registry = line[1]
  60. elif line[0] == '/Ordering': csi_ordering = line[1]
  61. elif line[0] == '/Supplement': csi_supplement = line[1]
  62. elif len(line) > 1 and line[1] == 'usecmap': usecmap = line[0][1:]
  63. elif len(line) > 1 and line[1] == 'begincodespacerange': current = 'codespacerange'
  64. elif len(line) > 1 and line[1] == 'begincidrange': current = 'cidrange'
  65. elif len(line) > 1 and line[1] == 'beginbfrange': current = 'bfrange'; isbf = True
  66. elif len(line) > 1 and line[1] == 'begincidchar': current = 'cidchar'
  67. elif len(line) > 1 and line[1] == 'beginbfchar': current = 'bfchar'; isbf = True
  68. elif line[0] == 'begincodespacerange': current = 'codespacerange'
  69. elif line[0] == 'begincidrange': current = 'cidrange'
  70. elif line[0] == 'beginbfrange': current = 'bfrange'; isbf = True
  71. elif line[0] == 'begincidchar': current = 'cidchar'
  72. elif line[0] == 'beginbfchar': current = 'bfchar'; isbf = True
  73. elif line[0].startswith("end"):
  74. current = None
  75. elif current == 'codespacerange' and len(line) == 2:
  76. n, a, b = (len(line[0])-2)/2, tocode(line[0]), tocode(line[1])
  77. codespacerange.append((n, a, b))
  78. elif current == 'cidrange' and len(line) == 3:
  79. a, b, c = tocode(line[0]), tocode(line[1]), tocode(line[2])
  80. map_cidrange(a, b, c)
  81. elif current == 'cidchar' and len(line) == 2:
  82. a, b = tocode(line[0]), tocode(line[1])
  83. map_cidchar(a, b)
  84. elif current == 'bfchar' and len(line) == 2:
  85. a, b = tocode(line[0]), line[1]
  86. map_bfchar(a, b)
  87. elif current == 'bfrange' and len(line) == 3:
  88. a, b, c = tocode(line[0]), tocode(line[1]), line[2]
  89. map_bfrange(a, b, c)
  90. # Create ranges
  91. singles = []
  92. ranges = []
  93. mranges = []
  94. out_lo = -100
  95. out_hi = -100
  96. out_v_lo = 0
  97. out_v_hi = 0
  98. def flush_range():
  99. if out_lo >= 0:
  100. if out_lo == out_hi:
  101. singles.append((out_lo, out_v_lo))
  102. else:
  103. ranges.append((out_lo, out_hi, out_v_lo))
  104. keys = list(map.keys())
  105. keys.sort()
  106. for code in keys:
  107. v = map[code]
  108. if type(v) is not int:
  109. flush_range()
  110. out_lo = out_hi = -100
  111. mranges.append((code, v))
  112. else:
  113. if code != out_hi + 1 or v != out_v_hi + 1:
  114. flush_range()
  115. out_lo = out_hi = code
  116. out_v_lo = out_v_hi = v
  117. else:
  118. out_hi = out_hi + 1
  119. out_v_hi = out_v_hi + 1
  120. flush_range()
  121. # Print CMap file
  122. print("%!PS-Adobe-3.0 Resource-CMap")
  123. print("%%DocumentNeededResources: procset (CIDInit)")
  124. print("%%IncludeResource: procset (CIDInit)")
  125. print("%%%%BeginResource: CMap (%s)" % cmapname)
  126. print("%%%%Version: %s" % cmapversion)
  127. print("%%EndComments")
  128. print("/CIDInit /ProcSet findresource begin")
  129. print("12 dict begin")
  130. print("begincmap")
  131. if usecmap: print("/%s usecmap" % usecmap)
  132. print("/CIDSystemInfo 3 dict dup begin")
  133. print(" /Registry %s def" % csi_registry)
  134. print(" /Ordering %s def" % csi_ordering)
  135. print(" /Supplement %s def" % csi_supplement)
  136. print("end def")
  137. print("/CMapName /%s def" % cmapname)
  138. print("/CMapVersion %s def" % cmapversion)
  139. print("/CMapType 1 def")
  140. print("/WMode %d def" % wmode)
  141. if len(codespacerange):
  142. print("%d begincodespacerange" % len(codespacerange))
  143. for r in codespacerange:
  144. fmt = "<%%0%dx> <%%0%dx>" % (r[0]*2, r[0]*2)
  145. print(fmt % (r[1], r[2]))
  146. print("endcodespacerange")
  147. if len(singles) > 0:
  148. if isbf:
  149. print("%d beginbfchar" % len(singles))
  150. for s in singles:
  151. print("<%04x> <%04x>" % s)
  152. print("endbfchar")
  153. else:
  154. print("%d begincidchar" % len(singles))
  155. for s in singles:
  156. print("<%04x> %d" % s)
  157. print("endcidchar")
  158. if len(ranges) > 0:
  159. if isbf:
  160. print("%d beginbfrange" % len(ranges))
  161. for r in ranges:
  162. print("<%04x> <%04x> <%04x>" % r)
  163. print("endbfrange")
  164. else:
  165. print("%d begincidrange" % len(ranges))
  166. for r in ranges:
  167. print("<%04x> <%04x> %d" % r)
  168. print("endcidrange")
  169. if len(mranges) > 0:
  170. print("%d beginbfchar" % len(mranges))
  171. for cid, v in mranges:
  172. print("<%04x> <%s>" % (cid, "".join(["%04x" % ch for ch in v])))
  173. print("endbfchar")
  174. print("endcmap")
  175. print("CMapName currentdict /CMap defineresource pop")
  176. print("end")
  177. print("end")
  178. print("%%EndResource")
  179. print("%%EOF")
  180. for arg in sys.argv[1:]:
  181. cleancmap(arg)