makeencoding.py 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. #!/usr/bin/env python3
  2. # Convert unicode mapping table to C arrays mapping glyph names and unicode values.
  3. #
  4. # ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-U.TXT
  5. # ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT
  6. # ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT
  7. # ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT
  8. # ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1251.TXT
  9. # ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT
  10. #
  11. BANNED = [
  12. "controlSTX", "controlSOT", "controlETX", "controlEOT", "controlENQ",
  13. "controlACK", "controlBEL", "controlBS", "controlHT", "controlLF",
  14. "controlVT", "controlFF", "controlCR", "controlSO", "controlSI",
  15. "controlDLE", "controlDC1", "controlDC2", "controlDC3", "controlDC4",
  16. "controlNAK", "controlSYN", "controlETB", "controlCAN", "controlEM",
  17. "controlSUB", "controlESC", "controlFS", "controlGS", "controlRS",
  18. "controlUS",
  19. "SF100000", "SF110000", "SF010000", "SF030000", "SF020000", "SF040000",
  20. "SF080000", "SF090000", "SF060000", "SF070000", "SF050000", "SF430000",
  21. "SF240000", "SF510000", "SF390000", "SF250000", "SF500000", "SF490000",
  22. "SF380000", "SF280000", "SF260000", "SF360000", "SF370000", "SF420000",
  23. "SF190000", "SF230000", "SF410000", "SF450000", "SF460000", "SF400000",
  24. "SF540000", "SF440000",
  25. ]
  26. glyphs = {}
  27. for line in open("scripts/glyphlist.txt").readlines():
  28. if line[0] != '#':
  29. n, u = line.rstrip().split(';')
  30. if len(u) == 4:
  31. u = int(u, base=16)
  32. if u not in glyphs and n not in BANNED:
  33. glyphs[u] = n
  34. def load_table(fn):
  35. table = [0] * 256
  36. for line in open(fn).readlines():
  37. line = line.strip()
  38. if line[0] != '#' and not line.endswith("#UNDEFINED"):
  39. line = line.split()
  40. c = int(line[0][2:], base=16)
  41. u = int(line[1][2:], base=16)
  42. table[c] = u
  43. return table
  44. def dump_table(name, table):
  45. print("unsigned short fz_unicode_from_%s[256] = {" % name)
  46. for u in table:
  47. print('\t%d,' % u)
  48. print("};")
  49. print()
  50. print("const char *fz_glyph_name_from_%s[%d] = {" % (name, len(table)))
  51. for u in table:
  52. if u in glyphs:
  53. print('\t"%s",' % glyphs[u])
  54. else:
  55. print('\t_notdef,')
  56. print("};")
  57. print()
  58. rev = []
  59. i = 0
  60. for u in table:
  61. if u in glyphs:
  62. if u >= 128:
  63. rev += ['{0x%04x,%d},' % (u, i)]
  64. i = i + 1
  65. rev.sort()
  66. print("static const struct { unsigned short u, c; } %s_from_unicode[] = {" % name)
  67. for s in rev:
  68. print("\t" + s)
  69. print("};")
  70. print()
  71. dump_table("iso8859_1", load_table("scripts/8859-1.TXT"))
  72. dump_table("iso8859_7", load_table("scripts/8859-7.TXT"))
  73. dump_table("koi8u", load_table("scripts/KOI8-U.TXT"))
  74. dump_table("windows_1250", load_table("scripts/CP1250.TXT"))
  75. dump_table("windows_1251", load_table("scripts/CP1251.TXT"))
  76. dump_table("windows_1252", load_table("scripts/CP1252.TXT"))