genucd.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. # Create utfdata.h from UnicodeData.txt
  2. import sys
  3. tolower = []
  4. toupper = []
  5. isalpha = []
  6. for line in open(sys.argv[1]).readlines():
  7. line = line.split(";")
  8. code = int(line[0],16)
  9. # if code > 65535: continue # skip non-BMP codepoints
  10. if line[2][0] == 'L':
  11. isalpha.append(code)
  12. if line[12]:
  13. toupper.append((code,int(line[12],16)))
  14. if line[13]:
  15. tolower.append((code,int(line[13],16)))
  16. def dumpalpha():
  17. table = []
  18. prev = 0
  19. start = 0
  20. for code in isalpha:
  21. if code != prev+1:
  22. if start:
  23. table.append((start,prev))
  24. start = code
  25. prev = code
  26. table.append((start,prev))
  27. print("")
  28. print("static const Rune ucd_alpha2[] = {")
  29. for a, b in table:
  30. if b - a > 0:
  31. print(hex(a)+","+hex(b)+",")
  32. print("};");
  33. print("")
  34. print("static const Rune ucd_alpha1[] = {")
  35. for a, b in table:
  36. if b - a == 0:
  37. print(hex(a)+",")
  38. print("};");
  39. def dumpmap(name, input):
  40. table = []
  41. prev_a = 0
  42. prev_b = 0
  43. start_a = 0
  44. start_b = 0
  45. for a, b in input:
  46. if a != prev_a+1 or b != prev_b+1:
  47. if start_a:
  48. table.append((start_a,prev_a,start_b))
  49. start_a = a
  50. start_b = b
  51. prev_a = a
  52. prev_b = b
  53. table.append((start_a,prev_a,start_b))
  54. print("")
  55. print("static const Rune " + name + "2[] = {")
  56. for a, b, n in table:
  57. if b - a > 0:
  58. print(hex(a)+","+hex(b)+","+str(n-a)+",")
  59. print("};");
  60. print("")
  61. print("static const Rune " + name + "1[] = {")
  62. for a, b, n in table:
  63. if b - a == 0:
  64. print(hex(a)+","+str(n-a)+",")
  65. print("};");
  66. print("/* This file was automatically created from " + sys.argv[1] + " */")
  67. dumpalpha()
  68. dumpmap("ucd_tolower", tolower)
  69. dumpmap("ucd_toupper", toupper)