genucd.py 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. # Create utfdata.h from UnicodeData.txt
  2. tolower = []
  3. toupper = []
  4. isalpha = []
  5. for line in open("UnicodeData.txt").readlines():
  6. line = line.split(";")
  7. code = int(line[0],16)
  8. # if code > 65535: continue # skip non-BMP codepoints
  9. if line[2][0] == 'L':
  10. isalpha.append(code)
  11. if line[12]:
  12. toupper.append((code,int(line[12],16)))
  13. if line[13]:
  14. tolower.append((code,int(line[13],16)))
  15. def dumpalpha():
  16. table = []
  17. prev = 0
  18. start = 0
  19. for code in isalpha:
  20. if code != prev+1:
  21. if start:
  22. table.append((start,prev))
  23. start = code
  24. prev = code
  25. table.append((start,prev))
  26. print("")
  27. print("static const int ucd_alpha2[] = {")
  28. for a, b in table:
  29. if b - a > 0:
  30. print(hex(a)+","+hex(b)+",")
  31. print("};");
  32. print("")
  33. print("static const int ucd_alpha1[] = {")
  34. for a, b in table:
  35. if b - a == 0:
  36. print(hex(a)+",")
  37. print("};");
  38. def dumpmap(name, input):
  39. table = []
  40. prev_a = 0
  41. prev_b = 0
  42. start_a = 0
  43. start_b = 0
  44. for a, b in input:
  45. if a != prev_a+1 or b != prev_b+1:
  46. if start_a:
  47. table.append((start_a,prev_a,start_b))
  48. start_a = a
  49. start_b = b
  50. prev_a = a
  51. prev_b = b
  52. table.append((start_a,prev_a,start_b))
  53. print("")
  54. print("static const int " + name + "2[] = {")
  55. for a, b, n in table:
  56. if b - a > 0:
  57. print(hex(a)+","+hex(b)+","+str(n-a)+",")
  58. print("};");
  59. print("")
  60. print("static const int " + name + "1[] = {")
  61. for a, b, n in table:
  62. if b - a == 0:
  63. print(hex(a)+","+str(n-a)+",")
  64. print("};");
  65. print("/* This file was automatically created from UnicodeData.txt */")
  66. dumpalpha()
  67. dumpmap("ucd_tolower", tolower)
  68. dumpmap("ucd_toupper", toupper)