step-04-generate-java-literals.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. # Step 04 - generate Java literals.
  2. #
  3. # Java byte-code has severe restrictions. There is no such thing as
  4. # "array literal" - those are implemented as series of data[x] = y;
  5. # as a consequence N-byte array will use 7N bytes in class, plus N bytes
  6. # in instantiated variable. Also no literal could be longer than 64KiB.
  7. #
  8. # To keep dictionary data compact both in source code and in compiled format
  9. # we use the following tricks:
  10. # * use String as a data container
  11. # * store only lowest 7 bits; i.e. all characters fit ASCII table; this allows
  12. # efficient conversion to byte array; also ASCII characters use only 1 byte
  13. #. of memory (UTF-8 encoding)
  14. # * RLE-compress sequence of 8-th bits
  15. #
  16. # This script generates literals used in Java code.
  17. try:
  18. unichr # Python 2
  19. except NameError:
  20. unichr = chr # Python 3
  21. bin_path = "dictionary.bin"
  22. with open(bin_path, "rb") as raw:
  23. data = raw.read()
  24. low = []
  25. hi = []
  26. is_skip = True
  27. skip_flip_offset = 36
  28. cntr = skip_flip_offset
  29. for b in data:
  30. value = ord(b)
  31. low.append(chr(value & 0x7F))
  32. if is_skip:
  33. if value < 0x80:
  34. cntr += 1
  35. else:
  36. is_skip = False
  37. hi.append(unichr(cntr))
  38. cntr = skip_flip_offset + 1
  39. else:
  40. if value >= 0x80:
  41. cntr += 1
  42. else:
  43. is_skip = True
  44. hi.append(unichr(cntr))
  45. cntr = skip_flip_offset + 1
  46. hi.append(unichr(cntr))
  47. low0 = low[0:len(low) // 2]
  48. low1 = low[len(low) // 2:len(low)]
  49. def escape(chars):
  50. result = []
  51. for c in chars:
  52. if "\r" == c:
  53. result.append("\\r")
  54. elif "\n" == c:
  55. result.append("\\n")
  56. elif "\t" == c:
  57. result.append("\\t")
  58. elif "\"" == c:
  59. result.append("\\\"")
  60. elif "\\" == c:
  61. result.append("\\\\")
  62. elif ord(c) < 32 or ord(c) >= 127:
  63. result.append("\\u%04X" % ord(c))
  64. else:
  65. result.append(c)
  66. return result
  67. source_code = [
  68. " private static final String DATA0 = \"", "".join(escape(low0)), "\";\n",
  69. " private static final String DATA1 = \"", "".join(escape(low1)), "\";\n",
  70. " private static final String SKIP_FLIP = \"", "".join(escape(hi)), "\";\n"
  71. ]
  72. src_path = "DictionaryData.inc.java"
  73. with open(src_path, "w") as source:
  74. source.write("".join(source_code))