brotlidump.py 90 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367
  1. #!python3
  2. """Program to dump contents of Brotli compressed files showing the compression format.
  3. Jurjen N.E. Bos, 2016.
  4. I found the following issues with the Brotli format:
  5. - The distance alphabet has size 16+(48<<POSTFIX),
  6. but the last symbols are useless.
  7. It could be lowered to 16+(44-POSTFIX<<POSTFIX), and this could matter.
  8. - The block type code is useless if NBLTYPES==2, you would only need 1 symbol
  9. anyway, so why don't you just switch to "the other" type?
  10. """
  11. import struct
  12. from operator import itemgetter, methodcaller
  13. from itertools import accumulate, repeat
  14. from collections import defaultdict, deque
  15. from functools import partial
  16. DICTIONARY_PATH = 'dictionary.bin'
  17. class InvalidStream(Exception): pass
  18. #lookup table
  19. L, I, D = "literal", "insert&copy", "distance"
  20. pL, pI, pD = 'P'+L, 'P'+I, 'P'+D
  21. def outputCharFormatter(c):
  22. """Show character in readable format
  23. """
  24. #TODO 2: allow hex only output
  25. if 32<c<127: return chr(c)
  26. elif c==10: return '\\n'
  27. elif c==13: return '\\r'
  28. elif c==32: return '" "'
  29. else: return '\\x{:02x}'.format(c)
  30. def outputFormatter(s):
  31. """Show string or char.
  32. """
  33. result = ''
  34. def formatSubString(s):
  35. for c in s:
  36. if c==32: yield ' '
  37. else: yield outputCharFormatter(c)
  38. if len(result)<200: return ''.join(formatSubString(s))
  39. else:
  40. return ''.join(formatSubString(s[:100]))+'...'+ \
  41. ''.join(formatSubString(s[-100:]))
  42. class BitStream:
  43. """Represent a bytes object. Can read bits and prefix codes the way
  44. Brotli does.
  45. """
  46. def __init__(self, byteString):
  47. self.data = byteString
  48. #position in bits: byte pos is pos>>3, bit pos is pos&7
  49. self.pos = 0
  50. def __repr__(self):
  51. """Representation
  52. >>> olleke
  53. BitStream(pos=0:0)
  54. """
  55. return "BitStream(pos={:x}:{})".format(self.pos>>3, self.pos&7)
  56. def read(self, n):
  57. """Read n bits from the stream and return as an integer.
  58. Produces zero bits beyond the stream.
  59. >>> olleke.data[0]==27
  60. True
  61. >>> olleke.read(5)
  62. 27
  63. >>> olleke
  64. BitStream(pos=0:5)
  65. """
  66. value = self.peek(n)
  67. self.pos += n
  68. if self.pos>len(self.data)*8:
  69. raise ValueError('Read past end of stream')
  70. return value
  71. def peek(self, n):
  72. """Peek an n bit integer from the stream without updating the pointer.
  73. It is not an error to read beyond the end of the stream.
  74. >>> olleke.data[:2]==b'\x1b\x2e' and 0x2e1b==11803
  75. True
  76. >>> olleke.peek(15)
  77. 11803
  78. >>> hex(olleke.peek(32))
  79. '0x2e1b'
  80. """
  81. #read bytes that contain the data: self.data[self.pos>>3:self.pos+n+7>>3]
  82. #convert to int: int.from_bytes(..., 'little')
  83. #shift out the bits from the first byte: >>(self.pos&7)
  84. #mask unwanted bits: & (1<<n)-1
  85. return int.from_bytes(
  86. self.data[self.pos>>3:self.pos+n+7>>3],
  87. 'little')>>(self.pos&7) & (1<<n)-1
  88. def readBytes(self, n):
  89. """Read n bytes from the stream on a byte boundary.
  90. """
  91. if self.pos&7: raise ValueError('readBytes: need byte boundary')
  92. result = self.data[self.pos>>3:(self.pos>>3)+n]
  93. self.pos += 8*n
  94. return result
  95. #-----------------------Symbol-------------------------------------------
  96. class Symbol:
  97. """A symbol in a code.
  98. Refers back to the code that contains it.
  99. Index is the place in the alphabet of the symbol.
  100. """
  101. __slots__ = 'code', 'index'
  102. def __init__(self, code, value):
  103. self.code = code
  104. self.index = value
  105. def __repr__(self):
  106. return 'Symbol({}, {})'.format(self.code.name, self.index)
  107. def __len__(self):
  108. """Number of bits in the prefix notation of this symbol
  109. """
  110. return self.code.length(self.index)
  111. def __int__(self):
  112. return self.index
  113. #these routines call equivalent routine in Code class
  114. def bitPattern(self):
  115. """Value of the symbol in the stream
  116. """
  117. return self.code.bitPattern(self.index)
  118. def extraBits(self):
  119. """Number of extra bits to read for this symbol
  120. """
  121. return self.code.extraBits(self.index)
  122. def __str__(self):
  123. """Short descriptor of the symbol without extra bits.
  124. """
  125. return self.code.mnemonic(self.index)
  126. #requiring optional extra bits, if self.code supports them
  127. def value(self, extra=None):
  128. """The value used for processing. Can be a tuple.
  129. with optional extra bits
  130. """
  131. if isinstance(self.code, WithExtra):
  132. if not 0<=extra<1<<self.extraBits():
  133. raise ValueError("value: extra value doesn't fit in extraBits")
  134. return self.code.value(self.index, extra)
  135. if extra is not None:
  136. raise ValueError('value: no extra bits for this code')
  137. return self.code.value(self.index)
  138. def explanation(self, extra=None):
  139. """Long explanation of the value from the numeric value
  140. with optional extra bits
  141. Used by Layout.verboseRead when printing the value
  142. """
  143. if isinstance(self.code, WithExtra):
  144. return self.code.callback(self, extra)
  145. return self.code.callback(self)
  146. #========================Code definitions==================================
  147. class RangeDecoder:
  148. """A decoder for the Code class that assumes the symbols
  149. are encoded consecutively in binary.
  150. It all depends on the "alphabetSize" property.
  151. The range runs from 0 to alphabetSize-1.
  152. This is the default decoder.
  153. """
  154. def __init__(self, *, alphabetSize=None, bitLength=None, **args):
  155. if bitLength is not None: alphabetSize = 1<<bitLength
  156. if alphabetSize is not None:
  157. self.alphabetSize = alphabetSize
  158. self.maxLength = (alphabetSize-1).bit_length()
  159. def __len__(self):
  160. return self.alphabetSize
  161. def __iter__(self):
  162. """Produce all symbols.
  163. """
  164. return map(partial(Symbol, self), range(len(self)))
  165. def __getitem__(self, index):
  166. if index>=self.alphabetSize: raise ValueError('index out of range')
  167. return Symbol(self, index)
  168. def bitPattern(self, index):
  169. return '{:0{}b}'.format(index, self.maxLength)
  170. def length(self, index):
  171. """Encoding length of given symbol.
  172. Does not depend on index in this case.
  173. """
  174. return self.maxLength
  175. def decodePeek(self, data):
  176. """Find which symbol index matches the given data (from peek, as a number)
  177. and return the number of bits decoded.
  178. Can also be used to figure out length of a symbol.
  179. """
  180. return self.maxLength, Symbol(self, data&(1<<self.maxLength)-1)
  181. class PrefixDecoder:
  182. """A decoder for the Code class that uses a prefix code.
  183. The code is determined by encoding:
  184. encode[p] gives the index corresponding to bit pattern p.
  185. Used setDecode(decodeTable) to switch the decoder from the default
  186. to a prefix decoder, or pass decodeTable at init.
  187. You can also use setLength(lengthTable)
  188. to define the encoding from the lengths.
  189. The set of symbol values does not need to be consecutive.
  190. """
  191. def __init__(self, *, decodeTable=None, **args):
  192. if decodeTable is not None: self.setDecode(decodeTable)
  193. def __len__(self):
  194. return len(self.decodeTable)
  195. def __iter__(self):
  196. def revBits(index):
  197. return self.bitPattern(index)[::-1]
  198. return (
  199. Symbol(self, index)
  200. for index in sorted(self.decodeTable.values(), key=revBits)
  201. )
  202. def __getitem__(self, index):
  203. if index not in self.lengthTable:
  204. raise ValueError('No symbol {}[{}]'.format(
  205. self.__class__.__name__, index))
  206. return Symbol(self, index)
  207. def bitPattern(self, index):
  208. bits = next(b for (b,s) in self.decodeTable.items() if s==index)
  209. return '{:0{}b}'.format(bits, self.length(index))
  210. def length(self, index):
  211. """Encoding length of given symbol.
  212. """
  213. return self.lengthTable[index]
  214. def decodePeek(self, data):
  215. """Find which symbol index matches the given data (from peek, as a number)
  216. and return the number of bits decoded.
  217. Can also be used to figure out length of a symbol.
  218. """
  219. #do binary search for word length
  220. #invariant: lo<=length<=hi
  221. lo, hi = self.minLength, self.maxLength
  222. while lo<=hi:
  223. mid = lo+hi>>1
  224. #note lo<=mid<hi at this point
  225. mask = (1<<mid)-1
  226. #lets see what happens if we guess length is mid
  227. try: index = self.decodeTable[data&mask]
  228. except KeyError:
  229. #too many bits specified, reduce estimated length
  230. hi = mid-1
  231. continue
  232. #we found a symbol, but there could be a longer match
  233. symbolLength = self.lengthTable[index]
  234. if symbolLength<=mid:
  235. #all bits match, symbol must be right
  236. return symbolLength, Symbol(self, index)
  237. #there must be more bits to match
  238. lo = mid+1
  239. return lo, Symbol(self, index)
  240. #routine to set up the tables
  241. def setDecode(self, decodeTable):
  242. """Store decodeTable,
  243. and compute lengthTable, minLength, maxLength from encodings.
  244. """
  245. self.decodeTable = decodeTable
  246. #set of symbols with unknown length
  247. todo = set(decodeTable)
  248. #bit size under investigation
  249. maskLength = 0
  250. lengthTable = {}
  251. while todo:
  252. mask = (1<<maskLength)-1
  253. #split the encodings that we didn't find yet using b bits
  254. splitSymbols = defaultdict(list)
  255. for s in todo: splitSymbols[s&mask].append(s)
  256. #unique encodings have a length of maskLength bits
  257. #set length, and remove from todo list
  258. for s,subset in splitSymbols.items():
  259. if len(subset)==1:
  260. lengthTable[self.decodeTable[s]] = maskLength
  261. todo.remove(s)
  262. #now investigate with longer mask
  263. maskLength +=1
  264. #save result
  265. self.lengthTable = lengthTable
  266. self.minLength = min(lengthTable.values())
  267. self.maxLength = max(lengthTable.values())
  268. self.switchToPrefix()
  269. def setLength(self, lengthTable):
  270. """Given the bit pattern lengths for symbols given in lengthTable,
  271. set decodeTable, minLength, maxLength
  272. """
  273. self.lengthTable = lengthTable
  274. self.minLength = min(lengthTable.values())
  275. self.maxLength = max(lengthTable.values())
  276. #compute the backwards codes first; then reverse them
  277. #compute (backwards) first code for every separate lengths
  278. nextCodes = []
  279. #build codes for each length, from right to left
  280. code = 0
  281. for bits in range(self.maxLength+1):
  282. code <<= 1
  283. nextCodes.append(code)
  284. code += sum(x==bits for x in lengthTable.values())
  285. self.decodeTable = {}
  286. #count codes for each length, and store reversed in the table
  287. for symbol in sorted(lengthTable):
  288. bits = lengthTable[symbol]
  289. bitpattern = '{:0{}b}'.format(nextCodes[bits], bits)
  290. self.decodeTable[int(bitpattern[::-1], 2)] = symbol
  291. nextCodes[bits] += 1
  292. self.switchToPrefix()
  293. def switchToPrefix(self):
  294. """This routine makes sure the prefix decoder is activated.
  295. """
  296. self.mode = PrefixDecoder
  297. class Code(RangeDecoder, PrefixDecoder):
  298. """An alphabet of symbols, that can be read from a stream.
  299. If you use setDecode or setLength, you have a prefix code,
  300. otherwise you have a range code.
  301. Features:
  302. code[index] produces symbol with given index
  303. value(index): value of symbol
  304. mnemonic(index): short description of symbol
  305. explanation(index): show meaning of symbol, shown in Layout.verboseRead
  306. iter(code): produce all symbols in some order
  307. name: show as context in Layout.verboseRead
  308. """
  309. name = '?'
  310. #callback is a function that gets the symbol and the extra bits
  311. #default callback calls explanation
  312. def __init__(self, name=None, *, callback=None, description='', **args):
  313. """Don't forget to set either alphabetSize or decodeTable
  314. """
  315. #set name when provided, otherwise take class variable
  316. if name is not None: self.name = name
  317. if callback is not None: self.callback = callback
  318. self.description = description
  319. #mode switch
  320. if 'bitLength' in args or 'alphabetSize' in args:
  321. self.mode = RangeDecoder
  322. RangeDecoder.__init__(self, **args)
  323. elif 'decodeTable' in args:
  324. self.mode = PrefixDecoder
  325. PrefixDecoder.__init__(self, **args)
  326. else:
  327. super().__init__(**args)
  328. def __repr__(self):
  329. return self.__class__.__name__+' '+self.name
  330. #the routines that get switched between RangeDecoder and PrefixDecoder
  331. def __len__(self): return self.mode.__len__(self)
  332. def __iter__(self): return self.mode.__iter__(self)
  333. def __getitem__(self, index): return self.mode.__getitem__(self, index)
  334. def bitPattern(self, index): return self.mode.bitPattern(self, index)
  335. def length(self, index): return self.mode.length(self, index)
  336. def decodePeek(self, data): return self.mode.decodePeek(self, data)
  337. #general routines
  338. def value(self, index, extra=None):
  339. """Get value of symbol for computations.
  340. Override where needed.
  341. """
  342. if extra is not None:
  343. raise ValueError('value: no extra for this symbol')
  344. return index
  345. def mnemonic(self, index):
  346. """Give mnemonic of symbol.
  347. Override where needed.
  348. """
  349. return str(self.value(index))
  350. def callback(self, symbol):
  351. return self.explanation(symbol.index)
  352. def explanation(self, index):
  353. """Long explanation of the value from the numeric value
  354. This is a default routine.
  355. You can customize in three ways:
  356. - set description to add some text
  357. - override to get more control
  358. - set callback to make it dependent on you local variables
  359. """
  360. value = self.value(index)
  361. return '{0}{1}: {2}'.format(
  362. self.description and self.description+': ',
  363. self.bitPattern(index),
  364. value,
  365. )
  366. def extraBits(self, index):
  367. return 0
  368. #Routines that use the decode interface
  369. def showCode(self, width=80):
  370. """Show all words of the code in a nice format.
  371. """
  372. #make table of all symbols with binary strings
  373. symbolStrings = [
  374. (self.bitPattern(s.index), self.mnemonic(s.index))
  375. for s in self
  376. ]
  377. #determine column widths the way Lisp programmers do it
  378. leftColWidth, rightColWidth = map(max, map(
  379. map,
  380. repeat(len),
  381. zip(*symbolStrings)
  382. ))
  383. colwidth = leftColWidth+rightColWidth
  384. columns = 81//(colwidth+2)
  385. rows = -(-len(symbolStrings)//columns)
  386. def justify(bs):
  387. b,s = bs
  388. return b.rjust(leftColWidth)+':'+s.ljust(rightColWidth)
  389. for i in range(rows):
  390. print(' '.join(map(justify, symbolStrings[i::rows])).rstrip())
  391. def readTuple(self, stream):
  392. """Read symbol from stream. Returns symbol, length.
  393. """
  394. length, symbol = self.decodePeek(stream.peek(self.maxLength))
  395. stream.pos += length
  396. return length, symbol
  397. def readTupleAndExtra(self, stream):
  398. return self.readTuple(stream)+(0, None)
  399. class WithExtra(Code):
  400. """Extension for Code so that symbol may have extra bits associated.
  401. If you supply an extraTable, you can use extraBits
  402. You can define an extraTable,
  403. which allows to call extraBits to get the number of extraBits.
  404. Otherwise, you can supply extraBits yourself.
  405. Routine readTupleAndExtra now reads the extra bits too.
  406. Value probably needs to be overridden; see Enumerator.
  407. Note: this does not give you an decodeTable.
  408. """
  409. #redefine these if you don't want to use an extraTable
  410. def extraBits(self, index):
  411. """Get the number of extra bits for this symbol.
  412. """
  413. return self.extraTable[index]
  414. def mnemonic(self, index):
  415. """This value must be independent of extra.
  416. """
  417. return str(index)
  418. def readTupleAndExtra(self, stream):
  419. """Read symbol and extrabits from stream.
  420. Returns symbol length, symbol, extraBits, extra
  421. >>> olleke.pos = 6
  422. >>> MetablockLengthAlphabet().readTupleAndExtra(olleke)
  423. (2, Symbol(MLEN, 4), 16, 46)
  424. """
  425. length, symbol = self.decodePeek(stream.peek(self.maxLength))
  426. stream.pos += length
  427. extraBits = self.extraBits(symbol.index)
  428. return length, symbol, extraBits, stream.read(extraBits)
  429. def explanation(self, index, extra=None):
  430. """Expanded version of Code.explanation supporting extra bits.
  431. If you don't supply extra, it is not mentioned.
  432. """
  433. extraBits = 0 if extra is None else self.extraBits(index)
  434. if not hasattr(self, 'extraTable'):
  435. formatString = '{0}{3}'
  436. lo = hi = value = self.value(index, extra)
  437. elif extraBits==0:
  438. formatString = '{0}{2}: {3}'
  439. lo, hi = self.span(index)
  440. value = lo
  441. else:
  442. formatString = '{0}{1} {2}: {3}-{4}; {3}+{5}={6}'
  443. lo, hi = self.span(index)
  444. value = lo+extra
  445. return formatString.format(
  446. self.description and self.description+': ',
  447. 'x'*extraBits,
  448. self.bitPattern(index),
  449. lo, hi,
  450. extra,
  451. value,
  452. )
  453. def callback(self, symbol, extra):
  454. return self.explanation(symbol.index, extra)
  455. class BoolCode(Code):
  456. """Same as Code(bitLength=1), but shows a boolean.
  457. """
  458. def __init__(self, name=None, **args):
  459. super().__init__(name, bitLength=1, **args)
  460. def value(self, index, extra=None):
  461. return bool(super().value(index, extra))
  462. class Enumerator(WithExtra):
  463. """Code that is defined by the ExtraTable.
  464. extraTable is a class variable that contains
  465. the extraBits of the symbols from 0
  466. value0 contains the value of symbol 0
  467. encodings is not neccessary, but allowed.
  468. Note: place for FixedCode to make sure extraBits works
  469. """
  470. def __init__(self, name=None, **args):
  471. #if there is no decodeTable to determine length, compute it ourselves
  472. if 'decodeTable' not in args:
  473. args['alphabetSize'] = len(self.extraTable)
  474. super().__init__(name, **args)
  475. def __len__(self):
  476. return len(self.extraTable)
  477. def __getitem__(self, index):
  478. """Faster than PrefixDecoder
  479. """
  480. if index>=len(self.extraTable):
  481. raise ValueError("No symbol {}[{}]".format(
  482. self.__class__.__name__, index))
  483. return Symbol(self, index)
  484. def value(self, index, extra):
  485. """Override if you don't define value0 and extraTable
  486. """
  487. lower, upper = self.span(index)
  488. value = lower+(extra or 0)
  489. if value>upper:
  490. raise ValueError('value: extra out of range')
  491. return value
  492. def span(self, index):
  493. """Give the range of possible values in a tuple
  494. Useful for mnemonic and explanation
  495. """
  496. lower = self.value0+sum(1<<x for x in self.extraTable[:index])
  497. upper = lower+(1<<self.extraTable[index])
  498. return lower, upper-1
  499. #======================Code subclasses======================================
  500. #Alphabets used in the metablock header----------------------------------
  501. #For prefix codes
  502. class PrefixCodeHeader(WithExtra):
  503. """Header of prefix codes.
  504. """
  505. def __init__(self, codename):
  506. super().__init__('PFX', bitLength=2)
  507. #this is the name of the code that it describes
  508. self.codename = codename
  509. def extraBits(self, index):
  510. return 2 if index==1 else 0
  511. def value(self, index, extra):
  512. """Returns ('Simple', #codewords) or ('Complex', HSKIP)
  513. """
  514. if index==1:
  515. if extra>3:
  516. raise ValueError('value: extra out of range')
  517. return 'Simple', extra+1
  518. if extra:
  519. raise ValueError('value: extra out of range')
  520. return 'Complex', index
  521. def explanation(self, index, extra):
  522. if index==1:
  523. return '{} is simple with {} code word{}'.format(
  524. self.codename, extra+1, 's' if extra else '')
  525. lengths = [1, 2, 3, 4, 0, 5, 17, 6]
  526. return '{} is complex with lengths {}...'.format(
  527. self.codename,
  528. ','.join(
  529. map(str, lengths[index:index+5]))
  530. )
  531. class TreeShapeAlhabet(BoolCode):
  532. """The bit used to indicate if four word code is "deep" or "wide"
  533. """
  534. name = 'SHAPE'
  535. def value(self, index):
  536. return [(2,2,2,2), (1,2,3,3)][index]
  537. def explanation(self, index):
  538. return str(bool(index))+': lengths {},{},{},{}'.format(*self.value(index))
  539. class LengthOfLengthAlphabet(Code):
  540. """For use in decoding complex code descriptors.
  541. >>> lengthOfLengthAlphabet = LengthOfLengthAlphabet('')
  542. >>> print(lengthOfLengthAlphabet[2])
  543. coded with 2 bits
  544. >>> len(lengthOfLengthAlphabet[0])
  545. 2
  546. >>> [len(lengthOfLengthAlphabet[x]) for x in range(6)]
  547. [2, 4, 3, 2, 2, 4]
  548. >>> lengthOfLengthAlphabet.showCode()
  549. 00:skipped 01:coded with 4 bits 0111:coded with 1 bits
  550. 10:coded with 3 bits 011:coded with 2 bits 1111:coded with 5 bits
  551. """
  552. decodeTable = {
  553. 0b00:0, 0b10:3,
  554. 0b0111:1, 0b01:4,
  555. 0b011:2, 0b1111:5,
  556. }
  557. def __init__(self, name=None, **args):
  558. super().__init__(name, decodeTable=self.decodeTable, **args)
  559. def mnemonic(self, index):
  560. if index==0: return 'skipped'
  561. return 'coded with {} bits'.format(index)
  562. def explanation(self, index, extra=None):
  563. return self.description+': '+self.mnemonic(index)
  564. class LengthAlphabet(WithExtra):
  565. """Length of symbols
  566. Used during construction of a code.
  567. """
  568. def __init__(self, name):
  569. super().__init__(name, alphabetSize=18)
  570. def extraBits(self, index):
  571. return {16:2, 17:3}.get(index, 0)
  572. def mnemonic(self, index):
  573. if index==0: return 'unused'
  574. elif index==16: return 'rep xx'
  575. elif index==17: return 'zero xxx'
  576. else: return 'len {}'.format(index)
  577. def explanation(self, index, extra):
  578. return self.description.format(self[index], extra)
  579. def value(self, index, extra):
  580. #the caller got the length already, so extra is enough
  581. return extra
  582. #Stream header
  583. class WindowSizeAlphabet(Code):
  584. """The alphabet used for window size in the stream header.
  585. >>> WindowSizeAlphabet()[10].explanation()
  586. 'windowsize=(1<<10)-16=1008'
  587. """
  588. decodeTable = {
  589. 0b0100001: 10, 0b1100001: 14, 0b0011: 18, 0b1011: 22,
  590. 0b0110001: 11, 0b1110001: 15, 0b0101: 19, 0b1101: 23,
  591. 0b1000001: 12, 0b0: 16, 0b0111: 20, 0b1111: 24,
  592. 0b1010001: 13, 0b0000001: 17, 0b1001: 21,
  593. 0b0010001: None,
  594. }
  595. name = 'WSIZE'
  596. def __init__(self, name=None):
  597. super().__init__(name, decodeTable=self.decodeTable)
  598. def value(self, index):
  599. #missing value gives index None
  600. if index is None: return None
  601. return (1<<index)-16
  602. def explanation(self, index):
  603. return 'windowsize=(1<<{})-16={}'.format(
  604. index, (1<<index)-16)
  605. #Metablock
  606. class MetablockLengthAlphabet(WithExtra):
  607. """Used for the meta block length;
  608. also indicates a block with no data
  609. >>> metablockLengthAlphabet = MetablockLengthAlphabet()
  610. >>> metablockLengthAlphabet[0]; str(metablockLengthAlphabet[0])
  611. Symbol(MLEN, 0)
  612. 'empty'
  613. >>> metablockLengthAlphabet[3]
  614. Traceback (most recent call last):
  615. ...
  616. ValueError: No symbol MetablockLengthAlphabet[3]
  617. >>> print(metablockLengthAlphabet[4])
  618. hhhh00
  619. >>> metablockLengthAlphabet[4].value(0x1000)
  620. 4097
  621. >>> metablockLengthAlphabet[5].value(0x1000)
  622. Traceback (most recent call last):
  623. ...
  624. InvalidStream: Zeros in high nibble of MLEN
  625. >>> metablockLengthAlphabet[5].explanation(0x12345)
  626. 'data length: 12345h+1=74566'
  627. >>> metablockLengthAlphabet.showCode()
  628. 00:hhhh00 10:hhhhhh10 01:hhhhh01 11:empty
  629. """
  630. decodeTable = {0b11:0, 0b00:4, 0b01:5, 0b10:6}
  631. name = 'MLEN'
  632. def __init__(self, name=None):
  633. super().__init__(name, decodeTable=self.decodeTable)
  634. def extraBits(self, index):
  635. return index*4
  636. def mnemonic(self, index):
  637. if index==0: return 'empty'
  638. return 'h'*(self.extraBits(index)//4)+self.bitPattern(index)
  639. def value(self, index, extra):
  640. extraBits = self.extraBits(index)
  641. if not 0<=extra<1<<extraBits:
  642. raise ValueError('value: extra out of range')
  643. if index==0: return 0
  644. if index>4 and extra>>extraBits-4==0: raise InvalidStream(
  645. 'Zeros in high nibble of MLEN')
  646. return extra+1
  647. def explanation(self, index, extra):
  648. if index==0: return '11: empty block'
  649. extraBits = self.extraBits(index)
  650. return 'data length: {:0{}x}h+1={}'.format(extra, extraBits//4, extra+1)
  651. class ReservedAlphabet(BoolCode):
  652. """The reserved bit that must be zero.
  653. """
  654. name = 'RSVD'
  655. def value(self, index):
  656. if index: raise ValueError('Reserved bit is not zero')
  657. def explanation(self, index):
  658. return 'Reserved (must be zero)'
  659. class FillerAlphabet(Code):
  660. def __init__(self, *, streamPos):
  661. super().__init__('SKIP', bitLength=(-streamPos)&7)
  662. def explanation(self, index):
  663. return '{} bit{} ignored'.format(
  664. self.length(index),
  665. '' if self.length(index)==1 else 's',
  666. )
  667. class SkipLengthAlphabet(WithExtra):
  668. """Used for the skip length in an empty metablock
  669. >>> skipLengthAlphabet = SkipLengthAlphabet()
  670. >>> skipLengthAlphabet[0]; str(skipLengthAlphabet[0])
  671. Symbol(SKIP, 0)
  672. 'empty'
  673. >>> skipLengthAlphabet[4]
  674. Traceback (most recent call last):
  675. ...
  676. ValueError: index out of range
  677. >>> print(skipLengthAlphabet[3])
  678. hhhhhh11
  679. >>> skipLengthAlphabet[2].value(0x1000)
  680. 4097
  681. >>> skipLengthAlphabet[3].value(0x1000)
  682. Traceback (most recent call last):
  683. ...
  684. InvalidStream: Zeros in high byte of SKIPBYTES
  685. >>> skipLengthAlphabet[3].explanation(0x12345)
  686. 'skip length: 12345h+1=74566'
  687. >>> skipLengthAlphabet.showCode()
  688. 00:empty 01:hh01 10:hhhh10 11:hhhhhh11
  689. """
  690. def __init__(self):
  691. super().__init__('SKIP', bitLength=2)
  692. def extraBits(self, index):
  693. return index*8
  694. def mnemonic(self, index):
  695. if index==0: return 'empty'
  696. return 'h'*(self.extraBits(index)//4)+self.bitPattern(index)
  697. def value(self, index, extra):
  698. extraBits = self.extraBits(index)
  699. if not 0<=extra<1<<extraBits:
  700. raise ValueError('value: extra out of range')
  701. if index==0: return 0
  702. if index>1 and extra>>extraBits-8==0:
  703. raise InvalidStream('Zeros in high byte of SKIPBYTES')
  704. return extra+1
  705. def explanation(self, index, extra):
  706. if index==0: return '00: no skip'
  707. extraBits = self.extraBits(index)
  708. return 'skip length: {:{}x}h+1={}'.format(extra, extraBits//8, extra+1)
  709. class TypeCountAlphabet(Enumerator):
  710. """Used for giving block type counts and tree counts.
  711. >>> TypeCountAlphabet(description='').showCode()
  712. 0:0 0101:xx,0101 1011:xxxxx,1011
  713. 0001:0001 1101:xxxxxx,1101 0111:xxx,0111
  714. 1001:xxxx,1001 0011:x,0011 1111:xxxxxxx,1111
  715. """
  716. decodeTable = {
  717. 0b0: 0, 0b1001: 5,
  718. 0b0001: 1, 0b1011: 6,
  719. 0b0011: 2, 0b1101: 7,
  720. 0b0101: 3, 0b1111: 8,
  721. 0b0111: 4,
  722. }
  723. value0 = 1
  724. extraTable = [0, 0, 1, 2, 3, 4, 5, 6, 7]
  725. name = 'BT#'
  726. def __init__(self, name=None, *, description):
  727. super().__init__(
  728. name,
  729. decodeTable=self.decodeTable,
  730. description=description)
  731. def mnemonic(self, index):
  732. if index==0: return '0'
  733. if index==1: return '0001'
  734. return 'x'*(self.extraBits(index))+','+self.bitPattern(index)
  735. def explanation(self, index, extra):
  736. value = self.value(index, extra)
  737. description = self.description
  738. if value==1: description = description[:-1]
  739. return '{}: {} {}'.format(
  740. self.mnemonic(index),
  741. value,
  742. description)
  743. class BlockTypeAlphabet(Code):
  744. """The block types; this code works for all three kinds.
  745. >>> b = BlockTypeAlphabet('T', NBLTYPES=5)
  746. >>> print(*(x for x in b))
  747. prev +1 #0 #1 #2 #3 #4
  748. """
  749. def __init__(self, name, NBLTYPES, **args):
  750. super().__init__(name, alphabetSize=NBLTYPES+2, **args)
  751. self.NBLTYPES = NBLTYPES
  752. def mnemonic(self, index):
  753. if index==0: return 'prev'
  754. elif index==1: return '+1'
  755. else: return '#'+str(index-2)
  756. def value(self, index):
  757. return index-2
  758. def explanation(self, index):
  759. if index==0: return '0: previous'
  760. elif index==1: return '1: increment'
  761. else: return 'Set block type to: '+str(index-2)
  762. class BlockCountAlphabet(Enumerator):
  763. """Block counts
  764. >>> b = BlockCountAlphabet('L')
  765. >>> print(b[25])
  766. [24*x]: BC16625-16793840
  767. """
  768. value0 = 1
  769. extraTable = [2,2,2,2,3, 3,3,3,4,4, 4,4,5,5,5, 5,6,6,7,8, 9,10,11,12,13, 24]
  770. def __init__(self, name, **args):
  771. super().__init__(name, alphabetSize=26, **args)
  772. def mnemonic(self, index):
  773. extraBits = self.extraBits(index)
  774. return '{}: BC{}-{}'.format(
  775. 'x'*extraBits if index<5 else '[{}*x]'.format(extraBits),
  776. *self.span(index))
  777. def explanation(self, index, extra):
  778. return 'Block count: '+super().explanation(index, extra)
  779. class DistanceParamAlphabet(WithExtra):
  780. """The distance parameters NPOSTFIX and NDIRECT.
  781. Although these are treated as two in the description, this is easier.
  782. """
  783. def __init__(self):
  784. super().__init__('DIST', bitLength=2)
  785. def extraBits(self, index):
  786. return 4
  787. def value(self, index, extra):
  788. """Returns NPOSTFIX and NDIRECT<<NPOSTFIX
  789. """
  790. if extra>15:
  791. raise ValueError('value: extra out of range')
  792. return index, extra<<index
  793. def explanation(self, index, extra):
  794. return '{} postfix bits and {:04b}<<{}={} direct codes'.format(
  795. index, extra, index, extra<<index)
  796. def mnemonic(self, index):
  797. return 'PF'+str(index)
  798. class LiteralContextMode(Code):
  799. """For the literal context modes.
  800. >>> LiteralContextMode().showCode()
  801. 00:LSB6 01:MSB6 10:UTF8 11:Signed
  802. >>> LiteralContextMode().explanation(2)
  803. 'Context mode for type 9: 2(UTF8)'
  804. """
  805. def __init__(self, *, number=9):
  806. super().__init__('LC'+str(number), bitLength=2)
  807. self.number = number
  808. def mnemonic(self, index):
  809. return ['LSB6', 'MSB6', 'UTF8', 'Signed'][index]
  810. def explanation(self, index):
  811. return 'Context mode for type {}: {}({})'.format(
  812. self.number,
  813. index,
  814. self.mnemonic(index))
  815. class RLEmaxAlphabet(Enumerator):
  816. """Used for describing the run length encoding used for describing context maps.
  817. >>> RLEmaxAlphabet().showCode()
  818. 0:1 1:more
  819. """
  820. value0 = 0
  821. extraTable = [0, 4]
  822. name = 'RLE#'
  823. def mnemonic(self, index):
  824. return ['1', 'more'][index]
  825. def explanation(self, index, extra):
  826. description = self.description and self.description+': '
  827. if index==0: return description+'No RLE coding'
  828. return '{}xxxx 1: RLEMAX={}'.format(description, extra+1)
  829. class TreeAlphabet(WithExtra):
  830. """The alphabet to enumerate entries (called trees) in the context map.
  831. parameters are RLEMAX and NTREES
  832. >>> t = TreeAlphabet('', RLEMAX=3, NTREES=5)
  833. >>> len(t)
  834. 8
  835. >>> print(t[2])
  836. xx+4 zeroes
  837. >>> t[3].explanation(2)
  838. '8+010=10 zeroes'
  839. >>> t[0].value(0)
  840. (1, 0)
  841. """
  842. name = 'CMI'
  843. def __init__(self, name=None, *, RLEMAX, NTREES, **args):
  844. super().__init__(name, alphabetSize=RLEMAX+NTREES, **args)
  845. self.RLEMAX = RLEMAX
  846. self.NTREES = NTREES
  847. def extraBits(self, index):
  848. if 0<index<=self.RLEMAX: return index
  849. return 0
  850. def mnemonic(self, index):
  851. if index==0: return 'map #0'
  852. if index<=self.RLEMAX:
  853. return '{}+{} zeroes'.format('x'*index, 1<<index)
  854. return 'map #{}'.format(index-self.RLEMAX)
  855. def value(self, index, extra):
  856. """Give count and value."""
  857. index = index
  858. if index==0: return 1, 0
  859. if index<=self.RLEMAX: return (1<<index)+extra, 0
  860. return 1, index-self.RLEMAX
  861. def explanation(self, index, extra):
  862. description = self.description and self.description+': '
  863. if index==0: return description+'map #0'
  864. if index<=self.RLEMAX:
  865. return '{}+{:0{}b}={} zeroes'.format(
  866. (1<<index),
  867. extra, self.extraBits(index),
  868. (1<<index)+extra)
  869. return '{}map #{}-{}={}'.format(
  870. description,
  871. index, self.RLEMAX, index-self.RLEMAX)
  872. #Prefix alphabets for the data stream----------------------------------
  873. class LiteralAlphabet(Code):
  874. """Alphabet of symbols.
  875. """
  876. minLength = maxLength = 8
  877. def __init__(self, number):
  878. super().__init__('L'+str(number), alphabetSize=1<<8)
  879. def mnemonic(self, index):
  880. return outputCharFormatter(index)
  881. def value(self, index, extra=None):
  882. return index
  883. def explanation(self, index, extra=None):
  884. return self.mnemonic(index)
  885. class InsertLengthAlphabet(Enumerator):
  886. """Intern code for insert counts
  887. """
  888. value0 = 0
  889. extraTable = [0,0,0,0,0, 0,1,1,2,2, 3,3,4,4,5, 5,6,7,8,9, 10,12,14,24]
  890. class CopyLengthAlphabet(Enumerator):
  891. value0 = 2
  892. extraTable = [0,0,0,0,0, 0,0,0,1,1, 2,2,3,3,4, 4,5,5,6,7, 8,9,10,24]
  893. class InsertAndCopyAlphabet(WithExtra):
  894. """The insert and copy code
  895. >>> for x in range(0,704,704//13):
  896. ... print('{:10b}'.format(x), InsertAndCopyAlphabet()[x])
  897. 0 I0C2&D=0
  898. 110110 I6+xC8&D=0
  899. 1101100 I5C22+xxx&D=0
  900. 10100010 I4C4
  901. 11011000 I3C10+x
  902. 100001110 I14+xxC8
  903. 101000100 I10+xxC22+xxx
  904. 101111010 I98+xxxxxC14+xx
  905. 110110000 I6+xC70+xxxxx
  906. 111100110 I1090+[10*x]C8
  907. 1000011100 I26+xxxC326+[8*x]
  908. 1001010010 I322+[8*x]C14+xx
  909. 1010001000 I194+[7*x]C70+xxxxx
  910. 1010111110 I22594+[24*x]C1094+[10*x]
  911. """
  912. insertLengthAlphabet = InsertLengthAlphabet(None)
  913. copyLengthAlphabet = CopyLengthAlphabet(None)
  914. def __init__(self, number=''):
  915. super().__init__('IC'+str(number), bitLength=10)
  916. def __len__(self):
  917. return 704
  918. def extraBits(self, index):
  919. insertSymbol, copySymbol, dist0 = self.splitSymbol(index)
  920. return InsertLengthAlphabet.extraTable[insertSymbol.index] + \
  921. CopyLengthAlphabet.extraTable[copySymbol.index]
  922. def splitSymbol(self, index):
  923. """Give relevant values for computations:
  924. (insertSymbol, copySymbol, dist0flag)
  925. """
  926. #determine insert and copy upper bits from table
  927. row = [0,0,1,1,2,2,1,3,2,3,3][index>>6]
  928. col = [0,1,0,1,0,1,2,0,2,1,2][index>>6]
  929. #determine inserts and copy sub codes
  930. insertLengthCode = row<<3 | index>>3&7
  931. if row: insertLengthCode -= 8
  932. copyLengthCode = col<<3 | index&7
  933. return (
  934. Symbol(self.insertLengthAlphabet, insertLengthCode),
  935. Symbol(self.copyLengthAlphabet, copyLengthCode),
  936. row==0
  937. )
  938. def mnemonic(self, index):
  939. """Make a nice mnemonic
  940. """
  941. i,c,d0 = self.splitSymbol(index)
  942. iLower, _ = i.code.span(i.index)
  943. iExtra = i.extraBits()
  944. cLower, _ = c.code.span(c.index)
  945. cExtra = c.extraBits()
  946. return 'I{}{}{}C{}{}{}{}'.format(
  947. iLower,
  948. '+' if iExtra else '',
  949. 'x'*iExtra if iExtra<6 else '[{}*x]'.format(iExtra),
  950. cLower,
  951. '+' if cExtra else '',
  952. 'x'*cExtra if cExtra<6 else '[{}*x]'.format(cExtra),
  953. '&D=0' if d0 else '')
  954. def value(self, index, extra):
  955. i,c,d0 = self.splitSymbol(index)
  956. iExtra = i.extraBits()
  957. ce, ie = extra>>iExtra, extra&(1<<iExtra)-1
  958. insert = i.value(ie)
  959. copy = c.value(ce)
  960. return insert, copy, d0
  961. def explanation(self, index, extra):
  962. insert, copy, d0 = self.value(index, extra)
  963. if d0: return 'Literal: {}, copy: {}, same distance'.format(insert, copy)
  964. else: return 'Literal: {}, copy: {}'.format(insert, copy)
  965. class DistanceAlphabet(WithExtra):
  966. """Represent the distance encoding.
  967. Dynamically generated alphabet.
  968. This is what the documentation should have said:
  969. Ignoring offsets for the moment, the "long" encoding works as follows:
  970. Write the distance in binary as follows:
  971. 1xy..yz..z, then the distance symbol consists of n..nxz..z
  972. Where:
  973. n is one less than number of bits in y
  974. x is a single bit
  975. y..y are n+1 extra bits (encoded in the bit stream)
  976. z..z is NPOSTFIX bits that are part of the symbol
  977. The offsets are so as to start at the lowest useable value:
  978. if 1xyyyyz = distance +(4<<POSTFIX)-NDIRECT-1
  979. then n..nxz..z is symbol -NDIRECT-16
  980. >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
  981. >>> print(d[4], d[17], d[34])
  982. last-1 1 10xx00-5
  983. >>> [str(d[x]) for x in range(26, 32)]
  984. ['10x00-5', '10x01-5', '10x10-5', '10x11-5', '11x00-5', '11x01-5']
  985. """
  986. def __init__(self, number, *, NPOSTFIX, NDIRECT):
  987. self.NPOSTFIX = NPOSTFIX
  988. self.NDIRECT = NDIRECT
  989. #set length
  990. #Actually, not all symbols are used,
  991. #only NDIRECT+16+(44-2*POSTFIX<<NPOSTFIX)
  992. super().__init__('D'+str(number),
  993. alphabetSize=self.NDIRECT+16+(48<<self.NPOSTFIX))
  994. def extraBits(self, index):
  995. """Indicate how many extra bits are needed to interpret symbol
  996. >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
  997. >>> [d[i].extraBits() for i in range(26)]
  998. [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  999. >>> [d[i].extraBits() for i in range(26,36)]
  1000. [1, 1, 1, 1, 1, 1, 1, 1, 2, 2]
  1001. """
  1002. if index<16+self.NDIRECT: return 0
  1003. return 1 + ((index - self.NDIRECT - 16) >> (self.NPOSTFIX + 1))
  1004. def value(self, dcode, dextra):
  1005. """Decode value of symbol together with the extra bits.
  1006. >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
  1007. >>> d[34].value(2)
  1008. (0, 35)
  1009. """
  1010. if dcode<16:
  1011. return [(1,0),(2,0),(3,0),(4,0),
  1012. (1,-1),(1,+1),(1,-2),(1,+2),(1,-3),(1,+3),
  1013. (2,-1),(2,+1),(2,-2),(2,+2),(2,-3),(2,+3)
  1014. ][dcode]
  1015. if dcode<16+self.NDIRECT:
  1016. return (0,dcode-16)
  1017. #we use the original formulas, instead of my clear explanation
  1018. POSTFIX_MASK = (1 << self.NPOSTFIX) - 1
  1019. ndistbits = 1 + ((dcode - self.NDIRECT - 16) >> (self.NPOSTFIX + 1))
  1020. hcode = (dcode - self.NDIRECT - 16) >> self.NPOSTFIX
  1021. lcode = (dcode - self.NDIRECT - 16) & POSTFIX_MASK
  1022. offset = ((2 + (hcode & 1)) << ndistbits) - 4
  1023. distance = ((offset + dextra) << self.NPOSTFIX) + lcode + self.NDIRECT + 1
  1024. return (0,distance)
  1025. def mnemonic(self, index, verbose=False):
  1026. """Give mnemonic representation of meaning.
  1027. verbose compresses strings of x's
  1028. """
  1029. if index<16:
  1030. return ['last', '2last', '3last', '4last',
  1031. 'last-1', 'last+1', 'last-2', 'last+2', 'last-3', 'last+3',
  1032. '2last-1', '2last+1', '2last-2', '2last+2', '2last-3', '2last+3'
  1033. ][index]
  1034. if index<16+self.NDIRECT:
  1035. return str(index-16)
  1036. #construct strings like "1xx01-15"
  1037. index -= self.NDIRECT+16
  1038. hcode = index >> self.NPOSTFIX
  1039. lcode = index & (1<<self.NPOSTFIX)-1
  1040. if self.NPOSTFIX: formatString = '1{0}{1}{2:0{3}b}{4:+d}'
  1041. else: formatString = '1{0}{1}{4:+d}'
  1042. return formatString.format(
  1043. hcode&1,
  1044. 'x'*(2+hcode>>1) if hcode<13 or verbose else '[{}*x]'.format(2+hcode>>1),
  1045. lcode, self.NPOSTFIX,
  1046. self.NDIRECT+1-(4<<self.NPOSTFIX))
  1047. def explanation(self, index, extra):
  1048. """
  1049. >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
  1050. >>> d[55].explanation(13)
  1051. '11[1101]01-5: [0]+240'
  1052. """
  1053. extraBits = self.extraBits(index)
  1054. extraString = '[{:0{}b}]'.format(extra, extraBits)
  1055. return '{0}: [{1[0]}]{1[1]:+d}'.format(
  1056. self.mnemonic(index, True).replace('x'*(extraBits or 1), extraString),
  1057. self.value(index, extra))
  1058. #Classes for doing actual work------------------------------------------
  1059. class ContextModeKeeper:
  1060. """For computing the literal context mode.
  1061. You feed it characters, and it computes indices in the context map.
  1062. """
  1063. def __init__(self, mode):
  1064. self.chars = deque([0,0], maxlen=2)
  1065. self.mode = mode
  1066. def setContextMode(self, mode):
  1067. """Switch to given context mode (0..3)"""
  1068. self.mode = mode
  1069. def getIndex(self):
  1070. if self.mode==0: #LSB6
  1071. return self.chars[1]&0x3f
  1072. elif self.mode==1: #MSB6
  1073. return self.chars[1]>>2
  1074. elif self.mode==2: #UTF8: character class of previous and a bit of the second
  1075. p2,p1 = self.chars
  1076. return self.lut0[p1]|self.lut1[p2]
  1077. elif self.mode==3: #Signed: initial bits of last two bytes
  1078. p2,p1 = self.chars
  1079. return self.lut2[p1]<<3|self.lut2[p2]
  1080. def add(self, index):
  1081. """Adjust the context for output char (as int)."""
  1082. self.chars.append(index)
  1083. #0: control #16: quote #32: ,:; #48: AEIOU
  1084. #4: tab/lf/cr #20: % #36: . #52: BC..Z
  1085. #8: space #24: (<[{ #40: = #56: aeiou
  1086. #12:!#$&*+-/?@| #28: )>]} #44: 0-9 #60: bc..z
  1087. lut0 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 4, 0, 0,
  1088. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1089. 8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12,
  1090. 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12,
  1091. 12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48,
  1092. 52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12,
  1093. 12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56,
  1094. 60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12, 0
  1095. ]+[0,1]*32+[2,3]*32
  1096. #0: space 1:punctuation 2:digit/upper 3:lower
  1097. lut1 = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1098. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1099. 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1100. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
  1101. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1102. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
  1103. 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1104. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0
  1105. ]+[0]*96+[2]*32
  1106. #initial bits: 8*0, 4*0, 2*0, 1*0, 1*1, 2*1, 4*1, 8*1
  1107. lut2 = [0]+[1]*15+[2]*48+[3]*64+[4]*64+[5]*48+[6]*15+[7]
  1108. assert len(lut0)==len(lut1)==len(lut2)==256
  1109. class WordList:
  1110. """Word list.
  1111. >>> WordList().word(7, 35555)
  1112. b'Program to '
  1113. """
  1114. NDBITS = [0, 0, 0, 0, 10, 10, 11, 11, 10, 10,
  1115. 10, 10, 10, 9, 9, 8, 7, 7, 8, 7,
  1116. 7, 6, 6, 5, 5]
  1117. def __init__(self):
  1118. self.file = open(DICTIONARY_PATH, 'rb')
  1119. self.compileActions()
  1120. def word(self, size, dist):
  1121. """Get word
  1122. """
  1123. #split dist in index and action
  1124. ndbits = self.NDBITS[size]
  1125. index = dist&(1<<ndbits)-1
  1126. action = dist>>ndbits
  1127. #compute position in file
  1128. position = sum(n<<self.NDBITS[n] for n in range(4,size))+size*index
  1129. self.file.seek(position)
  1130. return self.doAction(self.file.read(size), action)
  1131. def upperCase1(self, word):
  1132. word = word.decode('utf8')
  1133. word = word[0].upper()+word[1:]
  1134. return word.encode('utf8')
  1135. #Super compact form of action table.
  1136. #_ means space, .U means UpperCaseAll, U(w) means UpperCaseFirst
  1137. actionTable = r"""
  1138. 0:w 25:w+_for_ 50:w+\n\t 75:w+. This_100:w+ize_
  1139. 1:w+_ 26:w[3:] 51:w+: 76:w+, 101:w.U+.
  1140. 2:_+w+_ 27:w[:-2] 52:_+w+._ 77:.+w+_ 102:\xc2\xa0+w
  1141. 3:w[1:] 28:w+_a_ 53:w+ed_ 78:U(w)+( 103:_+w+,
  1142. 4:U(w)+_ 29:w+_that_ 54:w[9:] 79:U(w)+. 104:U(w)+="
  1143. 5:w+_the_ 30:_+U(w) 55:w[7:] 80:w+_not_ 105:w.U+="
  1144. 6:_+w 31:w+._ 56:w[:-6] 81:_+w+=" 106:w+ous_
  1145. 7:s_+w+_ 32:.+w 57:w+( 82:w+er_ 107:w.U+,_
  1146. 8:w+_of_ 33:_+w+,_ 58:U(w)+,_ 83:_+w.U+_ 108:U(w)+=\'
  1147. 9:U(w) 34:w[4:] 59:w[:-8] 84:w+al_ 109:_+U(w)+,
  1148. 10:w+_and_ 35:w+_with_ 60:w+_at_ 85:_+w.U 110:_+w.U+="
  1149. 11:w[2:] 36:w+\' 61:w+ly_ 86:w+=\' 111:_+w.U+,_
  1150. 12:w[:-1] 37:w+_from_ 62:_the_+w+_of_ 87:w.U+" 112:_+w.U+,
  1151. 13:,_+w+_ 38:w+_by_ 63:w[:-5] 88:U(w)+._ 113:w.U+(
  1152. 14:w+,_ 39:w[5:] 64:w[:-9] 89:_+w+( 114:w.U+._
  1153. 15:_+U(w)+_ 40:w[6:] 65:_+U(w)+,_ 90:w+ful_ 115:_+w.U+.
  1154. 16:w+_in_ 41:_the_+w 66:U(w)+" 91:_+U(w)+._116:w.U+=\'
  1155. 17:w+_to_ 42:w[:-4] 67:.+w+( 92:w+ive_ 117:_+w.U+._
  1156. 18:e_+w+_ 43:w+. The_ 68:w.U+_ 93:w+less_ 118:_+U(w)+="
  1157. 19:w+" 44:w.U 69:U(w)+"> 94:w.U+\' 119:_+w.U+=\'
  1158. 20:w+. 45:w+_on_ 70:w+=" 95:w+est_ 120:_+U(w)+=\'
  1159. 21:w+"> 46:w+_as_ 71:_+w+. 96:_+U(w)+.
  1160. 22:w+\n 47:w+_is_ 72:.com/+w 97:w.U+">
  1161. 23:w[:-3] 48:w[:-7] 98:_+w+=\'
  1162. 24:w+] 49:w[:-1]+ing_ 74:U(w)+\' 99:U(w)+,
  1163. """
  1164. def compileActions(self):
  1165. """Build the action table from the text above
  1166. """
  1167. import re
  1168. self.actionList = actions = [None]*121
  1169. #Action 73, which is too long, looks like this when expanded:
  1170. actions[73] = "b' the '+w+b' of the '"
  1171. #find out what the columns are
  1172. actionLines = self.actionTable.splitlines()
  1173. colonPositions = [m.start()
  1174. for m in re.finditer(':',actionLines[1])
  1175. ]+[100]
  1176. columns = [(colonPositions[i]-3,colonPositions[i+1]-3)
  1177. for i in range(len(colonPositions)-1)]
  1178. for line in self.actionTable.splitlines(keepends=False):
  1179. for start,end in columns:
  1180. action = line[start:end]
  1181. #skip empty actions
  1182. if not action or action.isspace(): continue
  1183. #chop it up, and check if the colon is properly placed
  1184. index, colon, action = action[:3], action[3], action[4:]
  1185. assert colon==':'
  1186. #remove filler spaces at right
  1187. action = action.rstrip()
  1188. #replace space symbols
  1189. action = action.replace('_', ' ')
  1190. wPos = action.index('w')
  1191. #add quotes around left string when present
  1192. #translation: any pattern from beginning, up to
  1193. #(but not including) a + following by a w later on
  1194. action = re.sub(r"^(.*)(?=\+[U(]*w)", r"b'\1'", action)
  1195. #add quotes around right string when present
  1196. #translation: anything with a w in it, followed by a +
  1197. #and a pattern up to the end
  1198. #(there is no variable lookbehind assertion,
  1199. #so we have to copy the pattern)
  1200. action = re.sub(r"(w[[:\-1\]).U]*)\+(.*)$", r"\1+b'\2'", action)
  1201. #expand shortcut for uppercaseAll
  1202. action = action.replace(".U", ".upper()")
  1203. #store action
  1204. actions[int(index)] = action
  1205. def doAction(self, w, action):
  1206. """Perform the proper action
  1207. """
  1208. #set environment for the UpperCaseFirst
  1209. U = self.upperCase1
  1210. return eval(self.actionList[action], locals())
  1211. class Layout:
  1212. """Class to layout the output.
  1213. """
  1214. #display width of hexdata+bitdata
  1215. width = 25
  1216. #general
  1217. def __init__(self, stream):
  1218. self.stream = stream
  1219. self.bitPtr = self.width
  1220. def makeHexData(self, pos):
  1221. """Produce hex dump of all data containing the bits
  1222. from pos to stream.pos
  1223. """
  1224. firstAddress = pos+7>>3
  1225. lastAddress = self.stream.pos+7>>3
  1226. return ''.join(map('{:02x} '.format,
  1227. self.stream.data[firstAddress:lastAddress]))
  1228. def formatBitData(self, pos, width1, width2=0):
  1229. """Show formatted bit data:
  1230. Bytes are separated by commas
  1231. whole bytes are displayed in hex
  1232. >>> Layout(olleke).formatBitData(6, 2, 16)
  1233. '|00h|2Eh,|00'
  1234. >>> Layout(olleke).formatBitData(4, 1, 0)
  1235. '1'
  1236. """
  1237. result = []
  1238. #make empty prefix code explicit
  1239. if width1==0: result = ['()', ',']
  1240. for width in width1, width2:
  1241. #skip empty width2
  1242. if width==0: continue
  1243. #build result backwards in a list
  1244. while width>0:
  1245. availableBits = 8-(pos&7)
  1246. if width<availableBits:
  1247. #read partial byte, beginning nor ending at boundary
  1248. data = self.stream.data[pos>>3] >> (pos&7) & (1<<width)-1
  1249. result.append('{:0{}b}'.format(data, width))
  1250. elif availableBits<8:
  1251. #read rest of byte, ending at boundary
  1252. data = self.stream.data[pos>>3] >> (pos&7)
  1253. result.append('|{:0{}b}'.format(data, availableBits))
  1254. else:
  1255. #read whole byte (in hex), beginning and ending at boundary
  1256. data = self.stream.data[pos>>3]
  1257. result.append('|{:02X}h'.format(data))
  1258. width -= availableBits
  1259. pos += availableBits
  1260. #if width overshot from the availableBits subtraction, fix it
  1261. pos += width
  1262. #add comma to separate fields
  1263. result.append(',')
  1264. #concatenate pieces, reversed, skipping the last space
  1265. return ''.join(result[-2::-1])
  1266. def readPrefixCode(self, alphabet):
  1267. """give alphabet the prefix code that is read from the stream
  1268. Called for the following alphabets, in this order:
  1269. The alphabet in question must have a "logical" order,
  1270. otherwise the assignment of symbols doesn't work.
  1271. """
  1272. mode, numberOfSymbols = self.verboseRead(PrefixCodeHeader(alphabet.name))
  1273. if mode=='Complex':
  1274. #for a complex code, numberOfSymbols means hskip
  1275. self.readComplexCode(numberOfSymbols, alphabet)
  1276. return alphabet
  1277. else:
  1278. table = []
  1279. #Set table of lengths for mnemonic function
  1280. lengths = [[0], [1,1], [1,2,2], '????'][numberOfSymbols-1]
  1281. #adjust mnemonic function of alphabet class
  1282. def myMnemonic(index):
  1283. return '{} bit{}: {}'.format(
  1284. lengths[i],
  1285. '' if lengths[i]==1 else 's',
  1286. alphabet.__class__.mnemonic(alphabet, index)
  1287. )
  1288. alphabet.mnemonic = myMnemonic
  1289. for i in range(numberOfSymbols):
  1290. table.append(self.verboseRead(alphabet, skipExtra=True).index)
  1291. #restore mnemonic
  1292. del alphabet.mnemonic
  1293. if numberOfSymbols==4:
  1294. #read tree shape to redefine lengths
  1295. lengths = self.verboseRead(TreeShapeAlhabet())
  1296. #construct the alphabet prefix code
  1297. alphabet.setLength(dict(zip(table, lengths)))
  1298. return alphabet
  1299. def readComplexCode(self, hskip, alphabet):
  1300. """Read complex code"""
  1301. stream = self.stream
  1302. #read the lengths for the length code
  1303. lengths = [1,2,3,4,0,5,17,6,16,7,8,9,10,11,12,13,14,15][hskip:]
  1304. codeLengths = {}
  1305. total = 0
  1306. lol = LengthOfLengthAlphabet('##'+alphabet.name)
  1307. #lengthCode will be used for coding the lengths of the new code
  1308. #we use it for display until now; definition comes below
  1309. lengthCode = LengthAlphabet('#'+alphabet.name)
  1310. lengthIter = iter(lengths)
  1311. lengthsLeft = len(lengths)
  1312. while total<32 and lengthsLeft>0:
  1313. lengthsLeft -= 1
  1314. newSymbol = next(lengthIter)
  1315. lol.description = str(lengthCode[newSymbol])
  1316. length = self.verboseRead(lol)
  1317. if length:
  1318. codeLengths[newSymbol] = length
  1319. total += 32>>length
  1320. if total>32: raise ValueError("Stream format")
  1321. if len(codeLengths)==1: codeLengths[list(codeLengths.keys())[0]] = 0
  1322. #Now set the encoding of the lengthCode
  1323. lengthCode.setLength(codeLengths)
  1324. print("***** Lengths for {} will be coded as:".format(alphabet.name))
  1325. lengthCode.showCode()
  1326. #Now determine the symbol lengths with the lengthCode
  1327. symbolLengths = {}
  1328. total = 0
  1329. lastLength = 8
  1330. alphabetIter = iter(alphabet)
  1331. while total<32768:
  1332. #look ahead to see what is going to happen
  1333. length = lengthCode.decodePeek(
  1334. self.stream.peek(lengthCode.maxLength))[1].index
  1335. #in every branch, set lengthCode.description to explanatory text
  1336. #lengthCode calls format(symbol, extra) with this string
  1337. if length==0:
  1338. symbol = next(alphabetIter)
  1339. lengthCode.description = 'symbol {} unused'.format(symbol)
  1340. self.verboseRead(lengthCode)
  1341. #unused symbol
  1342. continue
  1343. if length==16:
  1344. lengthCode.description = \
  1345. '{1}+3 symbols of length '+str(lastLength)
  1346. extra = self.verboseRead(lengthCode)
  1347. #scan series of 16s (repeat counts)
  1348. #start with repeat count 2
  1349. repeat = 2
  1350. startSymbol = next(alphabetIter)
  1351. endSymbol = next(alphabetIter)
  1352. symbolLengths[startSymbol.index] = \
  1353. symbolLengths[endSymbol.index] = lastLength
  1354. #count the two just defined symbols
  1355. total += 2*32768>>lastLength
  1356. #note: loop may end because we're there
  1357. #even if a 16 _appears_ to follow
  1358. while True:
  1359. #determine last symbol
  1360. oldRepeat = repeat
  1361. repeat = (repeat-2<<2)+extra+3
  1362. #read as many symbols as repeat increased
  1363. for i in range(oldRepeat, repeat):
  1364. endSymbol = next(alphabetIter)
  1365. symbolLengths[endSymbol.index] = lastLength
  1366. #compute new total; it may be end of loop
  1367. total += (repeat-oldRepeat)*32768>>lastLength
  1368. if total>=32768: break
  1369. #see if there is more to do
  1370. length = lengthCode.decodePeek(
  1371. self.stream.peek(lengthCode.maxLength))[1].index
  1372. if length!=16: break
  1373. lengthCode.description = 'total {}+{{1}} symbols'.format(
  1374. (repeat-2<<2)+3)
  1375. extra = self.verboseRead(lengthCode)
  1376. elif length==17:
  1377. #read, and show explanation
  1378. lengthCode.description = '{1}+3 unused'
  1379. extra = self.verboseRead(lengthCode)
  1380. #scan series of 17s (groups of zero counts)
  1381. #start with repeat count 2
  1382. repeat = 2
  1383. startSymbol = next(alphabetIter)
  1384. endSymbol = next(alphabetIter)
  1385. #note: loop will not end with total==32768,
  1386. #since total doesn't change here
  1387. while True:
  1388. #determine last symbol
  1389. oldRepeat = repeat
  1390. repeat = (repeat-2<<3)+extra+3
  1391. #read as many symbols as repeat increases
  1392. for i in range(repeat-oldRepeat):
  1393. endSymbol = next(alphabetIter)
  1394. #see if there is more to do
  1395. length = lengthCode.decodePeek(
  1396. self.stream.peek(lengthCode.maxLength))[1].index
  1397. if length!=17: break
  1398. lengthCode.description = 'total {}+{{1}} unused'.format(
  1399. (repeat-2<<3)+3)
  1400. extra = self.verboseRead(lengthCode)
  1401. else:
  1402. symbol = next(alphabetIter)
  1403. #double braces for format
  1404. char = str(symbol)
  1405. if char in '{}': char *= 2
  1406. lengthCode.description = \
  1407. 'Length for {} is {{0.index}} bits'.format(char)
  1408. #output is not needed (will be 0)
  1409. self.verboseRead(lengthCode)
  1410. symbolLengths[symbol.index] = length
  1411. total += 32768>>length
  1412. lastLength = length
  1413. assert total==32768
  1414. alphabet.setLength(symbolLengths)
  1415. print('End of table. Prefix code '+alphabet.name+':')
  1416. alphabet.showCode()
  1417. #stream
  1418. def processStream(self):
  1419. """Process a brotli stream.
  1420. """
  1421. print('addr hex{:{}s}binary context explanation'.format(
  1422. '', self.width-10))
  1423. print('Stream header'.center(60, '-'))
  1424. self.windowSize = self.verboseRead(WindowSizeAlphabet())
  1425. print('Metablock header'.center(60, '='))
  1426. self.ISLAST = False
  1427. self.output = bytearray()
  1428. while not self.ISLAST:
  1429. self.ISLAST = self.verboseRead(
  1430. BoolCode('LAST', description="Last block"))
  1431. if self.ISLAST:
  1432. if self.verboseRead(
  1433. BoolCode('EMPTY', description="Empty block")): break
  1434. if self.metablockLength(): continue
  1435. if not self.ISLAST and self.uncompressed(): continue
  1436. print('Block type descriptors'.center(60, '-'))
  1437. self.numberOfBlockTypes = {}
  1438. self.currentBlockCounts = {}
  1439. self.blockTypeCodes = {}
  1440. self.blockCountCodes = {}
  1441. for blockType in (L,I,D): self.blockType(blockType)
  1442. print('Distance code parameters'.center(60, '-'))
  1443. self.NPOSTFIX, self.NDIRECT = self.verboseRead(DistanceParamAlphabet())
  1444. self.readLiteralContextModes()
  1445. print('Context maps'.center(60, '-'))
  1446. self.cmaps = {}
  1447. #keep the number of each kind of prefix tree for the last loop
  1448. numberOfTrees = {I: self.numberOfBlockTypes[I]}
  1449. for blockType in (L,D):
  1450. numberOfTrees[blockType] = self.contextMap(blockType)
  1451. print('Prefix code lists'.center(60, '-'))
  1452. self.prefixCodes = {}
  1453. for blockType in (L,I,D):
  1454. self.readPrefixArray(blockType, numberOfTrees[blockType])
  1455. self.metablock()
  1456. #metablock header
  1457. def verboseRead(self, alphabet, context='', skipExtra=False):
  1458. """Read symbol and extra from stream and explain what happens.
  1459. Returns the value of the symbol
  1460. >>> olleke.pos = 0
  1461. >>> l = Layout(olleke)
  1462. >>> l.verboseRead(WindowSizeAlphabet())
  1463. 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288
  1464. 4194288
  1465. """
  1466. #TODO 2: verbosity level, e.g. show only codes and maps in header
  1467. stream = self.stream
  1468. pos = stream.pos
  1469. if skipExtra:
  1470. length, symbol = alphabet.readTuple(stream)
  1471. extraBits, extra = 0, None
  1472. else:
  1473. length, symbol, extraBits, extra = alphabet.readTupleAndExtra(
  1474. stream)
  1475. #fields: address, hex data, binary data, name of alphabet, explanation
  1476. hexdata = self.makeHexData(pos)
  1477. addressField = '{:04x}'.format(pos+7>>3) if hexdata else ''
  1478. bitdata = self.formatBitData(pos, length, extraBits)
  1479. #bitPtr moves bitdata so that the bytes are easier to read
  1480. #jump back to right if a new byte starts
  1481. if '|' in bitdata[1:]:
  1482. #start over on the right side
  1483. self.bitPtr = self.width
  1484. fillWidth = self.bitPtr-(len(hexdata)+len(bitdata))
  1485. if fillWidth<0: fillWidth = 0
  1486. print('{:<5s} {:<{}s} {:7s} {}'.format(
  1487. addressField,
  1488. hexdata+' '*fillWidth+bitdata, self.width,
  1489. context+alphabet.name,
  1490. symbol if skipExtra else symbol.explanation(extra),
  1491. ))
  1492. #jump to the right if we started with a '|'
  1493. #because we didn't jump before printing
  1494. if bitdata.startswith('|'): self.bitPtr = self.width
  1495. else: self.bitPtr -= len(bitdata)
  1496. return symbol if skipExtra else symbol.value(extra)
  1497. def metablockLength(self):
  1498. """Read MNIBBLES and meta block length;
  1499. if empty block, skip block and return true.
  1500. """
  1501. self.MLEN = self.verboseRead(MetablockLengthAlphabet())
  1502. if self.MLEN:
  1503. return False
  1504. #empty block; skip and return False
  1505. self.verboseRead(ReservedAlphabet())
  1506. MSKIP = self.verboseRead(SkipLengthAlphabet())
  1507. self.verboseRead(FillerAlphabet(streamPos=self.stream.pos))
  1508. self.stream.pos += 8*MSKIP
  1509. print("Skipping to {:x}".format(self.stream.pos>>3))
  1510. return True
  1511. def uncompressed(self):
  1512. """If true, handle uncompressed data
  1513. """
  1514. ISUNCOMPRESSED = self.verboseRead(
  1515. BoolCode('UNCMPR', description='Is uncompressed?'))
  1516. if ISUNCOMPRESSED:
  1517. self.verboseRead(FillerAlphabet(streamPos=self.stream.pos))
  1518. print('Uncompressed data:')
  1519. self.output += self.stream.readBytes(self.MLEN)
  1520. print(outputFormatter(self.output[-self.MLEN:]))
  1521. return ISUNCOMPRESSED
  1522. def blockType(self, kind):
  1523. """Read block type switch descriptor for given kind of blockType."""
  1524. NBLTYPES = self.verboseRead(TypeCountAlphabet(
  1525. 'BT#'+kind[0].upper(),
  1526. description='{} block types'.format(kind),
  1527. ))
  1528. self.numberOfBlockTypes[kind] = NBLTYPES
  1529. if NBLTYPES>=2:
  1530. self.blockTypeCodes[kind] = self.readPrefixCode(
  1531. BlockTypeAlphabet('BT'+kind[0].upper(), NBLTYPES))
  1532. self.blockCountCodes[kind] = self.readPrefixCode(
  1533. BlockCountAlphabet('BC'+kind[0].upper()))
  1534. blockCount = self.verboseRead(self.blockCountCodes[kind])
  1535. else:
  1536. blockCount = 1<<24
  1537. self.currentBlockCounts[kind] = blockCount
  1538. def readLiteralContextModes(self):
  1539. """Read literal context modes.
  1540. LSB6: lower 6 bits of last char
  1541. MSB6: upper 6 bits of last char
  1542. UTF8: roughly dependent on categories:
  1543. upper 4 bits depend on category of last char:
  1544. control/whitespace/space/ punctuation/quote/%/open/close/
  1545. comma/period/=/digits/ VOWEL/CONSONANT/vowel/consonant
  1546. lower 2 bits depend on category of 2nd last char:
  1547. space/punctuation/digit or upper/lowercase
  1548. signed: hamming weight of last 2 chars
  1549. """
  1550. print('Context modes'.center(60, '-'))
  1551. self.literalContextModes = []
  1552. for i in range(self.numberOfBlockTypes[L]):
  1553. self.literalContextModes.append(
  1554. self.verboseRead(LiteralContextMode(number=i)))
  1555. def contextMap(self, kind):
  1556. """Read context maps
  1557. Returns the number of differnt values on the context map
  1558. (In other words, the number of prefix trees)
  1559. """
  1560. NTREES = self.verboseRead(TypeCountAlphabet(
  1561. kind[0].upper()+'T#',
  1562. description='{} prefix trees'.format(kind)))
  1563. mapSize = {L:64, D:4}[kind]
  1564. if NTREES<2:
  1565. self.cmaps[kind] = [0]*mapSize
  1566. else:
  1567. #read CMAPkind
  1568. RLEMAX = self.verboseRead(RLEmaxAlphabet(
  1569. 'RLE#'+kind[0].upper(),
  1570. description=kind+' context map'))
  1571. alphabet = TreeAlphabet('CM'+kind[0].upper(), NTREES=NTREES, RLEMAX=RLEMAX)
  1572. cmapCode = self.readPrefixCode(alphabet)
  1573. tableSize = mapSize*self.numberOfBlockTypes[kind]
  1574. cmap = []
  1575. while len(cmap)<tableSize:
  1576. cmapCode.description = 'map {}, entry {}'.format(
  1577. *divmod(len(cmap), mapSize))
  1578. count, value = self.verboseRead(cmapCode)
  1579. cmap.extend([value]*count)
  1580. assert len(cmap)==tableSize
  1581. IMTF = self.verboseRead(BoolCode('IMTF', description='Apply inverse MTF'))
  1582. if IMTF:
  1583. self.IMTF(cmap)
  1584. if kind==L:
  1585. print('Context maps for literal data:')
  1586. for i in range(0, len(cmap), 64):
  1587. print(*(
  1588. ''.join(map(str, cmap[j:j+8]))
  1589. for j in range(i, i+64, 8)
  1590. ))
  1591. else:
  1592. print('Context map for distances:')
  1593. print(*(
  1594. ''.join(map('{:x}'.format, cmap[i:i+4]))
  1595. for i in range(0, len(cmap), 4)
  1596. ))
  1597. self.cmaps[kind] = cmap
  1598. return NTREES
  1599. @staticmethod
  1600. def IMTF(v):
  1601. """In place inverse move to front transform.
  1602. """
  1603. #mtf is initialized virtually with range(infinity)
  1604. mtf = []
  1605. for i, vi in enumerate(v):
  1606. #get old value from mtf. If never seen, take virtual value
  1607. try: value = mtf.pop(vi)
  1608. except IndexError: value = vi
  1609. #put value at front
  1610. mtf.insert(0, value)
  1611. #replace transformed value
  1612. v[i] = value
  1613. def readPrefixArray(self, kind, numberOfTrees):
  1614. """Read prefix code array"""
  1615. prefixes = []
  1616. for i in range(numberOfTrees):
  1617. if kind==L: alphabet = LiteralAlphabet(i)
  1618. elif kind==I: alphabet = InsertAndCopyAlphabet(i)
  1619. elif kind==D: alphabet = DistanceAlphabet(
  1620. i, NPOSTFIX=self.NPOSTFIX, NDIRECT=self.NDIRECT)
  1621. self.readPrefixCode(alphabet)
  1622. prefixes.append(alphabet)
  1623. self.prefixCodes[kind] = prefixes
  1624. #metablock data
  1625. def metablock(self):
  1626. """Process the data.
  1627. Relevant variables of self:
  1628. numberOfBlockTypes[kind]: number of block types
  1629. currentBlockTypes[kind]: current block types (=0)
  1630. literalContextModes: the context modes for the literal block types
  1631. currentBlockCounts[kind]: counters for block types
  1632. blockTypeCodes[kind]: code for block type
  1633. blockCountCodes[kind]: code for block count
  1634. cmaps[kind]: the context maps (not for I)
  1635. prefixCodes[kind][#]: the prefix codes
  1636. lastDistances: the last four distances
  1637. lastChars: the last two chars
  1638. output: the result
  1639. """
  1640. print('Meta block contents'.center(60, '='))
  1641. self.currentBlockTypes = {L:0, I:0, D:0, pL:1, pI:1, pD:1}
  1642. self.lastDistances = deque([17,16,11,4], maxlen=4)
  1643. #the current context mode is for block type 0
  1644. self.contextMode = ContextModeKeeper(self.literalContextModes[0])
  1645. wordList = WordList()
  1646. #setup distance callback function
  1647. def distanceCallback(symbol, extra):
  1648. "callback function for displaying decoded distance"
  1649. index, offset = symbol.value(extra)
  1650. if index:
  1651. #recent distance
  1652. distance = self.lastDistances[-index]+offset
  1653. return 'Distance: {}last{:+d}={}'.format(index, offset, distance)
  1654. #absolute value
  1655. if offset<=maxDistance:
  1656. return 'Absolute value: {} (pos {})'.format(offset, maxDistance-offset)
  1657. #word list value
  1658. action, word = divmod(offset-maxDistance, 1<<wordList.NDBITS[copyLen])
  1659. return '{}-{} gives word {},{} action {}'.format(
  1660. offset, maxDistance, copyLen, word, action)
  1661. for dpc in self.prefixCodes[D]: dpc.callback = distanceCallback
  1662. blockLen = 0
  1663. #there we go
  1664. while blockLen<self.MLEN:
  1665. #get insert&copy command
  1666. litLen, copyLen, dist0Flag = self.verboseRead(
  1667. self.prefixCodes[I][
  1668. self.figureBlockType(I)])
  1669. #literal data
  1670. for i in range(litLen):
  1671. bt = self.figureBlockType(L)
  1672. cm = self.contextMode.getIndex()
  1673. ct = self.cmaps[L][bt<<6|cm]
  1674. char = self.verboseRead(
  1675. self.prefixCodes[L][ct],
  1676. context='{},{}='.format(bt,cm))
  1677. self.contextMode.add(char)
  1678. self.output.append(char)
  1679. blockLen += litLen
  1680. #check if we're done
  1681. if blockLen>=self.MLEN: return
  1682. #distance
  1683. #distances are computed relative to output length, at most window size
  1684. maxDistance = min(len(self.output), self.windowSize)
  1685. if dist0Flag:
  1686. distance = self.lastDistances[-1]
  1687. else:
  1688. bt = self.figureBlockType(D)
  1689. cm = {2:0, 3:1, 4:2}.get(copyLen, 3)
  1690. ct = self.cmaps[D][bt<<2|cm]
  1691. index, offset = self.verboseRead(
  1692. self.prefixCodes[D][ct],
  1693. context='{},{}='.format(bt,cm))
  1694. distance = self.lastDistances[-index]+offset if index else offset
  1695. if index==1 and offset==0:
  1696. #to make sure distance is not put in last distance list
  1697. dist0Flag = True
  1698. if distance<=maxDistance:
  1699. #copy from output
  1700. for i in range(
  1701. maxDistance-distance,
  1702. maxDistance-distance+copyLen):
  1703. self.output.append(self.output[i])
  1704. if not dist0Flag: self.lastDistances.append(distance)
  1705. comment = 'Seen before'
  1706. else:
  1707. #fetch from wordlist
  1708. newWord = wordList.word(copyLen, distance-maxDistance-1)
  1709. self.output.extend(newWord)
  1710. #adjust copyLen to reflect actual new data
  1711. copyLen = len(newWord)
  1712. comment = 'From wordlist'
  1713. blockLen += copyLen
  1714. print(' '*40,
  1715. comment,
  1716. ': "',
  1717. outputFormatter(self.output[-copyLen:]),
  1718. '"',
  1719. sep='')
  1720. self.contextMode.add(self.output[-2])
  1721. self.contextMode.add(self.output[-1])
  1722. def figureBlockType(self, kind):
  1723. counts, types = self.currentBlockCounts, self.currentBlockTypes
  1724. if counts[kind]==0:
  1725. newType = self.verboseRead(self.blockTypeCodes[kind])
  1726. if newType==-2: newType = types['P'+kind]
  1727. elif newType==-1:
  1728. newType = (types[kind]+1)%self.numberOfBlockTypes[kind]
  1729. types['P'+kind] = types[kind]
  1730. types[kind] = newType
  1731. counts[kind] = self.verboseRead(self.blockCountCodes[kind])
  1732. counts[kind] -=1
  1733. return types[kind]
  1734. __test__ = {
  1735. 'BitStream': """
  1736. >>> bs = BitStream(b'Jurjen')
  1737. >>> bs.readBytes(2)
  1738. b'Ju'
  1739. >>> bs.read(6) #r=01110010
  1740. 50
  1741. >>> bs
  1742. BitStream(pos=2:6)
  1743. >>> bs.peek(5) #j=01101010
  1744. 9
  1745. >>> bs.readBytes(2)
  1746. Traceback (most recent call last):
  1747. ...
  1748. ValueError: readBytes: need byte boundary
  1749. """,
  1750. 'Symbol': """
  1751. >>> a=Symbol(MetablockLengthAlphabet(),5)
  1752. >>> len(a)
  1753. 2
  1754. >>> int(a)
  1755. 5
  1756. >>> a.bitPattern()
  1757. '01'
  1758. >>> a.value(200000)
  1759. 200001
  1760. >>> a.explanation(300000)
  1761. 'data length: 493e0h+1=300001'
  1762. """,
  1763. 'RangeDecoder': """
  1764. >>> a=RangeDecoder(bitLength=3)
  1765. >>> len(a)
  1766. 8
  1767. >>> a.name='t'
  1768. >>> list(a)
  1769. [Symbol(t, 0), Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4), Symbol(t, 5), Symbol(t, 6), Symbol(t, 7)]
  1770. >>> a[2]
  1771. Symbol(t, 2)
  1772. >>> a.bitPattern(4)
  1773. '100'
  1774. >>> a.length(2)
  1775. 3
  1776. >>> a.decodePeek(15)
  1777. (3, Symbol(t, 7))
  1778. >>>
  1779. """,
  1780. 'PrefixDecoder': """
  1781. >>> a=PrefixDecoder(decodeTable={0:1,1:2,3:3,7:4})
  1782. >>> len(a)
  1783. 4
  1784. >>> a.name='t'
  1785. >>> list(a)
  1786. [Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4)]
  1787. >>> a.decodePeek(22)
  1788. (1, Symbol(t, 1))
  1789. >>> a.decodePeek(27)
  1790. (3, Symbol(t, 3))
  1791. >>> a.length(1)
  1792. 1
  1793. >>> a.length(4)
  1794. 3
  1795. """,
  1796. 'Code': """
  1797. >>> a=Code('t',alphabetSize=10)
  1798. >>> len(a)
  1799. 10
  1800. >>> a.showCode()
  1801. 0000:0 0001:1 0010:2 0011:3 0100:4 0101:5 0110:6 0111:7 1000:8 1001:9
  1802. >>> a.setLength({2:1,3:2,5:3,6:3})
  1803. >>> a.showCode()
  1804. 0:2 01:3 011:5 111:6
  1805. >>> len(a)
  1806. 4
  1807. >>> def callback(i): return 'call{}back'.format(i)
  1808. >>> a=Code('t',callback=callback,bitLength=3)
  1809. >>> a[6].explanation()
  1810. 'call6back'
  1811. """,
  1812. 'WithExtra': """
  1813. >>> class A(WithExtra):
  1814. ... extraTable = [0,1,1,2,2]
  1815. >>> a=A('t',alphabetSize=5)
  1816. >>> a[1]
  1817. Symbol(t, 1)
  1818. >>> a.extraBits(2)
  1819. 1
  1820. >>> a.mnemonic(4)
  1821. '4'
  1822. >>> a.readTupleAndExtra(BitStream(b'\x5b'))
  1823. (3, Symbol(t, 3), 2, 3)
  1824. """,
  1825. 'BoolCode': """
  1826. >>> BoolCode('test')[0].explanation()
  1827. '0: False'
  1828. """,
  1829. 'Enumerator': """
  1830. >>> class A(Enumerator):
  1831. ... extraTable = [0,1,1,2,2]
  1832. ... value0=3
  1833. >>> a=A(alphabetLength=5)
  1834. >>> a.value(3)
  1835. Traceback (most recent call last):
  1836. ...
  1837. TypeError: value() missing 1 required positional argument: 'extra'
  1838. >>> a.explanation(3,4)
  1839. 'xx 011: 8-11; 8+4=12'
  1840. """,
  1841. 'WindowSizeAlphabet': """
  1842. >>> windowSizeAlphabet = WindowSizeAlphabet()
  1843. >>> windowSizeAlphabet[0]
  1844. Traceback (most recent call last):
  1845. ...
  1846. ValueError: No symbol WindowSizeAlphabet[0]
  1847. >>> len(windowSizeAlphabet)
  1848. 16
  1849. >>> windowSizeAlphabet[21]
  1850. Symbol(WSIZE, 21)
  1851. >>> windowSizeAlphabet[21].bitPattern()
  1852. '1001'
  1853. >>> windowSizeAlphabet[21].extraBits()
  1854. 0
  1855. >>> windowSizeAlphabet[21].index
  1856. 21
  1857. >>> windowSizeAlphabet[10].value()
  1858. 1008
  1859. >>> windowSizeAlphabet[10].explanation()
  1860. 'windowsize=(1<<10)-16=1008'
  1861. >>> windowSizeAlphabet.showCode()
  1862. 0:65520 1100001:16368 1110001:32752 0011:262128
  1863. 0000001:131056 0010001:None 1001:2097136 1011:4194288
  1864. 1000001:4080 1010001:8176 0101:524272 0111:1048560
  1865. 0100001:1008 0110001:2032 1101:8388592 1111:16777200
  1866. """,
  1867. 'TypeCountAlphabet': """
  1868. >>> typeCountAlphabet = TypeCountAlphabet(description='bananas')
  1869. >>> len(typeCountAlphabet)
  1870. 9
  1871. >>> typeCountAlphabet[3]
  1872. Symbol(BT#, 3)
  1873. >>> typeCountAlphabet[9]
  1874. Traceback (most recent call last):
  1875. ...
  1876. ValueError: No symbol TypeCountAlphabet[9]
  1877. >>> print(typeCountAlphabet[3])
  1878. xx,0101
  1879. >>> typeCountAlphabet[8].value(127)
  1880. 256
  1881. >>> typeCountAlphabet[4].explanation(2)
  1882. 'xxx,0111: 11 bananas'
  1883. >>> typeCountAlphabet[0].explanation()
  1884. '0: 1 banana'
  1885. """,
  1886. 'DistanceParamAlphabet': """
  1887. >>> dpa = DistanceParamAlphabet()
  1888. >>> dpa.showCode()
  1889. 00:PF0 01:PF1 10:PF2 11:PF3
  1890. >>> dpa.readTupleAndExtra(BitStream(b'\\x29'))
  1891. (2, Symbol(DIST, 1), 4, 10)
  1892. >>> dpa.explanation(2, 5)
  1893. '2 postfix bits and 0101<<2=20 direct codes'
  1894. """,
  1895. 'LiteralAlphabet': """
  1896. >>> LiteralAlphabet(-1).showCode() #doctest: +ELLIPSIS
  1897. 00000000:\\x00 00110100:4 01101000:h 10011100:\\x9c 11010000:\\xd0
  1898. 00000001:\\x01 00110101:5 01101001:i 10011101:\\x9d 11010001:\\xd1
  1899. 00000010:\\x02 00110110:6 01101010:j 10011110:\\x9e 11010010:\\xd2
  1900. ...
  1901. 00101111:/ 01100011:c 10010111:\\x97 11001011:\\xcb 11111111:\\xff
  1902. 00110000:0 01100100:d 10011000:\\x98 11001100:\\xcc
  1903. 00110001:1 01100101:e 10011001:\\x99 11001101:\\xcd
  1904. 00110010:2 01100110:f 10011010:\\x9a 11001110:\\xce
  1905. 00110011:3 01100111:g 10011011:\\x9b 11001111:\\xcf
  1906. """,
  1907. 'BlockCountAlphabet': """
  1908. >>> bc=BlockCountAlphabet('BCL')
  1909. >>> len(bc)
  1910. 26
  1911. >>> bs=BitStream(b'\\x40\\x83\\xc8\\x59\\12\\x02')
  1912. >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
  1913. 'Block count: xx 00000: 1-4; 1+2=3'
  1914. >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
  1915. 'Block count: xxx 00110: 33-40; 33+0=33'
  1916. >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
  1917. 'Block count: xxxxxx 10001: 305-368; 305+28=333'
  1918. >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
  1919. 'Block count: xxxxxxxxxxx 10110: 2289-4336; 2289+1044=3333'
  1920. """,
  1921. 'Layout': """
  1922. >>> olleke.pos = 0
  1923. >>> l = Layout(olleke)
  1924. >>> l.verboseRead(WindowSizeAlphabet())
  1925. 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288
  1926. 4194288
  1927. >>> l.verboseRead(BoolCode('LAST', description="Last block"))
  1928. 1 LAST Last block: 1: True
  1929. True
  1930. >>> l.verboseRead(BoolCode('EMPTY', description="Empty block"))
  1931. 0 EMPTY Empty block: 0: False
  1932. False
  1933. >>> l.verboseRead(MetablockLengthAlphabet())
  1934. 0001 2e 00 |00h|2Eh,|00 MLEN data length: 002eh+1=47
  1935. 47
  1936. >>> olleke.pos = 76
  1937. >>> l = Layout(olleke)
  1938. >>> x = l.verboseRead(DistanceAlphabet(0,NPOSTFIX=0,NDIRECT=0), skipExtra=True)
  1939. 000a 82 10|1100 D0 10[15*x]-3
  1940. >>> x.explanation(0x86a3)
  1941. '10[1000011010100011]-3: [0]+100000'
  1942. """,
  1943. 'olleke': """
  1944. >>> olleke.pos = 0
  1945. >>> try: Layout(olleke).processStream()
  1946. ... except NotImplementedError: pass
  1947. ... #doctest: +REPORT_NDIFF
  1948. addr hex binary context explanation
  1949. -----------------------Stream header------------------------
  1950. 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288
  1951. ======================Metablock header======================
  1952. 1 LAST Last block: 1: True
  1953. 0 EMPTY Empty block: 0: False
  1954. 0001 2e 00 |00h|2Eh,|00 MLEN data length: 002eh+1=47
  1955. -------------------Block type descriptors-------------------
  1956. 0003 00 0 BT#L 0: 1 literal block type
  1957. 0 BT#I 0: 1 insert&copy block type
  1958. 0 BT#D 0: 1 distance block type
  1959. ------------------Distance code parameters------------------
  1960. 0004 44 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes
  1961. -----------------------Context modes------------------------
  1962. 10 LC0 Context mode for type 0: 2(UTF8)
  1963. ------------------------Context maps------------------------
  1964. 0 LT# 0: 1 literal prefix tree
  1965. 0 DT# 0: 1 distance prefix tree
  1966. ---------------------Prefix code lists----------------------
  1967. 10 PFX L0 is complex with lengths 3,4,0,5,17...
  1968. 0005 4f 1|0 ##L0 len 3: coded with 3 bits
  1969. 0111 ##L0 len 4: coded with 1 bits
  1970. 10 ##L0 unused: coded with 3 bits
  1971. 0006 d6 0|0 ##L0 len 5: skipped
  1972. 011 ##L0 zero xxx: coded with 2 bits
  1973. ***** Lengths for L0 will be coded as:
  1974. 0:len 4 01:zero xxx 011:unused 111:len 3
  1975. 0007 95 1|11,01 #L0 7+3 unused
  1976. 0 #L0 Length for \\n is 4 bits
  1977. 001,01 #L0 1+3 unused
  1978. 0008 44 010,0|1 #L0 total 19+2 unused
  1979. 0 #L0 Length for " " is 4 bits
  1980. 0 #L0 Length for ! is 4 bits
  1981. 0009 cb 011,|01 #L0 3+3 unused
  1982. |110,01 #L0 total 35+6 unused
  1983. 000a 82 0 #L0 Length for K is 4 bits
  1984. 000,01 #L0 0+3 unused
  1985. 0 #L0 Length for O is 4 bits
  1986. 000b 4d 01|1 #L0 symbol P unused
  1987. 011 #L0 symbol Q unused
  1988. 0 #L0 Length for R is 4 bits
  1989. 000c 88 000,|01 #L0 0+3 unused
  1990. |100,01 #L0 total 11+4 unused
  1991. 000d b6 0 #L0 Length for b is 4 bits
  1992. 011 #L0 symbol c unused
  1993. 011 #L0 symbol d unused
  1994. 000e 27 11|1 #L0 Length for e is 3 bits
  1995. 010,01 #L0 2+3 unused
  1996. |0 #L0 Length for k is 4 bits
  1997. 000f 1f 111 #L0 Length for l is 3 bits
  1998. 011 #L0 symbol m unused
  1999. 0 #L0 Length for n is 4 bits
  2000. |0 #L0 Length for o is 4 bits
  2001. 0010 c1 000,01 #L0 0+3 unused
  2002. 0 #L0 Length for s is 4 bits
  2003. 0011 b4 0|11 #L0 symbol t unused
  2004. 0 #L0 Length for u is 4 bits
  2005. End of table. Prefix code L0:
  2006. 000:e 0010:\\n 0110:! 0001:O 0101:b 0011:n 0111:s
  2007. 100:l 1010:" " 1110:K 1001:R 1101:k 1011:o 1111:u
  2008. 11,01 PFX IC0 is simple with 4 code words
  2009. 0012 2a |2Ah|10 IC0 ? bits: I5C4
  2010. 0013 b5 ec 00|B5h IC0 ? bits: I6+xC7
  2011. 0015 22 0010|111011 IC0 ? bits: I8+xC5
  2012. 0016 8c 001100|0010 IC0 ? bits: I0C14+xx
  2013. 0 SHAPE False: lengths 2,2,2,2
  2014. 0017 74 10,0|1 PFX D0 is simple with 3 code words
  2015. 0018 a6 0|01110 D0 1 bit: 2last-3
  2016. 010011 D0 2 bits: 11xx-3
  2017. 0019 aa 01010|1 D0 2 bits: 11xxx-3
  2018. ====================Meta block contents=====================
  2019. |1,01 IC0 Literal: 9, copy: 5
  2020. 001a 41 0001 0,0=L0 O
  2021. 100 0,48=L0 l
  2022. 001b a2 10|0 0,62=L0 l
  2023. 000 0,63=L0 e
  2024. 001c a1 1|101 0,59=L0 k
  2025. 000 0,63=L0 e
  2026. |1010 0,59=L0 " "
  2027. 001d b5 0101 0,11=L0 b
  2028. |1011 0,60=L0 o
  2029. 001e 24 0 0,3=D0 Distance: 2last-3=8
  2030. Seen before: "lleke"
  2031. 0,10 IC0 Literal: 6, copy: 7
  2032. |0010 0,59=L0 \\n
  2033. 001f 89 1001 0,7=L0 R
  2034. 000 0,52=L0 e
  2035. 0020 fa 010|1 0,58=L0 b
  2036. 1111 0,63=L0 u
  2037. 0021 eb 011|1 0,59=L0 s
  2038. 11,01 0,3=D0 Absolute value: 12 (pos 8)
  2039. Seen before: "olleke\\n"
  2040. 0022 db 01,1|1 IC0 Literal: 0, copy: 15
  2041. |110,11 0,3=D0 Absolute value: 27 (pos 0)
  2042. Seen before: "Olleke bolleke\\n"
  2043. 0023 f8 00 IC0 Literal: 5, copy: 4
  2044. 1110 0,7=L0 K
  2045. 0024 2c 00|11 0,52=L0 n
  2046. 1011 0,62=L0 o
  2047. 0025 0d 1|00 0,59=L0 l
  2048. 0110 0,63=L0 !
  2049. """,
  2050. 'file': """
  2051. >>> try: Layout(BitStream(
  2052. ... open("./tests/testdata/10x10y.compressed",'rb')
  2053. ... .read())).processStream()
  2054. ... except NotImplementedError: pass
  2055. addr hex binary context explanation
  2056. -----------------------Stream header------------------------
  2057. 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288
  2058. ======================Metablock header======================
  2059. 1 LAST Last block: 1: True
  2060. 0 EMPTY Empty block: 0: False
  2061. 0001 13 00 |00h|13h,|00 MLEN data length: 0013h+1=20
  2062. -------------------Block type descriptors-------------------
  2063. 0003 00 0 BT#L 0: 1 literal block type
  2064. 0 BT#I 0: 1 insert&copy block type
  2065. 0 BT#D 0: 1 distance block type
  2066. ------------------Distance code parameters------------------
  2067. 0004 a4 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes
  2068. -----------------------Context modes------------------------
  2069. 10 LC0 Context mode for type 0: 2(UTF8)
  2070. ------------------------Context maps------------------------
  2071. 0 LT# 0: 1 literal prefix tree
  2072. 0 DT# 0: 1 distance prefix tree
  2073. ---------------------Prefix code lists----------------------
  2074. 0005 b0 0|1,01 PFX L0 is simple with 2 code words
  2075. 0006 b2 0|1011000 L0 1 bit: X
  2076. 0007 ea 0|1011001 L0 1 bit: Y
  2077. 01,01 PFX IC0 is simple with 2 code words
  2078. 0008 81 0000001|111 IC0 1 bit: I1C9&D=0
  2079. 0009 47 02 0|47h|1 IC0 1 bit: I1C9
  2080. 00,01 PFX D0 is simple with 1 code word
  2081. 000b 8a 010|000 D0 0 bits: 10x-3
  2082. ====================Meta block contents=====================
  2083. 1 IC0 Literal: 1, copy: 9
  2084. 0 0,0=L0 X
  2085. 0,() 0,3=D0 Absolute value: 1 (pos 0)
  2086. Seen before: "XXXXXXXXX"
  2087. 0 IC0 Literal: 1, copy: 9, same distance
  2088. |1 0,54=L0 Y
  2089. Seen before: "YYYYYYYYY"
  2090. """,
  2091. 'XY': """
  2092. >>> try: Layout(BitStream(brotli.compress('X'*10+'Y'*10))).processStream()
  2093. ... except NotImplementedError: pass
  2094. addr hex binary context explanation
  2095. -----------------------Stream header------------------------
  2096. 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288
  2097. ======================Metablock header======================
  2098. 1 LAST Last block: 1: True
  2099. 0 EMPTY Empty block: 0: False
  2100. 0001 13 00 |00h|13h,|00 MLEN data length: 0013h+1=20
  2101. -------------------Block type descriptors-------------------
  2102. 0003 00 0 BT#L 0: 1 literal block type
  2103. 0 BT#I 0: 1 insert&copy block type
  2104. 0 BT#D 0: 1 distance block type
  2105. ------------------Distance code parameters------------------
  2106. 0004 a4 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes
  2107. -----------------------Context modes------------------------
  2108. 10 LC0 Context mode for type 0: 2(UTF8)
  2109. ------------------------Context maps------------------------
  2110. 0 LT# 0: 1 literal prefix tree
  2111. 0 DT# 0: 1 distance prefix tree
  2112. ---------------------Prefix code lists----------------------
  2113. 0005 b0 0|1,01 PFX L0 is simple with 2 code words
  2114. 0006 b2 0|1011000 L0 1 bit: X
  2115. 0007 82 0|1011001 L0 1 bit: Y
  2116. 00,01 PFX IC0 is simple with 1 code word
  2117. 0008 84 0000100|100 IC0 0 bits: I4C6&D=0
  2118. 0009 00 00,0|1 PFX D0 is simple with 1 code word
  2119. 000a e0 0|00000 D0 0 bits: last
  2120. ====================Meta block contents=====================
  2121. () IC0 Literal: 4, copy: 6, same distance
  2122. 0 0,0=L0 X
  2123. 0 0,52=L0 X
  2124. 0 0,54=L0 X
  2125. 0 0,54=L0 X
  2126. Seen before: "XXXXXX"
  2127. () IC0 Literal: 4, copy: 6, same distance
  2128. 1 0,54=L0 Y
  2129. 1 0,54=L0 Y
  2130. |1 0,54=L0 Y
  2131. 000b 01 1 0,54=L0 Y
  2132. Seen before: "YYYYYY"
  2133. """,
  2134. 'empty': """
  2135. >>> try: Layout(BitStream(b'\\x81\\x16\\x00\\x58')).processStream()
  2136. ... except NotImplementedError: pass
  2137. addr hex binary context explanation
  2138. -----------------------Stream header------------------------
  2139. 0000 81 0000001 WSIZE windowsize=(1<<17)-16=131056
  2140. ======================Metablock header======================
  2141. |1 LAST Last block: 1: True
  2142. 0001 16 0 EMPTY Empty block: 0: False
  2143. 11 MLEN 11: empty block
  2144. 0 RSVD Reserved (must be zero)
  2145. 0002 00 000000|00,01 SKIP skip length: 0h+1=1
  2146. |00 SKIP 2 bits ignored
  2147. Skipping to 4
  2148. """,
  2149. }
  2150. if __name__ == '__main__':
  2151. import os
  2152. import sys
  2153. here = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
  2154. DICTIONARY_PATH = os.path.realpath(os.path.join(here, DICTIONARY_PATH))
  2155. if len(sys.argv) > 1:
  2156. l = Layout(BitStream(open(sys.argv[1],'rb').read()))
  2157. l.processStream()
  2158. else:
  2159. sys.path.append("h:/Persoonlijk/bin")
  2160. try:
  2161. import brotli
  2162. open('brotlidump.br', 'wb').write(
  2163. brotli.compress(
  2164. open('brotlidump.py', 'r').read()
  2165. ))
  2166. olleke = BitStream(brotli.compress(
  2167. 'Olleke bolleke\nRebusolleke\nOlleke bolleke\nKnol!'))
  2168. except ImportError: pass
  2169. import doctest
  2170. doctest.testmod(optionflags=doctest.REPORT_NDIFF
  2171. #|doctest.FAIL_FAST
  2172. )