mutool.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. #!/usr/bin/env python3
  2. '''
  3. Intended to behaves exactly like mutool, but uses the mupdf python => C++ =>
  4. mupdf.so wrappers.
  5. The code is intended to be similar to the mutool C code, to simplify
  6. comparison.
  7. '''
  8. import getopt
  9. import os
  10. import sys
  11. import textwrap
  12. if os.environ.get('MUPDF_PYTHON') in ('swig', None):
  13. # PYTHONPATH should have been set up to point to a build/shared-*/
  14. # directory containing mupdf.so generated by scripts/mupdfwrap.py and SWIG.
  15. import mupdf
  16. elif os.environ.get('MUPDF_PYTHON') == 'cppyy':
  17. sys.path.insert(0, os.path.abspath(f'{__file__}/../../platform/python'))
  18. import mupdf_cppyy
  19. del sys.path[0]
  20. mupdf = mupdf_cppyy.cppyy.gbl.mupdf
  21. else:
  22. raise Exception(f'Unrecognised $MUPDF_PYTHON: {os.environ.get("MUPDF_PYTHON")}')
  23. def usage():
  24. print( textwrap.dedent('''
  25. usage: mutool.py <command> [options]
  26. \tclean\t-- rewrite pdf file
  27. \tconvert\t-- convert document
  28. \ttrace\t-- trace device calls
  29. \tdraw\t-- convert document
  30. '''))
  31. # Things for clean
  32. #
  33. def clean_usage():
  34. print(textwrap.dedent(
  35. f'''
  36. usage: mutool clean [options] input.pdf [output.pdf] [pages]
  37. \t-p -\tpassword
  38. \t-g\tgarbage collect unused objects
  39. \t-gg\tin addition to -g compact xref table
  40. \t-ggg\tin addition to -gg merge duplicate objects
  41. \t-gggg\tin addition to -ggg check streams for duplication
  42. \t-l\tlinearize PDF
  43. \t-D\tsave file without encryption
  44. \t-E -\tsave file with new encryption (rc4-40, rc4-128, aes-128, or aes-256)
  45. \t-O -\towner password (only if encrypting)
  46. \t-U -\tuser password (only if encrypting)
  47. \t-P -\tpermission flags (only if encrypting)
  48. \t-a\tascii hex encode binary streams
  49. \t-d\tdecompress streams
  50. \t-z\tdeflate uncompressed streams
  51. \t-f\tcompress font streams
  52. \t-i\tcompress image streams
  53. \t-c\tclean content streams
  54. \t-s\tsanitize content streams
  55. \t-A\tcreate appearance streams for annotations
  56. \t-AA\trecreate appearance streams for annotations
  57. \tpages\tcomma separated list of page numbers and ranges
  58. '''
  59. ))
  60. sys.exit(1)
  61. def clean(argv):
  62. outfile = 'out.pdf'
  63. password = ''
  64. opts = mupdf.PdfCleanOptions()
  65. opts.write.do_garbage += 1
  66. errors = 0
  67. items, argv = getopt.getopt( argv, 'adfgilp:sczDAE:O:U:P:')
  68. for option, value in items:
  69. if 0: pass # lgtm [py/unreachable-statement]
  70. elif option == '-p': password = value
  71. elif option == '-d': opts.write.do_decompress += 1
  72. elif option == '-z': opts.write.do_compress += 1
  73. elif option == '-f': opts.write.do_compress_fonts += 1
  74. elif option == '-i': opts.write.do_compress_images += 1
  75. elif option == '-a': opts.write.do_ascii += 1
  76. elif option == '-g': opts.write.do_garbage += 1
  77. elif option == '-l': opts.write.do_linear += 1
  78. elif option == '-c': opts.write.do_clean += 1
  79. elif option == '-s': opts.write.do_sanitize += 1
  80. elif option == '-A': opts.write.do_appearance += 1
  81. elif option == '-D': opts.write.do_encrypt = PDF_ENCRYPT_NONE
  82. elif option == '-E': opts.write.do_encrypt = encrypt_method_from_string(value)
  83. elif option == '-P': opts.write.permissions = int(value)
  84. elif option == '-O': opts.write.opwd_utf8 = value[:128]
  85. elif option == '-U': opts.write.upwd_utf8 = value[:128]
  86. else:
  87. clean_usage()
  88. if (opts.write.do_ascii or opts.write.do_decompress) and not opts.write.do_compress:
  89. opts.write.do_pretty = 1
  90. if not argv:
  91. clean_usage()
  92. infile = argv.pop(0)
  93. if argv and '.pdf' in argv[0].lower():
  94. outfile = argv.pop(0)
  95. try:
  96. mupdf.pdf_clean_file(infile, outfile, password, opts, argv)
  97. except Exception as e:
  98. print( f'mupdf.pdf_clean_file() failed: {e}')
  99. errors += 1
  100. if 0:
  101. # Enable for debugging.
  102. import traceback
  103. traceback.print_exc()
  104. return errors != 0;
  105. # Things for draw.
  106. #
  107. import mutool_draw
  108. draw = mutool_draw.draw
  109. # Things for convert.
  110. #
  111. def convert_usage():
  112. print( textwrap.dedent(
  113. f'''
  114. mutool convert version {mupdf.FZ_VERSION}
  115. Usage: mutool convert [options] file [pages]
  116. \t-p -\tpassword
  117. \t-A -\tnumber of bits of antialiasing (0 to 8)
  118. \t-W -\tpage width for EPUB layout
  119. \t-H -\tpage height for EPUB layout
  120. \t-S -\tfont size for EPUB layout
  121. \t-U -\tfile name of user stylesheet for EPUB layout
  122. \t-X\tdisable document styles for EPUB layout
  123. \t-o -\toutput file name (%d for page number)
  124. \t-F -\toutput format (default inferred from output file name)
  125. \t\t\traster: cbz, png, pnm, pgm, ppm, pam, pbm, pkm.
  126. \t\t\tprint-raster: pcl, pclm, ps, pwg.
  127. \t\t\tvector: pdf, svg.
  128. \t\t\ttext: html, xhtml, text, stext.
  129. \t-O -\tcomma separated list of options for output format
  130. \tpages\tcomma separated list of page ranges (N=last page)
  131. '''
  132. ))
  133. print( mupdf.fz_draw_options_usage)
  134. print( mupdf.fz_pcl_write_options_usage)
  135. print( mupdf.fz_pclm_write_options_usage)
  136. print( mupdf.fz_pwg_write_options_usage)
  137. print( mupdf.fz_stext_options_usage)
  138. print( mupdf.fz_pdf_write_options_usage)
  139. print( mupdf.fz_svg_write_options_usage)
  140. sys.exit(1)
  141. def convert_runpage( doc, number, out):
  142. page = mupdf.FzPage( doc, number - 1)
  143. mediabox = page.fz_bound_page()
  144. dev = out.fz_begin_page(mediabox)
  145. page.fz_run_page( dev, mupdf.FzMatrix(mupdf.fz_identity), mupdf.FzCookie())
  146. out.fz_end_page()
  147. def convert_runrange( doc, count, range_, out):
  148. start = None
  149. end = None
  150. while 1:
  151. range_, start, end = mupdf.fz_parse_page_range( range_, count)
  152. if range_ is None:
  153. break
  154. step = +1 if end > start else -1
  155. for i in range( start, end, step):
  156. convert_runpage( doc, i, out)
  157. def convert( argv):
  158. # input options
  159. password = ''
  160. alphabits = 8
  161. layout_w = mupdf.FZ_DEFAULT_LAYOUT_W
  162. layout_h = mupdf.FZ_DEFAULT_LAYOUT_H
  163. layout_em = mupdf.FZ_DEFAULT_LAYOUT_EM
  164. layout_css = None
  165. layout_use_doc_css = 1
  166. # output options
  167. output = None
  168. format_ = None
  169. options = ''
  170. items, argv = getopt.getopt( argv, 'p:A:W:H:S:U:Xo:F:O:')
  171. for option, value in items:
  172. if 0: pass # lgtm [py/unreachable-statement]
  173. elif option == '-p': password = value
  174. elif option == '-A': alphabits = int(value)
  175. elif option == '-W': layout_w = float( value)
  176. elif option == '-H': layout_h = float( value)
  177. elif option == '-S': layout_em = float( value)
  178. elif option == '-U': layout_css = value
  179. elif option == '-X': layout_use_doc_css = 0
  180. elif option == '-o': output = value
  181. elif option == '-F': format_ = value
  182. elif option == '-O': options = value
  183. else: assert 0
  184. if not argv or (not format_ and not output):
  185. convert_usage()
  186. mupdf.fz_set_aa_level( alphabits)
  187. if layout_css:
  188. buf = mupdf.FzBuffer( layout_css)
  189. mupdf.fz_set_user_css( buf.string_from_buffer())
  190. mupdf.fz_set_use_document_css(layout_use_doc_css)
  191. if format_:
  192. out = mupdf.FzDocumentWriter( output, format_, options)
  193. else:
  194. out = mupdf.FzDocumentWriter( output, options, mupdf.FzDocumentWriter.OutputType_PDF)
  195. i = 0
  196. while 1:
  197. if i >= len( argv):
  198. break
  199. arg = argv[i]
  200. doc = mupdf.FzDocument( arg)
  201. if doc.fz_needs_password():
  202. if not doc.fz_authenticate_password( password):
  203. raise Exception( f'cannot authenticate password: {arg}')
  204. doc.fz_layout_document( layout_w, layout_h, layout_em)
  205. count = doc.fz_count_pages()
  206. range_ = '1-N'
  207. if i + 1 < len(argv) and mupdf.fz_is_page_range(ctx, argv[i+1]):
  208. i += 1
  209. range_ = argv[i]
  210. convert_runrange( doc, count, range_, out)
  211. i += 1
  212. out.fz_close_document_writer()
  213. # Things for trace.
  214. #
  215. def trace_usage():
  216. print( textwrap.dedent('''
  217. Usage: mutool trace [options] file [pages]
  218. \t-p -\tpassword
  219. \t-W -\tpage width for EPUB layout
  220. \t-H -\tpage height for EPUB layout
  221. \t-S -\tfont size for EPUB layout
  222. \t-U -\tfile name of user stylesheet for EPUB layout
  223. \t-X\tdisable document styles for EPUB layout
  224. \t-d\tuse display list
  225. \tpages\tcomma separated list of page numbers and ranges
  226. '''))
  227. sys.exit( 1)
  228. def trace_runpage( use_display_list, doc, number):
  229. page = mupdf.FzPage( doc, number-1)
  230. mediabox = page.fz_bound_page()
  231. print( f'<page number="{number}" mediabox="{mediabox.x0} {mediabox.y0} {mediabox.x1} {mediabox.y1}">')
  232. output = mupdf.FzOutput( mupdf.FzOutput.Fixed_STDOUT)
  233. dev = mupdf.FzDevice( output)
  234. if use_display_list:
  235. list_ = mupdf.FzDisplayList( page)
  236. list_.fz_run_display_list( dev, mupdf.FzMatrix(mupdf.fz_identity), mupdf.FzRect(mupdf.fz_infinite_rect), mupdf.FzCookie())
  237. else:
  238. page.fz_run_page( dev, mupdf.FzMatrix(mupdf.fz_identity), mupdf.FzCookie())
  239. output.fz_close_output()
  240. print( '</page>')
  241. def trace_runrange( use_display_list, doc, count, range_):
  242. start = None
  243. end = None
  244. while 1:
  245. range_, start, end = mupdf.fz_parse_page_range( range_, count)
  246. print(f'range_={range_!r} start={start} end={end}')
  247. if range_ is None:
  248. break
  249. step = +1 if end > start else -1
  250. for i in range( start, end, step):
  251. trace_runpage( use_display_list, doc, i)
  252. def trace( argv):
  253. password = ''
  254. layout_w = mupdf.FZ_DEFAULT_LAYOUT_W
  255. layout_h = mupdf.FZ_DEFAULT_LAYOUT_H
  256. layout_em = mupdf.FZ_DEFAULT_LAYOUT_EM
  257. layout_css = None
  258. layout_use_doc_css = 1
  259. use_display_list = 0
  260. argv_i = 0
  261. while 1:
  262. arg = argv[ argv_i]
  263. if arg == '-p':
  264. password = next( opt)
  265. elif arg == '-W':
  266. argv_i += 1
  267. layout_w = float( argv[argv_i])
  268. elif arg == '-H':
  269. argv_i += 1
  270. layout_h = float( argv[argv_i])
  271. elif arg == '-S':
  272. argv_i += 1
  273. layout_em = float( argv[argv_i])
  274. elif arg == '-U':
  275. argv_i += 1
  276. layout_css = argv[argv_i]
  277. elif arg == '-X':
  278. layout_use_doc_css = 0
  279. elif arg == '-d':
  280. use_display_list = 1
  281. else:
  282. break
  283. argv_i += 1
  284. if argv_i == len( argv):
  285. trace_usage()
  286. if layout_css:
  287. buffer_ = mupdf.FzBuffer( layout_css)
  288. mupdf.fz_set_user_css( buffer_.string_from_buffer())
  289. mupdf.fz_set_use_document_css( layout_use_doc_css)
  290. for argv_i in range( argv_i, len( argv)):
  291. arg = argv[ argv_i]
  292. doc = mupdf.FzDocument( arg)
  293. if doc.fz_needs_password():
  294. doc.fz_authenticate_password( password)
  295. doc.fz_layout_document( layout_w, layout_h, layout_em)
  296. print( f'<document filename="{arg}">')
  297. count = doc.fz_count_pages()
  298. if argv_i + 1 < len( argv) and mupdf.fz_is_page_range( argv[ argv_i+1]):
  299. argv_i += 1
  300. trace_runrange( use_display_list, doc, count, argv[ argv_i])
  301. else:
  302. trace_runrange( use_display_list, doc, count, '1-N')
  303. print( '</document>')
  304. def main( argv):
  305. arg1 = argv[1]
  306. fn = getattr( sys.modules[__name__], arg1, None)
  307. if not fn:
  308. print( f'cannot find {arg1}')
  309. usage()
  310. sys.exit(1)
  311. return fn( argv[2:])
  312. if __name__ == '__main__':
  313. try:
  314. e = main( sys.argv)
  315. sys.exit(e)
  316. except Exception as e:
  317. if 0: # Enable when debugging.
  318. sys.stdout.flush()
  319. sys.stderr.flush()
  320. print(f'Exception: {e}')
  321. sys.stdout.flush()
  322. raise