zint_org_uk.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. # This script takes the output from pandoc and converts it into the format needed by
  2. # the website at Zint.org.uk
  3. #
  4. # Warning: This code is ugly... but it saves days of manual effort updating the website.
  5. #
  6. # Copyright (C) 2022 <rstuart114@gmail.com>
  7. # Works out which tags should influence indentation and puts them on their own line
  8. def isolate_tag(tag):
  9. global stage
  10. indentable_tag = True
  11. for keyword in indent_skip:
  12. if keyword in tag:
  13. indentable_tag = False
  14. if '</' in tag:
  15. # Close tag
  16. if (indentable_tag):
  17. stage += "\n"
  18. stage += tag
  19. stage += "\n"
  20. else:
  21. stage += tag
  22. else:
  23. # Open tag
  24. if (indentable_tag):
  25. stage += "\n"
  26. stage += tag
  27. stage += "\n"
  28. else:
  29. stage += tag
  30. # Add the right amount of indendation (indentation X 4 spaces)
  31. def add_indent():
  32. global indentation
  33. retval = ""
  34. for i in range(0,indentation):
  35. retval += " "
  36. return retval
  37. # Apply indentation to text
  38. def with_indent(text):
  39. global indentation
  40. retval = ""
  41. d = ''
  42. for c in text:
  43. if d == '\n':
  44. retval += d
  45. retval += add_indent()
  46. else:
  47. retval += d
  48. d = c
  49. retval += d
  50. return retval
  51. # Read file and pull some tags onto their own lines for later processing
  52. manual = ""
  53. tag = False
  54. tag_buffer = ""
  55. text_buffer = ""
  56. stage = ""
  57. indent_skip = ['img', 'code', 'pre', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', '<a', '</a', 'sup', '<col', '</col', '<hr', 'div']
  58. print("Reading... manual.html")
  59. with open('manual.html') as f:
  60. manual = f.read()
  61. for c in manual:
  62. if c == '<':
  63. stage += text_buffer
  64. tag = True
  65. tag_buffer = ""
  66. if (tag):
  67. tag_buffer += c
  68. else:
  69. text_buffer += c
  70. if c == '>':
  71. tag_buffer = tag_buffer.replace("\n", " ")
  72. isolate_tag(tag_buffer)
  73. tag = False
  74. text_buffer = ""
  75. f.close()
  76. manual = stage
  77. stage = ""
  78. print("Adjusting HTML")
  79. # Change the guts of the HTML tags
  80. in_dd = False
  81. to_remove = False
  82. remove_next = False
  83. span_literal = False
  84. for c in manual:
  85. if c == '<':
  86. # Remove "{#tbl:" table identifiers
  87. if '{#tbl:' in text_buffer:
  88. text_buffer = text_buffer[text_buffer.index('tag=') + 7:-3]
  89. text_buffer = text_buffer.replace('\n', ' ')
  90. text_buffer = '\n' + text_buffer + '\n'
  91. # Remove "{@tabl:" table references
  92. if 'tbl:' in text_buffer:
  93. text_buffer = ''
  94. stage += text_buffer
  95. tag = True
  96. tag_buffer = ""
  97. to_remove = False
  98. if (tag):
  99. tag_buffer += c
  100. else:
  101. text_buffer += c
  102. if c == '>':
  103. # Remove some tags which aren't needed on website
  104. if 'span' in tag_buffer:
  105. to_remove = True
  106. if 'div' in tag_buffer:
  107. to_remove = True
  108. if '<col' in tag_buffer:
  109. to_remove = True
  110. if '</col' in tag_buffer:
  111. to_remove = True
  112. if (remove_next):
  113. to_remove = True
  114. remove_next = False
  115. if ('a href' in tag_buffer) and ('aria-hidden="true"' in tag_buffer):
  116. to_remove = True
  117. remove_next = True
  118. if '<a href="#' in tag_buffer:
  119. to_remove = True
  120. remove_next = True
  121. # Don't allow <p> and </p> between <dd> and </dd>
  122. if (tag_buffer == "<dd>"):
  123. in_dd = True
  124. if (tag_buffer == "</dd>"):
  125. in_dd = False
  126. if (in_dd and tag_buffer == '<p>'):
  127. to_remove = True
  128. if (in_dd and tag_buffer == '</p>'):
  129. to_remove = True
  130. # Remove attributes for some tags
  131. if '<pre' in tag_buffer:
  132. tag_buffer = '<pre>'
  133. if '<table' in tag_buffer:
  134. tag_buffer = '<table>'
  135. if '<tr' in tag_buffer:
  136. tag_buffer = '<tr>'
  137. if '<td' in tag_buffer:
  138. tag_buffer = '<td>'
  139. if '<th ' in tag_buffer:
  140. tag_buffer = '<th>'
  141. # Bump all headers up one level
  142. tag_buffer = tag_buffer.replace('<h6', '<h7')
  143. tag_buffer = tag_buffer.replace('</h6', '</h7')
  144. tag_buffer = tag_buffer.replace('<h5', '<h6')
  145. tag_buffer = tag_buffer.replace('</h5', '</h6')
  146. tag_buffer = tag_buffer.replace('<h4', '<h5')
  147. tag_buffer = tag_buffer.replace('</h4', '</h5')
  148. tag_buffer = tag_buffer.replace('<h3', '<h4')
  149. tag_buffer = tag_buffer.replace('</h3', '</h4')
  150. tag_buffer = tag_buffer.replace('<h2', '<h3')
  151. tag_buffer = tag_buffer.replace('</h2', '</h3')
  152. tag_buffer = tag_buffer.replace('<h1', '<h2')
  153. tag_buffer = tag_buffer.replace('</h1', '</h2')
  154. # Change class names for code snippets
  155. tag_buffer = tag_buffer.replace('class="sourceCode bash"', 'class="language-bash"')
  156. tag_buffer = tag_buffer.replace('class="sourceCode c"', 'class="language-cpp"')
  157. # Change location of images
  158. tag_buffer = tag_buffer.replace('src="images/', 'src="/images/manual/')
  159. # Change <code> without language to <span>
  160. if tag_buffer == '<code>':
  161. tag_buffer = '<span class="literal">'
  162. span_literal = True
  163. if tag_buffer == '</code>' and span_literal:
  164. tag_buffer = '</span>'
  165. span_literal = False
  166. if not to_remove:
  167. stage += tag_buffer
  168. tag = False
  169. text_buffer = ""
  170. manual = stage
  171. stage = ""
  172. print("Removing empty lines")
  173. # Remove blank lines unless in between <pre> and </pre>
  174. last_char = ''
  175. in_pre = False
  176. for c in manual:
  177. if c == '<':
  178. tag = True
  179. tag_buffer = ""
  180. if (tag):
  181. tag_buffer += c
  182. else:
  183. text_buffer += c
  184. if c == '>':
  185. if ("<pre" in tag_buffer):
  186. in_pre = True
  187. if ("</pre" in tag_buffer):
  188. in_pre = False
  189. tag = False
  190. text_buffer = ""
  191. if c == '\n':
  192. if (last_char != '\n') or (in_pre == True):
  193. stage += c
  194. else:
  195. stage += c
  196. last_char = c
  197. manual = stage
  198. stage = ""
  199. print("Applying indentation")
  200. # Indent the code to make it easier to read
  201. indentation = 1
  202. in_pre = False
  203. paragraph_block = False
  204. document_start = True
  205. chapter_six = False
  206. last_char = ''
  207. for c in manual:
  208. if c == '<':
  209. #Fix 'floating' full stops
  210. text_buffer = text_buffer.replace(' . ', '. ')
  211. # Apply indentation to text
  212. if in_pre:
  213. stage += text_buffer
  214. else:
  215. stage += with_indent(text_buffer)
  216. tag = True
  217. tag_buffer = ""
  218. if (tag):
  219. tag_buffer += c
  220. else:
  221. # Strip '{}' from already removed table references
  222. if c == '}' and last_char == '{':
  223. text_buffer = text_buffer[:-1]
  224. else:
  225. text_buffer += c
  226. last_char = c
  227. if c == '>':
  228. indentable_tag = True
  229. for keyword in indent_skip:
  230. if keyword in tag_buffer:
  231. indentable_tag = False
  232. # Protect the indentation in <pre> segments
  233. if ('<pre' in tag_buffer):
  234. in_pre = True
  235. if ('</pre' in tag_buffer):
  236. in_pre = False
  237. # Chapter 6 requires special treatment - detect beginning and end
  238. if ('id="types-of-symbology"' in tag_buffer):
  239. chapter_six = True
  240. if ('id="legal-and-version-information"' in tag_buffer):
  241. chapter_six = False
  242. if '</' in tag_buffer:
  243. # Close tag
  244. if (indentable_tag):
  245. indentation -= 1
  246. stage += add_indent()
  247. stage += tag_buffer
  248. else:
  249. if text_buffer.endswith('\n'):
  250. stage += add_indent()
  251. stage += tag_buffer
  252. else:
  253. # Split into sections
  254. if (indentation == 1) and ('<p' in tag_buffer):
  255. if not paragraph_block:
  256. if document_start:
  257. document_start = False
  258. else:
  259. stage += '</section>\n'
  260. stage += '<section class="container">\n'
  261. paragraph_block = True
  262. # Handle headers but also decide where to split into multiple HTML files and mark with <page>
  263. if (indentation == 1):
  264. if ('<h2' in tag_buffer):
  265. if document_start:
  266. document_start = False
  267. stage += '<section class="container">\n'
  268. paragraph_block = True
  269. else:
  270. stage += '</section>\n'
  271. stage += '<page>\n'
  272. stage += '<section class="container">\n'
  273. paragraph_block = True
  274. elif ('<h3' in tag_buffer) and chapter_six:
  275. stage += '</section>\n'
  276. stage += '<page>\n'
  277. stage += '<section class="container">\n'
  278. paragraph_block = True
  279. elif ('<h' in tag_buffer):
  280. if not paragraph_block:
  281. stage += '</section>\n'
  282. stage += '<section class="container">\n'
  283. paragraph_block = True
  284. # <dl> section has it's own class
  285. if (indentation == 1) and ('<dl' in tag_buffer):
  286. stage += '</section>\n'
  287. stage += '<section class="definition-list container">\n'
  288. paragraph_block = False
  289. # <table> section has it's own class
  290. if (indentation == 1) and ('<table' in tag_buffer):
  291. stage += '</section>\n'
  292. stage += '<section class="table">\n'
  293. paragraph_block = False
  294. # Open tag
  295. if (indentable_tag):
  296. stage += add_indent()
  297. stage += tag_buffer
  298. indentation += 1
  299. else:
  300. if text_buffer.endswith('\n'):
  301. stage += add_indent()
  302. stage += tag_buffer
  303. tag = False
  304. text_buffer = ""
  305. stage += '\n</section>\n'
  306. manual = stage
  307. stage = ""
  308. # Remove <h2> data and split into output files
  309. out_filenames = ['chapter1.html', 'chapter2.html', 'chapter3.html', 'chapter4.html', 'chapter5.html',
  310. 'chapter6.0.html', 'chapter6.1.html', 'chapter6.2.html', 'chapter6.3.html', 'chapter6.4.html',
  311. 'chapter6.5.html', 'chapter6.6.html', 'chapter6.7.html', 'chapter7.html', 'appendixa.html', 'appendixb.html']
  312. page = 0
  313. print("Writing... ", out_filenames[page])
  314. f = open(out_filenames[page], "w")
  315. h2_tag = False
  316. for c in manual:
  317. if c == '<':
  318. if h2_tag == False:
  319. stage += text_buffer
  320. tag = True
  321. tag_buffer = ""
  322. if (tag):
  323. tag_buffer += c
  324. else:
  325. text_buffer += c
  326. if c == '>':
  327. if '<h2' in tag_buffer:
  328. h2_tag = True
  329. elif '</h2' in tag_buffer:
  330. h2_tag = False
  331. elif tag_buffer == '<page>':
  332. f.write(stage)
  333. f.close()
  334. stage = ""
  335. page += 1
  336. print("Writing... ", out_filenames[page])
  337. f = open(out_filenames[page], "w")
  338. else:
  339. stage += tag_buffer
  340. tag = False
  341. text_buffer = ""
  342. f.write(stage)
  343. f.close()