| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406 |
- # This script takes the output from pandoc and converts it into the format needed by
- # the website at Zint.org.uk
- #
- # Warning: This code is ugly... but it saves days of manual effort updating the website.
- #
- # Copyright (C) 2022 <rstuart114@gmail.com>
- # Works out which tags should influence indentation and puts them on their own line
- def isolate_tag(tag):
- global stage
-
- indentable_tag = True
- for keyword in indent_skip:
- if keyword in tag:
- indentable_tag = False
-
- if '</' in tag:
- # Close tag
- if (indentable_tag):
- stage += "\n"
- stage += tag
- stage += "\n"
- else:
- stage += tag
- else:
- # Open tag
- if (indentable_tag):
- stage += "\n"
- stage += tag
- stage += "\n"
- else:
- stage += tag
- # Add the right amount of indendation (indentation X 4 spaces)
- def add_indent():
- global indentation
- retval = ""
-
- for i in range(0,indentation):
- retval += " "
-
- return retval
- # Apply indentation to text
- def with_indent(text):
- global indentation
- retval = ""
- d = ''
-
- for c in text:
- if d == '\n':
- retval += d
- retval += add_indent()
- else:
- retval += d
- d = c
-
- retval += d
-
- return retval
- # Read file and pull some tags onto their own lines for later processing
- manual = ""
- tag = False
- tag_buffer = ""
- text_buffer = ""
- stage = ""
- indent_skip = ['img', 'code', 'pre', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', '<a', '</a', 'sup', '<col', '</col', '<hr', 'div']
- print("Reading... manual.html")
- with open('manual.html') as f:
- manual = f.read()
-
- for c in manual:
- if c == '<':
- stage += text_buffer
- tag = True
- tag_buffer = ""
-
- if (tag):
- tag_buffer += c
- else:
- text_buffer += c
-
- if c == '>':
- tag_buffer = tag_buffer.replace("\n", " ")
- isolate_tag(tag_buffer)
- tag = False
- text_buffer = ""
-
- f.close()
- manual = stage
- stage = ""
- print("Adjusting HTML")
- # Change the guts of the HTML tags
- in_dd = False
- to_remove = False
- remove_next = False
- span_literal = False
- for c in manual:
- if c == '<':
- # Remove "{#tbl:" table identifiers
- if '{#tbl:' in text_buffer:
- text_buffer = text_buffer[text_buffer.index('tag=') + 7:-3]
- text_buffer = text_buffer.replace('\n', ' ')
- text_buffer = '\n' + text_buffer + '\n'
-
- # Remove "{@tabl:" table references
- if 'tbl:' in text_buffer:
- text_buffer = ''
-
- stage += text_buffer
- tag = True
- tag_buffer = ""
- to_remove = False
-
- if (tag):
- tag_buffer += c
- else:
- text_buffer += c
-
- if c == '>':
- # Remove some tags which aren't needed on website
- if 'span' in tag_buffer:
- to_remove = True
-
- if 'div' in tag_buffer:
- to_remove = True
-
- if '<col' in tag_buffer:
- to_remove = True
-
- if '</col' in tag_buffer:
- to_remove = True
-
- if (remove_next):
- to_remove = True
- remove_next = False
-
- if ('a href' in tag_buffer) and ('aria-hidden="true"' in tag_buffer):
- to_remove = True
- remove_next = True
-
- if '<a href="#' in tag_buffer:
- to_remove = True
- remove_next = True
-
- # Don't allow <p> and </p> between <dd> and </dd>
- if (tag_buffer == "<dd>"):
- in_dd = True
- if (tag_buffer == "</dd>"):
- in_dd = False
-
- if (in_dd and tag_buffer == '<p>'):
- to_remove = True
-
- if (in_dd and tag_buffer == '</p>'):
- to_remove = True
-
- # Remove attributes for some tags
- if '<pre' in tag_buffer:
- tag_buffer = '<pre>'
-
- if '<table' in tag_buffer:
- tag_buffer = '<table>'
-
- if '<tr' in tag_buffer:
- tag_buffer = '<tr>'
-
- if '<td' in tag_buffer:
- tag_buffer = '<td>'
-
- if '<th ' in tag_buffer:
- tag_buffer = '<th>'
-
- # Bump all headers up one level
- tag_buffer = tag_buffer.replace('<h6', '<h7')
- tag_buffer = tag_buffer.replace('</h6', '</h7')
- tag_buffer = tag_buffer.replace('<h5', '<h6')
- tag_buffer = tag_buffer.replace('</h5', '</h6')
- tag_buffer = tag_buffer.replace('<h4', '<h5')
- tag_buffer = tag_buffer.replace('</h4', '</h5')
- tag_buffer = tag_buffer.replace('<h3', '<h4')
- tag_buffer = tag_buffer.replace('</h3', '</h4')
- tag_buffer = tag_buffer.replace('<h2', '<h3')
- tag_buffer = tag_buffer.replace('</h2', '</h3')
- tag_buffer = tag_buffer.replace('<h1', '<h2')
- tag_buffer = tag_buffer.replace('</h1', '</h2')
-
- # Change class names for code snippets
- tag_buffer = tag_buffer.replace('class="sourceCode bash"', 'class="language-bash"')
- tag_buffer = tag_buffer.replace('class="sourceCode c"', 'class="language-cpp"')
-
- # Change location of images
- tag_buffer = tag_buffer.replace('src="images/', 'src="/images/manual/')
-
- # Change <code> without language to <span>
- if tag_buffer == '<code>':
- tag_buffer = '<span class="literal">'
- span_literal = True
-
- if tag_buffer == '</code>' and span_literal:
- tag_buffer = '</span>'
- span_literal = False
- if not to_remove:
- stage += tag_buffer
- tag = False
- text_buffer = ""
-
- manual = stage
- stage = ""
- print("Removing empty lines")
- # Remove blank lines unless in between <pre> and </pre>
- last_char = ''
- in_pre = False
- for c in manual:
- if c == '<':
- tag = True
- tag_buffer = ""
-
- if (tag):
- tag_buffer += c
- else:
- text_buffer += c
-
- if c == '>':
- if ("<pre" in tag_buffer):
- in_pre = True
- if ("</pre" in tag_buffer):
- in_pre = False
- tag = False
- text_buffer = ""
-
- if c == '\n':
- if (last_char != '\n') or (in_pre == True):
- stage += c
- else:
- stage += c
- last_char = c
-
- manual = stage
- stage = ""
- print("Applying indentation")
- # Indent the code to make it easier to read
- indentation = 1
- in_pre = False
- paragraph_block = False
- document_start = True
- chapter_six = False
- last_char = ''
- for c in manual:
- if c == '<':
- #Fix 'floating' full stops
- text_buffer = text_buffer.replace(' . ', '. ')
-
- # Apply indentation to text
- if in_pre:
- stage += text_buffer
- else:
- stage += with_indent(text_buffer)
- tag = True
- tag_buffer = ""
-
- if (tag):
- tag_buffer += c
- else:
- # Strip '{}' from already removed table references
- if c == '}' and last_char == '{':
- text_buffer = text_buffer[:-1]
- else:
- text_buffer += c
- last_char = c
-
- if c == '>':
- indentable_tag = True
- for keyword in indent_skip:
- if keyword in tag_buffer:
- indentable_tag = False
-
- # Protect the indentation in <pre> segments
- if ('<pre' in tag_buffer):
- in_pre = True
- if ('</pre' in tag_buffer):
- in_pre = False
-
- # Chapter 6 requires special treatment - detect beginning and end
- if ('id="types-of-symbology"' in tag_buffer):
- chapter_six = True
- if ('id="legal-and-version-information"' in tag_buffer):
- chapter_six = False
-
- if '</' in tag_buffer:
- # Close tag
- if (indentable_tag):
- indentation -= 1
- stage += add_indent()
- stage += tag_buffer
- else:
- if text_buffer.endswith('\n'):
- stage += add_indent()
- stage += tag_buffer
- else:
- # Split into sections
- if (indentation == 1) and ('<p' in tag_buffer):
- if not paragraph_block:
- if document_start:
- document_start = False
- else:
- stage += '</section>\n'
- stage += '<section class="container">\n'
- paragraph_block = True
-
- # Handle headers but also decide where to split into multiple HTML files and mark with <page>
- if (indentation == 1):
- if ('<h2' in tag_buffer):
- if document_start:
- document_start = False
- stage += '<section class="container">\n'
- paragraph_block = True
- else:
- stage += '</section>\n'
- stage += '<page>\n'
- stage += '<section class="container">\n'
- paragraph_block = True
- elif ('<h3' in tag_buffer) and chapter_six:
- stage += '</section>\n'
- stage += '<page>\n'
- stage += '<section class="container">\n'
- paragraph_block = True
- elif ('<h' in tag_buffer):
- if not paragraph_block:
- stage += '</section>\n'
- stage += '<section class="container">\n'
- paragraph_block = True
-
- # <dl> section has it's own class
- if (indentation == 1) and ('<dl' in tag_buffer):
- stage += '</section>\n'
- stage += '<section class="definition-list container">\n'
- paragraph_block = False
-
- # <table> section has it's own class
- if (indentation == 1) and ('<table' in tag_buffer):
- stage += '</section>\n'
- stage += '<section class="table">\n'
- paragraph_block = False
-
- # Open tag
- if (indentable_tag):
- stage += add_indent()
- stage += tag_buffer
- indentation += 1
- else:
- if text_buffer.endswith('\n'):
- stage += add_indent()
- stage += tag_buffer
- tag = False
- text_buffer = ""
- stage += '\n</section>\n'
- manual = stage
- stage = ""
- # Remove <h2> data and split into output files
- out_filenames = ['chapter1.html', 'chapter2.html', 'chapter3.html', 'chapter4.html', 'chapter5.html',
- 'chapter6.0.html', 'chapter6.1.html', 'chapter6.2.html', 'chapter6.3.html', 'chapter6.4.html',
- 'chapter6.5.html', 'chapter6.6.html', 'chapter6.7.html', 'chapter7.html', 'appendixa.html', 'appendixb.html']
- page = 0
- print("Writing... ", out_filenames[page])
- f = open(out_filenames[page], "w")
- h2_tag = False
- for c in manual:
- if c == '<':
- if h2_tag == False:
- stage += text_buffer
- tag = True
- tag_buffer = ""
-
- if (tag):
- tag_buffer += c
- else:
- text_buffer += c
-
- if c == '>':
- if '<h2' in tag_buffer:
- h2_tag = True
- elif '</h2' in tag_buffer:
- h2_tag = False
- elif tag_buffer == '<page>':
- f.write(stage)
- f.close()
- stage = ""
- page += 1
- print("Writing... ", out_filenames[page])
- f = open(out_filenames[page], "w")
- else:
- stage += tag_buffer
- tag = False
- text_buffer = ""
- f.write(stage)
- f.close()
|