| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140 |
- # Copyright 2012 Google Inc. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- #
- """Adapter between Gumbo and html5lib.
- This exports one method, parse, with the same signature as html5lib.parse. It
- takes the text to parse, and optionally an html5lib TreeBuilder to build the
- tree, and gives back a DOM tree in that format. Example:
- doc = parse(text, treebuilder='lxml')
- """
- __author__ = 'jdtang@google.com (Jonathan Tang)'
- import gumboc
- # These should match html5lib.constants.namespaces, and be indexed by the enum
- # values of gumboc.Namespace
- _NAMESPACES = [
- 'http://www.w3.org/1999/xhtml',
- 'http://www.w3.org/2000/svg',
- 'http://www.w3.org/1998/Math/MathML',
- ]
- def _convert_doctype(treebuilder, source_node):
- if not source_node.has_doctype:
- # Mimic html5lib behavior: if no doctype token, no doctype node.
- return
- treebuilder.insertDoctype({
- 'name': source_node.name.decode('utf-8'),
- 'publicId': source_node.public_identifier.decode('utf-8'),
- 'systemId': source_node.system_identifier.decode('utf-8'),
- })
- def _convert_attributes(source_node):
- def maybe_namespace(attr):
- if attr.namespace != gumboc.AttributeNamespace.NONE:
- return (repr(attr.namespace).lower() if attr.name != 'xmlns' else None,
- attr.name.decode('utf-8'),
- attr.namespace.to_url())
- else:
- return attr.name.decode('utf-8')
- return dict((maybe_namespace(attr), attr.value.decode('utf-8'))
- for attr in source_node.attributes)
- def _convert_element(source_node):
- if source_node.type not in ( gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE):
- # If-statement instead of assert so it runs with -O
- raise AssertionError(
- '_convert_element only works with elements; found %r' %
- source_node.type)
- return {
- 'name': source_node.v.element.tag_name.decode('utf-8'),
- 'namespace': _NAMESPACES[source_node.v.element.tag_namespace.value],
- 'data': _convert_attributes(source_node),
- }
- def _insert_root(treebuilder, source_node, pop_element = True):
- treebuilder.insertRoot(_convert_element(source_node))
- for child_node in source_node.children:
- _insert_node(treebuilder, child_node)
- if pop_element:
- treebuilder.openElements.pop()
- def _insert_node(treebuilder, source_node):
- assert source_node.type != gumboc.NodeType.DOCUMENT
- if source_node.type == gumboc.NodeType.COMMENT:
- treebuilder.insertComment({'data': source_node.v.text.text.decode('utf-8')})
- elif source_node.type in (
- gumboc.NodeType.TEXT,
- gumboc.NodeType.WHITESPACE,
- gumboc.NodeType.CDATA):
- treebuilder.insertText(source_node.v.text.text.decode('utf-8'))
- else:
- treebuilder.insertElementNormal(_convert_element(source_node))
- for child_node in source_node.v.element.children:
- _insert_node(treebuilder, child_node)
- treebuilder.openElements.pop()
- class HTMLParser(object):
- def __init__(self, tree):
- self.tree = tree
- def parse(self, text_or_file, **kwargs):
- try:
- text = text_or_file.read()
- except AttributeError:
- # Assume a string.
- text = text_or_file
- with gumboc.parse(text, **kwargs) as output:
- _convert_doctype(self.tree, output.contents.document.contents)
- for node in output.contents.document.contents.children:
- if node.type == gumboc.NodeType.COMMENT:
- self.tree.insertComment({'data': node.v.text.text.decode('utf-8')},
- self.tree.document)
- elif node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE):
- _insert_root(self.tree, output.contents.root.contents)
- else:
- assert 'Only comments and <html> nodes allowed at the root'
- return self.tree.getDocument()
- def parseFragment(self, text_or_file, container, **kwargs):
- try:
- text = text_or_file.read()
- except AttributeError:
- # Assume a string.
- text = text_or_file
- if ' ' in container:
- container_ns, container = container.split(' ')
- else:
- container_ns = "html"
- with gumboc.parse(
- text,
- fragment_context=gumboc.Tag.from_str(container),
- fragment_namespace=getattr(gumboc.Namespace, container_ns.upper()),
- **kwargs) as output:
- for node in output.contents.document.contents.children:
- if node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE):
- _insert_root(self.tree, output.contents.root.contents, False)
- else:
- assert 'Malformed fragment parse (??)'
- return self.tree.getFragment()
|