soup_adapter.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. # Copyright 2012 Google Inc. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. #
  15. """Adapter between Gumbo and BeautifulSoup.
  16. This parses an HTML document and gives back a BeautifulSoup object, which you
  17. can then manipulate like a normal BeautifulSoup parse tree.
  18. """
  19. __author__ = 'jdtang@google.com (Jonathan Tang)'
  20. import BeautifulSoup
  21. import gumboc
  22. def _utf8(text):
  23. return text.decode('utf-8', 'replace')
  24. def _add_source_info(obj, original_text, start_pos, end_pos):
  25. obj.original = str(original_text)
  26. obj.line = start_pos.line
  27. obj.col = start_pos.column
  28. obj.offset = start_pos.offset
  29. if end_pos:
  30. obj.end_line = end_pos.line
  31. obj.end_col = end_pos.column
  32. obj.end_offset = end_pos.offset
  33. def _convert_attrs(attrs):
  34. # TODO(jdtang): Ideally attributes would pass along their positions as well,
  35. # but I can't extend the built in str objects with new attributes. Maybe work
  36. # around this with a subclass in some way...
  37. return [(_utf8(attr.name), _utf8(attr.value)) for attr in attrs]
  38. def _add_document(soup, element):
  39. # Currently ignored, since there's no real place for this in the BeautifulSoup
  40. # API.
  41. pass
  42. def _add_element(soup, element):
  43. # TODO(jdtang): Expose next/previous in gumbo so they can be passed along to
  44. # BeautifulSoup.
  45. tag = BeautifulSoup.Tag(
  46. soup, _utf8(element.tag_name), _convert_attrs(element.attributes))
  47. for child in element.children:
  48. tag.append(_add_node(soup, child))
  49. _add_source_info(
  50. tag, element.original_tag, element.start_pos, element.end_pos)
  51. tag.original_end_tag = str(element.original_end_tag)
  52. return tag
  53. def _add_text(cls):
  54. def add_text_internal(soup, element):
  55. text = cls(_utf8(element.text))
  56. _add_source_info(text, element.original_text, element.start_pos, None)
  57. return text
  58. return add_text_internal
  59. _HANDLERS = [
  60. _add_document,
  61. _add_element,
  62. _add_text(BeautifulSoup.NavigableString),
  63. _add_text(BeautifulSoup.CData),
  64. _add_text(BeautifulSoup.Comment),
  65. _add_text(BeautifulSoup.NavigableString),
  66. _add_element,
  67. ]
  68. def _add_node(soup, node):
  69. return _HANDLERS[node.type.value](soup, node.contents)
  70. def _add_next_prev_pointers(soup):
  71. def _traverse(node):
  72. # .findAll requires the .next pointer, which is what we're trying to add
  73. # when we call this, and so we manually supply a generator to yield the
  74. # nodes in DOM order.
  75. yield node
  76. try:
  77. for child in node.contents:
  78. for descendant in _traverse(child):
  79. yield descendant
  80. except AttributeError:
  81. # Not an element.
  82. return
  83. nodes = sorted(_traverse(soup), key=lambda node: node.offset)
  84. if nodes:
  85. nodes[0].previous = None
  86. nodes[-1].next = None
  87. for i, node in enumerate(nodes[1:-1], 1):
  88. nodes[i-1].next = node
  89. node.previous = nodes[i-1]
  90. def parse(text, **kwargs):
  91. with gumboc.parse(text, **kwargs) as output:
  92. soup = BeautifulSoup.BeautifulSoup()
  93. soup.append(_add_node(soup, output.contents.root.contents))
  94. _add_next_prev_pointers(soup)
  95. return soup