gumboc.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423
  1. # Copyright 2012 Google Inc. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. #
  15. """CTypes bindings for the Gumbo HTML5 parser.
  16. This exports the raw interface of the library as a set of very thin ctypes
  17. wrappers. It's intended to be wrapped by other libraries to provide a more
  18. Pythonic API.
  19. """
  20. __author__ = 'jdtang@google.com (Jonathan Tang)'
  21. import sys
  22. import contextlib
  23. import ctypes
  24. import os.path
  25. import gumboc_tags
  26. _name_of_lib = 'libgumbo.so'
  27. if sys.platform.startswith('darwin'):
  28. _name_of_lib = 'libgumbo.dylib'
  29. elif sys.platform.startswith('win'):
  30. _name_of_lib = "gumbo.dll"
  31. try:
  32. # First look for a freshly-built .so in the .libs directory, for development.
  33. _dll = ctypes.cdll.LoadLibrary(os.path.join(
  34. os.path.dirname(__file__), '..', '..', '.libs', _name_of_lib))
  35. except OSError:
  36. # PyPI or setuptools install, look in the current directory.
  37. _dll = ctypes.cdll.LoadLibrary(os.path.join(
  38. os.path.dirname(__file__), _name_of_lib))
  39. except OSError:
  40. # System library, on unix or mac osx
  41. _dll = ctypes.cdll.LoadLibrary(_name_of_lib)
  42. # Some aliases for common types.
  43. _bitvector = ctypes.c_uint
  44. _Ptr = ctypes.POINTER
  45. class EnumMetaclass(type(ctypes.c_uint)):
  46. def __new__(metaclass, name, bases, cls_dict):
  47. cls = type(ctypes.c_uint).__new__(metaclass, name, bases, cls_dict)
  48. if name == 'Enum':
  49. return cls
  50. try:
  51. for i, value in enumerate(cls_dict['_values_']):
  52. setattr(cls, value, cls.from_param(i))
  53. except KeyError:
  54. raise ValueError('No _values_ list found inside enum type.')
  55. except TypeError:
  56. raise ValueError('_values_ must be a list of names of enum constants.')
  57. return cls
  58. def with_metaclass(mcls):
  59. def decorator(cls):
  60. body = vars(cls).copy()
  61. # clean out class body
  62. body.pop('__dict__', None)
  63. body.pop('__weakref__', None)
  64. return mcls(cls.__name__, cls.__bases__, body)
  65. return decorator
  66. @with_metaclass(EnumMetaclass)
  67. class Enum(ctypes.c_uint):
  68. @classmethod
  69. def from_param(cls, param):
  70. if isinstance(param, Enum):
  71. if param.__class__ != cls:
  72. raise ValueError("Can't mix enums of different types")
  73. return param
  74. if param < 0 or param > len(cls._values_):
  75. raise ValueError('%d is out of range for enum type %s; max %d.' %
  76. (param, cls.__name__, len(cls._values_)))
  77. return cls(param)
  78. def __eq__(self, other):
  79. return self.value == other.value
  80. def __ne__(self, other):
  81. return self.value != other.value
  82. def __hash__(self):
  83. return hash(self.value)
  84. def __repr__(self):
  85. try:
  86. return self._values_[self.value]
  87. except IndexError:
  88. raise IndexError('Value %d is out of range for %r' %
  89. (self.value, self._values_))
  90. class StringPiece(ctypes.Structure):
  91. _fields_ = [
  92. ('data', _Ptr(ctypes.c_char)),
  93. ('length', ctypes.c_size_t),
  94. ]
  95. def __len__(self):
  96. return self.length
  97. def __str__(self):
  98. return ctypes.string_at(self.data, self.length)
  99. class SourcePosition(ctypes.Structure):
  100. _fields_ = [
  101. ('line', ctypes.c_uint),
  102. ('column', ctypes.c_uint),
  103. ('offset', ctypes.c_uint)
  104. ]
  105. SourcePosition.EMPTY = SourcePosition.in_dll(_dll, 'kGumboEmptySourcePosition')
  106. class AttributeNamespace(Enum):
  107. URLS = [
  108. 'http://www.w3.org/1999/xhtml',
  109. 'http://www.w3.org/1999/xlink',
  110. 'http://www.w3.org/XML/1998/namespace',
  111. 'http://www.w3.org/2000/xmlns',
  112. ]
  113. _values_ = ['NONE', 'XLINK', 'XML', 'XMLNS']
  114. def to_url(self):
  115. return self.URLS[self.value]
  116. class Attribute(ctypes.Structure):
  117. _fields_ = [
  118. ('namespace', AttributeNamespace),
  119. ('name', ctypes.c_char_p),
  120. ('original_name', StringPiece),
  121. ('value', ctypes.c_char_p),
  122. ('original_value', StringPiece),
  123. ('name_start', SourcePosition),
  124. ('name_end', SourcePosition),
  125. ('value_start', SourcePosition),
  126. ('value_end', SourcePosition)
  127. ]
  128. class Vector(ctypes.Structure):
  129. _type_ = ctypes.c_void_p
  130. _fields_ = [
  131. ('data', _Ptr(ctypes.c_void_p)),
  132. ('length', ctypes.c_uint),
  133. ('capacity', ctypes.c_uint)
  134. ]
  135. class Iter(object):
  136. def __init__(self, vector):
  137. self.current = 0
  138. self.vector = vector
  139. def __iter__(self):
  140. return self
  141. def __next__(self):
  142. # Python 3
  143. if self.current >= self.vector.length:
  144. raise StopIteration
  145. obj = self.vector[self.current]
  146. self.current += 1
  147. return obj
  148. def next(self):
  149. # Python 2
  150. return self.__next__()
  151. def __len__(self):
  152. return self.length
  153. def __getitem__(self, i):
  154. try:
  155. # Python 2
  156. numeric_types = (int, long)
  157. except NameError:
  158. # Python 3
  159. numeric_types = int
  160. if isinstance(i, numeric_types):
  161. if i < 0:
  162. i += self.length
  163. if i > self.length:
  164. raise IndexError
  165. array_type = _Ptr(_Ptr(self._type_))
  166. return ctypes.cast(self.data, array_type)[i].contents
  167. return list(self)[i]
  168. def __iter__(self):
  169. return Vector.Iter(self)
  170. Vector.EMPTY = Vector.in_dll(_dll, 'kGumboEmptyVector')
  171. class AttributeVector(Vector):
  172. _type_ = Attribute
  173. class NodeVector(Vector):
  174. # _type_ assigned later, to avoid circular references with Node
  175. pass
  176. class QuirksMode(Enum):
  177. _values_ = ['NO_QUIRKS', 'QUIRKS', 'LIMITED_QUIRKS']
  178. class Document(ctypes.Structure):
  179. _fields_ = [
  180. ('children', NodeVector),
  181. ('has_doctype', ctypes.c_bool),
  182. ('name', ctypes.c_char_p),
  183. ('public_identifier', ctypes.c_char_p),
  184. ('system_identifier', ctypes.c_char_p),
  185. ('doc_type_quirks_mode', QuirksMode),
  186. ]
  187. def __repr__(self):
  188. return 'Document'
  189. class Namespace(Enum):
  190. URLS = [
  191. 'http://www.w3.org/1999/xhtml',
  192. 'http://www.w3.org/2000/svg',
  193. 'http://www.w3.org/1998/Math/MathML',
  194. ]
  195. _values_ = ['HTML', 'SVG', 'MATHML']
  196. def to_url(self):
  197. return self.URLS[self.value]
  198. class Tag(Enum):
  199. @staticmethod
  200. def from_str(tagname):
  201. text_ptr = ctypes.c_char_p(tagname.encode('utf-8'))
  202. return _tag_enum(text_ptr)
  203. _values_ = gumboc_tags.TagNames + ['UNKNOWN', 'LAST']
  204. class Element(ctypes.Structure):
  205. _fields_ = [
  206. ('children', NodeVector),
  207. ('tag', Tag),
  208. ('tag_namespace', Namespace),
  209. ('original_tag', StringPiece),
  210. ('original_end_tag', StringPiece),
  211. ('start_pos', SourcePosition),
  212. ('end_pos', SourcePosition),
  213. ('attributes', AttributeVector),
  214. ]
  215. @property
  216. def tag_name(self):
  217. original_tag = StringPiece.from_buffer_copy(self.original_tag)
  218. _tag_from_original_text(ctypes.byref(original_tag))
  219. if self.tag_namespace == Namespace.SVG:
  220. svg_tagname = _normalize_svg_tagname(ctypes.byref(original_tag))
  221. if svg_tagname is not None:
  222. return str(svg_tagname)
  223. if self.tag == Tag.UNKNOWN:
  224. if original_tag.data is None:
  225. return ''
  226. return str(original_tag).lower()
  227. return _tagname(self.tag)
  228. def __repr__(self):
  229. return ('<%r>\n' % self.tag +
  230. '\n'.join(repr(child) for child in self.children) +
  231. '</%r>' % self.tag)
  232. class Text(ctypes.Structure):
  233. _fields_ = [
  234. ('text', ctypes.c_char_p),
  235. ('original_text', StringPiece),
  236. ('start_pos', SourcePosition)
  237. ]
  238. def __repr__(self):
  239. return 'Text(%r)' % self.text
  240. class NodeType(Enum):
  241. _values_ = ['DOCUMENT', 'ELEMENT', 'TEXT', 'CDATA',
  242. 'COMMENT', 'WHITESPACE', 'TEMPLATE']
  243. class NodeUnion(ctypes.Union):
  244. _fields_ = [
  245. ('document', Document),
  246. ('element', Element),
  247. ('text', Text),
  248. ]
  249. class Node(ctypes.Structure):
  250. # _fields_ set later to avoid a circular reference
  251. def _contents(self):
  252. # Python3 enters an infinite loop if you use an @property within
  253. # __getattr__, so we factor it out to a helper.
  254. if self.type == NodeType.DOCUMENT:
  255. return self.v.document
  256. elif self.type in (NodeType.ELEMENT, NodeType.TEMPLATE):
  257. return self.v.element
  258. else:
  259. return self.v.text
  260. @property
  261. def contents(self):
  262. return self._contents()
  263. def __getattr__(self, name):
  264. return getattr(self._contents(), name)
  265. def __setattr__(self, name, value):
  266. return setattr(self._contents(), name, value)
  267. def __repr__(self):
  268. return repr(self.contents)
  269. Node._fields_ = [
  270. ('type', NodeType),
  271. # Set the type to Node later to avoid a circular dependency.
  272. ('parent', _Ptr(Node)),
  273. ('index_within_parent', ctypes.c_size_t),
  274. # TODO(jdtang): Make a real list of enum constants for this.
  275. ('parse_flags', _bitvector),
  276. ('v', NodeUnion)
  277. ]
  278. NodeVector._type_ = Node
  279. class Options(ctypes.Structure):
  280. _fields_ = [
  281. # TODO(jdtang): Allow the Python API to set the allocator/deallocator
  282. # function. Right now these are treated as opaque void pointers.
  283. ('allocator', ctypes.c_void_p),
  284. ('deallocator', ctypes.c_void_p),
  285. ('userdata', ctypes.c_void_p),
  286. ('tab_stop', ctypes.c_int),
  287. ('stop_on_first_error', ctypes.c_bool),
  288. ('max_errors', ctypes.c_int),
  289. ('fragment_context', Tag),
  290. ('fragment_namespace', Namespace),
  291. ]
  292. class Output(ctypes.Structure):
  293. _fields_ = [
  294. ('document', _Ptr(Node)),
  295. ('root', _Ptr(Node)),
  296. # TODO(jdtang): Error type.
  297. ('errors', Vector),
  298. ]
  299. @contextlib.contextmanager
  300. def parse(text, **kwargs):
  301. options = Options()
  302. for field_name, _ in Options._fields_:
  303. try:
  304. setattr(options, field_name, kwargs[field_name])
  305. except KeyError:
  306. setattr(options, field_name, getattr(_DEFAULT_OPTIONS, field_name))
  307. # We have to manually take a reference to the input text here so that it
  308. # outlives the parse output. If we let ctypes do it automatically on function
  309. # call, it creates a temporary buffer which is destroyed when the call
  310. # completes, and then the original_text pointers point into invalid memory.
  311. text_ptr = ctypes.c_char_p(text.encode('utf-8'))
  312. output = _parse_with_options(ctypes.byref(options), text_ptr, len(text))
  313. try:
  314. yield output
  315. finally:
  316. _destroy_output(ctypes.byref(options), output)
  317. _DEFAULT_OPTIONS = Options.in_dll(_dll, 'kGumboDefaultOptions')
  318. _parse_with_options = _dll.gumbo_parse_with_options
  319. _parse_with_options.argtypes = [_Ptr(Options), ctypes.c_char_p, ctypes.c_size_t]
  320. _parse_with_options.restype = _Ptr(Output)
  321. _tag_from_original_text = _dll.gumbo_tag_from_original_text
  322. _tag_from_original_text.argtypes = [_Ptr(StringPiece)]
  323. _tag_from_original_text.restype = None
  324. _normalize_svg_tagname = _dll.gumbo_normalize_svg_tagname
  325. _normalize_svg_tagname.argtypes = [_Ptr(StringPiece)]
  326. _normalize_svg_tagname.restype = ctypes.c_char_p
  327. _destroy_output = _dll.gumbo_destroy_output
  328. _destroy_output.argtypes = [_Ptr(Options), _Ptr(Output)]
  329. _destroy_output.restype = None
  330. _tagname = _dll.gumbo_normalized_tagname
  331. _tagname.argtypes = [Tag]
  332. _tagname.restype = ctypes.c_char_p
  333. _tag_enum = _dll.gumbo_tag_enum
  334. _tag_enum.argtypes = [ctypes.c_char_p]
  335. _tag_enum.restype = Tag
  336. __all__ = ['StringPiece', 'SourcePosition', 'AttributeNamespace', 'Attribute',
  337. 'Vector', 'AttributeVector', 'NodeVector', 'QuirksMode', 'Document',
  338. 'Namespace', 'Tag', 'Element', 'Text', 'NodeType', 'Node',
  339. 'Options', 'Output', 'parse']