test_utils.cc 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. // Copyright 2013 Google Inc. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //
  15. // Author: jdtang@google.com (Jonathan Tang)
  16. #include "test_utils.h"
  17. #include "error.h"
  18. #include "util.h"
  19. int GetChildCount(GumboNode* node) {
  20. if (node->type == GUMBO_NODE_DOCUMENT) {
  21. return node->v.document.children.length;
  22. } else {
  23. return node->v.element.children.length;
  24. }
  25. }
  26. GumboTag GetTag(GumboNode* node) { return node->v.element.tag; }
  27. GumboNode* GetChild(GumboNode* parent, int index) {
  28. if (parent->type == GUMBO_NODE_DOCUMENT) {
  29. return static_cast<GumboNode*>(parent->v.document.children.data[index]);
  30. } else {
  31. return static_cast<GumboNode*>(parent->v.element.children.data[index]);
  32. }
  33. }
  34. int GetAttributeCount(GumboNode* node) {
  35. return node->v.element.attributes.length;
  36. }
  37. GumboAttribute* GetAttribute(GumboNode* node, int index) {
  38. return static_cast<GumboAttribute*>(node->v.element.attributes.data[index]);
  39. }
  40. // Convenience function to do some basic assertions on the structure of the
  41. // document (nodes are elements, nodes have the right tags) and then return
  42. // the body node.
  43. void GetAndAssertBody(GumboNode* root, GumboNode** body) {
  44. GumboNode* html = NULL;
  45. for (int i = 0; i < GetChildCount(root); ++i) {
  46. GumboNode* child = GetChild(root, i);
  47. if (child->type != GUMBO_NODE_ELEMENT) {
  48. ASSERT_EQ(GUMBO_NODE_COMMENT, child->type);
  49. continue;
  50. }
  51. ASSERT_TRUE(html == NULL);
  52. html = child;
  53. }
  54. ASSERT_TRUE(html != NULL);
  55. ASSERT_EQ(GUMBO_NODE_ELEMENT, html->type);
  56. EXPECT_EQ(GUMBO_TAG_HTML, GetTag(html));
  57. // There may be comment/whitespace nodes; this walks through the children of
  58. // <html> and assigns head/body based on them, or assert-fails if there are
  59. // fewer/more than 2 such nodes.
  60. GumboNode* head = NULL;
  61. *body = NULL;
  62. for (int i = 0; i < GetChildCount(html); ++i) {
  63. GumboNode* child = GetChild(html, i);
  64. if (child->type != GUMBO_NODE_ELEMENT) {
  65. continue;
  66. }
  67. if (!head) {
  68. head = child;
  69. EXPECT_EQ(GUMBO_TAG_HEAD, GetTag(head));
  70. } else if (!(*body)) {
  71. *body = child;
  72. EXPECT_EQ(GUMBO_TAG_BODY, GetTag(*body));
  73. } else {
  74. ASSERT_TRUE("More than two elements found inside <html>" != NULL);
  75. }
  76. }
  77. EXPECT_TRUE(head != NULL);
  78. ASSERT_TRUE(*body != NULL);
  79. }
  80. void SanityCheckPointers(
  81. const char* input, size_t input_length, const GumboNode* node, int depth) {
  82. ASSERT_GE(input_length, (size_t) 0);
  83. ASSERT_TRUE(node != NULL);
  84. // There are some truly pathological HTML documents out there - the
  85. // integration tests for this include one where the DOM "tree" is actually a
  86. // linked list 27,500 nodes deep - and so we need a limit on the recursion
  87. // depth here to avoid blowing the stack. Alternatively, we could externalize
  88. // the stack and use an iterative algorithm, but that gets us very little for
  89. // the additional programming complexity.
  90. if (node->type == GUMBO_NODE_DOCUMENT || depth > 400) {
  91. // Don't sanity-check the document as well...we start with the root.
  92. return;
  93. }
  94. if (node->type == GUMBO_NODE_ELEMENT) {
  95. const GumboElement* element = &node->v.element;
  96. // Sanity checks on original* pointers, making sure they fall within the
  97. // original input.
  98. if (element->original_tag.data && element->original_tag.length) {
  99. EXPECT_GE(element->original_tag.data, input);
  100. EXPECT_LT(element->original_tag.data, input + input_length);
  101. EXPECT_LE(element->original_tag.length, input_length);
  102. }
  103. if (element->original_end_tag.data && element->original_tag.length) {
  104. EXPECT_GE(element->original_end_tag.data, input);
  105. EXPECT_LT(element->original_end_tag.data, input + input_length);
  106. EXPECT_LE(element->original_end_tag.length, input_length);
  107. }
  108. EXPECT_GE(element->start_pos.offset, 0);
  109. EXPECT_LE(element->start_pos.offset, input_length);
  110. EXPECT_GE(element->end_pos.offset, 0);
  111. EXPECT_LE(element->end_pos.offset, input_length);
  112. const GumboVector* children = &element->children;
  113. for (int i = 0; i < children->length; ++i) {
  114. const GumboNode* child = static_cast<const GumboNode*>(children->data[i]);
  115. // Checks on parent/child links.
  116. ASSERT_TRUE(child != NULL);
  117. EXPECT_EQ(node, child->parent);
  118. EXPECT_EQ(i, child->index_within_parent);
  119. SanityCheckPointers(input, input_length, child, depth + 1);
  120. }
  121. } else {
  122. const GumboText* text = &node->v.text;
  123. EXPECT_GE(text->original_text.data, input);
  124. EXPECT_LT(text->original_text.data, input + input_length);
  125. EXPECT_LE(text->original_text.length, input_length);
  126. EXPECT_GE(text->start_pos.offset, 0);
  127. EXPECT_LT(text->start_pos.offset, input_length);
  128. }
  129. }
  130. // Custom allocator machinery to sanity check for memory leaks. Normally we can
  131. // use heapcheck/valgrind/ASAN for this, but they only give the
  132. // results when the program terminates. This means that if the parser is run in
  133. // a loop (say, a MapReduce) and there's a leak, it may end up exhausting memory
  134. // before it can catch the particular document responsible for the leak. These
  135. // allocators let us check each document individually for leaks.
  136. static void* LeakDetectingMalloc(void* userdata, size_t size) {
  137. MallocStats* stats = static_cast<MallocStats*>(userdata);
  138. stats->bytes_allocated += size;
  139. ++stats->objects_allocated;
  140. // Arbitrary limit of 2G on allocation; parsing any reasonable document
  141. // shouldn't take more than that.
  142. assert(stats->bytes_allocated < (1 << 31));
  143. void* obj = malloc(size);
  144. // gumbo_debug("Allocated %u bytes at %x.\n", size, obj);
  145. return obj;
  146. }
  147. static void LeakDetectingFree(void* userdata, void* ptr) {
  148. MallocStats* stats = static_cast<MallocStats*>(userdata);
  149. if (ptr) {
  150. ++stats->objects_freed;
  151. // gumbo_debug("Freed %x.\n");
  152. free(ptr);
  153. }
  154. }
  155. void InitLeakDetection(GumboOptions* options, MallocStats* stats) {
  156. stats->bytes_allocated = 0;
  157. stats->objects_allocated = 0;
  158. stats->objects_freed = 0;
  159. options->allocator = LeakDetectingMalloc;
  160. options->deallocator = LeakDetectingFree;
  161. options->userdata = stats;
  162. }
  163. GumboTest::GumboTest()
  164. : options_(kGumboDefaultOptions), errors_are_expected_(false), text_("") {
  165. InitLeakDetection(&options_, &malloc_stats_);
  166. options_.max_errors = 100;
  167. parser_._options = &options_;
  168. parser_._output = static_cast<GumboOutput*>(
  169. gumbo_parser_allocate(&parser_, sizeof(GumboOutput)));
  170. gumbo_init_errors(&parser_);
  171. }
  172. GumboTest::~GumboTest() {
  173. if (!errors_are_expected_) {
  174. // TODO(jdtang): A googlemock matcher may be a more appropriate solution for
  175. // this; we only want to pretty-print errors that are not an expected
  176. // output of the test.
  177. for (int i = 0; i < parser_._output->errors.length && i < 1; ++i) {
  178. gumbo_print_caret_diagnostic(&parser_,
  179. static_cast<GumboError*>(parser_._output->errors.data[i]), text_);
  180. }
  181. }
  182. gumbo_destroy_errors(&parser_);
  183. gumbo_parser_deallocate(&parser_, parser_._output);
  184. EXPECT_EQ(malloc_stats_.objects_allocated, malloc_stats_.objects_freed);
  185. }