char_ref.cc 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. // Copyright 2011 Google Inc. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //
  15. // Author: jdtang@google.com (Jonathan Tang)
  16. //
  17. // GUnit char_ref tests. These are quick smoke tests, mostly to identify
  18. // crashing bugs so that they can be fixed without having to debug
  19. // multi-language tests. As such, they focus on coverage rather than
  20. // completeness. For testing the full spec, use char_ref_py_tests, which share
  21. // their testdata with the Python html5lib library.
  22. #include "char_ref.h"
  23. #include <stdio.h>
  24. #include <string.h>
  25. #include "gtest/gtest.h"
  26. #include "test_utils.h"
  27. #include "utf8.h"
  28. namespace {
  29. class CharRefTest : public GumboTest {
  30. protected:
  31. bool ConsumeCharRef(const char* input) {
  32. return ConsumeCharRef(input, ' ', false);
  33. }
  34. bool ConsumeCharRef(
  35. const char* input, int additional_allowed_char, bool is_in_attribute) {
  36. text_ = input;
  37. utf8iterator_init(&parser_, input, strlen(input), &iter_);
  38. bool result = consume_char_ref(
  39. &parser_, &iter_, additional_allowed_char, is_in_attribute, &output_);
  40. fflush(stdout);
  41. return result;
  42. }
  43. Utf8Iterator iter_;
  44. OneOrTwoCodepoints output_;
  45. };
  46. TEST_F(CharRefTest, Whitespace) {
  47. EXPECT_TRUE(ConsumeCharRef(" &nbsp;"));
  48. EXPECT_EQ(kGumboNoChar, output_.first);
  49. EXPECT_EQ(kGumboNoChar, output_.second);
  50. }
  51. TEST_F(CharRefTest, NumericHex) {
  52. EXPECT_TRUE(ConsumeCharRef("&#x12ab;"));
  53. EXPECT_EQ(0x12ab, output_.first);
  54. EXPECT_EQ(kGumboNoChar, output_.second);
  55. }
  56. TEST_F(CharRefTest, NumericDecimal) {
  57. EXPECT_TRUE(ConsumeCharRef("&#1234;"));
  58. EXPECT_EQ(1234, output_.first);
  59. EXPECT_EQ(kGumboNoChar, output_.second);
  60. }
  61. TEST_F(CharRefTest, NumericInvalidDigit) {
  62. errors_are_expected_ = true;
  63. EXPECT_FALSE(ConsumeCharRef("&#google"));
  64. EXPECT_EQ(kGumboNoChar, output_.first);
  65. EXPECT_EQ(kGumboNoChar, output_.second);
  66. EXPECT_EQ('&', utf8iterator_current(&iter_));
  67. }
  68. TEST_F(CharRefTest, NumericNoSemicolon) {
  69. errors_are_expected_ = true;
  70. EXPECT_FALSE(ConsumeCharRef("&#1234google"));
  71. EXPECT_EQ(1234, output_.first);
  72. EXPECT_EQ(kGumboNoChar, output_.second);
  73. EXPECT_EQ('g', utf8iterator_current(&iter_));
  74. }
  75. TEST_F(CharRefTest, NumericReplacement) {
  76. errors_are_expected_ = true;
  77. EXPECT_FALSE(ConsumeCharRef("&#X82"));
  78. // Low quotation mark character.
  79. EXPECT_EQ(0x201A, output_.first);
  80. EXPECT_EQ(kGumboNoChar, output_.second);
  81. }
  82. TEST_F(CharRefTest, NumericInvalid) {
  83. errors_are_expected_ = true;
  84. EXPECT_FALSE(ConsumeCharRef("&#xDA00"));
  85. EXPECT_EQ(0xFFFD, output_.first);
  86. EXPECT_EQ(kGumboNoChar, output_.second);
  87. }
  88. TEST_F(CharRefTest, NumericUtfInvalid) {
  89. errors_are_expected_ = true;
  90. EXPECT_FALSE(ConsumeCharRef("&#x007"));
  91. EXPECT_EQ(0x7, output_.first);
  92. EXPECT_EQ(kGumboNoChar, output_.second);
  93. }
  94. TEST_F(CharRefTest, NamedReplacement) {
  95. EXPECT_TRUE(ConsumeCharRef("&lt;"));
  96. EXPECT_EQ('<', output_.first);
  97. EXPECT_EQ(kGumboNoChar, output_.second);
  98. }
  99. TEST_F(CharRefTest, NamedReplacementNoSemicolon) {
  100. errors_are_expected_ = true;
  101. EXPECT_FALSE(ConsumeCharRef("&gt"));
  102. EXPECT_EQ('>', output_.first);
  103. EXPECT_EQ(kGumboNoChar, output_.second);
  104. }
  105. TEST_F(CharRefTest, NamedReplacementWithInvalidUtf8) {
  106. errors_are_expected_ = true;
  107. EXPECT_TRUE(ConsumeCharRef("&\xc3\xa5"));
  108. EXPECT_EQ(kGumboNoChar, output_.first);
  109. EXPECT_EQ(kGumboNoChar, output_.second);
  110. }
  111. TEST_F(CharRefTest, NamedReplacementInvalid) {
  112. errors_are_expected_ = true;
  113. EXPECT_FALSE(ConsumeCharRef("&google;"));
  114. EXPECT_EQ(kGumboNoChar, output_.first);
  115. EXPECT_EQ(kGumboNoChar, output_.second);
  116. EXPECT_EQ('&', utf8iterator_current(&iter_));
  117. }
  118. // TEST_F(CharRefTest, NamedReplacementInvalidNoSemicolon) {
  119. // EXPECT_FALSE(ConsumeCharRef("&google"));
  120. // EXPECT_EQ(kGumboNoChar, output_.first);
  121. // EXPECT_EQ(kGumboNoChar, output_.second);
  122. // EXPECT_EQ('&', utf8iterator_current(&iter_));
  123. //}
  124. TEST_F(CharRefTest, AdditionalAllowedChar) {
  125. EXPECT_TRUE(ConsumeCharRef("&\"", '"', false));
  126. EXPECT_EQ(kGumboNoChar, output_.first);
  127. EXPECT_EQ(kGumboNoChar, output_.second);
  128. EXPECT_EQ('&', utf8iterator_current(&iter_));
  129. }
  130. TEST_F(CharRefTest, InAttribute) {
  131. EXPECT_TRUE(ConsumeCharRef("&noted", ' ', true));
  132. EXPECT_EQ(kGumboNoChar, output_.first);
  133. EXPECT_EQ(kGumboNoChar, output_.second);
  134. EXPECT_EQ('&', utf8iterator_current(&iter_));
  135. }
  136. TEST_F(CharRefTest, MultiChars) {
  137. EXPECT_TRUE(ConsumeCharRef("&notindot;"));
  138. EXPECT_EQ(0x22F5, output_.first);
  139. EXPECT_EQ(0x0338, output_.second);
  140. }
  141. TEST_F(CharRefTest, CharAfter) {
  142. EXPECT_TRUE(ConsumeCharRef("&lt;x"));
  143. EXPECT_EQ('<', output_.first);
  144. EXPECT_EQ(kGumboNoChar, output_.second);
  145. EXPECT_EQ('x', utf8iterator_current(&iter_));
  146. }
  147. } // namespace