tokenizer.cc 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673
  1. // Copyright 2011 Google Inc. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //
  15. // Author: jdtang@google.com (Jonathan Tang)
  16. #include "tokenizer.h"
  17. #include <stdio.h>
  18. #include "gtest/gtest.h"
  19. #include "test_utils.h"
  20. extern const char* kGumboTagNames[];
  21. namespace {
  22. // Tests for tokenizer.c
  23. class GumboTokenizerTest : public GumboTest {
  24. protected:
  25. GumboTokenizerTest() { gumbo_tokenizer_state_init(&parser_, "", 0); }
  26. virtual ~GumboTokenizerTest() {
  27. gumbo_tokenizer_state_destroy(&parser_);
  28. gumbo_token_destroy(&parser_, &token_);
  29. }
  30. void SetInput(const char* input) {
  31. text_ = input;
  32. gumbo_tokenizer_state_destroy(&parser_);
  33. gumbo_tokenizer_state_init(&parser_, input, strlen(input));
  34. }
  35. void Advance(int num_tokens) {
  36. for (int i = 0; i < num_tokens; ++i) {
  37. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  38. gumbo_token_destroy(&parser_, &token_);
  39. }
  40. }
  41. GumboToken token_;
  42. };
  43. TEST(GumboTagEnumTest, TagEnumIncludesAllTags) {
  44. EXPECT_EQ(150, GUMBO_TAG_UNKNOWN);
  45. EXPECT_STREQ("", kGumboTagNames[GUMBO_TAG_UNKNOWN]);
  46. }
  47. TEST_F(GumboTokenizerTest, PartialTag) {
  48. SetInput("<a");
  49. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  50. EXPECT_EQ(GUMBO_TOKEN_EOF, token_.type);
  51. }
  52. TEST_F(GumboTokenizerTest, PartialTagWithAttributes) {
  53. SetInput("<a href=foo /");
  54. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  55. EXPECT_EQ(GUMBO_TOKEN_EOF, token_.type);
  56. }
  57. TEST_F(GumboTokenizerTest, LexCharToken) {
  58. SetInput("a");
  59. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  60. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  61. EXPECT_EQ(1, token_.position.column);
  62. EXPECT_EQ(1, token_.position.line);
  63. EXPECT_EQ(0, token_.position.offset);
  64. EXPECT_EQ('a', *token_.original_text.data);
  65. EXPECT_EQ(1, token_.original_text.length);
  66. EXPECT_EQ('a', token_.v.character);
  67. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  68. EXPECT_EQ(GUMBO_TOKEN_EOF, token_.type);
  69. EXPECT_EQ(1, token_.position.offset);
  70. }
  71. TEST_F(GumboTokenizerTest, LexCharRef) {
  72. SetInput("&nbsp; Text");
  73. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  74. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  75. EXPECT_EQ(1, token_.position.column);
  76. EXPECT_EQ(1, token_.position.line);
  77. EXPECT_EQ(0, token_.position.offset);
  78. EXPECT_EQ('&', *token_.original_text.data);
  79. EXPECT_EQ(6, token_.original_text.length);
  80. EXPECT_EQ(0xA0, token_.v.character);
  81. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  82. EXPECT_EQ(GUMBO_TOKEN_WHITESPACE, token_.type);
  83. EXPECT_EQ(' ', *token_.original_text.data);
  84. EXPECT_EQ(' ', token_.v.character);
  85. }
  86. TEST_F(GumboTokenizerTest, LexCharRef_NotCharRef) {
  87. SetInput("&xyz");
  88. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  89. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  90. EXPECT_EQ(0, token_.position.offset);
  91. EXPECT_EQ('&', token_.v.character);
  92. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  93. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  94. EXPECT_EQ(1, token_.position.offset);
  95. EXPECT_EQ('x', token_.v.character);
  96. }
  97. TEST_F(GumboTokenizerTest, LeadingWhitespace) {
  98. SetInput(
  99. "<div>\n"
  100. " <span class=foo>");
  101. Advance(4);
  102. EXPECT_TRUE(gumbo_lex(&parser_, &token_)); // <span>
  103. GumboTokenStartTag* start_tag = &token_.v.start_tag;
  104. EXPECT_EQ(GUMBO_TAG_SPAN, start_tag->tag);
  105. EXPECT_EQ(2, token_.position.line);
  106. EXPECT_EQ(3, token_.position.column);
  107. ASSERT_EQ(1, start_tag->attributes.length);
  108. GumboAttribute* clas =
  109. static_cast<GumboAttribute*>(start_tag->attributes.data[0]);
  110. EXPECT_STREQ("class", clas->name);
  111. EXPECT_EQ("class", ToString(clas->original_name));
  112. EXPECT_EQ(2, clas->name_start.line);
  113. EXPECT_EQ(9, clas->name_start.column);
  114. EXPECT_EQ(14, clas->name_end.column);
  115. EXPECT_STREQ("foo", clas->value);
  116. EXPECT_EQ("foo", ToString(clas->original_value));
  117. EXPECT_EQ(15, clas->value_start.column);
  118. EXPECT_EQ(18, clas->value_end.column);
  119. }
  120. TEST_F(GumboTokenizerTest, Doctype) {
  121. SetInput("<!doctype html>");
  122. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  123. ASSERT_EQ(GUMBO_TOKEN_DOCTYPE, token_.type);
  124. EXPECT_EQ(0, token_.position.offset);
  125. GumboTokenDocType* doc_type = &token_.v.doc_type;
  126. EXPECT_FALSE(doc_type->force_quirks);
  127. EXPECT_FALSE(doc_type->has_public_identifier);
  128. EXPECT_FALSE(doc_type->has_system_identifier);
  129. EXPECT_STREQ("html", doc_type->name);
  130. EXPECT_STREQ("", doc_type->public_identifier);
  131. EXPECT_STREQ("", doc_type->system_identifier);
  132. }
  133. TEST_F(GumboTokenizerTest, DoctypePublic) {
  134. SetInput(
  135. "<!DOCTYPE html PUBLIC "
  136. "\"-//W3C//DTD XHTML 1.0 Transitional//EN\" "
  137. "'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'>");
  138. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  139. ASSERT_EQ(GUMBO_TOKEN_DOCTYPE, token_.type);
  140. EXPECT_EQ(0, token_.position.offset);
  141. GumboTokenDocType* doc_type = &token_.v.doc_type;
  142. EXPECT_FALSE(doc_type->force_quirks);
  143. EXPECT_TRUE(doc_type->has_public_identifier);
  144. EXPECT_TRUE(doc_type->has_system_identifier);
  145. EXPECT_STREQ("html", doc_type->name);
  146. EXPECT_STREQ(
  147. "-//W3C//DTD XHTML 1.0 Transitional//EN", doc_type->public_identifier);
  148. EXPECT_STREQ("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd",
  149. doc_type->system_identifier);
  150. }
  151. TEST_F(GumboTokenizerTest, DoctypeSystem) {
  152. SetInput("<!DOCtype root_element SYSTEM \"DTD_location\">");
  153. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  154. ASSERT_EQ(GUMBO_TOKEN_DOCTYPE, token_.type);
  155. EXPECT_EQ(0, token_.position.offset);
  156. GumboTokenDocType* doc_type = &token_.v.doc_type;
  157. EXPECT_FALSE(doc_type->force_quirks);
  158. EXPECT_FALSE(doc_type->has_public_identifier);
  159. EXPECT_TRUE(doc_type->has_system_identifier);
  160. EXPECT_STREQ("root_element", doc_type->name);
  161. EXPECT_STREQ("DTD_location", doc_type->system_identifier);
  162. }
  163. TEST_F(GumboTokenizerTest, DoctypeUnterminated) {
  164. SetInput("<!DOCTYPE a PUBLIC''");
  165. EXPECT_FALSE(gumbo_lex(&parser_, &token_));
  166. ASSERT_EQ(GUMBO_TOKEN_DOCTYPE, token_.type);
  167. EXPECT_EQ(0, token_.position.offset);
  168. GumboTokenDocType* doc_type = &token_.v.doc_type;
  169. EXPECT_TRUE(doc_type->force_quirks);
  170. EXPECT_TRUE(doc_type->has_public_identifier);
  171. EXPECT_FALSE(doc_type->has_system_identifier);
  172. EXPECT_STREQ("a", doc_type->name);
  173. EXPECT_STREQ("", doc_type->system_identifier);
  174. }
  175. TEST_F(GumboTokenizerTest, RawtextEnd) {
  176. SetInput("<title>x ignores <tag></title>");
  177. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  178. EXPECT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
  179. EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.start_tag.tag);
  180. gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_RAWTEXT);
  181. gumbo_token_destroy(&parser_, &token_);
  182. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  183. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  184. EXPECT_EQ('x', token_.v.character);
  185. gumbo_token_destroy(&parser_, &token_);
  186. Advance(9);
  187. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  188. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  189. EXPECT_EQ('<', token_.v.character);
  190. gumbo_token_destroy(&parser_, &token_);
  191. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  192. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  193. EXPECT_EQ('t', token_.v.character);
  194. gumbo_token_destroy(&parser_, &token_);
  195. Advance(3);
  196. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  197. EXPECT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
  198. EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.end_tag);
  199. }
  200. TEST_F(GumboTokenizerTest, RCDataEnd) {
  201. SetInput("<title>x</title>");
  202. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  203. EXPECT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
  204. EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.start_tag.tag);
  205. gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_RCDATA);
  206. gumbo_token_destroy(&parser_, &token_);
  207. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  208. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  209. EXPECT_EQ('x', token_.v.character);
  210. gumbo_token_destroy(&parser_, &token_);
  211. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  212. EXPECT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
  213. EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.end_tag);
  214. }
  215. TEST_F(GumboTokenizerTest, ScriptEnd) {
  216. SetInput("<script>x = '\"></';</script>");
  217. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  218. EXPECT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
  219. EXPECT_EQ(GUMBO_TAG_SCRIPT, token_.v.start_tag.tag);
  220. gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT);
  221. gumbo_token_destroy(&parser_, &token_);
  222. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  223. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  224. EXPECT_EQ('x', token_.v.character);
  225. gumbo_token_destroy(&parser_, &token_);
  226. Advance(6);
  227. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  228. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  229. EXPECT_EQ('<', token_.v.character);
  230. gumbo_token_destroy(&parser_, &token_);
  231. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  232. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  233. EXPECT_EQ('/', token_.v.character);
  234. gumbo_token_destroy(&parser_, &token_);
  235. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  236. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  237. EXPECT_EQ('\'', token_.v.character);
  238. gumbo_token_destroy(&parser_, &token_);
  239. Advance(1);
  240. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  241. EXPECT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
  242. EXPECT_EQ(GUMBO_TAG_SCRIPT, token_.v.end_tag);
  243. }
  244. TEST_F(GumboTokenizerTest, ScriptEscapedEnd) {
  245. SetInput("<title>x</title>");
  246. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  247. EXPECT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
  248. EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.start_tag.tag);
  249. gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT_ESCAPED);
  250. gumbo_token_destroy(&parser_, &token_);
  251. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  252. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  253. EXPECT_EQ('x', token_.v.character);
  254. gumbo_token_destroy(&parser_, &token_);
  255. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  256. EXPECT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
  257. EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.end_tag);
  258. }
  259. TEST_F(GumboTokenizerTest, ScriptCommentEscaped) {
  260. SetInput(
  261. "<script><!-- var foo = x < 7 + '</div>-- <A href=\"foo\"></a>';\n"
  262. "-->\n"
  263. "</script>");
  264. Advance(1);
  265. gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT);
  266. Advance(15);
  267. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  268. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  269. EXPECT_EQ('x', token_.v.character);
  270. gumbo_token_destroy(&parser_, &token_);
  271. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  272. EXPECT_EQ(GUMBO_TOKEN_WHITESPACE, token_.type);
  273. EXPECT_EQ(' ', token_.v.character);
  274. gumbo_token_destroy(&parser_, &token_);
  275. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  276. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  277. EXPECT_EQ('<', token_.v.character);
  278. gumbo_token_destroy(&parser_, &token_);
  279. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  280. EXPECT_EQ(GUMBO_TOKEN_WHITESPACE, token_.type);
  281. EXPECT_EQ(' ', token_.v.character);
  282. gumbo_token_destroy(&parser_, &token_);
  283. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  284. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  285. EXPECT_EQ('7', token_.v.character);
  286. gumbo_token_destroy(&parser_, &token_);
  287. Advance(4);
  288. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  289. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  290. EXPECT_EQ('<', token_.v.character);
  291. gumbo_token_destroy(&parser_, &token_);
  292. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  293. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  294. EXPECT_EQ('/', token_.v.character);
  295. gumbo_token_destroy(&parser_, &token_);
  296. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  297. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  298. EXPECT_EQ('d', token_.v.character);
  299. gumbo_token_destroy(&parser_, &token_);
  300. Advance(25);
  301. }
  302. TEST_F(GumboTokenizerTest, ScriptEscapedEmbeddedLessThan) {
  303. SetInput("<script>/*<![CDATA[*/ x<7 /*]]>*/</script>");
  304. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  305. EXPECT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
  306. EXPECT_EQ(GUMBO_TAG_SCRIPT, token_.v.start_tag.tag);
  307. gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT);
  308. gumbo_token_destroy(&parser_, &token_);
  309. Advance(14);
  310. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  311. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  312. EXPECT_EQ('x', token_.v.character);
  313. gumbo_token_destroy(&parser_, &token_);
  314. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  315. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  316. EXPECT_EQ('<', token_.v.character);
  317. gumbo_token_destroy(&parser_, &token_);
  318. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  319. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  320. EXPECT_EQ('7', token_.v.character);
  321. gumbo_token_destroy(&parser_, &token_);
  322. Advance(8);
  323. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  324. EXPECT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
  325. EXPECT_EQ(GUMBO_TAG_SCRIPT, token_.v.end_tag);
  326. }
  327. TEST_F(GumboTokenizerTest, ScriptHasTagEmbedded) {
  328. SetInput("<script>var foo = '</div>';</script>");
  329. Advance(1);
  330. gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT);
  331. Advance(11);
  332. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  333. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  334. EXPECT_EQ('<', token_.v.character);
  335. gumbo_token_destroy(&parser_, &token_);
  336. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  337. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  338. EXPECT_EQ('/', token_.v.character);
  339. gumbo_token_destroy(&parser_, &token_);
  340. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  341. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  342. EXPECT_EQ('d', token_.v.character);
  343. gumbo_token_destroy(&parser_, &token_);
  344. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  345. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  346. EXPECT_EQ('i', token_.v.character);
  347. }
  348. TEST_F(GumboTokenizerTest, ScriptDoubleEscaped) {
  349. SetInput(
  350. "<script><!--var foo = '<a href=\"foo\"></a>\n"
  351. "<sCrIpt>i--<f</script>'-->;</script>");
  352. Advance(1);
  353. gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT);
  354. Advance(34);
  355. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  356. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  357. EXPECT_EQ('<', token_.v.character);
  358. gumbo_token_destroy(&parser_, &token_);
  359. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  360. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  361. EXPECT_EQ('s', token_.v.character);
  362. gumbo_token_destroy(&parser_, &token_);
  363. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  364. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  365. EXPECT_EQ('C', token_.v.character);
  366. gumbo_token_destroy(&parser_, &token_);
  367. Advance(20);
  368. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  369. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  370. EXPECT_EQ('-', token_.v.character);
  371. gumbo_token_destroy(&parser_, &token_);
  372. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  373. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  374. EXPECT_EQ('-', token_.v.character);
  375. gumbo_token_destroy(&parser_, &token_);
  376. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  377. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  378. EXPECT_EQ('>', token_.v.character);
  379. }
  380. TEST_F(GumboTokenizerTest, CData) {
  381. // SetInput uses strlen and so can't handle nulls.
  382. text_ = "<![CDATA[\0filler\0text\0]]>";
  383. gumbo_tokenizer_state_destroy(&parser_);
  384. gumbo_tokenizer_state_init(
  385. &parser_, text_, sizeof("<![CDATA[\0filler\0text\0]]>") - 1);
  386. gumbo_tokenizer_set_is_current_node_foreign(&parser_, true);
  387. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  388. EXPECT_EQ(GUMBO_TOKEN_NULL, token_.type);
  389. EXPECT_EQ(0, token_.v.character);
  390. gumbo_token_destroy(&parser_, &token_);
  391. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  392. EXPECT_EQ(GUMBO_TOKEN_CDATA, token_.type);
  393. EXPECT_EQ('f', token_.v.character);
  394. }
  395. TEST_F(GumboTokenizerTest, StyleHasTagEmbedded) {
  396. SetInput("<style>/* For <head> */</style>");
  397. Advance(1);
  398. gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_RCDATA);
  399. Advance(7);
  400. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  401. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  402. EXPECT_EQ('<', token_.v.character);
  403. gumbo_token_destroy(&parser_, &token_);
  404. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  405. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  406. EXPECT_EQ('h', token_.v.character);
  407. gumbo_token_destroy(&parser_, &token_);
  408. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  409. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  410. EXPECT_EQ('e', token_.v.character);
  411. }
  412. TEST_F(GumboTokenizerTest, PreWithNewlines) {
  413. SetInput("<!DOCTYPE html><pre>\r\na</pre>");
  414. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  415. ASSERT_EQ(GUMBO_TOKEN_DOCTYPE, token_.type);
  416. EXPECT_EQ(0, token_.position.offset);
  417. gumbo_token_destroy(&parser_, &token_);
  418. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  419. ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
  420. EXPECT_EQ("<pre>", ToString(token_.original_text));
  421. EXPECT_EQ(15, token_.position.offset);
  422. }
  423. TEST_F(GumboTokenizerTest, SelfClosingStartTag) {
  424. SetInput("<br />");
  425. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  426. ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
  427. EXPECT_EQ(0, token_.position.offset);
  428. EXPECT_EQ("<br />", ToString(token_.original_text));
  429. GumboTokenStartTag* start_tag = &token_.v.start_tag;
  430. EXPECT_EQ(GUMBO_TAG_BR, start_tag->tag);
  431. EXPECT_EQ(0, start_tag->attributes.length);
  432. EXPECT_TRUE(start_tag->is_self_closing);
  433. }
  434. TEST_F(GumboTokenizerTest, OpenTagWithAttributes) {
  435. SetInput("<a href ='/search?q=foo&amp;hl=en' id=link>");
  436. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  437. ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
  438. GumboTokenStartTag* start_tag = &token_.v.start_tag;
  439. EXPECT_EQ(GUMBO_TAG_A, start_tag->tag);
  440. EXPECT_FALSE(start_tag->is_self_closing);
  441. ASSERT_EQ(2, start_tag->attributes.length);
  442. GumboAttribute* href =
  443. static_cast<GumboAttribute*>(start_tag->attributes.data[0]);
  444. EXPECT_STREQ("href", href->name);
  445. EXPECT_EQ("href", ToString(href->original_name));
  446. EXPECT_STREQ("/search?q=foo&hl=en", href->value);
  447. EXPECT_EQ("'/search?q=foo&amp;hl=en'", ToString(href->original_value));
  448. GumboAttribute* id =
  449. static_cast<GumboAttribute*>(start_tag->attributes.data[1]);
  450. EXPECT_STREQ("id", id->name);
  451. EXPECT_EQ("id", ToString(id->original_name));
  452. EXPECT_STREQ("link", id->value);
  453. EXPECT_EQ("link", ToString(id->original_value));
  454. }
  455. TEST_F(GumboTokenizerTest, BogusComment1) {
  456. SetInput("<?xml is bogus-comment>Text");
  457. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  458. ASSERT_EQ(GUMBO_TOKEN_COMMENT, token_.type);
  459. EXPECT_STREQ("?xml is bogus-comment", token_.v.text);
  460. gumbo_token_destroy(&parser_, &token_);
  461. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  462. EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  463. EXPECT_EQ('T', token_.v.character);
  464. errors_are_expected_ = true;
  465. }
  466. TEST_F(GumboTokenizerTest, BogusComment2) {
  467. SetInput("</#bogus-comment");
  468. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  469. ASSERT_EQ(GUMBO_TOKEN_COMMENT, token_.type);
  470. EXPECT_STREQ("#bogus-comment", token_.v.text);
  471. gumbo_token_destroy(&parser_, &token_);
  472. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  473. EXPECT_EQ(GUMBO_TOKEN_EOF, token_.type);
  474. errors_are_expected_ = true;
  475. }
  476. TEST_F(GumboTokenizerTest, MultilineAttribute) {
  477. SetInput(
  478. "<foo long_attr=\"SomeCode;\n"
  479. " calls_a_big_long_function();\n"
  480. " return true;\" />");
  481. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  482. ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
  483. GumboTokenStartTag* start_tag = &token_.v.start_tag;
  484. EXPECT_EQ(GUMBO_TAG_UNKNOWN, start_tag->tag);
  485. EXPECT_TRUE(start_tag->is_self_closing);
  486. ASSERT_EQ(1, start_tag->attributes.length);
  487. GumboAttribute* long_attr =
  488. static_cast<GumboAttribute*>(start_tag->attributes.data[0]);
  489. EXPECT_STREQ("long_attr", long_attr->name);
  490. EXPECT_EQ("long_attr", ToString(long_attr->original_name));
  491. EXPECT_STREQ(
  492. "SomeCode;\n"
  493. " calls_a_big_long_function();\n"
  494. " return true;",
  495. long_attr->value);
  496. }
  497. TEST_F(GumboTokenizerTest, DoubleAmpersand) {
  498. SetInput("<span jsif=\"foo && bar\">");
  499. EXPECT_TRUE(gumbo_lex(&parser_, &token_));
  500. ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
  501. GumboTokenStartTag* start_tag = &token_.v.start_tag;
  502. EXPECT_EQ(GUMBO_TAG_SPAN, start_tag->tag);
  503. EXPECT_FALSE(start_tag->is_self_closing);
  504. ASSERT_EQ(1, start_tag->attributes.length);
  505. GumboAttribute* jsif =
  506. static_cast<GumboAttribute*>(start_tag->attributes.data[0]);
  507. EXPECT_STREQ("jsif", jsif->name);
  508. EXPECT_EQ("jsif", ToString(jsif->original_name));
  509. EXPECT_STREQ("foo && bar", jsif->value);
  510. EXPECT_EQ("\"foo && bar\"", ToString(jsif->original_value));
  511. }
  512. TEST_F(GumboTokenizerTest, MatchedTagPair) {
  513. SetInput("<div id=dash<-Dash data-test=\"bar\">a</div>");
  514. ASSERT_TRUE(gumbo_lex(&parser_, &token_));
  515. ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
  516. EXPECT_EQ(0, token_.position.offset);
  517. GumboTokenStartTag* start_tag = &token_.v.start_tag;
  518. EXPECT_EQ(GUMBO_TAG_DIV, start_tag->tag);
  519. EXPECT_FALSE(start_tag->is_self_closing);
  520. ASSERT_EQ(2, start_tag->attributes.length);
  521. GumboAttribute* id =
  522. static_cast<GumboAttribute*>(start_tag->attributes.data[0]);
  523. EXPECT_STREQ("id", id->name);
  524. EXPECT_EQ("id", ToString(id->original_name));
  525. EXPECT_EQ(1, id->name_start.line);
  526. EXPECT_EQ(5, id->name_start.offset);
  527. EXPECT_EQ(6, id->name_start.column);
  528. EXPECT_EQ(8, id->name_end.column);
  529. EXPECT_STREQ("dash<-Dash", id->value);
  530. EXPECT_EQ("dash<-Dash", ToString(id->original_value));
  531. EXPECT_EQ(9, id->value_start.column);
  532. EXPECT_EQ(19, id->value_end.column);
  533. GumboAttribute* data_attr =
  534. static_cast<GumboAttribute*>(start_tag->attributes.data[1]);
  535. EXPECT_STREQ("data-test", data_attr->name);
  536. EXPECT_EQ("data-test", ToString(data_attr->original_name));
  537. EXPECT_EQ(20, data_attr->name_start.column);
  538. EXPECT_EQ(29, data_attr->name_end.column);
  539. EXPECT_STREQ("bar", data_attr->value);
  540. EXPECT_EQ("\"bar\"", ToString(data_attr->original_value));
  541. EXPECT_EQ(30, data_attr->value_start.column);
  542. EXPECT_EQ(35, data_attr->value_end.column);
  543. gumbo_token_destroy(&parser_, &token_);
  544. ASSERT_TRUE(gumbo_lex(&parser_, &token_));
  545. ASSERT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
  546. EXPECT_EQ(35, token_.position.offset);
  547. EXPECT_EQ('a', token_.v.character);
  548. gumbo_token_destroy(&parser_, &token_);
  549. ASSERT_TRUE(gumbo_lex(&parser_, &token_));
  550. ASSERT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
  551. EXPECT_EQ(GUMBO_TAG_DIV, token_.v.end_tag);
  552. errors_are_expected_ = true;
  553. }
  554. TEST_F(GumboTokenizerTest, BogusEndTag) {
  555. // According to the spec, the correct parse of this is an end tag token for
  556. // "<div<>" (notice the ending bracket) with the attribute "th=th" (ignored
  557. // because end tags don't take attributes), with the tokenizer passing through
  558. // the self-closing tag state in the process.
  559. SetInput("</div</th>");
  560. ASSERT_TRUE(gumbo_lex(&parser_, &token_));
  561. ASSERT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
  562. EXPECT_EQ(0, token_.position.offset);
  563. EXPECT_EQ(GUMBO_TAG_UNKNOWN, token_.v.end_tag);
  564. EXPECT_EQ("</div</th>", ToString(token_.original_text));
  565. errors_are_expected_ = true;
  566. }
  567. } // namespace