| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673 |
- // Copyright 2011 Google Inc. All Rights Reserved.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- //
- // Author: jdtang@google.com (Jonathan Tang)
- #include "tokenizer.h"
- #include <stdio.h>
- #include "gtest/gtest.h"
- #include "test_utils.h"
- extern const char* kGumboTagNames[];
- namespace {
- // Tests for tokenizer.c
- class GumboTokenizerTest : public GumboTest {
- protected:
- GumboTokenizerTest() { gumbo_tokenizer_state_init(&parser_, "", 0); }
- virtual ~GumboTokenizerTest() {
- gumbo_tokenizer_state_destroy(&parser_);
- gumbo_token_destroy(&parser_, &token_);
- }
- void SetInput(const char* input) {
- text_ = input;
- gumbo_tokenizer_state_destroy(&parser_);
- gumbo_tokenizer_state_init(&parser_, input, strlen(input));
- }
- void Advance(int num_tokens) {
- for (int i = 0; i < num_tokens; ++i) {
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- gumbo_token_destroy(&parser_, &token_);
- }
- }
- GumboToken token_;
- };
- TEST(GumboTagEnumTest, TagEnumIncludesAllTags) {
- EXPECT_EQ(150, GUMBO_TAG_UNKNOWN);
- EXPECT_STREQ("", kGumboTagNames[GUMBO_TAG_UNKNOWN]);
- }
- TEST_F(GumboTokenizerTest, PartialTag) {
- SetInput("<a");
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_EOF, token_.type);
- }
- TEST_F(GumboTokenizerTest, PartialTagWithAttributes) {
- SetInput("<a href=foo /");
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_EOF, token_.type);
- }
- TEST_F(GumboTokenizerTest, LexCharToken) {
- SetInput("a");
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ(1, token_.position.column);
- EXPECT_EQ(1, token_.position.line);
- EXPECT_EQ(0, token_.position.offset);
- EXPECT_EQ('a', *token_.original_text.data);
- EXPECT_EQ(1, token_.original_text.length);
- EXPECT_EQ('a', token_.v.character);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_EOF, token_.type);
- EXPECT_EQ(1, token_.position.offset);
- }
- TEST_F(GumboTokenizerTest, LexCharRef) {
- SetInput(" Text");
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ(1, token_.position.column);
- EXPECT_EQ(1, token_.position.line);
- EXPECT_EQ(0, token_.position.offset);
- EXPECT_EQ('&', *token_.original_text.data);
- EXPECT_EQ(6, token_.original_text.length);
- EXPECT_EQ(0xA0, token_.v.character);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_WHITESPACE, token_.type);
- EXPECT_EQ(' ', *token_.original_text.data);
- EXPECT_EQ(' ', token_.v.character);
- }
- TEST_F(GumboTokenizerTest, LexCharRef_NotCharRef) {
- SetInput("&xyz");
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ(0, token_.position.offset);
- EXPECT_EQ('&', token_.v.character);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ(1, token_.position.offset);
- EXPECT_EQ('x', token_.v.character);
- }
- TEST_F(GumboTokenizerTest, LeadingWhitespace) {
- SetInput(
- "<div>\n"
- " <span class=foo>");
- Advance(4);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_)); // <span>
- GumboTokenStartTag* start_tag = &token_.v.start_tag;
- EXPECT_EQ(GUMBO_TAG_SPAN, start_tag->tag);
- EXPECT_EQ(2, token_.position.line);
- EXPECT_EQ(3, token_.position.column);
- ASSERT_EQ(1, start_tag->attributes.length);
- GumboAttribute* clas =
- static_cast<GumboAttribute*>(start_tag->attributes.data[0]);
- EXPECT_STREQ("class", clas->name);
- EXPECT_EQ("class", ToString(clas->original_name));
- EXPECT_EQ(2, clas->name_start.line);
- EXPECT_EQ(9, clas->name_start.column);
- EXPECT_EQ(14, clas->name_end.column);
- EXPECT_STREQ("foo", clas->value);
- EXPECT_EQ("foo", ToString(clas->original_value));
- EXPECT_EQ(15, clas->value_start.column);
- EXPECT_EQ(18, clas->value_end.column);
- }
- TEST_F(GumboTokenizerTest, Doctype) {
- SetInput("<!doctype html>");
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- ASSERT_EQ(GUMBO_TOKEN_DOCTYPE, token_.type);
- EXPECT_EQ(0, token_.position.offset);
- GumboTokenDocType* doc_type = &token_.v.doc_type;
- EXPECT_FALSE(doc_type->force_quirks);
- EXPECT_FALSE(doc_type->has_public_identifier);
- EXPECT_FALSE(doc_type->has_system_identifier);
- EXPECT_STREQ("html", doc_type->name);
- EXPECT_STREQ("", doc_type->public_identifier);
- EXPECT_STREQ("", doc_type->system_identifier);
- }
- TEST_F(GumboTokenizerTest, DoctypePublic) {
- SetInput(
- "<!DOCTYPE html PUBLIC "
- "\"-//W3C//DTD XHTML 1.0 Transitional//EN\" "
- "'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'>");
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- ASSERT_EQ(GUMBO_TOKEN_DOCTYPE, token_.type);
- EXPECT_EQ(0, token_.position.offset);
- GumboTokenDocType* doc_type = &token_.v.doc_type;
- EXPECT_FALSE(doc_type->force_quirks);
- EXPECT_TRUE(doc_type->has_public_identifier);
- EXPECT_TRUE(doc_type->has_system_identifier);
- EXPECT_STREQ("html", doc_type->name);
- EXPECT_STREQ(
- "-//W3C//DTD XHTML 1.0 Transitional//EN", doc_type->public_identifier);
- EXPECT_STREQ("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd",
- doc_type->system_identifier);
- }
- TEST_F(GumboTokenizerTest, DoctypeSystem) {
- SetInput("<!DOCtype root_element SYSTEM \"DTD_location\">");
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- ASSERT_EQ(GUMBO_TOKEN_DOCTYPE, token_.type);
- EXPECT_EQ(0, token_.position.offset);
- GumboTokenDocType* doc_type = &token_.v.doc_type;
- EXPECT_FALSE(doc_type->force_quirks);
- EXPECT_FALSE(doc_type->has_public_identifier);
- EXPECT_TRUE(doc_type->has_system_identifier);
- EXPECT_STREQ("root_element", doc_type->name);
- EXPECT_STREQ("DTD_location", doc_type->system_identifier);
- }
- TEST_F(GumboTokenizerTest, DoctypeUnterminated) {
- SetInput("<!DOCTYPE a PUBLIC''");
- EXPECT_FALSE(gumbo_lex(&parser_, &token_));
- ASSERT_EQ(GUMBO_TOKEN_DOCTYPE, token_.type);
- EXPECT_EQ(0, token_.position.offset);
- GumboTokenDocType* doc_type = &token_.v.doc_type;
- EXPECT_TRUE(doc_type->force_quirks);
- EXPECT_TRUE(doc_type->has_public_identifier);
- EXPECT_FALSE(doc_type->has_system_identifier);
- EXPECT_STREQ("a", doc_type->name);
- EXPECT_STREQ("", doc_type->system_identifier);
- }
- TEST_F(GumboTokenizerTest, RawtextEnd) {
- SetInput("<title>x ignores <tag></title>");
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
- EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.start_tag.tag);
- gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_RAWTEXT);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('x', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- Advance(9);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('<', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('t', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- Advance(3);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
- EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.end_tag);
- }
- TEST_F(GumboTokenizerTest, RCDataEnd) {
- SetInput("<title>x</title>");
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
- EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.start_tag.tag);
- gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_RCDATA);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('x', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
- EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.end_tag);
- }
- TEST_F(GumboTokenizerTest, ScriptEnd) {
- SetInput("<script>x = '\"></';</script>");
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
- EXPECT_EQ(GUMBO_TAG_SCRIPT, token_.v.start_tag.tag);
- gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('x', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- Advance(6);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('<', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('/', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('\'', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- Advance(1);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
- EXPECT_EQ(GUMBO_TAG_SCRIPT, token_.v.end_tag);
- }
- TEST_F(GumboTokenizerTest, ScriptEscapedEnd) {
- SetInput("<title>x</title>");
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
- EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.start_tag.tag);
- gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT_ESCAPED);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('x', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
- EXPECT_EQ(GUMBO_TAG_TITLE, token_.v.end_tag);
- }
- TEST_F(GumboTokenizerTest, ScriptCommentEscaped) {
- SetInput(
- "<script><!-- var foo = x < 7 + '</div>-- <A href=\"foo\"></a>';\n"
- "-->\n"
- "</script>");
- Advance(1);
- gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT);
- Advance(15);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('x', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_WHITESPACE, token_.type);
- EXPECT_EQ(' ', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('<', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_WHITESPACE, token_.type);
- EXPECT_EQ(' ', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('7', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- Advance(4);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('<', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('/', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('d', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- Advance(25);
- }
- TEST_F(GumboTokenizerTest, ScriptEscapedEmbeddedLessThan) {
- SetInput("<script>/*<![CDATA[*/ x<7 /*]]>*/</script>");
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
- EXPECT_EQ(GUMBO_TAG_SCRIPT, token_.v.start_tag.tag);
- gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT);
- gumbo_token_destroy(&parser_, &token_);
- Advance(14);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('x', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('<', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('7', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- Advance(8);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
- EXPECT_EQ(GUMBO_TAG_SCRIPT, token_.v.end_tag);
- }
- TEST_F(GumboTokenizerTest, ScriptHasTagEmbedded) {
- SetInput("<script>var foo = '</div>';</script>");
- Advance(1);
- gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT);
- Advance(11);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('<', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('/', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('d', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('i', token_.v.character);
- }
- TEST_F(GumboTokenizerTest, ScriptDoubleEscaped) {
- SetInput(
- "<script><!--var foo = '<a href=\"foo\"></a>\n"
- "<sCrIpt>i--<f</script>'-->;</script>");
- Advance(1);
- gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_SCRIPT);
- Advance(34);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('<', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('s', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('C', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- Advance(20);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('-', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('-', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('>', token_.v.character);
- }
- TEST_F(GumboTokenizerTest, CData) {
- // SetInput uses strlen and so can't handle nulls.
- text_ = "<![CDATA[\0filler\0text\0]]>";
- gumbo_tokenizer_state_destroy(&parser_);
- gumbo_tokenizer_state_init(
- &parser_, text_, sizeof("<![CDATA[\0filler\0text\0]]>") - 1);
- gumbo_tokenizer_set_is_current_node_foreign(&parser_, true);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_NULL, token_.type);
- EXPECT_EQ(0, token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CDATA, token_.type);
- EXPECT_EQ('f', token_.v.character);
- }
- TEST_F(GumboTokenizerTest, StyleHasTagEmbedded) {
- SetInput("<style>/* For <head> */</style>");
- Advance(1);
- gumbo_tokenizer_set_state(&parser_, GUMBO_LEX_RCDATA);
- Advance(7);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('<', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('h', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('e', token_.v.character);
- }
- TEST_F(GumboTokenizerTest, PreWithNewlines) {
- SetInput("<!DOCTYPE html><pre>\r\na</pre>");
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- ASSERT_EQ(GUMBO_TOKEN_DOCTYPE, token_.type);
- EXPECT_EQ(0, token_.position.offset);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
- EXPECT_EQ("<pre>", ToString(token_.original_text));
- EXPECT_EQ(15, token_.position.offset);
- }
- TEST_F(GumboTokenizerTest, SelfClosingStartTag) {
- SetInput("<br />");
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
- EXPECT_EQ(0, token_.position.offset);
- EXPECT_EQ("<br />", ToString(token_.original_text));
- GumboTokenStartTag* start_tag = &token_.v.start_tag;
- EXPECT_EQ(GUMBO_TAG_BR, start_tag->tag);
- EXPECT_EQ(0, start_tag->attributes.length);
- EXPECT_TRUE(start_tag->is_self_closing);
- }
- TEST_F(GumboTokenizerTest, OpenTagWithAttributes) {
- SetInput("<a href ='/search?q=foo&hl=en' id=link>");
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
- GumboTokenStartTag* start_tag = &token_.v.start_tag;
- EXPECT_EQ(GUMBO_TAG_A, start_tag->tag);
- EXPECT_FALSE(start_tag->is_self_closing);
- ASSERT_EQ(2, start_tag->attributes.length);
- GumboAttribute* href =
- static_cast<GumboAttribute*>(start_tag->attributes.data[0]);
- EXPECT_STREQ("href", href->name);
- EXPECT_EQ("href", ToString(href->original_name));
- EXPECT_STREQ("/search?q=foo&hl=en", href->value);
- EXPECT_EQ("'/search?q=foo&hl=en'", ToString(href->original_value));
- GumboAttribute* id =
- static_cast<GumboAttribute*>(start_tag->attributes.data[1]);
- EXPECT_STREQ("id", id->name);
- EXPECT_EQ("id", ToString(id->original_name));
- EXPECT_STREQ("link", id->value);
- EXPECT_EQ("link", ToString(id->original_value));
- }
- TEST_F(GumboTokenizerTest, BogusComment1) {
- SetInput("<?xml is bogus-comment>Text");
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- ASSERT_EQ(GUMBO_TOKEN_COMMENT, token_.type);
- EXPECT_STREQ("?xml is bogus-comment", token_.v.text);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ('T', token_.v.character);
- errors_are_expected_ = true;
- }
- TEST_F(GumboTokenizerTest, BogusComment2) {
- SetInput("</#bogus-comment");
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- ASSERT_EQ(GUMBO_TOKEN_COMMENT, token_.type);
- EXPECT_STREQ("#bogus-comment", token_.v.text);
- gumbo_token_destroy(&parser_, &token_);
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- EXPECT_EQ(GUMBO_TOKEN_EOF, token_.type);
- errors_are_expected_ = true;
- }
- TEST_F(GumboTokenizerTest, MultilineAttribute) {
- SetInput(
- "<foo long_attr=\"SomeCode;\n"
- " calls_a_big_long_function();\n"
- " return true;\" />");
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
- GumboTokenStartTag* start_tag = &token_.v.start_tag;
- EXPECT_EQ(GUMBO_TAG_UNKNOWN, start_tag->tag);
- EXPECT_TRUE(start_tag->is_self_closing);
- ASSERT_EQ(1, start_tag->attributes.length);
- GumboAttribute* long_attr =
- static_cast<GumboAttribute*>(start_tag->attributes.data[0]);
- EXPECT_STREQ("long_attr", long_attr->name);
- EXPECT_EQ("long_attr", ToString(long_attr->original_name));
- EXPECT_STREQ(
- "SomeCode;\n"
- " calls_a_big_long_function();\n"
- " return true;",
- long_attr->value);
- }
- TEST_F(GumboTokenizerTest, DoubleAmpersand) {
- SetInput("<span jsif=\"foo && bar\">");
- EXPECT_TRUE(gumbo_lex(&parser_, &token_));
- ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
- GumboTokenStartTag* start_tag = &token_.v.start_tag;
- EXPECT_EQ(GUMBO_TAG_SPAN, start_tag->tag);
- EXPECT_FALSE(start_tag->is_self_closing);
- ASSERT_EQ(1, start_tag->attributes.length);
- GumboAttribute* jsif =
- static_cast<GumboAttribute*>(start_tag->attributes.data[0]);
- EXPECT_STREQ("jsif", jsif->name);
- EXPECT_EQ("jsif", ToString(jsif->original_name));
- EXPECT_STREQ("foo && bar", jsif->value);
- EXPECT_EQ("\"foo && bar\"", ToString(jsif->original_value));
- }
- TEST_F(GumboTokenizerTest, MatchedTagPair) {
- SetInput("<div id=dash<-Dash data-test=\"bar\">a</div>");
- ASSERT_TRUE(gumbo_lex(&parser_, &token_));
- ASSERT_EQ(GUMBO_TOKEN_START_TAG, token_.type);
- EXPECT_EQ(0, token_.position.offset);
- GumboTokenStartTag* start_tag = &token_.v.start_tag;
- EXPECT_EQ(GUMBO_TAG_DIV, start_tag->tag);
- EXPECT_FALSE(start_tag->is_self_closing);
- ASSERT_EQ(2, start_tag->attributes.length);
- GumboAttribute* id =
- static_cast<GumboAttribute*>(start_tag->attributes.data[0]);
- EXPECT_STREQ("id", id->name);
- EXPECT_EQ("id", ToString(id->original_name));
- EXPECT_EQ(1, id->name_start.line);
- EXPECT_EQ(5, id->name_start.offset);
- EXPECT_EQ(6, id->name_start.column);
- EXPECT_EQ(8, id->name_end.column);
- EXPECT_STREQ("dash<-Dash", id->value);
- EXPECT_EQ("dash<-Dash", ToString(id->original_value));
- EXPECT_EQ(9, id->value_start.column);
- EXPECT_EQ(19, id->value_end.column);
- GumboAttribute* data_attr =
- static_cast<GumboAttribute*>(start_tag->attributes.data[1]);
- EXPECT_STREQ("data-test", data_attr->name);
- EXPECT_EQ("data-test", ToString(data_attr->original_name));
- EXPECT_EQ(20, data_attr->name_start.column);
- EXPECT_EQ(29, data_attr->name_end.column);
- EXPECT_STREQ("bar", data_attr->value);
- EXPECT_EQ("\"bar\"", ToString(data_attr->original_value));
- EXPECT_EQ(30, data_attr->value_start.column);
- EXPECT_EQ(35, data_attr->value_end.column);
- gumbo_token_destroy(&parser_, &token_);
- ASSERT_TRUE(gumbo_lex(&parser_, &token_));
- ASSERT_EQ(GUMBO_TOKEN_CHARACTER, token_.type);
- EXPECT_EQ(35, token_.position.offset);
- EXPECT_EQ('a', token_.v.character);
- gumbo_token_destroy(&parser_, &token_);
- ASSERT_TRUE(gumbo_lex(&parser_, &token_));
- ASSERT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
- EXPECT_EQ(GUMBO_TAG_DIV, token_.v.end_tag);
- errors_are_expected_ = true;
- }
- TEST_F(GumboTokenizerTest, BogusEndTag) {
- // According to the spec, the correct parse of this is an end tag token for
- // "<div<>" (notice the ending bracket) with the attribute "th=th" (ignored
- // because end tags don't take attributes), with the tokenizer passing through
- // the self-closing tag state in the process.
- SetInput("</div</th>");
- ASSERT_TRUE(gumbo_lex(&parser_, &token_));
- ASSERT_EQ(GUMBO_TOKEN_END_TAG, token_.type);
- EXPECT_EQ(0, token_.position.offset);
- EXPECT_EQ(GUMBO_TAG_UNKNOWN, token_.v.end_tag);
- EXPECT_EQ("</div</th>", ToString(token_.original_text));
- errors_are_expected_ = true;
- }
- } // namespace
|