| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599 |
- // Copyright 2011 Google Inc. All Rights Reserved.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- //
- // Author: jdtang@google.com (Jonathan Tang)
- #include "utf8.h"
- #include <string.h>
- #include "gtest/gtest.h"
- #include "error.h"
- #include "gumbo.h"
- #include "parser.h"
- #include "test_utils.h"
- namespace {
- // Tests for utf8.c
- class Utf8Test : public GumboTest {
- protected:
- void Advance(int num_chars) {
- for (int i = 0; i < num_chars; ++i) {
- utf8iterator_next(&input_);
- }
- }
- void ResetText(const char* text) {
- text_ = text;
- utf8iterator_init(&parser_, text, strlen(text), &input_);
- }
- GumboError* GetFirstError() {
- return static_cast<GumboError*>(parser_._output->errors.data[0]);
- }
- int GetNumErrors() { return parser_._output->errors.length; }
- Utf8Iterator input_;
- };
- TEST_F(Utf8Test, EmptyString) {
- ResetText("");
- EXPECT_EQ(-1, utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, GetPosition_EmptyString) {
- ResetText("");
- GumboSourcePosition pos;
- utf8iterator_get_position(&input_, &pos);
- EXPECT_EQ(1, pos.line);
- EXPECT_EQ(1, pos.column);
- EXPECT_EQ(0, pos.offset);
- }
- TEST_F(Utf8Test, Null) {
- // Can't use ResetText, as the implicit strlen will choke on the null.
- text_ = "\0f";
- utf8iterator_init(&parser_, text_, 2, &input_);
- EXPECT_EQ(0, utf8iterator_current(&input_));
- EXPECT_EQ('\0', *utf8iterator_get_char_pointer(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ('f', utf8iterator_current(&input_));
- EXPECT_EQ('f', *utf8iterator_get_char_pointer(&input_));
- }
- TEST_F(Utf8Test, OneByteChar) {
- ResetText("a");
- EXPECT_EQ(0, GetNumErrors());
- EXPECT_EQ('a', utf8iterator_current(&input_));
- EXPECT_EQ('a', *utf8iterator_get_char_pointer(&input_));
- GumboSourcePosition pos;
- utf8iterator_get_position(&input_, &pos);
- EXPECT_EQ(1, pos.line);
- EXPECT_EQ(1, pos.column);
- EXPECT_EQ(0, pos.offset);
- utf8iterator_next(&input_);
- EXPECT_EQ(-1, utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, ContinuationByte) {
- ResetText("\x85");
- EXPECT_EQ(1, GetNumErrors());
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- EXPECT_EQ('\x85', *utf8iterator_get_char_pointer(&input_));
- errors_are_expected_ = true;
- GumboError* error = GetFirstError();
- EXPECT_EQ(GUMBO_ERR_UTF8_INVALID, error->type);
- EXPECT_EQ('\x85', *error->original_text);
- EXPECT_EQ(0x85, error->v.codepoint);
- utf8iterator_next(&input_);
- EXPECT_EQ(-1, utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, MultipleContinuationBytes) {
- ResetText("a\x85\xA0\xC2x\x9A");
- EXPECT_EQ('a', utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ('x', utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ(-1, utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ(4, GetNumErrors());
- }
- TEST_F(Utf8Test, OverlongEncoding) {
- // \xC0\x75 = 11000000 01110101.
- ResetText("\xC0\x75");
- ASSERT_EQ(1, GetNumErrors());
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- EXPECT_EQ('\xC0', *utf8iterator_get_char_pointer(&input_));
- errors_are_expected_ = true;
- GumboError* error = GetFirstError();
- EXPECT_EQ(GUMBO_ERR_UTF8_INVALID, error->type);
- EXPECT_EQ(1, error->position.line);
- EXPECT_EQ(1, error->position.column);
- EXPECT_EQ(0, error->position.offset);
- EXPECT_EQ('\xC0', *error->original_text);
- EXPECT_EQ(0xC0, error->v.codepoint);
- utf8iterator_next(&input_);
- EXPECT_EQ(0x75, utf8iterator_current(&input_));
- EXPECT_EQ('\x75', *utf8iterator_get_char_pointer(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ(-1, utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, OverlongEncodingWithContinuationByte) {
- // \xC0\x85 = 11000000 10000101.
- ResetText("\xC0\x85");
- ASSERT_EQ(1, GetNumErrors());
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- EXPECT_EQ('\xC0', *utf8iterator_get_char_pointer(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- errors_are_expected_ = true;
- GumboError* error = GetFirstError();
- EXPECT_EQ(GUMBO_ERR_UTF8_INVALID, error->type);
- EXPECT_EQ(1, error->position.line);
- EXPECT_EQ(1, error->position.column);
- EXPECT_EQ(0, error->position.offset);
- EXPECT_EQ('\xC0', *error->original_text);
- EXPECT_EQ(0xC0, error->v.codepoint);
- utf8iterator_next(&input_);
- EXPECT_EQ(-1, utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, TwoByteChar) {
- // \xC3\xA5 = 11000011 10100101.
- ResetText("\xC3\xA5o");
- EXPECT_EQ(0, GetNumErrors());
- // Codepoint = 000 11100101 = 0xE5.
- EXPECT_EQ(0xE5, utf8iterator_current(&input_));
- EXPECT_EQ('\xC3', *utf8iterator_get_char_pointer(&input_));
- GumboSourcePosition pos;
- utf8iterator_get_position(&input_, &pos);
- EXPECT_EQ(1, pos.line);
- EXPECT_EQ(1, pos.column);
- EXPECT_EQ(0, pos.offset);
- utf8iterator_next(&input_);
- EXPECT_EQ('o', utf8iterator_current(&input_));
- utf8iterator_get_position(&input_, &pos);
- EXPECT_EQ(1, pos.line);
- EXPECT_EQ(2, pos.column);
- EXPECT_EQ(2, pos.offset);
- }
- TEST_F(Utf8Test, TwoByteChar2) {
- // \xC2\xA5 = 11000010 10100101.
- ResetText("\xC2\xA5");
- EXPECT_EQ(0, GetNumErrors());
- // Codepoint = 000 10100101 = 0xA5.
- EXPECT_EQ(0xA5, utf8iterator_current(&input_));
- EXPECT_EQ('\xC2', *utf8iterator_get_char_pointer(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ(-1, utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, ThreeByteChar) {
- // \xE3\xA7\xA7 = 11100011 10100111 10100111
- ResetText("\xE3\xA7\xA7\xB0");
- EXPECT_EQ(0, GetNumErrors());
- // Codepoint = 00111001 11100111 = 0x39E7
- EXPECT_EQ(0x39E7, utf8iterator_current(&input_));
- EXPECT_EQ('\xE3', *utf8iterator_get_char_pointer(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ(1, GetNumErrors());
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- EXPECT_EQ('\xB0', *utf8iterator_get_char_pointer(&input_));
- GumboSourcePosition pos;
- utf8iterator_get_position(&input_, &pos);
- EXPECT_EQ(1, pos.line);
- EXPECT_EQ(2, pos.column);
- EXPECT_EQ(3, pos.offset);
- utf8iterator_next(&input_);
- EXPECT_EQ(-1, utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, FourByteChar) {
- // \xC3\x9A = 11000011 10011010
- // \xF1\xA7\xA7\xA7 = 11110001 10100111 10100111 10100111
- ResetText("\xC3\x9A\xF1\xA7\xA7\xA7");
- // Codepoint = 000 11011010 = 0xDA.
- EXPECT_EQ(0xDA, utf8iterator_current(&input_));
- EXPECT_EQ('\xC3', *utf8iterator_get_char_pointer(&input_));
- utf8iterator_next(&input_);
- // Codepoint = 00110 01111001 11100111 = 0x679E7.
- EXPECT_EQ(0x679E7, utf8iterator_current(&input_));
- EXPECT_EQ('\xF1', *utf8iterator_get_char_pointer(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ(-1, utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, FourByteCharWithoutContinuationChars) {
- // \xF1\xA7\xA7\xA7 = 11110001 10100111 10100111 10100111
- ResetText("\xF1\xA7\xA7-");
- EXPECT_EQ(1, GetNumErrors());
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- EXPECT_EQ('\xF1', *utf8iterator_get_char_pointer(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ('-', utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ(-1, utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, FiveByteCharIsError) {
- ResetText("\xF6\xA7\xA7\xA7\xA7x");
- EXPECT_EQ(1, GetNumErrors());
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- utf8iterator_next(&input_);
- utf8iterator_next(&input_);
- utf8iterator_next(&input_);
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ('x', utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, SixByteCharIsError) {
- ResetText("\xF8\xA7\xA7\xA7\xA7\xA7x");
- EXPECT_EQ(1, GetNumErrors());
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- utf8iterator_next(&input_);
- utf8iterator_next(&input_);
- utf8iterator_next(&input_);
- utf8iterator_next(&input_);
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ('x', utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, SevenByteCharIsError) {
- ResetText("\xFC\xA7\xA7\xA7\xA7\xA7\xA7x");
- EXPECT_EQ(1, GetNumErrors());
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- utf8iterator_next(&input_);
- utf8iterator_next(&input_);
- utf8iterator_next(&input_);
- utf8iterator_next(&input_);
- utf8iterator_next(&input_);
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ('x', utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, 0xFFIsError) {
- ResetText("\xFFx");
- EXPECT_EQ(1, GetNumErrors());
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ('x', utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, InvalidControlCharIsError) {
- ResetText("\x1Bx");
- EXPECT_EQ(1, GetNumErrors());
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ('x', utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, TruncatedInput) {
- ResetText("\xF1\xA7");
- EXPECT_EQ(1, GetNumErrors());
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- errors_are_expected_ = true;
- GumboError* error = GetFirstError();
- EXPECT_EQ(GUMBO_ERR_UTF8_TRUNCATED, error->type);
- EXPECT_EQ(1, error->position.line);
- EXPECT_EQ(1, error->position.column);
- EXPECT_EQ(0, error->position.offset);
- EXPECT_EQ('\xF1', *error->original_text);
- EXPECT_EQ(0xF1A7, error->v.codepoint);
- utf8iterator_next(&input_);
- EXPECT_EQ(-1, utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, Html5SpecExample) {
- // This example has since been removed from the spec, and the spec has been
- // changed to reference the Unicode Standard 6.2, 5.22 "Best practices for
- // U+FFFD substitution."
- ResetText("\x41\x98\xBA\x42\xE2\x98\x43\xE2\x98\xBA\xE2\x98");
- EXPECT_EQ('A', utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ('B', utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ('C', utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- // \xE2\x98\xBA = 11100010 10011000 10111010
- // Codepoint = 00100110 00111010 = 0x263A
- EXPECT_EQ(0x263A, utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- }
- TEST_F(Utf8Test, MultipleEOFReads) {
- ResetText("a");
- Advance(2);
- EXPECT_EQ(-1, utf8iterator_current(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ(-1, utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, AsciiOnly) {
- ResetText("hello");
- Advance(4);
- EXPECT_EQ('o', utf8iterator_current(&input_));
- EXPECT_EQ('o', *utf8iterator_get_char_pointer(&input_));
- GumboSourcePosition pos;
- utf8iterator_get_position(&input_, &pos);
- EXPECT_EQ(1, pos.line);
- EXPECT_EQ(5, pos.column);
- EXPECT_EQ(4, pos.offset);
- Advance(1);
- EXPECT_EQ(-1, utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, NewlinePosition) {
- ResetText("a\nnewline");
- Advance(1);
- // Newline itself should register as being at the end of a line.
- GumboSourcePosition pos;
- utf8iterator_get_position(&input_, &pos);
- EXPECT_EQ(1, pos.line);
- EXPECT_EQ(2, pos.column);
- EXPECT_EQ(1, pos.offset);
- // The next character should be at the next line.
- Advance(1);
- utf8iterator_get_position(&input_, &pos);
- EXPECT_EQ(2, pos.line);
- EXPECT_EQ(1, pos.column);
- EXPECT_EQ(2, pos.offset);
- }
- TEST_F(Utf8Test, TabPositionFreshTabstop) {
- ResetText("a\n\ttab");
- Advance(sizeof("a\n\t") - 1);
- GumboSourcePosition pos;
- utf8iterator_get_position(&input_, &pos);
- EXPECT_EQ(2, pos.line);
- EXPECT_EQ(8, pos.column);
- EXPECT_EQ(3, pos.offset);
- }
- TEST_F(Utf8Test, TabPositionMidTabstop) {
- ResetText("a tab\tinline");
- Advance(sizeof("a tab\t") - 1);
- GumboSourcePosition pos;
- utf8iterator_get_position(&input_, &pos);
- EXPECT_EQ(1, pos.line);
- EXPECT_EQ(8, pos.column);
- EXPECT_EQ(6, pos.offset);
- }
- TEST_F(Utf8Test, ConfigurableTabstop) {
- options_.tab_stop = 4;
- ResetText("a\n\ttab");
- Advance(sizeof("a\n\t") - 1);
- GumboSourcePosition pos;
- utf8iterator_get_position(&input_, &pos);
- EXPECT_EQ(2, pos.line);
- EXPECT_EQ(4, pos.column);
- EXPECT_EQ(3, pos.offset);
- }
- TEST_F(Utf8Test, CRLF) {
- ResetText("Windows\r\nlinefeeds");
- Advance(sizeof("Windows") - 1);
- EXPECT_EQ('\n', utf8iterator_current(&input_));
- EXPECT_EQ('\n', *utf8iterator_get_char_pointer(&input_));
- GumboSourcePosition pos;
- utf8iterator_get_position(&input_, &pos);
- EXPECT_EQ(1, pos.line);
- // The carriage return should be ignore in column calculations, treating the
- // CRLF combination as one character.
- EXPECT_EQ(8, pos.column);
- // However, it should not be ignored in computing offsets, which are often
- // used by other tools to index into the original buffer. We don't expect
- // other unicode-aware tools to have the same \r\n handling as HTML5.
- EXPECT_EQ(8, pos.offset);
- }
- TEST_F(Utf8Test, CarriageReturn) {
- ResetText("Mac\rlinefeeds");
- Advance(sizeof("Mac") - 1);
- EXPECT_EQ('\n', utf8iterator_current(&input_));
- // We don't change the original pointer, which is part of the const input
- // buffer. original_text pointers will see a carriage return as original
- // written.
- EXPECT_EQ('\r', *utf8iterator_get_char_pointer(&input_));
- GumboSourcePosition pos;
- utf8iterator_get_position(&input_, &pos);
- EXPECT_EQ(1, pos.line);
- EXPECT_EQ(4, pos.column);
- EXPECT_EQ(3, pos.offset);
- Advance(1);
- EXPECT_EQ('l', utf8iterator_current(&input_));
- EXPECT_EQ('l', *utf8iterator_get_char_pointer(&input_));
- utf8iterator_get_position(&input_, &pos);
- EXPECT_EQ(2, pos.line);
- EXPECT_EQ(1, pos.column);
- EXPECT_EQ(4, pos.offset);
- }
- TEST_F(Utf8Test, Matches) {
- ResetText("\xC2\xA5goobar");
- Advance(1);
- EXPECT_TRUE(utf8iterator_maybe_consume_match(&input_, "goo", 3, true));
- EXPECT_EQ('b', utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, MatchesOverflow) {
- ResetText("goo");
- EXPECT_FALSE(utf8iterator_maybe_consume_match(&input_, "goobar", 6, true));
- EXPECT_EQ('g', utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, MatchesEof) {
- ResetText("goo");
- EXPECT_TRUE(utf8iterator_maybe_consume_match(&input_, "goo", 3, true));
- EXPECT_EQ(-1, utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, MatchesCaseSensitivity) {
- ResetText("gooBAR");
- EXPECT_FALSE(utf8iterator_maybe_consume_match(&input_, "goobar", 6, true));
- EXPECT_EQ('g', utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, MatchesCaseInsensitive) {
- ResetText("gooBAR");
- EXPECT_TRUE(utf8iterator_maybe_consume_match(&input_, "goobar", 6, false));
- EXPECT_EQ(-1, utf8iterator_current(&input_));
- }
- TEST_F(Utf8Test, MatchFollowedByNullByte) {
- // Can't use ResetText, as the implicit strlen will choke on the null.
- text_ = "CDATA\0f";
- utf8iterator_init(&parser_, text_, 7, &input_);
- EXPECT_TRUE(utf8iterator_maybe_consume_match(
- &input_, "cdata", sizeof("cdata") - 1, false));
- EXPECT_EQ(0, utf8iterator_current(&input_));
- EXPECT_EQ('\0', *utf8iterator_get_char_pointer(&input_));
- utf8iterator_next(&input_);
- EXPECT_EQ('f', utf8iterator_current(&input_));
- EXPECT_EQ('f', *utf8iterator_get_char_pointer(&input_));
- }
- TEST_F(Utf8Test, MarkReset) {
- ResetText("this is a test");
- Advance(5);
- EXPECT_EQ('i', utf8iterator_current(&input_));
- utf8iterator_mark(&input_);
- Advance(3);
- EXPECT_EQ('a', utf8iterator_current(&input_));
- GumboError error;
- utf8iterator_fill_error_at_mark(&input_, &error);
- EXPECT_EQ('i', *error.original_text);
- EXPECT_EQ(1, error.position.line);
- EXPECT_EQ(6, error.position.column);
- EXPECT_EQ(5, error.position.offset);
- utf8iterator_reset(&input_);
- EXPECT_EQ('i', utf8iterator_current(&input_));
- EXPECT_EQ('i', *utf8iterator_get_char_pointer(&input_));
- GumboSourcePosition position;
- utf8iterator_get_position(&input_, &position);
- EXPECT_EQ(1, error.position.line);
- EXPECT_EQ(6, error.position.column);
- EXPECT_EQ(5, error.position.offset);
- }
- } // namespace
|