utf8.cc 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599
  1. // Copyright 2011 Google Inc. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //
  15. // Author: jdtang@google.com (Jonathan Tang)
  16. #include "utf8.h"
  17. #include <string.h>
  18. #include "gtest/gtest.h"
  19. #include "error.h"
  20. #include "gumbo.h"
  21. #include "parser.h"
  22. #include "test_utils.h"
  23. namespace {
  24. // Tests for utf8.c
  25. class Utf8Test : public GumboTest {
  26. protected:
  27. void Advance(int num_chars) {
  28. for (int i = 0; i < num_chars; ++i) {
  29. utf8iterator_next(&input_);
  30. }
  31. }
  32. void ResetText(const char* text) {
  33. text_ = text;
  34. utf8iterator_init(&parser_, text, strlen(text), &input_);
  35. }
  36. GumboError* GetFirstError() {
  37. return static_cast<GumboError*>(parser_._output->errors.data[0]);
  38. }
  39. int GetNumErrors() { return parser_._output->errors.length; }
  40. Utf8Iterator input_;
  41. };
  42. TEST_F(Utf8Test, EmptyString) {
  43. ResetText("");
  44. EXPECT_EQ(-1, utf8iterator_current(&input_));
  45. }
  46. TEST_F(Utf8Test, GetPosition_EmptyString) {
  47. ResetText("");
  48. GumboSourcePosition pos;
  49. utf8iterator_get_position(&input_, &pos);
  50. EXPECT_EQ(1, pos.line);
  51. EXPECT_EQ(1, pos.column);
  52. EXPECT_EQ(0, pos.offset);
  53. }
  54. TEST_F(Utf8Test, Null) {
  55. // Can't use ResetText, as the implicit strlen will choke on the null.
  56. text_ = "\0f";
  57. utf8iterator_init(&parser_, text_, 2, &input_);
  58. EXPECT_EQ(0, utf8iterator_current(&input_));
  59. EXPECT_EQ('\0', *utf8iterator_get_char_pointer(&input_));
  60. utf8iterator_next(&input_);
  61. EXPECT_EQ('f', utf8iterator_current(&input_));
  62. EXPECT_EQ('f', *utf8iterator_get_char_pointer(&input_));
  63. }
  64. TEST_F(Utf8Test, OneByteChar) {
  65. ResetText("a");
  66. EXPECT_EQ(0, GetNumErrors());
  67. EXPECT_EQ('a', utf8iterator_current(&input_));
  68. EXPECT_EQ('a', *utf8iterator_get_char_pointer(&input_));
  69. GumboSourcePosition pos;
  70. utf8iterator_get_position(&input_, &pos);
  71. EXPECT_EQ(1, pos.line);
  72. EXPECT_EQ(1, pos.column);
  73. EXPECT_EQ(0, pos.offset);
  74. utf8iterator_next(&input_);
  75. EXPECT_EQ(-1, utf8iterator_current(&input_));
  76. }
  77. TEST_F(Utf8Test, ContinuationByte) {
  78. ResetText("\x85");
  79. EXPECT_EQ(1, GetNumErrors());
  80. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  81. EXPECT_EQ('\x85', *utf8iterator_get_char_pointer(&input_));
  82. errors_are_expected_ = true;
  83. GumboError* error = GetFirstError();
  84. EXPECT_EQ(GUMBO_ERR_UTF8_INVALID, error->type);
  85. EXPECT_EQ('\x85', *error->original_text);
  86. EXPECT_EQ(0x85, error->v.codepoint);
  87. utf8iterator_next(&input_);
  88. EXPECT_EQ(-1, utf8iterator_current(&input_));
  89. }
  90. TEST_F(Utf8Test, MultipleContinuationBytes) {
  91. ResetText("a\x85\xA0\xC2x\x9A");
  92. EXPECT_EQ('a', utf8iterator_current(&input_));
  93. utf8iterator_next(&input_);
  94. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  95. utf8iterator_next(&input_);
  96. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  97. utf8iterator_next(&input_);
  98. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  99. utf8iterator_next(&input_);
  100. EXPECT_EQ('x', utf8iterator_current(&input_));
  101. utf8iterator_next(&input_);
  102. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  103. utf8iterator_next(&input_);
  104. EXPECT_EQ(-1, utf8iterator_current(&input_));
  105. utf8iterator_next(&input_);
  106. EXPECT_EQ(4, GetNumErrors());
  107. }
  108. TEST_F(Utf8Test, OverlongEncoding) {
  109. // \xC0\x75 = 11000000 01110101.
  110. ResetText("\xC0\x75");
  111. ASSERT_EQ(1, GetNumErrors());
  112. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  113. EXPECT_EQ('\xC0', *utf8iterator_get_char_pointer(&input_));
  114. errors_are_expected_ = true;
  115. GumboError* error = GetFirstError();
  116. EXPECT_EQ(GUMBO_ERR_UTF8_INVALID, error->type);
  117. EXPECT_EQ(1, error->position.line);
  118. EXPECT_EQ(1, error->position.column);
  119. EXPECT_EQ(0, error->position.offset);
  120. EXPECT_EQ('\xC0', *error->original_text);
  121. EXPECT_EQ(0xC0, error->v.codepoint);
  122. utf8iterator_next(&input_);
  123. EXPECT_EQ(0x75, utf8iterator_current(&input_));
  124. EXPECT_EQ('\x75', *utf8iterator_get_char_pointer(&input_));
  125. utf8iterator_next(&input_);
  126. EXPECT_EQ(-1, utf8iterator_current(&input_));
  127. }
  128. TEST_F(Utf8Test, OverlongEncodingWithContinuationByte) {
  129. // \xC0\x85 = 11000000 10000101.
  130. ResetText("\xC0\x85");
  131. ASSERT_EQ(1, GetNumErrors());
  132. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  133. EXPECT_EQ('\xC0', *utf8iterator_get_char_pointer(&input_));
  134. utf8iterator_next(&input_);
  135. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  136. errors_are_expected_ = true;
  137. GumboError* error = GetFirstError();
  138. EXPECT_EQ(GUMBO_ERR_UTF8_INVALID, error->type);
  139. EXPECT_EQ(1, error->position.line);
  140. EXPECT_EQ(1, error->position.column);
  141. EXPECT_EQ(0, error->position.offset);
  142. EXPECT_EQ('\xC0', *error->original_text);
  143. EXPECT_EQ(0xC0, error->v.codepoint);
  144. utf8iterator_next(&input_);
  145. EXPECT_EQ(-1, utf8iterator_current(&input_));
  146. }
  147. TEST_F(Utf8Test, TwoByteChar) {
  148. // \xC3\xA5 = 11000011 10100101.
  149. ResetText("\xC3\xA5o");
  150. EXPECT_EQ(0, GetNumErrors());
  151. // Codepoint = 000 11100101 = 0xE5.
  152. EXPECT_EQ(0xE5, utf8iterator_current(&input_));
  153. EXPECT_EQ('\xC3', *utf8iterator_get_char_pointer(&input_));
  154. GumboSourcePosition pos;
  155. utf8iterator_get_position(&input_, &pos);
  156. EXPECT_EQ(1, pos.line);
  157. EXPECT_EQ(1, pos.column);
  158. EXPECT_EQ(0, pos.offset);
  159. utf8iterator_next(&input_);
  160. EXPECT_EQ('o', utf8iterator_current(&input_));
  161. utf8iterator_get_position(&input_, &pos);
  162. EXPECT_EQ(1, pos.line);
  163. EXPECT_EQ(2, pos.column);
  164. EXPECT_EQ(2, pos.offset);
  165. }
  166. TEST_F(Utf8Test, TwoByteChar2) {
  167. // \xC2\xA5 = 11000010 10100101.
  168. ResetText("\xC2\xA5");
  169. EXPECT_EQ(0, GetNumErrors());
  170. // Codepoint = 000 10100101 = 0xA5.
  171. EXPECT_EQ(0xA5, utf8iterator_current(&input_));
  172. EXPECT_EQ('\xC2', *utf8iterator_get_char_pointer(&input_));
  173. utf8iterator_next(&input_);
  174. EXPECT_EQ(-1, utf8iterator_current(&input_));
  175. }
  176. TEST_F(Utf8Test, ThreeByteChar) {
  177. // \xE3\xA7\xA7 = 11100011 10100111 10100111
  178. ResetText("\xE3\xA7\xA7\xB0");
  179. EXPECT_EQ(0, GetNumErrors());
  180. // Codepoint = 00111001 11100111 = 0x39E7
  181. EXPECT_EQ(0x39E7, utf8iterator_current(&input_));
  182. EXPECT_EQ('\xE3', *utf8iterator_get_char_pointer(&input_));
  183. utf8iterator_next(&input_);
  184. EXPECT_EQ(1, GetNumErrors());
  185. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  186. EXPECT_EQ('\xB0', *utf8iterator_get_char_pointer(&input_));
  187. GumboSourcePosition pos;
  188. utf8iterator_get_position(&input_, &pos);
  189. EXPECT_EQ(1, pos.line);
  190. EXPECT_EQ(2, pos.column);
  191. EXPECT_EQ(3, pos.offset);
  192. utf8iterator_next(&input_);
  193. EXPECT_EQ(-1, utf8iterator_current(&input_));
  194. }
  195. TEST_F(Utf8Test, FourByteChar) {
  196. // \xC3\x9A = 11000011 10011010
  197. // \xF1\xA7\xA7\xA7 = 11110001 10100111 10100111 10100111
  198. ResetText("\xC3\x9A\xF1\xA7\xA7\xA7");
  199. // Codepoint = 000 11011010 = 0xDA.
  200. EXPECT_EQ(0xDA, utf8iterator_current(&input_));
  201. EXPECT_EQ('\xC3', *utf8iterator_get_char_pointer(&input_));
  202. utf8iterator_next(&input_);
  203. // Codepoint = 00110 01111001 11100111 = 0x679E7.
  204. EXPECT_EQ(0x679E7, utf8iterator_current(&input_));
  205. EXPECT_EQ('\xF1', *utf8iterator_get_char_pointer(&input_));
  206. utf8iterator_next(&input_);
  207. EXPECT_EQ(-1, utf8iterator_current(&input_));
  208. }
  209. TEST_F(Utf8Test, FourByteCharWithoutContinuationChars) {
  210. // \xF1\xA7\xA7\xA7 = 11110001 10100111 10100111 10100111
  211. ResetText("\xF1\xA7\xA7-");
  212. EXPECT_EQ(1, GetNumErrors());
  213. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  214. EXPECT_EQ('\xF1', *utf8iterator_get_char_pointer(&input_));
  215. utf8iterator_next(&input_);
  216. EXPECT_EQ('-', utf8iterator_current(&input_));
  217. utf8iterator_next(&input_);
  218. EXPECT_EQ(-1, utf8iterator_current(&input_));
  219. }
  220. TEST_F(Utf8Test, FiveByteCharIsError) {
  221. ResetText("\xF6\xA7\xA7\xA7\xA7x");
  222. EXPECT_EQ(1, GetNumErrors());
  223. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  224. utf8iterator_next(&input_);
  225. utf8iterator_next(&input_);
  226. utf8iterator_next(&input_);
  227. utf8iterator_next(&input_);
  228. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  229. utf8iterator_next(&input_);
  230. EXPECT_EQ('x', utf8iterator_current(&input_));
  231. }
  232. TEST_F(Utf8Test, SixByteCharIsError) {
  233. ResetText("\xF8\xA7\xA7\xA7\xA7\xA7x");
  234. EXPECT_EQ(1, GetNumErrors());
  235. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  236. utf8iterator_next(&input_);
  237. utf8iterator_next(&input_);
  238. utf8iterator_next(&input_);
  239. utf8iterator_next(&input_);
  240. utf8iterator_next(&input_);
  241. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  242. utf8iterator_next(&input_);
  243. EXPECT_EQ('x', utf8iterator_current(&input_));
  244. }
  245. TEST_F(Utf8Test, SevenByteCharIsError) {
  246. ResetText("\xFC\xA7\xA7\xA7\xA7\xA7\xA7x");
  247. EXPECT_EQ(1, GetNumErrors());
  248. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  249. utf8iterator_next(&input_);
  250. utf8iterator_next(&input_);
  251. utf8iterator_next(&input_);
  252. utf8iterator_next(&input_);
  253. utf8iterator_next(&input_);
  254. utf8iterator_next(&input_);
  255. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  256. utf8iterator_next(&input_);
  257. EXPECT_EQ('x', utf8iterator_current(&input_));
  258. }
  259. TEST_F(Utf8Test, 0xFFIsError) {
  260. ResetText("\xFFx");
  261. EXPECT_EQ(1, GetNumErrors());
  262. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  263. utf8iterator_next(&input_);
  264. EXPECT_EQ('x', utf8iterator_current(&input_));
  265. }
  266. TEST_F(Utf8Test, InvalidControlCharIsError) {
  267. ResetText("\x1Bx");
  268. EXPECT_EQ(1, GetNumErrors());
  269. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  270. utf8iterator_next(&input_);
  271. EXPECT_EQ('x', utf8iterator_current(&input_));
  272. }
  273. TEST_F(Utf8Test, TruncatedInput) {
  274. ResetText("\xF1\xA7");
  275. EXPECT_EQ(1, GetNumErrors());
  276. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  277. errors_are_expected_ = true;
  278. GumboError* error = GetFirstError();
  279. EXPECT_EQ(GUMBO_ERR_UTF8_TRUNCATED, error->type);
  280. EXPECT_EQ(1, error->position.line);
  281. EXPECT_EQ(1, error->position.column);
  282. EXPECT_EQ(0, error->position.offset);
  283. EXPECT_EQ('\xF1', *error->original_text);
  284. EXPECT_EQ(0xF1A7, error->v.codepoint);
  285. utf8iterator_next(&input_);
  286. EXPECT_EQ(-1, utf8iterator_current(&input_));
  287. }
  288. TEST_F(Utf8Test, Html5SpecExample) {
  289. // This example has since been removed from the spec, and the spec has been
  290. // changed to reference the Unicode Standard 6.2, 5.22 "Best practices for
  291. // U+FFFD substitution."
  292. ResetText("\x41\x98\xBA\x42\xE2\x98\x43\xE2\x98\xBA\xE2\x98");
  293. EXPECT_EQ('A', utf8iterator_current(&input_));
  294. utf8iterator_next(&input_);
  295. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  296. utf8iterator_next(&input_);
  297. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  298. utf8iterator_next(&input_);
  299. EXPECT_EQ('B', utf8iterator_current(&input_));
  300. utf8iterator_next(&input_);
  301. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  302. utf8iterator_next(&input_);
  303. EXPECT_EQ('C', utf8iterator_current(&input_));
  304. utf8iterator_next(&input_);
  305. // \xE2\x98\xBA = 11100010 10011000 10111010
  306. // Codepoint = 00100110 00111010 = 0x263A
  307. EXPECT_EQ(0x263A, utf8iterator_current(&input_));
  308. utf8iterator_next(&input_);
  309. EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
  310. utf8iterator_next(&input_);
  311. }
  312. TEST_F(Utf8Test, MultipleEOFReads) {
  313. ResetText("a");
  314. Advance(2);
  315. EXPECT_EQ(-1, utf8iterator_current(&input_));
  316. utf8iterator_next(&input_);
  317. EXPECT_EQ(-1, utf8iterator_current(&input_));
  318. }
  319. TEST_F(Utf8Test, AsciiOnly) {
  320. ResetText("hello");
  321. Advance(4);
  322. EXPECT_EQ('o', utf8iterator_current(&input_));
  323. EXPECT_EQ('o', *utf8iterator_get_char_pointer(&input_));
  324. GumboSourcePosition pos;
  325. utf8iterator_get_position(&input_, &pos);
  326. EXPECT_EQ(1, pos.line);
  327. EXPECT_EQ(5, pos.column);
  328. EXPECT_EQ(4, pos.offset);
  329. Advance(1);
  330. EXPECT_EQ(-1, utf8iterator_current(&input_));
  331. }
  332. TEST_F(Utf8Test, NewlinePosition) {
  333. ResetText("a\nnewline");
  334. Advance(1);
  335. // Newline itself should register as being at the end of a line.
  336. GumboSourcePosition pos;
  337. utf8iterator_get_position(&input_, &pos);
  338. EXPECT_EQ(1, pos.line);
  339. EXPECT_EQ(2, pos.column);
  340. EXPECT_EQ(1, pos.offset);
  341. // The next character should be at the next line.
  342. Advance(1);
  343. utf8iterator_get_position(&input_, &pos);
  344. EXPECT_EQ(2, pos.line);
  345. EXPECT_EQ(1, pos.column);
  346. EXPECT_EQ(2, pos.offset);
  347. }
  348. TEST_F(Utf8Test, TabPositionFreshTabstop) {
  349. ResetText("a\n\ttab");
  350. Advance(sizeof("a\n\t") - 1);
  351. GumboSourcePosition pos;
  352. utf8iterator_get_position(&input_, &pos);
  353. EXPECT_EQ(2, pos.line);
  354. EXPECT_EQ(8, pos.column);
  355. EXPECT_EQ(3, pos.offset);
  356. }
  357. TEST_F(Utf8Test, TabPositionMidTabstop) {
  358. ResetText("a tab\tinline");
  359. Advance(sizeof("a tab\t") - 1);
  360. GumboSourcePosition pos;
  361. utf8iterator_get_position(&input_, &pos);
  362. EXPECT_EQ(1, pos.line);
  363. EXPECT_EQ(8, pos.column);
  364. EXPECT_EQ(6, pos.offset);
  365. }
  366. TEST_F(Utf8Test, ConfigurableTabstop) {
  367. options_.tab_stop = 4;
  368. ResetText("a\n\ttab");
  369. Advance(sizeof("a\n\t") - 1);
  370. GumboSourcePosition pos;
  371. utf8iterator_get_position(&input_, &pos);
  372. EXPECT_EQ(2, pos.line);
  373. EXPECT_EQ(4, pos.column);
  374. EXPECT_EQ(3, pos.offset);
  375. }
  376. TEST_F(Utf8Test, CRLF) {
  377. ResetText("Windows\r\nlinefeeds");
  378. Advance(sizeof("Windows") - 1);
  379. EXPECT_EQ('\n', utf8iterator_current(&input_));
  380. EXPECT_EQ('\n', *utf8iterator_get_char_pointer(&input_));
  381. GumboSourcePosition pos;
  382. utf8iterator_get_position(&input_, &pos);
  383. EXPECT_EQ(1, pos.line);
  384. // The carriage return should be ignore in column calculations, treating the
  385. // CRLF combination as one character.
  386. EXPECT_EQ(8, pos.column);
  387. // However, it should not be ignored in computing offsets, which are often
  388. // used by other tools to index into the original buffer. We don't expect
  389. // other unicode-aware tools to have the same \r\n handling as HTML5.
  390. EXPECT_EQ(8, pos.offset);
  391. }
  392. TEST_F(Utf8Test, CarriageReturn) {
  393. ResetText("Mac\rlinefeeds");
  394. Advance(sizeof("Mac") - 1);
  395. EXPECT_EQ('\n', utf8iterator_current(&input_));
  396. // We don't change the original pointer, which is part of the const input
  397. // buffer. original_text pointers will see a carriage return as original
  398. // written.
  399. EXPECT_EQ('\r', *utf8iterator_get_char_pointer(&input_));
  400. GumboSourcePosition pos;
  401. utf8iterator_get_position(&input_, &pos);
  402. EXPECT_EQ(1, pos.line);
  403. EXPECT_EQ(4, pos.column);
  404. EXPECT_EQ(3, pos.offset);
  405. Advance(1);
  406. EXPECT_EQ('l', utf8iterator_current(&input_));
  407. EXPECT_EQ('l', *utf8iterator_get_char_pointer(&input_));
  408. utf8iterator_get_position(&input_, &pos);
  409. EXPECT_EQ(2, pos.line);
  410. EXPECT_EQ(1, pos.column);
  411. EXPECT_EQ(4, pos.offset);
  412. }
  413. TEST_F(Utf8Test, Matches) {
  414. ResetText("\xC2\xA5goobar");
  415. Advance(1);
  416. EXPECT_TRUE(utf8iterator_maybe_consume_match(&input_, "goo", 3, true));
  417. EXPECT_EQ('b', utf8iterator_current(&input_));
  418. }
  419. TEST_F(Utf8Test, MatchesOverflow) {
  420. ResetText("goo");
  421. EXPECT_FALSE(utf8iterator_maybe_consume_match(&input_, "goobar", 6, true));
  422. EXPECT_EQ('g', utf8iterator_current(&input_));
  423. }
  424. TEST_F(Utf8Test, MatchesEof) {
  425. ResetText("goo");
  426. EXPECT_TRUE(utf8iterator_maybe_consume_match(&input_, "goo", 3, true));
  427. EXPECT_EQ(-1, utf8iterator_current(&input_));
  428. }
  429. TEST_F(Utf8Test, MatchesCaseSensitivity) {
  430. ResetText("gooBAR");
  431. EXPECT_FALSE(utf8iterator_maybe_consume_match(&input_, "goobar", 6, true));
  432. EXPECT_EQ('g', utf8iterator_current(&input_));
  433. }
  434. TEST_F(Utf8Test, MatchesCaseInsensitive) {
  435. ResetText("gooBAR");
  436. EXPECT_TRUE(utf8iterator_maybe_consume_match(&input_, "goobar", 6, false));
  437. EXPECT_EQ(-1, utf8iterator_current(&input_));
  438. }
  439. TEST_F(Utf8Test, MatchFollowedByNullByte) {
  440. // Can't use ResetText, as the implicit strlen will choke on the null.
  441. text_ = "CDATA\0f";
  442. utf8iterator_init(&parser_, text_, 7, &input_);
  443. EXPECT_TRUE(utf8iterator_maybe_consume_match(
  444. &input_, "cdata", sizeof("cdata") - 1, false));
  445. EXPECT_EQ(0, utf8iterator_current(&input_));
  446. EXPECT_EQ('\0', *utf8iterator_get_char_pointer(&input_));
  447. utf8iterator_next(&input_);
  448. EXPECT_EQ('f', utf8iterator_current(&input_));
  449. EXPECT_EQ('f', *utf8iterator_get_char_pointer(&input_));
  450. }
  451. TEST_F(Utf8Test, MarkReset) {
  452. ResetText("this is a test");
  453. Advance(5);
  454. EXPECT_EQ('i', utf8iterator_current(&input_));
  455. utf8iterator_mark(&input_);
  456. Advance(3);
  457. EXPECT_EQ('a', utf8iterator_current(&input_));
  458. GumboError error;
  459. utf8iterator_fill_error_at_mark(&input_, &error);
  460. EXPECT_EQ('i', *error.original_text);
  461. EXPECT_EQ(1, error.position.line);
  462. EXPECT_EQ(6, error.position.column);
  463. EXPECT_EQ(5, error.position.offset);
  464. utf8iterator_reset(&input_);
  465. EXPECT_EQ('i', utf8iterator_current(&input_));
  466. EXPECT_EQ('i', *utf8iterator_get_char_pointer(&input_));
  467. GumboSourcePosition position;
  468. utf8iterator_get_position(&input_, &position);
  469. EXPECT_EQ(1, error.position.line);
  470. EXPECT_EQ(6, error.position.column);
  471. EXPECT_EQ(5, error.position.offset);
  472. }
  473. } // namespace