validator_test.cc 3.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. // (C) Copyright 2017, Google Inc.
  2. // Licensed under the Apache License, Version 2.0 (the "License");
  3. // you may not use this file except in compliance with the License.
  4. // You may obtain a copy of the License at
  5. // http://www.apache.org/licenses/LICENSE-2.0
  6. // Unless required by applicable law or agreed to in writing, software
  7. // distributed under the License is distributed on an "AS IS" BASIS,
  8. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. // See the License for the specific language governing permissions and
  10. // limitations under the License.
  11. #include "validator.h"
  12. #include "gmock/gmock.h" // for testing::ElementsAreArray
  13. #include "include_gunit.h"
  14. namespace tesseract {
  15. class TestableValidator : public Validator {
  16. public:
  17. static ViramaScript TestableMostFrequentViramaScript(const std::vector<char32> &utf32) {
  18. return MostFrequentViramaScript(utf32);
  19. }
  20. };
  21. // The majority of Validator is tested by the script-specific tests of its
  22. // subclasses, but the MostFrequentViramaScript function is worth a unittest.
  23. TEST(ValidatorTest, MostFrequentViramaScript) {
  24. // The most frequent virama script should come out correct, despite
  25. // distractions from other scripts.
  26. EXPECT_EQ(ViramaScript::kTelugu, TestableValidator::TestableMostFrequentViramaScript({0xc05}));
  27. // It is still Telugu surrounded by Latin.
  28. EXPECT_EQ(ViramaScript::kTelugu,
  29. TestableValidator::TestableMostFrequentViramaScript({'a', 0xc05, 'b', 'c'}));
  30. // But not still Telugu surrounded by Devanagari.
  31. EXPECT_EQ(ViramaScript::kDevanagari,
  32. TestableValidator::TestableMostFrequentViramaScript({0x905, 0xc05, 0x906, 0x907}));
  33. EXPECT_EQ(ViramaScript::kKannada,
  34. TestableValidator::TestableMostFrequentViramaScript({0xc85, 0xc05, 0xc86, 0xc87}));
  35. EXPECT_EQ(ViramaScript::kBengali,
  36. TestableValidator::TestableMostFrequentViramaScript({0x985, 0xc05, 0x986, 0x987}));
  37. // Danda and double Danda don't count as Devanagari, as they are common.
  38. EXPECT_EQ(ViramaScript::kTelugu,
  39. TestableValidator::TestableMostFrequentViramaScript({0x964, 0xc05, 0x965, 0x965}));
  40. }
  41. // ValidateCleanAndSegment doesn't modify the input by much, but its
  42. // transformation should be idempotent. (Doesn't change again if re-applied.)
  43. TEST(ValidatorTest, Idempotency) {
  44. std::vector<char32> str1({0xd24, 0xd23, 0xd32, 0xd4d, '\'', 0x200d, 0x200c, 0x200d, 0x200c});
  45. std::vector<char32> str2({0xd24, 0xd23, 0xd32, 0xd4d, 0x200c, 0x200d, 0x200c, 0x200d, '\''});
  46. std::vector<std::vector<char32>> result1, result2, result3, result4;
  47. EXPECT_TRUE(
  48. Validator::ValidateCleanAndSegment(GraphemeNormMode::kSingleString, true, str1, &result1));
  49. EXPECT_TRUE(Validator::ValidateCleanAndSegment(GraphemeNormMode::kSingleString, true, result1[0],
  50. &result2));
  51. EXPECT_EQ(result1.size(), result2.size());
  52. EXPECT_THAT(result2[0], testing::ElementsAreArray(result1[0]));
  53. EXPECT_TRUE(
  54. Validator::ValidateCleanAndSegment(GraphemeNormMode::kSingleString, true, str2, &result3));
  55. EXPECT_TRUE(Validator::ValidateCleanAndSegment(GraphemeNormMode::kSingleString, true, result3[0],
  56. &result4));
  57. EXPECT_EQ(result3.size(), result4.size());
  58. EXPECT_THAT(result4[0], testing::ElementsAreArray(result3[0]));
  59. }
  60. } // namespace tesseract