recodebeam_test.cc 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486
  1. // (C) Copyright 2017, Google Inc.
  2. // Licensed under the Apache License, Version 2.0 (the "License");
  3. // you may not use this file except in compliance with the License.
  4. // You may obtain a copy of the License at
  5. // http://www.apache.org/licenses/LICENSE-2.0
  6. // Unless required by applicable law or agreed to in writing, software
  7. // distributed under the License is distributed on an "AS IS" BASIS,
  8. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. // See the License for the specific language governing permissions and
  10. // limitations under the License.
  11. #include "include_gunit.h"
  12. #include "log.h" // for LOG
  13. #include "matrix.h"
  14. #include "normstrngs.h"
  15. #include "pageres.h"
  16. #include "ratngs.h"
  17. #include "recodebeam.h"
  18. #include "unicharcompress.h"
  19. #include "unicharset_training_utils.h"
  20. #include "helpers.h"
  21. namespace tesseract {
  22. // Number of characters to test beam search with.
  23. const int kNumChars = 100;
  24. // Amount of extra random data to pad with after.
  25. const int kPadding = 64;
  26. // Dictionary test data.
  27. // The top choice is: "Gef s wordsright.".
  28. // The desired phrase is "Gets words right.".
  29. // There is a competing dictionary phrase: "Get swords right.".
  30. // ... due to the following errors from the network:
  31. // f stronger than t in "Get".
  32. // weak space between Gef and s and between s and words.
  33. // weak space between words and right.
  34. const char *kGWRTops[] = {"G", "e", "f", " ", "s", " ", "w", "o", "r", "d",
  35. "s", "", "r", "i", "g", "h", "t", ".", nullptr};
  36. const float kGWRTopScores[] = {0.99, 0.85, 0.87, 0.55, 0.99, 0.65, 0.89, 0.99, 0.99,
  37. 0.99, 0.99, 0.95, 0.99, 0.90, 0.90, 0.90, 0.95, 0.75};
  38. const char *kGWR2nds[] = {"C", "c", "t", "", "S", "", "W", "O", "t", "h",
  39. "S", " ", "t", "I", "9", "b", "f", ",", nullptr};
  40. const float kGWR2ndScores[] = {0.01, 0.10, 0.12, 0.42, 0.01, 0.25, 0.10, 0.01, 0.01,
  41. 0.01, 0.01, 0.05, 0.01, 0.09, 0.09, 0.09, 0.05, 0.25};
  42. const char *kZHTops[] = {"实", "学", "储", "啬", "投", "学", "生", nullptr};
  43. const float kZHTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.98};
  44. const char *kZH2nds[] = {"学", "储", "投", "生", "学", "生", "实", nullptr};
  45. const float kZH2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01};
  46. const char *kViTops[] = {"v", "ậ", "y", " ", "t", "ộ", "i", nullptr};
  47. const float kViTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.97};
  48. const char *kVi2nds[] = {"V", "a", "v", "", "l", "o", "", nullptr};
  49. const float kVi2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01};
  50. class RecodeBeamTest : public ::testing::Test {
  51. protected:
  52. void SetUp() override {
  53. std::locale::global(std::locale(""));
  54. file::MakeTmpdir();
  55. }
  56. RecodeBeamTest() : lstm_dict_(&ccutil_) {}
  57. ~RecodeBeamTest() override {
  58. lstm_dict_.End();
  59. }
  60. // Loads and compresses the given unicharset.
  61. void LoadUnicharset(const std::string &unicharset_name) {
  62. std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
  63. std::string unicharset_file = file::JoinPath(TESTDATA_DIR, unicharset_name);
  64. std::string radical_data;
  65. CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, file::Defaults()));
  66. CHECK(ccutil_.unicharset.load_from_file(unicharset_file.c_str()));
  67. unichar_null_char_ =
  68. ccutil_.unicharset.has_special_codes() ? UNICHAR_BROKEN : ccutil_.unicharset.size();
  69. std::string radical_str(radical_data.c_str());
  70. EXPECT_TRUE(recoder_.ComputeEncoding(ccutil_.unicharset, unichar_null_char_, &radical_str));
  71. RecodedCharID code;
  72. recoder_.EncodeUnichar(unichar_null_char_, &code);
  73. encoded_null_char_ = code(0);
  74. // Space should encode as itself.
  75. recoder_.EncodeUnichar(UNICHAR_SPACE, &code);
  76. EXPECT_EQ(UNICHAR_SPACE, code(0));
  77. std::string output_name = file::JoinPath(FLAGS_test_tmpdir, "testenc.txt");
  78. std::string encoding = recoder_.GetEncodingAsString(ccutil_.unicharset);
  79. std::string encoding_str(&encoding[0], encoding.size());
  80. CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
  81. LOG(INFO) << "Wrote encoding to:" << output_name << "\n";
  82. }
  83. // Loads the dictionary.
  84. void LoadDict(const std::string &lang) {
  85. std::string traineddata_name = lang + ".traineddata";
  86. std::string traineddata_file = file::JoinPath(TESTDATA_DIR, traineddata_name);
  87. lstm_dict_.SetupForLoad(nullptr);
  88. tesseract::TessdataManager mgr;
  89. mgr.Init(traineddata_file.c_str());
  90. lstm_dict_.LoadLSTM(lang.c_str(), &mgr);
  91. lstm_dict_.FinishLoad();
  92. }
  93. // Expects the appropriate results from the compressed_ ccutil_.unicharset.
  94. void ExpectCorrect(const GENERIC_2D_ARRAY<float> &output,
  95. const std::vector<int> &transcription) {
  96. // Get the utf8 string of the transcription.
  97. std::string truth_utf8;
  98. for (int i : transcription) {
  99. truth_utf8 += ccutil_.unicharset.id_to_unichar(i);
  100. }
  101. PointerVector<WERD_RES> words;
  102. ExpectCorrect(output, truth_utf8, nullptr, &words);
  103. }
  104. void ExpectCorrect(const GENERIC_2D_ARRAY<float> &output, const std::string &truth_utf8,
  105. Dict *dict, PointerVector<WERD_RES> *words) {
  106. RecodeBeamSearch beam_search(recoder_, encoded_null_char_, false, dict);
  107. beam_search.Decode(output, 3.5, -0.125, -25.0, nullptr);
  108. // Uncomment and/or change nullptr above to &ccutil_.unicharset to debug:
  109. // beam_search.DebugBeams(ccutil_.unicharset);
  110. std::vector<int> labels, xcoords;
  111. beam_search.ExtractBestPathAsLabels(&labels, &xcoords);
  112. LOG(INFO) << "Labels size = " << labels.size() << " coords " << xcoords.size() << "\n";
  113. // Now decode using recoder_.
  114. std::string decoded;
  115. int end = 1;
  116. for (unsigned start = 0; start < labels.size(); start = end) {
  117. RecodedCharID code;
  118. unsigned index = start;
  119. int uni_id = INVALID_UNICHAR_ID;
  120. do {
  121. code.Set(code.length(), labels[index++]);
  122. uni_id = recoder_.DecodeUnichar(code);
  123. } while (index < labels.size() && code.length() < RecodedCharID::kMaxCodeLen &&
  124. (uni_id == INVALID_UNICHAR_ID || !recoder_.IsValidFirstCode(labels[index])));
  125. EXPECT_NE(INVALID_UNICHAR_ID, uni_id) << "index=" << index << "/" << labels.size();
  126. // To the extent of truth_utf8, we expect decoded to match, but if
  127. // transcription is shorter, that is OK too, as we may just be testing
  128. // that we get a valid sequence when padded with random data.
  129. if (uni_id != unichar_null_char_ && decoded.size() < truth_utf8.size()) {
  130. decoded += ccutil_.unicharset.id_to_unichar(uni_id);
  131. }
  132. end = index;
  133. }
  134. EXPECT_EQ(truth_utf8, decoded);
  135. // Check that ExtractBestPathAsUnicharIds does the same thing.
  136. std::vector<int> unichar_ids;
  137. std::vector<float> certainties, ratings;
  138. beam_search.ExtractBestPathAsUnicharIds(false, &ccutil_.unicharset, &unichar_ids, &certainties,
  139. &ratings, &xcoords);
  140. std::string u_decoded;
  141. float total_rating = 0.0f;
  142. for (unsigned u = 0; u < unichar_ids.size(); ++u) {
  143. // To the extent of truth_utf8, we expect decoded to match, but if
  144. // transcription is shorter, that is OK too, as we may just be testing
  145. // that we get a valid sequence when padded with random data.
  146. if (u_decoded.size() < truth_utf8.size()) {
  147. const char *str = ccutil_.unicharset.id_to_unichar(unichar_ids[u]);
  148. total_rating += ratings[u];
  149. LOG(INFO) << u << ":u_id=" << unichar_ids[u] << "=" << str << ", c="
  150. << certainties[u] << ", r=" << ratings[u] << "r_sum="
  151. << total_rating << " @" << xcoords[u] << "\n";
  152. if (str[0] == ' ') {
  153. total_rating = 0.0f;
  154. }
  155. u_decoded += str;
  156. }
  157. }
  158. EXPECT_EQ(truth_utf8, u_decoded);
  159. // Check that ExtractBestPathAsWords does the same thing.
  160. TBOX line_box(0, 0, 100, 10);
  161. for (int i = 0; i < 2; ++i) {
  162. beam_search.ExtractBestPathAsWords(line_box, 1.0f, false, &ccutil_.unicharset, words);
  163. std::string w_decoded;
  164. for (int w = 0; w < words->size(); ++w) {
  165. const WERD_RES *word = (*words)[w];
  166. if (w_decoded.size() < truth_utf8.size()) {
  167. if (!w_decoded.empty() && word->word->space()) {
  168. w_decoded += " ";
  169. }
  170. w_decoded += word->best_choice->unichar_string().c_str();
  171. }
  172. LOG(INFO) << "Word:" << w << " = " << word->best_choice->unichar_string()
  173. << ", c=" << word->best_choice->certainty() << ", r=" << word->best_choice->rating()
  174. << ", perm=" << word->best_choice->permuter() << "\n";
  175. }
  176. std::string w_trunc(w_decoded.data(), truth_utf8.size());
  177. if (truth_utf8 != w_trunc) {
  178. tesseract::NormalizeUTF8String(
  179. tesseract::UnicodeNormMode::kNFKD, tesseract::OCRNorm::kNormalize,
  180. tesseract::GraphemeNorm::kNone, w_decoded.c_str(), &w_decoded);
  181. w_trunc.assign(w_decoded.data(), truth_utf8.size());
  182. }
  183. EXPECT_EQ(truth_utf8, w_trunc);
  184. }
  185. }
  186. // Generates easy encoding of the given unichar_ids, and pads with at least
  187. // padding of random data.
  188. GENERIC_2D_ARRAY<float> GenerateRandomPaddedOutputs(const std::vector<int> &unichar_ids,
  189. int padding) {
  190. int width = unichar_ids.size() * 2 * RecodedCharID::kMaxCodeLen;
  191. int num_codes = recoder_.code_range();
  192. GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
  193. // Fill with random data.
  194. TRand random;
  195. for (int t = 0; t < width; ++t) {
  196. for (int i = 0; i < num_codes; ++i) {
  197. outputs(t, i) = random.UnsignedRand(0.25);
  198. }
  199. }
  200. int t = 0;
  201. for (int unichar_id : unichar_ids) {
  202. RecodedCharID code;
  203. int len = recoder_.EncodeUnichar(unichar_id, &code);
  204. EXPECT_NE(0, len);
  205. for (int j = 0; j < len; ++j) {
  206. // Make the desired answer a clear winner.
  207. if (j > 0 && code(j) == code(j - 1)) {
  208. // We will collapse adjacent equal codes so put a null in between.
  209. outputs(t++, encoded_null_char_) = 1.0f;
  210. }
  211. outputs(t++, code(j)) = 1.0f;
  212. }
  213. // Put a 0 as a null char in between.
  214. outputs(t++, encoded_null_char_) = 1.0f;
  215. }
  216. // Normalize the probs.
  217. for (int t = 0; t < width; ++t) {
  218. double sum = 0.0;
  219. for (int i = 0; i < num_codes; ++i) {
  220. sum += outputs(t, i);
  221. }
  222. for (int i = 0; i < num_codes; ++i) {
  223. outputs(t, i) /= sum;
  224. }
  225. }
  226. return outputs;
  227. }
  228. // Encodes a utf8 string (character) as unichar_id, then recodes, and sets
  229. // the score for the appropriate sequence of codes, returning the ending t.
  230. int EncodeUTF8(const char *utf8_str, float score, int start_t, TRand *random,
  231. GENERIC_2D_ARRAY<float> *outputs) {
  232. int t = start_t;
  233. std::vector<int> unichar_ids;
  234. EXPECT_TRUE(ccutil_.unicharset.encode_string(utf8_str, true, &unichar_ids, nullptr, nullptr));
  235. if (unichar_ids.empty() || utf8_str[0] == '\0') {
  236. unichar_ids.clear();
  237. unichar_ids.push_back(unichar_null_char_);
  238. }
  239. int num_ids = unichar_ids.size();
  240. for (int u = 0; u < num_ids; ++u) {
  241. RecodedCharID code;
  242. int len = recoder_.EncodeUnichar(unichar_ids[u], &code);
  243. EXPECT_NE(0, len);
  244. for (int i = 0; i < len; ++i) {
  245. // Apply the desired score.
  246. (*outputs)(t++, code(i)) = score;
  247. if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
  248. int dups = static_cast<int>(random->UnsignedRand(3.0));
  249. for (int d = 0; d < dups; ++d) {
  250. // Duplicate the desired score.
  251. (*outputs)(t++, code(i)) = score;
  252. }
  253. }
  254. }
  255. if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
  256. int dups = static_cast<int>(random->UnsignedRand(3.0));
  257. for (int d = 0; d < dups; ++d) {
  258. // Add a random number of nulls as well.
  259. (*outputs)(t++, encoded_null_char_) = score;
  260. }
  261. }
  262. }
  263. return t;
  264. }
  265. // Generates an encoding of the given 4 arrays as synthetic network scores.
  266. // uses scores1 for chars1 and scores2 for chars2, and everything else gets
  267. // the leftovers shared out equally. Note that empty string encodes as the
  268. // null_char_.
  269. GENERIC_2D_ARRAY<float> GenerateSyntheticOutputs(const char *chars1[], const float scores1[],
  270. const char *chars2[], const float scores2[],
  271. TRand *random) {
  272. int width = 0;
  273. while (chars1[width] != nullptr) {
  274. ++width;
  275. }
  276. int padding = width * RecodedCharID::kMaxCodeLen;
  277. int num_codes = recoder_.code_range();
  278. GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
  279. int t = 0;
  280. for (int i = 0; i < width; ++i) {
  281. // In case there is overlap in the codes between 1st and 2nd choice, it
  282. // is better to encode the 2nd choice first.
  283. int end_t2 = EncodeUTF8(chars2[i], scores2[i], t, random, &outputs);
  284. int end_t1 = EncodeUTF8(chars1[i], scores1[i], t, random, &outputs);
  285. // Advance t to the max end, setting everything else to the leftovers.
  286. int max_t = std::max(end_t1, end_t2);
  287. while (t < max_t) {
  288. double total_score = 0.0;
  289. for (int j = 0; j < num_codes; ++j) {
  290. total_score += outputs(t, j);
  291. }
  292. double null_remainder = (1.0 - total_score) / 2.0;
  293. double remainder = null_remainder / (num_codes - 2);
  294. if (outputs(t, encoded_null_char_) < null_remainder) {
  295. outputs(t, encoded_null_char_) += null_remainder;
  296. } else {
  297. remainder += remainder;
  298. }
  299. for (int j = 0; j < num_codes; ++j) {
  300. if (outputs(t, j) == 0.0f) {
  301. outputs(t, j) = remainder;
  302. }
  303. }
  304. ++t;
  305. }
  306. }
  307. // Fill the rest with null chars.
  308. while (t < width + padding) {
  309. outputs(t++, encoded_null_char_) = 1.0f;
  310. }
  311. return outputs;
  312. }
  313. UnicharCompress recoder_;
  314. int unichar_null_char_ = 0;
  315. int encoded_null_char_ = 0;
  316. CCUtil ccutil_;
  317. Dict lstm_dict_;
  318. };
  319. TEST_F(RecodeBeamTest, DoesChinese) {
  320. LOG(INFO) << "Testing chi_tra"
  321. << "\n";
  322. LoadUnicharset("chi_tra.unicharset");
  323. // Correctly reproduce the first kNumchars characters from easy output.
  324. std::vector<int> transcription;
  325. for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
  326. transcription.push_back(i);
  327. }
  328. GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
  329. ExpectCorrect(outputs, transcription);
  330. LOG(INFO) << "Testing chi_sim"
  331. << "\n";
  332. LoadUnicharset("chi_sim.unicharset");
  333. // Correctly reproduce the first kNumchars characters from easy output.
  334. transcription.clear();
  335. for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
  336. transcription.push_back(i);
  337. }
  338. outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
  339. ExpectCorrect(outputs, transcription);
  340. }
  341. TEST_F(RecodeBeamTest, DoesJapanese) {
  342. LOG(INFO) << "Testing jpn"
  343. << "\n";
  344. LoadUnicharset("jpn.unicharset");
  345. // Correctly reproduce the first kNumchars characters from easy output.
  346. std::vector<int> transcription;
  347. for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
  348. transcription.push_back(i);
  349. }
  350. GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
  351. ExpectCorrect(outputs, transcription);
  352. }
  353. TEST_F(RecodeBeamTest, DoesKorean) {
  354. LOG(INFO) << "Testing kor"
  355. << "\n";
  356. LoadUnicharset("kor.unicharset");
  357. // Correctly reproduce the first kNumchars characters from easy output.
  358. std::vector<int> transcription;
  359. for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
  360. transcription.push_back(i);
  361. }
  362. GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
  363. ExpectCorrect(outputs, transcription);
  364. }
  365. TEST_F(RecodeBeamTest, DoesKannada) {
  366. LOG(INFO) << "Testing kan"
  367. << "\n";
  368. LoadUnicharset("kan.unicharset");
  369. // Correctly reproduce the first kNumchars characters from easy output.
  370. std::vector<int> transcription;
  371. for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
  372. transcription.push_back(i);
  373. }
  374. GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
  375. ExpectCorrect(outputs, transcription);
  376. }
  377. TEST_F(RecodeBeamTest, DoesMarathi) {
  378. LOG(INFO) << "Testing mar"
  379. << "\n";
  380. LoadUnicharset("mar.unicharset");
  381. // Correctly reproduce the first kNumchars characters from easy output.
  382. std::vector<int> transcription;
  383. for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
  384. transcription.push_back(i);
  385. }
  386. GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
  387. ExpectCorrect(outputs, transcription);
  388. }
  389. TEST_F(RecodeBeamTest, DoesEnglish) {
  390. LOG(INFO) << "Testing eng"
  391. << "\n";
  392. LoadUnicharset("eng.unicharset");
  393. // Correctly reproduce the first kNumchars characters from easy output.
  394. std::vector<int> transcription;
  395. for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
  396. transcription.push_back(i);
  397. }
  398. GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
  399. ExpectCorrect(outputs, transcription);
  400. }
  401. TEST_F(RecodeBeamTest, DISABLED_EngDictionary) {
  402. LOG(INFO) << "Testing eng dictionary"
  403. << "\n";
  404. LoadUnicharset("eng_beam.unicharset");
  405. GENERIC_2D_ARRAY<float> outputs =
  406. GenerateSyntheticOutputs(kGWRTops, kGWRTopScores, kGWR2nds, kGWR2ndScores, nullptr);
  407. std::string default_str;
  408. for (int i = 0; kGWRTops[i] != nullptr; ++i) {
  409. default_str += kGWRTops[i];
  410. }
  411. PointerVector<WERD_RES> words;
  412. ExpectCorrect(outputs, default_str, nullptr, &words);
  413. // Now try again with the dictionary.
  414. LoadDict("eng_beam");
  415. ExpectCorrect(outputs, "Gets words right.", &lstm_dict_, &words);
  416. }
  417. TEST_F(RecodeBeamTest, DISABLED_ChiDictionary) {
  418. LOG(INFO) << "Testing zh_hans dictionary"
  419. << "\n";
  420. LoadUnicharset("zh_hans.unicharset");
  421. GENERIC_2D_ARRAY<float> outputs =
  422. GenerateSyntheticOutputs(kZHTops, kZHTopScores, kZH2nds, kZH2ndScores, nullptr);
  423. PointerVector<WERD_RES> words;
  424. ExpectCorrect(outputs, "实学储啬投学生", nullptr, &words);
  425. // Each is an individual word, with permuter = top choice.
  426. EXPECT_EQ(7, words.size());
  427. for (int w = 0; w < words.size(); ++w) {
  428. EXPECT_EQ(TOP_CHOICE_PERM, words[w]->best_choice->permuter());
  429. }
  430. // Now try again with the dictionary.
  431. LoadDict("zh_hans");
  432. ExpectCorrect(outputs, "实学储啬投学生", &lstm_dict_, &words);
  433. // Number of words expected.
  434. const int kNumWords = 5;
  435. // Content of the words.
  436. const char *kWords[kNumWords] = {"实学", "储", "啬", "投", "学生"};
  437. // Permuters of the words.
  438. const int kWordPerms[kNumWords] = {SYSTEM_DAWG_PERM, TOP_CHOICE_PERM, TOP_CHOICE_PERM,
  439. TOP_CHOICE_PERM, SYSTEM_DAWG_PERM};
  440. EXPECT_EQ(kNumWords, words.size());
  441. for (int w = 0; w < kNumWords && w < words.size(); ++w) {
  442. EXPECT_STREQ(kWords[w], words[w]->best_choice->unichar_string().c_str());
  443. EXPECT_EQ(kWordPerms[w], words[w]->best_choice->permuter());
  444. }
  445. }
  446. // Tests that a recoder built with decomposed unicode allows true ctc
  447. // arbitrary duplicates and inserted nulls inside the multicode sequence.
  448. TEST_F(RecodeBeamTest, DISABLED_MultiCodeSequences) {
  449. LOG(INFO) << "Testing duplicates in multi-code sequences"
  450. << "\n";
  451. LoadUnicharset("vie.d.unicharset");
  452. tesseract::SetupBasicProperties(false, true, &ccutil_.unicharset);
  453. TRand random;
  454. GENERIC_2D_ARRAY<float> outputs =
  455. GenerateSyntheticOutputs(kViTops, kViTopScores, kVi2nds, kVi2ndScores, &random);
  456. PointerVector<WERD_RES> words;
  457. std::string truth_str;
  458. tesseract::NormalizeUTF8String(tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize,
  459. tesseract::GraphemeNorm::kNone, "vậy tội", &truth_str);
  460. ExpectCorrect(outputs, truth_str, nullptr, &words);
  461. }
  462. } // namespace tesseract