baseapi_test.cc 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398
  1. // (C) Copyright 2017, Google Inc.
  2. // Licensed under the Apache License, Version 2.0 (the "License");
  3. // you may not use this file except in compliance with the License.
  4. // You may obtain a copy of the License at
  5. // http://www.apache.org/licenses/LICENSE-2.0
  6. // Unless required by applicable law or agreed to in writing, software
  7. // distributed under the License is distributed on an "AS IS" BASIS,
  8. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. // See the License for the specific language governing permissions and
  10. // limitations under the License.
  11. #include "include_gunit.h"
  12. #include "cycletimer.h" // for CycleTimer
  13. #include "log.h" // for LOG
  14. #include "ocrblock.h" // for class BLOCK
  15. #include "pageres.h"
  16. #include <tesseract/baseapi.h>
  17. #include <allheaders.h>
  18. #include "gmock/gmock-matchers.h"
  19. #include <memory>
  20. #include <regex>
  21. #include <string>
  22. #include <vector>
  23. namespace tesseract {
  24. using ::testing::ContainsRegex;
  25. using ::testing::HasSubstr;
  26. static const char *langs[] = {"eng", "vie", "hin", "ara", nullptr};
  27. static const char *image_files[] = {"HelloGoogle.tif", "viet.tif", "raaj.tif", "arabic.tif",
  28. nullptr};
  29. static const char *gt_text[] = {"Hello Google", "\x74\x69\xe1\xba\xbf\x6e\x67",
  30. "\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x9c",
  31. "\xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a", nullptr};
  32. class FriendlyTessBaseAPI : public tesseract::TessBaseAPI {
  33. FRIEND_TEST(TesseractTest, LSTMGeometryTest);
  34. };
  35. std::string GetCleanedTextResult(tesseract::TessBaseAPI *tess, Image pix) {
  36. tess->SetImage(pix);
  37. char *result = tess->GetUTF8Text();
  38. std::string ocr_result = result;
  39. delete[] result;
  40. trim(ocr_result);
  41. return ocr_result;
  42. }
  43. // The fixture for testing Tesseract.
  44. class TesseractTest : public testing::Test {
  45. protected:
  46. static std::string TestDataNameToPath(const std::string &name) {
  47. return file::JoinPath(TESTING_DIR, name);
  48. }
  49. static std::string TessdataPath() {
  50. return TESSDATA_DIR;
  51. }
  52. };
  53. // Test static TessBaseAPI (like it is used by tesserocr).
  54. TEST_F(TesseractTest, StaticTessBaseAPI) {
  55. static tesseract::TessBaseAPI api;
  56. api.End();
  57. }
  58. // Tests that Tesseract gets exactly the right answer on phototest.
  59. TEST_F(TesseractTest, BasicTesseractTest) {
  60. tesseract::TessBaseAPI api;
  61. std::string truth_text;
  62. std::string ocr_text;
  63. if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) != -1) {
  64. Image src_pix = pixRead(TestDataNameToPath("phototest.tif").c_str());
  65. CHECK(src_pix);
  66. ocr_text = GetCleanedTextResult(&api, src_pix);
  67. CHECK_OK(
  68. file::GetContents(TestDataNameToPath("phototest.gold.txt"), &truth_text, file::Defaults()));
  69. trim(truth_text);
  70. EXPECT_STREQ(truth_text.c_str(), ocr_text.c_str());
  71. src_pix.destroy();
  72. } else {
  73. // eng.traineddata not found.
  74. GTEST_SKIP();
  75. }
  76. }
  77. // Test that api.GetComponentImages() will return a set of images for
  78. // paragraphs even if text recognition was not run.
  79. TEST_F(TesseractTest, IteratesParagraphsEvenIfNotDetected) {
  80. tesseract::TessBaseAPI api;
  81. if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) != -1) {
  82. api.SetPageSegMode(tesseract::PSM_SINGLE_BLOCK);
  83. api.SetVariable("paragraph_debug_level", "3");
  84. #if 0 // TODO: b622.png is missing
  85. Pix* src_pix = pixRead(TestDataNameToPath("b622.png").c_str());
  86. CHECK(src_pix);
  87. api.SetImage(src_pix);
  88. Boxa* para_boxes =
  89. api.GetComponentImages(tesseract::RIL_PARA, true, nullptr, nullptr);
  90. EXPECT_TRUE(para_boxes != nullptr);
  91. Boxa* block_boxes =
  92. api.GetComponentImages(tesseract::RIL_BLOCK, true, nullptr, nullptr);
  93. EXPECT_TRUE(block_boxes != nullptr);
  94. // TODO(eger): Get paragraphs out of this page pre-text.
  95. EXPECT_GE(boxaGetCount(para_boxes), boxaGetCount(block_boxes));
  96. boxaDestroy(&block_boxes);
  97. boxaDestroy(&para_boxes);
  98. src_pix.destroy();
  99. #endif
  100. } else {
  101. // eng.traineddata not found.
  102. GTEST_SKIP();
  103. }
  104. }
  105. // We should get hOCR output and not seg fault, even if the api caller doesn't
  106. // call SetInputName().
  107. TEST_F(TesseractTest, HOCRWorksWithoutSetInputName) {
  108. tesseract::TessBaseAPI api;
  109. if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) == -1) {
  110. // eng.traineddata not found.
  111. GTEST_SKIP();
  112. }
  113. Image src_pix = pixRead(TestDataNameToPath("HelloGoogle.tif").c_str());
  114. CHECK(src_pix);
  115. api.SetImage(src_pix);
  116. char *result = api.GetHOCRText(0);
  117. EXPECT_TRUE(result != nullptr);
  118. EXPECT_THAT(result, HasSubstr("Hello"));
  119. EXPECT_THAT(result, HasSubstr("<div class='ocr_page'"));
  120. delete[] result;
  121. src_pix.destroy();
  122. }
  123. // hOCR output should contain baseline info for upright textlines.
  124. TEST_F(TesseractTest, HOCRContainsBaseline) {
  125. tesseract::TessBaseAPI api;
  126. if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) == -1) {
  127. // eng.traineddata not found.
  128. GTEST_SKIP();
  129. }
  130. Image src_pix = pixRead(TestDataNameToPath("HelloGoogle.tif").c_str());
  131. CHECK(src_pix);
  132. api.SetInputName("HelloGoogle.tif");
  133. api.SetImage(src_pix);
  134. char *result = api.GetHOCRText(0);
  135. EXPECT_TRUE(result != nullptr);
  136. EXPECT_THAT(result, HasSubstr("Hello"));
  137. EXPECT_TRUE(std::regex_search(
  138. result, std::regex{"<span class='ocr_line'[^>]* baseline [-.0-9]+ [-.0-9]+"}));
  139. delete[] result;
  140. src_pix.destroy();
  141. }
  142. // Tests that Tesseract gets exactly the right answer on some page numbers.
  143. TEST_F(TesseractTest, AdaptToWordStrTest) {
  144. #ifdef DISABLED_LEGACY_ENGINE
  145. // Skip test because TessBaseAPI::AdaptToWordStr is missing.
  146. GTEST_SKIP();
  147. #else
  148. static const char *kTrainingPages[] = {"136.tif", "256.tif", "410.tif", "432.tif", "540.tif",
  149. "692.tif", "779.tif", "793.tif", "808.tif", "815.tif",
  150. "12.tif", "12.tif", nullptr};
  151. static const char *kTrainingText[] = {"1 3 6", "2 5 6", "4 1 0", "4 3 2", "5 4 0",
  152. "6 9 2", "7 7 9", "7 9 3", "8 0 8", "8 1 5",
  153. "1 2", "1 2", nullptr};
  154. static const char *kTestPages[] = {"324.tif", "433.tif", "12.tif", nullptr};
  155. static const char *kTestText[] = {"324", "433", "12", nullptr};
  156. tesseract::TessBaseAPI api;
  157. std::string truth_text;
  158. std::string ocr_text;
  159. if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) == -1) {
  160. // eng.traineddata not found.
  161. GTEST_SKIP();
  162. }
  163. api.SetVariable("matcher_sufficient_examples_for_prototyping", "1");
  164. api.SetVariable("classify_class_pruner_threshold", "220");
  165. // Train on the training text.
  166. for (int i = 0; kTrainingPages[i] != nullptr; ++i) {
  167. std::string image_file = TestDataNameToPath(kTrainingPages[i]);
  168. Image src_pix = pixRead(image_file.c_str());
  169. CHECK(src_pix);
  170. api.SetImage(src_pix);
  171. EXPECT_TRUE(api.AdaptToWordStr(tesseract::PSM_SINGLE_WORD, kTrainingText[i]))
  172. << "Failed to adapt to text \"" << kTrainingText[i] << "\" on image " << image_file;
  173. src_pix.destroy();
  174. }
  175. // Test the test text.
  176. api.SetVariable("tess_bn_matching", "1");
  177. api.SetPageSegMode(tesseract::PSM_SINGLE_WORD);
  178. for (int i = 0; kTestPages[i] != nullptr; ++i) {
  179. Image src_pix = pixRead(TestDataNameToPath(kTestPages[i]).c_str());
  180. CHECK(src_pix);
  181. ocr_text = GetCleanedTextResult(&api, src_pix);
  182. trim(truth_text);
  183. EXPECT_STREQ(kTestText[i], ocr_text.c_str());
  184. src_pix.destroy();
  185. }
  186. #endif
  187. }
  188. // Tests that LSTM gets exactly the right answer on phototest.
  189. TEST_F(TesseractTest, BasicLSTMTest) {
  190. tesseract::TessBaseAPI api;
  191. std::string truth_text;
  192. std::string ocr_text;
  193. if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_LSTM_ONLY) == -1) {
  194. // eng.traineddata not found.
  195. GTEST_SKIP();
  196. }
  197. Image src_pix = pixRead(TestDataNameToPath("phototest_2.tif").c_str());
  198. CHECK(src_pix);
  199. ocr_text = GetCleanedTextResult(&api, src_pix);
  200. CHECK_OK(
  201. file::GetContents(TestDataNameToPath("phototest.gold.txt"), &truth_text, file::Defaults()));
  202. trim(truth_text);
  203. EXPECT_STREQ(truth_text.c_str(), ocr_text.c_str());
  204. src_pix.destroy();
  205. }
  206. // Test that LSTM's character bounding boxes are properly converted to
  207. // Tesseract structures. Note that we can't guarantee that LSTM's
  208. // character boxes fall completely within Tesseract's word box because
  209. // the baseline denormalization/normalization transforms may introduce
  210. // errors due to float/int conversions (e.g., see OUTLINE::move() in
  211. // ccstruct/poutline.h) Instead, we do a loose check.
  212. TEST_F(TesseractTest, LSTMGeometryTest) {
  213. Image src_pix = pixRead(TestDataNameToPath("deslant.tif").c_str());
  214. FriendlyTessBaseAPI api;
  215. if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_LSTM_ONLY) == -1) {
  216. // eng.traineddata not found.
  217. GTEST_SKIP();
  218. }
  219. api.SetImage(src_pix);
  220. ASSERT_EQ(api.Recognize(nullptr), 0);
  221. const PAGE_RES *page_res = api.GetPageRes();
  222. PAGE_RES_IT page_res_it(const_cast<PAGE_RES *>(page_res));
  223. page_res_it.restart_page();
  224. BLOCK *block = page_res_it.block()->block;
  225. CHECK(block);
  226. // extract word and character boxes for each word
  227. for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
  228. WERD_RES *word = page_res_it.word();
  229. CHECK(word);
  230. CHECK(word->best_choice);
  231. CHECK_GT(word->best_choice->length(), 0);
  232. CHECK(word->word);
  233. CHECK(word->box_word);
  234. // tesseract's word box
  235. TBOX tess_blob_box;
  236. tess_blob_box = word->word->bounding_box();
  237. tess_blob_box.rotate(block->re_rotation());
  238. // verify that each of LSTM's character boxes lies close to within
  239. // tesseract's word box
  240. for (int i = 0; i < word->box_word->length(); ++i) {
  241. TBOX lstm_blob_box = word->box_word->BlobBox(i);
  242. // LSTM character box should not spill out of tesseract word box
  243. // by more than a few pixels in any direction
  244. EXPECT_LT(tess_blob_box.left() - lstm_blob_box.left(), 5);
  245. EXPECT_LT(lstm_blob_box.right() - tess_blob_box.right(), 5);
  246. EXPECT_LT(tess_blob_box.bottom() - lstm_blob_box.bottom(), 5);
  247. EXPECT_LT(lstm_blob_box.top() - tess_blob_box.top(), 5);
  248. }
  249. }
  250. src_pix.destroy();
  251. }
  252. TEST_F(TesseractTest, InitConfigOnlyTest) {
  253. // Languages for testing initialization.
  254. const char *langs[] = {"eng", "chi_tra", "jpn", "vie"};
  255. std::unique_ptr<tesseract::TessBaseAPI> api;
  256. CycleTimer timer;
  257. for (auto &lang : langs) {
  258. api = std::make_unique<tesseract::TessBaseAPI>();
  259. timer.Restart();
  260. EXPECT_EQ(0, api->Init(TessdataPath().c_str(), lang, tesseract::OEM_TESSERACT_ONLY));
  261. timer.Stop();
  262. LOG(INFO) << "Lang " << lang << " took " << timer.GetInMs() << "ms in regular init";
  263. }
  264. // Init variables to set for config-only initialization.
  265. std::vector<std::string> vars_vec, vars_values;
  266. vars_vec.emplace_back("tessedit_init_config_only");
  267. vars_values.emplace_back("1");
  268. LOG(INFO) << "Switching to config only initialization:";
  269. for (auto &lang : langs) {
  270. api = std::make_unique<tesseract::TessBaseAPI>();
  271. timer.Restart();
  272. EXPECT_EQ(0, api->Init(TessdataPath().c_str(), lang, tesseract::OEM_TESSERACT_ONLY, nullptr, 0,
  273. &vars_vec, &vars_values, false));
  274. timer.Stop();
  275. LOG(INFO) << "Lang " << lang << " took " << timer.GetInMs() << "ms in config-only init";
  276. }
  277. }
  278. // Tests if two instances of Tesseract/LSTM can co-exist in the same thread.
  279. // NOTE: This is not an exhaustive test and current support for multiple
  280. // instances in Tesseract is fragile. This test is intended largely as a means
  281. // of detecting and guarding against the existing support being possibly broken
  282. // by future CLs. TessBaseAPI instances are initialized using the default
  283. // OEM_DEFAULT mode.
  284. TEST(TesseractInstanceTest, TestMultipleTessInstances) {
  285. int num_langs = 0;
  286. while (langs[num_langs] != nullptr) {
  287. ++num_langs;
  288. }
  289. const std::string kTessdataPath = TESSDATA_DIR;
  290. // Preload images and verify that OCR is correct on them individually.
  291. std::vector<Image > pix(num_langs);
  292. for (int i = 0; i < num_langs; ++i) {
  293. std::string tracestring = "Single instance test with lang = ";
  294. tracestring += langs[i];
  295. SCOPED_TRACE(tracestring);
  296. std::string path = file::JoinPath(TESTING_DIR, image_files[i]);
  297. pix[i] = pixRead(path.c_str());
  298. QCHECK(pix[i] != nullptr) << "Could not read " << path;
  299. tesseract::TessBaseAPI tess;
  300. EXPECT_EQ(0, tess.Init(kTessdataPath.c_str(), langs[i]));
  301. std::string ocr_result = GetCleanedTextResult(&tess, pix[i]);
  302. EXPECT_STREQ(gt_text[i], ocr_result.c_str());
  303. }
  304. // Process the images in all pairwise combinations of associated languages.
  305. std::string ocr_result[2];
  306. for (int i = 0; i < num_langs; ++i) {
  307. for (int j = i + 1; j < num_langs; ++j) {
  308. tesseract::TessBaseAPI tess1, tess2;
  309. tess1.Init(kTessdataPath.c_str(), langs[i]);
  310. tess2.Init(kTessdataPath.c_str(), langs[j]);
  311. ocr_result[0] = GetCleanedTextResult(&tess1, pix[i]);
  312. ocr_result[1] = GetCleanedTextResult(&tess2, pix[j]);
  313. EXPECT_FALSE(strcmp(gt_text[i], ocr_result[0].c_str()) ||
  314. strcmp(gt_text[j], ocr_result[1].c_str()))
  315. << "OCR failed on language pair " << langs[i] << "-" << langs[j];
  316. }
  317. }
  318. for (int i = 0; i < num_langs; ++i) {
  319. pix[i].destroy();
  320. }
  321. }
  322. // Tests whether Tesseract parameters are correctly set for the two instances.
  323. TEST(TesseractInstanceTest, TestMultipleTessInstanceVariables) {
  324. std::string illegal_name = "an_illegal_name";
  325. std::string langs[2] = {"eng", "hin"};
  326. std::string int_param_name = "tessedit_pageseg_mode";
  327. int int_param[2] = {1, 2};
  328. std::string int_param_str[2] = {"1", "2"};
  329. std::string bool_param_name = "tessedit_ambigs_training";
  330. bool bool_param[2] = {false, true};
  331. std::string bool_param_str[2] = {"F", "T"};
  332. std::string str_param_name = "tessedit_char_blacklist";
  333. std::string str_param[2] = {"abc", "def"};
  334. std::string double_param_name = "segment_penalty_dict_frequent_word";
  335. std::string double_param_str[2] = {"0.01", "2"};
  336. double double_param[2] = {0.01, 2};
  337. const std::string kTessdataPath = TESSDATA_DIR;
  338. tesseract::TessBaseAPI tess1, tess2;
  339. for (int i = 0; i < 2; ++i) {
  340. tesseract::TessBaseAPI *api = (i == 0) ? &tess1 : &tess2;
  341. api->Init(kTessdataPath.c_str(), langs[i].c_str());
  342. api->SetVariable(illegal_name.c_str(), "none");
  343. api->SetVariable(int_param_name.c_str(), int_param_str[i].c_str());
  344. api->SetVariable(bool_param_name.c_str(), bool_param_str[i].c_str());
  345. api->SetVariable(str_param_name.c_str(), str_param[i].c_str());
  346. api->SetVariable(double_param_name.c_str(), double_param_str[i].c_str());
  347. }
  348. for (int i = 0; i < 2; ++i) {
  349. tesseract::TessBaseAPI *api = (i == 0) ? &tess1 : &tess2;
  350. EXPECT_FALSE(api->GetStringVariable(illegal_name.c_str()));
  351. int intvar;
  352. EXPECT_TRUE(api->GetIntVariable(int_param_name.c_str(), &intvar));
  353. EXPECT_EQ(int_param[i], intvar);
  354. bool boolvar;
  355. EXPECT_TRUE(api->GetBoolVariable(bool_param_name.c_str(), &boolvar));
  356. EXPECT_EQ(bool_param[i], boolvar);
  357. EXPECT_STREQ(str_param[i].c_str(), api->GetStringVariable(str_param_name.c_str()));
  358. double doublevar;
  359. EXPECT_TRUE(api->GetDoubleVariable(double_param_name.c_str(), &doublevar));
  360. EXPECT_EQ(double_param[i], doublevar);
  361. }
  362. }
  363. } // namespace tesseract