tessocr.cpp 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. // Copyright (C) 2020-2024 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #include "mupdf/fitz/config.h"
  23. #ifndef OCR_DISABLED
  24. #include <climits>
  25. #include "tesseract/baseapi.h"
  26. #include "tesseract/capi.h" // for ETEXT_DESC
  27. extern "C" {
  28. #include "allheaders.h"
  29. #include "tessocr.h"
  30. #include "leptonica-wrap.h"
  31. #if TESSERACT_MAJOR_VERSION >= 5
  32. static bool
  33. load_file(const char* filename, std::vector<char>* data)
  34. {
  35. bool result = false;
  36. FILE *fp = fopen(filename, "rb");
  37. if (fp == NULL)
  38. return false;
  39. fseek(fp, 0, SEEK_END);
  40. long size = ftell(fp);
  41. fseek(fp, 0, SEEK_SET);
  42. // Trying to open a directory on Linux sets size to LONG_MAX. Catch it here.
  43. if (size > 0 && size < LONG_MAX)
  44. {
  45. // reserve an extra byte in case caller wants to append a '\0' character
  46. data->reserve(size + 1);
  47. data->resize(size);
  48. result = static_cast<long>(fread(&(*data)[0], 1, size, fp)) == size;
  49. }
  50. fclose(fp);
  51. return result;
  52. }
  53. static bool
  54. tess_file_reader(const char *fname, std::vector<char> *out)
  55. {
  56. /* FIXME: Look for inbuilt ones. */
  57. /* Then under TESSDATA */
  58. return load_file(fname, out);
  59. }
  60. #else
  61. static bool
  62. load_file(const char* filename, GenericVector<char>* data)
  63. {
  64. bool result = false;
  65. FILE *fp = fopen(filename, "rb");
  66. if (fp == NULL)
  67. return false;
  68. fseek(fp, 0, SEEK_END);
  69. long size = ftell(fp);
  70. fseek(fp, 0, SEEK_SET);
  71. // Trying to open a directory on Linux sets size to LONG_MAX. Catch it here.
  72. if (size > 0 && size < LONG_MAX)
  73. {
  74. // reserve an extra byte in case caller wants to append a '\0' character
  75. data->reserve(size + 1);
  76. data->resize_no_init(size);
  77. result = static_cast<long>(fread(&(*data)[0], 1, size, fp)) == size;
  78. }
  79. fclose(fp);
  80. return result;
  81. }
  82. static bool
  83. tess_file_reader(const STRING& fname, GenericVector<char> *out)
  84. {
  85. /* FIXME: Look for inbuilt ones. */
  86. /* Then under TESSDATA */
  87. return load_file(fname.c_str(), out);
  88. }
  89. #endif
  90. void *ocr_init(fz_context *ctx, const char *language, const char *datadir)
  91. {
  92. tesseract::TessBaseAPI *api;
  93. fz_set_leptonica_mem(ctx);
  94. api = new tesseract::TessBaseAPI();
  95. if (api == NULL)
  96. {
  97. fz_clear_leptonica_mem(ctx);
  98. fz_throw(ctx, FZ_ERROR_LIBRARY, "Tesseract base initialisation failed");
  99. }
  100. if (language == NULL || language[0] == 0)
  101. language = "eng";
  102. // Initialize tesseract-ocr with English, without specifying tessdata path
  103. if (api->Init(datadir, 0, /* data, data_size */
  104. language,
  105. tesseract::OcrEngineMode::OEM_DEFAULT,
  106. NULL, 0, /* configs, configs_size */
  107. NULL, NULL, /* vars_vec */
  108. false, /* set_only_non_debug_params */
  109. &tess_file_reader))
  110. {
  111. delete api;
  112. fz_clear_leptonica_mem(ctx);
  113. fz_throw(ctx, FZ_ERROR_LIBRARY, "Tesseract language initialisation failed");
  114. }
  115. return api;
  116. }
  117. void ocr_fin(fz_context *ctx, void *api_)
  118. {
  119. tesseract::TessBaseAPI *api = (tesseract::TessBaseAPI *)api_;
  120. if (api == NULL)
  121. return;
  122. api->End();
  123. delete api;
  124. fz_clear_leptonica_mem(ctx);
  125. }
  126. static inline int isbigendian(void)
  127. {
  128. static const int one = 1;
  129. return *(char*)&one == 0;
  130. }
  131. static Pix *
  132. ocr_set_image(fz_context *ctx, tesseract::TessBaseAPI *api, fz_pixmap *pix)
  133. {
  134. Pix *image = pixCreateHeader(pix->w, pix->h, 8);
  135. if (image == NULL)
  136. fz_throw(ctx, FZ_ERROR_LIBRARY, "Tesseract image creation failed");
  137. pixSetData(image, (l_uint32 *)pix->samples);
  138. pixSetPadBits(image, 1);
  139. pixSetXRes(image, pix->xres);
  140. pixSetYRes(image, pix->yres);
  141. if (!isbigendian())
  142. {
  143. /* Frizzle the image */
  144. int x, y;
  145. uint32_t *d = (uint32_t *)pix->samples;
  146. for (y = pix->h; y > 0; y--)
  147. for (x = pix->w>>2; x > 0; x--)
  148. {
  149. uint32_t v = *d;
  150. ((uint8_t *)d)[0] = v>>24;
  151. ((uint8_t *)d)[1] = v>>16;
  152. ((uint8_t *)d)[2] = v>>8;
  153. ((uint8_t *)d)[3] = v;
  154. d++;
  155. }
  156. }
  157. /* pixWrite("test.pnm", image, IFF_PNM); */
  158. api->SetImage(image);
  159. return image;
  160. }
  161. static void
  162. ocr_clear_image(fz_context *ctx, Pix *image)
  163. {
  164. pixSetData(image, NULL);
  165. pixDestroy(&image);
  166. }
  167. typedef struct {
  168. fz_context *ctx;
  169. void *arg;
  170. int (*progress)(fz_context *, void *, int progress);
  171. } progress_arg;
  172. static bool
  173. do_cancel(void *arg, int dummy)
  174. {
  175. return true;
  176. }
  177. static bool
  178. progress_callback(ETEXT_DESC *monitor, int l, int r, int t, int b)
  179. {
  180. progress_arg *details = (progress_arg *)monitor->cancel_this;
  181. int cancel;
  182. if (!details->progress)
  183. return false;
  184. cancel = details->progress(details->ctx, details->arg, monitor->progress);
  185. if (cancel)
  186. monitor->cancel = do_cancel;
  187. return false;
  188. }
  189. void ocr_recognise(fz_context *ctx,
  190. void *api_,
  191. fz_pixmap *pix,
  192. void (*callback)(fz_context *ctx,
  193. void *arg,
  194. int unicode,
  195. const char *font_name,
  196. const int *line_bbox,
  197. const int *word_bbox,
  198. const int *char_bbox,
  199. int pointsize),
  200. int (*progress)(fz_context *ctx,
  201. void *arg,
  202. int progress),
  203. void *arg)
  204. {
  205. tesseract::TessBaseAPI *api = (tesseract::TessBaseAPI *)api_;
  206. Pix *image;
  207. int code;
  208. int word_bbox[4];
  209. int char_bbox[4];
  210. int line_bbox[4];
  211. bool bold, italic, underlined, monospace, serif, smallcaps;
  212. int pointsize, font_id;
  213. const char* font_name;
  214. ETEXT_DESC monitor;
  215. progress_arg details;
  216. if (api == NULL)
  217. return;
  218. image = ocr_set_image(ctx, api, pix);
  219. monitor.cancel = nullptr;
  220. monitor.cancel_this = &details;
  221. details.ctx = ctx;
  222. details.arg = arg;
  223. details.progress = progress;
  224. monitor.progress_callback2 = progress_callback;
  225. code = api->Recognize(&monitor);
  226. if (code < 0)
  227. {
  228. ocr_clear_image(ctx, image);
  229. fz_throw(ctx, FZ_ERROR_LIBRARY, "OCR recognise failed");
  230. }
  231. if (!isbigendian())
  232. {
  233. /* Frizzle the image */
  234. int x, y;
  235. uint32_t *d = (uint32_t *)pix->samples;
  236. for (y = pix->h; y > 0; y--)
  237. for (x = pix->w>>2; x > 0; x--)
  238. {
  239. uint32_t v = *d;
  240. ((uint8_t *)d)[0] = v>>24;
  241. ((uint8_t *)d)[1] = v>>16;
  242. ((uint8_t *)d)[2] = v>>8;
  243. ((uint8_t *)d)[3] = v;
  244. d++;
  245. }
  246. }
  247. tesseract::ResultIterator *res_it = api->GetIterator();
  248. fz_try(ctx)
  249. {
  250. while (!res_it->Empty(tesseract::RIL_BLOCK))
  251. {
  252. if (res_it->Empty(tesseract::RIL_WORD))
  253. {
  254. res_it->Next(tesseract::RIL_WORD);
  255. continue;
  256. }
  257. res_it->BoundingBox(tesseract::RIL_TEXTLINE,
  258. line_bbox, line_bbox+1,
  259. line_bbox+2, line_bbox+3);
  260. res_it->BoundingBox(tesseract::RIL_WORD,
  261. word_bbox, word_bbox+1,
  262. word_bbox+2, word_bbox+3);
  263. font_name = res_it->WordFontAttributes(&bold,
  264. &italic,
  265. &underlined,
  266. &monospace,
  267. &serif,
  268. &smallcaps,
  269. &pointsize,
  270. &font_id);
  271. do
  272. {
  273. const char *graph = res_it->GetUTF8Text(tesseract::RIL_SYMBOL);
  274. if (graph && graph[0] != 0)
  275. {
  276. int unicode;
  277. res_it->BoundingBox(tesseract::RIL_SYMBOL,
  278. char_bbox, char_bbox+1,
  279. char_bbox+2, char_bbox+3);
  280. fz_chartorune(&unicode, graph);
  281. callback(ctx, arg, unicode, font_name, line_bbox, word_bbox, char_bbox, pointsize);
  282. }
  283. delete[] graph;
  284. res_it->Next(tesseract::RIL_SYMBOL);
  285. }
  286. while (!res_it->Empty(tesseract::RIL_BLOCK) &&
  287. !res_it->IsAtBeginningOf(tesseract::RIL_WORD));
  288. }
  289. }
  290. fz_always(ctx)
  291. {
  292. delete res_it;
  293. ocr_clear_image(ctx, image);
  294. }
  295. fz_catch(ctx)
  296. fz_rethrow(ctx);
  297. }
  298. }
  299. #endif