| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351 |
- // Copyright (C) 2020-2024 Artifex Software, Inc.
- //
- // This file is part of MuPDF.
- //
- // MuPDF is free software: you can redistribute it and/or modify it under the
- // terms of the GNU Affero General Public License as published by the Free
- // Software Foundation, either version 3 of the License, or (at your option)
- // any later version.
- //
- // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
- // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- // details.
- //
- // You should have received a copy of the GNU Affero General Public License
- // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
- //
- // Alternative licensing terms are available from the licensor.
- // For commercial licensing, see <https://www.artifex.com/> or contact
- // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
- // CA 94129, USA, for further information.
- #include "mupdf/fitz/config.h"
- #ifndef OCR_DISABLED
- #include <climits>
- #include "tesseract/baseapi.h"
- #include "tesseract/capi.h" // for ETEXT_DESC
- extern "C" {
- #include "allheaders.h"
- #include "tessocr.h"
- #include "leptonica-wrap.h"
- #if TESSERACT_MAJOR_VERSION >= 5
- static bool
- load_file(const char* filename, std::vector<char>* data)
- {
- bool result = false;
- FILE *fp = fopen(filename, "rb");
- if (fp == NULL)
- return false;
- fseek(fp, 0, SEEK_END);
- long size = ftell(fp);
- fseek(fp, 0, SEEK_SET);
- // Trying to open a directory on Linux sets size to LONG_MAX. Catch it here.
- if (size > 0 && size < LONG_MAX)
- {
- // reserve an extra byte in case caller wants to append a '\0' character
- data->reserve(size + 1);
- data->resize(size);
- result = static_cast<long>(fread(&(*data)[0], 1, size, fp)) == size;
- }
- fclose(fp);
- return result;
- }
- static bool
- tess_file_reader(const char *fname, std::vector<char> *out)
- {
- /* FIXME: Look for inbuilt ones. */
- /* Then under TESSDATA */
- return load_file(fname, out);
- }
- #else
- static bool
- load_file(const char* filename, GenericVector<char>* data)
- {
- bool result = false;
- FILE *fp = fopen(filename, "rb");
- if (fp == NULL)
- return false;
- fseek(fp, 0, SEEK_END);
- long size = ftell(fp);
- fseek(fp, 0, SEEK_SET);
- // Trying to open a directory on Linux sets size to LONG_MAX. Catch it here.
- if (size > 0 && size < LONG_MAX)
- {
- // reserve an extra byte in case caller wants to append a '\0' character
- data->reserve(size + 1);
- data->resize_no_init(size);
- result = static_cast<long>(fread(&(*data)[0], 1, size, fp)) == size;
- }
- fclose(fp);
- return result;
- }
- static bool
- tess_file_reader(const STRING& fname, GenericVector<char> *out)
- {
- /* FIXME: Look for inbuilt ones. */
- /* Then under TESSDATA */
- return load_file(fname.c_str(), out);
- }
- #endif
- void *ocr_init(fz_context *ctx, const char *language, const char *datadir)
- {
- tesseract::TessBaseAPI *api;
- fz_set_leptonica_mem(ctx);
- api = new tesseract::TessBaseAPI();
- if (api == NULL)
- {
- fz_clear_leptonica_mem(ctx);
- fz_throw(ctx, FZ_ERROR_LIBRARY, "Tesseract base initialisation failed");
- }
- if (language == NULL || language[0] == 0)
- language = "eng";
- // Initialize tesseract-ocr with English, without specifying tessdata path
- if (api->Init(datadir, 0, /* data, data_size */
- language,
- tesseract::OcrEngineMode::OEM_DEFAULT,
- NULL, 0, /* configs, configs_size */
- NULL, NULL, /* vars_vec */
- false, /* set_only_non_debug_params */
- &tess_file_reader))
- {
- delete api;
- fz_clear_leptonica_mem(ctx);
- fz_throw(ctx, FZ_ERROR_LIBRARY, "Tesseract language initialisation failed");
- }
- return api;
- }
- void ocr_fin(fz_context *ctx, void *api_)
- {
- tesseract::TessBaseAPI *api = (tesseract::TessBaseAPI *)api_;
- if (api == NULL)
- return;
- api->End();
- delete api;
- fz_clear_leptonica_mem(ctx);
- }
- static inline int isbigendian(void)
- {
- static const int one = 1;
- return *(char*)&one == 0;
- }
- static Pix *
- ocr_set_image(fz_context *ctx, tesseract::TessBaseAPI *api, fz_pixmap *pix)
- {
- Pix *image = pixCreateHeader(pix->w, pix->h, 8);
- if (image == NULL)
- fz_throw(ctx, FZ_ERROR_LIBRARY, "Tesseract image creation failed");
- pixSetData(image, (l_uint32 *)pix->samples);
- pixSetPadBits(image, 1);
- pixSetXRes(image, pix->xres);
- pixSetYRes(image, pix->yres);
- if (!isbigendian())
- {
- /* Frizzle the image */
- int x, y;
- uint32_t *d = (uint32_t *)pix->samples;
- for (y = pix->h; y > 0; y--)
- for (x = pix->w>>2; x > 0; x--)
- {
- uint32_t v = *d;
- ((uint8_t *)d)[0] = v>>24;
- ((uint8_t *)d)[1] = v>>16;
- ((uint8_t *)d)[2] = v>>8;
- ((uint8_t *)d)[3] = v;
- d++;
- }
- }
- /* pixWrite("test.pnm", image, IFF_PNM); */
- api->SetImage(image);
- return image;
- }
- static void
- ocr_clear_image(fz_context *ctx, Pix *image)
- {
- pixSetData(image, NULL);
- pixDestroy(&image);
- }
- typedef struct {
- fz_context *ctx;
- void *arg;
- int (*progress)(fz_context *, void *, int progress);
- } progress_arg;
- static bool
- do_cancel(void *arg, int dummy)
- {
- return true;
- }
- static bool
- progress_callback(ETEXT_DESC *monitor, int l, int r, int t, int b)
- {
- progress_arg *details = (progress_arg *)monitor->cancel_this;
- int cancel;
- if (!details->progress)
- return false;
- cancel = details->progress(details->ctx, details->arg, monitor->progress);
- if (cancel)
- monitor->cancel = do_cancel;
- return false;
- }
- void ocr_recognise(fz_context *ctx,
- void *api_,
- fz_pixmap *pix,
- void (*callback)(fz_context *ctx,
- void *arg,
- int unicode,
- const char *font_name,
- const int *line_bbox,
- const int *word_bbox,
- const int *char_bbox,
- int pointsize),
- int (*progress)(fz_context *ctx,
- void *arg,
- int progress),
- void *arg)
- {
- tesseract::TessBaseAPI *api = (tesseract::TessBaseAPI *)api_;
- Pix *image;
- int code;
- int word_bbox[4];
- int char_bbox[4];
- int line_bbox[4];
- bool bold, italic, underlined, monospace, serif, smallcaps;
- int pointsize, font_id;
- const char* font_name;
- ETEXT_DESC monitor;
- progress_arg details;
- if (api == NULL)
- return;
- image = ocr_set_image(ctx, api, pix);
- monitor.cancel = nullptr;
- monitor.cancel_this = &details;
- details.ctx = ctx;
- details.arg = arg;
- details.progress = progress;
- monitor.progress_callback2 = progress_callback;
- code = api->Recognize(&monitor);
- if (code < 0)
- {
- ocr_clear_image(ctx, image);
- fz_throw(ctx, FZ_ERROR_LIBRARY, "OCR recognise failed");
- }
- if (!isbigendian())
- {
- /* Frizzle the image */
- int x, y;
- uint32_t *d = (uint32_t *)pix->samples;
- for (y = pix->h; y > 0; y--)
- for (x = pix->w>>2; x > 0; x--)
- {
- uint32_t v = *d;
- ((uint8_t *)d)[0] = v>>24;
- ((uint8_t *)d)[1] = v>>16;
- ((uint8_t *)d)[2] = v>>8;
- ((uint8_t *)d)[3] = v;
- d++;
- }
- }
- tesseract::ResultIterator *res_it = api->GetIterator();
- fz_try(ctx)
- {
- while (!res_it->Empty(tesseract::RIL_BLOCK))
- {
- if (res_it->Empty(tesseract::RIL_WORD))
- {
- res_it->Next(tesseract::RIL_WORD);
- continue;
- }
- res_it->BoundingBox(tesseract::RIL_TEXTLINE,
- line_bbox, line_bbox+1,
- line_bbox+2, line_bbox+3);
- res_it->BoundingBox(tesseract::RIL_WORD,
- word_bbox, word_bbox+1,
- word_bbox+2, word_bbox+3);
- font_name = res_it->WordFontAttributes(&bold,
- &italic,
- &underlined,
- &monospace,
- &serif,
- &smallcaps,
- &pointsize,
- &font_id);
- do
- {
- const char *graph = res_it->GetUTF8Text(tesseract::RIL_SYMBOL);
- if (graph && graph[0] != 0)
- {
- int unicode;
- res_it->BoundingBox(tesseract::RIL_SYMBOL,
- char_bbox, char_bbox+1,
- char_bbox+2, char_bbox+3);
- fz_chartorune(&unicode, graph);
- callback(ctx, arg, unicode, font_name, line_bbox, word_bbox, char_bbox, pointsize);
- }
- delete[] graph;
- res_it->Next(tesseract::RIL_SYMBOL);
- }
- while (!res_it->Empty(tesseract::RIL_BLOCK) &&
- !res_it->IsAtBeginningOf(tesseract::RIL_WORD));
- }
- }
- fz_always(ctx)
- {
- delete res_it;
- ocr_clear_image(ctx, image);
- }
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- }
- #endif
|