| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205 |
- // Copyright (C) 2004-2025 Artifex Software, Inc.
- //
- // This file is part of MuPDF.
- //
- // MuPDF is free software: you can redistribute it and/or modify it under the
- // terms of the GNU Affero General Public License as published by the Free
- // Software Foundation, either version 3 of the License, or (at your option)
- // any later version.
- //
- // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
- // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- // details.
- //
- // You should have received a copy of the GNU Affero General Public License
- // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
- //
- // Alternative licensing terms are available from the licensor.
- // For commercial licensing, see <https://www.artifex.com/> or contact
- // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
- // CA 94129, USA, for further information.
- #include "mupdf/fitz.h"
- #include <assert.h>
- #include <string.h>
- #include <errno.h>
- #undef DEBUG_OCR
- #ifndef OCR_DISABLED
- #include "tessocr.h"
- /*
- This device can be used in 2 modes, with or without a list.
- In both modes the OCR device is created with a target device. The
- caller runs the page to the device, and the device processes the calls
- and (eventually) calls through to the target.
- In both modes, all incoming calls are forwarded to an internal draw
- device to render the page, so the page rendering is always complete.
- The incoming calls are also forwarded (mostly, eventually) to the
- target. Where the 2 modes differ is in the timing/content of those
- forwarded calls.
- In the first mode (without a list), the device instantly forwards all
- non-text calls to the target. When the OCR device is closed, an OCR pass
- is performed, and the recovered text is forwarded to the target. All
- recovered text is listed as Courier, and ends up on top of the content.
- This is fine for text extraction and probably for most cases of document
- conversion. It's no good for correcting the unicode values within a
- document though.
- So, we have concocted a second way of working, using a display list. In
- this mode, as well as rendering every device call that comes in, it
- forwards them to a display list (and not the target). When the device
- is closed we OCR the text image, and store the results. We then play
- the list back through a 'rewrite' device to the target. The rewrite
- device rewrites the text objects with the correct unicode values. Any
- characters given by the OCR pass that aren't used by the rewrite step
- are then sent through as invisible text.
- This means that all the target device sees is the exact same graphical
- objects in the exact same order, but with corrected unicode values.
- Also, any text that appears in the document as a result of images or
- line art is sent through as 'invisible' text at the end, so it will work
- for cut/paste or search.
- Or, at least, that was the plan. Unfortunately, it turns out that
- Tesseract (with the LSTM engine (the most modern one)) is really bad at
- giving bounding boxes for characters. It seems that the neural network
- can say "hey, there is an 'X'", but it can't actually say where the X
- occurred within the word. So tesseract knows where the words are, and
- knows the order of the letters within the word, but basically guesses
- at bboxes for the letters.
- Because of this, we can't rely on character bboxes from tesseract to be
- correct. We have to work off the word bboxes alone, together with the
- order in which characters are passed to us.
- So, as Tesseract gives us data, we store the word bbox, together with
- the list of chars within that word.
- When we play the list back through the display device, we then have to
- rewrite text objects based on which word they are in. For the first
- version, we'll make the extremely dodgy assumption that characters
- come in the same order within the word.
- For future versions we may want to collect bboxes for each text char
- on our initial list building pass, collate those into matching 'words'
- and sort them accordingly.
- */
- typedef struct word_record_s {
- int len;
- fz_rect bbox;
- int n;
- int unicode[FZ_FLEXIBLE_ARRAY];
- } word_record;
- typedef struct fz_ocr_device_s
- {
- fz_device super;
- /* Progress monitoring */
- int (*progress)(fz_context *, void *, int progress);
- void *progress_arg;
- fz_device *target;
- fz_display_list *list;
- fz_device *list_dev;
- fz_device *draw_dev;
- fz_pixmap *pixmap;
- fz_rect mediabox;
- fz_matrix ctm;
- fz_rect word_bbox;
- fz_font *font;
- /* Current word */
- int char_max;
- int char_len;
- int *chars;
- /* Entire page */
- int words_max;
- int words_len;
- word_record **words;
- char *language;
- char *datadir;
- } fz_ocr_device;
- static void
- fz_ocr_fill_path(fz_context *ctx, fz_device *dev, const fz_path *path, int even_odd, fz_matrix ctm,
- fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- fz_fill_path(ctx, ocr->list_dev, path, even_odd, ctm, colorspace, color, alpha, color_params);
- fz_fill_path(ctx, ocr->draw_dev, path, even_odd, ctm, colorspace, color, alpha, color_params);
- }
- static void
- fz_ocr_stroke_path(fz_context *ctx, fz_device *dev, const fz_path *path, const fz_stroke_state *stroke,
- fz_matrix ctm, fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- fz_stroke_path(ctx, ocr->list_dev, path, stroke, ctm, colorspace, color, alpha, color_params);
- fz_stroke_path(ctx, ocr->draw_dev, path, stroke, ctm, colorspace, color, alpha, color_params);
- }
- static void
- fz_ocr_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm,
- fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- if (ocr->list_dev != ocr->target)
- fz_fill_text(ctx, ocr->list_dev, text, ctm, colorspace, color, alpha, color_params);
- fz_fill_text(ctx, ocr->draw_dev, text, ctm, colorspace, color, alpha, color_params);
- }
- static void
- fz_ocr_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke,
- fz_matrix ctm, fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- if (ocr->list_dev != ocr->target)
- fz_stroke_text(ctx, ocr->list_dev, text, stroke, ctm, colorspace, color, alpha, color_params);
- fz_stroke_text(ctx, ocr->draw_dev, text, stroke, ctm, colorspace, color, alpha, color_params);
- }
- static void
- fz_ocr_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, fz_matrix ctm, float alpha, fz_color_params color_params)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- fz_fill_shade(ctx, ocr->list_dev, shade, ctm, alpha, color_params);
- fz_fill_shade(ctx, ocr->draw_dev, shade, ctm, alpha, color_params);
- }
- static void
- fz_ocr_fill_image(fz_context *ctx, fz_device *dev, fz_image *image, fz_matrix ctm, float alpha, fz_color_params color_params)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- fz_fill_image(ctx, ocr->list_dev, image, ctm, alpha, color_params);
- fz_fill_image(ctx, ocr->draw_dev, image, ctm, alpha, color_params);
- }
- static void
- fz_ocr_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *image, fz_matrix ctm,
- fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- fz_fill_image_mask(ctx, ocr->list_dev, image, ctm, colorspace, color, alpha, color_params);
- fz_fill_image_mask(ctx, ocr->draw_dev, image, ctm, colorspace, color, alpha, color_params);
- }
- static void
- fz_ocr_clip_path(fz_context *ctx, fz_device *dev, const fz_path *path, int even_odd, fz_matrix ctm, fz_rect scissor)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- fz_clip_path(ctx, ocr->list_dev, path, even_odd, ctm, scissor);
- fz_clip_path(ctx, ocr->draw_dev, path, even_odd, ctm, scissor);
- }
- static void
- fz_ocr_clip_stroke_path(fz_context *ctx, fz_device *dev, const fz_path *path, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- fz_clip_stroke_path(ctx, ocr->list_dev, path, stroke, ctm, scissor);
- fz_clip_stroke_path(ctx, ocr->draw_dev, path, stroke, ctm, scissor);
- }
- static void
- fz_ocr_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_rect scissor)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- if (ocr->list_dev != ocr->target)
- fz_clip_text(ctx, ocr->list_dev, text, ctm, scissor);
- fz_clip_text(ctx, ocr->draw_dev, text, ctm, scissor);
- }
- static void
- fz_ocr_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- if (ocr->list_dev != ocr->target)
- fz_clip_stroke_text(ctx, ocr->list_dev, text, stroke, ctm, scissor);
- fz_clip_stroke_text(ctx, ocr->draw_dev, text, stroke, ctm, scissor);
- }
- static void
- fz_ocr_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- /* Ignore text is generally used when text has been sent as
- * part of other graphics - such as line art or images. As such
- * we'll pick up the 'true' unicode values of such text in the
- * OCR phase. We therefore send text to the list device (so
- * it can be rewritten), but not direct to the target. */
- if (ocr->list_dev != ocr->target)
- fz_ignore_text(ctx, ocr->list_dev, text, ctm);
- fz_ignore_text(ctx, ocr->draw_dev, text, ctm);
- }
- static void
- fz_ocr_clip_image_mask(fz_context *ctx, fz_device *dev, fz_image *image, fz_matrix ctm, fz_rect scissor)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- fz_clip_image_mask(ctx, ocr->list_dev, image, ctm, scissor);
- fz_clip_image_mask(ctx, ocr->draw_dev, image, ctm, scissor);
- }
- static void
- fz_ocr_pop_clip(fz_context *ctx, fz_device *dev)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- fz_pop_clip(ctx, ocr->list_dev);
- fz_pop_clip(ctx, ocr->draw_dev);
- }
- static void
- fz_ocr_begin_mask(fz_context *ctx, fz_device *dev, fz_rect rect, int luminosity, fz_colorspace *colorspace, const float *color, fz_color_params color_params)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- fz_begin_mask(ctx, ocr->list_dev, rect, luminosity, colorspace, color, color_params);
- fz_begin_mask(ctx, ocr->draw_dev, rect, luminosity, colorspace, color, color_params);
- }
- static void
- fz_ocr_end_mask(fz_context *ctx, fz_device *dev, fz_function *tr)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- fz_end_mask_tr(ctx, ocr->list_dev, tr);
- fz_end_mask_tr(ctx, ocr->draw_dev, tr);
- }
- static void
- fz_ocr_begin_group(fz_context *ctx, fz_device *dev, fz_rect rect, fz_colorspace *cs, int isolated, int knockout, int blendmode, float alpha)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- fz_begin_group(ctx, ocr->list_dev, rect, cs, isolated, knockout, blendmode, alpha);
- fz_begin_group(ctx, ocr->draw_dev, rect, cs, isolated, knockout, blendmode, alpha);
- }
- static void
- fz_ocr_end_group(fz_context *ctx, fz_device *dev)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- fz_end_group(ctx, ocr->list_dev);
- fz_end_group(ctx, ocr->draw_dev);
- }
- static int
- fz_ocr_begin_tile(fz_context *ctx, fz_device *dev, fz_rect area, fz_rect view, float xstep, float ystep, fz_matrix ctm, int id)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- /* Always pass 0 as tile id here so that neither device can
- * disagree about whether the contents need to be sent. */
- (void)fz_begin_tile_id(ctx, ocr->list_dev, area, view, xstep, ystep, ctm, 0);
- (void)fz_begin_tile_id(ctx, ocr->draw_dev, area, view, xstep, ystep, ctm, 0);
- return 0;
- }
- static void
- fz_ocr_end_tile(fz_context *ctx, fz_device *dev)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- fz_end_tile(ctx, ocr->list_dev);
- fz_end_tile(ctx, ocr->draw_dev);
- }
- static void
- fz_ocr_render_flags(fz_context *ctx, fz_device *dev, int set, int clear)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- fz_render_flags(ctx, ocr->list_dev, set, clear);
- fz_render_flags(ctx, ocr->draw_dev, set, clear);
- }
- static void
- fz_ocr_set_default_colorspaces(fz_context *ctx, fz_device *dev, fz_default_colorspaces *cs)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- fz_set_default_colorspaces(ctx, ocr->list_dev, cs);
- fz_set_default_colorspaces(ctx, ocr->draw_dev, cs);
- }
- static void
- fz_ocr_begin_layer(fz_context *ctx, fz_device *dev, const char *layer_name)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- fz_begin_layer(ctx, ocr->list_dev, layer_name);
- fz_begin_layer(ctx, ocr->draw_dev, layer_name);
- }
- static void
- fz_ocr_end_layer(fz_context *ctx, fz_device *dev)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- fz_end_layer(ctx, ocr->list_dev);
- fz_end_layer(ctx, ocr->draw_dev);
- }
- static void
- drop_ocr_device(fz_context *ctx, fz_ocr_device *ocr)
- {
- int i;
- if (ocr == NULL)
- return;
- if (ocr->list_dev != ocr->target)
- fz_drop_device(ctx, ocr->list_dev);
- fz_drop_display_list(ctx, ocr->list);
- fz_drop_device(ctx, ocr->draw_dev);
- fz_drop_pixmap(ctx, ocr->pixmap);
- for (i = 0; i < ocr->words_len; i++)
- fz_free(ctx, ocr->words[i]);
- fz_free(ctx, ocr->words);
- fz_free(ctx, ocr->chars);
- fz_free(ctx, ocr->language);
- fz_free(ctx, ocr->datadir);
- }
- static void
- flush_word(fz_context *ctx, fz_ocr_device *ocr)
- {
- float color = 1;
- fz_color_params params = { 0 };
- int i;
- fz_text *text = NULL;
- fz_matrix trm;
- float step;
- fz_rect char_bbox;
- if (ocr->char_len == 0)
- return;
- /* If we're not sending direct to the target device, then insert
- * all the chars we've found into a table so we can rewrite
- * the text objects that come from the list device on the fly.
- */
- if (ocr->list_dev != ocr->target)
- {
- word_record *word;
- if (ocr->words_len == ocr->words_max)
- {
- int new_max = ocr->words_max * 2;
- if (new_max == 0)
- new_max = 32;
- ocr->words = fz_realloc_array(ctx, ocr->words, new_max, word_record *);
- ocr->words_max = new_max;
- }
- word = fz_malloc_flexible(ctx, word_record, unicode, ocr->char_len);
- word->len = ocr->char_len;
- word->bbox = ocr->word_bbox;
- word->n = 0;
- memcpy(word->unicode, ocr->chars, ocr->char_len * sizeof(int));
- ocr->words[ocr->words_len++] = word;
- ocr->char_len = 0;
- return;
- }
- /* FIXME: Look at font-name. */
- /* All this is a bit horrid, because the detection of sizes for
- * the glyphs depends on the width of the glyphs. Use Courier
- * because it's monospaced. */
- if (ocr->font == NULL)
- ocr->font = fz_new_base14_font(ctx, "Courier");
- fz_var(text);
- fz_try(ctx)
- {
- text = fz_new_text(ctx);
- /* Divide the word box into equal lengths. */
- /* This falls down when we have words with chars of
- * different widths in, but it's acceptable for these
- * purposes. */
- /* FIXME: This assumes L2R motion of text. */
- step = (ocr->word_bbox.x1 - ocr->word_bbox.x0) / ocr->char_len;
- char_bbox.x1 = ocr->word_bbox.x0;
- char_bbox.y0 = ocr->word_bbox.y0;
- char_bbox.y1 = ocr->word_bbox.y1;
- for (i = 0; i < ocr->char_len; i++)
- {
- char_bbox.x0 = char_bbox.x1;
- char_bbox.x1 += step;
- /* Horrid constants that happen to work with Courier. */
- trm.a = 10.0f/6 * (char_bbox.x1 - char_bbox.x0);
- trm.b = 0;
- trm.c = 0;
- trm.d = 10.0f/6 * (char_bbox.y1 - char_bbox.y0);
- trm.e = char_bbox.x0;
- trm.f = char_bbox.y0;
- fz_show_glyph(ctx, text, ocr->font, trm,
- fz_encode_character(ctx, ocr->font, ocr->chars[i]), ocr->chars[i],
- 0, 0, FZ_BIDI_LTR, 0);
- }
- fz_fill_text(ctx, ocr->target, text, fz_identity,
- fz_device_gray(ctx), &color, 1, params);
- }
- fz_always(ctx)
- {
- fz_drop_text(ctx, text);
- }
- fz_catch(ctx)
- fz_rethrow(ctx);
- ocr->char_len = 0;
- }
- static void
- char_callback(fz_context *ctx, void *arg, int unicode,
- const char *font_name,
- const int *line_bbox, const int *word_bbox,
- const int *char_bbox, int pointsize)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)arg;
- fz_rect bbox = { word_bbox[0]-1, word_bbox[1]-1, word_bbox[2]+1, word_bbox[3]+1 };
- if (bbox.x0 != ocr->word_bbox.x0 ||
- bbox.y0 != ocr->word_bbox.y0 ||
- bbox.x1 != ocr->word_bbox.x1 ||
- bbox.y1 != ocr->word_bbox.y1)
- {
- flush_word(ctx, ocr);
- ocr->word_bbox = bbox;
- }
- if (ocr->char_max == ocr->char_len)
- {
- int new_max = ocr->char_max * 2;
- if (new_max == 0)
- new_max = 32;
- ocr->chars = fz_realloc_array(ctx, ocr->chars, new_max, int);
- ocr->char_max = new_max;
- }
- ocr->chars[ocr->char_len++] = unicode;
- }
- typedef struct
- {
- fz_device super;
- fz_device *target;
- int words_len;
- word_record **words;
- int current;
- } fz_rewrite_device;
- static fz_text_span *
- fz_clone_text_span(fz_context *ctx, const fz_text_span *span)
- {
- fz_text_span *cspan;
- if (span == NULL)
- return NULL;
- cspan = fz_malloc_struct(ctx, fz_text_span);
- *cspan = *span;
- cspan->cap = cspan->len;
- cspan->items = fz_calloc_no_throw(ctx, cspan->len, sizeof(*cspan->items));
- if (cspan->items == NULL)
- {
- fz_free(ctx, cspan);
- errno = ENOMEM;
- fz_throw(ctx, FZ_ERROR_SYSTEM, "calloc (%zu x %zu bytes) failed", (size_t)cspan->len, sizeof(*cspan->items));
- }
- memcpy(cspan->items, span->items, sizeof(*cspan->items) * cspan->len);
- fz_keep_font(ctx, cspan->font);
- return cspan;
- }
- #ifdef DEBUG_OCR
- static void
- debug_word(fz_context *ctx, word_record *word)
- {
- int i;
- fz_write_printf(ctx, fz_stdout(ctx), " %g %g %g %g:",
- word->bbox.x0,
- word->bbox.y0,
- word->bbox.x1,
- word->bbox.y1);
- for (i = 0; i < word->n; i++)
- {
- int unicode = word->unicode[i];
- if (unicode >= 32 && unicode < 127)
- fz_write_printf(ctx, fz_stdout(ctx), "%c", unicode);
- else
- fz_write_printf(ctx, fz_stdout(ctx), "<%04x>", unicode);
- }
- if (word->n < word->len)
- {
- int unicode = word->unicode[i++];
- if (unicode >= 32 && unicode < 127)
- fz_write_printf(ctx, fz_stdout(ctx), "{%c}", unicode);
- else
- fz_write_printf(ctx, fz_stdout(ctx), "{<%04x>}", unicode);
- for (; i < word->len; i++)
- {
- int unicode = word->unicode[i];
- if (unicode >= 32 && unicode < 127)
- fz_write_printf(ctx, fz_stdout(ctx), "%c", unicode);
- else
- fz_write_printf(ctx, fz_stdout(ctx), "<%04x>", unicode);
- }
- }
- fz_write_printf(ctx, fz_stdout(ctx), "\n");
- }
- #endif
- static void
- rewrite_char(fz_context *ctx, fz_rewrite_device *dev, fz_matrix ctm, fz_text_item *item, fz_point vadv)
- {
- int i, start;
- fz_point p = { item->x, item->y };
- /* No point in trying to rewrite spaces! */
- if (item->ucs == 32)
- return;
- p = fz_transform_point(p, ctm);
- p.x += vadv.x/2;
- p.y += vadv.y/2;
- #ifdef DEBUG_OCR
- fz_write_printf(ctx, fz_stdout(ctx), "Looking for '%c' at %g %g\n", item->ucs, p.x, p.y);
- #endif
- start = dev->current;
- for (i = start; i < dev->words_len; i++)
- {
- #ifdef DEBUG_OCR
- debug_word(ctx, dev->words[i]);
- #endif
- if (dev->words[i]->n >= dev->words[i]->len)
- continue;
- if (dev->words[i]->bbox.x0 <= p.x &&
- dev->words[i]->bbox.x1 >= p.x &&
- dev->words[i]->bbox.y0 <= p.y &&
- dev->words[i]->bbox.y1 >= p.y)
- {
- item->ucs = dev->words[i]->unicode[dev->words[i]->n++];
- dev->current = i;
- return;
- }
- }
- for (i = 0; i < start; i++)
- {
- #ifdef DEBUG_OCR
- debug_word(ctx, dev->words[i]);
- #endif
- if (dev->words[i]->n >= dev->words[i]->len)
- continue;
- if (dev->words[i]->bbox.x0 <= p.x &&
- dev->words[i]->bbox.x1 >= p.x &&
- dev->words[i]->bbox.y0 <= p.y &&
- dev->words[i]->bbox.y1 >= p.y)
- {
- item->ucs = dev->words[i]->unicode[dev->words[i]->n++];
- dev->current = i;
- return;
- }
- }
- }
- static fz_text_span *
- rewrite_span(fz_context *ctx, fz_rewrite_device *dev, fz_matrix ctm, const fz_text_span *span)
- {
- fz_text_span *rspan = fz_clone_text_span(ctx, span);
- int wmode = span->wmode;
- int i;
- fz_point dir;
- fz_matrix trm = span->trm;
- trm.e = 0;
- trm.f = 0;
- trm = fz_concat(trm, ctm);
- if (wmode == 0)
- {
- dir.x = 1;
- dir.y = 0;
- }
- else
- {
- dir.x = 0;
- dir.y = -1;
- }
- dir = fz_transform_vector(dir, trm);
- /* And do the actual rewriting */
- for (i = 0; i < rspan->len; i++) {
- float advance = rspan->items[i].adv;
- fz_point vadv = { dir.x * advance, dir.y * advance };
- rewrite_char(ctx, dev, ctm, &rspan->items[i], vadv);
- }
- return rspan;
- }
- static fz_text *
- rewrite_text(fz_context *ctx, fz_rewrite_device *dev, fz_matrix ctm, const fz_text *text)
- {
- fz_text *rtext = fz_new_text(ctx);
- fz_text_span *span = text->head;
- fz_text_span **dspan = &rtext->head;
- fz_try(ctx)
- {
- while (span)
- {
- *dspan = rewrite_span(ctx, dev, ctm, span);
- rtext->tail = *dspan;
- dspan = &(*dspan)->next;
- span = span->next;
- }
- }
- fz_catch(ctx)
- {
- fz_drop_text(ctx, rtext);
- fz_rethrow(ctx);
- }
- return rtext;
- }
- static void
- rewrite_fill_path(fz_context *ctx, fz_device *dev, const fz_path *path, int even_odd, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params params)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_fill_path(ctx, rewrite->target, path, even_odd, ctm, cs, color, alpha, params);
- }
- static void
- rewrite_stroke_path(fz_context *ctx, fz_device *dev, const fz_path *path, const fz_stroke_state *stroke, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params params)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_stroke_path(ctx, rewrite->target, path, stroke, ctm, cs, color, alpha, params);
- }
- static void
- rewrite_clip_path(fz_context *ctx, fz_device *dev, const fz_path *path, int even_odd, fz_matrix ctm, fz_rect scissor)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_clip_path(ctx, rewrite->target, path, even_odd, ctm, scissor);
- }
- static void
- rewrite_clip_stroke_path(fz_context *ctx, fz_device *dev, const fz_path *path, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_clip_stroke_path(ctx, rewrite->target, path, stroke, ctm, scissor);
- }
- static void
- rewrite_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params params)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_text *rtext = rewrite_text(ctx, rewrite, ctm, text);
- fz_try(ctx)
- fz_fill_text(ctx, rewrite->target, rtext, ctm, cs, color, alpha, params);
- fz_always(ctx)
- fz_drop_text(ctx, rtext);
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- static void
- rewrite_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params params)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_text *rtext = rewrite_text(ctx, rewrite, ctm, text);
- fz_try(ctx)
- fz_stroke_text(ctx, rewrite->target, rtext, stroke, ctm, cs, color, alpha, params);
- fz_always(ctx)
- fz_drop_text(ctx, rtext);
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- static void
- rewrite_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_rect scissor)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_text *rtext = rewrite_text(ctx, rewrite, ctm, text);
- fz_try(ctx)
- fz_clip_text(ctx, rewrite->target, rtext, ctm, scissor);
- fz_always(ctx)
- fz_drop_text(ctx, rtext);
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- static void
- rewrite_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_text *rtext = rewrite_text(ctx, rewrite, ctm, text);
- fz_try(ctx)
- fz_clip_stroke_text(ctx, rewrite->target, rtext, stroke, ctm, scissor);
- fz_always(ctx)
- fz_drop_text(ctx, rtext);
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- static void
- rewrite_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_text *rtext = rewrite_text(ctx, rewrite, ctm, text);
- fz_try(ctx)
- fz_ignore_text(ctx, rewrite->target, rtext, ctm);
- fz_always(ctx)
- fz_drop_text(ctx, rtext);
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- static void
- rewrite_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shd, fz_matrix ctm, float alpha, fz_color_params color_params)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_fill_shade(ctx, rewrite->target, shd, ctm, alpha, color_params);
- }
- static void
- rewrite_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, float alpha, fz_color_params color_params)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_fill_image(ctx, rewrite->target, img, ctm, alpha, color_params);
- }
- static void
- rewrite_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params color_params)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_fill_image_mask(ctx, rewrite->target, img, ctm, cs, color, alpha, color_params);
- }
- static void
- rewrite_clip_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, fz_rect scissor)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_clip_image_mask(ctx, rewrite->target, img, ctm, scissor);
- }
- static void
- rewrite_pop_clip(fz_context *ctx, fz_device *dev)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_pop_clip(ctx, rewrite->target);
- }
- static void
- rewrite_begin_mask(fz_context *ctx, fz_device *dev, fz_rect area, int luminosity, fz_colorspace *cs, const float *bc, fz_color_params params)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_begin_mask(ctx, rewrite->target, area, luminosity, cs, bc, params);
- }
- static void
- rewrite_end_mask(fz_context *ctx, fz_device *dev, fz_function *tr)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_end_mask_tr(ctx, rewrite->target, tr);
- }
- static void
- rewrite_begin_group(fz_context *ctx, fz_device *dev, fz_rect area, fz_colorspace *cs, int isolated, int knockout, int blendmode, float alpha)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_begin_group(ctx, rewrite->target, area, cs, isolated, knockout, blendmode, alpha);
- }
- static void
- rewrite_end_group(fz_context *ctx, fz_device *dev)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_end_group(ctx, rewrite->target);
- }
- static int
- rewrite_begin_tile(fz_context *ctx, fz_device *dev, fz_rect area, fz_rect view, float xstep, float ystep, fz_matrix ctm, int id)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- return fz_begin_tile_id(ctx, rewrite->target, area, view, xstep, ystep, ctm, id);
- }
- static void
- rewrite_end_tile(fz_context *ctx, fz_device *dev)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_end_tile(ctx, rewrite->target);
- }
- static void
- rewrite_render_flags(fz_context *ctx, fz_device *dev, int set, int clear)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_render_flags(ctx, rewrite->target, set, clear);
- }
- static void
- rewrite_set_default_colorspaces(fz_context *ctx, fz_device *dev, fz_default_colorspaces *cs)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_set_default_colorspaces(ctx, rewrite->target, cs);
- }
- static void
- rewrite_begin_layer(fz_context *ctx, fz_device *dev, const char *layer_name)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_begin_layer(ctx, rewrite->target, layer_name);
- }
- static void
- rewrite_end_layer(fz_context *ctx, fz_device *dev)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_end_layer(ctx, rewrite->target);
- }
- static void
- rewrite_close(fz_context *ctx, fz_device *dev)
- {
- fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
- fz_font *font;
- fz_text *text = NULL;
- fz_matrix trm;
- int i, j;
- /* All this is a bit horrid, because the detection of sizes for
- * the glyphs depends on the width of the glyphs. Use Courier
- * because it's monospaced. */
- font = fz_new_base14_font(ctx, "Courier");
- fz_var(text);
- fz_try(ctx)
- {
- text = fz_new_text(ctx);
- for (i = 0; i < rewrite->words_len; i++)
- {
- word_record *word = rewrite->words[i];
- fz_rect char_bbox;
- float step;
- if (word->n >= word->len)
- continue;
- step = (word->bbox.x1 - word->bbox.x0) / word->len;
- char_bbox.x1 = word->bbox.x0;
- char_bbox.y0 = word->bbox.y0;
- char_bbox.y1 = word->bbox.y1;
- for (j = 0; j < word->len; j++)
- {
- char_bbox.x0 = char_bbox.x1;
- char_bbox.x1 += step;
- /* Horrid constants that happen to work with Courier. */
- trm.a = 10.0f/6 * (char_bbox.x1 - char_bbox.x0);
- trm.b = 0;
- trm.c = 0;
- trm.d = (char_bbox.y1 - char_bbox.y0);
- trm.e = char_bbox.x0;
- trm.f = char_bbox.y0;
- fz_show_glyph(ctx, text, font, trm,
- word->unicode[j], word->unicode[j],
- 0, 0, FZ_BIDI_LTR, 0);
- }
- }
- fz_ignore_text(ctx, rewrite->target, text, fz_identity);
- }
- fz_always(ctx)
- {
- fz_drop_text(ctx, text);
- fz_drop_font(ctx, font);
- }
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- static fz_device *
- new_rewrite_device(fz_context *ctx, fz_device *target, word_record **words, int words_len)
- {
- fz_rewrite_device *rewrite;
- rewrite = fz_new_derived_device(ctx, fz_rewrite_device);
- rewrite->super.close_device = rewrite_close;
- rewrite->super.fill_path = rewrite_fill_path;
- rewrite->super.stroke_path = rewrite_stroke_path;
- rewrite->super.clip_path = rewrite_clip_path;
- rewrite->super.clip_stroke_path = rewrite_clip_stroke_path;
- rewrite->super.fill_text = rewrite_fill_text;
- rewrite->super.stroke_text = rewrite_stroke_text;
- rewrite->super.clip_text = rewrite_clip_text;
- rewrite->super.clip_stroke_text = rewrite_clip_stroke_text;
- rewrite->super.ignore_text = rewrite_ignore_text;
- rewrite->super.fill_shade = rewrite_fill_shade;
- rewrite->super.fill_image = rewrite_fill_image;
- rewrite->super.fill_image_mask = rewrite_fill_image_mask;
- rewrite->super.clip_image_mask = rewrite_clip_image_mask;
- rewrite->super.pop_clip = rewrite_pop_clip;
- rewrite->super.begin_mask = rewrite_begin_mask;
- rewrite->super.end_mask = rewrite_end_mask;
- rewrite->super.begin_group = rewrite_begin_group;
- rewrite->super.end_group = rewrite_end_group;
- rewrite->super.begin_tile = rewrite_begin_tile;
- rewrite->super.end_tile = rewrite_end_tile;
- rewrite->super.render_flags = rewrite_render_flags;
- rewrite->super.set_default_colorspaces = rewrite_set_default_colorspaces;
- rewrite->super.begin_layer = rewrite_begin_layer;
- rewrite->super.end_layer = rewrite_end_layer;
- rewrite->target = target;
- rewrite->words = words;
- rewrite->words_len = words_len;
- rewrite->current = 0;
- return &rewrite->super;
- }
- static int
- fz_ocr_progress(fz_context *ctx, void *arg, int prog)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)arg;
- if (ocr->progress == NULL)
- return 0;
- return ocr->progress(ctx, ocr->progress_arg, prog);
- }
- static void
- fz_ocr_close_device(fz_context *ctx, fz_device *dev)
- {
- fz_ocr_device *ocr = (fz_ocr_device *)dev;
- void *tessapi;
- fz_device *rewrite_device;
- fz_rect bbox;
- fz_close_device(ctx, ocr->draw_dev);
- /* Now run the OCR */
- tessapi = ocr_init(ctx, ocr->language, ocr->datadir);
- fz_try(ctx)
- {
- ocr_recognise(ctx, tessapi, ocr->pixmap, char_callback, &fz_ocr_progress, ocr);
- flush_word(ctx, ocr);
- }
- fz_always(ctx)
- ocr_fin(ctx, tessapi);
- fz_catch(ctx)
- fz_rethrow(ctx);
- /* If we're not using a list, we're done! */
- if (ocr->list_dev == ocr->target)
- return;
- fz_close_device(ctx, ocr->list_dev);
- bbox = fz_transform_rect(ocr->mediabox, ocr->ctm);
- rewrite_device = new_rewrite_device(ctx, ocr->target, ocr->words, ocr->words_len);
- fz_try(ctx)
- {
- fz_run_display_list(ctx, ocr->list, rewrite_device,
- fz_identity, bbox, NULL);
- }
- fz_always(ctx)
- {
- fz_close_device(ctx, rewrite_device);
- fz_drop_device(ctx, rewrite_device);
- }
- fz_catch(ctx)
- fz_rethrow(ctx);
- }
- static void
- fz_ocr_drop_device(fz_context *ctx, fz_device *dev)
- {
- drop_ocr_device(ctx, (fz_ocr_device *)dev);
- }
- #endif
- fz_device *
- fz_new_ocr_device(fz_context *ctx,
- fz_device *target,
- fz_matrix ctm,
- fz_rect mediabox,
- int with_list,
- const char *language,
- const char *datadir,
- int (*progress)(fz_context *, void *, int),
- void *progress_arg)
- {
- #ifdef OCR_DISABLED
- fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "OCR Disabled in this build");
- #else
- fz_ocr_device *dev;
- if (target == NULL)
- fz_throw(ctx, FZ_ERROR_ARGUMENT, "OCR devices require a target");
- dev = fz_new_derived_device(ctx, fz_ocr_device);
- dev->super.close_device = fz_ocr_close_device;
- dev->super.drop_device = fz_ocr_drop_device;
- dev->super.fill_path = fz_ocr_fill_path;
- dev->super.stroke_path = fz_ocr_stroke_path;
- dev->super.clip_path = fz_ocr_clip_path;
- dev->super.clip_stroke_path = fz_ocr_clip_stroke_path;
- dev->super.fill_text = fz_ocr_fill_text;
- dev->super.stroke_text = fz_ocr_stroke_text;
- dev->super.clip_text = fz_ocr_clip_text;
- dev->super.clip_stroke_text = fz_ocr_clip_stroke_text;
- dev->super.ignore_text = fz_ocr_ignore_text;
- dev->super.fill_shade = fz_ocr_fill_shade;
- dev->super.fill_image = fz_ocr_fill_image;
- dev->super.fill_image_mask = fz_ocr_fill_image_mask;
- dev->super.clip_image_mask = fz_ocr_clip_image_mask;
- dev->super.pop_clip = fz_ocr_pop_clip;
- dev->super.begin_mask = fz_ocr_begin_mask;
- dev->super.end_mask = fz_ocr_end_mask;
- dev->super.begin_group = fz_ocr_begin_group;
- dev->super.end_group = fz_ocr_end_group;
- dev->super.begin_tile = fz_ocr_begin_tile;
- dev->super.end_tile = fz_ocr_end_tile;
- dev->super.render_flags = fz_ocr_render_flags;
- dev->super.set_default_colorspaces = fz_ocr_set_default_colorspaces;
- dev->super.begin_layer = fz_ocr_begin_layer;
- dev->super.end_layer = fz_ocr_end_layer;
- dev->progress = progress;
- dev->progress_arg = progress_arg;
- fz_try(ctx)
- {
- fz_rect bbox;
- fz_irect ibox;
- fz_point res;
- dev->target = target;
- dev->mediabox = mediabox;
- dev->ctm = ctm;
- bbox = fz_transform_rect(mediabox, ctm);
- ibox = fz_round_rect(bbox);
- /* Fudge the width to be a multiple of 4. */
- ibox.x1 += (4-(ibox.x1-ibox.x0)) & 3;
- dev->pixmap = fz_new_pixmap_with_bbox(ctx, fz_device_gray(ctx),
- ibox, NULL, 0);
- fz_clear_pixmap(ctx, dev->pixmap);
- res = fz_transform_point_xy(72, 72, ctm);
- if (res.x < 0)
- res.x = -res.x;
- if (res.x < 1)
- res.x = 1;
- if (res.y < 0)
- res.y = -res.y;
- if (res.y < 1)
- res.y = 1;
- fz_set_pixmap_resolution(ctx, dev->pixmap, res.x, res.y);
- dev->language = fz_strdup(ctx, language ? language : "eng");
- dev->datadir = fz_strdup(ctx, datadir ? datadir : "");
- dev->draw_dev = fz_new_draw_device(ctx, fz_identity, dev->pixmap);
- if (with_list)
- {
- dev->list = fz_new_display_list(ctx, mediabox);
- dev->list_dev = fz_new_list_device(ctx, dev->list);
- } else
- dev->list_dev = dev->target;
- }
- fz_catch(ctx)
- {
- drop_ocr_device(ctx, dev);
- fz_rethrow(ctx);
- }
- return (fz_device*)dev;
- #endif
- }
|