output-docx.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900
  1. // Copyright (C) 2004-2025 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #include "mupdf/fitz.h"
  23. #if FZ_ENABLE_DOCX_OUTPUT
  24. #include "glyphbox.h"
  25. #include "extract/extract.h"
  26. #include "extract/buffer.h"
  27. #include <assert.h>
  28. #include <errno.h>
  29. #include <string.h>
  30. typedef struct
  31. {
  32. fz_document_writer super;
  33. extract_alloc_t *alloc;
  34. /*
  35. * .ctx is needed for the callbacks we get from the Extract library, for
  36. * example s_realloc_fn(). Each of our main device callbacks sets .ctx on
  37. * entry, and resets back to NULL before returning.
  38. */
  39. fz_context *ctx;
  40. fz_output *output;
  41. extract_t *extract;
  42. int spacing;
  43. int rotation;
  44. int images;
  45. int mediabox_clip;
  46. fz_rect mediabox; /* As passed to writer_begin_page(). */
  47. char output_cache[1024];
  48. } fz_docx_writer;
  49. typedef struct
  50. {
  51. fz_device super;
  52. fz_docx_writer *writer;
  53. } fz_docx_device;
  54. static void dev_text(fz_context *ctx, fz_device *dev_, const fz_text *text, fz_matrix ctm,
  55. fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
  56. {
  57. fz_docx_device *dev = (fz_docx_device*) dev_;
  58. fz_text_span *span;
  59. assert(!dev->writer->ctx);
  60. dev->writer->ctx = ctx;
  61. fz_try(ctx)
  62. {
  63. for (span = text->head; span; span = span->next)
  64. {
  65. int i;
  66. fz_matrix combined, trm;
  67. fz_rect bbox;
  68. combined = fz_concat(span->trm, ctm);
  69. bbox = span->font->bbox;
  70. if (extract_span_begin(
  71. dev->writer->extract,
  72. span->font->name,
  73. span->font->flags.is_bold,
  74. span->font->flags.is_italic,
  75. span->wmode,
  76. combined.a,
  77. combined.b,
  78. combined.c,
  79. combined.d,
  80. bbox.x0,
  81. bbox.y0,
  82. bbox.x1,
  83. bbox.y1))
  84. {
  85. fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to begin span");
  86. }
  87. trm = span->trm;
  88. for (i=0; i<span->len; ++i)
  89. {
  90. fz_text_item *item = &span->items[i];
  91. float adv = 0;
  92. fz_rect bounds;
  93. trm.e = item->x;
  94. trm.f = item->y;
  95. combined = fz_concat(trm, ctm);
  96. if (dev->writer->mediabox_clip)
  97. if (fz_glyph_entirely_outside_box(ctx, &ctm, span, item, &dev->writer->mediabox))
  98. continue;
  99. if (span->items[i].gid >= 0)
  100. adv = span->items[i].adv;
  101. bounds = fz_bound_glyph(ctx, span->font, span->items[i].gid, combined);
  102. if (extract_add_char(dev->writer->extract, combined.e, combined.f, item->ucs, adv,
  103. bounds.x0, bounds.y0, bounds.x1, bounds.y1))
  104. fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to add char");
  105. }
  106. if (extract_span_end(dev->writer->extract))
  107. fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to end span");
  108. }
  109. }
  110. fz_always(ctx)
  111. {
  112. dev->writer->ctx = NULL;
  113. }
  114. fz_catch(ctx)
  115. {
  116. fz_rethrow(ctx);
  117. }
  118. }
  119. static void dev_fill_text(fz_context *ctx, fz_device *dev_, const fz_text *text, fz_matrix ctm,
  120. fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
  121. {
  122. dev_text(ctx, dev_, text, ctm, colorspace, color, alpha, color_params);
  123. }
  124. static void dev_stroke_text(fz_context *ctx, fz_device *dev_, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm,
  125. fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
  126. {
  127. dev_text(ctx, dev_, text, ctm, colorspace, color, alpha, color_params);
  128. }
  129. static void dev_clip_text(fz_context *ctx, fz_device *dev_, const fz_text *text, fz_matrix ctm, fz_rect scissor)
  130. {
  131. dev_text(ctx, dev_, text, ctm, NULL, NULL, 0 /*alpha*/, fz_default_color_params);
  132. }
  133. static void dev_clip_stroke_text(fz_context *ctx, fz_device *dev_, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor)
  134. {
  135. dev_text(ctx, dev_, text, ctm, NULL, 0, 0, fz_default_color_params);
  136. }
  137. static void
  138. dev_ignore_text(fz_context *ctx, fz_device *dev_, const fz_text *text, fz_matrix ctm)
  139. {
  140. }
  141. static void writer_image_free(void *handle, void *image_data)
  142. {
  143. fz_docx_writer *writer = handle;
  144. fz_free(writer->ctx, image_data);
  145. }
  146. static void dev_fill_image(fz_context *ctx, fz_device *dev_, fz_image *img, fz_matrix ctm, float alpha, fz_color_params color_params)
  147. {
  148. fz_docx_device *dev = (fz_docx_device*) dev_;
  149. const char *type = NULL;
  150. fz_compressed_buffer *compressed = fz_compressed_image_buffer(ctx, img);
  151. assert(!dev->writer->ctx);
  152. dev->writer->ctx = ctx;
  153. fz_try(ctx)
  154. {
  155. if (compressed)
  156. {
  157. if (0) { /* For alignment */ }
  158. else if (compressed->params.type == FZ_IMAGE_RAW) type = "raw";
  159. else if (compressed->params.type == FZ_IMAGE_FAX) type = "fax";
  160. else if (compressed->params.type == FZ_IMAGE_FLATE) type = "flate";
  161. else if (compressed->params.type == FZ_IMAGE_LZW) type = "lzw";
  162. else if (compressed->params.type == FZ_IMAGE_BROTLI) type = "brotli";
  163. else if (compressed->params.type == FZ_IMAGE_BMP) type = "bmp";
  164. else if (compressed->params.type == FZ_IMAGE_GIF) type = "gif";
  165. else if (compressed->params.type == FZ_IMAGE_JBIG2) type = "jbig2";
  166. else if (compressed->params.type == FZ_IMAGE_JPEG) type = "jpeg";
  167. else if (compressed->params.type == FZ_IMAGE_JPX) type = "jpx";
  168. else if (compressed->params.type == FZ_IMAGE_JXR) type = "jxr";
  169. else if (compressed->params.type == FZ_IMAGE_PNG) type = "png";
  170. else if (compressed->params.type == FZ_IMAGE_PNM) type = "pnm";
  171. else if (compressed->params.type == FZ_IMAGE_TIFF) type = "tiff";
  172. if (type)
  173. {
  174. /* Write out raw data. */
  175. unsigned char *data;
  176. size_t datasize = fz_buffer_extract(ctx, compressed->buffer, &data);
  177. if (extract_add_image(
  178. dev->writer->extract,
  179. type,
  180. ctm.e /*x*/,
  181. ctm.f /*y*/,
  182. img->w /*w*/,
  183. img->h /*h*/,
  184. data,
  185. datasize,
  186. writer_image_free,
  187. dev->writer
  188. ))
  189. {
  190. fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to add image type=%s", type);
  191. }
  192. }
  193. else
  194. {
  195. /* We don't recognise this image type, so ignore. */
  196. }
  197. }
  198. else
  199. {
  200. /*
  201. * Compressed data not available, so we could write out
  202. * raw pixel values. But for now we ignore.
  203. */
  204. }
  205. }
  206. fz_always(ctx)
  207. {
  208. dev->writer->ctx = NULL;
  209. }
  210. fz_catch(ctx)
  211. {
  212. fz_rethrow(ctx);
  213. }
  214. }
  215. /*
  216. * Support for sending information to Extract when walking stroke/fill path
  217. * with fz_walk_path().
  218. */
  219. typedef struct
  220. {
  221. fz_path_walker walker;
  222. extract_t *extract;
  223. } walker_info_t;
  224. static void s_moveto(fz_context *ctx, void *arg, float x, float y)
  225. {
  226. extract_t* extract = arg;
  227. if (extract_moveto(extract, x, y))
  228. fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_moveto() failed");
  229. }
  230. static void s_lineto(fz_context *ctx, void *arg, float x, float y)
  231. {
  232. extract_t* extract = arg;
  233. if (extract_lineto(extract, x, y))
  234. fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_lineto() failed");
  235. }
  236. static void s_curveto(fz_context *ctx, void *arg, float x1, float y1,
  237. float x2, float y2, float x3, float y3)
  238. {
  239. /* We simply move to the end point of the curve so that subsequent
  240. (straight) lines will be handled correctly. */
  241. extract_t* extract = arg;
  242. if (extract_moveto(extract, x3, y3))
  243. fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_moveto() failed");
  244. }
  245. static void s_closepath(fz_context *ctx, void *arg)
  246. {
  247. extract_t* extract = arg;
  248. if (extract_closepath(extract))
  249. fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_closepath() failed");
  250. }
  251. /*
  252. * Calls extract_*() path functions on <path> using fz_walk_path() and the
  253. * above callbacks.
  254. */
  255. static void s_walk_path(fz_context *ctx, fz_docx_device *dev, extract_t *extract, const fz_path *path)
  256. {
  257. fz_path_walker walker;
  258. walker.moveto = s_moveto;
  259. walker.lineto = s_lineto;
  260. walker.curveto = s_curveto;
  261. walker.closepath = s_closepath;
  262. walker.quadto = NULL;
  263. walker.curvetov = NULL;
  264. walker.curvetoy = NULL;
  265. walker.rectto = NULL;
  266. assert(dev->writer->ctx == ctx);
  267. fz_walk_path(ctx, path, &walker, extract /*arg*/);
  268. }
  269. void dev_fill_path(fz_context *ctx, fz_device *dev_, const fz_path *path, int even_odd,
  270. fz_matrix matrix, fz_colorspace * colorspace, const float *color, float alpha,
  271. fz_color_params color_params)
  272. {
  273. fz_docx_device *dev = (fz_docx_device*) dev_;
  274. extract_t *extract = dev->writer->extract;
  275. assert(!dev->writer->ctx);
  276. dev->writer->ctx = ctx;
  277. fz_try(ctx)
  278. {
  279. if (extract_fill_begin(
  280. extract,
  281. matrix.a,
  282. matrix.b,
  283. matrix.c,
  284. matrix.d,
  285. matrix.e,
  286. matrix.f,
  287. color[0]
  288. ))
  289. fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to begin fill");
  290. s_walk_path(ctx, dev, extract, path);
  291. if (extract_fill_end(extract))
  292. fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_fill_end() failed");
  293. }
  294. fz_always(ctx)
  295. {
  296. dev->writer->ctx = NULL;
  297. }
  298. fz_catch(ctx)
  299. {
  300. fz_rethrow(ctx);
  301. }
  302. }
  303. static void
  304. dev_stroke_path(fz_context *ctx, fz_device *dev_, const fz_path *path,
  305. const fz_stroke_state *stroke, fz_matrix in_ctm,
  306. fz_colorspace *colorspace_in, const float *color, float alpha,
  307. fz_color_params color_params)
  308. {
  309. fz_docx_device *dev = (fz_docx_device*) dev_;
  310. extract_t *extract = dev->writer->extract;
  311. assert(!dev->writer->ctx);
  312. dev->writer->ctx = ctx;
  313. fz_try(ctx)
  314. {
  315. if (extract_stroke_begin(
  316. extract,
  317. in_ctm.a,
  318. in_ctm.b,
  319. in_ctm.c,
  320. in_ctm.d,
  321. in_ctm.e,
  322. in_ctm.f,
  323. stroke->linewidth,
  324. color[0]
  325. ))
  326. fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to begin stroke");
  327. s_walk_path(ctx, dev, extract, path);
  328. if (extract_stroke_end(extract))
  329. fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_stroke_end() failed");
  330. }
  331. fz_always(ctx)
  332. {
  333. dev->writer->ctx = NULL;
  334. }
  335. fz_catch(ctx)
  336. {
  337. fz_rethrow(ctx);
  338. }
  339. }
  340. static extract_struct_t
  341. fz_struct_to_extract(fz_structure type)
  342. {
  343. switch (type)
  344. {
  345. default:
  346. return extract_struct_INVALID;
  347. case FZ_STRUCTURE_DOCUMENT:
  348. return extract_struct_DOCUMENT;
  349. case FZ_STRUCTURE_PART:
  350. return extract_struct_PART;
  351. case FZ_STRUCTURE_ART:
  352. return extract_struct_ART;
  353. case FZ_STRUCTURE_SECT:
  354. return extract_struct_SECT;
  355. case FZ_STRUCTURE_DIV:
  356. return extract_struct_DIV;
  357. case FZ_STRUCTURE_BLOCKQUOTE:
  358. return extract_struct_BLOCKQUOTE;
  359. case FZ_STRUCTURE_CAPTION:
  360. return extract_struct_CAPTION;
  361. case FZ_STRUCTURE_TOC:
  362. return extract_struct_TOC;
  363. case FZ_STRUCTURE_TOCI:
  364. return extract_struct_TOCI;
  365. case FZ_STRUCTURE_INDEX:
  366. return extract_struct_INDEX;
  367. case FZ_STRUCTURE_NONSTRUCT:
  368. return extract_struct_NONSTRUCT;
  369. case FZ_STRUCTURE_PRIVATE:
  370. return extract_struct_PRIVATE;
  371. /* Grouping elements (PDF 2.0 - Table 364) */
  372. case FZ_STRUCTURE_DOCUMENTFRAGMENT:
  373. return extract_struct_DOCUMENTFRAGMENT;
  374. /* Grouping elements (PDF 2.0 - Table 365) */
  375. case FZ_STRUCTURE_ASIDE:
  376. return extract_struct_ASIDE;
  377. /* Grouping elements (PDF 2.0 - Table 366) */
  378. case FZ_STRUCTURE_TITLE:
  379. return extract_struct_TITLE;
  380. case FZ_STRUCTURE_FENOTE:
  381. return extract_struct_FENOTE;
  382. /* Grouping elements (PDF 2.0 - Table 367) */
  383. case FZ_STRUCTURE_SUB:
  384. return extract_struct_SUB;
  385. /* Paragraphlike elements (PDF 1.7 - Table 10.21) */
  386. case FZ_STRUCTURE_P:
  387. return extract_struct_P;
  388. case FZ_STRUCTURE_H:
  389. return extract_struct_H;
  390. case FZ_STRUCTURE_H1:
  391. return extract_struct_H1;
  392. case FZ_STRUCTURE_H2:
  393. return extract_struct_H2;
  394. case FZ_STRUCTURE_H3:
  395. return extract_struct_H3;
  396. case FZ_STRUCTURE_H4:
  397. return extract_struct_H4;
  398. case FZ_STRUCTURE_H5:
  399. return extract_struct_H5;
  400. case FZ_STRUCTURE_H6:
  401. return extract_struct_H6;
  402. /* List elements (PDF 1.7 - Table 10.23) */
  403. case FZ_STRUCTURE_LIST:
  404. return extract_struct_LIST;
  405. case FZ_STRUCTURE_LISTITEM:
  406. return extract_struct_LISTITEM;
  407. case FZ_STRUCTURE_LABEL:
  408. return extract_struct_LABEL;
  409. case FZ_STRUCTURE_LISTBODY:
  410. return extract_struct_LISTBODY;
  411. /* Table elements (PDF 1.7 - Table 10.24) */
  412. case FZ_STRUCTURE_TABLE:
  413. return extract_struct_TABLE;
  414. case FZ_STRUCTURE_TR:
  415. return extract_struct_TR;
  416. case FZ_STRUCTURE_TH:
  417. return extract_struct_TH;
  418. case FZ_STRUCTURE_TD:
  419. return extract_struct_TD;
  420. case FZ_STRUCTURE_THEAD:
  421. return extract_struct_THEAD;
  422. case FZ_STRUCTURE_TBODY:
  423. return extract_struct_TBODY;
  424. case FZ_STRUCTURE_TFOOT:
  425. return extract_struct_TFOOT;
  426. /* Inline elements (PDF 1.7 - Table 10.25) */
  427. case FZ_STRUCTURE_SPAN:
  428. return extract_struct_SPAN;
  429. case FZ_STRUCTURE_QUOTE:
  430. return extract_struct_QUOTE;
  431. case FZ_STRUCTURE_NOTE:
  432. return extract_struct_NOTE;
  433. case FZ_STRUCTURE_REFERENCE:
  434. return extract_struct_REFERENCE;
  435. case FZ_STRUCTURE_BIBENTRY:
  436. return extract_struct_BIBENTRY;
  437. case FZ_STRUCTURE_CODE:
  438. return extract_struct_CODE;
  439. case FZ_STRUCTURE_LINK:
  440. return extract_struct_LINK;
  441. case FZ_STRUCTURE_ANNOT:
  442. return extract_struct_ANNOT;
  443. /* Inline elements (PDF 2.0 - Table 368) */
  444. case FZ_STRUCTURE_EM:
  445. return extract_struct_EM;
  446. case FZ_STRUCTURE_STRONG:
  447. return extract_struct_STRONG;
  448. /* Ruby inline element (PDF 1.7 - Table 10.26) */
  449. case FZ_STRUCTURE_RUBY:
  450. return extract_struct_RUBY;
  451. case FZ_STRUCTURE_RB:
  452. return extract_struct_RB;
  453. case FZ_STRUCTURE_RT:
  454. return extract_struct_RT;
  455. case FZ_STRUCTURE_RP:
  456. return extract_struct_RP;
  457. /* Warichu inline element (PDF 1.7 - Table 10.26) */
  458. case FZ_STRUCTURE_WARICHU:
  459. return extract_struct_WARICHU;
  460. case FZ_STRUCTURE_WT:
  461. return extract_struct_WT;
  462. case FZ_STRUCTURE_WP:
  463. return extract_struct_WP;
  464. /* Illustration elements (PDF 1.7 - Table 10.27) */
  465. case FZ_STRUCTURE_FIGURE:
  466. return extract_struct_FIGURE;
  467. case FZ_STRUCTURE_FORMULA:
  468. return extract_struct_FORMULA;
  469. case FZ_STRUCTURE_FORM:
  470. return extract_struct_FORM;
  471. /* Artifact structure type (PDF 2.0 - Table 375) */
  472. case FZ_STRUCTURE_ARTIFACT:
  473. return extract_struct_ARTIFACT;
  474. }
  475. }
  476. static void
  477. dev_begin_structure(fz_context *ctx, fz_device *dev_, fz_structure standard, const char *raw, int idx)
  478. {
  479. fz_docx_device *dev = (fz_docx_device *)dev_;
  480. extract_t *extract = dev->writer->extract;
  481. assert(!dev->writer->ctx);
  482. dev->writer->ctx = ctx;
  483. fz_try(ctx)
  484. {
  485. if (extract_begin_struct(extract, fz_struct_to_extract(standard), idx, -1))
  486. fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to begin struct");
  487. }
  488. fz_always(ctx)
  489. dev->writer->ctx = NULL;
  490. fz_catch(ctx)
  491. fz_rethrow(ctx);
  492. }
  493. static void
  494. dev_end_structure(fz_context *ctx, fz_device *dev_)
  495. {
  496. fz_docx_device *dev = (fz_docx_device *)dev_;
  497. extract_t *extract = dev->writer->extract;
  498. assert(!dev->writer->ctx);
  499. dev->writer->ctx = ctx;
  500. fz_try(ctx)
  501. {
  502. if (extract_end_struct(extract))
  503. fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to end struct");
  504. }
  505. fz_always(ctx)
  506. dev->writer->ctx = NULL;
  507. fz_catch(ctx)
  508. fz_rethrow(ctx);
  509. }
  510. static fz_device *writer_begin_page(fz_context *ctx, fz_document_writer *writer_, fz_rect mediabox)
  511. {
  512. fz_docx_writer *writer = (fz_docx_writer*) writer_;
  513. fz_docx_device *dev;
  514. assert(!writer->ctx);
  515. writer->ctx = ctx;
  516. writer->mediabox = mediabox;
  517. fz_var(dev);
  518. fz_try(ctx)
  519. {
  520. if (extract_page_begin(writer->extract, mediabox.x0, mediabox.y0, mediabox.x1, mediabox.y1))
  521. fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to begin page");
  522. dev = fz_new_derived_device(ctx, fz_docx_device);
  523. dev->super.fill_text = dev_fill_text;
  524. dev->super.stroke_text = dev_stroke_text;
  525. dev->super.clip_text = dev_clip_text;
  526. dev->super.clip_stroke_text = dev_clip_stroke_text;
  527. dev->super.ignore_text = dev_ignore_text;
  528. dev->super.fill_image = dev_fill_image;
  529. dev->super.fill_path = dev_fill_path;
  530. dev->super.stroke_path = dev_stroke_path;
  531. dev->super.begin_structure = dev_begin_structure;
  532. dev->super.end_structure = dev_end_structure;
  533. dev->writer = writer;
  534. }
  535. fz_always(ctx)
  536. {
  537. writer->ctx = NULL;
  538. }
  539. fz_catch(ctx)
  540. {
  541. fz_rethrow(ctx);
  542. }
  543. return &dev->super;
  544. }
  545. static void writer_end_page(fz_context *ctx, fz_document_writer *writer_, fz_device *dev)
  546. {
  547. fz_docx_writer *writer = (fz_docx_writer*) writer_;
  548. assert(!writer->ctx);
  549. writer->ctx = ctx;
  550. fz_try(ctx)
  551. {
  552. fz_close_device(ctx, dev);
  553. if (extract_page_end(writer->extract))
  554. fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to end page");
  555. if (extract_process(writer->extract, writer->spacing, writer->rotation, writer->images))
  556. fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to process page");
  557. }
  558. fz_always(ctx)
  559. {
  560. writer->ctx = NULL;
  561. fz_drop_device(ctx, dev);
  562. }
  563. fz_catch(ctx)
  564. {
  565. fz_rethrow(ctx);
  566. }
  567. }
  568. static int buffer_write(void *handle, const void *source, size_t numbytes, size_t *o_actual)
  569. /*
  570. * extract_buffer_t callback that calls fz_write_data(). <source> will be docx
  571. * archive data.
  572. */
  573. {
  574. int e = 0;
  575. fz_docx_writer *writer = handle;
  576. fz_var(e);
  577. fz_try(writer->ctx)
  578. {
  579. fz_write_data(writer->ctx, writer->output, source, numbytes);
  580. *o_actual = numbytes;
  581. }
  582. fz_catch(writer->ctx)
  583. {
  584. errno = EIO;
  585. e = -1;
  586. }
  587. return e;
  588. }
  589. static int buffer_cache(void *handle, void **o_cache, size_t *o_numbytes)
  590. /*
  591. * extract_buffer_t cache function. We simply return writer->output_cache.
  592. */
  593. {
  594. fz_docx_writer *writer = handle;
  595. *o_cache = writer->output_cache;
  596. *o_numbytes = sizeof(writer->output_cache);
  597. return 0;
  598. }
  599. static void writer_close(fz_context *ctx, fz_document_writer *writer_)
  600. {
  601. fz_docx_writer *writer = (fz_docx_writer*) writer_;
  602. extract_buffer_t *extract_buffer_output = NULL;
  603. fz_var(extract_buffer_output);
  604. fz_var(writer);
  605. assert(!writer->ctx);
  606. writer->ctx = ctx;
  607. fz_try(ctx)
  608. {
  609. /*
  610. * Write docx to writer->output. Need to create an
  611. * extract_buffer_t that writes to writer->output, for use by
  612. * extract_write().
  613. */
  614. if (extract_buffer_open(
  615. writer->alloc,
  616. writer,
  617. NULL /*fn_read*/,
  618. buffer_write,
  619. buffer_cache,
  620. NULL /*fn_close*/,
  621. &extract_buffer_output
  622. ))
  623. {
  624. fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to create extract_buffer_output: %s", strerror(errno));
  625. }
  626. if (extract_write(writer->extract, extract_buffer_output))
  627. fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to generate docx content: %s", strerror(errno));
  628. if (extract_buffer_close(&extract_buffer_output))
  629. fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to close extract_buffer: %s", strerror(errno));
  630. extract_end(&writer->extract);
  631. fz_close_output(ctx, writer->output);
  632. writer->ctx = NULL;
  633. }
  634. fz_catch(ctx)
  635. {
  636. /*
  637. * We don't call fz_close_output() because it can throw and in
  638. * this error case we can safely leave cleanup to our s_drop()
  639. * function's calls to fz_drop_output().
  640. */
  641. extract_buffer_close(&extract_buffer_output);
  642. extract_end(&writer->extract);
  643. writer->ctx = NULL;
  644. fz_rethrow(ctx);
  645. }
  646. }
  647. static void writer_drop(fz_context *ctx, fz_document_writer *writer_)
  648. {
  649. fz_docx_writer *writer = (fz_docx_writer*) writer_;
  650. fz_drop_output(ctx, writer->output);
  651. writer->output = NULL;
  652. assert(!writer->ctx);
  653. writer->ctx = ctx;
  654. extract_end(&writer->extract);
  655. extract_alloc_destroy(&writer->alloc);
  656. writer->ctx = NULL;
  657. }
  658. static int get_bool_option(fz_context *ctx, const char *options, const char *name, int default_)
  659. {
  660. const char *value;
  661. if (fz_has_option(ctx, options, name, &value))
  662. {
  663. if (fz_option_eq(value, "yes")) return 1;
  664. if (fz_option_eq(value, "no")) return 0;
  665. else fz_throw(ctx, FZ_ERROR_SYNTAX, "option '%s' should be yes or no in options='%s'", name, options);
  666. }
  667. else
  668. return default_;
  669. }
  670. static double get_double_option(fz_context *ctx, const char *options, const char *name, double default_)
  671. {
  672. const char *value;
  673. if (fz_has_option(ctx, options, name, &value))
  674. {
  675. double ret = atof(value);
  676. return ret;
  677. }
  678. else
  679. return default_;
  680. }
  681. static void *s_realloc_fn(void *state, void *prev, size_t size)
  682. {
  683. fz_docx_writer *writer = state;
  684. assert(writer);
  685. assert(writer->ctx);
  686. return fz_realloc_no_throw(writer->ctx, prev, size);
  687. }
  688. /* Will drop <out> if an error occurs. */
  689. static fz_document_writer *fz_new_docx_writer_internal(fz_context *ctx, fz_output *out,
  690. const char *options, extract_format_t format)
  691. {
  692. fz_docx_writer *writer = NULL;
  693. fz_var(writer);
  694. fz_try(ctx)
  695. {
  696. double space_guess = get_double_option(ctx, options, "space-guess", 0);
  697. writer = fz_new_derived_document_writer(
  698. ctx,
  699. fz_docx_writer,
  700. writer_begin_page,
  701. writer_end_page,
  702. writer_close,
  703. writer_drop
  704. );
  705. writer->ctx = ctx;
  706. writer->output = out;
  707. if (get_bool_option(ctx, options, "html", 0)) format = extract_format_HTML;
  708. if (get_bool_option(ctx, options, "text", 0)) format = extract_format_TEXT;
  709. if (get_bool_option(ctx, options, "json", 0)) format = extract_format_JSON;
  710. if (extract_alloc_create(s_realloc_fn, writer, &writer->alloc))
  711. fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to create extract_alloc instance");
  712. if (extract_begin(writer->alloc, format, &writer->extract))
  713. fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to create extract instance");
  714. if (space_guess)
  715. extract_set_space_guess(writer->extract, space_guess);
  716. writer->spacing = get_bool_option(ctx, options, "spacing", 0);
  717. writer->rotation = get_bool_option(ctx, options, "rotation", 1);
  718. writer->images = get_bool_option(ctx, options, "images", 1);
  719. writer->mediabox_clip = get_bool_option(ctx, options, "mediabox-clip", 1);
  720. if (extract_set_layout_analysis(writer->extract, get_bool_option(ctx, options, "analyse", 0)))
  721. fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_enable_analysis failed.");
  722. {
  723. const char* v;
  724. if (fz_has_option(ctx, options, "tables-csv-format", &v))
  725. {
  726. size_t len = strlen(v) + 1; /* Might include trailing options. */
  727. char* formatbuf = fz_malloc(ctx, len);
  728. fz_copy_option(ctx, v, formatbuf, len);
  729. fprintf(stderr, "tables-csv-format: %s\n", formatbuf);
  730. if (extract_tables_csv_format(writer->extract, formatbuf))
  731. {
  732. fz_free(ctx, formatbuf);
  733. fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_tables_csv_format() failed.");
  734. }
  735. fz_free(ctx, formatbuf);
  736. }
  737. }
  738. writer->ctx = NULL;
  739. }
  740. fz_catch(ctx)
  741. {
  742. /* fz_drop_document_writer() drops its output so we only need to call
  743. fz_drop_output() if we failed before creating the writer. */
  744. if (writer)
  745. {
  746. writer->ctx = ctx;
  747. fz_drop_document_writer(ctx, &writer->super);
  748. writer->ctx = NULL;
  749. }
  750. else
  751. fz_drop_output(ctx, out);
  752. fz_rethrow(ctx);
  753. }
  754. return &writer->super;
  755. }
  756. fz_document_writer *fz_new_docx_writer_with_output(fz_context *ctx, fz_output *out, const char *options)
  757. {
  758. return fz_new_docx_writer_internal(ctx, out, options, extract_format_DOCX);
  759. }
  760. fz_document_writer *fz_new_docx_writer(fz_context *ctx, const char *path, const char *options)
  761. {
  762. /* No need to drop <out> if fz_new_docx_writer_internal() throws, because
  763. it always drops <out> if it fails. */
  764. fz_output *out = fz_new_output_with_path(ctx, path, 0 /*append*/);
  765. return fz_new_docx_writer_internal(ctx, out, options, extract_format_DOCX);
  766. }
  767. #if FZ_ENABLE_ODT_OUTPUT
  768. fz_document_writer *fz_new_odt_writer_with_output(fz_context *ctx, fz_output *out, const char *options)
  769. {
  770. return fz_new_docx_writer_internal(ctx, out, options, extract_format_ODT);
  771. }
  772. fz_document_writer *fz_new_odt_writer(fz_context *ctx, const char *path, const char *options)
  773. {
  774. /* No need to drop <out> if fz_new_docx_writer_internal() throws, because
  775. it always drops <out> if it fails. */
  776. fz_output *out = fz_new_output_with_path(ctx, path, 0 /*append*/);
  777. return fz_new_docx_writer_internal(ctx, out, options, extract_format_ODT);
  778. }
  779. #else
  780. fz_document_writer *fz_new_odt_writer_with_output(fz_context *ctx, fz_output *out, const char *options)
  781. {
  782. fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "ODT writer not enabled");
  783. return NULL;
  784. }
  785. fz_document_writer *fz_new_odt_writer(fz_context *ctx, const char *path, const char *options)
  786. {
  787. fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "ODT writer not enabled");
  788. return NULL;
  789. }
  790. #endif
  791. #else
  792. fz_document_writer *fz_new_odt_writer_with_output(fz_context *ctx, fz_output *out, const char *options)
  793. {
  794. fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "DOCX/ODT writer not enabled");
  795. return NULL;
  796. }
  797. fz_document_writer *fz_new_odt_writer(fz_context *ctx, const char *path, const char *options)
  798. {
  799. fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "DOCX/ODT writer not enabled");
  800. return NULL;
  801. }
  802. fz_document_writer *fz_new_docx_writer_with_output(fz_context *ctx, fz_output *out, const char *options)
  803. {
  804. fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "DOCX writer not enabled");
  805. return NULL;
  806. }
  807. fz_document_writer *fz_new_docx_writer(fz_context *ctx, const char *path, const char *options)
  808. {
  809. fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "DOCX writer not enabled");
  810. return NULL;
  811. }
  812. #endif