23 #include "allheaders.h" 172 static const int kBasicBufSize = 2048;
175 static const int kCharWidth = 2;
180 static const int kMaxBytesPerCodepoint = 20;
196 textonly_ = textonly;
200 void TessPDFRenderer::AppendPDFObjectDIY(
size_t objectsize) {
205 void TessPDFRenderer::AppendPDFObject(
const char *data) {
206 AppendPDFObjectDIY(strlen(data));
214 double kPrecision = 1000.0;
215 double a = round(x * kPrecision) / kPrecision;
221 long dist2(
int x1,
int y1,
int x2,
int y2) {
222 return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
234 int word_x1,
int word_y1,
int word_x2,
int word_y2,
235 int line_x1,
int line_y1,
int line_x2,
int line_y2,
236 double *x0,
double *y0,
double *length) {
238 Swap(&word_x1, &word_x2);
239 Swap(&word_y1, &word_y2);
246 double l2 =
dist2(line_x1, line_y1, line_x2, line_y2);
251 double t = ((px - line_x2) * (line_x2 - line_x1) +
252 (py - line_y2) * (line_y2 - line_y1)) / l2;
253 x = line_x2 + t * (line_x2 - line_x1);
254 y = line_y2 + t * (line_y2 - line_y1);
256 word_length = sqrt(static_cast<double>(
dist2(word_x1, word_y1,
258 word_length = word_length * 72.0 / ppi;
260 y = height - (y * 72.0 / ppi);
264 *length = word_length;
276 int line_x1,
int line_y1,
int line_x2,
int line_y2,
277 double *a,
double *b,
double *c,
double *d) {
278 double theta = atan2(static_cast<double>(line_y1 - line_y2),
279 static_cast<double>(line_x2 - line_x1));
284 switch(writing_direction) {
305 int *line_x1,
int *line_y1,
306 int *line_x2,
int *line_y2) {
311 double rise = abs(y2 - y1) * 72 / ppi;
312 double run = abs(x2 - x1) * 72 / ppi;
313 if (rise < 2.0 && 2.0 < run)
314 *line_y1 = *line_y2 = (y1 + y2) / 2;
318 if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {
319 tprintf(
"Dropping invalid codepoint %d\n", code);
322 if (code < 0x10000) {
323 snprintf(utf16, kMaxBytesPerCodepoint,
"%04X", code);
325 int a = code - 0x010000;
326 int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
327 int low_surrogate = (0x03FF & a) + 0xDC00;
328 snprintf(utf16, kMaxBytesPerCodepoint,
329 "%04X%04X", high_surrogate, low_surrogate);
335 double width,
double height) {
337 double ppi =
api->GetSourceYResolution();
340 double old_x = 0.0, old_y = 0.0;
341 int old_fontsize = 0;
344 bool new_block =
true;
355 pdf_str.add_str_double(
"",
prec(width));
357 pdf_str.add_str_double(
"",
prec(height));
358 pdf_str +=
" 0 0 cm";
360 pdf_str +=
" /Im1 Do";
369 ResultIterator *res_it =
api->GetIterator();
371 if (res_it->IsAtBeginningOf(
RIL_BLOCK)) {
372 pdf_str +=
"BT\n3 Tr";
380 ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
394 res_it->Orientation(&orientation, &writing_direction,
395 &textline_order, &deskew_angle);
397 switch (res_it->WordDirection()) {
405 writing_direction = old_writing_direction;
411 double x, y, word_length;
413 int word_x1, word_y1, word_x2, word_y2;
414 res_it->Baseline(
RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
416 word_x1, word_y1, word_x2, word_y2,
417 line_x1, line_y1, line_x2, line_y2,
418 &x, &y, &word_length);
421 if (writing_direction != old_writing_direction || new_block) {
423 line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
424 pdf_str.add_str_double(
" ",
prec(a));
425 pdf_str.add_str_double(
" ",
prec(b));
426 pdf_str.add_str_double(
" ",
prec(c));
427 pdf_str.add_str_double(
" ",
prec(d));
428 pdf_str.add_str_double(
" ",
prec(x));
429 pdf_str.add_str_double(
" ",
prec(y));
433 double dx = x - old_x;
434 double dy = y - old_y;
435 pdf_str.add_str_double(
" ",
prec(dx * a + dy * b));
436 pdf_str.add_str_double(
" ",
prec(dx * c + dy * d));
441 old_writing_direction = writing_direction;
448 bool bold, italic, underlined, monospace, serif, smallcaps;
450 res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
451 &serif, &smallcaps, &fontsize, &font_id);
452 const int kDefaultFontsize = 8;
454 fontsize = kDefaultFontsize;
455 if (fontsize != old_fontsize) {
457 snprintf(textfont,
sizeof(textfont),
"/f-0-0 %d Tf ", fontsize);
459 old_fontsize = fontsize;
466 int pdf_word_len = 0;
468 const char *grapheme = res_it->GetUTF8Text(
RIL_SYMBOL);
469 if (grapheme && grapheme[0] !=
'\0') {
472 char utf16[kMaxBytesPerCodepoint];
473 for (
int i = 0; i < unicodes.
length(); i++) {
474 int code = unicodes[i];
484 if (word_length > 0 && pdf_word_len > 0 && fontsize > 0) {
486 kCharWidth *
prec(100.0 * word_length / (fontsize * pdf_word_len));
487 pdf_str.add_str_double(
"", h_stretch);
493 if (last_word_in_line) {
496 if (last_word_in_block) {
500 char *ret =
new char[pdf_str.length() + 1];
501 strcpy(ret, pdf_str.string());
507 char buf[kBasicBufSize];
510 n = snprintf(buf,
sizeof(buf),
513 0xDE, 0xAD, 0xBE, 0xEB);
514 if (n >=
sizeof(buf))
return false;
515 AppendPDFObject(buf);
518 n = snprintf(buf,
sizeof(buf),
526 if (n >=
sizeof(buf))
return false;
527 AppendPDFObject(buf);
535 n = snprintf(buf,
sizeof(buf),
538 " /BaseFont /GlyphLessFont\n" 539 " /DescendantFonts [ %ld 0 R ]\n" 540 " /Encoding /Identity-H\n" 542 " /ToUnicode %ld 0 R\n" 549 if (n >=
sizeof(buf))
return false;
550 AppendPDFObject(buf);
553 n = snprintf(buf,
sizeof(buf),
556 " /BaseFont /GlyphLessFont\n" 557 " /CIDToGIDMap %ld 0 R\n" 560 " /Ordering (Identity)\n" 561 " /Registry (Adobe)\n" 564 " /FontDescriptor %ld 0 R\n" 565 " /Subtype /CIDFontType2\n" 573 if (n >=
sizeof(buf))
return false;
574 AppendPDFObject(buf);
577 const int kCIDToGIDMapSize = 2 * (1 << 16);
578 unsigned char *cidtogidmap =
new unsigned char[kCIDToGIDMapSize];
579 for (
int i = 0; i < kCIDToGIDMapSize; i++) {
580 cidtogidmap[i] = (i % 2) ? 1 : 0;
583 unsigned char *comp =
584 zlibCompress(cidtogidmap, kCIDToGIDMapSize, &len);
585 delete[] cidtogidmap;
586 n = snprintf(buf,
sizeof(buf),
589 " /Length %lu /Filter /FlateDecode\n" 593 if (n >=
sizeof(buf)) {
598 long objsize = strlen(buf);
599 AppendData(reinterpret_cast<char *>(comp), len);
602 const char *endstream_endobj =
606 objsize += strlen(endstream_endobj);
607 AppendPDFObjectDIY(objsize);
610 "/CIDInit /ProcSet findresource begin\n" 615 " /Registry (Adobe)\n" 619 "/CMapName /Adobe-Identify-UCS def\n" 621 "1 begincodespacerange\n" 623 "endcodespacerange\n" 625 "<0000> <FFFF> <0000>\n" 628 "CMapName currentdict /CMap defineresource pop\n" 633 n = snprintf(buf,
sizeof(buf),
635 "<< /Length %lu >>\n" 639 "endobj\n", (
unsigned long) strlen(stream), stream);
640 if (n >=
sizeof(buf))
return false;
641 AppendPDFObject(buf);
644 n = snprintf(buf,
sizeof(buf),
651 " /FontBBox [ 0 0 %d %d ]\n" 652 " /FontFile2 %ld 0 R\n" 653 " /FontName /GlyphLessFont\n" 656 " /Type /FontDescriptor\n" 665 if (n >=
sizeof(buf))
return false;
666 AppendPDFObject(buf);
668 n = snprintf(buf,
sizeof(buf),
"%s/pdf.ttf", datadir_);
669 if (n >=
sizeof(buf))
return false;
670 FILE *fp = fopen(buf,
"rb");
672 tprintf(
"Can not open file \"%s\"!\n", buf);
675 fseek(fp, 0, SEEK_END);
676 long int size = ftell(fp);
677 fseek(fp, 0, SEEK_SET);
678 char *buffer =
new char[size];
679 if (fread(buffer, 1, size, fp) != size) {
686 n = snprintf(buf,
sizeof(buf),
692 "stream\n", size, size);
693 if (n >=
sizeof(buf)) {
698 objsize = strlen(buf);
703 objsize += strlen(endstream_endobj);
704 AppendPDFObjectDIY(objsize);
708 bool TessPDFRenderer::imageToPDFObj(Pix *pix,
712 long int *pdf_object_size) {
714 char b0[kBasicBufSize];
715 char b1[kBasicBufSize];
716 char b2[kBasicBufSize];
717 if (!pdf_object_size || !pdf_object)
720 *pdf_object_size = 0;
724 L_Compressed_Data *cid = NULL;
725 const int kJpegQuality = 85;
729 if (pixGetSpp(pix) == 4 && format == IFF_PNG) {
730 Pix *p1 = pixAlphaBlendUniform(pix, 0xffffff00);
731 sad = pixGenerateCIData(p1, L_FLATE_ENCODE, 0, 0, &cid);
734 sad = l_generateCIDataForPdf(
filename, pix, kJpegQuality, &cid);
738 l_CIDataDestroy(&cid);
742 const char *group4 =
"";
746 filter =
"/FlateDecode";
749 filter =
"/DCTDecode";
752 filter =
"/CCITTFaxDecode";
756 filter =
"/JPXDecode";
759 l_CIDataDestroy(&cid);
766 const char *colorspace;
767 if (cid->ncolors > 0) {
768 n = snprintf(b0,
sizeof(b0),
769 " /ColorSpace [ /Indexed /DeviceRGB %d %s ]\n",
770 cid->ncolors - 1, cid->cmapdatahex);
771 if (n >=
sizeof(b0)) {
772 l_CIDataDestroy(&cid);
779 colorspace =
" /ColorSpace /DeviceGray\n";
782 colorspace =
" /ColorSpace /DeviceRGB\n";
785 l_CIDataDestroy(&cid);
790 int predictor = (cid->predictor) ? 14 : 1;
793 n = snprintf(b1,
sizeof(b1),
797 " /Subtype /Image\n",
798 objnum, (
unsigned long) cid->nbytescomp);
799 if (n >=
sizeof(b1)) {
800 l_CIDataDestroy(&cid);
804 n = snprintf(b2,
sizeof(b2),
807 " /BitsPerComponent %d\n" 815 " /BitsPerComponent %d\n" 819 cid->w, cid->h, cid->bps, filter, predictor, cid->spp,
820 group4, cid->w, cid->bps);
821 if (n >=
sizeof(b2)) {
822 l_CIDataDestroy(&cid);
830 size_t b1_len = strlen(b1);
831 size_t b2_len = strlen(b2);
832 size_t b3_len = strlen(b3);
833 size_t colorspace_len = strlen(colorspace);
836 b1_len + colorspace_len + b2_len + cid->nbytescomp + b3_len;
837 *pdf_object =
new char[*pdf_object_size];
839 char *p = *pdf_object;
840 memcpy(p, b1, b1_len);
842 memcpy(p, colorspace, colorspace_len);
844 memcpy(p, b2, b2_len);
846 memcpy(p, cid->datacomp, cid->nbytescomp);
847 p += cid->nbytescomp;
848 memcpy(p, b3, b3_len);
849 l_CIDataDestroy(&cid);
855 char buf[kBasicBufSize];
856 char buf2[kBasicBufSize];
857 Pix *pix =
api->GetInputImage();
859 int ppi =
api->GetSourceYResolution();
860 if (!pix || ppi <= 0)
862 double width = pixGetWidth(pix) * 72.0 / ppi;
863 double height = pixGetHeight(pix) * 72.0 / ppi;
865 snprintf(buf2,
sizeof(buf2),
"/XObject << /Im1 %ld 0 R >>\n", obj_ + 2);
866 const char *xobject = (textonly_) ?
"" : buf2;
869 n = snprintf(buf,
sizeof(buf),
874 " /MediaBox [0 0 %.2f %.2f]\n" 875 " /Contents %ld 0 R\n" 879 " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n" 880 " /Font << /f-0-0 %ld 0 R >>\n" 890 if (n >=
sizeof(buf))
return false;
892 AppendPDFObject(buf);
895 char* pdftext = GetPDFTextObjects(
api, width, height);
896 long pdftext_len = strlen(pdftext);
897 unsigned char *pdftext_casted =
reinterpret_cast<unsigned char *
>(pdftext);
899 unsigned char *comp_pdftext =
900 zlibCompress(pdftext_casted, pdftext_len, &len);
901 long comp_pdftext_len = len;
902 n = snprintf(buf,
sizeof(buf),
905 " /Length %ld /Filter /FlateDecode\n" 907 "stream\n", obj_, comp_pdftext_len);
908 if (n >=
sizeof(buf)) {
910 lept_free(comp_pdftext);
914 long objsize = strlen(buf);
915 AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
916 objsize += comp_pdftext_len;
917 lept_free(comp_pdftext);
923 objsize += strlen(b2);
924 AppendPDFObjectDIY(objsize);
927 char *pdf_object = NULL;
928 if (!imageToPDFObj(pix,
filename, obj_, &pdf_object, &objsize)) {
932 AppendPDFObjectDIY(objsize);
941 char buf[kBasicBufSize];
950 const long int kPagesObjectNumber = 2;
951 offsets_[kPagesObjectNumber] = offsets_.
back();
952 n = snprintf(buf,
sizeof(buf),
956 " /Kids [ ", kPagesObjectNumber);
957 if (n >=
sizeof(buf))
return false;
959 size_t pages_objsize = strlen(buf);
960 for (
size_t i = 0; i < pages_.
size(); i++) {
961 n = snprintf(buf,
sizeof(buf),
962 "%ld 0 R ", pages_[i]);
963 if (n >=
sizeof(buf))
return false;
965 pages_objsize += strlen(buf);
967 n = snprintf(buf,
sizeof(buf),
971 "endobj\n", pages_.
size());
972 if (n >=
sizeof(buf))
return false;
974 pages_objsize += strlen(buf);
975 offsets_.
back() += pages_objsize;
978 STRING utf16_title =
"FEFF";
981 char utf16[kMaxBytesPerCodepoint];
982 for (
int i = 0; i < unicodes.
length(); i++) {
983 int code = unicodes[i];
985 utf16_title += utf16;
989 char* datestr = l_getFormattedDate();
990 n = snprintf(buf,
sizeof(buf),
993 " /Producer (Tesseract %s)\n" 994 " /CreationDate (D:%s)\n" 1000 if (n >=
sizeof(buf))
return false;
1001 AppendPDFObject(buf);
1002 n = snprintf(buf,
sizeof(buf),
1005 "0000000000 65535 f \n", obj_);
1006 if (n >=
sizeof(buf))
return false;
1008 for (
int i = 1; i < obj_; i++) {
1009 n = snprintf(buf,
sizeof(buf),
"%010ld 00000 n \n", offsets_[i]);
1010 if (n >=
sizeof(buf))
return false;
1013 n = snprintf(buf,
sizeof(buf),
1027 if (n >=
sizeof(buf))
return false;
void AffineMatrix(int writing_direction, int line_x1, int line_y1, int line_x2, int line_y2, double *a, double *b, double *c, double *d)
bool CodepointToUtf16be(int code, char utf16[kMaxBytesPerCodepoint])
struct TessBaseAPI TessBaseAPI
void GetWordBaseline(int writing_direction, int ppi, int height, int word_x1, int word_y1, int word_x2, int word_y2, int line_x1, int line_y1, int line_x2, int line_y2, double *x0, double *y0, double *length)
TessPDFRenderer(const char *outputbase, const char *datadir)
long dist2(int x1, int y1, int x2, int y2)
virtual bool AddImageHandler(TessBaseAPI *api)
static bool UTF8ToUnicode(const char *utf8_str, GenericVector< int > *unicodes)
virtual bool BeginDocumentHandler()
virtual bool EndDocumentHandler()
const char * title() const
const char * c_str() const
void AppendString(const char *s)
void ClipBaseline(int ppi, int x1, int y1, int x2, int y2, int *line_x1, int *line_y1, int *line_x2, int *line_y2)
void AppendData(const char *s, int len)
#define TESSERACT_VERSION_STR