tesseract  3.05.02
tesseract::TessPDFRenderer Class Reference

#include <renderer.h>

Inheritance diagram for tesseract::TessPDFRenderer:
tesseract::TessResultRenderer

Public Member Functions

 TessPDFRenderer (const char *outputbase, const char *datadir)
 
 TessPDFRenderer (const char *outputbase, const char *datadir, bool textonly)
 
- Public Member Functions inherited from tesseract::TessResultRenderer
virtual ~TessResultRenderer ()
 
void insert (TessResultRenderer *next)
 
TessResultRenderernext ()
 
bool BeginDocument (const char *title)
 
bool AddImage (TessBaseAPI *api)
 
bool EndDocument ()
 
const char * file_extension () const
 
const char * title () const
 
int imagenum () const
 

Protected Member Functions

virtual bool BeginDocumentHandler ()
 
virtual bool AddImageHandler (TessBaseAPI *api)
 
virtual bool EndDocumentHandler ()
 
- Protected Member Functions inherited from tesseract::TessResultRenderer
 TessResultRenderer (const char *outputbase, const char *extension)
 
void AppendString (const char *s)
 
void AppendData (const char *s, int len)
 

Detailed Description

Renders tesseract output into searchable PDF

Definition at line 186 of file renderer.h.

Constructor & Destructor Documentation

◆ TessPDFRenderer() [1/2]

tesseract::TessPDFRenderer::TessPDFRenderer ( const char *  outputbase,
const char *  datadir 
)

Definition at line 186 of file pdfrenderer.cpp.

187  : TessResultRenderer(outputbase, "pdf") {
188  TessPDFRenderer(outputbase, datadir, false);
189 }
TessPDFRenderer(const char *outputbase, const char *datadir)
TessResultRenderer(const char *outputbase, const char *extension)
Definition: renderer.cpp:32

◆ TessPDFRenderer() [2/2]

tesseract::TessPDFRenderer::TessPDFRenderer ( const char *  outputbase,
const char *  datadir,
bool  textonly 
)

Definition at line 191 of file pdfrenderer.cpp.

193  : TessResultRenderer(outputbase, "pdf") {
194  obj_ = 0;
195  datadir_ = datadir;
196  textonly_ = textonly;
197  offsets_.push_back(0);
198 }
int push_back(T object)
TessResultRenderer(const char *outputbase, const char *extension)
Definition: renderer.cpp:32

Member Function Documentation

◆ AddImageHandler()

bool tesseract::TessPDFRenderer::AddImageHandler ( TessBaseAPI api)
protectedvirtual

Implements tesseract::TessResultRenderer.

Definition at line 853 of file pdfrenderer.cpp.

853  {
854  size_t n;
855  char buf[kBasicBufSize];
856  char buf2[kBasicBufSize];
857  Pix *pix = api->GetInputImage();
858  char *filename = (char *)api->GetInputName();
859  int ppi = api->GetSourceYResolution();
860  if (!pix || ppi <= 0)
861  return false;
862  double width = pixGetWidth(pix) * 72.0 / ppi;
863  double height = pixGetHeight(pix) * 72.0 / ppi;
864 
865  snprintf(buf2, sizeof(buf2), "/XObject << /Im1 %ld 0 R >>\n", obj_ + 2);
866  const char *xobject = (textonly_) ? "" : buf2;
867 
868  // PAGE
869  n = snprintf(buf, sizeof(buf),
870  "%ld 0 obj\n"
871  "<<\n"
872  " /Type /Page\n"
873  " /Parent %ld 0 R\n"
874  " /MediaBox [0 0 %.2f %.2f]\n"
875  " /Contents %ld 0 R\n"
876  " /Resources\n"
877  " <<\n"
878  " %s"
879  " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
880  " /Font << /f-0-0 %ld 0 R >>\n"
881  " >>\n"
882  ">>\n"
883  "endobj\n",
884  obj_,
885  2L, // Pages object
886  width, height,
887  obj_ + 1, // Contents object
888  xobject, // Image object
889  3L); // Type0 Font
890  if (n >= sizeof(buf)) return false;
891  pages_.push_back(obj_);
892  AppendPDFObject(buf);
893 
894  // CONTENTS
895  char* pdftext = GetPDFTextObjects(api, width, height);
896  long pdftext_len = strlen(pdftext);
897  unsigned char *pdftext_casted = reinterpret_cast<unsigned char *>(pdftext);
898  size_t len;
899  unsigned char *comp_pdftext =
900  zlibCompress(pdftext_casted, pdftext_len, &len);
901  long comp_pdftext_len = len;
902  n = snprintf(buf, sizeof(buf),
903  "%ld 0 obj\n"
904  "<<\n"
905  " /Length %ld /Filter /FlateDecode\n"
906  ">>\n"
907  "stream\n", obj_, comp_pdftext_len);
908  if (n >= sizeof(buf)) {
909  delete[] pdftext;
910  lept_free(comp_pdftext);
911  return false;
912  }
913  AppendString(buf);
914  long objsize = strlen(buf);
915  AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
916  objsize += comp_pdftext_len;
917  lept_free(comp_pdftext);
918  delete[] pdftext;
919  const char *b2 =
920  "endstream\n"
921  "endobj\n";
922  AppendString(b2);
923  objsize += strlen(b2);
924  AppendPDFObjectDIY(objsize);
925 
926  if (!textonly_) {
927  char *pdf_object = NULL;
928  if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize)) {
929  return false;
930  }
931  AppendData(pdf_object, objsize);
932  AppendPDFObjectDIY(objsize);
933  delete[] pdf_object;
934  }
935  return true;
936 }
int push_back(T object)
void AppendString(const char *s)
Definition: renderer.cpp:101
void AppendData(const char *s, int len)
Definition: renderer.cpp:105

◆ BeginDocumentHandler()

bool tesseract::TessPDFRenderer::BeginDocumentHandler ( )
protectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 506 of file pdfrenderer.cpp.

506  {
507  char buf[kBasicBufSize];
508  size_t n;
509 
510  n = snprintf(buf, sizeof(buf),
511  "%%PDF-1.5\n"
512  "%%%c%c%c%c\n",
513  0xDE, 0xAD, 0xBE, 0xEB);
514  if (n >= sizeof(buf)) return false;
515  AppendPDFObject(buf);
516 
517  // CATALOG
518  n = snprintf(buf, sizeof(buf),
519  "1 0 obj\n"
520  "<<\n"
521  " /Type /Catalog\n"
522  " /Pages %ld 0 R\n"
523  ">>\n"
524  "endobj\n",
525  2L);
526  if (n >= sizeof(buf)) return false;
527  AppendPDFObject(buf);
528 
529  // We are reserving object #2 for the /Pages
530  // object, which I am going to create and write
531  // at the end of the PDF file.
532  AppendPDFObject("");
533 
534  // TYPE0 FONT
535  n = snprintf(buf, sizeof(buf),
536  "3 0 obj\n"
537  "<<\n"
538  " /BaseFont /GlyphLessFont\n"
539  " /DescendantFonts [ %ld 0 R ]\n"
540  " /Encoding /Identity-H\n"
541  " /Subtype /Type0\n"
542  " /ToUnicode %ld 0 R\n"
543  " /Type /Font\n"
544  ">>\n"
545  "endobj\n",
546  4L, // CIDFontType2 font
547  6L // ToUnicode
548  );
549  if (n >= sizeof(buf)) return false;
550  AppendPDFObject(buf);
551 
552  // CIDFONTTYPE2
553  n = snprintf(buf, sizeof(buf),
554  "4 0 obj\n"
555  "<<\n"
556  " /BaseFont /GlyphLessFont\n"
557  " /CIDToGIDMap %ld 0 R\n"
558  " /CIDSystemInfo\n"
559  " <<\n"
560  " /Ordering (Identity)\n"
561  " /Registry (Adobe)\n"
562  " /Supplement 0\n"
563  " >>\n"
564  " /FontDescriptor %ld 0 R\n"
565  " /Subtype /CIDFontType2\n"
566  " /Type /Font\n"
567  " /DW %d\n"
568  ">>\n"
569  "endobj\n",
570  5L, // CIDToGIDMap
571  7L, // Font descriptor
572  1000 / kCharWidth);
573  if (n >= sizeof(buf)) return false;
574  AppendPDFObject(buf);
575 
576  // CIDTOGIDMAP
577  const int kCIDToGIDMapSize = 2 * (1 << 16);
578  unsigned char *cidtogidmap = new unsigned char[kCIDToGIDMapSize];
579  for (int i = 0; i < kCIDToGIDMapSize; i++) {
580  cidtogidmap[i] = (i % 2) ? 1 : 0;
581  }
582  size_t len;
583  unsigned char *comp =
584  zlibCompress(cidtogidmap, kCIDToGIDMapSize, &len);
585  delete[] cidtogidmap;
586  n = snprintf(buf, sizeof(buf),
587  "5 0 obj\n"
588  "<<\n"
589  " /Length %lu /Filter /FlateDecode\n"
590  ">>\n"
591  "stream\n",
592  (unsigned long)len);
593  if (n >= sizeof(buf)) {
594  lept_free(comp);
595  return false;
596  }
597  AppendString(buf);
598  long objsize = strlen(buf);
599  AppendData(reinterpret_cast<char *>(comp), len);
600  objsize += len;
601  lept_free(comp);
602  const char *endstream_endobj =
603  "endstream\n"
604  "endobj\n";
605  AppendString(endstream_endobj);
606  objsize += strlen(endstream_endobj);
607  AppendPDFObjectDIY(objsize);
608 
609  const char *stream =
610  "/CIDInit /ProcSet findresource begin\n"
611  "12 dict begin\n"
612  "begincmap\n"
613  "/CIDSystemInfo\n"
614  "<<\n"
615  " /Registry (Adobe)\n"
616  " /Ordering (UCS)\n"
617  " /Supplement 0\n"
618  ">> def\n"
619  "/CMapName /Adobe-Identify-UCS def\n"
620  "/CMapType 2 def\n"
621  "1 begincodespacerange\n"
622  "<0000> <FFFF>\n"
623  "endcodespacerange\n"
624  "1 beginbfrange\n"
625  "<0000> <FFFF> <0000>\n"
626  "endbfrange\n"
627  "endcmap\n"
628  "CMapName currentdict /CMap defineresource pop\n"
629  "end\n"
630  "end\n";
631 
632  // TOUNICODE
633  n = snprintf(buf, sizeof(buf),
634  "6 0 obj\n"
635  "<< /Length %lu >>\n"
636  "stream\n"
637  "%s"
638  "endstream\n"
639  "endobj\n", (unsigned long) strlen(stream), stream);
640  if (n >= sizeof(buf)) return false;
641  AppendPDFObject(buf);
642 
643  // FONT DESCRIPTOR
644  n = snprintf(buf, sizeof(buf),
645  "7 0 obj\n"
646  "<<\n"
647  " /Ascent %d\n"
648  " /CapHeight %d\n"
649  " /Descent -1\n" // Spec says must be negative
650  " /Flags 5\n" // FixedPitch + Symbolic
651  " /FontBBox [ 0 0 %d %d ]\n"
652  " /FontFile2 %ld 0 R\n"
653  " /FontName /GlyphLessFont\n"
654  " /ItalicAngle 0\n"
655  " /StemV 80\n"
656  " /Type /FontDescriptor\n"
657  ">>\n"
658  "endobj\n",
659  1000,
660  1000,
661  1000 / kCharWidth,
662  1000,
663  8L // Font data
664  );
665  if (n >= sizeof(buf)) return false;
666  AppendPDFObject(buf);
667 
668  n = snprintf(buf, sizeof(buf), "%s/pdf.ttf", datadir_);
669  if (n >= sizeof(buf)) return false;
670  FILE *fp = fopen(buf, "rb");
671  if (!fp) {
672  tprintf("Can not open file \"%s\"!\n", buf);
673  return false;
674  }
675  fseek(fp, 0, SEEK_END);
676  long int size = ftell(fp);
677  fseek(fp, 0, SEEK_SET);
678  char *buffer = new char[size];
679  if (fread(buffer, 1, size, fp) != size) {
680  fclose(fp);
681  delete[] buffer;
682  return false;
683  }
684  fclose(fp);
685  // FONTFILE2
686  n = snprintf(buf, sizeof(buf),
687  "8 0 obj\n"
688  "<<\n"
689  " /Length %ld\n"
690  " /Length1 %ld\n"
691  ">>\n"
692  "stream\n", size, size);
693  if (n >= sizeof(buf)) {
694  delete[] buffer;
695  return false;
696  }
697  AppendString(buf);
698  objsize = strlen(buf);
699  AppendData(buffer, size);
700  delete[] buffer;
701  objsize += size;
702  AppendString(endstream_endobj);
703  objsize += strlen(endstream_endobj);
704  AppendPDFObjectDIY(objsize);
705  return true;
706 }
#define tprintf(...)
Definition: tprintf.h:31
void AppendString(const char *s)
Definition: renderer.cpp:101
void AppendData(const char *s, int len)
Definition: renderer.cpp:105

◆ EndDocumentHandler()

bool tesseract::TessPDFRenderer::EndDocumentHandler ( )
protectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 939 of file pdfrenderer.cpp.

939  {
940  size_t n;
941  char buf[kBasicBufSize];
942 
943  // We reserved the /Pages object number early, so that the /Page
944  // objects could refer to their parent. We finally have enough
945  // information to go fill it in. Using lower level calls to manipulate
946  // the offset record in two spots, because we are placing objects
947  // out of order in the file.
948 
949  // PAGES
950  const long int kPagesObjectNumber = 2;
951  offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1
952  n = snprintf(buf, sizeof(buf),
953  "%ld 0 obj\n"
954  "<<\n"
955  " /Type /Pages\n"
956  " /Kids [ ", kPagesObjectNumber);
957  if (n >= sizeof(buf)) return false;
958  AppendString(buf);
959  size_t pages_objsize = strlen(buf);
960  for (size_t i = 0; i < pages_.size(); i++) {
961  n = snprintf(buf, sizeof(buf),
962  "%ld 0 R ", pages_[i]);
963  if (n >= sizeof(buf)) return false;
964  AppendString(buf);
965  pages_objsize += strlen(buf);
966  }
967  n = snprintf(buf, sizeof(buf),
968  "]\n"
969  " /Count %d\n"
970  ">>\n"
971  "endobj\n", pages_.size());
972  if (n >= sizeof(buf)) return false;
973  AppendString(buf);
974  pages_objsize += strlen(buf);
975  offsets_.back() += pages_objsize; // manipulation #2
976 
977  // INFO
978  STRING utf16_title = "FEFF"; // byte_order_marker
979  GenericVector<int> unicodes;
980  UNICHAR::UTF8ToUnicode(title(), &unicodes);
981  char utf16[kMaxBytesPerCodepoint];
982  for (int i = 0; i < unicodes.length(); i++) {
983  int code = unicodes[i];
984  if (CodepointToUtf16be(code, utf16)) {
985  utf16_title += utf16;
986  }
987  }
988 
989  char* datestr = l_getFormattedDate();
990  n = snprintf(buf, sizeof(buf),
991  "%ld 0 obj\n"
992  "<<\n"
993  " /Producer (Tesseract %s)\n"
994  " /CreationDate (D:%s)\n"
995  " /Title <%s>\n"
996  ">>\n"
997  "endobj\n",
998  obj_, TESSERACT_VERSION_STR, datestr, utf16_title.c_str());
999  lept_free(datestr);
1000  if (n >= sizeof(buf)) return false;
1001  AppendPDFObject(buf);
1002  n = snprintf(buf, sizeof(buf),
1003  "xref\n"
1004  "0 %ld\n"
1005  "0000000000 65535 f \n", obj_);
1006  if (n >= sizeof(buf)) return false;
1007  AppendString(buf);
1008  for (int i = 1; i < obj_; i++) {
1009  n = snprintf(buf, sizeof(buf), "%010ld 00000 n \n", offsets_[i]);
1010  if (n >= sizeof(buf)) return false;
1011  AppendString(buf);
1012  }
1013  n = snprintf(buf, sizeof(buf),
1014  "trailer\n"
1015  "<<\n"
1016  " /Size %ld\n"
1017  " /Root %ld 0 R\n"
1018  " /Info %ld 0 R\n"
1019  ">>\n"
1020  "startxref\n"
1021  "%ld\n"
1022  "%%%%EOF\n",
1023  obj_,
1024  1L, // catalog
1025  obj_ - 1, // info
1026  offsets_.back());
1027  if (n >= sizeof(buf)) return false;
1028  AppendString(buf);
1029  return true;
1030 }
bool CodepointToUtf16be(int code, char utf16[kMaxBytesPerCodepoint])
static bool UTF8ToUnicode(const char *utf8_str, GenericVector< int > *unicodes)
Definition: unichar.cpp:211
T & back() const
const char * title() const
Definition: renderer.h:81
const char * c_str() const
Definition: strngs.cpp:212
Definition: strngs.h:44
void AppendString(const char *s)
Definition: renderer.cpp:101
int size() const
Definition: genericvector.h:72
int length() const
Definition: genericvector.h:79
#define TESSERACT_VERSION_STR
Definition: baseapi.h:23

The documentation for this class was generated from the following files: