tesseract  3.05.02
tesseract::TessdataManager Class Reference

#include <tessdatamanager.h>

Public Member Functions

 TessdataManager ()
 
 ~TessdataManager ()
 
int DebugLevel ()
 
bool Init (const char *data_file_name, int debug_level)
 
const STRINGGetDataFileName () const
 
FILE * GetDataFilePtr () const
 
bool SeekToStart (TessdataType tessdata_type)
 
inT64 GetEndOffset (TessdataType tessdata_type) const
 
void End ()
 
bool swap () const
 
bool OverwriteComponents (const char *new_traineddata_filename, char **component_filenames, int num_new_components)
 
bool ExtractToFile (const char *filename)
 

Static Public Member Functions

static bool WriteMetadata (inT64 *offset_table, const char *language_data_path_prefix, FILE *output_file)
 
static bool CombineDataFiles (const char *language_data_path_prefix, const char *output_filename)
 
static void CopyFile (FILE *input_file, FILE *output_file, bool newline_end, inT64 num_bytes_to_copy)
 
static bool TessdataTypeFromFileSuffix (const char *suffix, TessdataType *type, bool *text_file)
 
static bool TessdataTypeFromFileName (const char *filename, TessdataType *type, bool *text_file)
 

Detailed Description

Definition at line 133 of file tessdatamanager.h.

Constructor & Destructor Documentation

◆ TessdataManager()

tesseract::TessdataManager::TessdataManager ( )
inline

Definition at line 135 of file tessdatamanager.h.

135  {
136  data_file_ = NULL;
137  actual_tessdata_num_entries_ = 0;
138  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
139  offset_table_[i] = -1;
140  }
141  }

◆ ~TessdataManager()

tesseract::TessdataManager::~TessdataManager ( )
inline

Definition at line 142 of file tessdatamanager.h.

142 {}

Member Function Documentation

◆ CombineDataFiles()

bool tesseract::TessdataManager::CombineDataFiles ( const char *  language_data_path_prefix,
const char *  output_filename 
)
static

Reads all the standard tesseract config and data files for a language at the given path and bundles them up into one binary data file. Returns true if the combined traineddata file was successfully written.

Definition at line 127 of file tessdatamanager.cpp.

129  {
130  int i;
131  inT64 offset_table[TESSDATA_NUM_ENTRIES];
132  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
133  FILE *output_file = fopen(output_filename, "wb");
134  if (output_file == NULL) {
135  tprintf("Error opening %s for writing\n", output_filename);
136  return false;
137  }
138  // Leave some space for recording the offset_table.
139  if (fseek(output_file,
140  sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) {
141  tprintf("Error seeking %s\n", output_filename);
142  fclose(output_file);
143  return false;
144  }
145 
147  bool text_file = false;
148  FILE *file_ptr[TESSDATA_NUM_ENTRIES];
149 
150  // Load individual tessdata components from files.
151  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
153  kTessdataFileSuffixes[i], &type, &text_file));
154  STRING filename = language_data_path_prefix;
155  filename += kTessdataFileSuffixes[i];
156  file_ptr[i] = fopen(filename.string(), "rb");
157  if (file_ptr[i] != NULL) {
158  offset_table[type] = ftell(output_file);
159  CopyFile(file_ptr[i], output_file, text_file, -1);
160  fclose(file_ptr[i]);
161  }
162  }
163 
164  // Make sure that the required components are present.
165  if (file_ptr[TESSDATA_UNICHARSET] == NULL) {
166  tprintf("Error opening %sunicharset file\n", language_data_path_prefix);
167  fclose(output_file);
168  return false;
169  }
170  if (file_ptr[TESSDATA_INTTEMP] != NULL &&
171  (file_ptr[TESSDATA_PFFMTABLE] == NULL ||
172  file_ptr[TESSDATA_NORMPROTO] == NULL)) {
173  tprintf("Error opening %spffmtable and/or %snormproto files"
174  " while %sinttemp file was present\n", language_data_path_prefix,
175  language_data_path_prefix, language_data_path_prefix);
176  fclose(output_file);
177  return false;
178  }
179 
180  return WriteMetadata(offset_table, language_data_path_prefix, output_file);
181 }
static bool WriteMetadata(inT64 *offset_table, const char *language_data_path_prefix, FILE *output_file)
long long int inT64
Definition: host.h:41
static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end, inT64 num_bytes_to_copy)
int inT32
Definition: host.h:35
#define tprintf(...)
Definition: tprintf.h:31
Definition: strngs.h:44
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type, bool *text_file)
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ CopyFile()

void tesseract::TessdataManager::CopyFile ( FILE *  input_file,
FILE *  output_file,
bool  newline_end,
inT64  num_bytes_to_copy 
)
static

Copies data from the given input file to the output_file provided. If num_bytes_to_copy is >= 0, only num_bytes_to_copy is copied from the input file, otherwise all the data in the input file is copied.

Definition at line 74 of file tessdatamanager.cpp.

75  {
76  if (num_bytes_to_copy == 0) return;
77  int buffer_size = 1024;
78  if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) {
79  buffer_size = num_bytes_to_copy;
80  }
81  inT64 num_bytes_copied = 0;
82  char *chunk = new char[buffer_size];
83  int bytes_read;
84  char last_char = 0x0;
85  while ((bytes_read = fread(chunk, sizeof(char),
86  buffer_size, input_file))) {
87  fwrite(chunk, sizeof(char), bytes_read, output_file);
88  last_char = chunk[bytes_read-1];
89  if (num_bytes_to_copy > 0) {
90  num_bytes_copied += bytes_read;
91  if (num_bytes_copied == num_bytes_to_copy) break;
92  if (num_bytes_copied + buffer_size > num_bytes_to_copy) {
93  buffer_size = num_bytes_to_copy - num_bytes_copied;
94  }
95  }
96  }
97  if (newline_end) ASSERT_HOST(last_char == '\n');
98  delete[] chunk;
99 }
long long int inT64
Definition: host.h:41
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ DebugLevel()

int tesseract::TessdataManager::DebugLevel ( )
inline

Definition at line 143 of file tessdatamanager.h.

143 { return debug_level_; }

◆ End()

void tesseract::TessdataManager::End ( )
inline

Closes data_file_ (if it was opened by Init()).

Definition at line 192 of file tessdatamanager.h.

192  {
193  if (data_file_ != NULL) {
194  fclose(data_file_);
195  data_file_ = NULL;
196  }
197  }

◆ ExtractToFile()

bool tesseract::TessdataManager::ExtractToFile ( const char *  filename)

Extracts tessdata component implied by the name of the input file from the combined traineddata loaded into TessdataManager. Writes the extracted component to the file indicated by the file name. E.g. if the filename given is somepath/somelang.unicharset, unicharset will be extracted from the data loaded into the TessdataManager and will be written to somepath/somelang.unicharset.

Returns
true if the component was successfully extracted, false if the component was not present in the traineddata loaded into TessdataManager.

Definition at line 259 of file tessdatamanager.cpp.

259  {
261  bool text_file = false;
263  filename, &type, &text_file));
264  if (!SeekToStart(type)) return false;
265 
266  FILE *output_file = fopen(filename, "wb");
267  if (output_file == NULL) {
268  tprintf("Error opening %s\n", filename);
269  exit(1);
270  }
271  inT64 begin_offset = ftell(GetDataFilePtr());
272  inT64 end_offset = GetEndOffset(type);
274  GetDataFilePtr(), output_file, text_file,
275  end_offset - begin_offset + 1);
276  fclose(output_file);
277  return true;
278 }
inT64 GetEndOffset(TessdataType tessdata_type) const
long long int inT64
Definition: host.h:41
static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end, inT64 num_bytes_to_copy)
#define tprintf(...)
Definition: tprintf.h:31
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type, bool *text_file)
bool SeekToStart(TessdataType tessdata_type)
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ GetDataFileName()

const STRING& tesseract::TessdataManager::GetDataFileName ( ) const
inline

Definition at line 152 of file tessdatamanager.h.

152 { return data_file_name_; }

◆ GetDataFilePtr()

FILE* tesseract::TessdataManager::GetDataFilePtr ( ) const
inline

Returns data file pointer.

Definition at line 155 of file tessdatamanager.h.

155 { return data_file_; }

◆ GetEndOffset()

inT64 tesseract::TessdataManager::GetEndOffset ( TessdataType  tessdata_type) const
inline

Returns the end offset for the given tesseract data file type.

Definition at line 178 of file tessdatamanager.h.

178  {
179  int index = tessdata_type + 1;
180  while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) {
181  ++index; // skip tessdata types not present in the combined file
182  }
183  if (debug_level_) {
184  tprintf("TessdataManager: end offset for type %d is %lld\n",
185  tessdata_type,
186  (index == actual_tessdata_num_entries_) ? -1
187  : offset_table_[index]);
188  }
189  return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1;
190  }
#define tprintf(...)
Definition: tprintf.h:31

◆ Init()

bool tesseract::TessdataManager::Init ( const char *  data_file_name,
int  debug_level 
)

Opens the given data file and reads the offset table.

Returns
true on success.

Definition at line 36 of file tessdatamanager.cpp.

36  {
37  int i;
38  debug_level_ = debug_level;
39  data_file_name_ = data_file_name;
40  data_file_ = fopen(data_file_name, "rb");
41  if (data_file_ == NULL) {
42  tprintf("Error opening data file %s\n", data_file_name);
43  tprintf("Please make sure the TESSDATA_PREFIX environment variable is set "
44  "to the parent directory of your \"tessdata\" directory.\n");
45  return false;
46  }
47  fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_);
48  swap_ = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
49  if (swap_) {
50  ReverseN(&actual_tessdata_num_entries_,
51  sizeof(actual_tessdata_num_entries_));
52  }
53  if (actual_tessdata_num_entries_ > TESSDATA_NUM_ENTRIES) {
54  // For forward compatibility, truncate to the number we can handle.
55  actual_tessdata_num_entries_ = TESSDATA_NUM_ENTRIES;
56  }
57  fread(offset_table_, sizeof(inT64),
58  actual_tessdata_num_entries_, data_file_);
59  if (swap_) {
60  for (i = 0 ; i < actual_tessdata_num_entries_; ++i) {
61  ReverseN(&offset_table_[i], sizeof(offset_table_[i]));
62  }
63  }
64  if (debug_level_) {
65  tprintf("TessdataManager loaded %d types of tesseract data files.\n",
66  actual_tessdata_num_entries_);
67  for (i = 0; i < actual_tessdata_num_entries_; ++i) {
68  tprintf("Offset for type %d is %lld\n", i, offset_table_[i]);
69  }
70  }
71  return true;
72 }
long long int inT64
Definition: host.h:41
int inT32
Definition: host.h:35
#define tprintf(...)
Definition: tprintf.h:31
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:177

◆ OverwriteComponents()

bool tesseract::TessdataManager::OverwriteComponents ( const char *  new_traineddata_filename,
char **  component_filenames,
int  num_new_components 
)

Gets the individual components from the data_file_ with which the class was initialized. Overwrites the components specified by component_filenames. Writes the updated traineddata file to new_traineddata_filename.

Definition at line 183 of file tessdatamanager.cpp.

186  {
187  int i;
188  inT64 offset_table[TESSDATA_NUM_ENTRIES];
190  bool text_file = false;
191  FILE *file_ptr[TESSDATA_NUM_ENTRIES];
192  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
193  offset_table[i] = -1;
194  file_ptr[i] = NULL;
195  }
196  FILE *output_file = fopen(new_traineddata_filename, "wb");
197  if (output_file == NULL) {
198  tprintf("Error opening %s for writing\n", new_traineddata_filename);
199  return false;
200  }
201 
202  // Leave some space for recording the offset_table.
203  if (fseek(output_file,
204  sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) {
205  fclose(output_file);
206  tprintf("Error seeking %s\n", new_traineddata_filename);
207  return false;
208  }
209 
210  // Open the files with the new components.
211  for (i = 0; i < num_new_components; ++i) {
212  if (TessdataTypeFromFileName(component_filenames[i], &type, &text_file))
213  file_ptr[type] = fopen(component_filenames[i], "rb");
214  }
215 
216  // Write updated data to the output traineddata file.
217  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
218  if (file_ptr[i] != NULL) {
219  // Get the data from the opened component file.
220  offset_table[i] = ftell(output_file);
221  CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1);
222  fclose(file_ptr[i]);
223  } else {
224  // Get this data component from the loaded data file.
225  if (SeekToStart(static_cast<TessdataType>(i))) {
226  offset_table[i] = ftell(output_file);
227  CopyFile(data_file_, output_file, kTessdataFileIsText[i],
228  GetEndOffset(static_cast<TessdataType>(i)) -
229  ftell(data_file_) + 1);
230  }
231  }
232  }
233  const char *language_data_path_prefix = strchr(new_traineddata_filename, '.');
234  return WriteMetadata(offset_table, language_data_path_prefix, output_file);
235 }
inT64 GetEndOffset(TessdataType tessdata_type) const
static bool WriteMetadata(inT64 *offset_table, const char *language_data_path_prefix, FILE *output_file)
long long int inT64
Definition: host.h:41
static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end, inT64 num_bytes_to_copy)
int inT32
Definition: host.h:35
#define tprintf(...)
Definition: tprintf.h:31
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type, bool *text_file)
bool SeekToStart(TessdataType tessdata_type)

◆ SeekToStart()

bool tesseract::TessdataManager::SeekToStart ( TessdataType  tessdata_type)
inline

Returns false if there is no data of the given type. Otherwise does a seek on the data_file_ to position the pointer at the start of the data of the given type.

Definition at line 162 of file tessdatamanager.h.

162  {
163  if (debug_level_) {
164  tprintf("TessdataManager: seek to offset %lld - start of tessdata"
165  "type %d (%s))\n", offset_table_[tessdata_type],
166  tessdata_type, kTessdataFileSuffixes[tessdata_type]);
167  }
168  if (offset_table_[tessdata_type] < 0) {
169  return false;
170  } else {
171  ASSERT_HOST(fseek(data_file_,
172  static_cast<size_t>(offset_table_[tessdata_type]),
173  SEEK_SET) == 0);
174  return true;
175  }
176  }
#define tprintf(...)
Definition: tprintf.h:31
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ swap()

bool tesseract::TessdataManager::swap ( ) const
inline

Definition at line 198 of file tessdatamanager.h.

198  {
199  return swap_;
200  }

◆ TessdataTypeFromFileName()

bool tesseract::TessdataManager::TessdataTypeFromFileName ( const char *  filename,
TessdataType type,
bool *  text_file 
)
static

Tries to determine tessdata component file suffix from filename, returns true on success.

Definition at line 251 of file tessdatamanager.cpp.

252  {
253  // Get the file suffix (extension)
254  const char *suffix = strrchr(filename, '.');
255  if (suffix == NULL || *(++suffix) == '\0') return false;
256  return TessdataTypeFromFileSuffix(suffix, type, text_file);
257 }
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type, bool *text_file)

◆ TessdataTypeFromFileSuffix()

bool tesseract::TessdataManager::TessdataTypeFromFileSuffix ( const char *  suffix,
TessdataType type,
bool *  text_file 
)
static

Fills type with TessdataType of the tessdata component represented by the given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET. Sets *text_file to true if the component is in text format (e.g. unicharset, unichar ambigs, config, etc).

Returns
true if the tessdata component type could be determined from the given file name.

Definition at line 237 of file tessdatamanager.cpp.

238  {
239  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
240  if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
241  *type = static_cast<TessdataType>(i);
242  *text_file = kTessdataFileIsText[i];
243  return true;
244  }
245  }
246  tprintf("TessdataManager can't determine which tessdata"
247  " component is represented by %s\n", suffix);
248  return false;
249 }
#define tprintf(...)
Definition: tprintf.h:31

◆ WriteMetadata()

bool tesseract::TessdataManager::WriteMetadata ( inT64 offset_table,
const char *  language_data_path_prefix,
FILE *  output_file 
)
static

Writes the number of entries and the given offset table to output_file. Returns false on error.

Definition at line 101 of file tessdatamanager.cpp.

103  {
104  inT32 num_entries = TESSDATA_NUM_ENTRIES;
105  bool result = true;
106  if (fseek(output_file, 0, SEEK_SET) != 0 ||
107  fwrite(&num_entries, sizeof(inT32), 1, output_file) != 1 ||
108  fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES,
109  output_file) != TESSDATA_NUM_ENTRIES) {
110  fclose(output_file);
111  result = false;
112  tprintf("WriteMetadata failed in TessdataManager!\n");
113  } else if (fclose(output_file)) {
114  result = false;
115  tprintf("WriteMetadata failed to close file!\n");
116  } else {
117  tprintf("TessdataManager combined tesseract data files.\n");
118  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
119  tprintf("Offset for type %2d (%s%-22s) is %lld\n", i,
120  language_data_path_prefix, kTessdataFileSuffixes[i],
121  offset_table[i]);
122  }
123  }
124  return result;
125 }
long long int inT64
Definition: host.h:41
int inT32
Definition: host.h:35
#define tprintf(...)
Definition: tprintf.h:31

The documentation for this class was generated from the following files: