tesseract  3.05.02
tessdatamanager.h
Go to the documentation of this file.
1 // File: tessdatamanager.h
3 // Description: Functions to handle loading/combining tesseract data files.
4 // Author: Daria Antonova
5 // Created: Wed Jun 03 11:26:43 PST 2009
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_
21 #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
22 
23 #include <stdio.h>
24 
25 #include "host.h"
26 #include "strngs.h"
27 #include "tprintf.h"
28 
29 static const char kTrainedDataSuffix[] = "traineddata";
30 
31 // When adding new tessdata types and file suffixes, please make sure to
32 // update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText.
33 static const char kLangConfigFileSuffix[] = "config";
34 static const char kUnicharsetFileSuffix[] = "unicharset";
35 static const char kAmbigsFileSuffix[] = "unicharambigs";
36 static const char kBuiltInTemplatesFileSuffix[] = "inttemp";
37 static const char kBuiltInCutoffsFileSuffix[] = "pffmtable";
38 static const char kNormProtoFileSuffix[] = "normproto";
39 static const char kPuncDawgFileSuffix[] = "punc-dawg";
40 static const char kSystemDawgFileSuffix[] = "word-dawg";
41 static const char kNumberDawgFileSuffix[] = "number-dawg";
42 static const char kFreqDawgFileSuffix[] = "freq-dawg";
43 static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs";
44 static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset";
45 static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg";
46 static const char kShapeTableFileSuffix[] = "shapetable";
47 static const char kBigramDawgFileSuffix[] = "bigram-dawg";
48 static const char kUnambigDawgFileSuffix[] = "unambig-dawg";
49 static const char kParamsModelFileSuffix[] = "params-model";
50 
51 namespace tesseract {
52 
64  TESSDATA_FIXED_LENGTH_DAWGS, // 10 // deprecated
71 
73 };
74 
79 static const char *const kTessdataFileSuffixes[] = {
80  kLangConfigFileSuffix, // 0
81  kUnicharsetFileSuffix, // 1
82  kAmbigsFileSuffix, // 2
83  kBuiltInTemplatesFileSuffix, // 3
84  kBuiltInCutoffsFileSuffix, // 4
85  kNormProtoFileSuffix, // 5
86  kPuncDawgFileSuffix, // 6
87  kSystemDawgFileSuffix, // 7
88  kNumberDawgFileSuffix, // 8
89  kFreqDawgFileSuffix, // 9
90  kFixedLengthDawgsFileSuffix, // 10 // deprecated
91  kCubeUnicharsetFileSuffix, // 11
92  kCubeSystemDawgFileSuffix, // 12
93  kShapeTableFileSuffix, // 13
94  kBigramDawgFileSuffix, // 14
95  kUnambigDawgFileSuffix, // 15
96  kParamsModelFileSuffix, // 16
97 };
98 
103 static const bool kTessdataFileIsText[] = {
104  true, // 0
105  true, // 1
106  true, // 2
107  false, // 3
108  true, // 4
109  true, // 5
110  false, // 6
111  false, // 7
112  false, // 8
113  false, // 9
114  false, // 10 // deprecated
115  true, // 11
116  false, // 12
117  false, // 13
118  false, // 14
119  false, // 15
120  true, // 16
121 };
122 
130 static const int kMaxNumTessdataEntries = 1000;
131 
132 
134  public:
136  data_file_ = NULL;
137  actual_tessdata_num_entries_ = 0;
138  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
139  offset_table_[i] = -1;
140  }
141  }
143  int DebugLevel() { return debug_level_; }
144 
149  bool Init(const char *data_file_name, int debug_level);
150 
151  // Return the name of the underlying data file.
152  const STRING &GetDataFileName() const { return data_file_name_; }
153 
155  inline FILE *GetDataFilePtr() const { return data_file_; }
156 
162  inline bool SeekToStart(TessdataType tessdata_type) {
163  if (debug_level_) {
164  tprintf("TessdataManager: seek to offset %lld - start of tessdata"
165  "type %d (%s))\n", offset_table_[tessdata_type],
166  tessdata_type, kTessdataFileSuffixes[tessdata_type]);
167  }
168  if (offset_table_[tessdata_type] < 0) {
169  return false;
170  } else {
171  ASSERT_HOST(fseek(data_file_,
172  static_cast<size_t>(offset_table_[tessdata_type]),
173  SEEK_SET) == 0);
174  return true;
175  }
176  }
178  inline inT64 GetEndOffset(TessdataType tessdata_type) const {
179  int index = tessdata_type + 1;
180  while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) {
181  ++index; // skip tessdata types not present in the combined file
182  }
183  if (debug_level_) {
184  tprintf("TessdataManager: end offset for type %d is %lld\n",
185  tessdata_type,
186  (index == actual_tessdata_num_entries_) ? -1
187  : offset_table_[index]);
188  }
189  return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1;
190  }
192  inline void End() {
193  if (data_file_ != NULL) {
194  fclose(data_file_);
195  data_file_ = NULL;
196  }
197  }
198  bool swap() const {
199  return swap_;
200  }
201 
205  static bool WriteMetadata(inT64 *offset_table,
206  const char *language_data_path_prefix,
207  FILE *output_file);
208 
214  static bool CombineDataFiles(const char *language_data_path_prefix,
215  const char *output_filename);
216 
222  bool OverwriteComponents(const char *new_traineddata_filename,
223  char **component_filenames,
224  int num_new_components);
225 
236  bool ExtractToFile(const char *filename);
237 
243  static void CopyFile(FILE *input_file, FILE *output_file,
244  bool newline_end, inT64 num_bytes_to_copy);
245 
254  static bool TessdataTypeFromFileSuffix(const char *suffix,
255  TessdataType *type,
256  bool *text_file);
257 
262  static bool TessdataTypeFromFileName(const char *filename,
263  TessdataType *type,
264  bool *text_file);
265 
266  private:
267 
272  static FILE *GetFilePtr(const char *language_data_path_prefix,
273  const char *file_suffix, bool text_file);
274 
279  inT64 offset_table_[TESSDATA_NUM_ENTRIES];
288  inT32 actual_tessdata_num_entries_;
289  STRING data_file_name_; // name of the data file.
290  FILE *data_file_;
291  int debug_level_;
292  // True if the bytes need swapping.
293  bool swap_;
294 };
295 
296 
297 } // namespace tesseract
298 
299 #endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_
inT64 GetEndOffset(TessdataType tessdata_type) const
const STRING & GetDataFileName() const
static bool WriteMetadata(inT64 *offset_table, const char *language_data_path_prefix, FILE *output_file)
bool ExtractToFile(const char *filename)
static bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
bool Init(const char *data_file_name, int debug_level)
long long int inT64
Definition: host.h:41
static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end, inT64 num_bytes_to_copy)
int inT32
Definition: host.h:35
#define tprintf(...)
Definition: tprintf.h:31
Definition: strngs.h:44
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type, bool *text_file)
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type, bool *text_file)
bool SeekToStart(TessdataType tessdata_type)
#define ASSERT_HOST(x)
Definition: errcode.h:84