tesseract  3.05.02
tessdatamanager.cpp
Go to the documentation of this file.
1 // File: tessdatamanager.cpp
3 // Description: Functions to handle loading/combining tesseract data files.
4 // Author: Daria Antonova
5 // Created: Wed Jun 03 11:26:43 PST 2009
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #endif
23 
24 #include "tessdatamanager.h"
25 
26 #include <stdio.h>
27 
28 #include "helpers.h"
29 #include "serialis.h"
30 #include "strngs.h"
31 #include "tprintf.h"
32 #include "params.h"
33 
34 namespace tesseract {
35 
36 bool TessdataManager::Init(const char *data_file_name, int debug_level) {
37  int i;
38  debug_level_ = debug_level;
39  data_file_name_ = data_file_name;
40  data_file_ = fopen(data_file_name, "rb");
41  if (data_file_ == NULL) {
42  tprintf("Error opening data file %s\n", data_file_name);
43  tprintf("Please make sure the TESSDATA_PREFIX environment variable is set "
44  "to the parent directory of your \"tessdata\" directory.\n");
45  return false;
46  }
47  fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_);
48  swap_ = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
49  if (swap_) {
50  ReverseN(&actual_tessdata_num_entries_,
51  sizeof(actual_tessdata_num_entries_));
52  }
53  if (actual_tessdata_num_entries_ > TESSDATA_NUM_ENTRIES) {
54  // For forward compatibility, truncate to the number we can handle.
55  actual_tessdata_num_entries_ = TESSDATA_NUM_ENTRIES;
56  }
57  fread(offset_table_, sizeof(inT64),
58  actual_tessdata_num_entries_, data_file_);
59  if (swap_) {
60  for (i = 0 ; i < actual_tessdata_num_entries_; ++i) {
61  ReverseN(&offset_table_[i], sizeof(offset_table_[i]));
62  }
63  }
64  if (debug_level_) {
65  tprintf("TessdataManager loaded %d types of tesseract data files.\n",
66  actual_tessdata_num_entries_);
67  for (i = 0; i < actual_tessdata_num_entries_; ++i) {
68  tprintf("Offset for type %d is %lld\n", i, offset_table_[i]);
69  }
70  }
71  return true;
72 }
73 
74 void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
75  bool newline_end, inT64 num_bytes_to_copy) {
76  if (num_bytes_to_copy == 0) return;
77  int buffer_size = 1024;
78  if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) {
79  buffer_size = num_bytes_to_copy;
80  }
81  inT64 num_bytes_copied = 0;
82  char *chunk = new char[buffer_size];
83  int bytes_read;
84  char last_char = 0x0;
85  while ((bytes_read = fread(chunk, sizeof(char),
86  buffer_size, input_file))) {
87  fwrite(chunk, sizeof(char), bytes_read, output_file);
88  last_char = chunk[bytes_read-1];
89  if (num_bytes_to_copy > 0) {
90  num_bytes_copied += bytes_read;
91  if (num_bytes_copied == num_bytes_to_copy) break;
92  if (num_bytes_copied + buffer_size > num_bytes_to_copy) {
93  buffer_size = num_bytes_to_copy - num_bytes_copied;
94  }
95  }
96  }
97  if (newline_end) ASSERT_HOST(last_char == '\n');
98  delete[] chunk;
99 }
100 
102  const char * language_data_path_prefix,
103  FILE *output_file) {
104  inT32 num_entries = TESSDATA_NUM_ENTRIES;
105  bool result = true;
106  if (fseek(output_file, 0, SEEK_SET) != 0 ||
107  fwrite(&num_entries, sizeof(inT32), 1, output_file) != 1 ||
108  fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES,
109  output_file) != TESSDATA_NUM_ENTRIES) {
110  fclose(output_file);
111  result = false;
112  tprintf("WriteMetadata failed in TessdataManager!\n");
113  } else if (fclose(output_file)) {
114  result = false;
115  tprintf("WriteMetadata failed to close file!\n");
116  } else {
117  tprintf("TessdataManager combined tesseract data files.\n");
118  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
119  tprintf("Offset for type %2d (%s%-22s) is %lld\n", i,
120  language_data_path_prefix, kTessdataFileSuffixes[i],
121  offset_table[i]);
122  }
123  }
124  return result;
125 }
126 
128  const char *language_data_path_prefix,
129  const char *output_filename) {
130  int i;
131  inT64 offset_table[TESSDATA_NUM_ENTRIES];
132  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
133  FILE *output_file = fopen(output_filename, "wb");
134  if (output_file == NULL) {
135  tprintf("Error opening %s for writing\n", output_filename);
136  return false;
137  }
138  // Leave some space for recording the offset_table.
139  if (fseek(output_file,
140  sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) {
141  tprintf("Error seeking %s\n", output_filename);
142  fclose(output_file);
143  return false;
144  }
145 
147  bool text_file = false;
148  FILE *file_ptr[TESSDATA_NUM_ENTRIES];
149 
150  // Load individual tessdata components from files.
151  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
153  kTessdataFileSuffixes[i], &type, &text_file));
154  STRING filename = language_data_path_prefix;
155  filename += kTessdataFileSuffixes[i];
156  file_ptr[i] = fopen(filename.string(), "rb");
157  if (file_ptr[i] != NULL) {
158  offset_table[type] = ftell(output_file);
159  CopyFile(file_ptr[i], output_file, text_file, -1);
160  fclose(file_ptr[i]);
161  }
162  }
163 
164  // Make sure that the required components are present.
165  if (file_ptr[TESSDATA_UNICHARSET] == NULL) {
166  tprintf("Error opening %sunicharset file\n", language_data_path_prefix);
167  fclose(output_file);
168  return false;
169  }
170  if (file_ptr[TESSDATA_INTTEMP] != NULL &&
171  (file_ptr[TESSDATA_PFFMTABLE] == NULL ||
172  file_ptr[TESSDATA_NORMPROTO] == NULL)) {
173  tprintf("Error opening %spffmtable and/or %snormproto files"
174  " while %sinttemp file was present\n", language_data_path_prefix,
175  language_data_path_prefix, language_data_path_prefix);
176  fclose(output_file);
177  return false;
178  }
179 
180  return WriteMetadata(offset_table, language_data_path_prefix, output_file);
181 }
182 
184  const char *new_traineddata_filename,
185  char **component_filenames,
186  int num_new_components) {
187  int i;
188  inT64 offset_table[TESSDATA_NUM_ENTRIES];
190  bool text_file = false;
191  FILE *file_ptr[TESSDATA_NUM_ENTRIES];
192  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
193  offset_table[i] = -1;
194  file_ptr[i] = NULL;
195  }
196  FILE *output_file = fopen(new_traineddata_filename, "wb");
197  if (output_file == NULL) {
198  tprintf("Error opening %s for writing\n", new_traineddata_filename);
199  return false;
200  }
201 
202  // Leave some space for recording the offset_table.
203  if (fseek(output_file,
204  sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) {
205  fclose(output_file);
206  tprintf("Error seeking %s\n", new_traineddata_filename);
207  return false;
208  }
209 
210  // Open the files with the new components.
211  for (i = 0; i < num_new_components; ++i) {
212  if (TessdataTypeFromFileName(component_filenames[i], &type, &text_file))
213  file_ptr[type] = fopen(component_filenames[i], "rb");
214  }
215 
216  // Write updated data to the output traineddata file.
217  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
218  if (file_ptr[i] != NULL) {
219  // Get the data from the opened component file.
220  offset_table[i] = ftell(output_file);
221  CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1);
222  fclose(file_ptr[i]);
223  } else {
224  // Get this data component from the loaded data file.
225  if (SeekToStart(static_cast<TessdataType>(i))) {
226  offset_table[i] = ftell(output_file);
227  CopyFile(data_file_, output_file, kTessdataFileIsText[i],
228  GetEndOffset(static_cast<TessdataType>(i)) -
229  ftell(data_file_) + 1);
230  }
231  }
232  }
233  const char *language_data_path_prefix = strchr(new_traineddata_filename, '.');
234  return WriteMetadata(offset_table, language_data_path_prefix, output_file);
235 }
236 
238  const char *suffix, TessdataType *type, bool *text_file) {
239  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
240  if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
241  *type = static_cast<TessdataType>(i);
242  *text_file = kTessdataFileIsText[i];
243  return true;
244  }
245  }
246  tprintf("TessdataManager can't determine which tessdata"
247  " component is represented by %s\n", suffix);
248  return false;
249 }
250 
252  const char *filename, TessdataType *type, bool *text_file) {
253  // Get the file suffix (extension)
254  const char *suffix = strrchr(filename, '.');
255  if (suffix == NULL || *(++suffix) == '\0') return false;
256  return TessdataTypeFromFileSuffix(suffix, type, text_file);
257 }
258 
261  bool text_file = false;
263  filename, &type, &text_file));
264  if (!SeekToStart(type)) return false;
265 
266  FILE *output_file = fopen(filename, "wb");
267  if (output_file == NULL) {
268  tprintf("Error opening %s\n", filename);
269  exit(1);
270  }
271  inT64 begin_offset = ftell(GetDataFilePtr());
272  inT64 end_offset = GetEndOffset(type);
274  GetDataFilePtr(), output_file, text_file,
275  end_offset - begin_offset + 1);
276  fclose(output_file);
277  return true;
278 }
279 
280 } // namespace tesseract
inT64 GetEndOffset(TessdataType tessdata_type) const
static bool WriteMetadata(inT64 *offset_table, const char *language_data_path_prefix, FILE *output_file)
bool ExtractToFile(const char *filename)
static bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
bool Init(const char *data_file_name, int debug_level)
long long int inT64
Definition: host.h:41
static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end, inT64 num_bytes_to_copy)
int inT32
Definition: host.h:35
#define tprintf(...)
Definition: tprintf.h:31
Definition: strngs.h:44
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type, bool *text_file)
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type, bool *text_file)
bool SeekToStart(TessdataType tessdata_type)
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:177
#define ASSERT_HOST(x)
Definition: errcode.h:84