21 #pragma warning(disable:4244) // Conversion warnings 38 debug_level_ = debug_level;
39 data_file_name_ = data_file_name;
40 data_file_ = fopen(data_file_name,
"rb");
41 if (data_file_ == NULL) {
42 tprintf(
"Error opening data file %s\n", data_file_name);
43 tprintf(
"Please make sure the TESSDATA_PREFIX environment variable is set " 44 "to the parent directory of your \"tessdata\" directory.\n");
47 fread(&actual_tessdata_num_entries_,
sizeof(
inT32), 1, data_file_);
48 swap_ = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
50 ReverseN(&actual_tessdata_num_entries_,
51 sizeof(actual_tessdata_num_entries_));
57 fread(offset_table_,
sizeof(
inT64),
58 actual_tessdata_num_entries_, data_file_);
60 for (i = 0 ; i < actual_tessdata_num_entries_; ++i) {
61 ReverseN(&offset_table_[i],
sizeof(offset_table_[i]));
65 tprintf(
"TessdataManager loaded %d types of tesseract data files.\n",
66 actual_tessdata_num_entries_);
67 for (i = 0; i < actual_tessdata_num_entries_; ++i) {
68 tprintf(
"Offset for type %d is %lld\n", i, offset_table_[i]);
75 bool newline_end,
inT64 num_bytes_to_copy) {
76 if (num_bytes_to_copy == 0)
return;
77 int buffer_size = 1024;
78 if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) {
79 buffer_size = num_bytes_to_copy;
81 inT64 num_bytes_copied = 0;
82 char *chunk =
new char[buffer_size];
85 while ((bytes_read = fread(chunk,
sizeof(
char),
86 buffer_size, input_file))) {
87 fwrite(chunk,
sizeof(
char), bytes_read, output_file);
88 last_char = chunk[bytes_read-1];
89 if (num_bytes_to_copy > 0) {
90 num_bytes_copied += bytes_read;
91 if (num_bytes_copied == num_bytes_to_copy)
break;
92 if (num_bytes_copied + buffer_size > num_bytes_to_copy) {
93 buffer_size = num_bytes_to_copy - num_bytes_copied;
102 const char * language_data_path_prefix,
106 if (fseek(output_file, 0, SEEK_SET) != 0 ||
107 fwrite(&num_entries,
sizeof(
inT32), 1, output_file) != 1 ||
112 tprintf(
"WriteMetadata failed in TessdataManager!\n");
113 }
else if (fclose(output_file)) {
115 tprintf(
"WriteMetadata failed to close file!\n");
117 tprintf(
"TessdataManager combined tesseract data files.\n");
119 tprintf(
"Offset for type %2d (%s%-22s) is %lld\n", i,
120 language_data_path_prefix, kTessdataFileSuffixes[i],
128 const char *language_data_path_prefix,
129 const char *output_filename) {
133 FILE *output_file = fopen(output_filename,
"wb");
134 if (output_file == NULL) {
135 tprintf(
"Error opening %s for writing\n", output_filename);
139 if (fseek(output_file,
141 tprintf(
"Error seeking %s\n", output_filename);
147 bool text_file =
false;
153 kTessdataFileSuffixes[i], &type, &text_file));
155 filename += kTessdataFileSuffixes[i];
156 file_ptr[i] = fopen(
filename.string(),
"rb");
157 if (file_ptr[i] != NULL) {
158 offset_table[type] = ftell(output_file);
159 CopyFile(file_ptr[i], output_file, text_file, -1);
166 tprintf(
"Error opening %sunicharset file\n", language_data_path_prefix);
173 tprintf(
"Error opening %spffmtable and/or %snormproto files" 174 " while %sinttemp file was present\n", language_data_path_prefix,
175 language_data_path_prefix, language_data_path_prefix);
180 return WriteMetadata(offset_table, language_data_path_prefix, output_file);
184 const char *new_traineddata_filename,
185 char **component_filenames,
186 int num_new_components) {
190 bool text_file =
false;
193 offset_table[i] = -1;
196 FILE *output_file = fopen(new_traineddata_filename,
"wb");
197 if (output_file == NULL) {
198 tprintf(
"Error opening %s for writing\n", new_traineddata_filename);
203 if (fseek(output_file,
206 tprintf(
"Error seeking %s\n", new_traineddata_filename);
211 for (i = 0; i < num_new_components; ++i) {
213 file_ptr[type] = fopen(component_filenames[i],
"rb");
218 if (file_ptr[i] != NULL) {
220 offset_table[i] = ftell(output_file);
221 CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1);
226 offset_table[i] = ftell(output_file);
227 CopyFile(data_file_, output_file, kTessdataFileIsText[i],
229 ftell(data_file_) + 1);
233 const char *language_data_path_prefix = strchr(new_traineddata_filename,
'.');
234 return WriteMetadata(offset_table, language_data_path_prefix, output_file);
238 const char *suffix,
TessdataType *type,
bool *text_file) {
240 if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
242 *text_file = kTessdataFileIsText[i];
246 tprintf(
"TessdataManager can't determine which tessdata" 247 " component is represented by %s\n", suffix);
254 const char *suffix = strrchr(
filename,
'.');
255 if (suffix == NULL || *(++suffix) ==
'\0')
return false;
261 bool text_file =
false;
266 FILE *output_file = fopen(
filename,
"wb");
267 if (output_file == NULL) {
275 end_offset - begin_offset + 1);
inT64 GetEndOffset(TessdataType tessdata_type) const
FILE * GetDataFilePtr() const
static bool WriteMetadata(inT64 *offset_table, const char *language_data_path_prefix, FILE *output_file)
bool ExtractToFile(const char *filename)
static bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
bool Init(const char *data_file_name, int debug_level)
static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end, inT64 num_bytes_to_copy)
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type, bool *text_file)
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type, bool *text_file)
bool SeekToStart(TessdataType tessdata_type)
void ReverseN(void *ptr, int num_bytes)