tesseract  3.05.02
combine_tessdata.cpp
Go to the documentation of this file.
1 // File: combine_tessdata
3 // Description: Creates a unified traineddata file from several
4 // data files produced by the training process.
5 // Author: Daria Antonova
6 // Created: Wed Jun 03 11:26:43 PST 2009
7 //
8 // (C) Copyright 2009, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #include "tessdatamanager.h"
22 
23 // Main program to combine/extract/overwrite tessdata components
24 // in [lang].traineddata files.
25 //
26 // To combine all the individual tessdata components (unicharset, DAWGs,
27 // classifier templates, ambiguities, language configs) located at, say,
28 // /home/$USER/temp/eng.* run:
29 //
30 // combine_tessdata /home/$USER/temp/eng.
31 //
32 // The result will be a combined tessdata file /home/$USER/temp/eng.traineddata
33 //
34 // Specify option -e if you would like to extract individual components
35 // from a combined traineddata file. For example, to extract language config
36 // file and the unicharset from tessdata/eng.traineddata run:
37 //
38 // combine_tessdata -e tessdata/eng.traineddata
39 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
40 //
41 // The desired config file and unicharset will be written to
42 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
43 //
44 // Specify option -o to overwrite individual components of the given
45 // [lang].traineddata file. For example, to overwrite language config
46 // and unichar ambiguities files in tessdata/eng.traineddata use:
47 //
48 // combine_tessdata -o tessdata/eng.traineddata
49 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs
50 //
51 // As a result, tessdata/eng.traineddata will contain the new language config
52 // and unichar ambigs, plus all the original DAWGs, classifier teamples, etc.
53 //
54 // Note: the file names of the files to extract to and to overwrite from should
55 // have the appropriate file suffixes (extensions) indicating their tessdata
56 // component type (.unicharset for the unicharset, .unicharambigs for unichar
57 // ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.
58 //
59 // Specify option -u to unpack all the components to the specified path:
60 //
61 // combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.
62 //
63 // This will create /home/$USER/temp/eng.* files with individual tessdata
64 // components from tessdata/eng.traineddata.
65 //
66 int main(int argc, char **argv) {
67  int i;
68  if (argc == 2) {
69  printf("Combining tessdata files\n");
70  STRING lang = argv[1];
71  char* last = &argv[1][strlen(argv[1])-1];
72  if (*last != '.')
73  lang += '.';
74  STRING output_file = lang;
75  output_file += kTrainedDataSuffix;
77  lang.string(), output_file.string())) {
78  printf("Error combining tessdata files into %s\n",
79  output_file.string());
80  } else {
81  printf("Output %s created successfully.\n", output_file.string());
82  }
83  } else if (argc >= 4 && (strcmp(argv[1], "-e") == 0 ||
84  strcmp(argv[1], "-u") == 0)) {
85  // Initialize TessdataManager with the data in the given traineddata file.
87  tm.Init(argv[2], 0);
88  printf("Extracting tessdata components from %s\n", argv[2]);
89  if (strcmp(argv[1], "-e") == 0) {
90  for (i = 3; i < argc; ++i) {
91  if (tm.ExtractToFile(argv[i])) {
92  printf("Wrote %s\n", argv[i]);
93  } else {
94  printf("Not extracting %s, since this component"
95  " is not present\n", argv[i]);
96  }
97  }
98  } else { // extract all the components
99  for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) {
100  STRING filename = argv[3];
101  char* last = &argv[3][strlen(argv[3])-1];
102  if (*last != '.')
103  filename += '.';
104  filename += tesseract::kTessdataFileSuffixes[i];
105  if (tm.ExtractToFile(filename.string())) {
106  printf("Wrote %s\n", filename.string());
107  }
108  }
109  }
110  tm.End();
111  } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {
112  // Rename the current traineddata file to a temporary name.
113  const char *new_traineddata_filename = argv[2];
114  STRING traineddata_filename = new_traineddata_filename;
115  traineddata_filename += ".__tmp__";
116  if (rename(new_traineddata_filename, traineddata_filename.string()) != 0) {
117  tprintf("Failed to create a temporary file %s\n",
118  traineddata_filename.string());
119  exit(1);
120  }
121 
122  // Initialize TessdataManager with the data in the given traineddata file.
124  tm.Init(traineddata_filename.string(), 0);
125 
126  // Write the updated traineddata file.
127  tm.OverwriteComponents(new_traineddata_filename, argv+3, argc-3);
128  tm.End();
129  } else {
130  printf("Usage for combining tessdata components:\n"
131  " %s language_data_path_prefix\n"
132  " (e.g. %s tessdata/eng.)\n\n", argv[0], argv[0]);
133  printf("Usage for extracting tessdata components:\n"
134  " %s -e traineddata_file [output_component_file...]\n"
135  " (e.g. %s -e eng.traineddata eng.unicharset)\n\n",
136  argv[0], argv[0]);
137  printf("Usage for overwriting tessdata components:\n"
138  " %s -o traineddata_file [input_component_file...]\n"
139  " (e.g. %s -o eng.traineddata eng.unicharset)\n\n",
140  argv[0], argv[0]);
141  printf("Usage for unpacking all tessdata components:\n"
142  " %s -u traineddata_file output_path_prefix\n"
143  " (e.g. %s -u eng.traineddata tmp/eng.)\n", argv[0], argv[0]);
144  return 1;
145  }
146 }
LIST last(LIST var_list)
Definition: oldlist.cpp:271
bool ExtractToFile(const char *filename)
static bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
const char * string() const
Definition: strngs.cpp:201
bool Init(const char *data_file_name, int debug_level)
#define tprintf(...)
Definition: tprintf.h:31
Definition: strngs.h:44
int main(int argc, char **argv)