tesseract  3.05.02
pango_font_info.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: pango_font_info.cpp
3  * Description: Font-related objects and helper functions
4  * Author: Ranjith Unnikrishnan
5  * Created: Mon Nov 18 2013
6  *
7  * (C) Copyright 2013, Google Inc.
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  * http://www.apache.org/licenses/LICENSE-2.0
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  *
18  **********************************************************************/
19 
20 // Include automatically generated configuration file if running autoconf.
21 #ifdef HAVE_CONFIG_H
22 #include "config_auto.h"
23 #endif
24 
25 #if (defined __MINGW32__) || (defined __CYGWIN__)
26 // workaround for stdlib.h and putenv
27 #undef __STRICT_ANSI__
28 #endif
29 
30 #include <stdlib.h>
31 #include <stdio.h>
32 #include <string.h>
33 #ifndef _MSC_VER
34 #include <sys/param.h>
35 #endif
36 #include <algorithm>
37 
38 #include "pango_font_info.h"
39 #include "commandlineflags.h"
40 #include "fileio.h"
41 #include "normstrngs.h"
42 #include "tlog.h"
43 #include "unichar.h"
44 #include "util.h"
45 #include "pango/pango.h"
46 #include "pango/pangocairo.h"
47 #include "pango/pangofc-font.h"
48 
49 STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp",
50  "Overrides fontconfig default temporary dir");
51 
52 #ifndef USE_STD_NAMESPACE
53 #include "ocr/trainingdata/typesetting/legacy_fonts.h"
54 BOOL_PARAM_FLAG(use_only_legacy_fonts, false,
55  "Overrides --fonts_dir and sets the known universe of fonts to"
56  "the list in legacy_fonts.h");
57 
58 STRING_PARAM_FLAG(fonts_dir, "/auto/ocr-data/tesstraining/fonts",
59  "Overrides system default font location");
60 #else
61 using std::pair;
62 STRING_PARAM_FLAG(fonts_dir, "",
63  "If empty it use system default. Otherwise it overrides"
64  " system default font location");
65 #endif
66 
67 namespace tesseract {
68 
69 // Default assumed output resolution. Required only for providing font metrics
70 // in pixels.
71 const int kDefaultResolution = 300;
72 
73 string PangoFontInfo::fonts_dir_;
74 string PangoFontInfo::cache_dir_;
75 
76 PangoFontInfo::PangoFontInfo() : desc_(NULL), resolution_(kDefaultResolution) {
77  Clear();
78 }
79 
80 PangoFontInfo::PangoFontInfo(const string& desc)
81  : desc_(NULL), resolution_(kDefaultResolution) {
82  if (!ParseFontDescriptionName(desc)) {
83  tprintf("ERROR: Could not parse %s\n", desc.c_str());
84  Clear();
85  }
86 }
87 
88 void PangoFontInfo::Clear() {
89  font_size_ = 0;
90  family_name_.clear();
91  font_type_ = UNKNOWN;
92  if (desc_) {
93  pango_font_description_free(desc_);
94  desc_ = NULL;
95  }
96 }
97 
98 PangoFontInfo::~PangoFontInfo() { pango_font_description_free(desc_); }
99 
101  if (!desc_) return "";
102  char* desc_str = pango_font_description_to_string(desc_);
103  string desc_name(desc_str);
104  g_free(desc_str);
105  return desc_name;
106 }
107 
108 // If not already initialized, initializes FontConfig by setting its
109 // environment variable and creating a fonts.conf file that points to the
110 // FLAGS_fonts_dir and the cache to FLAGS_fontconfig_tmpdir.
111 /* static */
113  if (fonts_dir_.empty()) {
114  HardInitFontConfig(FLAGS_fonts_dir.c_str(),
115  FLAGS_fontconfig_tmpdir.c_str());
116  }
117 }
118 
119 // Re-initializes font config, whether or not already initialized.
120 // If already initialized, any existing cache is deleted, just to be sure.
121 /* static */
122 void PangoFontInfo::HardInitFontConfig(const string& fonts_dir,
123  const string& cache_dir) {
124  if (!cache_dir_.empty()) {
126  File::JoinPath(cache_dir_.c_str(), "*cache-?").c_str());
127  }
128  const int MAX_FONTCONF_FILESIZE = 1024;
129  char fonts_conf_template[MAX_FONTCONF_FILESIZE];
130  cache_dir_ = cache_dir;
131  fonts_dir_ = fonts_dir;
132  snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,
133  "<?xml version=\"1.0\"?>\n"
134  "<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n"
135  "<fontconfig>\n"
136  "<dir>%s</dir>\n"
137  "<cachedir>%s</cachedir>\n"
138  "<config></config>\n"
139  "</fontconfig>",
140  fonts_dir.c_str(), cache_dir_.c_str());
141  string fonts_conf_file = File::JoinPath(cache_dir_.c_str(), "fonts.conf");
142  File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file);
143 #ifdef _WIN32
144  std::string env("FONTCONFIG_PATH=");
145  env.append(cache_dir_.c_str());
146  putenv(env.c_str());
147  putenv("LANG=en_US.utf8");
148 #else
149  setenv("FONTCONFIG_PATH", cache_dir_.c_str(), true);
150  // Fix the locale so that the reported font names are consistent.
151  setenv("LANG", "en_US.utf8", true);
152 #endif // _WIN32
153 
154  if (FcInitReinitialize() != FcTrue) {
155  tprintf("FcInitiReinitialize failed!!\n");
156  }
158  // Clear Pango's font cache too.
159  pango_cairo_font_map_set_default(NULL);
160 }
161 
162 static void ListFontFamilies(PangoFontFamily*** families,
163  int* n_families) {
165  PangoFontMap* font_map = pango_cairo_font_map_get_default();
167  pango_font_map_list_families(font_map, families, n_families);
168 }
169 
170 bool PangoFontInfo::ParseFontDescription(const PangoFontDescription *desc) {
171  Clear();
172  const char* family = pango_font_description_get_family(desc);
173  if (!family) {
174  char* desc_str = pango_font_description_to_string(desc);
175  tprintf("WARNING: Could not parse family name from description: '%s'\n",
176  desc_str);
177  g_free(desc_str);
178  return false;
179  }
180  family_name_ = string(family);
181  desc_ = pango_font_description_copy(desc);
182 
183  // Set font size in points
184  font_size_ = pango_font_description_get_size(desc);
185  if (!pango_font_description_get_size_is_absolute(desc)) {
186  font_size_ /= PANGO_SCALE;
187  }
188 
189  return true;
190 }
191 
192 bool PangoFontInfo::ParseFontDescriptionName(const string& name) {
193  PangoFontDescription *desc = pango_font_description_from_string(name.c_str());
194  bool success = ParseFontDescription(desc);
195  pango_font_description_free(desc);
196  return success;
197 }
198 
199 // Returns the PangoFont structure corresponding to the closest available font
200 // in the font map. Note that if the font is wholly missing, this could
201 // correspond to a completely different font family and face.
202 PangoFont* PangoFontInfo::ToPangoFont() const {
204  PangoFontMap* font_map = pango_cairo_font_map_get_default();
205  PangoContext* context = pango_context_new();
206  pango_cairo_context_set_resolution(context, resolution_);
207  pango_context_set_font_map(context, font_map);
208  PangoFont* font = NULL;
209  {
211  font = pango_font_map_load_font(font_map, context, desc_);
212  }
213  g_object_unref(context);
214  return font;
215 }
216 
217 bool PangoFontInfo::CoversUTF8Text(const char* utf8_text, int byte_length) const {
218  PangoFont* font = ToPangoFont();
219  PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
220  for (UNICHAR::const_iterator it = UNICHAR::begin(utf8_text, byte_length);
221  it != UNICHAR::end(utf8_text, byte_length);
222  ++it) {
223  if (IsWhitespace(*it) || pango_is_zero_width(*it))
224  continue;
225  if (pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) {
226  char tmp[5];
227  int len = it.get_utf8(tmp);
228  tmp[len] = '\0';
229  tlog(2, "'%s' (U+%x) not covered by font\n", tmp, *it);
230  return false;
231  }
232  }
233  return true;
234 }
235 
236 // This variant of strncpy permits src and dest to overlap. It will copy the
237 // first byte first.
238 static char* my_strnmove(char* dest, const char* src, size_t n) {
239  char* ret = dest;
240 
241  // Copy characters until n reaches zero or the src byte is a nul.
242  do {
243  *dest = *src;
244  --n;
245  ++dest;
246  ++src;
247  } while (n && src[0]);
248 
249  // If we reached a nul byte and there are more 'n' left, zero them out.
250  while (n) {
251  *dest = '\0';
252  --n;
253  ++dest;
254  }
255  return ret;
256 }
257 
258 int PangoFontInfo::DropUncoveredChars(string* utf8_text) const {
259  PangoFont* font = ToPangoFont();
260  PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
261  int num_dropped_chars = 0;
262  // Maintain two iterators that point into the string. For space efficiency, we
263  // will repeatedly copy one covered UTF8 character from one to the other, and
264  // at the end resize the string to the right length.
265  char* out = const_cast<char*>(utf8_text->c_str());
266  const UNICHAR::const_iterator it_begin =
267  UNICHAR::begin(utf8_text->c_str(), utf8_text->length());
268  const UNICHAR::const_iterator it_end =
269  UNICHAR::end(utf8_text->c_str(), utf8_text->length());
270  for (UNICHAR::const_iterator it = it_begin; it != it_end;) {
271  // Skip bad utf-8.
272  if (!it.is_legal()) {
273  ++it; // One suitable error message will still be issued.
274  continue;
275  }
276  int unicode = *it;
277  int utf8_len = it.utf8_len();
278  const char* utf8_char = it.utf8_data();
279  // Move it forward before the data gets modified.
280  ++it;
281  if (!IsWhitespace(unicode) && !pango_is_zero_width(unicode) &&
282  pango_coverage_get(coverage, unicode) != PANGO_COVERAGE_EXACT) {
283  if (TLOG_IS_ON(2)) {
284  UNICHAR unichar(unicode);
285  char* str = unichar.utf8_str();
286  tlog(2, "'%s' (U+%x) not covered by font\n", str, unicode);
287  delete[] str;
288  }
289  ++num_dropped_chars;
290  continue;
291  }
292  my_strnmove(out, utf8_char, utf8_len);
293  out += utf8_len;
294  }
295  utf8_text->resize(out - utf8_text->c_str());
296  return num_dropped_chars;
297 }
298 
299 bool PangoFontInfo::GetSpacingProperties(const string& utf8_char,
300  int* x_bearing, int* x_advance) const {
301  // Convert to equivalent PangoFont structure
302  PangoFont* font = ToPangoFont();
303  // Find the glyph index in the font for the supplied utf8 character.
304  int total_advance = 0;
305  int min_bearing = 0;
306  // Handle multi-unicode strings by reporting the left-most position of the
307  // x-bearing, and right-most position of the x-advance if the string were to
308  // be rendered.
309  const UNICHAR::const_iterator it_begin = UNICHAR::begin(utf8_char.c_str(),
310  utf8_char.length());
311  const UNICHAR::const_iterator it_end = UNICHAR::end(utf8_char.c_str(),
312  utf8_char.length());
313  for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
314  PangoGlyph glyph_index = pango_fc_font_get_glyph(
315  reinterpret_cast<PangoFcFont*>(font), *it);
316  if (!glyph_index) {
317  // Glyph for given unicode character doesn't exist in font.
318  return false;
319  }
320  // Find the ink glyph extents for the glyph
321  PangoRectangle ink_rect, logical_rect;
322  pango_font_get_glyph_extents(font, glyph_index, &ink_rect, &logical_rect);
323  pango_extents_to_pixels(&ink_rect, NULL);
324  pango_extents_to_pixels(&logical_rect, NULL);
325 
326  int bearing = total_advance + PANGO_LBEARING(ink_rect);
327  if (it == it_begin || bearing < min_bearing) {
328  min_bearing = bearing;
329  }
330  total_advance += PANGO_RBEARING(logical_rect);
331  }
332  *x_bearing = min_bearing;
333  *x_advance = total_advance;
334  return true;
335 }
336 
337 bool PangoFontInfo::CanRenderString(const char* utf8_word, int len) const {
338  vector<string> graphemes;
339  return CanRenderString(utf8_word, len, &graphemes);
340 }
341 
342 bool PangoFontInfo::CanRenderString(const char* utf8_word, int len,
343  vector<string>* graphemes) const {
344  if (graphemes) graphemes->clear();
345  // We check for font coverage of the text first, as otherwise Pango could
346  // (undesirably) fall back to another font that does have the required
347  // coverage.
348  if (!CoversUTF8Text(utf8_word, len)) {
349  return false;
350  }
351  // U+25CC dotted circle character that often (but not always) gets rendered
352  // when there is an illegal grapheme sequence.
353  const char32 kDottedCircleGlyph = 9676;
354  bool bad_glyph = false;
355  PangoFontMap* font_map = pango_cairo_font_map_get_default();
356  PangoContext* context = pango_context_new();
357  pango_context_set_font_map(context, font_map);
358  PangoLayout* layout;
359  {
360  // Pango is not relasing the cached layout.
362  layout = pango_layout_new(context);
363  }
364  if (desc_) {
365  pango_layout_set_font_description(layout, desc_);
366  } else {
367  PangoFontDescription *desc = pango_font_description_from_string(
368  DescriptionName().c_str());
369  pango_layout_set_font_description(layout, desc);
370  pango_font_description_free(desc);
371  }
372  pango_layout_set_text(layout, utf8_word, len);
373  PangoLayoutIter* run_iter = NULL;
374  { // Fontconfig caches some information here that is not freed before exit.
376  run_iter = pango_layout_get_iter(layout);
377  }
378  do {
379  PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter);
380  if (!run) {
381  tlog(2, "Found end of line NULL run marker\n");
382  continue;
383  }
384  PangoGlyph dotted_circle_glyph;
385  PangoFont* font = run->item->analysis.font;
386 
387 #ifdef _WIN32 // Fixme! Leaks memory and breaks unittests.
388  PangoGlyphString* glyphs = pango_glyph_string_new();
389  char s[] = "\xc2\xa7";
390  pango_shape(s, sizeof(s), &(run->item->analysis), glyphs);
391  dotted_circle_glyph = glyphs->glyphs[0].glyph;
392 #else
393  dotted_circle_glyph = pango_fc_font_get_glyph(
394  reinterpret_cast<PangoFcFont*>(font), kDottedCircleGlyph);
395 #endif
396 
397  if (TLOG_IS_ON(2)) {
398  PangoFontDescription* desc = pango_font_describe(font);
399  char* desc_str = pango_font_description_to_string(desc);
400  tlog(2, "Desc of font in run: %s\n", desc_str);
401  g_free(desc_str);
402  pango_font_description_free(desc);
403  }
404 
405  PangoGlyphItemIter cluster_iter;
406  gboolean have_cluster;
407  for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter,
408  run, utf8_word);
409  have_cluster && !bad_glyph;
410  have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
411  const int start_byte_index = cluster_iter.start_index;
412  const int end_byte_index = cluster_iter.end_index;
413  int start_glyph_index = cluster_iter.start_glyph;
414  int end_glyph_index = cluster_iter.end_glyph;
415  string cluster_text = string(utf8_word + start_byte_index,
416  end_byte_index - start_byte_index);
417  if (graphemes) graphemes->push_back(cluster_text);
418  if (IsUTF8Whitespace(cluster_text.c_str())) {
419  tlog(2, "Skipping whitespace\n");
420  continue;
421  }
422  if (TLOG_IS_ON(2)) {
423  printf("start_byte=%d end_byte=%d start_glyph=%d end_glyph=%d ",
424  start_byte_index, end_byte_index,
425  start_glyph_index, end_glyph_index);
426  }
427  for (int i = start_glyph_index,
428  step = (end_glyph_index > start_glyph_index) ? 1 : -1;
429  !bad_glyph && i != end_glyph_index; i+= step) {
430  const bool unknown_glyph =
431  (cluster_iter.glyph_item->glyphs->glyphs[i].glyph &
432  PANGO_GLYPH_UNKNOWN_FLAG);
433  const bool illegal_glyph =
434  (cluster_iter.glyph_item->glyphs->glyphs[i].glyph ==
435  dotted_circle_glyph);
436  bad_glyph = unknown_glyph || illegal_glyph;
437  if (TLOG_IS_ON(2)) {
438  printf("(%d=%d)", cluster_iter.glyph_item->glyphs->glyphs[i].glyph,
439  bad_glyph ? 1 : 0);
440  }
441  }
442  if (TLOG_IS_ON(2)) {
443  printf(" '%s'\n", cluster_text.c_str());
444  }
445  if (bad_glyph)
446  tlog(1, "Found illegal glyph!\n");
447  }
448  } while (!bad_glyph && pango_layout_iter_next_run(run_iter));
449 
450  pango_layout_iter_free(run_iter);
451  g_object_unref(context);
452  g_object_unref(layout);
453  if (bad_glyph && graphemes) graphemes->clear();
454  return !bad_glyph;
455 }
456 
457 
458 // ------------------------ FontUtils ------------------------------------
459 vector<string> FontUtils::available_fonts_; // cache list
460 
461 // Returns whether the specified font description is available in the fonts
462 // directory.
463 //
464 // The generated list of font families and faces includes "synthesized" font
465 // faces that are not truly loadable. Pango versions >=1.18 have a
466 // pango_font_face_is_synthesized method that can be used to prune the list.
467 // Until then, we are restricted to using a hack where we try to load the font
468 // from the font_map, and then check what we loaded to see if it has the
469 // description we expected. If it is not, then the font is deemed unavailable.
470 /* static */
471 bool FontUtils::IsAvailableFont(const char* input_query_desc,
472  string* best_match) {
473  string query_desc(input_query_desc);
474 #if (PANGO_VERSION <= 12005)
475  // Strip commas and any ' Medium' substring in the name.
476  query_desc.erase(std::remove(query_desc.begin(), query_desc.end(), ','),
477  query_desc.end());
478  const string kMediumStr = " Medium";
479  std::size_t found = query_desc.find(kMediumStr);
480  if (found != std::string::npos) {
481  query_desc.erase(found, kMediumStr.length());
482  }
483 #endif
484  PangoFontDescription *desc = pango_font_description_from_string(
485  query_desc.c_str());
486  PangoFont* selected_font = NULL;
487  {
489  PangoFontMap* font_map = pango_cairo_font_map_get_default();
490  PangoContext* context = pango_context_new();
491  pango_context_set_font_map(context, font_map);
492  {
494  selected_font = pango_font_map_load_font(font_map, context, desc);
495  }
496  g_object_unref(context);
497  }
498  if (selected_font == NULL) {
499  pango_font_description_free(desc);
500  return false;
501  }
502  PangoFontDescription* selected_desc = pango_font_describe(selected_font);
503 
504  bool equal = pango_font_description_equal(desc, selected_desc);
505  tlog(3, "query weight = %d \t selected weight =%d\n",
506  pango_font_description_get_weight(desc),
507  pango_font_description_get_weight(selected_desc));
508 
509  char* selected_desc_str = pango_font_description_to_string(selected_desc);
510  tlog(2, "query_desc: '%s' Selected: '%s'\n", query_desc.c_str(),
511  selected_desc_str);
512  if (!equal && best_match != NULL) {
513  *best_match = selected_desc_str;
514  // Clip the ending ' 0' if there is one. It seems that, if there is no
515  // point size on the end of the fontname, then Pango always appends ' 0'.
516  int len = best_match->size();
517  if (len > 2 && best_match->at(len - 1) == '0' &&
518  best_match->at(len - 2) == ' ') {
519  *best_match = best_match->substr(0, len - 2);
520  }
521  }
522  g_free(selected_desc_str);
523  pango_font_description_free(selected_desc);
524  g_object_unref(selected_font);
525  pango_font_description_free(desc);
526  return equal;
527 }
528 
529 static bool ShouldIgnoreFontFamilyName(const char* query) {
530  static const char* kIgnoredFamilyNames[]
531  = { "Sans", "Serif", "Monospace", NULL };
532  const char** list = kIgnoredFamilyNames;
533  for (; *list != NULL; ++list) {
534  if (!strcmp(*list, query))
535  return true;
536  }
537  return false;
538 }
539 
540 // Outputs description names of available fonts.
541 /* static */
542 const vector<string>& FontUtils::ListAvailableFonts() {
543  if (!available_fonts_.empty()) {
544  return available_fonts_;
545  }
546 #ifndef USE_STD_NAMESPACE
547  if (FLAGS_use_only_legacy_fonts) {
548  // Restrict view to list of fonts in legacy_fonts.h
549  tprintf("Using list of legacy fonts only\n");
550  const int kNumFontLists = 4;
551  for (int i = 0; i < kNumFontLists; ++i) {
552  for (int j = 0; kFontlists[i][j] != NULL; ++j) {
553  available_fonts_.push_back(kFontlists[i][j]);
554  }
555  }
556  return available_fonts_;
557  }
558 #endif
559 
560  PangoFontFamily** families = 0;
561  int n_families = 0;
562  ListFontFamilies(&families, &n_families);
563  for (int i = 0; i < n_families; ++i) {
564  const char* family_name = pango_font_family_get_name(families[i]);
565  tlog(2, "Listing family %s\n", family_name);
566  if (ShouldIgnoreFontFamilyName(family_name)) {
567  continue;
568  }
569 
570  int n_faces;
571  PangoFontFace** faces = NULL;
572  pango_font_family_list_faces(families[i], &faces, &n_faces);
573  for (int j = 0; j < n_faces; ++j) {
574  PangoFontDescription* desc = pango_font_face_describe(faces[j]);
575  char* desc_str = pango_font_description_to_string(desc);
576  if (IsAvailableFont(desc_str)) {
577  available_fonts_.push_back(desc_str);
578  }
579  pango_font_description_free(desc);
580  g_free(desc_str);
581  }
582  g_free(faces);
583  }
584  g_free(families);
585  sort(available_fonts_.begin(), available_fonts_.end());
586  return available_fonts_;
587 }
588 
589 
590 static void CharCoverageMapToBitmap(PangoCoverage* coverage,
591  vector<bool>* unichar_bitmap) {
592  const int kMinUnicodeValue = 33;
593  const int kMaxUnicodeValue = 0x10FFFF;
594  unichar_bitmap->resize(kMaxUnicodeValue + 1, false);
595  // Mark off characters that the font can render.
596  for (int i = kMinUnicodeValue; i <= kMaxUnicodeValue; ++i) {
597  if (IsInterchangeValid(i)) {
598  (*unichar_bitmap)[i]
599  = (pango_coverage_get(coverage, i) == PANGO_COVERAGE_EXACT);
600  }
601  }
602 }
603 
604 /* static */
605 void FontUtils::GetAllRenderableCharacters(vector<bool>* unichar_bitmap) {
606  const vector<string>& all_fonts = ListAvailableFonts();
607  return GetAllRenderableCharacters(all_fonts, unichar_bitmap);
608 }
609 
610 /* static */
611 void FontUtils::GetAllRenderableCharacters(const string& font_name,
612  vector<bool>* unichar_bitmap) {
613  PangoFontInfo font_info(font_name);
614  PangoCoverage* coverage = pango_font_get_coverage(
615  font_info.ToPangoFont(), NULL);
616  CharCoverageMapToBitmap(coverage, unichar_bitmap);
617 }
618 
619 /* static */
620 void FontUtils::GetAllRenderableCharacters(const vector<string>& fonts,
621  vector<bool>* unichar_bitmap) {
622  // Form the union of coverage maps from the fonts
623  PangoCoverage* all_coverage = pango_coverage_new();
624  tlog(1, "Processing %d fonts\n", fonts.size());
625  for (int i = 0; i < fonts.size(); ++i) {
626  PangoFontInfo font_info(fonts[i]);
627  PangoCoverage* coverage = pango_font_get_coverage(
628  font_info.ToPangoFont(), NULL);
629  // Mark off characters that any font can render.
630  pango_coverage_max(all_coverage, coverage);
631  }
632  CharCoverageMapToBitmap(all_coverage, unichar_bitmap);
633  pango_coverage_unref(all_coverage);
634 }
635 
636 
637 // Utilities written to be backward compatible with StringRender
638 
639 /* static */
640 int FontUtils::FontScore(const TessHashMap<char32, inT64>& ch_map,
641  const string& fontname, int* raw_score,
642  vector<bool>* ch_flags) {
643  PangoFontInfo font_info;
644  if (!font_info.ParseFontDescriptionName(fontname)) {
645  tprintf("ERROR: Could not parse %s\n", fontname.c_str());
646  }
647  PangoFont* font = font_info.ToPangoFont();
648  PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
649 
650  if (ch_flags) {
651  ch_flags->clear();
652  ch_flags->reserve(ch_map.size());
653  }
654  *raw_score = 0;
655  int ok_chars = 0;
656  for (TessHashMap<char32, inT64>::const_iterator it = ch_map.begin();
657  it != ch_map.end(); ++it) {
658  bool covered = (IsWhitespace(it->first) ||
659  (pango_coverage_get(coverage, it->first)
660  == PANGO_COVERAGE_EXACT));
661  if (covered) {
662  ++(*raw_score);
663  ok_chars += it->second;
664  }
665  if (ch_flags) {
666  ch_flags->push_back(covered);
667  }
668  }
669  return ok_chars;
670 }
671 
672 
673 /* static */
674 string FontUtils::BestFonts(const TessHashMap<char32, inT64>& ch_map,
675  vector<pair<const char*, vector<bool> > >* fonts) {
676  const double kMinOKFraction = 0.99;
677  // Weighted fraction of characters that must be renderable in a font to make
678  // it OK even if the raw count is not good.
679  const double kMinWeightedFraction = 0.99995;
680 
681  fonts->clear();
682  vector<vector<bool> > font_flags;
683  vector<int> font_scores;
684  vector<int> raw_scores;
685  int most_ok_chars = 0;
686  int best_raw_score = 0;
687  const vector<string>& font_names = FontUtils::ListAvailableFonts();
688  for (int i = 0; i < font_names.size(); ++i) {
689  vector<bool> ch_flags;
690  int raw_score = 0;
691  int ok_chars = FontScore(ch_map, font_names[i], &raw_score, &ch_flags);
692  most_ok_chars = MAX(ok_chars, most_ok_chars);
693  best_raw_score = MAX(raw_score, best_raw_score);
694 
695  font_flags.push_back(ch_flags);
696  font_scores.push_back(ok_chars);
697  raw_scores.push_back(raw_score);
698  }
699 
700  // Now select the fonts with a score above a threshold fraction
701  // of both the raw and weighted best scores. To prevent bogus fonts being
702  // selected for CJK, we require a high fraction (kMinOKFraction = 0.99) of
703  // BOTH weighted and raw scores.
704  // In low character-count scripts, the issue is more getting enough fonts,
705  // when only 1 or 2 might have all those rare dingbats etc in them, so we
706  // allow a font with a very high weighted (coverage) score
707  // (kMinWeightedFraction = 0.99995) to be used even if its raw score is poor.
708  int least_good_enough = static_cast<int>(most_ok_chars * kMinOKFraction);
709  int least_raw_enough = static_cast<int>(best_raw_score * kMinOKFraction);
710  int override_enough = static_cast<int>(most_ok_chars * kMinWeightedFraction);
711 
712  string font_list;
713  for (int i = 0; i < font_names.size(); ++i) {
714  int score = font_scores[i];
715  int raw_score = raw_scores[i];
716  if ((score >= least_good_enough && raw_score >= least_raw_enough) ||
717  score >= override_enough) {
718  fonts->push_back(make_pair(font_names[i].c_str(), font_flags[i]));
719  tlog(1, "OK font %s = %.4f%%, raw = %d = %.2f%%\n",
720  font_names[i].c_str(),
721  100.0 * score / most_ok_chars,
722  raw_score, 100.0 * raw_score / best_raw_score);
723  font_list += font_names[i];
724  font_list += "\n";
725  } else if (score >= least_good_enough || raw_score >= least_raw_enough) {
726  tlog(1, "Runner-up font %s = %.4f%%, raw = %d = %.2f%%\n",
727  font_names[i].c_str(),
728  100.0 * score / most_ok_chars,
729  raw_score, 100.0 * raw_score / best_raw_score);
730  }
731  }
732  return font_list;
733 }
734 
735 /* static */
736 bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
737  string* font_name, vector<string>* graphemes) {
738  return SelectFont(utf8_word, utf8_len, ListAvailableFonts(), font_name,
739  graphemes);
740 }
741 
742 /* static */
743 bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
744  const vector<string>& all_fonts,
745  string* font_name, vector<string>* graphemes) {
746  if (font_name) font_name->clear();
747  if (graphemes) graphemes->clear();
748  for (int i = 0; i < all_fonts.size(); ++i) {
749  PangoFontInfo font;
750  vector<string> found_graphemes;
751  ASSERT_HOST_MSG(font.ParseFontDescriptionName(all_fonts[i]),
752  "Could not parse font desc name %s\n",
753  all_fonts[i].c_str());
754  if (font.CanRenderString(utf8_word, utf8_len, &found_graphemes)) {
755  if (graphemes) graphemes->swap(found_graphemes);
756  if (font_name) *font_name = all_fonts[i];
757  return true;
758  }
759  }
760  return false;
761 }
762 
763 // PangoFontInfo is reinitialized, so clear the static list of fonts.
764 /* static */
765 void FontUtils::ReInit() { available_fonts_.clear(); }
766 
767 } // namespace tesseract
static bool DeleteMatchingFiles(const char *pattern)
Definition: fileio.cpp:113
bool GetSpacingProperties(const string &utf8_char, int *x_bearing, int *x_advance) const
bool CanRenderString(const char *utf8_word, int len, std::vector< string > *graphemes) const
bool ParseFontDescriptionName(const string &name)
static string BestFonts(const TessHashMap< char32, inT64 > &ch_map, std::vector< std::pair< const char *, std::vector< bool > > > *font_flag)
#define tlog(level,...)
Definition: tlog.h:33
bool CoversUTF8Text(const char *utf8_text, int byte_length) const
bool IsUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:182
#define TLOG_IS_ON(level)
Definition: tlog.h:39
static int FontScore(const TessHashMap< char32, inT64 > &ch_map, const string &fontname, int *raw_score, std::vector< bool > *ch_flags)
bool IsWhitespace(const char32 ch)
Definition: normstrngs.cpp:176
static const_iterator end(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:204
BOOL_PARAM_FLAG(use_only_legacy_fonts, false, "Overrides --fonts_dir and sets the known universe of fonts to" "the list in legacy_fonts.h")
int DropUncoveredChars(string *utf8_text) const
static void GetAllRenderableCharacters(std::vector< bool > *unichar_bitmap)
#define ASSERT_HOST_MSG(x,...)
Definition: errcode.h:90
static const_iterator begin(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:200
const int kDefaultResolution
static bool SelectFont(const char *utf8_word, const int utf8_len, string *font_name, std::vector< string > *graphemes)
bool IsInterchangeValid(const char32 ch)
Definition: normstrngs.cpp:208
#define MAX(x, y)
Definition: ndminx.h:24
#define tprintf(...)
Definition: tprintf.h:31
int utf8_len() const
Definition: unichar.cpp:186
static bool IsAvailableFont(const char *font_desc)
static string JoinPath(const string &prefix, const string &suffix)
Definition: fileio.cpp:83
static void HardInitFontConfig(const string &fonts_dir, const string &cache_dir)
char * utf8_str() const
Definition: unichar.cpp:125
static void WriteStringToFileOrDie(const string &str, const string &filename)
Definition: fileio.cpp:53
signed int char32
Definition: normstrngs.h:27
static const std::vector< string > & ListAvailableFonts()
#define DISABLE_HEAP_LEAK_CHECK
Definition: util.h:63
STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp", "Overrides fontconfig default temporary dir")