tesseract
3.05.02
paragraphs.h
Go to the documentation of this file.
1
/**********************************************************************
2
* File: paragraphs.h
3
* Description: Paragraph Detection data structures.
4
* Author: David Eger
5
* Created: 25 February 2011
6
*
7
* (C) Copyright 2011, Google Inc.
8
** Licensed under the Apache License, Version 2.0 (the "License");
9
** you may not use this file except in compliance with the License.
10
** You may obtain a copy of the License at
11
** http://www.apache.org/licenses/LICENSE-2.0
12
** Unless required by applicable law or agreed to in writing, software
13
** distributed under the License is distributed on an "AS IS" BASIS,
14
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
** See the License for the specific language governing permissions and
16
** limitations under the License.
17
*
18
**********************************************************************/
19
20
#ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_
21
#define TESSERACT_CCMAIN_PARAGRAPHS_H_
22
23
#include "
rect.h
"
24
#include "
ocrpara.h
"
25
#include "
genericvector.h
"
26
#include "
strngs.h
"
27
28
29
class
WERD
;
30
class
UNICHARSET
;
31
32
namespace
tesseract
{
33
34
class
MutableIterator;
35
36
// This structure captures all information needed about a text line for the
37
// purposes of paragraph detection. It is meant to be exceedingly light-weight
38
// so that we can easily test paragraph detection independent of the rest of
39
// Tesseract.
40
class
RowInfo
{
41
public
:
42
// Constant data derived from Tesseract output.
43
STRING
text
;
// the full UTF-8 text of the line.
44
bool
ltr
;
// whether the majority of the text is left-to-right
45
// TODO(eger) make this more fine-grained.
46
47
bool
has_leaders
;
// does the line contain leader dots (.....)?
48
bool
has_drop_cap
;
// does the line have a drop cap?
49
int
pix_ldistance
;
// distance to the left pblock boundary in pixels
50
int
pix_rdistance
;
// distance to the right pblock boundary in pixels
51
float
pix_xheight
;
// guessed xheight for the line
52
int
average_interword_space
;
// average space between words in pixels.
53
54
int
num_words
;
55
TBOX
lword_box
;
// in normalized (horiz text rows) space
56
TBOX
rword_box
;
// in normalized (horiz text rows) space
57
58
STRING
lword_text
;
// the UTF-8 text of the leftmost werd
59
STRING
rword_text
;
// the UTF-8 text of the rightmost werd
60
61
// The text of a paragraph typically starts with the start of an idea and
62
// ends with the end of an idea. Here we define paragraph as something that
63
// may have a first line indent and a body indent which may be different.
64
// Typical words that start an idea are:
65
// 1. Words in western scripts that start with
66
// a capital letter, for example "The"
67
// 2. Bulleted or numbered list items, for
68
// example "2."
69
// Typical words which end an idea are words ending in punctuation marks. In
70
// this vocabulary, each list item is represented as a paragraph.
71
bool
lword_indicates_list_item
;
72
bool
lword_likely_starts_idea
;
73
bool
lword_likely_ends_idea
;
74
75
bool
rword_indicates_list_item
;
76
bool
rword_likely_starts_idea
;
77
bool
rword_likely_ends_idea
;
78
};
79
80
// Main entry point for Paragraph Detection Algorithm.
81
//
82
// Given a set of equally spaced textlines (described by row_infos),
83
// Split them into paragraphs. See http://goto/paragraphstalk
84
//
85
// Output:
86
// row_owners - one pointer for each row, to the paragraph it belongs to.
87
// paragraphs - this is the actual list of PARA objects.
88
// models - the list of paragraph models referenced by the PARA objects.
89
// caller is responsible for deleting the models.
90
void
DetectParagraphs
(
int
debug_level,
91
GenericVector<RowInfo>
*row_infos,
92
GenericVector<PARA *>
*row_owners,
93
PARA_LIST *paragraphs,
94
GenericVector<ParagraphModel *>
*models);
95
96
// Given a MutableIterator to the start of a block, run DetectParagraphs on
97
// that block and commit the results to the underlying ROW and BLOCK structs,
98
// saving the ParagraphModels in models. Caller owns the models.
99
// We use unicharset during the function to answer questions such as "is the
100
// first letter of this word upper case?"
101
void
DetectParagraphs
(
int
debug_level,
102
bool
after_text_recognition,
103
const
MutableIterator
*block_start,
104
GenericVector<ParagraphModel *>
*models);
105
106
}
// namespace
107
108
#endif // TESSERACT_CCMAIN_PARAGRAPHS_H_
tesseract::MutableIterator
Definition:
mutableiterator.h:44
strngs.h
tesseract::RowInfo::text
STRING text
Definition:
paragraphs.h:43
GenericVector
Definition:
baseapi.h:41
tesseract::RowInfo::lword_likely_starts_idea
bool lword_likely_starts_idea
Definition:
paragraphs.h:72
tesseract::DetectParagraphs
void DetectParagraphs(int debug_level, GenericVector< RowInfo > *row_infos, GenericVector< PARA *> *row_owners, PARA_LIST *paragraphs, GenericVector< ParagraphModel *> *models)
Definition:
paragraphs.cpp:2265
rect.h
tesseract::RowInfo::pix_xheight
float pix_xheight
Definition:
paragraphs.h:51
tesseract::RowInfo::pix_ldistance
int pix_ldistance
Definition:
paragraphs.h:49
tesseract::RowInfo::has_drop_cap
bool has_drop_cap
Definition:
paragraphs.h:48
tesseract::RowInfo::average_interword_space
int average_interword_space
Definition:
paragraphs.h:52
tesseract::RowInfo::lword_text
STRING lword_text
Definition:
paragraphs.h:58
genericvector.h
UNICHARSET
Definition:
unicharset.h:139
tesseract::RowInfo::rword_indicates_list_item
bool rword_indicates_list_item
Definition:
paragraphs.h:75
tesseract::RowInfo::lword_likely_ends_idea
bool lword_likely_ends_idea
Definition:
paragraphs.h:73
WERD
Definition:
werd.h:60
tesseract
Definition:
baseapi.cpp:81
tesseract::RowInfo::has_leaders
bool has_leaders
Definition:
paragraphs.h:47
tesseract::RowInfo::num_words
int num_words
Definition:
paragraphs.h:54
tesseract::RowInfo::rword_box
TBOX rword_box
Definition:
paragraphs.h:56
ocrpara.h
tesseract::RowInfo::lword_box
TBOX lword_box
Definition:
paragraphs.h:55
STRING
Definition:
strngs.h:44
tesseract::RowInfo::rword_text
STRING rword_text
Definition:
paragraphs.h:59
tesseract::RowInfo::rword_likely_starts_idea
bool rword_likely_starts_idea
Definition:
paragraphs.h:76
tesseract::RowInfo
Definition:
paragraphs.h:40
TBOX
Definition:
rect.h:30
tesseract::RowInfo::pix_rdistance
int pix_rdistance
Definition:
paragraphs.h:50
tesseract::RowInfo::lword_indicates_list_item
bool lword_indicates_list_item
Definition:
paragraphs.h:71
tesseract::RowInfo::rword_likely_ends_idea
bool rword_likely_ends_idea
Definition:
paragraphs.h:77
tesseract::RowInfo::ltr
bool ltr
Definition:
paragraphs.h:44
ccmain
paragraphs.h
Generated on Mon Oct 29 2018 11:27:44 for tesseract by
1.8.14