tesseract  3.05.02
tesseract::TableFinder Class Reference

#include <tablefind.h>

Public Member Functions

 TableFinder ()
 
 ~TableFinder ()
 
void set_resolution (int resolution)
 
void set_left_to_right_language (bool order)
 
void Init (int grid_size, const ICOORD &bottom_left, const ICOORD &top_right)
 
void InsertCleanPartitions (ColPartitionGrid *grid, TO_BLOCK *block)
 
void LocateTables (ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb, const FCOORD &reskew)
 

Protected Member Functions

int gridsize () const
 
int gridwidth () const
 
int gridheight () const
 
const ICOORDbleft () const
 
const ICOORDtright () const
 
ScrollViewMakeWindow (int x, int y, const char *window_name)
 
void InsertTextPartition (ColPartition *part)
 
void InsertFragmentedTextPartition (ColPartition *part)
 
void InsertLeaderPartition (ColPartition *part)
 
void InsertRulingPartition (ColPartition *part)
 
void InsertImagePartition (ColPartition *part)
 
void SplitAndInsertFragmentedTextPartition (ColPartition *part)
 
bool AllowTextPartition (const ColPartition &part) const
 
bool AllowBlob (const BLOBNBOX &blob) const
 
void MoveColSegmentsToGrid (ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
 
void InitializePartitions (ColPartitionSet **all_columns)
 
void SetVerticalSpacing (ColPartition *part)
 
void SetGlobalSpacings (ColPartitionGrid *grid)
 
void set_global_median_xheight (int xheight)
 
void set_global_median_blob_width (int width)
 
void set_global_median_ledding (int ledding)
 
void FindNeighbors ()
 
void MarkTablePartitions ()
 
void MarkPartitionsUsingLocalInformation ()
 
bool HasWideOrNoInterWordGap (ColPartition *part) const
 
bool HasLeaderAdjacent (const ColPartition &part)
 
void FilterFalseAlarms ()
 
void FilterParagraphEndings ()
 
void FilterHeaderAndFooter ()
 
void SmoothTablePartitionRuns ()
 
void GetColumnBlocks (ColPartitionSet **columns, ColSegment_LIST *col_segments)
 
void GroupColumnBlocks (ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
 
bool ConsecutiveBoxes (const TBOX &b1, const TBOX &b2)
 
void SetColumnsType (ColSegment_LIST *col_segments)
 
void GridMergeColumnBlocks ()
 
void GetTableColumns (ColSegment_LIST *table_columns)
 
void GetTableRegions (ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
 
void GridMergeTableRegions ()
 
bool BelongToOneTable (const TBOX &box1, const TBOX &box2)
 
void AdjustTableBoundaries ()
 
void GrowTableBox (const TBOX &table_box, TBOX *result_box)
 
void GrowTableToIncludePartials (const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
 
void GrowTableToIncludeLines (const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
 
bool HLineBelongsToTable (const ColPartition &part, const TBOX &table_box)
 
void IncludeLeftOutColumnHeaders (TBOX *table_box)
 
void DeleteSingleColumnTables ()
 
bool GapInXProjection (int *xprojection, int length)
 
void RecognizeTables ()
 
void DisplayColSegments (ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
 
void DisplayColPartitions (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
 
void DisplayColPartitions (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
 
void DisplayColPartitionConnections (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
 
void DisplayColSegmentGrid (ScrollView *win, ColSegmentGrid *grid, ScrollView::Color color)
 
void WriteToPix (const FCOORD &reskew)
 
void MakeTableBlocks (ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb)
 

Static Protected Member Functions

static void SetPartitionSpacings (ColPartitionGrid *grid, ColPartitionSet **all_columns)
 

Protected Attributes

int resolution_
 
int global_median_xheight_
 
int global_median_blob_width_
 
int global_median_ledding_
 
ColPartitionGrid clean_part_grid_
 
ColPartitionGrid leader_and_ruling_grid_
 
ColPartitionGrid fragmented_text_grid_
 
ColSegmentGrid col_seg_grid_
 
ColSegmentGrid table_grid_
 
bool left_to_right_language_
 

Detailed Description

Definition at line 131 of file tablefind.h.

Constructor & Destructor Documentation

◆ TableFinder()

tesseract::TableFinder::TableFinder ( )

Definition at line 164 of file tablefind.cpp.

◆ ~TableFinder()

tesseract::TableFinder::~TableFinder ( )

Definition at line 172 of file tablefind.cpp.

172  {
173  // ColPartitions and ColSegments created by this class for storage in grids
174  // need to be deleted explicitly.
175  clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);
176  leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>);
177  fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>);
178  col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);
179  table_grid_.ClearGridData(&DeleteObject<ColSegment>);
180 }
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:426
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
void ClearGridData(void(*free_method)(BBC *))
Definition: bbgrid.h:467
ColSegmentGrid table_grid_
Definition: tablefind.h:428
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:424
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420

Member Function Documentation

◆ AdjustTableBoundaries()

void tesseract::TableFinder::AdjustTableBoundaries ( )
protected

Definition at line 1494 of file tablefind.cpp.

1494  {
1495  // Iterate the table regions in the grid
1496  ColSegment_CLIST adjusted_tables;
1497  ColSegment_C_IT it(&adjusted_tables);
1499  gsearch.StartFullSearch();
1500  ColSegment* table = NULL;
1501  while ((table = gsearch.NextFullSearch()) != NULL) {
1502  const TBOX& table_box = table->bounding_box();
1503  TBOX grown_box = table_box;
1504  GrowTableBox(table_box, &grown_box);
1505  // To prevent a table from expanding again, do not insert the
1506  // modified box back to the grid. Instead move it to a list and
1507  // and remove it from the grid. The list is moved later back to the grid.
1508  if (!grown_box.null_box()) {
1509  ColSegment* col = new ColSegment();
1510  col->InsertBox(grown_box);
1511  it.add_after_then_move(col);
1512  }
1513  gsearch.RemoveBBox();
1514  delete table;
1515  }
1516  // clear table grid to move final tables in it
1517  // TODO(nbeato): table_grid_ should already be empty. The above loop
1518  // removed everything. Maybe just assert it is empty?
1519  table_grid_.Clear();
1520  it.move_to_first();
1521  // move back final tables to table_grid_
1522  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1523  ColSegment* seg = it.extract();
1524  table_grid_.InsertBBox(true, true, seg);
1525  }
1526 }
void GrowTableBox(const TBOX &table_box, TBOX *result_box)
Definition: tablefind.cpp:1528
bool null_box() const
Definition: rect.h:46
ColSegmentGrid table_grid_
Definition: tablefind.h:428
void Clear()
Definition: bbgrid.h:458
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:489
Definition: rect.h:30
GridSearch< ColSegment, ColSegment_CLIST, ColSegment_C_IT > ColSegmentGridSearch
Definition: tablefind.h:121

◆ AllowBlob()

bool tesseract::TableFinder::AllowBlob ( const BLOBNBOX blob) const
protected

Definition at line 510 of file tablefind.cpp.

510  {
511  const TBOX& box = blob.bounding_box();
512  const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight;
513  const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth;
514  const int median_area = global_median_xheight_ * global_median_blob_width_;
515  const double kAreaRequired = median_area * kAllowBlobArea;
516  // Keep comparisons strictly greater to disallow 0!
517  return box.height() > kHeightRequired &&
518  box.width() > kWidthRequired &&
519  box.area() > kAreaRequired;
520 }
const double kAllowBlobHeight
Definition: tablefind.cpp:59
const TBOX & bounding_box() const
Definition: blobbox.h:215
const double kAllowBlobArea
Definition: tablefind.cpp:61
inT16 width() const
Definition: rect.h:111
inT32 area() const
Definition: rect.h:118
inT16 height() const
Definition: rect.h:104
const double kAllowBlobWidth
Definition: tablefind.cpp:60
Definition: rect.h:30

◆ AllowTextPartition()

bool tesseract::TableFinder::AllowTextPartition ( const ColPartition part) const
protected

Definition at line 497 of file tablefind.cpp.

497  {
498  const double kHeightRequired = global_median_xheight_ * kAllowTextHeight;
499  const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth;
500  const int median_area = global_median_xheight_ * global_median_blob_width_;
501  const double kAreaPerBlobRequired = median_area * kAllowTextArea;
502  // Keep comparisons strictly greater to disallow 0!
503  return part.median_size() > kHeightRequired &&
504  part.median_width() > kWidthRequired &&
505  part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count();
506 }
const double kAllowTextWidth
Definition: tablefind.cpp:53
const double kAllowTextArea
Definition: tablefind.cpp:54
const double kAllowTextHeight
Definition: tablefind.cpp:52

◆ BelongToOneTable()

bool tesseract::TableFinder::BelongToOneTable ( const TBOX box1,
const TBOX box2 
)
protected

Definition at line 1452 of file tablefind.cpp.

1452  {
1453  // Check the obvious case. Most likely not true because overlapping boxes
1454  // should already be merged, but seems like a good thing to do in case things
1455  // change.
1456  if (box1.overlap(box2))
1457  return true;
1458  // Check for ColPartitions spanning both table regions
1459  TBOX bbox = box1.bounding_union(box2);
1460  // Start a rect search on bbox
1461  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1462  rectsearch(&clean_part_grid_);
1463  rectsearch.StartRectSearch(bbox);
1464  ColPartition* part = NULL;
1465  while ((part = rectsearch.NextRectSearch()) != NULL) {
1466  const TBOX& part_box = part->bounding_box();
1467  // return true if a colpartition spanning both table regions is found
1468  if (part_box.overlap(box1) && part_box.overlap(box2) &&
1469  !part->IsImageType())
1470  return true;
1471  }
1472  return false;
1473 }
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:129
bool overlap(const TBOX &box) const
Definition: rect.h:345
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
Definition: rect.h:30

◆ bleft()

const ICOORD & tesseract::TableFinder::bleft ( ) const
protected

Definition at line 395 of file tablefind.cpp.

395  {
396  return clean_part_grid_.bleft();
397 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
const ICOORD & bleft() const
Definition: bbgrid.h:72

◆ ConsecutiveBoxes()

bool tesseract::TableFinder::ConsecutiveBoxes ( const TBOX b1,
const TBOX b2 
)
protected

Definition at line 576 of file tablefind.cpp.

576  {
577  int x_margin = 20;
578  int y_margin = 5;
579  return (abs(b1.left() - b2.left()) < x_margin) &&
580  (abs(b1.right() - b2.right()) < x_margin) &&
581  (abs(b1.top()-b2.bottom()) < y_margin ||
582  abs(b2.top()-b1.bottom()) < y_margin);
583 }
inT16 bottom() const
Definition: rect.h:61
inT16 left() const
Definition: rect.h:68
inT16 top() const
Definition: rect.h:54
inT16 right() const
Definition: rect.h:75

◆ DeleteSingleColumnTables()

void tesseract::TableFinder::DeleteSingleColumnTables ( )
protected

Definition at line 1711 of file tablefind.cpp.

1711  {
1712  int page_width = tright().x() - bleft().x();
1713  ASSERT_HOST(page_width > 0);
1714  // create an integer array to hold projection on x-axis
1715  int* table_xprojection = new int[page_width];
1716  // Iterate through all tables in the table grid
1717  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1718  table_search(&table_grid_);
1719  table_search.StartFullSearch();
1720  ColSegment* table;
1721  while ((table = table_search.NextFullSearch()) != NULL) {
1722  TBOX table_box = table->bounding_box();
1723  // reset the projection array
1724  for (int i = 0; i < page_width; i++) {
1725  table_xprojection[i] = 0;
1726  }
1727  // Start a rect search on table_box
1728  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1729  rectsearch(&clean_part_grid_);
1730  rectsearch.SetUniqueMode(true);
1731  rectsearch.StartRectSearch(table_box);
1732  ColPartition* part;
1733  while ((part = rectsearch.NextRectSearch()) != NULL) {
1734  if (!part->IsTextType())
1735  continue; // Do not consider non-text partitions
1736  if (part->flow() == BTFT_LEADER)
1737  continue; // Assume leaders are in tables
1738  TBOX part_box = part->bounding_box();
1739  // Do not consider partitions partially covered by the table
1740  if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable)
1741  continue;
1742  BLOBNBOX_CLIST* part_boxes = part->boxes();
1743  BLOBNBOX_C_IT pit(part_boxes);
1744 
1745  // Make sure overlapping blobs don't artificially inflate the number
1746  // of rows in the table. This happens frequently with things such as
1747  // decimals and split characters. Do this by assuming the column
1748  // partition is sorted mostly left to right and just clip
1749  // bounding boxes by the previous box's extent.
1750  int next_position_to_write = 0;
1751 
1752  for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1753  BLOBNBOX *pblob = pit.data();
1754  // ignore blob height for the purpose of projection since we
1755  // are only interested in finding valleys
1756  int xstart = pblob->bounding_box().left();
1757  int xend = pblob->bounding_box().right();
1758 
1759  xstart = MAX(xstart, next_position_to_write);
1760  for (int i = xstart; i < xend; i++)
1761  table_xprojection[i - bleft().x()]++;
1762  next_position_to_write = xend;
1763  }
1764  }
1765  // Find largest valley between two reasonable peaks in the table
1766  if (!GapInXProjection(table_xprojection, page_width)) {
1767  table_search.RemoveBBox();
1768  delete table;
1769  }
1770  }
1771  delete[] table_xprojection;
1772 }
const TBOX & bounding_box() const
Definition: blobbox.h:215
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
const double kMinOverlapWithTable
Definition: tablefind.cpp:100
ColSegmentGrid table_grid_
Definition: tablefind.h:428
inT16 x() const
access function
Definition: points.h:52
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
inT16 left() const
Definition: rect.h:68
const ICOORD & bleft() const
Definition: tablefind.cpp:395
#define MAX(x, y)
Definition: ndminx.h:24
const ICOORD & tright() const
Definition: tablefind.cpp:398
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
bool GapInXProjection(int *xprojection, int length)
Definition: tablefind.cpp:1776
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ DisplayColPartitionConnections()

void tesseract::TableFinder::DisplayColPartitionConnections ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  default_color 
)
protected

Definition at line 1958 of file tablefind.cpp.

1961  {
1962 #ifndef GRAPHICS_DISABLED
1963  // Iterate the ColPartitions in the grid.
1964  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1965  gsearch(grid);
1966  gsearch.StartFullSearch();
1967  ColPartition* part = NULL;
1968  while ((part = gsearch.NextFullSearch()) != NULL) {
1969  const TBOX& box = part->bounding_box();
1970  int left_x = box.left();
1971  int right_x = box.right();
1972  int top_y = box.top();
1973  int bottom_y = box.bottom();
1974 
1975  ColPartition* upper_part = part->nearest_neighbor_above();
1976  if (upper_part) {
1977  const TBOX& upper_box = upper_part->bounding_box();
1978  int mid_x = (left_x + right_x) / 2;
1979  int mid_y = (top_y + bottom_y) / 2;
1980  int other_x = (upper_box.left() + upper_box.right()) / 2;
1981  int other_y = (upper_box.top() + upper_box.bottom()) / 2;
1982  win->Brush(ScrollView::NONE);
1983  win->Pen(color);
1984  win->Line(mid_x, mid_y, other_x, other_y);
1985  }
1986  ColPartition* lower_part = part->nearest_neighbor_below();
1987  if (lower_part) {
1988  const TBOX& lower_box = lower_part->bounding_box();
1989  int mid_x = (left_x + right_x) / 2;
1990  int mid_y = (top_y + bottom_y) / 2;
1991  int other_x = (lower_box.left() + lower_box.right()) / 2;
1992  int other_y = (lower_box.top() + lower_box.bottom()) / 2;
1993  win->Brush(ScrollView::NONE);
1994  win->Pen(color);
1995  win->Line(mid_x, mid_y, other_x, other_y);
1996  }
1997  }
1998  win->UpdateWindow();
1999 #endif
2000 }
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:538
void Brush(Color color)
Definition: scrollview.cpp:732
inT16 bottom() const
Definition: rect.h:61
inT16 left() const
Definition: rect.h:68
void Pen(Color color)
Definition: scrollview.cpp:726
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
void UpdateWindow()
Definition: scrollview.cpp:710

◆ DisplayColPartitions() [1/2]

void tesseract::TableFinder::DisplayColPartitions ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  text_color,
ScrollView::Color  table_color 
)
protected

Definition at line 1924 of file tablefind.cpp.

1927  {
1928 #ifndef GRAPHICS_DISABLED
1929  ScrollView::Color color = default_color;
1930  // Iterate the ColPartitions in the grid.
1931  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1932  gsearch(grid);
1933  gsearch.StartFullSearch();
1934  ColPartition* part = NULL;
1935  while ((part = gsearch.NextFullSearch()) != NULL) {
1936  color = default_color;
1937  if (part->type() == PT_TABLE)
1938  color = table_color;
1939 
1940  const TBOX& box = part->bounding_box();
1941  int left_x = box.left();
1942  int right_x = box.right();
1943  int top_y = box.top();
1944  int bottom_y = box.bottom();
1945  win->Brush(ScrollView::NONE);
1946  win->Pen(color);
1947  win->Rectangle(left_x, bottom_y, right_x, top_y);
1948  }
1949  win->UpdateWindow();
1950 #endif
1951 }
Definition: capi.h:97
void Brush(Color color)
Definition: scrollview.cpp:732
inT16 bottom() const
Definition: rect.h:61
inT16 left() const
Definition: rect.h:68
void Pen(Color color)
Definition: scrollview.cpp:726
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:606
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
void UpdateWindow()
Definition: scrollview.cpp:710

◆ DisplayColPartitions() [2/2]

void tesseract::TableFinder::DisplayColPartitions ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  default_color 
)
protected

Definition at line 1952 of file tablefind.cpp.

1954  {
1955  DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW);
1956 }
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1924

◆ DisplayColSegmentGrid()

void tesseract::TableFinder::DisplayColSegmentGrid ( ScrollView win,
ColSegmentGrid grid,
ScrollView::Color  color 
)
protected

Definition at line 1899 of file tablefind.cpp.

1900  {
1901 #ifndef GRAPHICS_DISABLED
1902  // Iterate the ColPartitions in the grid.
1903  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1904  gsearch(grid);
1905  gsearch.StartFullSearch();
1906  ColSegment* seg = NULL;
1907  while ((seg = gsearch.NextFullSearch()) != NULL) {
1908  const TBOX& box = seg->bounding_box();
1909  int left_x = box.left();
1910  int right_x = box.right();
1911  int top_y = box.top();
1912  int bottom_y = box.bottom();
1913  win->Brush(ScrollView::NONE);
1914  win->Pen(color);
1915  win->Rectangle(left_x, bottom_y, right_x, top_y);
1916  }
1917  win->UpdateWindow();
1918 #endif
1919 }
void Brush(Color color)
Definition: scrollview.cpp:732
inT16 bottom() const
Definition: rect.h:61
inT16 left() const
Definition: rect.h:68
void Pen(Color color)
Definition: scrollview.cpp:726
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:606
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
void UpdateWindow()
Definition: scrollview.cpp:710

◆ DisplayColSegments()

void tesseract::TableFinder::DisplayColSegments ( ScrollView win,
ColSegment_LIST *  cols,
ScrollView::Color  color 
)
protected

Definition at line 1879 of file tablefind.cpp.

1881  {
1882 #ifndef GRAPHICS_DISABLED
1883  win->Pen(color);
1884  win->Brush(ScrollView::NONE);
1885  ColSegment_IT it(segments);
1886  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1887  ColSegment* col = it.data();
1888  const TBOX& box = col->bounding_box();
1889  int left_x = box.left();
1890  int right_x = box.right();
1891  int top_y = box.top();
1892  int bottom_y = box.bottom();
1893  win->Rectangle(left_x, bottom_y, right_x, top_y);
1894  }
1895  win->UpdateWindow();
1896 #endif
1897 }
void Brush(Color color)
Definition: scrollview.cpp:732
inT16 bottom() const
Definition: rect.h:61
inT16 left() const
Definition: rect.h:68
void Pen(Color color)
Definition: scrollview.cpp:726
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:606
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
void UpdateWindow()
Definition: scrollview.cpp:710

◆ FilterFalseAlarms()

void tesseract::TableFinder::FilterFalseAlarms ( )
protected

Definition at line 996 of file tablefind.cpp.

996  {
999  // TODO(nbeato): Fully justified text as non-table?
1000 }

◆ FilterHeaderAndFooter()

void tesseract::TableFinder::FilterHeaderAndFooter ( )
protected

Definition at line 1082 of file tablefind.cpp.

1082  {
1083  // Consider top-most text colpartition as header and bottom most as footer
1084  ColPartition* header = NULL;
1085  ColPartition* footer = NULL;
1086  int max_top = MIN_INT32;
1087  int min_bottom = MAX_INT32;
1089  gsearch.StartFullSearch();
1090  ColPartition* part = NULL;
1091  while ((part = gsearch.NextFullSearch()) != NULL) {
1092  if (!part->IsTextType())
1093  continue; // Consider only text partitions
1094  int top = part->bounding_box().top();
1095  int bottom = part->bounding_box().bottom();
1096  if (top > max_top) {
1097  max_top = top;
1098  header = part;
1099  }
1100  if (bottom < min_bottom) {
1101  min_bottom = bottom;
1102  footer = part;
1103  }
1104  }
1105  if (header)
1106  header->clear_table_type();
1107  if (footer)
1108  footer->clear_table_type();
1109 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
#define MIN_INT32
Definition: host.h:61
#define MAX_INT32
Definition: host.h:53

◆ FilterParagraphEndings()

void tesseract::TableFinder::FilterParagraphEndings ( )
protected

Definition at line 1002 of file tablefind.cpp.

1002  {
1003  // Detect last line of paragraph
1004  // Iterate the ColPartitions in the grid.
1006  gsearch.StartFullSearch();
1007  ColPartition* part = NULL;
1008  while ((part = gsearch.NextFullSearch()) != NULL) {
1009  if (part->type() != PT_TABLE)
1010  continue; // Consider only table partitions
1011 
1012  // Paragraph ending should have flowing text above it.
1013  ColPartition* upper_part = part->nearest_neighbor_above();
1014  if (!upper_part)
1015  continue;
1016  if (upper_part->type() != PT_FLOWING_TEXT)
1017  continue;
1018  if (upper_part->bounding_box().width() <
1019  2 * part->bounding_box().width())
1020  continue;
1021  // Check if its the last line of a paragraph.
1022  // In most cases, a paragraph ending should be left-aligned to text line
1023  // above it. Sometimes, it could be a 2 line paragraph, in which case
1024  // the line above it is indented.
1025  // To account for that, check if the partition center is to
1026  // the left of the one above it.
1027  int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2;
1028  int upper_mid = (upper_part->bounding_box().left() +
1029  upper_part->bounding_box().right()) / 2;
1030  int current_spacing = 0; // spacing of the current line to margin
1031  int upper_spacing = 0; // spacing of the previous line to the margin
1033  // Left to right languages, use mid - left to figure out the distance
1034  // the middle is from the left margin.
1035  int left = MIN(part->bounding_box().left(),
1036  upper_part->bounding_box().left());
1037  current_spacing = mid - left;
1038  upper_spacing = upper_mid - left;
1039  } else {
1040  // Right to left languages, use right - mid to figure out the distance
1041  // the middle is from the right margin.
1042  int right = MAX(part->bounding_box().right(),
1043  upper_part->bounding_box().right());
1044  current_spacing = right - mid;
1045  upper_spacing = right - upper_mid;
1046  }
1047  if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing)
1048  continue;
1049 
1050  // Paragraphs should have similar fonts.
1051  if (!part->MatchingSizes(*upper_part) ||
1052  !part->MatchingStrokeWidth(*upper_part, kStrokeWidthFractionalTolerance,
1054  continue;
1055  }
1056 
1057  // The last line of a paragraph should be left aligned.
1058  // TODO(nbeato): This would be untrue if the text was right aligned.
1059  // How often is that?
1060  if (part->space_to_left() >
1061  kMaxParagraphEndingLeftSpaceMultiple * part->median_size())
1062  continue;
1063  // The line above it should be right aligned (assuming justified format).
1064  // Since we can't assume justified text, we compare whitespace to text.
1065  // The above line should have majority spanning text (or the current
1066  // line could have fit on the previous line). So compare
1067  // whitespace to text.
1068  if (upper_part->bounding_box().width() <
1069  kMinParagraphEndingTextToWhitespaceRatio * upper_part->space_to_right())
1070  continue;
1071 
1072  // Ledding above the line should be less than ledding below
1073  if (part->space_above() >= part->space_below() ||
1074  part->space_above() > 2 * global_median_ledding_)
1075  continue;
1076 
1077  // If all checks failed, it is probably text.
1078  part->clear_table_type();
1079  }
1080 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
#define MIN(x, y)
Definition: ndminx.h:28
Definition: capi.h:97
const double kParagraphEndingPreviousLineRatio
Definition: tablefind.cpp:125
const double kMaxParagraphEndingLeftSpaceMultiple
Definition: tablefind.cpp:129
#define MAX(x, y)
Definition: ndminx.h:24
const double kMinParagraphEndingTextToWhitespaceRatio
Definition: tablefind.cpp:135
const double kStrokeWidthFractionalTolerance
Definition: tablefind.cpp:143
const double kStrokeWidthConstantTolerance
Definition: tablefind.cpp:144

◆ FindNeighbors()

void tesseract::TableFinder::FindNeighbors ( )
protected

Definition at line 774 of file tablefind.cpp.

774  {
776  gsearch.StartFullSearch();
777  ColPartition* part = NULL;
778  while ((part = gsearch.NextFullSearch()) != NULL) {
779  // TODO(nbeato): Rename this function, meaning is different now.
780  // IT is finding nearest neighbors its own way
781  //SetVerticalSpacing(part);
782 
783  ColPartition* upper = part->SingletonPartner(true);
784  if (upper)
785  part->set_nearest_neighbor_above(upper);
786 
787  ColPartition* lower = part->SingletonPartner(false);
788  if (lower)
789  part->set_nearest_neighbor_below(lower);
790  }
791 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418

◆ GapInXProjection()

bool tesseract::TableFinder::GapInXProjection ( int *  xprojection,
int  length 
)
protected

Definition at line 1776 of file tablefind.cpp.

1776  {
1777  // Find peak value of the histogram
1778  int peak_value = 0;
1779  for (int i = 0; i < length; i++) {
1780  if (xprojection[i] > peak_value) {
1781  peak_value = xprojection[i];
1782  }
1783  }
1784  // Peak value represents the maximum number of horizontally
1785  // overlapping colpartitions, so this can be considered as the
1786  // number of rows in the table
1787  if (peak_value < kMinRowsInTable)
1788  return false;
1789  double projection_threshold = kSmallTableProjectionThreshold * peak_value;
1790  if (peak_value >= kLargeTableRowCount)
1791  projection_threshold = kLargeTableProjectionThreshold * peak_value;
1792  // Threshold the histogram
1793  for (int i = 0; i < length; i++) {
1794  xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
1795  }
1796  // Find the largest run of zeros between two ones
1797  int largest_gap = 0;
1798  int run_start = -1;
1799  for (int i = 1; i < length; i++) {
1800  // detect start of a run of zeros
1801  if (xprojection[i - 1] && !xprojection[i]) {
1802  run_start = i;
1803  }
1804  // detect end of a run of zeros and update the value of largest gap
1805  if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1806  int gap = i - run_start;
1807  if (gap > largest_gap)
1808  largest_gap = gap;
1809  run_start = -1;
1810  }
1811  }
1812  return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_;
1813 }
const double kSmallTableProjectionThreshold
Definition: tablefind.cpp:109
const int kLargeTableRowCount
Definition: tablefind.cpp:112
const int kMinRowsInTable
Definition: tablefind.cpp:115
const double kLargeTableProjectionThreshold
Definition: tablefind.cpp:110
const double kMaxXProjectionGapFactor
Definition: tablefind.cpp:139

◆ GetColumnBlocks()

void tesseract::TableFinder::GetColumnBlocks ( ColPartitionSet **  columns,
ColSegment_LIST *  col_segments 
)
protected

Definition at line 531 of file tablefind.cpp.

532  {
533  for (int i = 0; i < gridheight(); ++i) {
534  ColPartitionSet* columns = all_columns[i];
535  if (columns != NULL) {
536  ColSegment_LIST new_blocks;
537  // Get boxes from the current vertical position on the grid
538  columns->GetColumnBoxes(i * gridsize(), (i+1) * gridsize(), &new_blocks);
539  // Merge the new_blocks boxes into column_blocks if they are well-aligned
540  GroupColumnBlocks(&new_blocks, column_blocks);
541  }
542  }
543 }
void GroupColumnBlocks(ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:546
int gridheight() const
Definition: tablefind.cpp:392

◆ GetTableColumns()

void tesseract::TableFinder::GetTableColumns ( ColSegment_LIST *  table_columns)
protected

Definition at line 1281 of file tablefind.cpp.

1281  {
1282  ColSegment_IT it(table_columns);
1283  // Iterate the ColPartitions in the grid.
1284  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1285  gsearch(&clean_part_grid_);
1286  gsearch.StartFullSearch();
1287  ColPartition* part;
1288  while ((part = gsearch.NextFullSearch()) != NULL) {
1289  if (part->inside_table_column() || part->type() != PT_TABLE)
1290  continue; // prevent a partition to be assigned to multiple columns
1291  const TBOX& box = part->bounding_box();
1292  ColSegment* col = new ColSegment();
1293  col->InsertBox(box);
1294  part->set_inside_table_column(true);
1295  // Start a search below the current cell to find bottom neighbours
1296  // Note: a full search will always process things above it first, so
1297  // this should be starting at the highest cell and working its way down.
1298  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1299  vsearch(&clean_part_grid_);
1300  vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());
1301  ColPartition* neighbor = NULL;
1302  bool found_neighbours = false;
1303  while ((neighbor = vsearch.NextVerticalSearch(true)) != NULL) {
1304  // only consider neighbors not assigned to any column yet
1305  if (neighbor->inside_table_column())
1306  continue;
1307  // Horizontal lines should not break the flow
1308  if (neighbor->IsHorizontalLine())
1309  continue;
1310  // presence of a non-table neighbor marks the end of current
1311  // table column
1312  if (neighbor->type() != PT_TABLE)
1313  break;
1314  // add the neighbor partition to the table column
1315  const TBOX& neighbor_box = neighbor->bounding_box();
1316  col->InsertBox(neighbor_box);
1317  neighbor->set_inside_table_column(true);
1318  found_neighbours = true;
1319  }
1320  if (found_neighbours) {
1321  it.add_after_then_move(col);
1322  } else {
1323  part->set_inside_table_column(false);
1324  delete col;
1325  }
1326  }
1327 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
Definition: capi.h:97
inT16 bottom() const
Definition: rect.h:61
inT16 left() const
Definition: rect.h:68
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75

◆ GetTableRegions()

void tesseract::TableFinder::GetTableRegions ( ColSegment_LIST *  table_columns,
ColSegment_LIST *  table_regions 
)
protected

Definition at line 1331 of file tablefind.cpp.

1332  {
1333  ColSegment_IT cit(table_columns);
1334  ColSegment_IT rit(table_regions);
1335  // Iterate through column blocks
1336  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1337  gsearch(&col_seg_grid_);
1338  gsearch.StartFullSearch();
1339  ColSegment* part;
1340  int page_height = tright().y() - bleft().y();
1341  ASSERT_HOST(page_height > 0);
1342  // create a bool array to hold projection on y-axis
1343  bool* table_region = new bool[page_height];
1344  while ((part = gsearch.NextFullSearch()) != NULL) {
1345  const TBOX& part_box = part->bounding_box();
1346  // reset the projection array
1347  for (int i = 0; i < page_height; i++) {
1348  table_region[i] = false;
1349  }
1350  // iterate through all table columns to find regions in the current
1351  // page column block
1352  cit.move_to_first();
1353  for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
1354  TBOX col_box = cit.data()->bounding_box();
1355  // find intersection region of table column and page column
1356  TBOX intersection_box = col_box.intersection(part_box);
1357  // project table column on the y-axis
1358  for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {
1359  table_region[i - bleft().y()] = true;
1360  }
1361  }
1362  // set x-limits of table regions to page column width
1363  TBOX current_table_box;
1364  current_table_box.set_left(part_box.left());
1365  current_table_box.set_right(part_box.right());
1366  // go through the y-axis projection to find runs of table
1367  // regions. Each run makes one table region.
1368  for (int i = 1; i < page_height; i++) {
1369  // detect start of a table region
1370  if (!table_region[i - 1] && table_region[i]) {
1371  current_table_box.set_bottom(i + bleft().y());
1372  }
1373  // TODO(nbeato): Is it guaranteed that the last row is not a table region?
1374  // detect end of a table region
1375  if (table_region[i - 1] && !table_region[i]) {
1376  current_table_box.set_top(i + bleft().y());
1377  if (!current_table_box.null_box()) {
1378  ColSegment* seg = new ColSegment();
1379  seg->InsertBox(current_table_box);
1380  rit.add_after_then_move(seg);
1381  }
1382  }
1383  }
1384  }
1385  delete[] table_region;
1386 }
void set_bottom(int y)
Definition: rect.h:64
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:426
bool null_box() const
Definition: rect.h:46
void set_left(int x)
Definition: rect.h:71
inT16 bottom() const
Definition: rect.h:61
inT16 left() const
Definition: rect.h:68
const ICOORD & bleft() const
Definition: tablefind.cpp:395
void set_right(int x)
Definition: rect.h:78
inT16 top() const
Definition: rect.h:54
void set_top(int y)
Definition: rect.h:57
const ICOORD & tright() const
Definition: tablefind.cpp:398
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 y() const
access_function
Definition: points.h:56

◆ gridheight()

int tesseract::TableFinder::gridheight ( ) const
protected

Definition at line 392 of file tablefind.cpp.

392  {
393  return clean_part_grid_.gridheight();
394 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
int gridheight() const
Definition: bbgrid.h:69

◆ GridMergeColumnBlocks()

void tesseract::TableFinder::GridMergeColumnBlocks ( )
protected

Definition at line 1203 of file tablefind.cpp.

1203  {
1204  int margin = gridsize();
1205 
1206  // Iterate the Column Blocks in the grid.
1207  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1208  gsearch(&col_seg_grid_);
1209  gsearch.StartFullSearch();
1210  ColSegment* seg;
1211  while ((seg = gsearch.NextFullSearch()) != NULL) {
1212  if (seg->type() != COL_TEXT)
1213  continue; // only consider text blocks for split detection
1214  bool neighbor_found = false;
1215  bool modified = false; // Modified at least once
1216  // keep expanding current box as long as neighboring table columns
1217  // are found above or below it.
1218  do {
1219  TBOX box = seg->bounding_box();
1220  // slightly expand the search region vertically
1221  int top_range = MIN(box.top() + margin, tright().y());
1222  int bottom_range = MAX(box.bottom() - margin, bleft().y());
1223  box.set_top(top_range);
1224  box.set_bottom(bottom_range);
1225  neighbor_found = false;
1226  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1227  rectsearch(&col_seg_grid_);
1228  rectsearch.StartRectSearch(box);
1229  ColSegment* neighbor = NULL;
1230  while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
1231  if (neighbor == seg)
1232  continue;
1233  const TBOX& neighbor_box = neighbor->bounding_box();
1234  // If the neighbor box significantly overlaps with the current
1235  // box (due to the expansion of the current box in the
1236  // previous iteration of this loop), remove the neighbor box
1237  // and expand the current box to include it.
1238  if (neighbor_box.overlap_fraction(box) >= 0.9) {
1239  seg->InsertBox(neighbor_box);
1240  modified = true;
1241  rectsearch.RemoveBBox();
1242  gsearch.RepositionIterator();
1243  delete neighbor;
1244  continue;
1245  }
1246  // Only expand if the neighbor box is of table type
1247  if (neighbor->type() != COL_TABLE)
1248  continue;
1249  // Insert the neighbor box into the current column block
1250  if (neighbor_box.major_x_overlap(box) &&
1251  !box.contains(neighbor_box)) {
1252  seg->InsertBox(neighbor_box);
1253  neighbor_found = true;
1254  modified = true;
1255  rectsearch.RemoveBBox();
1256  gsearch.RepositionIterator();
1257  delete neighbor;
1258  }
1259  }
1260  } while (neighbor_found);
1261  if (modified) {
1262  // Because the box has changed, it has to be removed first.
1263  gsearch.RemoveBBox();
1264  col_seg_grid_.InsertBBox(true, true, seg);
1265  gsearch.RepositionIterator();
1266  }
1267  }
1268 }
void set_bottom(int y)
Definition: rect.h:64
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:426
#define MIN(x, y)
Definition: ndminx.h:28
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
inT16 bottom() const
Definition: rect.h:61
bool contains(const FCOORD pt) const
Definition: rect.h:323
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
const ICOORD & bleft() const
Definition: tablefind.cpp:395
#define MAX(x, y)
Definition: ndminx.h:24
inT16 top() const
Definition: rect.h:54
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:489
void set_top(int y)
Definition: rect.h:57
const ICOORD & tright() const
Definition: tablefind.cpp:398
Definition: rect.h:30
inT16 y() const
access_function
Definition: points.h:56

◆ GridMergeTableRegions()

void tesseract::TableFinder::GridMergeTableRegions ( )
protected

Definition at line 1394 of file tablefind.cpp.

1394  {
1395  // Iterate the table regions in the grid.
1396  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1397  gsearch(&table_grid_);
1398  gsearch.StartFullSearch();
1399  ColSegment* seg = NULL;
1400  while ((seg = gsearch.NextFullSearch()) != NULL) {
1401  bool neighbor_found = false;
1402  bool modified = false; // Modified at least once
1403  do {
1404  // Start a rectangle search x-bounded by the image and y by the table
1405  const TBOX& box = seg->bounding_box();
1406  TBOX search_region(box);
1407  search_region.set_left(bleft().x());
1408  search_region.set_right(tright().x());
1409  neighbor_found = false;
1410  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1411  rectsearch(&table_grid_);
1412  rectsearch.StartRectSearch(search_region);
1413  ColSegment* neighbor = NULL;
1414  while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
1415  if (neighbor == seg)
1416  continue;
1417  const TBOX& neighbor_box = neighbor->bounding_box();
1418  // Check if a neighbor box has a large overlap with the table
1419  // region. This may happen as a result of merging two table
1420  // regions in the previous iteration.
1421  if (neighbor_box.overlap_fraction(box) >= 0.9) {
1422  seg->InsertBox(neighbor_box);
1423  rectsearch.RemoveBBox();
1424  gsearch.RepositionIterator();
1425  delete neighbor;
1426  modified = true;
1427  continue;
1428  }
1429  // Check if two table regions belong together based on a common
1430  // horizontal ruling line
1431  if (BelongToOneTable(box, neighbor_box)) {
1432  seg->InsertBox(neighbor_box);
1433  neighbor_found = true;
1434  modified = true;
1435  rectsearch.RemoveBBox();
1436  gsearch.RepositionIterator();
1437  delete neighbor;
1438  }
1439  }
1440  } while (neighbor_found);
1441  if (modified) {
1442  // Because the box has changed, it has to be removed first.
1443  gsearch.RemoveBBox();
1444  table_grid_.InsertBBox(true, true, seg);
1445  gsearch.RepositionIterator();
1446  }
1447  }
1448 }
ColSegmentGrid table_grid_
Definition: tablefind.h:428
bool BelongToOneTable(const TBOX &box1, const TBOX &box2)
Definition: tablefind.cpp:1452
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
const ICOORD & bleft() const
Definition: tablefind.cpp:395
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:489
const ICOORD & tright() const
Definition: tablefind.cpp:398
Definition: rect.h:30

◆ gridsize()

int tesseract::TableFinder::gridsize ( ) const
protected

Definition at line 386 of file tablefind.cpp.

386  {
387  return clean_part_grid_.gridsize();
388 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
int gridsize() const
Definition: bbgrid.h:63

◆ gridwidth()

int tesseract::TableFinder::gridwidth ( ) const
protected

Definition at line 389 of file tablefind.cpp.

389  {
390  return clean_part_grid_.gridwidth();
391 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
int gridwidth() const
Definition: bbgrid.h:66

◆ GroupColumnBlocks()

void tesseract::TableFinder::GroupColumnBlocks ( ColSegment_LIST *  current_segments,
ColSegment_LIST *  col_segments 
)
protected

Definition at line 546 of file tablefind.cpp.

547  {
548  ColSegment_IT src_it(new_blocks);
549  ColSegment_IT dest_it(column_blocks);
550  // iterate through the source list
551  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
552  ColSegment* src_seg = src_it.data();
553  const TBOX& src_box = src_seg->bounding_box();
554  bool match_found = false;
555  // iterate through the destination list to find a matching column block
556  for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
557  ColSegment* dest_seg = dest_it.data();
558  TBOX dest_box = dest_seg->bounding_box();
559  if (ConsecutiveBoxes(src_box, dest_box)) {
560  // If matching block is found, insert the current block into it
561  // and delete the soure block
562  dest_seg->InsertBox(src_box);
563  match_found = true;
564  delete src_it.extract();
565  break;
566  }
567  }
568  // If no match is found, just append the source block to column_blocks
569  if (!match_found) {
570  dest_it.add_after_then_move(src_it.extract());
571  }
572  }
573 }
bool ConsecutiveBoxes(const TBOX &b1, const TBOX &b2)
Definition: tablefind.cpp:576
Definition: rect.h:30

◆ GrowTableBox()

void tesseract::TableFinder::GrowTableBox ( const TBOX table_box,
TBOX result_box 
)
protected

Definition at line 1528 of file tablefind.cpp.

1528  {
1529  // TODO(nbeato): The growing code is a bit excessive right now.
1530  // By removing these lines, the partitions considered need
1531  // to have some overlap or be special cases. These lines could
1532  // be added again once a check is put in place to make sure that
1533  // growing tables don't stomp on a lot of non-table partitions.
1534 
1535  // search for horizontal ruling lines within the vertical margin
1536  // int vertical_margin = kRulingVerticalMargin * gridsize();
1537  TBOX search_box = table_box;
1538  // int top = MIN(search_box.top() + vertical_margin, tright().y());
1539  // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());
1540  // search_box.set_top(top);
1541  // search_box.set_bottom(bottom);
1542 
1543  GrowTableToIncludePartials(table_box, search_box, result_box);
1544  GrowTableToIncludeLines(table_box, search_box, result_box);
1545  IncludeLeftOutColumnHeaders(result_box);
1546 }
void GrowTableToIncludePartials(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1550
void GrowTableToIncludeLines(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1578
Definition: rect.h:30
void IncludeLeftOutColumnHeaders(TBOX *table_box)
Definition: tablefind.cpp:1672

◆ GrowTableToIncludeLines()

void tesseract::TableFinder::GrowTableToIncludeLines ( const TBOX table_box,
const TBOX search_range,
TBOX result_box 
)
protected

Definition at line 1578 of file tablefind.cpp.

1580  {
1582  rsearch.SetUniqueMode(true);
1583  rsearch.StartRectSearch(search_range);
1584  ColPartition* part = NULL;
1585  while ((part = rsearch.NextRectSearch()) != NULL) {
1586  // TODO(nbeato) This should also do vertical, but column
1587  // boundaries are breaking things. This function needs to be
1588  // updated to allow vertical lines as well.
1589  if (!part->IsLineType())
1590  continue;
1591  // Avoid the following function call if the result of the
1592  // function is irrelevant.
1593  const TBOX& part_box = part->bounding_box();
1594  if (result_box->contains(part_box))
1595  continue;
1596  // Include a partially overlapping horizontal line only if the
1597  // extra ColPartitions that will be included due to expansion
1598  // have large side spacing w.r.t. columns containing them.
1599  if (HLineBelongsToTable(*part, table_box))
1600  *result_box = result_box->bounding_union(part_box);
1601  // TODO(nbeato): Vertical
1602  }
1603 }
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:129
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
bool contains(const FCOORD pt) const
Definition: rect.h:323
bool HLineBelongsToTable(const ColPartition &part, const TBOX &table_box)
Definition: tablefind.cpp:1608
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
Definition: rect.h:30

◆ GrowTableToIncludePartials()

void tesseract::TableFinder::GrowTableToIncludePartials ( const TBOX table_box,
const TBOX search_range,
TBOX result_box 
)
protected

Definition at line 1550 of file tablefind.cpp.

1552  {
1553  // Rulings are in a different grid, so search 2 grids for rulings, text,
1554  // and table partitions that are not entirely within the new box.
1555  for (int i = 0; i < 2; ++i) {
1556  ColPartitionGrid* grid = (i == 0) ? &fragmented_text_grid_ :
1558  ColPartitionGridSearch rectsearch(grid);
1559  rectsearch.StartRectSearch(search_range);
1560  ColPartition* part = NULL;
1561  while ((part = rectsearch.NextRectSearch()) != NULL) {
1562  // Only include text and table types.
1563  if (part->IsImageType())
1564  continue;
1565  const TBOX& part_box = part->bounding_box();
1566  // Include partition in the table if more than half of it
1567  // is covered by the table
1568  if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1569  *result_box = result_box->bounding_union(part_box);
1570  continue;
1571  }
1572  }
1573  }
1574 }
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:129
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
const double kMinOverlapWithTable
Definition: tablefind.cpp:100
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:424
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
Definition: rect.h:30

◆ HasLeaderAdjacent()

bool tesseract::TableFinder::HasLeaderAdjacent ( const ColPartition part)
protected

Definition at line 954 of file tablefind.cpp.

954  {
955  if (part.flow() == BTFT_LEADER)
956  return true;
957  // Search range is left and right bounded by an offset of the
958  // median xheight. This offset is to allow some tolerance to the
959  // the leaders on the page in the event that the alignment is still
960  // a bit off.
961  const TBOX& box = part.bounding_box();
962  const int search_size = kAdjacentLeaderSearchPadding * global_median_xheight_;
963  const int top = box.top() + search_size;
964  const int bottom = box.bottom() - search_size;
966  for (int direction = 0; direction < 2; ++direction) {
967  bool right_to_left = (direction == 0);
968  int x = right_to_left ? box.right() : box.left();
969  hsearch.StartSideSearch(x, bottom, top);
970  ColPartition* leader = NULL;
971  while ((leader = hsearch.NextSideSearch(right_to_left)) != NULL) {
972  // The leader could be a horizontal ruling in the grid.
973  // Make sure it is actually a leader.
974  if (leader->flow() != BTFT_LEADER)
975  continue;
976  // This should not happen, they are in different grids.
977  ASSERT_HOST(&part != leader);
978  // Make sure the leader shares a page column with the partition,
979  // otherwise we are spreading across columns.
980  if (!part.IsInSameColumnAs(*leader))
981  break;
982  // There should be a significant vertical overlap
983  if (!leader->VSignificantCoreOverlap(part))
984  continue;
985  // Leader passed all tests, so it is adjacent.
986  return true;
987  }
988  }
989  // No leaders are adjacent to the given partition.
990  return false;
991 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
inT16 bottom() const
Definition: rect.h:61
inT16 left() const
Definition: rect.h:68
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
#define ASSERT_HOST(x)
Definition: errcode.h:84
const int kAdjacentLeaderSearchPadding
Definition: tablefind.cpp:120

◆ HasWideOrNoInterWordGap()

bool tesseract::TableFinder::HasWideOrNoInterWordGap ( ColPartition part) const
protected

Definition at line 865 of file tablefind.cpp.

865  {
866  // Should only get text partitions.
867  ASSERT_HOST(part->IsTextType());
868  // Blob access
869  BLOBNBOX_CLIST* part_boxes = part->boxes();
870  BLOBNBOX_C_IT it(part_boxes);
871  // Check if this is a relatively small partition (such as a single word)
872  if (part->bounding_box().width() <
873  kMinBoxesInTextPartition * part->median_size() &&
874  part_boxes->length() < kMinBoxesInTextPartition)
875  return true;
876 
877  // Variables used to compute inter-blob spacing.
878  int current_x0 = -1;
879  int current_x1 = -1;
880  int previous_x1 = -1;
881  // Stores the maximum gap detected.
882  int largest_partition_gap_found = -1;
883  // Text partition gap limits. If this is text (and not a table),
884  // there should be at least one gap larger than min_gap and no gap
885  // larger than max_gap.
886  const double max_gap = kMaxGapInTextPartition * part->median_size();
887  const double min_gap = kMinMaxGapInTextPartition * part->median_size();
888 
889  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
890  BLOBNBOX* blob = it.data();
891  current_x0 = blob->bounding_box().left();
892  current_x1 = blob->bounding_box().right();
893  if (previous_x1 != -1) {
894  int gap = current_x0 - previous_x1;
895 
896  // TODO(nbeato): Boxes may overlap? Huh?
897  // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors
898  // on the top right of the page are filtered out with this line.
899  // Note 2: Iterating over blobs in a partition, so we are looking for
900  // spacing between the words.
901  if (gap < 0) {
902  // More likely case, the blobs slightly overlap. This can happen
903  // with diacritics (accents) or broken alphabet symbols (characters).
904  // Merge boxes together by taking max of right sides.
905  if (-gap < part->median_size() * kMaxBlobOverlapFactor) {
906  previous_x1 = MAX(previous_x1, current_x1);
907  continue;
908  }
909  // Extreme case, blobs overlap significantly in the same partition...
910  // This should not happen often (if at all), but it does.
911  // TODO(nbeato): investigate cases when this happens.
912  else {
913  // The behavior before was to completely ignore this case.
914  }
915  }
916 
917  // If a large enough gap is found, mark it as a table cell (return true)
918  if (gap > max_gap)
919  return true;
920  if (gap > largest_partition_gap_found)
921  largest_partition_gap_found = gap;
922  }
923  previous_x1 = current_x1;
924  }
925  // Since no large gap was found, return false if the partition is too
926  // long to be a data cell
927  if (part->bounding_box().width() >
928  kMaxBoxesInDataPartition * part->median_size() ||
929  part_boxes->length() > kMaxBoxesInDataPartition)
930  return false;
931 
932  // A partition may be a single blob. In this case, it's an isolated symbol
933  // or non-text (such as a ruling or image).
934  // Detect these as table partitions? Shouldn't this be case by case?
935  // The behavior before was to ignore this, making max_partition_gap < 0
936  // and implicitly return true. Just making it explicit.
937  if (largest_partition_gap_found == -1)
938  return true;
939 
940  // return true if the maximum gap found is smaller than the minimum allowed
941  // max_gap in a text partition. This indicates that there is no significant
942  // space in the partition, hence it is likely a single word.
943  return largest_partition_gap_found < min_gap;
944 }
const TBOX & bounding_box() const
Definition: blobbox.h:215
const double kMaxGapInTextPartition
Definition: tablefind.cpp:72
const int kMaxBoxesInDataPartition
Definition: tablefind.cpp:69
const double kMaxBlobOverlapFactor
Definition: tablefind.cpp:80
inT16 left() const
Definition: rect.h:68
#define MAX(x, y)
Definition: ndminx.h:24
inT16 right() const
Definition: rect.h:75
const double kMinMaxGapInTextPartition
Definition: tablefind.cpp:76
#define ASSERT_HOST(x)
Definition: errcode.h:84
const int kMinBoxesInTextPartition
Definition: tablefind.cpp:66

◆ HLineBelongsToTable()

bool tesseract::TableFinder::HLineBelongsToTable ( const ColPartition part,
const TBOX table_box 
)
protected

Definition at line 1608 of file tablefind.cpp.

1609  {
1610  if (!part.IsHorizontalLine())
1611  return false;
1612  const TBOX& part_box = part.bounding_box();
1613  if (!part_box.major_x_overlap(table_box))
1614  return false;
1615  // Do not consider top-most horizontal line since it usually
1616  // originates from noise.
1617  // TODO(nbeato): I had to comment this out because the ruling grid doesn't
1618  // have neighbors solved.
1619  // if (!part.nearest_neighbor_above())
1620  // return false;
1621  const TBOX bbox = part_box.bounding_union(table_box);
1622  // In the "unioned table" box (the table extents expanded by the line),
1623  // keep track of how many partitions have significant padding to the left
1624  // and right. If more than half of the partitions covered by the new table
1625  // have significant spacing, the line belongs to the table and the table
1626  // grows to include all of the partitions.
1627  int num_extra_partitions = 0;
1628  int extra_space_to_right = 0;
1629  int extra_space_to_left = 0;
1630  // Rulings are in a different grid, so search 2 grids for rulings, text,
1631  // and table partitions that are introduced by the new box.
1632  for (int i = 0; i < 2; ++i) {
1633  ColPartitionGrid* grid = (i == 0) ? &clean_part_grid_ :
1635  // Start a rect search on bbox
1636  ColPartitionGridSearch rectsearch(grid);
1637  rectsearch.SetUniqueMode(true);
1638  rectsearch.StartRectSearch(bbox);
1639  ColPartition* extra_part = NULL;
1640  while ((extra_part = rectsearch.NextRectSearch()) != NULL) {
1641  // ColPartition already in table
1642  const TBOX& extra_part_box = extra_part->bounding_box();
1643  if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable)
1644  continue;
1645  // Non-text ColPartitions do not contribute
1646  if (extra_part->IsImageType())
1647  continue;
1648  // Consider this partition.
1649  num_extra_partitions++;
1650  // presence of a table cell is a strong hint, so just increment the scores
1651  // without looking at the spacing.
1652  if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {
1653  extra_space_to_right++;
1654  extra_space_to_left++;
1655  continue;
1656  }
1657  int space_threshold = kSideSpaceMargin * part.median_size();
1658  if (extra_part->space_to_right() > space_threshold)
1659  extra_space_to_right++;
1660  if (extra_part->space_to_left() > space_threshold)
1661  extra_space_to_left++;
1662  }
1663  }
1664  // tprintf("%d %d %d\n",
1665  // num_extra_partitions,extra_space_to_right,extra_space_to_left);
1666  return (extra_space_to_right > num_extra_partitions / 2) ||
1667  (extra_space_to_left > num_extra_partitions / 2);
1668 }
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:129
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
const double kMinOverlapWithTable
Definition: tablefind.cpp:100
Definition: capi.h:97
const int kSideSpaceMargin
Definition: tablefind.cpp:105
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
Definition: rect.h:30

◆ IncludeLeftOutColumnHeaders()

void tesseract::TableFinder::IncludeLeftOutColumnHeaders ( TBOX table_box)
protected

Definition at line 1672 of file tablefind.cpp.

1672  {
1673  // Start a search above the current table to look for column headers
1675  vsearch.StartVerticalSearch(table_box->left(), table_box->right(),
1676  table_box->top());
1677  ColPartition* neighbor = NULL;
1678  ColPartition* previous_neighbor = NULL;
1679  while ((neighbor = vsearch.NextVerticalSearch(false)) != NULL) {
1680  // Max distance to find a table heading.
1681  const int max_distance = kMaxColumnHeaderDistance *
1682  neighbor->median_size();
1683  int table_top = table_box->top();
1684  const TBOX& box = neighbor->bounding_box();
1685  // Do not continue if the next box is way above
1686  if (box.bottom() - table_top > max_distance)
1687  break;
1688  // Unconditionally include partitions of type TABLE or LINE
1689  // TODO(faisal): add some reasonable conditions here
1690  if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {
1691  table_box->set_top(box.top());
1692  previous_neighbor = NULL;
1693  continue;
1694  }
1695  // If there are two text partitions, one above the other, without a table
1696  // cell on their left or right side, consider them a barrier and quit
1697  if (previous_neighbor == NULL) {
1698  previous_neighbor = neighbor;
1699  } else {
1700  const TBOX& previous_box = previous_neighbor->bounding_box();
1701  if (!box.major_y_overlap(previous_box))
1702  break;
1703  }
1704  }
1705 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
Definition: capi.h:97
inT16 bottom() const
Definition: rect.h:61
inT16 left() const
Definition: rect.h:68
inT16 top() const
Definition: rect.h:54
const int kMaxColumnHeaderDistance
Definition: tablefind.cpp:88
void set_top(int y)
Definition: rect.h:57
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:429

◆ Init()

void tesseract::TableFinder::Init ( int  grid_size,
const ICOORD bottom_left,
const ICOORD top_right 
)

Definition at line 186 of file tablefind.cpp.

187  {
188  // Initialize clean partitions list and grid
189  clean_part_grid_.Init(grid_size, bottom_left, top_right);
190  leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right);
191  fragmented_text_grid_.Init(grid_size, bottom_left, top_right);
192  col_seg_grid_.Init(grid_size, bottom_left, top_right);
193  table_grid_.Init(grid_size, bottom_left, top_right);
194 }
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:426
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
ColSegmentGrid table_grid_
Definition: tablefind.h:428
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:424
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: bbgrid.h:447

◆ InitializePartitions()

void tesseract::TableFinder::InitializePartitions ( ColPartitionSet **  all_columns)
protected

Definition at line 587 of file tablefind.cpp.

587  {
588  FindNeighbors();
589  SetPartitionSpacings(&clean_part_grid_, all_columns);
591 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
void SetGlobalSpacings(ColPartitionGrid *grid)
Definition: tablefind.cpp:717
static void SetPartitionSpacings(ColPartitionGrid *grid, ColPartitionSet **all_columns)
Definition: tablefind.cpp:594

◆ InsertCleanPartitions()

void tesseract::TableFinder::InsertCleanPartitions ( ColPartitionGrid grid,
TO_BLOCK block 
)

Definition at line 198 of file tablefind.cpp.

199  {
200  // Calculate stats. This lets us filter partitions in AllowTextPartition()
201  // and filter blobs in AllowBlob().
202  SetGlobalSpacings(grid);
203 
204  // Iterate the ColPartitions in the grid.
205  ColPartitionGridSearch gsearch(grid);
206  gsearch.SetUniqueMode(true);
207  gsearch.StartFullSearch();
208  ColPartition* part = NULL;
209  while ((part = gsearch.NextFullSearch()) != NULL) {
210  // Reject partitions with nothing useful inside of them.
211  if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0)
212  continue;
213  ColPartition* clean_part = part->ShallowCopy();
214  ColPartition* leader_part = NULL;
215  if (part->IsLineType()) {
216  InsertRulingPartition(clean_part);
217  continue;
218  }
219  // Insert all non-text partitions to clean_parts
220  if (!part->IsTextType()) {
221  InsertImagePartition(clean_part);
222  continue;
223  }
224  // Insert text colpartitions after removing noisy components from them
225  // The leaders are split into a separate grid.
226  BLOBNBOX_CLIST* part_boxes = part->boxes();
227  BLOBNBOX_C_IT pit(part_boxes);
228  for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
229  BLOBNBOX *pblob = pit.data();
230  // Bad blobs... happens in UNLV set.
231  // news.3G1, page 17 (around x=6)
232  if (!AllowBlob(*pblob))
233  continue;
234  if (pblob->flow() == BTFT_LEADER) {
235  if (leader_part == NULL) {
236  leader_part = part->ShallowCopy();
237  leader_part->set_flow(BTFT_LEADER);
238  }
239  leader_part->AddBox(pblob);
240  } else if (pblob->region_type() != BRT_NOISE) {
241  clean_part->AddBox(pblob);
242  }
243  }
244  clean_part->ComputeLimits();
245  ColPartition* fragmented = clean_part->CopyButDontOwnBlobs();
246  InsertTextPartition(clean_part);
248  if (leader_part != NULL) {
249  // TODO(nbeato): Note that ComputeLimits does not update the column
250  // information. So the leader may appear to span more columns than it
251  // really does later on when IsInSameColumnAs gets called to test
252  // for adjacent leaders.
253  leader_part->ComputeLimits();
254  InsertLeaderPartition(leader_part);
255  }
256  }
257 
258  // Make the partition partners better for upper and lower neighbors.
261 }
bool AllowBlob(const BLOBNBOX &blob) const
Definition: tablefind.cpp:510
void InsertLeaderPartition(ColPartition *part)
Definition: tablefind.cpp:418
void InsertRulingPartition(ColPartition *part)
Definition: tablefind.cpp:426
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
BlobTextFlowType flow() const
Definition: blobbox.h:280
void set_flow(BlobTextFlowType value)
Definition: blobbox.h:283
void InsertImagePartition(ColPartition *part)
Definition: tablefind.cpp:429
void SetGlobalSpacings(ColPartitionGrid *grid)
Definition: tablefind.cpp:717
void InsertTextPartition(ColPartition *part)
Definition: tablefind.cpp:402
void SplitAndInsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:444
void RefinePartitionPartners(bool get_desperate)
BlobRegionType region_type() const
Definition: blobbox.h:268

◆ InsertFragmentedTextPartition()

void tesseract::TableFinder::InsertFragmentedTextPartition ( ColPartition part)
protected

Definition at line 410 of file tablefind.cpp.

410  {
411  ASSERT_HOST(part != NULL);
412  if (AllowTextPartition(*part)) {
413  fragmented_text_grid_.InsertBBox(true, true, part);
414  } else {
415  delete part;
416  }
417 }
bool AllowTextPartition(const ColPartition &part) const
Definition: tablefind.cpp:497
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:424
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:489
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ InsertImagePartition()

void tesseract::TableFinder::InsertImagePartition ( ColPartition part)
protected

Definition at line 429 of file tablefind.cpp.

429  {
430  // NOTE: If images are placed into a different grid in the future,
431  // the function SetPartitionSpacings needs to be updated. It should
432  // be the only thing that cares about image partitions.
433  clean_part_grid_.InsertBBox(true, true, part);
434 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:489

◆ InsertLeaderPartition()

void tesseract::TableFinder::InsertLeaderPartition ( ColPartition part)
protected

Definition at line 418 of file tablefind.cpp.

418  {
419  ASSERT_HOST(part != NULL);
420  if (!part->IsEmpty() && part->bounding_box().area() > 0) {
421  leader_and_ruling_grid_.InsertBBox(true, true, part);
422  } else {
423  delete part;
424  }
425 }
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:489
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ InsertRulingPartition()

void tesseract::TableFinder::InsertRulingPartition ( ColPartition part)
protected

Definition at line 426 of file tablefind.cpp.

426  {
427  leader_and_ruling_grid_.InsertBBox(true, true, part);
428 }
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:489

◆ InsertTextPartition()

void tesseract::TableFinder::InsertTextPartition ( ColPartition part)
protected

Definition at line 402 of file tablefind.cpp.

402  {
403  ASSERT_HOST(part != NULL);
404  if (AllowTextPartition(*part)) {
405  clean_part_grid_.InsertBBox(true, true, part);
406  } else {
407  delete part;
408  }
409 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
bool AllowTextPartition(const ColPartition &part) const
Definition: tablefind.cpp:497
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:489
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ LocateTables()

void tesseract::TableFinder::LocateTables ( ColPartitionGrid grid,
ColPartitionSet **  columns,
WidthCallback width_cb,
const FCOORD reskew 
)

Definition at line 264 of file tablefind.cpp.

267  {
268  // initialize spacing, neighbors, and columns
269  InitializePartitions(all_columns);
270 
271 #ifndef GRAPHICS_DISABLED
272  if (textord_show_tables) {
273  ScrollView* table_win = MakeWindow(0, 300, "Column Partitions & Neighbors");
279 
280  table_win = MakeWindow(100, 300, "Fragmented Text");
282  }
283 #endif // GRAPHICS_DISABLED
284 
285  // mark, filter, and smooth candidate table partitions
287 
288  // Make single-column blocks from good_columns_ partitions. col_segments are
289  // moved to a grid later which takes the ownership
290  ColSegment_LIST column_blocks;
291  GetColumnBlocks(all_columns, &column_blocks);
292  // Set the ratio of candidate table partitions in each column
293  SetColumnsType(&column_blocks);
294 
295  // Move column segments to col_seg_grid_
296  MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
297 
298  // Detect split in column layout that might have occurred due to the
299  // presence of a table. In such a case, merge the corresponding columns.
301 
302  // Group horizontally overlapping table partitions into table columns.
303  // table_columns created here get deleted at the end of this method.
304  ColSegment_LIST table_columns;
305  GetTableColumns(&table_columns);
306 
307  // Within each column, mark the range table regions occupy based on the
308  // table columns detected. table_regions are moved to a grid later which
309  // takes the ownership
310  ColSegment_LIST table_regions;
311  GetTableRegions(&table_columns, &table_regions);
312 
313 #ifndef GRAPHICS_DISABLED
315  ScrollView* table_win = MakeWindow(1200, 300, "Table Columns and Regions");
316  DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE);
317  DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW);
318  }
319 #endif // GRAPHICS_DISABLED
320 
321  // Merge table regions across columns for tables spanning multiple
322  // columns
323  MoveColSegmentsToGrid(&table_regions, &table_grid_);
325 
326  // Adjust table boundaries by including nearby horizontal lines and left
327  // out column headers
330 
332  // Remove false alarms consiting of a single column
334 
335 #ifndef GRAPHICS_DISABLED
336  if (textord_show_tables) {
337  ScrollView* table_win = MakeWindow(1200, 300, "Detected Table Locations");
339  DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI);
340  table_grid_.DisplayBoxes(table_win);
341  }
342 #endif // GRAPHICS_DISABLED
343 
344  // Find table grid structure and reject tables that are malformed.
345  RecognizeTables();
347  RecognizeTables();
348 
349 #ifndef GRAPHICS_DISABLED
350  if (textord_show_tables) {
351  ScrollView* table_win = MakeWindow(1400, 600, "Recognized Tables");
354  table_grid_.DisplayBoxes(table_win);
355  }
356 #endif // GRAPHICS_DISABLED
357  } else {
358  // Remove false alarms consiting of a single column
359  // TODO(nbeato): verify this is a NOP after structured table rejection.
360  // Right now it isn't. If the recognize function is doing what it is
361  // supposed to do, this function is obsolete.
363 
364 #ifndef GRAPHICS_DISABLED
365  if (textord_show_tables) {
366  ScrollView* table_win = MakeWindow(1500, 300, "Detected Tables");
369  table_grid_.DisplayBoxes(table_win);
370  }
371 #endif // GRAPHICS_DISABLED
372  }
373 
375  WriteToPix(reskew);
376 
377  // Merge all colpartitions in table regions to make them a single
378  // colpartition and revert types of isolated table cells not
379  // assigned to any table to their original types.
380  MakeTableBlocks(grid, all_columns, width_cb);
381 }
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:426
bool textord_show_tables
Definition: tablefind.cpp:147
void MoveColSegmentsToGrid(ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
Definition: tablefind.cpp:1184
void GetColumnBlocks(ColPartitionSet **columns, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:531
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
void GetTableRegions(ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
Definition: tablefind.cpp:1331
void MakeTableBlocks(ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb)
Definition: tablefind.cpp:2079
ColSegmentGrid table_grid_
Definition: tablefind.h:428
bool textord_tablefind_show_mark
Definition: tablefind.cpp:149
void DisplayBoxes(ScrollView *window)
Definition: bbgrid.h:616
void GetTableColumns(ColSegment_LIST *table_columns)
Definition: tablefind.cpp:1281
bool textord_tablefind_recognize_tables
Definition: tablefind.cpp:153
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1924
void DisplayColPartitionConnections(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
Definition: tablefind.cpp:1958
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:424
bool textord_dump_table_images
Definition: tablefind.cpp:146
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
void InitializePartitions(ColPartitionSet **all_columns)
Definition: tablefind.cpp:587
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:526
void DisplayColSegments(ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
Definition: tablefind.cpp:1879
void SetColumnsType(ColSegment_LIST *col_segments)
Definition: tablefind.cpp:1151
void WriteToPix(const FCOORD &reskew)
Definition: tablefind.cpp:2006

◆ MakeTableBlocks()

void tesseract::TableFinder::MakeTableBlocks ( ColPartitionGrid grid,
ColPartitionSet **  columns,
WidthCallback width_cb 
)
protected

Definition at line 2079 of file tablefind.cpp.

2081  {
2082  // Since we have table blocks already, remove table tags from all
2083  // colpartitions
2084  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
2085  gsearch(grid);
2086  gsearch.StartFullSearch();
2087  ColPartition* part = NULL;
2088 
2089  while ((part = gsearch.NextFullSearch()) != NULL) {
2090  if (part->type() == PT_TABLE) {
2091  part->clear_table_type();
2092  }
2093  }
2094  // Now make a single colpartition out of each table block and remove
2095  // all colpartitions contained within a table
2096  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
2097  table_search(&table_grid_);
2098  table_search.StartFullSearch();
2099  ColSegment* table;
2100  while ((table = table_search.NextFullSearch()) != NULL) {
2101  const TBOX& table_box = table->bounding_box();
2102  // Start a rect search on table_box
2103  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
2104  rectsearch(grid);
2105  rectsearch.StartRectSearch(table_box);
2106  ColPartition* part;
2107  ColPartition* table_partition = NULL;
2108  while ((part = rectsearch.NextRectSearch()) != NULL) {
2109  // Do not consider image partitions
2110  if (!part->IsTextType())
2111  continue;
2112  TBOX part_box = part->bounding_box();
2113  // Include partition in the table if more than half of it
2114  // is covered by the table
2115  if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
2116  rectsearch.RemoveBBox();
2117  if (table_partition) {
2118  table_partition->Absorb(part, width_cb);
2119  } else {
2120  table_partition = part;
2121  }
2122  }
2123  }
2124  // Insert table colpartition back to part_grid_
2125  if (table_partition) {
2126  // To match the columns used when transforming to blocks, the new table
2127  // partition must have its first and last column set at the grid y that
2128  // corresponds to its bottom.
2129  const TBOX& table_box = table_partition->bounding_box();
2130  int grid_x, grid_y;
2131  grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y);
2132  table_partition->SetPartitionType(resolution_, all_columns[grid_y]);
2133  table_partition->set_table_type();
2134  table_partition->set_blob_type(BRT_TEXT);
2135  table_partition->set_flow(BTFT_CHAIN);
2136  table_partition->SetBlobTypes();
2137  grid->InsertBBox(true, true, table_partition);
2138  }
2139  }
2140 }
const double kMinOverlapWithTable
Definition: tablefind.cpp:100
ColSegmentGrid table_grid_
Definition: tablefind.h:428
Definition: capi.h:97
inT16 bottom() const
Definition: rect.h:61
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
inT16 left() const
Definition: rect.h:68
Definition: rect.h:30

◆ MakeWindow()

ScrollView * tesseract::TableFinder::MakeWindow ( int  x,
int  y,
const char *  window_name 
)
protected

Definition at line 526 of file tablefind.cpp.

526  {
527  return clean_part_grid_.MakeWindow(x, y, window_name);
528 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: bbgrid.h:592

◆ MarkPartitionsUsingLocalInformation()

void tesseract::TableFinder::MarkPartitionsUsingLocalInformation ( )
protected

Definition at line 835 of file tablefind.cpp.

835  {
836  // Iterate the ColPartitions in the grid.
837  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
838  gsearch(&clean_part_grid_);
839  gsearch.StartFullSearch();
840  ColPartition* part = NULL;
841  while ((part = gsearch.NextFullSearch()) != NULL) {
842  if (!part->IsTextType()) // Only consider text partitions
843  continue;
844  // Only consider partitions in dominant font size or smaller
845  if (part->median_size() > kMaxTableCellXheight * global_median_xheight_)
846  continue;
847  // Mark partitions with a large gap, or no significant gap as
848  // table partitions.
849  // Comments: It produces several false alarms at:
850  // - last line of a paragraph (fixed)
851  // - single word section headings
852  // - page headers and footers
853  // - numbered equations
854  // - line drawing regions
855  // TODO(faisal): detect and fix above-mentioned cases
856  if (HasWideOrNoInterWordGap(part) ||
857  HasLeaderAdjacent(*part)) {
858  part->set_table_type();
859  }
860  }
861 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
const double kMaxTableCellXheight
Definition: tablefind.cpp:84
bool HasLeaderAdjacent(const ColPartition &part)
Definition: tablefind.cpp:954
bool HasWideOrNoInterWordGap(ColPartition *part) const
Definition: tablefind.cpp:865

◆ MarkTablePartitions()

void tesseract::TableFinder::MarkTablePartitions ( )
protected

Definition at line 797 of file tablefind.cpp.

797  {
800  ScrollView* table_win = MakeWindow(300, 300, "Initial Table Partitions");
804  }
807  ScrollView* table_win = MakeWindow(600, 300, "Filtered Table Partitions");
811  }
814  ScrollView* table_win = MakeWindow(900, 300, "Smoothed Table Partitions");
818  }
821  ScrollView* table_win = MakeWindow(900, 300, "Final Table Partitions");
825  }
826 }
bool textord_show_tables
Definition: tablefind.cpp:147
void MarkPartitionsUsingLocalInformation()
Definition: tablefind.cpp:835
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
bool textord_tablefind_show_mark
Definition: tablefind.cpp:149
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1924
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:526

◆ MoveColSegmentsToGrid()

void tesseract::TableFinder::MoveColSegmentsToGrid ( ColSegment_LIST *  segments,
ColSegmentGrid col_seg_grid 
)
protected

Definition at line 1184 of file tablefind.cpp.

1185  {
1186  ColSegment_IT it(segments);
1187  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1188  ColSegment* seg = it.extract();
1189  col_seg_grid->InsertBBox(true, true, seg);
1190  }
1191 }

◆ RecognizeTables()

void tesseract::TableFinder::RecognizeTables ( )
protected

Definition at line 1825 of file tablefind.cpp.

1825  {
1826  ScrollView* table_win = NULL;
1827  if (textord_show_tables) {
1828  table_win = MakeWindow(0, 0, "Table Structure");
1831  // table_grid_.DisplayBoxes(table_win);
1832  }
1833 
1834 
1835  TableRecognizer recognizer;
1836  recognizer.Init();
1837  recognizer.set_line_grid(&leader_and_ruling_grid_);
1838  recognizer.set_text_grid(&fragmented_text_grid_);
1839  recognizer.set_max_text_height(global_median_xheight_ * 2.0);
1840  recognizer.set_min_height(1.5 * gridheight());
1841  // Loop over all of the tables and try to fit them.
1842  // Store the good tables here.
1843  ColSegment_CLIST good_tables;
1844  ColSegment_C_IT good_it(&good_tables);
1845 
1847  gsearch.StartFullSearch();
1848  ColSegment* found_table = NULL;
1849  while ((found_table = gsearch.NextFullSearch()) != NULL) {
1850  gsearch.RemoveBBox();
1851 
1852  // The goal is to make the tables persistent in a list.
1853  // When that happens, this will move into the search loop.
1854  const TBOX& found_box = found_table->bounding_box();
1855  StructuredTable* table_structure = recognizer.RecognizeTable(found_box);
1856 
1857  // Process a table. Good tables are inserted into the grid again later on
1858  // We can't change boxes in the grid while it is running a search.
1859  if (table_structure != NULL) {
1860  if (textord_show_tables) {
1861  table_structure->Display(table_win, ScrollView::LIME_GREEN);
1862  }
1863  found_table->set_bounding_box(table_structure->bounding_box());
1864  delete table_structure;
1865  good_it.add_after_then_move(found_table);
1866  } else {
1867  delete found_table;
1868  }
1869  }
1870  // TODO(nbeato): MERGE!! There is awesome info now available for merging.
1871 
1872  // At this point, the grid is empty. We can safely insert the good tables
1873  // back into grid.
1874  for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward())
1875  table_grid_.InsertBBox(true, true, good_it.extract());
1876 }
bool textord_show_tables
Definition: tablefind.cpp:147
ColSegmentGrid table_grid_
Definition: tablefind.h:428
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1924
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:424
int gridheight() const
Definition: tablefind.cpp:392
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:489
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:526
Definition: rect.h:30
GridSearch< ColSegment, ColSegment_CLIST, ColSegment_C_IT > ColSegmentGridSearch
Definition: tablefind.h:121

◆ set_global_median_blob_width()

void tesseract::TableFinder::set_global_median_blob_width ( int  width)
protected

Definition at line 767 of file tablefind.cpp.

767  {
769 }

◆ set_global_median_ledding()

void tesseract::TableFinder::set_global_median_ledding ( int  ledding)
protected

Definition at line 770 of file tablefind.cpp.

770  {
771  global_median_ledding_ = ledding;
772 }

◆ set_global_median_xheight()

void tesseract::TableFinder::set_global_median_xheight ( int  xheight)
protected

Definition at line 764 of file tablefind.cpp.

764  {
765  global_median_xheight_ = xheight;
766 }

◆ set_left_to_right_language()

void tesseract::TableFinder::set_left_to_right_language ( bool  order)

Definition at line 182 of file tablefind.cpp.

182  {
183  left_to_right_language_ = order;
184 }

◆ set_resolution()

void tesseract::TableFinder::set_resolution ( int  resolution)
inline

Definition at line 138 of file tablefind.h.

138  {
139  resolution_ = resolution;
140  }

◆ SetColumnsType()

void tesseract::TableFinder::SetColumnsType ( ColSegment_LIST *  col_segments)
protected

Definition at line 1151 of file tablefind.cpp.

1151  {
1152  ColSegment_IT it(column_blocks);
1153  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1154  ColSegment* seg = it.data();
1155  TBOX box = seg->bounding_box();
1156  int num_table_cells = 0;
1157  int num_text_cells = 0;
1158  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1159  rsearch(&clean_part_grid_);
1160  rsearch.SetUniqueMode(true);
1161  rsearch.StartRectSearch(box);
1162  ColPartition* part = NULL;
1163  while ((part = rsearch.NextRectSearch()) != NULL) {
1164  if (part->type() == PT_TABLE) {
1165  num_table_cells++;
1166  } else if (part->type() == PT_FLOWING_TEXT) {
1167  num_text_cells++;
1168  }
1169  }
1170  // If a column block has no text or table partition in it, it is not needed
1171  // for table detection.
1172  if (!num_table_cells && !num_text_cells) {
1173  delete it.extract();
1174  } else {
1175  seg->set_num_table_cells(num_table_cells);
1176  seg->set_num_text_cells(num_text_cells);
1177  // set column type based on the ratio of table to text cells
1178  seg->set_type();
1179  }
1180  }
1181 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
Definition: capi.h:97
Definition: rect.h:30

◆ SetGlobalSpacings()

void tesseract::TableFinder::SetGlobalSpacings ( ColPartitionGrid grid)
protected

Definition at line 717 of file tablefind.cpp.

717  {
718  STATS xheight_stats(0, kMaxVerticalSpacing + 1);
719  STATS width_stats(0, kMaxBlobWidth + 1);
720  STATS ledding_stats(0, kMaxVerticalSpacing + 1);
721  // Iterate the ColPartitions in the grid.
722  ColPartitionGridSearch gsearch(grid);
723  gsearch.SetUniqueMode(true);
724  gsearch.StartFullSearch();
725  ColPartition* part = NULL;
726  while ((part = gsearch.NextFullSearch()) != NULL) {
727  // TODO(nbeato): HACK HACK HACK! medians are equal to partition length.
728  // ComputeLimits needs to get called somewhere outside of TableFinder
729  // to make sure the partitions are properly initialized.
730  // When this is called, SmoothPartitionPartners dies in an assert after
731  // table find runs. Alternative solution.
732  // part->ComputeLimits();
733  if (part->IsTextType()) {
734  // xheight_stats.add(part->median_size(), part->boxes_count());
735  // width_stats.add(part->median_width(), part->boxes_count());
736 
737  // This loop can be removed when above issues are fixed.
738  // Replace it with the 2 lines commented out above.
739  BLOBNBOX_C_IT it(part->boxes());
740  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
741  xheight_stats.add(it.data()->bounding_box().height(), 1);
742  width_stats.add(it.data()->bounding_box().width(), 1);
743  }
744 
745  ledding_stats.add(part->space_above(), 1);
746  ledding_stats.add(part->space_below(), 1);
747  }
748  }
749  // Set estimates based on median of statistics obtained
750  set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5));
751  set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5));
752  set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5));
753  #ifndef GRAPHICS_DISABLED
755  const char* kWindowName = "X-height (R), X-width (G), and ledding (B)";
756  ScrollView* stats_win = MakeWindow(500, 10, kWindowName);
757  xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
758  width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
759  ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE);
760  }
761  #endif // GRAPHICS_DISABLED
762 }
void set_global_median_ledding(int ledding)
Definition: tablefind.cpp:770
const int kMaxBlobWidth
Definition: tablefind.cpp:43
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
bool textord_tablefind_show_stats
Definition: tablefind.cpp:151
const int kMaxVerticalSpacing
Definition: tablefind.cpp:41
void set_global_median_blob_width(int width)
Definition: tablefind.cpp:767
void set_global_median_xheight(int xheight)
Definition: tablefind.cpp:764
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:526
Definition: statistc.h:33

◆ SetPartitionSpacings()

void tesseract::TableFinder::SetPartitionSpacings ( ColPartitionGrid grid,
ColPartitionSet **  all_columns 
)
staticprotected

Definition at line 594 of file tablefind.cpp.

595  {
596  // Iterate the ColPartitions in the grid.
597  ColPartitionGridSearch gsearch(grid);
598  gsearch.StartFullSearch();
599  ColPartition* part = NULL;
600  while ((part = gsearch.NextFullSearch()) != NULL) {
601  ColPartitionSet* columns = all_columns[gsearch.GridY()];
602  TBOX box = part->bounding_box();
603  int y = part->MidY();
604  ColPartition* left_column = columns->ColumnContaining(box.left(), y);
605  ColPartition* right_column = columns->ColumnContaining(box.right(), y);
606  // set distance from left column as space to the left
607  if (left_column) {
608  int left_space = MAX(0, box.left() - left_column->LeftAtY(y));
609  part->set_space_to_left(left_space);
610  }
611  // set distance from right column as space to the right
612  if (right_column) {
613  int right_space = MAX(0, right_column->RightAtY(y) - box.right());
614  part->set_space_to_right(right_space);
615  }
616 
617  // Look for images that may be closer.
618  // NOTE: used to be part_grid_, might cause issues now
619  ColPartitionGridSearch hsearch(grid);
620  hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
621  ColPartition* neighbor = NULL;
622  while ((neighbor = hsearch.NextSideSearch(true)) != NULL) {
623  if (neighbor->type() == PT_PULLOUT_IMAGE ||
624  neighbor->type() == PT_FLOWING_IMAGE ||
625  neighbor->type() == PT_HEADING_IMAGE) {
626  int right = neighbor->bounding_box().right();
627  if (right < box.left()) {
628  int space = MIN(box.left() - right, part->space_to_left());
629  part->set_space_to_left(space);
630  }
631  }
632  }
633  hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
634  neighbor = NULL;
635  while ((neighbor = hsearch.NextSideSearch(false)) != NULL) {
636  if (neighbor->type() == PT_PULLOUT_IMAGE ||
637  neighbor->type() == PT_FLOWING_IMAGE ||
638  neighbor->type() == PT_HEADING_IMAGE) {
639  int left = neighbor->bounding_box().left();
640  if (left > box.right()) {
641  int space = MIN(left - box.right(), part->space_to_right());
642  part->set_space_to_right(space);
643  }
644  }
645  }
646 
647  ColPartition* upper_part = part->SingletonPartner(true);
648  if (upper_part) {
649  int space = MAX(0, upper_part->bounding_box().bottom() -
650  part->bounding_box().bottom());
651  part->set_space_above(space);
652  } else {
653  // TODO(nbeato): What constitutes a good value?
654  // 0 is the default value when not set, explicitly noting it needs to
655  // be something else.
656  part->set_space_above(MAX_INT32);
657  }
658 
659  ColPartition* lower_part = part->SingletonPartner(false);
660  if (lower_part) {
661  int space = MAX(0, part->bounding_box().bottom() -
662  lower_part->bounding_box().bottom());
663  part->set_space_below(space);
664  } else {
665  // TODO(nbeato): What constitutes a good value?
666  // 0 is the default value when not set, explicitly noting it needs to
667  // be something else.
668  part->set_space_below(MAX_INT32);
669  }
670  }
671 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
#define MIN(x, y)
Definition: ndminx.h:28
inT16 bottom() const
Definition: rect.h:61
inT16 left() const
Definition: rect.h:68
#define MAX(x, y)
Definition: ndminx.h:24
#define MAX_INT32
Definition: host.h:53
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75

◆ SetVerticalSpacing()

void tesseract::TableFinder::SetVerticalSpacing ( ColPartition part)
protected

Definition at line 674 of file tablefind.cpp.

674  {
675  TBOX box = part->bounding_box();
676  int top_range = MIN(box.top() + kMaxVerticalSpacing, tright().y());
677  int bottom_range = MAX(box.bottom() - kMaxVerticalSpacing, bleft().y());
678  box.set_top(top_range);
679  box.set_bottom(bottom_range);
680 
681  TBOX part_box = part->bounding_box();
682  // Start a rect search
683  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
684  rectsearch(&clean_part_grid_);
685  rectsearch.StartRectSearch(box);
686  ColPartition* neighbor;
687  int min_space_above = kMaxVerticalSpacing;
688  int min_space_below = kMaxVerticalSpacing;
689  ColPartition* above_neighbor = NULL;
690  ColPartition* below_neighbor = NULL;
691  while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
692  if (neighbor == part)
693  continue;
694  TBOX neighbor_box = neighbor->bounding_box();
695  if (neighbor_box.major_x_overlap(part_box)) {
696  int gap = abs(part->median_bottom() - neighbor->median_bottom());
697  // If neighbor is below current partition
698  if (neighbor_box.top() < part_box.bottom() &&
699  gap < min_space_below) {
700  min_space_below = gap;
701  below_neighbor = neighbor;
702  } // If neighbor is above current partition
703  else if (part_box.top() < neighbor_box.bottom() &&
704  gap < min_space_above) {
705  min_space_above = gap;
706  above_neighbor = neighbor;
707  }
708  }
709  }
710  part->set_space_above(min_space_above);
711  part->set_space_below(min_space_below);
712  part->set_nearest_neighbor_above(above_neighbor);
713  part->set_nearest_neighbor_below(below_neighbor);
714 }
void set_bottom(int y)
Definition: rect.h:64
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
#define MIN(x, y)
Definition: ndminx.h:28
const int kMaxVerticalSpacing
Definition: tablefind.cpp:41
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
inT16 bottom() const
Definition: rect.h:61
const ICOORD & bleft() const
Definition: tablefind.cpp:395
#define MAX(x, y)
Definition: ndminx.h:24
inT16 top() const
Definition: rect.h:54
void set_top(int y)
Definition: rect.h:57
const ICOORD & tright() const
Definition: tablefind.cpp:398
Definition: rect.h:30
inT16 y() const
access_function
Definition: points.h:56

◆ SmoothTablePartitionRuns()

void tesseract::TableFinder::SmoothTablePartitionRuns ( )
protected

Definition at line 1116 of file tablefind.cpp.

1116  {
1117  // Iterate the ColPartitions in the grid.
1119  gsearch.StartFullSearch();
1120  ColPartition* part = NULL;
1121  while ((part = gsearch.NextFullSearch()) != NULL) {
1122  if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN)
1123  continue; // Consider only text partitions
1124  ColPartition* upper_part = part->nearest_neighbor_above();
1125  ColPartition* lower_part = part->nearest_neighbor_below();
1126  if (!upper_part || !lower_part)
1127  continue;
1128  if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE)
1129  part->set_table_type();
1130  }
1131 
1132  // Pass 2, do the opposite. If both the upper and lower neighbors
1133  // exist and are not tables, this probably shouldn't be a table.
1134  gsearch.StartFullSearch();
1135  part = NULL;
1136  while ((part = gsearch.NextFullSearch()) != NULL) {
1137  if (part->type() != PT_TABLE)
1138  continue; // Consider only text partitions
1139  ColPartition* upper_part = part->nearest_neighbor_above();
1140  ColPartition* lower_part = part->nearest_neighbor_below();
1141 
1142  // table can't be by itself
1143  if ((upper_part && upper_part->type() != PT_TABLE) &&
1144  (lower_part && lower_part->type() != PT_TABLE)) {
1145  part->clear_table_type();
1146  }
1147  }
1148 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
Definition: capi.h:97

◆ SplitAndInsertFragmentedTextPartition()

void tesseract::TableFinder::SplitAndInsertFragmentedTextPartition ( ColPartition part)
protected

Definition at line 444 of file tablefind.cpp.

444  {
445  ASSERT_HOST(part != NULL);
446  // Bye bye empty partitions!
447  if (part->boxes()->empty()) {
448  delete part;
449  return;
450  }
451 
452  // The AllowBlob function prevents this.
453  ASSERT_HOST(part->median_width() > 0);
454  const double kThreshold = part->median_width() * kSplitPartitionSize;
455 
456  ColPartition* right_part = part;
457  bool found_split = true;
458  while (found_split) {
459  found_split = false;
460  BLOBNBOX_C_IT box_it(right_part->boxes());
461  // Blobs are sorted left side first. If blobs overlap,
462  // the previous blob may have a "more right" right side.
463  // Account for this by always keeping the largest "right"
464  // so far.
465  int previous_right = MIN_INT32;
466 
467  // Look for the next split in the partition.
468  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
469  const TBOX& box = box_it.data()->bounding_box();
470  if (previous_right != MIN_INT32 &&
471  box.left() - previous_right > kThreshold) {
472  // We have a split position. Split the partition in two pieces.
473  // Insert the left piece in the grid and keep processing the right.
474  int mid_x = (box.left() + previous_right) / 2;
475  ColPartition* left_part = right_part;
476  right_part = left_part->SplitAt(mid_x);
477 
479  found_split = true;
480  break;
481  }
482 
483  // The right side of the previous blobs.
484  previous_right = MAX(previous_right, box.right());
485  }
486  }
487  // When a split is not found, the right part is minimized
488  // as much as possible, so process it.
489  InsertFragmentedTextPartition(right_part);
490 }
const double kSplitPartitionSize
Definition: tablefind.cpp:47
inT16 left() const
Definition: rect.h:68
void InsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:410
#define MAX(x, y)
Definition: ndminx.h:24
#define MIN_INT32
Definition: host.h:61
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ tright()

const ICOORD & tesseract::TableFinder::tright ( ) const
protected

Definition at line 398 of file tablefind.cpp.

398  {
399  return clean_part_grid_.tright();
400 }
const ICOORD & tright() const
Definition: bbgrid.h:75
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418

◆ WriteToPix()

void tesseract::TableFinder::WriteToPix ( const FCOORD reskew)
protected

Definition at line 2006 of file tablefind.cpp.

2006  {
2007  // Input file must be named test1.tif
2008  PIX* pix = pixRead("test1.tif");
2009  if (!pix) {
2010  tprintf("Input file test1.tif not found.\n");
2011  return;
2012  }
2013  int img_height = pixGetHeight(pix);
2014  int img_width = pixGetWidth(pix);
2015  // Maximum number of text or table partitions
2016  int num_boxes = 10;
2017  BOXA* text_box_array = boxaCreate(num_boxes);
2018  BOXA* table_box_array = boxaCreate(num_boxes);
2019  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
2020  gsearch(&clean_part_grid_);
2021  gsearch.StartFullSearch();
2022  ColPartition* part;
2023  // load colpartitions into text_box_array and table_box_array
2024  while ((part = gsearch.NextFullSearch()) != NULL) {
2025  TBOX box = part->bounding_box();
2026  box.rotate_large(reskew);
2027  BOX* lept_box = boxCreate(box.left(), img_height - box.top(),
2028  box.right() - box.left(),
2029  box.top() - box.bottom());
2030  if (part->type() == PT_TABLE)
2031  boxaAddBox(table_box_array, lept_box, L_INSERT);
2032  else
2033  boxaAddBox(text_box_array, lept_box, L_INSERT);
2034  }
2035  // draw colpartitions on the output image
2036  PIX* out = pixDrawBoxa(pix, text_box_array, 3, 0xff000000);
2037  out = pixDrawBoxa(out, table_box_array, 3, 0x0000ff00);
2038 
2039  BOXA* table_array = boxaCreate(num_boxes);
2040  // text file containing detected table bounding boxes
2041  FILE* fptr = fopen("tess-table.txt", "wb");
2042  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
2043  table_search(&table_grid_);
2044  table_search.StartFullSearch();
2045  ColSegment* table;
2046  // load table boxes to table_array and write them to text file as well
2047  while ((table = table_search.NextFullSearch()) != NULL) {
2048  TBOX box = table->bounding_box();
2049  box.rotate_large(reskew);
2050  // Since deskewing introduces negative coordinates, reskewing
2051  // might not completely recover from that since both steps enlarge
2052  // the actual box. Hence a box that undergoes deskewing/reskewing
2053  // may go out of image boundaries. Crop a table box if needed to
2054  // contain it inside the image dimensions.
2055  box = box.intersection(TBOX(0, 0, img_width - 1, img_height - 1));
2056  BOX* lept_box = boxCreate(box.left(), img_height - box.top(),
2057  box.right() - box.left(),
2058  box.top() - box.bottom());
2059  boxaAddBox(table_array, lept_box, L_INSERT);
2060  fprintf(fptr, "%d %d %d %d TABLE\n", box.left(),
2061  img_height - box.top(), box.right(), img_height - box.bottom());
2062  }
2063  fclose(fptr);
2064  // paint table boxes on the debug image
2065  out = pixDrawBoxa(out, table_array, 5, 0x7fff0000);
2066 
2067  pixWrite("out.png", out, IFF_PNG);
2068  // memory cleanup
2069  boxaDestroy(&text_box_array);
2070  boxaDestroy(&table_box_array);
2071  boxaDestroy(&table_array);
2072  pixDestroy(&pix);
2073  pixDestroy(&out);
2074 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
ColSegmentGrid table_grid_
Definition: tablefind.h:428
Definition: capi.h:97
inT16 bottom() const
Definition: rect.h:61
inT16 left() const
Definition: rect.h:68
#define tprintf(...)
Definition: tprintf.h:31
inT16 top() const
Definition: rect.h:54
void rotate_large(const FCOORD &vec)
Definition: rect.cpp:72
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87

Member Data Documentation

◆ clean_part_grid_

ColPartitionGrid tesseract::TableFinder::clean_part_grid_
protected

Definition at line 418 of file tablefind.h.

◆ col_seg_grid_

ColSegmentGrid tesseract::TableFinder::col_seg_grid_
protected

Definition at line 426 of file tablefind.h.

◆ fragmented_text_grid_

ColPartitionGrid tesseract::TableFinder::fragmented_text_grid_
protected

Definition at line 424 of file tablefind.h.

◆ global_median_blob_width_

int tesseract::TableFinder::global_median_blob_width_
protected

Definition at line 412 of file tablefind.h.

◆ global_median_ledding_

int tesseract::TableFinder::global_median_ledding_
protected

Definition at line 414 of file tablefind.h.

◆ global_median_xheight_

int tesseract::TableFinder::global_median_xheight_
protected

Definition at line 410 of file tablefind.h.

◆ leader_and_ruling_grid_

ColPartitionGrid tesseract::TableFinder::leader_and_ruling_grid_
protected

Definition at line 420 of file tablefind.h.

◆ left_to_right_language_

bool tesseract::TableFinder::left_to_right_language_
protected

Definition at line 430 of file tablefind.h.

◆ resolution_

int tesseract::TableFinder::resolution_
protected

Definition at line 408 of file tablefind.h.

◆ table_grid_

ColSegmentGrid tesseract::TableFinder::table_grid_
protected

Definition at line 428 of file tablefind.h.


The documentation for this class was generated from the following files: