tesseract  3.05.02
bitvector.cpp
Go to the documentation of this file.
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
4 // File: bitvector.cpp
5 // Description: Class replacement for BITVECTOR.
6 // Author: Ray Smith
7 // Created: Mon Jan 10 17:45:01 PST 2011
8 //
9 // (C) Copyright 2011, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 #include "bitvector.h"
23 #include <string.h>
24 #include "helpers.h"
25 #include "ndminx.h"
26 
27 namespace tesseract {
28 
29 // Fast lookup table to get the first least significant set bit in a byte.
30 // For zero, the table has 255, but since it is a special case, most code
31 // that uses this table will check for zero before looking up lsb_index_.
32 const uinT8 BitVector::lsb_index_[256] = {
33  255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
34  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
35  5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
36  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
37  6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
38  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
39  5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
40  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
41  7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
42  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
43  5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
44  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
45  6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
46  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
47  5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
48  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
49 };
50 
51 // Fast lookup table to get the residual bits after zeroing the first (lowest)
52 // set bit in a byte.
53 const uinT8 BitVector::lsb_eroded_[256] = {
54  0, 0, 0, 0x2, 0, 0x4, 0x4, 0x6,
55  0, 0x8, 0x8, 0x0a, 0x08, 0x0c, 0x0c, 0x0e,
56  0, 0x10, 0x10, 0x12, 0x10, 0x14, 0x14, 0x16,
57  0x10, 0x18, 0x18, 0x1a, 0x18, 0x1c, 0x1c, 0x1e,
58  0, 0x20, 0x20, 0x22, 0x20, 0x24, 0x24, 0x26,
59  0x20, 0x28, 0x28, 0x2a, 0x28, 0x2c, 0x2c, 0x2e,
60  0x20, 0x30, 0x30, 0x32, 0x30, 0x34, 0x34, 0x36,
61  0x30, 0x38, 0x38, 0x3a, 0x38, 0x3c, 0x3c, 0x3e,
62  0, 0x40, 0x40, 0x42, 0x40, 0x44, 0x44, 0x46,
63  0x40, 0x48, 0x48, 0x4a, 0x48, 0x4c, 0x4c, 0x4e,
64  0x40, 0x50, 0x50, 0x52, 0x50, 0x54, 0x54, 0x56,
65  0x50, 0x58, 0x58, 0x5a, 0x58, 0x5c, 0x5c, 0x5e,
66  0x40, 0x60, 0x60, 0x62, 0x60, 0x64, 0x64, 0x66,
67  0x60, 0x68, 0x68, 0x6a, 0x68, 0x6c, 0x6c, 0x6e,
68  0x60, 0x70, 0x70, 0x72, 0x70, 0x74, 0x74, 0x76,
69  0x70, 0x78, 0x78, 0x7a, 0x78, 0x7c, 0x7c, 0x7e,
70  0, 0x80, 0x80, 0x82, 0x80, 0x84, 0x84, 0x86,
71  0x80, 0x88, 0x88, 0x8a, 0x88, 0x8c, 0x8c, 0x8e,
72  0x80, 0x90, 0x90, 0x92, 0x90, 0x94, 0x94, 0x96,
73  0x90, 0x98, 0x98, 0x9a, 0x98, 0x9c, 0x9c, 0x9e,
74  0x80, 0xa0, 0xa0, 0xa2, 0xa0, 0xa4, 0xa4, 0xa6,
75  0xa0, 0xa8, 0xa8, 0xaa, 0xa8, 0xac, 0xac, 0xae,
76  0xa0, 0xb0, 0xb0, 0xb2, 0xb0, 0xb4, 0xb4, 0xb6,
77  0xb0, 0xb8, 0xb8, 0xba, 0xb8, 0xbc, 0xbc, 0xbe,
78  0x80, 0xc0, 0xc0, 0xc2, 0xc0, 0xc4, 0xc4, 0xc6,
79  0xc0, 0xc8, 0xc8, 0xca, 0xc8, 0xcc, 0xcc, 0xce,
80  0xc0, 0xd0, 0xd0, 0xd2, 0xd0, 0xd4, 0xd4, 0xd6,
81  0xd0, 0xd8, 0xd8, 0xda, 0xd8, 0xdc, 0xdc, 0xde,
82  0xc0, 0xe0, 0xe0, 0xe2, 0xe0, 0xe4, 0xe4, 0xe6,
83  0xe0, 0xe8, 0xe8, 0xea, 0xe8, 0xec, 0xec, 0xee,
84  0xe0, 0xf0, 0xf0, 0xf2, 0xf0, 0xf4, 0xf4, 0xf6,
85  0xf0, 0xf8, 0xf8, 0xfa, 0xf8, 0xfc, 0xfc, 0xfe
86 };
87 
88 // Fast lookup table to give the number of set bits in a byte.
89 const int BitVector::hamming_table_[256] = {
90  0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
91  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
92  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
93  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
94  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
95  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
97  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
98  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
99  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
100  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
101  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
102  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
103  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
104  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
105  4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
106 };
107 
108 
109 BitVector::BitVector() : bit_size_(0), array_(NULL) {}
110 
111 BitVector::BitVector(int length) : bit_size_(length) {
112  array_ = new uinT32[WordLength()];
113  SetAllFalse();
114 }
115 
116 BitVector::BitVector(const BitVector& src) : bit_size_(src.bit_size_) {
117  array_ = new uinT32[WordLength()];
118  memcpy(array_, src.array_, ByteLength());
119 }
120 
122  Alloc(src.bit_size_);
123  memcpy(array_, src.array_, ByteLength());
124  return *this;
125 }
126 
128  delete [] array_;
129 }
130 
131 // Initializes the array to length * false.
132 void BitVector::Init(int length) {
133  Alloc(length);
134  SetAllFalse();
135 }
136 
137 // Writes to the given file. Returns false in case of error.
138 bool BitVector::Serialize(FILE* fp) const {
139  if (fwrite(&bit_size_, sizeof(bit_size_), 1, fp) != 1) return false;
140  int wordlen = WordLength();
141  if (static_cast<int>(fwrite(array_, sizeof(*array_), wordlen, fp)) != wordlen)
142  return false;
143  return true;
144 }
145 
146 // Reads from the given file. Returns false in case of error.
147 // If swap is true, assumes a big/little-endian swap is needed.
148 bool BitVector::DeSerialize(bool swap, FILE* fp) {
149  uinT32 new_bit_size;
150  if (fread(&new_bit_size, sizeof(new_bit_size), 1, fp) != 1) return false;
151  if (swap) {
152  ReverseN(&new_bit_size, sizeof(new_bit_size));
153  }
154  Alloc(new_bit_size);
155  int wordlen = WordLength();
156  if (static_cast<int>(fread(array_, sizeof(*array_), wordlen, fp)) != wordlen)
157  return false;
158  if (swap) {
159  for (int i = 0; i < wordlen; ++i)
160  ReverseN(&array_[i], sizeof(array_[i]));
161  }
162  return true;
163 }
164 
166  memset(array_, 0, ByteLength());
167 }
169  memset(array_, ~0, ByteLength());
170 }
171 
172 // Returns the index of the next set bit after the given index.
173 // Useful for quickly iterating through the set bits in a sparse vector.
174 int BitVector::NextSetBit(int prev_bit) const {
175  // Move on to the next bit.
176  int next_bit = prev_bit + 1;
177  if (next_bit >= bit_size_) return -1;
178  // Check the remains of the word containing the next_bit first.
179  int next_word = WordIndex(next_bit);
180  int bit_index = next_word * kBitFactor;
181  int word_end = bit_index + kBitFactor;
182  uinT32 word = array_[next_word];
183  uinT8 byte = word & 0xff;
184  while (bit_index < word_end) {
185  if (bit_index + 8 > next_bit && byte != 0) {
186  while (bit_index + lsb_index_[byte] < next_bit && byte != 0)
187  byte = lsb_eroded_[byte];
188  if (byte != 0)
189  return bit_index + lsb_index_[byte];
190  }
191  word >>= 8;
192  bit_index += 8;
193  byte = word & 0xff;
194  }
195  // next_word didn't contain a 1, so find the next word with set bit.
196  ++next_word;
197  int wordlen = WordLength();
198  while (next_word < wordlen && (word = array_[next_word]) == 0) {
199  ++next_word;
200  bit_index += kBitFactor;
201  }
202  if (bit_index >= bit_size_) return -1;
203  // Find the first non-zero byte within the word.
204  while ((word & 0xff) == 0) {
205  word >>= 8;
206  bit_index += 8;
207  }
208  return bit_index + lsb_index_[word & 0xff];
209 }
210 
211 // Returns the number of set bits in the vector.
213  int wordlen = WordLength();
214  int total_bits = 0;
215  for (int w = 0; w < wordlen; ++w) {
216  uinT32 word = array_[w];
217  for (int i = 0; i < 4; ++i) {
218  total_bits += hamming_table_[word & 0xff];
219  word >>= 8;
220  }
221  }
222  return total_bits;
223 }
224 
225 // Logical in-place operations on whole bit vectors. Tries to do something
226 // sensible if they aren't the same size, but they should be really.
227 void BitVector::operator|=(const BitVector& other) {
228  int length = MIN(WordLength(), other.WordLength());
229  for (int w = 0; w < length; ++w)
230  array_[w] |= other.array_[w];
231 }
232 void BitVector::operator&=(const BitVector& other) {
233  int length = MIN(WordLength(), other.WordLength());
234  for (int w = 0; w < length; ++w)
235  array_[w] &= other.array_[w];
236  for (int w = WordLength() - 1; w >= length; --w)
237  array_[w] = 0;
238 }
239 void BitVector::operator^=(const BitVector& other) {
240  int length = MIN(WordLength(), other.WordLength());
241  for (int w = 0; w < length; ++w)
242  array_[w] ^= other.array_[w];
243 }
244 // Set subtraction *this = v1 - v2.
245 void BitVector::SetSubtract(const BitVector& v1, const BitVector& v2) {
246  Alloc(v1.size());
247  int length = MIN(v1.WordLength(), v2.WordLength());
248  for (int w = 0; w < length; ++w)
249  array_[w] = v1.array_[w] ^ (v1.array_[w] & v2.array_[w]);
250  for (int w = WordLength() - 1; w >= length; --w)
251  array_[w] = v1.array_[w];
252 }
253 
254 // Allocates memory for a vector of the given length.
255 // Reallocates if the array is a different size, larger or smaller.
256 void BitVector::Alloc(int length) {
257  int initial_wordlength = WordLength();
258  bit_size_ = length;
259  int new_wordlength = WordLength();
260  if (new_wordlength != initial_wordlength) {
261  delete [] array_;
262  array_ = new uinT32[new_wordlength];
263  }
264 }
265 
266 
267 } // namespace tesseract.
268 
269 
int size() const
Definition: bitvector.h:57
int NumSetBits() const
Definition: bitvector.cpp:212
void Init(int length)
Definition: bitvector.cpp:132
#define MIN(x, y)
Definition: ndminx.h:28
unsigned char uinT8
Definition: host.h:32
void operator|=(const BitVector &other)
Definition: bitvector.cpp:227
void operator^=(const BitVector &other)
Definition: bitvector.cpp:239
static const uinT8 lsb_index_[256]
Definition: bitvector.h:39
bool Serialize(FILE *fp) const
Definition: bitvector.cpp:138
bool DeSerialize(bool swap, FILE *fp)
Definition: bitvector.cpp:148
unsigned int uinT32
Definition: host.h:36
static const uinT8 lsb_eroded_[256]
Definition: bitvector.h:42
BitVector & operator=(const BitVector &src)
Definition: bitvector.cpp:121
int NextSetBit(int prev_bit) const
Definition: bitvector.cpp:174
void SetSubtract(const BitVector &v1, const BitVector &v2)
Definition: bitvector.cpp:245
static const int hamming_table_[256]
Definition: bitvector.h:44
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:177
void operator &=(const BitVector &other)