tesseract  3.05.02
openclwrapper.cpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use this file except in compliance with the License.
3 // You may obtain a copy of the License at
4 // http://www.apache.org/licenses/LICENSE-2.0
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 #ifdef _WIN32
11 #include <io.h>
12 #else
13 #include <sys/types.h>
14 #include <unistd.h>
15 #endif
16 #include <float.h>
17 
18 #include "openclwrapper.h"
19 #include "oclkernels.h"
20 
21 // for micro-benchmark
22 #include "otsuthr.h"
23 #include "thresholder.h"
24 
25 #if ON_APPLE
26 #include <mach/mach_time.h>
27 #include <stdio.h>
28 #endif
29 
30 #define CALLOC LEPT_CALLOC
31 #define FREE LEPT_FREE
32 
33 #ifdef USE_OPENCL
34 
36 GPUEnv OpenclDevice::gpuEnv;
37 
38 bool OpenclDevice::deviceIsSelected = false;
39 ds_device OpenclDevice::selectedDevice;
40 
41 int OpenclDevice::isInited = 0;
42 
43 static l_int32 MORPH_BC = ASYMMETRIC_MORPH_BC;
44 
45 static const l_uint32 lmask32[] = {
46  0x80000000, 0xc0000000, 0xe0000000, 0xf0000000, 0xf8000000, 0xfc000000,
47  0xfe000000, 0xff000000, 0xff800000, 0xffc00000, 0xffe00000, 0xfff00000,
48  0xfff80000, 0xfffc0000, 0xfffe0000, 0xffff0000, 0xffff8000, 0xffffc000,
49  0xffffe000, 0xfffff000, 0xfffff800, 0xfffffc00, 0xfffffe00, 0xffffff00,
50  0xffffff80, 0xffffffc0, 0xffffffe0, 0xfffffff0, 0xfffffff8, 0xfffffffc,
51  0xfffffffe, 0xffffffff};
52 
53 static const l_uint32 rmask32[] = {
54  0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f,
55  0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
56  0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff,
57  0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
58  0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff,
59  0x7fffffff, 0xffffffff};
60 
61 static cl_mem pixsCLBuffer, pixdCLBuffer, pixdCLIntermediate; //Morph operations buffers
62 static cl_mem pixThBuffer; //output from thresholdtopix calculation
63 static cl_int clStatus;
64 static KernelEnv rEnv;
65 
66 #define DS_TAG_VERSION "<version>"
67 #define DS_TAG_VERSION_END "</version>"
68 #define DS_TAG_DEVICE "<device>"
69 #define DS_TAG_DEVICE_END "</device>"
70 #define DS_TAG_SCORE "<score>"
71 #define DS_TAG_SCORE_END "</score>"
72 #define DS_TAG_DEVICE_TYPE "<type>"
73 #define DS_TAG_DEVICE_TYPE_END "</type>"
74 #define DS_TAG_DEVICE_NAME "<name>"
75 #define DS_TAG_DEVICE_NAME_END "</name>"
76 #define DS_TAG_DEVICE_DRIVER_VERSION "<driver>"
77 #define DS_TAG_DEVICE_DRIVER_VERSION_END "</driver>"
78 
79 #define DS_DEVICE_NATIVE_CPU_STRING "native_cpu"
80 
81 #define DS_DEVICE_NAME_LENGTH 256
82 
83 typedef enum { DS_EVALUATE_ALL, DS_EVALUATE_NEW_ONLY } ds_evaluation_type;
84 
85 typedef struct {
86  unsigned int numDevices;
87  ds_device *devices;
88  const char *version;
89 } ds_profile;
90 
91 typedef enum {
92  DS_SUCCESS = 0,
93  DS_INVALID_PROFILE = 1000,
94  DS_MEMORY_ERROR,
95  DS_INVALID_PERF_EVALUATOR_TYPE,
96  DS_INVALID_PERF_EVALUATOR,
97  DS_PERF_EVALUATOR_ERROR,
98  DS_FILE_ERROR,
99  DS_UNKNOWN_DEVICE_TYPE,
100  DS_PROFILE_FILE_ERROR,
101  DS_SCORE_SERIALIZER_ERROR,
102  DS_SCORE_DESERIALIZER_ERROR
103 } ds_status;
104 
105 // Pointer to a function that calculates the score of a device (ex:
106 // device->score) update the data size of score. The encoding and the format
107 // of the score data is implementation defined. The function should return
108 // DS_SUCCESS if there's no error to be reported.
109 typedef ds_status (*ds_perf_evaluator)(ds_device *device, void *data);
110 
111 // deallocate memory used by score
112 typedef ds_status (*ds_score_release)(void *score);
113 static ds_status releaseDSProfile(ds_profile *profile, ds_score_release sr) {
114  ds_status status = DS_SUCCESS;
115  if (profile != NULL) {
116  if (profile->devices != NULL && sr != NULL) {
117  unsigned int i;
118  for (i = 0; i < profile->numDevices; i++) {
119  free(profile->devices[i].oclDeviceName);
120  free(profile->devices[i].oclDriverVersion);
121  status = sr(profile->devices[i].score);
122  if (status != DS_SUCCESS) break;
123  }
124  free(profile->devices);
125  }
126  free(profile);
127  }
128  return status;
129 }
130 
131 static ds_status initDSProfile(ds_profile **p, const char *version) {
132  int numDevices;
133  cl_uint numPlatforms;
134  cl_platform_id *platforms = NULL;
135  cl_device_id *devices = NULL;
136  ds_status status = DS_SUCCESS;
137  unsigned int next;
138  unsigned int i;
139 
140  if (p == NULL) return DS_INVALID_PROFILE;
141 
142  ds_profile *profile = (ds_profile *)malloc(sizeof(ds_profile));
143  if (profile == NULL) return DS_MEMORY_ERROR;
144 
145  memset(profile, 0, sizeof(ds_profile));
146 
147  clGetPlatformIDs(0, NULL, &numPlatforms);
148 
149  if (numPlatforms > 0) {
150  platforms = (cl_platform_id *)malloc(numPlatforms * sizeof(cl_platform_id));
151  if (platforms == NULL) {
152  status = DS_MEMORY_ERROR;
153  goto cleanup;
154  }
155  clGetPlatformIDs(numPlatforms, platforms, NULL);
156  }
157 
158  numDevices = 0;
159  for (i = 0; i < (unsigned int)numPlatforms; i++) {
160  cl_uint num;
161  clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &num);
162  numDevices += num;
163  }
164 
165  if (numDevices > 0) {
166  devices = (cl_device_id *)malloc(numDevices * sizeof(cl_device_id));
167  if (devices == NULL) {
168  status = DS_MEMORY_ERROR;
169  goto cleanup;
170  }
171  }
172 
173  profile->numDevices =
174  numDevices + 1; // +1 to numDevices to include the native CPU
175  profile->devices =
176  (ds_device *)malloc(profile->numDevices * sizeof(ds_device));
177  if (profile->devices == NULL) {
178  profile->numDevices = 0;
179  status = DS_MEMORY_ERROR;
180  goto cleanup;
181  }
182  memset(profile->devices, 0, profile->numDevices * sizeof(ds_device));
183 
184  next = 0;
185  for (i = 0; i < (unsigned int)numPlatforms; i++) {
186  cl_uint num;
187  unsigned j;
188  clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numDevices, devices, &num);
189  for (j = 0; j < num; j++, next++) {
190  char buffer[DS_DEVICE_NAME_LENGTH];
191  size_t length;
192 
193  profile->devices[next].type = DS_DEVICE_OPENCL_DEVICE;
194  profile->devices[next].oclDeviceID = devices[j];
195 
196  clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_NAME,
197  DS_DEVICE_NAME_LENGTH, &buffer, NULL);
198  length = strlen(buffer);
199  profile->devices[next].oclDeviceName = (char *)malloc(length + 1);
200  memcpy(profile->devices[next].oclDeviceName, buffer, length + 1);
201 
202  clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DRIVER_VERSION,
203  DS_DEVICE_NAME_LENGTH, &buffer, NULL);
204  length = strlen(buffer);
205  profile->devices[next].oclDriverVersion = (char *)malloc(length + 1);
206  memcpy(profile->devices[next].oclDriverVersion, buffer, length + 1);
207  }
208  }
209  profile->devices[next].type = DS_DEVICE_NATIVE_CPU;
210  profile->version = version;
211 
212 cleanup:
213  free(platforms);
214  free(devices);
215  if (status == DS_SUCCESS) {
216  *p = profile;
217  } else {
218  if (profile) {
219  free(profile->devices);
220  free(profile);
221  }
222  }
223  return status;
224 }
225 
226 static ds_status profileDevices(ds_profile *profile,
227  const ds_evaluation_type type,
228  ds_perf_evaluator evaluator,
229  void *evaluatorData, unsigned int *numUpdates) {
230  ds_status status = DS_SUCCESS;
231  unsigned int i;
232  unsigned int updates = 0;
233 
234  if (profile == NULL) {
235  return DS_INVALID_PROFILE;
236  }
237  if (evaluator == NULL) {
238  return DS_INVALID_PERF_EVALUATOR;
239  }
240 
241  for (i = 0; i < profile->numDevices; i++) {
242  ds_status evaluatorStatus;
243 
244  switch (type) {
245  case DS_EVALUATE_NEW_ONLY:
246  if (profile->devices[i].score != NULL) break;
247  // else fall through
248  case DS_EVALUATE_ALL:
249  evaluatorStatus = evaluator(profile->devices + i, evaluatorData);
250  if (evaluatorStatus != DS_SUCCESS) {
251  status = evaluatorStatus;
252  return status;
253  }
254  updates++;
255  break;
256  default:
257  return DS_INVALID_PERF_EVALUATOR_TYPE;
258  break;
259  };
260  }
261  if (numUpdates) *numUpdates = updates;
262  return status;
263 }
264 
265 static const char *findString(const char *contentStart, const char *contentEnd,
266  const char *string) {
267  size_t stringLength;
268  const char *currentPosition;
269  const char *found = NULL;
270  stringLength = strlen(string);
271  currentPosition = contentStart;
272  for (currentPosition = contentStart; currentPosition < contentEnd;
273  currentPosition++) {
274  if (*currentPosition == string[0]) {
275  if (currentPosition + stringLength < contentEnd) {
276  if (strncmp(currentPosition, string, stringLength) == 0) {
277  found = currentPosition;
278  break;
279  }
280  }
281  }
282  }
283  return found;
284 }
285 
286 static ds_status readProFile(const char *fileName, char **content,
287  size_t *contentSize) {
288  size_t size = 0;
289 
290  *contentSize = 0;
291  *content = NULL;
292 
293  FILE *input = fopen(fileName, "rb");
294  if (input == NULL) {
295  return DS_FILE_ERROR;
296  }
297 
298  fseek(input, 0L, SEEK_END);
299  size = ftell(input);
300  rewind(input);
301  char *binary = (char *)malloc(size);
302  if (binary == NULL) {
303  fclose(input);
304  return DS_FILE_ERROR;
305  }
306  fread(binary, sizeof(char), size, input);
307  fclose(input);
308 
309  *contentSize = size;
310  *content = binary;
311  return DS_SUCCESS;
312 }
313 
314 typedef ds_status (*ds_score_deserializer)(ds_device *device,
315  const unsigned char *serializedScore,
316  unsigned int serializedScoreSize);
317 
318 static ds_status readProfileFromFile(ds_profile *profile,
319  ds_score_deserializer deserializer,
320  const char *file) {
321  ds_status status = DS_SUCCESS;
322  char *contentStart = NULL;
323  const char *contentEnd = NULL;
324  size_t contentSize;
325 
326  if (profile == NULL) return DS_INVALID_PROFILE;
327 
328  status = readProFile(file, &contentStart, &contentSize);
329  if (status == DS_SUCCESS) {
330  const char *currentPosition;
331  const char *dataStart;
332  const char *dataEnd;
333 
334  contentEnd = contentStart + contentSize;
335  currentPosition = contentStart;
336 
337  // parse the version string
338  dataStart = findString(currentPosition, contentEnd, DS_TAG_VERSION);
339  if (dataStart == NULL) {
340  status = DS_PROFILE_FILE_ERROR;
341  goto cleanup;
342  }
343  dataStart += strlen(DS_TAG_VERSION);
344 
345  dataEnd = findString(dataStart, contentEnd, DS_TAG_VERSION_END);
346  if (dataEnd == NULL) {
347  status = DS_PROFILE_FILE_ERROR;
348  goto cleanup;
349  }
350 
351  size_t versionStringLength = strlen(profile->version);
352  if (versionStringLength + dataStart != dataEnd ||
353  strncmp(profile->version, dataStart, versionStringLength) != 0) {
354  // version mismatch
355  status = DS_PROFILE_FILE_ERROR;
356  goto cleanup;
357  }
358  currentPosition = dataEnd + strlen(DS_TAG_VERSION_END);
359 
360  // parse the device information
361  while (1) {
362  unsigned int i;
363 
364  const char *deviceTypeStart;
365  const char *deviceTypeEnd;
366  ds_device_type deviceType;
367 
368  const char *deviceNameStart;
369  const char *deviceNameEnd;
370 
371  const char *deviceScoreStart;
372  const char *deviceScoreEnd;
373 
374  const char *deviceDriverStart;
375  const char *deviceDriverEnd;
376 
377  dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE);
378  if (dataStart == NULL) {
379  // nothing useful remain, quit...
380  break;
381  }
382  dataStart += strlen(DS_TAG_DEVICE);
383  dataEnd = findString(dataStart, contentEnd, DS_TAG_DEVICE_END);
384  if (dataEnd == NULL) {
385  status = DS_PROFILE_FILE_ERROR;
386  goto cleanup;
387  }
388 
389  // parse the device type
390  deviceTypeStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_TYPE);
391  if (deviceTypeStart == NULL) {
392  status = DS_PROFILE_FILE_ERROR;
393  goto cleanup;
394  }
395  deviceTypeStart += strlen(DS_TAG_DEVICE_TYPE);
396  deviceTypeEnd =
397  findString(deviceTypeStart, contentEnd, DS_TAG_DEVICE_TYPE_END);
398  if (deviceTypeEnd == NULL) {
399  status = DS_PROFILE_FILE_ERROR;
400  goto cleanup;
401  }
402  memcpy(&deviceType, deviceTypeStart, sizeof(ds_device_type));
403 
404  // parse the device name
405  if (deviceType == DS_DEVICE_OPENCL_DEVICE) {
406  deviceNameStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_NAME);
407  if (deviceNameStart == NULL) {
408  status = DS_PROFILE_FILE_ERROR;
409  goto cleanup;
410  }
411  deviceNameStart += strlen(DS_TAG_DEVICE_NAME);
412  deviceNameEnd =
413  findString(deviceNameStart, contentEnd, DS_TAG_DEVICE_NAME_END);
414  if (deviceNameEnd == NULL) {
415  status = DS_PROFILE_FILE_ERROR;
416  goto cleanup;
417  }
418 
419  deviceDriverStart =
420  findString(dataStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION);
421  if (deviceDriverStart == NULL) {
422  status = DS_PROFILE_FILE_ERROR;
423  goto cleanup;
424  }
425  deviceDriverStart += strlen(DS_TAG_DEVICE_DRIVER_VERSION);
426  deviceDriverEnd = findString(deviceDriverStart, contentEnd,
427  DS_TAG_DEVICE_DRIVER_VERSION_END);
428  if (deviceDriverEnd == NULL) {
429  status = DS_PROFILE_FILE_ERROR;
430  goto cleanup;
431  }
432 
433  // check if this device is on the system
434  for (i = 0; i < profile->numDevices; i++) {
435  if (profile->devices[i].type == DS_DEVICE_OPENCL_DEVICE) {
436  size_t actualDeviceNameLength;
437  size_t driverVersionLength;
438 
439  actualDeviceNameLength = strlen(profile->devices[i].oclDeviceName);
440  driverVersionLength = strlen(profile->devices[i].oclDriverVersion);
441  if (deviceNameStart + actualDeviceNameLength == deviceNameEnd &&
442  deviceDriverStart + driverVersionLength == deviceDriverEnd &&
443  strncmp(profile->devices[i].oclDeviceName, deviceNameStart,
444  actualDeviceNameLength) == 0 &&
445  strncmp(profile->devices[i].oclDriverVersion, deviceDriverStart,
446  driverVersionLength) == 0) {
447  deviceScoreStart =
448  findString(dataStart, contentEnd, DS_TAG_SCORE);
449  if (deviceNameStart == NULL) {
450  status = DS_PROFILE_FILE_ERROR;
451  goto cleanup;
452  }
453  deviceScoreStart += strlen(DS_TAG_SCORE);
454  deviceScoreEnd =
455  findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
456  status = deserializer(profile->devices + i,
457  (const unsigned char *)deviceScoreStart,
458  deviceScoreEnd - deviceScoreStart);
459  if (status != DS_SUCCESS) {
460  goto cleanup;
461  }
462  }
463  }
464  }
465  } else if (deviceType == DS_DEVICE_NATIVE_CPU) {
466  for (i = 0; i < profile->numDevices; i++) {
467  if (profile->devices[i].type == DS_DEVICE_NATIVE_CPU) {
468  deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
469  if (deviceScoreStart == NULL) {
470  status = DS_PROFILE_FILE_ERROR;
471  goto cleanup;
472  }
473  deviceScoreStart += strlen(DS_TAG_SCORE);
474  deviceScoreEnd =
475  findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
476  status = deserializer(profile->devices + i,
477  (const unsigned char *)deviceScoreStart,
478  deviceScoreEnd - deviceScoreStart);
479  if (status != DS_SUCCESS) {
480  goto cleanup;
481  }
482  }
483  }
484  }
485 
486  // skip over the current one to find the next device
487  currentPosition = dataEnd + strlen(DS_TAG_DEVICE_END);
488  }
489  }
490 cleanup:
491  free(contentStart);
492  return status;
493 }
494 
495 typedef ds_status (*ds_score_serializer)(ds_device *device,
496  void **serializedScore,
497  unsigned int *serializedScoreSize);
498 static ds_status writeProfileToFile(ds_profile *profile,
499  ds_score_serializer serializer,
500  const char *file) {
501  ds_status status = DS_SUCCESS;
502 
503  if (profile == NULL) return DS_INVALID_PROFILE;
504 
505  FILE *profileFile = fopen(file, "wb");
506  if (profileFile == NULL) {
507  status = DS_FILE_ERROR;
508  } else {
509  unsigned int i;
510 
511  // write version string
512  fwrite(DS_TAG_VERSION, sizeof(char), strlen(DS_TAG_VERSION), profileFile);
513  fwrite(profile->version, sizeof(char), strlen(profile->version),
514  profileFile);
515  fwrite(DS_TAG_VERSION_END, sizeof(char), strlen(DS_TAG_VERSION_END),
516  profileFile);
517  fwrite("\n", sizeof(char), 1, profileFile);
518 
519  for (i = 0; i < profile->numDevices && status == DS_SUCCESS; i++) {
520  void *serializedScore;
521  unsigned int serializedScoreSize;
522 
523  fwrite(DS_TAG_DEVICE, sizeof(char), strlen(DS_TAG_DEVICE), profileFile);
524 
525  fwrite(DS_TAG_DEVICE_TYPE, sizeof(char), strlen(DS_TAG_DEVICE_TYPE),
526  profileFile);
527  fwrite(&profile->devices[i].type, sizeof(ds_device_type), 1, profileFile);
528  fwrite(DS_TAG_DEVICE_TYPE_END, sizeof(char),
529  strlen(DS_TAG_DEVICE_TYPE_END), profileFile);
530 
531  switch (profile->devices[i].type) {
532  case DS_DEVICE_NATIVE_CPU: {
533  // There's no need to emit a device name for the native CPU device.
534  /*
535  fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME),
536  profileFile);
537  fwrite(DS_DEVICE_NATIVE_CPU_STRING,sizeof(char),
538  strlen(DS_DEVICE_NATIVE_CPU_STRING), profileFile);
539  fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char),
540  strlen(DS_TAG_DEVICE_NAME_END), profileFile);
541  */
542  } break;
543  case DS_DEVICE_OPENCL_DEVICE: {
544  fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME),
545  profileFile);
546  fwrite(profile->devices[i].oclDeviceName, sizeof(char),
547  strlen(profile->devices[i].oclDeviceName), profileFile);
548  fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char),
549  strlen(DS_TAG_DEVICE_NAME_END), profileFile);
550 
551  fwrite(DS_TAG_DEVICE_DRIVER_VERSION, sizeof(char),
552  strlen(DS_TAG_DEVICE_DRIVER_VERSION), profileFile);
553  fwrite(profile->devices[i].oclDriverVersion, sizeof(char),
554  strlen(profile->devices[i].oclDriverVersion), profileFile);
555  fwrite(DS_TAG_DEVICE_DRIVER_VERSION_END, sizeof(char),
556  strlen(DS_TAG_DEVICE_DRIVER_VERSION_END), profileFile);
557  } break;
558  default:
559  status = DS_UNKNOWN_DEVICE_TYPE;
560  break;
561  };
562 
563  fwrite(DS_TAG_SCORE, sizeof(char), strlen(DS_TAG_SCORE), profileFile);
564  status = serializer(profile->devices + i, &serializedScore,
565  &serializedScoreSize);
566  if (status == DS_SUCCESS && serializedScore != NULL &&
567  serializedScoreSize > 0) {
568  fwrite(serializedScore, sizeof(char), serializedScoreSize, profileFile);
569  free(serializedScore);
570  }
571  fwrite(DS_TAG_SCORE_END, sizeof(char), strlen(DS_TAG_SCORE_END),
572  profileFile);
573  fwrite(DS_TAG_DEVICE_END, sizeof(char), strlen(DS_TAG_DEVICE_END),
574  profileFile);
575  fwrite("\n", sizeof(char), 1, profileFile);
576  }
577  fclose(profileFile);
578  }
579  return status;
580 }
581 
582 // substitute invalid characters in device name with _
583 static void legalizeFileName( char *fileName) {
584  //printf("fileName: %s\n", fileName);
585  const char *invalidChars =
586  "/\?:*\"><| "; // space is valid but can cause headaches
587  // for each invalid char
588  for (unsigned i = 0; i < strlen(invalidChars); i++) {
589  char invalidStr[4];
590  invalidStr[0] = invalidChars[i];
591  invalidStr[1] = '\0';
592  //printf("eliminating %s\n", invalidStr);
593  //char *pos = strstr(fileName, invalidStr);
594  // initial ./ is valid for present directory
595  //if (*pos == '.') pos++;
596  //if (*pos == '/') pos++;
597  for (char *pos = strstr(fileName, invalidStr); pos != NULL;
598  pos = strstr(pos + 1, invalidStr)) {
599  // printf("\tfound: %s, ", pos);
600  pos[0] = '_';
601  // printf("fileName: %s\n", fileName);
602  }
603  }
604 }
605 
606 static void populateGPUEnvFromDevice( GPUEnv *gpuInfo, cl_device_id device ) {
607  //printf("[DS] populateGPUEnvFromDevice\n");
608  size_t size;
609  gpuInfo->mnIsUserCreated = 1;
610  // device
611  gpuInfo->mpDevID = device;
612  gpuInfo->mpArryDevsID = new cl_device_id[1];
613  gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
614  clStatus =
615  clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE,
616  sizeof(cl_device_type), &gpuInfo->mDevType, &size);
617  CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(TYPE)");
618  // platform
619  clStatus =
620  clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM,
621  sizeof(cl_platform_id), &gpuInfo->mpPlatformID, &size);
622  CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)");
623  // context
624  cl_context_properties props[3];
625  props[0] = CL_CONTEXT_PLATFORM;
626  props[1] = (cl_context_properties) gpuInfo->mpPlatformID;
627  props[2] = 0;
628  gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID, NULL,
629  NULL, &clStatus);
630  CHECK_OPENCL( clStatus, "populateGPUEnv::createContext");
631  // queue
632  cl_command_queue_properties queueProperties = 0;
633  gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus );
634  CHECK_OPENCL( clStatus, "populateGPUEnv::createCommandQueue");
635 }
636 
637 int OpenclDevice::LoadOpencl()
638 {
639 #ifdef WIN32
640  HINSTANCE HOpenclDll = NULL;
641  void *OpenclDll = NULL;
642  // fprintf(stderr, " LoadOpenclDllxx... \n");
643  OpenclDll = static_cast<HINSTANCE>(HOpenclDll);
644  OpenclDll = LoadLibrary("openCL.dll");
645  if (!static_cast<HINSTANCE>(OpenclDll)) {
646  fprintf(stderr, "[OD] Load opencl.dll failed!\n");
647  FreeLibrary(static_cast<HINSTANCE>(OpenclDll));
648  return 0;
649  }
650  fprintf(stderr, "[OD] Load opencl.dll successful!\n");
651 #endif
652  return 1;
653 }
654 int OpenclDevice::SetKernelEnv( KernelEnv *envInfo )
655 {
656  envInfo->mpkContext = gpuEnv.mpContext;
657  envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
658  envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
659 
660  return 1;
661 }
662 
663 static cl_mem allocateZeroCopyBuffer(KernelEnv rEnv, l_uint32 *hostbuffer,
664  size_t nElements, cl_mem_flags flags,
665  cl_int *pStatus)
666 {
667  cl_mem membuffer = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (flags),
668  nElements * sizeof(l_uint32), hostbuffer, pStatus);
669 
670  return membuffer;
671 }
672 
673 static
674 Pix *mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, Pix *pixd, Pix *pixs,
675  int elements, cl_mem_flags flags, bool memcopy = false,
676  bool sync = true) {
677  PROCNAME("mapOutputCLBuffer");
678  if (!pixd) {
679  if (memcopy) {
680  if ((pixd = pixCreateTemplate(pixs)) == NULL)
681  (Pix *)ERROR_PTR("pixd not made", procName, NULL);
682  } else {
683  if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs),
684  pixGetDepth(pixs))) == NULL)
685  (Pix *)ERROR_PTR("pixd not made", procName, NULL);
686  }
687  }
688  l_uint32 *pValues = (l_uint32 *)clEnqueueMapBuffer(
689  rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
690  elements * sizeof(l_uint32), 0, NULL, NULL, NULL);
691 
692  if (memcopy) {
693  memcpy(pixGetData(pixd), pValues, elements * sizeof(l_uint32));
694  } else {
695  pixSetData(pixd, pValues);
696  }
697 
698  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, clbuffer, pValues, 0, NULL,
699  NULL);
700 
701  if (sync) {
702  clFinish(rEnv.mpkCmdQueue);
703  }
704 
705  return pixd;
706 }
707 
708 void OpenclDevice::releaseMorphCLBuffers()
709 {
710  if (pixdCLIntermediate != NULL) clReleaseMemObject(pixdCLIntermediate);
711  if (pixsCLBuffer != NULL) clReleaseMemObject(pixsCLBuffer);
712  if (pixdCLBuffer != NULL) clReleaseMemObject(pixdCLBuffer);
713  if (pixThBuffer != NULL) clReleaseMemObject(pixThBuffer);
714  pixdCLIntermediate = pixsCLBuffer = pixdCLBuffer = pixThBuffer = NULL;
715 }
716 
717 int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Pix* pixs)
718 {
719  SetKernelEnv( &rEnv );
720 
721  if (pixThBuffer != NULL) {
722  pixsCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl * h,
723  CL_MEM_ALLOC_HOST_PTR, &clStatus);
724 
725  // Get the output from ThresholdToPix operation
726  clStatus =
727  clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0,
728  sizeof(l_uint32) * wpl * h, 0, NULL, NULL);
729  }
730  else
731  {
732  //Get data from the source image
733  l_uint32* srcdata = (l_uint32*) malloc(wpl*h*sizeof(l_uint32));
734  memcpy(srcdata, pixGetData(pixs), wpl*h*sizeof(l_uint32));
735 
736  pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl*h, CL_MEM_USE_HOST_PTR, &clStatus);
737  }
738 
739  pixdCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl * h,
740  CL_MEM_ALLOC_HOST_PTR, &clStatus);
741 
742  pixdCLIntermediate = allocateZeroCopyBuffer(
743  rEnv, NULL, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
744 
745  return (int)clStatus;
746 }
747 
748 int OpenclDevice::InitEnv()
749 {
750 //PERF_COUNT_START("OD::InitEnv")
751 // printf("[OD] OpenclDevice::InitEnv()\n");
752 #ifdef SAL_WIN32
753  while( 1 )
754  {
755  if( 1 == LoadOpencl() )
756  break;
757  }
758 PERF_COUNT_SUB("LoadOpencl")
759 #endif
760  // sets up environment, compiles programs
761 
762  InitOpenclRunEnv_DeviceSelection( 0 );
763 //PERF_COUNT_SUB("called InitOpenclRunEnv_DS")
764 //PERF_COUNT_END
765  return 1;
766 }
767 
768 int OpenclDevice::ReleaseOpenclRunEnv()
769 {
770  ReleaseOpenclEnv( &gpuEnv );
771 #ifdef SAL_WIN32
772  FreeOpenclDll();
773 #endif
774  return 1;
775 }
776 inline int OpenclDevice::AddKernelConfig( int kCount, const char *kName )
777 {
778  if ( kCount < 1 )
779  fprintf(stderr,"Error: ( KCount < 1 ) AddKernelConfig\n" );
780  strcpy( gpuEnv.mArrykernelNames[kCount-1], kName );
781  gpuEnv.mnKernelCount++;
782  return 0;
783 }
784 int OpenclDevice::RegistOpenclKernel()
785 {
786  if ( !gpuEnv.mnIsUserCreated )
787  memset( &gpuEnv, 0, sizeof(gpuEnv) );
788 
789  gpuEnv.mnFileCount = 0; //argc;
790  gpuEnv.mnKernelCount = 0UL;
791 
792  AddKernelConfig( 1, (const char*) "oclAverageSub1" );
793  return 0;
794 }
795 
796 int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) {
797 //PERF_COUNT_START("InitOpenclRunEnv_DS")
798  if (!isInited) {
799  // after programs compiled, selects best device
800  ds_device bestDevice_DS = getDeviceSelection( );
801 //PERF_COUNT_SUB("called getDeviceSelection()")
802  cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
803  // overwrite global static GPUEnv with new device
804  if (selectedDeviceIsOpenCL() ) {
805  //printf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice() for selected device\n");
806  populateGPUEnvFromDevice( &gpuEnv, bestDevice );
807  gpuEnv.mnFileCount = 0; //argc;
808  gpuEnv.mnKernelCount = 0UL;
809 //PERF_COUNT_SUB("populate gpuEnv")
810  CompileKernelFile(&gpuEnv, "");
811 //PERF_COUNT_SUB("CompileKernelFile")
812  } else {
813  //printf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice() b/c native cpu selected\n");
814  }
815  isInited = 1;
816  }
817 //PERF_COUNT_END
818  return 0;
819 }
820 
821 
822 OpenclDevice::OpenclDevice()
823 {
824  //InitEnv();
825 }
826 
827 OpenclDevice::~OpenclDevice()
828 {
829  //ReleaseOpenclRunEnv();
830 }
831 
832 int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo )
833 {
834  int i = 0;
835  int clStatus = 0;
836 
837  if ( !isInited )
838  {
839  return 1;
840  }
841 
842  for ( i = 0; i < gpuEnv.mnFileCount; i++ )
843  {
844  if ( gpuEnv.mpArryPrograms[i] )
845  {
846  clStatus = clReleaseProgram( gpuEnv.mpArryPrograms[i] );
847  CHECK_OPENCL( clStatus, "clReleaseProgram" );
848  gpuEnv.mpArryPrograms[i] = NULL;
849  }
850  }
851  if ( gpuEnv.mpCmdQueue )
852  {
853  clReleaseCommandQueue( gpuEnv.mpCmdQueue );
854  gpuEnv.mpCmdQueue = NULL;
855  }
856  if ( gpuEnv.mpContext )
857  {
858  clReleaseContext( gpuEnv.mpContext );
859  gpuEnv.mpContext = NULL;
860  }
861  isInited = 0;
862  gpuInfo->mnIsUserCreated = 0;
863  delete[] gpuInfo->mpArryDevsID;
864  return 1;
865 }
866 int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
867 {
868  unsigned int i = 0;
869  cl_int clStatus;
870  int status = 0;
871  char *str = NULL;
872  FILE *fd = NULL;
873  char fileName[256] = {0}, cl_name[128] = {0};
874  char deviceName[1024];
875  clStatus = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME,
876  sizeof(deviceName), deviceName, NULL);
877  CHECK_OPENCL(clStatus, "clGetDeviceInfo");
878  str = (char *)strstr(clFileName, (char *)".cl");
879  memcpy(cl_name, clFileName, str - clFileName);
880  cl_name[str - clFileName] = '\0';
881  sprintf(fileName, "%s-%s.bin", cl_name, deviceName);
882  legalizeFileName(fileName);
883  fd = fopen(fileName, "rb");
884  status = (fd != NULL) ? 1 : 0;
885  if (fd != NULL) {
886  *fhandle = fd;
887  }
888  return status;
889 
890 }
891 int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName )
892 {
893  int i;
894  for ( i = 0; i < gpuEnvCached->mnFileCount; i++ )
895  {
896  if ( strcasecmp( gpuEnvCached->mArryKnelSrcFile[i], clFileName ) == 0 )
897  {
898  if (gpuEnvCached->mpArryPrograms[i] != NULL) {
899  return 1;
900  }
901  }
902  }
903 
904  return 0;
905 }
906 int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes )
907 {
908  FILE *output = NULL;
909  output = fopen(fileName, "wb");
910  if (output == NULL) {
911  return 0;
912  }
913 
914  fwrite( birary, sizeof(char), numBytes, output );
915  fclose( output );
916 
917  return 1;
918 
919 }
920 int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * clFileName )
921 {
922  unsigned int i = 0;
923  cl_int clStatus;
924  size_t *binarySizes;
925  cl_uint numDevices;
926  cl_device_id *mpArryDevsID;
927  char **binaries, *str = NULL;
928 
929  clStatus = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES,
930  sizeof(numDevices), &numDevices, NULL);
931  CHECK_OPENCL( clStatus, "clGetProgramInfo" );
932 
933  mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
934  if (mpArryDevsID == NULL) {
935  return 0;
936  }
937  /* grab the handles to all of the devices in the program. */
938  clStatus = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
939  sizeof(cl_device_id) * numDevices, mpArryDevsID,
940  NULL);
941  CHECK_OPENCL( clStatus, "clGetProgramInfo" );
942 
943  /* figure out the sizes of each of the binaries. */
944  binarySizes = (size_t*) malloc( sizeof(size_t) * numDevices );
945 
946  clStatus =
947  clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
948  sizeof(size_t) * numDevices, binarySizes, NULL);
949  CHECK_OPENCL( clStatus, "clGetProgramInfo" );
950 
951  /* copy over all of the generated binaries. */
952  binaries = (char**) malloc( sizeof(char *) * numDevices );
953  if (binaries == NULL) {
954  return 0;
955  }
956 
957  for ( i = 0; i < numDevices; i++ )
958  {
959  if ( binarySizes[i] != 0 )
960  {
961  binaries[i] = (char*) malloc( sizeof(char) * binarySizes[i] );
962  if (binaries[i] == NULL) {
963  return 0;
964  }
965  }
966  else
967  {
968  binaries[i] = NULL;
969  }
970  }
971 
972  clStatus = clGetProgramInfo(program, CL_PROGRAM_BINARIES,
973  sizeof(char *) * numDevices, binaries, NULL);
974  CHECK_OPENCL(clStatus,"clGetProgramInfo");
975 
976  /* dump out each binary into its own separate file. */
977  for ( i = 0; i < numDevices; i++ )
978  {
979  char fileName[256] = { 0 }, cl_name[128] = { 0 };
980 
981  if ( binarySizes[i] != 0 )
982  {
983  char deviceName[1024];
984  clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
985  sizeof(deviceName), deviceName, NULL);
986  CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
987 
988  str = (char*) strstr( clFileName, (char*) ".cl" );
989  memcpy( cl_name, clFileName, str - clFileName );
990  cl_name[str - clFileName] = '\0';
991  sprintf( fileName, "%s-%s.bin", cl_name, deviceName );
992  legalizeFileName(fileName);
993  if ( !WriteBinaryToFile( fileName, binaries[i], binarySizes[i] ) )
994  {
995  printf("[OD] write binary[%s] failed\n", fileName);
996  return 0;
997  } //else
998  printf("[OD] write binary[%s] successfully\n", fileName);
999  }
1000  }
1001 
1002  // Release all resouces and memory
1003  for ( i = 0; i < numDevices; i++ )
1004  {
1005  free(binaries[i]);
1006  binaries[i] = NULL;
1007  }
1008 
1009  free(binaries);
1010  binaries = NULL;
1011 
1012  free(binarySizes);
1013  binarySizes = NULL;
1014 
1015  free(mpArryDevsID);
1016  mpArryDevsID = NULL;
1017 
1018  return 1;
1019 }
1020 
1021 int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
1022 {
1023 //PERF_COUNT_START("CompileKernelFile")
1024  cl_int clStatus = 0;
1025  size_t length;
1026  char *buildLog = NULL, *binary;
1027  const char *source;
1028  size_t source_size[1];
1029  int b_error, binary_status, binaryExisted, idx;
1030  cl_uint numDevices;
1031  cl_device_id *mpArryDevsID;
1032  FILE *fd, *fd1;
1033  const char* filename = "kernel.cl";
1034  //fprintf(stderr, "[OD] CompileKernelFile ... \n");
1035  if ( CachedOfKernerPrg(gpuInfo, filename) == 1 )
1036  {
1037  return 1;
1038  }
1039 
1040  idx = gpuInfo->mnFileCount;
1041 
1042  source = kernel_src;
1043 
1044  source_size[0] = strlen( source );
1045  binaryExisted = 0;
1046  binaryExisted = BinaryGenerated( filename, &fd ); // don't check for binary during microbenchmark
1047 //PERF_COUNT_SUB("BinaryGenerated")
1048  if ( binaryExisted == 1 )
1049  {
1050  clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
1051  sizeof(numDevices), &numDevices, NULL);
1052  CHECK_OPENCL(clStatus, "clGetContextInfo");
1053 
1054  mpArryDevsID = (cl_device_id *)malloc(sizeof(cl_device_id) * numDevices);
1055  if (mpArryDevsID == NULL) {
1056  return 0;
1057  }
1058 //PERF_COUNT_SUB("get numDevices")
1059  b_error = 0;
1060  length = 0;
1061  b_error |= fseek( fd, 0, SEEK_END ) < 0;
1062  b_error |= ( length = ftell(fd) ) <= 0;
1063  b_error |= fseek( fd, 0, SEEK_SET ) < 0;
1064  if ( b_error )
1065  {
1066  return 0;
1067  }
1068 
1069  binary = (char*) malloc( length + 2 );
1070  if ( !binary )
1071  {
1072  return 0;
1073  }
1074 
1075  memset( binary, 0, length + 2 );
1076  b_error |= fread( binary, 1, length, fd ) != length;
1077 
1078 
1079  fclose( fd );
1080 //PERF_COUNT_SUB("read file")
1081  fd = NULL;
1082  // grab the handles to all of the devices in the context.
1083  clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES,
1084  sizeof(cl_device_id) * numDevices,
1085  mpArryDevsID, NULL);
1086  CHECK_OPENCL( clStatus, "clGetContextInfo" );
1087 //PERF_COUNT_SUB("get devices")
1088  //fprintf(stderr, "[OD] Create kernel from binary\n");
1089  gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( gpuInfo->mpContext,numDevices,
1090  mpArryDevsID, &length, (const unsigned char**) &binary,
1091  &binary_status, &clStatus );
1092  CHECK_OPENCL( clStatus, "clCreateProgramWithBinary" );
1093 //PERF_COUNT_SUB("clCreateProgramWithBinary")
1094  free( binary );
1095  free( mpArryDevsID );
1096  mpArryDevsID = NULL;
1097  // PERF_COUNT_SUB("binaryExisted")
1098  }
1099  else
1100  {
1101  // create a CL program using the kernel source
1102  //fprintf(stderr, "[OD] Create kernel from source\n");
1103  gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource( gpuInfo->mpContext, 1, &source,
1104  source_size, &clStatus);
1105  CHECK_OPENCL( clStatus, "clCreateProgramWithSource" );
1106 //PERF_COUNT_SUB("!binaryExisted")
1107  }
1108 
1109  if (gpuInfo->mpArryPrograms[idx] == (cl_program) NULL) {
1110  return 0;
1111  }
1112 
1113  //char options[512];
1114  // create a cl program executable for all the devices specified
1115  //printf("[OD] BuildProgram.\n");
1116 PERF_COUNT_START("OD::CompileKernel::clBuildProgram")
1117  if (!gpuInfo->mnIsUserCreated)
1118  {
1119  clStatus =
1120  clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
1121  buildOption, NULL, NULL);
1122  // PERF_COUNT_SUB("clBuildProgram notUserCreated")
1123  }
1124  else
1125  {
1126  clStatus =
1127  clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
1128  buildOption, NULL, NULL);
1129  // PERF_COUNT_SUB("clBuildProgram isUserCreated")
1130  }
1132  if ( clStatus != CL_SUCCESS )
1133  {
1134  printf ("BuildProgram error!\n");
1135  if ( !gpuInfo->mnIsUserCreated )
1136  {
1137  clStatus = clGetProgramBuildInfo(
1138  gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1139  CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
1140  }
1141  else
1142  {
1143  clStatus = clGetProgramBuildInfo(
1144  gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1145  CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
1146  }
1147  if ( clStatus != CL_SUCCESS )
1148  {
1149  printf("opencl create build log fail\n");
1150  return 0;
1151  }
1152  buildLog = (char*) malloc( length );
1153  if (buildLog == (char *)NULL) {
1154  return 0;
1155  }
1156  if ( !gpuInfo->mnIsUserCreated )
1157  {
1158  clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1159  CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
1160  }
1161  else
1162  {
1163  clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1164  CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
1165  }
1166  if ( clStatus != CL_SUCCESS )
1167  {
1168  printf("opencl program build info fail\n");
1169  return 0;
1170  }
1171 
1172  fd1 = fopen( "kernel-build.log", "w+" );
1173  if (fd1 != NULL) {
1174  fwrite(buildLog, sizeof(char), length, fd1);
1175  fclose(fd1);
1176  }
1177 
1178  free( buildLog );
1179 //PERF_COUNT_SUB("build error log")
1180  return 0;
1181  }
1182 
1183  strcpy( gpuInfo->mArryKnelSrcFile[idx], filename );
1184 //PERF_COUNT_SUB("strcpy")
1185  if ( binaryExisted == 0 ) {
1186  GeneratBinFromKernelSource( gpuInfo->mpArryPrograms[idx], filename );
1187  PERF_COUNT_SUB("GenerateBinFromKernelSource")
1188  }
1189 
1190  gpuInfo->mnFileCount += 1;
1191 //PERF_COUNT_END
1192  return 1;
1193 }
1194 
1195 l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl,l_uint32 *line)
1196 {
1197 PERF_COUNT_START("pixReadFromTiffKernel")
1198  cl_int clStatus;
1199  KernelEnv rEnv;
1200  size_t globalThreads[2];
1201  size_t localThreads[2];
1202  int gsize;
1203  cl_mem valuesCl;
1204  cl_mem outputCl;
1205 
1206  //global and local work dimensions for Horizontal pass
1207  gsize = (w + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1208  globalThreads[0] = gsize;
1209  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1210  globalThreads[1] = gsize;
1211  localThreads[0] = GROUPSIZE_X;
1212  localThreads[1] = GROUPSIZE_Y;
1213 
1214  SetKernelEnv( &rEnv );
1215 
1216  l_uint32 *pResult = (l_uint32 *)malloc(w*h * sizeof(l_uint32));
1217  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "composeRGBPixel", &clStatus );
1218  CHECK_OPENCL(clStatus, "clCreateKernel composeRGBPixel");
1219 
1220  //Allocate input and output OCL buffers
1221  valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w*h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1222  outputCl = allocateZeroCopyBuffer(rEnv, pResult, w*h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1223 
1224  //Kernel arguments
1225  clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &valuesCl);
1226  CHECK_OPENCL( clStatus, "clSetKernelArg");
1227  clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(w), &w);
1228  CHECK_OPENCL( clStatus, "clSetKernelArg" );
1229  clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(h), &h);
1230  CHECK_OPENCL( clStatus, "clSetKernelArg" );
1231  clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1232  CHECK_OPENCL( clStatus, "clSetKernelArg" );
1233  clStatus = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &outputCl);
1234  CHECK_OPENCL( clStatus, "clSetKernelArg");
1235 
1236  //Kernel enqueue
1237 PERF_COUNT_SUB("before")
1238 clStatus =
1239  clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL,
1240  globalThreads, localThreads, 0, NULL, NULL);
1241 CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel");
1242 
1243 /* map results back from gpu */
1244 void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ,
1245  0, w * h * sizeof(l_uint32), 0, NULL, NULL,
1246  &clStatus);
1247 CHECK_OPENCL(clStatus, "clEnqueueMapBuffer outputCl");
1248 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, NULL, NULL);
1249 
1250 // Sync
1251 clFinish(rEnv.mpkCmdQueue);
1252 PERF_COUNT_SUB("kernel & map")
1254  return pResult;
1255 }
1256 
1257 //Morphology Dilate operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
1258 static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h)
1259 {
1260  size_t globalThreads[2];
1261  cl_mem pixtemp;
1262  cl_int status;
1263  int gsize;
1264  size_t localThreads[2];
1265 
1266  //Horizontal pass
1267  gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1268  globalThreads[0] = gsize;
1269  globalThreads[1] = GROUPSIZE_HMORY;
1270  localThreads[0] = GROUPSIZE_HMORX;
1271  localThreads[1] = GROUPSIZE_HMORY;
1272 
1273  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_5x5", &status );
1274  CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_5x5");
1275 
1276  status = clSetKernelArg(rEnv.mpkKernel,
1277  0,
1278  sizeof(cl_mem),
1279  &pixsCLBuffer);
1280  status = clSetKernelArg(rEnv.mpkKernel,
1281  1,
1282  sizeof(cl_mem),
1283  &pixdCLBuffer);
1284  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1285  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1286 
1287  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1288  NULL, globalThreads, localThreads, 0,
1289  NULL, NULL);
1290 
1291  //Swap source and dest buffers
1292  pixtemp = pixsCLBuffer;
1293  pixsCLBuffer = pixdCLBuffer;
1294  pixdCLBuffer = pixtemp;
1295 
1296  //Vertical
1297  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1298  globalThreads[0] = gsize;
1299  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1300  globalThreads[1] = gsize;
1301  localThreads[0] = GROUPSIZE_X;
1302  localThreads[1] = GROUPSIZE_Y;
1303 
1304  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer_5x5", &status );
1305  CHECK_OPENCL(status, "clCreateKernel morphoDilateVer_5x5");
1306 
1307  status = clSetKernelArg(rEnv.mpkKernel,
1308  0,
1309  sizeof(cl_mem),
1310  &pixsCLBuffer);
1311  status = clSetKernelArg(rEnv.mpkKernel,
1312  1,
1313  sizeof(cl_mem),
1314  &pixdCLBuffer);
1315  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1316  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1317  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1318  NULL, globalThreads, localThreads, 0,
1319  NULL, NULL);
1320 
1321  return status;
1322 }
1323 
1324 //Morphology Erode operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
1325 static cl_int pixErodeCL_55(l_int32 wpl, l_int32 h)
1326 {
1327  size_t globalThreads[2];
1328  cl_mem pixtemp;
1329  cl_int status;
1330  int gsize;
1331  l_uint32 fwmask, lwmask;
1332  size_t localThreads[2];
1333 
1334  lwmask = lmask32[31 - 2];
1335  fwmask = rmask32[31 - 2];
1336 
1337  //Horizontal pass
1338  gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1339  globalThreads[0] = gsize;
1340  globalThreads[1] = GROUPSIZE_HMORY;
1341  localThreads[0] = GROUPSIZE_HMORX;
1342  localThreads[1] = GROUPSIZE_HMORY;
1343 
1344  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_5x5", &status );
1345  CHECK_OPENCL(status, "clCreateKernel morphoErodeHor_5x5");
1346 
1347  status = clSetKernelArg(rEnv.mpkKernel,
1348  0,
1349  sizeof(cl_mem),
1350  &pixsCLBuffer);
1351  status = clSetKernelArg(rEnv.mpkKernel,
1352  1,
1353  sizeof(cl_mem),
1354  &pixdCLBuffer);
1355  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1356  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1357 
1358  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1359  NULL, globalThreads, localThreads, 0,
1360  NULL, NULL);
1361 
1362  //Swap source and dest buffers
1363  pixtemp = pixsCLBuffer;
1364  pixsCLBuffer = pixdCLBuffer;
1365  pixdCLBuffer = pixtemp;
1366 
1367  //Vertical
1368  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1369  globalThreads[0] = gsize;
1370  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1371  globalThreads[1] = gsize;
1372  localThreads[0] = GROUPSIZE_X;
1373  localThreads[1] = GROUPSIZE_Y;
1374 
1375  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer_5x5", &status );
1376  CHECK_OPENCL(status, "clCreateKernel morphoErodeVer_5x5");
1377 
1378  status = clSetKernelArg(rEnv.mpkKernel,
1379  0,
1380  sizeof(cl_mem),
1381  &pixsCLBuffer);
1382  status = clSetKernelArg(rEnv.mpkKernel,
1383  1,
1384  sizeof(cl_mem),
1385  &pixdCLBuffer);
1386  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1387  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1388  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(fwmask), &fwmask);
1389  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(lwmask), &lwmask);
1390  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1391  NULL, globalThreads, localThreads, 0,
1392  NULL, NULL);
1393 
1394  return status;
1395 }
1396 
1397 //Morphology Dilate operation. Invokes the relevant OpenCL kernels
1398 static cl_int
1399 pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h)
1400 {
1401  l_int32 xp, yp, xn, yn;
1402  SEL* sel;
1403  size_t globalThreads[2];
1404  cl_mem pixtemp;
1405  cl_int status;
1406  int gsize;
1407  size_t localThreads[2];
1408  char isEven;
1409 
1410  OpenclDevice::SetKernelEnv( &rEnv );
1411 
1412  if (hsize == 5 && vsize == 5)
1413  {
1414  //Specific case for 5x5
1415  status = pixDilateCL_55(wpl, h);
1416  return status;
1417  }
1418 
1419  sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1420 
1421  selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1422  selDestroy(&sel);
1423  //global and local work dimensions for Horizontal pass
1424  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1425  globalThreads[0] = gsize;
1426  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1427  globalThreads[1] = gsize;
1428  localThreads[0] = GROUPSIZE_X;
1429  localThreads[1] = GROUPSIZE_Y;
1430 
1431  if (xp > 31 || xn > 31)
1432  {
1433  // Generic case.
1434  rEnv.mpkKernel =
1435  clCreateKernel(rEnv.mpkProgram, "morphoDilateHor", &status);
1436  CHECK_OPENCL(status, "clCreateKernel morphoDilateHor");
1437 
1438  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1439  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1440  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1441  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn);
1442  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl);
1443  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h);
1444  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1445  NULL, globalThreads, localThreads, 0,
1446  NULL, NULL);
1447 
1448  if (yp > 0 || yn > 0) {
1449  pixtemp = pixsCLBuffer;
1450  pixsCLBuffer = pixdCLBuffer;
1451  pixdCLBuffer = pixtemp;
1452  }
1453  }
1454  else if (xp > 0 || xn > 0 )
1455  {
1456  // Specific Horizontal pass kernel for half width < 32
1457  rEnv.mpkKernel =
1458  clCreateKernel(rEnv.mpkProgram, "morphoDilateHor_32word", &status);
1459  CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_32word");
1460  isEven = (xp != xn);
1461 
1462  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1463  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1464  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1465  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1466  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1467  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isEven), &isEven);
1468  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1469  NULL, globalThreads, localThreads, 0,
1470  NULL, NULL);
1471 
1472  if (yp > 0 || yn > 0) {
1473  pixtemp = pixsCLBuffer;
1474  pixsCLBuffer = pixdCLBuffer;
1475  pixdCLBuffer = pixtemp;
1476  }
1477  }
1478 
1479  if (yp > 0 || yn > 0)
1480  {
1481  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer", &status );
1482  CHECK_OPENCL(status, "clCreateKernel morphoDilateVer");
1483 
1484  status = clSetKernelArg(rEnv.mpkKernel,
1485  0,
1486  sizeof(cl_mem),
1487  &pixsCLBuffer);
1488  status = clSetKernelArg(rEnv.mpkKernel,
1489  1,
1490  sizeof(cl_mem),
1491  &pixdCLBuffer);
1492  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp);
1493  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1494  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1495  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(yn), &yn);
1496  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1497  NULL, globalThreads, localThreads, 0,
1498  NULL, NULL);
1499  }
1500 
1501  return status;
1502 }
1503 
1504 //Morphology Erode operation. Invokes the relevant OpenCL kernels
1505 static cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, l_uint32 h) {
1506  l_int32 xp, yp, xn, yn;
1507  SEL *sel;
1508  size_t globalThreads[2];
1509  size_t localThreads[2];
1510  cl_mem pixtemp;
1511  cl_int status;
1512  int gsize;
1513  char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
1514  l_uint32 rwmask, lwmask;
1515  char isEven;
1516 
1517  sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1518 
1519  selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1520  selDestroy(&sel);
1521  OpenclDevice::SetKernelEnv(&rEnv);
1522 
1523  if (hsize == 5 && vsize == 5 && isAsymmetric) {
1524  // Specific kernel for 5x5
1525  status = pixErodeCL_55(wpl, h);
1526  return status;
1527  }
1528 
1529  lwmask = lmask32[31 - (xn & 31)];
1530  rwmask = rmask32[31 - (xp & 31)];
1531 
1532  // global and local work dimensions for Horizontal pass
1533  gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1534  globalThreads[0] = gsize;
1535  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1536  globalThreads[1] = gsize;
1537  localThreads[0] = GROUPSIZE_X;
1538  localThreads[1] = GROUPSIZE_Y;
1539 
1540  // Horizontal Pass
1541  if (xp > 31 || xn > 31) {
1542  // Generic case.
1543  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeHor", &status);
1544 
1545  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1546  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1547  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1548  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn);
1549  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl);
1550  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h);
1551  status =
1552  clSetKernelArg(rEnv.mpkKernel, 6, sizeof(isAsymmetric), &isAsymmetric);
1553  status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(rwmask), &rwmask);
1554  status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(lwmask), &lwmask);
1555  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1556  NULL, globalThreads, localThreads, 0,
1557  NULL, NULL);
1558 
1559  if (yp > 0 || yn > 0) {
1560  pixtemp = pixsCLBuffer;
1561  pixsCLBuffer = pixdCLBuffer;
1562  pixdCLBuffer = pixtemp;
1563  }
1564  } else if (xp > 0 || xn > 0) {
1565  rEnv.mpkKernel =
1566  clCreateKernel(rEnv.mpkProgram, "morphoErodeHor_32word", &status);
1567  isEven = (xp != xn);
1568 
1569  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1570  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1571  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1572  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1573  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1574  status =
1575  clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric);
1576  status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(rwmask), &rwmask);
1577  status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(lwmask), &lwmask);
1578  status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(isEven), &isEven);
1579  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1580  NULL, globalThreads, localThreads, 0,
1581  NULL, NULL);
1582 
1583  if (yp > 0 || yn > 0) {
1584  pixtemp = pixsCLBuffer;
1585  pixsCLBuffer = pixdCLBuffer;
1586  pixdCLBuffer = pixtemp;
1587  }
1588  }
1589 
1590  // Vertical Pass
1591  if (yp > 0 || yn > 0) {
1592  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeVer", &status);
1593  CHECK_OPENCL(status, "clCreateKernel morphoErodeVer");
1594 
1595  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1596  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1597  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp);
1598  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1599  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1600  status =
1601  clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric);
1602  status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(yn), &yn);
1603  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1604  NULL, globalThreads, localThreads, 0,
1605  NULL, NULL);
1606  }
1607 
1608  return status;
1609 }
1610 
1611 //Morphology Open operation. Invokes the relevant OpenCL kernels
1612 static cl_int pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h)
1613 {
1614  cl_int status;
1615  cl_mem pixtemp;
1616 
1617  //Erode followed by Dilate
1618  status = pixErodeCL(hsize, vsize, wpl, h);
1619 
1620  pixtemp = pixsCLBuffer;
1621  pixsCLBuffer = pixdCLBuffer;
1622  pixdCLBuffer = pixtemp;
1623 
1624  status = pixDilateCL(hsize, vsize, wpl, h);
1625 
1626  return status;
1627 }
1628 
1629 //Morphology Close operation. Invokes the relevant OpenCL kernels
1630 static cl_int pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h)
1631 {
1632  cl_int status;
1633  cl_mem pixtemp;
1634 
1635  //Dilate followed by Erode
1636  status = pixDilateCL(hsize, vsize, wpl, h);
1637 
1638  pixtemp = pixsCLBuffer;
1639  pixsCLBuffer = pixdCLBuffer;
1640  pixdCLBuffer = pixtemp;
1641 
1642  status = pixErodeCL(hsize, vsize, wpl, h);
1643 
1644  return status;
1645 }
1646 
1647 //output = buffer1 & ~(buffer2)
1648 static
1649 cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1,
1650  cl_mem buffer2, cl_mem outBuffer = NULL) {
1651  cl_int status;
1652  size_t globalThreads[2];
1653  int gsize;
1654  size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
1655 
1656  gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1657  globalThreads[0] = gsize;
1658  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1659  globalThreads[1] = gsize;
1660 
1661  if (outBuffer != NULL) {
1662  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "pixSubtract", &status);
1663  CHECK_OPENCL(status, "clCreateKernel pixSubtract");
1664  } else {
1665  rEnv.mpkKernel =
1666  clCreateKernel(rEnv.mpkProgram, "pixSubtract_inplace", &status);
1667  CHECK_OPENCL(status, "clCreateKernel pixSubtract_inplace");
1668  }
1669 
1670  // Enqueue a kernel run call.
1671  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &buffer1);
1672  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &buffer2);
1673  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1674  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1675  if (outBuffer != NULL) {
1676  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &outBuffer);
1677  }
1678  status =
1679  clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL,
1680  globalThreads, localThreads, 0, NULL, NULL);
1681 
1682  return status;
1683 }
1684 
1685 // OpenCL implementation of Get Lines from pix function
1686 //Note: Assumes the source and dest opencl buffer are initialized. No check done
1687 void OpenclDevice::pixGetLinesCL(Pix *pixd, Pix *pixs, Pix **pix_vline,
1688  Pix **pix_hline, Pix **pixClosed,
1689  bool getpixClosed, l_int32 close_hsize,
1690  l_int32 close_vsize, l_int32 open_hsize,
1691  l_int32 open_vsize, l_int32 line_hsize,
1692  l_int32 line_vsize) {
1693  l_uint32 wpl, h;
1694  cl_mem pixtemp;
1695 
1696  wpl = pixGetWpl(pixs);
1697  h = pixGetHeight(pixs);
1698 
1699  // First step : Close Morph operation: Dilate followed by Erode
1700  clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
1701 
1702  // Copy the Close output to CPU buffer
1703  if (getpixClosed) {
1704  *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs,
1705  wpl * h, CL_MAP_READ, true, false);
1706  }
1707 
1708  // Store the output of close operation in an intermediate buffer
1709  // this will be later used for pixsubtract
1710  clStatus =
1711  clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
1712  0, sizeof(int) * wpl * h, 0, NULL, NULL);
1713 
1714  // Second step: Open Operation - Erode followed by Dilate
1715  pixtemp = pixsCLBuffer;
1716  pixsCLBuffer = pixdCLBuffer;
1717  pixdCLBuffer = pixtemp;
1718 
1719  clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
1720 
1721  // Third step: Subtract : (Close - Open)
1722  pixtemp = pixsCLBuffer;
1723  pixsCLBuffer = pixdCLBuffer;
1724  pixdCLBuffer = pixdCLIntermediate;
1725  pixdCLIntermediate = pixtemp;
1726 
1727  clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
1728 
1729  // Store the output of Hollow operation in an intermediate buffer
1730  // this will be later used
1731  clStatus =
1732  clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
1733  0, sizeof(int) * wpl * h, 0, NULL, NULL);
1734 
1735  pixtemp = pixsCLBuffer;
1736  pixsCLBuffer = pixdCLBuffer;
1737  pixdCLBuffer = pixtemp;
1738 
1739  // Fourth step: Get vertical line
1740  // pixOpenBrick(NULL, pix_hollow, 1, min_line_length);
1741  clStatus = pixOpenCL(1, line_vsize, wpl, h);
1742 
1743  // Copy the vertical line output to CPU buffer
1744  *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl * h,
1745  CL_MAP_READ, true, false);
1746 
1747  pixtemp = pixsCLBuffer;
1748  pixsCLBuffer = pixdCLIntermediate;
1749  pixdCLIntermediate = pixtemp;
1750 
1751  // Fifth step: Get horizontal line
1752  // pixOpenBrick(NULL, pix_hollow, min_line_length, 1);
1753  clStatus = pixOpenCL(line_hsize, 1, wpl, h);
1754 
1755  // Copy the horizontal line output to CPU buffer
1756  *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl * h,
1757  CL_MAP_READ, true, true);
1758 
1759  return;
1760 }
1761 
1762 /*************************************************************************
1763  * HistogramRect
1764  * Otsu Thresholding Operations
1765  * histogramAllChannels is laid out as all channel 0, then all channel 1...
1766  * only supports 1 or 4 channels (bytes_per_pixel)
1767  ************************************************************************/
1768 int OpenclDevice::HistogramRectOCL(unsigned char *imageData,
1769  int bytes_per_pixel, int bytes_per_line,
1770  int left, // always 0
1771  int top, // always 0
1772  int width, int height, int kHistogramSize,
1773  int *histogramAllChannels) {
1774  PERF_COUNT_START("HistogramRectOCL")
1775  cl_int clStatus;
1776  int retVal = 0;
1777  KernelEnv histKern;
1778  SetKernelEnv(&histKern);
1779  KernelEnv histRedKern;
1780  SetKernelEnv(&histRedKern);
1781  /* map imagedata to device as read only */
1782  // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be
1783  // coherent which we don't need.
1784  // faster option would be to allocate initial image buffer
1785  // using a garlic bus memory type
1786  cl_mem imageBuffer = clCreateBuffer(
1787  histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1788  width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus);
1789  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1790 
1791  /* setup work group size parameters */
1792  int block_size = 256;
1793  cl_uint numCUs;
1794  clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
1795  sizeof(numCUs), &numCUs, NULL);
1796  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1797 
1798  int requestedOccupancy = 10;
1799  int numWorkGroups = numCUs * requestedOccupancy;
1800  int numThreads = block_size * numWorkGroups;
1801  size_t local_work_size[] = {static_cast<size_t>(block_size)};
1802  size_t global_work_size[] = {static_cast<size_t>(numThreads)};
1803  size_t red_global_work_size[] = {
1804  static_cast<size_t>(block_size * kHistogramSize * bytes_per_pixel)};
1805 
1806  /* map histogramAllChannels as write only */
1807 
1808  cl_mem histogramBuffer = clCreateBuffer(
1809  histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1810  kHistogramSize * bytes_per_pixel * sizeof(int), histogramAllChannels,
1811  &clStatus);
1812  CHECK_OPENCL(clStatus, "clCreateBuffer histogramBuffer");
1813 
1814  /* intermediate histogram buffer */
1815  int histRed = 256;
1816  int tmpHistogramBins = kHistogramSize * bytes_per_pixel * histRed;
1817 
1818  cl_mem tmpHistogramBuffer =
1819  clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE,
1820  tmpHistogramBins * sizeof(cl_uint), NULL, &clStatus);
1821  CHECK_OPENCL(clStatus, "clCreateBuffer tmpHistogramBuffer");
1822 
1823  /* atomic sync buffer */
1824  int *zeroBuffer = new int[1];
1825  zeroBuffer[0] = 0;
1826  cl_mem atomicSyncBuffer = clCreateBuffer(
1827  histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
1828  sizeof(cl_int), zeroBuffer, &clStatus);
1829  CHECK_OPENCL(clStatus, "clCreateBuffer atomicSyncBuffer");
1830  delete[] zeroBuffer;
1831  // Create kernel objects based on bytes_per_pixel
1832  if (bytes_per_pixel == 1) {
1833  histKern.mpkKernel = clCreateKernel(
1834  histKern.mpkProgram, "kernel_HistogramRectOneChannel", &clStatus);
1835  CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectOneChannel");
1836 
1837  histRedKern.mpkKernel =
1838  clCreateKernel(histRedKern.mpkProgram,
1839  "kernel_HistogramRectOneChannelReduction", &clStatus);
1840  CHECK_OPENCL(clStatus,
1841  "clCreateKernel kernel_HistogramRectOneChannelReduction");
1842  } else {
1843  histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus );
1844  CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannels");
1845 
1846  histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectAllChannelsReduction", &clStatus );
1847  CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannelsReduction");
1848  }
1849 
1850  void *ptr;
1851 
1852  //Initialize tmpHistogramBuffer buffer
1853  ptr = clEnqueueMapBuffer(
1854  histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0,
1855  tmpHistogramBins * sizeof(cl_uint), 0, NULL, NULL, &clStatus);
1856  CHECK_OPENCL( clStatus, "clEnqueueMapBuffer tmpHistogramBuffer");
1857 
1858  memset(ptr, 0, tmpHistogramBins*sizeof(cl_uint));
1859  clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0,
1860  NULL, NULL);
1861 
1862  /* set kernel 1 arguments */
1863  clStatus =
1864  clSetKernelArg(histKern.mpkKernel, 0, sizeof(cl_mem), &imageBuffer);
1865  CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
1866  cl_uint numPixels = width*height;
1867  clStatus =
1868  clSetKernelArg(histKern.mpkKernel, 1, sizeof(cl_uint), &numPixels);
1869  CHECK_OPENCL( clStatus, "clSetKernelArg numPixels" );
1870  clStatus = clSetKernelArg(histKern.mpkKernel, 2, sizeof(cl_mem),
1871  &tmpHistogramBuffer);
1872  CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer");
1873 
1874  /* set kernel 2 arguments */
1875  int n = numThreads/bytes_per_pixel;
1876  clStatus = clSetKernelArg(histRedKern.mpkKernel, 0, sizeof(cl_int), &n);
1877  CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
1878  clStatus = clSetKernelArg(histRedKern.mpkKernel, 1, sizeof(cl_mem),
1879  &tmpHistogramBuffer);
1880  CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer");
1881  clStatus = clSetKernelArg(histRedKern.mpkKernel, 2, sizeof(cl_mem),
1882  &histogramBuffer);
1883  CHECK_OPENCL( clStatus, "clSetKernelArg histogramBuffer");
1884 
1885  /* launch histogram */
1886 PERF_COUNT_SUB("before")
1887 clStatus = clEnqueueNDRangeKernel(histKern.mpkCmdQueue, histKern.mpkKernel, 1,
1888  NULL, global_work_size, local_work_size, 0,
1889  NULL, NULL);
1890 CHECK_OPENCL(clStatus,
1891  "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels");
1892 clFinish(histKern.mpkCmdQueue);
1893 if (clStatus != 0) {
1894  retVal = -1;
1895  }
1896  /* launch histogram */
1897  clStatus = clEnqueueNDRangeKernel(
1898  histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1, NULL,
1899  red_global_work_size, local_work_size, 0, NULL, NULL);
1900  CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction" );
1901  clFinish( histRedKern.mpkCmdQueue );
1902  if (clStatus != 0) {
1903  retVal = -1;
1904  }
1905  PERF_COUNT_SUB("redKernel")
1906 
1907  /* map results back from gpu */
1908  ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE,
1909  CL_MAP_READ, 0,
1910  kHistogramSize * bytes_per_pixel * sizeof(int), 0,
1911  NULL, NULL, &clStatus);
1912  CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer");
1913  if (clStatus != 0) {
1914  retVal = -1;
1915  }
1916  clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0,
1917  NULL, NULL);
1918 
1919  clReleaseMemObject(histogramBuffer);
1920  clReleaseMemObject(imageBuffer);
1921 PERF_COUNT_SUB("after")
1923 return retVal;
1924 }
1925 
1926 /*************************************************************************
1927  * Threshold the rectangle, taking everything except the image buffer pointer
1928  * from the class, using thresholds/hi_values to the output IMAGE.
1929  * only supports 1 or 4 channels
1930  ************************************************************************/
1931 int OpenclDevice::ThresholdRectToPixOCL(unsigned char *imageData,
1932  int bytes_per_pixel, int bytes_per_line,
1933  int *thresholds, int *hi_values,
1934  Pix **pix, int height, int width,
1935  int top, int left) {
1936  PERF_COUNT_START("ThresholdRectToPixOCL")
1937  int retVal = 0;
1938  /* create pix result buffer */
1939  *pix = pixCreate(width, height, 1);
1940  uint32_t *pixData = pixGetData(*pix);
1941  int wpl = pixGetWpl(*pix);
1942  int pixSize = wpl * height * sizeof(uint32_t); // number of pixels
1943 
1944  cl_int clStatus;
1945  KernelEnv rEnv;
1946  SetKernelEnv(&rEnv);
1947 
1948  /* setup work group size parameters */
1949  int block_size = 256;
1950  cl_uint numCUs = 6;
1951  clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
1952  sizeof(numCUs), &numCUs, NULL);
1953  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1954 
1955  int requestedOccupancy = 10;
1956  int numWorkGroups = numCUs * requestedOccupancy;
1957  int numThreads = block_size * numWorkGroups;
1958  size_t local_work_size[] = {(size_t)block_size};
1959  size_t global_work_size[] = {(size_t)numThreads};
1960 
1961  /* map imagedata to device as read only */
1962  // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be
1963  // coherent which we don't need.
1964  // faster option would be to allocate initial image buffer
1965  // using a garlic bus memory type
1966  cl_mem imageBuffer = clCreateBuffer(
1967  rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1968  width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus);
1969  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1970 
1971  /* map pix as write only */
1972  pixThBuffer =
1973  clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1974  pixSize, pixData, &clStatus);
1975  CHECK_OPENCL(clStatus, "clCreateBuffer pix");
1976 
1977  /* map thresholds and hi_values */
1978  cl_mem thresholdsBuffer =
1979  clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1980  bytes_per_pixel * sizeof(int), thresholds, &clStatus);
1981  CHECK_OPENCL(clStatus, "clCreateBuffer thresholdBuffer");
1982  cl_mem hiValuesBuffer =
1983  clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1984  bytes_per_pixel * sizeof(int), hi_values, &clStatus);
1985  CHECK_OPENCL(clStatus, "clCreateBuffer hiValuesBuffer");
1986 
1987  /* compile kernel */
1988  if (bytes_per_pixel == 4) {
1989  rEnv.mpkKernel =
1990  clCreateKernel(rEnv.mpkProgram, "kernel_ThresholdRectToPix", &clStatus);
1991  CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix");
1992  } else {
1993  rEnv.mpkKernel = clCreateKernel(
1994  rEnv.mpkProgram, "kernel_ThresholdRectToPix_OneChan", &clStatus);
1995  CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix_OneChan");
1996  }
1997 
1998  /* set kernel arguments */
1999  clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &imageBuffer);
2000  CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
2001  clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(int), &height);
2002  CHECK_OPENCL(clStatus, "clSetKernelArg height");
2003  clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(int), &width);
2004  CHECK_OPENCL(clStatus, "clSetKernelArg width");
2005  clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(int), &wpl);
2006  CHECK_OPENCL(clStatus, "clSetKernelArg wpl");
2007  clStatus =
2008  clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &thresholdsBuffer);
2009  CHECK_OPENCL(clStatus, "clSetKernelArg thresholdsBuffer");
2010  clStatus = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(cl_mem), &hiValuesBuffer);
2011  CHECK_OPENCL(clStatus, "clSetKernelArg hiValuesBuffer");
2012  clStatus = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(cl_mem), &pixThBuffer);
2013  CHECK_OPENCL(clStatus, "clSetKernelArg pixThBuffer");
2014 
2015  /* launch kernel & wait */
2016  PERF_COUNT_SUB("before")
2017  clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 1,
2018  NULL, global_work_size, local_work_size,
2019  0, NULL, NULL);
2020  CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix");
2021  clFinish(rEnv.mpkCmdQueue);
2022  PERF_COUNT_SUB("kernel")
2023  if (clStatus != 0) {
2024  printf("Setting return value to -1\n");
2025  retVal = -1;
2026  }
2027  /* map results back from gpu */
2028  void *ptr =
2029  clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0,
2030  pixSize, 0, NULL, NULL, &clStatus);
2031  CHECK_OPENCL(clStatus, "clEnqueueMapBuffer histogramBuffer");
2032  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, NULL,
2033  NULL);
2034 
2035  clReleaseMemObject(imageBuffer);
2036  clReleaseMemObject(thresholdsBuffer);
2037  clReleaseMemObject(hiValuesBuffer);
2038 
2039  PERF_COUNT_SUB("after")
2041  return retVal;
2042 }
2043 
2044 
2045 
2046 /******************************************************************************
2047  * Data Types for Device Selection
2048  *****************************************************************************/
2049 
2050 typedef struct _TessScoreEvaluationInputData {
2051  int height;
2052  int width;
2053  int numChannels;
2054  unsigned char *imageData;
2055  Pix *pix;
2056 } TessScoreEvaluationInputData;
2057 
2058 static void populateTessScoreEvaluationInputData(TessScoreEvaluationInputData *input) {
2059  srand(1);
2060  // 8.5x11 inches @ 300dpi rounded to clean multiples
2061  int height = 3328; // %256
2062  int width = 2560; // %512
2063  int numChannels = 4;
2064  input->height = height;
2065  input->width = width;
2066  input->numChannels = numChannels;
2067  unsigned char (*imageData4)[4] = (unsigned char (*)[4]) malloc(height*width*numChannels*sizeof(unsigned char)); // new unsigned char[4][height*width];
2068  input->imageData = (unsigned char *) &imageData4[0];
2069 
2070  // zero out image
2071  unsigned char pixelWhite[4] = { 0, 0, 0, 255};
2072  unsigned char pixelBlack[4] = {255, 255, 255, 255};
2073  for (int p = 0; p < height*width; p++) {
2074  //unsigned char tmp[4] = imageData4[0];
2075  imageData4[p][0] = pixelWhite[0];
2076  imageData4[p][1] = pixelWhite[1];
2077  imageData4[p][2] = pixelWhite[2];
2078  imageData4[p][3] = pixelWhite[3];
2079  }
2080  // random lines to be eliminated
2081  int maxLineWidth = 64; // pixels wide
2082  int numLines = 10;
2083  // vertical lines
2084  for (int i = 0; i < numLines; i++) {
2085  int lineWidth = rand()%maxLineWidth;
2086  int vertLinePos = lineWidth + rand()%(width-2*lineWidth);
2087  //printf("[PI] VerticalLine @ %i (w=%i)\n", vertLinePos, lineWidth);
2088  for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) {
2089  for (int col = 0; col < height; col++) {
2090  //imageData4[row*width+col] = pixelBlack;
2091  imageData4[row*width+col][0] = pixelBlack[0];
2092  imageData4[row*width+col][1] = pixelBlack[1];
2093  imageData4[row*width+col][2] = pixelBlack[2];
2094  imageData4[row*width+col][3] = pixelBlack[3];
2095  }
2096  }
2097  }
2098  // horizontal lines
2099  for (int i = 0; i < numLines; i++) {
2100  int lineWidth = rand()%maxLineWidth;
2101  int horLinePos = lineWidth + rand()%(height-2*lineWidth);
2102  //printf("[PI] HorizontalLine @ %i (w=%i)\n", horLinePos, lineWidth);
2103  for (int row = 0; row < width; row++) {
2104  for (int col = horLinePos-lineWidth/2; col < horLinePos+lineWidth/2; col++) { // for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) {
2105  //printf("[PI] HoizLine pix @ (%3i, %3i)\n", row, col);
2106  //imageData4[row*width+col] = pixelBlack;
2107  imageData4[row*width+col][0] = pixelBlack[0];
2108  imageData4[row*width+col][1] = pixelBlack[1];
2109  imageData4[row*width+col][2] = pixelBlack[2];
2110  imageData4[row*width+col][3] = pixelBlack[3];
2111  }
2112  }
2113  }
2114  // spots (noise, squares)
2115  float fractionBlack = 0.1; // how much of the image should be blackened
2116  int numSpots = (height*width)*fractionBlack/(maxLineWidth*maxLineWidth/2/2);
2117  for (int i = 0; i < numSpots; i++) {
2118  int lineWidth = rand()%maxLineWidth;
2119  int col = lineWidth + rand()%(width-2*lineWidth);
2120  int row = lineWidth + rand()%(height-2*lineWidth);
2121  //printf("[PI] Spot[%i/%i] @ (%3i, %3i)\n", i, numSpots, row, col );
2122  for (int r = row-lineWidth/2; r < row+lineWidth/2; r++) {
2123  for (int c = col-lineWidth/2; c < col+lineWidth/2; c++) {
2124  //printf("[PI] \tSpot[%i/%i] @ (%3i, %3i)\n", i, numSpots, r, c );
2125  //imageData4[row*width+col] = pixelBlack;
2126  imageData4[r*width+c][0] = pixelBlack[0];
2127  imageData4[r*width+c][1] = pixelBlack[1];
2128  imageData4[r*width+c][2] = pixelBlack[2];
2129  imageData4[r*width+c][3] = pixelBlack[3];
2130  }
2131  }
2132  }
2133 
2134  input->pix = pixCreate(input->width, input->height, 1);
2135 }
2136 
2137 typedef struct _TessDeviceScore {
2138  float time; // small time means faster device
2139  bool clError; // were there any opencl errors
2140  bool valid; // was the correct response generated
2141 } TessDeviceScore;
2142 
2143 /******************************************************************************
2144  * Micro Benchmarks for Device Selection
2145  *****************************************************************************/
2146 
2147 static double composeRGBPixelMicroBench(GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type) {
2148  double time = 0;
2149 #if ON_WINDOWS
2150  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2151  QueryPerformanceFrequency(&freq);
2152 #elif ON_APPLE
2153  mach_timebase_info_data_t info = {0, 0};
2154  mach_timebase_info(&info);
2155  long long start, stop;
2156 #else
2157  timespec time_funct_start, time_funct_end;
2158 #endif
2159  // input data
2160  l_uint32 *tiffdata = (l_uint32 *)input.imageData;// same size and random data; data doesn't change workload
2161 
2162  // function call
2163  if (type == DS_DEVICE_OPENCL_DEVICE) {
2164 #if ON_WINDOWS
2165  QueryPerformanceCounter(&time_funct_start);
2166 #elif ON_APPLE
2167  start = mach_absolute_time();
2168 #else
2169  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2170 #endif
2171 
2172  OpenclDevice::gpuEnv = *env;
2173  int wpl = pixGetWpl(input.pix);
2174  OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height,
2175  wpl, NULL);
2176 #if ON_WINDOWS
2177  QueryPerformanceCounter(&time_funct_end);
2178  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2179 #elif ON_APPLE
2180  stop = mach_absolute_time();
2181  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2182 #else
2183  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2184  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2185 #endif
2186 
2187  } else {
2188 #if ON_WINDOWS
2189  QueryPerformanceCounter(&time_funct_start);
2190 #elif ON_APPLE
2191  start = mach_absolute_time();
2192 #else
2193  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2194 #endif
2195  Pix *pix = pixCreate(input.width, input.height, 32);
2196  l_uint32 *pixData = pixGetData(pix);
2197  int i, j;
2198  int idx = 0;
2199  for (i = 0; i < input.height ; i++) {
2200  for (j = 0; j < input.width; j++) {
2201  l_uint32 tiffword = tiffdata[i * input.width + j];
2202  l_int32 rval = ((tiffword) & 0xff);
2203  l_int32 gval = (((tiffword) >> 8) & 0xff);
2204  l_int32 bval = (((tiffword) >> 16) & 0xff);
2205  l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
2206  pixData[idx] = value;
2207  idx++;
2208  }
2209  }
2210 #if ON_WINDOWS
2211  QueryPerformanceCounter(&time_funct_end);
2212  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2213 #elif ON_APPLE
2214  stop = mach_absolute_time();
2215  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2216 #else
2217  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2218  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2219 #endif
2220  pixDestroy(&pix);
2221  }
2222 
2223 
2224  // cleanup
2225 
2226  return time;
2227 }
2228 
2229 static double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
2230  double time;
2231 #if ON_WINDOWS
2232  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2233  QueryPerformanceFrequency(&freq);
2234 #elif ON_APPLE
2235  mach_timebase_info_data_t info = {0, 0};
2236  mach_timebase_info(&info);
2237  long long start, stop;
2238 #else
2239  timespec time_funct_start, time_funct_end;
2240 #endif
2241 
2242  int left = 0;
2243  int top = 0;
2244  int kHistogramSize = 256;
2245  int bytes_per_line = input.width*input.numChannels;
2246  int *histogramAllChannels = new int[kHistogramSize*input.numChannels];
2247  // function call
2248  if (type == DS_DEVICE_OPENCL_DEVICE) {
2249 #if ON_WINDOWS
2250  QueryPerformanceCounter(&time_funct_start);
2251 #elif ON_APPLE
2252  start = mach_absolute_time();
2253 #else
2254  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2255 #endif
2256 
2257  OpenclDevice::gpuEnv = *env;
2258  int retVal = OpenclDevice::HistogramRectOCL(
2259  input.imageData, input.numChannels, bytes_per_line, top, left,
2260  input.width, input.height, kHistogramSize, histogramAllChannels);
2261 
2262 #if ON_WINDOWS
2263  QueryPerformanceCounter(&time_funct_end);
2264  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2265 #elif ON_APPLE
2266  stop = mach_absolute_time();
2267  if (retVal == 0) {
2268  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2269  } else {
2270  time = FLT_MAX;
2271  }
2272 #else
2273  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2274  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2275 #endif
2276  } else {
2277  int *histogram = new int[kHistogramSize];
2278 #if ON_WINDOWS
2279  QueryPerformanceCounter(&time_funct_start);
2280 #elif ON_APPLE
2281  start = mach_absolute_time();
2282 #else
2283  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2284 #endif
2285  for (int ch = 0; ch < input.numChannels; ++ch) {
2286  tesseract::HistogramRect(input.pix, input.numChannels, left, top,
2287  input.width, input.height, histogram);
2288  }
2289 #if ON_WINDOWS
2290  QueryPerformanceCounter(&time_funct_end);
2291  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2292 #elif ON_APPLE
2293  stop = mach_absolute_time();
2294  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2295 #else
2296  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2297  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2298 #endif
2299  delete[] histogram;
2300  }
2301 
2302  // cleanup
2303  delete[] histogramAllChannels;
2304  return time;
2305 }
2306 
2307 //Reproducing the ThresholdRectToPix native version
2308 static void ThresholdRectToPix_Native(const unsigned char* imagedata,
2309  int bytes_per_pixel,
2310  int bytes_per_line,
2311  const int* thresholds,
2312  const int* hi_values,
2313  Pix** pix) {
2314  int top = 0;
2315  int left = 0;
2316  int width = pixGetWidth(*pix);
2317  int height = pixGetHeight(*pix);
2318 
2319  *pix = pixCreate(width, height, 1);
2320  uint32_t *pixdata = pixGetData(*pix);
2321  int wpl = pixGetWpl(*pix);
2322  const unsigned char* srcdata = imagedata + top * bytes_per_line +
2323  left * bytes_per_pixel;
2324  for (int y = 0; y < height; ++y) {
2325  const uint8_t *linedata = srcdata;
2326  uint32_t *pixline = pixdata + y * wpl;
2327  for (int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
2328  bool white_result = true;
2329  for (int ch = 0; ch < bytes_per_pixel; ++ch) {
2330  if (hi_values[ch] >= 0 &&
2331  (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
2332  white_result = false;
2333  break;
2334  }
2335  }
2336  if (white_result)
2337  CLEAR_DATA_BIT(pixline, x);
2338  else
2339  SET_DATA_BIT(pixline, x);
2340  }
2341  srcdata += bytes_per_line;
2342  }
2343 }
2344 
2345 static double thresholdRectToPixMicroBench(GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type) {
2346  double time;
2347 #if ON_WINDOWS
2348  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2349  QueryPerformanceFrequency(&freq);
2350 #elif ON_APPLE
2351  mach_timebase_info_data_t info = {0, 0};
2352  mach_timebase_info(&info);
2353  long long start, stop;
2354 #else
2355  timespec time_funct_start, time_funct_end;
2356 #endif
2357 
2358  // input data
2359  unsigned char pixelHi = (unsigned char)255;
2360  int* thresholds = new int[4];
2361  thresholds[0] = pixelHi/2;
2362  thresholds[1] = pixelHi/2;
2363  thresholds[2] = pixelHi/2;
2364  thresholds[3] = pixelHi/2;
2365  int *hi_values = new int[4];
2366  thresholds[0] = pixelHi;
2367  thresholds[1] = pixelHi;
2368  thresholds[2] = pixelHi;
2369  thresholds[3] = pixelHi;
2370  //Pix* pix = pixCreate(width, height, 1);
2371  int top = 0;
2372  int left = 0;
2373  int bytes_per_line = input.width*input.numChannels;
2374 
2375  // function call
2376  if (type == DS_DEVICE_OPENCL_DEVICE) {
2377 #if ON_WINDOWS
2378  QueryPerformanceCounter(&time_funct_start);
2379 #elif ON_APPLE
2380  start = mach_absolute_time();
2381 #else
2382  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2383 #endif
2384 
2385  OpenclDevice::gpuEnv = *env;
2386  int retVal = OpenclDevice::ThresholdRectToPixOCL(
2387  input.imageData, input.numChannels, bytes_per_line, thresholds,
2388  hi_values, &input.pix, input.height, input.width, top, left);
2389 
2390 #if ON_WINDOWS
2391  QueryPerformanceCounter(&time_funct_end);
2392  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2393 #elif ON_APPLE
2394  stop = mach_absolute_time();
2395  if (retVal == 0) {
2396  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2397  ;
2398  } else {
2399  time = FLT_MAX;
2400  }
2401 
2402 #else
2403  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2404  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2405 #endif
2406  } else {
2407 
2408 
2409  tesseract::ImageThresholder thresholder;
2410  thresholder.SetImage( input.pix );
2411 #if ON_WINDOWS
2412  QueryPerformanceCounter(&time_funct_start);
2413 #elif ON_APPLE
2414  start = mach_absolute_time();
2415 #else
2416  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2417 #endif
2418  ThresholdRectToPix_Native( input.imageData, input.numChannels, bytes_per_line,
2419  thresholds, hi_values, &input.pix );
2420 
2421 #if ON_WINDOWS
2422  QueryPerformanceCounter(&time_funct_end);
2423  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2424 #elif ON_APPLE
2425  stop = mach_absolute_time();
2426  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2427 #else
2428  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2429  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2430 #endif
2431  }
2432 
2433  // cleanup
2434  delete[] thresholds;
2435  delete[] hi_values;
2436  return time;
2437 }
2438 
2439 static double getLineMasksMorphMicroBench(GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type) {
2440 
2441  double time = 0;
2442 #if ON_WINDOWS
2443  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2444  QueryPerformanceFrequency(&freq);
2445 #elif ON_APPLE
2446  mach_timebase_info_data_t info = {0, 0};
2447  mach_timebase_info(&info);
2448  long long start, stop;
2449 #else
2450  timespec time_funct_start, time_funct_end;
2451 #endif
2452 
2453  // input data
2454  int resolution = 300;
2455  int wpl = pixGetWpl(input.pix);
2456  int kThinLineFraction = 20; // tess constant
2457  int kMinLineLengthFraction = 4; // tess constant
2458  int max_line_width = resolution / kThinLineFraction;
2459  int min_line_length = resolution / kMinLineLengthFraction;
2460  int closing_brick = max_line_width / 3;
2461 
2462  // function call
2463  if (type == DS_DEVICE_OPENCL_DEVICE) {
2464 #if ON_WINDOWS
2465  QueryPerformanceCounter(&time_funct_start);
2466 #elif ON_APPLE
2467  start = mach_absolute_time();
2468 #else
2469  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2470 #endif
2471  OpenclDevice::gpuEnv = *env;
2472  OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
2473  Pix *pix_vline = NULL, *pix_hline = NULL, *pix_closed = NULL;
2474  OpenclDevice::pixGetLinesCL(
2475  NULL, input.pix, &pix_vline, &pix_hline, &pix_closed, true,
2476  closing_brick, closing_brick, max_line_width, max_line_width,
2477  min_line_length, min_line_length);
2478 
2479  OpenclDevice::releaseMorphCLBuffers();
2480 
2481 #if ON_WINDOWS
2482  QueryPerformanceCounter(&time_funct_end);
2483  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2484 #elif ON_APPLE
2485  stop = mach_absolute_time();
2486  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2487 #else
2488  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2489  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2490 #endif
2491  } else {
2492 #if ON_WINDOWS
2493  QueryPerformanceCounter(&time_funct_start);
2494 #elif ON_APPLE
2495  start = mach_absolute_time();
2496 #else
2497  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2498 #endif
2499 
2500  // native serial code
2501  Pix *src_pix = input.pix;
2502  Pix *pix_closed =
2503  pixCloseBrick(NULL, src_pix, closing_brick, closing_brick);
2504  Pix *pix_solid =
2505  pixOpenBrick(NULL, pix_closed, max_line_width, max_line_width);
2506  Pix *pix_hollow = pixSubtract(NULL, pix_closed, pix_solid);
2507  pixDestroy(&pix_solid);
2508  Pix *pix_vline = pixOpenBrick(NULL, pix_hollow, 1, min_line_length);
2509  Pix *pix_hline = pixOpenBrick(NULL, pix_hollow, min_line_length, 1);
2510  pixDestroy(&pix_hollow);
2511 
2512 #if ON_WINDOWS
2513  QueryPerformanceCounter(&time_funct_end);
2514  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2515 #elif ON_APPLE
2516  stop = mach_absolute_time();
2517  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2518 #else
2519  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2520  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2521 #endif
2522  }
2523 
2524  return time;
2525 }
2526 
2527 
2528 
2529 /******************************************************************************
2530  * Device Selection
2531  *****************************************************************************/
2532 
2533 #include "stdlib.h"
2534 
2535 // encode score object as byte string
2536 static ds_status serializeScore( ds_device* device, void **serializedScore, unsigned int* serializedScoreSize ) {
2537  *serializedScoreSize = sizeof(TessDeviceScore);
2538  *serializedScore = new unsigned char[*serializedScoreSize];
2539  memcpy(*serializedScore, device->score, *serializedScoreSize);
2540  return DS_SUCCESS;
2541 }
2542 
2543 // parses byte string and stores in score object
2544 static ds_status deserializeScore( ds_device* device, const unsigned char* serializedScore, unsigned int serializedScoreSize ) {
2545  // check that serializedScoreSize == sizeof(TessDeviceScore);
2546  device->score = new TessDeviceScore;
2547  memcpy(device->score, serializedScore, serializedScoreSize);
2548  return DS_SUCCESS;
2549 }
2550 
2551 static ds_status releaseScore(void *score) {
2552  delete (TessDeviceScore *)score;
2553  return DS_SUCCESS;
2554 }
2555 
2556 // evaluate devices
2557 static ds_status evaluateScoreForDevice( ds_device *device, void *inputData) {
2558  // overwrite statuc gpuEnv w/ current device
2559  // so native opencl calls can be used; they use static gpuEnv
2560  printf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" );
2561  GPUEnv *env = NULL;
2562  if (device->type == DS_DEVICE_OPENCL_DEVICE) {
2563  env = new GPUEnv;
2564  //printf("[DS] populating tmp GPUEnv from device\n");
2565  populateGPUEnvFromDevice( env, device->oclDeviceID);
2566  env->mnFileCount = 0; //argc;
2567  env->mnKernelCount = 0UL;
2568  //printf("[DS] compiling kernels for tmp GPUEnv\n");
2569  OpenclDevice::gpuEnv = *env;
2570  OpenclDevice::CompileKernelFile(env, "");
2571  }
2572 
2573  TessScoreEvaluationInputData *input = (TessScoreEvaluationInputData *)inputData;
2574 
2575  // pixReadTiff
2576  double composeRGBPixelTime = composeRGBPixelMicroBench( env, *input, device->type );
2577 
2578  // HistogramRect
2579  double histogramRectTime = histogramRectMicroBench( env, *input, device->type );
2580 
2581  // ThresholdRectToPix
2582  double thresholdRectToPixTime = thresholdRectToPixMicroBench( env, *input, device->type );
2583 
2584  // getLineMasks
2585  double getLineMasksMorphTime = getLineMasksMorphMicroBench( env, *input, device->type );
2586 
2587 
2588  // weigh times (% of cpu time)
2589  // these weights should be the % execution time that the native cpu code took
2590  float composeRGBPixelWeight = 1.2f;
2591  float histogramRectWeight = 2.4f;
2592  float thresholdRectToPixWeight = 4.5f;
2593  float getLineMasksMorphWeight = 5.0f;
2594 
2595  float weightedTime = composeRGBPixelWeight * composeRGBPixelTime +
2596  histogramRectWeight * histogramRectTime +
2597  thresholdRectToPixWeight * thresholdRectToPixTime +
2598  getLineMasksMorphWeight * getLineMasksMorphTime;
2599  device->score = new TessDeviceScore;
2600  ((TessDeviceScore *)device->score)->time = weightedTime;
2601 
2602  printf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" );
2603  printf("[DS]%25s: %f (w=%.1f)\n", "composeRGBPixel", composeRGBPixelTime, composeRGBPixelWeight );
2604  printf("[DS]%25s: %f (w=%.1f)\n", "HistogramRect", histogramRectTime, histogramRectWeight );
2605  printf("[DS]%25s: %f (w=%.1f)\n", "ThresholdRectToPix", thresholdRectToPixTime, thresholdRectToPixWeight );
2606  printf("[DS]%25s: %f (w=%.1f)\n", "getLineMasksMorph", getLineMasksMorphTime, getLineMasksMorphWeight );
2607  printf("[DS]%25s: %f\n", "Score", ((TessDeviceScore *)device->score)->time );
2608  return DS_SUCCESS;
2609 }
2610 
2611 // initial call to select device
2612 ds_device OpenclDevice::getDeviceSelection( ) {
2613  if (!deviceIsSelected) {
2614  PERF_COUNT_START("getDeviceSelection")
2615  // check if opencl is available at runtime
2616  if (1 == LoadOpencl()) {
2617  // opencl is available
2618  // PERF_COUNT_SUB("LoadOpencl")
2619  // setup devices
2620  ds_status status;
2621  ds_profile *profile;
2622  status = initDSProfile(&profile, "v0.1");
2623  PERF_COUNT_SUB("initDSProfile")
2624  // try reading scores from file
2625  const char *fileName = "tesseract_opencl_profile_devices.dat";
2626  status = readProfileFromFile(profile, deserializeScore, fileName);
2627  if (status != DS_SUCCESS) {
2628  // need to run evaluation
2629  printf("[DS] Profile file not available (%s); performing profiling.\n",
2630  fileName);
2631 
2632  // create input data
2633  TessScoreEvaluationInputData input;
2634  populateTessScoreEvaluationInputData(&input);
2635  // PERF_COUNT_SUB("populateTessScoreEvaluationInputData")
2636  // perform evaluations
2637  unsigned int numUpdates;
2638  status = profileDevices(profile, DS_EVALUATE_ALL,
2639  evaluateScoreForDevice, &input, &numUpdates);
2640  PERF_COUNT_SUB("profileDevices")
2641  // write scores to file
2642  if (status == DS_SUCCESS) {
2643  status = writeProfileToFile(profile, serializeScore, fileName);
2644  PERF_COUNT_SUB("writeProfileToFile")
2645  if (status == DS_SUCCESS) {
2646  printf("[DS] Scores written to file (%s).\n", fileName);
2647  } else {
2648  printf(
2649  "[DS] Error saving scores to file (%s); scores not written to "
2650  "file.\n",
2651  fileName);
2652  }
2653  } else {
2654  printf(
2655  "[DS] Unable to evaluate performance; scores not written to "
2656  "file.\n");
2657  }
2658  } else {
2659  PERF_COUNT_SUB("readProfileFromFile")
2660  printf("[DS] Profile read from file (%s).\n", fileName);
2661  }
2662 
2663  // we now have device scores either from file or evaluation
2664  // select fastest using custom Tesseract selection algorithm
2665  float bestTime = FLT_MAX; // begin search with worst possible time
2666  int bestDeviceIdx = -1;
2667  for (unsigned d = 0; d < profile->numDevices; d++) {
2668  ds_device device = profile->devices[d];
2669  TessDeviceScore score = *(TessDeviceScore *)device.score;
2670 
2671  float time = score.time;
2672  printf("[DS] Device[%u] %i:%s score is %f\n", d + 1, device.type,
2673  device.oclDeviceName, time);
2674  if (time < bestTime) {
2675  bestTime = time;
2676  bestDeviceIdx = d;
2677  }
2678  }
2679  printf("[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2680  profile->devices[bestDeviceIdx].oclDeviceName,
2681  profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
2682  ? "OpenCL"
2683  : "Native");
2684  // cleanup
2685  // TODO: call destructor for profile object?
2686 
2687  bool overridden = false;
2688  char *overrideDeviceStr = getenv("TESSERACT_OPENCL_DEVICE");
2689  if (overrideDeviceStr != NULL) {
2690  int overrideDeviceIdx = atoi(overrideDeviceStr);
2691  if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices) {
2692  printf(
2693  "[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, "
2694  "%i)\n",
2695  overrideDeviceStr, overrideDeviceIdx);
2696  bestDeviceIdx = overrideDeviceIdx - 1;
2697  overridden = true;
2698  } else {
2699  printf(
2700  "[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are "
2701  "valid devices).\n",
2702  overrideDeviceStr, profile->numDevices);
2703  }
2704  }
2705 
2706  if (overridden) {
2707  printf("[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2708  profile->devices[bestDeviceIdx].oclDeviceName,
2709  profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
2710  ? "OpenCL"
2711  : "Native");
2712  }
2713  selectedDevice = profile->devices[bestDeviceIdx];
2714  // cleanup
2715  releaseDSProfile(profile, releaseScore);
2716  } else {
2717  // opencl isn't available at runtime, select native cpu device
2718  printf("[DS] OpenCL runtime not available.\n");
2719  selectedDevice.type = DS_DEVICE_NATIVE_CPU;
2720  selectedDevice.oclDeviceName = "(null)";
2721  selectedDevice.score = NULL;
2722  selectedDevice.oclDeviceID = NULL;
2723  selectedDevice.oclDriverVersion = NULL;
2724  }
2725  deviceIsSelected = true;
2726  PERF_COUNT_SUB("select from Profile")
2728  }
2729  // PERF_COUNT_END
2730  return selectedDevice;
2731 }
2732 
2733 
2734 bool OpenclDevice::selectedDeviceIsOpenCL() {
2735  ds_device device = getDeviceSelection();
2736  return (device.type == DS_DEVICE_OPENCL_DEVICE);
2737 }
2738 
2739 #endif
#define PERF_COUNT_SUB(SUB)
const int kMinLineLengthFraction
Denominator of resolution makes min pixels to demand line lengths to be.
Definition: linefind.cpp:43
const int kHistogramSize
Definition: otsuthr.h:27
void HistogramRect(Pix *src_pix, int channel, int left, int top, int width, int height, int *histogram)
Definition: otsuthr.cpp:151
const int kThinLineFraction
Denominator of resolution makes max pixel width to allow thin lines.
Definition: linefind.cpp:41
#define PERF_COUNT_START(FUNCT_NAME)
#define PERF_COUNT_END
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: thresholder.cpp:62
const char * kernel_src
Definition: oclkernels.h:21