13 #include <sys/types.h> 26 #include <mach/mach_time.h> 30 #define CALLOC LEPT_CALLOC 31 #define FREE LEPT_FREE 36 GPUEnv OpenclDevice::gpuEnv;
38 bool OpenclDevice::deviceIsSelected =
false;
39 ds_device OpenclDevice::selectedDevice;
41 int OpenclDevice::isInited = 0;
43 static l_int32 MORPH_BC = ASYMMETRIC_MORPH_BC;
45 static const l_uint32 lmask32[] = {
46 0x80000000, 0xc0000000, 0xe0000000, 0xf0000000, 0xf8000000, 0xfc000000,
47 0xfe000000, 0xff000000, 0xff800000, 0xffc00000, 0xffe00000, 0xfff00000,
48 0xfff80000, 0xfffc0000, 0xfffe0000, 0xffff0000, 0xffff8000, 0xffffc000,
49 0xffffe000, 0xfffff000, 0xfffff800, 0xfffffc00, 0xfffffe00, 0xffffff00,
50 0xffffff80, 0xffffffc0, 0xffffffe0, 0xfffffff0, 0xfffffff8, 0xfffffffc,
51 0xfffffffe, 0xffffffff};
53 static const l_uint32 rmask32[] = {
54 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f,
55 0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
56 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff,
57 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
58 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff,
59 0x7fffffff, 0xffffffff};
61 static cl_mem pixsCLBuffer, pixdCLBuffer, pixdCLIntermediate;
62 static cl_mem pixThBuffer;
63 static cl_int clStatus;
64 static KernelEnv rEnv;
66 #define DS_TAG_VERSION "<version>" 67 #define DS_TAG_VERSION_END "</version>" 68 #define DS_TAG_DEVICE "<device>" 69 #define DS_TAG_DEVICE_END "</device>" 70 #define DS_TAG_SCORE "<score>" 71 #define DS_TAG_SCORE_END "</score>" 72 #define DS_TAG_DEVICE_TYPE "<type>" 73 #define DS_TAG_DEVICE_TYPE_END "</type>" 74 #define DS_TAG_DEVICE_NAME "<name>" 75 #define DS_TAG_DEVICE_NAME_END "</name>" 76 #define DS_TAG_DEVICE_DRIVER_VERSION "<driver>" 77 #define DS_TAG_DEVICE_DRIVER_VERSION_END "</driver>" 79 #define DS_DEVICE_NATIVE_CPU_STRING "native_cpu" 81 #define DS_DEVICE_NAME_LENGTH 256 83 typedef enum { DS_EVALUATE_ALL, DS_EVALUATE_NEW_ONLY } ds_evaluation_type;
86 unsigned int numDevices;
93 DS_INVALID_PROFILE = 1000,
95 DS_INVALID_PERF_EVALUATOR_TYPE,
96 DS_INVALID_PERF_EVALUATOR,
97 DS_PERF_EVALUATOR_ERROR,
99 DS_UNKNOWN_DEVICE_TYPE,
100 DS_PROFILE_FILE_ERROR,
101 DS_SCORE_SERIALIZER_ERROR,
102 DS_SCORE_DESERIALIZER_ERROR
109 typedef ds_status (*ds_perf_evaluator)(ds_device *device,
void *data);
112 typedef ds_status (*ds_score_release)(
void *score);
113 static ds_status releaseDSProfile(ds_profile *profile, ds_score_release sr) {
114 ds_status status = DS_SUCCESS;
115 if (profile != NULL) {
116 if (profile->devices != NULL && sr != NULL) {
118 for (i = 0; i < profile->numDevices; i++) {
119 free(profile->devices[i].oclDeviceName);
120 free(profile->devices[i].oclDriverVersion);
121 status = sr(profile->devices[i].score);
122 if (status != DS_SUCCESS)
break;
124 free(profile->devices);
131 static ds_status initDSProfile(ds_profile **p,
const char *version) {
133 cl_uint numPlatforms;
134 cl_platform_id *platforms = NULL;
135 cl_device_id *devices = NULL;
136 ds_status status = DS_SUCCESS;
140 if (p == NULL)
return DS_INVALID_PROFILE;
142 ds_profile *profile = (ds_profile *)malloc(
sizeof(ds_profile));
143 if (profile == NULL)
return DS_MEMORY_ERROR;
145 memset(profile, 0,
sizeof(ds_profile));
147 clGetPlatformIDs(0, NULL, &numPlatforms);
149 if (numPlatforms > 0) {
150 platforms = (cl_platform_id *)malloc(numPlatforms *
sizeof(cl_platform_id));
151 if (platforms == NULL) {
152 status = DS_MEMORY_ERROR;
155 clGetPlatformIDs(numPlatforms, platforms, NULL);
159 for (i = 0; i < (
unsigned int)numPlatforms; i++) {
161 clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &num);
165 if (numDevices > 0) {
166 devices = (cl_device_id *)malloc(numDevices *
sizeof(cl_device_id));
167 if (devices == NULL) {
168 status = DS_MEMORY_ERROR;
173 profile->numDevices =
176 (ds_device *)malloc(profile->numDevices *
sizeof(ds_device));
177 if (profile->devices == NULL) {
178 profile->numDevices = 0;
179 status = DS_MEMORY_ERROR;
182 memset(profile->devices, 0, profile->numDevices *
sizeof(ds_device));
185 for (i = 0; i < (
unsigned int)numPlatforms; i++) {
188 clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numDevices, devices, &num);
189 for (j = 0; j < num; j++, next++) {
190 char buffer[DS_DEVICE_NAME_LENGTH];
193 profile->devices[next].type = DS_DEVICE_OPENCL_DEVICE;
194 profile->devices[next].oclDeviceID = devices[j];
196 clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_NAME,
197 DS_DEVICE_NAME_LENGTH, &buffer, NULL);
198 length = strlen(buffer);
199 profile->devices[next].oclDeviceName = (
char *)malloc(length + 1);
200 memcpy(profile->devices[next].oclDeviceName, buffer, length + 1);
202 clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DRIVER_VERSION,
203 DS_DEVICE_NAME_LENGTH, &buffer, NULL);
204 length = strlen(buffer);
205 profile->devices[next].oclDriverVersion = (
char *)malloc(length + 1);
206 memcpy(profile->devices[next].oclDriverVersion, buffer, length + 1);
209 profile->devices[next].type = DS_DEVICE_NATIVE_CPU;
210 profile->version = version;
215 if (status == DS_SUCCESS) {
219 free(profile->devices);
226 static ds_status profileDevices(ds_profile *profile,
227 const ds_evaluation_type type,
228 ds_perf_evaluator evaluator,
229 void *evaluatorData,
unsigned int *numUpdates) {
230 ds_status status = DS_SUCCESS;
232 unsigned int updates = 0;
234 if (profile == NULL) {
235 return DS_INVALID_PROFILE;
237 if (evaluator == NULL) {
238 return DS_INVALID_PERF_EVALUATOR;
241 for (i = 0; i < profile->numDevices; i++) {
242 ds_status evaluatorStatus;
245 case DS_EVALUATE_NEW_ONLY:
246 if (profile->devices[i].score != NULL)
break;
248 case DS_EVALUATE_ALL:
249 evaluatorStatus = evaluator(profile->devices + i, evaluatorData);
250 if (evaluatorStatus != DS_SUCCESS) {
251 status = evaluatorStatus;
257 return DS_INVALID_PERF_EVALUATOR_TYPE;
261 if (numUpdates) *numUpdates = updates;
265 static const char *findString(
const char *contentStart,
const char *contentEnd,
266 const char *
string) {
268 const char *currentPosition;
269 const char *found = NULL;
270 stringLength = strlen(
string);
271 currentPosition = contentStart;
272 for (currentPosition = contentStart; currentPosition < contentEnd;
274 if (*currentPosition ==
string[0]) {
275 if (currentPosition + stringLength < contentEnd) {
276 if (strncmp(currentPosition,
string, stringLength) == 0) {
277 found = currentPosition;
286 static ds_status readProFile(
const char *fileName,
char **content,
287 size_t *contentSize) {
293 FILE *input = fopen(fileName,
"rb");
295 return DS_FILE_ERROR;
298 fseek(input, 0L, SEEK_END);
301 char *binary = (
char *)malloc(size);
302 if (binary == NULL) {
304 return DS_FILE_ERROR;
306 fread(binary,
sizeof(
char), size, input);
314 typedef ds_status (*ds_score_deserializer)(ds_device *device,
315 const unsigned char *serializedScore,
316 unsigned int serializedScoreSize);
318 static ds_status readProfileFromFile(ds_profile *profile,
319 ds_score_deserializer deserializer,
321 ds_status status = DS_SUCCESS;
322 char *contentStart = NULL;
323 const char *contentEnd = NULL;
326 if (profile == NULL)
return DS_INVALID_PROFILE;
328 status = readProFile(file, &contentStart, &contentSize);
329 if (status == DS_SUCCESS) {
330 const char *currentPosition;
331 const char *dataStart;
334 contentEnd = contentStart + contentSize;
335 currentPosition = contentStart;
338 dataStart = findString(currentPosition, contentEnd, DS_TAG_VERSION);
339 if (dataStart == NULL) {
340 status = DS_PROFILE_FILE_ERROR;
343 dataStart += strlen(DS_TAG_VERSION);
345 dataEnd = findString(dataStart, contentEnd, DS_TAG_VERSION_END);
346 if (dataEnd == NULL) {
347 status = DS_PROFILE_FILE_ERROR;
351 size_t versionStringLength = strlen(profile->version);
352 if (versionStringLength + dataStart != dataEnd ||
353 strncmp(profile->version, dataStart, versionStringLength) != 0) {
355 status = DS_PROFILE_FILE_ERROR;
358 currentPosition = dataEnd + strlen(DS_TAG_VERSION_END);
364 const char *deviceTypeStart;
365 const char *deviceTypeEnd;
366 ds_device_type deviceType;
368 const char *deviceNameStart;
369 const char *deviceNameEnd;
371 const char *deviceScoreStart;
372 const char *deviceScoreEnd;
374 const char *deviceDriverStart;
375 const char *deviceDriverEnd;
377 dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE);
378 if (dataStart == NULL) {
382 dataStart += strlen(DS_TAG_DEVICE);
383 dataEnd = findString(dataStart, contentEnd, DS_TAG_DEVICE_END);
384 if (dataEnd == NULL) {
385 status = DS_PROFILE_FILE_ERROR;
390 deviceTypeStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_TYPE);
391 if (deviceTypeStart == NULL) {
392 status = DS_PROFILE_FILE_ERROR;
395 deviceTypeStart += strlen(DS_TAG_DEVICE_TYPE);
397 findString(deviceTypeStart, contentEnd, DS_TAG_DEVICE_TYPE_END);
398 if (deviceTypeEnd == NULL) {
399 status = DS_PROFILE_FILE_ERROR;
402 memcpy(&deviceType, deviceTypeStart,
sizeof(ds_device_type));
405 if (deviceType == DS_DEVICE_OPENCL_DEVICE) {
406 deviceNameStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_NAME);
407 if (deviceNameStart == NULL) {
408 status = DS_PROFILE_FILE_ERROR;
411 deviceNameStart += strlen(DS_TAG_DEVICE_NAME);
413 findString(deviceNameStart, contentEnd, DS_TAG_DEVICE_NAME_END);
414 if (deviceNameEnd == NULL) {
415 status = DS_PROFILE_FILE_ERROR;
420 findString(dataStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION);
421 if (deviceDriverStart == NULL) {
422 status = DS_PROFILE_FILE_ERROR;
425 deviceDriverStart += strlen(DS_TAG_DEVICE_DRIVER_VERSION);
426 deviceDriverEnd = findString(deviceDriverStart, contentEnd,
427 DS_TAG_DEVICE_DRIVER_VERSION_END);
428 if (deviceDriverEnd == NULL) {
429 status = DS_PROFILE_FILE_ERROR;
434 for (i = 0; i < profile->numDevices; i++) {
435 if (profile->devices[i].type == DS_DEVICE_OPENCL_DEVICE) {
436 size_t actualDeviceNameLength;
437 size_t driverVersionLength;
439 actualDeviceNameLength = strlen(profile->devices[i].oclDeviceName);
440 driverVersionLength = strlen(profile->devices[i].oclDriverVersion);
441 if (deviceNameStart + actualDeviceNameLength == deviceNameEnd &&
442 deviceDriverStart + driverVersionLength == deviceDriverEnd &&
443 strncmp(profile->devices[i].oclDeviceName, deviceNameStart,
444 actualDeviceNameLength) == 0 &&
445 strncmp(profile->devices[i].oclDriverVersion, deviceDriverStart,
446 driverVersionLength) == 0) {
448 findString(dataStart, contentEnd, DS_TAG_SCORE);
449 if (deviceNameStart == NULL) {
450 status = DS_PROFILE_FILE_ERROR;
453 deviceScoreStart += strlen(DS_TAG_SCORE);
455 findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
456 status = deserializer(profile->devices + i,
457 (
const unsigned char *)deviceScoreStart,
458 deviceScoreEnd - deviceScoreStart);
459 if (status != DS_SUCCESS) {
465 }
else if (deviceType == DS_DEVICE_NATIVE_CPU) {
466 for (i = 0; i < profile->numDevices; i++) {
467 if (profile->devices[i].type == DS_DEVICE_NATIVE_CPU) {
468 deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
469 if (deviceScoreStart == NULL) {
470 status = DS_PROFILE_FILE_ERROR;
473 deviceScoreStart += strlen(DS_TAG_SCORE);
475 findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
476 status = deserializer(profile->devices + i,
477 (
const unsigned char *)deviceScoreStart,
478 deviceScoreEnd - deviceScoreStart);
479 if (status != DS_SUCCESS) {
487 currentPosition = dataEnd + strlen(DS_TAG_DEVICE_END);
495 typedef ds_status (*ds_score_serializer)(ds_device *device,
496 void **serializedScore,
497 unsigned int *serializedScoreSize);
498 static ds_status writeProfileToFile(ds_profile *profile,
499 ds_score_serializer serializer,
501 ds_status status = DS_SUCCESS;
503 if (profile == NULL)
return DS_INVALID_PROFILE;
505 FILE *profileFile = fopen(file,
"wb");
506 if (profileFile == NULL) {
507 status = DS_FILE_ERROR;
512 fwrite(DS_TAG_VERSION,
sizeof(
char), strlen(DS_TAG_VERSION), profileFile);
513 fwrite(profile->version,
sizeof(
char), strlen(profile->version),
515 fwrite(DS_TAG_VERSION_END,
sizeof(
char), strlen(DS_TAG_VERSION_END),
517 fwrite(
"\n",
sizeof(
char), 1, profileFile);
519 for (i = 0; i < profile->numDevices && status == DS_SUCCESS; i++) {
520 void *serializedScore;
521 unsigned int serializedScoreSize;
523 fwrite(DS_TAG_DEVICE,
sizeof(
char), strlen(DS_TAG_DEVICE), profileFile);
525 fwrite(DS_TAG_DEVICE_TYPE,
sizeof(
char), strlen(DS_TAG_DEVICE_TYPE),
527 fwrite(&profile->devices[i].type,
sizeof(ds_device_type), 1, profileFile);
528 fwrite(DS_TAG_DEVICE_TYPE_END,
sizeof(
char),
529 strlen(DS_TAG_DEVICE_TYPE_END), profileFile);
531 switch (profile->devices[i].type) {
532 case DS_DEVICE_NATIVE_CPU: {
543 case DS_DEVICE_OPENCL_DEVICE: {
544 fwrite(DS_TAG_DEVICE_NAME,
sizeof(
char), strlen(DS_TAG_DEVICE_NAME),
546 fwrite(profile->devices[i].oclDeviceName,
sizeof(
char),
547 strlen(profile->devices[i].oclDeviceName), profileFile);
548 fwrite(DS_TAG_DEVICE_NAME_END,
sizeof(
char),
549 strlen(DS_TAG_DEVICE_NAME_END), profileFile);
551 fwrite(DS_TAG_DEVICE_DRIVER_VERSION,
sizeof(
char),
552 strlen(DS_TAG_DEVICE_DRIVER_VERSION), profileFile);
553 fwrite(profile->devices[i].oclDriverVersion,
sizeof(
char),
554 strlen(profile->devices[i].oclDriverVersion), profileFile);
555 fwrite(DS_TAG_DEVICE_DRIVER_VERSION_END,
sizeof(
char),
556 strlen(DS_TAG_DEVICE_DRIVER_VERSION_END), profileFile);
559 status = DS_UNKNOWN_DEVICE_TYPE;
563 fwrite(DS_TAG_SCORE,
sizeof(
char), strlen(DS_TAG_SCORE), profileFile);
564 status = serializer(profile->devices + i, &serializedScore,
565 &serializedScoreSize);
566 if (status == DS_SUCCESS && serializedScore != NULL &&
567 serializedScoreSize > 0) {
568 fwrite(serializedScore,
sizeof(
char), serializedScoreSize, profileFile);
569 free(serializedScore);
571 fwrite(DS_TAG_SCORE_END,
sizeof(
char), strlen(DS_TAG_SCORE_END),
573 fwrite(DS_TAG_DEVICE_END,
sizeof(
char), strlen(DS_TAG_DEVICE_END),
575 fwrite(
"\n",
sizeof(
char), 1, profileFile);
583 static void legalizeFileName(
char *fileName) {
585 const char *invalidChars =
588 for (
unsigned i = 0; i < strlen(invalidChars); i++) {
590 invalidStr[0] = invalidChars[i];
591 invalidStr[1] =
'\0';
597 for (
char *pos = strstr(fileName, invalidStr); pos != NULL;
598 pos = strstr(pos + 1, invalidStr)) {
606 static void populateGPUEnvFromDevice( GPUEnv *gpuInfo, cl_device_id device ) {
609 gpuInfo->mnIsUserCreated = 1;
611 gpuInfo->mpDevID = device;
612 gpuInfo->mpArryDevsID =
new cl_device_id[1];
613 gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
615 clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE,
616 sizeof(cl_device_type), &gpuInfo->mDevType, &size);
617 CHECK_OPENCL( clStatus,
"populateGPUEnv::getDeviceInfo(TYPE)");
620 clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM,
621 sizeof(cl_platform_id), &gpuInfo->mpPlatformID, &size);
622 CHECK_OPENCL( clStatus,
"populateGPUEnv::getDeviceInfo(PLATFORM)");
624 cl_context_properties props[3];
625 props[0] = CL_CONTEXT_PLATFORM;
626 props[1] = (cl_context_properties) gpuInfo->mpPlatformID;
628 gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID, NULL,
630 CHECK_OPENCL( clStatus,
"populateGPUEnv::createContext");
632 cl_command_queue_properties queueProperties = 0;
633 gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus );
634 CHECK_OPENCL( clStatus,
"populateGPUEnv::createCommandQueue");
637 int OpenclDevice::LoadOpencl()
640 HINSTANCE HOpenclDll = NULL;
641 void *OpenclDll = NULL;
643 OpenclDll =
static_cast<HINSTANCE
>(HOpenclDll);
644 OpenclDll = LoadLibrary(
"openCL.dll");
645 if (!static_cast<HINSTANCE>(OpenclDll)) {
646 fprintf(stderr,
"[OD] Load opencl.dll failed!\n");
647 FreeLibrary(static_cast<HINSTANCE>(OpenclDll));
650 fprintf(stderr,
"[OD] Load opencl.dll successful!\n");
654 int OpenclDevice::SetKernelEnv( KernelEnv *envInfo )
656 envInfo->mpkContext = gpuEnv.mpContext;
657 envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
658 envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
663 static cl_mem allocateZeroCopyBuffer(KernelEnv rEnv, l_uint32 *hostbuffer,
664 size_t nElements, cl_mem_flags flags,
667 cl_mem membuffer = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (flags),
668 nElements *
sizeof(l_uint32), hostbuffer, pStatus);
674 Pix *mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, Pix *pixd, Pix *pixs,
675 int elements, cl_mem_flags flags,
bool memcopy =
false,
677 PROCNAME(
"mapOutputCLBuffer");
680 if ((pixd = pixCreateTemplate(pixs)) == NULL)
681 (Pix *)ERROR_PTR(
"pixd not made", procName, NULL);
683 if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs),
684 pixGetDepth(pixs))) == NULL)
685 (Pix *)ERROR_PTR(
"pixd not made", procName, NULL);
688 l_uint32 *pValues = (l_uint32 *)clEnqueueMapBuffer(
689 rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
690 elements *
sizeof(l_uint32), 0, NULL, NULL, NULL);
693 memcpy(pixGetData(pixd), pValues, elements *
sizeof(l_uint32));
695 pixSetData(pixd, pValues);
698 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, clbuffer, pValues, 0, NULL,
702 clFinish(rEnv.mpkCmdQueue);
708 void OpenclDevice::releaseMorphCLBuffers()
710 if (pixdCLIntermediate != NULL) clReleaseMemObject(pixdCLIntermediate);
711 if (pixsCLBuffer != NULL) clReleaseMemObject(pixsCLBuffer);
712 if (pixdCLBuffer != NULL) clReleaseMemObject(pixdCLBuffer);
713 if (pixThBuffer != NULL) clReleaseMemObject(pixThBuffer);
714 pixdCLIntermediate = pixsCLBuffer = pixdCLBuffer = pixThBuffer = NULL;
717 int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Pix* pixs)
719 SetKernelEnv( &rEnv );
721 if (pixThBuffer != NULL) {
722 pixsCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl * h,
723 CL_MEM_ALLOC_HOST_PTR, &clStatus);
727 clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0,
728 sizeof(l_uint32) * wpl * h, 0, NULL, NULL);
733 l_uint32* srcdata = (l_uint32*) malloc(wpl*h*
sizeof(l_uint32));
734 memcpy(srcdata, pixGetData(pixs), wpl*h*
sizeof(l_uint32));
736 pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl*h, CL_MEM_USE_HOST_PTR, &clStatus);
739 pixdCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl * h,
740 CL_MEM_ALLOC_HOST_PTR, &clStatus);
742 pixdCLIntermediate = allocateZeroCopyBuffer(
743 rEnv, NULL, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
745 return (
int)clStatus;
748 int OpenclDevice::InitEnv()
755 if( 1 == LoadOpencl() )
762 InitOpenclRunEnv_DeviceSelection( 0 );
768 int OpenclDevice::ReleaseOpenclRunEnv()
770 ReleaseOpenclEnv( &gpuEnv );
776 inline int OpenclDevice::AddKernelConfig(
int kCount,
const char *kName )
779 fprintf(stderr,
"Error: ( KCount < 1 ) AddKernelConfig\n" );
780 strcpy( gpuEnv.mArrykernelNames[kCount-1], kName );
781 gpuEnv.mnKernelCount++;
784 int OpenclDevice::RegistOpenclKernel()
786 if ( !gpuEnv.mnIsUserCreated )
787 memset( &gpuEnv, 0,
sizeof(gpuEnv) );
789 gpuEnv.mnFileCount = 0;
790 gpuEnv.mnKernelCount = 0UL;
792 AddKernelConfig( 1, (
const char*)
"oclAverageSub1" );
796 int OpenclDevice::InitOpenclRunEnv_DeviceSelection(
int argc ) {
800 ds_device bestDevice_DS = getDeviceSelection( );
802 cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
804 if (selectedDeviceIsOpenCL() ) {
806 populateGPUEnvFromDevice( &gpuEnv, bestDevice );
807 gpuEnv.mnFileCount = 0;
808 gpuEnv.mnKernelCount = 0UL;
810 CompileKernelFile(&gpuEnv,
"");
822 OpenclDevice::OpenclDevice()
827 OpenclDevice::~OpenclDevice()
832 int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo )
842 for ( i = 0; i < gpuEnv.mnFileCount; i++ )
844 if ( gpuEnv.mpArryPrograms[i] )
846 clStatus = clReleaseProgram( gpuEnv.mpArryPrograms[i] );
847 CHECK_OPENCL( clStatus,
"clReleaseProgram" );
848 gpuEnv.mpArryPrograms[i] = NULL;
851 if ( gpuEnv.mpCmdQueue )
853 clReleaseCommandQueue( gpuEnv.mpCmdQueue );
854 gpuEnv.mpCmdQueue = NULL;
856 if ( gpuEnv.mpContext )
858 clReleaseContext( gpuEnv.mpContext );
859 gpuEnv.mpContext = NULL;
862 gpuInfo->mnIsUserCreated = 0;
863 delete[] gpuInfo->mpArryDevsID;
866 int OpenclDevice::BinaryGenerated(
const char * clFileName, FILE ** fhandle )
873 char fileName[256] = {0}, cl_name[128] = {0};
874 char deviceName[1024];
875 clStatus = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME,
876 sizeof(deviceName), deviceName, NULL);
877 CHECK_OPENCL(clStatus,
"clGetDeviceInfo");
878 str = (
char *)strstr(clFileName, (
char *)
".cl");
879 memcpy(cl_name, clFileName, str - clFileName);
880 cl_name[str - clFileName] =
'\0';
881 sprintf(fileName,
"%s-%s.bin", cl_name, deviceName);
882 legalizeFileName(fileName);
883 fd = fopen(fileName,
"rb");
884 status = (fd != NULL) ? 1 : 0;
891 int OpenclDevice::CachedOfKernerPrg(
const GPUEnv *gpuEnvCached,
const char * clFileName )
894 for ( i = 0; i < gpuEnvCached->mnFileCount; i++ )
896 if ( strcasecmp( gpuEnvCached->mArryKnelSrcFile[i], clFileName ) == 0 )
898 if (gpuEnvCached->mpArryPrograms[i] != NULL) {
906 int OpenclDevice::WriteBinaryToFile(
const char* fileName,
const char* birary,
size_t numBytes )
909 output = fopen(fileName,
"wb");
910 if (output == NULL) {
914 fwrite( birary,
sizeof(
char), numBytes, output );
920 int OpenclDevice::GeneratBinFromKernelSource( cl_program program,
const char * clFileName )
926 cl_device_id *mpArryDevsID;
927 char **binaries, *str = NULL;
929 clStatus = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES,
930 sizeof(numDevices), &numDevices, NULL);
931 CHECK_OPENCL( clStatus,
"clGetProgramInfo" );
933 mpArryDevsID = (cl_device_id*) malloc(
sizeof(cl_device_id) * numDevices );
934 if (mpArryDevsID == NULL) {
938 clStatus = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
939 sizeof(cl_device_id) * numDevices, mpArryDevsID,
941 CHECK_OPENCL( clStatus,
"clGetProgramInfo" );
944 binarySizes = (
size_t*) malloc(
sizeof(
size_t) * numDevices );
947 clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
948 sizeof(
size_t) * numDevices, binarySizes, NULL);
949 CHECK_OPENCL( clStatus,
"clGetProgramInfo" );
952 binaries = (
char**) malloc(
sizeof(
char *) * numDevices );
953 if (binaries == NULL) {
957 for ( i = 0; i < numDevices; i++ )
959 if ( binarySizes[i] != 0 )
961 binaries[i] = (
char*) malloc(
sizeof(
char) * binarySizes[i] );
962 if (binaries[i] == NULL) {
972 clStatus = clGetProgramInfo(program, CL_PROGRAM_BINARIES,
973 sizeof(
char *) * numDevices, binaries, NULL);
974 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
977 for ( i = 0; i < numDevices; i++ )
979 char fileName[256] = { 0 }, cl_name[128] = { 0 };
981 if ( binarySizes[i] != 0 )
983 char deviceName[1024];
984 clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
985 sizeof(deviceName), deviceName, NULL);
986 CHECK_OPENCL( clStatus,
"clGetDeviceInfo" );
988 str = (
char*) strstr( clFileName, (
char*)
".cl" );
989 memcpy( cl_name, clFileName, str - clFileName );
990 cl_name[str - clFileName] =
'\0';
991 sprintf( fileName,
"%s-%s.bin", cl_name, deviceName );
992 legalizeFileName(fileName);
993 if ( !WriteBinaryToFile( fileName, binaries[i], binarySizes[i] ) )
995 printf(
"[OD] write binary[%s] failed\n", fileName);
998 printf(
"[OD] write binary[%s] successfully\n", fileName);
1003 for ( i = 0; i < numDevices; i++ )
1016 mpArryDevsID = NULL;
1021 int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo,
const char *buildOption )
1024 cl_int clStatus = 0;
1026 char *buildLog = NULL, *binary;
1028 size_t source_size[1];
1029 int b_error, binary_status, binaryExisted, idx;
1031 cl_device_id *mpArryDevsID;
1033 const char*
filename =
"kernel.cl";
1035 if ( CachedOfKernerPrg(gpuInfo,
filename) == 1 )
1040 idx = gpuInfo->mnFileCount;
1044 source_size[0] = strlen( source );
1046 binaryExisted = BinaryGenerated(
filename, &fd );
1048 if ( binaryExisted == 1 )
1050 clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
1051 sizeof(numDevices), &numDevices, NULL);
1052 CHECK_OPENCL(clStatus,
"clGetContextInfo");
1054 mpArryDevsID = (cl_device_id *)malloc(
sizeof(cl_device_id) * numDevices);
1055 if (mpArryDevsID == NULL) {
1061 b_error |= fseek( fd, 0, SEEK_END ) < 0;
1062 b_error |= ( length = ftell(fd) ) <= 0;
1063 b_error |= fseek( fd, 0, SEEK_SET ) < 0;
1069 binary = (
char*) malloc( length + 2 );
1075 memset( binary, 0, length + 2 );
1076 b_error |= fread( binary, 1, length, fd ) != length;
1083 clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES,
1084 sizeof(cl_device_id) * numDevices,
1085 mpArryDevsID, NULL);
1086 CHECK_OPENCL( clStatus,
"clGetContextInfo" );
1089 gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( gpuInfo->mpContext,numDevices,
1090 mpArryDevsID, &length, (
const unsigned char**) &binary,
1091 &binary_status, &clStatus );
1092 CHECK_OPENCL( clStatus,
"clCreateProgramWithBinary" );
1095 free( mpArryDevsID );
1096 mpArryDevsID = NULL;
1103 gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource( gpuInfo->mpContext, 1, &source,
1104 source_size, &clStatus);
1105 CHECK_OPENCL( clStatus,
"clCreateProgramWithSource" );
1109 if (gpuInfo->mpArryPrograms[idx] == (cl_program) NULL) {
1117 if (!gpuInfo->mnIsUserCreated)
1120 clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
1121 buildOption, NULL, NULL);
1127 clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
1128 buildOption, NULL, NULL);
1132 if ( clStatus != CL_SUCCESS )
1134 printf (
"BuildProgram error!\n");
1135 if ( !gpuInfo->mnIsUserCreated )
1137 clStatus = clGetProgramBuildInfo(
1138 gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1139 CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
1143 clStatus = clGetProgramBuildInfo(
1144 gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1145 CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
1147 if ( clStatus != CL_SUCCESS )
1149 printf(
"opencl create build log fail\n");
1152 buildLog = (
char*) malloc( length );
1153 if (buildLog == (
char *)NULL) {
1156 if ( !gpuInfo->mnIsUserCreated )
1158 clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1159 CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
1163 clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1164 CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
1166 if ( clStatus != CL_SUCCESS )
1168 printf(
"opencl program build info fail\n");
1172 fd1 = fopen(
"kernel-build.log",
"w+" );
1174 fwrite(buildLog,
sizeof(
char), length, fd1);
1183 strcpy( gpuInfo->mArryKnelSrcFile[idx],
filename );
1185 if ( binaryExisted == 0 ) {
1186 GeneratBinFromKernelSource( gpuInfo->mpArryPrograms[idx],
filename );
1190 gpuInfo->mnFileCount += 1;
1195 l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl,l_uint32 *line)
1200 size_t globalThreads[2];
1201 size_t localThreads[2];
1207 gsize = (w + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1208 globalThreads[0] = gsize;
1209 gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1210 globalThreads[1] = gsize;
1211 localThreads[0] = GROUPSIZE_X;
1212 localThreads[1] = GROUPSIZE_Y;
1214 SetKernelEnv( &rEnv );
1216 l_uint32 *pResult = (l_uint32 *)malloc(w*h * sizeof(l_uint32));
1217 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "composeRGBPixel", &clStatus );
1218 CHECK_OPENCL(clStatus, "clCreateKernel composeRGBPixel");
1221 valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w*h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1222 outputCl = allocateZeroCopyBuffer(rEnv, pResult, w*h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1225 clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &valuesCl);
1226 CHECK_OPENCL( clStatus, "clSetKernelArg");
1227 clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(w), &w);
1228 CHECK_OPENCL( clStatus, "clSetKernelArg" );
1229 clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(h), &h);
1230 CHECK_OPENCL( clStatus, "clSetKernelArg" );
1231 clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1232 CHECK_OPENCL( clStatus, "clSetKernelArg" );
1233 clStatus = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &outputCl);
1234 CHECK_OPENCL( clStatus, "clSetKernelArg");
1239 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL,
1240 globalThreads, localThreads, 0, NULL, NULL);
1241 CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel");
1244 void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ,
1245 0, w * h * sizeof(l_uint32), 0, NULL, NULL,
1247 CHECK_OPENCL(clStatus, "clEnqueueMapBuffer outputCl");
1248 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, NULL, NULL);
1251 clFinish(rEnv.mpkCmdQueue);
1258 static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h)
1260 size_t globalThreads[2];
1264 size_t localThreads[2];
1267 gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1268 globalThreads[0] = gsize;
1269 globalThreads[1] = GROUPSIZE_HMORY;
1270 localThreads[0] = GROUPSIZE_HMORX;
1271 localThreads[1] = GROUPSIZE_HMORY;
1273 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram,
"morphoDilateHor_5x5", &status );
1274 CHECK_OPENCL(status,
"clCreateKernel morphoDilateHor_5x5");
1276 status = clSetKernelArg(rEnv.mpkKernel,
1280 status = clSetKernelArg(rEnv.mpkKernel,
1284 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1285 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1287 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1288 NULL, globalThreads, localThreads, 0,
1292 pixtemp = pixsCLBuffer;
1293 pixsCLBuffer = pixdCLBuffer;
1294 pixdCLBuffer = pixtemp;
1297 gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1298 globalThreads[0] = gsize;
1299 gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1300 globalThreads[1] = gsize;
1301 localThreads[0] = GROUPSIZE_X;
1302 localThreads[1] = GROUPSIZE_Y;
1304 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram,
"morphoDilateVer_5x5", &status );
1305 CHECK_OPENCL(status,
"clCreateKernel morphoDilateVer_5x5");
1307 status = clSetKernelArg(rEnv.mpkKernel,
1311 status = clSetKernelArg(rEnv.mpkKernel,
1315 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1316 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1317 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1318 NULL, globalThreads, localThreads, 0,
1325 static cl_int pixErodeCL_55(l_int32 wpl, l_int32 h)
1327 size_t globalThreads[2];
1331 l_uint32 fwmask, lwmask;
1332 size_t localThreads[2];
1334 lwmask = lmask32[31 - 2];
1335 fwmask = rmask32[31 - 2];
1338 gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1339 globalThreads[0] = gsize;
1340 globalThreads[1] = GROUPSIZE_HMORY;
1341 localThreads[0] = GROUPSIZE_HMORX;
1342 localThreads[1] = GROUPSIZE_HMORY;
1344 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram,
"morphoErodeHor_5x5", &status );
1345 CHECK_OPENCL(status,
"clCreateKernel morphoErodeHor_5x5");
1347 status = clSetKernelArg(rEnv.mpkKernel,
1351 status = clSetKernelArg(rEnv.mpkKernel,
1355 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1356 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1358 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1359 NULL, globalThreads, localThreads, 0,
1363 pixtemp = pixsCLBuffer;
1364 pixsCLBuffer = pixdCLBuffer;
1365 pixdCLBuffer = pixtemp;
1368 gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1369 globalThreads[0] = gsize;
1370 gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1371 globalThreads[1] = gsize;
1372 localThreads[0] = GROUPSIZE_X;
1373 localThreads[1] = GROUPSIZE_Y;
1375 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram,
"morphoErodeVer_5x5", &status );
1376 CHECK_OPENCL(status,
"clCreateKernel morphoErodeVer_5x5");
1378 status = clSetKernelArg(rEnv.mpkKernel,
1382 status = clSetKernelArg(rEnv.mpkKernel,
1386 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1387 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1388 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(fwmask), &fwmask);
1389 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(lwmask), &lwmask);
1390 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1391 NULL, globalThreads, localThreads, 0,
1399 pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h)
1401 l_int32 xp, yp, xn, yn;
1403 size_t globalThreads[2];
1407 size_t localThreads[2];
1410 OpenclDevice::SetKernelEnv( &rEnv );
1412 if (hsize == 5 && vsize == 5)
1415 status = pixDilateCL_55(wpl, h);
1419 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1421 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1424 gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1425 globalThreads[0] = gsize;
1426 gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1427 globalThreads[1] = gsize;
1428 localThreads[0] = GROUPSIZE_X;
1429 localThreads[1] = GROUPSIZE_Y;
1431 if (xp > 31 || xn > 31)
1435 clCreateKernel(rEnv.mpkProgram,
"morphoDilateHor", &status);
1436 CHECK_OPENCL(status,
"clCreateKernel morphoDilateHor");
1438 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1439 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1440 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1441 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(xn), &xn);
1442 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(wpl), &wpl);
1443 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(h), &h);
1444 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1445 NULL, globalThreads, localThreads, 0,
1448 if (yp > 0 || yn > 0) {
1449 pixtemp = pixsCLBuffer;
1450 pixsCLBuffer = pixdCLBuffer;
1451 pixdCLBuffer = pixtemp;
1454 else if (xp > 0 || xn > 0 )
1458 clCreateKernel(rEnv.mpkProgram,
"morphoDilateHor_32word", &status);
1459 CHECK_OPENCL(status,
"clCreateKernel morphoDilateHor_32word");
1460 isEven = (xp != xn);
1462 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1463 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1464 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1465 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1466 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1467 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(isEven), &isEven);
1468 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1469 NULL, globalThreads, localThreads, 0,
1472 if (yp > 0 || yn > 0) {
1473 pixtemp = pixsCLBuffer;
1474 pixsCLBuffer = pixdCLBuffer;
1475 pixdCLBuffer = pixtemp;
1479 if (yp > 0 || yn > 0)
1481 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram,
"morphoDilateVer", &status );
1482 CHECK_OPENCL(status,
"clCreateKernel morphoDilateVer");
1484 status = clSetKernelArg(rEnv.mpkKernel,
1488 status = clSetKernelArg(rEnv.mpkKernel,
1492 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(yp), &yp);
1493 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1494 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1495 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(yn), &yn);
1496 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1497 NULL, globalThreads, localThreads, 0,
1505 static cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, l_uint32 h) {
1506 l_int32 xp, yp, xn, yn;
1508 size_t globalThreads[2];
1509 size_t localThreads[2];
1513 char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
1514 l_uint32 rwmask, lwmask;
1517 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1519 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1521 OpenclDevice::SetKernelEnv(&rEnv);
1523 if (hsize == 5 && vsize == 5 && isAsymmetric) {
1525 status = pixErodeCL_55(wpl, h);
1529 lwmask = lmask32[31 - (xn & 31)];
1530 rwmask = rmask32[31 - (xp & 31)];
1533 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1534 globalThreads[0] = gsize;
1535 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1536 globalThreads[1] = gsize;
1537 localThreads[0] = GROUPSIZE_X;
1538 localThreads[1] = GROUPSIZE_Y;
1541 if (xp > 31 || xn > 31) {
1543 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoErodeHor", &status);
1545 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1546 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1547 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1548 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(xn), &xn);
1549 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(wpl), &wpl);
1550 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(h), &h);
1552 clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(isAsymmetric), &isAsymmetric);
1553 status = clSetKernelArg(rEnv.mpkKernel, 7,
sizeof(rwmask), &rwmask);
1554 status = clSetKernelArg(rEnv.mpkKernel, 8,
sizeof(lwmask), &lwmask);
1555 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1556 NULL, globalThreads, localThreads, 0,
1559 if (yp > 0 || yn > 0) {
1560 pixtemp = pixsCLBuffer;
1561 pixsCLBuffer = pixdCLBuffer;
1562 pixdCLBuffer = pixtemp;
1564 }
else if (xp > 0 || xn > 0) {
1566 clCreateKernel(rEnv.mpkProgram,
"morphoErodeHor_32word", &status);
1567 isEven = (xp != xn);
1569 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1570 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1571 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1572 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1573 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1575 clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(isAsymmetric), &isAsymmetric);
1576 status = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(rwmask), &rwmask);
1577 status = clSetKernelArg(rEnv.mpkKernel, 7,
sizeof(lwmask), &lwmask);
1578 status = clSetKernelArg(rEnv.mpkKernel, 8,
sizeof(isEven), &isEven);
1579 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1580 NULL, globalThreads, localThreads, 0,
1583 if (yp > 0 || yn > 0) {
1584 pixtemp = pixsCLBuffer;
1585 pixsCLBuffer = pixdCLBuffer;
1586 pixdCLBuffer = pixtemp;
1591 if (yp > 0 || yn > 0) {
1592 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoErodeVer", &status);
1593 CHECK_OPENCL(status,
"clCreateKernel morphoErodeVer");
1595 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1596 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1597 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(yp), &yp);
1598 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1599 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1601 clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(isAsymmetric), &isAsymmetric);
1602 status = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(yn), &yn);
1603 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1604 NULL, globalThreads, localThreads, 0,
1612 static cl_int pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h)
1618 status = pixErodeCL(hsize, vsize, wpl, h);
1620 pixtemp = pixsCLBuffer;
1621 pixsCLBuffer = pixdCLBuffer;
1622 pixdCLBuffer = pixtemp;
1624 status = pixDilateCL(hsize, vsize, wpl, h);
1630 static cl_int pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h)
1636 status = pixDilateCL(hsize, vsize, wpl, h);
1638 pixtemp = pixsCLBuffer;
1639 pixsCLBuffer = pixdCLBuffer;
1640 pixdCLBuffer = pixtemp;
1642 status = pixErodeCL(hsize, vsize, wpl, h);
1649 cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1,
1650 cl_mem buffer2, cl_mem outBuffer = NULL) {
1652 size_t globalThreads[2];
1654 size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
1656 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1657 globalThreads[0] = gsize;
1658 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1659 globalThreads[1] = gsize;
1661 if (outBuffer != NULL) {
1662 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"pixSubtract", &status);
1663 CHECK_OPENCL(status,
"clCreateKernel pixSubtract");
1666 clCreateKernel(rEnv.mpkProgram,
"pixSubtract_inplace", &status);
1667 CHECK_OPENCL(status,
"clCreateKernel pixSubtract_inplace");
1671 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &buffer1);
1672 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &buffer2);
1673 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1674 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1675 if (outBuffer != NULL) {
1676 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(cl_mem), &outBuffer);
1679 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL,
1680 globalThreads, localThreads, 0, NULL, NULL);
1687 void OpenclDevice::pixGetLinesCL(Pix *pixd, Pix *pixs, Pix **pix_vline,
1688 Pix **pix_hline, Pix **pixClosed,
1689 bool getpixClosed, l_int32 close_hsize,
1690 l_int32 close_vsize, l_int32 open_hsize,
1691 l_int32 open_vsize, l_int32 line_hsize,
1692 l_int32 line_vsize) {
1696 wpl = pixGetWpl(pixs);
1697 h = pixGetHeight(pixs);
1700 clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
1704 *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs,
1705 wpl * h, CL_MAP_READ,
true,
false);
1711 clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
1712 0,
sizeof(
int) * wpl * h, 0, NULL, NULL);
1715 pixtemp = pixsCLBuffer;
1716 pixsCLBuffer = pixdCLBuffer;
1717 pixdCLBuffer = pixtemp;
1719 clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
1722 pixtemp = pixsCLBuffer;
1723 pixsCLBuffer = pixdCLBuffer;
1724 pixdCLBuffer = pixdCLIntermediate;
1725 pixdCLIntermediate = pixtemp;
1727 clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
1732 clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
1733 0,
sizeof(
int) * wpl * h, 0, NULL, NULL);
1735 pixtemp = pixsCLBuffer;
1736 pixsCLBuffer = pixdCLBuffer;
1737 pixdCLBuffer = pixtemp;
1741 clStatus = pixOpenCL(1, line_vsize, wpl, h);
1744 *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl * h,
1745 CL_MAP_READ,
true,
false);
1747 pixtemp = pixsCLBuffer;
1748 pixsCLBuffer = pixdCLIntermediate;
1749 pixdCLIntermediate = pixtemp;
1753 clStatus = pixOpenCL(line_hsize, 1, wpl, h);
1756 *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl * h,
1757 CL_MAP_READ,
true,
true);
1768 int OpenclDevice::HistogramRectOCL(
unsigned char *imageData,
1769 int bytes_per_pixel,
int bytes_per_line,
1773 int *histogramAllChannels) {
1778 SetKernelEnv(&histKern);
1779 KernelEnv histRedKern;
1780 SetKernelEnv(&histRedKern);
1786 cl_mem imageBuffer = clCreateBuffer(
1787 histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1788 width * height * bytes_per_pixel * sizeof(
char), imageData, &clStatus);
1789 CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1792 int block_size = 256;
1794 clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
1795 sizeof(numCUs), &numCUs, NULL);
1796 CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1798 int requestedOccupancy = 10;
1799 int numWorkGroups = numCUs * requestedOccupancy;
1800 int numThreads = block_size * numWorkGroups;
1801 size_t local_work_size[] = {
static_cast<size_t>(block_size)};
1802 size_t global_work_size[] = {
static_cast<size_t>(numThreads)};
1803 size_t red_global_work_size[] = {
1804 static_cast<size_t>(block_size *
kHistogramSize * bytes_per_pixel)};
1808 cl_mem histogramBuffer = clCreateBuffer(
1809 histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1810 kHistogramSize * bytes_per_pixel *
sizeof(
int), histogramAllChannels,
1812 CHECK_OPENCL(clStatus,
"clCreateBuffer histogramBuffer");
1816 int tmpHistogramBins =
kHistogramSize * bytes_per_pixel * histRed;
1818 cl_mem tmpHistogramBuffer =
1819 clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE,
1820 tmpHistogramBins *
sizeof(cl_uint), NULL, &clStatus);
1821 CHECK_OPENCL(clStatus,
"clCreateBuffer tmpHistogramBuffer");
1824 int *zeroBuffer =
new int[1];
1826 cl_mem atomicSyncBuffer = clCreateBuffer(
1827 histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
1828 sizeof(cl_int), zeroBuffer, &clStatus);
1829 CHECK_OPENCL(clStatus,
"clCreateBuffer atomicSyncBuffer");
1830 delete[] zeroBuffer;
1832 if (bytes_per_pixel == 1) {
1833 histKern.mpkKernel = clCreateKernel(
1834 histKern.mpkProgram,
"kernel_HistogramRectOneChannel", &clStatus);
1835 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_HistogramRectOneChannel");
1837 histRedKern.mpkKernel =
1838 clCreateKernel(histRedKern.mpkProgram,
1839 "kernel_HistogramRectOneChannelReduction", &clStatus);
1840 CHECK_OPENCL(clStatus,
1841 "clCreateKernel kernel_HistogramRectOneChannelReduction");
1843 histKern.mpkKernel = clCreateKernel( histKern.mpkProgram,
"kernel_HistogramRectAllChannels", &clStatus );
1844 CHECK_OPENCL( clStatus,
"clCreateKernel kernel_HistogramRectAllChannels");
1846 histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram,
"kernel_HistogramRectAllChannelsReduction", &clStatus );
1847 CHECK_OPENCL( clStatus,
"clCreateKernel kernel_HistogramRectAllChannelsReduction");
1853 ptr = clEnqueueMapBuffer(
1854 histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0,
1855 tmpHistogramBins *
sizeof(cl_uint), 0, NULL, NULL, &clStatus);
1856 CHECK_OPENCL( clStatus,
"clEnqueueMapBuffer tmpHistogramBuffer");
1858 memset(ptr, 0, tmpHistogramBins*
sizeof(cl_uint));
1859 clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0,
1864 clSetKernelArg(histKern.mpkKernel, 0,
sizeof(cl_mem), &imageBuffer);
1865 CHECK_OPENCL( clStatus,
"clSetKernelArg imageBuffer");
1866 cl_uint numPixels = width*height;
1868 clSetKernelArg(histKern.mpkKernel, 1,
sizeof(cl_uint), &numPixels);
1869 CHECK_OPENCL( clStatus,
"clSetKernelArg numPixels" );
1870 clStatus = clSetKernelArg(histKern.mpkKernel, 2,
sizeof(cl_mem),
1871 &tmpHistogramBuffer);
1872 CHECK_OPENCL( clStatus,
"clSetKernelArg tmpHistogramBuffer");
1875 int n = numThreads/bytes_per_pixel;
1876 clStatus = clSetKernelArg(histRedKern.mpkKernel, 0,
sizeof(cl_int), &n);
1877 CHECK_OPENCL( clStatus,
"clSetKernelArg imageBuffer");
1878 clStatus = clSetKernelArg(histRedKern.mpkKernel, 1,
sizeof(cl_mem),
1879 &tmpHistogramBuffer);
1880 CHECK_OPENCL( clStatus,
"clSetKernelArg tmpHistogramBuffer");
1881 clStatus = clSetKernelArg(histRedKern.mpkKernel, 2,
sizeof(cl_mem),
1883 CHECK_OPENCL( clStatus,
"clSetKernelArg histogramBuffer");
1887 clStatus = clEnqueueNDRangeKernel(histKern.mpkCmdQueue, histKern.mpkKernel, 1,
1888 NULL, global_work_size, local_work_size, 0,
1890 CHECK_OPENCL(clStatus,
1891 "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels");
1892 clFinish(histKern.mpkCmdQueue);
1893 if (clStatus != 0) {
1897 clStatus = clEnqueueNDRangeKernel(
1898 histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1, NULL,
1899 red_global_work_size, local_work_size, 0, NULL, NULL);
1900 CHECK_OPENCL( clStatus,
"clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction" );
1901 clFinish( histRedKern.mpkCmdQueue );
1902 if (clStatus != 0) {
1908 ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE,
1911 NULL, NULL, &clStatus);
1912 CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer");
1913 if (clStatus != 0) {
1916 clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0,
1919 clReleaseMemObject(histogramBuffer);
1920 clReleaseMemObject(imageBuffer);
1931 int OpenclDevice::ThresholdRectToPixOCL(
unsigned char *imageData,
1932 int bytes_per_pixel,
int bytes_per_line,
1933 int *thresholds,
int *hi_values,
1934 Pix **pix,
int height,
int width,
1935 int top,
int left) {
1939 *pix = pixCreate(width, height, 1);
1940 uint32_t *pixData = pixGetData(*pix);
1941 int wpl = pixGetWpl(*pix);
1942 int pixSize = wpl * height * sizeof(uint32_t);
1946 SetKernelEnv(&rEnv);
1949 int block_size = 256;
1951 clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
1952 sizeof(numCUs), &numCUs, NULL);
1953 CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1955 int requestedOccupancy = 10;
1956 int numWorkGroups = numCUs * requestedOccupancy;
1957 int numThreads = block_size * numWorkGroups;
1958 size_t local_work_size[] = {(size_t)block_size};
1959 size_t global_work_size[] = {(size_t)numThreads};
1966 cl_mem imageBuffer = clCreateBuffer(
1967 rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1968 width * height * bytes_per_pixel *
sizeof(
char), imageData, &clStatus);
1969 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1973 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1974 pixSize, pixData, &clStatus);
1975 CHECK_OPENCL(clStatus,
"clCreateBuffer pix");
1978 cl_mem thresholdsBuffer =
1979 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1980 bytes_per_pixel *
sizeof(
int), thresholds, &clStatus);
1981 CHECK_OPENCL(clStatus,
"clCreateBuffer thresholdBuffer");
1982 cl_mem hiValuesBuffer =
1983 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1984 bytes_per_pixel *
sizeof(
int), hi_values, &clStatus);
1985 CHECK_OPENCL(clStatus,
"clCreateBuffer hiValuesBuffer");
1988 if (bytes_per_pixel == 4) {
1990 clCreateKernel(rEnv.mpkProgram,
"kernel_ThresholdRectToPix", &clStatus);
1991 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_ThresholdRectToPix");
1993 rEnv.mpkKernel = clCreateKernel(
1994 rEnv.mpkProgram,
"kernel_ThresholdRectToPix_OneChan", &clStatus);
1995 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_ThresholdRectToPix_OneChan");
1999 clStatus = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &imageBuffer);
2000 CHECK_OPENCL(clStatus,
"clSetKernelArg imageBuffer");
2001 clStatus = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(
int), &height);
2002 CHECK_OPENCL(clStatus,
"clSetKernelArg height");
2003 clStatus = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(
int), &width);
2004 CHECK_OPENCL(clStatus,
"clSetKernelArg width");
2005 clStatus = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(
int), &wpl);
2006 CHECK_OPENCL(clStatus,
"clSetKernelArg wpl");
2008 clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(cl_mem), &thresholdsBuffer);
2009 CHECK_OPENCL(clStatus,
"clSetKernelArg thresholdsBuffer");
2010 clStatus = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(cl_mem), &hiValuesBuffer);
2011 CHECK_OPENCL(clStatus,
"clSetKernelArg hiValuesBuffer");
2012 clStatus = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(cl_mem), &pixThBuffer);
2013 CHECK_OPENCL(clStatus,
"clSetKernelArg pixThBuffer");
2017 clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 1,
2018 NULL, global_work_size, local_work_size,
2020 CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix");
2021 clFinish(rEnv.mpkCmdQueue);
2023 if (clStatus != 0) {
2024 printf(
"Setting return value to -1\n");
2029 clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0,
2030 pixSize, 0, NULL, NULL, &clStatus);
2031 CHECK_OPENCL(clStatus,
"clEnqueueMapBuffer histogramBuffer");
2032 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, NULL,
2035 clReleaseMemObject(imageBuffer);
2036 clReleaseMemObject(thresholdsBuffer);
2037 clReleaseMemObject(hiValuesBuffer);
2050 typedef struct _TessScoreEvaluationInputData {
2054 unsigned char *imageData;
2056 } TessScoreEvaluationInputData;
2058 static void populateTessScoreEvaluationInputData(TessScoreEvaluationInputData *input) {
2063 int numChannels = 4;
2064 input->height = height;
2065 input->width = width;
2066 input->numChannels = numChannels;
2067 unsigned char (*imageData4)[4] = (
unsigned char (*)[4]) malloc(height*width*numChannels*
sizeof(
unsigned char));
2068 input->imageData = (
unsigned char *) &imageData4[0];
2071 unsigned char pixelWhite[4] = { 0, 0, 0, 255};
2072 unsigned char pixelBlack[4] = {255, 255, 255, 255};
2073 for (
int p = 0; p < height*width; p++) {
2075 imageData4[p][0] = pixelWhite[0];
2076 imageData4[p][1] = pixelWhite[1];
2077 imageData4[p][2] = pixelWhite[2];
2078 imageData4[p][3] = pixelWhite[3];
2081 int maxLineWidth = 64;
2084 for (
int i = 0; i < numLines; i++) {
2085 int lineWidth = rand()%maxLineWidth;
2086 int vertLinePos = lineWidth + rand()%(width-2*lineWidth);
2088 for (
int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) {
2089 for (
int col = 0; col < height; col++) {
2091 imageData4[row*width+col][0] = pixelBlack[0];
2092 imageData4[row*width+col][1] = pixelBlack[1];
2093 imageData4[row*width+col][2] = pixelBlack[2];
2094 imageData4[row*width+col][3] = pixelBlack[3];
2099 for (
int i = 0; i < numLines; i++) {
2100 int lineWidth = rand()%maxLineWidth;
2101 int horLinePos = lineWidth + rand()%(height-2*lineWidth);
2103 for (
int row = 0; row < width; row++) {
2104 for (
int col = horLinePos-lineWidth/2; col < horLinePos+lineWidth/2; col++) {
2107 imageData4[row*width+col][0] = pixelBlack[0];
2108 imageData4[row*width+col][1] = pixelBlack[1];
2109 imageData4[row*width+col][2] = pixelBlack[2];
2110 imageData4[row*width+col][3] = pixelBlack[3];
2115 float fractionBlack = 0.1;
2116 int numSpots = (height*width)*fractionBlack/(maxLineWidth*maxLineWidth/2/2);
2117 for (
int i = 0; i < numSpots; i++) {
2118 int lineWidth = rand()%maxLineWidth;
2119 int col = lineWidth + rand()%(width-2*lineWidth);
2120 int row = lineWidth + rand()%(height-2*lineWidth);
2122 for (
int r = row-lineWidth/2; r < row+lineWidth/2; r++) {
2123 for (
int c = col-lineWidth/2; c < col+lineWidth/2; c++) {
2126 imageData4[r*width+c][0] = pixelBlack[0];
2127 imageData4[r*width+c][1] = pixelBlack[1];
2128 imageData4[r*width+c][2] = pixelBlack[2];
2129 imageData4[r*width+c][3] = pixelBlack[3];
2134 input->pix = pixCreate(input->width, input->height, 1);
2137 typedef struct _TessDeviceScore {
2147 static double composeRGBPixelMicroBench(GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type) {
2150 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2151 QueryPerformanceFrequency(&freq);
2153 mach_timebase_info_data_t info = {0, 0};
2154 mach_timebase_info(&info);
2155 long long start, stop;
2157 timespec time_funct_start, time_funct_end;
2160 l_uint32 *tiffdata = (l_uint32 *)input.imageData;
2163 if (type == DS_DEVICE_OPENCL_DEVICE) {
2165 QueryPerformanceCounter(&time_funct_start);
2167 start = mach_absolute_time();
2169 clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2172 OpenclDevice::gpuEnv = *env;
2173 int wpl = pixGetWpl(input.pix);
2174 OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height,
2177 QueryPerformanceCounter(&time_funct_end);
2178 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(
double)(freq.QuadPart);
2180 stop = mach_absolute_time();
2181 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2183 clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2184 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2189 QueryPerformanceCounter(&time_funct_start);
2191 start = mach_absolute_time();
2193 clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2195 Pix *pix = pixCreate(input.width, input.height, 32);
2196 l_uint32 *pixData = pixGetData(pix);
2199 for (i = 0; i < input.height ; i++) {
2200 for (j = 0; j < input.width; j++) {
2201 l_uint32 tiffword = tiffdata[i * input.width + j];
2202 l_int32 rval = ((tiffword) & 0xff);
2203 l_int32 gval = (((tiffword) >> 8) & 0xff);
2204 l_int32 bval = (((tiffword) >> 16) & 0xff);
2205 l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
2206 pixData[idx] = value;
2211 QueryPerformanceCounter(&time_funct_end);
2212 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(
double)(freq.QuadPart);
2214 stop = mach_absolute_time();
2215 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2217 clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2218 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2229 static double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
2232 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2233 QueryPerformanceFrequency(&freq);
2235 mach_timebase_info_data_t info = {0, 0};
2236 mach_timebase_info(&info);
2237 long long start, stop;
2239 timespec time_funct_start, time_funct_end;
2245 int bytes_per_line = input.width*input.numChannels;
2246 int *histogramAllChannels =
new int[
kHistogramSize*input.numChannels];
2248 if (type == DS_DEVICE_OPENCL_DEVICE) {
2250 QueryPerformanceCounter(&time_funct_start);
2252 start = mach_absolute_time();
2254 clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2257 OpenclDevice::gpuEnv = *env;
2258 int retVal = OpenclDevice::HistogramRectOCL(
2259 input.imageData, input.numChannels, bytes_per_line, top, left,
2260 input.width, input.height,
kHistogramSize, histogramAllChannels);
2263 QueryPerformanceCounter(&time_funct_end);
2264 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(
double)(freq.QuadPart);
2266 stop = mach_absolute_time();
2268 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2273 clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2274 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2279 QueryPerformanceCounter(&time_funct_start);
2281 start = mach_absolute_time();
2283 clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2285 for (
int ch = 0; ch < input.numChannels; ++ch) {
2287 input.width, input.height, histogram);
2290 QueryPerformanceCounter(&time_funct_end);
2291 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(
double)(freq.QuadPart);
2293 stop = mach_absolute_time();
2294 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2296 clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2297 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2303 delete[] histogramAllChannels;
2308 static void ThresholdRectToPix_Native(
const unsigned char* imagedata,
2309 int bytes_per_pixel,
2311 const int* thresholds,
2312 const int* hi_values,
2316 int width = pixGetWidth(*pix);
2317 int height = pixGetHeight(*pix);
2319 *pix = pixCreate(width, height, 1);
2320 uint32_t *pixdata = pixGetData(*pix);
2321 int wpl = pixGetWpl(*pix);
2322 const unsigned char* srcdata = imagedata + top * bytes_per_line +
2323 left * bytes_per_pixel;
2324 for (
int y = 0; y < height; ++y) {
2325 const uint8_t *linedata = srcdata;
2326 uint32_t *pixline = pixdata + y * wpl;
2327 for (
int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
2328 bool white_result =
true;
2329 for (
int ch = 0; ch < bytes_per_pixel; ++ch) {
2330 if (hi_values[ch] >= 0 &&
2331 (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
2332 white_result =
false;
2337 CLEAR_DATA_BIT(pixline, x);
2339 SET_DATA_BIT(pixline, x);
2341 srcdata += bytes_per_line;
2345 static double thresholdRectToPixMicroBench(GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type) {
2348 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2349 QueryPerformanceFrequency(&freq);
2351 mach_timebase_info_data_t info = {0, 0};
2352 mach_timebase_info(&info);
2353 long long start, stop;
2355 timespec time_funct_start, time_funct_end;
2359 unsigned char pixelHi = (
unsigned char)255;
2360 int* thresholds =
new int[4];
2361 thresholds[0] = pixelHi/2;
2362 thresholds[1] = pixelHi/2;
2363 thresholds[2] = pixelHi/2;
2364 thresholds[3] = pixelHi/2;
2365 int *hi_values =
new int[4];
2366 thresholds[0] = pixelHi;
2367 thresholds[1] = pixelHi;
2368 thresholds[2] = pixelHi;
2369 thresholds[3] = pixelHi;
2373 int bytes_per_line = input.width*input.numChannels;
2376 if (type == DS_DEVICE_OPENCL_DEVICE) {
2378 QueryPerformanceCounter(&time_funct_start);
2380 start = mach_absolute_time();
2382 clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2385 OpenclDevice::gpuEnv = *env;
2386 int retVal = OpenclDevice::ThresholdRectToPixOCL(
2387 input.imageData, input.numChannels, bytes_per_line, thresholds,
2388 hi_values, &input.pix, input.height, input.width, top, left);
2391 QueryPerformanceCounter(&time_funct_end);
2392 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(
double)(freq.QuadPart);
2394 stop = mach_absolute_time();
2396 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2403 clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2404 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2412 QueryPerformanceCounter(&time_funct_start);
2414 start = mach_absolute_time();
2416 clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2418 ThresholdRectToPix_Native( input.imageData, input.numChannels, bytes_per_line,
2419 thresholds, hi_values, &input.pix );
2422 QueryPerformanceCounter(&time_funct_end);
2423 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(
double)(freq.QuadPart);
2425 stop = mach_absolute_time();
2426 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2428 clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2429 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2434 delete[] thresholds;
2439 static double getLineMasksMorphMicroBench(GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type) {
2443 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2444 QueryPerformanceFrequency(&freq);
2446 mach_timebase_info_data_t info = {0, 0};
2447 mach_timebase_info(&info);
2448 long long start, stop;
2450 timespec time_funct_start, time_funct_end;
2454 int resolution = 300;
2455 int wpl = pixGetWpl(input.pix);
2460 int closing_brick = max_line_width / 3;
2463 if (type == DS_DEVICE_OPENCL_DEVICE) {
2465 QueryPerformanceCounter(&time_funct_start);
2467 start = mach_absolute_time();
2469 clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2471 OpenclDevice::gpuEnv = *env;
2472 OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
2473 Pix *pix_vline = NULL, *pix_hline = NULL, *pix_closed = NULL;
2474 OpenclDevice::pixGetLinesCL(
2475 NULL, input.pix, &pix_vline, &pix_hline, &pix_closed,
true,
2476 closing_brick, closing_brick, max_line_width, max_line_width,
2477 min_line_length, min_line_length);
2479 OpenclDevice::releaseMorphCLBuffers();
2482 QueryPerformanceCounter(&time_funct_end);
2483 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(
double)(freq.QuadPart);
2485 stop = mach_absolute_time();
2486 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2488 clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2489 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2493 QueryPerformanceCounter(&time_funct_start);
2495 start = mach_absolute_time();
2497 clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2501 Pix *src_pix = input.pix;
2503 pixCloseBrick(NULL, src_pix, closing_brick, closing_brick);
2505 pixOpenBrick(NULL, pix_closed, max_line_width, max_line_width);
2506 Pix *pix_hollow = pixSubtract(NULL, pix_closed, pix_solid);
2507 pixDestroy(&pix_solid);
2508 Pix *pix_vline = pixOpenBrick(NULL, pix_hollow, 1, min_line_length);
2509 Pix *pix_hline = pixOpenBrick(NULL, pix_hollow, min_line_length, 1);
2510 pixDestroy(&pix_hollow);
2513 QueryPerformanceCounter(&time_funct_end);
2514 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(
double)(freq.QuadPart);
2516 stop = mach_absolute_time();
2517 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2519 clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2520 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2536 static ds_status serializeScore( ds_device* device,
void **serializedScore,
unsigned int* serializedScoreSize ) {
2537 *serializedScoreSize =
sizeof(TessDeviceScore);
2538 *serializedScore =
new unsigned char[*serializedScoreSize];
2539 memcpy(*serializedScore, device->score, *serializedScoreSize);
2544 static ds_status deserializeScore( ds_device* device,
const unsigned char* serializedScore,
unsigned int serializedScoreSize ) {
2546 device->score =
new TessDeviceScore;
2547 memcpy(device->score, serializedScore, serializedScoreSize);
2551 static ds_status releaseScore(
void *score) {
2552 delete (TessDeviceScore *)score;
2557 static ds_status evaluateScoreForDevice( ds_device *device,
void *inputData) {
2560 printf(
"\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ?
"OpenCL" :
"Native" );
2562 if (device->type == DS_DEVICE_OPENCL_DEVICE) {
2565 populateGPUEnvFromDevice( env, device->oclDeviceID);
2566 env->mnFileCount = 0;
2567 env->mnKernelCount = 0UL;
2569 OpenclDevice::gpuEnv = *env;
2570 OpenclDevice::CompileKernelFile(env,
"");
2573 TessScoreEvaluationInputData *input = (TessScoreEvaluationInputData *)inputData;
2576 double composeRGBPixelTime = composeRGBPixelMicroBench( env, *input, device->type );
2579 double histogramRectTime = histogramRectMicroBench( env, *input, device->type );
2582 double thresholdRectToPixTime = thresholdRectToPixMicroBench( env, *input, device->type );
2585 double getLineMasksMorphTime = getLineMasksMorphMicroBench( env, *input, device->type );
2590 float composeRGBPixelWeight = 1.2f;
2591 float histogramRectWeight = 2.4f;
2592 float thresholdRectToPixWeight = 4.5f;
2593 float getLineMasksMorphWeight = 5.0f;
2595 float weightedTime = composeRGBPixelWeight * composeRGBPixelTime +
2596 histogramRectWeight * histogramRectTime +
2597 thresholdRectToPixWeight * thresholdRectToPixTime +
2598 getLineMasksMorphWeight * getLineMasksMorphTime;
2599 device->score =
new TessDeviceScore;
2600 ((TessDeviceScore *)device->score)->time = weightedTime;
2602 printf(
"[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ?
"OpenCL" :
"Native" );
2603 printf(
"[DS]%25s: %f (w=%.1f)\n",
"composeRGBPixel", composeRGBPixelTime, composeRGBPixelWeight );
2604 printf(
"[DS]%25s: %f (w=%.1f)\n",
"HistogramRect", histogramRectTime, histogramRectWeight );
2605 printf(
"[DS]%25s: %f (w=%.1f)\n",
"ThresholdRectToPix", thresholdRectToPixTime, thresholdRectToPixWeight );
2606 printf(
"[DS]%25s: %f (w=%.1f)\n",
"getLineMasksMorph", getLineMasksMorphTime, getLineMasksMorphWeight );
2607 printf(
"[DS]%25s: %f\n",
"Score", ((TessDeviceScore *)device->score)->time );
2612 ds_device OpenclDevice::getDeviceSelection( ) {
2613 if (!deviceIsSelected) {
2616 if (1 == LoadOpencl()) {
2621 ds_profile *profile;
2622 status = initDSProfile(&profile,
"v0.1");
2625 const
char *fileName = "tesseract_opencl_profile_devices.dat";
2626 status = readProfileFromFile(profile, deserializeScore, fileName);
2627 if (status != DS_SUCCESS) {
2629 printf(
"[DS] Profile file not available (%s); performing profiling.\n",
2633 TessScoreEvaluationInputData input;
2634 populateTessScoreEvaluationInputData(&input);
2637 unsigned int numUpdates;
2638 status = profileDevices(profile, DS_EVALUATE_ALL,
2639 evaluateScoreForDevice, &input, &numUpdates);
2642 if (status == DS_SUCCESS) {
2643 status = writeProfileToFile(profile, serializeScore, fileName);
2645 if (status == DS_SUCCESS) {
2646 printf(
"[DS] Scores written to file (%s).\n", fileName);
2649 "[DS] Error saving scores to file (%s); scores not written to " 2655 "[DS] Unable to evaluate performance; scores not written to " 2660 printf("[DS] Profile read from file (%s).\n", fileName);
2665 float bestTime = FLT_MAX;
2666 int bestDeviceIdx = -1;
2667 for (
unsigned d = 0; d < profile->numDevices; d++) {
2668 ds_device device = profile->devices[d];
2669 TessDeviceScore score = *(TessDeviceScore *)device.score;
2671 float time = score.time;
2672 printf(
"[DS] Device[%u] %i:%s score is %f\n", d + 1, device.type,
2673 device.oclDeviceName, time);
2674 if (time < bestTime) {
2679 printf(
"[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2680 profile->devices[bestDeviceIdx].oclDeviceName,
2681 profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
2687 bool overridden =
false;
2688 char *overrideDeviceStr = getenv(
"TESSERACT_OPENCL_DEVICE");
2689 if (overrideDeviceStr != NULL) {
2690 int overrideDeviceIdx = atoi(overrideDeviceStr);
2691 if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices) {
2693 "[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, " 2695 overrideDeviceStr, overrideDeviceIdx);
2696 bestDeviceIdx = overrideDeviceIdx - 1;
2700 "[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are " 2701 "valid devices).\n",
2702 overrideDeviceStr, profile->numDevices);
2707 printf(
"[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2708 profile->devices[bestDeviceIdx].oclDeviceName,
2709 profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
2713 selectedDevice = profile->devices[bestDeviceIdx];
2715 releaseDSProfile(profile, releaseScore);
2718 printf(
"[DS] OpenCL runtime not available.\n");
2719 selectedDevice.type = DS_DEVICE_NATIVE_CPU;
2720 selectedDevice.oclDeviceName =
"(null)";
2721 selectedDevice.score = NULL;
2722 selectedDevice.oclDeviceID = NULL;
2723 selectedDevice.oclDriverVersion = NULL;
2725 deviceIsSelected =
true;
2730 return selectedDevice;
2734 bool OpenclDevice::selectedDeviceIsOpenCL() {
2735 ds_device device = getDeviceSelection();
2736 return (device.type == DS_DEVICE_OPENCL_DEVICE);
#define PERF_COUNT_SUB(SUB)
const int kMinLineLengthFraction
Denominator of resolution makes min pixels to demand line lengths to be.
void HistogramRect(Pix *src_pix, int channel, int left, int top, int width, int height, int *histogram)
const int kThinLineFraction
Denominator of resolution makes max pixel width to allow thin lines.
#define PERF_COUNT_START(FUNCT_NAME)
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)