PSFEstimationwithCPSO
util.hpp
00001 /*
00002  * util.hpp
00003  *
00004  *  Created on: 27/05/2012
00005  *  Author: Peter Frank Perroni (pfperroni@inf.ufpr.br)
00006  */
00007 
00008 #include <CL/cl.h>
00009 #include <CL/opencl.h>
00010 #include <CL/cl_platform.h>
00011 #include <CL/cl_ext.h>
00012 #include <vector_types.h>
00013 #include <cmath>
00014 #include <float.h>
00015 #include <stdio.h>
00016 #include <stdlib.h>
00017 #include <complex>
00018 #include <iostream>
00019 #include "config.hpp"
00020 #include "debug.hpp"
00021 
00022 #ifndef _UTIL_HPP_
00023 #define _UTIL_HPP_
00024 
00025 using namespace std;
00026 
00027 #define STRINGIFY(text) #text
00028 
00029 #define OCL_CODE(...) #__VA_ARGS__
00030 
00031 // Automatic adjusts of data types.
00032 #ifdef _DOUBLE_WORD_
00033 #define CL_WORD cl_double
00034 #define WORD double
00035 #define WORD_MAX DBL_MAX
00036 #define FITS_TYPE TDOUBLE
00037 #define FFT_TYPE double2
00038 #else
00039 #define WORD float
00040 #define CL_WORD cl_float
00041 #define WORD_MAX FLT_MAX
00042 #define FITS_TYPE TFLOAT
00043 #define FFT_TYPE float2
00044 #endif
00045 
00046 #define SIZEOF_WORD sizeof(WORD)
00047 #define SIZEOF_FFTTYPE sizeof(FFT_TYPE)
00048 
00049 typedef struct {
00050         size_t sz;
00051         void *ptr;
00052 } clParam;
00053 
00054 #ifdef _PROFILING_
00055 #define PROFILING(event) { \
00056         CHECK_CL_STATE( clWaitForEvents(1, &event); ) \
00057         cl_ulong _start = 0, _end = 0, _latest; \
00058         CHECK_CL_STATE( clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &_start, NULL); ) \
00059         CHECK_CL_STATE( clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &_end, NULL); ) \
00060         /* Solve an OpenCL bug which switches the start value by the end value. */ \
00061         if(_start > _end){ _latest = _start; _start = _end; _end = _latest; } \
00062         /* There is a bug that makes OpenCL Not always collect one of the profiling information */ \
00063         /* (data won't be available even after retrying the read on the same missing property). */ \
00064         if(_start > 0 && _end > 0) Profiling::increaseProcessingTime((_end - _start) * 1.0e-6f); \
00065         clReleaseEvent(event); \
00066 }
00067 #else
00068 #define PROFILING(event) clReleaseEvent(event);
00069 #endif
00070 
00071 #define clMemcpyDeviceToHost(command_queue, dest, src, size) { \
00072         cl_event _event; \
00073         CHECK_CL_STATE( clEnqueueReadBuffer(command_queue, src, CL_TRUE, 0, size, dest, 0, NULL, &_event) ) \
00074         PROFILING(_event); \
00075 }
00076 
00077 #define clMemcpyDeviceToHostOffset(command_queue, dest, src, src_offset, size) { \
00078         cl_event _event; \
00079         CHECK_CL_STATE( clEnqueueReadBuffer(command_queue, src, CL_TRUE, src_offset, size, dest, 0, NULL, &_event) ) \
00080         PROFILING(_event); \
00081 }
00082 
00083 #define clMemcpyHostToDevice(command_queue, dest, src, size) { \
00084         cl_event _event; \
00085         CHECK_CL_STATE( clEnqueueWriteBuffer(command_queue, dest, CL_TRUE, 0, size, src, 0, NULL, &_event) ) \
00086         PROFILING(_event); \
00087 }
00088 
00089 #define clMemcpyHostToDeviceOffset(command_queue, dest, dest_offset, src, size) { \
00090         cl_event _event; \
00091         CHECK_CL_STATE( clEnqueueWriteBuffer(command_queue, dest, CL_TRUE, dest_offset, size, src, 0, NULL, &_event) ) \
00092         PROFILING(_event); \
00093 }
00094 
00095 #define clMemcpyDeviceToDevice(command_queue, dest, src, size, block) { \
00096         cl_event _event; \
00097         CHECK_CL_STATE( clEnqueueCopyBuffer(command_queue, src, dest, 0, 0, size, 0, NULL, &_event) ) \
00098         if(block) clWaitForEvents(1, &_event); \
00099         PROFILING(_event); \
00100 }
00101 
00102 #define clMemcpyDeviceToDeviceOffset(command_queue, dest, dest_offset, src, src_offset, size, block) { \
00103         cl_event _event; \
00104         CHECK_CL_STATE( clEnqueueCopyBuffer(command_queue, src, dest, (size_t)(src_offset), (size_t)(dest_offset), size, 0, NULL, &_event) ) \
00105         if(block) clWaitForEvents(1, &_event); \
00106         PROFILING(_event); \
00107 }
00108 
00109 #define FINAL_REDUCE(command_queue, cl_src, result) { \
00110         WORD _sum[REDUCTION_NBLOCKS]; \
00111         clMemcpyDeviceToHost(command_queue, _sum, cl_src, REDUCTION_NBLOCKS * SIZEOF_WORD); \
00112         result = 0; \
00113         for(int _i=0; _i < REDUCTION_NBLOCKS; _i++){result += _sum[_i];} \
00114 }
00115 
00116 #define FINAL_REDUCTIONS(command_queue, cl_src, result, n_reductions) { \
00117         int sz = n_reductions * REDUCTION_NBLOCKS; \
00118         WORD _sum[sz]; \
00119         clMemcpyDeviceToHost(command_queue, _sum, cl_src, sz * SIZEOF_WORD); \
00120         result = 0; \
00121         for(int _i=0; _i < sz; _i++){result += _sum[_i];} \
00122 }
00123 
00124 #define FINAL_REDUCE_FFT(command_queue, cl_src, result) { \
00125         WORD _sum[REDUCTION_NBLOCKS]; \
00126         clMemcpyDeviceToHost(command_queue, _sum, cl_src, REDUCTION_NBLOCKS * SIZEOF_WORD); \
00127         result.x = 0; result.y = 0; \
00128         for(int _i=0; _i < REDUCTION_NBLOCKS; _i++){result.x += _sum[_i].x; result.y += _sum[_i].y;} \
00129 }
00130 
00131 
00132 #ifdef _CHECK_HALT_
00133 #define HALT(condition) \
00134         if(condition) { \
00135                 ERROR_DETAILS("Halt on condition (" << STRINGIFY(condition) << ")", __FILE__, __LINE__); \
00136                 exit(1); \
00137         }
00138 #else
00139 #define HALT(condition)
00140 #endif
00141 
00142 #define VA_NUM_ARGS(...) VA_NUM_ARGS_IMPL(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)
00143 #define VA_NUM_ARGS_IMPL(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, N,...) N
00144 
00145 #define CREATE_BUFFER(context, flags, _size, ptr_src, dest) { \
00146         int size = (_size <= 0) ? 1 : _size; \
00147         cl_int ciErrNum; \
00148         dest = clCreateBuffer(context, flags, size, ptr_src, &ciErrNum); \
00149         TRACE("Creating buffer " << STRINGIFY(dest) << " with flags=" << STRINGIFY(flags) << ", size=" << size << ((ptr_src==NULL)?" and without startup value => ":" and with contents copied from ") << STRINGIFY(ptr_src) << "."); \
00150         if(dest == NULL || ciErrNum != CL_SUCCESS){ \
00151                 ERROR_DETAILS("Error creating buffer for " << STRINGIFY(dest), __FILE__, __LINE__); \
00152                 exit(1); \
00153         }\
00154 }
00155 
00156 #define CALL_KERNEL2D(command_queue, kernel, globalWorkSizeD1, globalWorkSizeD2, localWorkSizeD1, localWorkSizeD2, nParams, ...) \
00157         { \
00158                 const size_t globalWorkSize2D[2] = {(globalWorkSizeD1), (globalWorkSizeD2)}; \
00159                 const size_t localWorkSize2D[2] = {(localWorkSizeD1), (localWorkSizeD2)}; \
00160                 _CALL_KERNEL(command_queue, kernel, 2, globalWorkSize2D, localWorkSize2D, nParams, __VA_ARGS__) \
00161         }
00162 
00163 #define CALL_KERNEL(command_queue, kernel, globalWorkSize, localWorkSize, nParams, ...) \
00164         { \
00165                 const size_t globalWorkSize1D[1] = {(globalWorkSize)}; \
00166                 const size_t localWorkSize1D[1] = {(localWorkSize)}; \
00167                 _CALL_KERNEL(command_queue, kernel, 1, globalWorkSize1D, localWorkSize1D, nParams, __VA_ARGS__) \
00168         }
00169 
00170 #define _CALL_KERNEL(command_queue, kernel, nDim, globalWorkSize, localWorkSize, nParams, ...) \
00171         omp_set_lock(kernel.mutex); /* Mandatory to submit Only One Kernel at a time to the queue! */ \
00172         cl_int _err; \
00173         clParam _params[nParams] = {__VA_ARGS__}; \
00174         for(int _i=0; _i < nParams; _i++){ \
00175                 _err  = clSetKernelArg(kernel.kernel_instance, _i, _params[_i].sz, _params[_i].ptr); \
00176                 if (_err != CL_SUCCESS){ \
00177                         ERROR("Error (" << _err << ") setting parameter [" << (_i+1) << "] for kernel " << STRINGIFY(kernel.kernel_instance)); \
00178                         exit(1); \
00179                 } \
00180         } \
00181         TRACE("Calling the kernel " << STRINGIFY(kernel.kernel_instance) << " with globalWorkSize=" << globalWorkSize << " and localWorkSize=" << localWorkSize); \
00182         cl_event _event; \
00183         _err = clEnqueueNDRangeKernel(command_queue, kernel.kernel_instance, nDim, NULL, globalWorkSize, localWorkSize, 0, NULL, &_event); \
00184         if (_err != CL_SUCCESS){ \
00185                 ERROR("Error (" << _err << ") calling kernel: " << STRINGIFY(kernel.kernel_instance)); \
00186                 exit(1); \
00187         } \
00188         PROFILING(_event); \
00189         omp_unset_lock(kernel.mutex);
00190 
00191 
00192 #define ADJUST_FFT(command_queue, cl_fft, sz) { \
00193         int round = 0; \
00194         int globalWorkSize = (sz < GENPHASE_BLOCKSZ) ? sz : ceil(sz / (double)GENPHASE_BLOCKSZ) * GENPHASE_BLOCKSZ; \
00195         int localWorkSize = (sz < GENPHASE_BLOCKSZ) ? sz : GENPHASE_BLOCKSZ; \
00196         CALL_KERNEL(command_queue, kernels->cl_adjust_fft, globalWorkSize, localWorkSize, 3, \
00197                 {sizeof(cl_mem), (void*)&cl_fft}, \
00198                 {sizeof(cl_int), (void*)&round}, \
00199                 {sizeof(cl_int), (void*)&sz} \
00200         ); \
00201 }
00202 
00203 #define ROUND_FFT(command_queue, cl_fft, sz) { \
00204         int round = 1; \
00205         int globalWorkSize = (sz < GENPHASE_BLOCKSZ) ? sz : ceil(sz / (double)GENPHASE_BLOCKSZ) * GENPHASE_BLOCKSZ; \
00206         int localWorkSize = (sz < GENPHASE_BLOCKSZ) ? sz : GENPHASE_BLOCKSZ; \
00207         CALL_KERNEL(command_queue, kernels->cl_adjust_fft, globalWorkSize, localWorkSize, 3, \
00208                 {sizeof(cl_mem), (void*)&cl_fft}, \
00209                 {sizeof(cl_int), (void*)&round}, \
00210                 {sizeof(cl_int), (void*)&sz} \
00211         ); \
00212 }
00213 
00214 #define SYNC_QUEUE { \
00215         cl_int _status = CL_SUCCESS; \
00216         if((_status=clFinish(command_queue)) != CL_SUCCESS){ \
00217                 ostringstream __oss; \
00218                 __oss << "Error: [" << (int)_status << "] synchronizing the kernel!" << std::endl; \
00219                 ERROR(__oss.str().c_str()); \
00220                 exit(1); \
00221         } \
00222 }
00223 
00224 #define COMPLEX2REAL(z) sqrt(z.x * z.x + z.y + z.y)
00225 
00226 #define REAL2COMPLEX_GPU_VIA_CPU(command_queue, cl_src, cl_dest, sz) { \
00227         WORD tmp_r[sz]; \
00228         FFT_TYPE tmp_z[sz]; \
00229         clMemcpyDeviceToHost(command_queue, tmp_r, cl_src, sz * SIZEOF_WORD); \
00230         for(int _i=0; _i < sz; _i++){tmp_z.x = tmp_r[_i]; tmp_z.y = 0;} \
00231         clMemcpyHostToDevice(command_queue, tmp_Z, cl_DEST, sz * SIZEOF_FFTTYPE); \
00232 }
00233 
00234 #define REAL2COMPLEX_GPU_VIA_GPU(command_queue, cl_src, cl_dest, sz) { \
00235         int globalWorkSize = (sz < MATRIX_OP_BLOCKSZ) ? sz : ceil(sz / (double)MATRIX_OP_BLOCKSZ) * MATRIX_OP_BLOCKSZ; \
00236         int localWorkSize = (sz < MATRIX_OP_BLOCKSZ) ? sz : MATRIX_OP_BLOCKSZ; \
00237         int _sz = sz; \
00238         CALL_KERNEL(command_queue, kernels->cl_real2complex, globalWorkSize, localWorkSize, 3, \
00239                 {sizeof(cl_mem), (void*)&cl_src}, \
00240                 {sizeof(cl_mem), (void*)&cl_dest}, \
00241                 {sizeof(cl_int), (void*)&_sz} \
00242         ); \
00243 }
00244 
00245 #define COMPLEX2REAL_GPU_VIA_GPU(command_queue, cl_src, cl_dest, sz) { \
00246         int globalWorkSize = (sz < MATRIX_OP_BLOCKSZ) ? sz : ceil(sz / (double)MATRIX_OP_BLOCKSZ) * MATRIX_OP_BLOCKSZ; \
00247         int localWorkSize = (sz < MATRIX_OP_BLOCKSZ) ? sz : MATRIX_OP_BLOCKSZ; \
00248         int _sz = sz; \
00249         CALL_KERNEL(command_queue, kernels->cl_complex2real, globalWorkSize, localWorkSize, 3, \
00250                 {sizeof(cl_mem), (void*)&cl_src}, \
00251                 {sizeof(cl_mem), (void*)&cl_dest}, \
00252                 {sizeof(cl_int), (void*)&_sz} \
00253         ); \
00254 }
00255 
00256 #endif /* _UTIL_HPP_ */
00257 
 All Classes Functions