PSFEstimationwithCPSO
|
00001 /* 00002 * util.hpp 00003 * 00004 * Created on: 27/05/2012 00005 * Author: Peter Frank Perroni (pfperroni@inf.ufpr.br) 00006 */ 00007 00008 #include <CL/cl.h> 00009 #include <CL/opencl.h> 00010 #include <CL/cl_platform.h> 00011 #include <CL/cl_ext.h> 00012 #include <vector_types.h> 00013 #include <cmath> 00014 #include <float.h> 00015 #include <stdio.h> 00016 #include <stdlib.h> 00017 #include <complex> 00018 #include <iostream> 00019 #include "config.hpp" 00020 #include "debug.hpp" 00021 00022 #ifndef _UTIL_HPP_ 00023 #define _UTIL_HPP_ 00024 00025 using namespace std; 00026 00027 #define STRINGIFY(text) #text 00028 00029 #define OCL_CODE(...) #__VA_ARGS__ 00030 00031 // Automatic adjusts of data types. 00032 #ifdef _DOUBLE_WORD_ 00033 #define CL_WORD cl_double 00034 #define WORD double 00035 #define WORD_MAX DBL_MAX 00036 #define FITS_TYPE TDOUBLE 00037 #define FFT_TYPE double2 00038 #else 00039 #define WORD float 00040 #define CL_WORD cl_float 00041 #define WORD_MAX FLT_MAX 00042 #define FITS_TYPE TFLOAT 00043 #define FFT_TYPE float2 00044 #endif 00045 00046 #define SIZEOF_WORD sizeof(WORD) 00047 #define SIZEOF_FFTTYPE sizeof(FFT_TYPE) 00048 00049 typedef struct { 00050 size_t sz; 00051 void *ptr; 00052 } clParam; 00053 00054 #ifdef _PROFILING_ 00055 #define PROFILING(event) { \ 00056 CHECK_CL_STATE( clWaitForEvents(1, &event); ) \ 00057 cl_ulong _start = 0, _end = 0, _latest; \ 00058 CHECK_CL_STATE( clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &_start, NULL); ) \ 00059 CHECK_CL_STATE( clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &_end, NULL); ) \ 00060 /* Solve an OpenCL bug which switches the start value by the end value. */ \ 00061 if(_start > _end){ _latest = _start; _start = _end; _end = _latest; } \ 00062 /* There is a bug that makes OpenCL Not always collect one of the profiling information */ \ 00063 /* (data won't be available even after retrying the read on the same missing property). */ \ 00064 if(_start > 0 && _end > 0) Profiling::increaseProcessingTime((_end - _start) * 1.0e-6f); \ 00065 clReleaseEvent(event); \ 00066 } 00067 #else 00068 #define PROFILING(event) clReleaseEvent(event); 00069 #endif 00070 00071 #define clMemcpyDeviceToHost(command_queue, dest, src, size) { \ 00072 cl_event _event; \ 00073 CHECK_CL_STATE( clEnqueueReadBuffer(command_queue, src, CL_TRUE, 0, size, dest, 0, NULL, &_event) ) \ 00074 PROFILING(_event); \ 00075 } 00076 00077 #define clMemcpyDeviceToHostOffset(command_queue, dest, src, src_offset, size) { \ 00078 cl_event _event; \ 00079 CHECK_CL_STATE( clEnqueueReadBuffer(command_queue, src, CL_TRUE, src_offset, size, dest, 0, NULL, &_event) ) \ 00080 PROFILING(_event); \ 00081 } 00082 00083 #define clMemcpyHostToDevice(command_queue, dest, src, size) { \ 00084 cl_event _event; \ 00085 CHECK_CL_STATE( clEnqueueWriteBuffer(command_queue, dest, CL_TRUE, 0, size, src, 0, NULL, &_event) ) \ 00086 PROFILING(_event); \ 00087 } 00088 00089 #define clMemcpyHostToDeviceOffset(command_queue, dest, dest_offset, src, size) { \ 00090 cl_event _event; \ 00091 CHECK_CL_STATE( clEnqueueWriteBuffer(command_queue, dest, CL_TRUE, dest_offset, size, src, 0, NULL, &_event) ) \ 00092 PROFILING(_event); \ 00093 } 00094 00095 #define clMemcpyDeviceToDevice(command_queue, dest, src, size, block) { \ 00096 cl_event _event; \ 00097 CHECK_CL_STATE( clEnqueueCopyBuffer(command_queue, src, dest, 0, 0, size, 0, NULL, &_event) ) \ 00098 if(block) clWaitForEvents(1, &_event); \ 00099 PROFILING(_event); \ 00100 } 00101 00102 #define clMemcpyDeviceToDeviceOffset(command_queue, dest, dest_offset, src, src_offset, size, block) { \ 00103 cl_event _event; \ 00104 CHECK_CL_STATE( clEnqueueCopyBuffer(command_queue, src, dest, (size_t)(src_offset), (size_t)(dest_offset), size, 0, NULL, &_event) ) \ 00105 if(block) clWaitForEvents(1, &_event); \ 00106 PROFILING(_event); \ 00107 } 00108 00109 #define FINAL_REDUCE(command_queue, cl_src, result) { \ 00110 WORD _sum[REDUCTION_NBLOCKS]; \ 00111 clMemcpyDeviceToHost(command_queue, _sum, cl_src, REDUCTION_NBLOCKS * SIZEOF_WORD); \ 00112 result = 0; \ 00113 for(int _i=0; _i < REDUCTION_NBLOCKS; _i++){result += _sum[_i];} \ 00114 } 00115 00116 #define FINAL_REDUCTIONS(command_queue, cl_src, result, n_reductions) { \ 00117 int sz = n_reductions * REDUCTION_NBLOCKS; \ 00118 WORD _sum[sz]; \ 00119 clMemcpyDeviceToHost(command_queue, _sum, cl_src, sz * SIZEOF_WORD); \ 00120 result = 0; \ 00121 for(int _i=0; _i < sz; _i++){result += _sum[_i];} \ 00122 } 00123 00124 #define FINAL_REDUCE_FFT(command_queue, cl_src, result) { \ 00125 WORD _sum[REDUCTION_NBLOCKS]; \ 00126 clMemcpyDeviceToHost(command_queue, _sum, cl_src, REDUCTION_NBLOCKS * SIZEOF_WORD); \ 00127 result.x = 0; result.y = 0; \ 00128 for(int _i=0; _i < REDUCTION_NBLOCKS; _i++){result.x += _sum[_i].x; result.y += _sum[_i].y;} \ 00129 } 00130 00131 00132 #ifdef _CHECK_HALT_ 00133 #define HALT(condition) \ 00134 if(condition) { \ 00135 ERROR_DETAILS("Halt on condition (" << STRINGIFY(condition) << ")", __FILE__, __LINE__); \ 00136 exit(1); \ 00137 } 00138 #else 00139 #define HALT(condition) 00140 #endif 00141 00142 #define VA_NUM_ARGS(...) VA_NUM_ARGS_IMPL(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1) 00143 #define VA_NUM_ARGS_IMPL(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, N,...) N 00144 00145 #define CREATE_BUFFER(context, flags, _size, ptr_src, dest) { \ 00146 int size = (_size <= 0) ? 1 : _size; \ 00147 cl_int ciErrNum; \ 00148 dest = clCreateBuffer(context, flags, size, ptr_src, &ciErrNum); \ 00149 TRACE("Creating buffer " << STRINGIFY(dest) << " with flags=" << STRINGIFY(flags) << ", size=" << size << ((ptr_src==NULL)?" and without startup value => ":" and with contents copied from ") << STRINGIFY(ptr_src) << "."); \ 00150 if(dest == NULL || ciErrNum != CL_SUCCESS){ \ 00151 ERROR_DETAILS("Error creating buffer for " << STRINGIFY(dest), __FILE__, __LINE__); \ 00152 exit(1); \ 00153 }\ 00154 } 00155 00156 #define CALL_KERNEL2D(command_queue, kernel, globalWorkSizeD1, globalWorkSizeD2, localWorkSizeD1, localWorkSizeD2, nParams, ...) \ 00157 { \ 00158 const size_t globalWorkSize2D[2] = {(globalWorkSizeD1), (globalWorkSizeD2)}; \ 00159 const size_t localWorkSize2D[2] = {(localWorkSizeD1), (localWorkSizeD2)}; \ 00160 _CALL_KERNEL(command_queue, kernel, 2, globalWorkSize2D, localWorkSize2D, nParams, __VA_ARGS__) \ 00161 } 00162 00163 #define CALL_KERNEL(command_queue, kernel, globalWorkSize, localWorkSize, nParams, ...) \ 00164 { \ 00165 const size_t globalWorkSize1D[1] = {(globalWorkSize)}; \ 00166 const size_t localWorkSize1D[1] = {(localWorkSize)}; \ 00167 _CALL_KERNEL(command_queue, kernel, 1, globalWorkSize1D, localWorkSize1D, nParams, __VA_ARGS__) \ 00168 } 00169 00170 #define _CALL_KERNEL(command_queue, kernel, nDim, globalWorkSize, localWorkSize, nParams, ...) \ 00171 omp_set_lock(kernel.mutex); /* Mandatory to submit Only One Kernel at a time to the queue! */ \ 00172 cl_int _err; \ 00173 clParam _params[nParams] = {__VA_ARGS__}; \ 00174 for(int _i=0; _i < nParams; _i++){ \ 00175 _err = clSetKernelArg(kernel.kernel_instance, _i, _params[_i].sz, _params[_i].ptr); \ 00176 if (_err != CL_SUCCESS){ \ 00177 ERROR("Error (" << _err << ") setting parameter [" << (_i+1) << "] for kernel " << STRINGIFY(kernel.kernel_instance)); \ 00178 exit(1); \ 00179 } \ 00180 } \ 00181 TRACE("Calling the kernel " << STRINGIFY(kernel.kernel_instance) << " with globalWorkSize=" << globalWorkSize << " and localWorkSize=" << localWorkSize); \ 00182 cl_event _event; \ 00183 _err = clEnqueueNDRangeKernel(command_queue, kernel.kernel_instance, nDim, NULL, globalWorkSize, localWorkSize, 0, NULL, &_event); \ 00184 if (_err != CL_SUCCESS){ \ 00185 ERROR("Error (" << _err << ") calling kernel: " << STRINGIFY(kernel.kernel_instance)); \ 00186 exit(1); \ 00187 } \ 00188 PROFILING(_event); \ 00189 omp_unset_lock(kernel.mutex); 00190 00191 00192 #define ADJUST_FFT(command_queue, cl_fft, sz) { \ 00193 int round = 0; \ 00194 int globalWorkSize = (sz < GENPHASE_BLOCKSZ) ? sz : ceil(sz / (double)GENPHASE_BLOCKSZ) * GENPHASE_BLOCKSZ; \ 00195 int localWorkSize = (sz < GENPHASE_BLOCKSZ) ? sz : GENPHASE_BLOCKSZ; \ 00196 CALL_KERNEL(command_queue, kernels->cl_adjust_fft, globalWorkSize, localWorkSize, 3, \ 00197 {sizeof(cl_mem), (void*)&cl_fft}, \ 00198 {sizeof(cl_int), (void*)&round}, \ 00199 {sizeof(cl_int), (void*)&sz} \ 00200 ); \ 00201 } 00202 00203 #define ROUND_FFT(command_queue, cl_fft, sz) { \ 00204 int round = 1; \ 00205 int globalWorkSize = (sz < GENPHASE_BLOCKSZ) ? sz : ceil(sz / (double)GENPHASE_BLOCKSZ) * GENPHASE_BLOCKSZ; \ 00206 int localWorkSize = (sz < GENPHASE_BLOCKSZ) ? sz : GENPHASE_BLOCKSZ; \ 00207 CALL_KERNEL(command_queue, kernels->cl_adjust_fft, globalWorkSize, localWorkSize, 3, \ 00208 {sizeof(cl_mem), (void*)&cl_fft}, \ 00209 {sizeof(cl_int), (void*)&round}, \ 00210 {sizeof(cl_int), (void*)&sz} \ 00211 ); \ 00212 } 00213 00214 #define SYNC_QUEUE { \ 00215 cl_int _status = CL_SUCCESS; \ 00216 if((_status=clFinish(command_queue)) != CL_SUCCESS){ \ 00217 ostringstream __oss; \ 00218 __oss << "Error: [" << (int)_status << "] synchronizing the kernel!" << std::endl; \ 00219 ERROR(__oss.str().c_str()); \ 00220 exit(1); \ 00221 } \ 00222 } 00223 00224 #define COMPLEX2REAL(z) sqrt(z.x * z.x + z.y + z.y) 00225 00226 #define REAL2COMPLEX_GPU_VIA_CPU(command_queue, cl_src, cl_dest, sz) { \ 00227 WORD tmp_r[sz]; \ 00228 FFT_TYPE tmp_z[sz]; \ 00229 clMemcpyDeviceToHost(command_queue, tmp_r, cl_src, sz * SIZEOF_WORD); \ 00230 for(int _i=0; _i < sz; _i++){tmp_z.x = tmp_r[_i]; tmp_z.y = 0;} \ 00231 clMemcpyHostToDevice(command_queue, tmp_Z, cl_DEST, sz * SIZEOF_FFTTYPE); \ 00232 } 00233 00234 #define REAL2COMPLEX_GPU_VIA_GPU(command_queue, cl_src, cl_dest, sz) { \ 00235 int globalWorkSize = (sz < MATRIX_OP_BLOCKSZ) ? sz : ceil(sz / (double)MATRIX_OP_BLOCKSZ) * MATRIX_OP_BLOCKSZ; \ 00236 int localWorkSize = (sz < MATRIX_OP_BLOCKSZ) ? sz : MATRIX_OP_BLOCKSZ; \ 00237 int _sz = sz; \ 00238 CALL_KERNEL(command_queue, kernels->cl_real2complex, globalWorkSize, localWorkSize, 3, \ 00239 {sizeof(cl_mem), (void*)&cl_src}, \ 00240 {sizeof(cl_mem), (void*)&cl_dest}, \ 00241 {sizeof(cl_int), (void*)&_sz} \ 00242 ); \ 00243 } 00244 00245 #define COMPLEX2REAL_GPU_VIA_GPU(command_queue, cl_src, cl_dest, sz) { \ 00246 int globalWorkSize = (sz < MATRIX_OP_BLOCKSZ) ? sz : ceil(sz / (double)MATRIX_OP_BLOCKSZ) * MATRIX_OP_BLOCKSZ; \ 00247 int localWorkSize = (sz < MATRIX_OP_BLOCKSZ) ? sz : MATRIX_OP_BLOCKSZ; \ 00248 int _sz = sz; \ 00249 CALL_KERNEL(command_queue, kernels->cl_complex2real, globalWorkSize, localWorkSize, 3, \ 00250 {sizeof(cl_mem), (void*)&cl_src}, \ 00251 {sizeof(cl_mem), (void*)&cl_dest}, \ 00252 {sizeof(cl_int), (void*)&_sz} \ 00253 ); \ 00254 } 00255 00256 #endif /* _UTIL_HPP_ */ 00257