PSFEstimationwithCPSO
|
00001 /* 00002 * kippe_aux.cpp 00003 * 00004 * Created on: 27/05/2012 00005 * Author: Peter Frank Perroni (pfperroni@inf.ufpr.br) 00006 */ 00007 00008 #include "cpso.hpp" 00009 00010 using namespace std; 00011 00012 map<cl_context, cl_mem*> CPSO::static_references; 00013 int *CPSO::diffraction_mask; 00014 omp_lock_t CPSO::mutex; 00015 omp_lock_t CPSO::mutex_fft; 00016 omp_lock_t Debug::mutex1; 00017 omp_lock_t Debug::mutex2; 00018 bool CPSO::lock_initialized = false; 00019 bool Debug::lock_initialized = false; 00020 char Debug::buffer[DEFAULT_BUFFER_SIZE]; 00021 int CPSO::seq = 0; 00022 00043 CPSO::CPSO(double* _zernikes, int *_phase_mask, int *_diffraction_mask, int _phase_size, int _image_size, int _n_zernikes, 00044 int _psf_range, WORD _w, WORD c1, WORD c2, WORD _reset_at, int _n_particles, int _n_swarms){ 00045 startup_locks(); 00046 00047 omp_set_lock(&mutex); 00048 UID = seq++; 00049 omp_unset_lock(&mutex); 00050 00051 buffer = new char[DEFAULT_BUFFER_SIZE]; 00052 queue = clFactory::getQueue(); 00053 context = queue->getContext(); 00054 command_queue = queue->getCommandQueue(); 00055 kernels = queue->kernels; 00056 has_startup_coefs = false; 00057 store_static_data(context, _n_zernikes, _phase_size, _image_size, _zernikes, _phase_mask, _diffraction_mask); 00058 allocate_data(_phase_size, _image_size, _n_zernikes, _psf_range, _w, c1, c2, _reset_at, _n_particles, _n_swarms); 00059 } 00060 00064 CPSO::~CPSO(){ 00065 // Release the internal buffer. 00066 delete buffer; 00067 } 00068 00085 void CPSO::store_static_data(cl_context _context, int _n_zernikes, int _phase_size, int _image_size, double* zernikes, 00086 int *phase_mask, int *_diffraction_mask){ 00087 // If the data was already stored into this context, leave this method. 00088 if(static_references.count(_context) != 0){ 00089 return; 00090 } 00091 00092 int _img_area = _image_size * _image_size; 00093 int _size_fft = _phase_size * _phase_size; 00094 int _z_size = _n_zernikes * _size_fft; 00095 00096 WORD *_zernikes = new WORD[_z_size]; 00097 for(int i = 0; i < _z_size; i++) _zernikes[i] = zernikes[i]; 00098 cl_mem _cl_zernikes, _cl_phase_mask, _cl_diffraction_mask; 00099 CREATE_BUFFER(_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, _z_size * SIZEOF_WORD, _zernikes, _cl_zernikes); 00100 CREATE_BUFFER(_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, _size_fft * sizeof(int), phase_mask, _cl_phase_mask); 00101 CREATE_BUFFER(_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, _img_area * sizeof(int), _diffraction_mask, _cl_diffraction_mask);//img_areah 00102 00103 delete _zernikes; 00104 00105 cl_mem *static_data = new cl_mem[3]; 00106 static_data[0] = _cl_zernikes; 00107 static_data[1] = _cl_phase_mask; 00108 static_data[2] = _cl_diffraction_mask; 00109 static_references[_context] = static_data; 00110 00111 diffraction_mask = new int[_img_area]; 00112 memcpy(diffraction_mask, _diffraction_mask, _img_area * sizeof(int)); 00113 } 00114 00120 void CPSO::clear_static_data(){ 00121 cl_mem *static_data; 00122 map<cl_context, cl_mem*>::iterator iter; 00123 for(iter=static_references.begin(); iter != static_references.end(); iter++){ 00124 static_data = iter->second; 00125 clReleaseMemObject(static_data[0]); 00126 clReleaseMemObject(static_data[1]); 00127 clReleaseMemObject(static_data[2]); 00128 delete static_data; 00129 } 00130 destroy_locks(); 00131 delete diffraction_mask; 00132 } 00133 00137 void CPSO::startup_locks(){ 00138 if(!lock_initialized){ 00139 omp_init_lock(&mutex); 00140 omp_init_lock(&mutex_fft); 00141 Debug::startup_locks(); 00142 lock_initialized = true; 00143 } 00144 } 00145 00149 void CPSO::destroy_locks(){ 00150 if(lock_initialized){ 00151 omp_destroy_lock(&mutex); 00152 omp_destroy_lock(&mutex_fft); 00153 Debug::destroy_locks(); 00154 lock_initialized = false; 00155 } 00156 } 00157 00172 void CPSO::allocate_data(int _phase_size, int image_size, int _n_zernikes, int _psf_range, 00173 WORD _w, WORD c1, WORD c2, WORD _reset_at, int _n_particles, int _n_swarms) { 00174 00175 // Set the simple Host variables. 00176 threads = DEFAULT_BLOCKSZ; 00177 psf_range = _psf_range; 00178 img_size = image_size; 00179 phase_size = _phase_size; 00180 n_zernikes = _n_zernikes; 00181 img_sizeh = (img_size / 2) + 1; 00182 img_area = img_size * img_size; 00183 img_areah = img_size * img_sizeh; 00184 size_fft = phase_size * phase_size; 00185 z_size = n_zernikes * size_fft; 00186 00187 n_particles = _n_particles; 00188 n_swarms = _n_swarms; 00189 int swarm_sizes[n_swarms]; 00190 float swarm_dim = n_zernikes / (float)n_swarms; 00191 for(int i=0; i < n_swarms; i++){ 00192 swarm_sizes[i] = (i + 1) * swarm_dim - 1; 00193 } 00194 reset_at = _reset_at; 00195 w = _w; 00196 00197 startup_coefs = new WORD[n_zernikes]; 00198 int n_psfs = n_particles * n_swarms; 00199 00200 // Allocate memory on device and set its contents. 00201 // For the PSF. 00202 CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * size_fft * SIZEOF_FFTTYPE, NULL, cl_pupil); 00203 CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * size_fft * SIZEOF_WORD, NULL, cl_phase);// Only necessary for validation purpose. 00204 CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * img_area * SIZEOF_FFTTYPE, NULL, cl_fft_psfe); 00205 CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * img_area * SIZEOF_FFTTYPE, NULL, cl_fft_conobj); 00206 CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * REDUCTION_NBLOCKS * SIZEOF_WORD, NULL, cl_sum); 00207 CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * img_area * SIZEOF_WORD, NULL, cl_cost); 00208 CREATE_BUFFER(context, CL_MEM_READ_WRITE, img_area * SIZEOF_WORD, NULL, cl_mismatch); 00209 00210 // For the CPSO. 00211 CREATE_BUFFER(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, n_swarms * sizeof(int), swarm_sizes, cl_swarm_dim); 00212 CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * n_zernikes * SIZEOF_WORD, NULL, cl_coefs); 00213 CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_particles * n_zernikes * SIZEOF_WORD, NULL, cl_speed); 00214 CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * n_zernikes * SIZEOF_WORD, NULL, cl_pbest); 00215 CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_zernikes * SIZEOF_WORD, NULL, cl_gbest); 00216 CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * SIZEOF_WORD, NULL, cl_pbest_value); 00217 CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_swarms * SIZEOF_WORD, NULL, cl_gbest_value); 00218 CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * sizeof(int), NULL, cl_reset_search); 00219 CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * sizeof(uint4), NULL, cl_rand_ctx); 00220 CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * SIZEOF_WORD, NULL, cl_w); 00221 CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * SIZEOF_WORD, NULL, cl_c1); 00222 CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * SIZEOF_WORD, NULL, cl_c2); 00223 00224 // Remaining objects. 00225 CREATE_BUFFER(context, CL_MEM_READ_WRITE, img_area * SIZEOF_WORD, NULL, cl_best_conobj); 00226 CREATE_BUFFER(context, CL_MEM_READ_WRITE, size_fft * SIZEOF_WORD, NULL, cl_best_phase); 00227 CREATE_BUFFER(context, CL_MEM_READ_WRITE, size_fft * SIZEOF_WORD, NULL, cl_best_psf); 00228 CREATE_BUFFER(context, CL_MEM_READ_WRITE, img_area * SIZEOF_WORD, NULL, cl_best_psfe); 00229 CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_zernikes * SIZEOF_WORD, NULL, cl_best_coefs); 00230 CREATE_BUFFER(context, CL_MEM_READ_WRITE, img_area * SIZEOF_FFTTYPE, NULL, cl_best_fft_psfe); 00231 CREATE_BUFFER(context, CL_MEM_READ_WRITE, img_area * SIZEOF_FFTTYPE, NULL, cl_best_fft_conobj); 00232 CREATE_BUFFER(context, CL_MEM_READ_ONLY, img_area * SIZEOF_FFTTYPE, NULL, cl_object); 00233 CREATE_BUFFER(context, CL_MEM_READ_ONLY, img_area * SIZEOF_FFTTYPE, NULL, cl_image); 00234 CREATE_BUFFER(context, CL_MEM_READ_WRITE, img_area * SIZEOF_FFTTYPE, NULL, cl_fft_object); 00235 CREATE_BUFFER(context, CL_MEM_READ_WRITE, img_area * SIZEOF_FFTTYPE, NULL, cl_fft_image); 00236 CREATE_BUFFER(context, CL_MEM_READ_WRITE, img_area * SIZEOF_FFTTYPE, NULL, cl_fft_original_psf); 00237 CREATE_BUFFER(context, CL_MEM_READ_WRITE, z_size * SIZEOF_FFTTYPE, NULL, cl_debug_info); 00238 00239 // Get the references for the static values. 00240 cl_mem *static_data = static_references[context]; 00241 cl_zernikes = static_data[0]; 00242 cl_phase_mask = static_data[1]; 00243 cl_diffraction_mask = static_data[2]; 00244 00245 DEBUG(int, command_queue, "phase-mask", cl_phase_mask, phase_size, phase_size); 00246 DEBUG(int, command_queue, "difraction-mask", cl_diffraction_mask, img_size, img_size);//img_size, img_sizeh 00247 00248 // Initialize the FFT contexts. 00249 vn_object = new viennacl::vector<WORD>(cl_object, img_area*2); 00250 vn_image = new viennacl::vector<WORD>(cl_image, img_area*2); 00251 vn_fft_object = new viennacl::vector<WORD>(cl_fft_object, img_area*2); 00252 vn_fft_image = new viennacl::vector<WORD>(cl_fft_image, img_area*2); 00253 vn_fft_original_psf = new viennacl::vector<WORD>(cl_fft_original_psf, img_area*2); 00254 vn_fft_conobj = new viennacl::vector<WORD>(cl_fft_conobj, img_area*2); 00255 00256 // Each particle will have its own PSF instance. 00257 psf = new PSF*[n_psfs]; 00258 for(int i=0; i < n_psfs; i++){ 00259 psf[i] = new PSF(context, command_queue, img_size, phase_size, n_zernikes); 00260 // Set the PSF references into the major matrixes. 00261 psf[i]->setCoefsPosition(cl_coefs, i * n_zernikes); 00262 psf[i]->setPupilPosition(cl_pupil, i * size_fft); 00263 } 00264 original_psf_fft = new FFT_TYPE[img_area]; 00265 00266 // Set CPSO search parameters. 00267 setW(w); 00268 setC1(c1); 00269 setC2(c2); 00270 00271 SYNC_QUEUE 00272 } 00273 00282 void CPSO::set_images(double *object, double *image) { 00283 has_psf_original = false; 00284 00285 // Create an array with values zero to clear the device memory. 00286 FFT_TYPE *zero = (FFT_TYPE*)buffer; 00287 memset(zero, 0, z_size * SIZEOF_FFTTYPE); 00288 00289 int i; 00290 WORD worst_value[n_swarms]; 00291 // Worst possible value is the maximum possible value (minimization problem). 00292 for(i=0; i < n_swarms; i++) worst_value[i] = WORD_MAX; 00293 00294 int n_coefs = n_particles * n_zernikes; 00295 WORD coefs[n_coefs * n_swarms]; 00296 if(has_startup_coefs){ 00297 // NOTE: The PSO results CANNOT be directly compared to the Simulated Annealing results! 00298 // This happens because the SA calculates Only 1 Zernike at a time and has many controls to accept 00299 // the coefficient or not, what makes it very difficult to force it to accept manual coefs. 00300 // At same time, the PSO pupil is totally recalculated every time, whereas the SA one has 00301 // the intermediate values stored, what cause slight differences that will be cascade all 00302 // over the computation, resulting in a near but Not exact costs (PSO cost x SA cost). 00303 // Initialize 1 particle with the static startup coefficients provided. 00304 memcpy(coefs, startup_coefs, n_zernikes * SIZEOF_WORD); 00305 // Replicate these same coefs to all particles into the first swarm. 00306 for(i=1; i < n_particles; i++) memcpy(&coefs[i * n_zernikes], coefs, n_zernikes * SIZEOF_WORD); 00307 } 00308 else{ 00309 //generateRandomCoefs(coefs, n_coefs, psf_range); 00310 generateNormalDistrRandomCoefs(coefs, n_coefs, psf_range); 00311 // Generate random coefficients for every dimension in every particle into the first swarm. 00312 //for(i=0; i < n_coefs; i++) coefs[i] = getRandCoef(psf_range); 00313 } 00314 has_startup_coefs = false; 00315 00316 // Replicate the particle coefficients to all remaining swarms. 00317 for(i=1; i < n_swarms; i++) memcpy(&coefs[i * n_coefs], coefs, n_coefs * SIZEOF_WORD); 00318 00319 // Generate random speeds for every dimension in every particle. 00320 // Speeds should not be replicated for every swarm, since it will be used Only to update 00321 // the next position for the own swarm's dimension interval, thus the remaining intervals 00322 // would be useless. 00323 WORD speed[n_coefs]; 00324 // Max.speed is half search space. 00325 for(i=0; i < n_coefs; i++) speed[i] = getRandCoef(psf_range / 2); 00326 00327 // Create a random seed for every particle. 00328 int n_psfs = n_particles * n_swarms; 00329 uint4 rand_ctx[n_psfs]; 00330 for(i=0; i < n_psfs; i++) {rand_ctx[i].x = rand(); rand_ctx[i].y = rand(); rand_ctx[i].z = rand(); rand_ctx[i].w = rand(); } 00331 00332 //Regardless of the FITS data type (double or float), convert it to the type currently in use. 00333 FFT_TYPE *_image = new FFT_TYPE[img_area]; 00334 for(i = 0; i < img_area; i++) {_image[i].x = image[i]; _image[i].y = 0;} 00335 FFT_TYPE *_object = new FFT_TYPE[img_area]; 00336 for(i = 0; i < img_area; i++) {_object[i].x = object[i]; _object[i].y = 0;} 00337 00338 // Copy the values to the device. 00339 // For the CPSO. 00340 clMemcpyHostToDevice(command_queue, cl_coefs, coefs, n_coefs * n_swarms * SIZEOF_WORD); 00341 clMemcpyHostToDevice(command_queue, cl_speed, speed, n_coefs * SIZEOF_WORD); 00342 // Set the personal best position as the initial position. 00343 clMemcpyHostToDevice(command_queue, cl_pbest, coefs, n_coefs * n_swarms * SIZEOF_WORD); 00344 // Set the global best position as zero, cause it must be calculated before used (thus there's no initial value). 00345 clMemcpyHostToDevice(command_queue, cl_gbest, zero, n_zernikes * SIZEOF_WORD); 00346 // Set the global best value as the worst possible value. 00347 clMemcpyHostToDevice(command_queue, cl_gbest_value, worst_value, n_swarms * SIZEOF_WORD); 00348 clMemcpyHostToDevice(command_queue, cl_reset_search, zero, n_psfs * sizeof(int)); 00349 clMemcpyHostToDevice(command_queue, cl_rand_ctx, rand_ctx, n_psfs * sizeof(uint4)); 00350 00351 // For the remaining objects. 00352 clMemcpyHostToDevice(command_queue, cl_debug_info, zero, z_size * SIZEOF_FFTTYPE); 00353 clMemcpyHostToDevice(command_queue, cl_object, _object, img_area * SIZEOF_FFTTYPE); 00354 clMemcpyHostToDevice(command_queue, cl_image, _image, img_area * SIZEOF_FFTTYPE); 00355 clMemcpyHostToDevice(command_queue, cl_fft_image, zero, img_area * SIZEOF_FFTTYPE); 00356 00357 delete _image; 00358 delete _object; 00359 00360 // Reset the number of evaluations. 00361 n_psf_evals = 0; 00362 max_evals = -1; 00363 convergence_stable_cycle = -1; 00364 00365 SYNC_QUEUE 00366 00367 // Calculate the Object's and the Image's FFTs. 00368 omp_set_lock(&mutex_fft); 00369 viennacl::fft(*vn_object, *vn_fft_object); 00370 viennacl::fft(*vn_image, *vn_fft_image); 00371 omp_unset_lock(&mutex_fft); 00372 00373 DEBUG_2D(WORD, command_queue, "coefs", cl_coefs, n_particles, n_zernikes); 00374 DEBUG_COMPLEX(command_queue, "object", cl_object, img_size, img_size); 00375 DEBUG_COMPLEX(command_queue, "fft-object", cl_fft_object, img_size, img_size); 00376 DEBUG_COMPLEX(command_queue, "image", cl_image, img_size, img_size); 00377 DEBUG_COMPLEX(command_queue, "fft-image", cl_fft_image, img_size, img_size); 00378 } 00379 00385 void CPSO::startup(TimeTracker **trackers) { 00386 // Run the PSF calculation once for every particle, 00387 // obtaining a starting point for the search technique. 00388 runPsf(trackers, n_particles); 00389 00390 TRACK(if(trackers!=NULL)trackers[1]->resume()) 00391 int i, pos; 00392 WORD cost[n_particles * n_swarms]; 00393 WORD partial[n_particles * REDUCTION_NBLOCKS]; 00394 00395 // Finalize the reduction. 00396 clMemcpyDeviceToHost(command_queue, partial, cl_sum, n_particles * REDUCTION_NBLOCKS * SIZEOF_WORD); 00397 00398 // The CPSO startup calculation is composed of 'n_particles' PSFs, 00399 // whose results are replicated for all remaining swarms. 00400 for(i=0, pos=0; i < n_particles; i++){ 00401 cost[i] = 0; 00402 // Finalizes the reduction for every particle calculated. 00403 for(int j=0; j < REDUCTION_NBLOCKS; j++){ 00404 cost[i] += partial[pos++]; 00405 } 00406 } 00407 00408 // Repeat the cost for every swarm. 00409 for(i=1; i < n_swarms; i++) memcpy(&cost[i * n_particles], cost, n_particles * SIZEOF_WORD); 00410 clMemcpyHostToDevice(command_queue, cl_pbest_value, cost, n_swarms * n_particles * SIZEOF_WORD); 00411 00412 // Repeat the partial costs for every swarm, since it's required by the CPSO kernel. 00413 for(i=0; i < n_swarms; i++){ 00414 clMemcpyHostToDeviceOffset(command_queue, cl_sum, i * n_particles * REDUCTION_NBLOCKS * SIZEOF_WORD, partial, n_particles * REDUCTION_NBLOCKS * SIZEOF_WORD); 00415 } 00416 TRACK(if(trackers!=NULL)trackers[1]->pause()) 00417 } 00418 00422 void CPSO::finalize_cl() { 00423 // Release OpenCl instances. 00424 clReleaseMemObject(cl_coefs); 00425 clReleaseMemObject(cl_pupil); 00426 clReleaseMemObject(cl_phase); 00427 clReleaseMemObject(cl_cost); 00428 clReleaseMemObject(cl_mismatch); 00429 clReleaseMemObject(cl_sum); 00430 clReleaseMemObject(cl_fft_conobj); 00431 clReleaseMemObject(cl_fft_psfe); 00432 00433 clReleaseMemObject(cl_swarm_dim); 00434 clReleaseMemObject(cl_speed); 00435 clReleaseMemObject(cl_pbest); 00436 clReleaseMemObject(cl_gbest); 00437 clReleaseMemObject(cl_pbest_value); 00438 clReleaseMemObject(cl_gbest_value); 00439 clReleaseMemObject(cl_reset_search); 00440 clReleaseMemObject(cl_rand_ctx); 00441 clReleaseMemObject(cl_w); 00442 clReleaseMemObject(cl_c1); 00443 clReleaseMemObject(cl_c2); 00444 00445 clReleaseMemObject(cl_object); 00446 clReleaseMemObject(cl_image); 00447 clReleaseMemObject(cl_debug_info); 00448 clReleaseMemObject(cl_fft_object); 00449 clReleaseMemObject(cl_fft_image); 00450 clReleaseMemObject(cl_best_phase); 00451 clReleaseMemObject(cl_best_psf); 00452 clReleaseMemObject(cl_best_psfe); 00453 clReleaseMemObject(cl_best_fft_psfe); 00454 clReleaseMemObject(cl_best_conobj); 00455 clReleaseMemObject(cl_fft_original_psf); 00456 clReleaseMemObject(cl_best_fft_conobj); 00457 clReleaseMemObject(cl_best_coefs); 00458 00459 // Dispose the command queue. 00460 clFactory::disposeQueue(queue); 00461 00462 // Delete the Host pointers. 00463 int n_psfs = n_particles * n_swarms; 00464 for(int i=0; i < n_psfs; i++){ 00465 delete psf[i]; 00466 } 00467 delete[] psf; 00468 00469 delete startup_coefs; 00470 delete original_psf_fft; 00471 00472 // Release the FFT contexts. 00473 delete vn_object; 00474 delete vn_image; 00475 delete vn_fft_object; 00476 delete vn_fft_image; 00477 delete vn_fft_original_psf; 00478 delete vn_fft_conobj; 00479 } 00480 00489 void CPSO::run(TimeTracker **trackers, int n_cycles){ 00490 if(n_cycles <= 0){ 00491 return; 00492 } 00493 00494 // Initializes the CPSO calculation. 00495 startup(trackers); 00496 00497 // Begins the calculations. 00498 int j; 00499 ostringstream oss; 00500 WORD bkp_coefs[n_zernikes]; 00501 int n_psfs = n_swarms * n_particles; 00502 uint4 rand_ctx[n_psfs]; 00503 double all_costs[n_cycles], all_psf_diff[n_cycles]; 00504 int tmp, i = 0; 00505 do{ 00506 do{ 00507 if(i > 0){ 00508 // Calculates the current PSF. 00509 runPsf(trackers); 00510 } 00511 // Runs the CPSO to obtain the new coefficients. 00512 runCPSO(trackers, n_cycles); 00513 00514 TRACK(if(trackers!=NULL)trackers[1]->resume()) 00515 00516 all_costs[i] = getMinCost(); 00517 // Repeat the PSF difference is the cost has not reduced, because it's still the same PSF calculated. 00518 all_psf_diff[i] = (i==0 || all_costs[i]<all_costs[i-1]) ? calcPsfDifferences() : all_psf_diff[i-1]; 00519 00520 oss << "PSO#" << UID << " Cycle#" << i << " -> (cost " << all_costs[i] << ", psf_diff " << all_psf_diff[i] << ", " << getNPsfEvals() << " evals) ["; 00521 clMemcpyDeviceToHostOffset(command_queue, bkp_coefs, cl_pbest, getBestPsfPos() * n_zernikes * SIZEOF_WORD, n_zernikes * SIZEOF_WORD); 00522 for(j=0; j < n_zernikes; j++) oss << bkp_coefs[j] << ", "; 00523 oss << "]"; 00524 Debug::debug(oss.str().c_str()); 00525 oss.str(""); 00526 00527 // Refresh the particle's seeds to avoid the problem of 00528 // exhausting the random numbers for the seed, keeping a good randomization. 00529 for(j=0; j < n_psfs; j++) {rand_ctx[j].x = rand(); rand_ctx[j].y = rand(); rand_ctx[j].z = rand(); rand_ctx[j].w = rand(); } 00530 clMemcpyHostToDevice(command_queue, cl_rand_ctx, rand_ctx, n_psfs * sizeof(uint4)); 00531 00532 i++; 00533 SYNC_QUEUE; 00534 00535 TRACK(if(trackers!=NULL)trackers[1]->pause()) 00536 }while(i % n_cycles != 0); 00537 00538 TRACK(if(trackers!=NULL)trackers[1]->resume()) 00539 00540 // Make the backup for the initial coefficients. 00541 clMemcpyDeviceToHost(command_queue, bkp_coefs, cl_coefs, n_zernikes * SIZEOF_WORD); 00542 00543 // Calculate the best search result. 00544 //---------------------------------- 00545 clMemcpyDeviceToDeviceOffset(command_queue, cl_coefs, 0, cl_pbest, getBestPsfPos() * n_zernikes * SIZEOF_WORD, n_zernikes * SIZEOF_WORD, true); 00546 tmp = n_psf_evals; 00547 00548 TRACK(if(trackers!=NULL)trackers[1]->pause()) 00549 00550 runPsf(trackers, 1); 00551 00552 TRACK(if(trackers!=NULL)trackers[1]->resume()) 00553 n_psf_evals = tmp; 00554 saveFirstResult(); 00555 TRACK(if(trackers!=NULL)trackers[1]->pause()) 00556 00557 TRACK(if(trackers!=NULL)trackers[5]->resume()) 00558 WORD _gbest_cost; 00559 DEBUG(WORD, command_queue, "sum-cost-gbest", cl_sum, 1, REDUCTION_NBLOCKS); 00560 FINAL_REDUCE(command_queue, cl_sum, _gbest_cost); 00561 CHECKSUM(WORD, command_queue, cl_cost, img_area, _gbest_cost);//img_areah 00562 gbest_cost = _gbest_cost; 00563 TRACK(if(trackers!=NULL)trackers[5]->pause()) 00564 //---------------------------------- 00565 00566 TRACK(if(trackers!=NULL)trackers[1]->resume()) 00567 oss << "Final GBest Cost: = " << gbest_cost; 00568 Debug::debug(oss.str().c_str()); 00569 oss.str(""); 00570 00571 // Finds the cycle where the convergence occurred. 00572 double mean = calc_mean(all_costs, n_cycles); 00573 double stddev = calc_stddev(all_costs, n_cycles); 00574 for(j=0; j < n_cycles; j++){ 00575 if(all_costs[j] <= (mean + stddev)){ 00576 setStableCycle(j + 1); 00577 break; 00578 } 00579 } 00580 00581 // Restore the backup of the initial coefficients. 00582 clMemcpyHostToDevice(command_queue, cl_coefs, bkp_coefs, n_zernikes * SIZEOF_WORD); 00583 00584 // Restore the w to be able to run the CPSO again. 00585 setW(w); 00586 TRACK(if(trackers!=NULL)trackers[1]->pause()) 00587 00588 }while(getNPsfEvals() < max_evals); 00589 } 00590 00597 void CPSO::runPsf(TimeTracker **trackers){ 00598 runPsf(trackers, n_particles * n_swarms); 00599 } 00600 00607 void CPSO::runPsf(TimeTracker **trackers, int n_psfs){ 00608 generatePhase(trackers, n_psfs); 00609 makePsf(trackers, n_psfs); 00610 convolveObj(trackers, n_psfs); 00611 calcCost(trackers, n_psfs); 00612 n_psf_evals += n_psfs; 00613 } 00614 00621 void CPSO::runCPSO(TimeTracker **tracker, int n_cycles){ 00622 int max_threads = n_particles * n_swarms; 00623 00624 TRACK(if(tracker!=NULL)tracker[1]->resume()) 00625 CALL_KERNEL(command_queue, kernels->cl_cpso, max_threads, n_particles, 20, 00626 {sizeof(cl_mem), (void*)&cl_coefs}, 00627 {sizeof(cl_mem), (void*)&cl_speed}, 00628 {sizeof(cl_mem), (void*)&cl_pbest}, 00629 {sizeof(cl_mem), (void*)&cl_gbest}, 00630 {n_particles * SIZEOF_WORD, NULL}, 00631 {sizeof(cl_mem), (void*)&cl_pbest_value}, 00632 {sizeof(cl_mem), (void*)&cl_gbest_value}, 00633 {sizeof(cl_mem), (void*)&cl_reset_search}, 00634 {sizeof(cl_mem), (void*)&cl_swarm_dim}, 00635 {sizeof(cl_mem), (void*)&cl_sum}, 00636 {sizeof(cl_mem), (void*)&cl_rand_ctx}, 00637 {sizeof(cl_mem), (void*)&cl_w}, 00638 {sizeof(CL_WORD), (void*)&w}, 00639 {sizeof(cl_mem), (void*)&cl_c1}, 00640 {sizeof(cl_mem), (void*)&cl_c2}, 00641 {sizeof(cl_int), (void*)&n_particles}, 00642 {sizeof(cl_int), (void*)&n_swarms}, 00643 {sizeof(CL_WORD), (void*)&psf_range}, 00644 {sizeof(CL_WORD), (void*)&reset_at}, 00645 {sizeof(cl_int), (void*)&n_cycles} 00646 ); 00647 00648 SYNC_QUEUE 00649 00650 TRACK(if(tracker!=NULL)tracker[1]->pause()) 00651 } 00652 00659 void CPSO::generatePhase(TimeTracker **tracker, int n_psfs) { 00660 int max_threads = n_psfs * phase_size; 00661 00662 TRACK(if(tracker!=NULL)tracker[2]->resume()) 00663 CALL_KERNEL(command_queue, kernels->cl_generate_phase, max_threads, phase_size, 9, 00664 {sizeof(cl_mem), (void*)&cl_zernikes}, 00665 {sizeof(cl_mem), (void*)&cl_coefs}, 00666 {sizeof(cl_mem), (void*)&cl_phase_mask}, 00667 {sizeof(cl_mem), (void*)&cl_pupil}, 00668 {sizeof(cl_mem), (void*)&cl_phase}, // This parameter, Only for validation purposes. 00669 {SIZEOF_WORD*n_zernikes, NULL}, 00670 {sizeof(cl_int), (void*)&phase_size}, 00671 {sizeof(cl_int), (void*)&n_zernikes}, 00672 {sizeof(cl_int), (void*)&max_threads} 00673 ); 00674 SYNC_QUEUE 00675 DEBUG_COMPLEX3D(command_queue, "pupil", cl_pupil, phase_size, phase_size, n_psfs); 00676 00677 // Move the pupil information into every separate PSF instance. 00678 for(int i=0; i < n_psfs; i++){ 00679 psf[i]->refreshPupil(false); 00680 } 00681 SYNC_QUEUE 00682 00683 TRACK(if(tracker!=NULL)tracker[2]->pause()) 00684 } 00685 00692 void CPSO::makePsf(TimeTracker **tracker, int n_psfs) { 00693 int blocks_fft = ceil(size_fft / (float) MATRIX_OP_BLOCKSZ); 00694 int blocks_img = ceil(img_area / (float) MATRIX_OP_BLOCKSZ); 00695 int i; 00696 00697 TRACK(if(tracker!=NULL)tracker[3]->resume()); 00698 // Calculate the FFT of the pupils. 00699 for(i=0; i < n_psfs; i++){ 00700 omp_set_lock(&mutex_fft); 00701 psf[i]->pupilToFocusFft(); 00702 omp_unset_lock(&mutex_fft); 00703 } 00704 00705 for(i=0; i < n_psfs; i++){ 00706 DEBUG_COMPLEX(command_queue, "focus", psf[i]->cl_focus, phase_size, phase_size); 00707 } 00708 TRACK(if(tracker!=NULL)tracker[3]->pause()) 00709 00710 TRACK(if(tracker!=NULL)tracker[4]->resume()) 00711 // Get the actual PSF values. 00712 for(i=0; i < n_psfs; i++){ 00713 CALL_KERNEL(command_queue, kernels->cl_power_spec, blocks_fft*MATRIX_OP_BLOCKSZ, MATRIX_OP_BLOCKSZ, 3, 00714 {sizeof(cl_mem), (void*)&psf[i]->cl_focus}, 00715 {sizeof(cl_mem), (void*)&psf[i]->cl_psf}, 00716 {sizeof(cl_int), (void*)&size_fft} 00717 ); 00718 } 00719 SYNC_QUEUE 00720 for(i=0; i < n_psfs; i++){ 00721 DEBUG(WORD, command_queue, "psf", psf[i]->cl_psf, phase_size, phase_size); 00722 } 00723 TRACK(if(tracker!=NULL)tracker[4]->pause()) 00724 00725 TRACK(if(tracker!=NULL)tracker[5]->resume()) 00726 // Sum PSF values. 00727 for(i=0; i < n_psfs; i++){ 00728 CALL_KERNEL(command_queue, kernels->cl_reduce, REDUCTION_NBLOCKS*REDUCTION_BLOCKSZ, REDUCTION_BLOCKSZ, 4, 00729 {sizeof(cl_mem), (void*)&psf[i]->cl_psf}, 00730 {sizeof(cl_mem), (void*)&psf[i]->cl_sum}, 00731 {REDUCTION_BLOCKSZ * SIZEOF_WORD, NULL}, 00732 {sizeof(cl_int), (void*)&size_fft} 00733 ); 00734 } 00735 SYNC_QUEUE 00736 WORD scale[n_psfs]; 00737 for(i=0; i < n_psfs; i++){ 00738 DEBUG(WORD, command_queue, "sum-psf", psf[i]->cl_sum, 1, REDUCTION_NBLOCKS); 00739 FINAL_REDUCE(command_queue, psf[i]->cl_sum, scale[i]); 00740 CHECKSUM(WORD, command_queue, psf[i]->cl_psf, size_fft, scale[i]); 00741 scale[i] = 1 / (WORD)scale[i]; 00742 } 00743 TRACK(if(tracker!=NULL)tracker[5]->pause()) 00744 00745 TRACK(if(tracker!=NULL)tracker[4]->resume()) 00746 // Scale the pupils with the PSFs sums. 00747 for(i=0; i < n_psfs; i++){ 00748 CALL_KERNEL(command_queue, kernels->cl_multiply_complexarr, blocks_fft*MATRIX_OP_BLOCKSZ, MATRIX_OP_BLOCKSZ, 4, 00749 {sizeof(cl_mem), (void*)&psf[i]->cl_pupil}, 00750 {sizeof(cl_mem), (void*)&psf[i]->cl_pupil}, 00751 {sizeof(cl_float), (void*)&scale[i]}, 00752 {sizeof(cl_int), (void*)&size_fft} 00753 ); 00754 } 00755 SYNC_QUEUE 00756 for(i=0; i < n_psfs; i++){ 00757 DEBUG_COMPLEX(command_queue, "norm-pupil", psf[i]->cl_pupil, phase_size, phase_size); 00758 } 00759 TRACK(if(tracker!=NULL)tracker[4]->pause()) 00760 00761 TRACK(if(tracker!=NULL)tracker[3]->resume()) 00762 // Calculate the FFT of the scaled pupils. 00763 for(i=0; i < n_psfs; i++){ 00764 omp_set_lock(&mutex_fft); 00765 psf[i]->pupilToFocusFft(); 00766 omp_unset_lock(&mutex_fft); 00767 } 00768 for(i=0; i < n_psfs; i++){ 00769 DEBUG_COMPLEX(command_queue, "norm-focus", psf[i]->cl_focus, phase_size, phase_size); 00770 } 00771 TRACK(if(tracker!=NULL)tracker[3]->pause()) 00772 00773 TRACK(if(tracker!=NULL)tracker[4]->resume()) 00774 // Get the actual PSF scaled values. 00775 for(i=0; i < n_psfs; i++){ 00776 CALL_KERNEL(command_queue, kernels->cl_power_spec, blocks_fft*MATRIX_OP_BLOCKSZ, MATRIX_OP_BLOCKSZ, 3, 00777 {sizeof(cl_mem), (void*)&psf[i]->cl_focus}, 00778 {sizeof(cl_mem), (void*)&psf[i]->cl_psf}, 00779 {sizeof(cl_int), (void*)&size_fft} 00780 ); 00781 } 00782 SYNC_QUEUE 00783 for(i=0; i < n_psfs; i++){ 00784 DEBUG(WORD, command_queue, "norm-psf", psf[i]->cl_psf, phase_size, phase_size); 00785 } 00786 00787 // Extract the PSFs adjusted to the image width. 00788 for(i=0; i < n_psfs; i++){ 00789 if(img_size < phase_size){ 00790 // Extract the values according to the PSF_EXTRACT constant. 00791 CALL_KERNEL(command_queue, kernels->cl_resize_psf, blocks_img*MATRIX_OP_BLOCKSZ, MATRIX_OP_BLOCKSZ, 5, 00792 {sizeof(cl_mem), (void*)&psf[i]->cl_psf}, 00793 {sizeof(cl_mem), (void*)&psf[i]->cl_psfe}, 00794 {sizeof(cl_int), (void*)&phase_size}, 00795 {sizeof(cl_int), (void*)&img_size}, 00796 {sizeof(cl_int), (void*)&img_area} 00797 ); 00798 } 00799 else{ 00800 // Just copy. 00801 clMemcpyDeviceToDevice(command_queue, psf[i]->cl_psfe, psf[i]->cl_psf, img_area * SIZEOF_WORD, false); 00802 } 00803 } 00804 SYNC_QUEUE 00805 for(i=0; i < n_psfs; i++){ 00806 DEBUG(WORD, command_queue, "psfe", psf[i]->cl_psfe, img_size, img_size); 00807 } 00808 TRACK(if(tracker!=NULL)tracker[4]->pause()) 00809 00810 TRACK(if(tracker!=NULL)tracker[5]->resume()) 00811 // Sum the extracted PSF values. 00812 for(i=0; i < n_psfs; i++){ 00813 CALL_KERNEL(command_queue, kernels->cl_reduce, REDUCTION_NBLOCKS*REDUCTION_BLOCKSZ, REDUCTION_BLOCKSZ, 4, 00814 {sizeof(cl_mem), (void*)&psf[i]->cl_psfe}, 00815 {sizeof(cl_mem), (void*)&psf[i]->cl_sum}, 00816 {REDUCTION_BLOCKSZ * SIZEOF_WORD, NULL}, 00817 {sizeof(cl_int), (void*)&img_area} 00818 ); 00819 } 00820 SYNC_QUEUE 00821 for(i=0; i < n_psfs; i++){ 00822 DEBUG(WORD, command_queue, "sum-psfe", psf[i]->cl_sum, 1, REDUCTION_NBLOCKS); 00823 FINAL_REDUCE(command_queue, psf[i]->cl_sum, scale[i]); 00824 CHECKSUM(WORD, command_queue, psf[i]->cl_psfe, img_area, scale[i]); 00825 scale[i] = 1.0 / scale[i]; 00826 } 00827 TRACK(if(tracker!=NULL)tracker[5]->pause()) 00828 00829 TRACK(if(tracker!=NULL)tracker[4]->resume()) 00830 // Scale the extracted PSFs with their own sums. 00831 for(i=0; i < n_psfs; i++){ 00832 CALL_KERNEL(command_queue, kernels->cl_multiply_doublearr, blocks_img*MATRIX_OP_BLOCKSZ, MATRIX_OP_BLOCKSZ, 4, 00833 {sizeof(cl_mem), (void*)& psf[i]->cl_psfe}, 00834 {sizeof(cl_mem), (void*)& psf[i]->cl_psfe}, 00835 {sizeof(cl_float), (void*)&scale[i]}, 00836 {sizeof(cl_int), (void*)&img_area} 00837 ); 00838 } 00839 SYNC_QUEUE 00840 for(i=0; i < n_psfs; i++){ 00841 DEBUG(WORD, command_queue, "norm-psfe", psf[i]->cl_psfe, img_size, img_size); 00842 } 00843 TRACK(if(tracker!=NULL)tracker[4]->pause()) 00844 } 00845 00852 void CPSO::convolveObj(TimeTracker **tracker, int n_psfs) { 00853 int max_threads = n_psfs * img_size; 00854 int i; 00855 00856 TRACK(if(tracker!=NULL)tracker[3]->resume()) 00857 // Move the real numbers to the FFT_TYPE format. 00858 for(i=0; i < n_psfs; i++){ 00859 REAL2COMPLEX_GPU_VIA_GPU(command_queue, psf[i]->cl_psfe, psf[i]->cl_fft_psfe, img_area); 00860 } 00861 SYNC_QUEUE 00862 for(i=0; i < n_psfs; i++){ 00863 DEBUG_COMPLEX(command_queue, "fft-psfe", psf[i]->cl_fft_psfe, img_size, img_size); //img_sizeh 00864 } 00865 00866 // Calculate the FFT of the PSFs. 00867 for(i=0; i < n_psfs; i++){ 00868 omp_set_lock(&mutex_fft); 00869 psf[i]->psfeFft(); 00870 omp_unset_lock(&mutex_fft); 00871 } 00872 00873 // Move the FFTs of the PSFs (calculated previously) back to the major array. 00874 for(i=0; i < n_psfs; i++){ 00875 clMemcpyDeviceToDeviceOffset(command_queue, cl_fft_psfe, i * img_area * SIZEOF_FFTTYPE, psf[i]->cl_fft_psfe, 0, img_area * SIZEOF_FFTTYPE, false); 00876 } 00877 SYNC_QUEUE 00878 DEBUG_COMPLEX3D(command_queue, "fft-psfe", cl_fft_psfe, img_size, img_size, n_psfs); //img_sizeh 00879 TRACK(if(tracker!=NULL)tracker[3]->pause()) 00880 00881 TRACK(if(tracker!=NULL)tracker[4]->resume()) 00882 // Convolve the object with the PSFs, generating 'n_psfs' convolved objects. 00883 CALL_KERNEL(command_queue, kernels->cl_multiply_fftw_complex_arrays, max_threads, img_size, 5, 00884 {sizeof(cl_mem), (void*)&cl_fft_psfe}, 00885 {sizeof(cl_mem), (void*)&cl_fft_object}, 00886 {sizeof(cl_mem), (void*)&cl_fft_conobj}, 00887 {sizeof(cl_int), (void*)&img_size}, 00888 {sizeof(cl_int), (void*)&max_threads} 00889 ); 00890 SYNC_QUEUE 00891 DEBUG_COMPLEX3D(command_queue, "fft-conobj", cl_fft_conobj, img_size, img_size, n_psfs); 00892 TRACK(if(tracker!=NULL)tracker[4]->pause()) 00893 } 00894 00901 void CPSO::calcCost(TimeTracker **tracker, int n_psfs) { 00902 int max_threads = n_psfs * img_size; 00903 00904 TRACK(if(tracker!=NULL)tracker[4]->resume()) 00905 // Calculate the search cost as the difference between the convolved object and the image. 00906 // The result for every point is stored into cl_cost. 00907 WORD scale = img_area; 00908 CALL_KERNEL(command_queue, kernels->cl_calc_cost, max_threads, img_size, 7, 00909 {sizeof(cl_mem), (void*)&cl_fft_image}, 00910 {sizeof(cl_mem), (void*)&cl_fft_conobj}, 00911 {sizeof(cl_mem), (void*)&cl_cost}, 00912 {sizeof(cl_mem), (void*)&cl_diffraction_mask}, 00913 {sizeof(cl_int), (void*)&scale}, 00914 {sizeof(cl_int), (void*)&img_size}, 00915 {sizeof(cl_int), (void*)&max_threads} 00916 ); 00917 SYNC_QUEUE 00918 DEBUG_3D(WORD, command_queue, "cost", cl_cost, img_size, img_size, n_psfs); 00919 TRACK(if(tracker!=NULL)tracker[4]->pause()) 00920 00921 TRACK(if(tracker!=NULL)tracker[5]->resume()) 00922 // Sum cl_cost to obtain the final cost (one cost per convolved object). 00923 CALL_KERNEL2D(command_queue, kernels->cl_reduce, REDUCTION_NBLOCKS*REDUCTION_BLOCKSZ, n_psfs, REDUCTION_BLOCKSZ, 1, 4, 00924 {sizeof(cl_mem), (void*)&cl_cost}, 00925 {sizeof(cl_mem), (void*)&cl_sum}, 00926 {REDUCTION_BLOCKSZ * SIZEOF_WORD, NULL}, 00927 {sizeof(cl_int), (void*)&img_area} 00928 ); 00929 SYNC_QUEUE 00930 WORD chk_sum; 00931 DEBUG_2D(WORD, command_queue, "sum-cost", cl_sum, n_psfs, REDUCTION_NBLOCKS); 00932 FINAL_REDUCTIONS(command_queue, cl_sum, chk_sum, n_psfs); 00933 CHECKSUM(WORD, command_queue, cl_cost, img_area*n_psfs, chk_sum);//img_sizeh 00934 00935 TRACK(if(tracker!=NULL)tracker[5]->pause()) 00936 } 00937 00938 FFT_TYPE CPSO::calcDifference(TimeTracker **tracker, WORD *img, WORD *img_diff){ 00939 FFT_TYPE *fft_conobj = (FFT_TYPE*)buffer; 00940 for(int i = 0; i < img_area; i++) {fft_conobj[i].x = img[i]; fft_conobj[i].y = 0;} 00941 clMemcpyHostToDevice(command_queue, cl_fft_conobj, fft_conobj, img_area * SIZEOF_FFTTYPE); 00942 00943 TRACK(if(tracker!=NULL)tracker[3]->resume()) 00944 omp_set_lock(&mutex_fft); 00945 viennacl::inplace_fft(*vn_fft_conobj); // Inplace transform. 00946 omp_unset_lock(&mutex_fft); 00947 DEBUG_COMPLEX(command_queue, "fft-conobj", cl_fft_conobj, img_size, img_size); 00948 TRACK(if(tracker!=NULL)tracker[3]->pause()) 00949 00950 FFT_TYPE result = calcMismatch(tracker, cl_fft_conobj); 00951 clMemcpyDeviceToHost(command_queue, img_diff, cl_cost, img_area * SIZEOF_WORD); 00952 00953 return result; 00954 } 00955 00968 FFT_TYPE CPSO::calcMismatch(TimeTracker **tracker, cl_mem _cl_fft_conobj) { 00969 int max_threads = img_size; 00970 00971 if(_cl_fft_conobj != cl_fft_conobj){ 00972 clMemcpyDeviceToDevice(command_queue, cl_fft_conobj, _cl_fft_conobj, img_area * SIZEOF_FFTTYPE, true); 00973 DEBUG_COMPLEX(command_queue, "fft-conobj", cl_fft_conobj, img_size, img_size); 00974 } 00975 00976 // Backtransform the first convolved object Only. 00977 TRACK(if(tracker!=NULL)tracker[3]->resume()) 00978 omp_set_lock(&mutex_fft); 00979 viennacl::inplace_ifft(*vn_fft_conobj); // Inplace inverse transform. 00980 omp_unset_lock(&mutex_fft); 00981 DEBUG_COMPLEX(command_queue, "i-fft-conobj", cl_fft_conobj, img_size, img_size); 00982 TRACK(if(tracker!=NULL)tracker[3]->pause()) 00983 00984 TRACK(if(tracker!=NULL)tracker[4]->resume()) 00985 // Calculate the mismatch and save the best convolved object. 00986 CALL_KERNEL(command_queue, kernels->cl_calc_mismatch, max_threads, img_size, 7, 00987 {sizeof(cl_mem), (void*)&cl_image}, 00988 {sizeof(cl_mem), (void*)&cl_fft_conobj}, 00989 {sizeof(cl_mem), (void*)&cl_best_conobj}, 00990 {sizeof(cl_mem), (void*)&cl_cost}, 00991 {sizeof(cl_mem), (void*)&cl_mismatch}, 00992 {sizeof(cl_int), (void*)&img_size}, 00993 {sizeof(cl_int), (void*)&max_threads} 00994 ); 00995 SYNC_QUEUE 00996 DEBUG_3D(WORD, command_queue, "cost", cl_cost, img_size, img_size, 1); 00997 TRACK(if(tracker!=NULL)tracker[4]->pause()) 00998 00999 // Make the sums. 01000 FFT_TYPE mismatch; 01001 reduce_squares(tracker, 1, img_area, cl_cost, cl_sum, &mismatch.x); 01002 reduce_squares(tracker, 1, img_area, cl_mismatch, cl_sum, &mismatch.y); 01003 01004 return mismatch; 01005 } 01006 01020 void CPSO::reduce_squares(TimeTracker **tracker, int n_reductions, int reduction_width, cl_mem square, cl_mem sum, WORD* result){ 01021 TRACK(if(tracker!=NULL)tracker[5]->resume()); 01022 CALL_KERNEL2D(command_queue, kernels->cl_reduce, REDUCTION_NBLOCKS*REDUCTION_BLOCKSZ, n_reductions, REDUCTION_BLOCKSZ, 1, 4, 01023 {sizeof(cl_mem), (void*)&square}, 01024 {sizeof(cl_mem), (void*)&sum}, 01025 {REDUCTION_BLOCKSZ * SIZEOF_WORD, NULL}, 01026 {sizeof(cl_int), (void*)&reduction_width} 01027 ); 01028 SYNC_QUEUE 01029 01030 for(int i=0; i < n_reductions; i++){ 01031 DEBUG(WORD, command_queue, "sum-square", sum, 1, REDUCTION_NBLOCKS); 01032 FINAL_REDUCE(command_queue, sum, result[i]); 01033 CHECKSUM(WORD, command_queue, square, reduction_width, result[i]); 01034 } 01035 TRACK(if(tracker!=NULL)tracker[5]->pause()); 01036 } 01037 01052 FFT_TYPE CPSO::validatePsf(double *psf_data){ 01053 // Move the PSF data to the PSF instance. 01054 PSF psf(context, command_queue, img_size, phase_size, 1); 01055 FFT_TYPE *_psf = (FFT_TYPE*)buffer; 01056 for(int i = 0; i < img_area; i++) {_psf[i].x = psf_data[i]; _psf[i].y = 0;} 01057 clMemcpyHostToDevice(command_queue, psf.cl_fft_psfe, _psf, img_area * SIZEOF_FFTTYPE); 01058 DEBUG_COMPLEX(command_queue, "psf-validate", psf.cl_fft_psfe, img_size, img_size); 01059 // Calculate the FFT of the PSF. 01060 psf.psfeFft(); 01061 DEBUG_COMPLEX(command_queue, "fft-psf-validate", psf.cl_fft_psfe, img_size, img_size); 01062 01063 // Convolve the internal object with the external PSF. 01064 CALL_KERNEL(command_queue, kernels->cl_multiply_fftw_complex_arrays, img_size, img_size, 5, 01065 {sizeof(cl_mem), (void*)&psf.cl_fft_psfe}, 01066 {sizeof(cl_mem), (void*)&cl_fft_object}, 01067 {sizeof(cl_mem), (void*)&cl_fft_conobj}, 01068 {sizeof(cl_int), (void*)&img_size}, 01069 {sizeof(cl_int), (void*)&img_size} 01070 ); 01071 SYNC_QUEUE 01072 DEBUG_COMPLEX(command_queue, "fft-conobj", cl_fft_conobj, img_size, img_size); 01073 01074 // Return the mismatch between the internal object 01075 // convolved with the external PSF and the internal image. 01076 return calcMismatch(NULL, cl_fft_conobj); 01077 } 01078 01086 WORD CPSO::getRandCoef(double range) { 01087 WORD randn = ((rand() / (WORD)RAND_MAX) - 0.5) * 2 * range; 01088 if (randn > range) { 01089 randn = -2.0 * range + randn; 01090 } 01091 else if (randn < -range) { 01092 randn = 2 * range + randn; 01093 } 01094 HALT(randn > range || randn < -range); 01095 01096 return randn; 01097 } 01098 01106 void CPSO::setStartupCoefs(WORD *coefs){ 01107 has_startup_coefs = true; 01108 for(int i=0; i < n_zernikes; i++) startup_coefs[i] = coefs[i]; 01109 } 01110 01114 void CPSO::saveFirstResult(){ 01115 // Save the convolved objects's FFT before inverting it. 01116 clMemcpyDeviceToDevice(command_queue, cl_best_fft_conobj, cl_fft_conobj, img_area * SIZEOF_FFTTYPE, false); 01117 01118 // Invert the FFT of the first the convolved object. 01119 omp_set_lock(&mutex_fft); 01120 viennacl::inplace_ifft(*vn_fft_conobj); // Inplace transform. 01121 omp_unset_lock(&mutex_fft); 01122 01123 // Extract the real part of the fist convolved object and save it. 01124 CALL_KERNEL(command_queue, kernels->cl_real, img_area, img_size, 3, 01125 {sizeof(cl_mem), (void*)&cl_fft_conobj}, 01126 {sizeof(cl_mem), (void*)&cl_best_conobj}, // Save it here as the best convolved object. 01127 {sizeof(cl_int), (void*)&img_area} 01128 ); 01129 SYNC_QUEUE 01130 01131 // Save the others values of the first PSF as the best values. 01132 clMemcpyDeviceToDevice(command_queue, cl_best_phase, cl_phase, size_fft * SIZEOF_WORD, false); 01133 clMemcpyDeviceToDevice(command_queue, cl_best_coefs, cl_coefs, n_zernikes * SIZEOF_WORD, false); 01134 clMemcpyDeviceToDevice(command_queue, cl_best_psf, psf[0]->cl_psf, img_area * SIZEOF_WORD, false); 01135 clMemcpyDeviceToDevice(command_queue, cl_best_psfe, psf[0]->cl_psfe, img_area * SIZEOF_WORD, false); 01136 clMemcpyDeviceToDevice(command_queue, cl_best_fft_psfe, psf[0]->cl_fft_psfe, img_area * SIZEOF_FFTTYPE, false); 01137 } 01138 01144 WORD CPSO::getMinCost(){ 01145 WORD ret = WORD_MAX; 01146 WORD *cost = (WORD*)buffer; 01147 clMemcpyDeviceToHost(command_queue, cost, cl_gbest_value, n_swarms * SIZEOF_WORD); 01148 for(int i=0; i < n_swarms; i++){ 01149 if(cost[i] < ret){ 01150 ret = cost[i]; 01151 } 01152 } 01153 01154 return ret; 01155 } 01156 01162 int CPSO::getBestPsfPos(){ 01163 int n_psfs = n_swarms * n_particles; 01164 WORD val = WORD_MAX, pos; 01165 WORD *cost = (WORD*)buffer; 01166 clMemcpyDeviceToHost(command_queue, cost, cl_pbest_value, n_psfs * SIZEOF_WORD); 01167 for(int i=0; i < n_psfs; i++){ 01168 if(cost[i] < val){ 01169 val = cost[i]; 01170 pos = i; 01171 } 01172 } 01173 return pos; 01174 } 01175 01181 void CPSO::getBestCoefs(WORD* _coefs){ 01182 clMemcpyDeviceToHost(command_queue, _coefs, cl_best_coefs, n_zernikes * SIZEOF_WORD); 01183 } 01184 01189 WORD CPSO::getGBestCost(){ 01190 return gbest_cost; 01191 } 01192 01196 void CPSO::commitBestValues() { 01197 // clMemcpyDeviceToDevice(cl_best_psf, cl_psf, size_fft * SIZEOF_WORD); 01198 // clMemcpyDeviceToDevice(cl_best_psfe, cl_psfe, img_area * SIZEOF_WORD); 01199 // clMemcpyDeviceToDevice(cl_best_conobj, cl_conobj, img_area * SIZEOF_WORD); 01200 } 01201 01207 void CPSO::getBestPhase(WORD *phase) { 01208 clMemcpyDeviceToHost(command_queue, phase, cl_best_phase, size_fft * SIZEOF_WORD); 01209 } 01210 01216 void CPSO::getBestPsf(WORD *psf) { 01217 clMemcpyDeviceToHost(command_queue, psf, cl_best_psf, size_fft * SIZEOF_WORD); 01218 } 01219 01225 void CPSO::getBestPsfe(WORD *psfe) { 01226 clMemcpyDeviceToHost(command_queue, psfe, cl_best_psfe, img_area * SIZEOF_WORD); 01227 } 01228 01234 void CPSO::getBestConvolvedObject(WORD *conobj) { 01235 clMemcpyDeviceToHost(command_queue, conobj, cl_best_conobj, img_area * SIZEOF_WORD); 01236 } 01237 01243 void CPSO::getBestConvolvedObjectFFT(FFT_TYPE *conobj_fft) { 01244 clMemcpyDeviceToHost(command_queue, conobj_fft, cl_best_fft_conobj, img_area * SIZEOF_FFTTYPE); 01245 } 01246 01252 void CPSO::getBestPsfeFFT(FFT_TYPE *psfe_fft) { 01253 clMemcpyDeviceToHost(command_queue, psfe_fft, cl_best_fft_psfe, img_area * SIZEOF_FFTTYPE); 01254 } 01255 01261 void CPSO::getObjectFFT(FFT_TYPE *obj_fft) { 01262 clMemcpyDeviceToHost(command_queue, obj_fft, cl_fft_object, img_area * SIZEOF_FFTTYPE); 01263 } 01264 01270 void CPSO::getImageFFT(FFT_TYPE *img_fft) { 01271 clMemcpyDeviceToHost(command_queue, img_fft, cl_fft_image, img_area * SIZEOF_FFTTYPE); 01272 } 01273 01279 void CPSO::setW(WORD _w){ 01280 replicateValue(_w, n_particles * n_swarms, cl_w); 01281 } 01282 01288 void CPSO::setC1(WORD _c1){ 01289 replicateValue(_c1, n_particles * n_swarms, cl_c1); 01290 } 01291 01297 void CPSO::setC2(WORD _c2){ 01298 replicateValue(_c2, n_particles * n_swarms, cl_c2); 01299 } 01300 01306 void CPSO::setOriginalPsf(WORD *original_psf){ 01307 FFT_TYPE *arr = (FFT_TYPE*)buffer; 01308 for(int i=0; i < img_area; i++) arr[i].x = original_psf[i]; 01309 clMemcpyHostToDevice(command_queue, cl_fft_original_psf, arr, img_area * SIZEOF_FFTTYPE); 01310 01311 // Invert the FFT of the first the original PSF. 01312 omp_set_lock(&mutex_fft); 01313 viennacl::inplace_fft(*vn_fft_original_psf); // Inplace transform. 01314 omp_unset_lock(&mutex_fft); 01315 01316 clMemcpyDeviceToHost(command_queue, original_psf_fft, cl_fft_original_psf, img_area * SIZEOF_FFTTYPE); 01317 has_psf_original = true; 01318 } 01319 01325 double CPSO::calcPsfDifferences(){ 01326 if(!has_psf_original){ 01327 return -1; 01328 } 01329 01330 FFT_TYPE *arr = (FFT_TYPE*)buffer; 01331 clMemcpyDeviceToHost(command_queue, arr, psf[getBestPsfPos()]->cl_fft_psfe, img_area * SIZEOF_FFTTYPE); 01332 WORD x, y; 01333 double diff = 0; 01334 for(int i=0; i < img_area; i++){ 01335 x = original_psf_fft[i].x - arr[i].x; 01336 y = original_psf_fft[i].y - arr[i].y; 01337 diff += (diffraction_mask[i]) * (sqrt(x * x + y * y) / img_area); 01338 } 01339 return diff; 01340 } 01341 01349 void CPSO::replicateValue(WORD value, int sz, cl_mem cl_ref){ 01350 WORD *arr = (WORD*)buffer; 01351 for(int i=0; i < sz; i++) arr[i] = value; 01352 clMemcpyHostToDevice(command_queue, cl_ref, arr, sz * SIZEOF_WORD); 01353 } 01354 01358 void CPSO::lock(){ 01359 in_use = true; 01360 } 01361 01365 void CPSO::release() { 01366 in_use = false; 01367 } 01368 01372 bool CPSO::isInUse() { 01373 return in_use; 01374 } 01375 01379 void CPSO::copyToDeviceAsFloat(cl_command_queue command_queue, cl_mem dest, double *values, int size) { 01380 float *floatValue = (float*)buffer; 01381 for (int i = 0; i < size; i++) { 01382 floatValue[i] = values[i]; 01383 } 01384 clMemcpyHostToDevice(command_queue, dest, floatValue, size * sizeof(float)); 01385 } 01386 01390 void CPSO::copyToHostAsDouble(cl_command_queue command_queue, cl_mem src, double *values, int size) { 01391 float *floatValue = (float*)buffer; 01392 clMemcpyDeviceToHost(command_queue, floatValue, src, size * sizeof(float)); 01393 for (int i = 0; i < size; i++) { 01394 values[i] = floatValue[i]; 01395 } 01396 } 01397 01405 double CPSO::calc_mean(double *values, int size){ 01406 double mean = 0; 01407 for(int i=0; i < size; i++){ 01408 mean += values[i]; 01409 } 01410 return mean / size; 01411 } 01412 01420 double CPSO::calc_variance(double *values, int size){ 01421 if(size == 1) return 0; 01422 01423 double mean = calc_mean(values, size); 01424 double var = 0; 01425 for(int i=0; i < size; i++){ 01426 var += (values[i] - mean) * (values[i] - mean); 01427 } 01428 return var / (size - 1); 01429 } 01430 01438 double CPSO::calc_stddev(double *values, int size){ 01439 return sqrt(calc_variance(values, size)); 01440 } 01441 01449 void CPSO::generateRandomCoefs(WORD *coefs, int n_zernikes, double range){ 01450 // The number of Zernike terms to be distorted is randomized 01451 // to simulate a real environment. 01452 int n_randcoefs = (rand() / (WORD)RAND_MAX) * n_zernikes; 01453 if(n_randcoefs == 0) n_randcoefs = 1; 01454 memset(coefs, 0, n_zernikes * SIZEOF_WORD); 01455 WORD val; 01456 for(int i=0; i < n_randcoefs; i++){ 01457 do{ 01458 val = randNormalDistribution(0, range / 4); 01459 }while(val > range); 01460 // The range used for the random coefficients is randomized to avoid 01461 // too drastic image distortions and to simulate a real environment. 01462 coefs[(int)((rand() / (WORD)RAND_MAX) * n_zernikes)] = val; 01463 } 01464 } 01465 01473 void CPSO::generateNormalDistrRandomCoefs(WORD *coefs, int n_zernikes, double range){ 01474 WORD val; 01475 for(int i=0; i < n_zernikes; i++){ 01476 do{ 01477 val = randNormalDistribution(0, range / 4); 01478 }while(val > range); 01479 coefs[i] = val; 01480 } 01481 } 01482 01490 double CPSO::randNormalDistribution(double mean, double std_dev){ 01491 return (mean + (rand()%2 ? -1.0 : 1.0) * 01492 std_dev * pow(-log(0.99999*((double)rand()/RAND_MAX)), 0.5) 01493 ); 01494 }