PSFEstimationwithCPSO
cpso.cpp
00001 /*
00002  * kippe_aux.cpp
00003  *
00004  *  Created on: 27/05/2012
00005  *  Author: Peter Frank Perroni (pfperroni@inf.ufpr.br)
00006  */
00007 
00008 #include "cpso.hpp"
00009 
00010 using namespace std;
00011 
00012 map<cl_context, cl_mem*> CPSO::static_references;
00013 int *CPSO::diffraction_mask;
00014 omp_lock_t CPSO::mutex;
00015 omp_lock_t CPSO::mutex_fft;
00016 omp_lock_t Debug::mutex1;
00017 omp_lock_t Debug::mutex2;
00018 bool CPSO::lock_initialized = false;
00019 bool Debug::lock_initialized = false;
00020 char Debug::buffer[DEFAULT_BUFFER_SIZE];
00021 int CPSO::seq = 0;
00022 
00043 CPSO::CPSO(double* _zernikes, int *_phase_mask, int *_diffraction_mask, int _phase_size, int _image_size, int _n_zernikes,
00044                 int _psf_range, WORD _w, WORD c1, WORD c2, WORD _reset_at, int _n_particles, int _n_swarms){
00045         startup_locks();
00046 
00047         omp_set_lock(&mutex);
00048         UID = seq++;
00049         omp_unset_lock(&mutex);
00050 
00051         buffer = new char[DEFAULT_BUFFER_SIZE];
00052         queue = clFactory::getQueue();
00053         context = queue->getContext();
00054         command_queue = queue->getCommandQueue();
00055         kernels = queue->kernels;
00056         has_startup_coefs = false;
00057         store_static_data(context, _n_zernikes, _phase_size, _image_size, _zernikes, _phase_mask, _diffraction_mask);
00058         allocate_data(_phase_size, _image_size, _n_zernikes, _psf_range, _w, c1, c2, _reset_at, _n_particles, _n_swarms);
00059 }
00060 
00064 CPSO::~CPSO(){
00065         // Release the internal buffer.
00066         delete buffer;
00067 }
00068 
00085 void CPSO::store_static_data(cl_context _context, int _n_zernikes, int _phase_size, int _image_size, double* zernikes,
00086                 int *phase_mask, int *_diffraction_mask){
00087         // If the data was already stored into this context, leave this method.
00088         if(static_references.count(_context) != 0){
00089                 return;
00090         }
00091 
00092         int _img_area = _image_size * _image_size;
00093         int _size_fft = _phase_size * _phase_size;
00094         int _z_size = _n_zernikes * _size_fft;
00095 
00096         WORD *_zernikes = new WORD[_z_size];
00097         for(int i = 0; i < _z_size; i++) _zernikes[i] = zernikes[i];
00098         cl_mem _cl_zernikes, _cl_phase_mask, _cl_diffraction_mask;
00099         CREATE_BUFFER(_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, _z_size * SIZEOF_WORD, _zernikes, _cl_zernikes);
00100         CREATE_BUFFER(_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, _size_fft * sizeof(int), phase_mask, _cl_phase_mask);
00101         CREATE_BUFFER(_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, _img_area * sizeof(int), _diffraction_mask, _cl_diffraction_mask);//img_areah
00102 
00103         delete _zernikes;
00104 
00105         cl_mem *static_data = new cl_mem[3];
00106         static_data[0] = _cl_zernikes;
00107         static_data[1] = _cl_phase_mask;
00108         static_data[2] = _cl_diffraction_mask;
00109         static_references[_context] = static_data;
00110 
00111         diffraction_mask = new int[_img_area];
00112         memcpy(diffraction_mask, _diffraction_mask, _img_area * sizeof(int));
00113 }
00114 
00120 void CPSO::clear_static_data(){
00121         cl_mem *static_data;
00122         map<cl_context, cl_mem*>::iterator iter;
00123         for(iter=static_references.begin(); iter != static_references.end(); iter++){
00124                 static_data = iter->second;
00125                 clReleaseMemObject(static_data[0]);
00126                 clReleaseMemObject(static_data[1]);
00127                 clReleaseMemObject(static_data[2]);
00128                 delete static_data;
00129         }
00130         destroy_locks();
00131         delete diffraction_mask;
00132 }
00133 
00137 void CPSO::startup_locks(){
00138         if(!lock_initialized){
00139                 omp_init_lock(&mutex);
00140                 omp_init_lock(&mutex_fft);
00141                 Debug::startup_locks();
00142                 lock_initialized = true;
00143         }
00144 }
00145 
00149 void CPSO::destroy_locks(){
00150         if(lock_initialized){
00151                 omp_destroy_lock(&mutex);
00152                 omp_destroy_lock(&mutex_fft);
00153                 Debug::destroy_locks();
00154                 lock_initialized = false;
00155         }
00156 }
00157 
00172 void CPSO::allocate_data(int _phase_size, int image_size, int _n_zernikes, int _psf_range,
00173                 WORD _w, WORD c1, WORD c2, WORD _reset_at, int _n_particles, int _n_swarms) {
00174 
00175         // Set the simple Host variables.
00176         threads = DEFAULT_BLOCKSZ;
00177         psf_range = _psf_range;
00178         img_size = image_size;
00179         phase_size = _phase_size;
00180         n_zernikes = _n_zernikes;
00181         img_sizeh = (img_size / 2) + 1;
00182         img_area = img_size * img_size;
00183         img_areah = img_size * img_sizeh;
00184         size_fft = phase_size * phase_size;
00185         z_size = n_zernikes * size_fft;
00186 
00187         n_particles = _n_particles;
00188         n_swarms = _n_swarms;
00189         int swarm_sizes[n_swarms];
00190         float swarm_dim = n_zernikes / (float)n_swarms;
00191         for(int i=0; i < n_swarms; i++){
00192                 swarm_sizes[i] = (i + 1) * swarm_dim - 1;
00193         }
00194         reset_at = _reset_at;
00195         w = _w;
00196 
00197         startup_coefs = new WORD[n_zernikes];
00198         int n_psfs = n_particles * n_swarms;
00199 
00200         // Allocate memory on device and set its contents.
00201         // For the PSF.
00202         CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * size_fft * SIZEOF_FFTTYPE, NULL, cl_pupil);
00203         CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * size_fft * SIZEOF_WORD, NULL, cl_phase);// Only necessary for validation purpose.
00204         CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * img_area * SIZEOF_FFTTYPE, NULL, cl_fft_psfe);
00205         CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * img_area * SIZEOF_FFTTYPE, NULL, cl_fft_conobj);
00206         CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * REDUCTION_NBLOCKS * SIZEOF_WORD, NULL, cl_sum);
00207         CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * img_area * SIZEOF_WORD, NULL, cl_cost);
00208         CREATE_BUFFER(context, CL_MEM_READ_WRITE, img_area * SIZEOF_WORD, NULL, cl_mismatch);
00209 
00210         // For the CPSO.
00211         CREATE_BUFFER(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, n_swarms * sizeof(int), swarm_sizes, cl_swarm_dim);
00212         CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * n_zernikes * SIZEOF_WORD, NULL, cl_coefs);
00213         CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_particles * n_zernikes * SIZEOF_WORD, NULL, cl_speed);
00214         CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * n_zernikes * SIZEOF_WORD, NULL, cl_pbest);
00215         CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_zernikes * SIZEOF_WORD, NULL, cl_gbest);
00216         CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * SIZEOF_WORD, NULL, cl_pbest_value);
00217         CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_swarms * SIZEOF_WORD, NULL, cl_gbest_value);
00218         CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * sizeof(int), NULL, cl_reset_search);
00219         CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * sizeof(uint4), NULL, cl_rand_ctx);
00220         CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * SIZEOF_WORD, NULL, cl_w);
00221         CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * SIZEOF_WORD, NULL, cl_c1);
00222         CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_psfs * SIZEOF_WORD, NULL, cl_c2);
00223 
00224         // Remaining objects.
00225         CREATE_BUFFER(context, CL_MEM_READ_WRITE, img_area * SIZEOF_WORD, NULL, cl_best_conobj);
00226         CREATE_BUFFER(context, CL_MEM_READ_WRITE, size_fft * SIZEOF_WORD, NULL, cl_best_phase);
00227         CREATE_BUFFER(context, CL_MEM_READ_WRITE, size_fft * SIZEOF_WORD, NULL, cl_best_psf);
00228         CREATE_BUFFER(context, CL_MEM_READ_WRITE, img_area * SIZEOF_WORD, NULL, cl_best_psfe);
00229         CREATE_BUFFER(context, CL_MEM_READ_WRITE, n_zernikes * SIZEOF_WORD, NULL, cl_best_coefs);
00230         CREATE_BUFFER(context, CL_MEM_READ_WRITE, img_area * SIZEOF_FFTTYPE, NULL, cl_best_fft_psfe);
00231         CREATE_BUFFER(context, CL_MEM_READ_WRITE, img_area * SIZEOF_FFTTYPE, NULL, cl_best_fft_conobj);
00232         CREATE_BUFFER(context, CL_MEM_READ_ONLY, img_area * SIZEOF_FFTTYPE, NULL, cl_object);
00233         CREATE_BUFFER(context, CL_MEM_READ_ONLY, img_area * SIZEOF_FFTTYPE, NULL, cl_image);
00234         CREATE_BUFFER(context, CL_MEM_READ_WRITE, img_area * SIZEOF_FFTTYPE, NULL, cl_fft_object);
00235         CREATE_BUFFER(context, CL_MEM_READ_WRITE, img_area * SIZEOF_FFTTYPE, NULL, cl_fft_image);
00236         CREATE_BUFFER(context, CL_MEM_READ_WRITE, img_area * SIZEOF_FFTTYPE, NULL, cl_fft_original_psf);
00237         CREATE_BUFFER(context, CL_MEM_READ_WRITE, z_size * SIZEOF_FFTTYPE, NULL, cl_debug_info);
00238 
00239         // Get the references for the static values.
00240         cl_mem *static_data = static_references[context];
00241         cl_zernikes = static_data[0];
00242         cl_phase_mask = static_data[1];
00243         cl_diffraction_mask = static_data[2];
00244 
00245         DEBUG(int, command_queue, "phase-mask", cl_phase_mask, phase_size, phase_size);
00246         DEBUG(int, command_queue, "difraction-mask", cl_diffraction_mask, img_size, img_size);//img_size, img_sizeh
00247 
00248         // Initialize the FFT contexts.
00249         vn_object = new viennacl::vector<WORD>(cl_object, img_area*2);
00250         vn_image = new viennacl::vector<WORD>(cl_image, img_area*2);
00251         vn_fft_object = new viennacl::vector<WORD>(cl_fft_object, img_area*2);
00252         vn_fft_image = new viennacl::vector<WORD>(cl_fft_image, img_area*2);
00253         vn_fft_original_psf = new viennacl::vector<WORD>(cl_fft_original_psf, img_area*2);
00254         vn_fft_conobj = new viennacl::vector<WORD>(cl_fft_conobj, img_area*2);
00255 
00256         // Each particle will have its own PSF instance.
00257         psf = new PSF*[n_psfs];
00258         for(int i=0; i < n_psfs; i++){
00259                 psf[i] = new PSF(context, command_queue, img_size, phase_size, n_zernikes);
00260                 // Set the PSF references into the major matrixes.
00261                 psf[i]->setCoefsPosition(cl_coefs, i * n_zernikes);
00262                 psf[i]->setPupilPosition(cl_pupil, i * size_fft);
00263         }
00264         original_psf_fft = new FFT_TYPE[img_area];
00265 
00266         // Set CPSO search parameters.
00267         setW(w);
00268         setC1(c1);
00269         setC2(c2);
00270 
00271         SYNC_QUEUE
00272 }
00273 
00282 void CPSO::set_images(double *object, double *image) {
00283         has_psf_original = false;
00284 
00285         // Create an array with values zero to clear the device memory.
00286         FFT_TYPE *zero = (FFT_TYPE*)buffer;
00287         memset(zero, 0, z_size * SIZEOF_FFTTYPE);
00288 
00289         int i;
00290         WORD worst_value[n_swarms];
00291         // Worst possible value is the maximum possible value (minimization problem).
00292         for(i=0; i < n_swarms; i++) worst_value[i] = WORD_MAX;
00293 
00294         int n_coefs = n_particles * n_zernikes;
00295         WORD coefs[n_coefs * n_swarms];
00296         if(has_startup_coefs){
00297                 // NOTE: The PSO results CANNOT be directly compared to the Simulated Annealing results!
00298                 //       This happens because the SA calculates Only 1 Zernike at a time and has many controls to accept
00299                 //       the coefficient or not, what makes it very difficult to force it to accept manual coefs.
00300                 //       At same time, the PSO pupil is totally recalculated every time, whereas the SA one has
00301                 //       the intermediate values stored, what cause slight differences that will be cascade all
00302                 //       over the computation, resulting in a near but Not exact costs (PSO cost x SA cost).
00303                 // Initialize 1 particle with the static startup coefficients provided.
00304                 memcpy(coefs, startup_coefs, n_zernikes * SIZEOF_WORD);
00305                 // Replicate these same coefs to all particles into the first swarm.
00306                 for(i=1; i < n_particles; i++) memcpy(&coefs[i * n_zernikes], coefs, n_zernikes * SIZEOF_WORD);
00307         }
00308         else{
00309                 //generateRandomCoefs(coefs, n_coefs, psf_range);
00310                 generateNormalDistrRandomCoefs(coefs, n_coefs, psf_range);
00311         // Generate random coefficients for every dimension in every particle into the first swarm.
00312         //for(i=0; i < n_coefs; i++) coefs[i] = getRandCoef(psf_range);
00313         }
00314         has_startup_coefs = false;
00315 
00316         // Replicate the particle coefficients to all remaining swarms.
00317         for(i=1; i < n_swarms; i++) memcpy(&coefs[i * n_coefs], coefs, n_coefs * SIZEOF_WORD);
00318 
00319         // Generate random speeds for every dimension in every particle.
00320         // Speeds should not be replicated for every swarm, since it will be used Only to update
00321         // the next position for the own swarm's dimension interval, thus the remaining intervals
00322         // would be useless.
00323         WORD speed[n_coefs];
00324         // Max.speed is half search space.
00325         for(i=0; i < n_coefs; i++) speed[i] = getRandCoef(psf_range / 2);
00326 
00327         // Create a random seed for every particle.
00328         int n_psfs = n_particles * n_swarms;
00329         uint4 rand_ctx[n_psfs];
00330         for(i=0; i < n_psfs; i++) {rand_ctx[i].x = rand(); rand_ctx[i].y = rand(); rand_ctx[i].z = rand(); rand_ctx[i].w = rand(); }
00331 
00332         //Regardless of the FITS data type (double or float), convert it to the type currently in use.
00333         FFT_TYPE *_image = new FFT_TYPE[img_area];
00334         for(i = 0; i < img_area; i++) {_image[i].x = image[i]; _image[i].y = 0;}
00335         FFT_TYPE *_object = new FFT_TYPE[img_area];
00336         for(i = 0; i < img_area; i++) {_object[i].x = object[i]; _object[i].y = 0;}
00337 
00338         // Copy the values to the device.
00339         // For the CPSO.
00340         clMemcpyHostToDevice(command_queue, cl_coefs, coefs, n_coefs * n_swarms * SIZEOF_WORD);
00341         clMemcpyHostToDevice(command_queue, cl_speed, speed, n_coefs * SIZEOF_WORD);
00342         // Set the personal best position as the initial position.
00343         clMemcpyHostToDevice(command_queue, cl_pbest, coefs, n_coefs * n_swarms * SIZEOF_WORD);
00344         // Set the global best position as zero, cause it must be calculated before used (thus there's no initial value).
00345         clMemcpyHostToDevice(command_queue, cl_gbest, zero, n_zernikes * SIZEOF_WORD);
00346         // Set the global best value as the worst possible value.
00347         clMemcpyHostToDevice(command_queue, cl_gbest_value, worst_value, n_swarms * SIZEOF_WORD);
00348         clMemcpyHostToDevice(command_queue, cl_reset_search, zero, n_psfs * sizeof(int));
00349         clMemcpyHostToDevice(command_queue, cl_rand_ctx, rand_ctx, n_psfs * sizeof(uint4));
00350 
00351         // For the remaining objects.
00352         clMemcpyHostToDevice(command_queue, cl_debug_info, zero, z_size * SIZEOF_FFTTYPE);
00353         clMemcpyHostToDevice(command_queue, cl_object, _object, img_area * SIZEOF_FFTTYPE);
00354         clMemcpyHostToDevice(command_queue, cl_image, _image, img_area * SIZEOF_FFTTYPE);
00355         clMemcpyHostToDevice(command_queue, cl_fft_image, zero, img_area * SIZEOF_FFTTYPE);
00356 
00357         delete _image;
00358         delete _object;
00359 
00360         // Reset the number of evaluations.
00361         n_psf_evals = 0;
00362         max_evals = -1;
00363         convergence_stable_cycle = -1;
00364 
00365         SYNC_QUEUE
00366 
00367         // Calculate the Object's and the Image's FFTs.
00368         omp_set_lock(&mutex_fft);
00369         viennacl::fft(*vn_object, *vn_fft_object);
00370         viennacl::fft(*vn_image, *vn_fft_image);
00371         omp_unset_lock(&mutex_fft);
00372 
00373         DEBUG_2D(WORD, command_queue, "coefs", cl_coefs, n_particles, n_zernikes);
00374         DEBUG_COMPLEX(command_queue, "object", cl_object, img_size, img_size);
00375         DEBUG_COMPLEX(command_queue, "fft-object", cl_fft_object, img_size, img_size);
00376         DEBUG_COMPLEX(command_queue, "image", cl_image, img_size, img_size);
00377         DEBUG_COMPLEX(command_queue, "fft-image", cl_fft_image, img_size, img_size);
00378 }
00379 
00385 void CPSO::startup(TimeTracker **trackers) {
00386         // Run the PSF calculation once for every particle,
00387         // obtaining a starting point for the search technique.
00388         runPsf(trackers, n_particles);
00389 
00390         TRACK(if(trackers!=NULL)trackers[1]->resume())
00391         int i, pos;
00392         WORD cost[n_particles * n_swarms];
00393         WORD partial[n_particles * REDUCTION_NBLOCKS];
00394 
00395         // Finalize the reduction.
00396         clMemcpyDeviceToHost(command_queue, partial, cl_sum, n_particles * REDUCTION_NBLOCKS * SIZEOF_WORD);
00397 
00398         // The CPSO startup calculation is composed of 'n_particles' PSFs,
00399         // whose results are replicated for all remaining swarms.
00400         for(i=0, pos=0; i < n_particles; i++){
00401                 cost[i] = 0;
00402                 // Finalizes the reduction for every particle calculated.
00403                 for(int j=0; j < REDUCTION_NBLOCKS; j++){
00404                         cost[i] += partial[pos++];
00405                 }
00406         }
00407 
00408         // Repeat the cost for every swarm.
00409         for(i=1; i < n_swarms; i++) memcpy(&cost[i * n_particles], cost, n_particles * SIZEOF_WORD);
00410         clMemcpyHostToDevice(command_queue, cl_pbest_value, cost, n_swarms * n_particles * SIZEOF_WORD);
00411 
00412         // Repeat the partial costs for every swarm, since it's required by the CPSO kernel.
00413         for(i=0; i < n_swarms; i++){
00414                 clMemcpyHostToDeviceOffset(command_queue, cl_sum, i * n_particles * REDUCTION_NBLOCKS * SIZEOF_WORD, partial, n_particles * REDUCTION_NBLOCKS * SIZEOF_WORD);
00415         }
00416         TRACK(if(trackers!=NULL)trackers[1]->pause())
00417 }
00418 
00422 void CPSO::finalize_cl() {
00423         // Release OpenCl instances.
00424         clReleaseMemObject(cl_coefs);
00425         clReleaseMemObject(cl_pupil);
00426         clReleaseMemObject(cl_phase);
00427         clReleaseMemObject(cl_cost);
00428         clReleaseMemObject(cl_mismatch);
00429         clReleaseMemObject(cl_sum);
00430         clReleaseMemObject(cl_fft_conobj);
00431         clReleaseMemObject(cl_fft_psfe);
00432 
00433         clReleaseMemObject(cl_swarm_dim);
00434         clReleaseMemObject(cl_speed);
00435         clReleaseMemObject(cl_pbest);
00436         clReleaseMemObject(cl_gbest);
00437         clReleaseMemObject(cl_pbest_value);
00438         clReleaseMemObject(cl_gbest_value);
00439         clReleaseMemObject(cl_reset_search);
00440         clReleaseMemObject(cl_rand_ctx);
00441         clReleaseMemObject(cl_w);
00442         clReleaseMemObject(cl_c1);
00443         clReleaseMemObject(cl_c2);
00444 
00445         clReleaseMemObject(cl_object);
00446         clReleaseMemObject(cl_image);
00447         clReleaseMemObject(cl_debug_info);
00448         clReleaseMemObject(cl_fft_object);
00449         clReleaseMemObject(cl_fft_image);
00450         clReleaseMemObject(cl_best_phase);
00451         clReleaseMemObject(cl_best_psf);
00452         clReleaseMemObject(cl_best_psfe);
00453         clReleaseMemObject(cl_best_fft_psfe);
00454         clReleaseMemObject(cl_best_conobj);
00455         clReleaseMemObject(cl_fft_original_psf);
00456         clReleaseMemObject(cl_best_fft_conobj);
00457         clReleaseMemObject(cl_best_coefs);
00458 
00459         // Dispose the command queue.
00460         clFactory::disposeQueue(queue);
00461 
00462         // Delete the Host pointers.
00463         int n_psfs = n_particles * n_swarms;
00464         for(int i=0; i < n_psfs; i++){
00465                 delete psf[i];
00466         }
00467         delete[] psf;
00468 
00469         delete startup_coefs;
00470         delete original_psf_fft;
00471 
00472         // Release the FFT contexts.
00473         delete vn_object;
00474         delete vn_image;
00475         delete vn_fft_object;
00476         delete vn_fft_image;
00477         delete vn_fft_original_psf;
00478         delete vn_fft_conobj;
00479 }
00480 
00489 void CPSO::run(TimeTracker **trackers, int n_cycles){
00490         if(n_cycles <= 0){
00491                 return;
00492         }
00493 
00494         // Initializes the CPSO calculation.
00495         startup(trackers);
00496 
00497         // Begins the calculations.
00498         int j;
00499         ostringstream oss;
00500         WORD bkp_coefs[n_zernikes];
00501         int n_psfs = n_swarms * n_particles;
00502         uint4 rand_ctx[n_psfs];
00503         double all_costs[n_cycles], all_psf_diff[n_cycles];
00504         int tmp, i = 0;
00505         do{
00506                 do{
00507                         if(i > 0){
00508                                 // Calculates the current PSF.
00509                                 runPsf(trackers);
00510                         }
00511                         // Runs the CPSO to obtain the new coefficients.
00512                         runCPSO(trackers, n_cycles);
00513 
00514                         TRACK(if(trackers!=NULL)trackers[1]->resume())
00515 
00516                         all_costs[i] = getMinCost();
00517                         // Repeat the PSF difference is the cost has not reduced, because it's still the same PSF calculated.
00518                         all_psf_diff[i] = (i==0 || all_costs[i]<all_costs[i-1]) ? calcPsfDifferences() : all_psf_diff[i-1];
00519 
00520                         oss << "PSO#" << UID << " Cycle#" << i << " -> (cost " << all_costs[i] << ", psf_diff " << all_psf_diff[i] << ", " << getNPsfEvals() << " evals) [";
00521                         clMemcpyDeviceToHostOffset(command_queue, bkp_coefs, cl_pbest, getBestPsfPos() * n_zernikes * SIZEOF_WORD, n_zernikes * SIZEOF_WORD);
00522                         for(j=0; j < n_zernikes; j++) oss << bkp_coefs[j] << ", ";
00523                         oss << "]";
00524                         Debug::debug(oss.str().c_str());
00525                         oss.str("");
00526 
00527                         // Refresh the particle's seeds to avoid the problem of
00528                         // exhausting the random numbers for the seed, keeping a good randomization.
00529                         for(j=0; j < n_psfs; j++) {rand_ctx[j].x = rand(); rand_ctx[j].y = rand(); rand_ctx[j].z = rand(); rand_ctx[j].w = rand(); }
00530                         clMemcpyHostToDevice(command_queue, cl_rand_ctx, rand_ctx, n_psfs * sizeof(uint4));
00531 
00532                         i++;
00533                         SYNC_QUEUE;
00534 
00535                         TRACK(if(trackers!=NULL)trackers[1]->pause())
00536                 }while(i % n_cycles != 0);
00537 
00538                 TRACK(if(trackers!=NULL)trackers[1]->resume())
00539 
00540                 // Make the backup for the initial coefficients.
00541                 clMemcpyDeviceToHost(command_queue, bkp_coefs, cl_coefs, n_zernikes * SIZEOF_WORD);
00542 
00543                 // Calculate the best search result.
00544                 //----------------------------------
00545                 clMemcpyDeviceToDeviceOffset(command_queue, cl_coefs, 0, cl_pbest, getBestPsfPos() * n_zernikes * SIZEOF_WORD, n_zernikes * SIZEOF_WORD, true);
00546                 tmp = n_psf_evals;
00547 
00548                 TRACK(if(trackers!=NULL)trackers[1]->pause())
00549 
00550                 runPsf(trackers, 1);
00551 
00552                 TRACK(if(trackers!=NULL)trackers[1]->resume())
00553                 n_psf_evals = tmp;
00554                 saveFirstResult();
00555                 TRACK(if(trackers!=NULL)trackers[1]->pause())
00556 
00557                 TRACK(if(trackers!=NULL)trackers[5]->resume())
00558                 WORD _gbest_cost;
00559                 DEBUG(WORD, command_queue, "sum-cost-gbest",  cl_sum, 1, REDUCTION_NBLOCKS);
00560                 FINAL_REDUCE(command_queue,  cl_sum, _gbest_cost);
00561                 CHECKSUM(WORD, command_queue,  cl_cost, img_area, _gbest_cost);//img_areah
00562                 gbest_cost = _gbest_cost;
00563                 TRACK(if(trackers!=NULL)trackers[5]->pause())
00564                 //----------------------------------
00565 
00566                 TRACK(if(trackers!=NULL)trackers[1]->resume())
00567                 oss << "Final GBest Cost: = " << gbest_cost;
00568                 Debug::debug(oss.str().c_str());
00569                 oss.str("");
00570 
00571                 // Finds the cycle where the convergence occurred.
00572                 double mean = calc_mean(all_costs, n_cycles);
00573                 double stddev = calc_stddev(all_costs, n_cycles);
00574                 for(j=0; j < n_cycles; j++){
00575                         if(all_costs[j] <= (mean + stddev)){
00576                                 setStableCycle(j + 1);
00577                                 break;
00578                         }
00579                 }
00580 
00581                 // Restore the backup of the initial coefficients.
00582                 clMemcpyHostToDevice(command_queue, cl_coefs, bkp_coefs, n_zernikes * SIZEOF_WORD);
00583 
00584                 // Restore the w to be able to run the CPSO again.
00585                 setW(w);
00586                 TRACK(if(trackers!=NULL)trackers[1]->pause())
00587 
00588         }while(getNPsfEvals() < max_evals);
00589 }
00590 
00597 void CPSO::runPsf(TimeTracker **trackers){
00598         runPsf(trackers, n_particles * n_swarms);
00599 }
00600 
00607 void CPSO::runPsf(TimeTracker **trackers, int n_psfs){
00608         generatePhase(trackers, n_psfs);
00609         makePsf(trackers, n_psfs);
00610         convolveObj(trackers, n_psfs);
00611         calcCost(trackers, n_psfs);
00612         n_psf_evals += n_psfs;
00613 }
00614 
00621 void CPSO::runCPSO(TimeTracker **tracker, int n_cycles){
00622         int max_threads = n_particles * n_swarms;
00623 
00624         TRACK(if(tracker!=NULL)tracker[1]->resume())
00625         CALL_KERNEL(command_queue, kernels->cl_cpso, max_threads, n_particles, 20,
00626                 {sizeof(cl_mem), (void*)&cl_coefs},
00627                 {sizeof(cl_mem), (void*)&cl_speed},
00628                 {sizeof(cl_mem), (void*)&cl_pbest},
00629                 {sizeof(cl_mem), (void*)&cl_gbest},
00630                 {n_particles * SIZEOF_WORD, NULL},
00631                 {sizeof(cl_mem), (void*)&cl_pbest_value},
00632                 {sizeof(cl_mem), (void*)&cl_gbest_value},
00633                 {sizeof(cl_mem), (void*)&cl_reset_search},
00634                 {sizeof(cl_mem), (void*)&cl_swarm_dim},
00635                 {sizeof(cl_mem), (void*)&cl_sum},
00636                 {sizeof(cl_mem), (void*)&cl_rand_ctx},
00637                 {sizeof(cl_mem), (void*)&cl_w},
00638                 {sizeof(CL_WORD), (void*)&w},
00639                 {sizeof(cl_mem), (void*)&cl_c1},
00640                 {sizeof(cl_mem), (void*)&cl_c2},
00641                 {sizeof(cl_int), (void*)&n_particles},
00642                 {sizeof(cl_int), (void*)&n_swarms},
00643                 {sizeof(CL_WORD), (void*)&psf_range},
00644                 {sizeof(CL_WORD), (void*)&reset_at},
00645                 {sizeof(cl_int), (void*)&n_cycles}
00646     );
00647 
00648         SYNC_QUEUE
00649 
00650         TRACK(if(tracker!=NULL)tracker[1]->pause())
00651 }
00652 
00659 void CPSO::generatePhase(TimeTracker **tracker, int n_psfs) {
00660         int max_threads = n_psfs * phase_size;
00661 
00662         TRACK(if(tracker!=NULL)tracker[2]->resume())
00663         CALL_KERNEL(command_queue, kernels->cl_generate_phase, max_threads, phase_size, 9,
00664                 {sizeof(cl_mem), (void*)&cl_zernikes},
00665                 {sizeof(cl_mem), (void*)&cl_coefs},
00666                 {sizeof(cl_mem), (void*)&cl_phase_mask},
00667                 {sizeof(cl_mem), (void*)&cl_pupil},
00668                 {sizeof(cl_mem), (void*)&cl_phase}, // This parameter, Only for validation purposes.
00669                 {SIZEOF_WORD*n_zernikes, NULL},
00670                 {sizeof(cl_int), (void*)&phase_size},
00671                 {sizeof(cl_int), (void*)&n_zernikes},
00672                 {sizeof(cl_int), (void*)&max_threads}
00673     );
00674         SYNC_QUEUE
00675         DEBUG_COMPLEX3D(command_queue, "pupil", cl_pupil, phase_size, phase_size, n_psfs);
00676 
00677         // Move the pupil information into every separate PSF instance.
00678         for(int i=0; i < n_psfs; i++){
00679                 psf[i]->refreshPupil(false);
00680         }
00681         SYNC_QUEUE
00682 
00683         TRACK(if(tracker!=NULL)tracker[2]->pause())
00684 }
00685 
00692 void CPSO::makePsf(TimeTracker **tracker, int n_psfs) {
00693         int blocks_fft = ceil(size_fft / (float) MATRIX_OP_BLOCKSZ);
00694         int blocks_img = ceil(img_area / (float) MATRIX_OP_BLOCKSZ);
00695         int i;
00696 
00697         TRACK(if(tracker!=NULL)tracker[3]->resume());
00698         // Calculate the FFT of the pupils.
00699         for(i=0; i < n_psfs; i++){
00700                 omp_set_lock(&mutex_fft);
00701                 psf[i]->pupilToFocusFft();
00702                 omp_unset_lock(&mutex_fft);
00703         }
00704 
00705         for(i=0; i < n_psfs; i++){
00706                 DEBUG_COMPLEX(command_queue, "focus", psf[i]->cl_focus, phase_size, phase_size);
00707         }
00708         TRACK(if(tracker!=NULL)tracker[3]->pause())
00709 
00710         TRACK(if(tracker!=NULL)tracker[4]->resume())
00711         // Get the actual PSF values.
00712         for(i=0; i < n_psfs; i++){
00713                 CALL_KERNEL(command_queue, kernels->cl_power_spec, blocks_fft*MATRIX_OP_BLOCKSZ, MATRIX_OP_BLOCKSZ, 3,
00714                         {sizeof(cl_mem), (void*)&psf[i]->cl_focus},
00715                         {sizeof(cl_mem), (void*)&psf[i]->cl_psf},
00716                         {sizeof(cl_int), (void*)&size_fft}
00717                 );
00718         }
00719         SYNC_QUEUE
00720         for(i=0; i < n_psfs; i++){
00721                 DEBUG(WORD, command_queue, "psf", psf[i]->cl_psf, phase_size, phase_size);
00722         }
00723         TRACK(if(tracker!=NULL)tracker[4]->pause())
00724 
00725         TRACK(if(tracker!=NULL)tracker[5]->resume())
00726         // Sum PSF values.
00727         for(i=0; i < n_psfs; i++){
00728                 CALL_KERNEL(command_queue, kernels->cl_reduce, REDUCTION_NBLOCKS*REDUCTION_BLOCKSZ, REDUCTION_BLOCKSZ, 4,
00729                         {sizeof(cl_mem), (void*)&psf[i]->cl_psf},
00730                         {sizeof(cl_mem), (void*)&psf[i]->cl_sum},
00731                         {REDUCTION_BLOCKSZ * SIZEOF_WORD, NULL},
00732                         {sizeof(cl_int), (void*)&size_fft}
00733                 );
00734         }
00735         SYNC_QUEUE
00736         WORD scale[n_psfs];
00737         for(i=0; i < n_psfs; i++){
00738                 DEBUG(WORD, command_queue, "sum-psf", psf[i]->cl_sum, 1, REDUCTION_NBLOCKS);
00739                 FINAL_REDUCE(command_queue, psf[i]->cl_sum, scale[i]);
00740                 CHECKSUM(WORD, command_queue, psf[i]->cl_psf, size_fft, scale[i]);
00741                 scale[i] = 1 / (WORD)scale[i];
00742         }
00743         TRACK(if(tracker!=NULL)tracker[5]->pause())
00744 
00745         TRACK(if(tracker!=NULL)tracker[4]->resume())
00746         // Scale the pupils with the PSFs sums.
00747         for(i=0; i < n_psfs; i++){
00748                 CALL_KERNEL(command_queue, kernels->cl_multiply_complexarr, blocks_fft*MATRIX_OP_BLOCKSZ, MATRIX_OP_BLOCKSZ, 4,
00749                         {sizeof(cl_mem), (void*)&psf[i]->cl_pupil},
00750                         {sizeof(cl_mem), (void*)&psf[i]->cl_pupil},
00751                         {sizeof(cl_float), (void*)&scale[i]},
00752                         {sizeof(cl_int), (void*)&size_fft}
00753                 );
00754         }
00755         SYNC_QUEUE
00756         for(i=0; i < n_psfs; i++){
00757                 DEBUG_COMPLEX(command_queue, "norm-pupil", psf[i]->cl_pupil, phase_size, phase_size);
00758         }
00759         TRACK(if(tracker!=NULL)tracker[4]->pause())
00760 
00761         TRACK(if(tracker!=NULL)tracker[3]->resume())
00762         // Calculate the FFT of the scaled pupils.
00763         for(i=0; i < n_psfs; i++){
00764                 omp_set_lock(&mutex_fft);
00765                 psf[i]->pupilToFocusFft();
00766                 omp_unset_lock(&mutex_fft);
00767         }
00768         for(i=0; i < n_psfs; i++){
00769                 DEBUG_COMPLEX(command_queue, "norm-focus", psf[i]->cl_focus, phase_size, phase_size);
00770         }
00771         TRACK(if(tracker!=NULL)tracker[3]->pause())
00772 
00773         TRACK(if(tracker!=NULL)tracker[4]->resume())
00774         // Get the actual PSF scaled values.
00775         for(i=0; i < n_psfs; i++){
00776                 CALL_KERNEL(command_queue, kernels->cl_power_spec, blocks_fft*MATRIX_OP_BLOCKSZ, MATRIX_OP_BLOCKSZ, 3,
00777                         {sizeof(cl_mem), (void*)&psf[i]->cl_focus},
00778                         {sizeof(cl_mem), (void*)&psf[i]->cl_psf},
00779                         {sizeof(cl_int), (void*)&size_fft}
00780                 );
00781         }
00782         SYNC_QUEUE
00783         for(i=0; i < n_psfs; i++){
00784                 DEBUG(WORD, command_queue, "norm-psf", psf[i]->cl_psf, phase_size, phase_size);
00785         }
00786 
00787         // Extract the PSFs adjusted to the image width.
00788         for(i=0; i < n_psfs; i++){
00789                 if(img_size < phase_size){
00790                         // Extract the values according to the PSF_EXTRACT constant.
00791                         CALL_KERNEL(command_queue, kernels->cl_resize_psf, blocks_img*MATRIX_OP_BLOCKSZ, MATRIX_OP_BLOCKSZ, 5,
00792                                 {sizeof(cl_mem), (void*)&psf[i]->cl_psf},
00793                                 {sizeof(cl_mem), (void*)&psf[i]->cl_psfe},
00794                                 {sizeof(cl_int), (void*)&phase_size},
00795                                 {sizeof(cl_int), (void*)&img_size},
00796                                 {sizeof(cl_int), (void*)&img_area}
00797                         );
00798                 }
00799                 else{
00800                         // Just copy.
00801                         clMemcpyDeviceToDevice(command_queue, psf[i]->cl_psfe, psf[i]->cl_psf, img_area * SIZEOF_WORD, false);
00802                 }
00803         }
00804         SYNC_QUEUE
00805         for(i=0; i < n_psfs; i++){
00806                 DEBUG(WORD, command_queue, "psfe", psf[i]->cl_psfe, img_size, img_size);
00807         }
00808         TRACK(if(tracker!=NULL)tracker[4]->pause())
00809 
00810         TRACK(if(tracker!=NULL)tracker[5]->resume())
00811         // Sum the extracted PSF values.
00812         for(i=0; i < n_psfs; i++){
00813                 CALL_KERNEL(command_queue, kernels->cl_reduce, REDUCTION_NBLOCKS*REDUCTION_BLOCKSZ, REDUCTION_BLOCKSZ, 4,
00814                         {sizeof(cl_mem), (void*)&psf[i]->cl_psfe},
00815                         {sizeof(cl_mem), (void*)&psf[i]->cl_sum},
00816                         {REDUCTION_BLOCKSZ * SIZEOF_WORD, NULL},
00817                         {sizeof(cl_int), (void*)&img_area}
00818                 );
00819         }
00820         SYNC_QUEUE
00821         for(i=0; i < n_psfs; i++){
00822                 DEBUG(WORD, command_queue, "sum-psfe",  psf[i]->cl_sum, 1, REDUCTION_NBLOCKS);
00823                 FINAL_REDUCE(command_queue,  psf[i]->cl_sum, scale[i]);
00824                 CHECKSUM(WORD, command_queue,  psf[i]->cl_psfe, img_area, scale[i]);
00825                 scale[i] = 1.0 / scale[i];
00826         }
00827         TRACK(if(tracker!=NULL)tracker[5]->pause())
00828 
00829         TRACK(if(tracker!=NULL)tracker[4]->resume())
00830         // Scale the extracted PSFs with their own sums.
00831         for(i=0; i < n_psfs; i++){
00832                 CALL_KERNEL(command_queue, kernels->cl_multiply_doublearr, blocks_img*MATRIX_OP_BLOCKSZ, MATRIX_OP_BLOCKSZ, 4,
00833                         {sizeof(cl_mem), (void*)& psf[i]->cl_psfe},
00834                         {sizeof(cl_mem), (void*)& psf[i]->cl_psfe},
00835                         {sizeof(cl_float), (void*)&scale[i]},
00836                         {sizeof(cl_int), (void*)&img_area}
00837                 );
00838         }
00839         SYNC_QUEUE
00840         for(i=0; i < n_psfs; i++){
00841                 DEBUG(WORD, command_queue, "norm-psfe",  psf[i]->cl_psfe, img_size, img_size);
00842         }
00843         TRACK(if(tracker!=NULL)tracker[4]->pause())
00844 }
00845 
00852 void CPSO::convolveObj(TimeTracker **tracker, int n_psfs) {
00853         int max_threads = n_psfs * img_size;
00854         int i;
00855 
00856         TRACK(if(tracker!=NULL)tracker[3]->resume())
00857         // Move the real numbers to the FFT_TYPE format.
00858         for(i=0; i < n_psfs; i++){
00859                 REAL2COMPLEX_GPU_VIA_GPU(command_queue, psf[i]->cl_psfe, psf[i]->cl_fft_psfe, img_area);
00860         }
00861         SYNC_QUEUE
00862         for(i=0; i < n_psfs; i++){
00863                 DEBUG_COMPLEX(command_queue, "fft-psfe", psf[i]->cl_fft_psfe, img_size, img_size); //img_sizeh
00864         }
00865 
00866         // Calculate the FFT of the PSFs.
00867         for(i=0; i < n_psfs; i++){
00868                 omp_set_lock(&mutex_fft);
00869                 psf[i]->psfeFft();
00870                 omp_unset_lock(&mutex_fft);
00871         }
00872 
00873         // Move the FFTs of the PSFs (calculated previously) back to the major array.
00874         for(i=0; i < n_psfs; i++){
00875                 clMemcpyDeviceToDeviceOffset(command_queue, cl_fft_psfe, i * img_area * SIZEOF_FFTTYPE, psf[i]->cl_fft_psfe, 0, img_area * SIZEOF_FFTTYPE, false);
00876         }
00877         SYNC_QUEUE
00878         DEBUG_COMPLEX3D(command_queue, "fft-psfe", cl_fft_psfe, img_size, img_size, n_psfs); //img_sizeh
00879         TRACK(if(tracker!=NULL)tracker[3]->pause())
00880 
00881         TRACK(if(tracker!=NULL)tracker[4]->resume())
00882         // Convolve the object with the PSFs, generating 'n_psfs' convolved objects.
00883         CALL_KERNEL(command_queue, kernels->cl_multiply_fftw_complex_arrays, max_threads, img_size, 5,
00884                 {sizeof(cl_mem), (void*)&cl_fft_psfe},
00885                 {sizeof(cl_mem), (void*)&cl_fft_object},
00886                 {sizeof(cl_mem), (void*)&cl_fft_conobj},
00887                 {sizeof(cl_int), (void*)&img_size},
00888                 {sizeof(cl_int), (void*)&max_threads}
00889         );
00890         SYNC_QUEUE
00891         DEBUG_COMPLEX3D(command_queue, "fft-conobj", cl_fft_conobj, img_size, img_size, n_psfs);
00892         TRACK(if(tracker!=NULL)tracker[4]->pause())
00893 }
00894 
00901 void CPSO::calcCost(TimeTracker **tracker, int n_psfs) {
00902         int max_threads = n_psfs * img_size;
00903 
00904         TRACK(if(tracker!=NULL)tracker[4]->resume())
00905         // Calculate the search cost as the difference between the convolved object and the image.
00906         // The result for every point is stored into cl_cost.
00907         WORD scale = img_area;
00908         CALL_KERNEL(command_queue, kernels->cl_calc_cost, max_threads, img_size, 7,
00909                 {sizeof(cl_mem), (void*)&cl_fft_image},
00910                 {sizeof(cl_mem), (void*)&cl_fft_conobj},
00911                 {sizeof(cl_mem), (void*)&cl_cost},
00912                 {sizeof(cl_mem), (void*)&cl_diffraction_mask},
00913                 {sizeof(cl_int), (void*)&scale},
00914                 {sizeof(cl_int), (void*)&img_size},
00915                 {sizeof(cl_int), (void*)&max_threads}
00916         );
00917         SYNC_QUEUE
00918         DEBUG_3D(WORD, command_queue, "cost", cl_cost, img_size, img_size, n_psfs);
00919         TRACK(if(tracker!=NULL)tracker[4]->pause())
00920 
00921         TRACK(if(tracker!=NULL)tracker[5]->resume())
00922         // Sum cl_cost to obtain the final cost (one cost per convolved object).
00923         CALL_KERNEL2D(command_queue, kernels->cl_reduce, REDUCTION_NBLOCKS*REDUCTION_BLOCKSZ, n_psfs, REDUCTION_BLOCKSZ, 1, 4,
00924                 {sizeof(cl_mem), (void*)&cl_cost},
00925                 {sizeof(cl_mem), (void*)&cl_sum},
00926                 {REDUCTION_BLOCKSZ * SIZEOF_WORD, NULL},
00927                 {sizeof(cl_int), (void*)&img_area}
00928         );
00929         SYNC_QUEUE
00930         WORD chk_sum;
00931         DEBUG_2D(WORD, command_queue, "sum-cost",  cl_sum, n_psfs, REDUCTION_NBLOCKS);
00932         FINAL_REDUCTIONS(command_queue,  cl_sum, chk_sum, n_psfs);
00933         CHECKSUM(WORD, command_queue, cl_cost, img_area*n_psfs, chk_sum);//img_sizeh
00934 
00935         TRACK(if(tracker!=NULL)tracker[5]->pause())
00936 }
00937 
00938 FFT_TYPE CPSO::calcDifference(TimeTracker **tracker, WORD *img, WORD *img_diff){
00939         FFT_TYPE *fft_conobj = (FFT_TYPE*)buffer;
00940         for(int i = 0; i < img_area; i++) {fft_conobj[i].x = img[i]; fft_conobj[i].y = 0;}
00941         clMemcpyHostToDevice(command_queue, cl_fft_conobj, fft_conobj, img_area * SIZEOF_FFTTYPE);
00942 
00943         TRACK(if(tracker!=NULL)tracker[3]->resume())
00944         omp_set_lock(&mutex_fft);
00945         viennacl::inplace_fft(*vn_fft_conobj); // Inplace transform.
00946         omp_unset_lock(&mutex_fft);
00947         DEBUG_COMPLEX(command_queue, "fft-conobj", cl_fft_conobj, img_size, img_size);
00948         TRACK(if(tracker!=NULL)tracker[3]->pause())
00949 
00950         FFT_TYPE result = calcMismatch(tracker, cl_fft_conobj);
00951         clMemcpyDeviceToHost(command_queue, img_diff, cl_cost, img_area * SIZEOF_WORD);
00952 
00953         return result;
00954 }
00955 
00968 FFT_TYPE CPSO::calcMismatch(TimeTracker **tracker, cl_mem _cl_fft_conobj) {
00969         int max_threads = img_size;
00970 
00971         if(_cl_fft_conobj != cl_fft_conobj){
00972                 clMemcpyDeviceToDevice(command_queue, cl_fft_conobj, _cl_fft_conobj, img_area * SIZEOF_FFTTYPE, true);
00973                 DEBUG_COMPLEX(command_queue, "fft-conobj", cl_fft_conobj, img_size, img_size);
00974         }
00975 
00976         // Backtransform the first convolved object Only.
00977         TRACK(if(tracker!=NULL)tracker[3]->resume())
00978         omp_set_lock(&mutex_fft);
00979         viennacl::inplace_ifft(*vn_fft_conobj); // Inplace inverse transform.
00980         omp_unset_lock(&mutex_fft);
00981         DEBUG_COMPLEX(command_queue, "i-fft-conobj", cl_fft_conobj, img_size, img_size);
00982         TRACK(if(tracker!=NULL)tracker[3]->pause())
00983 
00984         TRACK(if(tracker!=NULL)tracker[4]->resume())
00985         // Calculate the mismatch and save the best convolved object.
00986         CALL_KERNEL(command_queue, kernels->cl_calc_mismatch, max_threads, img_size, 7,
00987                 {sizeof(cl_mem), (void*)&cl_image},
00988                 {sizeof(cl_mem), (void*)&cl_fft_conobj},
00989                 {sizeof(cl_mem), (void*)&cl_best_conobj},
00990                 {sizeof(cl_mem), (void*)&cl_cost},
00991                 {sizeof(cl_mem), (void*)&cl_mismatch},
00992                 {sizeof(cl_int), (void*)&img_size},
00993                 {sizeof(cl_int), (void*)&max_threads}
00994         );
00995         SYNC_QUEUE
00996         DEBUG_3D(WORD, command_queue, "cost", cl_cost, img_size, img_size, 1);
00997         TRACK(if(tracker!=NULL)tracker[4]->pause())
00998 
00999         // Make the sums.
01000         FFT_TYPE mismatch;
01001         reduce_squares(tracker, 1, img_area, cl_cost, cl_sum, &mismatch.x);
01002         reduce_squares(tracker, 1, img_area, cl_mismatch, cl_sum, &mismatch.y);
01003 
01004         return mismatch;
01005 }
01006 
01020 void CPSO::reduce_squares(TimeTracker **tracker, int n_reductions, int reduction_width, cl_mem square, cl_mem sum, WORD* result){
01021         TRACK(if(tracker!=NULL)tracker[5]->resume());
01022         CALL_KERNEL2D(command_queue, kernels->cl_reduce, REDUCTION_NBLOCKS*REDUCTION_BLOCKSZ, n_reductions, REDUCTION_BLOCKSZ, 1, 4,
01023                 {sizeof(cl_mem), (void*)&square},
01024                 {sizeof(cl_mem), (void*)&sum},
01025                 {REDUCTION_BLOCKSZ * SIZEOF_WORD, NULL},
01026                 {sizeof(cl_int), (void*)&reduction_width}
01027         );
01028         SYNC_QUEUE
01029 
01030         for(int i=0; i < n_reductions; i++){
01031                 DEBUG(WORD, command_queue, "sum-square", sum, 1, REDUCTION_NBLOCKS);
01032                 FINAL_REDUCE(command_queue, sum, result[i]);
01033                 CHECKSUM(WORD, command_queue, square, reduction_width, result[i]);
01034         }
01035         TRACK(if(tracker!=NULL)tracker[5]->pause());
01036 }
01037 
01052 FFT_TYPE CPSO::validatePsf(double *psf_data){
01053         // Move the PSF data to the PSF instance.
01054         PSF psf(context, command_queue, img_size, phase_size, 1);
01055         FFT_TYPE *_psf = (FFT_TYPE*)buffer;
01056         for(int i = 0; i < img_area; i++) {_psf[i].x = psf_data[i]; _psf[i].y = 0;}
01057         clMemcpyHostToDevice(command_queue, psf.cl_fft_psfe, _psf, img_area * SIZEOF_FFTTYPE);
01058         DEBUG_COMPLEX(command_queue, "psf-validate", psf.cl_fft_psfe, img_size, img_size);
01059         // Calculate the FFT of the PSF.
01060         psf.psfeFft();
01061         DEBUG_COMPLEX(command_queue, "fft-psf-validate", psf.cl_fft_psfe, img_size, img_size);
01062 
01063         // Convolve the internal object with the external PSF.
01064         CALL_KERNEL(command_queue, kernels->cl_multiply_fftw_complex_arrays, img_size, img_size, 5,
01065                 {sizeof(cl_mem), (void*)&psf.cl_fft_psfe},
01066                 {sizeof(cl_mem), (void*)&cl_fft_object},
01067                 {sizeof(cl_mem), (void*)&cl_fft_conobj},
01068                 {sizeof(cl_int), (void*)&img_size},
01069                 {sizeof(cl_int), (void*)&img_size}
01070         );
01071         SYNC_QUEUE
01072         DEBUG_COMPLEX(command_queue, "fft-conobj", cl_fft_conobj, img_size, img_size);
01073 
01074         // Return the mismatch between the internal object
01075         // convolved with the external PSF and the internal image.
01076         return calcMismatch(NULL, cl_fft_conobj);
01077 }
01078 
01086 WORD CPSO::getRandCoef(double range) {
01087         WORD randn = ((rand() / (WORD)RAND_MAX) - 0.5) * 2 * range;
01088         if (randn > range) {
01089                 randn = -2.0 * range + randn;
01090         }
01091         else if (randn < -range) {
01092                 randn = 2 * range + randn;
01093         }
01094         HALT(randn > range || randn < -range);
01095 
01096         return randn;
01097 }
01098 
01106 void CPSO::setStartupCoefs(WORD *coefs){
01107         has_startup_coefs = true;
01108         for(int i=0; i < n_zernikes; i++) startup_coefs[i] = coefs[i];
01109 }
01110 
01114 void CPSO::saveFirstResult(){
01115         // Save the convolved objects's FFT before inverting it.
01116         clMemcpyDeviceToDevice(command_queue, cl_best_fft_conobj, cl_fft_conobj, img_area * SIZEOF_FFTTYPE, false);
01117 
01118         // Invert the FFT of the first the convolved object.
01119         omp_set_lock(&mutex_fft);
01120         viennacl::inplace_ifft(*vn_fft_conobj); // Inplace transform.
01121         omp_unset_lock(&mutex_fft);
01122 
01123         // Extract the real part of the fist convolved object and save it.
01124         CALL_KERNEL(command_queue, kernels->cl_real, img_area, img_size, 3,
01125                 {sizeof(cl_mem), (void*)&cl_fft_conobj},
01126                 {sizeof(cl_mem), (void*)&cl_best_conobj}, // Save it here as the best convolved object.
01127                 {sizeof(cl_int), (void*)&img_area}
01128         );
01129         SYNC_QUEUE
01130 
01131         // Save the others values of the first PSF as the best values.
01132         clMemcpyDeviceToDevice(command_queue, cl_best_phase, cl_phase, size_fft * SIZEOF_WORD, false);
01133         clMemcpyDeviceToDevice(command_queue, cl_best_coefs, cl_coefs, n_zernikes * SIZEOF_WORD, false);
01134         clMemcpyDeviceToDevice(command_queue, cl_best_psf, psf[0]->cl_psf, img_area * SIZEOF_WORD, false);
01135         clMemcpyDeviceToDevice(command_queue, cl_best_psfe, psf[0]->cl_psfe, img_area * SIZEOF_WORD, false);
01136         clMemcpyDeviceToDevice(command_queue, cl_best_fft_psfe, psf[0]->cl_fft_psfe, img_area * SIZEOF_FFTTYPE, false);
01137 }
01138 
01144 WORD CPSO::getMinCost(){
01145         WORD ret = WORD_MAX;
01146         WORD *cost = (WORD*)buffer;
01147         clMemcpyDeviceToHost(command_queue, cost, cl_gbest_value, n_swarms * SIZEOF_WORD);
01148         for(int i=0; i < n_swarms; i++){
01149                 if(cost[i] < ret){
01150                         ret = cost[i];
01151                 }
01152         }
01153 
01154         return ret;
01155 }
01156 
01162 int CPSO::getBestPsfPos(){
01163         int n_psfs = n_swarms * n_particles;
01164         WORD val = WORD_MAX, pos;
01165         WORD *cost = (WORD*)buffer;
01166         clMemcpyDeviceToHost(command_queue, cost, cl_pbest_value, n_psfs * SIZEOF_WORD);
01167         for(int i=0; i < n_psfs; i++){
01168                 if(cost[i] < val){
01169                         val = cost[i];
01170                         pos = i;
01171                 }
01172         }
01173         return pos;
01174 }
01175 
01181 void CPSO::getBestCoefs(WORD* _coefs){
01182         clMemcpyDeviceToHost(command_queue, _coefs, cl_best_coefs, n_zernikes * SIZEOF_WORD);
01183 }
01184 
01189 WORD CPSO::getGBestCost(){
01190         return gbest_cost;
01191 }
01192 
01196 void CPSO::commitBestValues() {
01197 //      clMemcpyDeviceToDevice(cl_best_psf, cl_psf, size_fft * SIZEOF_WORD);
01198 //      clMemcpyDeviceToDevice(cl_best_psfe, cl_psfe, img_area * SIZEOF_WORD);
01199 //      clMemcpyDeviceToDevice(cl_best_conobj, cl_conobj, img_area * SIZEOF_WORD);
01200 }
01201 
01207 void CPSO::getBestPhase(WORD *phase) {
01208         clMemcpyDeviceToHost(command_queue, phase, cl_best_phase, size_fft * SIZEOF_WORD);
01209 }
01210 
01216 void CPSO::getBestPsf(WORD *psf) {
01217         clMemcpyDeviceToHost(command_queue, psf, cl_best_psf, size_fft * SIZEOF_WORD);
01218 }
01219 
01225 void CPSO::getBestPsfe(WORD *psfe) {
01226         clMemcpyDeviceToHost(command_queue, psfe, cl_best_psfe, img_area * SIZEOF_WORD);
01227 }
01228 
01234 void CPSO::getBestConvolvedObject(WORD *conobj) {
01235         clMemcpyDeviceToHost(command_queue, conobj, cl_best_conobj, img_area * SIZEOF_WORD);
01236 }
01237 
01243 void CPSO::getBestConvolvedObjectFFT(FFT_TYPE *conobj_fft) {
01244         clMemcpyDeviceToHost(command_queue, conobj_fft, cl_best_fft_conobj, img_area * SIZEOF_FFTTYPE);
01245 }
01246 
01252 void CPSO::getBestPsfeFFT(FFT_TYPE *psfe_fft) {
01253         clMemcpyDeviceToHost(command_queue, psfe_fft, cl_best_fft_psfe, img_area * SIZEOF_FFTTYPE);
01254 }
01255 
01261 void CPSO::getObjectFFT(FFT_TYPE *obj_fft) {
01262         clMemcpyDeviceToHost(command_queue, obj_fft, cl_fft_object, img_area * SIZEOF_FFTTYPE);
01263 }
01264 
01270 void CPSO::getImageFFT(FFT_TYPE *img_fft) {
01271         clMemcpyDeviceToHost(command_queue, img_fft, cl_fft_image, img_area * SIZEOF_FFTTYPE);
01272 }
01273 
01279 void CPSO::setW(WORD _w){
01280         replicateValue(_w, n_particles * n_swarms, cl_w);
01281 }
01282 
01288 void CPSO::setC1(WORD _c1){
01289         replicateValue(_c1, n_particles * n_swarms, cl_c1);
01290 }
01291 
01297 void CPSO::setC2(WORD _c2){
01298         replicateValue(_c2, n_particles * n_swarms, cl_c2);
01299 }
01300 
01306 void CPSO::setOriginalPsf(WORD *original_psf){
01307         FFT_TYPE *arr = (FFT_TYPE*)buffer;
01308         for(int i=0; i < img_area; i++) arr[i].x = original_psf[i];
01309         clMemcpyHostToDevice(command_queue, cl_fft_original_psf, arr, img_area * SIZEOF_FFTTYPE);
01310 
01311         // Invert the FFT of the first the original PSF.
01312         omp_set_lock(&mutex_fft);
01313         viennacl::inplace_fft(*vn_fft_original_psf); // Inplace transform.
01314         omp_unset_lock(&mutex_fft);
01315 
01316         clMemcpyDeviceToHost(command_queue, original_psf_fft, cl_fft_original_psf, img_area * SIZEOF_FFTTYPE);
01317         has_psf_original = true;
01318 }
01319 
01325 double CPSO::calcPsfDifferences(){
01326         if(!has_psf_original){
01327                 return -1;
01328         }
01329 
01330         FFT_TYPE *arr = (FFT_TYPE*)buffer;
01331         clMemcpyDeviceToHost(command_queue, arr, psf[getBestPsfPos()]->cl_fft_psfe, img_area * SIZEOF_FFTTYPE);
01332         WORD x, y;
01333         double diff = 0;
01334         for(int i=0; i < img_area; i++){
01335                 x = original_psf_fft[i].x - arr[i].x;
01336                 y = original_psf_fft[i].y - arr[i].y;
01337                 diff += (diffraction_mask[i]) * (sqrt(x * x + y * y) / img_area);
01338         }
01339         return diff;
01340 }
01341 
01349 void CPSO::replicateValue(WORD value, int sz, cl_mem cl_ref){
01350         WORD *arr = (WORD*)buffer;
01351         for(int i=0; i < sz; i++) arr[i] = value;
01352         clMemcpyHostToDevice(command_queue, cl_ref, arr, sz * SIZEOF_WORD);
01353 }
01354 
01358 void CPSO::lock(){
01359         in_use = true;
01360 }
01361 
01365 void CPSO::release() {
01366         in_use = false;
01367 }
01368 
01372 bool CPSO::isInUse() {
01373         return in_use;
01374 }
01375 
01379 void CPSO::copyToDeviceAsFloat(cl_command_queue command_queue, cl_mem dest, double *values, int size) {
01380         float *floatValue = (float*)buffer;
01381         for (int i = 0; i < size; i++) {
01382                 floatValue[i] = values[i];
01383         }
01384         clMemcpyHostToDevice(command_queue, dest, floatValue, size * sizeof(float));
01385 }
01386 
01390 void CPSO::copyToHostAsDouble(cl_command_queue command_queue, cl_mem src, double *values, int size) {
01391         float *floatValue = (float*)buffer;
01392         clMemcpyDeviceToHost(command_queue, floatValue, src, size * sizeof(float));
01393         for (int i = 0; i < size; i++) {
01394                 values[i] = floatValue[i];
01395         }
01396 }
01397 
01405 double CPSO::calc_mean(double *values, int size){
01406         double mean = 0;
01407         for(int i=0; i < size; i++){
01408                 mean += values[i];
01409         }
01410         return mean / size;
01411 }
01412 
01420 double CPSO::calc_variance(double *values, int size){
01421         if(size == 1) return 0;
01422 
01423         double mean = calc_mean(values, size);
01424         double var = 0;
01425         for(int i=0; i < size; i++){
01426                 var += (values[i] - mean) * (values[i] - mean);
01427         }
01428         return var / (size - 1);
01429 }
01430 
01438 double CPSO::calc_stddev(double *values, int size){
01439         return sqrt(calc_variance(values, size));
01440 }
01441 
01449 void CPSO::generateRandomCoefs(WORD *coefs, int n_zernikes, double range){
01450         // The number of Zernike terms to be distorted is randomized
01451         // to simulate a real environment.
01452         int n_randcoefs = (rand() / (WORD)RAND_MAX) * n_zernikes;
01453         if(n_randcoefs == 0) n_randcoefs = 1;
01454         memset(coefs, 0, n_zernikes * SIZEOF_WORD);
01455         WORD val;
01456         for(int i=0; i < n_randcoefs; i++){
01457                 do{
01458                         val = randNormalDistribution(0, range / 4);
01459                 }while(val > range);
01460                 // The range used for the random coefficients is randomized to avoid
01461                 // too drastic image distortions and to simulate a real environment.
01462                 coefs[(int)((rand() / (WORD)RAND_MAX) * n_zernikes)] = val;
01463         }
01464 }
01465 
01473 void CPSO::generateNormalDistrRandomCoefs(WORD *coefs, int n_zernikes, double range){
01474         WORD val;
01475         for(int i=0; i < n_zernikes; i++){
01476                 do{
01477                         val = randNormalDistribution(0, range / 4);
01478                 }while(val > range);
01479                 coefs[i] = val;
01480         }
01481 }
01482 
01490 double CPSO::randNormalDistribution(double mean, double std_dev){
01491         return (mean + (rand()%2 ? -1.0 : 1.0) *
01492                         std_dev * pow(-log(0.99999*((double)rand()/RAND_MAX)), 0.5)
01493         );
01494 }
 All Classes Functions