#ifndef SYNTHESIS_CUDALBFGS_H #define SYNTHESIS_CUDALBFGS_H #include #include #include #include #include #include #include #include //#include "CudaLBFGS/lbfgs.h" #include "lbfgs.h" #include #include #include #include #include #ifndef isnan #define isnan(x) std::isnan(x) #endif namespace casa { //# NAMESPACE CASA - BEGIN class cpu_objfunc : public cpu_cost_function { private: int nX; int nY; unsigned int AspLen; casacore::Matrix itsMatDirty; casacore::Matrix itsPsfFT; std::vector center; casacore::Matrix newResidual; casacore::Matrix AspConvPsf; casacore::Matrix dAspConvPsf; casacore::FFTServer fft; casacore::Matrix Asp; casacore::Matrix dAsp; public: cpu_objfunc(const casacore::Matrix& dirty, const casacore::Matrix& psf, const std::vector& positionOptimum) : cpu_cost_function(2), itsMatDirty(dirty), itsPsfFT(psf), center(positionOptimum) { nX = itsMatDirty.shape()(0); nY = itsMatDirty.shape()(1); AspLen = center.size(); newResidual.resize(nX, nY); AspConvPsf.resize(nX, nY); dAspConvPsf.resize(nX, nY); fft = casacore::FFTServer(itsMatDirty.shape()); Asp.resize(nX, nY); dAsp.resize(nX, nY); } void cpu_f(const floatdouble *h_x, floatdouble *h_y) { /*const floatdouble x0 = h_x[0]; const floatdouble x1 = h_x[1]; // f = (1-x0)^2 + 100 (x1-x0^2)^2 const floatdouble a = (1.0 - x0); const floatdouble b = (x1 - x0 * x0) ; *h_y = (a*a) + 100.0f * (b*b);*/ double fx = 0.0; double amp = 1; const int refi = nX/2; const int refj = nY/2; int minX = nX - 1; int maxX = 0; int minY = nY - 1; int maxY = 0; // First, get the amp * AspenConvPsf for each Aspen to update the residual for (unsigned int k = 0; k < AspLen; k ++) { amp = h_x[2*k]; double scale = h_x[2*k+1]; std::cout << "f: amp " << amp << " scale " << scale << std::endl; if (isnan(amp) || scale < 0.4) // GSL scale < 0 { //std::cout << "nan? " << amp << " neg scale? " << scale << std::endl; // If scale is small (<0.4), make it 0 scale to utilize Hogbom and save time scale = (scale = fabs(scale)) < 0.4 ? 0 : scale; //std::cout << "reset neg scale to " << scale << std::endl; if (scale <= 0) { *h_y = 0; return; } } // generate a gaussian for each Asp in the Aspen set // x[0]: Amplitude0, x[1]: scale0 // x[2]: Amplitude1, x[3]: scale1 // x[2k]: Amplitude(k), x[2k+1]: scale(k+1) //casacore::Matrix Asp(nX, nY); Asp = 0.0; const double sigma5 = 5 * scale / 2; const int minI = std::max(0, (int)(center[k][0] - sigma5)); const int maxI = std::min(nX-1, (int)(center[k][0] + sigma5)); const int minJ = std::max(0, (int)(center[k][1] - sigma5)); const int maxJ = std::min(nY-1, (int)(center[k][1] + sigma5)); if (minI < minX) minX = minI; if (maxI > maxX) maxX = maxI; if (minJ < minY) minY = minJ; if (maxJ > maxY) maxY = maxJ; //std::cout << "minI " << minI << " minJ " << minJ << " minX " << minX << " minY " << minY << std::endl; for (int j = minJ; j <= maxJ; j++) { for (int i = minI; i <= maxI; i++) { const int px = i; const int py = j; Asp(i,j) = (1.0/(sqrt(2*M_PI)*fabs(scale)))*exp(-(pow(i-center[k][0],2) + pow(j-center[k][1],2))*0.5/pow(scale,2)); } } casacore::Matrix AspFT; fft.fft0(AspFT, Asp); casacore::Matrix cWork; cWork = AspFT * itsPsfFT; /* start = std::chrono::high_resolution_clock::now(); casacore::Matrix AspConvPsf(itsMatDirty.shape(), AspConvPsfPtr, casacore::TAKE_OVER); stop = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast(stop - start) ; std::cout << "BFGS AspConvPsf takeover runtime " << duration.count() << " us" << std::endl;*/ fft.fft0(AspConvPsf, cWork, false); fft.flip(AspConvPsf, false, false); //need this /*casacore::Bool ddelc; const casacore::Float *dptrc = AspConvPsf.getStorage(ddelc); float *ddptrc = const_cast(dptrc); Eigen::MatrixXf MAspConvPsf = Eigen::Map>(ddptrc, nX, nY); AmpAspConvPsfSum = AmpAspConvPsfSum + amp * MAspConvPsf; //optimumstrength*PsfConvAspen AspConvPsf.freeStorage(dptrc, ddelc);*/ // correct way is to do it here, but since we only have one aspen, it doesn't matter. /*for (int j = minJ; j < maxJ; ++j) { for(int i = minI; i < maxI; ++i) { AmpAspConvPsfSum(i, j) += amp * AspConvPsf(i, j); } }*/ // save this for my_df //MyP->setterAsp(Asp); //MyP->setterAspConvPsf(AspConvPsf); } // end get amp * AspenConvPsf // Update the residual using the current residual image and the latest Aspen. // Sanjay used, Res = OrigDirty - active-set aspen * Psf, in 2004, instead. // Both works but the current approach is simpler and performs well too. /*casacore::Bool ddel; const casacore::Float *dptr = itsMatDirty.getStorage(ddel); float *ddptr = const_cast(dptr); Eigen::MatrixXf Mdirty = Eigen::Map>(ddptr, nX, nY); newResidual = Mdirty - AmpAspConvPsfSum; itsMatDirty.freeStorage(dptr, ddel); // generate ChiSq // returns the objective function value for (int j = minY; j < maxY; ++j) { for(int i = minX; i < maxX; ++i) { fx = fx + double(pow(newResidual(i, j), 2)); } }*/ //genie for (int j = minY; j < maxY; ++j) { for(int i = minX; i < maxX; ++i) { newResidual(i, j) = itsMatDirty(i, j) - amp * AspConvPsf(i, j); fx = fx + double(pow(newResidual(i, j), 2)); } } //genie std::cout << "after fx " << fx << std::endl; // update newResidual back to the ParamObj //MyP->setterRes(newResidual); *h_y = fx; } void cpu_gradf(const floatdouble *h_x, floatdouble *h_grad) { /*const floatdouble x0 = h_x[0]; const floatdouble x1 = h_x[1]; // df/dx0 = -2 (1-x0) - 400 (x1-x0^2) x0 // df/dx1 = 200 (x1 - x0^2) h_grad[0] = -2.0f * (1.0f - x0) - 400.0f * x0 * (x1 - x0*x0); h_grad[1] = 200.0f * (x1 - x0*x0);*/ // gradient. 0: amplitude; 1: scale // returns the gradient evaluated on x for (unsigned int k = 0; k < AspLen; k ++) { dAsp = 0.0; double amp = h_x[2*k]; double scale = h_x[2*k+1]; std::cout << "grad: amp " << amp << " scale " << scale << std::endl; if (isnan(amp) || scale < 0.4) // GSL scale < 0 { //std::cout << "grad: nan? " << amp << " neg scale? " << scale << std::endl; scale = (scale = fabs(scale)) < 0.4 ? 0.4 : scale; // This cannot be 0 for df to work //std::cout << "reset neg scale to " << scale << std::endl; } const double sigma5 = 5 * scale / 2; const int minI = std::max(0, (int)(center[k][0] - sigma5)); const int maxI = std::min(nX-1, (int)(center[k][0] + sigma5)); const int minJ = std::max(0, (int)(center[k][1] - sigma5)); const int maxJ = std::min(nY-1, (int)(center[k][1] + sigma5)); for (int j = minJ; j <= maxJ; j++) { for (int i = minI; i <= maxI; i++) { const int px = i; const int py = j; //Asp(i,j) = (1.0/(sqrt(2*M_PI)*fabs(scale)))*exp(-(pow(i-center[k][0],2) + pow(j-center[k][1],2))*0.5/pow(scale,2)); dAsp(i,j)= Asp(i,j) * (((pow(i-center[k][0],2) + pow(j-center[k][1],2)) / pow(scale,2) - 1) / fabs(scale)); // verified by python } } casacore::Matrix dAspFT; fft.fft0(dAspFT, dAsp); casacore::Matrix dcWork; dcWork = dAspFT * itsPsfFT; fft.fft0(dAspConvPsf, dcWork, false); fft.flip(dAspConvPsf, false, false); //need this // reset grad to 0. This is important to get the correct optimization. double dA = 0.0; double dS = 0.0; //std::cout << "before grad " << 2*k << ": " << gsl_vector_get(grad, 2*k) << std::endl; //std::cout << "before grad " << 2*k+1 << ": " << gsl_vector_get(grad, 2*k+1) << std::endl; for (int j = minJ; j <= maxJ; j++) { for (int i = minI; i <= maxI; i++) { /* // generate derivatives of amplitude GradAmp(i,j) = (-2) * newResidual(i,j) * AspConvPsf(i,j); // generate derivative of scale GradScale(i,j) = (-2) * amp * newResidual(i,j) * dAspConvPsf(i,j); dA += double(GradAmp(i,j)); dS += double(GradScale(i,j));*/ // derivatives of amplitude dA += double((-2) * newResidual(i,j) * AspConvPsf(i,j)); // derivative of scale dS += double((-2) * amp * newResidual(i,j) * dAspConvPsf(i,j)); } } h_grad[2*k] = dA; h_grad[2*k+1] = dS; } // end of derivatives } void cpu_f_gradf(const floatdouble *h_x, floatdouble *h_f, floatdouble *h_gradf) { cpu_f(h_x, h_f); cpu_gradf(h_x, h_gradf); } }; /*namespace gpu_rosenbrock_d { __global__ void kernelF(const float *d_x, float *d_y) { const float &x0 = d_x[0]; const float &x1 = d_x[1]; // f = (1-x0)^2 + 100 (x1-x0^2)^2 const float a = (1.0 - x0); const float b = (x1 - x0 * x0) ; *d_y = (a*a) + 100.0f * (b*b); } __global__ void kernelGradf(const float *d_x, float *d_grad) { const float x0 = d_x[0]; const float x1 = d_x[1]; // df/dx0 = -2 (1-x0) - 400 (x1-x0^2) x0 // df/dx1 = 200 (x1 - x0^2) d_grad[0] = -2.0f * (1.0f - x0) - 400.0f * x0 * (x1 - x0*x0); d_grad[1] = 200.0f * (x1 - x0*x0); } } class gpu_rosenbrock : public cost_function { public: gpu_rosenbrock() : cost_function(2) {} void f(const float *d_x, float *d_y) { gpu_rosenbrock_d::kernelF<<<1, 1>>>(d_x, d_y); } void gradf(const float *d_x, float *d_grad) { gpu_rosenbrock_d::kernelGradf<<<1, 1>>>(d_x, d_grad); } void f_gradf(const float *d_x, float *d_f, float *d_grad) { f(d_x, d_f); gradf(d_x, d_grad); } };*/ } // end namespace #endif /*int main(int argc, char **argv) { // CPU cpu_rosenbrock rb1; lbfgs minimizer1(rb1); minimizer1.setGradientEpsilon(1e-3f); float x[2] = {2.0f, -1.0f}; lbfgs::status stat = minimizer1.cpu_lbfgs(x); cout << "CPU Rosenbrock: " << x[0] << " " << x[1] << endl; cout << minimizer1.statusToString(stat).c_str() << endl; // GPU gpu_rosenbrock rb2; lbfgs minimizer2(rb2); x[0] = -4.0f; x[1] = 2.0f; float *d_x; cudaMalloc(&d_x, 2 * sizeof(float)); cudaMemcpy(d_x, &x, 2 * sizeof(float), cudaMemcpyHostToDevice); minimizer2.minimize(d_x); cudaMemcpy(&x, d_x, 2 * sizeof(float), cudaMemcpyDeviceToHost); cout << "GPU Rosenbrock: " << x[0] << " " << x[1] << endl; return 0; }*/