diff --git a/doc/cpptraj.lyx b/doc/cpptraj.lyx index 7936bc00ff..ad71d98000 100644 --- a/doc/cpptraj.lyx +++ b/doc/cpptraj.lyx @@ -1174,6 +1174,10 @@ closest watershell \end_layout +\begin_layout LyX-Code +gist +\end_layout + \begin_layout Section General Concepts \end_layout @@ -22490,7 +22494,7 @@ gist \end_inset - [doorder] [doeij] [skipE] [refdens ] [temp ] + [doorder] [doeij] [skipS] [skipE] [refdens ] [temp ] \end_layout \begin_layout LyX-Code @@ -22543,6 +22547,10 @@ literal "true" ). \end_layout +\begin_layout Description +[skipS] Skip all entropy calculations. +\end_layout + \begin_layout Description \series bold @@ -22922,6 +22930,27 @@ refdens keyword, instead of allowing GIST to supply the default value. \end_layout +\begin_layout Standard +For GIST, a GPU accelerated version is available, in which the interaction + energy is calculated using CUDA. + When using the GPU accelerated version of GIST, the +\series bold +doeij +\series default +keyword is not available. + It is recommended to use a grid covering the entire box, when using the + GPU implementation. + You may also choose a smaller grid, but all interaction energies, i.e., each + atom with each atom, will always be calculated independent of the chosen + grid. + This ensures optimum performance when calculating the interaction energies. + Thus, the additional time required to calculate the order parameters ( +\series bold +doorder +\series default +) is negligible. +\end_layout + \begin_layout Subsubsection* \paragraph_spacing other 3 \noindent diff --git a/src/Action_GIST.cpp b/src/Action_GIST.cpp index c928fa26e3..109b58cf6f 100644 --- a/src/Action_GIST.cpp +++ b/src/Action_GIST.cpp @@ -15,6 +15,18 @@ const double Action_GIST::maxD_ = DBL_MAX; Action_GIST::Action_GIST() : +#ifdef CUDA + solvent_(NULL), + NBindex_c_(NULL), + molecule_c_(NULL), + paramsLJ_c_(NULL), + max_c_(NULL), + min_c_(NULL), + result_w_c_(NULL), + result_s_c_(NULL), + result_O_c_(NULL), + result_N_c_(NULL), +#endif gO_(0), gH_(0), Esw_(0), @@ -49,12 +61,17 @@ Action_GIST::Action_GIST() : {} void Action_GIST::Help() const { - mprintf("\t[doorder] [doeij] [skipE] [refdens ] [temp ]\n" + mprintf("\t[doorder] [doeij] [skipE] [skipS] [refdens ] [temp ]\n" "\t[noimage] [gridcntr ] [excludeions]\n" "\t[griddim ] [gridspacn ]\n" "\t[prefix ] [ext ] [out ]\n" "\t[info ]\n" - "Perform Grid Inhomogenous Solvation Theory calculation.\n"); + "Perform Grid Inhomogenous Solvation Theory calculation.\n" +#ifdef CUDA + "The option doeij is not available, when using the CUDA accelerated version,\n" + "as this would need way too much memory." +#endif + ); } Action::RetType Action_GIST::Init(ArgList& actionArgs, ActionInit& init, int debugIn) @@ -99,6 +116,12 @@ Action::RetType Action_GIST::Init(ArgList& actionArgs, ActionInit& init, int deb image_.InitImaging( !(actionArgs.hasKey("noimage")) ); doOrder_ = actionArgs.hasKey("doorder"); doEij_ = actionArgs.hasKey("doeij"); +#ifdef CUDA + if (this->doEij_) { + mprintf("Warning: 'doeij' cannot be specified when using CUDA. Setting is ignored"); + this->doEij_ = false; + } +#endif skipE_ = actionArgs.hasKey("skipE"); if (skipE_) { if (doEij_) { @@ -106,6 +129,9 @@ Action::RetType Action_GIST::Init(ArgList& actionArgs, ActionInit& init, int deb return Action::ERR; } } + + this->skipS_ = actionArgs.hasKey("skipS"); + if (doEij_) { eijfile_ = init.DFL().AddCpptrajFile(prefix_ + "-Eww_ij.dat", "GIST Eij matrix file"); if (eijfile_ == 0) return Action::ERR; @@ -255,6 +281,14 @@ Action::RetType Action_GIST::Init(ArgList& actionArgs, ActionInit& init, int deb EIJ_EN_.resize( numthreads ); } # endif + + #ifdef CUDA + if (this->skipE_ && this->doOrder_) { + mprintf("When the keyword \"skipE\" is supplied, \"doorder\" cannot be" + " chosen, as both calculations are done on the GPU at the same" + " time.\nIgnoring \"doorder!\"\n"); + } + #endif } //Box gbox; @@ -304,7 +338,12 @@ Action::RetType Action_GIST::Init(ArgList& actionArgs, ActionInit& init, int deb "# Crystal Nguyen, Michael K. Gilson, and Tom Young, arXiv:1108.4876v1 (2011)\n" "# Crystal N. Nguyen, Tom Kurtzman Young, and Michael K. Gilson,\n" "# J. Chem. Phys. 137, 044101 (2012)\n" - "# Lazaridis, J. Phys. Chem. B 102, 3531–3541 (1998)\n"); + "# Lazaridis, J. Phys. Chem. B 102, 3531–3541 (1998)\n" +#ifdef CUDA + "#When using the GPU parallelized version of GIST, please cite:\n" + "# Johannes Kraml, Anna S. Kamenik, Franz Waibl, Michael Schauperl, Klaus R. Liedl, JCTC (2019)\n" +#endif + ); gist_init_.Stop(); return Action::OK; } @@ -321,6 +360,10 @@ Action::RetType Action_GIST::Setup(ActionSetup& setup) { return Action::ERR; } image_.SetupImaging( setup.CoordInfo().TrajBox().Type() ); + #ifdef CUDA + this->numberAtoms_ = setup.Top().Natom(); + this->solvent_ = new bool[this->numberAtoms_]; + #endif // Get molecule number for each solvent molecule //mol_nums_.clear(); @@ -340,6 +383,9 @@ Action::RetType Action_GIST::Setup(ActionSetup& setup) { { if (mol->IsSolvent()) { int o_idx = mol->BeginAtom(); + #ifdef CUDA + this->headAtomType_ = setup.Top()[o_idx].TypeIndex(); + #endif // Check that molecule has correct # of atoms unsigned int molNumAtoms = (unsigned int)mol->NumAtoms(); if (nMolAtoms_ == 0) { @@ -372,6 +418,12 @@ Action::RetType Action_GIST::Setup(ActionSetup& setup) { for (unsigned int IDX = 0; IDX != nMolAtoms_; IDX++) { A_idxs_.push_back( o_idx + IDX ); atom_voxel_.push_back( OFF_GRID_ ); + #ifdef CUDA + this->molecule_.push_back( setup.Top()[o_idx + IDX ].MolNum() ); + this->charges_.push_back( setup.Top()[o_idx + IDX ].Charge() ); + this->atomTypes_.push_back( setup.Top()[o_idx + IDX ].TypeIndex() ); + this->solvent_[ o_idx + IDX ] = true; + #endif } NsolventAtoms += nMolAtoms_; // If first solvent molecule, save charges. If not, check that charges match. @@ -407,6 +459,12 @@ Action::RetType Action_GIST::Setup(ActionSetup& setup) { A_idxs_.push_back( u_idx ); atom_voxel_.push_back( SOLUTE_ ); NsoluteAtoms++; + #ifdef CUDA + this->molecule_.push_back( setup.Top()[ u_idx ].MolNum() ); + this->charges_.push_back( setup.Top()[ u_idx ].Charge() ); + this->atomTypes_.push_back( setup.Top()[ u_idx ].TypeIndex() ); + this->solvent_[ u_idx ] = false; + #endif } } } @@ -436,6 +494,35 @@ Action::RetType Action_GIST::Setup(ActionSetup& setup) { mprintf("\tNo imaging will be performed for energy distance calculations.\n"); } +#ifdef CUDA + NonbondParmType nb = setup.Top().Nonbond(); + this->NBIndex_ = nb.NBindex(); + this->numberAtomTypes_ = nb.Ntypes(); + for (unsigned int i = 0; i < nb.NBarray().size(); ++i) { + this->lJParamsA_.push_back( (float) nb.NBarray().at(i).A() ); + this->lJParamsB_.push_back( (float) nb.NBarray().at(i).B() ); + } + + try { + allocateCuda(((void**)&this->NBindex_c_), this->NBIndex_.size() * sizeof(int)); + allocateCuda((void**)&this->max_c_, 3 * sizeof(float)); + allocateCuda((void**)&this->min_c_, 3 * sizeof(float)); + allocateCuda((void**)&this->result_w_c_, this->numberAtoms_ * sizeof(float)); + allocateCuda((void**)&this->result_s_c_, this->numberAtoms_ * sizeof(float)); + allocateCuda((void**)&this->result_O_c_, this->numberAtoms_ * 4 * sizeof(int)); + allocateCuda((void**)&this->result_N_c_, this->numberAtoms_ * sizeof(int)); + } catch (CudaException &e) { + mprinterr("Error: Could not allocate memory on GPU!\n"); + this->freeGPUMemory(); + return Action::ERR; + } + try { + this->copyToGPU(); + } catch (CudaException &e) { + return Action::ERR; + } +#endif + gist_setup_.Stop(); return Action::OK; } @@ -723,6 +810,8 @@ Action::RetType Action_GIST::DoAction(int frameNum, ActionFrame& frm) { OnGrid_idxs_.clear(); OnGrid_XYZ_.clear(); + // CUDA necessary information + size_t bin_i, bin_j, bin_k; Vec3 const& Origin = gO_->Bin().GridOrigin(); // Loop over each solvent molecule @@ -891,14 +980,20 @@ Action::RetType Action_GIST::DoAction(int frameNum, ActionFrame& frm) { } // END loop over each solvent molecule // Do energy calculation if requested + #ifndef CUDA gist_nonbond_.Start(); if (!skipE_) NonbondEnergy(frm.Frm(), *CurrentParm_); gist_nonbond_.Stop(); + // Do order calculation if requested gist_order_.Start(); if (doOrder_) Order(frm.Frm()); gist_order_.Stop(); + #else + if (! this->skipE_) + NonbondCuda(frm); + #endif gist_action_.Stop(); return Action::OK; @@ -961,62 +1056,65 @@ void Action_GIST::Print() { double Vvox = gO_->Bin().VoxelVolume(); mprintf(" GIST OUTPUT:\n"); + + // The variables are kept outside, so that they are declared for later use. // Calculate orientational entropy DataSet_GridFlt& dTSorient_dens = static_cast( *dTSorient_ ); Farray dTSorient_norm( MAX_GRID_PT_, 0.0 ); double dTSorienttot = 0; int nwtt = 0; double dTSo = 0; - // LOOP over all voxels - mprintf("\tCalculating orientational entropy:\n"); - ProgressBar oe_progress( MAX_GRID_PT_ ); - for (unsigned int gr_pt = 0; gr_pt < MAX_GRID_PT_; gr_pt++) { - oe_progress.Update( gr_pt ); - dTSorient_dens[gr_pt] = 0; - dTSorient_norm[gr_pt] = 0; - int nw_total = N_waters_[gr_pt]; // Total number of waters that have been in this voxel. - nwtt += nw_total; - //mprintf("DEBUG1: %u nw_total %i\n", gr_pt, nw_total); - if (nw_total > 1) { - for (int n0 = 0; n0 < nw_total; n0++) - { - double NNr = 10000; - int q0 = n0 * 4; // Index into voxel_Q_ for n0 - for (int n1 = 0; n1 < nw_total; n1++) + if (! this->skipS_) { + // LOOP over all voxels + mprintf("\tCalculating orientational entropy:\n"); + ProgressBar oe_progress( MAX_GRID_PT_ ); + for (unsigned int gr_pt = 0; gr_pt < MAX_GRID_PT_; gr_pt++) { + oe_progress.Update( gr_pt ); + dTSorient_dens[gr_pt] = 0; + dTSorient_norm[gr_pt] = 0; + int nw_total = N_waters_[gr_pt]; // Total number of waters that have been in this voxel. + nwtt += nw_total; + //mprintf("DEBUG1: %u nw_total %i\n", gr_pt, nw_total); + if (nw_total > 1) { + for (int n0 = 0; n0 < nw_total; n0++) { - if (n0 != n1) { - int q1 = n1 * 4; // Index into voxel_Q_ for n1 - double rR = 2.0 * acos( fabs(voxel_Q_[gr_pt][q1 ] * voxel_Q_[gr_pt][q0 ] + double NNr = 10000; + int q0 = n0 * 4; // Index into voxel_Q_ for n0 + for (int n1 = 0; n1 < nw_total; n1++) + { + if (n0 != n1) { + int q1 = n1 * 4; // Index into voxel_Q_ for n1 + double rR = 2.0 * acos( fabs(voxel_Q_[gr_pt][q1 ] * voxel_Q_[gr_pt][q0 ] + voxel_Q_[gr_pt][q1+1] * voxel_Q_[gr_pt][q0+1] + voxel_Q_[gr_pt][q1+2] * voxel_Q_[gr_pt][q0+2] + voxel_Q_[gr_pt][q1+3] * voxel_Q_[gr_pt][q0+3] )); // add fabs for quaternion distance calculation - //mprintf("DEBUG1: %g\n", rR); - if (rR > 0 && rR < NNr) NNr = rR; - } - } // END inner loop over all waters for this voxel - - if (NNr < 9999 && NNr > 0) { - double dbl = log(NNr*NNr*NNr*nw_total / (3.0*Constants::TWOPI)); - //mprintf("DEBUG1: dbl %f\n", dbl); - dTSorient_norm[gr_pt] += dbl; - dTSo += dbl; - } - } // END outer loop over all waters for this voxel - //mprintf("DEBUG1: dTSorient_norm %f\n", dTSorient_norm[gr_pt]); - dTSorient_norm[gr_pt] = Constants::GASK_KCAL * temperature_ * - ((dTSorient_norm[gr_pt]/nw_total) + Constants::EULER_MASC); - double dtso_norm_nw = (double)dTSorient_norm[gr_pt] * (double)nw_total; - dTSorient_dens[gr_pt] = (dtso_norm_nw / (NFRAME_ * Vvox)); - dTSorienttot += dTSorient_dens[gr_pt]; - //mprintf("DEBUG1: %f\n", dTSorienttot); - } - } // END loop over all grid points (voxels) - dTSorienttot *= Vvox; - infofile_->Printf("Maximum number of waters found in one voxel for %d frames = %d\n", - NFRAME_, max_nwat_); - infofile_->Printf("Total referenced orientational entropy of the grid:" - " dTSorient = %9.5f kcal/mol, Nf=%d\n", dTSorienttot, NFRAME_); + //mprintf("DEBUG1: %g\n", rR); + if (rR > 0 && rR < NNr) NNr = rR; + } + } // END inner loop over all waters for this voxel + if (NNr < 9999 && NNr > 0) { + double dbl = log(NNr*NNr*NNr*nw_total / (3.0*Constants::TWOPI)); + //mprintf("DEBUG1: dbl %f\n", dbl); + dTSorient_norm[gr_pt] += dbl; + dTSo += dbl; + } + } // END outer loop over all waters for this voxel + //mprintf("DEBUG1: dTSorient_norm %f\n", dTSorient_norm[gr_pt]); + dTSorient_norm[gr_pt] = Constants::GASK_KCAL * temperature_ * + ((dTSorient_norm[gr_pt]/nw_total) + Constants::EULER_MASC); + double dtso_norm_nw = (double)dTSorient_norm[gr_pt] * (double)nw_total; + dTSorient_dens[gr_pt] = (dtso_norm_nw / (NFRAME_ * Vvox)); + dTSorienttot += dTSorient_dens[gr_pt]; + //mprintf("DEBUG1: %f\n", dTSorienttot); + } + } // END loop over all grid points (voxels) + dTSorienttot *= Vvox; + infofile_->Printf("Maximum number of waters found in one voxel for %d frames = %d\n", + NFRAME_, max_nwat_); + infofile_->Printf("Total referenced orientational entropy of the grid:" + " dTSorient = %9.5f kcal/mol, Nf=%d\n", dTSorienttot, NFRAME_); + } // Compute translational entropy for each voxel double dTStranstot = 0.0; double dTSt = 0.0; @@ -1034,8 +1132,12 @@ void Action_GIST::Print() { DataSet_GridFlt& dTSsix = static_cast( *dTSsix_ ); Farray dTStrans_norm( MAX_GRID_PT_, 0.0 ); Farray dTSsix_norm( MAX_GRID_PT_, 0.0 ); + // Loop over all grid points - mprintf("\tCalculating translational entropy:\n"); + if (! this->skipS_) + mprintf("\tCalculating translational entropy:\n"); + else + mprintf("Calculating Densities:\n"); ProgressBar te_progress( MAX_GRID_PT_ ); for (unsigned int gr_pt = 0; gr_pt < MAX_GRID_PT_; gr_pt++) { te_progress.Update( gr_pt ); @@ -1043,126 +1145,126 @@ void Action_GIST::Print() { double W_dens = 1.0 * N_waters_[gr_pt] / (NFRAME_*Vvox); gO[gr_pt] = W_dens / BULK_DENS_; gH[gr_pt] = 1.0 * N_hydrogens_[gr_pt] / (NFRAME_*Vvox*2*BULK_DENS_); - - int nw_total = N_waters_[gr_pt]; // Total number of waters that have been in this voxel. - for (int n0 = 0; n0 < nw_total; n0++) - { - double NNd = 10000; - double NNs = 10000; - int i0 = n0 * 3; // index into voxel_xyz_ for n0 - float VX = voxel_xyz_[gr_pt][i0 ]; - float VY = voxel_xyz_[gr_pt][i0+1]; - float VZ = voxel_xyz_[gr_pt][i0+2]; - int q0 = n0 * 4; // index into voxel_Q_ for n0 - float W4 = voxel_Q_[gr_pt][q0 ]; - float X4 = voxel_Q_[gr_pt][q0+1]; - float Y4 = voxel_Q_[gr_pt][q0+2]; - float Z4 = voxel_Q_[gr_pt][q0+3]; - // First do own voxel - for (int n1 = 0; n1 < nw_total; n1++) { - if ( n1 != n0) { - int i1 = n1 * 3; // index into voxel_xyz_ for n1 - double dx = (double)(VX - voxel_xyz_[gr_pt][i1 ]); - double dy = (double)(VY - voxel_xyz_[gr_pt][i1+1]); - double dz = (double)(VZ - voxel_xyz_[gr_pt][i1+2]); - double dd = dx*dx+dy*dy+dz*dz; - if (dd < NNd && dd > 0) { NNd = dd; } - int q1 = n1 * 4; // index into voxel_Q_ for n1 - double rR = 2 * acos( fabs(W4*voxel_Q_[gr_pt][q1 ] + - X4*voxel_Q_[gr_pt][q1+1] + - Y4*voxel_Q_[gr_pt][q1+2] + - Z4*voxel_Q_[gr_pt][q1+3] )); //add fabs for quaternion distance calculation - double ds = rR*rR + dd; - if (ds < NNs && ds > 0) { NNs = ds; } - } - } // END self loop over all waters for this voxel - //mprintf("DEBUG1: self NNd=%f NNs=%f\n", NNd, NNs); - // Determine which directions are possible. - bool cannotAddZ = (nz == 0 || ( gr_pt%nz == nz-1 )); - bool cannotAddY = ((nz == 0 || ny-1 == 0) || ( gr_pt%(nz*(ny-1)+(numplane*addx)) < nz)); - bool cannotAddX = (gr_pt >= addx * (nx-1) && gr_pt < addx * nx ); - bool cannotSubZ = (nz == 0 || gr_pt%nz == 0); - bool cannotSubY = ((nz == 0 || ny == 0) || (gr_pt%addx < nz)); - bool cannotSubX = ((nz == 0 || ny == 0) || (gr_pt < addx)); - bool boundary = ( cannotAddZ || cannotAddY || cannotAddX || - cannotSubZ || cannotSubY || cannotSubX ); - if (!boundary) { - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addy, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addy, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz + addy, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz - addy, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz + addy, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz - addy, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz + addx, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz - addx, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz + addx, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz - addx, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addy + addx, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addy - addx, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addy + addx, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addy - addx, NNd, NNs); - - // add the 8 more voxels for NNr searching - - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx + addy + addz, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx + addy - addz, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx - addy + addz, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx - addy - addz, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx + addy + addz, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx + addy - addz, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx - addy + addz, NNd, NNs); - TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx - addy - addz, NNd, NNs); - - - - NNd = sqrt(NNd); - NNs = sqrt(NNs); - - if (NNd < 3 && NNd > 0/*NNd < 9999 && NNd > 0*/) { - double dbl = log((NNd*NNd*NNd*NFRAME_*4*Constants::PI*BULK_DENS_)/3); - dTStrans_norm[gr_pt] += dbl; - dTSt += dbl; - dbl = log((NNs*NNs*NNs*NNs*NNs*NNs*NFRAME_*Constants::PI*BULK_DENS_)/48); - dTSsix_norm[gr_pt] += dbl; - dTSs += dbl; - //mprintf("DEBUG1: dbl=%f NNs=%f\n", dbl, NNs); + if (! this->skipS_) { + int nw_total = N_waters_[gr_pt]; // Total number of waters that have been in this voxel. + for (int n0 = 0; n0 < nw_total; n0++) + { + double NNd = 10000; + double NNs = 10000; + int i0 = n0 * 3; // index into voxel_xyz_ for n0 + float VX = voxel_xyz_[gr_pt][i0 ]; + float VY = voxel_xyz_[gr_pt][i0+1]; + float VZ = voxel_xyz_[gr_pt][i0+2]; + int q0 = n0 * 4; // index into voxel_Q_ for n0 + float W4 = voxel_Q_[gr_pt][q0 ]; + float X4 = voxel_Q_[gr_pt][q0+1]; + float Y4 = voxel_Q_[gr_pt][q0+2]; + float Z4 = voxel_Q_[gr_pt][q0+3]; + // First do own voxel + for (int n1 = 0; n1 < nw_total; n1++) { + if ( n1 != n0) { + int i1 = n1 * 3; // index into voxel_xyz_ for n1 + double dx = (double)(VX - voxel_xyz_[gr_pt][i1 ]); + double dy = (double)(VY - voxel_xyz_[gr_pt][i1+1]); + double dz = (double)(VZ - voxel_xyz_[gr_pt][i1+2]); + double dd = dx*dx+dy*dy+dz*dz; + if (dd < NNd && dd > 0) { NNd = dd; } + int q1 = n1 * 4; // index into voxel_Q_ for n1 + double rR = 2 * acos( fabs(W4*voxel_Q_[gr_pt][q1 ] + + X4*voxel_Q_[gr_pt][q1+1] + + Y4*voxel_Q_[gr_pt][q1+2] + + Z4*voxel_Q_[gr_pt][q1+3] )); //add fabs for quaternion distance calculation + double ds = rR*rR + dd; + if (ds < NNs && ds > 0) { NNs = ds; } + } + } // END self loop over all waters for this voxel + //mprintf("DEBUG1: self NNd=%f NNs=%f\n", NNd, NNs); + // Determine which directions are possible. + bool cannotAddZ = (nz == 0 || ( gr_pt%nz == nz-1 )); + bool cannotAddY = ((nz == 0 || ny-1 == 0) || ( gr_pt%(nz*(ny-1)+(numplane*addx)) < nz)); + bool cannotAddX = (gr_pt >= addx * (nx-1) && gr_pt < addx * nx ); + bool cannotSubZ = (nz == 0 || gr_pt%nz == 0); + bool cannotSubY = ((nz == 0 || ny == 0) || (gr_pt%addx < nz)); + bool cannotSubX = ((nz == 0 || ny == 0) || (gr_pt < addx)); + bool boundary = ( cannotAddZ || cannotAddY || cannotAddX || + cannotSubZ || cannotSubY || cannotSubX ); + if (!boundary) { + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addy, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addy, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz + addy, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz - addy, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz + addy, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz - addy, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz + addx, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz - addx, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz + addx, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz - addx, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addy + addx, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addy - addx, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addy + addx, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addy - addx, NNd, NNs); + + // add the 8 more voxels for NNr searching + + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx + addy + addz, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx + addy - addz, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx - addy + addz, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx - addy - addz, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx + addy + addz, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx + addy - addz, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx - addy + addz, NNd, NNs); + TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx - addy - addz, NNd, NNs); + + + NNd = sqrt(NNd); + NNs = sqrt(NNs); + + if (NNd < 3 && NNd > 0/*NNd < 9999 && NNd > 0*/) { + double dbl = log((NNd*NNd*NNd*NFRAME_*4*Constants::PI*BULK_DENS_)/3); + dTStrans_norm[gr_pt] += dbl; + dTSt += dbl; + dbl = log((NNs*NNs*NNs*NNs*NNs*NNs*NFRAME_*Constants::PI*BULK_DENS_)/48); + dTSsix_norm[gr_pt] += dbl; + dTSs += dbl; + //mprintf("DEBUG1: dbl=%f NNs=%f\n", dbl, NNs); + } } + } // END loop over all waters for this voxel + if (dTStrans_norm[gr_pt] != 0) { + nwts += nw_total; + dTStrans_norm[gr_pt] = Constants::GASK_KCAL*temperature_*( (dTStrans_norm[gr_pt]/nw_total) + + Constants::EULER_MASC ); + dTSsix_norm[gr_pt] = Constants::GASK_KCAL*temperature_*( (dTSsix_norm[gr_pt]/nw_total) + + Constants::EULER_MASC ); } - } // END loop over all waters for this voxel - if (dTStrans_norm[gr_pt] != 0) { - nwts += nw_total; - dTStrans_norm[gr_pt] = Constants::GASK_KCAL*temperature_*( (dTStrans_norm[gr_pt]/nw_total) + - Constants::EULER_MASC ); - dTSsix_norm[gr_pt] = Constants::GASK_KCAL*temperature_*( (dTSsix_norm[gr_pt]/nw_total) + - Constants::EULER_MASC ); + double dtst_norm_nw = (double)dTStrans_norm[gr_pt] * (double)nw_total; + dTStrans[gr_pt] = (dtst_norm_nw / (NFRAME_*Vvox)); + double dtss_norm_nw = (double)dTSsix_norm[gr_pt] * (double)nw_total; + dTSsix[gr_pt] = (dtss_norm_nw / (NFRAME_*Vvox)); + dTStranstot += dTStrans[gr_pt]; + } // END loop over all grid points (voxels) + } + if (!this->skipS_) { + dTStranstot *= Vvox; + double dTSst = 0.0; + double dTStt = 0.0; + if (nwts > 0) { + dTSst = Constants::GASK_KCAL*temperature_*((dTSs/nwts) + Constants::EULER_MASC); + dTStt = Constants::GASK_KCAL*temperature_*((dTSt/nwts) + Constants::EULER_MASC); } - double dtst_norm_nw = (double)dTStrans_norm[gr_pt] * (double)nw_total; - dTStrans[gr_pt] = (dtst_norm_nw / (NFRAME_*Vvox)); - double dtss_norm_nw = (double)dTSsix_norm[gr_pt] * (double)nw_total; - dTSsix[gr_pt] = (dtss_norm_nw / (NFRAME_*Vvox)); - dTStranstot += dTStrans[gr_pt]; - } // END loop over all grid points (voxels) - - dTStranstot *= Vvox; - double dTSst = 0.0; - double dTStt = 0.0; - if (nwts > 0) { - dTSst = Constants::GASK_KCAL*temperature_*((dTSs/nwts) + Constants::EULER_MASC); - dTStt = Constants::GASK_KCAL*temperature_*((dTSt/nwts) + Constants::EULER_MASC); + double dTSot = Constants::GASK_KCAL*temperature_*((dTSo/nwtt) + Constants::EULER_MASC); + infofile_->Printf("watcount in vol = %d\n", nwtt); + infofile_->Printf("watcount in subvol = %d\n", nwts); + infofile_->Printf("Total referenced translational entropy of the grid:" + " dTStrans = %9.5f kcal/mol, Nf=%d\n", dTStranstot, NFRAME_); + infofile_->Printf("Total 6d if all one vox: %9.5f kcal/mol\n", dTSst); + infofile_->Printf("Total t if all one vox: %9.5f kcal/mol\n", dTStt); + infofile_->Printf("Total o if all one vox: %9.5f kcal/mol\n", dTSot); } - double dTSot = Constants::GASK_KCAL*temperature_*((dTSo/nwtt) + Constants::EULER_MASC); - infofile_->Printf("watcount in vol = %d\n", nwtt); - infofile_->Printf("watcount in subvol = %d\n", nwts); - infofile_->Printf("Total referenced translational entropy of the grid:" - " dTStrans = %9.5f kcal/mol, Nf=%d\n", dTStranstot, NFRAME_); - infofile_->Printf("Total 6d if all one vox: %9.5f kcal/mol\n", dTSst); - infofile_->Printf("Total t if all one vox: %9.5f kcal/mol\n", dTStt); - infofile_->Printf("Total o if all one vox: %9.5f kcal/mol\n", dTSot); - // Compute average voxel energy. Allocate these sets even if skipping energy // to be consistent with previous output. DataSet_GridFlt& Esw_dens = static_cast( *Esw_ ); @@ -1177,13 +1279,17 @@ void Action_GIST::Print() { Farray Eww_norm( MAX_GRID_PT_, 0.0 ); Farray neighbor_dens( MAX_GRID_PT_, 0.0 ); if (!skipE_) { + #ifndef CUDA Darray const& E_UV_VDW = E_UV_VDW_[0]; Darray const& E_UV_Elec = E_UV_Elec_[0]; Darray const& E_VV_VDW = E_VV_VDW_[0]; Darray const& E_VV_Elec = E_VV_Elec_[0]; + #endif Farray const& Neighbor = neighbor_[0]; + #ifndef CUDA // Sum values from other threads if necessary SumEVV(); + #endif static const double DEBYE_EA = 0.20822678; // 1 Debye in eA double Eswtot = 0.0; double Ewwtot = 0.0; @@ -1192,15 +1298,26 @@ void Action_GIST::Print() { for (unsigned int gr_pt = 0; gr_pt < MAX_GRID_PT_; gr_pt++) { E_progress.Update( gr_pt ); + //mprintf("DEBUG1: VV vdw=%f elec=%f\n", E_VV_VDW_[gr_pt], E_VV_Elec_[gr_pt]); int nw_total = N_waters_[gr_pt]; // Total number of waters that have been in this voxel. if (nw_total > 1) { + #ifndef CUDA Esw_dens[gr_pt] = (E_UV_VDW[gr_pt] + E_UV_Elec[gr_pt]) / (NFRAME_ * Vvox); Esw_norm[gr_pt] = (E_UV_VDW[gr_pt] + E_UV_Elec[gr_pt]) / nw_total; Eww_dens[gr_pt] = (E_VV_VDW[gr_pt] + E_VV_Elec[gr_pt]) / (2 * NFRAME_ * Vvox); Eww_norm[gr_pt] = (E_VV_VDW[gr_pt] + E_VV_Elec[gr_pt]) / (2 * nw_total); + #else + double esw = this->Esw_->operator[](gr_pt); + double eww = this->Eww_->operator[](gr_pt); + Esw_dens[gr_pt] = esw / (this->NFRAME_ * Vvox); + Esw_norm[gr_pt] = esw / nw_total; + Eww_dens[gr_pt] = eww / (this->NFRAME_ * Vvox); + Eww_norm[gr_pt] = eww / nw_total; + #endif Eswtot += Esw_dens[gr_pt]; Ewwtot += Eww_dens[gr_pt]; + } else { Esw_dens[gr_pt]=0; Esw_norm[gr_pt]=0; @@ -1212,9 +1329,9 @@ void Action_GIST::Print() { if (nw_total > 0) { qtet[gr_pt] /= nw_total; //mprintf("DEBUG1: neighbor= %8.1f nw_total= %8i\n", neighbor[gr_pt], nw_total); - neighbor_norm[gr_pt] = 1.0 * Neighbor[gr_pt] / nw_total; + neighbor_norm[gr_pt] = (double)Neighbor[gr_pt] / nw_total; } - neighbor_dens[gr_pt] = 1.0 * Neighbor[gr_pt] / (NFRAME_ * Vvox); + neighbor_dens[gr_pt] = (double)Neighbor[gr_pt] / (NFRAME_ * Vvox); dipolex[gr_pt] /= (DEBYE_EA * NFRAME_ * Vvox); dipoley[gr_pt] /= (DEBYE_EA * NFRAME_ * Vvox); dipolez[gr_pt] /= (DEBYE_EA * NFRAME_ * Vvox); @@ -1227,6 +1344,17 @@ void Action_GIST::Print() { infofile_->Printf("Total water-solute energy of the grid: Esw = %9.5f kcal/mol\n", Eswtot); infofile_->Printf("Total unreferenced water-water energy of the grid: Eww = %9.5f kcal/mol\n", Ewwtot); + } else { + static const double DEBYE_EA = 0.20822678; + for (unsigned int gr_pt = 0; gr_pt < MAX_GRID_PT_; gr_pt++) + { + dipolex[gr_pt] /= (DEBYE_EA * NFRAME_ * Vvox); + dipoley[gr_pt] /= (DEBYE_EA * NFRAME_ * Vvox); + dipolez[gr_pt] /= (DEBYE_EA * NFRAME_ * Vvox); + pol[gr_pt] = sqrt( dipolex[gr_pt]*dipolex[gr_pt] + + dipoley[gr_pt]*dipoley[gr_pt] + + dipolez[gr_pt]*dipolez[gr_pt] ); + } } // Write the GIST output file. @@ -1300,4 +1428,175 @@ void Action_GIST::Print() { gist_order_.WriteTiming(2, "Order: ", gist_action_.Total()); gist_print_.WriteTiming(1, "Print:", total); mprintf("TIME:\tTotal: %.4f s\n", total); + #ifdef CUDA + this->freeGPUMemory(); + #endif } + +#ifdef CUDA +void Action_GIST::NonbondCuda(ActionFrame frm) { + // Simply to get the information for the energetic calculations + std::vector eww_result(this->numberAtoms_); + std::vector esw_result(this->numberAtoms_); + std::vector > order_indices; + this->gist_nonbond_.Start(); + + Matrix_3x3 ucell_m, recip_m; + float *recip = NULL; + float *ucell = NULL; + int boxinfo; + + // Check Boxinfo and write the necessary data into recip, ucell and boxinfo. + switch(this->image_.ImageType()) { + case NONORTHO: + recip = new float[9]; + ucell = new float[9]; + frm.Frm().BoxCrd().ToRecip(ucell_m, recip_m); + for (int i = 0; i < 9; ++i) { + ucell[i] = (float) ucell_m.Dptr()[i]; + recip[i] = (float) recip_m.Dptr()[i]; + } + boxinfo = 2; + break; + case ORTHO: + recip = new float[9]; + for (int i = 0; i < 3; ++i) { + recip[i] = (float) frm.Frm().BoxCrd()[i]; + } + ucell = NULL; + boxinfo = 1; + break; + case NOIMAGE: + recip = NULL; + ucell = NULL; + boxinfo = 0; + break; + default: + mprinterr("Error: Unexpected box information found."); + return; + } + + std::vector result_o = std::vector(4 * this->numberAtoms_); + std::vector result_n = std::vector(this->numberAtoms_); + // Call the GPU Wrapper, which subsequently calls the kernel, after setup operations. + // Must create arrays from the vectors, does that by getting the address of the first element of the vector. + std::vector > e_result = doActionCudaEnergy(frm.Frm().xAddress(), this->NBindex_c_, this->numberAtomTypes_, this->paramsLJ_c_, this->molecule_c_, boxinfo, recip, ucell, this->numberAtoms_, this->min_c_, + this->max_c_, this->headAtomType_,this->NeighborCut2_, &(result_o[0]), &(result_n[0]), this->result_w_c_, + this->result_s_c_, this->result_O_c_, this->result_N_c_, this->doOrder_); + eww_result = e_result.at(0); + esw_result = e_result.at(1); + + if (this->doOrder_) { + int counter = 0; + for (unsigned int i = 0; i < (4 * this->numberAtoms_); i += 4) { + ++counter; + std::vector temp; + for (unsigned int j = 0; j < 4; ++j) { + temp.push_back(result_o.at(i + j)); + } + order_indices.push_back(temp); + } + } + + delete[] recip; // Free memory + delete[] ucell; // Free memory + + for (unsigned int sidx = 0; sidx < NSOLVENT_; sidx++) { + int headAtomIndex = O_idxs_[sidx]; + size_t bin_i, bin_j, bin_k; + const double *vec = frm.Frm().XYZ(headAtomIndex); + int voxel = -1; + if (this->gO_->Bin().Calc(vec[0], vec[1], vec[2], bin_i, bin_j, bin_k)) { + voxel = this->gO_->CalcIndex(bin_i, bin_j, bin_k); + this->neighbor_.at(0).at(voxel) += result_n.at(headAtomIndex); + // This is not nice, as it assumes that O is set before the two Hydrogens + // might be the case, but is still not nice (in my opinion) + for (unsigned int IDX = 0; IDX != nMolAtoms_; IDX++) { + this->Esw_->UpdateVoxel(voxel, esw_result.at(headAtomIndex + IDX)); + this->Eww_->UpdateVoxel(voxel, eww_result.at(headAtomIndex + IDX)); + } + // Order calculation + if (this->doOrder_) { + double sum = 0; + Vec3 cent( frm.Frm().xAddress() + (headAtomIndex) * 3 ); + std::vector vectors; + switch(this->image_.ImageType()) { + case NONORTHO: + case ORTHO: + { + Matrix_3x3 ucell, recip; + frm.Frm().BoxCrd().ToRecip(ucell, recip); + Vec3 vec(frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(0) * 3)); + vectors.push_back( MinImagedVec(vec, cent, ucell, recip)); + vec = Vec3(frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(1) * 3)); + vectors.push_back( MinImagedVec(vec, cent, ucell, recip)); + vec = Vec3(frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(2) * 3)); + vectors.push_back( MinImagedVec(vec, cent, ucell, recip)); + vec = Vec3(frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(3) * 3)); + vectors.push_back( MinImagedVec(vec, cent, ucell, recip)); + } + break; + default: + vectors.push_back( Vec3( frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(0) * 3) ) - cent ); + vectors.push_back( Vec3( frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(1) * 3) ) - cent ); + vectors.push_back( Vec3( frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(2) * 3) ) - cent ); + vectors.push_back( Vec3( frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(3) * 3) ) - cent ); + } + + for (int i = 0; i < 3; ++i) { + for (int j = i + 1; j < 4; ++j) { + double cosThet = (vectors.at(i) * vectors.at(j)) / sqrt(vectors.at(i).Magnitude2() * vectors.at(j).Magnitude2()); + sum += (cosThet + 1.0/3) * (cosThet + 1.0/3); + } + } + this->order_norm_->UpdateVoxel(voxel, 1.0 - (3.0/8.0) * sum); + } + } + + } + this->gist_nonbond_.Stop(); +} + +/** + * Frees all the Memory on the GPU. + */ +void Action_GIST::freeGPUMemory(void) { + freeCuda(this->NBindex_c_); + freeCuda(this->molecule_c_); + freeCuda(this->paramsLJ_c_); + freeCuda(this->max_c_); + freeCuda(this->min_c_); + freeCuda(this->result_w_c_); + freeCuda(this->result_s_c_); + freeCuda(this->result_O_c_); + freeCuda(this->result_N_c_); + this->NBindex_c_ = NULL; + this->molecule_c_ = NULL; + this->paramsLJ_c_ = NULL; + this->max_c_ = NULL; + this->min_c_ = NULL; + this->result_w_c_= NULL; + this->result_s_c_= NULL; + this->result_O_c_ = NULL; + this->result_N_c_ = NULL; +} + +/** + * Copies data from the CPU to the GPU. + * @throws: CudaException + */ +void Action_GIST::copyToGPU(void) { + try { + copyMemoryToDevice(&(this->NBIndex_[0]), this->NBindex_c_, this->NBIndex_.size() * sizeof(int)); + copyMemoryToDeviceStruct(&(this->charges_[0]), &(this->atomTypes_[0]), this->solvent_, &(this->molecule_[0]), this->numberAtoms_, &(this->molecule_c_), + &(this->lJParamsA_[0]), &(this->lJParamsB_[0]), this->lJParamsA_.size(), &(this->paramsLJ_c_)); + } catch (CudaException &ce) { + this->freeGPUMemory(); + mprinterr("Error: Could not copy data to the device.\n"); + throw ce; + } catch (std::exception &e) { + this->freeGPUMemory(); + throw e; + } +} +#endif diff --git a/src/Action_GIST.h b/src/Action_GIST.h index 575850dc59..d30dba6ac5 100644 --- a/src/Action_GIST.h +++ b/src/Action_GIST.h @@ -3,14 +3,21 @@ #include "Action.h" #include "ImagedAction.h" #include "Timer.h" +#ifdef CUDA +#include "cuda_kernels/GistCudaSetup.cuh" +#endif class DataSet_3D; class DataSet_MatrixFlt; + /// Class for applying Grid Inhomogenous Solvation Theory /** \author Daniel R. Roe */ class Action_GIST : public Action { public: Action_GIST(); + #ifdef CUDA + ~Action_GIST() {delete[] this->solvent_;} + #endif DispatchObject* Alloc() const { return (DispatchObject*)new Action_GIST(); } void Help() const; private: @@ -27,6 +34,39 @@ class Action_GIST : public Action { void Order(Frame const&); void SumEVV(); +#ifdef CUDA + // Additional data for GPU calculation + + std::vector lJParamsA_; + std::vector lJParamsB_; + std::vector charges_; + std::vector atomTypes_; + std::vector NBIndex_; + std::vector molecule_; + + unsigned int numberAtoms_; + int numberAtomTypes_; + int headAtomType_; + bool *solvent_; + + // Arrays on GPU + int *NBindex_c_; + void *molecule_c_; + void *paramsLJ_c_; + float *max_c_; + float *min_c_; + float *result_w_c_; + float *result_s_c_; + int *result_O_c_; + int *result_N_c_; + + // CUDA only functions + void freeGPUMemory(void); + void copyToGPU(void); + void NonbondCuda(ActionFrame); + +#endif + static const Vec3 x_lab_; static const Vec3 y_lab_; static const Vec3 z_lab_; @@ -121,5 +161,6 @@ class Action_GIST : public Action { bool doEij_; ///< If true do the i-j energy calc bool skipE_; ///< If true skip the nonbond energy calc bool includeIons_; ///< If true include ions in solute region. + bool skipS_; ///< If true does not calculate entropy }; #endif diff --git a/src/Version.h b/src/Version.h index b254030f4a..288607a342 100644 --- a/src/Version.h +++ b/src/Version.h @@ -12,7 +12,7 @@ * Whenever a number that precedes is incremented, all subsequent * numbers should be reset to 0. */ -#define CPPTRAJ_INTERNAL_VERSION "V4.24.0" +#define CPPTRAJ_INTERNAL_VERSION "V4.25.0" /// PYTRAJ relies on this #define CPPTRAJ_VERSION_STRING CPPTRAJ_INTERNAL_VERSION #endif diff --git a/src/cuda_kernels/CMakeLists.txt b/src/cuda_kernels/CMakeLists.txt index 8739581321..4961542e79 100644 --- a/src/cuda_kernels/CMakeLists.txt +++ b/src/cuda_kernels/CMakeLists.txt @@ -1,4 +1,4 @@ -set(CPPTRAJ_CUDA_SOURCES core_kernels.cu kernel_wrappers.cu) +set(CPPTRAJ_CUDA_SOURCES core_kernels.cu kernel_wrappers.cu GistCudaCalc.cu GistCudaSetup.cu) cuda_add_library(cpptraj_cuda_routines ${CPPTRAJ_CUDA_SOURCES}) diff --git a/src/cuda_kernels/GistCudaCalc.cu b/src/cuda_kernels/GistCudaCalc.cu new file mode 100644 index 0000000000..ddd08875a5 --- /dev/null +++ b/src/cuda_kernels/GistCudaCalc.cu @@ -0,0 +1,464 @@ +#include "GistCudaCalc.cuh" + +#include + +#define ELECTOAMBER_2 332.05221729 + + + /** + * Calculate the squared distance in an orthorhombic box. See cpptraj implementation. + * @param vec1: The first point of the distance calculation. + * @param vec2: The seconf point of the distance calculation + * @param box: The boxinfo of the object. + * @return: The minimal distance in an orthorhombic box. + */ +__device__ +float dist2_imageOrtho(float *vec1, float *vec2, BoxInfo box) { + if (box[0] == 0 || box[1] == 0 || box[2] == 0) { + return -1; + } + float x = vec1[0] - vec2[0]; + float y = vec1[1] - vec2[1]; + float z = vec1[2] - vec2[2]; + + if (x < 0) { + x *= -1; + } + if (y < 0) { + y *= -1; + } + if (z < 0) { + z *= -1; + } + + while ( x > box[0] ) { + x -= box[0]; + } + while ( y > box[1] ) { + y -= box[1]; + } + while ( z > box[2] ) { + z -= box[2]; + } + + float dist = box[0] - x; + if ( dist < x ) { + x = dist; + } + dist = box[1] - y; + if ( dist < y ) { + y = dist; + } + dist = box[2] - z; + if ( dist < z ) { + z = dist; + } + + return x * x + y * y + z * z; +} + +/** + * Calculate M * v. + * @param vec: The vector v. + * @param mat3x3: The matrix M. + * @param ret: The values to be returned. If null, returns into vec. + */ +__device__ +void scalarProd(float* vec, BoxInfo mat3x3, float *ret) { + if (ret != NULL) { + ret[0] = vec[0] * mat3x3[0] + vec[1] * mat3x3[1] + vec[2] * mat3x3[2]; + ret[1] = vec[0] * mat3x3[3] + vec[1] * mat3x3[4] + vec[2] * mat3x3[5]; + ret[2] = vec[0] * mat3x3[6] + vec[1] * mat3x3[7] + vec[2] * mat3x3[8]; + } else { + float x = vec[0]; + float y = vec[1]; + float z = vec[2]; + vec[0] = x * mat3x3[0] + y * mat3x3[1] + z * mat3x3[2]; + vec[1] = x * mat3x3[3] + y * mat3x3[4] + z * mat3x3[5]; + vec[2] = x * mat3x3[6] + y * mat3x3[7] + z * mat3x3[8]; + } +} + +/** + * Find the result_O squared distance in a non-orthogonal box. + * @param vec1: Position vector of point 1. + * @param vec2: Position vector of point 2. + * @param recip: The inverse of the ucell. + * @param ucell: The box matrix. + * @return: The minimal squared distance between two atoms, also considering the images. + */ +__device__ +float dist2_imageNonOrtho(float *vec1, float *vec2, BoxInfo recip, UnitCell ucell) { + float vecRecip1[3]; + float vecRecip2[3]; + scalarProd(vec1, recip, vecRecip1); + scalarProd(vec2, recip, vecRecip2); + float r_2 = dist2_imageNonOrthoRecip(vecRecip1, vecRecip2, ucell); + + return r_2; +} + +/** + * Calculate if the distance to an image is smaller than a given distance. + * @param f: The first vector in the reciprocal space. + * @param vec2Cartesian: The second vector in cartesian coordinates. + * @param nx: Which neighbour in x direction? + * @param ny: Which neighbour in y direction? + * @param nz: Which neighbour in z direction? + * @param ucell: The box matrix. + * @param finalMin: The already calculated minimum. + * @return: The new minimum, if it is smaller than finalMin, finalMin otherwise. + */ +__device__ +float calcIfDistIsSmaller(float *f, float *vec2Cartesian, int nx, int ny, int nz, UnitCell ucell, float finalMin) { + float fx = f[0] + nx; + float fy = f[1] + ny; + float fz = f[2] + nz; + // Bring f back in Cartesian coordinates + float x = fx * ucell[0] + fy * ucell[3] + fz * ucell[6]; + float y = fx * ucell[1] + fy * ucell[4] + fz * ucell[7]; + float z = fx * ucell[2] + fy * ucell[5] + fz * ucell[8]; + x -= vec2Cartesian[0]; + y -= vec2Cartesian[1]; + z -= vec2Cartesian[2]; + float min = x * x + y * y + z * z; + if ( min < finalMin || finalMin < 0) { + return min; + } + return finalMin; +} + +/** + * Calculate the distance in a non-orthorhombic box, if the vectors are already + * multiplied by the inverse box matrix. + * @param vec1: The first position vector. + * @param vec2: The second position vector. + * @param ucell: The box cell. + * @return: The minimal distance between images. + */ +__device__ +float dist2_imageNonOrthoRecip(float * vec1, float * vec2, UnitCell ucell) { + + // Bring the points back into the main unit cell + float fx = vec1[0] - floor(vec1[0]); + float fy = vec1[1] - floor(vec1[1]); + float fz = vec1[2] - floor(vec1[2]); + float f2x = vec2[0] - floor(vec2[0]); + float f2y = vec2[1] - floor(vec2[1]); + float f2z = vec2[2] - floor(vec2[2]); + + float vec[3] = {fx, fy, fz}; + + // Bring f2 back in cartesian coordinates + float xFactor = f2x * ucell[0] + f2y * ucell[3] + f2z * ucell[6]; + float yFactor = f2x * ucell[1] + f2y * ucell[4] + f2z * ucell[7]; + float zFactor = f2x * ucell[2] + f2y * ucell[5] + f2z * ucell[8]; + float vec2Real[3] = {xFactor, yFactor, zFactor}; + + // Now the different cases, and always store the minimum + // Define the finalMinimum as a negative value, since it can never + // actually be negative this is fine. + float finalMinimum = -1.0; + + // Run through all cells + for (int ix = -1; ix <= 1; ++ix) { + for (int iy = -1; iy <= 1; ++iy) { + for (int iz = -1; iz <= 1; ++iz) { + finalMinimum = calcIfDistIsSmaller(vec, vec2Real, ix, iy, iz, ucell, finalMinimum); + } + } + } + + return finalMinimum; +} + +/** + * Calculates the distance between two vectors, without imaging. + * @param vec1: The position vector of the first atom. + * @param vec2: The position vector of the second atom. + * @return: The squared distance between the two positions. + */ +__device__ +float dist2_noImage(float *vec1, float *vec2) { + float x = vec1[0] - vec2[0]; + float y = vec1[1] - vec2[1]; + float z = vec1[2] - vec2[2]; + + return x*x + y*y + z*z; +} + +/** + * Caclulate the total energy between two atoms. + * @param vec1: The position vector of atom 1. + * @param vec2: The position vector of atom 2. + * @param q1: The charge of atom 1. + * @param q2: The charge of atom 2. + * @param LJA: The lennard jones A parameter. + * @param LJB: The lennard jones B parameter. + * @param boxinfo: Which kind of box, 0 not periodic, 1 orthorhombic, 2 otherwise. + * @param recip_o_box: Holds either the inverse of the cell matrix, if boxinfo is 2, + * or the box dimensions, if boxinfo is 1, or is NULL, if boxinfo is 0. + * @param ucell: Holds the cell matrix, if boxinfo is 2, NULL otherwise. + * @return: The total interaction energy between the two atoms. + */ +__device__ +float calcTotalEnergy(float q1, float q2, + float LJA, float LJB, float r_2) { +#ifdef DEBUG_GIST_CUDA + if (r_2 <= 0.000001 && r_2 >= -0.000001) { + printf("(%8.3f, %8.3f, %8.3f) (%8.3f, %8.3f, %8.3f) %d\n", vec1[0], vec1[1], vec1[2], vec2[0], vec2[1], vec2[2], boxinfo); + } +#endif + return calcVdWEnergy(r_2, LJA, LJB) + calcElectrostaticEnergy(r_2, q1, q2); +} + +/** + * Calculate the distance between two different points. + * @param vec1: The first vector to calculate the distance. + * @param vec2: The second vector to calculate the distance. + * @param recip_o_box: The boxinfo, either the box or the reciprocal. + * @param ucell: The unitcell of a box. + * @return: The squared distance between two points. + */ +__device__ +float calcDist(float *vec1, float *vec2, BoxInfo recip_o_box, + UnitCell ucell) { + float r_2 = 0; + switch(recip_o_box.boxinfo) { + case 0: + r_2 = dist2_noImage(vec1, vec2); + break; + case 1: + // Uses recip for box info as well; + r_2 = dist2_imageOrtho(vec1, vec2, recip_o_box); + break; + case 2: + r_2 = dist2_imageNonOrtho(vec1, vec2, recip_o_box, ucell); + break; + default: + r_2 = 0; + } + return r_2; +} + +/** + * Calculate the Van der Waals energy between two atoms. + * @param r_2: The squared distance between the two atoms. + * @param LJA: The A part of the lennard jones potential. + * @param LJB: The B part if the Lennard Jones potential. + * @return: The Van der Waals energy. + */ +__device__ +float calcVdWEnergy(float r_2, float LJA, float LJB) { + float r_6 = r_2 * r_2 * r_2; + float r_12 = r_6 * r_6; + float LJ = LJA / r_12 - LJB / r_6; + return LJ; +} + +/** + * Calculate the electrostatic energy between two different atoms. + * @param r_2: The square distance between the two atoms. + * @param q1: The charge of atom 1. + * @param q2: The charge of atom 2. + * @return: The electrostatic energy between the two atoms. + */ +__device__ +float calcElectrostaticEnergy(float r_2, float q1, float q2) { + double charge = q1 * q2 * ELECTOAMBER_2; + double r = sqrt(r_2); + float ELE = charge / r; + return ELE; +} + +/** + * Get the index into the lennard jones parameter array. + * @param a1: The atom type index of atom 1. + * @param a2: The atom type index of atom 2. + * @param NBindex: The arrays holding the indices into the LJ array. + * @param ntypes: The number of atom types. + * @return: The index into the parameter arrays. + */ +__device__ +int getLJIndex(int a1, int a2, int *NBindex, int ntypes) { + return NBindex[a1 * ntypes + a2]; +} + +/** + * Get the LJ parameters from a parameter array. + * @param a1: The atom type index of atom 1. + * @param a2: The atom type index of atom 2. + * @param NBindex: The indices into the parameter array. + * @param ntypes: The number of atom types. + * @param paramsLJ: The parameter array. + * @return: The LJ parameter belonging to the atom type pair a1, a2. + */ +__device__ +ParamsLJ getLJParam(int a1, int a2, int *NBindex, int ntypes, ParamsLJ *paramsLJ) { + int idx = getLJIndex(a1, a2, NBindex, ntypes); + if (idx < 0) { + return ParamsLJ(); + } + return paramsLJ[idx]; +} + +/** + * Checks if atom is on grid. + * @param vec: Position vector to the atom. + * @param min: The grid starting position. + * @param max: The end of the grid. + * @return: True if vector is on grid. + */ +__device__ +bool isOnGrid(float *vec, float *min, float *max) { + return ( ( (vec[0] >= min[0]) && (vec[0] <= max[0]) ) && + ( (vec[1] >= min[1]) && (vec[1] <= max[1]) ) && + ( (vec[2] >= min[2]) && (vec[2] <= max[2]) ) ); +} + +/** + * Calculate the energy on the GPU. + * @param coords: An array holding all the coordinates of all atoms. + * @param NBindex: An array holding indices into the LJ parameter arrays. + * @param ntypes: The number of atom types. + * @param paramsLJA: The A LJ parameters. + * @param paramsLJB: The B LJ parameters. + * @param charges: The charges of the atoms. + * @param boxinfo: Which kind of box, 0 not periodic, 1 orthorhombic, 2 otherwise. + * @param recip_o_box: Holds either the inverse of the cell matrix, if boxinfo is 2, + * or the box dimensions, if boxinfo is 1, or is NULL, if boxinfo is 0. + * @param ucell: Holds the cell matrix, if boxinfo is 2, NULL otherwise. + * @param maxAtoms: The number of atoms in the system. + * @param a_types: The different atom types of the atoms. + * @param solvent: True if atom is a solvent atom, false otherwise. + * @param molecule: The number of the molecule this atom belong to. + * @param result_ww: The result of the water - water interactions. + * @param result_sw: The result of the solute - water interactions. + * @param min: The minimum values of the grid. + * @param max: The maximum values of the grid. + */ +__global__ +void cudaCalcEnergy(Coordinates *coords, int *NBindex, int ntypes, ParamsLJ *parameterLJ, AtomProperties *atomProps, + BoxInfo recip_o_box, UnitCell ucell, int maxAtoms, float *result_ww, float *result_sw, + float *min, float *max, int headAtomType, float neighbourCut2, int *result_O, int *result_N) { + + + int a1 = blockIdx.x * blockDim.x + threadIdx.x; + int a2 = blockIdx.y * blockDim.y + threadIdx.y; + if ( (a1 >= maxAtoms) || (a2 >= maxAtoms) || (a1 == a2)) { + return; + } + + AtomProperties atom1 = atomProps[a1]; + AtomProperties atom2 = atomProps[a2]; + + // Do not calculate if the two values are the same or they belong to the same molecule. + if ( (atom1.molecule != atom2.molecule)) { + + Coordinates t1 = coords[a1]; + Coordinates t2 = coords[a2]; + ParamsLJ lj = getLJParam(atom1.atomType, atom2.atomType, NBindex, ntypes, parameterLJ); + + float vec1[3] = {t1.x, t1.y, t1.z}; + float vec2[3] = {t2.x, t2.y, t2.z}; + float r_2 = calcDist(vec1, vec2, recip_o_box, ucell); + float energy = calcTotalEnergy(atom1.charge, atom2.charge, lj.A, lj.B, r_2); + + if (atom2.solvent != 0) { + atomicAdd(&(result_ww[a1]), energy * 0.5); + } else { + atomicAdd(&(result_sw[a1]), energy); + } + + } +} + +/** + * Calculate the energy on the GPU. This implementation is somewhat slower, + * but is able to calculate the order parameters as well as the rest. + * @param coords: An array holding all the coordinates of all atoms. + * @param NBindex: An array holding indices into the LJ parameter arrays. + * @param ntypes: The number of atom types. + * @param paramsLJA: The A LJ parameters. + * @param paramsLJB: The B LJ parameters. + * @param charges: The charges of the atoms. + * @param boxinfo: Which kind of box, 0 not periodic, 1 orthorhombic, 2 otherwise. + * @param recip_o_box: Holds either the inverse of the cell matrix, if boxinfo is 2, + * or the box dimensions, if boxinfo is 1, or is NULL, if boxinfo is 0. + * @param ucell: Holds the cell matrix, if boxinfo is 2, NULL otherwise. + * @param maxAtoms: The number of atoms in the system. + * @param a_types: The different atom types of the atoms. + * @param solvent: True if atom is a solvent atom, false otherwise. + * @param molecule: The number of the molecule this atom belong to. + * @param result_ww: The result of the water - water interactions. + * @param result_sw: The result of the solute - water interactions. + * @param min: The minimum values of the grid. + * @param max: The maximum values of the grid. + */ +__global__ +void cudaCalcEnergySlow(Coordinates *coords, int *NBindex, int ntypes, ParamsLJ *parameterLJ, AtomProperties *atomProps, + BoxInfo recip_o_box, UnitCell ucell, int maxAtoms, float *result_ww, float *result_sw, + float *min, float *max, int headAtomType, float neighbourCut2, int *result_O, int *result_N) { + + int a1 = blockIdx.x * blockDim.x + threadIdx.x; + + if (a1 >= maxAtoms) { + return; + } + + AtomProperties atom1 = atomProps[a1]; + float distances[4] = {HUGE_C, HUGE_C, HUGE_C, HUGE_C}; + result_N[a1] = 0; + result_O[4 * a1 + 3] = 0; + result_O[4 * a1 + 2] = 0; + result_O[4 * a1 + 1] = 0; + result_O[4 * a1 ] = 0; + for (int a2 = 0; a2 < maxAtoms; ++a2) { + AtomProperties atom2 = atomProps[a2]; + // Do not calculate if the two values are the same or they belong to the same molecule. + if ( (a1 != a2) && (atom1.molecule != atom2.molecule)) { + Coordinates t1 = coords[a1]; + Coordinates t2 = coords[a2]; + ParamsLJ lj = getLJParam(atom1.atomType, atom2.atomType, NBindex, ntypes, parameterLJ); + float vec1[3] = {t1.x, t1.y, t1.z}; + float vec2[3] = {t2.x, t2.y, t2.z}; + float r_2 = calcDist(vec1, vec2, recip_o_box, ucell); + float energy = calcTotalEnergy(atom1.charge, atom2.charge, lj.A, lj.B, r_2); + if ((atom2.atomType == headAtomType) && atom2.solvent && atom1.solvent) { + if (r_2 < distances[0]) { + distances[3] = distances[2]; + distances[2] = distances[1]; + distances[1] = distances[0]; + distances[0] = r_2; + result_O[4 * a1 + 3] = result_O[4 * a1 + 2]; + result_O[4 * a1 + 2] = result_O[4 * a1 + 1]; + result_O[4 * a1 + 1] = result_O[4 * a1 ]; + result_O[4 * a1 ] = a2; + } else if (r_2 < distances[1]) { + distances[3] = distances[2]; + distances[2] = distances[1]; + distances[1] = r_2; + result_O[4 * a1 + 3] = result_O[4 * a1 + 2]; + result_O[4 * a1 + 2] = result_O[4 * a1 + 1]; + result_O[4 * a1 + 1] = a2; + } else if (r_2 < distances[2]) { + distances[3] = distances[2]; + distances[2] = r_2; + result_O[4 * a1 + 3] = result_O[4 * a1 + 2]; + result_O[4 * a1 + 2] = a2; + } else if (r_2 < distances[3]) { + distances[3] = r_2; + result_O[4 * a1 + 3] = a2; + } + if (r_2 < neighbourCut2) { + result_N[a1] += 1; + } + } + if (atom2.solvent != 0) { + result_ww[a1] += energy / 2.0; + } else { + result_sw[a1] += energy; + } + } + } +} \ No newline at end of file diff --git a/src/cuda_kernels/GistCudaCalc.cuh b/src/cuda_kernels/GistCudaCalc.cuh new file mode 100644 index 0000000000..9f46d9b798 --- /dev/null +++ b/src/cuda_kernels/GistCudaCalc.cuh @@ -0,0 +1,367 @@ +#ifndef GIST_CUDA_CALC_CUH +#define GIST_CUDA_CALC_CUH + +#define HUGE_C 1e30f +#define BLOCKSIZE 16 +#define SLOW_BLOCKSIZE 512 + +/** + * Coordinates class. + * + * Holds three different values for the coordinates, to achieve + * a faster access on the GPU (only a single read from the memory + * instead of three different reads, also the memory is better + * aligned). + * @author Johannes Kraml + */ +class Coordinates { +public: + float x; + float y; + float z; + + // Empty Constructor + __host__ __device__ + Coordinates(): x(0), y(0), z(0) {} + + /** + * Constructor using an array of three values. + * @param array: The array that holds the x, y and z coordinates + */ + __host__ __device__ + Coordinates(const double *array) { + this->x = array[0]; + this->y = array[1]; + this->z = array[2]; + } + + /** + * Copy Constructor. + * @param other: The other object. + */ + __host__ __device__ + Coordinates(const Coordinates &other) { + this->x = other.x; + this->y = other.y; + this->z = other.z; + } + + /** + * The assignment operator for an object of this class. + * @param other: The other Coordinate object. + * @return This object. + */ + __host__ __device__ + Coordinates &operator=(const Coordinates &other) { + this->x = other.x; + this->y = other.y; + this->z = other.z; + return *this; + } + +}; + +/** + * An implementation holding the A and B values for a + * Lennard-Jones van der Waals interaction energy calculation. + * @author Johannes Kraml + */ +class ParamsLJ{ +public: + float A; + float B; + + // Empty constructor + __host__ __device__ + ParamsLJ(): A(0), B(0) {} + + /** + * Constructor from an array of values. + * @param arr: The array that holds the A and B values. + */ + __host__ __device__ + ParamsLJ(float *arr) { + this->A = arr[0]; + this->B = arr[1]; + } + + /** + * Constructor using two different floats. + * @param A: The A value in the Lennard-Jones equation. + * @param B: The B value in the Lennard-Jones equation. + */ + __host__ __device__ + ParamsLJ(float A, float B) { + this->A = A; + this->B = B; + } + + /** + * Copy constructor for this class. + * @param other: The other object of this class + */ + __host__ __device__ + ParamsLJ(const ParamsLJ &other) { + this->A = other.A; + this->B = other.B; + } + + /** + * The assignment operator of this class. + * @param other: The other object of this type. + * @return this object. + */ + __host__ __device__ + ParamsLJ &operator=(const ParamsLJ &other) { + this->A = other.A; + this->B = other.B; + return *this; + } +}; + +/** + * The implementation of the Unit Cell on the GPU. + * @author Johannes Kraml + */ +class UnitCell { +public: + float array[9]; + + // The empty constuctor. + __host__ __device__ + UnitCell() {} + + /** + * Constructor using an array. + * @param arr: The array from which the values should be + * taken. + */ + __host__ __device__ + UnitCell(float *arr) { + this->array[0] = arr[0]; + this->array[1] = arr[1]; + this->array[2] = arr[2]; + this->array[3] = arr[3]; + this->array[4] = arr[4]; + this->array[5] = arr[5]; + this->array[6] = arr[6]; + this->array[7] = arr[7]; + this->array[8] = arr[8]; + } + + /** + * Copy constructor. + * @param other: The other object. + */ + __host__ __device__ + UnitCell(const UnitCell &other) { + this->array[0] = other.array[0]; + this->array[1] = other.array[1]; + this->array[2] = other.array[2]; + this->array[3] = other.array[3]; + this->array[4] = other.array[4]; + this->array[5] = other.array[5]; + this->array[6] = other.array[6]; + this->array[7] = other.array[7]; + this->array[8] = other.array[8]; + } + + /** + * Assignment operator. + * @param other: The other object. + * @return This object. + */ + __host__ __device__ + UnitCell &operator=(const UnitCell &other){ + this->array[0] = other.array[0]; + this->array[1] = other.array[1]; + this->array[2] = other.array[2]; + this->array[3] = other.array[3]; + this->array[4] = other.array[4]; + this->array[5] = other.array[5]; + this->array[6] = other.array[6]; + this->array[7] = other.array[7]; + this->array[8] = other.array[8]; + return *this; + } + + /** + * Access operator. + * @param idx: The index which should be accessed. + * + * @exceptsafe Not safe. + */ + __host__ __device__ + float operator[](int idx) { + if (idx >= 0 && idx < 9) { + return this->array[idx]; + } + return 1; + } + +}; + +/** + * Class holding the box information, i.e., which kind of box + * and the box dimensions. + */ +class BoxInfo { +public: + float array[9]; ///< box dimensions + int boxinfo; + + // Empty constructor + __host__ __device__ + BoxInfo(): boxinfo(0) {} + + /** + * Constructor using an array and the boxinfo. + * @param arr: The array holding the values for the box dimensions. + * @param boxinfo: Which kind of box this is. + */ + __host__ __device__ + BoxInfo(float *arr, int boxinfo) { + this->array[0] = arr[0]; + this->array[1] = arr[1]; + this->array[2] = arr[2]; + this->array[3] = arr[3]; + this->array[4] = arr[4]; + this->array[5] = arr[5]; + this->array[6] = arr[6]; + this->array[7] = arr[7]; + this->array[8] = arr[8]; + this->boxinfo = boxinfo; + } + + /** + * Copy constuctor. + * @param other: The other object. + */ + __host__ __device__ + BoxInfo(const BoxInfo &other) { + this->array[0] = other.array[0]; + this->array[1] = other.array[1]; + this->array[2] = other.array[2]; + this->array[3] = other.array[3]; + this->array[4] = other.array[4]; + this->array[5] = other.array[5]; + this->array[6] = other.array[6]; + this->array[7] = other.array[7]; + this->array[8] = other.array[8]; + this->boxinfo = other.boxinfo; + } + + /** + * Assignement operator. + * @param other: The other object. + * @return This object. + */ + __host__ __device__ + BoxInfo &operator=(const BoxInfo &other){ + this->array[0] = other.array[0]; + this->array[1] = other.array[1]; + this->array[2] = other.array[2]; + this->array[3] = other.array[3]; + this->array[4] = other.array[4]; + this->array[5] = other.array[5]; + this->array[6] = other.array[6]; + this->array[7] = other.array[7]; + this->array[8] = other.array[8]; + this->boxinfo = other.boxinfo; + return *this; + } + + /** + * Access operator. + * @param idx: The index at which to access the array. + * @return The value stored at that point in the array or 0 if none is found. + * @exceptsafe Is not exception safe. + */ + __host__ __device__ + float operator[](int idx) { + if (idx < 9 && idx >= 0) { + return this->array[idx]; + } + return 0; + } + +}; + +/** + * Class to store different Atom properties, like charge, atomType, solvent and the + * molecule this atom belongs to. + * @author Johannes Kraml + */ +class AtomProperties { +public: + float charge; + int atomType; + bool solvent; + int molecule; + + // Empty constructor + __host__ __device__ + AtomProperties() {} + + /** + * Constructor. + * @param charge: The charge of the atom. + * @param atomType: The atom type for access of the lennard-jones parameters. + * @param solvent: Is this atom a solvent atom? + * @param molecule: The molecule this atom belongs to. + */ + __host__ __device__ + AtomProperties(float charge, int atomType, bool solvent, int molecule) { + this->charge = charge; + this->atomType = atomType; + this->solvent = solvent; + this->molecule = molecule; + } + + /** + * Copy constructor. + * @param other: The other object. + */ + __host__ __device__ + AtomProperties(const AtomProperties &other) { + this->charge = other.charge; + this->atomType = other.atomType; + this->solvent = other.solvent; + this->molecule = other.molecule; + } + + /** + * Assignement operator. + * @param other: The other object. + * @return This object. + */ + __host__ __device__ + AtomProperties &operator=(const AtomProperties &other) { + this->charge = other.charge; + this->atomType = other.atomType; + this->solvent = other.solvent; + this->molecule = other.molecule; + return *this; + } +}; + + +// Device functions +__device__ float dist2_imageOrtho(float *, float *, BoxInfo); +__device__ void scalarProd(float* , BoxInfo , float *); +__device__ float dist2_imageNonOrtho(float *, float *, BoxInfo, UnitCell); +__device__ float calcIfDistIsSmaller(float *, float *, int , int , int , UnitCell, float ); +__device__ float dist2_imageNonOrthoRecip(float * , float * , UnitCell); +__device__ float dist2_noImage(float *, float *); +__device__ float calcTotalEnergy(float , float , float , float , float); +__device__ float calcVdWEnergy(float , float , float ); +__device__ float calcElectrostaticEnergy(float, float, float); +__device__ int getLJIndex(int , int , int *, int ); +__device__ ParamsLJ getLJParam(int , int , int *, int , ParamsLJ *); +__device__ bool isOnGrid(float *, float *, float *); +__device__ float calcDist(float *, float *, BoxInfo, UnitCell); + +// Global functions +__global__ void cudaCalcEnergy (Coordinates *, int *, int, ParamsLJ *, AtomProperties *, BoxInfo, UnitCell, int, float *, float *, float *, float *, int, float, int *, int *); +__global__ void cudaCalcEnergySlow(Coordinates *, int *, int, ParamsLJ *, AtomProperties *, BoxInfo, UnitCell, int, float *, float *, float *, float *, int, float, int *, int *); +#endif diff --git a/src/cuda_kernels/GistCudaSetup.cu b/src/cuda_kernels/GistCudaSetup.cu new file mode 100644 index 0000000000..d257da38b5 --- /dev/null +++ b/src/cuda_kernels/GistCudaSetup.cu @@ -0,0 +1,238 @@ +#include "GistCudaSetup.cuh" +#include "GistCudaCalc.cuh" + + +/** + * Allocate memory on the GPU. + * @param array: The pointer to the array, which will be allocated on the GPU. + * @param size: An integer giving the size of the array, which will be allocated. + * @throws: CudaException if a problem occurs. + */ +__host__ +void allocateCuda(void **array, int size) { + // Check if the array is actually free, if not, it will be freed + // (fun fact: checking is not necessary, one could also simply free the memory). + if ((*array) != NULL) { + cudaFree(*array); + } + // If something goes wrong, throw exception + if (cudaMalloc(array, size) != cudaSuccess) { + throw CudaException(); + } +} + +/** + * Copy memory from the CPU to the GPU. + * @param array: The array from which the values shall be copied. + * @param array_c: The array on the device, to which the values shall be copied. + * @param size: The size of the stuff which will be copied. + * @throws: CudaException if something goes wrong. + */ +__host__ +void copyMemoryToDevice(void *array, void *array_c, int size) { + // If something goes wrong, throw exception + // In this case only copying can go wrong. + if (cudaMemcpy(array_c, array, size, cudaMemcpyHostToDevice) != cudaSuccess) { + throw CudaException(); + } +} + +/** + * A simple helper function that copies a lot of stuff to the GPU (as structs). + * @param charge: An array holding the charges for the different atoms. + * @param atomtype: An array holding the integers for the atom types of the different atoms. + * @param solvent: An array of boolean values, holding the information whether a certain atom is solvent or solute. + * @param atomNumber: The total number of atoms. + * @param atomProps_c: A pointer to an array on the GPU, which will hold the atom properties. + * @param ljA: An array holding the lennard-jones parameter A for each atom type pair. + * @param ljB: An array holding the lennard-jones parameter B for each atom type pair. + * @param length: The length of the two aforementioned arrays (ljA & ljB). + * @param lJparams_c: A pointer to an array on the GPU, which will hold the lj parameters. + * @throws: CudaException if something bad happens. + */ +__host__ +void copyMemoryToDeviceStruct(float *charge, int *atomtype, bool *solvent, int *molecule, int atomNumber, void **atomProps_c, + float *ljA, float *ljB, int length, void **lJparams_c) { + // Check if the two arrays are free. Again, this could be removed (but will stay!) + if ((*atomProps_c) != NULL) { + cudaFree(*atomProps_c); + } + if ((*lJparams_c) != NULL) { + cudaFree(*lJparams_c); + } + // Allocate the necessary memory on the GPU. + if (cudaMalloc(atomProps_c, atomNumber * sizeof(AtomProperties)) != cudaSuccess) { + throw CudaException(); + } + if (cudaMalloc(lJparams_c, length * sizeof(ParamsLJ)) != cudaSuccess) { + throw CudaException(); + } + + // Create an array for the lennard-jones parameters. + ParamsLJ *ljp = (ParamsLJ *) malloc (length * sizeof(ParamsLJ)); + // Add the lennard-jones parameters to the array. + for (int i = 0; i < length; ++i) { + ljp[i] = ParamsLJ(ljA[i], ljB[i]); + } + + // Create an array for the atom properties. + AtomProperties *array = (AtomProperties *)malloc(atomNumber * sizeof(AtomProperties)); + // Add the properties into the array. + for (int i = 0; i < atomNumber; ++i) { + array[i] = AtomProperties(charge[i], atomtype[i], solvent[i], molecule[i]); + } + // Copy the memory from the host to the device. + if (cudaMemcpy((*atomProps_c), array, atomNumber * sizeof(AtomProperties), cudaMemcpyHostToDevice) != cudaSuccess) { + throw CudaException(); + } + if (cudaMemcpy((*lJparams_c), ljp, length * sizeof(ParamsLJ), cudaMemcpyHostToDevice) != cudaSuccess) { + throw CudaException(); + } + + // Free the two arrays (so that no memory leak occurs). + free(ljp); + free(array); +} + +/** + * Free an array on the CUDA capable device. + * @param array: The array you want to free. + */ +__host__ +void freeCuda(void *array) { + cudaFree(array); +} + + +// This is coded C-like, but uses exceptions. +/** + * This starts the cuda kernel, thus it is actually a quite long function. + */ +__host__ +std::vector > doActionCudaEnergy(const double *coords, int *NBindex_c, int ntypes, void *parameter, void *molecule_c, + int boxinfo, float *recip_o_box, float *ucell, int maxAtoms, float *min_c, float *max_c, int headAtomType, + float neighbourCut2, int *result_o, int *result_n, float *result_w_c, float *result_s_c, + int *result_O_c, int *result_N_c, bool doorder) { + Coordinates *coords_c = NULL; + float *recip_b_c = NULL; + float *ucell_c = NULL; + + + + float *result_A = (float *) calloc(maxAtoms, sizeof(float)); + float *result_s = (float *) calloc(maxAtoms, sizeof(float)); + // TODO: Fix this, test is actually a quite bad name here! + Coordinates *coord_array = (Coordinates *) calloc(maxAtoms, sizeof(Coordinates)); + + // Casting + AtomProperties *sender = (AtomProperties *) molecule_c; + ParamsLJ *lennardJonesParams = (ParamsLJ *) parameter; + + // Create Boxinfo and Unit cell. This is actually very important for the speed (otherwise + // there would be LOTS of access to non-local variables). + BoxInfo boxinf; + if (boxinfo != 0) { + boxinf = BoxInfo(recip_o_box, boxinfo); + } + UnitCell ucellN; + if (boxinfo == 2) { + ucellN = UnitCell(ucell); + } + + // Add the coordinates to the array. + // TODO: Fix Test here also! + for (int i = 0; i < maxAtoms; ++i) { + coord_array[i] = Coordinates(&coords[i * 3]); + } + + // vectors that will return the necessary information. + std::vector > result; + std::vector result_esw; + std::vector result_eww; + + // Allocate space on the GPU + if (cudaMalloc(&coords_c, maxAtoms * sizeof(Coordinates)) != cudaSuccess) { + free(result_A); free(result_s); free(coord_array); + throw CudaException(); + } + + + // Copy the data to the GPU + if (cudaMemcpy(coords_c, coord_array, maxAtoms * sizeof(Coordinates), cudaMemcpyHostToDevice) != cudaSuccess) { + cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c); + free(result_A); free(result_s); free(coord_array); + throw CudaException(); + } + if (cudaMemcpy(result_w_c, result_A, maxAtoms * sizeof(float), cudaMemcpyHostToDevice) != cudaSuccess) { + cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c); + free(result_A); free(result_s); free(coord_array); + throw CudaException(); + } + if (cudaMemcpy(result_s_c, result_s, maxAtoms * sizeof(float), cudaMemcpyHostToDevice) != cudaSuccess) { + cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c); + free(result_A); free(result_s); free(coord_array); + throw CudaException(); + } + + // If the doorder calculation is used, it needs to calculate everything differently, so the slow version is used + // (this is about 10% slower). + if (doorder) { + cudaCalcEnergySlow<<< (maxAtoms + SLOW_BLOCKSIZE) / SLOW_BLOCKSIZE, SLOW_BLOCKSIZE >>> (coords_c, NBindex_c, ntypes, lennardJonesParams, sender, + boxinf, ucellN, maxAtoms, result_w_c, result_s_c, min_c, max_c, + headAtomType, neighbourCut2, result_O_c, result_N_c); + } else { + // Uses a 2D array, which is nice for memory access. + dim3 threadsPerBlock(BLOCKSIZE, BLOCKSIZE); + dim3 numBlocks((maxAtoms + threadsPerBlock.x) / threadsPerBlock.x, (maxAtoms + threadsPerBlock.y) / threadsPerBlock.y); + // The actual call of the device function + cudaCalcEnergy<<>> (coords_c, NBindex_c, ntypes, lennardJonesParams, sender, + boxinf, ucellN, maxAtoms, result_w_c, result_s_c, min_c, max_c, + headAtomType, neighbourCut2, result_O_c, result_N_c); + // Check if there was an error. + cudaError_t cudaError = cudaGetLastError(); + if (cudaError != cudaSuccess) { + printf("returned %s\n", cudaGetErrorString(cudaError)); + } + } + // Return the results of the calculation to the main memory + if (cudaMemcpy(result_A, result_w_c, maxAtoms * sizeof(float), cudaMemcpyDeviceToHost) != cudaSuccess) { + cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c); + free(result_A); free(result_s); free(coord_array); + throw CudaException(); + } + + + if (cudaMemcpy(result_s, result_s_c, maxAtoms * sizeof(float), cudaMemcpyDeviceToHost) != cudaSuccess) { + cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c); + free(result_A); free(result_s); free(coord_array); + throw CudaException(); + } + + + + if (cudaMemcpy(result_o, result_O_c, maxAtoms * 4 * sizeof(int), cudaMemcpyDeviceToHost) != cudaSuccess) { + cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c); + free(result_A); free(result_s); free(coord_array); + throw CudaException(); + } + + if (cudaMemcpy(result_n, result_N_c, maxAtoms * sizeof(int), cudaMemcpyDeviceToHost) != cudaSuccess) { + cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c); + free(result_A); free(result_s); free(coord_array); + throw CudaException(); + } + + for (int i = 0; i < maxAtoms; ++i) { + result_eww.push_back(result_A[i]); + result_esw.push_back(result_s[i]); + } + + result.push_back(result_eww); + result.push_back(result_esw); + + // Free everything used in here. + cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c); + free(result_A); free(result_s); free(coord_array); + + return result; +} \ No newline at end of file diff --git a/src/cuda_kernels/GistCudaSetup.cuh b/src/cuda_kernels/GistCudaSetup.cuh new file mode 100644 index 0000000000..9e13948efd --- /dev/null +++ b/src/cuda_kernels/GistCudaSetup.cuh @@ -0,0 +1,27 @@ +#ifndef GIST_CUDA_SETUP_CUH +#define GIST_CUDA_SETUP_CUH + +#include +#include +#include + + + +// Exception classes +class CudaException : public std::exception { +public: + CudaException() { + } +}; + +// Function definitions +void allocateCuda(void **, int); +void copyMemoryToDevice(void *, void *, int); +void copyMemoryToDeviceStruct(float *, int *, bool *, int *, int, void **, float *, float *, int, void **); +void freeCuda(void *); +std::vector > doActionCudaEnergy(const double *, int *, int , void *, void *, + int , float *, float *, int , float *, + float *, int, float, int *, int *, float *, float *, + int *, int *, bool); + +#endif diff --git a/src/cuda_kernels/Makefile b/src/cuda_kernels/Makefile index 46b44bdb93..cf663ef027 100644 --- a/src/cuda_kernels/Makefile +++ b/src/cuda_kernels/Makefile @@ -10,7 +10,7 @@ TARGET = libcpptraj_cuda.a # Source files -CUDA_SOURCES=core_kernels.cu kernel_wrappers.cu +CUDA_SOURCES=core_kernels.cu kernel_wrappers.cu GistCudaCalc.cu GistCudaSetup.cu # Objects diff --git a/test/MasterTest.sh b/test/MasterTest.sh index a9fc070345..0566a8920e 100644 --- a/test/MasterTest.sh +++ b/test/MasterTest.sh @@ -963,6 +963,13 @@ CheckEnv() { 'fftw' ) TestLibrary "FFTW" "$CPPTRAJ_FFTW_FFT" ;; 'openmp' ) TestLibrary "OpenMP" "$CPPTRAJ_OPENMP" ;; 'singleensemble' ) TestLibrary "Single ensemble support" "$CPPTRAJ_SINGLE_ENS" ;; + 'cuda' ) TestLibrary "CUDA" "$CPPTRAJ_CUDA" ;; + 'notcuda' ) + if [ ! -z "$CPPTRAJ_CUDA" ]; then + echo " $DESCRIP cannot be run on CUDA." + ((CHECKERR++)) + fi + ;; 'pnetcdf' ) if [ ! -z "$DO_PARALLEL" ] ; then TestLibrary "Parallel NetCDF" "$CPPTRAJ_PNETCDFLIB" diff --git a/test/Test_GIST/RunTest.sh b/test/Test_GIST/RunTest.sh index 1feb9da292..28edac51bb 100755 --- a/test/Test_GIST/RunTest.sh +++ b/test/Test_GIST/RunTest.sh @@ -4,30 +4,34 @@ CleanFiles gist.in gist.out gist-*.dx ww_Eij.dat Eww_ij.dat \ Gist1-*.dx Gist1-*.dat Gist2-*.dx Gist2-*.dat +INPUT="-i gist.in" TESTNAME='GIST tests' Requires netcdf notparallel -INPUT="-i gist.in" - +UNITNAME='Eww Test' +CheckFor notcuda # doeij test with much smaller grid to save memory -cat > gist.in < gist.in < gist.in <