From b79a76583c5627ae68be4bedb2b527988b8aa4c3 Mon Sep 17 00:00:00 2001
From: jokr91 <hannes.kraml@gmail.com>
Date: Tue, 19 Nov 2019 15:42:18 +0100
Subject: [PATCH 1/4] Added GPU Compatibility to GIST

Added changes in the Makefile of the CUDA code, to include the correct
compilation of the GIST CUDA implementation.

The call to the GPU is done as an extra function, instead of the call to
NonbondEnergy, NonbondCuda is called. Since the NonbondCuda also
directly can calculate the order parameters, the function Order is also
replaced by the single call to NonbondCuda.
I added #ifdef statements around each change, so that the changes are
only effecting the code, when compiled with CUDA. If the code is
compiled without CUDA, the code is exactly the same as in the cpptraj
implementation.
Some further new functions were added to the implementation, so that the
NonbondCuda can be called easily from the rest of the code.
These functions include a possibility to free the GPUMemory
(freeGPUMemory) from the main program, a function to copy stuff from
the Host memory to the CUDA capable device (copyToGPU).
A few more variables were added, which are used for copying of data to
and from the GPU, and keeping track of a variety of atom parameters
(like charge, atom type, ...).

Most of the time, when working on the host memory, the vector class from
the STL is used, this is not true for the solvent_ array. The reason for
this is a feature concerning vector<bool>, which is not an array holding
boolean values, but a bit string, where at each bit, one true or false
value is stored. For copying the memory to the device, the use of this
functionality is not an option, thus the solvent_ array remains an
allocated array in the host memory.

The source files for the device code were implemented in the same way as
in GIGIST (compare www.github.com/liedllab/gigist). However, they were
all moved together into the cuda_kernels.

In the RunTest.sh, a different test for the CUDA version is supplied.
Actually, everything stays the same, except for the comparison with
Eww_ij, which is not calculated on the GPU, for memory reasons and thus
can also not be tested against. This is done by using the CheckEnv
function to recognize the cuda and notcuda keywords for the Requires and
CheckFor statements. The answer to why the Eww_ij is not implemented on
the GPU is quite simple: Consider the following example.
If a grid is constructed, consisting of 100 x 100 x 100 voxels, with a
grid spacing of 0.5 Angstrom, this is actually just a box of size 50 x
50 x 50 Angstrom. But if one would calculated the entire Eww_ij matrix
for this kind of calculation, a total of 10^12 data points would need to
be saved. This results from the calculation to compare each voxel with
each other. Given 100 x 100 x 100 voxels, this results in (10^6)^2 data
points, which evaluates to 10^12. At each data point, a value for the
energy has to be stored, when using ASCII-code, this will probably take
about 10 characters, resulting in a final file size of 10TB (and this
does not even consider the 4TB of memory that would be needed).
---
 src/Action_GIST.cpp                | 272 ++++++++++++++++-
 src/Action_GIST.h                  |  40 +++
 src/cuda_kernels/CMakeLists.txt    |   2 +-
 src/cuda_kernels/ConstantsG.cuh    |  69 +++++
 src/cuda_kernels/GistCudaCalc.cu   | 464 +++++++++++++++++++++++++++++
 src/cuda_kernels/GistCudaCalc.cuh  | 367 +++++++++++++++++++++++
 src/cuda_kernels/GistCudaSetup.cu  | 239 +++++++++++++++
 src/cuda_kernels/GistCudaSetup.cuh |  26 ++
 src/cuda_kernels/Makefile          |   2 +-
 test/MasterTest.sh                 |   7 +
 test/Test_GIST/RunTest.sh          |  28 +-
 11 files changed, 1498 insertions(+), 18 deletions(-)
 create mode 100644 src/cuda_kernels/ConstantsG.cuh
 create mode 100644 src/cuda_kernels/GistCudaCalc.cu
 create mode 100644 src/cuda_kernels/GistCudaCalc.cuh
 create mode 100644 src/cuda_kernels/GistCudaSetup.cu
 create mode 100644 src/cuda_kernels/GistCudaSetup.cuh
diff --git a/src/Action_GIST.cpp b/src/Action_GIST.cpp
index 196e8574f2..0a006ced1e 100644
--- a/src/Action_GIST.cpp
+++ b/src/Action_GIST.cpp
@@ -14,6 +14,18 @@
 const double Action_GIST::maxD_ = DBL_MAX;
 
 Action_GIST::Action_GIST() :
+#ifdef CUDA
+	solvent_(NULL),
+	NBindex_c_(NULL),
+	molecule_c_(NULL),
+	paramsLJ_c_(NULL),
+	max_c_(NULL),
+	min_c_(NULL),
+	result_w_c_(NULL),
+	result_s_c_(NULL),
+	result_O_c_(NULL),
+	result_N_c_(NULL),
+#endif
   gO_(0),
   gH_(0),
   Esw_(0),
@@ -53,7 +65,12 @@ void Action_GIST::Help() const {
           "\t[griddim <xval> <yval> <zval>] [gridspacn <spaceval>]\n"
           "\t[prefix <filename prefix>] [ext <grid extension>] [out <output>]\n"
           "\t[info <info>]\n"
-          "Perform Grid Inhomogenous Solvation Theory calculation.\n");
+          "Perform Grid Inhomogenous Solvation Theory calculation.\n"
+#ifdef CUDA
+          "The option doeij is not available, when using the CUDA accelerated version,\n"
+          "as this would need way too much memory."
+#endif
+          );
 }
 
 Action::RetType Action_GIST::Init(ArgList& actionArgs, ActionInit& init, int debugIn)
@@ -98,6 +115,12 @@ Action::RetType Action_GIST::Init(ArgList& actionArgs, ActionInit& init, int deb
   image_.InitImaging( !(actionArgs.hasKey("noimage")) );
   doOrder_ = actionArgs.hasKey("doorder");
   doEij_ = actionArgs.hasKey("doeij");
+#ifdef CUDA
+  if (this->doEij_) {
+    mprintf("Warning: 'doeij' cannot be specified when using CUDA. Setting is ignored");
+    this->doEij_ = false;
+  }
+#endif
   skipE_ = actionArgs.hasKey("skipE");
   if (skipE_) {
     if (doEij_) {
@@ -303,7 +326,9 @@ Action::RetType Action_GIST::Init(ArgList& actionArgs, ActionInit& init, int deb
           "#    Crystal Nguyen, Michael K. Gilson, and Tom Young, arXiv:1108.4876v1 (2011)\n"
           "#    Crystal N. Nguyen, Tom Kurtzman Young, and Michael K. Gilson,\n"
           "#      J. Chem. Phys. 137, 044101 (2012)\n"
-          "#    Lazaridis, J. Phys. Chem. B 102, 3531–3541 (1998)\n");
+          "#    Lazaridis, J. Phys. Chem. B 102, 3531–3541 (1998)\n"
+          "#If you use the GPU parallelized version of GIST, please cite:\n"
+          "#    Johannes Kraml, Anna S. Kamenik, Franz Waibl, Michael Schauperl, Klaus R. Liedl, JCTC (2019)\n");
   gist_init_.Stop();
   return Action::OK;
 }
@@ -320,6 +345,10 @@ Action::RetType Action_GIST::Setup(ActionSetup& setup) {
     return Action::ERR;
   }
   image_.SetupImaging( setup.CoordInfo().TrajBox().Type() );
+  #ifdef CUDA
+  this->numberAtoms_ = setup.Top().Natom();
+  this->solvent_ = new bool[this->numberAtoms_];
+  #endif
 
   // Get molecule number for each solvent molecule
   //mol_nums_.clear();
@@ -339,6 +368,9 @@ Action::RetType Action_GIST::Setup(ActionSetup& setup) {
   {
     if (mol->IsSolvent()) {
       int o_idx = mol->BeginAtom();
+      #ifdef CUDA
+      this->headAtomType_ = setup.Top()[o_idx].TypeIndex();
+      #endif
       // Check that molecule has correct # of atoms
       unsigned int molNumAtoms = (unsigned int)mol->NumAtoms();
       if (nMolAtoms_ == 0) {
@@ -371,6 +403,12 @@ Action::RetType Action_GIST::Setup(ActionSetup& setup) {
       for (unsigned int IDX = 0; IDX != nMolAtoms_; IDX++) {
         A_idxs_.push_back( o_idx + IDX );
         atom_voxel_.push_back( OFF_GRID_ );
+        #ifdef CUDA
+        this->molecule_.push_back( setup.Top()[o_idx + IDX ].MolNum() );
+        this->charges_.push_back( setup.Top()[o_idx + IDX ].Charge() );
+        this->atomTypes_.push_back( setup.Top()[o_idx + IDX ].TypeIndex() );
+        this->solvent_[ o_idx + IDX ] = true;
+        #endif
       }
       NsolventAtoms += nMolAtoms_;
       // If first solvent molecule, save charges. If not, check that charges match.
@@ -406,6 +444,12 @@ Action::RetType Action_GIST::Setup(ActionSetup& setup) {
           A_idxs_.push_back( u_idx );
           atom_voxel_.push_back( SOLUTE_ );
           NsoluteAtoms++;
+          #ifdef CUDA
+          this->molecule_.push_back( setup.Top()[ u_idx ].MolNum() );
+          this->charges_.push_back( setup.Top()[ u_idx ].Charge() );
+          this->atomTypes_.push_back( setup.Top()[ u_idx ].TypeIndex() );
+          this->solvent_[ u_idx ] = false;
+          #endif
         }
       }
     }
@@ -435,6 +479,35 @@ Action::RetType Action_GIST::Setup(ActionSetup& setup) {
       mprintf("\tNo imaging will be performed for energy distance calculations.\n");
   }
 
+#ifdef CUDA
+  NonbondParmType nb = setup.Top().Nonbond();
+  this->NBIndex_ = nb.NBindex();
+  this->numberAtomTypes_ = nb.Ntypes();
+  for (unsigned int i = 0; i < nb.NBarray().size(); ++i) {
+    this->lJParamsA_.push_back( (float) nb.NBarray().at(i).A() );
+    this->lJParamsB_.push_back( (float) nb.NBarray().at(i).B() );
+  }
+
+  try {
+    allocateCuda(((void**)&this->NBindex_c_), this->NBIndex_.size() * sizeof(int));
+    allocateCuda((void**)&this->max_c_, 3 * sizeof(float));
+    allocateCuda((void**)&this->min_c_, 3 * sizeof(float));
+    allocateCuda((void**)&this->result_w_c_, this->numberAtoms_ * sizeof(float));
+    allocateCuda((void**)&this->result_s_c_, this->numberAtoms_ * sizeof(float));
+    allocateCuda((void**)&this->result_O_c_, this->numberAtoms_ * 4 * sizeof(int));
+    allocateCuda((void**)&this->result_N_c_, this->numberAtoms_ * sizeof(int));
+  } catch (CudaException &e) {
+    mprinterr("Error: Could not allocate memory on GPU!\n");
+    this->freeGPUMemory();
+    return Action::ERR;
+  }
+  try {
+    this->copyToGPU();
+  } catch (CudaException &e) {
+    return Action::ERR;
+  }
+#endif
+
   gist_setup_.Stop();
   return Action::OK;
 }
@@ -722,6 +795,8 @@ Action::RetType Action_GIST::DoAction(int frameNum, ActionFrame& frm) {
   OnGrid_idxs_.clear();
   OnGrid_XYZ_.clear();
 
+  // CUDA necessary information
+
   size_t bin_i, bin_j, bin_k;
   Vec3 const& Origin = gO_->Bin().GridOrigin();
   // Loop over each solvent molecule
@@ -890,14 +965,19 @@ Action::RetType Action_GIST::DoAction(int frameNum, ActionFrame& frm) {
   } // END loop over each solvent molecule
 
   // Do energy calculation if requested
+  #ifndef CUDA
   gist_nonbond_.Start();
   if (!skipE_) NonbondEnergy(frm.Frm(), *CurrentParm_);
   gist_nonbond_.Stop();
+ 
 
   // Do order calculation if requested
   gist_order_.Start();
   if (doOrder_) Order(frm.Frm());
   gist_order_.Stop();
+  #else
+  NonbondCuda(frm);
+  #endif
 
   gist_action_.Stop();
   return Action::OK;
@@ -1176,10 +1256,12 @@ void Action_GIST::Print() {
   Farray Eww_norm( MAX_GRID_PT_, 0.0 );
   Farray neighbor_dens( MAX_GRID_PT_, 0.0 );
   if (!skipE_) {
+    #ifndef CUDA
     Darray const& E_UV_VDW = E_UV_VDW_[0];
     Darray const& E_UV_Elec = E_UV_Elec_[0];
     Darray const& E_VV_VDW = E_VV_VDW_[0];
     Darray const& E_VV_Elec = E_VV_Elec_[0];
+    #endif
     Farray const& Neighbor = neighbor_[0];
     // Sum values from other threads if necessary
     SumEVV();
@@ -1191,15 +1273,26 @@ void Action_GIST::Print() {
     for (unsigned int gr_pt = 0; gr_pt < MAX_GRID_PT_; gr_pt++)
     {
       E_progress.Update( gr_pt );
+      
       //mprintf("DEBUG1: VV vdw=%f elec=%f\n", E_VV_VDW_[gr_pt], E_VV_Elec_[gr_pt]);
       int nw_total = N_waters_[gr_pt]; // Total number of waters that have been in this voxel.
       if (nw_total > 1) {
+        #ifndef CUDA
         Esw_dens[gr_pt] = (E_UV_VDW[gr_pt]  + E_UV_Elec[gr_pt]) / (NFRAME_ * Vvox);
         Esw_norm[gr_pt] = (E_UV_VDW[gr_pt]  + E_UV_Elec[gr_pt]) / nw_total;
         Eww_dens[gr_pt] = (E_VV_VDW[gr_pt]  + E_VV_Elec[gr_pt]) / (2 * NFRAME_ * Vvox);
         Eww_norm[gr_pt] = (E_VV_VDW[gr_pt]  + E_VV_Elec[gr_pt]) / (2 * nw_total);
+        #else
+        double esw = this->Esw_->operator[](gr_pt);
+        double eww = this->Eww_->operator[](gr_pt);
+        Esw_dens[gr_pt] = esw / (this->NFRAME_ * Vvox);
+        Esw_norm[gr_pt] = esw / nw_total;
+        Eww_dens[gr_pt] = eww / (this->NFRAME_ * Vvox);
+        Eww_norm[gr_pt] = eww / nw_total;
+        #endif
         Eswtot += Esw_dens[gr_pt];
         Ewwtot += Eww_dens[gr_pt];
+        
       } else {
         Esw_dens[gr_pt]=0;
         Esw_norm[gr_pt]=0;
@@ -1211,9 +1304,9 @@ void Action_GIST::Print() {
       if (nw_total > 0) {
         qtet[gr_pt] /= nw_total;
         //mprintf("DEBUG1: neighbor= %8.1f  nw_total= %8i\n", neighbor[gr_pt], nw_total);
-        neighbor_norm[gr_pt] = 1.0 * Neighbor[gr_pt] / nw_total;
+        neighbor_norm[gr_pt] = (double)Neighbor[gr_pt] / nw_total;
       }
-      neighbor_dens[gr_pt] = 1.0 * Neighbor[gr_pt] / (NFRAME_ * Vvox);
+      neighbor_dens[gr_pt] = (double)Neighbor[gr_pt] / (NFRAME_ * Vvox);
       dipolex[gr_pt] /= (DEBYE_EA * NFRAME_ * Vvox);
       dipoley[gr_pt] /= (DEBYE_EA * NFRAME_ * Vvox);
       dipolez[gr_pt] /= (DEBYE_EA * NFRAME_ * Vvox);
@@ -1299,4 +1392,175 @@ void Action_GIST::Print() {
   gist_order_.WriteTiming(2,   "Order: ", gist_action_.Total());
   gist_print_.WriteTiming(1,   "Print:", total);
   mprintf("TIME:\tTotal: %.4f s\n", total);
+	#ifdef CUDA
+	this->freeGPUMemory();
+	#endif
+}
+
+#ifdef CUDA
+void Action_GIST::NonbondCuda(ActionFrame frm) {
+  // Simply to get the information for the energetic calculations
+  std::vector<float> eww_result(this->numberAtoms_);
+  std::vector<float> esw_result(this->numberAtoms_);
+  std::vector<std::vector<int> > order_indices;
+  this->gist_nonbond_.Start();
+
+  Matrix_3x3 ucell_m, recip_m;
+  float *recip = NULL;
+  float *ucell = NULL;
+  int boxinfo;
+
+  // Check Boxinfo and write the necessary data into recip, ucell and boxinfo.
+  switch(this->image_.ImageType()) {
+    case NONORTHO:
+      recip = new float[9];
+      ucell = new float[9];
+      frm.Frm().BoxCrd().ToRecip(ucell_m, recip_m);
+      for (int i = 0; i < 9; ++i) {
+        ucell[i] = (float) ucell_m.Dptr()[i];
+        recip[i] = (float) recip_m.Dptr()[i];
+      }
+      boxinfo = 2;
+      break;
+    case ORTHO:
+      recip = new float[9];
+      for (int i = 0; i < 3; ++i) {
+        recip[i] = (float) frm.Frm().BoxCrd()[i];
+      }
+      ucell = NULL;
+      boxinfo = 1;
+      break;
+    case NOIMAGE:
+      recip = NULL;
+      ucell = NULL;
+      boxinfo = 0;
+      break;
+    default:
+      mprinterr("Error: Unexpected box information found.");
+      return;
+  }
+
+  std::vector<int> result_o = std::vector<int>(4 * this->numberAtoms_);
+  std::vector<int> result_n = std::vector<int>(this->numberAtoms_);
+  // Call the GPU Wrapper, which subsequently calls the kernel, after setup operations.
+  // Must create arrays from the vectors, does that by getting the address of the first element of the vector.
+  std::vector<std::vector<float> > e_result = doActionCudaEnergy(frm.Frm().xAddress(), this->NBindex_c_, this->numberAtomTypes_, this->paramsLJ_c_, this->molecule_c_, boxinfo, recip, ucell, this->numberAtoms_, this->min_c_, 
+                                                    this->max_c_, this->headAtomType_,this->NeighborCut2_, &(result_o[0]), &(result_n[0]), this->result_w_c_, 
+                                                    this->result_s_c_, this->result_O_c_, this->result_N_c_, this->doOrder_);
+  eww_result = e_result.at(0);
+  esw_result = e_result.at(1);
+
+  if (this->doOrder_) {
+    int counter = 0;
+    for (unsigned int i = 0; i < (4 * this->numberAtoms_); i += 4) {
+      ++counter;
+      std::vector<int> temp;
+      for (unsigned int j = 0; j < 4; ++j) {
+        temp.push_back(result_o.at(i + j));
+      }
+      order_indices.push_back(temp);
+    }
+  }
+
+  delete[] recip; // Free memory
+  delete[] ucell; // Free memory
+
+  for (unsigned int sidx = 0; sidx < NSOLVENT_; sidx++) {
+    int headAtomIndex = O_idxs_[sidx];
+    size_t bin_i, bin_j, bin_k;
+    const double *vec = frm.Frm().XYZ(headAtomIndex);
+    int voxel = -1;
+    if (this->gO_->Bin().Calc(vec[0], vec[1], vec[2], bin_i, bin_j, bin_k)) {
+      voxel = this->gO_->CalcIndex(bin_i, bin_j, bin_k);
+      this->neighbor_.at(0).at(voxel) += result_n.at(headAtomIndex);
+      // This is not nice, as it assumes that O is set before the two Hydrogens
+      // might be the case, but is still not nice (in my opinion)
+      for (unsigned int IDX = 0; IDX != nMolAtoms_; IDX++) {
+        this->Esw_->UpdateVoxel(voxel, esw_result.at(headAtomIndex + IDX));
+        this->Eww_->UpdateVoxel(voxel, eww_result.at(headAtomIndex + IDX));
+      }
+      // Order calculation
+      if (this->doOrder_) {
+        double sum = 0;
+        Vec3 cent( frm.Frm().xAddress() + (headAtomIndex) * 3 );
+        std::vector<Vec3> vectors;
+        switch(this->image_.ImageType()) {
+          case NONORTHO:
+          case ORTHO:
+            {
+              Matrix_3x3 ucell, recip;
+              frm.Frm().BoxCrd().ToRecip(ucell, recip);
+              Vec3 vec(frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(0) * 3));
+              vectors.push_back( MinImagedVec(vec, cent, ucell, recip));
+              vec = Vec3(frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(1) * 3));
+              vectors.push_back( MinImagedVec(vec, cent, ucell, recip));
+              vec = Vec3(frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(2) * 3));
+              vectors.push_back( MinImagedVec(vec, cent, ucell, recip));
+              vec = Vec3(frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(3) * 3));
+              vectors.push_back( MinImagedVec(vec, cent, ucell, recip));
+            }
+            break;
+          default:
+            vectors.push_back( Vec3( frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(0) * 3) ) - cent );
+            vectors.push_back( Vec3( frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(1) * 3) ) - cent );
+            vectors.push_back( Vec3( frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(2) * 3) ) - cent );
+            vectors.push_back( Vec3( frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(3) * 3) ) - cent );
+        }
+        
+        for (int i = 0; i < 3; ++i) {
+          for (int j = i + 1; j < 4; ++j) {
+            double cosThet = (vectors.at(i) * vectors.at(j)) / sqrt(vectors.at(i).Magnitude2() * vectors.at(j).Magnitude2());
+            sum += (cosThet + 1.0/3) * (cosThet + 1.0/3);
+          }
+        }
+        this->order_norm_->UpdateVoxel(voxel, 1.0 - (3.0/8.0) * sum);
+      }
+    }
+    
+  }
+  this->gist_nonbond_.Stop();
+}
+
+/**
+ * Frees all the Memory on the GPU.
+ */
+void Action_GIST::freeGPUMemory(void) {
+  freeCuda(this->NBindex_c_);
+  freeCuda(this->molecule_c_);
+  freeCuda(this->paramsLJ_c_);
+  freeCuda(this->max_c_);
+  freeCuda(this->min_c_);
+  freeCuda(this->result_w_c_);
+  freeCuda(this->result_s_c_);
+  freeCuda(this->result_O_c_);
+  freeCuda(this->result_N_c_);
+  this->NBindex_c_   = NULL;
+  this->molecule_c_  = NULL;
+  this->paramsLJ_c_  = NULL;
+  this->max_c_     = NULL;
+  this->min_c_     = NULL;
+  this->result_w_c_= NULL;
+  this->result_s_c_= NULL;
+  this->result_O_c_  = NULL;
+  this->result_N_c_  = NULL;
 }
+
+/**
+ * Copies data from the CPU to the GPU.
+ * @throws: CudaException
+ */
+void Action_GIST::copyToGPU(void) {
+  try {
+    copyMemoryToDevice(&(this->NBIndex_[0]), this->NBindex_c_, this->NBIndex_.size() * sizeof(int));
+    copyMemoryToDeviceStruct(&(this->charges_[0]), &(this->atomTypes_[0]), this->solvent_, &(this->molecule_[0]), this->numberAtoms_, &(this->molecule_c_),
+                              &(this->lJParamsA_[0]), &(this->lJParamsB_[0]), this->lJParamsA_.size(), &(this->paramsLJ_c_));
+  } catch (CudaException &ce) {
+    this->freeGPUMemory();
+    mprinterr("Error: Could not copy data to the device.\n");
+    throw ce;
+  } catch (std::exception &e) {
+    this->freeGPUMemory();
+    throw e;
+  }
+}
+#endif
diff --git a/src/Action_GIST.h b/src/Action_GIST.h
index c6d0b0f217..62a64f76d8 100644
--- a/src/Action_GIST.h
+++ b/src/Action_GIST.h
@@ -5,12 +5,19 @@
 #include "DataSet_3D.h"
 #include "DataSet_MatrixFlt.h"
 #include "Timer.h"
+
+#ifdef CUDA
+#include "cuda_kernels/GistCudaSetup.cuh"
+#endif
 /// Class for applying Grid Inhomogenous Solvation Theory
 /** \author Daniel R. Roe
   */
 class Action_GIST : public Action {
   public:
     Action_GIST();
+		#ifdef CUDA
+		~Action_GIST() {delete[] this->solvent_;}
+		#endif
     DispatchObject* Alloc() const { return (DispatchObject*)new Action_GIST(); }
     void Help() const;
   private:
@@ -27,6 +34,39 @@ class Action_GIST : public Action {
     void Order(Frame const&);
     void SumEVV();
 
+#ifdef CUDA
+    // Additional data for GPU calculation
+
+    std::vector<float> lJParamsA_;
+    std::vector<float> lJParamsB_;
+    std::vector<float> charges_;
+    std::vector<int> atomTypes_;
+    std::vector<int> NBIndex_;
+    std::vector<int> molecule_;
+
+    unsigned int numberAtoms_;
+    int numberAtomTypes_;
+    int headAtomType_;
+    bool *solvent_;
+
+    // Arrays on GPU
+    int *NBindex_c_;
+    void *molecule_c_;
+    void *paramsLJ_c_;
+    float *max_c_;
+    float *min_c_;
+    float *result_w_c_;
+    float *result_s_c_;
+    int *result_O_c_;
+    int *result_N_c_;
+
+    // CUDA only functions
+    void freeGPUMemory(void);
+    void copyToGPU(void);
+    void NonbondCuda(ActionFrame);
+
+#endif
+
     static const Vec3 x_lab_;
     static const Vec3 y_lab_;
     static const Vec3 z_lab_;
diff --git a/src/cuda_kernels/CMakeLists.txt b/src/cuda_kernels/CMakeLists.txt
index 8739581321..4961542e79 100644
--- a/src/cuda_kernels/CMakeLists.txt
+++ b/src/cuda_kernels/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(CPPTRAJ_CUDA_SOURCES core_kernels.cu kernel_wrappers.cu)
+set(CPPTRAJ_CUDA_SOURCES core_kernels.cu kernel_wrappers.cu GistCudaCalc.cu GistCudaSetup.cu)
 
 cuda_add_library(cpptraj_cuda_routines ${CPPTRAJ_CUDA_SOURCES})
 
diff --git a/src/cuda_kernels/ConstantsG.cuh b/src/cuda_kernels/ConstantsG.cuh
new file mode 100644
index 0000000000..79b9ccea7f
--- /dev/null
+++ b/src/cuda_kernels/ConstantsG.cuh
@@ -0,0 +1,69 @@
+/**
+ * This is simply taken from the cpptraj implementation by Daniel R. Roe and Mark J. Williamson.
+ * All constants stay double, as otherwise precision might be lost. However, on the GPU a nice
+ * balance between calculations in double and single precision are important (weighing mostly on
+ * single precision). Therfore, to achieve maximum parallelization, cast to float if needed.
+ */
+
+#ifndef CONSTANTS_CUH
+#define CONSTANTS_CUH
+
+namespace ConstantsG {
+  const double PI           = 3.141592653589793;
+  const double TWOPI        = 6.283185307179586;  
+  const double FOURPI       = 12.566370614359172;
+  const double FOURTHIRDSPI = 4.1887902047863909; 
+  const double FOURFIFTHSPI = 2.5132741228718345;
+  const double PIOVER2      = 1.5707963267948966; 
+  /// Convert degrees to radians
+  const double DEGRAD       = 0.017453292519943295; 
+  /// Convert radians to degrees
+  const double RADDEG       =   57.29577951308232;
+  /// For checking floating point zero
+  const double SMALL        = 0.00000000000001;
+  const double HUGEG        = 1E30;
+  /// Gas constant in J/mol*K
+  const double GASK_J       = 8.3144621;
+  /// Gas constant in kcal/mol*K
+  const double GASK_KCAL    = 0.0019872041;
+  /// Avogadro constant
+  const double NA = 6.02214129e23;
+  /// Speed of light (m/s)
+  const double C0 = 299792458;
+  /// Euler-Mascheroni constant
+  const double EULER_MASC = 0.5772156649;
+  /// Convert atomic mass unit (amu) to kg
+  const double AMU_TO_KG = 1.660539e-27;
+  /// Convert Angstroms to nanometers
+  const double ANG_TO_NM = 0.1;
+  /// Convert nanometers to Angstroms
+  const double NM_TO_ANG = 10.0;
+  /// Convert calories to Joules
+  const double CAL_TO_J = 4.184;
+  /// Convert Joules to calories
+  const double J_TO_CAL = 1.0 / CAL_TO_J;
+  /// Convert electron charge to Amber units (w/ prefactor)
+  /** NOTE: This value is actually very low precision, but is used since this
+    *       is the value used by LEaP. The actual conversion of Coulomb's
+    *       constant for use with Amber units (i.e. e-, Ang, and kcal/mol)
+    *       is (with NA representing Avogadro's number):
+    *         (8.987552e9 N*m^2/C^2) * (1.602177e-19 C/e-)^2 * (10^10 Ang/m) *
+    *         (1/4184 kcal/J) * NA = 332.06279350
+    *       The square root of this value is 18.222590197
+    */
+  const double ELECTOAMBER  = 18.2223;
+  /// Convert Amber charge to electron charge
+  const double AMBERTOELEC  = 1.0 / ELECTOAMBER;
+  /// Convert from Amber internal units of time (1/20.455 ps) to ps.
+  /** Amber operates in kcal/mol units for energy, amu for masses,
+    * and Angstoms for distances. For convenience when calculating KE from
+    * velocity, the velocities have a conversion factor built in; as a result
+    * the Amber unit of time is (1/20.455) ps. So e.g. to convert Amber
+    * velocities from internal units to Ang/ps multiply by 20.455. The number
+    * itself is derived from sqrt(1 / ((AMU_TO_KG * NA) / (1000 * CAL_TO_J))).
+    */
+const double AMBERTIME_TO_PS = 20.455;
+}
+
+
+#endif
\ No newline at end of file
diff --git a/src/cuda_kernels/GistCudaCalc.cu b/src/cuda_kernels/GistCudaCalc.cu
new file mode 100644
index 0000000000..b0eff73857
--- /dev/null
+++ b/src/cuda_kernels/GistCudaCalc.cu
@@ -0,0 +1,464 @@
+#include "GistCudaCalc.cuh"
+
+#include <stdio.h>
+
+#define ELECTOAMBER_2 332.05221729
+
+
+ /**
+  * Calculate the squared distance in an orthorhombic box. See cpptraj implementation.
+  * @param vec1: The first point of the distance calculation.
+  * @param vec2: The seconf point of the distance calculation
+  * @param box: The boxinfo of the object.
+  * @return: The minimal distance in an orthorhombic box.
+  */
+__device__ 
+float dist2_imageOrtho(float *vec1, float *vec2, BoxInfo box) {
+  if (box[0] == 0 || box[1] == 0 || box[2] == 0) {
+    return -1;
+  }
+  float x = vec1[0] - vec2[0];
+  float y = vec1[1] - vec2[1];
+  float z = vec1[2] - vec2[2];
+
+  if (x < 0) {
+    x *= -1;
+  }
+  if (y < 0) {
+    y *= -1;
+  }
+  if (z < 0) {
+    z *= -1;
+  }
+
+  while ( x > box[0] ) {
+    x -= box[0];
+  }
+  while ( y > box[1] ) {
+    y -= box[1];
+  }
+  while ( z > box[2] ) {
+    z -= box[2];
+  }
+
+  float dist = box[0] - x;
+  if ( dist < x ) {
+    x = dist;
+  }
+  dist = box[1] - y;
+  if  ( dist < y ) {
+    y = dist;
+  }
+  dist = box[2] - z;
+  if ( dist < z ) {
+    z = dist;
+  }
+
+  return x * x + y * y + z * z;
+}
+
+/**
+ * Calculate M * v.
+ * @param vec: The vector v.
+ * @param mat3x3: The matrix M.
+ * @param ret: The values to be returned. If null, returns into vec.
+ */
+__device__
+void scalarProd(float* vec, BoxInfo mat3x3, float *ret) {
+  if (ret != NULL) {
+    ret[0] = vec[0] * mat3x3[0] + vec[1] * mat3x3[1] + vec[2] * mat3x3[2];
+    ret[1] = vec[0] * mat3x3[3] + vec[1] * mat3x3[4] + vec[2] * mat3x3[5];
+    ret[2] = vec[0] * mat3x3[6] + vec[1] * mat3x3[7] + vec[2] * mat3x3[8];
+  } else {
+    float x = vec[0];
+    float y = vec[1];
+    float z = vec[2];
+    vec[0] = x * mat3x3[0] + y * mat3x3[1] + z * mat3x3[2];
+    vec[1] = x * mat3x3[3] + y * mat3x3[4] + z * mat3x3[5];
+    vec[2] = x * mat3x3[6] + y * mat3x3[7] + z * mat3x3[8];
+  }
+}
+
+/**
+ * Find the result_O squared distance in a non-orthogonal box.
+ * @param vec1: Position vector of point 1.
+ * @param vec2: Position vector of point 2.
+ * @param recip: The inverse of the ucell.
+ * @param ucell: The box matrix.
+ * @return: The minimal squared distance between two atoms, also considering the images.
+ */
+__device__
+float dist2_imageNonOrtho(float *vec1, float *vec2, BoxInfo recip, UnitCell ucell) {
+  float vecRecip1[3];
+  float vecRecip2[3];
+  scalarProd(vec1, recip, vecRecip1);
+  scalarProd(vec2, recip, vecRecip2);
+  float r_2 = dist2_imageNonOrthoRecip(vecRecip1, vecRecip2, ucell);
+  
+  return r_2;
+}
+
+/**
+ * Calculate if the distance to an image is smaller than a given distance.
+ * @param f: The first vector in the reciprocal space.
+ * @param vec2Cartesian: The second vector in cartesian coordinates.
+ * @param nx: Which neighbour in x direction?
+ * @param ny: Which neighbour in y direction?
+ * @param nz: Which neighbour in z direction?
+ * @param ucell: The box matrix.
+ * @param finalMin: The already calculated minimum.
+ * @return: The new minimum, if it is smaller than finalMin, finalMin otherwise.
+ */
+__device__
+float calcIfDistIsSmaller(float *f, float *vec2Cartesian, int nx, int ny, int nz, UnitCell ucell, float finalMin) {
+  float fx = f[0] + nx;
+  float fy = f[1] + ny;
+  float fz = f[2] + nz;
+  // Bring f back in Cartesian coordinates
+  float x = fx * ucell[0] + fy * ucell[3] + fz * ucell[6];
+  float y = fx * ucell[1] + fy * ucell[4] + fz * ucell[7];
+  float z = fx * ucell[2] + fy * ucell[5] + fz * ucell[8];
+  x -= vec2Cartesian[0];
+  y -= vec2Cartesian[1];
+  z -= vec2Cartesian[2];
+  float min = x * x + y * y + z * z;
+  if ( min < finalMin || finalMin < 0) {
+    return min;
+  }
+  return finalMin;
+}
+
+/**
+ * Calculate the distance in a non-orthorhombic box, if the vectors are already
+ * multiplied by the inverse box matrix.
+ * @param vec1: The first position vector.
+ * @param vec2: The second position vector.
+ * @param ucell: The box cell.
+ * @return: The minimal distance between images.
+ */
+__device__
+float dist2_imageNonOrthoRecip(float * vec1, float * vec2, UnitCell ucell) {
+    
+  // Bring the points back into the main unit cell
+  float fx = vec1[0] - floor(vec1[0]);
+  float fy = vec1[1] - floor(vec1[1]);
+  float fz = vec1[2] - floor(vec1[2]);
+  float f2x = vec2[0] - floor(vec2[0]);
+  float f2y = vec2[1] - floor(vec2[1]);
+  float f2z = vec2[2] - floor(vec2[2]);
+
+  float vec[3] = {fx, fy, fz};
+
+  // Bring f2 back in cartesian coordinates
+  float xFactor = f2x * ucell[0] + f2y * ucell[3] + f2z * ucell[6];
+  float yFactor = f2x * ucell[1] + f2y * ucell[4] + f2z * ucell[7];
+  float zFactor = f2x * ucell[2] + f2y * ucell[5] + f2z * ucell[8];
+  float vec2Real[3] = {xFactor, yFactor, zFactor};
+
+  // Now the different cases, and always store the minimum
+  // Define the finalMinimum as a negative value, since it can never
+  // actually be negative this is fine.
+  float finalMinimum = -1.0;
+
+  // Run through all cells
+  for (int ix = -1; ix <= 1; ++ix) {
+    for (int iy = -1; iy <= 1; ++iy) {
+      for (int iz = -1; iz <= 1; ++iz) {
+        finalMinimum = calcIfDistIsSmaller(vec, vec2Real, ix, iy, iz, ucell, finalMinimum);
+      }
+    }
+  }
+
+  return finalMinimum;
+}
+
+/**
+ * Calculates the distance between two vectors, without imaging.
+ * @param vec1: The position vector of the first atom.
+ * @param vec2: The position vector of the second atom.
+ * @return: The squared distance between the two positions.
+ */
+__device__
+float dist2_noImage(float *vec1, float *vec2) {
+  float x = vec1[0] - vec2[0];
+  float y = vec1[1] - vec2[1];
+  float z = vec1[2] - vec2[2];
+
+  return x*x + y*y + z*z;
+}
+
+/**
+ * Caclulate the total energy between two atoms.
+ * @param vec1: The position vector of atom 1.
+ * @param vec2: The position vector of atom 2.
+ * @param q1: The charge of atom 1.
+ * @param q2: The charge of atom 2.
+ * @param LJA: The lennard jones A parameter.
+ * @param LJB: The lennard jones B parameter.
+ * @param boxinfo: Which kind of box, 0 not periodic, 1 orthorhombic, 2 otherwise.
+ * @param recip_o_box: Holds either the inverse of the cell matrix, if boxinfo is 2,
+ *                        or the box dimensions, if boxinfo is 1, or is NULL, if boxinfo is 0.
+ * @param ucell: Holds the cell matrix, if boxinfo is 2, NULL otherwise.
+ * @return: The total interaction energy between the two atoms.
+ */
+__device__
+float calcTotalEnergy(float q1, float q2, 
+                            float LJA, float LJB, float r_2) {
+#ifdef DEBUG_GIST_CUDA
+  if (r_2 <= 0.000001 && r_2 >= -0.000001) {
+    printf("(%8.3f, %8.3f, %8.3f) (%8.3f, %8.3f, %8.3f) %d\n", vec1[0], vec1[1], vec1[2], vec2[0], vec2[1], vec2[2], boxinfo);
+  }
+#endif
+  return calcVdWEnergy(r_2, LJA, LJB) + calcElectrostaticEnergy(r_2, q1, q2);
+}
+
+/**
+ * Calculate the distance between two different points.
+ * @param vec1: The first vector to calculate the distance.
+ * @param vec2: The second vector to calculate the distance.
+ * @param recip_o_box: The boxinfo, either the box or the reciprocal.
+ * @param ucell: The unitcell of a box.
+ * @return: The squared distance between two points.
+ */
+__device__
+float calcDist(float *vec1, float *vec2, BoxInfo recip_o_box,
+                    UnitCell ucell) {
+  float r_2 = 0;
+  switch(recip_o_box.boxinfo) {
+    case 0:
+      r_2 = dist2_noImage(vec1, vec2);
+      break;
+    case 1:
+      // Uses recip for box info as well;
+      r_2 = dist2_imageOrtho(vec1, vec2, recip_o_box);
+      break;
+    case 2:
+      r_2 = dist2_imageNonOrtho(vec1, vec2, recip_o_box, ucell);
+      break;
+    default:
+      r_2 = 0;
+  }
+  return r_2;
+}
+
+/**
+ * Calculate the Van der Waals energy between two atoms.
+ * @param r_2: The squared distance between the two atoms.
+ * @param LJA: The A part of the lennard jones potential.
+ * @param LJB: The B part if the Lennard Jones potential.
+ * @return: The Van der Waals energy.
+ */
+__device__
+float calcVdWEnergy(float r_2, float LJA, float LJB) {
+  float r_6 = r_2 * r_2 * r_2;
+  float r_12 = r_6 * r_6;
+  float LJ =  LJA / r_12 - LJB / r_6;
+  return LJ;
+}
+
+/**
+ * Calculate the electrostatic energy between two different atoms.
+ * @param r_2: The square distance between the two atoms.
+ * @param q1: The charge of atom 1.
+ * @param q2: The charge of atom 2.
+ * @return: The electrostatic energy between the two atoms.
+ */
+__device__
+float calcElectrostaticEnergy(float r_2, float q1, float q2) {
+  double charge = q1 * q2 * ELECTOAMBER_2;
+  double r = sqrt(r_2);
+  float ELE = charge / r;
+  return ELE;
+}
+
+/**
+ * Get the index into the lennard jones parameter array.
+ * @param a1: The atom type index of atom 1.
+ * @param a2: The atom type index of atom 2.
+ * @param NBindex: The arrays holding the indices into the LJ array.
+ * @param ntypes: The number of atom types.
+ * @return: The index into the parameter arrays.
+ */
+__device__
+int getLJIndex(int a1, int a2, int *NBindex, int ntypes) {
+  return NBindex[a1 * ntypes + a2];
+}
+
+/**
+ * Get the LJ parameters from a parameter array.
+ * @param a1: The atom type index of atom 1.
+ * @param a2: The atom type index of atom 2.
+ * @param NBindex: The indices into the parameter array.
+ * @param ntypes: The number of atom types.
+ * @param paramsLJ: The parameter array.
+ * @return: The LJ parameter belonging to the atom type pair a1, a2.
+ */
+__device__
+ParamsLJ getLJParam(int a1, int a2, int *NBindex, int ntypes, ParamsLJ *paramsLJ) {
+  int idx = getLJIndex(a1, a2, NBindex, ntypes);
+  if (idx < 0) {
+    return ParamsLJ();
+  }
+  return paramsLJ[idx];
+}
+
+/**
+ * Checks if atom is on grid.
+ * @param vec: Position vector to the atom.
+ * @param min: The grid starting position.
+ * @param max: The end of the grid.
+ * @return: True if vector is on grid.
+ */
+__device__
+bool isOnGrid(float *vec, float *min, float *max) {
+  return ( ( (vec[0] >= min[0]) && (vec[0] <= max[0]) ) &&
+           ( (vec[1] >= min[1]) && (vec[1] <= max[1]) ) &&
+           ( (vec[2] >= min[2]) && (vec[2] <= max[2]) ) );
+}
+
+/**
+ * Calculate the energy on the GPU.
+ * @param coords: An array holding all the coordinates of all atoms.
+ * @param NBindex: An array holding indices into the LJ parameter arrays.
+ * @param ntypes: The number of atom types.
+ * @param paramsLJA: The A LJ parameters.
+ * @param paramsLJB: The B LJ parameters.
+ * @param charges: The charges of the atoms.
+ * @param boxinfo: Which kind of box, 0 not periodic, 1 orthorhombic, 2 otherwise.
+ * @param recip_o_box: Holds either the inverse of the cell matrix, if boxinfo is 2,
+ *                        or the box dimensions, if boxinfo is 1, or is NULL, if boxinfo is 0.
+ * @param ucell: Holds the cell matrix, if boxinfo is 2, NULL otherwise.
+ * @param maxAtoms: The number of atoms in the system.
+ * @param a_types: The different atom types of the atoms.
+ * @param solvent: True if atom is a solvent atom, false otherwise.
+ * @param molecule: The number of the molecule this atom belong to.
+ * @param result_ww: The result of the water - water interactions.
+ * @param result_sw: The result of the solute - water interactions.
+ * @param min: The minimum values of the grid.
+ * @param max: The maximum values of the grid.
+ */
+__global__
+void cudaCalcEnergy(Coordinates *coords, int *NBindex, int ntypes, ParamsLJ *parameterLJ, AtomProperties *atomProps, 
+                          BoxInfo recip_o_box, UnitCell ucell, int maxAtoms, float *result_ww, float *result_sw, 
+                          float *min, float *max, int headAtomType, float neighbourCut2, int *result_O, int *result_N) {
+  
+
+  int a1 = blockIdx.x * blockDim.x + threadIdx.x;
+  int a2 = blockIdx.y * blockDim.y + threadIdx.y;
+  if ( (a1 >= maxAtoms) || (a2 >= maxAtoms) || (a1 == a2)) {
+    return;
+  }
+
+  AtomProperties atom1 = atomProps[a1];
+  AtomProperties atom2 = atomProps[a2];
+
+  // Do not calculate if the two values are the same or they belong to the same molecule.
+  if ( (atom1.molecule != atom2.molecule)) {
+
+    Coordinates t1 = coords[a1];
+    Coordinates t2 = coords[a2];
+    ParamsLJ lj = getLJParam(atom1.atomType, atom2.atomType, NBindex, ntypes, parameterLJ);
+
+    float vec1[3] = {t1.x, t1.y, t1.z};
+    float vec2[3] = {t2.x, t2.y, t2.z};
+    float r_2 = calcDist(vec1, vec2, recip_o_box, ucell);
+    float energy = calcTotalEnergy(atom1.charge, atom2.charge, lj.A, lj.B, r_2);
+    
+    if (atom2.solvent != 0) {
+      atomicAdd(&(result_ww[a1]), energy * 0.5);
+    } else {
+      atomicAdd(&(result_sw[a1]), energy);
+    }
+  
+  }
+}
+
+/**
+ * Calculate the energy on the GPU. This implementation is somewhat slower,
+ * but is able to calculate the order parameters as well as the rest.
+ * @param coords: An array holding all the coordinates of all atoms.
+ * @param NBindex: An array holding indices into the LJ parameter arrays.
+ * @param ntypes: The number of atom types.
+ * @param paramsLJA: The A LJ parameters.
+ * @param paramsLJB: The B LJ parameters.
+ * @param charges: The charges of the atoms.
+ * @param boxinfo: Which kind of box, 0 not periodic, 1 orthorhombic, 2 otherwise.
+ * @param recip_o_box: Holds either the inverse of the cell matrix, if boxinfo is 2,
+ *                        or the box dimensions, if boxinfo is 1, or is NULL, if boxinfo is 0.
+ * @param ucell: Holds the cell matrix, if boxinfo is 2, NULL otherwise.
+ * @param maxAtoms: The number of atoms in the system.
+ * @param a_types: The different atom types of the atoms.
+ * @param solvent: True if atom is a solvent atom, false otherwise.
+ * @param molecule: The number of the molecule this atom belong to.
+ * @param result_ww: The result of the water - water interactions.
+ * @param result_sw: The result of the solute - water interactions.
+ * @param min: The minimum values of the grid.
+ * @param max: The maximum values of the grid.
+ */
+__global__
+void cudaCalcEnergySlow(Coordinates *coords, int *NBindex, int ntypes, ParamsLJ *parameterLJ, AtomProperties *atomProps, 
+  BoxInfo recip_o_box, UnitCell ucell, int maxAtoms, float *result_ww, float *result_sw,
+  float *min, float *max, int headAtomType, float neighbourCut2, int *result_O, int *result_N) {
+  
+  int a1 = blockIdx.x * blockDim.x + threadIdx.x;
+  
+  if (a1 >= maxAtoms) {
+  return;
+  }
+  
+  AtomProperties atom1 = atomProps[a1];
+  float distances[4] = {HUGE_C, HUGE_C, HUGE_C, HUGE_C};
+  result_N[a1] = 0;
+  result_O[4 * a1 + 3] = 0;
+  result_O[4 * a1 + 2] = 0;
+  result_O[4 * a1 + 1] = 0;
+  result_O[4 * a1    ] = 0;
+  for (int a2 = 0; a2 < maxAtoms; ++a2) {
+     AtomProperties atom2 = atomProps[a2];
+    // Do not calculate if the two values are the same or they belong to the same molecule.
+    if ( (a1 != a2) && (atom1.molecule != atom2.molecule)) {
+      Coordinates t1 = coords[a1];
+      Coordinates t2 = coords[a2];
+      ParamsLJ lj = getLJParam(atom1.atomType, atom2.atomType, NBindex, ntypes, parameterLJ);
+      float vec1[3] = {t1.x, t1.y, t1.z};
+      float vec2[3] = {t2.x, t2.y, t2.z};
+      float r_2 = calcDist(vec1, vec2, recip_o_box, ucell);
+      float energy = calcTotalEnergy(atom1.charge, atom2.charge, lj.A, lj.B, r_2);
+      if ((atom2.atomType == headAtomType) && atom2.solvent && atom1.solvent) {
+        if (r_2 < distances[0]) {
+          distances[3] = distances[2];
+          distances[2] = distances[1];
+          distances[1] = distances[0];
+          distances[0] = r_2;
+          result_O[4 * a1 + 3] = result_O[4 * a1 + 2];
+          result_O[4 * a1 + 2] = result_O[4 * a1 + 1];
+          result_O[4 * a1 + 1] = result_O[4 * a1    ];
+          result_O[4 * a1    ] = a2;
+        } else if (r_2 < distances[1]) {
+          distances[3] = distances[2];
+          distances[2] = distances[1];
+          distances[1] = r_2;
+          result_O[4 * a1 + 3] = result_O[4 * a1 + 2];
+          result_O[4 * a1 + 2] = result_O[4 * a1 + 1];
+          result_O[4 * a1 + 1] = a2;
+        } else if (r_2 < distances[2]) {
+          distances[3] = distances[2];
+          distances[2] = r_2;
+          result_O[4 * a1 + 3] = result_O[4 * a1 + 2];
+          result_O[4 * a1 + 2] = a2;
+        } else if (r_2 < distances[3]) {
+          distances[3] = r_2;
+          result_O[4 * a1 + 3] = a2;
+        }
+        if (r_2 < neighbourCut2) {
+          result_N[a1] += 1;
+        }
+      }
+      if (atom2.solvent != 0) {
+        result_ww[a1] += energy / 2.0;
+      } else {
+        result_sw[a1] += energy;
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/src/cuda_kernels/GistCudaCalc.cuh b/src/cuda_kernels/GistCudaCalc.cuh
new file mode 100644
index 0000000000..ed881fa169
--- /dev/null
+++ b/src/cuda_kernels/GistCudaCalc.cuh
@@ -0,0 +1,367 @@
+#ifndef GIST_CUDA_CALC_CUH
+#define GIST_CUDA_CALC_CUH
+
+#define HUGE_C 1e200
+#define BLOCKSIZE 16
+#define SLOW_BLOCKSIZE 512
+
+/**
+ * Coordinates class.
+ *
+ * Holds three different values for the coordinates, to achieve
+ * a faster access on the GPU (only a single read from the memory
+ * instead of three different reads, also the memory is better
+ * aligned).
+ * @author Johannes Kraml
+ */
+class Coordinates {
+public:
+  float x;
+  float y;
+  float z;
+
+	// Empty Constructor
+  __host__ __device__
+  Coordinates(): x(0), y(0), z(0) {}
+
+	/**
+	 * Constructor using an array of three values.
+	 * @param array: The array that holds the x, y and z coordinates
+	 */
+  __host__ __device__
+  Coordinates(const double *array) {
+    this->x = array[0];
+    this->y = array[1];
+    this->z = array[2];
+  }
+
+	/**
+	 * Copy Constructor.
+	 * @param other: The other object.
+	 */
+  __host__ __device__
+  Coordinates(const Coordinates &other) {
+    this->x = other.x;
+    this->y = other.y;
+    this->z = other.z;
+  }
+
+	/**
+	 * The assignment operator for an object of this class.
+	 * @param other: The other Coordinate object.
+	 * @return This object.
+	 */
+  __host__ __device__
+  Coordinates &operator=(const Coordinates &other) {
+    this->x = other.x;
+    this->y = other.y;
+    this->z = other.z;
+    return *this;
+  }
+  
+};
+
+/**
+ * An implementation holding the A and B values for a
+ * Lennard-Jones van der Waals interaction energy calculation.
+ * @author Johannes Kraml
+ */
+class ParamsLJ{
+public:
+  float A;
+  float B;
+
+	// Empty constructor
+  __host__ __device__
+  ParamsLJ(): A(0), B(0) {}
+
+	/**
+	 * Constructor from an array of values.
+	 * @param arr: The array that holds the A and B values.
+	 */
+  __host__ __device__
+  ParamsLJ(float *arr) {
+    this->A = arr[0];
+    this->B = arr[1];
+  }
+
+	/**
+	 * Constructor using two different floats.
+	 * @param A: The A value in the Lennard-Jones equation.
+	 * @param B: The B value in the Lennard-Jones equation.
+	 */
+  __host__ __device__
+  ParamsLJ(float A, float B) {
+    this->A = A;
+    this->B = B;
+  }
+
+	/**
+	 * Copy constructor for this class.
+	 * @param other: The other object of this class
+	 */
+  __host__ __device__
+  ParamsLJ(const ParamsLJ &other) {
+    this->A = other.A;
+    this->B = other.B;
+  }
+
+	/**
+	 * The assignment operator of this class.
+	 * @param other: The other object of this type.
+	 * @return this object.
+	 */
+  __host__ __device__
+  ParamsLJ &operator=(const ParamsLJ &other) {
+    this->A = other.A;
+    this->B = other.B;
+    return *this;
+  }
+};
+
+/**
+ * The implementation of the Unit Cell on the GPU.
+ * @author Johannes Kraml
+ */
+class UnitCell {
+public:
+	float array[9];
+	
+	// The empty constuctor.
+  __host__ __device__
+  UnitCell() {}
+
+	/**
+	 * Constructor using an array.
+	 * @param arr: The array from which the values should be
+	 *             taken.
+	 */
+  __host__ __device__
+  UnitCell(float *arr) {
+    this->array[0] = arr[0];
+    this->array[1] = arr[1];
+    this->array[2] = arr[2];
+    this->array[3] = arr[3];
+    this->array[4] = arr[4];
+    this->array[5] = arr[5];
+    this->array[6] = arr[6];
+    this->array[7] = arr[7];
+    this->array[8] = arr[8];
+  }
+
+	/**
+	 * Copy constructor.
+	 * @param other: The other object.
+	 */
+  __host__ __device__
+  UnitCell(const UnitCell &other) {
+    this->array[0] = other.array[0];
+    this->array[1] = other.array[1];
+    this->array[2] = other.array[2];
+    this->array[3] = other.array[3];
+    this->array[4] = other.array[4];
+    this->array[5] = other.array[5];
+    this->array[6] = other.array[6];
+    this->array[7] = other.array[7];
+    this->array[8] = other.array[8];
+  }
+
+	/**
+	 * Assignment operator.
+	 * @param other: The other object.
+	 * @return This object.
+	 */
+  __host__ __device__
+  UnitCell &operator=(const UnitCell &other){
+    this->array[0] = other.array[0];
+    this->array[1] = other.array[1];
+    this->array[2] = other.array[2];
+    this->array[3] = other.array[3];
+    this->array[4] = other.array[4];
+    this->array[5] = other.array[5];
+    this->array[6] = other.array[6];
+    this->array[7] = other.array[7];
+    this->array[8] = other.array[8];
+    return *this;
+  }
+
+	/**
+	 * Access operator.
+	 * @param idx: The index which should be accessed.
+	 *
+	 * @exceptsafe Not safe.
+	 */
+  __host__ __device__
+  float operator[](int idx) {
+    if (idx >= 0 && idx < 9) {
+      return this->array[idx];
+    }
+    return 1;
+  }
+
+};
+
+/**
+ * Class holding the box information, i.e., which kind of box
+ * and the box dimensions.
+ */
+class BoxInfo {
+public:
+  float array[9]; ///< box dimensions
+	int boxinfo;
+	
+	// Empty constructor
+  __host__ __device__
+  BoxInfo(): boxinfo(0) {}
+
+	/**
+	 * Constructor using an array and the boxinfo.
+	 * @param arr: The array holding the values for the box dimensions.
+	 * @param boxinfo: Which kind of box this is.
+	 */
+  __host__ __device__
+  BoxInfo(float *arr, int boxinfo) {
+    this->array[0] = arr[0];
+    this->array[1] = arr[1];
+    this->array[2] = arr[2];
+    this->array[3] = arr[3];
+    this->array[4] = arr[4];
+    this->array[5] = arr[5];
+    this->array[6] = arr[6];
+    this->array[7] = arr[7];
+    this->array[8] = arr[8];
+    this->boxinfo = boxinfo;
+  }
+
+	/**
+	 * Copy constuctor.
+	 * @param other: The other object.
+	 */
+  __host__ __device__
+  BoxInfo(const BoxInfo &other) {
+    this->array[0] = other.array[0];
+    this->array[1] = other.array[1];
+    this->array[2] = other.array[2];
+    this->array[3] = other.array[3];
+    this->array[4] = other.array[4];
+    this->array[5] = other.array[5];
+    this->array[6] = other.array[6];
+    this->array[7] = other.array[7];
+    this->array[8] = other.array[8];
+    this->boxinfo = other.boxinfo;
+  }
+
+	/**
+	 * Assignement operator.
+	 * @param other: The other object.
+	 * @return This object.
+	 */
+  __host__ __device__
+  BoxInfo &operator=(const BoxInfo &other){
+    this->array[0] = other.array[0];
+    this->array[1] = other.array[1];
+    this->array[2] = other.array[2];
+    this->array[3] = other.array[3];
+    this->array[4] = other.array[4];
+    this->array[5] = other.array[5];
+    this->array[6] = other.array[6];
+    this->array[7] = other.array[7];
+    this->array[8] = other.array[8];
+    this->boxinfo = other.boxinfo;
+    return *this;
+  }
+
+	/**
+	 * Access operator.
+	 * @param idx: The index at which to access the array.
+	 * @return The value stored at that point in the array or 0 if none is found.
+	 * @exceptsafe Is not exception safe.
+	 */
+  __host__ __device__
+  float operator[](int idx) {
+    if (idx < 9 && idx >= 0) {
+      return this->array[idx];
+    }
+    return 0;
+  }
+
+};
+
+/**
+ * Class to store different Atom properties, like charge, atomType, solvent and the
+ * molecule this atom belongs to.
+ * @author Johannes Kraml
+ */
+class AtomProperties {
+public:
+  float charge;
+  int atomType;
+  bool solvent;
+  int molecule;
+
+	// Empty constructor
+  __host__ __device__
+  AtomProperties() {}
+
+	/**
+	 * Constructor.
+	 * @param charge: The charge of the atom.
+	 * @param atomType: The atom type for access of the lennard-jones parameters.
+	 * @param solvent: Is this atom a solvent atom?
+	 * @param molecule: The molecule this atom belongs to.
+	 */
+  __host__ __device__
+  AtomProperties(float charge, int atomType, bool solvent, int molecule) {
+    this->charge = charge;
+    this->atomType = atomType;
+    this->solvent = solvent;
+    this->molecule = molecule;
+  }
+
+	/**
+	 * Copy constructor.
+	 * @param other: The other object.
+	 */
+  __host__ __device__
+  AtomProperties(const AtomProperties &other) {
+    this->charge = other.charge;
+    this->atomType = other.atomType;
+    this->solvent = other.solvent;
+    this->molecule = other.molecule;
+  }
+
+	/**
+	 * Assignement operator.
+	 * @param other: The other object.
+	 * @return This object.
+	 */
+  __host__  __device__
+  AtomProperties &operator=(const AtomProperties &other) {
+    this->charge = other.charge;
+    this->atomType = other.atomType;
+    this->solvent = other.solvent;
+    this->molecule = other.molecule;
+    return *this;
+  }
+};
+
+
+// Device functions
+__device__ float dist2_imageOrtho(float *, float *, BoxInfo);
+__device__ void scalarProd(float* , BoxInfo , float *);
+__device__ float dist2_imageNonOrtho(float *, float *, BoxInfo, UnitCell);
+__device__ float calcIfDistIsSmaller(float *, float *, int , int , int , UnitCell, float );
+__device__ float dist2_imageNonOrthoRecip(float * , float * , UnitCell);
+__device__ float dist2_noImage(float *, float *);
+__device__ float calcTotalEnergy(float , float , float , float , float);
+__device__ float calcVdWEnergy(float , float , float );
+__device__ float calcElectrostaticEnergy(float, float, float);
+__device__ int getLJIndex(int , int , int *, int );
+__device__ ParamsLJ getLJParam(int , int , int *, int , ParamsLJ *);
+__device__ bool isOnGrid(float *, float *, float *);
+__device__ float calcDist(float *, float *, BoxInfo, UnitCell);
+
+// Global functions
+__global__ void cudaCalcEnergy    (Coordinates *, int *, int, ParamsLJ *, AtomProperties *, BoxInfo, UnitCell, int, float *, float *, float *, float *, int, float, int *, int *);
+__global__ void cudaCalcEnergySlow(Coordinates *, int *, int, ParamsLJ *, AtomProperties *, BoxInfo, UnitCell, int, float *, float *,	float *, float *, int, float, int *, int *);
+#endif
\ No newline at end of file
diff --git a/src/cuda_kernels/GistCudaSetup.cu b/src/cuda_kernels/GistCudaSetup.cu
new file mode 100644
index 0000000000..facdbdde17
--- /dev/null
+++ b/src/cuda_kernels/GistCudaSetup.cu
@@ -0,0 +1,239 @@
+#include "GistCudaSetup.cuh"
+#include "GistCudaCalc.cuh"
+#include <iostream>
+
+
+/**
+ * Allocate memory on the GPU.
+ * @param array: The pointer to the array, which will be allocated on the GPU.
+ * @param size: An integer giving the size of the array, which will be allocated.
+ * @throws: CudaException if a problem occurs.
+ */
+__host__
+void allocateCuda(void **array, int size) {
+  // Check if the array is actually free, if not, it will be freed 
+  // (fun fact: checking is not necessary, one could also simply free the memory).
+  if ((*array) != NULL) {
+    cudaFree(*array);
+  }
+  // If something goes wrong, throw exception
+  if (cudaMalloc(array, size) != cudaSuccess) {
+    throw CudaException();
+  }
+}
+
+/**
+ * Copy memory from the CPU to the GPU.
+ * @param array: The array from which the values shall be copied.
+ * @param array_c: The array on the device, to which the values shall be copied.
+ * @param size: The size of the stuff which will be copied.
+ * @throws: CudaException if something goes wrong.
+ */
+__host__
+void copyMemoryToDevice(void *array, void *array_c, int size) {
+  // If something goes wrong, throw exception
+  // In this case only copying can go wrong.
+  if (cudaMemcpy(array_c, array, size, cudaMemcpyHostToDevice) != cudaSuccess) {
+    throw CudaException();
+  }
+}
+
+/**
+ * A simple helper function that copies a lot of stuff to the GPU (as structs).
+ * @param charge: An array holding the charges for the different atoms.
+ * @param atomtype: An array holding the integers for the atom types of the different atoms.
+ * @param solvent: An array of boolean values, holding the information whether a certain atom is solvent or solute.
+ * @param atomNumber: The total number of atoms.
+ * @param atomProps_c: A pointer to an array on the GPU, which will hold the atom properties.
+ * @param ljA: An array holding the lennard-jones parameter A for each atom type pair.
+ * @param ljB: An array holding the lennard-jones parameter B for each atom type pair.
+ * @param length: The length of the two aforementioned arrays (ljA & ljB).
+ * @param lJparams_c: A pointer to an array on the GPU, which will hold the lj parameters.
+ * @throws: CudaException if something bad happens.
+ */
+__host__
+void copyMemoryToDeviceStruct(float *charge, int *atomtype, bool *solvent, int *molecule, int atomNumber, void **atomProps_c,
+                              float *ljA, float *ljB, int length, void **lJparams_c) {
+  // Check if the two arrays are free. Again, this could be removed (but will stay!)
+  if ((*atomProps_c) != NULL) {
+    cudaFree(*atomProps_c);
+  }
+  if ((*lJparams_c) != NULL) {
+    cudaFree(*lJparams_c);
+  }
+  // Allocate the necessary memory on the GPU.
+  if (cudaMalloc(atomProps_c, atomNumber * sizeof(AtomProperties)) != cudaSuccess) {
+    throw CudaException();
+  }
+  if (cudaMalloc(lJparams_c, length * sizeof(ParamsLJ)) != cudaSuccess) {
+    throw CudaException();
+  }
+
+  // Create an array for the lennard-jones parameters.
+  ParamsLJ *ljp = (ParamsLJ *) malloc (length * sizeof(ParamsLJ));
+  // Add the lennard-jones parameters to the array.
+  for (int i = 0; i < length; ++i) {
+    ljp[i] = ParamsLJ(ljA[i], ljB[i]);
+  }
+
+  // Create an array for the atom properties.
+  AtomProperties *array = (AtomProperties *)malloc(atomNumber * sizeof(AtomProperties));
+  // Add the properties into the array.
+  for (int i = 0; i < atomNumber; ++i) {
+    array[i] = AtomProperties(charge[i], atomtype[i], solvent[i], molecule[i]);
+  }
+  // Copy the memory from the host to the device.
+  if (cudaMemcpy((*atomProps_c), array, atomNumber * sizeof(AtomProperties), cudaMemcpyHostToDevice) != cudaSuccess) {
+    throw CudaException();
+  }
+  if (cudaMemcpy((*lJparams_c), ljp, length * sizeof(ParamsLJ), cudaMemcpyHostToDevice) != cudaSuccess) {
+    throw CudaException();
+  }
+
+  // Free the two arrays (so that no memory leak occurs).
+  free(ljp);
+  free(array);
+}
+
+/**
+ * Free an array on the CUDA capable device.
+ * @param array: The array you want to free.
+ */
+__host__
+void freeCuda(void *array) {
+  cudaFree(array);
+}
+
+
+// This is coded C-like, but uses exceptions.
+/**
+ * This starts the cuda kernel, thus it is actually a quite long function.
+ */
+__host__
+std::vector<std::vector<float> > doActionCudaEnergy(const double *coords, int *NBindex_c, int ntypes, void *parameter, void *molecule_c,
+                            int boxinfo, float *recip_o_box, float *ucell, int maxAtoms, float *min_c, float *max_c, int headAtomType, 
+                            float neighbourCut2, int *result_o, int *result_n, float *result_w_c, float *result_s_c,
+                            int *result_O_c, int *result_N_c, bool doorder) {
+  Coordinates *coords_c   = NULL;
+  float *recip_b_c  = NULL;
+  float *ucell_c    = NULL;
+  
+  
+
+  float *result_A = (float *) calloc(maxAtoms, sizeof(float));
+  float *result_s = (float *) calloc(maxAtoms, sizeof(float));
+  // TODO: Fix this, test is actually a quite bad name here!
+  Coordinates *coord_array = (Coordinates *) calloc(maxAtoms, sizeof(Coordinates));
+  
+  // Casting
+  AtomProperties *sender = (AtomProperties *) molecule_c;
+  ParamsLJ *lennardJonesParams = (ParamsLJ *) parameter;
+  
+  // Create Boxinfo and Unit cell. This is actually very important for the speed (otherwise
+  // there would be LOTS of access to non-local variables).
+  BoxInfo boxinf;
+  if (boxinfo != 0) {
+    boxinf = BoxInfo(recip_o_box, boxinfo);
+  }
+  UnitCell ucellN;
+  if (boxinfo == 2) {
+    ucellN = UnitCell(ucell);
+  }
+  
+  // Add the coordinates to the array.
+  // TODO: Fix Test here also!
+  for (int i = 0; i < maxAtoms; ++i) {
+    coord_array[i] = Coordinates(&coords[i * 3]);
+  }
+
+  // vectors that will return the necessary information.
+  std::vector<std::vector<float> > result;
+  std::vector<float> result_esw;
+  std::vector<float> result_eww;
+
+  // Allocate space on the GPU
+  if (cudaMalloc(&coords_c, maxAtoms * sizeof(Coordinates)) != cudaSuccess) {
+    free(result_A); free(result_s); free(coord_array);
+    throw CudaException();
+  }
+
+
+  // Copy the data to the GPU
+  if (cudaMemcpy(coords_c, coord_array, maxAtoms * sizeof(Coordinates), cudaMemcpyHostToDevice) != cudaSuccess) {
+    cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c);
+    free(result_A); free(result_s); free(coord_array);
+    throw CudaException();
+  }
+  if (cudaMemcpy(result_w_c, result_A, maxAtoms * sizeof(float), cudaMemcpyHostToDevice) != cudaSuccess) {
+    cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c);
+    free(result_A); free(result_s); free(coord_array);
+    throw CudaException();
+  }
+  if (cudaMemcpy(result_s_c, result_s, maxAtoms * sizeof(float), cudaMemcpyHostToDevice) != cudaSuccess) {
+    cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c);
+    free(result_A); free(result_s); free(coord_array);
+    throw CudaException();
+  }
+
+  // If the doorder calculation is used, it needs to calculate everything differently, so the slow version is used
+  // (this is about 10% slower).
+  if (doorder) {
+    cudaCalcEnergySlow<<< (maxAtoms + SLOW_BLOCKSIZE) / SLOW_BLOCKSIZE, SLOW_BLOCKSIZE >>> (coords_c, NBindex_c, ntypes, lennardJonesParams, sender,
+                                                                                            boxinf, ucellN, maxAtoms, result_w_c, result_s_c, min_c, max_c,
+                                                                                            headAtomType, neighbourCut2, result_O_c, result_N_c);
+  } else {
+    // Uses a 2D array, which is nice for memory access.
+    dim3 threadsPerBlock(BLOCKSIZE, BLOCKSIZE);
+    dim3 numBlocks((maxAtoms + threadsPerBlock.x) / threadsPerBlock.x, (maxAtoms + threadsPerBlock.y) / threadsPerBlock.y);
+    // The actual call of the device function
+    cudaCalcEnergy<<<numBlocks, threadsPerBlock>>> (coords_c, NBindex_c, ntypes, lennardJonesParams, sender,
+                                                                      boxinf, ucellN, maxAtoms, result_w_c, result_s_c, min_c, max_c,
+                                                                      headAtomType, neighbourCut2, result_O_c, result_N_c);
+    // Check if there was an error.
+    cudaError_t cudaError = cudaGetLastError();
+    if (cudaError != cudaSuccess) {
+      printf("returned %s\n", cudaGetErrorString(cudaError));
+    }
+  }
+  // Return the results of the calculation to the main memory
+  if (cudaMemcpy(result_A, result_w_c, maxAtoms * sizeof(float), cudaMemcpyDeviceToHost) != cudaSuccess) {
+    cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c);
+    free(result_A); free(result_s); free(coord_array);
+    throw CudaException();
+  }  
+  
+
+  if (cudaMemcpy(result_s, result_s_c, maxAtoms * sizeof(float), cudaMemcpyDeviceToHost) != cudaSuccess) {
+    cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c);
+    free(result_A); free(result_s); free(coord_array);
+    throw CudaException();
+  }
+
+
+  
+  if (cudaMemcpy(result_o, result_O_c, maxAtoms * 4 * sizeof(int), cudaMemcpyDeviceToHost) != cudaSuccess) {
+    cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c);
+    free(result_A); free(result_s); free(coord_array);
+    throw CudaException();
+  }
+  
+  if (cudaMemcpy(result_n, result_N_c, maxAtoms * sizeof(int), cudaMemcpyDeviceToHost) != cudaSuccess) {
+    cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c);
+    free(result_A); free(result_s); free(coord_array);
+    throw CudaException();
+  }
+
+  for (int i = 0; i < maxAtoms; ++i) {
+    result_eww.push_back(result_A[i]);
+    result_esw.push_back(result_s[i]);
+  }
+
+  result.push_back(result_eww);
+  result.push_back(result_esw);
+
+  // Free everything used in here.
+  cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c);
+  free(result_A); free(result_s); free(coord_array);
+  
+  return result;
+}
\ No newline at end of file
diff --git a/src/cuda_kernels/GistCudaSetup.cuh b/src/cuda_kernels/GistCudaSetup.cuh
new file mode 100644
index 0000000000..d35be41875
--- /dev/null
+++ b/src/cuda_kernels/GistCudaSetup.cuh
@@ -0,0 +1,26 @@
+#ifndef GIST_CUDA_SETUP_CUH
+#define GIST_CUDA_SETUP_CUH
+
+#include <exception>
+#include <vector>
+#include <iostream>
+
+
+// Exception classes
+class CudaException : public std::exception {
+public:
+  CudaException() {
+  }
+};
+
+// Function definitions
+void allocateCuda(void **, int);
+void copyMemoryToDevice(void *, void *, int);
+void copyMemoryToDeviceStruct(float *, int *, bool *, int *, int, void **, float *, float *, int, void **);
+void freeCuda(void *);
+std::vector<std::vector<float> > doActionCudaEnergy(const double *, int *, int , void *, void *,
+                            int , float *, float *, int , float *, 
+                            float *, int, float, int *, int *, float *, float *,
+                            int *, int *, bool);
+
+#endif
diff --git a/src/cuda_kernels/Makefile b/src/cuda_kernels/Makefile
index 46b44bdb93..cf663ef027 100644
--- a/src/cuda_kernels/Makefile
+++ b/src/cuda_kernels/Makefile
@@ -10,7 +10,7 @@ TARGET        = libcpptraj_cuda.a
 
 # Source files
 
-CUDA_SOURCES=core_kernels.cu kernel_wrappers.cu
+CUDA_SOURCES=core_kernels.cu kernel_wrappers.cu GistCudaCalc.cu GistCudaSetup.cu
 
 # Objects
 
diff --git a/test/MasterTest.sh b/test/MasterTest.sh
index a9fc070345..0566a8920e 100644
--- a/test/MasterTest.sh
+++ b/test/MasterTest.sh
@@ -963,6 +963,13 @@ CheckEnv() {
       'fftw'      ) TestLibrary "FFTW"               "$CPPTRAJ_FFTW_FFT" ;;
       'openmp'    ) TestLibrary "OpenMP"             "$CPPTRAJ_OPENMP" ;;
       'singleensemble' ) TestLibrary "Single ensemble support" "$CPPTRAJ_SINGLE_ENS" ;;
+      'cuda'      ) TestLibrary "CUDA"               "$CPPTRAJ_CUDA" ;;
+      'notcuda'   )
+	if [ ! -z "$CPPTRAJ_CUDA" ]; then
+          echo "  $DESCRIP cannot be run on CUDA."
+	  ((CHECKERR++))
+        fi
+	;;
       'pnetcdf'   )
         if [ ! -z "$DO_PARALLEL" ] ; then
           TestLibrary "Parallel NetCDF" "$CPPTRAJ_PNETCDFLIB"
diff --git a/test/Test_GIST/RunTest.sh b/test/Test_GIST/RunTest.sh
index 1feb9da292..9ee00ca573 100755
--- a/test/Test_GIST/RunTest.sh
+++ b/test/Test_GIST/RunTest.sh
@@ -4,30 +4,34 @@
 
 CleanFiles gist.in gist.out gist-*.dx ww_Eij.dat Eww_ij.dat \
            Gist1-*.dx Gist1-*.dat Gist2-*.dx Gist2-*.dat
+INPUT="-i gist.in"
 TESTNAME='GIST tests'
 Requires netcdf notparallel
 
-INPUT="-i gist.in"
-
+UNITNAME='Eww Test'
+CheckFor notcuda
 # doeij test with much smaller grid to save memory
-cat > gist.in <<EOF
-parm ../tz2.ortho.parm7
-trajin ../tz2.ortho.nc 1 10
-autoimage origin
-gist doorder doeij refdens 0.033422885325 gridcntr 1.44 0.67 0.29 \
-     griddim 10 12 10 gridspacn 2.0 prefix Gist1
-go
+if [ $? -eq 0 ]; then
+  cat > gist.in <<EOF
+  parm ../tz2.ortho.parm7
+  trajin ../tz2.ortho.nc 1 10
+  autoimage origin
+  gist doorder doeij refdens 0.033422885325 gridcntr 1.44 0.67 0.29 \
+		griddim 10 12 10 gridspacn 2.0 prefix Gist1
+  go
 EOF
-RunCpptraj "GIST water-water interaction test"
-DoTest Gist1-Eww_ij.dat.save Gist1-Eww_ij.dat
+  RunCpptraj "GIST water-water interaction test"
+  DoTest Gist1-Eww_ij.dat.save Gist1-Eww_ij.dat
+fi
 
 # GIST test with finer grid for everything else
+UNITNAME='CPU and GPU tests'
 cat > gist.in <<EOF
 parm ../tz2.ortho.parm7
 trajin ../tz2.ortho.nc 1 10
 autoimage origin
 gist doorder refdens 0.033422885325 gridcntr 1.5 1.0 0.0 \
-     griddim 34 44 36 gridspacn 0.50 prefix Gist2 info Info.dat
+		griddim 34 44 36 gridspacn 0.50 prefix Gist2 info Info.dat
 go
 EOF
 RunCpptraj "GIST test"

From 86ba3b69fd40395c7e3863ab504112a45e8b812b Mon Sep 17 00:00:00 2001
From: jokr91 <hannes.kraml@gmail.com>
Date: Mon, 16 Dec 2019 09:35:21 +0100
Subject: [PATCH 2/4] Added skipS flag

A flag to skip the entropy calculation was added. Also, the indentation
was fixed for all files.
---
 src/Action_GIST.cpp | 393 ++++++++++++++++++++++++--------------------
 src/Action_GIST.h   |   1 +
 2 files changed, 215 insertions(+), 179 deletions(-)

diff --git a/src/Action_GIST.cpp b/src/Action_GIST.cpp
index 0a006ced1e..3b70fd2840 100644
--- a/src/Action_GIST.cpp
+++ b/src/Action_GIST.cpp
@@ -15,16 +15,16 @@ const double Action_GIST::maxD_ = DBL_MAX;
 
 Action_GIST::Action_GIST() :
 #ifdef CUDA
-	solvent_(NULL),
-	NBindex_c_(NULL),
-	molecule_c_(NULL),
-	paramsLJ_c_(NULL),
-	max_c_(NULL),
-	min_c_(NULL),
-	result_w_c_(NULL),
-	result_s_c_(NULL),
-	result_O_c_(NULL),
-	result_N_c_(NULL),
+  solvent_(NULL),
+  NBindex_c_(NULL),
+  molecule_c_(NULL),
+  paramsLJ_c_(NULL),
+  max_c_(NULL),
+  min_c_(NULL),
+  result_w_c_(NULL),
+  result_s_c_(NULL),
+  result_O_c_(NULL),
+  result_N_c_(NULL),
 #endif
   gO_(0),
   gH_(0),
@@ -60,7 +60,7 @@ Action_GIST::Action_GIST() :
 {}
 
 void Action_GIST::Help() const {
-  mprintf("\t[doorder] [doeij] [skipE] [refdens <rdval>] [temp <tval>]\n"
+  mprintf("\t[doorder] [doeij] [skipE] [skipS] [refdens <rdval>] [temp <tval>]\n"
           "\t[noimage] [gridcntr <xval> <yval> <zval>] [excludeions]\n"
           "\t[griddim <xval> <yval> <zval>] [gridspacn <spaceval>]\n"
           "\t[prefix <filename prefix>] [ext <grid extension>] [out <output>]\n"
@@ -128,6 +128,9 @@ Action::RetType Action_GIST::Init(ArgList& actionArgs, ActionInit& init, int deb
       return Action::ERR;
     }
   }
+
+  this->skipS_ = actionArgs.hasKey("skipS");
+
   if (doEij_) {
     eijfile_ = init.DFL().AddCpptrajFile(prefix_ + "-Eww_ij.dat", "GIST Eij matrix file");
     if (eijfile_ == 0) return Action::ERR;
@@ -277,6 +280,14 @@ Action::RetType Action_GIST::Init(ArgList& actionArgs, ActionInit& init, int deb
       EIJ_EN_.resize( numthreads );
     }
 #   endif
+
+    #ifdef CUDA
+    if (this->skipE_ && this->doorder) {
+      mprintf("When the keyword \"skipE\" is supplied, \"doorder\" cannot be"
+              " chosen, as both calculations are done on the GPU at the same"
+              " time.\nIgnoring \"doorder!\"\n");
+    }
+    #endif
   }
 
   //Box gbox;
@@ -327,8 +338,11 @@ Action::RetType Action_GIST::Init(ArgList& actionArgs, ActionInit& init, int deb
           "#    Crystal N. Nguyen, Tom Kurtzman Young, and Michael K. Gilson,\n"
           "#      J. Chem. Phys. 137, 044101 (2012)\n"
           "#    Lazaridis, J. Phys. Chem. B 102, 3531–3541 (1998)\n"
-          "#If you use the GPU parallelized version of GIST, please cite:\n"
-          "#    Johannes Kraml, Anna S. Kamenik, Franz Waibl, Michael Schauperl, Klaus R. Liedl, JCTC (2019)\n");
+#ifdef CUDA
+          "#When using the GPU parallelized version of GIST, please cite:\n"
+          "#    Johannes Kraml, Anna S. Kamenik, Franz Waibl, Michael Schauperl, Klaus R. Liedl, JCTC (2019)\n"
+#endif
+          );
   gist_init_.Stop();
   return Action::OK;
 }
@@ -976,7 +990,8 @@ Action::RetType Action_GIST::DoAction(int frameNum, ActionFrame& frm) {
   if (doOrder_) Order(frm.Frm());
   gist_order_.Stop();
   #else
-  NonbondCuda(frm);
+  if (! this->skipE_)
+    NonbondCuda(frm);
   #endif
 
   gist_action_.Stop();
@@ -1040,62 +1055,65 @@ void Action_GIST::Print() {
   double Vvox = gO_->Bin().VoxelVolume();
 
   mprintf("    GIST OUTPUT:\n");
+  
+  // The variables are kept outside, so that they are declared for later use.
   // Calculate orientational entropy
   DataSet_GridFlt& dTSorient_dens = static_cast<DataSet_GridFlt&>( *dTSorient_ );
   Farray dTSorient_norm( MAX_GRID_PT_, 0.0 );
   double dTSorienttot = 0;
   int nwtt = 0;
   double dTSo = 0;
-  // LOOP over all voxels
-  mprintf("\tCalculating orientational entropy:\n");
-  ProgressBar oe_progress( MAX_GRID_PT_ );
-  for (unsigned int gr_pt = 0; gr_pt < MAX_GRID_PT_; gr_pt++) {
-    oe_progress.Update( gr_pt );
-    dTSorient_dens[gr_pt] = 0;
-    dTSorient_norm[gr_pt] = 0;
-    int nw_total = N_waters_[gr_pt]; // Total number of waters that have been in this voxel.
-    nwtt += nw_total;
-    //mprintf("DEBUG1: %u nw_total %i\n", gr_pt, nw_total);
-    if (nw_total > 1) {
-      for (int n0 = 0; n0 < nw_total; n0++)
-      {
-        double NNr = 10000;
-        int q0 = n0 * 4; // Index into voxel_Q_ for n0
-        for (int n1 = 0; n1 < nw_total; n1++)
+  if (! this->skipS_) {
+    // LOOP over all voxels
+    mprintf("\tCalculating orientational entropy:\n");
+    ProgressBar oe_progress( MAX_GRID_PT_ );
+    for (unsigned int gr_pt = 0; gr_pt < MAX_GRID_PT_; gr_pt++) {
+      oe_progress.Update( gr_pt );
+      dTSorient_dens[gr_pt] = 0;
+      dTSorient_norm[gr_pt] = 0;
+      int nw_total = N_waters_[gr_pt]; // Total number of waters that have been in this voxel.
+      nwtt += nw_total;
+      //mprintf("DEBUG1: %u nw_total %i\n", gr_pt, nw_total);
+      if (nw_total > 1) {
+        for (int n0 = 0; n0 < nw_total; n0++)
         {
-          if (n0 != n1) {
-            int q1 = n1 * 4; // Index into voxel_Q_ for n1
-            double rR = 2.0 * acos(  fabs(voxel_Q_[gr_pt][q1  ] * voxel_Q_[gr_pt][q0  ]
+          double NNr = 10000;
+          int q0 = n0 * 4; // Index into voxel_Q_ for n0
+          for (int n1 = 0; n1 < nw_total; n1++)
+          {
+            if (n0 != n1) {
+              int q1 = n1 * 4; // Index into voxel_Q_ for n1
+              double rR = 2.0 * acos(  fabs(voxel_Q_[gr_pt][q1  ] * voxel_Q_[gr_pt][q0  ]
                                    + voxel_Q_[gr_pt][q1+1] * voxel_Q_[gr_pt][q0+1]
                                    + voxel_Q_[gr_pt][q1+2] * voxel_Q_[gr_pt][q0+2]
                                    + voxel_Q_[gr_pt][q1+3] * voxel_Q_[gr_pt][q0+3] )); // add fabs for quaternion distance calculation
-            //mprintf("DEBUG1: %g\n", rR);
-            if (rR > 0 && rR < NNr) NNr = rR;
-          }
-        } // END inner loop over all waters for this voxel
-
-        if (NNr < 9999 && NNr > 0) {
-          double dbl = log(NNr*NNr*NNr*nw_total / (3.0*Constants::TWOPI));
-          //mprintf("DEBUG1: dbl %f\n", dbl);
-          dTSorient_norm[gr_pt] += dbl;
-          dTSo += dbl;
-        }
-      } // END outer loop over all waters for this voxel
-      //mprintf("DEBUG1: dTSorient_norm %f\n", dTSorient_norm[gr_pt]);
-      dTSorient_norm[gr_pt] = Constants::GASK_KCAL * temperature_ * 
-                               ((dTSorient_norm[gr_pt]/nw_total) + Constants::EULER_MASC);
-      double dtso_norm_nw = (double)dTSorient_norm[gr_pt] * (double)nw_total;
-      dTSorient_dens[gr_pt] = (dtso_norm_nw / (NFRAME_ * Vvox));
-      dTSorienttot += dTSorient_dens[gr_pt];
-      //mprintf("DEBUG1: %f\n", dTSorienttot);
-    }
-  } // END loop over all grid points (voxels)
-  dTSorienttot *= Vvox;
-  infofile_->Printf("Maximum number of waters found in one voxel for %d frames = %d\n",
-                    NFRAME_, max_nwat_);
-  infofile_->Printf("Total referenced orientational entropy of the grid:"
-                    " dTSorient = %9.5f kcal/mol, Nf=%d\n", dTSorienttot, NFRAME_);
+              //mprintf("DEBUG1: %g\n", rR);
+              if (rR > 0 && rR < NNr) NNr = rR;
+            }
+          } // END inner loop over all waters for this voxel
 
+          if (NNr < 9999 && NNr > 0) {
+            double dbl = log(NNr*NNr*NNr*nw_total / (3.0*Constants::TWOPI));
+            //mprintf("DEBUG1: dbl %f\n", dbl);
+            dTSorient_norm[gr_pt] += dbl;
+            dTSo += dbl;
+          }
+        } // END outer loop over all waters for this voxel
+        //mprintf("DEBUG1: dTSorient_norm %f\n", dTSorient_norm[gr_pt]);
+        dTSorient_norm[gr_pt] = Constants::GASK_KCAL * temperature_ * 
+                                ((dTSorient_norm[gr_pt]/nw_total) + Constants::EULER_MASC);
+        double dtso_norm_nw = (double)dTSorient_norm[gr_pt] * (double)nw_total;
+        dTSorient_dens[gr_pt] = (dtso_norm_nw / (NFRAME_ * Vvox));
+        dTSorienttot += dTSorient_dens[gr_pt];
+        //mprintf("DEBUG1: %f\n", dTSorienttot);
+      }
+    } // END loop over all grid points (voxels)
+    dTSorienttot *= Vvox;
+    infofile_->Printf("Maximum number of waters found in one voxel for %d frames = %d\n",
+                      NFRAME_, max_nwat_);
+    infofile_->Printf("Total referenced orientational entropy of the grid:"
+                      " dTSorient = %9.5f kcal/mol, Nf=%d\n", dTSorienttot, NFRAME_);
+  }
   // Compute translational entropy for each voxel
   double dTStranstot = 0.0;
   double dTSt = 0.0;
@@ -1113,8 +1131,12 @@ void Action_GIST::Print() {
   DataSet_GridFlt& dTSsix = static_cast<DataSet_GridFlt&>( *dTSsix_ );
   Farray dTStrans_norm( MAX_GRID_PT_, 0.0 );
   Farray dTSsix_norm( MAX_GRID_PT_, 0.0 );
+  
   // Loop over all grid points
-  mprintf("\tCalculating translational entropy:\n");
+  if (! this->skipS_)
+    mprintf("\tCalculating translational entropy:\n");
+  else
+    mprintf("Calculating Densities:\n");
   ProgressBar te_progress( MAX_GRID_PT_ );
   for (unsigned int gr_pt = 0; gr_pt < MAX_GRID_PT_; gr_pt++) {
     te_progress.Update( gr_pt );
@@ -1122,126 +1144,126 @@ void Action_GIST::Print() {
     double W_dens = 1.0 * N_waters_[gr_pt] / (NFRAME_*Vvox);
     gO[gr_pt] = W_dens / BULK_DENS_;
     gH[gr_pt] = 1.0 * N_hydrogens_[gr_pt] / (NFRAME_*Vvox*2*BULK_DENS_);
-
-    int nw_total = N_waters_[gr_pt]; // Total number of waters that have been in this voxel.
-    for (int n0 = 0; n0 < nw_total; n0++)
-    {
-      double NNd = 10000;
-      double NNs = 10000;
-      int i0 = n0 * 3; // index into voxel_xyz_ for n0
-      float VX = voxel_xyz_[gr_pt][i0  ];
-      float VY = voxel_xyz_[gr_pt][i0+1];
-      float VZ = voxel_xyz_[gr_pt][i0+2];
-      int q0 = n0 * 4;  // index into voxel_Q_ for n0
-      float W4 = voxel_Q_[gr_pt][q0  ];
-      float X4 = voxel_Q_[gr_pt][q0+1];
-      float Y4 = voxel_Q_[gr_pt][q0+2];
-      float Z4 = voxel_Q_[gr_pt][q0+3];
-      // First do own voxel
-      for (int n1 = 0; n1 < nw_total; n1++) {
-        if ( n1 != n0) {
-          int i1 = n1 * 3; // index into voxel_xyz_ for n1
-          double dx = (double)(VX - voxel_xyz_[gr_pt][i1  ]);
-          double dy = (double)(VY - voxel_xyz_[gr_pt][i1+1]);
-          double dz = (double)(VZ - voxel_xyz_[gr_pt][i1+2]);
-          double dd = dx*dx+dy*dy+dz*dz;
-          if (dd < NNd && dd > 0) { NNd = dd; }
-          int q1 = n1 * 4; // index into voxel_Q_ for n1
-          double rR = 2 * acos( fabs(W4*voxel_Q_[gr_pt][q1  ] +
-                                X4*voxel_Q_[gr_pt][q1+1] +
-                                Y4*voxel_Q_[gr_pt][q1+2] +
-                                Z4*voxel_Q_[gr_pt][q1+3] )); //add fabs for quaternion distance calculation
-          double ds = rR*rR + dd;
-          if (ds < NNs && ds > 0) { NNs = ds; }
-        }
-      } // END self loop over all waters for this voxel
-      //mprintf("DEBUG1: self NNd=%f NNs=%f\n", NNd, NNs);
-      // Determine which directions are possible.
-      bool cannotAddZ = (nz == 0 || ( gr_pt%nz == nz-1 ));
-      bool cannotAddY = ((nz == 0 || ny-1 == 0) || ( gr_pt%(nz*(ny-1)+(numplane*addx)) < nz));
-      bool cannotAddX = (gr_pt >= addx * (nx-1) && gr_pt < addx * nx );
-      bool cannotSubZ = (nz == 0 || gr_pt%nz == 0);
-      bool cannotSubY = ((nz == 0 || ny == 0) || (gr_pt%addx < nz));
-      bool cannotSubX = ((nz == 0 || ny == 0) || (gr_pt < addx));
-      bool boundary = ( cannotAddZ || cannotAddY || cannotAddX ||
-                        cannotSubZ || cannotSubY || cannotSubX );
-      if (!boundary) {
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addy, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addy, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz + addy, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz - addy, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz + addy, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz - addy, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz + addx, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz - addx, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz + addx, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz - addx, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addy + addx, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addy - addx, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addy + addx, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addy - addx, NNd, NNs);
-
-        // add the 8 more voxels for NNr searching
-
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx + addy + addz, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx + addy - addz, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx - addy + addz, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx - addy - addz, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx + addy + addz, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx + addy - addz, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx - addy + addz, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx - addy - addz, NNd, NNs);
-
-
-
-        NNd = sqrt(NNd);
-        NNs = sqrt(NNs);
-
-        if (NNd < 3 && NNd > 0/*NNd < 9999 && NNd > 0*/) {
-          double dbl = log((NNd*NNd*NNd*NFRAME_*4*Constants::PI*BULK_DENS_)/3);
-          dTStrans_norm[gr_pt] += dbl;
-          dTSt += dbl;
-          dbl = log((NNs*NNs*NNs*NNs*NNs*NNs*NFRAME_*Constants::PI*BULK_DENS_)/48);
-          dTSsix_norm[gr_pt] += dbl;
-          dTSs += dbl;
-          //mprintf("DEBUG1: dbl=%f NNs=%f\n", dbl, NNs);
+    if (! this->skipS_) {
+      int nw_total = N_waters_[gr_pt]; // Total number of waters that have been in this voxel.
+      for (int n0 = 0; n0 < nw_total; n0++)
+      {
+        double NNd = 10000;
+        double NNs = 10000;
+        int i0 = n0 * 3; // index into voxel_xyz_ for n0
+        float VX = voxel_xyz_[gr_pt][i0  ];
+        float VY = voxel_xyz_[gr_pt][i0+1];
+        float VZ = voxel_xyz_[gr_pt][i0+2];
+        int q0 = n0 * 4;  // index into voxel_Q_ for n0
+        float W4 = voxel_Q_[gr_pt][q0  ];
+        float X4 = voxel_Q_[gr_pt][q0+1];
+        float Y4 = voxel_Q_[gr_pt][q0+2];
+        float Z4 = voxel_Q_[gr_pt][q0+3];
+        // First do own voxel
+        for (int n1 = 0; n1 < nw_total; n1++) {
+          if ( n1 != n0) {
+            int i1 = n1 * 3; // index into voxel_xyz_ for n1
+            double dx = (double)(VX - voxel_xyz_[gr_pt][i1  ]);
+            double dy = (double)(VY - voxel_xyz_[gr_pt][i1+1]);
+            double dz = (double)(VZ - voxel_xyz_[gr_pt][i1+2]);
+            double dd = dx*dx+dy*dy+dz*dz;
+            if (dd < NNd && dd > 0) { NNd = dd; }
+            int q1 = n1 * 4; // index into voxel_Q_ for n1
+            double rR = 2 * acos( fabs(W4*voxel_Q_[gr_pt][q1  ] +
+                                  X4*voxel_Q_[gr_pt][q1+1] +
+                                  Y4*voxel_Q_[gr_pt][q1+2] +
+                                  Z4*voxel_Q_[gr_pt][q1+3] )); //add fabs for quaternion distance calculation
+            double ds = rR*rR + dd;
+            if (ds < NNs && ds > 0) { NNs = ds; }
+          }
+        } // END self loop over all waters for this voxel
+        //mprintf("DEBUG1: self NNd=%f NNs=%f\n", NNd, NNs);
+        // Determine which directions are possible.
+        bool cannotAddZ = (nz == 0 || ( gr_pt%nz == nz-1 ));
+        bool cannotAddY = ((nz == 0 || ny-1 == 0) || ( gr_pt%(nz*(ny-1)+(numplane*addx)) < nz));
+        bool cannotAddX = (gr_pt >= addx * (nx-1) && gr_pt < addx * nx );
+        bool cannotSubZ = (nz == 0 || gr_pt%nz == 0);
+        bool cannotSubY = ((nz == 0 || ny == 0) || (gr_pt%addx < nz));
+        bool cannotSubX = ((nz == 0 || ny == 0) || (gr_pt < addx));
+        bool boundary = ( cannotAddZ || cannotAddY || cannotAddX ||
+                          cannotSubZ || cannotSubY || cannotSubX );
+        if (!boundary) {
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addy, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addy, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz + addy, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz - addy, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz + addy, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz - addy, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz + addx, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz - addx, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz + addx, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz - addx, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addy + addx, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addy - addx, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addy + addx, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addy - addx, NNd, NNs);
+
+          // add the 8 more voxels for NNr searching
+
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx + addy + addz, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx + addy - addz, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx - addy + addz, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx - addy - addz, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx + addy + addz, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx + addy - addz, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx - addy + addz, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx - addy - addz, NNd, NNs);
+
+
+          NNd = sqrt(NNd);
+          NNs = sqrt(NNs);
+
+          if (NNd < 3 && NNd > 0/*NNd < 9999 && NNd > 0*/) {
+            double dbl = log((NNd*NNd*NNd*NFRAME_*4*Constants::PI*BULK_DENS_)/3);
+            dTStrans_norm[gr_pt] += dbl;
+            dTSt += dbl;
+            dbl = log((NNs*NNs*NNs*NNs*NNs*NNs*NFRAME_*Constants::PI*BULK_DENS_)/48);
+            dTSsix_norm[gr_pt] += dbl;
+            dTSs += dbl;
+            //mprintf("DEBUG1: dbl=%f NNs=%f\n", dbl, NNs);
+          }
         }
+      } // END loop over all waters for this voxel
+      if (dTStrans_norm[gr_pt] != 0) {
+        nwts += nw_total;
+        dTStrans_norm[gr_pt] = Constants::GASK_KCAL*temperature_*( (dTStrans_norm[gr_pt]/nw_total) +
+                                                                  Constants::EULER_MASC );
+        dTSsix_norm[gr_pt] = Constants::GASK_KCAL*temperature_*( (dTSsix_norm[gr_pt]/nw_total) +
+                                                                Constants::EULER_MASC );
       }
-    } // END loop over all waters for this voxel
-    if (dTStrans_norm[gr_pt] != 0) {
-      nwts += nw_total;
-      dTStrans_norm[gr_pt] = Constants::GASK_KCAL*temperature_*( (dTStrans_norm[gr_pt]/nw_total) +
-                                                                 Constants::EULER_MASC );
-      dTSsix_norm[gr_pt] = Constants::GASK_KCAL*temperature_*( (dTSsix_norm[gr_pt]/nw_total) +
-                                                               Constants::EULER_MASC );
+      double dtst_norm_nw = (double)dTStrans_norm[gr_pt] * (double)nw_total;
+      dTStrans[gr_pt] = (dtst_norm_nw / (NFRAME_*Vvox));
+      double dtss_norm_nw = (double)dTSsix_norm[gr_pt] * (double)nw_total;
+      dTSsix[gr_pt] = (dtss_norm_nw / (NFRAME_*Vvox));
+      dTStranstot += dTStrans[gr_pt];
+    } // END loop over all grid points (voxels)
+  }
+  if (!this->skipS_) {
+    dTStranstot *= Vvox;
+    double dTSst = 0.0;
+    double dTStt = 0.0;
+    if (nwts > 0) {
+      dTSst = Constants::GASK_KCAL*temperature_*((dTSs/nwts) + Constants::EULER_MASC);
+      dTStt = Constants::GASK_KCAL*temperature_*((dTSt/nwts) + Constants::EULER_MASC);
     }
-    double dtst_norm_nw = (double)dTStrans_norm[gr_pt] * (double)nw_total;
-    dTStrans[gr_pt] = (dtst_norm_nw / (NFRAME_*Vvox));
-    double dtss_norm_nw = (double)dTSsix_norm[gr_pt] * (double)nw_total;
-    dTSsix[gr_pt] = (dtss_norm_nw / (NFRAME_*Vvox));
-    dTStranstot += dTStrans[gr_pt];
-  } // END loop over all grid points (voxels)
-
-  dTStranstot *= Vvox;
-  double dTSst = 0.0;
-  double dTStt = 0.0;
-  if (nwts > 0) {
-    dTSst = Constants::GASK_KCAL*temperature_*((dTSs/nwts) + Constants::EULER_MASC);
-    dTStt = Constants::GASK_KCAL*temperature_*((dTSt/nwts) + Constants::EULER_MASC);
+    double dTSot = Constants::GASK_KCAL*temperature_*((dTSo/nwtt) + Constants::EULER_MASC);
+    infofile_->Printf("watcount in vol = %d\n", nwtt);
+    infofile_->Printf("watcount in subvol = %d\n", nwts);
+    infofile_->Printf("Total referenced translational entropy of the grid:"
+                      " dTStrans = %9.5f kcal/mol, Nf=%d\n", dTStranstot, NFRAME_);
+    infofile_->Printf("Total 6d if all one vox: %9.5f kcal/mol\n", dTSst);
+    infofile_->Printf("Total t if all one vox: %9.5f kcal/mol\n", dTStt);
+    infofile_->Printf("Total o if all one vox: %9.5f kcal/mol\n", dTSot);
   }
-  double dTSot = Constants::GASK_KCAL*temperature_*((dTSo/nwtt) + Constants::EULER_MASC);
-  infofile_->Printf("watcount in vol = %d\n", nwtt);
-  infofile_->Printf("watcount in subvol = %d\n", nwts);
-  infofile_->Printf("Total referenced translational entropy of the grid:"
-                    " dTStrans = %9.5f kcal/mol, Nf=%d\n", dTStranstot, NFRAME_);
-  infofile_->Printf("Total 6d if all one vox: %9.5f kcal/mol\n", dTSst);
-  infofile_->Printf("Total t if all one vox: %9.5f kcal/mol\n", dTStt);
-  infofile_->Printf("Total o if all one vox: %9.5f kcal/mol\n", dTSot);
-
   // Compute average voxel energy. Allocate these sets even if skipping energy
   // to be consistent with previous output.
   DataSet_GridFlt& Esw_dens = static_cast<DataSet_GridFlt&>( *Esw_ );
@@ -1263,8 +1285,10 @@ void Action_GIST::Print() {
     Darray const& E_VV_Elec = E_VV_Elec_[0];
     #endif
     Farray const& Neighbor = neighbor_[0];
+    #ifndef CUDA
     // Sum values from other threads if necessary
     SumEVV();
+    #endif
     static const double DEBYE_EA = 0.20822678; // 1 Debye in eA
     double Eswtot = 0.0;
     double Ewwtot = 0.0;
@@ -1319,6 +1343,17 @@ void Action_GIST::Print() {
     infofile_->Printf("Total water-solute energy of the grid: Esw = %9.5f kcal/mol\n", Eswtot);
     infofile_->Printf("Total unreferenced water-water energy of the grid: Eww = %9.5f kcal/mol\n",
                       Ewwtot);
+  } else {
+    static const double DEBYE_EA = 0.20822678;
+    for (unsigned int gr_pt = 0; gr_pt < MAX_GRID_PT_; gr_pt++)
+    {
+      dipolex[gr_pt] /= (DEBYE_EA * NFRAME_ * Vvox);
+      dipoley[gr_pt] /= (DEBYE_EA * NFRAME_ * Vvox);
+      dipolez[gr_pt] /= (DEBYE_EA * NFRAME_ * Vvox);
+      pol[gr_pt] = sqrt( dipolex[gr_pt]*dipolex[gr_pt] +
+                         dipoley[gr_pt]*dipoley[gr_pt] +
+                         dipolez[gr_pt]*dipolez[gr_pt] );
+    }
   }
 
   // Write the GIST output file.
@@ -1392,9 +1427,9 @@ void Action_GIST::Print() {
   gist_order_.WriteTiming(2,   "Order: ", gist_action_.Total());
   gist_print_.WriteTiming(1,   "Print:", total);
   mprintf("TIME:\tTotal: %.4f s\n", total);
-	#ifdef CUDA
-	this->freeGPUMemory();
-	#endif
+  #ifdef CUDA
+  this->freeGPUMemory();
+  #endif
 }
 
 #ifdef CUDA
diff --git a/src/Action_GIST.h b/src/Action_GIST.h
index 62a64f76d8..7397d4dcf8 100644
--- a/src/Action_GIST.h
+++ b/src/Action_GIST.h
@@ -161,5 +161,6 @@ class Action_GIST : public Action {
     bool doEij_;               ///< If true do the i-j energy calc
     bool skipE_;               ///< If true skip the nonbond energy calc
     bool includeIons_;         ///< If true include ions in solute region.
+		bool skipS_;               ///< If true does not calculate entropy
 };
 #endif

From c80f1c16fddb2b0679662c4de0b5c0a4845cae06 Mon Sep 17 00:00:00 2001
From: jokr91 <hannes.kraml@gmail.com>
Date: Tue, 17 Dec 2019 14:30:21 +0100
Subject: [PATCH 3/4] Updated Documentation, Change Version

Added a paragraph with a couple of tips for the GPU implementation and
also added a statement concerning the new skipS keyword.
Changed the version number of cpptraj in src/Version.h to 4.25.0.
---
 doc/cpptraj.lyx                   |  27 +++-
 src/Action_GIST.cpp               |   2 +-
 src/Action_GIST.h                 |   8 +-
 src/Version.h                     |   2 +-
 src/cuda_kernels/GistCudaCalc.cuh | 196 +++++++++++++++---------------
 test/Test_GIST/RunTest.sh         |   4 +-
 6 files changed, 132 insertions(+), 107 deletions(-)

diff --git a/doc/cpptraj.lyx b/doc/cpptraj.lyx
index 7936bc00ff..0d1b1e1528 100644
--- a/doc/cpptraj.lyx
+++ b/doc/cpptraj.lyx
@@ -22490,7 +22490,7 @@ gist
 
 \end_inset
 
- [doorder] [doeij] [skipE] [refdens <rdval>] [temp <tval>]
+ [doorder] [doeij] [skipS] [skipE] [refdens <rdval>] [temp <tval>]
 \end_layout
 
 \begin_layout LyX-Code
@@ -22543,6 +22543,10 @@ literal "true"
 ).
 \end_layout
 
+\begin_layout Description
+[skipS] Skip all entropy calculations.
+\end_layout
+
 \begin_layout Description
 
 \series bold
@@ -22922,6 +22926,27 @@ refdens
  keyword, instead of allowing GIST to supply the default value.
 \end_layout
 
+\begin_layout Standard
+For GIST, a GPU accelerated version is available, in which the interaction
+ energy is calculated using CUDA.
+ When using the GPU accelerated version of GIST, the 
+\series bold
+doeij 
+\series default
+keyword is not available.
+ It is recommended to use a grid covering the entire box, when using the
+ GPU implementation.
+ You may also choose a smaller grid, but all interaction energies, i.e., each
+ atom with each atom, will always be calculated independent of the chosen
+ grid.
+ This ensures optimum performance when calculating the interaction energies.
+ Thus, the additional time required to calculate the order parameters (
+\series bold
+doorder
+\series default
+) is negligible.
+\end_layout
+
 \begin_layout Subsubsection*
 \paragraph_spacing other 3
 \noindent
diff --git a/src/Action_GIST.cpp b/src/Action_GIST.cpp
index 3b70fd2840..fbaf39c910 100644
--- a/src/Action_GIST.cpp
+++ b/src/Action_GIST.cpp
@@ -282,7 +282,7 @@ Action::RetType Action_GIST::Init(ArgList& actionArgs, ActionInit& init, int deb
 #   endif
 
     #ifdef CUDA
-    if (this->skipE_ && this->doorder) {
+    if (this->skipE_ && this->doOrder_) {
       mprintf("When the keyword \"skipE\" is supplied, \"doorder\" cannot be"
               " chosen, as both calculations are done on the GPU at the same"
               " time.\nIgnoring \"doorder!\"\n");
diff --git a/src/Action_GIST.h b/src/Action_GIST.h
index 7397d4dcf8..f3f61e25dd 100644
--- a/src/Action_GIST.h
+++ b/src/Action_GIST.h
@@ -15,9 +15,9 @@
 class Action_GIST : public Action {
   public:
     Action_GIST();
-		#ifdef CUDA
-		~Action_GIST() {delete[] this->solvent_;}
-		#endif
+    #ifdef CUDA
+    ~Action_GIST() {delete[] this->solvent_;}
+    #endif
     DispatchObject* Alloc() const { return (DispatchObject*)new Action_GIST(); }
     void Help() const;
   private:
@@ -161,6 +161,6 @@ class Action_GIST : public Action {
     bool doEij_;               ///< If true do the i-j energy calc
     bool skipE_;               ///< If true skip the nonbond energy calc
     bool includeIons_;         ///< If true include ions in solute region.
-		bool skipS_;               ///< If true does not calculate entropy
+    bool skipS_;               ///< If true does not calculate entropy
 };
 #endif
diff --git a/src/Version.h b/src/Version.h
index b254030f4a..288607a342 100644
--- a/src/Version.h
+++ b/src/Version.h
@@ -12,7 +12,7 @@
  * Whenever a number that precedes <revision> is incremented, all subsequent
  * numbers should be reset to 0.
  */
-#define CPPTRAJ_INTERNAL_VERSION "V4.24.0"
+#define CPPTRAJ_INTERNAL_VERSION "V4.25.0"
 /// PYTRAJ relies on this
 #define CPPTRAJ_VERSION_STRING CPPTRAJ_INTERNAL_VERSION
 #endif
diff --git a/src/cuda_kernels/GistCudaCalc.cuh b/src/cuda_kernels/GistCudaCalc.cuh
index ed881fa169..9f46d9b798 100644
--- a/src/cuda_kernels/GistCudaCalc.cuh
+++ b/src/cuda_kernels/GistCudaCalc.cuh
@@ -1,7 +1,7 @@
 #ifndef GIST_CUDA_CALC_CUH
 #define GIST_CUDA_CALC_CUH
 
-#define HUGE_C 1e200
+#define HUGE_C 1e30f
 #define BLOCKSIZE 16
 #define SLOW_BLOCKSIZE 512
 
@@ -20,14 +20,14 @@ public:
   float y;
   float z;
 
-	// Empty Constructor
+  // Empty Constructor
   __host__ __device__
   Coordinates(): x(0), y(0), z(0) {}
 
-	/**
-	 * Constructor using an array of three values.
-	 * @param array: The array that holds the x, y and z coordinates
-	 */
+  /**
+   * Constructor using an array of three values.
+   * @param array: The array that holds the x, y and z coordinates
+   */
   __host__ __device__
   Coordinates(const double *array) {
     this->x = array[0];
@@ -35,10 +35,10 @@ public:
     this->z = array[2];
   }
 
-	/**
-	 * Copy Constructor.
-	 * @param other: The other object.
-	 */
+  /**
+   * Copy Constructor.
+   * @param other: The other object.
+   */
   __host__ __device__
   Coordinates(const Coordinates &other) {
     this->x = other.x;
@@ -46,11 +46,11 @@ public:
     this->z = other.z;
   }
 
-	/**
-	 * The assignment operator for an object of this class.
-	 * @param other: The other Coordinate object.
-	 * @return This object.
-	 */
+  /**
+   * The assignment operator for an object of this class.
+   * @param other: The other Coordinate object.
+   * @return This object.
+   */
   __host__ __device__
   Coordinates &operator=(const Coordinates &other) {
     this->x = other.x;
@@ -71,46 +71,46 @@ public:
   float A;
   float B;
 
-	// Empty constructor
+  // Empty constructor
   __host__ __device__
   ParamsLJ(): A(0), B(0) {}
 
-	/**
-	 * Constructor from an array of values.
-	 * @param arr: The array that holds the A and B values.
-	 */
+  /**
+   * Constructor from an array of values.
+   * @param arr: The array that holds the A and B values.
+   */
   __host__ __device__
   ParamsLJ(float *arr) {
     this->A = arr[0];
     this->B = arr[1];
   }
 
-	/**
-	 * Constructor using two different floats.
-	 * @param A: The A value in the Lennard-Jones equation.
-	 * @param B: The B value in the Lennard-Jones equation.
-	 */
+  /**
+   * Constructor using two different floats.
+   * @param A: The A value in the Lennard-Jones equation.
+   * @param B: The B value in the Lennard-Jones equation.
+   */
   __host__ __device__
   ParamsLJ(float A, float B) {
     this->A = A;
     this->B = B;
   }
 
-	/**
-	 * Copy constructor for this class.
-	 * @param other: The other object of this class
-	 */
+  /**
+   * Copy constructor for this class.
+   * @param other: The other object of this class
+   */
   __host__ __device__
   ParamsLJ(const ParamsLJ &other) {
     this->A = other.A;
     this->B = other.B;
   }
 
-	/**
-	 * The assignment operator of this class.
-	 * @param other: The other object of this type.
-	 * @return this object.
-	 */
+  /**
+   * The assignment operator of this class.
+   * @param other: The other object of this type.
+   * @return this object.
+   */
   __host__ __device__
   ParamsLJ &operator=(const ParamsLJ &other) {
     this->A = other.A;
@@ -125,17 +125,17 @@ public:
  */
 class UnitCell {
 public:
-	float array[9];
-	
-	// The empty constuctor.
+  float array[9];
+  
+  // The empty constuctor.
   __host__ __device__
   UnitCell() {}
 
-	/**
-	 * Constructor using an array.
-	 * @param arr: The array from which the values should be
-	 *             taken.
-	 */
+  /**
+   * Constructor using an array.
+   * @param arr: The array from which the values should be
+   *             taken.
+   */
   __host__ __device__
   UnitCell(float *arr) {
     this->array[0] = arr[0];
@@ -149,10 +149,10 @@ public:
     this->array[8] = arr[8];
   }
 
-	/**
-	 * Copy constructor.
-	 * @param other: The other object.
-	 */
+  /**
+   * Copy constructor.
+   * @param other: The other object.
+   */
   __host__ __device__
   UnitCell(const UnitCell &other) {
     this->array[0] = other.array[0];
@@ -166,11 +166,11 @@ public:
     this->array[8] = other.array[8];
   }
 
-	/**
-	 * Assignment operator.
-	 * @param other: The other object.
-	 * @return This object.
-	 */
+  /**
+   * Assignment operator.
+   * @param other: The other object.
+   * @return This object.
+   */
   __host__ __device__
   UnitCell &operator=(const UnitCell &other){
     this->array[0] = other.array[0];
@@ -185,12 +185,12 @@ public:
     return *this;
   }
 
-	/**
-	 * Access operator.
-	 * @param idx: The index which should be accessed.
-	 *
-	 * @exceptsafe Not safe.
-	 */
+  /**
+   * Access operator.
+   * @param idx: The index which should be accessed.
+   *
+   * @exceptsafe Not safe.
+   */
   __host__ __device__
   float operator[](int idx) {
     if (idx >= 0 && idx < 9) {
@@ -208,17 +208,17 @@ public:
 class BoxInfo {
 public:
   float array[9]; ///< box dimensions
-	int boxinfo;
-	
-	// Empty constructor
+  int boxinfo;
+  
+  // Empty constructor
   __host__ __device__
   BoxInfo(): boxinfo(0) {}
 
-	/**
-	 * Constructor using an array and the boxinfo.
-	 * @param arr: The array holding the values for the box dimensions.
-	 * @param boxinfo: Which kind of box this is.
-	 */
+  /**
+   * Constructor using an array and the boxinfo.
+   * @param arr: The array holding the values for the box dimensions.
+   * @param boxinfo: Which kind of box this is.
+   */
   __host__ __device__
   BoxInfo(float *arr, int boxinfo) {
     this->array[0] = arr[0];
@@ -233,10 +233,10 @@ public:
     this->boxinfo = boxinfo;
   }
 
-	/**
-	 * Copy constuctor.
-	 * @param other: The other object.
-	 */
+  /**
+   * Copy constuctor.
+   * @param other: The other object.
+   */
   __host__ __device__
   BoxInfo(const BoxInfo &other) {
     this->array[0] = other.array[0];
@@ -251,11 +251,11 @@ public:
     this->boxinfo = other.boxinfo;
   }
 
-	/**
-	 * Assignement operator.
-	 * @param other: The other object.
-	 * @return This object.
-	 */
+  /**
+   * Assignement operator.
+   * @param other: The other object.
+   * @return This object.
+   */
   __host__ __device__
   BoxInfo &operator=(const BoxInfo &other){
     this->array[0] = other.array[0];
@@ -271,12 +271,12 @@ public:
     return *this;
   }
 
-	/**
-	 * Access operator.
-	 * @param idx: The index at which to access the array.
-	 * @return The value stored at that point in the array or 0 if none is found.
-	 * @exceptsafe Is not exception safe.
-	 */
+  /**
+   * Access operator.
+   * @param idx: The index at which to access the array.
+   * @return The value stored at that point in the array or 0 if none is found.
+   * @exceptsafe Is not exception safe.
+   */
   __host__ __device__
   float operator[](int idx) {
     if (idx < 9 && idx >= 0) {
@@ -299,17 +299,17 @@ public:
   bool solvent;
   int molecule;
 
-	// Empty constructor
+  // Empty constructor
   __host__ __device__
   AtomProperties() {}
 
-	/**
-	 * Constructor.
-	 * @param charge: The charge of the atom.
-	 * @param atomType: The atom type for access of the lennard-jones parameters.
-	 * @param solvent: Is this atom a solvent atom?
-	 * @param molecule: The molecule this atom belongs to.
-	 */
+  /**
+   * Constructor.
+   * @param charge: The charge of the atom.
+   * @param atomType: The atom type for access of the lennard-jones parameters.
+   * @param solvent: Is this atom a solvent atom?
+   * @param molecule: The molecule this atom belongs to.
+   */
   __host__ __device__
   AtomProperties(float charge, int atomType, bool solvent, int molecule) {
     this->charge = charge;
@@ -318,10 +318,10 @@ public:
     this->molecule = molecule;
   }
 
-	/**
-	 * Copy constructor.
-	 * @param other: The other object.
-	 */
+  /**
+   * Copy constructor.
+   * @param other: The other object.
+   */
   __host__ __device__
   AtomProperties(const AtomProperties &other) {
     this->charge = other.charge;
@@ -330,11 +330,11 @@ public:
     this->molecule = other.molecule;
   }
 
-	/**
-	 * Assignement operator.
-	 * @param other: The other object.
-	 * @return This object.
-	 */
+  /**
+   * Assignement operator.
+   * @param other: The other object.
+   * @return This object.
+   */
   __host__  __device__
   AtomProperties &operator=(const AtomProperties &other) {
     this->charge = other.charge;
@@ -364,4 +364,4 @@ __device__ float calcDist(float *, float *, BoxInfo, UnitCell);
 // Global functions
 __global__ void cudaCalcEnergy    (Coordinates *, int *, int, ParamsLJ *, AtomProperties *, BoxInfo, UnitCell, int, float *, float *, float *, float *, int, float, int *, int *);
 __global__ void cudaCalcEnergySlow(Coordinates *, int *, int, ParamsLJ *, AtomProperties *, BoxInfo, UnitCell, int, float *, float *,	float *, float *, int, float, int *, int *);
-#endif
\ No newline at end of file
+#endif
diff --git a/test/Test_GIST/RunTest.sh b/test/Test_GIST/RunTest.sh
index 9ee00ca573..28edac51bb 100755
--- a/test/Test_GIST/RunTest.sh
+++ b/test/Test_GIST/RunTest.sh
@@ -17,7 +17,7 @@ if [ $? -eq 0 ]; then
   trajin ../tz2.ortho.nc 1 10
   autoimage origin
   gist doorder doeij refdens 0.033422885325 gridcntr 1.44 0.67 0.29 \
-		griddim 10 12 10 gridspacn 2.0 prefix Gist1
+    griddim 10 12 10 gridspacn 2.0 prefix Gist1
   go
 EOF
   RunCpptraj "GIST water-water interaction test"
@@ -31,7 +31,7 @@ parm ../tz2.ortho.parm7
 trajin ../tz2.ortho.nc 1 10
 autoimage origin
 gist doorder refdens 0.033422885325 gridcntr 1.5 1.0 0.0 \
-		griddim 34 44 36 gridspacn 0.50 prefix Gist2 info Info.dat
+    griddim 34 44 36 gridspacn 0.50 prefix Gist2 info Info.dat
 go
 EOF
 RunCpptraj "GIST test"

From 12a6f53f79e0c76709ca6d4a90702be31df7601f Mon Sep 17 00:00:00 2001
From: jokr91 <hannes.kraml@gmail.com>
Date: Wed, 8 Jan 2020 16:58:30 +0100
Subject: [PATCH 4/4] Changes to comply with reviews by @drroe

- Removed ConstantsG.cuh
- Fixed printf error
- Added gist to the CUDA enabled commands
---
 doc/cpptraj.lyx                    |  4 ++
 src/cuda_kernels/ConstantsG.cuh    | 69 ------------------------------
 src/cuda_kernels/GistCudaCalc.cu   |  2 +-
 src/cuda_kernels/GistCudaSetup.cu  |  1 -
 src/cuda_kernels/GistCudaSetup.cuh |  3 +-
 5 files changed, 7 insertions(+), 72 deletions(-)
 delete mode 100644 src/cuda_kernels/ConstantsG.cuh

diff --git a/doc/cpptraj.lyx b/doc/cpptraj.lyx
index 0d1b1e1528..ad71d98000 100644
--- a/doc/cpptraj.lyx
+++ b/doc/cpptraj.lyx
@@ -1174,6 +1174,10 @@ closest
 watershell
 \end_layout
 
+\begin_layout LyX-Code
+gist
+\end_layout
+
 \begin_layout Section
 General Concepts
 \end_layout
diff --git a/src/cuda_kernels/ConstantsG.cuh b/src/cuda_kernels/ConstantsG.cuh
deleted file mode 100644
index 79b9ccea7f..0000000000
--- a/src/cuda_kernels/ConstantsG.cuh
+++ /dev/null
@@ -1,69 +0,0 @@
-/**
- * This is simply taken from the cpptraj implementation by Daniel R. Roe and Mark J. Williamson.
- * All constants stay double, as otherwise precision might be lost. However, on the GPU a nice
- * balance between calculations in double and single precision are important (weighing mostly on
- * single precision). Therfore, to achieve maximum parallelization, cast to float if needed.
- */
-
-#ifndef CONSTANTS_CUH
-#define CONSTANTS_CUH
-
-namespace ConstantsG {
-  const double PI           = 3.141592653589793;
-  const double TWOPI        = 6.283185307179586;  
-  const double FOURPI       = 12.566370614359172;
-  const double FOURTHIRDSPI = 4.1887902047863909; 
-  const double FOURFIFTHSPI = 2.5132741228718345;
-  const double PIOVER2      = 1.5707963267948966; 
-  /// Convert degrees to radians
-  const double DEGRAD       = 0.017453292519943295; 
-  /// Convert radians to degrees
-  const double RADDEG       =   57.29577951308232;
-  /// For checking floating point zero
-  const double SMALL        = 0.00000000000001;
-  const double HUGEG        = 1E30;
-  /// Gas constant in J/mol*K
-  const double GASK_J       = 8.3144621;
-  /// Gas constant in kcal/mol*K
-  const double GASK_KCAL    = 0.0019872041;
-  /// Avogadro constant
-  const double NA = 6.02214129e23;
-  /// Speed of light (m/s)
-  const double C0 = 299792458;
-  /// Euler-Mascheroni constant
-  const double EULER_MASC = 0.5772156649;
-  /// Convert atomic mass unit (amu) to kg
-  const double AMU_TO_KG = 1.660539e-27;
-  /// Convert Angstroms to nanometers
-  const double ANG_TO_NM = 0.1;
-  /// Convert nanometers to Angstroms
-  const double NM_TO_ANG = 10.0;
-  /// Convert calories to Joules
-  const double CAL_TO_J = 4.184;
-  /// Convert Joules to calories
-  const double J_TO_CAL = 1.0 / CAL_TO_J;
-  /// Convert electron charge to Amber units (w/ prefactor)
-  /** NOTE: This value is actually very low precision, but is used since this
-    *       is the value used by LEaP. The actual conversion of Coulomb's
-    *       constant for use with Amber units (i.e. e-, Ang, and kcal/mol)
-    *       is (with NA representing Avogadro's number):
-    *         (8.987552e9 N*m^2/C^2) * (1.602177e-19 C/e-)^2 * (10^10 Ang/m) *
-    *         (1/4184 kcal/J) * NA = 332.06279350
-    *       The square root of this value is 18.222590197
-    */
-  const double ELECTOAMBER  = 18.2223;
-  /// Convert Amber charge to electron charge
-  const double AMBERTOELEC  = 1.0 / ELECTOAMBER;
-  /// Convert from Amber internal units of time (1/20.455 ps) to ps.
-  /** Amber operates in kcal/mol units for energy, amu for masses,
-    * and Angstoms for distances. For convenience when calculating KE from
-    * velocity, the velocities have a conversion factor built in; as a result
-    * the Amber unit of time is (1/20.455) ps. So e.g. to convert Amber
-    * velocities from internal units to Ang/ps multiply by 20.455. The number
-    * itself is derived from sqrt(1 / ((AMU_TO_KG * NA) / (1000 * CAL_TO_J))).
-    */
-const double AMBERTIME_TO_PS = 20.455;
-}
-
-
-#endif
\ No newline at end of file
diff --git a/src/cuda_kernels/GistCudaCalc.cu b/src/cuda_kernels/GistCudaCalc.cu
index b0eff73857..ddd08875a5 100644
--- a/src/cuda_kernels/GistCudaCalc.cu
+++ b/src/cuda_kernels/GistCudaCalc.cu
@@ -1,6 +1,6 @@
 #include "GistCudaCalc.cuh"
 
-#include <stdio.h>
+#include <cstdio>
 
 #define ELECTOAMBER_2 332.05221729
 
diff --git a/src/cuda_kernels/GistCudaSetup.cu b/src/cuda_kernels/GistCudaSetup.cu
index facdbdde17..d257da38b5 100644
--- a/src/cuda_kernels/GistCudaSetup.cu
+++ b/src/cuda_kernels/GistCudaSetup.cu
@@ -1,6 +1,5 @@
 #include "GistCudaSetup.cuh"
 #include "GistCudaCalc.cuh"
-#include <iostream>
 
 
 /**
diff --git a/src/cuda_kernels/GistCudaSetup.cuh b/src/cuda_kernels/GistCudaSetup.cuh
index d35be41875..9e13948efd 100644
--- a/src/cuda_kernels/GistCudaSetup.cuh
+++ b/src/cuda_kernels/GistCudaSetup.cuh
@@ -3,7 +3,8 @@
 
 #include <exception>
 #include <vector>
-#include <iostream>
+#include <cstdio>
+
 
 
 // Exception classes