diff --git a/doc/cpptraj.lyx b/doc/cpptraj.lyx
index 7936bc00ff..ad71d98000 100644
--- a/doc/cpptraj.lyx
+++ b/doc/cpptraj.lyx
@@ -1174,6 +1174,10 @@ closest
 watershell
 \end_layout
 
+\begin_layout LyX-Code
+gist
+\end_layout
+
 \begin_layout Section
 General Concepts
 \end_layout
@@ -22490,7 +22494,7 @@ gist
 
 \end_inset
 
- [doorder] [doeij] [skipE] [refdens <rdval>] [temp <tval>]
+ [doorder] [doeij] [skipS] [skipE] [refdens <rdval>] [temp <tval>]
 \end_layout
 
 \begin_layout LyX-Code
@@ -22543,6 +22547,10 @@ literal "true"
 ).
 \end_layout
 
+\begin_layout Description
+[skipS] Skip all entropy calculations.
+\end_layout
+
 \begin_layout Description
 
 \series bold
@@ -22922,6 +22930,27 @@ refdens
  keyword, instead of allowing GIST to supply the default value.
 \end_layout
 
+\begin_layout Standard
+For GIST, a GPU accelerated version is available, in which the interaction
+ energy is calculated using CUDA.
+ When using the GPU accelerated version of GIST, the 
+\series bold
+doeij 
+\series default
+keyword is not available.
+ It is recommended to use a grid covering the entire box, when using the
+ GPU implementation.
+ You may also choose a smaller grid, but all interaction energies, i.e., each
+ atom with each atom, will always be calculated independent of the chosen
+ grid.
+ This ensures optimum performance when calculating the interaction energies.
+ Thus, the additional time required to calculate the order parameters (
+\series bold
+doorder
+\series default
+) is negligible.
+\end_layout
+
 \begin_layout Subsubsection*
 \paragraph_spacing other 3
 \noindent
diff --git a/src/Action_GIST.cpp b/src/Action_GIST.cpp
index c928fa26e3..109b58cf6f 100644
--- a/src/Action_GIST.cpp
+++ b/src/Action_GIST.cpp
@@ -15,6 +15,18 @@
 const double Action_GIST::maxD_ = DBL_MAX;
 
 Action_GIST::Action_GIST() :
+#ifdef CUDA
+  solvent_(NULL),
+  NBindex_c_(NULL),
+  molecule_c_(NULL),
+  paramsLJ_c_(NULL),
+  max_c_(NULL),
+  min_c_(NULL),
+  result_w_c_(NULL),
+  result_s_c_(NULL),
+  result_O_c_(NULL),
+  result_N_c_(NULL),
+#endif
   gO_(0),
   gH_(0),
   Esw_(0),
@@ -49,12 +61,17 @@ Action_GIST::Action_GIST() :
 {}
 
 void Action_GIST::Help() const {
-  mprintf("\t[doorder] [doeij] [skipE] [refdens <rdval>] [temp <tval>]\n"
+  mprintf("\t[doorder] [doeij] [skipE] [skipS] [refdens <rdval>] [temp <tval>]\n"
           "\t[noimage] [gridcntr <xval> <yval> <zval>] [excludeions]\n"
           "\t[griddim <xval> <yval> <zval>] [gridspacn <spaceval>]\n"
           "\t[prefix <filename prefix>] [ext <grid extension>] [out <output>]\n"
           "\t[info <info>]\n"
-          "Perform Grid Inhomogenous Solvation Theory calculation.\n");
+          "Perform Grid Inhomogenous Solvation Theory calculation.\n"
+#ifdef CUDA
+          "The option doeij is not available, when using the CUDA accelerated version,\n"
+          "as this would need way too much memory."
+#endif
+          );
 }
 
 Action::RetType Action_GIST::Init(ArgList& actionArgs, ActionInit& init, int debugIn)
@@ -99,6 +116,12 @@ Action::RetType Action_GIST::Init(ArgList& actionArgs, ActionInit& init, int deb
   image_.InitImaging( !(actionArgs.hasKey("noimage")) );
   doOrder_ = actionArgs.hasKey("doorder");
   doEij_ = actionArgs.hasKey("doeij");
+#ifdef CUDA
+  if (this->doEij_) {
+    mprintf("Warning: 'doeij' cannot be specified when using CUDA. Setting is ignored");
+    this->doEij_ = false;
+  }
+#endif
   skipE_ = actionArgs.hasKey("skipE");
   if (skipE_) {
     if (doEij_) {
@@ -106,6 +129,9 @@ Action::RetType Action_GIST::Init(ArgList& actionArgs, ActionInit& init, int deb
       return Action::ERR;
     }
   }
+
+  this->skipS_ = actionArgs.hasKey("skipS");
+
   if (doEij_) {
     eijfile_ = init.DFL().AddCpptrajFile(prefix_ + "-Eww_ij.dat", "GIST Eij matrix file");
     if (eijfile_ == 0) return Action::ERR;
@@ -255,6 +281,14 @@ Action::RetType Action_GIST::Init(ArgList& actionArgs, ActionInit& init, int deb
       EIJ_EN_.resize( numthreads );
     }
 #   endif
+
+    #ifdef CUDA
+    if (this->skipE_ && this->doOrder_) {
+      mprintf("When the keyword \"skipE\" is supplied, \"doorder\" cannot be"
+              " chosen, as both calculations are done on the GPU at the same"
+              " time.\nIgnoring \"doorder!\"\n");
+    }
+    #endif
   }
 
   //Box gbox;
@@ -304,7 +338,12 @@ Action::RetType Action_GIST::Init(ArgList& actionArgs, ActionInit& init, int deb
           "#    Crystal Nguyen, Michael K. Gilson, and Tom Young, arXiv:1108.4876v1 (2011)\n"
           "#    Crystal N. Nguyen, Tom Kurtzman Young, and Michael K. Gilson,\n"
           "#      J. Chem. Phys. 137, 044101 (2012)\n"
-          "#    Lazaridis, J. Phys. Chem. B 102, 3531–3541 (1998)\n");
+          "#    Lazaridis, J. Phys. Chem. B 102, 3531–3541 (1998)\n"
+#ifdef CUDA
+          "#When using the GPU parallelized version of GIST, please cite:\n"
+          "#    Johannes Kraml, Anna S. Kamenik, Franz Waibl, Michael Schauperl, Klaus R. Liedl, JCTC (2019)\n"
+#endif
+          );
   gist_init_.Stop();
   return Action::OK;
 }
@@ -321,6 +360,10 @@ Action::RetType Action_GIST::Setup(ActionSetup& setup) {
     return Action::ERR;
   }
   image_.SetupImaging( setup.CoordInfo().TrajBox().Type() );
+  #ifdef CUDA
+  this->numberAtoms_ = setup.Top().Natom();
+  this->solvent_ = new bool[this->numberAtoms_];
+  #endif
 
   // Get molecule number for each solvent molecule
   //mol_nums_.clear();
@@ -340,6 +383,9 @@ Action::RetType Action_GIST::Setup(ActionSetup& setup) {
   {
     if (mol->IsSolvent()) {
       int o_idx = mol->BeginAtom();
+      #ifdef CUDA
+      this->headAtomType_ = setup.Top()[o_idx].TypeIndex();
+      #endif
       // Check that molecule has correct # of atoms
       unsigned int molNumAtoms = (unsigned int)mol->NumAtoms();
       if (nMolAtoms_ == 0) {
@@ -372,6 +418,12 @@ Action::RetType Action_GIST::Setup(ActionSetup& setup) {
       for (unsigned int IDX = 0; IDX != nMolAtoms_; IDX++) {
         A_idxs_.push_back( o_idx + IDX );
         atom_voxel_.push_back( OFF_GRID_ );
+        #ifdef CUDA
+        this->molecule_.push_back( setup.Top()[o_idx + IDX ].MolNum() );
+        this->charges_.push_back( setup.Top()[o_idx + IDX ].Charge() );
+        this->atomTypes_.push_back( setup.Top()[o_idx + IDX ].TypeIndex() );
+        this->solvent_[ o_idx + IDX ] = true;
+        #endif
       }
       NsolventAtoms += nMolAtoms_;
       // If first solvent molecule, save charges. If not, check that charges match.
@@ -407,6 +459,12 @@ Action::RetType Action_GIST::Setup(ActionSetup& setup) {
           A_idxs_.push_back( u_idx );
           atom_voxel_.push_back( SOLUTE_ );
           NsoluteAtoms++;
+          #ifdef CUDA
+          this->molecule_.push_back( setup.Top()[ u_idx ].MolNum() );
+          this->charges_.push_back( setup.Top()[ u_idx ].Charge() );
+          this->atomTypes_.push_back( setup.Top()[ u_idx ].TypeIndex() );
+          this->solvent_[ u_idx ] = false;
+          #endif
         }
       }
     }
@@ -436,6 +494,35 @@ Action::RetType Action_GIST::Setup(ActionSetup& setup) {
       mprintf("\tNo imaging will be performed for energy distance calculations.\n");
   }
 
+#ifdef CUDA
+  NonbondParmType nb = setup.Top().Nonbond();
+  this->NBIndex_ = nb.NBindex();
+  this->numberAtomTypes_ = nb.Ntypes();
+  for (unsigned int i = 0; i < nb.NBarray().size(); ++i) {
+    this->lJParamsA_.push_back( (float) nb.NBarray().at(i).A() );
+    this->lJParamsB_.push_back( (float) nb.NBarray().at(i).B() );
+  }
+
+  try {
+    allocateCuda(((void**)&this->NBindex_c_), this->NBIndex_.size() * sizeof(int));
+    allocateCuda((void**)&this->max_c_, 3 * sizeof(float));
+    allocateCuda((void**)&this->min_c_, 3 * sizeof(float));
+    allocateCuda((void**)&this->result_w_c_, this->numberAtoms_ * sizeof(float));
+    allocateCuda((void**)&this->result_s_c_, this->numberAtoms_ * sizeof(float));
+    allocateCuda((void**)&this->result_O_c_, this->numberAtoms_ * 4 * sizeof(int));
+    allocateCuda((void**)&this->result_N_c_, this->numberAtoms_ * sizeof(int));
+  } catch (CudaException &e) {
+    mprinterr("Error: Could not allocate memory on GPU!\n");
+    this->freeGPUMemory();
+    return Action::ERR;
+  }
+  try {
+    this->copyToGPU();
+  } catch (CudaException &e) {
+    return Action::ERR;
+  }
+#endif
+
   gist_setup_.Stop();
   return Action::OK;
 }
@@ -723,6 +810,8 @@ Action::RetType Action_GIST::DoAction(int frameNum, ActionFrame& frm) {
   OnGrid_idxs_.clear();
   OnGrid_XYZ_.clear();
 
+  // CUDA necessary information
+
   size_t bin_i, bin_j, bin_k;
   Vec3 const& Origin = gO_->Bin().GridOrigin();
   // Loop over each solvent molecule
@@ -891,14 +980,20 @@ Action::RetType Action_GIST::DoAction(int frameNum, ActionFrame& frm) {
   } // END loop over each solvent molecule
 
   // Do energy calculation if requested
+  #ifndef CUDA
   gist_nonbond_.Start();
   if (!skipE_) NonbondEnergy(frm.Frm(), *CurrentParm_);
   gist_nonbond_.Stop();
+ 
 
   // Do order calculation if requested
   gist_order_.Start();
   if (doOrder_) Order(frm.Frm());
   gist_order_.Stop();
+  #else
+  if (! this->skipE_)
+    NonbondCuda(frm);
+  #endif
 
   gist_action_.Stop();
   return Action::OK;
@@ -961,62 +1056,65 @@ void Action_GIST::Print() {
   double Vvox = gO_->Bin().VoxelVolume();
 
   mprintf("    GIST OUTPUT:\n");
+  
+  // The variables are kept outside, so that they are declared for later use.
   // Calculate orientational entropy
   DataSet_GridFlt& dTSorient_dens = static_cast<DataSet_GridFlt&>( *dTSorient_ );
   Farray dTSorient_norm( MAX_GRID_PT_, 0.0 );
   double dTSorienttot = 0;
   int nwtt = 0;
   double dTSo = 0;
-  // LOOP over all voxels
-  mprintf("\tCalculating orientational entropy:\n");
-  ProgressBar oe_progress( MAX_GRID_PT_ );
-  for (unsigned int gr_pt = 0; gr_pt < MAX_GRID_PT_; gr_pt++) {
-    oe_progress.Update( gr_pt );
-    dTSorient_dens[gr_pt] = 0;
-    dTSorient_norm[gr_pt] = 0;
-    int nw_total = N_waters_[gr_pt]; // Total number of waters that have been in this voxel.
-    nwtt += nw_total;
-    //mprintf("DEBUG1: %u nw_total %i\n", gr_pt, nw_total);
-    if (nw_total > 1) {
-      for (int n0 = 0; n0 < nw_total; n0++)
-      {
-        double NNr = 10000;
-        int q0 = n0 * 4; // Index into voxel_Q_ for n0
-        for (int n1 = 0; n1 < nw_total; n1++)
+  if (! this->skipS_) {
+    // LOOP over all voxels
+    mprintf("\tCalculating orientational entropy:\n");
+    ProgressBar oe_progress( MAX_GRID_PT_ );
+    for (unsigned int gr_pt = 0; gr_pt < MAX_GRID_PT_; gr_pt++) {
+      oe_progress.Update( gr_pt );
+      dTSorient_dens[gr_pt] = 0;
+      dTSorient_norm[gr_pt] = 0;
+      int nw_total = N_waters_[gr_pt]; // Total number of waters that have been in this voxel.
+      nwtt += nw_total;
+      //mprintf("DEBUG1: %u nw_total %i\n", gr_pt, nw_total);
+      if (nw_total > 1) {
+        for (int n0 = 0; n0 < nw_total; n0++)
         {
-          if (n0 != n1) {
-            int q1 = n1 * 4; // Index into voxel_Q_ for n1
-            double rR = 2.0 * acos(  fabs(voxel_Q_[gr_pt][q1  ] * voxel_Q_[gr_pt][q0  ]
+          double NNr = 10000;
+          int q0 = n0 * 4; // Index into voxel_Q_ for n0
+          for (int n1 = 0; n1 < nw_total; n1++)
+          {
+            if (n0 != n1) {
+              int q1 = n1 * 4; // Index into voxel_Q_ for n1
+              double rR = 2.0 * acos(  fabs(voxel_Q_[gr_pt][q1  ] * voxel_Q_[gr_pt][q0  ]
                                    + voxel_Q_[gr_pt][q1+1] * voxel_Q_[gr_pt][q0+1]
                                    + voxel_Q_[gr_pt][q1+2] * voxel_Q_[gr_pt][q0+2]
                                    + voxel_Q_[gr_pt][q1+3] * voxel_Q_[gr_pt][q0+3] )); // add fabs for quaternion distance calculation
-            //mprintf("DEBUG1: %g\n", rR);
-            if (rR > 0 && rR < NNr) NNr = rR;
-          }
-        } // END inner loop over all waters for this voxel
-
-        if (NNr < 9999 && NNr > 0) {
-          double dbl = log(NNr*NNr*NNr*nw_total / (3.0*Constants::TWOPI));
-          //mprintf("DEBUG1: dbl %f\n", dbl);
-          dTSorient_norm[gr_pt] += dbl;
-          dTSo += dbl;
-        }
-      } // END outer loop over all waters for this voxel
-      //mprintf("DEBUG1: dTSorient_norm %f\n", dTSorient_norm[gr_pt]);
-      dTSorient_norm[gr_pt] = Constants::GASK_KCAL * temperature_ * 
-                               ((dTSorient_norm[gr_pt]/nw_total) + Constants::EULER_MASC);
-      double dtso_norm_nw = (double)dTSorient_norm[gr_pt] * (double)nw_total;
-      dTSorient_dens[gr_pt] = (dtso_norm_nw / (NFRAME_ * Vvox));
-      dTSorienttot += dTSorient_dens[gr_pt];
-      //mprintf("DEBUG1: %f\n", dTSorienttot);
-    }
-  } // END loop over all grid points (voxels)
-  dTSorienttot *= Vvox;
-  infofile_->Printf("Maximum number of waters found in one voxel for %d frames = %d\n",
-                    NFRAME_, max_nwat_);
-  infofile_->Printf("Total referenced orientational entropy of the grid:"
-                    " dTSorient = %9.5f kcal/mol, Nf=%d\n", dTSorienttot, NFRAME_);
+              //mprintf("DEBUG1: %g\n", rR);
+              if (rR > 0 && rR < NNr) NNr = rR;
+            }
+          } // END inner loop over all waters for this voxel
 
+          if (NNr < 9999 && NNr > 0) {
+            double dbl = log(NNr*NNr*NNr*nw_total / (3.0*Constants::TWOPI));
+            //mprintf("DEBUG1: dbl %f\n", dbl);
+            dTSorient_norm[gr_pt] += dbl;
+            dTSo += dbl;
+          }
+        } // END outer loop over all waters for this voxel
+        //mprintf("DEBUG1: dTSorient_norm %f\n", dTSorient_norm[gr_pt]);
+        dTSorient_norm[gr_pt] = Constants::GASK_KCAL * temperature_ * 
+                                ((dTSorient_norm[gr_pt]/nw_total) + Constants::EULER_MASC);
+        double dtso_norm_nw = (double)dTSorient_norm[gr_pt] * (double)nw_total;
+        dTSorient_dens[gr_pt] = (dtso_norm_nw / (NFRAME_ * Vvox));
+        dTSorienttot += dTSorient_dens[gr_pt];
+        //mprintf("DEBUG1: %f\n", dTSorienttot);
+      }
+    } // END loop over all grid points (voxels)
+    dTSorienttot *= Vvox;
+    infofile_->Printf("Maximum number of waters found in one voxel for %d frames = %d\n",
+                      NFRAME_, max_nwat_);
+    infofile_->Printf("Total referenced orientational entropy of the grid:"
+                      " dTSorient = %9.5f kcal/mol, Nf=%d\n", dTSorienttot, NFRAME_);
+  }
   // Compute translational entropy for each voxel
   double dTStranstot = 0.0;
   double dTSt = 0.0;
@@ -1034,8 +1132,12 @@ void Action_GIST::Print() {
   DataSet_GridFlt& dTSsix = static_cast<DataSet_GridFlt&>( *dTSsix_ );
   Farray dTStrans_norm( MAX_GRID_PT_, 0.0 );
   Farray dTSsix_norm( MAX_GRID_PT_, 0.0 );
+  
   // Loop over all grid points
-  mprintf("\tCalculating translational entropy:\n");
+  if (! this->skipS_)
+    mprintf("\tCalculating translational entropy:\n");
+  else
+    mprintf("Calculating Densities:\n");
   ProgressBar te_progress( MAX_GRID_PT_ );
   for (unsigned int gr_pt = 0; gr_pt < MAX_GRID_PT_; gr_pt++) {
     te_progress.Update( gr_pt );
@@ -1043,126 +1145,126 @@ void Action_GIST::Print() {
     double W_dens = 1.0 * N_waters_[gr_pt] / (NFRAME_*Vvox);
     gO[gr_pt] = W_dens / BULK_DENS_;
     gH[gr_pt] = 1.0 * N_hydrogens_[gr_pt] / (NFRAME_*Vvox*2*BULK_DENS_);
-
-    int nw_total = N_waters_[gr_pt]; // Total number of waters that have been in this voxel.
-    for (int n0 = 0; n0 < nw_total; n0++)
-    {
-      double NNd = 10000;
-      double NNs = 10000;
-      int i0 = n0 * 3; // index into voxel_xyz_ for n0
-      float VX = voxel_xyz_[gr_pt][i0  ];
-      float VY = voxel_xyz_[gr_pt][i0+1];
-      float VZ = voxel_xyz_[gr_pt][i0+2];
-      int q0 = n0 * 4;  // index into voxel_Q_ for n0
-      float W4 = voxel_Q_[gr_pt][q0  ];
-      float X4 = voxel_Q_[gr_pt][q0+1];
-      float Y4 = voxel_Q_[gr_pt][q0+2];
-      float Z4 = voxel_Q_[gr_pt][q0+3];
-      // First do own voxel
-      for (int n1 = 0; n1 < nw_total; n1++) {
-        if ( n1 != n0) {
-          int i1 = n1 * 3; // index into voxel_xyz_ for n1
-          double dx = (double)(VX - voxel_xyz_[gr_pt][i1  ]);
-          double dy = (double)(VY - voxel_xyz_[gr_pt][i1+1]);
-          double dz = (double)(VZ - voxel_xyz_[gr_pt][i1+2]);
-          double dd = dx*dx+dy*dy+dz*dz;
-          if (dd < NNd && dd > 0) { NNd = dd; }
-          int q1 = n1 * 4; // index into voxel_Q_ for n1
-          double rR = 2 * acos( fabs(W4*voxel_Q_[gr_pt][q1  ] +
-                                X4*voxel_Q_[gr_pt][q1+1] +
-                                Y4*voxel_Q_[gr_pt][q1+2] +
-                                Z4*voxel_Q_[gr_pt][q1+3] )); //add fabs for quaternion distance calculation
-          double ds = rR*rR + dd;
-          if (ds < NNs && ds > 0) { NNs = ds; }
-        }
-      } // END self loop over all waters for this voxel
-      //mprintf("DEBUG1: self NNd=%f NNs=%f\n", NNd, NNs);
-      // Determine which directions are possible.
-      bool cannotAddZ = (nz == 0 || ( gr_pt%nz == nz-1 ));
-      bool cannotAddY = ((nz == 0 || ny-1 == 0) || ( gr_pt%(nz*(ny-1)+(numplane*addx)) < nz));
-      bool cannotAddX = (gr_pt >= addx * (nx-1) && gr_pt < addx * nx );
-      bool cannotSubZ = (nz == 0 || gr_pt%nz == 0);
-      bool cannotSubY = ((nz == 0 || ny == 0) || (gr_pt%addx < nz));
-      bool cannotSubX = ((nz == 0 || ny == 0) || (gr_pt < addx));
-      bool boundary = ( cannotAddZ || cannotAddY || cannotAddX ||
-                        cannotSubZ || cannotSubY || cannotSubX );
-      if (!boundary) {
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addy, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addy, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz + addy, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz - addy, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz + addy, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz - addy, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz + addx, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz - addx, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz + addx, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz - addx, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addy + addx, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addy - addx, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addy + addx, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addy - addx, NNd, NNs);
-
-        // add the 8 more voxels for NNr searching
-
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx + addy + addz, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx + addy - addz, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx - addy + addz, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx - addy - addz, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx + addy + addz, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx + addy - addz, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx - addy + addz, NNd, NNs);
-        TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx - addy - addz, NNd, NNs);
-
-
-
-        NNd = sqrt(NNd);
-        NNs = sqrt(NNs);
-
-        if (NNd < 3 && NNd > 0/*NNd < 9999 && NNd > 0*/) {
-          double dbl = log((NNd*NNd*NNd*NFRAME_*4*Constants::PI*BULK_DENS_)/3);
-          dTStrans_norm[gr_pt] += dbl;
-          dTSt += dbl;
-          dbl = log((NNs*NNs*NNs*NNs*NNs*NNs*NFRAME_*Constants::PI*BULK_DENS_)/48);
-          dTSsix_norm[gr_pt] += dbl;
-          dTSs += dbl;
-          //mprintf("DEBUG1: dbl=%f NNs=%f\n", dbl, NNs);
+    if (! this->skipS_) {
+      int nw_total = N_waters_[gr_pt]; // Total number of waters that have been in this voxel.
+      for (int n0 = 0; n0 < nw_total; n0++)
+      {
+        double NNd = 10000;
+        double NNs = 10000;
+        int i0 = n0 * 3; // index into voxel_xyz_ for n0
+        float VX = voxel_xyz_[gr_pt][i0  ];
+        float VY = voxel_xyz_[gr_pt][i0+1];
+        float VZ = voxel_xyz_[gr_pt][i0+2];
+        int q0 = n0 * 4;  // index into voxel_Q_ for n0
+        float W4 = voxel_Q_[gr_pt][q0  ];
+        float X4 = voxel_Q_[gr_pt][q0+1];
+        float Y4 = voxel_Q_[gr_pt][q0+2];
+        float Z4 = voxel_Q_[gr_pt][q0+3];
+        // First do own voxel
+        for (int n1 = 0; n1 < nw_total; n1++) {
+          if ( n1 != n0) {
+            int i1 = n1 * 3; // index into voxel_xyz_ for n1
+            double dx = (double)(VX - voxel_xyz_[gr_pt][i1  ]);
+            double dy = (double)(VY - voxel_xyz_[gr_pt][i1+1]);
+            double dz = (double)(VZ - voxel_xyz_[gr_pt][i1+2]);
+            double dd = dx*dx+dy*dy+dz*dz;
+            if (dd < NNd && dd > 0) { NNd = dd; }
+            int q1 = n1 * 4; // index into voxel_Q_ for n1
+            double rR = 2 * acos( fabs(W4*voxel_Q_[gr_pt][q1  ] +
+                                  X4*voxel_Q_[gr_pt][q1+1] +
+                                  Y4*voxel_Q_[gr_pt][q1+2] +
+                                  Z4*voxel_Q_[gr_pt][q1+3] )); //add fabs for quaternion distance calculation
+            double ds = rR*rR + dd;
+            if (ds < NNs && ds > 0) { NNs = ds; }
+          }
+        } // END self loop over all waters for this voxel
+        //mprintf("DEBUG1: self NNd=%f NNs=%f\n", NNd, NNs);
+        // Determine which directions are possible.
+        bool cannotAddZ = (nz == 0 || ( gr_pt%nz == nz-1 ));
+        bool cannotAddY = ((nz == 0 || ny-1 == 0) || ( gr_pt%(nz*(ny-1)+(numplane*addx)) < nz));
+        bool cannotAddX = (gr_pt >= addx * (nx-1) && gr_pt < addx * nx );
+        bool cannotSubZ = (nz == 0 || gr_pt%nz == 0);
+        bool cannotSubY = ((nz == 0 || ny == 0) || (gr_pt%addx < nz));
+        bool cannotSubX = ((nz == 0 || ny == 0) || (gr_pt < addx));
+        bool boundary = ( cannotAddZ || cannotAddY || cannotAddX ||
+                          cannotSubZ || cannotSubY || cannotSubX );
+        if (!boundary) {
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addy, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addy, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz + addy, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz - addy, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz + addy, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz - addy, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz + addx, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addz - addx, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz + addx, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addz - addx, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addy + addx, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addy - addx, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addy + addx, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addy - addx, NNd, NNs);
+
+          // add the 8 more voxels for NNr searching
+
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx + addy + addz, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx + addy - addz, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx - addy + addz, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt + addx - addy - addz, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx + addy + addz, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx + addy - addz, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx - addy + addz, NNd, NNs);
+          TransEntropy(VX, VY, VZ, W4, X4, Y4, Z4, gr_pt - addx - addy - addz, NNd, NNs);
+
+
+          NNd = sqrt(NNd);
+          NNs = sqrt(NNs);
+
+          if (NNd < 3 && NNd > 0/*NNd < 9999 && NNd > 0*/) {
+            double dbl = log((NNd*NNd*NNd*NFRAME_*4*Constants::PI*BULK_DENS_)/3);
+            dTStrans_norm[gr_pt] += dbl;
+            dTSt += dbl;
+            dbl = log((NNs*NNs*NNs*NNs*NNs*NNs*NFRAME_*Constants::PI*BULK_DENS_)/48);
+            dTSsix_norm[gr_pt] += dbl;
+            dTSs += dbl;
+            //mprintf("DEBUG1: dbl=%f NNs=%f\n", dbl, NNs);
+          }
         }
+      } // END loop over all waters for this voxel
+      if (dTStrans_norm[gr_pt] != 0) {
+        nwts += nw_total;
+        dTStrans_norm[gr_pt] = Constants::GASK_KCAL*temperature_*( (dTStrans_norm[gr_pt]/nw_total) +
+                                                                  Constants::EULER_MASC );
+        dTSsix_norm[gr_pt] = Constants::GASK_KCAL*temperature_*( (dTSsix_norm[gr_pt]/nw_total) +
+                                                                Constants::EULER_MASC );
       }
-    } // END loop over all waters for this voxel
-    if (dTStrans_norm[gr_pt] != 0) {
-      nwts += nw_total;
-      dTStrans_norm[gr_pt] = Constants::GASK_KCAL*temperature_*( (dTStrans_norm[gr_pt]/nw_total) +
-                                                                 Constants::EULER_MASC );
-      dTSsix_norm[gr_pt] = Constants::GASK_KCAL*temperature_*( (dTSsix_norm[gr_pt]/nw_total) +
-                                                               Constants::EULER_MASC );
+      double dtst_norm_nw = (double)dTStrans_norm[gr_pt] * (double)nw_total;
+      dTStrans[gr_pt] = (dtst_norm_nw / (NFRAME_*Vvox));
+      double dtss_norm_nw = (double)dTSsix_norm[gr_pt] * (double)nw_total;
+      dTSsix[gr_pt] = (dtss_norm_nw / (NFRAME_*Vvox));
+      dTStranstot += dTStrans[gr_pt];
+    } // END loop over all grid points (voxels)
+  }
+  if (!this->skipS_) {
+    dTStranstot *= Vvox;
+    double dTSst = 0.0;
+    double dTStt = 0.0;
+    if (nwts > 0) {
+      dTSst = Constants::GASK_KCAL*temperature_*((dTSs/nwts) + Constants::EULER_MASC);
+      dTStt = Constants::GASK_KCAL*temperature_*((dTSt/nwts) + Constants::EULER_MASC);
     }
-    double dtst_norm_nw = (double)dTStrans_norm[gr_pt] * (double)nw_total;
-    dTStrans[gr_pt] = (dtst_norm_nw / (NFRAME_*Vvox));
-    double dtss_norm_nw = (double)dTSsix_norm[gr_pt] * (double)nw_total;
-    dTSsix[gr_pt] = (dtss_norm_nw / (NFRAME_*Vvox));
-    dTStranstot += dTStrans[gr_pt];
-  } // END loop over all grid points (voxels)
-
-  dTStranstot *= Vvox;
-  double dTSst = 0.0;
-  double dTStt = 0.0;
-  if (nwts > 0) {
-    dTSst = Constants::GASK_KCAL*temperature_*((dTSs/nwts) + Constants::EULER_MASC);
-    dTStt = Constants::GASK_KCAL*temperature_*((dTSt/nwts) + Constants::EULER_MASC);
+    double dTSot = Constants::GASK_KCAL*temperature_*((dTSo/nwtt) + Constants::EULER_MASC);
+    infofile_->Printf("watcount in vol = %d\n", nwtt);
+    infofile_->Printf("watcount in subvol = %d\n", nwts);
+    infofile_->Printf("Total referenced translational entropy of the grid:"
+                      " dTStrans = %9.5f kcal/mol, Nf=%d\n", dTStranstot, NFRAME_);
+    infofile_->Printf("Total 6d if all one vox: %9.5f kcal/mol\n", dTSst);
+    infofile_->Printf("Total t if all one vox: %9.5f kcal/mol\n", dTStt);
+    infofile_->Printf("Total o if all one vox: %9.5f kcal/mol\n", dTSot);
   }
-  double dTSot = Constants::GASK_KCAL*temperature_*((dTSo/nwtt) + Constants::EULER_MASC);
-  infofile_->Printf("watcount in vol = %d\n", nwtt);
-  infofile_->Printf("watcount in subvol = %d\n", nwts);
-  infofile_->Printf("Total referenced translational entropy of the grid:"
-                    " dTStrans = %9.5f kcal/mol, Nf=%d\n", dTStranstot, NFRAME_);
-  infofile_->Printf("Total 6d if all one vox: %9.5f kcal/mol\n", dTSst);
-  infofile_->Printf("Total t if all one vox: %9.5f kcal/mol\n", dTStt);
-  infofile_->Printf("Total o if all one vox: %9.5f kcal/mol\n", dTSot);
-
   // Compute average voxel energy. Allocate these sets even if skipping energy
   // to be consistent with previous output.
   DataSet_GridFlt& Esw_dens = static_cast<DataSet_GridFlt&>( *Esw_ );
@@ -1177,13 +1279,17 @@ void Action_GIST::Print() {
   Farray Eww_norm( MAX_GRID_PT_, 0.0 );
   Farray neighbor_dens( MAX_GRID_PT_, 0.0 );
   if (!skipE_) {
+    #ifndef CUDA
     Darray const& E_UV_VDW = E_UV_VDW_[0];
     Darray const& E_UV_Elec = E_UV_Elec_[0];
     Darray const& E_VV_VDW = E_VV_VDW_[0];
     Darray const& E_VV_Elec = E_VV_Elec_[0];
+    #endif
     Farray const& Neighbor = neighbor_[0];
+    #ifndef CUDA
     // Sum values from other threads if necessary
     SumEVV();
+    #endif
     static const double DEBYE_EA = 0.20822678; // 1 Debye in eA
     double Eswtot = 0.0;
     double Ewwtot = 0.0;
@@ -1192,15 +1298,26 @@ void Action_GIST::Print() {
     for (unsigned int gr_pt = 0; gr_pt < MAX_GRID_PT_; gr_pt++)
     {
       E_progress.Update( gr_pt );
+      
       //mprintf("DEBUG1: VV vdw=%f elec=%f\n", E_VV_VDW_[gr_pt], E_VV_Elec_[gr_pt]);
       int nw_total = N_waters_[gr_pt]; // Total number of waters that have been in this voxel.
       if (nw_total > 1) {
+        #ifndef CUDA
         Esw_dens[gr_pt] = (E_UV_VDW[gr_pt]  + E_UV_Elec[gr_pt]) / (NFRAME_ * Vvox);
         Esw_norm[gr_pt] = (E_UV_VDW[gr_pt]  + E_UV_Elec[gr_pt]) / nw_total;
         Eww_dens[gr_pt] = (E_VV_VDW[gr_pt]  + E_VV_Elec[gr_pt]) / (2 * NFRAME_ * Vvox);
         Eww_norm[gr_pt] = (E_VV_VDW[gr_pt]  + E_VV_Elec[gr_pt]) / (2 * nw_total);
+        #else
+        double esw = this->Esw_->operator[](gr_pt);
+        double eww = this->Eww_->operator[](gr_pt);
+        Esw_dens[gr_pt] = esw / (this->NFRAME_ * Vvox);
+        Esw_norm[gr_pt] = esw / nw_total;
+        Eww_dens[gr_pt] = eww / (this->NFRAME_ * Vvox);
+        Eww_norm[gr_pt] = eww / nw_total;
+        #endif
         Eswtot += Esw_dens[gr_pt];
         Ewwtot += Eww_dens[gr_pt];
+        
       } else {
         Esw_dens[gr_pt]=0;
         Esw_norm[gr_pt]=0;
@@ -1212,9 +1329,9 @@ void Action_GIST::Print() {
       if (nw_total > 0) {
         qtet[gr_pt] /= nw_total;
         //mprintf("DEBUG1: neighbor= %8.1f  nw_total= %8i\n", neighbor[gr_pt], nw_total);
-        neighbor_norm[gr_pt] = 1.0 * Neighbor[gr_pt] / nw_total;
+        neighbor_norm[gr_pt] = (double)Neighbor[gr_pt] / nw_total;
       }
-      neighbor_dens[gr_pt] = 1.0 * Neighbor[gr_pt] / (NFRAME_ * Vvox);
+      neighbor_dens[gr_pt] = (double)Neighbor[gr_pt] / (NFRAME_ * Vvox);
       dipolex[gr_pt] /= (DEBYE_EA * NFRAME_ * Vvox);
       dipoley[gr_pt] /= (DEBYE_EA * NFRAME_ * Vvox);
       dipolez[gr_pt] /= (DEBYE_EA * NFRAME_ * Vvox);
@@ -1227,6 +1344,17 @@ void Action_GIST::Print() {
     infofile_->Printf("Total water-solute energy of the grid: Esw = %9.5f kcal/mol\n", Eswtot);
     infofile_->Printf("Total unreferenced water-water energy of the grid: Eww = %9.5f kcal/mol\n",
                       Ewwtot);
+  } else {
+    static const double DEBYE_EA = 0.20822678;
+    for (unsigned int gr_pt = 0; gr_pt < MAX_GRID_PT_; gr_pt++)
+    {
+      dipolex[gr_pt] /= (DEBYE_EA * NFRAME_ * Vvox);
+      dipoley[gr_pt] /= (DEBYE_EA * NFRAME_ * Vvox);
+      dipolez[gr_pt] /= (DEBYE_EA * NFRAME_ * Vvox);
+      pol[gr_pt] = sqrt( dipolex[gr_pt]*dipolex[gr_pt] +
+                         dipoley[gr_pt]*dipoley[gr_pt] +
+                         dipolez[gr_pt]*dipolez[gr_pt] );
+    }
   }
 
   // Write the GIST output file.
@@ -1300,4 +1428,175 @@ void Action_GIST::Print() {
   gist_order_.WriteTiming(2,   "Order: ", gist_action_.Total());
   gist_print_.WriteTiming(1,   "Print:", total);
   mprintf("TIME:\tTotal: %.4f s\n", total);
+  #ifdef CUDA
+  this->freeGPUMemory();
+  #endif
 }
+
+#ifdef CUDA
+void Action_GIST::NonbondCuda(ActionFrame frm) {
+  // Simply to get the information for the energetic calculations
+  std::vector<float> eww_result(this->numberAtoms_);
+  std::vector<float> esw_result(this->numberAtoms_);
+  std::vector<std::vector<int> > order_indices;
+  this->gist_nonbond_.Start();
+
+  Matrix_3x3 ucell_m, recip_m;
+  float *recip = NULL;
+  float *ucell = NULL;
+  int boxinfo;
+
+  // Check Boxinfo and write the necessary data into recip, ucell and boxinfo.
+  switch(this->image_.ImageType()) {
+    case NONORTHO:
+      recip = new float[9];
+      ucell = new float[9];
+      frm.Frm().BoxCrd().ToRecip(ucell_m, recip_m);
+      for (int i = 0; i < 9; ++i) {
+        ucell[i] = (float) ucell_m.Dptr()[i];
+        recip[i] = (float) recip_m.Dptr()[i];
+      }
+      boxinfo = 2;
+      break;
+    case ORTHO:
+      recip = new float[9];
+      for (int i = 0; i < 3; ++i) {
+        recip[i] = (float) frm.Frm().BoxCrd()[i];
+      }
+      ucell = NULL;
+      boxinfo = 1;
+      break;
+    case NOIMAGE:
+      recip = NULL;
+      ucell = NULL;
+      boxinfo = 0;
+      break;
+    default:
+      mprinterr("Error: Unexpected box information found.");
+      return;
+  }
+
+  std::vector<int> result_o = std::vector<int>(4 * this->numberAtoms_);
+  std::vector<int> result_n = std::vector<int>(this->numberAtoms_);
+  // Call the GPU Wrapper, which subsequently calls the kernel, after setup operations.
+  // Must create arrays from the vectors, does that by getting the address of the first element of the vector.
+  std::vector<std::vector<float> > e_result = doActionCudaEnergy(frm.Frm().xAddress(), this->NBindex_c_, this->numberAtomTypes_, this->paramsLJ_c_, this->molecule_c_, boxinfo, recip, ucell, this->numberAtoms_, this->min_c_, 
+                                                    this->max_c_, this->headAtomType_,this->NeighborCut2_, &(result_o[0]), &(result_n[0]), this->result_w_c_, 
+                                                    this->result_s_c_, this->result_O_c_, this->result_N_c_, this->doOrder_);
+  eww_result = e_result.at(0);
+  esw_result = e_result.at(1);
+
+  if (this->doOrder_) {
+    int counter = 0;
+    for (unsigned int i = 0; i < (4 * this->numberAtoms_); i += 4) {
+      ++counter;
+      std::vector<int> temp;
+      for (unsigned int j = 0; j < 4; ++j) {
+        temp.push_back(result_o.at(i + j));
+      }
+      order_indices.push_back(temp);
+    }
+  }
+
+  delete[] recip; // Free memory
+  delete[] ucell; // Free memory
+
+  for (unsigned int sidx = 0; sidx < NSOLVENT_; sidx++) {
+    int headAtomIndex = O_idxs_[sidx];
+    size_t bin_i, bin_j, bin_k;
+    const double *vec = frm.Frm().XYZ(headAtomIndex);
+    int voxel = -1;
+    if (this->gO_->Bin().Calc(vec[0], vec[1], vec[2], bin_i, bin_j, bin_k)) {
+      voxel = this->gO_->CalcIndex(bin_i, bin_j, bin_k);
+      this->neighbor_.at(0).at(voxel) += result_n.at(headAtomIndex);
+      // This is not nice, as it assumes that O is set before the two Hydrogens
+      // might be the case, but is still not nice (in my opinion)
+      for (unsigned int IDX = 0; IDX != nMolAtoms_; IDX++) {
+        this->Esw_->UpdateVoxel(voxel, esw_result.at(headAtomIndex + IDX));
+        this->Eww_->UpdateVoxel(voxel, eww_result.at(headAtomIndex + IDX));
+      }
+      // Order calculation
+      if (this->doOrder_) {
+        double sum = 0;
+        Vec3 cent( frm.Frm().xAddress() + (headAtomIndex) * 3 );
+        std::vector<Vec3> vectors;
+        switch(this->image_.ImageType()) {
+          case NONORTHO:
+          case ORTHO:
+            {
+              Matrix_3x3 ucell, recip;
+              frm.Frm().BoxCrd().ToRecip(ucell, recip);
+              Vec3 vec(frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(0) * 3));
+              vectors.push_back( MinImagedVec(vec, cent, ucell, recip));
+              vec = Vec3(frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(1) * 3));
+              vectors.push_back( MinImagedVec(vec, cent, ucell, recip));
+              vec = Vec3(frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(2) * 3));
+              vectors.push_back( MinImagedVec(vec, cent, ucell, recip));
+              vec = Vec3(frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(3) * 3));
+              vectors.push_back( MinImagedVec(vec, cent, ucell, recip));
+            }
+            break;
+          default:
+            vectors.push_back( Vec3( frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(0) * 3) ) - cent );
+            vectors.push_back( Vec3( frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(1) * 3) ) - cent );
+            vectors.push_back( Vec3( frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(2) * 3) ) - cent );
+            vectors.push_back( Vec3( frm.Frm().xAddress() + (order_indices.at(headAtomIndex).at(3) * 3) ) - cent );
+        }
+        
+        for (int i = 0; i < 3; ++i) {
+          for (int j = i + 1; j < 4; ++j) {
+            double cosThet = (vectors.at(i) * vectors.at(j)) / sqrt(vectors.at(i).Magnitude2() * vectors.at(j).Magnitude2());
+            sum += (cosThet + 1.0/3) * (cosThet + 1.0/3);
+          }
+        }
+        this->order_norm_->UpdateVoxel(voxel, 1.0 - (3.0/8.0) * sum);
+      }
+    }
+    
+  }
+  this->gist_nonbond_.Stop();
+}
+
+/**
+ * Frees all the Memory on the GPU.
+ */
+void Action_GIST::freeGPUMemory(void) {
+  freeCuda(this->NBindex_c_);
+  freeCuda(this->molecule_c_);
+  freeCuda(this->paramsLJ_c_);
+  freeCuda(this->max_c_);
+  freeCuda(this->min_c_);
+  freeCuda(this->result_w_c_);
+  freeCuda(this->result_s_c_);
+  freeCuda(this->result_O_c_);
+  freeCuda(this->result_N_c_);
+  this->NBindex_c_   = NULL;
+  this->molecule_c_  = NULL;
+  this->paramsLJ_c_  = NULL;
+  this->max_c_     = NULL;
+  this->min_c_     = NULL;
+  this->result_w_c_= NULL;
+  this->result_s_c_= NULL;
+  this->result_O_c_  = NULL;
+  this->result_N_c_  = NULL;
+}
+
+/**
+ * Copies data from the CPU to the GPU.
+ * @throws: CudaException
+ */
+void Action_GIST::copyToGPU(void) {
+  try {
+    copyMemoryToDevice(&(this->NBIndex_[0]), this->NBindex_c_, this->NBIndex_.size() * sizeof(int));
+    copyMemoryToDeviceStruct(&(this->charges_[0]), &(this->atomTypes_[0]), this->solvent_, &(this->molecule_[0]), this->numberAtoms_, &(this->molecule_c_),
+                              &(this->lJParamsA_[0]), &(this->lJParamsB_[0]), this->lJParamsA_.size(), &(this->paramsLJ_c_));
+  } catch (CudaException &ce) {
+    this->freeGPUMemory();
+    mprinterr("Error: Could not copy data to the device.\n");
+    throw ce;
+  } catch (std::exception &e) {
+    this->freeGPUMemory();
+    throw e;
+  }
+}
+#endif
diff --git a/src/Action_GIST.h b/src/Action_GIST.h
index 575850dc59..d30dba6ac5 100644
--- a/src/Action_GIST.h
+++ b/src/Action_GIST.h
@@ -3,14 +3,21 @@
 #include "Action.h"
 #include "ImagedAction.h"
 #include "Timer.h"
+#ifdef CUDA
+#include "cuda_kernels/GistCudaSetup.cuh"
+#endif
 class DataSet_3D;
 class DataSet_MatrixFlt;
+
 /// Class for applying Grid Inhomogenous Solvation Theory
 /** \author Daniel R. Roe
   */
 class Action_GIST : public Action {
   public:
     Action_GIST();
+    #ifdef CUDA
+    ~Action_GIST() {delete[] this->solvent_;}
+    #endif
     DispatchObject* Alloc() const { return (DispatchObject*)new Action_GIST(); }
     void Help() const;
   private:
@@ -27,6 +34,39 @@ class Action_GIST : public Action {
     void Order(Frame const&);
     void SumEVV();
 
+#ifdef CUDA
+    // Additional data for GPU calculation
+
+    std::vector<float> lJParamsA_;
+    std::vector<float> lJParamsB_;
+    std::vector<float> charges_;
+    std::vector<int> atomTypes_;
+    std::vector<int> NBIndex_;
+    std::vector<int> molecule_;
+
+    unsigned int numberAtoms_;
+    int numberAtomTypes_;
+    int headAtomType_;
+    bool *solvent_;
+
+    // Arrays on GPU
+    int *NBindex_c_;
+    void *molecule_c_;
+    void *paramsLJ_c_;
+    float *max_c_;
+    float *min_c_;
+    float *result_w_c_;
+    float *result_s_c_;
+    int *result_O_c_;
+    int *result_N_c_;
+
+    // CUDA only functions
+    void freeGPUMemory(void);
+    void copyToGPU(void);
+    void NonbondCuda(ActionFrame);
+
+#endif
+
     static const Vec3 x_lab_;
     static const Vec3 y_lab_;
     static const Vec3 z_lab_;
@@ -121,5 +161,6 @@ class Action_GIST : public Action {
     bool doEij_;               ///< If true do the i-j energy calc
     bool skipE_;               ///< If true skip the nonbond energy calc
     bool includeIons_;         ///< If true include ions in solute region.
+    bool skipS_;               ///< If true does not calculate entropy
 };
 #endif
diff --git a/src/Version.h b/src/Version.h
index b254030f4a..288607a342 100644
--- a/src/Version.h
+++ b/src/Version.h
@@ -12,7 +12,7 @@
  * Whenever a number that precedes <revision> is incremented, all subsequent
  * numbers should be reset to 0.
  */
-#define CPPTRAJ_INTERNAL_VERSION "V4.24.0"
+#define CPPTRAJ_INTERNAL_VERSION "V4.25.0"
 /// PYTRAJ relies on this
 #define CPPTRAJ_VERSION_STRING CPPTRAJ_INTERNAL_VERSION
 #endif
diff --git a/src/cuda_kernels/CMakeLists.txt b/src/cuda_kernels/CMakeLists.txt
index 8739581321..4961542e79 100644
--- a/src/cuda_kernels/CMakeLists.txt
+++ b/src/cuda_kernels/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(CPPTRAJ_CUDA_SOURCES core_kernels.cu kernel_wrappers.cu)
+set(CPPTRAJ_CUDA_SOURCES core_kernels.cu kernel_wrappers.cu GistCudaCalc.cu GistCudaSetup.cu)
 
 cuda_add_library(cpptraj_cuda_routines ${CPPTRAJ_CUDA_SOURCES})
 
diff --git a/src/cuda_kernels/GistCudaCalc.cu b/src/cuda_kernels/GistCudaCalc.cu
new file mode 100644
index 0000000000..ddd08875a5
--- /dev/null
+++ b/src/cuda_kernels/GistCudaCalc.cu
@@ -0,0 +1,464 @@
+#include "GistCudaCalc.cuh"
+
+#include <cstdio>
+
+#define ELECTOAMBER_2 332.05221729
+
+
+ /**
+  * Calculate the squared distance in an orthorhombic box. See cpptraj implementation.
+  * @param vec1: The first point of the distance calculation.
+  * @param vec2: The seconf point of the distance calculation
+  * @param box: The boxinfo of the object.
+  * @return: The minimal distance in an orthorhombic box.
+  */
+__device__ 
+float dist2_imageOrtho(float *vec1, float *vec2, BoxInfo box) {
+  if (box[0] == 0 || box[1] == 0 || box[2] == 0) {
+    return -1;
+  }
+  float x = vec1[0] - vec2[0];
+  float y = vec1[1] - vec2[1];
+  float z = vec1[2] - vec2[2];
+
+  if (x < 0) {
+    x *= -1;
+  }
+  if (y < 0) {
+    y *= -1;
+  }
+  if (z < 0) {
+    z *= -1;
+  }
+
+  while ( x > box[0] ) {
+    x -= box[0];
+  }
+  while ( y > box[1] ) {
+    y -= box[1];
+  }
+  while ( z > box[2] ) {
+    z -= box[2];
+  }
+
+  float dist = box[0] - x;
+  if ( dist < x ) {
+    x = dist;
+  }
+  dist = box[1] - y;
+  if  ( dist < y ) {
+    y = dist;
+  }
+  dist = box[2] - z;
+  if ( dist < z ) {
+    z = dist;
+  }
+
+  return x * x + y * y + z * z;
+}
+
+/**
+ * Calculate M * v.
+ * @param vec: The vector v.
+ * @param mat3x3: The matrix M.
+ * @param ret: The values to be returned. If null, returns into vec.
+ */
+__device__
+void scalarProd(float* vec, BoxInfo mat3x3, float *ret) {
+  if (ret != NULL) {
+    ret[0] = vec[0] * mat3x3[0] + vec[1] * mat3x3[1] + vec[2] * mat3x3[2];
+    ret[1] = vec[0] * mat3x3[3] + vec[1] * mat3x3[4] + vec[2] * mat3x3[5];
+    ret[2] = vec[0] * mat3x3[6] + vec[1] * mat3x3[7] + vec[2] * mat3x3[8];
+  } else {
+    float x = vec[0];
+    float y = vec[1];
+    float z = vec[2];
+    vec[0] = x * mat3x3[0] + y * mat3x3[1] + z * mat3x3[2];
+    vec[1] = x * mat3x3[3] + y * mat3x3[4] + z * mat3x3[5];
+    vec[2] = x * mat3x3[6] + y * mat3x3[7] + z * mat3x3[8];
+  }
+}
+
+/**
+ * Find the result_O squared distance in a non-orthogonal box.
+ * @param vec1: Position vector of point 1.
+ * @param vec2: Position vector of point 2.
+ * @param recip: The inverse of the ucell.
+ * @param ucell: The box matrix.
+ * @return: The minimal squared distance between two atoms, also considering the images.
+ */
+__device__
+float dist2_imageNonOrtho(float *vec1, float *vec2, BoxInfo recip, UnitCell ucell) {
+  float vecRecip1[3];
+  float vecRecip2[3];
+  scalarProd(vec1, recip, vecRecip1);
+  scalarProd(vec2, recip, vecRecip2);
+  float r_2 = dist2_imageNonOrthoRecip(vecRecip1, vecRecip2, ucell);
+  
+  return r_2;
+}
+
+/**
+ * Calculate if the distance to an image is smaller than a given distance.
+ * @param f: The first vector in the reciprocal space.
+ * @param vec2Cartesian: The second vector in cartesian coordinates.
+ * @param nx: Which neighbour in x direction?
+ * @param ny: Which neighbour in y direction?
+ * @param nz: Which neighbour in z direction?
+ * @param ucell: The box matrix.
+ * @param finalMin: The already calculated minimum.
+ * @return: The new minimum, if it is smaller than finalMin, finalMin otherwise.
+ */
+__device__
+float calcIfDistIsSmaller(float *f, float *vec2Cartesian, int nx, int ny, int nz, UnitCell ucell, float finalMin) {
+  float fx = f[0] + nx;
+  float fy = f[1] + ny;
+  float fz = f[2] + nz;
+  // Bring f back in Cartesian coordinates
+  float x = fx * ucell[0] + fy * ucell[3] + fz * ucell[6];
+  float y = fx * ucell[1] + fy * ucell[4] + fz * ucell[7];
+  float z = fx * ucell[2] + fy * ucell[5] + fz * ucell[8];
+  x -= vec2Cartesian[0];
+  y -= vec2Cartesian[1];
+  z -= vec2Cartesian[2];
+  float min = x * x + y * y + z * z;
+  if ( min < finalMin || finalMin < 0) {
+    return min;
+  }
+  return finalMin;
+}
+
+/**
+ * Calculate the distance in a non-orthorhombic box, if the vectors are already
+ * multiplied by the inverse box matrix.
+ * @param vec1: The first position vector.
+ * @param vec2: The second position vector.
+ * @param ucell: The box cell.
+ * @return: The minimal distance between images.
+ */
+__device__
+float dist2_imageNonOrthoRecip(float * vec1, float * vec2, UnitCell ucell) {
+    
+  // Bring the points back into the main unit cell
+  float fx = vec1[0] - floor(vec1[0]);
+  float fy = vec1[1] - floor(vec1[1]);
+  float fz = vec1[2] - floor(vec1[2]);
+  float f2x = vec2[0] - floor(vec2[0]);
+  float f2y = vec2[1] - floor(vec2[1]);
+  float f2z = vec2[2] - floor(vec2[2]);
+
+  float vec[3] = {fx, fy, fz};
+
+  // Bring f2 back in cartesian coordinates
+  float xFactor = f2x * ucell[0] + f2y * ucell[3] + f2z * ucell[6];
+  float yFactor = f2x * ucell[1] + f2y * ucell[4] + f2z * ucell[7];
+  float zFactor = f2x * ucell[2] + f2y * ucell[5] + f2z * ucell[8];
+  float vec2Real[3] = {xFactor, yFactor, zFactor};
+
+  // Now the different cases, and always store the minimum
+  // Define the finalMinimum as a negative value, since it can never
+  // actually be negative this is fine.
+  float finalMinimum = -1.0;
+
+  // Run through all cells
+  for (int ix = -1; ix <= 1; ++ix) {
+    for (int iy = -1; iy <= 1; ++iy) {
+      for (int iz = -1; iz <= 1; ++iz) {
+        finalMinimum = calcIfDistIsSmaller(vec, vec2Real, ix, iy, iz, ucell, finalMinimum);
+      }
+    }
+  }
+
+  return finalMinimum;
+}
+
+/**
+ * Calculates the distance between two vectors, without imaging.
+ * @param vec1: The position vector of the first atom.
+ * @param vec2: The position vector of the second atom.
+ * @return: The squared distance between the two positions.
+ */
+__device__
+float dist2_noImage(float *vec1, float *vec2) {
+  float x = vec1[0] - vec2[0];
+  float y = vec1[1] - vec2[1];
+  float z = vec1[2] - vec2[2];
+
+  return x*x + y*y + z*z;
+}
+
+/**
+ * Caclulate the total energy between two atoms.
+ * @param vec1: The position vector of atom 1.
+ * @param vec2: The position vector of atom 2.
+ * @param q1: The charge of atom 1.
+ * @param q2: The charge of atom 2.
+ * @param LJA: The lennard jones A parameter.
+ * @param LJB: The lennard jones B parameter.
+ * @param boxinfo: Which kind of box, 0 not periodic, 1 orthorhombic, 2 otherwise.
+ * @param recip_o_box: Holds either the inverse of the cell matrix, if boxinfo is 2,
+ *                        or the box dimensions, if boxinfo is 1, or is NULL, if boxinfo is 0.
+ * @param ucell: Holds the cell matrix, if boxinfo is 2, NULL otherwise.
+ * @return: The total interaction energy between the two atoms.
+ */
+__device__
+float calcTotalEnergy(float q1, float q2, 
+                            float LJA, float LJB, float r_2) {
+#ifdef DEBUG_GIST_CUDA
+  if (r_2 <= 0.000001 && r_2 >= -0.000001) {
+    printf("(%8.3f, %8.3f, %8.3f) (%8.3f, %8.3f, %8.3f) %d\n", vec1[0], vec1[1], vec1[2], vec2[0], vec2[1], vec2[2], boxinfo);
+  }
+#endif
+  return calcVdWEnergy(r_2, LJA, LJB) + calcElectrostaticEnergy(r_2, q1, q2);
+}
+
+/**
+ * Calculate the distance between two different points.
+ * @param vec1: The first vector to calculate the distance.
+ * @param vec2: The second vector to calculate the distance.
+ * @param recip_o_box: The boxinfo, either the box or the reciprocal.
+ * @param ucell: The unitcell of a box.
+ * @return: The squared distance between two points.
+ */
+__device__
+float calcDist(float *vec1, float *vec2, BoxInfo recip_o_box,
+                    UnitCell ucell) {
+  float r_2 = 0;
+  switch(recip_o_box.boxinfo) {
+    case 0:
+      r_2 = dist2_noImage(vec1, vec2);
+      break;
+    case 1:
+      // Uses recip for box info as well;
+      r_2 = dist2_imageOrtho(vec1, vec2, recip_o_box);
+      break;
+    case 2:
+      r_2 = dist2_imageNonOrtho(vec1, vec2, recip_o_box, ucell);
+      break;
+    default:
+      r_2 = 0;
+  }
+  return r_2;
+}
+
+/**
+ * Calculate the Van der Waals energy between two atoms.
+ * @param r_2: The squared distance between the two atoms.
+ * @param LJA: The A part of the lennard jones potential.
+ * @param LJB: The B part if the Lennard Jones potential.
+ * @return: The Van der Waals energy.
+ */
+__device__
+float calcVdWEnergy(float r_2, float LJA, float LJB) {
+  float r_6 = r_2 * r_2 * r_2;
+  float r_12 = r_6 * r_6;
+  float LJ =  LJA / r_12 - LJB / r_6;
+  return LJ;
+}
+
+/**
+ * Calculate the electrostatic energy between two different atoms.
+ * @param r_2: The square distance between the two atoms.
+ * @param q1: The charge of atom 1.
+ * @param q2: The charge of atom 2.
+ * @return: The electrostatic energy between the two atoms.
+ */
+__device__
+float calcElectrostaticEnergy(float r_2, float q1, float q2) {
+  double charge = q1 * q2 * ELECTOAMBER_2;
+  double r = sqrt(r_2);
+  float ELE = charge / r;
+  return ELE;
+}
+
+/**
+ * Get the index into the lennard jones parameter array.
+ * @param a1: The atom type index of atom 1.
+ * @param a2: The atom type index of atom 2.
+ * @param NBindex: The arrays holding the indices into the LJ array.
+ * @param ntypes: The number of atom types.
+ * @return: The index into the parameter arrays.
+ */
+__device__
+int getLJIndex(int a1, int a2, int *NBindex, int ntypes) {
+  return NBindex[a1 * ntypes + a2];
+}
+
+/**
+ * Get the LJ parameters from a parameter array.
+ * @param a1: The atom type index of atom 1.
+ * @param a2: The atom type index of atom 2.
+ * @param NBindex: The indices into the parameter array.
+ * @param ntypes: The number of atom types.
+ * @param paramsLJ: The parameter array.
+ * @return: The LJ parameter belonging to the atom type pair a1, a2.
+ */
+__device__
+ParamsLJ getLJParam(int a1, int a2, int *NBindex, int ntypes, ParamsLJ *paramsLJ) {
+  int idx = getLJIndex(a1, a2, NBindex, ntypes);
+  if (idx < 0) {
+    return ParamsLJ();
+  }
+  return paramsLJ[idx];
+}
+
+/**
+ * Checks if atom is on grid.
+ * @param vec: Position vector to the atom.
+ * @param min: The grid starting position.
+ * @param max: The end of the grid.
+ * @return: True if vector is on grid.
+ */
+__device__
+bool isOnGrid(float *vec, float *min, float *max) {
+  return ( ( (vec[0] >= min[0]) && (vec[0] <= max[0]) ) &&
+           ( (vec[1] >= min[1]) && (vec[1] <= max[1]) ) &&
+           ( (vec[2] >= min[2]) && (vec[2] <= max[2]) ) );
+}
+
+/**
+ * Calculate the energy on the GPU.
+ * @param coords: An array holding all the coordinates of all atoms.
+ * @param NBindex: An array holding indices into the LJ parameter arrays.
+ * @param ntypes: The number of atom types.
+ * @param paramsLJA: The A LJ parameters.
+ * @param paramsLJB: The B LJ parameters.
+ * @param charges: The charges of the atoms.
+ * @param boxinfo: Which kind of box, 0 not periodic, 1 orthorhombic, 2 otherwise.
+ * @param recip_o_box: Holds either the inverse of the cell matrix, if boxinfo is 2,
+ *                        or the box dimensions, if boxinfo is 1, or is NULL, if boxinfo is 0.
+ * @param ucell: Holds the cell matrix, if boxinfo is 2, NULL otherwise.
+ * @param maxAtoms: The number of atoms in the system.
+ * @param a_types: The different atom types of the atoms.
+ * @param solvent: True if atom is a solvent atom, false otherwise.
+ * @param molecule: The number of the molecule this atom belong to.
+ * @param result_ww: The result of the water - water interactions.
+ * @param result_sw: The result of the solute - water interactions.
+ * @param min: The minimum values of the grid.
+ * @param max: The maximum values of the grid.
+ */
+__global__
+void cudaCalcEnergy(Coordinates *coords, int *NBindex, int ntypes, ParamsLJ *parameterLJ, AtomProperties *atomProps, 
+                          BoxInfo recip_o_box, UnitCell ucell, int maxAtoms, float *result_ww, float *result_sw, 
+                          float *min, float *max, int headAtomType, float neighbourCut2, int *result_O, int *result_N) {
+  
+
+  int a1 = blockIdx.x * blockDim.x + threadIdx.x;
+  int a2 = blockIdx.y * blockDim.y + threadIdx.y;
+  if ( (a1 >= maxAtoms) || (a2 >= maxAtoms) || (a1 == a2)) {
+    return;
+  }
+
+  AtomProperties atom1 = atomProps[a1];
+  AtomProperties atom2 = atomProps[a2];
+
+  // Do not calculate if the two values are the same or they belong to the same molecule.
+  if ( (atom1.molecule != atom2.molecule)) {
+
+    Coordinates t1 = coords[a1];
+    Coordinates t2 = coords[a2];
+    ParamsLJ lj = getLJParam(atom1.atomType, atom2.atomType, NBindex, ntypes, parameterLJ);
+
+    float vec1[3] = {t1.x, t1.y, t1.z};
+    float vec2[3] = {t2.x, t2.y, t2.z};
+    float r_2 = calcDist(vec1, vec2, recip_o_box, ucell);
+    float energy = calcTotalEnergy(atom1.charge, atom2.charge, lj.A, lj.B, r_2);
+    
+    if (atom2.solvent != 0) {
+      atomicAdd(&(result_ww[a1]), energy * 0.5);
+    } else {
+      atomicAdd(&(result_sw[a1]), energy);
+    }
+  
+  }
+}
+
+/**
+ * Calculate the energy on the GPU. This implementation is somewhat slower,
+ * but is able to calculate the order parameters as well as the rest.
+ * @param coords: An array holding all the coordinates of all atoms.
+ * @param NBindex: An array holding indices into the LJ parameter arrays.
+ * @param ntypes: The number of atom types.
+ * @param paramsLJA: The A LJ parameters.
+ * @param paramsLJB: The B LJ parameters.
+ * @param charges: The charges of the atoms.
+ * @param boxinfo: Which kind of box, 0 not periodic, 1 orthorhombic, 2 otherwise.
+ * @param recip_o_box: Holds either the inverse of the cell matrix, if boxinfo is 2,
+ *                        or the box dimensions, if boxinfo is 1, or is NULL, if boxinfo is 0.
+ * @param ucell: Holds the cell matrix, if boxinfo is 2, NULL otherwise.
+ * @param maxAtoms: The number of atoms in the system.
+ * @param a_types: The different atom types of the atoms.
+ * @param solvent: True if atom is a solvent atom, false otherwise.
+ * @param molecule: The number of the molecule this atom belong to.
+ * @param result_ww: The result of the water - water interactions.
+ * @param result_sw: The result of the solute - water interactions.
+ * @param min: The minimum values of the grid.
+ * @param max: The maximum values of the grid.
+ */
+__global__
+void cudaCalcEnergySlow(Coordinates *coords, int *NBindex, int ntypes, ParamsLJ *parameterLJ, AtomProperties *atomProps, 
+  BoxInfo recip_o_box, UnitCell ucell, int maxAtoms, float *result_ww, float *result_sw,
+  float *min, float *max, int headAtomType, float neighbourCut2, int *result_O, int *result_N) {
+  
+  int a1 = blockIdx.x * blockDim.x + threadIdx.x;
+  
+  if (a1 >= maxAtoms) {
+  return;
+  }
+  
+  AtomProperties atom1 = atomProps[a1];
+  float distances[4] = {HUGE_C, HUGE_C, HUGE_C, HUGE_C};
+  result_N[a1] = 0;
+  result_O[4 * a1 + 3] = 0;
+  result_O[4 * a1 + 2] = 0;
+  result_O[4 * a1 + 1] = 0;
+  result_O[4 * a1    ] = 0;
+  for (int a2 = 0; a2 < maxAtoms; ++a2) {
+     AtomProperties atom2 = atomProps[a2];
+    // Do not calculate if the two values are the same or they belong to the same molecule.
+    if ( (a1 != a2) && (atom1.molecule != atom2.molecule)) {
+      Coordinates t1 = coords[a1];
+      Coordinates t2 = coords[a2];
+      ParamsLJ lj = getLJParam(atom1.atomType, atom2.atomType, NBindex, ntypes, parameterLJ);
+      float vec1[3] = {t1.x, t1.y, t1.z};
+      float vec2[3] = {t2.x, t2.y, t2.z};
+      float r_2 = calcDist(vec1, vec2, recip_o_box, ucell);
+      float energy = calcTotalEnergy(atom1.charge, atom2.charge, lj.A, lj.B, r_2);
+      if ((atom2.atomType == headAtomType) && atom2.solvent && atom1.solvent) {
+        if (r_2 < distances[0]) {
+          distances[3] = distances[2];
+          distances[2] = distances[1];
+          distances[1] = distances[0];
+          distances[0] = r_2;
+          result_O[4 * a1 + 3] = result_O[4 * a1 + 2];
+          result_O[4 * a1 + 2] = result_O[4 * a1 + 1];
+          result_O[4 * a1 + 1] = result_O[4 * a1    ];
+          result_O[4 * a1    ] = a2;
+        } else if (r_2 < distances[1]) {
+          distances[3] = distances[2];
+          distances[2] = distances[1];
+          distances[1] = r_2;
+          result_O[4 * a1 + 3] = result_O[4 * a1 + 2];
+          result_O[4 * a1 + 2] = result_O[4 * a1 + 1];
+          result_O[4 * a1 + 1] = a2;
+        } else if (r_2 < distances[2]) {
+          distances[3] = distances[2];
+          distances[2] = r_2;
+          result_O[4 * a1 + 3] = result_O[4 * a1 + 2];
+          result_O[4 * a1 + 2] = a2;
+        } else if (r_2 < distances[3]) {
+          distances[3] = r_2;
+          result_O[4 * a1 + 3] = a2;
+        }
+        if (r_2 < neighbourCut2) {
+          result_N[a1] += 1;
+        }
+      }
+      if (atom2.solvent != 0) {
+        result_ww[a1] += energy / 2.0;
+      } else {
+        result_sw[a1] += energy;
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/src/cuda_kernels/GistCudaCalc.cuh b/src/cuda_kernels/GistCudaCalc.cuh
new file mode 100644
index 0000000000..9f46d9b798
--- /dev/null
+++ b/src/cuda_kernels/GistCudaCalc.cuh
@@ -0,0 +1,367 @@
+#ifndef GIST_CUDA_CALC_CUH
+#define GIST_CUDA_CALC_CUH
+
+#define HUGE_C 1e30f
+#define BLOCKSIZE 16
+#define SLOW_BLOCKSIZE 512
+
+/**
+ * Coordinates class.
+ *
+ * Holds three different values for the coordinates, to achieve
+ * a faster access on the GPU (only a single read from the memory
+ * instead of three different reads, also the memory is better
+ * aligned).
+ * @author Johannes Kraml
+ */
+class Coordinates {
+public:
+  float x;
+  float y;
+  float z;
+
+  // Empty Constructor
+  __host__ __device__
+  Coordinates(): x(0), y(0), z(0) {}
+
+  /**
+   * Constructor using an array of three values.
+   * @param array: The array that holds the x, y and z coordinates
+   */
+  __host__ __device__
+  Coordinates(const double *array) {
+    this->x = array[0];
+    this->y = array[1];
+    this->z = array[2];
+  }
+
+  /**
+   * Copy Constructor.
+   * @param other: The other object.
+   */
+  __host__ __device__
+  Coordinates(const Coordinates &other) {
+    this->x = other.x;
+    this->y = other.y;
+    this->z = other.z;
+  }
+
+  /**
+   * The assignment operator for an object of this class.
+   * @param other: The other Coordinate object.
+   * @return This object.
+   */
+  __host__ __device__
+  Coordinates &operator=(const Coordinates &other) {
+    this->x = other.x;
+    this->y = other.y;
+    this->z = other.z;
+    return *this;
+  }
+  
+};
+
+/**
+ * An implementation holding the A and B values for a
+ * Lennard-Jones van der Waals interaction energy calculation.
+ * @author Johannes Kraml
+ */
+class ParamsLJ{
+public:
+  float A;
+  float B;
+
+  // Empty constructor
+  __host__ __device__
+  ParamsLJ(): A(0), B(0) {}
+
+  /**
+   * Constructor from an array of values.
+   * @param arr: The array that holds the A and B values.
+   */
+  __host__ __device__
+  ParamsLJ(float *arr) {
+    this->A = arr[0];
+    this->B = arr[1];
+  }
+
+  /**
+   * Constructor using two different floats.
+   * @param A: The A value in the Lennard-Jones equation.
+   * @param B: The B value in the Lennard-Jones equation.
+   */
+  __host__ __device__
+  ParamsLJ(float A, float B) {
+    this->A = A;
+    this->B = B;
+  }
+
+  /**
+   * Copy constructor for this class.
+   * @param other: The other object of this class
+   */
+  __host__ __device__
+  ParamsLJ(const ParamsLJ &other) {
+    this->A = other.A;
+    this->B = other.B;
+  }
+
+  /**
+   * The assignment operator of this class.
+   * @param other: The other object of this type.
+   * @return this object.
+   */
+  __host__ __device__
+  ParamsLJ &operator=(const ParamsLJ &other) {
+    this->A = other.A;
+    this->B = other.B;
+    return *this;
+  }
+};
+
+/**
+ * The implementation of the Unit Cell on the GPU.
+ * @author Johannes Kraml
+ */
+class UnitCell {
+public:
+  float array[9];
+  
+  // The empty constuctor.
+  __host__ __device__
+  UnitCell() {}
+
+  /**
+   * Constructor using an array.
+   * @param arr: The array from which the values should be
+   *             taken.
+   */
+  __host__ __device__
+  UnitCell(float *arr) {
+    this->array[0] = arr[0];
+    this->array[1] = arr[1];
+    this->array[2] = arr[2];
+    this->array[3] = arr[3];
+    this->array[4] = arr[4];
+    this->array[5] = arr[5];
+    this->array[6] = arr[6];
+    this->array[7] = arr[7];
+    this->array[8] = arr[8];
+  }
+
+  /**
+   * Copy constructor.
+   * @param other: The other object.
+   */
+  __host__ __device__
+  UnitCell(const UnitCell &other) {
+    this->array[0] = other.array[0];
+    this->array[1] = other.array[1];
+    this->array[2] = other.array[2];
+    this->array[3] = other.array[3];
+    this->array[4] = other.array[4];
+    this->array[5] = other.array[5];
+    this->array[6] = other.array[6];
+    this->array[7] = other.array[7];
+    this->array[8] = other.array[8];
+  }
+
+  /**
+   * Assignment operator.
+   * @param other: The other object.
+   * @return This object.
+   */
+  __host__ __device__
+  UnitCell &operator=(const UnitCell &other){
+    this->array[0] = other.array[0];
+    this->array[1] = other.array[1];
+    this->array[2] = other.array[2];
+    this->array[3] = other.array[3];
+    this->array[4] = other.array[4];
+    this->array[5] = other.array[5];
+    this->array[6] = other.array[6];
+    this->array[7] = other.array[7];
+    this->array[8] = other.array[8];
+    return *this;
+  }
+
+  /**
+   * Access operator.
+   * @param idx: The index which should be accessed.
+   *
+   * @exceptsafe Not safe.
+   */
+  __host__ __device__
+  float operator[](int idx) {
+    if (idx >= 0 && idx < 9) {
+      return this->array[idx];
+    }
+    return 1;
+  }
+
+};
+
+/**
+ * Class holding the box information, i.e., which kind of box
+ * and the box dimensions.
+ */
+class BoxInfo {
+public:
+  float array[9]; ///< box dimensions
+  int boxinfo;
+  
+  // Empty constructor
+  __host__ __device__
+  BoxInfo(): boxinfo(0) {}
+
+  /**
+   * Constructor using an array and the boxinfo.
+   * @param arr: The array holding the values for the box dimensions.
+   * @param boxinfo: Which kind of box this is.
+   */
+  __host__ __device__
+  BoxInfo(float *arr, int boxinfo) {
+    this->array[0] = arr[0];
+    this->array[1] = arr[1];
+    this->array[2] = arr[2];
+    this->array[3] = arr[3];
+    this->array[4] = arr[4];
+    this->array[5] = arr[5];
+    this->array[6] = arr[6];
+    this->array[7] = arr[7];
+    this->array[8] = arr[8];
+    this->boxinfo = boxinfo;
+  }
+
+  /**
+   * Copy constuctor.
+   * @param other: The other object.
+   */
+  __host__ __device__
+  BoxInfo(const BoxInfo &other) {
+    this->array[0] = other.array[0];
+    this->array[1] = other.array[1];
+    this->array[2] = other.array[2];
+    this->array[3] = other.array[3];
+    this->array[4] = other.array[4];
+    this->array[5] = other.array[5];
+    this->array[6] = other.array[6];
+    this->array[7] = other.array[7];
+    this->array[8] = other.array[8];
+    this->boxinfo = other.boxinfo;
+  }
+
+  /**
+   * Assignement operator.
+   * @param other: The other object.
+   * @return This object.
+   */
+  __host__ __device__
+  BoxInfo &operator=(const BoxInfo &other){
+    this->array[0] = other.array[0];
+    this->array[1] = other.array[1];
+    this->array[2] = other.array[2];
+    this->array[3] = other.array[3];
+    this->array[4] = other.array[4];
+    this->array[5] = other.array[5];
+    this->array[6] = other.array[6];
+    this->array[7] = other.array[7];
+    this->array[8] = other.array[8];
+    this->boxinfo = other.boxinfo;
+    return *this;
+  }
+
+  /**
+   * Access operator.
+   * @param idx: The index at which to access the array.
+   * @return The value stored at that point in the array or 0 if none is found.
+   * @exceptsafe Is not exception safe.
+   */
+  __host__ __device__
+  float operator[](int idx) {
+    if (idx < 9 && idx >= 0) {
+      return this->array[idx];
+    }
+    return 0;
+  }
+
+};
+
+/**
+ * Class to store different Atom properties, like charge, atomType, solvent and the
+ * molecule this atom belongs to.
+ * @author Johannes Kraml
+ */
+class AtomProperties {
+public:
+  float charge;
+  int atomType;
+  bool solvent;
+  int molecule;
+
+  // Empty constructor
+  __host__ __device__
+  AtomProperties() {}
+
+  /**
+   * Constructor.
+   * @param charge: The charge of the atom.
+   * @param atomType: The atom type for access of the lennard-jones parameters.
+   * @param solvent: Is this atom a solvent atom?
+   * @param molecule: The molecule this atom belongs to.
+   */
+  __host__ __device__
+  AtomProperties(float charge, int atomType, bool solvent, int molecule) {
+    this->charge = charge;
+    this->atomType = atomType;
+    this->solvent = solvent;
+    this->molecule = molecule;
+  }
+
+  /**
+   * Copy constructor.
+   * @param other: The other object.
+   */
+  __host__ __device__
+  AtomProperties(const AtomProperties &other) {
+    this->charge = other.charge;
+    this->atomType = other.atomType;
+    this->solvent = other.solvent;
+    this->molecule = other.molecule;
+  }
+
+  /**
+   * Assignement operator.
+   * @param other: The other object.
+   * @return This object.
+   */
+  __host__  __device__
+  AtomProperties &operator=(const AtomProperties &other) {
+    this->charge = other.charge;
+    this->atomType = other.atomType;
+    this->solvent = other.solvent;
+    this->molecule = other.molecule;
+    return *this;
+  }
+};
+
+
+// Device functions
+__device__ float dist2_imageOrtho(float *, float *, BoxInfo);
+__device__ void scalarProd(float* , BoxInfo , float *);
+__device__ float dist2_imageNonOrtho(float *, float *, BoxInfo, UnitCell);
+__device__ float calcIfDistIsSmaller(float *, float *, int , int , int , UnitCell, float );
+__device__ float dist2_imageNonOrthoRecip(float * , float * , UnitCell);
+__device__ float dist2_noImage(float *, float *);
+__device__ float calcTotalEnergy(float , float , float , float , float);
+__device__ float calcVdWEnergy(float , float , float );
+__device__ float calcElectrostaticEnergy(float, float, float);
+__device__ int getLJIndex(int , int , int *, int );
+__device__ ParamsLJ getLJParam(int , int , int *, int , ParamsLJ *);
+__device__ bool isOnGrid(float *, float *, float *);
+__device__ float calcDist(float *, float *, BoxInfo, UnitCell);
+
+// Global functions
+__global__ void cudaCalcEnergy    (Coordinates *, int *, int, ParamsLJ *, AtomProperties *, BoxInfo, UnitCell, int, float *, float *, float *, float *, int, float, int *, int *);
+__global__ void cudaCalcEnergySlow(Coordinates *, int *, int, ParamsLJ *, AtomProperties *, BoxInfo, UnitCell, int, float *, float *,	float *, float *, int, float, int *, int *);
+#endif
diff --git a/src/cuda_kernels/GistCudaSetup.cu b/src/cuda_kernels/GistCudaSetup.cu
new file mode 100644
index 0000000000..d257da38b5
--- /dev/null
+++ b/src/cuda_kernels/GistCudaSetup.cu
@@ -0,0 +1,238 @@
+#include "GistCudaSetup.cuh"
+#include "GistCudaCalc.cuh"
+
+
+/**
+ * Allocate memory on the GPU.
+ * @param array: The pointer to the array, which will be allocated on the GPU.
+ * @param size: An integer giving the size of the array, which will be allocated.
+ * @throws: CudaException if a problem occurs.
+ */
+__host__
+void allocateCuda(void **array, int size) {
+  // Check if the array is actually free, if not, it will be freed 
+  // (fun fact: checking is not necessary, one could also simply free the memory).
+  if ((*array) != NULL) {
+    cudaFree(*array);
+  }
+  // If something goes wrong, throw exception
+  if (cudaMalloc(array, size) != cudaSuccess) {
+    throw CudaException();
+  }
+}
+
+/**
+ * Copy memory from the CPU to the GPU.
+ * @param array: The array from which the values shall be copied.
+ * @param array_c: The array on the device, to which the values shall be copied.
+ * @param size: The size of the stuff which will be copied.
+ * @throws: CudaException if something goes wrong.
+ */
+__host__
+void copyMemoryToDevice(void *array, void *array_c, int size) {
+  // If something goes wrong, throw exception
+  // In this case only copying can go wrong.
+  if (cudaMemcpy(array_c, array, size, cudaMemcpyHostToDevice) != cudaSuccess) {
+    throw CudaException();
+  }
+}
+
+/**
+ * A simple helper function that copies a lot of stuff to the GPU (as structs).
+ * @param charge: An array holding the charges for the different atoms.
+ * @param atomtype: An array holding the integers for the atom types of the different atoms.
+ * @param solvent: An array of boolean values, holding the information whether a certain atom is solvent or solute.
+ * @param atomNumber: The total number of atoms.
+ * @param atomProps_c: A pointer to an array on the GPU, which will hold the atom properties.
+ * @param ljA: An array holding the lennard-jones parameter A for each atom type pair.
+ * @param ljB: An array holding the lennard-jones parameter B for each atom type pair.
+ * @param length: The length of the two aforementioned arrays (ljA & ljB).
+ * @param lJparams_c: A pointer to an array on the GPU, which will hold the lj parameters.
+ * @throws: CudaException if something bad happens.
+ */
+__host__
+void copyMemoryToDeviceStruct(float *charge, int *atomtype, bool *solvent, int *molecule, int atomNumber, void **atomProps_c,
+                              float *ljA, float *ljB, int length, void **lJparams_c) {
+  // Check if the two arrays are free. Again, this could be removed (but will stay!)
+  if ((*atomProps_c) != NULL) {
+    cudaFree(*atomProps_c);
+  }
+  if ((*lJparams_c) != NULL) {
+    cudaFree(*lJparams_c);
+  }
+  // Allocate the necessary memory on the GPU.
+  if (cudaMalloc(atomProps_c, atomNumber * sizeof(AtomProperties)) != cudaSuccess) {
+    throw CudaException();
+  }
+  if (cudaMalloc(lJparams_c, length * sizeof(ParamsLJ)) != cudaSuccess) {
+    throw CudaException();
+  }
+
+  // Create an array for the lennard-jones parameters.
+  ParamsLJ *ljp = (ParamsLJ *) malloc (length * sizeof(ParamsLJ));
+  // Add the lennard-jones parameters to the array.
+  for (int i = 0; i < length; ++i) {
+    ljp[i] = ParamsLJ(ljA[i], ljB[i]);
+  }
+
+  // Create an array for the atom properties.
+  AtomProperties *array = (AtomProperties *)malloc(atomNumber * sizeof(AtomProperties));
+  // Add the properties into the array.
+  for (int i = 0; i < atomNumber; ++i) {
+    array[i] = AtomProperties(charge[i], atomtype[i], solvent[i], molecule[i]);
+  }
+  // Copy the memory from the host to the device.
+  if (cudaMemcpy((*atomProps_c), array, atomNumber * sizeof(AtomProperties), cudaMemcpyHostToDevice) != cudaSuccess) {
+    throw CudaException();
+  }
+  if (cudaMemcpy((*lJparams_c), ljp, length * sizeof(ParamsLJ), cudaMemcpyHostToDevice) != cudaSuccess) {
+    throw CudaException();
+  }
+
+  // Free the two arrays (so that no memory leak occurs).
+  free(ljp);
+  free(array);
+}
+
+/**
+ * Free an array on the CUDA capable device.
+ * @param array: The array you want to free.
+ */
+__host__
+void freeCuda(void *array) {
+  cudaFree(array);
+}
+
+
+// This is coded C-like, but uses exceptions.
+/**
+ * This starts the cuda kernel, thus it is actually a quite long function.
+ */
+__host__
+std::vector<std::vector<float> > doActionCudaEnergy(const double *coords, int *NBindex_c, int ntypes, void *parameter, void *molecule_c,
+                            int boxinfo, float *recip_o_box, float *ucell, int maxAtoms, float *min_c, float *max_c, int headAtomType, 
+                            float neighbourCut2, int *result_o, int *result_n, float *result_w_c, float *result_s_c,
+                            int *result_O_c, int *result_N_c, bool doorder) {
+  Coordinates *coords_c   = NULL;
+  float *recip_b_c  = NULL;
+  float *ucell_c    = NULL;
+  
+  
+
+  float *result_A = (float *) calloc(maxAtoms, sizeof(float));
+  float *result_s = (float *) calloc(maxAtoms, sizeof(float));
+  // TODO: Fix this, test is actually a quite bad name here!
+  Coordinates *coord_array = (Coordinates *) calloc(maxAtoms, sizeof(Coordinates));
+  
+  // Casting
+  AtomProperties *sender = (AtomProperties *) molecule_c;
+  ParamsLJ *lennardJonesParams = (ParamsLJ *) parameter;
+  
+  // Create Boxinfo and Unit cell. This is actually very important for the speed (otherwise
+  // there would be LOTS of access to non-local variables).
+  BoxInfo boxinf;
+  if (boxinfo != 0) {
+    boxinf = BoxInfo(recip_o_box, boxinfo);
+  }
+  UnitCell ucellN;
+  if (boxinfo == 2) {
+    ucellN = UnitCell(ucell);
+  }
+  
+  // Add the coordinates to the array.
+  // TODO: Fix Test here also!
+  for (int i = 0; i < maxAtoms; ++i) {
+    coord_array[i] = Coordinates(&coords[i * 3]);
+  }
+
+  // vectors that will return the necessary information.
+  std::vector<std::vector<float> > result;
+  std::vector<float> result_esw;
+  std::vector<float> result_eww;
+
+  // Allocate space on the GPU
+  if (cudaMalloc(&coords_c, maxAtoms * sizeof(Coordinates)) != cudaSuccess) {
+    free(result_A); free(result_s); free(coord_array);
+    throw CudaException();
+  }
+
+
+  // Copy the data to the GPU
+  if (cudaMemcpy(coords_c, coord_array, maxAtoms * sizeof(Coordinates), cudaMemcpyHostToDevice) != cudaSuccess) {
+    cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c);
+    free(result_A); free(result_s); free(coord_array);
+    throw CudaException();
+  }
+  if (cudaMemcpy(result_w_c, result_A, maxAtoms * sizeof(float), cudaMemcpyHostToDevice) != cudaSuccess) {
+    cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c);
+    free(result_A); free(result_s); free(coord_array);
+    throw CudaException();
+  }
+  if (cudaMemcpy(result_s_c, result_s, maxAtoms * sizeof(float), cudaMemcpyHostToDevice) != cudaSuccess) {
+    cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c);
+    free(result_A); free(result_s); free(coord_array);
+    throw CudaException();
+  }
+
+  // If the doorder calculation is used, it needs to calculate everything differently, so the slow version is used
+  // (this is about 10% slower).
+  if (doorder) {
+    cudaCalcEnergySlow<<< (maxAtoms + SLOW_BLOCKSIZE) / SLOW_BLOCKSIZE, SLOW_BLOCKSIZE >>> (coords_c, NBindex_c, ntypes, lennardJonesParams, sender,
+                                                                                            boxinf, ucellN, maxAtoms, result_w_c, result_s_c, min_c, max_c,
+                                                                                            headAtomType, neighbourCut2, result_O_c, result_N_c);
+  } else {
+    // Uses a 2D array, which is nice for memory access.
+    dim3 threadsPerBlock(BLOCKSIZE, BLOCKSIZE);
+    dim3 numBlocks((maxAtoms + threadsPerBlock.x) / threadsPerBlock.x, (maxAtoms + threadsPerBlock.y) / threadsPerBlock.y);
+    // The actual call of the device function
+    cudaCalcEnergy<<<numBlocks, threadsPerBlock>>> (coords_c, NBindex_c, ntypes, lennardJonesParams, sender,
+                                                                      boxinf, ucellN, maxAtoms, result_w_c, result_s_c, min_c, max_c,
+                                                                      headAtomType, neighbourCut2, result_O_c, result_N_c);
+    // Check if there was an error.
+    cudaError_t cudaError = cudaGetLastError();
+    if (cudaError != cudaSuccess) {
+      printf("returned %s\n", cudaGetErrorString(cudaError));
+    }
+  }
+  // Return the results of the calculation to the main memory
+  if (cudaMemcpy(result_A, result_w_c, maxAtoms * sizeof(float), cudaMemcpyDeviceToHost) != cudaSuccess) {
+    cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c);
+    free(result_A); free(result_s); free(coord_array);
+    throw CudaException();
+  }  
+  
+
+  if (cudaMemcpy(result_s, result_s_c, maxAtoms * sizeof(float), cudaMemcpyDeviceToHost) != cudaSuccess) {
+    cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c);
+    free(result_A); free(result_s); free(coord_array);
+    throw CudaException();
+  }
+
+
+  
+  if (cudaMemcpy(result_o, result_O_c, maxAtoms * 4 * sizeof(int), cudaMemcpyDeviceToHost) != cudaSuccess) {
+    cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c);
+    free(result_A); free(result_s); free(coord_array);
+    throw CudaException();
+  }
+  
+  if (cudaMemcpy(result_n, result_N_c, maxAtoms * sizeof(int), cudaMemcpyDeviceToHost) != cudaSuccess) {
+    cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c);
+    free(result_A); free(result_s); free(coord_array);
+    throw CudaException();
+  }
+
+  for (int i = 0; i < maxAtoms; ++i) {
+    result_eww.push_back(result_A[i]);
+    result_esw.push_back(result_s[i]);
+  }
+
+  result.push_back(result_eww);
+  result.push_back(result_esw);
+
+  // Free everything used in here.
+  cudaFree(coords_c); cudaFree(recip_b_c); cudaFree(ucell_c);
+  free(result_A); free(result_s); free(coord_array);
+  
+  return result;
+}
\ No newline at end of file
diff --git a/src/cuda_kernels/GistCudaSetup.cuh b/src/cuda_kernels/GistCudaSetup.cuh
new file mode 100644
index 0000000000..9e13948efd
--- /dev/null
+++ b/src/cuda_kernels/GistCudaSetup.cuh
@@ -0,0 +1,27 @@
+#ifndef GIST_CUDA_SETUP_CUH
+#define GIST_CUDA_SETUP_CUH
+
+#include <exception>
+#include <vector>
+#include <cstdio>
+
+
+
+// Exception classes
+class CudaException : public std::exception {
+public:
+  CudaException() {
+  }
+};
+
+// Function definitions
+void allocateCuda(void **, int);
+void copyMemoryToDevice(void *, void *, int);
+void copyMemoryToDeviceStruct(float *, int *, bool *, int *, int, void **, float *, float *, int, void **);
+void freeCuda(void *);
+std::vector<std::vector<float> > doActionCudaEnergy(const double *, int *, int , void *, void *,
+                            int , float *, float *, int , float *, 
+                            float *, int, float, int *, int *, float *, float *,
+                            int *, int *, bool);
+
+#endif
diff --git a/src/cuda_kernels/Makefile b/src/cuda_kernels/Makefile
index 46b44bdb93..cf663ef027 100644
--- a/src/cuda_kernels/Makefile
+++ b/src/cuda_kernels/Makefile
@@ -10,7 +10,7 @@ TARGET        = libcpptraj_cuda.a
 
 # Source files
 
-CUDA_SOURCES=core_kernels.cu kernel_wrappers.cu
+CUDA_SOURCES=core_kernels.cu kernel_wrappers.cu GistCudaCalc.cu GistCudaSetup.cu
 
 # Objects
 
diff --git a/test/MasterTest.sh b/test/MasterTest.sh
index a9fc070345..0566a8920e 100644
--- a/test/MasterTest.sh
+++ b/test/MasterTest.sh
@@ -963,6 +963,13 @@ CheckEnv() {
       'fftw'      ) TestLibrary "FFTW"               "$CPPTRAJ_FFTW_FFT" ;;
       'openmp'    ) TestLibrary "OpenMP"             "$CPPTRAJ_OPENMP" ;;
       'singleensemble' ) TestLibrary "Single ensemble support" "$CPPTRAJ_SINGLE_ENS" ;;
+      'cuda'      ) TestLibrary "CUDA"               "$CPPTRAJ_CUDA" ;;
+      'notcuda'   )
+	if [ ! -z "$CPPTRAJ_CUDA" ]; then
+          echo "  $DESCRIP cannot be run on CUDA."
+	  ((CHECKERR++))
+        fi
+	;;
       'pnetcdf'   )
         if [ ! -z "$DO_PARALLEL" ] ; then
           TestLibrary "Parallel NetCDF" "$CPPTRAJ_PNETCDFLIB"
diff --git a/test/Test_GIST/RunTest.sh b/test/Test_GIST/RunTest.sh
index 1feb9da292..28edac51bb 100755
--- a/test/Test_GIST/RunTest.sh
+++ b/test/Test_GIST/RunTest.sh
@@ -4,30 +4,34 @@
 
 CleanFiles gist.in gist.out gist-*.dx ww_Eij.dat Eww_ij.dat \
            Gist1-*.dx Gist1-*.dat Gist2-*.dx Gist2-*.dat
+INPUT="-i gist.in"
 TESTNAME='GIST tests'
 Requires netcdf notparallel
 
-INPUT="-i gist.in"
-
+UNITNAME='Eww Test'
+CheckFor notcuda
 # doeij test with much smaller grid to save memory
-cat > gist.in <<EOF
-parm ../tz2.ortho.parm7
-trajin ../tz2.ortho.nc 1 10
-autoimage origin
-gist doorder doeij refdens 0.033422885325 gridcntr 1.44 0.67 0.29 \
-     griddim 10 12 10 gridspacn 2.0 prefix Gist1
-go
+if [ $? -eq 0 ]; then
+  cat > gist.in <<EOF
+  parm ../tz2.ortho.parm7
+  trajin ../tz2.ortho.nc 1 10
+  autoimage origin
+  gist doorder doeij refdens 0.033422885325 gridcntr 1.44 0.67 0.29 \
+    griddim 10 12 10 gridspacn 2.0 prefix Gist1
+  go
 EOF
-RunCpptraj "GIST water-water interaction test"
-DoTest Gist1-Eww_ij.dat.save Gist1-Eww_ij.dat
+  RunCpptraj "GIST water-water interaction test"
+  DoTest Gist1-Eww_ij.dat.save Gist1-Eww_ij.dat
+fi
 
 # GIST test with finer grid for everything else
+UNITNAME='CPU and GPU tests'
 cat > gist.in <<EOF
 parm ../tz2.ortho.parm7
 trajin ../tz2.ortho.nc 1 10
 autoimage origin
 gist doorder refdens 0.033422885325 gridcntr 1.5 1.0 0.0 \
-     griddim 34 44 36 gridspacn 0.50 prefix Gist2 info Info.dat
+    griddim 34 44 36 gridspacn 0.50 prefix Gist2 info Info.dat
 go
 EOF
 RunCpptraj "GIST test"