From 88136e60b8bc6a6c53c507bfc4dda1a39163e864 Mon Sep 17 00:00:00 2001
From: Dave Rowenhorst <45666721+drowenhorst-nrl@users.noreply.github.com>
Date: Wed, 22 May 2024 17:41:59 -0400
Subject: [PATCH] NLPAR GPU (#56)

* GPU enabled version of NLPAR
* PyEBSDIndex will now default to using discrete graphics if both integrated and discrete GPUs are found.
---
 .github/dependabot.yml               |    5 +-
 .github/workflows/tests.yml          |    4 +-
 License                              |    2 +-
 pyebsdindex/__init__.py              |   12 +-
 pyebsdindex/_ebsd_index_single.py    |  112 ++-
 pyebsdindex/band_detect.py           |   73 +-
 pyebsdindex/band_vote.py             |  944 ---------------------
 pyebsdindex/crystal_sym.py           |    2 +-
 pyebsdindex/crystallometry.py        |   52 +-
 pyebsdindex/misorientation.py        |   70 +-
 pyebsdindex/nlpar.py                 |  693 +---------------
 pyebsdindex/nlpar_cpu.py             |  794 ++++++++++++++++++
 pyebsdindex/opencl/band_detect_cl.py |   55 +-
 pyebsdindex/opencl/clkernels.cl      |   85 +-
 pyebsdindex/opencl/clnlpar.cl        |  455 ++++++++++
 pyebsdindex/opencl/nlpar_cl.py       |  538 ++++++++++++
 pyebsdindex/opencl/nlpar_clray.py    |  799 ++++++++++++++++++
 pyebsdindex/opencl/openclparam.py    |   41 +-
 pyebsdindex/opencl/test.cl           |   67 ++
 pyebsdindex/radon_fast.py            |   11 +-
 pyebsdindex/rotlib.py                |   13 +-
 pyebsdindex/tests/test_package.py    |    3 +-
 pyebsdindex/tripletvote.py           | 1150 +++++++++++++++-----------
 setup.py                             |    2 +-
 24 files changed, 3723 insertions(+), 2259 deletions(-)
 delete mode 100644 pyebsdindex/band_vote.py
 create mode 100644 pyebsdindex/nlpar_cpu.py
 create mode 100644 pyebsdindex/opencl/clnlpar.cl
 create mode 100644 pyebsdindex/opencl/nlpar_cl.py
 create mode 100644 pyebsdindex/opencl/nlpar_clray.py
 create mode 100644 pyebsdindex/opencl/test.cl

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 5990d9c..be902ed 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -5,7 +5,10 @@
 
 version: 2
 updates:
-  - package-ecosystem: "" # See documentation for possible values
+  - package-ecosystem: "github-actions" # See documentation for possible values
     directory: "/" # Location of package manifests
+    target-branch: "develop"
     schedule:
       interval: "weekly"
+    open-pull-requests-limit: 2
+    
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index b845d52..4f7d5a2 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -49,10 +49,10 @@ jobs:
             DEPENDENCIES: matplotlib==3.3 numba==0.52 numpy==1.19 ray[default]==1.13
             LABEL: -oldest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
 
diff --git a/License b/License
index 1110b57..10cb697 100644
--- a/License
+++ b/License
@@ -18,5 +18,5 @@ works bear some notice that they are derived from it, and any modified versions
 some notice that they have been modified.
 
 Author: David Rowenhorst; 
-The US Naval Research Laboratory Date: 21 Aug 2020
+The US Naval Research Laboratory Date: 22 May 2024
 
diff --git a/pyebsdindex/__init__.py b/pyebsdindex/__init__.py
index 1a0318f..ac40fd0 100644
--- a/pyebsdindex/__init__.py
+++ b/pyebsdindex/__init__.py
@@ -10,10 +10,18 @@
 __version__ = "0.2.2dev"
 
 
-# Try to import only once
+# Try to import only once - also will perform check that at least one GPU is found.
 try:
+    _pyopencl_installed = False
     import pyopencl
-    _pyopencl_installed = True
+    from pyebsdindex.opencl import openclparam
+    testcl = openclparam.OpenClParam()
+    try:
+        gpu = testcl.get_gpu()
+        if len(gpu) > 0:
+            _pyopencl_installed = True
+    except:
+        raise ImportError('pyopencl could not find GPU')
 except ImportError:
     _pyopencl_installed = False
 
diff --git a/pyebsdindex/_ebsd_index_single.py b/pyebsdindex/_ebsd_index_single.py
index eb9989a..d11647f 100644
--- a/pyebsdindex/_ebsd_index_single.py
+++ b/pyebsdindex/_ebsd_index_single.py
@@ -18,7 +18,14 @@
 # some notice that they have been modified.
 #
 # Author: David Rowenhorst;
-# The US Naval Research Laboratory Date: 21 Aug 2020
+# The US Naval Research Laboratory Date: 22 May 2024
+
+# For further information see:
+# David J. Rowenhorst, Patrick G. Callahan, Håkon W. Ånes. Fast Radon transforms for
+# high-precision EBSD orientation determination using PyEBSDIndex.
+# Journal of Applied Crystallography, 57(1):3–19, 2024.
+# DOI: 10.1107/S1600576723010221
+
 
 """Setup and handling of Radon indexing runs of EBSD patterns on a
 single thread.
@@ -412,6 +419,12 @@ def update_file(self, filename=None, patDim=np.array([120, 120], dtype=np.int32)
         if filename is None:
             self.filein = None
             self.bandDetectPlan.band_detect_setup(patDim=patDim)
+        elif isinstance(filename, ebsd_pattern.EBSDPatternFile):
+            self.fID = filename  
+            self.bandDetectPlan.band_detect_setup(
+                patDim=[self.fID.patternH, self.fID.patternW]
+            )       
+            self.filein = filename.filepath   
         else:
             self.filein = filename
             self.fID = ebsd_pattern.get_pattern_file_obj(self.filein)
@@ -617,6 +630,102 @@ def _detectbands(self, pats, PC, xyloc=None, clparams=None, verbose=0, chunksize
 
     def _indexbandsphase(self, banddata, bandnorm, verbose=0):
 
+#
+        rhomax = self.bandDetectPlan.rhoMax * (1-self.bandDetectPlan.rhoMaskFrac)
+        shpBandDat = banddata.shape
+        npoints = int(banddata.size/(shpBandDat[-1])+0.1)
+        nPhases = len(self.phaseLib)
+        q = np.zeros((nPhases, npoints, 4))
+        indxData = np.zeros((nPhases + 1, npoints), dtype=self.dataTemplate)
+        bandmatchindex = np.zeros((nPhases, npoints,shpBandDat[-1],2), dtype=np.int32)-100
+        banddataout = banddata.copy()
+
+        indxData["phase"] = -1
+        indxData["fit"] = 180.0
+        indxData["totvotes"] = 0
+        indxData["nmatch"] = 0
+
+
+        if self.phaseLib[0] is None:
+            return indxData, banddata
+
+        if self.nband_earlyexit is None:
+            earlyexit = -1
+            for ph in self.phaselist:
+                if hasattr(ph, 'nband_earlyexit'):
+                    earlyexit = max(earlyexit, ph.nband_earlyexit)
+            if earlyexit < 0:
+                earlyexit = shpBandDat[1]  # default to all the poles.
+            self.nband_earlyexit = earlyexit
+        else:
+            earlyexit = self.nband_earlyexit
+
+
+        adj_intensity = (-1 * np.abs(banddata["rho"]) * 0.5 / rhomax + 1) * banddata["max"]
+        adj_intensity *= ((banddata["theta"] > (2 * np.pi / 180)).astype(np.float32) + 0.5) / 2
+        adj_intensity *= ((banddata["theta"] < (178.0 * np.pi / 180)).astype(np.float32) + 0.5) / 2
+
+        adj_intensity *= banddata["valid"]
+
+        #print(adj_intensity.shape)
+
+
+        for j in range(len(self.phaseLib)):
+
+            indxData['pq'][j, :] = np.sum(banddata['max'] * banddata['valid'], axis=1)
+            p2do = np.ravel(np.nonzero(np.max(indxData["nmatch"], axis=0) < earlyexit)[0])
+
+            if p2do.size ==0:
+                break
+            (
+                avequat,
+                fit,
+                cm,
+                bandmatch,
+                nMatch,
+                matchAttempts,
+                totvotes,
+            ) = self.phaseLib[j].bandindex(
+            bandnorm[p2do, ...], band_intensity=adj_intensity[p2do, ...], band_widths=banddata["width"][p2do, ...], verbose=verbose)
+            whgood = np.nonzero(nMatch >= 3 )[0]
+            if whgood.size > 0:
+                whgood2 = p2do[whgood]
+                q[j, whgood2, :] = avequat[whgood, ...]
+                indxData["fit"][j, whgood2] = fit[whgood]
+                indxData["cm"][j, whgood2] = cm[whgood]
+                indxData["phase"][j, whgood2] = j
+                indxData["nmatch"][j, whgood2] = nMatch[whgood]
+                indxData["matchattempts"][j, whgood2] = matchAttempts[whgood, ...]
+                indxData["totvotes"][j, whgood2] = totvotes[whgood]
+                bandmatchindex[j, whgood2, ..., 1] = bandmatch[whgood, ...]
+
+
+
+
+        qref2detect = self._detector2refframe()
+        q = q.reshape(nPhases * npoints, 4)
+        q = rotlib.quat_multiply(q, qref2detect)
+        q = rotlib.quatnorm(q)
+        q = q.reshape(nPhases, npoints, 4)
+        indxData["quat"][0:nPhases, :, :] = q
+        indxData[-1, :] = indxData[0, :]
+        banddataout['band_match_index'][:,:,:] = bandmatchindex[0,:,:,:].squeeze()
+        if nPhases > 1:
+            for j in range(1, nPhases):
+                # indxData[-1, :] = np.where(
+                #    (indxData[j, :]["cm"] * indxData[j, :]["nmatch"])
+                #    > (indxData[j + 1, :]["cm"] * indxData[j + 1, :]["nmatch"]),
+                #    indxData[j, :],
+                #    indxData[j + 1, :],
+                phasetest = ((3.0 - indxData[j, :]["fit"]) * indxData[j, :]["nmatch"]) \
+                            > ((3.0 - indxData[-1, :]["fit"]) * indxData[-1, :]["nmatch"])
+                whbetter = np.nonzero(phasetest)
+                indxData[-1, whbetter] = indxData[j, whbetter]
+                banddataout['band_match_index'][whbetter,:] =  bandmatchindex[j,whbetter,:,:].squeeze()
+        return indxData, banddataout
+
+    def _indexbandsphase_old(self, banddata, bandnorm, verbose=0):
+
 #
         rhomax = self.bandDetectPlan.rhoMax * (1-self.bandDetectPlan.rhoMaskFrac)
         shpBandDat = banddata.shape
@@ -706,7 +815,6 @@ def _indexbandsphase(self, banddata, bandnorm, verbose=0):
                 indxData[-1, whbetter] = indxData[j, whbetter]
                 banddataout['band_match_index'][whbetter,:] =  bandmatchindex[j,whbetter,:,:].squeeze()
         return indxData, banddataout
-
     def _detector2refframe(self):
         ven = str.upper(self.vendor)
         if ven in ["EDAX", "EMSOFT", "KIKUCHIPY"]:
diff --git a/pyebsdindex/band_detect.py b/pyebsdindex/band_detect.py
index c77b572..af5abb9 100644
--- a/pyebsdindex/band_detect.py
+++ b/pyebsdindex/band_detect.py
@@ -1,29 +1,41 @@
-"""This software was developed by employees of the US Naval Research Laboratory (NRL), an
-agency of the Federal Government. Pursuant to title 17 section 105 of the United States
-Code, works of NRL employees are not subject to copyright protection, and this software
-is in the public domain. PyEBSDIndex is an experimental system. NRL assumes no
-responsibility whatsoever for its use by other parties, and makes no guarantees,
-expressed or implied, about its quality, reliability, or any other characteristic. We
-would appreciate acknowledgment if the software is used. To the extent that NRL may hold
-copyright in countries other than the United States, you are hereby granted the
-non-exclusive irrevocable and unconditional right to print, publish, prepare derivative
-works and distribute this software, in any medium, or authorize others to do so on your
-behalf, on a royalty-free basis throughout the world. You may improve, modify, and
-create derivative works of the software or any portion of the software, and you may copy
-and distribute such modifications or works. Modified works should carry a notice stating
-that you changed the software and should note the date and nature of any such change.
-Please explicitly acknowledge the US Naval Research Laboratory as the original source.
-This software can be redistributed and/or modified freely provided that any derivative
-works bear some notice that they are derived from it, and any modified versions bear
-some notice that they have been modified.
-
-Author: David Rowenhorst;
-The US Naval Research Laboratory Date: 21 Aug 2020"""
-
-from os import environ
-from pathlib import PurePath
+# This software was developed by employees of the US Naval Research Laboratory (NRL), an
+# agency of the Federal Government. Pursuant to title 17 section 105 of the United States
+# Code, works of NRL employees are not subject to copyright protection, and this software
+# is in the public domain. PyEBSDIndex is an experimental system. NRL assumes no
+# responsibility whatsoever for its use by other parties, and makes no guarantees,
+# expressed or implied, about its quality, reliability, or any other characteristic. We
+# would appreciate acknowledgment if the software is used. To the extent that NRL may hold
+# copyright in countries other than the United States, you are hereby granted the
+# non-exclusive irrevocable and unconditional right to print, publish, prepare derivative
+# works and distribute this software, in any medium, or authorize others to do so on your
+# behalf, on a royalty-free basis throughout the world. You may improve, modify, and
+# create derivative works of the software or any portion of the software, and you may copy
+# and distribute such modifications or works. Modified works should carry a notice stating
+# that you changed the software and should note the date and nature of any such change.
+# Please explicitly acknowledge the US Naval Research Laboratory as the original source.
+# This software can be redistributed and/or modified freely provided that any derivative
+# works bear some notice that they are derived from it, and any modified versions bear
+# some notice that they have been modified.
+#
+#
+# Author: David Rowenhorst;
+# The US Naval Research Laboratory Date: 22 May 2024
+#
+# For further information see:
+# David J. Rowenhorst, Patrick G. Callahan, Håkon W. Ånes. Fast Radon transforms for
+# high-precision EBSD orientation determination using PyEBSDIndex.
+# Journal of Applied Crystallography, 57(1):3–19, 2024.
+# DOI: 10.1107/S1600576723010221
+#
+#
+
+"""Class that reads in EBSD patterns, and finds the bands within those patterns using a Radon Transform."""
+
+#from os import environ
+import os
+from pathlib import PurePath, Path
 import platform
-import tempfile
+# import tempfile
 from timeit import default_timer as timer
 
 import matplotlib.pyplot as plt
@@ -37,10 +49,12 @@
 
 from pyebsdindex import radon_fast
 
-
-tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir())
-tempdir = tempdir.joinpath('numba')
-environ["NUMBA_CACHE_DIR"] = str(tempdir)
+tempdir = PurePath(Path.home())
+#tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir())
+#tempdir = tempdir.joinpath('numbacache')
+tempdir = tempdir.joinpath('.pyebsdindex').joinpath('numbacache')
+Path(tempdir).mkdir(parents=True, exist_ok=True)
+os.environ["NUMBA_CACHE_DIR"] = str(tempdir)+str(os.sep)
 
 RADEG = 180.0/np.pi
 
@@ -80,6 +94,7 @@ def __init__(
     self.EDAXIQ = False
     self.backgroundsub = None
     self.patternmask = None
+    self.useCPU = True
 
     self.dataType = np.dtype([('id', np.int32), ('max', np.float32), \
                     ('maxloc', np.float32, (2)), ('avemax', np.float32), ('aveloc', np.float32, (2)),\
diff --git a/pyebsdindex/band_vote.py b/pyebsdindex/band_vote.py
deleted file mode 100644
index 63d2210..0000000
--- a/pyebsdindex/band_vote.py
+++ /dev/null
@@ -1,944 +0,0 @@
-"""This software was developed by employees of the US Naval Research Laboratory (NRL), an
-agency of the Federal Government. Pursuant to title 17 section 105 of the United States
-Code, works of NRL employees are not subject to copyright protection, and this software
-is in the public domain. PyEBSDIndex is an experimental system. NRL assumes no
-responsibility whatsoever for its use by other parties, and makes no guarantees,
-expressed or implied, about its quality, reliability, or any other characteristic. We
-would appreciate acknowledgment if the software is used. To the extent that NRL may hold
-copyright in countries other than the United States, you are hereby granted the
-non-exclusive irrevocable and unconditional right to print, publish, prepare derivative
-works and distribute this software, in any medium, or authorize others to do so on your
-behalf, on a royalty-free basis throughout the world. You may improve, modify, and
-create derivative works of the software or any portion of the software, and you may copy
-and distribute such modifications or works. Modified works should carry a notice stating
-that you changed the software and should note the date and nature of any such change.
-Please explicitly acknowledge the US Naval Research Laboratory as the original source.
-This software can be redistributed and/or modified freely provided that any derivative
-works bear some notice that they are derived from it, and any modified versions bear
-some notice that they have been modified.
-
-Author: David Rowenhorst;
-The US Naval Research Laboratory Date: 21 Aug 2020"""
-
-from os import environ
-from pathlib import PurePath
-import platform
-import tempfile
-from timeit import default_timer as timer
-
-import numba
-import numpy as np
-
-from pyebsdindex import rotlib
-
-
-RADEG = 180.0/np.pi
-
-tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir())
-tempdir = tempdir.joinpath('numba')
-environ["NUMBA_CACHE_DIR"] = str(tempdir)
-
-
-class BandVote:
-  def __init__(self, tripLib, angTol=3.0, high_fidelity=True):
-    self.tripLib = tripLib
-    self.phase_name = self.tripLib.phaseName
-    self.phase_sym = self.tripLib.pointgroup
-    self.lattice_param = self.tripLib.latticeParameter
-    self.angTol = angTol
-    self.n_band_early_exit = 8
-    self.high_fidelity = high_fidelity
-    # these lookup tables are used to order the index for the pole-family when
-    # sorting triplet angles from low to high.
-    LUTA = np.array([[0,1,2],[0,2,1],[1,0,2],[1,2,0],[2,0,1],[2,1,0]],dtype=np.int64)
-    LUTB = np.array([[0,1,2],[1,0,2],[0,2,1],[2,0,1],[1,2,0],[2,1,0]],dtype=np.int64)
-
-    LUT = np.zeros((3,3,3,3),dtype=np.int64)
-    for i in range(6):
-      LUT[:,LUTA[i,0],LUTA[i,1],LUTA[i,2]] = LUTB[i,:]
-    self.LUT = np.asarray(LUT).copy()
-
-  def tripvote(self, band_norms, band_intensity = None, goNumba = True, verbose=0):
-    tic0 = timer()
-    nfam = self.tripLib.polefamilies.shape[0]
-    bandnorms = np.squeeze(band_norms)
-    n_bands = np.int64(bandnorms.size/3)
-    if band_intensity is None:
-      band_intensity = np.ones((n_bands))
-    tic = timer()
-    bandangs = np.abs(bandnorms.dot(bandnorms.T))
-    bandangs = np.clip(bandangs, -1.0, 1.0)
-    bandangs  = np.arccos(bandangs)*RADEG
-
-
-    if goNumba == True:
-      accumulator, bandFam, bandRank, band_cm = self.tripvote_numba(bandangs, self.LUT, self.angTol, self.tripLib.tripAngles, self.tripLib.tripID, nfam, n_bands)
-    else:
-      count = 0
-      accumulator = np.zeros((nfam,n_bands),dtype=np.int32)
-      for i in range(n_bands):
-        for j in range(i+1,n_bands):
-          for k in range(j+1, n_bands):
-            angtri = np.array([bandangs[i,j], bandangs[i,k], bandangs[j,k]])
-            srt = np.argsort(angtri)
-            srt2 = np.array(self.LUT[:, srt[0], srt[1], srt[2]])
-            unsrtFID = np.argsort(srt2)
-            angtriSRT = np.array(angtri[srt])
-            angTest = (np.abs(self.tripLib.tripAngles - angtriSRT)) <= self.angTol
-            angTest = np.all(angTest, axis = 1)
-            wh = angTest.nonzero()[0]
-
-            for q in wh:
-              f = self.tripLib.tripID[q,:]
-              f = f[unsrtFID]
-              accumulator[f, [i,j,k]] += 1
-
-
-      mxvote = np.amax(accumulator, axis = 0)
-      tvotes = np.sum(accumulator, axis = 0)
-      band_cm = np.zeros(n_bands)
-
-
-
-      for i in range(n_bands):
-        if tvotes[i] < 1:
-          band_cm[i] = 0.0
-        else:
-          srt = np.argsort(accumulator[:,i])
-          band_cm[i] = (accumulator[srt[-1], i] - accumulator[srt[-2], i])/tvotes[i]
-
-      bandFam = np.argmax(accumulator, axis=0)
-      bandRank = (n_bands - np.arange(n_bands))/n_bands * band_cm * mxvote
-    #print(np.sum(accumulator))
-    #print(accumulator)
-    #print(bandRank)
-    #print(tvotes, band_cm, mxvote)
-    #print('vote loops: ', timer() - tic)
-    if verbose > 2:
-      print('band Vote time:',timer() - tic)
-    tic = timer()
-
-    # avequat,fit,bandmatch,nMatch = self.band_vote_refine(bandnorms,bandRank,bandFam, angTol = self.angTol)
-    # if nMatch == 0:
-    #   srt = np.argsort(bandRank)
-    #   for i in range(np.min([5, n_bands])):
-    #     bandRank2 = bandRank
-    #     bandRank2[srt[i]] = -1.0
-    #     #avequat, fit, bandmatch, nMatch = self.band_vote_refine(bandnorms,bandRank2,bandFam,self.tripLib.completelib,self.angTol)
-    #     avequat,fit,bandmatch,nMatch = self.band_vote_refine(bandnorms,bandRank2,bandFam)
-    #     if nMatch > 2:
-    #       break
-    #print('refinement: ', timer() - tic)
-    #print('tripvote: ',timer() - tic0)
-    sumaccum = np.sum(accumulator)
-    bandRank_arg = np.argsort(bandRank).astype(np.int64)
-    test  = 0
-    fit = 1000.0
-    nMatch = -1
-    avequat = np.zeros(4, dtype=np.float32)
-    polematch = np.array([-1])
-    whGood = -1
-
-    angTable = self.tripLib.completelib['angTable']
-    sztable = angTable.shape
-    famIndx = self.tripLib.completelib['famIndex']
-    nFam = self.tripLib.completelib['nFamily']
-    polesCart = self.tripLib.completelib['polesCart']
-    angTol = self.angTol
-    n_band_early = np.int64(self.n_band_early_exit)
-
-    # this will check the vote, and return the exact band matching to specific poles of the best fitting solution.
-    fit, polematch, nMatch, whGood, ij, R, fitb = \
-      self.band_index_nb(polesCart, bandRank_arg, bandFam, famIndx, nFam, angTable, bandnorms, angTol, n_band_early)
-
-    if verbose > 2:
-      print('band index: ',timer() - tic)
-    tic = timer()
-
-    cm2 = 0.0
-    if nMatch >=2:
-      if self.high_fidelity == True:
-
-        srt = np.argsort(fitb[whGood])
-        whgood6 = whGood[srt[0:np.min([8, whGood.shape[0]])]]
-
-        weights6 = band_intensity[whgood6]
-        pflt6 = (np.asarray(polesCart[polematch[whgood6], :], dtype=np.float64))
-        bndnorm6 = (np.asarray(bandnorms[whgood6, :], dtype=np.float64))
-
-        avequat, fit = self.refine_orientation_quest(bndnorm6, pflt6 , weights=weights6)
-        fit = np.arccos(np.clip(fit, -1.0, 1.0))*RADEG
-        #avequat, fit = self.refine_orientation(bandnorms,whGood,polematch)
-      else:
-        avequat = rotlib.om2qu(R)
-      whmatch = np.nonzero(polematch >= 0)[0]
-      cm = np.mean(band_cm[whmatch])
-      whfam = self.tripLib.completelib['poleFamID'][polematch[whmatch]]
-      cm2 = np.sum(accumulator[[whfam], [whmatch]]).astype(np.float32)
-      cm2 /= np.sum(accumulator.clip(1))
-
-    if verbose > 2:
-      print('refinement: ', timer() - tic)
-      print('all: ',timer() - tic0)
-    return avequat, fit, cm2, polematch, nMatch, ij, sumaccum
-
-
-
-  def band_index(self,bandnorms,bnd1,bnd2,familyLabel,angTol=3.0, verbose = 0):
-
-    #nBands = np.int32(bandnorms.size/3)
-    angTable = self.tripLib.completelib['angTable']
-    sztable = angTable.shape
-    #whGood = -1
-    famIndx = self.tripLib.completelib['famIndex']
-    nFam = self.tripLib.completelib['nFamily']
-    poles = self.tripLib.completelib['polesCart']
-    #ang01 = 0.0
-    # need to check that the two selected bands are not parallel.
-    #v0 = bandnorms[bnd1, :]
-    #f0 = familyLabel[bnd1]
-    #v1 = bandnorms[bnd2, :]
-    #f1 = familyLabel[bnd2]
-    #ang01 = np.clip(np.dot(v0, v1), -1.0, 1.0)
-    #ang01 = np.arccos(ang01)*RADEG
-    #if ang01 < angTol: # the two poles are parallel, send in another two poles if available.
-    #  return 360.0, 0, whGood, -1
-
-    #wh01 = np.nonzero(np.abs(angTable[famIndx[f0], famIndx[f1]:np.int(famIndx[f1]+nFam[f1])] - ang01) < angTol)[0]
-
-    #n01 = wh01.size
-    #if  n01 == 0:
-    #  return 360.0, 0, whGood, -1
-
-    #wh01 += famIndx[f1]
-    #p0 = poles[famIndx[f0], :]
-    #print('pre first loop: ',timer() - tic)
-    #tic = timer()
-    # place numba code here ...
-
-
-
-    #fit, polematch, R, nGood, whGood = self.band_vote_refine_loops1(poles,v0,v1, p0, wh01, bandnorms, angTol)
-    fit,polematch,R,nGood,whGood = self.band_vote_refine_loops1(poles, bnd1, bnd2, familyLabel,  famIndx, nFam, angTable, bandnorms, angTol)
-
-    #print('numba first loops',timer() - tic)
-    #whGood = np.nonzero(angFit < angTol)[0]
-    #nGood = np.int64(whGood.size)
-    #if nGood < 3:
-    #  return 360.0, -1, -1, -1
-
-    #fit = np.mean(angFit[whGood])
-    #print('all bindexed time', timer()-tic0)
-    return fit, nGood, whGood, polematch
-
-  def refine_orientation(self,bandnorms, whGood, polematch):
-    tic = timer()
-    poles = self.tripLib.completelib['polesCart']
-    nGood = whGood.size
-    n2Fit = np.int64(np.product(np.arange(2)+(nGood-2+1))/np.int64(2))
-    whGood = np.asarray(whGood,dtype=np.int64)
-    #AB, ABgood = self.orientation_refine_loops_am(nGood,whGood,poles,bandnorms,polematch,n2Fit)
-    # tic = timer()
-    # quats = rotlib.om2quL(AB[ABgood.nonzero()[0], :, :])
-    # print("om2qu", timer() - tic)
-    # tic = timer()
-    # avequat = rotlib.quatave(quats)
-
-    AB, weights = self.orientation_refine_loops_am(nGood, whGood, poles, bandnorms, polematch, n2Fit)
-
-    wh_weight = np.nonzero(weights < 359.0)[0]
-    quats = rotlib.om2quL(AB[wh_weight, :, :])
-
-    expw  = weights[wh_weight]
-
-    rng = expw.max()-expw.min()
-    #print(rng, len(wh_weight))
-    if rng > 1e-6:
-      expw -= expw.min()
-      #print(np.arccos(1.0 - expw)*RADEG)
-      expw = np.exp(-expw/(0.5*(rng)))
-      expw /= np.sum(expw)
-      #print(quats)
-      #print(expw)
-      #print(expw*len(wh_weight))
-      avequat = rotlib.quatave(quats * np.expand_dims(expw, axis=-1))
-      #print(avequat)
-    else:
-      avequat = rotlib.quatave(quats)
-
-
-    test = rotlib.quat_vectorL(avequat,bandnorms[whGood,:])
-
-    tic = timer()
-    test = np.sum(test * poles[polematch[whGood], :], axis = 1)
-    test = np.arccos(np.clip(test, -1.0, 1.0))*RADEG
-
-
-    fit = np.mean(test)
-
-    #print('fitting: ',timer() - tic)
-    return avequat, fit
-
-  def refine_orientation_quest(self, bandnorms, polecartmatch, weights = None):
-    tic = timer()
-
-
-    if weights is None:
-      weights = np.ones((bandnorms.shape[0]), dtype=np.float64)
-    weightsn = np.asarray(weights, dtype=np.float64)
-    weightsn /= np.sum(weightsn)
-    #print(weightsn)
-    pflt = np.asarray(polecartmatch, dtype=np.float64)
-    bndnorm = np.asarray(bandnorms, dtype=np.float64)
-
-    avequat, fit = self.orientation_quest_nb(pflt, bndnorm, weightsn)
-
-    return avequat, fit
-
-
-  @staticmethod
-  @numba.jit(nopython=True, cache=True,fastmath=True,parallel=False)
-  def tripvote_numba( bandangs, LUT, angTol, tripAngles, tripID, nfam, n_bands):
-    LUTTemp = np.asarray(LUT).copy()
-    accumulator = np.zeros((nfam, n_bands), dtype=np.int32)
-    tshape = np.shape(tripAngles)
-    ntrip = int(tshape[0])
-    count  = 0.0
-    #angTest2 = np.zeros(ntrip, dtype=numba.boolean)
-    #angTest2 = np.empty(ntrip,dtype=numba.boolean)
-    for i in range(n_bands):
-      for j in range(i + 1,n_bands):
-        for k in range(j + 1,n_bands):
-          angtri = np.array([bandangs[i,j],bandangs[i,k],bandangs[j,k]], dtype=numba.float32)
-          srt = angtri.argsort(kind='quicksort') #np.array(np.argsort(angtri), dtype=numba.int64)
-          srt2 = np.asarray(LUTTemp[:,srt[0],srt[1],srt[2]], dtype=numba.int64).copy()
-          unsrtFID = np.argsort(srt2,kind='quicksort').astype(np.int64)
-          angtriSRT = np.asarray(angtri[srt])
-          angTest = (np.abs(tripAngles - angtriSRT)) <= angTol
-
-          for q in range(ntrip):
-            angTest2 = (angTest[q,0] + angTest[q,1] + angTest[q,2]) == 3
-            if angTest2:
-              f = tripID[q,:]
-              f = f[unsrtFID]
-              accumulator[f[0],i] += 1
-              accumulator[f[1],j] += 1
-              accumulator[f[2],k] += 1
-              t1 = False
-              t2 = False
-              t3 = False
-              if np.abs(angtriSRT[0] - angtriSRT[1]) < angTol:
-                accumulator[f[0],i] += 1
-                accumulator[f[1],k] += 1
-                accumulator[f[2],j] += 1
-                t1 = True
-              if np.abs(angtriSRT[1] - angtriSRT[2]) < angTol:
-                accumulator[f[0],j] += 1
-                accumulator[f[1],i] += 1
-                accumulator[f[2],k] += 1
-                t2 = True
-              if np.abs(angtriSRT[2] - angtriSRT[0]) < angTol:
-                accumulator[f[0],k] += 1
-                accumulator[f[1],j] += 1
-                accumulator[f[2],i] += 1
-                t3 = True
-              if (t1 and t2 and t3):
-                accumulator[f[0],k] += 1
-                accumulator[f[1],i] += 1
-                accumulator[f[2],j] += 1
-
-    mxvote = np.zeros(n_bands, dtype=np.int32)
-    tvotes = np.zeros(n_bands, dtype=np.int32)
-    band_cm = np.zeros(n_bands, dtype=np.float32)
-    for q in range(n_bands):
-      mxvote[q] = np.amax(accumulator[:,q])
-      tvotes[q] = np.sum(accumulator[:,q])
-
-
-
-    for i in range(n_bands):
-      if tvotes[i] < 1:
-        band_cm[i] = 0.0
-      else:
-        srt = np.argsort(accumulator[:,i])
-        band_cm[i] = (accumulator[srt[-1],i] - accumulator[srt[-2],i]) / tvotes[i]
-
-    bandRank = np.zeros(n_bands, dtype=np.float32)
-    bandFam = np.zeros(n_bands, dtype=np.int32)
-    for q in range(n_bands):
-      bandFam[q] = np.argmax(accumulator[:,q])
-    bandRank = (n_bands - np.arange(n_bands)) / n_bands * band_cm * mxvote
-
-    return accumulator, bandFam, bandRank, band_cm
-
-  @staticmethod
-  @numba.jit(nopython=True, cache=True, fastmath=True,parallel=False)
-  def band_index_nb(polesCart, bandRank_arg, familyLabel, famIndx, nFam, angTable, bandnorms, angTol, n_band_early):
-    eps = np.float32(1.0e-12)
-    nBnds = bandnorms.shape[0]
-
-    whGood_out = np.zeros(nBnds, dtype=np.int64)-1
-
-
-    nMatch = np.int64(-1)
-    Rout = np.zeros((1,3,3), dtype=np.float32)
-    #Rout[0,0,0] = 1.0; Rout[0,1,1] = 1.0; Rout[0,2,2] = 1.0
-    polematch_out = np.zeros((nBnds),dtype=np.int64) - 1
-    pflt = np.asarray(polesCart, dtype=np.float32)
-    bndnorm = np.transpose(np.asarray(bandnorms, dtype=np.float32))
-
-    fit = np.float32(360.0)
-    fitout = np.float32(360.0)
-    fitbout = np.zeros((nBnds))
-    R = np.zeros((1, 3, 3), dtype=np.float32)
-    #fit = np.float32(360.0)
-    #whGood = np.zeros(nBnds, dtype=np.int64) - 1
-    nGood = np.int64(-1)
-    ij = (-1,-1)
-    for ii in range(nBnds-1):
-      for jj in range(ii+1,nBnds):
-
-        polematch = np.zeros((nBnds),dtype=np.int64) - 1
-
-        bnd1 = bandRank_arg[-1 - ii]
-        bnd2 = bandRank_arg[-1 - jj]
-
-        v0 = bandnorms[bnd1,:]
-        f0 = familyLabel[bnd1]
-        v1 = bandnorms[bnd2,:]
-        f1 = familyLabel[bnd2]
-        ang01 = np.dot(v0,v1)
-        if ang01 > np.float32(1.0):
-          ang01 = np.float32(1.0-eps)
-        if ang01 < np.float32(-1.0):
-          ang01 = np.float32(-1.0+eps)
-
-        paralleltest = np.arccos(np.fabs(ang01)) * RADEG
-        if paralleltest < angTol:  # the two poles are parallel, send in another two poles if available.
-          continue
-
-        ang01 = np.arccos(ang01) * RADEG
-
-        wh01 = np.nonzero(np.abs(angTable[famIndx[f0],famIndx[f1]:np.int64(famIndx[f1] + nFam[f1])] - ang01) < angTol)[0]
-
-        n01 = wh01.size
-        if n01 == 0:
-          continue
-
-        wh01 += famIndx[f1]
-        p0 = polesCart[famIndx[f0], :]
-
-        n01 = wh01.size
-        v0v1c = np.cross(v0,v1)
-        v0v1c /= np.linalg.norm(v0v1c)
-        # attempt to see which solution gives the best match to all the poles
-        # best is measured as the number of poles that are within tolerance,
-        # divided by the angular deviation.
-        # Use the TRIAD method for finding the rotation matrix
-
-        Rtry = np.zeros((n01,3,3), dtype = np.float32)
-
-        #score = np.zeros((n01), dtype = np.float32)
-        A = np.zeros((3,3), dtype = np.float32)
-        B = np.zeros((3,3), dtype = np.float32)
-        #AB = np.zeros((3,3),dtype=np.float32)
-        b2 = np.cross(v0,v0v1c)
-        B[0,:] = v0
-        B[1,:] = v0v1c
-        B[2,:] = b2
-        A[:,0] = p0
-        score = -1.0
-
-        for i in range(n01):
-          p1 = polesCart[wh01[i], :]
-          ntemp = np.linalg.norm(p1) + 1.0e-35
-          p1 = p1 / ntemp
-          p0p1c = np.cross(p0,p1)
-          ntemp = np.linalg.norm(p0p1c) + 1.0e-35
-          p0p1c = p0p1c / ntemp
-          A[:,1] = p0p1c
-          A[:,2] = np.cross(p0,p0p1c)
-          AB = A.dot(B)
-          Rtry[i,:,:] = AB
-
-          testp = (AB.dot(bndnorm))
-          test = pflt.dot(testp)
-          #print(test.shape)
-          angfitTry = np.zeros((nBnds), dtype = np.float32)
-          #angfitTry = np.max(test,axis=0)
-          for j in range(nBnds):
-            angfitTry[j] = np.max(test[:,j])
-            angfitTry[j] = -1.0 if angfitTry[j] < -1.0 else angfitTry[j]
-            angfitTry[j] =  1.0 if angfitTry[j] >  1.0 else angfitTry[j]
-
-          #angfitTry = np.clip(np.amax(test,axis=0),-1.0,1.0)
-
-          angfitTry = np.arccos(angfitTry) * RADEG
-          whMatch = np.nonzero(angfitTry < angTol)[0]
-          nmatch = whMatch.size
-          #scoreTry = np.float32(nmatch) * np.mean(np.abs(angTol - angfitTry[whMatch]))
-          scoreTry = np.float32(nmatch) /( np.mean(angfitTry[whMatch]) + 1e-6)
-          if scoreTry > score:
-            score = scoreTry
-            angFit = angfitTry
-            for j in range(nBnds):
-              polematch[j] = np.argmax(test[:,j]) * ( 2*np.int32(angfitTry[j] < angTol)-1)
-            R[0, :,:] = Rtry[i,:,:]
-
-
-        whGood = (np.nonzero(angFit < angTol)[0]).astype(np.int64)
-        nGood = max(np.int64(whGood.size), np.int64(0))
-
-        if nGood < 3:
-          continue
-          #return 360.0,-1,-1,-1
-          #whGood = -1*np.ones((1), dtype=np.int64)
-          #fit = np.float32(360.0)
-          #polematch[:] = -1
-          #nGood = np.int64(-1)
-        else:
-          fitb = angFit
-          #fit = np.mean(fitb[whGood])
-          fit = np.float32(0.0)
-          for q in range(nGood):
-            fit += np.float32(fitb[whGood[q]])
-          fit /= np.float32(nGood)
-
-
-        if nGood >= (n_band_early):
-          fitout = fit
-          fitbout = fitb
-          nMatch = nGood
-          whGood_out = whGood
-          polematch_out = polematch
-          Rout = R
-          ij  = (bnd1,bnd2)
-          break
-        else:
-          if nMatch < nGood:
-            fitout = np.float32(fit)
-            fitbout = fitb
-            nMatch = nGood
-            whGood_out = whGood
-            polematch_out = polematch
-            Rout = R
-            ij = (bnd1, bnd2)
-          elif nMatch == nGood:
-            if fitout > fit:
-              fitout = np.float32(fit)
-              fitbout = fitb
-              nMatch = nGood
-              whGood_out = whGood
-              polematch_out = polematch
-              Rout = R
-              ij = (bnd1, bnd2)
-      if nMatch >= (n_band_early):
-        break
-    #quatout = rotlib.om2quL(Rout)
-    return fitout, polematch_out,nMatch, whGood_out, ij, Rout, fitbout
-
-  @staticmethod
-  @numba.jit(nopython=True, cache=True, fastmath=True,parallel=False)
-  def orientation_refine_loops_triad(nGood, whGood, poles, bandnorms, polematch, n2Fit):
-    #uses the TRIAD method for getting rotation matrix from imperfect poles.
-    quats = np.zeros((n2Fit, 4), dtype=np.float32)
-    counter = 0
-    A = np.zeros((3, 3), dtype=np.float32)
-    B = np.zeros((3, 3), dtype=np.float32)
-    AB = np.zeros((n2Fit, 3, 3), dtype=np.float32)
-    whgood2 = np.zeros(n2Fit, dtype=np.int32)
-    for i in range(nGood):
-      v0 = bandnorms[whGood[i],:]
-      p0 = poles[polematch[whGood[i]],:]
-      A[:,0] = p0
-      B[0,:] = v0
-      for j in range(i + 1,nGood):
-        v1 = bandnorms[whGood[j],:]
-        p1 = poles[polematch[whGood[j]],:]
-        v0v1c = np.cross(v0,v1)
-        # v0v1c /= np.linalg.norm(v0v1c)+1.0e-35
-        # v0v1c = vectnorm(v0v1c) # faster to inline these functions
-        norm = numba.float32(0.0)
-        for ii in range(3):
-          norm += v0v1c[ii] * v0v1c[ii]
-        norm = np.sqrt(norm) + 1.0e-35
-        #print(norm)
-        if norm > (0.087): # these vectors are not parallel (greater than 5 degrees)
-          for ii in range(3):
-            v0v1c[ii] = v0v1c[ii] / norm
-
-          p0p1c = np.cross(p0,p1)
-          # p0p1c /= (np.linalg.norm(p0p1c))+1.0e-35
-          #p0p1c = vectnorm(p0p1c) # faster to inline these functions
-          norm = numba.float32(0.0)
-          for ii in range(3):
-            norm += p0p1c[ii] * p0p1c[ii]
-          norm = np.sqrt(norm) + 1.0e-35
-          for ii in range(3):
-            p0p1c[ii] = p0p1c[ii] / norm
-
-          A[:,1] = p0p1c
-          B[1,:] = v0v1c
-          A[:,2] = np.cross(p0,p0p1c)
-          B[2,:] = np.cross(v0,v0v1c)
-          AB[counter, :,:] = A.dot(B)
-          whgood2[counter] = 1
-          #AB = np.reshape(AB, (1,3,3))
-          #quats[counter,:] = rotlib.om2quL(AB)
-          counter += 1
-        else: # the two are parallel - throwout the result.
-          whgood2[counter] = 0
-          counter +=1
-    return AB, whgood2
-
-  @staticmethod
-  @numba.jit(nopython=True,cache=True,fastmath=True,parallel=False)
-  def orientation_refine_loops_am(nGood,whGood,poles,bandnorms,polematch,n2Fit):
-    # this uses the method laid out by A. Morawiec 2020 Eq.4 for getting rotation matrix
-    # from imperfect poles
-    counter = 0
-
-    pflt = np.transpose(np.asarray(poles[polematch[whGood], :], dtype=np.float32))
-    bndnorm = np.transpose(np.asarray(bandnorms[whGood,:], dtype=np.float32))
-
-    A = np.zeros((3, 3), dtype=np.float32)
-    B = np.zeros((3, 3), dtype=np.float32)
-    AB = np.zeros((n2Fit, 3, 3),dtype=np.float32)
-    #whgood2 = np.zeros(n2Fit, dtype=np.int32)
-    whgood2 = np.zeros(n2Fit, dtype=np.float32)
-
-    for i in range(nGood):
-      v0 = bandnorms[whGood[i],:]
-      p0 = poles[polematch[whGood[i]],:]
-
-      for j in range(i + 1,nGood):
-        v1 = bandnorms[whGood[j],:]
-        p1 = poles[polematch[whGood[j]],:]
-        v0v1c = np.cross(v0,v1)
-        p0p1c = np.cross(p0,p1)
-        p0p1add = p0 + p1
-        v0v1add = v0 + v1
-        p0p1sub = p0 - p1
-        v0v1sub = v0 - v1
-
-        normPCross = numba.float32(0.0)
-        normVCross = numba.float32(0.0)
-        normPAdd = numba.float32(0.0)
-        normVAdd = numba.float32(0.0)
-        normPSub = numba.float32(0.0)
-        normVSub = numba.float32(0.0)
-
-        for ii in range(3):
-          normPCross += p0p1c[ii] * p0p1c[ii]
-          normVCross += v0v1c[ii] * v0v1c[ii]
-          normPAdd += p0p1add[ii] * p0p1add[ii]
-          normVAdd += v0v1add[ii] * v0v1add[ii]
-          normPSub += p0p1sub[ii] * p0p1sub[ii]
-          normVSub += v0v1sub[ii] * v0v1sub[ii]
-
-        normPCross = np.sqrt(normPCross) + 1.0e-35
-        normVCross = np.sqrt(normVCross) + 1.0e-35
-        normPAdd = np.sqrt(normPAdd) + 1.0e-35
-        normVAdd = np.sqrt(normVAdd) + 1.0e-35
-        normPSub = np.sqrt(normPSub) + 1.0e-35
-        normVSub = np.sqrt(normVSub) + 1.0e-35
-
-        # print(norm)
-        if normVCross > (0.087):  # these vectors are not parallel (greater than 5 degrees)
-          for ii in range(3):
-            v0v1c[ii] /= normVCross
-            p0p1c[ii] /= normPCross
-            v0v1add[ii] /= normVAdd
-            p0p1add[ii] /= normPAdd
-            v0v1sub[ii] /= normVSub
-            p0p1sub[ii] /= normPSub
-
-          A[:,0] = p0p1c
-          B[0,:] = v0v1c
-
-          A[:,1] = p0p1add
-          B[1,:] = v0v1add
-
-          A[:,2] = p0p1sub
-          B[2,:] = v0v1sub
-          R = A.dot(B)
-          AB[counter,:,:] = A.dot(B)
-
-          # test the fit of each candidate
-          testp = (R.dot(bndnorm))
-          #test = pflt.dot(testp)
-          test = np.sum(pflt*testp, axis=0)
-          #print(test.shape)
-          #angfitTry = np.zeros((nGood), dtype=np.float32)
-          # angfitTry = np.max(test,axis=0)
-          #for qq in range(nGood):
-          #  angfitTry[qq] = np.max(test[:, qq])
-          #  angfitTry[qq] = -1.0 if angfitTry[qq] < -1.0 else angfitTry[qq]
-          #  angfitTry[qq] = 1.0 if angfitTry[qq] > 1.0 else angfitTry[qq]
-          #angfitTry = np.mean(np.arccos(angfitTry) * RADEG)
-          #print(test)
-
-
-          #whgood2[counter] = 1
-          whgood2[counter] = 1.0 - np.mean(test)
-          #print(1.0 - np.mean(test))
-          counter += 1
-        else:  # the two are parallel - throwout the result.
-          whgood2[counter] = np.float32(360.0)
-          counter += 1
-    return AB,whgood2
-
-  @staticmethod
-  @numba.jit(nopython=True, cache=True, fastmath=True, parallel=False)
-  def orientation_quest_nb(polescart, bandnorms, weights):
-    # this uses the Quaternion Estimator AKA quest algorithm.
-
-    pflt = np.asarray(polescart, dtype=np.float64)
-    bndnorm = np.asarray(bandnorms, dtype=np.float64)
-    npoles = pflt.shape[0]
-    wn = (np.asarray(weights, dtype=np.float64)).reshape(npoles, 1)
-
-    #wn = np.ones((nGood,1), dtype=np.float32)/np.float32(nGood)  #(weights[whGood]).reshape(nGood,1)
-    wn /= np.sum(wn)
-
-    I = np.zeros((3,3), dtype=np.float64)
-    I[0,0] = 1.0 ; I[1,1] = 1.0 ; I[2,2] = 1.0
-    q = np.zeros((4), dtype=np.float64)
-
-    B = (wn * bndnorm).T @ pflt
-    S = B + B.T
-    z = np.asarray(np.sum(wn * np.cross(bndnorm, pflt), axis = 0), dtype=np.float64)
-    S2 = S @ S
-    det = np.linalg.det(S)
-    k = (S[1,1]*S[2,2] - S[1,2]*S[2,1]) + (S[0,0]*S[2,2] - S[0,2]*S[2,0]) + (S[0,0]*S[1,1] - S[1,0]*S[0,1])
-    sig = 0.5 * (S[0,0] + S[1,1] + S[2,2])
-    sig2 = sig * sig
-    d = z.T @ S2 @ z
-    c = det + (z.T @ S @ z)
-    b = sig2 + (z.T @ z)
-    a = sig2 -k
-
-    lam = 1.0
-    tol = 1.0e-6
-    iter = 0
-    dlam = 1e6
-    #for i in range(10):
-    while (dlam > tol) and (iter < 10):
-      lam0 = lam
-      lam = lam - (lam**4 - (a + b) * lam**2 - c * lam + (a * b + c * sig - d))/ (4 * lam**3 - 2 * (a + b) * lam - c)
-      dlam = np.fabs(lam0-lam)
-      iter += 1
-
-    beta = lam - sig
-    alpha = lam ** 2 - sig2 + k
-    gamma = (lam + sig) * alpha - det
-    X = np.asarray( (alpha * I + beta * S + S2), dtype=np.float64)  @ z
-    qn = np.float64(0.0)
-    qn += gamma ** 2 + X[0] **2 + X[1] **2 + X[2] **2
-    qn = np.sqrt(qn)
-    q[0] = gamma
-    q[1:4] = X[0:3]
-    q /= qn
-    if (np.sign(gamma) < 0):
-      q *= -1.0
-
-    #polesrot = rotlib.quat_vectorL1N(q, pflt, npoles, np.float64, p=1)
-    #pdot = np.sum(polesrot*bndnorm, axis = 1, dtype=np.float64)
-    return q, lam#, pdot
-
-  def pairVoteOrientation(self,bandnormsIN,goNumba=True):
-    tic0 = timer()
-    nfam = self.tripLib.polefamilies.shape[0]
-    bandnorms = np.squeeze(bandnormsIN)
-    n_bands = np.int64(bandnorms.size / 3)
-
-    bandangs = np.abs(bandnorms.dot(bandnorms.T))
-    bandangs = np.clip(bandangs,-1.0,1.0)
-    bandangs = np.arccos(bandangs) * RADEG
-
-    angTable = self.tripLib.completelib['angTable']
-    sztable = angTable.shape
-    whGood = -1
-    famIndx = self.tripLib.completelib['famIndex']
-    nFam = self.tripLib.completelib['nFamily']
-    poles = self.tripLib.completelib['polesCart']
-    angTableReduce = angTable[famIndx,:]
-    polesReduce = poles[famIndx,:]
-    qsym = self.tripLib.qsymops
-    tic = timer()
-
-
-    if goNumba == True:
-      solutions, nsolutions, solutionVotes, solSrt = self.pairVoteOrientationNumba(bandnorms,bandangs,qsym,angTableReduce,poles,polesReduce, self.angTol)
-    else:
-      solutions = np.empty((500, 24, 4), dtype=np.float32)
-      solutions[0,:,:] = rotlib.quat_multiply(qsym, np.array([1.0, 0.0, 0.0, 0.0], dtype=np.float32))
-      solutionVotes = np.zeros(500, dtype=np.int32)
-      nsolutions = 1
-      soltol = np.cos(5.0/RADEG/2.0)
-
-      A = np.zeros((3,3), dtype=np.float32)
-      B = np.zeros((3,3), dtype=np.float32) # used for TRIAD calculations
-
-      for i in range(n_bands):
-        for j in range(i + 1,n_bands):
-
-            angPair = bandangs[i,j]
-            if angPair > 10.0:
-              angTest = (np.abs(angTableReduce - angPair)) <= self.angTol
-              wh = angTest.nonzero()
-              if len(wh[0] > 0):
-
-                v0 = bandnorms[i,:]
-                v1 = bandnorms[j,:]
-                v0v1c = np.cross(v0,v1)
-                v0v1c /= np.linalg.norm(v0v1c)
-                B[0,:] = v0
-                B[1,:] = v0v1c
-                B[2,:] = np.cross(v0,v0v1c)
-                for k in range(len(wh[0])):
-
-                  p0 = polesReduce[wh[0][k],:]
-                  p1 = poles[wh[1][k],:]
-                  p0p1c = np.cross(p0,p1)
-                  p0p1c /= np.linalg.norm(v0v1c)
-                  A[:,0] = p0
-                  A[:,1] = p0p1c
-                  A[:,2] = np.cross(p0,p0p1c)
-                  AB = A.dot(B)
-
-                  qAB = rotlib.om2qu(AB)
-                  qABinv = rotlib.quatconj(qAB)
-                  #qABsym = rotlib.quat_multiply(qsym, qAB)
-
-                  solutionFound = False
-                  for q in range(nsolutions):
-                    #rotlib.quat_multiplyLNN(q1,q2,n,intype,p=P)
-
-                    soltest = np.max( np.abs( (rotlib.quat_multiply((solutions[q,:,:]), qABinv))[:,0] ))
-
-                    if soltest >= soltol:
-                      solutionVotes[q] += 1
-                      solutionFound = True
-                  if solutionFound == False:
-                    solutions[nsolutions, :, :] = rotlib.quat_multiply(qsym, qAB)
-                    solutionVotes[nsolutions] += 1
-                    nsolutions += 1
-
-      solSrt = np.argsort(solutionVotes)
-    #print(nsolutions, solutionVotes[solSrt[-10:]])
-    mxvote = np.max(solutionVotes)
-    #print(timer()-tic)
-    tic = timer()
-    if mxvote > 0:
-      whmxvotes = np.nonzero(solutionVotes == mxvote)
-      nmxvote = len(whmxvotes[0])
-      fit = np.zeros(nmxvote, dtype=np.float32)
-      avequat = np.zeros((nmxvote,4),dtype=np.float32)
-      nMatch = np.zeros((nmxvote),dtype=np.float32)
-      poleMatch = np.zeros((nmxvote, n_bands), dtype=np.int32) - 1
-      #cm = (mxvote-solutionVotes[solSrt[-nmxvote-1]])/np.sum(solutionVotes)
-      cm = mxvote/np.sum(solutionVotes)
-      for q in range(nmxvote):
-        testbands = rotlib.quat_vector(solutions[whmxvotes[0][q], 0, : ], bandnorms)
-        fittest = (testbands.dot(poles.T)).clip(-1.0, 1.0)
-        poleMatch1 = np.argmax(fittest, axis = 1)
-        fittemp = np.arccos(np.max(fittest,axis=1)) * RADEG
-
-        whGood = np.nonzero(fittemp < self.angTol)
-        nMatch[q] = len(whGood[0])
-        poleMatch[q,whGood[0]] = poleMatch1[whGood[0]]
-
-        avequat1, fit1 = self.refine_orientation(bandnorms,whGood[0],poleMatch1)
-
-        fit[q] = fit1
-        avequat[q,:] = avequat1
-
-      keep = np.argmax(nMatch)
-      #print(timer() - tic)
-      return avequat[keep,:],fit[keep],cm,poleMatch[keep,:],nMatch[keep],(0,0)
-
-    else: # no solutions
-      fit = 1000.0
-      nMatch = -1
-      avequat = np.zeros(4,dtype=np.float32)
-      polematch = np.array([-1])
-      whGood = -1
-      print(timer() - tic)
-      return avequat,fit,-1,polematch,nMatch,(0,0)
-
-
-  @staticmethod
-  @numba.jit(nopython=True,cache=True,fastmath=True,parallel=False)
-  def pairVoteOrientationNumba(bandnorms,bandangs, qsym, angTableReduce, poles, polesReduce, angTol):
-    n_bands = bandnorms.shape[0]
-    nsym = qsym.shape[0]
-    solutions = np.empty((500,24,4),dtype=np.float32)
-    solutions[0,:,:] = rotlib.quat_multiplyL(qsym,np.array([1.0,0.0,0.0,0.0],dtype=np.float32))
-    solutionVotes = np.zeros(500, dtype=np.int32)
-    nsolutions = 1
-    soltol = np.cos(5.0 / RADEG / 2.0)
-
-    A = np.empty((3,3),dtype=np.float32)
-    B = np.empty((3,3),dtype=np.float32)  # used for TRIAD calculations
-    AB = np.empty((1,3,3), dtype=np.float32)
-    qAB = np.empty((1,4), dtype=np.float32)
-    qABinv = np.empty((1,4),dtype=np.float32)
-    soltemp = np.empty((nsym,4), dtype=np.float32)
-
-    for i in range(n_bands):
-      for j in range(i + 1,n_bands):
-
-        angPair = bandangs[i,j]
-        if angPair > 10.0:
-          angTest = (np.abs(angTableReduce - angPair)) <= angTol
-          wh = angTest.nonzero()
-          if len(wh[0] > 0):
-
-            v0 = bandnorms[i,:]
-            v1 = bandnorms[j,:]
-            v0v1c = np.cross(v0,v1)
-            v0v1c /= np.linalg.norm(v0v1c)
-            B[0,:] = v0
-            B[1,:] = v0v1c
-            B[2,:] = np.cross(v0,v0v1c)
-            for k in range(len(wh[0])):
-
-              p0 = polesReduce[wh[0][k],:]
-              p1 = poles[wh[1][k],:]
-              p0p1c = np.cross(p0,p1)
-              p0p1c /= np.linalg.norm(v0v1c)
-              A[:,0] = p0
-              A[:,1] = p0p1c
-              A[:,2] = np.cross(p0,p0p1c)
-              AB[0,:,:] = A.dot(B)
-
-              qAB = rotlib.om2quL(AB)
-              qABinv = rotlib.quatconjL(qAB)
-              #print(qABinv.shape)
-              # qABsym = rotlib.quat_multiply(qsym, qAB)
-
-              solutionFound = False
-              for q in range(nsolutions):
-                # rotlib.quat_multiplyLNN(q1,q2,n,intype,p=P)
-
-                soltemp = np.copy(solutions[q,:,:])
-                #print(soltemp.shape)
-                soltemp =  rotlib.quat_multiplyL(soltemp,qABinv)
-
-                soltest = -1.0
-                for qq in range(nsym):
-                  if soltemp[qq,0] > soltest:
-                    soltest = soltemp[qq,0]
-
-                if soltest >= soltol:
-                  solutionVotes[q] += 1
-                  solutionFound = True
-              if solutionFound == False:
-                solutions[nsolutions,:,:] = rotlib.quat_multiplyL(qsym,qAB)
-                solutionVotes[nsolutions] += 1
-                nsolutions += 1
-
-    solSrt = np.argsort(solutionVotes)
-
-    return solutions, nsolutions, solutionVotes, solSrt
diff --git a/pyebsdindex/crystal_sym.py b/pyebsdindex/crystal_sym.py
index 64709ef..fa0bd79 100644
--- a/pyebsdindex/crystal_sym.py
+++ b/pyebsdindex/crystal_sym.py
@@ -381,7 +381,7 @@ def spacegroup2lauenumber(spacegroupid):
 
        Parameters
        ----------
-       sapcegroupid : int
+       spacegroupid : int
            number between 1 and 230
 
        Returns
diff --git a/pyebsdindex/crystallometry.py b/pyebsdindex/crystallometry.py
index 81660f8..fd61d79 100644
--- a/pyebsdindex/crystallometry.py
+++ b/pyebsdindex/crystallometry.py
@@ -1,28 +1,28 @@
-'''This software was developed by employees of the US Naval Research Laboratory (NRL), an
-agency of the Federal Government. Pursuant to title 17 section 105 of the United States
-Code, works of NRL employees are not subject to copyright protection, and this software
-is in the public domain. PyEBSDIndex is an experimental system. NRL assumes no
-responsibility whatsoever for its use by other parties, and makes no guarantees,
-expressed or implied, about its quality, reliability, or any other characteristic. We
-would appreciate acknowledgment if the software is used. To the extent that NRL may hold
-copyright in countries other than the United States, you are hereby granted the
-non-exclusive irrevocable and unconditional right to print, publish, prepare derivative
-works and distribute this software, in any medium, or authorize others to do so on your
-behalf, on a royalty-free basis throughout the world. You may improve, modify, and
-create derivative works of the software or any portion of the software, and you may copy
-and distribute such modifications or works. Modified works should carry a notice stating
-that you changed the software and should note the date and nature of any such change.
-Please explicitly acknowledge the US Naval Research Laboratory as the original source.
-This software can be redistributed and/or modified freely provided that any derivative
-works bear some notice that they are derived from it, and any modified versions bear
-some notice that they have been modified.
-
-This borrows heavily from Marc De Graef's EMsoft package. Many functions here
-are Python translations of functions from it.
-
-
-Author: David Rowenhorst, Patrick Callahan;
-The US Naval Research Laboratory Date: 21 Aug 2020'''
+# This software was developed by employees of the US Naval Research Laboratory (NRL), an
+# agency of the Federal Government. Pursuant to title 17 section 105 of the United States
+# Code, works of NRL employees are not subject to copyright protection, and this software
+# is in the public domain. PyEBSDIndex is an experimental system. NRL assumes no
+# responsibility whatsoever for its use by other parties, and makes no guarantees,
+# expressed or implied, about its quality, reliability, or any other characteristic. We
+# would appreciate acknowledgment if the software is used. To the extent that NRL may hold
+# copyright in countries other than the United States, you are hereby granted the
+# non-exclusive irrevocable and unconditional right to print, publish, prepare derivative
+# works and distribute this software, in any medium, or authorize others to do so on your
+# behalf, on a royalty-free basis throughout the world. You may improve, modify, and
+# create derivative works of the software or any portion of the software, and you may copy
+# and distribute such modifications or works. Modified works should carry a notice stating
+# that you changed the software and should note the date and nature of any such change.
+# Please explicitly acknowledge the US Naval Research Laboratory as the original source.
+# This software can be redistributed and/or modified freely provided that any derivative
+# works bear some notice that they are derived from it, and any modified versions bear
+# some notice that they have been modified.
+#
+# This borrows heavily from Marc De Graef's EMsoft package. Many functions here
+# are Python translations of functions from it.
+#
+#
+# Author: David Rowenhorst, Patrick Callahan;
+# The US Naval Research Laboratory Date: 22 May 2024
 
 
 import numpy as np
@@ -35,7 +35,7 @@
 
 
 class CrystalPlane:
-    def __init__(self,hkl = [0,0,0],centering = 'F'):
+    def __init__(self,hkl = (0,0,0),centering = 'F'):
         self.hkl = np.array(hkl)
         self.allowed = True
         self.lattice_centering = centering
diff --git a/pyebsdindex/misorientation.py b/pyebsdindex/misorientation.py
index 41f6a8e..cf4df04 100644
--- a/pyebsdindex/misorientation.py
+++ b/pyebsdindex/misorientation.py
@@ -1,50 +1,46 @@
-'''
-2022, David Rowenhorst/The US Naval Research Laboratory, Washington DC
-# Pursuant to title 17 section 105 of the United States Code, works of US Governement employees
-# are not not subject to copyright protection.
+# This software was developed by employees of the US Naval Research Laboratory (NRL), an
+# agency of the Federal Government. Pursuant to title 17 section 105 of the United States
+# Code, works of NRL employees are not subject to copyright protection, and this software
+# is in the public domain. PyEBSDIndex is an experimental system. NRL assumes no
+# responsibility whatsoever for its use by other parties, and makes no guarantees,
+# expressed or implied, about its quality, reliability, or any other characteristic. We
+# would appreciate acknowledgment if the software is used. To the extent that NRL may hold
+# copyright in countries other than the United States, you are hereby granted the
+# non-exclusive irrevocable and unconditional right to print, publish, prepare derivative
+# works and distribute this software, in any medium, or authorize others to do so on your
+# behalf, on a royalty-free basis throughout the world. You may improve, modify, and
+# create derivative works of the software or any portion of the software, and you may copy
+# and distribute such modifications or works. Modified works should carry a notice stating
+# that you changed the software and should note the date and nature of any such change.
+# Please explicitly acknowledge the US Naval Research Laboratory as the original source.
+# This software can be redistributed and/or modified freely provided that any derivative
+# works bear some notice that they are derived from it, and any modified versions bear
+# some notice that they have been modified.
 #
-# Copyright (c) 2013-2014, Marc De Graef/Carnegie Mellon University
-# All rights reserved.
 #
-# Redistribution and use in source and binary forms, with or without modification, are
-# permitted provided that the following conditions are met:
+# Author: David Rowenhorst;
+# The US Naval Research Laboratory Date: 22 May 2024
 #
-#     - Redistributions of source code must retain the above copyright notice, this list
-#        of conditions and the following disclaimer.
-#     - Redistributions in binary form must reproduce the above copyright notice, this
-#        list of conditions and the following disclaimer in the documentation and/or
-#        other materials provided with the distribution.
-#     - Neither the names of Marc De Graef, Carnegie Mellon University nor the names
-#        of its contributors may be used to endorse or promote products derived from
-#        this software without specific prior written permission.
+# For further information see:
+# David J. Rowenhorst, Patrick G. Callahan, Håkon W. Ånes. Fast Radon transforms for
+# high-precision EBSD orientation determination using PyEBSDIndex.
+# Journal of Applied Crystallography, 57(1):3–19, 2024.
+# DOI: 10.1107/S1600576723010221
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-####################################################################################################
-
-
-#Author David Rowenhorst/The US Naval Research Laboratory, Washington DC
 #
-#
-'''
 
 import numpy as np
 import numba
-from os import environ
+import os
 import tempfile
-from pathlib import PurePath
+from pathlib import PurePath, Path
 import platform
-tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir())
-tempdir = tempdir.joinpath('numba')
-environ["NUMBA_CACHE_DIR"] = str(tempdir)
+tempdir = PurePath(Path.home())
+#tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir())
+#tempdir = tempdir.joinpath('numbacache')
+tempdir = tempdir.joinpath('.pyebsdindex').joinpath('numbacache')
+Path(tempdir).mkdir(parents=True, exist_ok=True)
+os.environ["NUMBA_CACHE_DIR"] = str(tempdir)+str(os.sep)
 
 import rotlib
 
diff --git a/pyebsdindex/nlpar.py b/pyebsdindex/nlpar.py
index bb82881..e1fc4cd 100644
--- a/pyebsdindex/nlpar.py
+++ b/pyebsdindex/nlpar.py
@@ -1,3 +1,4 @@
+
 # This software was developed by employees of the US Naval Research Laboratory (NRL), an
 # agency of the Federal Government. Pursuant to title 17 section 105 of the United States
 # Code, works of NRL employees are not subject to copyright protection, and this software
@@ -18,694 +19,28 @@
 # some notice that they have been modified.
 #
 # Author: David Rowenhorst;
-# The US Naval Research Laboratory Date: 21 Aug 2020
+# The US Naval Research Laboratory Date: 22 May 2024
 
 # For more info see
 # Patrick T. Brewick, Stuart I. Wright, David J. Rowenhorst. Ultramicroscopy, 200:50–61, May 2019.
 
-"""Non-local pattern averaging and re-indexing (NLPAR)."""
-
-from pathlib import Path
-from timeit import default_timer as timer
-
-import numba
-import numpy as np
-import scipy.optimize as opt
-
-from pyebsdindex import ebsd_pattern
-
-#from os import environ
-#environ["NUMBA_CACHE_DIR"] = str(tempdir)
-
-
-__all__ = [
-    "NLPAR",
-]
-
-
-class NLPAR:
-  def __init__(self, filename=None,  lam=0.7, searchradius=3,dthresh=0.0, nrows = None, ncols = None):
-    self.lam = lam
-    self.searchradius = searchradius
-    self.dthresh = dthresh
-    self.filepath = None
-    self.hdfdatapath = None
-    self.filepathout = None
-    self.hdfdatapathout = None
-    #self.patternfile = None
-    #self.patternfileout = None
-    self.setfile(filename)
-    self.mask = None
-    self.sigma = None
-    self.nrows = None
-    self.ncols = None
-    if nrows is not None:
-      self.nrows = nrows
-    if ncols is not None:
-      self.ncols = ncols
-
-
-  def setfile(self,filepath=None):
-    self.filepath = None
-    self.hdfdatapath = None
-    pathtemp = np.atleast_1d(filepath)
-    fpath = pathtemp[0]
-    hdf5path = None
-    if pathtemp.size > 1:
-      hdf5path = pathtemp[1]
-    if fpath is not None:
-      self.filepath = Path(fpath)
-      self.hdfdatapath = hdf5path
-
-  def setoutfile(self, patternfile, filepath=None):
-    """Set the output file.
-
-    Parameters
-    ----------
-    patternfile
-      Input pattern file object from ebsd_pattern.
-    filepath
-      String.
-
-    Notes
-    -----
-    In the future I want to be able to specify the HDF5 data path to
-    store the output data, but that is proving to be a bit of a mess.
-    For now, a copy of the original HDF5 is made, and the NLPAR patterns
-    will be overwritten on top of the originals.
-    """
-    self.filepathout = None
-    self.hdfdatapathout = None
-    pathtemp = np.atleast_1d(filepath)
-    fpath = pathtemp[0]
-    hdf5path = None
-    #if pathtemp.size > 1:
-    #  hdf5path = pathtemp[1]
-    #print(fpath, hdf5path)
-    if fpath is not None: # the user has set an output file path.
-      self.filepathout = Path(fpath).expanduser().resolve()
-      self.hdfdatapathout =  hdf5path
-      if patternfile.filetype != 'HDF5': #check that the input and output are not the same.
-        pathok = self.filepathout.exists()
-        if pathok:
-          pathok = not self.filepathout.samefile(patternfile.filepath)
-          if not pathok:
-            raise ValueError('Error: File input and output are exactly the same.')
-            return
-
-        patternfile.copy_file([self.filepathout,self.hdfdatapathout], empty_data=True)
-        return  # fpath and (maybe) hdf5 path were set manually.
-      else: # this is a hdf5 file
-        if self.hdfdatapathout is None:
-          patternfile.copy_file(self.filepathout, empty_data=True)
-          self.hdfdatapathout = patternfile.h5patdatpth
-          return
-        else:
-          patternfile.copy_file([self.filepathout, self.hdfdatapathout], empty_data=True)
-          return
-
-    if patternfile is not None: # the user has set no path.
-      hdf5path = None
-      
-      if patternfile.filetype in ['UP', 'EBSP']:
-        p = Path(patternfile.filepath)
-        appnd = "_NLPAR_l{:1.2f}".format(self.lam) + "sr{:d}".format(self.searchradius)
-        newfilepath = str(p.parent / Path(p.stem + appnd + p.suffix))
-        patternfile.copy_file(newfilepath,empty_data=True)
-
-      if patternfile.filetype == 'HDF5':
-        hdf5path_tmp = str(patternfile.h5patdatpth).split('/')
-        if hdf5path_tmp[0] == '':
-          hdf5path_org =  hdf5path_tmp[1]
-        else:
-          hdf5path_org = hdf5path_tmp[0]
-        p = Path(patternfile.filepath)
-        appnd = "_NLPAR_l{:1.2f}".format(self.lam) + "sr{:d}".format(self.searchradius)
-        hdf5path = hdf5path_org+appnd
-        newfilepath = str(p.parent / Path(p.stem + appnd + p.suffix))
-        #patternfile.copy_file([newfilepath, hdf5path_org], newh5path=hdf5path)
-        patternfile.copy_file([newfilepath], empty_data=True)
-        hdf5path = patternfile.h5patdatpth
-
-      self.filepathout = newfilepath
-      self.hdfdatapathout = hdf5path
-      return
-
-  def getinfileobj(self):
-    if self.filepath is not None:
-      fID = ebsd_pattern.get_pattern_file_obj([self.filepath, self.hdfdatapath])
-      if (fID.nRows is not None):
-        if (self.nrows is None):
-          self.nrows = fID.nRows
-        else:
-          fID.nRows = self.nrows
-
-      if (fID.nCols is not None):
-        if (self.ncols is None):
-          self.ncols = fID.nCols
-        else:
-          fID.nCols = self.ncols
-
-      return fID
-
-    else:
-      return None
-
-  def getoutfileobj(self):
-    if self.filepathout is not None:
-      fID = ebsd_pattern.get_pattern_file_obj([self.filepathout, self.hdfdatapathout])
-      if self.nrows is not None:
-        fID.nRows = self.nrows
-      else:
-        self.nrows = fID.nRows
-
-      if self.ncols is not None:
-        fID.nCols = self.ncols
-      else:
-        self.ncols = fID.nCols
-      return fID
-    else:
-      return None
-
-  def opt_lambda(self,chunksize=0,saturation_protect=True,automask=True, backsub = False,
-                 target_weights=[0.5, 0.34, 0.25], dthresh=0.0, autoupdate=True):
-
-    target_weights = np.asarray(target_weights)
-
-    def loptfunc(lam,d2,tw,dthresh):
-      temp = (d2 > dthresh).choose(dthresh, d2)
-      dw = np.exp(-(temp) / lam ** 2)
-      w = np.sum(dw, axis=2) + 1e-12
-
-      metric = np.mean(np.abs(tw - 1.0 / w))
-      return metric
-
-
-    @numba.njit(fastmath=True, cache=True,parallel=True)
-    def d2norm(d2, n2, dij, sigma):
-      shp = d2.shape
-      s2 = sigma**2
-      for j in numba.prange(shp[0]):
-        for i in range(shp[1]):
-          for q in range(shp[2]):
-            if n2[j,i,q] > 0:
-              ii = dij[j,i,q,1]
-              jj = dij[j,i,q,0]
-              s2_12 = (s2[j,i]+s2[jj,ii])
-              d2[j,i,q] -= n2[j,i,q] * s2_12
-              d2[j,i,q] /= s2_12*np.sqrt(2.0*n2[j,i,q])
-
-    patternfile = self.getinfileobj()
-    patternfile.read_header()
-    nrows = np.uint64(self.nrows) #np.uint64(patternfile.nRows)
-    ncols = np.uint64(self.ncols) #np.uint64(patternfile.nCols)
-
-    pwidth = np.uint64(patternfile.patternW)
-    pheight = np.uint64(patternfile.patternH)
-    phw = pheight * pwidth
-
-    nn = 1
-    if chunksize <= 0:
-      chunksize = np.maximum(1, np.round(1e9 / phw / ncols) ) # keeps the chunk at about 4GB
-      chunksize = np.minimum(nrows,chunksize)
-      print("Chunk size set to nrows:", int(chunksize))
-    chunksize = np.int64(chunksize)
-
-    nn = np.uint64(nn)
-
-    if (automask is True) and (self.mask is None):
-      self.mask = (self.automask(pheight,pwidth))
-    if self.mask is None:
-      self.mask = np.ones((pheight,pwidth),dtype=np.uint8)
-
-    self.mask = (self.mask).astype(np.uint8)
-    indices = np.asarray((self.mask.flatten().nonzero())[0],np.uint64)
-
-    sigma = np.zeros((nrows,ncols),dtype=np.float32)+1e24
-    colstartcount = np.asarray([0,ncols],dtype=np.int64)
-
-
-    dthresh = np.float32(dthresh)
-    lamopt_values = []
-    
-    for j in range(0,nrows,chunksize):
-      print('Block',j)
-      #rowstartread = np.int64(max(0,j - nn))
-      rowstartread = np.int64(j)
-      rowend = min(j + chunksize + nn,nrows)
-      rowcountread = np.int64(rowend - rowstartread)
-      data, xyloc = patternfile.read_data(patStartCount=[[0,rowstartread],[ncols,rowcountread]],
-                                        convertToFloat=True,returnArrayOnly=True)
-
-      shp = data.shape
-
-      if backsub is True:
-        data = self.backsub(data)
-        #back = np.mean(data, axis=0)
-        #back -= np.mean(back)
-        #data -= back
-      data = data.reshape(shp[0], phw)
-
-      rowstartcount = np.asarray([0,rowcountread],dtype=np.int64)
-      sigchunk, (d2,n2, dij) = self.sigma_numba(data,nn,rowcountread,ncols,rowstartcount,colstartcount,indices,saturation_protect)
-      tmp = (sigma[j:j + rowstartcount[1],:] < sigchunk).choose( sigchunk, sigma[j:j + rowstartcount[1],:])
-      sigma[j:j + rowstartcount[1],:] = tmp
-
-
-      d2norm(d2, n2, dij, sigchunk)
-      lamopt_values_chnk = []
-      for tw in target_weights:
-        lam = 1.0
-        lambopt1 = opt.minimize(loptfunc,lam,args=(d2,tw,dthresh),method='Nelder-Mead',
-                                bounds = [[0.001, 10.0]],options={'fatol': 0.0001})
-        lamopt_values_chnk.append(lambopt1['x'])
-
-
-      lamopt_values.append(lamopt_values_chnk)
-    lamopt_values = np.asarray(lamopt_values)
-    print("Range of lambda values: ", np.mean(lamopt_values, axis = 0).flatten())
-    print("Optimal Choice: ", np.median(np.mean(lamopt_values, axis = 0)))
-    if autoupdate == True:
-      self.lam = np.median(np.mean(lamopt_values, axis = 0))
-    if self.sigma is None:
-      self.sigma = sigma
-    return np.mean(lamopt_values, axis = 0).flatten()
-
-  def calcnlpar(self, chunksize=0, searchradius=None, lam = None, dthresh = None, saturation_protect=True, automask=True,
-                filename=None, fileout=None, reset_sigma=True, backsub = False, rescale = False):
-
-    if lam is not None:
-      self.lam = lam
-
-    if dthresh is not None:
-      self.dthresh = dthresh
-
-    if searchradius is not None:
-      self.searchradius = searchradius
-
-    lam = np.float32(self.lam)
-    dthresh = np.float32(self.dthresh)
-    sr = np.int64(self.searchradius)
-
-    if filename is not None:
-      self.setfile(filepath=filename)
-
-    if reset_sigma:
-      self.sigma = None
-
-    patternfile = self.getinfileobj()
-
-    #if filepathout is not None:
-    self.setoutfile(patternfile, filepath=fileout)
-
-    patternfileout = self.getoutfileobj()
-
-
-    nrows = np.int64(self.nrows)#np.int64(patternfile.nRows)
-    ncols = np.int64(self.ncols)#np.int64(patternfile.nCols)
-    if patternfileout.nCols is None:
-      patternfileout.nCols = ncols
-    if patternfileout.nRows is None:
-      patternfileout.nRows = nrows
-
-
-    pwidth = np.int64(patternfile.patternW)
-    pheight = np.int64(patternfile.patternH)
-    phw = pheight*pwidth
-
-    if chunksize <= 0:
-      chunksize = np.maximum(1, np.round(1e9 / phw / ncols) ) # keeps the chunk at about 8GB
-      chunksize = np.minimum(nrows, chunksize)
-      print("Chunk size set to nrows:", int(chunksize))
-    chunksize = np.int64(chunksize)
 
 
 
-    if (automask is True) and (self.mask is None):
-      self.mask = (self.automask(pheight,pwidth))
-    if self.mask is None:
-      self.mask = np.ones((pheight,pwidth), dtype=np.uint8)
-
-    indices = np.asarray( (self.mask.flatten().nonzero())[0], np.uint64)
-    calcsigma = False
-    if self.sigma is None:
-      calcsigma = True
-      self.sigma = np.zeros((nrows, ncols), dtype=np.float32)+1e24
-
-
-    if np.asarray(self.sigma).size == 1:
-      tmp = np.asarray(self.sigma)[0]
-      self.sigma =  np.zeros((nrows, ncols), dtype=np.float32)+tmp
-      calcsigma = False
-
-    shpsigma = np.asarray(self.sigma).shape
-    if (shpsigma[0] != nrows) and (shpsigma[1] != ncols):
-      self.sigma = np.zeros((nrows,ncols),dtype=np.float32) + 1e24
-      calcsigma = True
-
-
-    sigma = np.asarray(self.sigma)
-    scalemethod = 'clip'
-    if rescale == True:
-      if np.issubdtype(patternfileout.filedatatype, np.integer):
-        mxval = np.iinfo(patternfileout.filedatatype).max
-        scalemethod = 'fullscale'
-      else: # not int, so no rescale.
-        rescale = False
-
-    nthreadpos = numba.get_num_threads()
-    #numba.set_num_threads(36)
-    colstartcount = np.asarray([0,ncols],dtype=np.int64)
-    print(lam, sr, dthresh)
-
-    for j in range(0,nrows,chunksize):
-      print('Row start', j)
-
-      rowstartread = np.int64(max(0, j-sr))
-      rowend = min(j + chunksize+sr,nrows)
-
-      if (rowend - rowstartread) < (2*sr+1):
-        rowstartread = np.int64(max(0, rowend - (2*sr+1)))
-      rowcountread = np.int64(rowend-rowstartread)
-      data, xyloc = patternfile.read_data(patStartCount = [[0,rowstartread], [ncols,rowcountread]],
-                                        convertToFloat=True,returnArrayOnly=True)
-
-      shpdata = data.shape
-
-      if backsub is True:
-        data = self.backsub(data)
-
-
-      data = data.reshape(shpdata[0], phw)
-
-      rowstartcount = np.asarray([0,rowcountread],dtype=np.int64)
-      if calcsigma is True:
-        sigchunk, tmp = self.sigma_numba(data,1,rowcountread,ncols,rowstartcount,colstartcount,indices,saturation_protect)
-        del tmp
-        tmp = (sigma[rowstartread:rowend,:] < sigchunk).choose(sigchunk,sigma[rowstartread:rowend,:])
-        sigma[rowstartread:rowend,:] = tmp
-      else:
-        sigchunk = sigma[rowstartread:rowend,:]
-
-      #dataout = data
-
-      dataout = self.nlpar_nb(data,lam, sr, dthresh, sigchunk,
-                              rowcountread,ncols,indices,saturation_protect)
-
-      dataout = dataout.reshape(rowcountread, ncols, phw)
-      dataout = dataout[j-rowstartread:, :, : ]
-      shpout = dataout.shape
-      dataout = dataout.reshape(shpout[0]*shpout[1], pheight, pwidth)
-      if rescale == True:
-        for i in range(dataout.shape[0]):
-          temp = dataout[i,:,:]
-          temp -= temp.min()
-          temp *= np.float32(mxval)/temp.max()
-          dataout[i,:,:] = temp
-
-      patternfileout.write_data(newpatterns=dataout,patStartCount = [[0,j], [ncols, shpout[0]]],
-                                     flt2int='clip',scalevalue=1.0 )
-      #self.patternfileout.write_data(newpatterns=dataout,patStartCount=[j*ncols,shpout[0]*shpout[1]],
-      #                               flt2int='clip',scalevalue=1.0 )
-      #return dataout
-      #sigma[j:j+rowstartcount[1],:] += \
-      #  sigchunk[rowstartcount[0]:rowstartcount[0]+rowstartcount[1],:]
-
-    numba.set_num_threads(nthreadpos)
-    return str(patternfileout.filepath)
-
-  def calcsigma(self,chunksize=0,nn=1,saturation_protect=True,automask=True):
-
-    patternfile = self.getinfileobj()
-
-
-    nrows = np.int64(self.nrows)#np.uint64(patternfile.nRows)
-    ncols = np.int64(self.ncols)#np.uint64(patternfile.nCols)
-
-    pwidth = np.uint64(patternfile.patternW)
-    pheight = np.uint64(patternfile.patternH)
-    phw = pheight*pwidth
-
-    if chunksize <= 0:
-      chunksize = np.round(2e9/phw/ncols) # keeps the chunk at about 8GB
-      chunksize = np.minimum(nrows,chunksize)
-      print("Chunk size set to nrows:", int(chunksize))
-    chunksize = np.int64(chunksize)
-
-
-    nn = np.uint64(nn)
-
-    if (automask is True) and (self.mask is None):
-      self.mask = (self.automask(pheight,pwidth))
-    if self.mask is None:
-      self.mask = np.ones((pheight,pwidth), dytype=np.uint8)
-
-    indices = np.asarray( (self.mask.flatten().nonzero())[0], np.uint64)
-
-    sigma = np.zeros((nrows, ncols), dtype=np.float32)
-    #d_nn = np.zeros((nrows, ncols, int((2*nn+1)**2)), dtype=np.float32)
-    colstartcount = np.asarray([0,ncols],dtype=np.int64)
-
-    for j in range(0,nrows,chunksize):
-      rowstartread = np.int64(max(0, j-nn))
-      rowend = min(j + chunksize+nn,nrows)
-      if (rowend - rowstartread) < (3):
-        rowstartread = np.int64(max(0, rowend - (3)))
-      rowcountread = np.int64(rowend-rowstartread)
-      data, xyloc = patternfile.read_data(patStartCount = [[0,rowstartread], [ncols,rowcountread]],
-                                        convertToFloat=True,returnArrayOnly=True)
-
-      shp = data.shape
-      data = data.reshape(data.shape[0], phw)
-
-      #data = None
-      if rowend == nrows:
-        rowstartcount = np.asarray([j-rowstartread,rowcountread - (j-rowstartread) ], dtype=np.int64)
-      else:
-        rowstartcount = np.asarray([j-rowstartread,chunksize ], dtype=np.int64)
-
-      sigchunk, temp = self.sigma_numba(data,nn, rowcountread,ncols,rowstartcount,colstartcount,indices,saturation_protect)
-
-      sigma[j:j+rowstartcount[1],:] += \
-        sigchunk[rowstartcount[0]:rowstartcount[0]+rowstartcount[1],:]
-
-    return sigma
-
-  def backsub(self, data):
-    # This function will fit a 2D gaussian on top of a plane to the averaged set of patterns (data) that is provided.
-    # It will automatically use whatever mask is defined for valid data.
-    # If the gaussian fit fails to converge, it will fall back to just using the mean set of patterns for the background
-    # with a warning.
-
-
-    def gaussian_surf(x, y, a, x0, y0, sigx, sigy, c, d, e):
-    # equation for 2D gaussian on top of a plane.
-      return a * np.exp(- ((x - x0) ** 2) / (2.0 * sigx ** 2) - ((y - y0) ** 2) / (2.0 * sigy ** 2)) + c + d * x + e * y
-
-    def fit_gauss(M, *args):
-    # helper function
-      x, y = M
-      #arr = np.zeros(x.shape)
-      return gaussian_surf(x, y, *args)
-
-    back = np.mean(data, axis=0) # start with the mean of all the data
-    # now fit a 2D gaussian sitting on a plane.  See fuction def above.
-    nx = data.shape[-1]
-    ny = data.shape[-2]
-    x = np.arange(nx, dtype=float)
-    x = (np.broadcast_to(x.reshape(1,nx), (ny, nx)))
-    y = np.arange(ny, dtype=float)
-    y = (np.broadcast_to(y, (nx, ny)).T)
-    x = x.ravel()
-    y = y.ravel()
-
-    # need to grab only the values that are in the mask.
-    wh = np.nonzero(self.mask.ravel())[0]
-    xwh = x[wh]
-    ywh = y[wh]
-    xywh = np.vstack((xwh, ywh))
-    zwh = (back.ravel())[wh]
-    whmx = np.unravel_index(back.argmax(), back.shape)
-    minz = zwh.min()
-    # initialize a guess for the parameters.
-    # [gauss amplitude, max loc x, max loc y, sigx, sigy, const offset, slope x, slope y]
-    p0 = [(zwh.max() - zwh.min()), whmx[1], whmx[0], nx/2.355, ny/2.355, minz, 0, 0]
-    try:
-      popt, pcov = opt.curve_fit(fit_gauss, xywh, zwh, p0)
-      backfit = (gaussian_surf(x, y, *popt)).reshape(ny, nx)
-      #print(p0, popt)
-    except RuntimeError:
-      print('Warning: no convergence on back subtract ... using mean of the patterns.')
-      print('This may not be ideal for scans with few grains across the width of the scan.')
-      backfit = back
-    backfit -= np.mean(backfit)
-    #f, axarr = plt.subplots(1, 3)
-    #f.set_size_inches(10, 4)
-    #axarr[0].imshow(data[0,:,:].squeeze(), cmap='gray')
-    #axarr[1].imshow(data[0,:,:].squeeze() - backfit, cmap='gray')
-    #axarr[2].imshow(backfit, cmap='gray')
-
-    data -= backfit
-    return data
-
-  @staticmethod
-  def automask( h,w ):
-    r = (min(h,w)*0.98*0.5)
-    x = np.arange(w, dtype=np.float32)
-    x = np.minimum(x , (w-x))
-    x = x.reshape(1,w)
-    y = np.arange(h, dtype=np.float32)
-    y = np.minimum(y , (h - y))
-    y = y.reshape(h,1)
-
-    mask =np.sqrt(y**2 + x**2)
-    mask = (mask < r).astype(np.uint8)
-    mask = np.roll(mask, (int(h/2), int(w/2) ), axis=(0,1))
-
-    return mask
-
-  @staticmethod
-  @numba.jit(nopython=True,cache=True,fastmath=False,parallel=True)
-  def sigma_numba(data, nn, nrows, ncols, rowstartcount, colstartcount, indices, saturation_protect=True):
-    sigma = np.zeros((nrows,ncols), dtype = np.float32)
-    dout = np.zeros((nrows,ncols,(nn*2+1)**2), dtype=np.float32)
-    nout = np.zeros((nrows,ncols,(nn * 2 + 1) ** 2),dtype=np.float32)
-    dij = np.zeros((nrows,ncols,(nn * 2 + 1) ** 2, 2),dtype=np.uint64)
-    shpdata = data.shape
-
-    n0 = np.float32(shpdata[-1])
-    shpind = indices.shape
-    mxval = np.max(data)
-    if saturation_protect == False:
-      mxval += 1.0
-    else:
-      mxval *= 0.9961
-
-    for j in numba.prange(rowstartcount[0], rowstartcount[0]+rowstartcount[1]):
-      nn_r_start = j - nn if (j - nn) >= 0 else 0
-      nn_r_end = (j + nn if (j + nn) < nrows else nrows-1)+1
-      for i in numba.prange(colstartcount[0], colstartcount[0]+colstartcount[1]):
-        nn_c_start = i - nn if (i - nn) >= 0 else 0
-        nn_c_end = (i + nn if (i + nn) < ncols else ncols - 1) + 1
-
-        mind = np.float32(1e24)
-        indx_0 = i+ncols*j
-        count = 0
-        for j_nn in range(nn_r_start,nn_r_end ):
-          for i_nn in range(nn_c_start,nn_c_end):
-            dij[j,i,count,0] = np.uint64(j_nn)
-            dij[j,i,count,1] = np.uint64(i_nn) # want to save this for labmda optimization
-            indx_nn = i_nn+ncols*j_nn
-            d2 = np.float32(0.0)
-            n2 = np.float32(1.0e-12)
-            nout[j,i,count] = n0 # want to save this for labmda optimization
-            if not((i == i_nn) and (j == j_nn)):
-              for q in range(shpind[0]):
-                d0 = data[indx_0, indices[q]]
-                d1 = data[indx_nn, indices[q]]
-                if (d1 < mxval) and (d0 < mxval):
-                  n2 += 1.0
-                  d2 += (d0 - d1)**2
-              nout[j,i,count] = n2
-
-              if d2 >= 1.e-3: #sometimes EDAX collects the same pattern twice
-                s0 = d2 / np.float32(n2 * 2.0)
-                if s0 < mind:
-                  mind = s0
-            dout[j,i,count] = d2 # want to save this for labmda optimization
-
-            count += 1
-
-        sigma[j,i] = np.sqrt(mind)
-        #if sigma[j,i] > 1e12:
-        #  print(sigma[j,i], dout[j,i,:], nout[i,j,:])
-    return sigma,( dout, nout, dij)
-
-  @staticmethod
-  @numba.jit(nopython=True,cache=True,fastmath=False,parallel=True)
-  def nlpar_nb(data,lam, sr, dthresh, sigma, nrows,ncols,indices,saturation_protect=True):
-    def getpairid(idx0, idx1):
-      idx0_t = int(idx0)
-      idx1_t = int(idx1)
-      if idx0 < idx1:
-        pairid = idx0_t + (idx1_t << 32)
-      else:
-        pairid = idx1_t + (idx0_t << 32)
-      return numba.uint64(pairid)
-
-    lam2 = 1.0 / lam**2
-    dataout = np.zeros_like(data, np.float32)
-    shpdata = data.shape
-    shpind = indices.shape
-    winsz = np.int32((2*sr+1)**2)
-
-
-    mxval = np.max(data)
-    if saturation_protect == False:
-      mxval += np.float32(1.0)
-    else:
-      mxval *= np.float32(0.999)
-    for i in numba.prange(ncols):
-      winstart_x = max((i - sr),0) - max((i + sr - (ncols - 1)),0)
-      winend_x = min((i + sr),(ncols - 1)) + max((sr - i),0) + 1
-      pairdict = numba.typed.Dict.empty(key_type=numba.core.types.uint64,value_type=numba.core.types.float32)
-      for j in range(nrows):
-        winstart_y = max((j - sr),0) - max((j + sr - (nrows - 1)),0)
-        winend_y = min((j + sr),(nrows - 1)) + max((sr - j),0) + 1
-
-        weights = np.zeros(winsz,dtype=np.float32)
-        pindx = np.zeros(winsz,dtype=np.uint64)
-
-        indx_0 = i + ncols * j
-        sigma0 = sigma[j,i]**2
-
-        counter = 0
-        for j_nn in range(winstart_y,winend_y):
-          for i_nn in range(winstart_x,winend_x):
-            indx_nn = i_nn + ncols * j_nn
-            pindx[counter] = indx_nn
+"""Non-local pattern averaging and re-indexing (NLPAR)."""
 
-            if indx_nn == indx_0:
-              weights[counter] = np.float32(-1.0e6)
-            else:
-              pairid = getpairid(indx_0, indx_nn)
-              if pairid in pairdict:
-                weights[counter] = pairdict[pairid]
-              else:
-                sigma1 = sigma[j_nn,i_nn]**2
-                d2 = np.float32(0.0)
-                n2 = np.float32(0.0)
-                for q in range(shpind[0]):
-                  d0 = data[indx_0,indices[q]]
-                  d1 = data[indx_nn,indices[q]]
-                  if (d1 < mxval) and (d0 < mxval):
-                    n2 += np.float32(1.0)
-                    d2 += (d0 - d1) ** np.int32(2)
-                d2 -= n2*(sigma0+sigma1)
-                dnorm = (sigma1 + sigma0)*np.sqrt(np.float32(2.0)*n2)
-                if dnorm > np.float32(1.e-8):
-                  d2 /= dnorm
-                else:
-                  d2 = np.float32(1e6)*n2
-                weights[counter] = d2
-                pairdict[pairid] = numba.float32(d2)
-            counter += 1
-        #print('________________')
-        # end of window scanning
-        sum = np.float32(0.0)
-        for i_nn in range(winsz):
 
-          weights[i_nn] = np.maximum(weights[i_nn]-dthresh, numba.float32(0.0))
-          weights[i_nn] = np.exp(-1.0 * weights[i_nn] * lam2)
-          sum += weights[i_nn]
+from pyebsdindex import _ray_installed
+from pyebsdindex import _pyopencl_installed
 
-        for i_nn in range(winsz):
-          indx_nn = pindx[i_nn]
-          weights[i_nn] /= sum
-          #print(weights[i_nn], ' \n')
-          for q in range(shpdata[1]):
-            dataout[indx_0, q] += data[indx_nn, q]*weights[i_nn]
-        #print('_______', '\n')
-    return dataout
 
+if _ray_installed and _pyopencl_installed:
+  from pyebsdindex.opencl.nlpar_clray import NLPAR
+elif _pyopencl_installed and not _ray_installed:
+  from pyebsdindex.opencl.nlpar_cl import NLPAR
+else:
+  from pyebsdindex.nlpar_cpu import NLPAR
 
+__all__ = [
+    "NLPAR",
+]
\ No newline at end of file
diff --git a/pyebsdindex/nlpar_cpu.py b/pyebsdindex/nlpar_cpu.py
new file mode 100644
index 0000000..129968a
--- /dev/null
+++ b/pyebsdindex/nlpar_cpu.py
@@ -0,0 +1,794 @@
+# This software was developed by employees of the US Naval Research Laboratory (NRL), an
+# agency of the Federal Government. Pursuant to title 17 section 105 of the United States
+# Code, works of NRL employees are not subject to copyright protection, and this software
+# is in the public domain. PyEBSDIndex is an experimental system. NRL assumes no
+# responsibility whatsoever for its use by other parties, and makes no guarantees,
+# expressed or implied, about its quality, reliability, or any other characteristic. We
+# would appreciate acknowledgment if the software is used. To the extent that NRL may hold
+# copyright in countries other than the United States, you are hereby granted the
+# non-exclusive irrevocable and unconditional right to print, publish, prepare derivative
+# works and distribute this software, in any medium, or authorize others to do so on your
+# behalf, on a royalty-free basis throughout the world. You may improve, modify, and
+# create derivative works of the software or any portion of the software, and you may copy
+# and distribute such modifications or works. Modified works should carry a notice stating
+# that you changed the software and should note the date and nature of any such change.
+# Please explicitly acknowledge the US Naval Research Laboratory as the original source.
+# This software can be redistributed and/or modified freely provided that any derivative
+# works bear some notice that they are derived from it, and any modified versions bear
+# some notice that they have been modified.
+#
+# Author: David Rowenhorst;
+# The US Naval Research Laboratory Date: 22 May 2024
+
+# For more info see
+# Patrick T. Brewick, Stuart I. Wright, David J. Rowenhorst. Ultramicroscopy, 200:50–61, May 2019.
+
+
+
+from pathlib import Path
+from timeit import default_timer as timer
+
+import numba
+import numpy as np
+import scipy.optimize as opt
+
+from pyebsdindex import ebsd_pattern
+
+#from os import environ
+#environ["NUMBA_CACHE_DIR"] = str(tempdir)
+
+
+__all__ = [
+    "NLPAR",
+]
+
+
+class NLPAR:
+  def __init__(self, filename=None,  lam=0.7, searchradius=3,dthresh=0.0, nrows = None, ncols = None, **kwargs):
+    self.lam = lam
+    self.searchradius = searchradius
+    self.dthresh = dthresh
+    self.filepath = None
+    self.hdfdatapath = None
+    self.filepathout = None
+    self.hdfdatapathout = None
+    #self.patternfile = None
+    #self.patternfileout = None
+    self.setfile(filename)
+    self.mask = None
+    self.sigma = None
+    self.sigmann = 1
+    self.nrows = None
+    self.ncols = None
+    if nrows is not None:
+      self.nrows = nrows
+    if ncols is not None:
+      self.ncols = ncols
+
+
+  def setfile(self,filepath=None):
+    self.filepath = None
+    self.hdfdatapath = None
+    pathtemp = np.atleast_1d(filepath)
+    fpath = pathtemp[0]
+    hdf5path = None
+    if pathtemp.size > 1:
+      hdf5path = pathtemp[1]
+    if fpath is not None:
+      self.filepath = Path(fpath)
+      self.hdfdatapath = hdf5path
+
+  def setoutfile(self, patternfile, filepath=None):
+    """Set the output file.
+
+    Parameters
+    ----------
+    patternfile
+      Input pattern file object from ebsd_pattern.
+    filepath
+      String.
+
+    Notes
+    -----
+    In the future I want to be able to specify the HDF5 data path to
+    store the output data, but that is proving to be a bit of a mess.
+    For now, a copy of the original HDF5 is made, and the NLPAR patterns
+    will be overwritten on top of the originals.
+    """
+    self.filepathout = None
+    self.hdfdatapathout = None
+    pathtemp = np.atleast_1d(filepath)
+    fpath = pathtemp[0]
+    hdf5path = None
+    #if pathtemp.size > 1:
+    #  hdf5path = pathtemp[1]
+    #print(fpath, hdf5path)
+    if fpath is not None: # the user has set an output file path.
+      self.filepathout = Path(fpath).expanduser().resolve()
+      self.hdfdatapathout =  hdf5path
+      if patternfile.filetype != 'HDF5': #check that the input and output are not the same.
+        pathok = self.filepathout.exists()
+        if pathok:
+          pathok = not self.filepathout.samefile(patternfile.filepath)
+          if not pathok:
+            raise ValueError('Error: File input and output are exactly the same.')
+            return
+
+        patternfile.copy_file([self.filepathout,self.hdfdatapathout], empty_data=True)
+        return  # fpath and (maybe) hdf5 path were set manually.
+      else: # this is a hdf5 file
+        if self.hdfdatapathout is None:
+          patternfile.copy_file(self.filepathout, empty_data=True)
+          self.hdfdatapathout = patternfile.h5patdatpth
+          return
+        else:
+          patternfile.copy_file([self.filepathout, self.hdfdatapathout], empty_data=True)
+          return
+
+    if patternfile is not None: # the user has set no path.
+      hdf5path = None
+      
+      if patternfile.filetype in ['UP', 'EBSP']:
+        p = Path(patternfile.filepath)
+        appnd = "_NLPAR_l{:1.2f}".format(self.lam) + "sr{:d}".format(self.searchradius)
+        newfilepath = str(p.parent / Path(p.stem + appnd + p.suffix))
+        patternfile.copy_file(newfilepath,empty_data=True)
+
+      if patternfile.filetype == 'HDF5':
+        hdf5path_tmp = str(patternfile.h5patdatpth).split('/')
+        if hdf5path_tmp[0] == '':
+          hdf5path_org =  hdf5path_tmp[1]
+        else:
+          hdf5path_org = hdf5path_tmp[0]
+        p = Path(patternfile.filepath)
+        appnd = "_NLPAR_l{:1.2f}".format(self.lam) + "sr{:d}".format(self.searchradius)
+        hdf5path = hdf5path_org+appnd
+        newfilepath = str(p.parent / Path(p.stem + appnd + p.suffix))
+        #patternfile.copy_file([newfilepath, hdf5path_org], newh5path=hdf5path)
+        patternfile.copy_file([newfilepath], empty_data=True)
+        hdf5path = patternfile.h5patdatpth
+
+      self.filepathout = newfilepath
+      self.hdfdatapathout = hdf5path
+      return
+
+  def getinfileobj(self):
+    if self.filepath is not None:
+      fID = ebsd_pattern.get_pattern_file_obj([self.filepath, self.hdfdatapath])
+      if (fID.nRows is not None):
+        if (self.nrows is None):
+          self.nrows = fID.nRows
+        else:
+          fID.nRows = self.nrows
+
+      if (fID.nCols is not None):
+        if (self.ncols is None):
+          self.ncols = fID.nCols
+        else:
+          fID.nCols = self.ncols
+
+      return fID
+
+    else:
+      return None
+
+  def getoutfileobj(self):
+    if self.filepathout is not None:
+      fID = ebsd_pattern.get_pattern_file_obj([self.filepathout, self.hdfdatapathout])
+      if self.nrows is not None:
+        fID.nRows = self.nrows
+      else:
+        self.nrows = fID.nRows
+
+      if self.ncols is not None:
+        fID.nCols = self.ncols
+      else:
+        self.ncols = fID.nCols
+      return fID
+    else:
+      return None
+
+  def opt_lambda(self,chunksize=0,saturation_protect=True,automask=True, backsub = False,
+                 target_weights=[0.5, 0.34, 0.25], dthresh=0.0, autoupdate=True, **kwargs):
+
+    target_weights = np.asarray(target_weights)
+
+    def loptfunc(lam,d2,tw,dthresh):
+      temp = (d2 > dthresh).choose(dthresh, d2)
+      dw = np.exp(-(temp) / lam ** 2)
+      w = np.sum(dw, axis=2) + 1e-12
+
+      metric = np.mean(np.abs(tw - 1.0 / w))
+      return metric
+
+
+    @numba.njit(fastmath=True, cache=True,parallel=True)
+    def d2norm(d2, n2, dij, sigma):
+      shp = d2.shape
+      s2 = sigma**2
+      for j in numba.prange(shp[0]):
+        for i in range(shp[1]):
+          for q in range(shp[2]):
+            if n2[j,i,q] > 0:
+              ii = dij[j,i,q,1]
+              jj = dij[j,i,q,0]
+              s2_12 = (s2[j,i]+s2[jj,ii])
+              d2[j,i,q] -= n2[j,i,q] * s2_12
+              d2[j,i,q] /= s2_12*np.sqrt(2.0*n2[j,i,q])
+
+    patternfile = self.getinfileobj()
+    patternfile.read_header()
+    nrows = np.uint64(self.nrows) #np.uint64(patternfile.nRows)
+    ncols = np.uint64(self.ncols) #np.uint64(patternfile.nCols)
+
+    pwidth = np.uint64(patternfile.patternW)
+    pheight = np.uint64(patternfile.patternH)
+    phw = pheight * pwidth
+
+    nn = 1
+    if chunksize <= 0:
+      chunksize = np.maximum(1, np.round(1e9 / phw / ncols) ) # keeps the chunk at about 4GB
+      chunksize = np.minimum(nrows,chunksize)
+      print("Chunk size set to nrows:", int(chunksize))
+    chunksize = np.int64(chunksize)
+
+    nn = np.uint64(nn)
+
+    if (automask is True) and (self.mask is None):
+      self.mask = (self.automask(pheight,pwidth))
+    if self.mask is None:
+      self.mask = np.ones((pheight,pwidth),dtype=np.uint8)
+
+    self.mask = (self.mask).astype(np.uint8)
+    indices = np.asarray((self.mask.flatten().nonzero())[0],np.uint64)
+
+    sigma = np.zeros((nrows,ncols),dtype=np.float32)+1e24
+    colstartcount = np.asarray([0,ncols],dtype=np.int64)
+
+
+    dthresh = np.float32(dthresh)
+    lamopt_values = []
+    
+    for j in range(0,nrows,chunksize):
+      print('Block',j)
+      #rowstartread = np.int64(max(0,j - nn))
+      rowstartread = np.int64(j)
+      rowend = min(j + chunksize + nn,nrows)
+      rowcountread = np.int64(rowend - rowstartread)
+      data, xyloc = patternfile.read_data(patStartCount=[[0,rowstartread],[ncols,rowcountread]],
+                                        convertToFloat=True,returnArrayOnly=True)
+
+      shp = data.shape
+
+      if backsub is True:
+        data = self.backsub(data)
+        #back = np.mean(data, axis=0)
+        #back -= np.mean(back)
+        #data -= back
+      data = data.reshape(shp[0], phw)
+
+      rowstartcount = np.asarray([0,rowcountread],dtype=np.int64)
+      sigchunk, (d2,n2, dij) = self.sigma_numba(data,nn,rowcountread,ncols,rowstartcount,colstartcount,indices,saturation_protect)
+      tmp = (sigma[j:j + rowstartcount[1],:] < sigchunk).choose( sigchunk, sigma[j:j + rowstartcount[1],:])
+      sigma[j:j + rowstartcount[1],:] = tmp
+
+
+      d2norm(d2, n2, dij, sigchunk)
+
+
+      lamopt_values_chnk = []
+      for tw in target_weights:
+        lam = 1.0
+        lambopt1 = opt.minimize(loptfunc,lam,args=(d2,tw,dthresh),method='Nelder-Mead',
+                                bounds = [[0.001, 10.0]],options={'fatol': 0.0001})
+        lamopt_values_chnk.append(lambopt1['x'])
+
+
+      lamopt_values.append(lamopt_values_chnk)
+    lamopt_values = np.asarray(lamopt_values)
+    print("Range of lambda values: ", np.mean(lamopt_values, axis = 0).flatten())
+    print("Optimal Choice: ", np.median(np.mean(lamopt_values, axis = 0)))
+    if autoupdate == True:
+      self.lam = np.median(np.mean(lamopt_values, axis = 0))
+    if self.sigma is None:
+      self.sigma = sigma
+    return np.mean(lamopt_values, axis = 0).flatten()
+
+  def calcnlpar(self, chunksize=0, searchradius=None, lam = None, dthresh = None, saturation_protect=True, automask=True,
+                filename=None, fileout=None, reset_sigma=False, backsub = False, rescale = False, **kwargs):
+
+    if lam is not None:
+      self.lam = lam
+
+    if dthresh is not None:
+      self.dthresh = dthresh
+
+    if searchradius is not None:
+      self.searchradius = searchradius
+
+    lam = np.float32(self.lam)
+    dthresh = np.float32(self.dthresh)
+    sr = np.int64(self.searchradius)
+
+    if filename is not None:
+      self.setfile(filepath=filename)
+
+
+
+
+    patternfile = self.getinfileobj()
+
+    #if filepathout is not None:
+    self.setoutfile(patternfile, filepath=fileout)
+
+    patternfileout = self.getoutfileobj()
+
+
+    nrows = np.int64(self.nrows)#np.int64(patternfile.nRows)
+    ncols = np.int64(self.ncols)#np.int64(patternfile.nCols)
+    if patternfileout.nCols is None:
+      patternfileout.nCols = ncols
+    if patternfileout.nRows is None:
+      patternfileout.nRows = nrows
+
+
+    pwidth = np.int64(patternfile.patternW)
+    pheight = np.int64(patternfile.patternH)
+    phw = pheight*pwidth
+
+    if chunksize <= 0:
+      chunksize = np.maximum(1, np.round(1e9 / phw / ncols) ) # keeps the chunk at about 8GB
+      chunksize = np.minimum(nrows, chunksize)
+      print("Chunk size set to nrows:", int(chunksize))
+    chunksize = np.int64(chunksize)
+
+
+
+    if (automask is True) and (self.mask is None):
+      self.mask = (self.automask(pheight,pwidth))
+    if self.mask is None:
+      self.mask = np.ones((pheight,pwidth), dtype=np.uint8)
+
+    indices = np.asarray( (self.mask.flatten().nonzero())[0], np.uint64)
+    calcsigma = False
+    if self.sigma is None:
+      calcsigma = True
+      self.sigma = np.zeros((nrows, ncols), dtype=np.float32)+1e24
+
+    if reset_sigma:
+      self.sigma = None
+
+    if (np.asarray(self.sigma).size == 1) and (self.sigma is not None):
+      tmp = np.asarray(self.sigma)[0]
+      self.sigma =  np.zeros((nrows, ncols), dtype=np.float32)+tmp
+      calcsigma = False
+
+    shpsigma = np.asarray(self.sigma).shape
+    if (shpsigma[0] != nrows) and (shpsigma[1] != ncols):
+      self.sigma = np.zeros((nrows,ncols),dtype=np.float32) + 1e24
+      calcsigma = True
+
+    sigma = np.asarray(self.sigma)
+
+    scalemethod = 'clip'
+    if rescale == True:
+      if np.issubdtype(patternfileout.filedatatype, np.integer):
+        mxval = np.iinfo(patternfileout.filedatatype).max
+        scalemethod = 'fullscale'
+      else: # not int, so no rescale.
+        rescale = False
+
+    nthreadpos = numba.get_num_threads()
+    #numba.set_num_threads(36)
+    colstartcount = np.asarray([0,ncols],dtype=np.int64)
+    print(lam, sr, dthresh)
+
+    for j in range(0,nrows,chunksize):
+      print('Row start', j)
+
+      rowstartread = np.int64(max(0, j-sr))
+      rowend = min(j + chunksize+sr,nrows)
+
+      if (rowend - rowstartread) < (2*sr+1):
+        rowstartread = np.int64(max(0, rowend - (2*sr+1)))
+      rowcountread = np.int64(rowend-rowstartread)
+      data, xyloc = patternfile.read_data(patStartCount = [[0,rowstartread], [ncols,rowcountread]],
+                                        convertToFloat=True,returnArrayOnly=True)
+
+      shpdata = data.shape
+
+      if backsub is True:
+        data = self.backsub(data)
+
+
+      data = data.reshape(shpdata[0], phw)
+
+      rowstartcount = np.asarray([0,rowcountread],dtype=np.int64)
+      if calcsigma is True:
+        sigchunk, tmp = self.sigma_numba(data,1,rowcountread,ncols,rowstartcount,colstartcount,indices,saturation_protect)
+        del tmp
+        tmp = (sigma[rowstartread:rowend,:] < sigchunk).choose(sigchunk,sigma[rowstartread:rowend,:])
+        sigma[rowstartread:rowend,:] = tmp
+      else:
+        sigchunk = sigma[rowstartread:rowend,:]
+
+      #dataout = data
+
+      dataout = self.nlpar_nb(data,lam, sr, dthresh, sigchunk,
+                              rowcountread,ncols,indices,saturation_protect)
+
+      dataout = dataout.reshape(rowcountread, ncols, phw)
+      dataout = dataout[j-rowstartread:, :, : ]
+      shpout = dataout.shape
+      dataout = dataout.reshape(shpout[0]*shpout[1], pheight, pwidth)
+      if rescale == True:
+        for i in range(dataout.shape[0]):
+          temp = dataout[i,:,:]
+          temp -= temp.min()
+          temp *= np.float32(mxval)/temp.max()
+          dataout[i,:,:] = temp
+
+      patternfileout.write_data(newpatterns=dataout,patStartCount = [[0,j], [ncols, shpout[0]]],
+                                     flt2int='clip',scalevalue=1.0 )
+      #self.patternfileout.write_data(newpatterns=dataout,patStartCount=[j*ncols,shpout[0]*shpout[1]],
+      #                               flt2int='clip',scalevalue=1.0 )
+      #return dataout
+      #sigma[j:j+rowstartcount[1],:] += \
+      #  sigchunk[rowstartcount[0]:rowstartcount[0]+rowstartcount[1],:]
+
+    numba.set_num_threads(nthreadpos)
+    return str(patternfileout.filepath)
+
+  def calcsigma(self,chunksize=0,nn=1,saturation_protect=True,automask=True):
+    self.sigmann = nn
+    patternfile = self.getinfileobj()
+
+
+    nrows = np.int64(self.nrows)#np.uint64(patternfile.nRows)
+    ncols = np.int64(self.ncols)#np.uint64(patternfile.nCols)
+
+    pwidth = np.uint64(patternfile.patternW)
+    pheight = np.uint64(patternfile.patternH)
+    phw = pheight*pwidth
+
+    if chunksize <= 0:
+      chunksize = np.round(2e9/phw/ncols) # keeps the chunk at about 8GB
+      chunksize = np.minimum(nrows,chunksize)
+      print("Chunk size set to nrows:", int(chunksize))
+    chunksize = np.int64(chunksize)
+
+
+    nn = np.uint64(nn)
+
+    if (automask is True) and (self.mask is None):
+      self.mask = (self.automask(pheight,pwidth))
+    if self.mask is None:
+      self.mask = np.ones((pheight,pwidth), dtype=np.uint8)
+
+    indices = np.asarray( (self.mask.flatten().nonzero())[0], np.uint64)
+
+    sigma = np.zeros((nrows, ncols), dtype=np.float32)
+    #d_nn = np.zeros((nrows, ncols, int((2*nn+1)**2)), dtype=np.float32)
+    colstartcount = np.asarray([0,ncols],dtype=np.int64)
+
+    for j in range(0,nrows,chunksize):
+      rowstartread = np.int64(max(0, j-nn))
+      rowend = min(j + chunksize+nn,nrows)
+      if (rowend - rowstartread) < (3):
+        rowstartread = np.int64(max(0, rowend - (3)))
+      rowcountread = np.int64(rowend-rowstartread)
+      data, xyloc = patternfile.read_data(patStartCount = [[0,rowstartread], [ncols,rowcountread]],
+                                        convertToFloat=True,returnArrayOnly=True)
+
+      shp = data.shape
+      data = data.reshape(data.shape[0], phw)
+
+      #data = None
+      if rowend == nrows:
+        rowstartcount = np.asarray([j-rowstartread,rowcountread - (j-rowstartread) ], dtype=np.int64)
+      else:
+        rowstartcount = np.asarray([j-rowstartread,chunksize ], dtype=np.int64)
+
+      sigchunk, temp = self.sigma_numba(data,nn, rowcountread,ncols,rowstartcount,colstartcount,indices,saturation_protect)
+
+      sigma[j:j+rowstartcount[1],:] += \
+        sigchunk[rowstartcount[0]:rowstartcount[0]+rowstartcount[1],:]
+
+    return sigma
+
+  def backsub(self, data):
+    # This function will fit a 2D gaussian on top of a plane to the averaged set of patterns (data) that is provided.
+    # It will automatically use whatever mask is defined for valid data.
+    # If the gaussian fit fails to converge, it will fall back to just using the mean set of patterns for the background
+    # with a warning.
+
+
+    def gaussian_surf(x, y, a, x0, y0, sigx, sigy, c, d, e):
+    # equation for 2D gaussian on top of a plane.
+      return a * np.exp(- ((x - x0) ** 2) / (2.0 * sigx ** 2) - ((y - y0) ** 2) / (2.0 * sigy ** 2)) + c + d * x + e * y
+
+    def fit_gauss(M, *args):
+    # helper function
+      x, y = M
+      #arr = np.zeros(x.shape)
+      return gaussian_surf(x, y, *args)
+
+    back = np.mean(data, axis=0) # start with the mean of all the data
+    # now fit a 2D gaussian sitting on a plane.  See fuction def above.
+    nx = data.shape[-1]
+    ny = data.shape[-2]
+    x = np.arange(nx, dtype=float)
+    x = (np.broadcast_to(x.reshape(1,nx), (ny, nx)))
+    y = np.arange(ny, dtype=float)
+    y = (np.broadcast_to(y, (nx, ny)).T)
+    x = x.ravel()
+    y = y.ravel()
+
+    # need to grab only the values that are in the mask.
+    wh = np.nonzero(self.mask.ravel())[0]
+    xwh = x[wh]
+    ywh = y[wh]
+    xywh = np.vstack((xwh, ywh))
+    zwh = (back.ravel())[wh]
+    whmx = np.unravel_index(back.argmax(), back.shape)
+    minz = zwh.min()
+    # initialize a guess for the parameters.
+    # [gauss amplitude, max loc x, max loc y, sigx, sigy, const offset, slope x, slope y]
+    p0 = [(zwh.max() - zwh.min()), whmx[1], whmx[0], nx/2.355, ny/2.355, minz, 0, 0]
+    try:
+      popt, pcov = opt.curve_fit(fit_gauss, xywh, zwh, p0)
+      backfit = (gaussian_surf(x, y, *popt)).reshape(ny, nx)
+      #print(p0, popt)
+    except RuntimeError:
+      print('Warning: no convergence on back subtract ... using mean of the patterns.')
+      print('This may not be ideal for scans with few grains across the width of the scan.')
+      backfit = back
+    backfit -= np.mean(backfit)
+    #f, axarr = plt.subplots(1, 3)
+    #f.set_size_inches(10, 4)
+    #axarr[0].imshow(data[0,:,:].squeeze(), cmap='gray')
+    #axarr[1].imshow(data[0,:,:].squeeze() - backfit, cmap='gray')
+    #axarr[2].imshow(backfit, cmap='gray')
+
+    data -= backfit
+    return data
+
+  @staticmethod
+  def automask( h,w ):
+    r = (min(h,w)*0.98*0.5)
+    x = np.arange(w, dtype=np.float32)
+    x = np.minimum(x , (w-x))
+    x = x.reshape(1,w)
+    y = np.arange(h, dtype=np.float32)
+    y = np.minimum(y , (h - y))
+    y = y.reshape(h,1)
+
+    mask =np.sqrt(y**2 + x**2)
+    mask = (mask < r).astype(np.uint8)
+    mask = np.roll(mask, (int(h/2), int(w/2) ), axis=(0,1))
+
+    return mask
+
+  @staticmethod
+  @numba.jit(nopython=True,cache=True,fastmath=False,parallel=True)
+  def sigma_numba(data, nn, nrows, ncols, rowstartcount, colstartcount, indices, saturation_protect=True):
+    sigma = np.zeros((nrows,ncols), dtype = np.float32)
+    dout = np.zeros((nrows,ncols,(nn*2+1)**2), dtype=np.float32)
+    nout = np.zeros((nrows,ncols,(nn * 2 + 1) ** 2),dtype=np.float32)
+    dij = np.zeros((nrows,ncols,(nn * 2 + 1) ** 2, 2),dtype=np.uint64)
+    shpdata = data.shape
+
+    n0 = np.float32(shpdata[-1])
+    shpind = indices.shape
+    mxval = np.max(data)
+    if saturation_protect == False:
+      mxval += 1.0
+    else:
+      mxval *= 0.9961
+
+    for j in numba.prange(rowstartcount[0], rowstartcount[0]+rowstartcount[1]):
+      nn_r_start = j - nn if (j - nn) >= 0 else 0
+      nn_r_end = (j + nn if (j + nn) < nrows else nrows-1)+1
+      for i in numba.prange(colstartcount[0], colstartcount[0]+colstartcount[1]):
+        nn_c_start = i - nn if (i - nn) >= 0 else 0
+        nn_c_end = (i + nn if (i + nn) < ncols else ncols - 1) + 1
+
+        mind = np.float32(1e24)
+        indx_0 = i+ncols*j
+        count = 0
+        for j_nn in range(nn_r_start,nn_r_end ):
+          for i_nn in range(nn_c_start,nn_c_end):
+            dij[j,i,count,0] = np.uint64(j_nn)
+            dij[j,i,count,1] = np.uint64(i_nn) # want to save this for labmda optimization
+            indx_nn = i_nn+ncols*j_nn
+            d2 = np.float32(0.0)
+            n2 = np.float32(1.0e-12)
+            nout[j,i,count] = n0 # want to save this for labmda optimization
+            if not((i == i_nn) and (j == j_nn)):
+              for q in range(shpind[0]):
+                d0 = data[indx_0, indices[q]]
+                d1 = data[indx_nn, indices[q]]
+                if (d1 < mxval) and (d0 < mxval):
+                  n2 += 1.0
+                  d2 += (d0 - d1)**2
+              nout[j,i,count] = n2
+
+              if d2 >= 1.e-3: #sometimes EDAX collects the same pattern twice
+                s0 = d2 / np.float32(n2 * 2.0)
+                if s0 < mind:
+                  mind = s0
+            dout[j,i,count] = d2 # want to save this for labmda optimization
+
+            count += 1
+
+        sigma[j,i] = np.sqrt(mind)
+        #if sigma[j,i] > 1e12:
+        #  print(sigma[j,i], dout[j,i,:], nout[i,j,:])
+    return sigma,( dout, nout, dij)
+
+  @staticmethod
+  @numba.jit(nopython=True,cache=True,fastmath=False,parallel=True)
+  def nlpar_nb(data,lam, sr, dthresh, sigma, nrows,ncols,indices,saturation_protect=True):
+    def getpairid(idx0, idx1):
+      idx0_t = int(idx0)
+      idx1_t = int(idx1)
+      if idx0 < idx1:
+        pairid = idx0_t + (idx1_t << 32)
+      else:
+        pairid = idx1_t + (idx0_t << 32)
+      return numba.uint64(pairid)
+
+    lam2 = 1.0 / lam**2
+    dataout = np.zeros_like(data, np.float32)
+    shpdata = data.shape
+    shpind = indices.shape
+    winsz = np.int32((2*sr+1)**2)
+
+
+    mxval = np.max(data)
+    if saturation_protect == False:
+      mxval += np.float32(1.0)
+    else:
+      mxval *= np.float32(0.999)
+    for i in numba.prange(ncols):
+      winstart_x = max((i - sr),0) - max((i + sr - (ncols - 1)),0)
+      winend_x = min((i + sr),(ncols - 1)) + max((sr - i),0) + 1
+      pairdict = numba.typed.Dict.empty(key_type=numba.core.types.uint64,value_type=numba.core.types.float32)
+      for j in range(nrows):
+        winstart_y = max((j - sr),0) - max((j + sr - (nrows - 1)),0)
+        winend_y = min((j + sr),(nrows - 1)) + max((sr - j),0) + 1
+
+        weights = np.zeros(winsz,dtype=np.float32)
+        pindx = np.zeros(winsz,dtype=np.uint64)
+
+        indx_0 = i + ncols * j
+        sigma0 = sigma[j,i]**2
+
+        counter = 0
+        for j_nn in range(winstart_y,winend_y):
+          for i_nn in range(winstart_x,winend_x):
+            indx_nn = i_nn + ncols * j_nn
+            pindx[counter] = indx_nn
+
+            if indx_nn == indx_0:
+              weights[counter] = np.float32(-1.0e6)
+            else:
+              pairid = getpairid(indx_0, indx_nn)
+              if pairid in pairdict:
+                weights[counter] = pairdict[pairid]
+              else:
+                sigma1 = sigma[j_nn,i_nn]**2
+                d2 = np.float32(0.0)
+                n2 = np.float32(0.0)
+                for q in range(shpind[0]):
+                  d0 = data[indx_0,indices[q]]
+                  d1 = data[indx_nn,indices[q]]
+                  if (d1 < mxval) and (d0 < mxval):
+                    n2 += np.float32(1.0)
+                    d2 += (d0 - d1) ** np.int32(2)
+                d2 -= n2*(sigma0+sigma1)
+                dnorm = (sigma1 + sigma0)*np.sqrt(np.float32(2.0)*n2)
+                if dnorm > np.float32(1.e-8):
+                  d2 /= dnorm
+                else:
+                  d2 = np.float32(1e6)*n2
+                weights[counter] = d2
+                pairdict[pairid] = numba.float32(d2)
+            counter += 1
+        #print('________________')
+        # end of window scanning
+        sum = np.float32(0.0)
+        for i_nn in range(winsz):
+
+          weights[i_nn] = np.maximum(weights[i_nn]-dthresh, numba.float32(0.0))
+          weights[i_nn] = np.exp(-1.0 * weights[i_nn] * lam2)
+          sum += weights[i_nn]
+
+        for i_nn in range(winsz):
+          indx_nn = pindx[i_nn]
+          weights[i_nn] /= sum
+          #print(weights[i_nn], ' \n')
+          for q in range(shpdata[1]):
+            dataout[indx_0, q] += data[indx_nn, q]*weights[i_nn]
+        #print('_______', '\n')
+    return dataout
+
+  def _calcchunks(self, patdim, ncol, nrow, target_bytes=2e9, col_overlap=0, row_overlap=0, col_offset=0, row_offset=0):
+
+    col_overlap = min(col_overlap, ncol - 1)
+    row_overlap = min(row_overlap, nrow - 1)
+
+    byteperpat = patdim[-1] * patdim[-2] * 4 * 2  # assume a 4 byte float input and output array
+    byteperdataset = byteperpat * ncol * nrow
+    nchunks = int(np.ceil(byteperdataset / target_bytes))
+
+    ncolchunks = (max(np.round(np.sqrt(nchunks * float(ncol) / nrow)), 1))
+    colstep = max((ncol / ncolchunks), 1)
+    ncolchunks = max(ncol / colstep, 1)
+    colstepov = min(colstep + 2 * col_overlap, ncol)
+    ncolchunks = max(int(np.ceil(ncolchunks)), 1)
+    colstep = max(int(np.round(colstep)), 1)
+    colstepov = min(colstep + 2 * col_overlap, ncol)
+
+    nrowchunks = max(np.ceil(nchunks / ncolchunks), 1)
+    rowstep = max((nrow / nrowchunks), 1)
+    nrowchunks = max(nrow / rowstep, 1)
+    rowstepov = min(rowstep + 2 * row_overlap, nrow)
+    nrowchunks = max(int(np.ceil(nrowchunks)), 1)
+    rowstep = max(int(np.round(rowstep)), 1)
+    rowstepov = min(rowstep + 2 * row_overlap, nrow)
+
+    # colchunks = np.round(np.arange(ncolchunks+1)*ncol/ncolchunks).astype(int)
+    colchunks = np.zeros((ncolchunks, 2), dtype=int)
+    colchunks[:, 0] = (np.arange(ncolchunks) * colstep).astype(int)
+    colchunks[:, 1] = colchunks[:, 0] + colstepov - int(col_overlap)
+    colchunks[:, 0] -= col_overlap
+    colchunks[0, 0] = 0;
+
+    for i in range(ncolchunks - 1):
+      if colchunks[i + 1, 0] >= ncol:
+        colchunks = colchunks[0:i + 1, :]
+
+    ncolchunks = colchunks.shape[0]
+    colchunks[-1, 1] = ncol
+
+    colchunks += col_offset
+
+    # colproc = np.zeros((ncolchunks, 2), dtype=int)
+    # if ncolchunks > 1:
+    #   colproc[1:, 0] = col_overlap
+    # if ncolchunks > 1:
+    #   colproc[0:, 1] = colchunks[:, 1] - colchunks[:, 0] - col_overlap
+    # colproc[-1, 1] = colchunks[-1, 1] - colchunks[-1, 0]
+
+    # rowchunks = np.round(np.arange(nrowchunks + 1) * nrow / nrowchunks).astype(int)
+    rowchunks = np.zeros((nrowchunks, 2), dtype=int)
+    rowchunks[:, 0] = (np.arange(nrowchunks) * rowstep).astype(int)
+    rowchunks[:, 1] = rowchunks[:, 0] + rowstepov - int(row_overlap)
+    rowchunks[:, 0] -= row_overlap
+    rowchunks[0, 0] = 0;
+
+    for i in range(nrowchunks - 1):
+      if rowchunks[i + 1, 0] >= nrow:
+        rowchunks = rowchunks[0:i + 1, :]
+
+    nrowchunks = rowchunks.shape[0]
+    rowchunks[-1, 1] = nrow
+
+    rowchunks += row_offset
+
+    # rowproc = np.zeros((nrowchunks, 2), dtype=int)
+    # if nrowchunks > 1:
+    #   rowproc[1:, 0] = row_overlap
+    # if nrowchunks > 1:
+    #   rowproc[0:, 1] = rowchunks[:, 1] - rowchunks[:, 0] - row_overlap
+    # rowproc[-1, 1] = rowchunks[-1, 1] - rowchunks[-1, 0]
+
+    return ncolchunks, nrowchunks, colchunks, rowchunks
+
+  # def asciiupdate(self, nrow, ncol, completematrix):
+  #   cm = completematrix
+  #   ncdisplay = min(ncol, 80)
+  #   nrow
+
+
diff --git a/pyebsdindex/opencl/band_detect_cl.py b/pyebsdindex/opencl/band_detect_cl.py
index ad288fd..f849224 100644
--- a/pyebsdindex/opencl/band_detect_cl.py
+++ b/pyebsdindex/opencl/band_detect_cl.py
@@ -36,9 +36,7 @@
 
 #from os import environ
 #environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
-tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir())
-tempdir = tempdir.joinpath('numba')
-environ["NUMBA_CACHE_DIR"] = str(tempdir)
+
 
 RADEG = 180.0/np.pi
 
@@ -257,42 +255,35 @@ def radon_fasterCL(self,image,padding = np.array([0,0]), fixArtifacts = False, b
     #  reform = False
 
     clvtypesize = 16 # this is the vector size to be used in the openCL implementation.
-    nImCL = np.int32(clvtypesize * (np.int64(np.ceil(nIm/clvtypesize))))
+    nImCL = np.uint64(clvtypesize * (np.int64(np.ceil(nIm/clvtypesize))))
     imtype = image.dtype
 
+    tict = timer()
+    #image_align = np.ones((shapeIm[1], shapeIm[2], nImCL), dtype = imtype)
+    #image_align[:,:,0:nIm] = np.transpose(image, [1,2,0]).copy()
+    toct = timer()
+
 
-    image_align = np.ones((shapeIm[1], shapeIm[2], nImCL), dtype = imtype)
-    image_align[:,:,0:nIm] = np.transpose(image, [1,2,0]).copy()
-    shpRdn = np.asarray( ((self.nRho+2*padding[0]), (self.nTheta+2*padding[1]), nImCL),dtype=np.uint64)
-    radon_gpu = cl.Buffer(ctx,mf.READ_WRITE,size=int((self.nRho+2*padding[0])*(self.nTheta+2*padding[1])*nImCL*4))
 
     #radon_gpu = cl.Buffer(ctx,mf.READ_WRITE,size=radon.nbytes)
     #radon_gpu = cl.Buffer(ctx,mf.READ_WRITE | mf.COPY_HOST_PTR,hostbuf=radon)
-    image_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=image_align)
-
-
+    image_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=image)
     imstep = np.uint64(np.product(shapeIm[-2:]))
-    indxstep = np.uint64(self.radonPlan.indexPlan.shape[-1])
-    rdnstep = np.uint64(self.nRho * self.nTheta)
-
-    padRho = np.uint64(padding[0])
-    padTheta = np.uint64(padding[1])
     tic = timer()
 
     nImChunk = np.uint64(nImCL/clvtypesize)
+    image_gpuflt = cl.Buffer(ctx, mf.READ_WRITE, size=int(int(shapeIm[1])*int(shapeIm[2])*int(nImCL) * int(4)))  # 32-bit float
 
-    if image.dtype.kind == 'f':
-      image_gpuflt = image_gpu
-    else:
-      image_gpuflt = cl.Buffer(ctx, mf.READ_WRITE, size=image_align.size * 4)  # 32-bit float
 
-      if image_align.dtype.type is np.ubyte:
-        prg.loadubyte8(queue, (imstep, 1, 1), None, image_gpu, image_gpuflt, nImChunk)
-      if image_align.dtype.type is np.uint16:
-        prg.loaduint16(queue, (imstep, 1, 1), None, image_gpu, image_gpuflt, nImChunk)
-      queue.flush()
-      image_gpu.release()
-      image_gpu = None
+    if image.dtype.type is np.float32:
+      prg.loadfloat32(queue, (shapeIm[2], shapeIm[1], nIm), None, image_gpu, image_gpuflt, nImCL)
+    if image.dtype.type is np.ubyte:
+      prg.loadubyte8(queue, (shapeIm[2], shapeIm[1], nIm), None, image_gpu, image_gpuflt, nImCL)
+    if image.dtype.type is np.uint16:
+      prg.loaduint16(queue, (shapeIm[2], shapeIm[1], nIm), None, image_gpu, image_gpuflt, nImCL)
+    queue.flush()
+    image_gpu.release()
+    image_gpu = None
 
 
 
@@ -302,6 +293,14 @@ def radon_fasterCL(self,image,padding = np.array([0,0]), fixArtifacts = False, b
       #imBack = np.zeros((shapeIm[1], shapeIm[2], nImCL),dtype=np.float32)
       #cl.enqueue_copy(queue,imBack,image_gpu,is_blocking=True)
 
+    indxstep = np.uint64(self.radonPlan.indexPlan.shape[-1])
+    rdnstep = np.uint64(self.nRho * self.nTheta)
+    padRho = np.uint64(padding[0])
+    padTheta = np.uint64(padding[1])
+    shpRdn = np.asarray(((self.nRho + 2 * padding[0]), (self.nTheta + 2 * padding[1]), nImCL), dtype=np.uint64)
+    radon_gpu = cl.Buffer(ctx, mf.READ_WRITE,
+                          size=int((self.nRho + 2 * padding[0]) * (self.nTheta + 2 * padding[1]) * nImCL * 4))
+
     rdnIndx_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.radonPlan.indexPlan)
     cl.enqueue_fill_buffer(queue, radon_gpu, np.float32(self.radonPlan.missingval), 0, radon_gpu.size)
     prg.radonSum(queue,(nImChunk,rdnstep),None,rdnIndx_gpu,image_gpuflt,radon_gpu,
@@ -629,7 +628,7 @@ def rdn_local_maxCL(self,radonIn, clparams=None,  returnBuff = True):
 
   def band_labelCL(self,rdnConvIn, lMaxRdnIn,clparams=None):
 
-    # an attempt to to run the band label on the GPU
+    # an attempt to run the band label on the GPU
 
     tic = timer()
 
diff --git a/pyebsdindex/opencl/clkernels.cl b/pyebsdindex/opencl/clkernels.cl
index ffbc20a..b712d2f 100644
--- a/pyebsdindex/opencl/clkernels.cl
+++ b/pyebsdindex/opencl/clkernels.cl
@@ -22,41 +22,70 @@ Author: David Rowenhorst;
 The US Naval Research Laboratory Date: 21 Aug 2020
 */
 
-// simple program to convert a 8-bit byte to float
-__kernel void loadubyte8( const __global uchar16 *im1, __global float16 *im1flt,
-                        const unsigned long int nImChunk)
+// simple program to convert a 8-bit byte to float and transpose array
+__kernel void loadubyte8( const __global uchar *im1, __global float *im1flt, const unsigned long int nImCL)
   {
-  const unsigned long int xy = get_global_id(0);
-  //const unsigned long int szim = get_global_size(0);
-  unsigned long i;
-  uchar16 imVal;
-  float16 imValflt;
+  const unsigned long int x = get_global_id(0);
+  const unsigned long int y = get_global_id(1);
+  const unsigned long int z = get_global_id(2);
+  const unsigned long int nx = get_global_size(0);
+  const unsigned long int ny = get_global_size(1);
+  //const unsigned long int nz = get_global_size(2);
+  
+  const unsigned long int indx1 = x + nx*y + nx*ny*z;
+  const unsigned long int indx2 = z + nImCL*(x + nx*y); // transpose z->x, x->y, y->z 
+  uchar imVal;
+  float imValflt;
+
+  imVal =  im1[indx1];
+  imValflt = convert_float(imVal);
+  im1flt[indx2] = imValflt;
+  
+  
+}
 
-  const unsigned long indx = nImChunk * xy;
-  for(i = 0; i< nImChunk; ++i){
-    imVal =  im1[indx+i];
-    imValflt = convert_float16(imVal);
-    im1flt[indx+i] = imValflt;
-  }
+
+// simple program to convert a 8-bit byte to float and transpose array
+__kernel void loaduint16( const __global ushort *im1, __global float *im1flt, const unsigned long int nImCL)
+  {
+  const unsigned long int x = get_global_id(0);
+  const unsigned long int y = get_global_id(1);
+  const unsigned long int z = get_global_id(2);
+  const unsigned long int nx = get_global_size(0);
+  const unsigned long int ny = get_global_size(1);
+  //const unsigned long int nz = get_global_size(2);
+  
+  const unsigned long int indx1 = x + nx*y + nx*ny*z;
+  const unsigned long int indx2 = z + nImCL*(x + nx*y); // transpose z->x, x->y, y->z 
+  ushort imVal;
+  float imValflt;
+
+  imVal =  im1[indx1];
+  imValflt = convert_float(imVal);
+  im1flt[indx2] = imValflt;
+  
   
 }
 
-// simple program to convert a 16-bit int to float
-__kernel void loaduint16( const __global ushort16 *im1, __global float16 *im1flt,
-                        const unsigned long int nImChunk)
+
+
+// simple program to convert a float to float and transpose array
+__kernel void loaduufloat32( const __global float *im1, __global float *im1flt, const unsigned long int nImCL)
   {
-  const unsigned long int xy = get_global_id(0);
-  //const unsigned long int szim = get_global_size(0);
-  unsigned long i;
-  ushort16 imVal;
-  float16 imValflt;
+  const unsigned long int x = get_global_id(0);
+  const unsigned long int y = get_global_id(1);
+  const unsigned long int z = get_global_id(2);
+  const unsigned long int nx = get_global_size(0);
+  const unsigned long int ny = get_global_size(1);
+  //const unsigned long int nz = get_global_size(2);
+  
+  const unsigned long int indx1 = x + nx*y + nx*ny*z;
+  const unsigned long int indx2 = z + nImCL*(x + nx*y); // transpose z->x, x->y, y->z 
+  float imVal;
 
-  const unsigned long indx = nImChunk * xy;
-  for(i = 0; i< nImChunk; ++i){
-    imVal =  im1[indx+i];
-    imValflt = convert_float16(imVal);
-    im1flt[indx+i] = imValflt;
-  }
+  imVal =  im1[indx1];
+  im1flt[indx2] = imVal;
+  
   
 }
 
diff --git a/pyebsdindex/opencl/clnlpar.cl b/pyebsdindex/opencl/clnlpar.cl
new file mode 100644
index 0000000..6289dc7
--- /dev/null
+++ b/pyebsdindex/opencl/clnlpar.cl
@@ -0,0 +1,455 @@
+/*
+This software was developed by employees of the US Naval Research Laboratory (NRL), an
+agency of the Federal Government. Pursuant to title 17 section 105 of the United States
+Code, works of NRL employees are not subject to copyright protection, and this software
+is in the public domain. PyEBSDIndex is an experimental system. NRL assumes no
+responsibility whatsoever for its use by other parties, and makes no guarantees,
+expressed or implied, about its quality, reliability, or any other characteristic. We
+would appreciate acknowledgment if the software is used. To the extent that NRL may hold
+copyright in countries other than the United States, you are hereby granted the
+non-exclusive irrevocable and unconditional right to print, publish, prepare derivative
+works and distribute this software, in any medium, or authorize others to do so on your
+behalf, on a royalty-free basis throughout the world. You may improve, modify, and
+create derivative works of the software or any portion of the software, and you may copy
+and distribute such modifications or works. Modified works should carry a notice stating
+that you changed the software and should note the date and nature of any such change.
+Please explicitly acknowledge the US Naval Research Laboratory as the original source.
+This software can be redistributed and/or modified freely provided that any derivative
+works bear some notice that they are derived from it, and any modified versions bear
+some notice that they have been modified.
+
+Author: David Rowenhorst;
+The US Naval Research Laboratory Date: 22 May 2024
+
+For more info see
+Patrick T. Brewick, Stuart I. Wright, David J. Rowenhorst. Ultramicroscopy, 200:50–61, May 2019.
+"""Non-local pattern averaging and re-indexing (NLPAR)."""
+*/
+
+
+float sum16(const float16 *v1);
+float sum16(const float16 *v1){
+     float sum = 0.0;
+     sum += v1[0].s0;
+     sum += v1[0].s1;
+     sum += v1[0].s2;
+     sum += v1[0].s3;
+     sum += v1[0].s4;
+     sum += v1[0].s5;
+     sum += v1[0].s6;
+     sum += v1[0].s7;
+     sum += v1[0].s8;
+     sum += v1[0].s9;
+     sum += v1[0].sa;
+     sum += v1[0].sb;
+     sum += v1[0].sc;
+     sum += v1[0].sd;
+     sum += v1[0].se;
+     sum += v1[0].sf;
+     return sum;
+
+}
+
+
+void print16(const float16 v1);
+void print16(const float16 v1){
+     
+     printf( "%f, ", v1.s0);
+     printf( "%f, ", v1.s1);
+     printf( "%f, ", v1.s2);
+     printf( "%f, ", v1.s3);
+     printf( "%f, ", v1.s4);
+     printf( "%f, ", v1.s5);
+     printf( "%f, ", v1.s6);
+     printf( "%f, ", v1.s7);
+     printf( "%f, ", v1.s8);
+     printf( "%f, ", v1.s9);
+     printf( "%f, ", v1.sa);
+     printf( "%f, ", v1.sb);
+     printf( "%f, ", v1.sc);
+     printf( "%f, ", v1.sd);
+     printf( "%f, ", v1.se);
+     printf( "%f", v1.sf);
+
+}
+
+
+__kernel void nlloadpat8bit(__global uchar *datain, __global float *dataout){
+  const unsigned long int x = get_global_id(0);
+  uchar imVal =  datain[x];
+  float imValflt = convert_float(imVal);
+  dataout[x] = imValflt;
+}
+
+__kernel void nlloadpat16bit(__global ushort *datain, __global float *dataout){
+  const unsigned long int x = get_global_id(0);
+  ushort imVal = datain[x];
+  float imValflt = convert_float(imVal);
+  dataout[x] = imValflt;
+}
+
+__kernel void nlloadpat32flt(__global float *datain, __global float *dataout){
+  const unsigned long int x = get_global_id(0);
+  dataout[x] = datain[x];
+}
+
+__kernel void calcsigma( __global float *data, __global float16 *mask, 
+      __global float *sig, 
+      __global float *dg, __global float *ng,
+      const long  nn, const long ndatchunk, const long npatpoint, const float maxlim){
+  // IDs of work-item represent x and y coordinates in image
+  const long x = get_global_id(0);
+  const long y = get_global_id(1);
+  const long ncol = get_global_size(0);
+  const long nrow = get_global_size(1);
+  const long indx_xy = x+y*ncol;
+  
+  const long indx0 = npatpoint * indx_xy;
+
+  long i, j, z;
+  long indx_j, indx_ij, count; 
+
+  long nnn = (2*nn+1) * (2*nn+1);
+
+  float16 d1, d0; 
+  float16 maskchunk; 
+  float16 mask0, mask1;
+  float dd; 
+
+
+  float d[256];
+  float n[256];
+ 
+  
+  for(j=0; j < nnn; ++j){
+      d[j] = 0.0;
+      n[j] = 1.0e-6; 
+  }
+
+  ;
+
+  for(z = 0; z<ndatchunk; ++z){
+      count = 0; 
+      
+      
+      d0 =  *(__global float16*) (data + indx0+16*z); 
+      //if (z == 0){print16(d0);}
+      //
+      maskchunk = mask[z];
+      //print16(maskchunk);
+      mask0 =  d0 < (float16) (maxlim) ? (float16) (1.0): (float16) (0.0);
+      mask0 =  maskchunk > (float16) 1.e-3 ? mask0 : (float16) 0.0; 
+      //mask0 = select((float16) (1.0), (float16) (0.0), isless( d0, (float16) maxlim));
+      
+      for(j=y-nn; j<=y+nn; ++j){
+          
+          indx_j =  (j >= 0) ? (j): abs(j);
+          indx_j =  (indx_j < nrow) ? (indx_j): nrow - (indx_j -nrow +1);
+          indx_j = ncol * indx_j;
+          
+          for(i=x-nn; i<=x+nn; ++i){      
+            
+              indx_ij =  (i >= 0) ? (i): abs(i);
+              indx_ij =  (indx_ij < ncol) ? (indx_ij): ncol - (indx_ij -ncol +1);
+              indx_ij = npatpoint*(indx_ij + indx_j);
+              
+              mask1 = mask0;
+              d1 =  *(__global float16*) (data + indx_ij+16*z); 
+              
+
+              mask1 =  d1 < (float16) (maxlim) ? mask1 : (float16) (0.0);
+              //mask1 = select((float16) (1.0), (float16) (0.0), isgreater(mask0, (float16)(1e-6)) && isgreater(mask1,(float16)(1e-6)));
+              //printf("%*s\n", 'here');
+              
+              d1 = (d0-d1);
+              d1 *= d1 ; 
+              d1 *= mask1; 
+
+              dd = sum16(&d1);
+              dd = (indx_ij == indx0) ? -1*dd : dd; // mark the center point 
+              
+              d[count] += dd;
+              n[count] += sum16(&mask1);
+              //d[count+nnn*indx_xy] += dd;
+              //n[count+nnn*indx_xy] += sum16(&mask1);
+
+              count += 1; 
+          }
+      }
+      //printf("%d %f\n", count, n[count]);
+  }
+
+
+  float mind = 1e24; 
+  float s0;  
+  ; //nnn*(x+ncol*y);
+  for(j=0; j<nnn; ++j){
+    if (d[j] > 1e-5){ //sometimes EDAX collects the same pattern twice & catch the pixel of interest.
+      s0 = d[j]/(2.0*n[j]); 
+      if (s0<mind){
+        mind = s0; 
+      }
+    } 
+  }
+
+  sig[indx_xy] = sqrt(mind);
+  
+  for(j=0; j<nnn; ++j){
+    dg[j+nnn*indx_xy] = d[j];
+    ng[j+nnn*indx_xy] = n[j];
+  }
+
+}
+
+
+__kernel void normd(
+    const __global float *sigma, 
+    const __global float *n, 
+     __global float *d,
+     const long nn){
+
+    const long x = get_global_id(0);
+    const long y = get_global_id(1);
+    const long ncol = get_global_size(0);
+    const long nrow = get_global_size(1);
+    const long indx_xy = x+y*ncol;
+
+    long i, j, q;
+    long indx_j, indx_ij; 
+
+    long nnn = (2*nn+1) * (2*nn+1);
+    //printf("%d\n", nnn) ;
+    float sigma_xy = sigma[indx_xy];  
+    sigma_xy *= sigma_xy;
+    // printf("%f", sigma_xy);
+    float sigma_ij, n_ij, d_ij;  
+    q = 0; 
+    for(j=y-nn; j<=y+nn; ++j){
+          
+          indx_j =  (j >= 0) ? (j): abs(j);
+          indx_j =  (indx_j < nrow) ? (indx_j): nrow - (indx_j -nrow +1);
+          indx_j = ncol * indx_j;
+          
+          for(i=x-nn; i<=x+nn; ++i){  
+              
+               indx_ij =  (i >= 0) ? (i): abs(i);
+               indx_ij =  (indx_ij < ncol) ? (indx_ij): ncol - (indx_ij -ncol +1);
+               indx_ij =  (indx_ij + indx_j);
+               sigma_ij = sigma[indx_ij];
+               sigma_ij *= sigma_ij;
+              
+               sigma_ij = sigma_ij + sigma_xy;
+               
+               d_ij = d[q+nnn*indx_xy];
+               n_ij = n[q+nnn*indx_xy];    
+               if (n_ij > 1.0e-3){           
+                  d_ij -= n_ij*sigma_ij;
+                  d_ij *= 1.0/( sigma_ij * sqrt(2.0*n_ij) ); 
+                  
+                  d[q+nnn*indx_xy] = d_ij;    
+                }
+               q += 1.0; 
+           }
+      } 
+      //printf("%d, %d, %d\n",nn, q, nnn);     
+}
+
+
+
+
+__kernel void calcnlpar( 
+      const __global float *data, 
+      const __global float16 *mask, 
+      const __global float *sigma, 
+      const __global long *crlimits, 
+      __global float *dataout, 
+      const long sr, 
+      const long ndatchunk, 
+      const long npatpoint, 
+      const float maxlim, 
+      const float lam2, 
+      const float dthresh){
+  //IDs of work-item represent x and y coordinates in image
+  //const long4 calclim =  crlimits[0];
+  const long x = get_global_id(0)+crlimits[0];
+  const long y = get_global_id(1)+crlimits[1];
+  const long ncol = crlimits[2]; //get_global_size(0);
+  const long nrow = crlimits[3];//get_global_size(1);
+  const long indx_xy = x+y*ncol;
+  //printf("%d\n", indx_xy);
+  const long indx0 = npatpoint * indx_xy;
+  //printf("%d, %d, %d, %d\n", x,y,ncol, nrow); 
+  long i, j, z;
+  long indx_j, indx_ij, count; 
+
+  long nnn = (2*sr+1) * (2*sr+1);
+
+  float16 d1, d0; 
+  float16 mask0, mask1;
+  float dd, nn, sigma_ij, norm; 
+  float sigma0 = sigma[indx_xy];
+  sigma0 *= sigma0; 
+  //float16* writeloc; 
+
+  float d[512]; // taking a risk here that noone will want a SR > 10
+  float n[512]; 
+
+  
+  for(j=0; j < nnn; ++j){
+      d[j] = 0.0;
+      n[j] = 1.0e-6; 
+  }
+
+
+  for(z = 0; z<ndatchunk; ++z){
+      count = 0; 
+      
+      
+      d0 =  *(__global float16*) (data + indx0+16*z); 
+      
+      mask0 = mask[z];
+      mask0 =  (mask0 > (float16) 1.e-3) ? (float16) 1.0 : (float16) 0.0; 
+      mask0 =  d0 < (float16) (maxlim) ? mask0 : (float16) (0.0);
+      
+      
+      for(j=y-sr; j<=y+sr; ++j){
+          
+          indx_j =  (j >= 0) ? (j): abs(j);
+          indx_j =  (indx_j < nrow) ? (indx_j): nrow - (indx_j -nrow +1);
+          indx_j = ncol * indx_j;
+          
+          for(i=x-sr; i<=x+sr; ++i){
+            
+              indx_ij =  (i >= 0) ? (i): abs(i);
+              indx_ij =  (indx_ij < ncol) ? (indx_ij): ncol - (indx_ij -ncol +1);
+              
+              indx_ij = (indx_ij + indx_j);
+              
+              indx_ij *= npatpoint;
+              
+              mask1 = mask0;
+              d1 =  *(__global float16*) (data + indx_ij+16*z); 
+              
+
+              mask1 =  d1 < (float16) (maxlim) ? mask1 : (float16) (0.0);
+              //mask1 = select((float16) (1.0), (float16) (0.0), isgreater(mask0, (float16)(1e-6)) && isgreater(mask1,(float16)(1e-6)));
+              //printf("%*s\n", 'here');
+              
+              d1 = (d0-d1);
+              d1 *= d1 ; 
+              d1 *= mask1; 
+
+              dd = sum16(&d1);
+              dd = (indx_ij == indx0) ? -1.0 : dd; // mark the center point 
+              
+              d[count] += dd;
+              n[count] += sum16(&mask1);
+              //d[count+nnn*indx_xy] += dd;
+              //n[count+nnn*indx_xy] += sum16(&mask1);
+
+              count += 1; 
+          }
+      }
+      
+  }
+// calculate the weights
+  count = 0; 
+  float sum = 0.0; 
+  nn = 1.0;
+  for(j=y-sr; j<=y+sr; ++j){
+          
+          indx_j =  (j >= 0) ? (j): abs(j);
+          indx_j =  (indx_j < nrow) ? (indx_j): nrow - (indx_j -nrow +1);
+          indx_j = ncol * indx_j;
+          
+          for(i=x-sr; i<=x+sr; ++i){
+              
+              dd = d[count]; 
+              if (dd > 1.e-3){
+                indx_ij =  (i >= 0) ? (i): abs(i);
+                indx_ij =  (indx_ij < ncol) ? (indx_ij): ncol - (indx_ij -ncol +1);
+                
+                indx_ij = (indx_ij + indx_j);
+                
+                sigma_ij = sigma[indx_xy];
+                sigma_ij *= sigma_ij; 
+                nn = n[count]; 
+                dd -= nn*(sigma_ij+sigma0);
+                
+                norm = (sigma0 + sigma_ij)*sqrt(2.0*nn);
+                if (norm > 1.0e-8){
+                  dd /= norm;  
+                } else {
+                  dd = 1e6*nn;
+                } 
+
+              } else {
+                dd = 0.0;
+                nn = 1.0;
+              }
+              
+              dd -= dthresh; 
+              dd = dd >= 0.0 ? dd : 0.0; 
+
+              dd = exp(-1.0*dd*lam2); 
+              sum += dd; 
+              d[count] = dd;
+
+              count += 1; 
+              
+          }
+  }
+  
+  for (j =0;j<nnn;j++){d[j] *= 1.0/sum; }
+
+// now one more time through the loops to average across the patterns with the weights  
+// and place in the output array
+  for(z = 0; z<ndatchunk; ++z){
+      count = 0; 
+      d0 =  *(__global float16*) (data + indx0+16*z); 
+      
+      mask0 = mask[z];
+      mask0 =  (mask0 > (float16) -1.e-3) ? (float16) 1.0 : (float16) 0.0; 
+      
+      for(j=y-sr; j<=y+sr; ++j){
+          
+          indx_j =  (j >= 0) ? (j): abs(j);
+          indx_j =  (indx_j < nrow) ? (indx_j): nrow - (indx_j -nrow +1);
+          indx_j = ncol * indx_j;
+          
+          for(i=x-sr; i<=x+sr; ++i){
+            
+              indx_ij =  (i >= 0) ? (i): abs(i);
+              indx_ij =  (indx_ij < ncol) ? (indx_ij): ncol - (indx_ij -ncol +1);
+              
+              indx_ij = (indx_ij + indx_j);
+              
+              indx_ij *= npatpoint;
+              
+              
+              d1 =  *(__global float16*) (data + indx_ij+16*z); 
+              
+              
+              d1 *= (float16) d[count]; 
+              d1 *= mask0; 
+              
+              *(__global float16*) (dataout + indx0+16*z) += d1; 
+              
+              //writeadd16(&(dataout[indx0+16*z]), &d1 );
+
+              //d[count+nnn*indx_xy] += dd;
+              //n[count+nnn*indx_xy] += sum16(&mask1);
+
+              count += 1; 
+          }
+      }
+  }
+
+
+
+}
+
+
+
+
+
diff --git a/pyebsdindex/opencl/nlpar_cl.py b/pyebsdindex/opencl/nlpar_cl.py
new file mode 100644
index 0000000..d0d5211
--- /dev/null
+++ b/pyebsdindex/opencl/nlpar_cl.py
@@ -0,0 +1,538 @@
+# This software was developed by employees of the US Naval Research Laboratory (NRL), an
+# agency of the Federal Government. Pursuant to title 17 section 105 of the United States
+# Code, works of NRL employees are not subject to copyright protection, and this software
+# is in the public domain. PyEBSDIndex is an experimental system. NRL assumes no
+# responsibility whatsoever for its use by other parties, and makes no guarantees,
+# expressed or implied, about its quality, reliability, or any other characteristic. We
+# would appreciate acknowledgment if the software is used. To the extent that NRL may hold
+# copyright in countries other than the United States, you are hereby granted the
+# non-exclusive irrevocable and unconditional right to print, publish, prepare derivative
+# works and distribute this software, in any medium, or authorize others to do so on your
+# behalf, on a royalty-free basis throughout the world. You may improve, modify, and
+# create derivative works of the software or any portion of the software, and you may copy
+# and distribute such modifications or works. Modified works should carry a notice stating
+# that you changed the software and should note the date and nature of any such change.
+# Please explicitly acknowledge the US Naval Research Laboratory as the original source.
+# This software can be redistributed and/or modified freely provided that any derivative
+# works bear some notice that they are derived from it, and any modified versions bear
+# some notice that they have been modified.
+#
+# Author: David Rowenhorst;
+# The US Naval Research Laboratory Date: 22 May 2024
+
+# For more info see
+# Patrick T. Brewick, Stuart I. Wright, David J. Rowenhorst. Ultramicroscopy, 200:50–61, May 2019.
+
+
+
+import os
+from timeit import default_timer as timer
+import numpy as np
+import pyopencl as cl
+
+
+
+import scipy.optimize as sp_opt
+from pyebsdindex import nlpar_cpu
+from pyebsdindex.opencl import openclparam
+from time import time as timer
+
+class NLPAR(nlpar_cpu.NLPAR):
+  def __init__( self, filename=None, **kwargs):
+    nlpar_cpu.NLPAR.__init__(self, filename=filename, **kwargs)
+    self.useCPU = False
+
+
+  def opt_lambda(self,saturation_protect=True, automask=True, backsub=False,
+                 target_weights=[0.5, 0.34, 0.25], dthresh=0.0, autoupdate=True, **kwargs):
+    return self.opt_lambda_cl(saturation_protect=saturation_protect,
+                              automask=automask,
+                              backsub=backsub,
+                              target_weights=target_weights,
+                              dthresh=dthresh,
+                              autoupdate=autoupdate, **kwargs)
+  def calcnlpar(self, **kwargs):
+    return self.calcnlpar_cl(**kwargs)
+
+
+  def calcsigma(self,nn=1, saturation_protect=True,automask=True, return_nndist=False, **kwargs):
+    self.sigmann = nn
+    if self.sigmann > 7:
+      print("Sigma optimization search limited to a search radius <= 7")
+      print("The search radius has been clipped to 7")
+      nn = 7
+      self.sigmann = nn
+
+    sig =  self.calcsigma_cl(nn=nn,
+                            saturation_protect=saturation_protect,
+                            automask=automask, **kwargs)
+    if return_nndist == True:
+      return sig
+    else:
+      return sig[0]
+  def opt_lambda_cpu(self, **kwargs):
+    return nlpar_cpu.NLPAR.opt_lambda(self, **kwargs)
+
+  def calcnlpar_cpu(self, **kwargs):
+    return nlpar_cpu.NLPAR.calcnlpar(self, **kwargs)
+
+  def calcsigma_cpu(self,nn=1, saturation_protect=True,automask=True, **kwargs):
+    return nlpar_cpu.NLPAR.calcsigma(self, nn=nn,
+                                     saturation_protect=saturation_protect, automask=automask, **kwargs)
+
+  def opt_lambda_cl(self, saturation_protect=True, automask=True, backsub=False,
+                 target_weights=[0.5, 0.34, 0.25], dthresh=0.0, autoupdate=True, **kwargs):
+
+    target_weights = np.asarray(target_weights)
+
+    def loptfunc(lam, d2, tw, dthresh):
+      temp = np.maximum(d2, dthresh)#(d2 > dthresh).choose(dthresh, d2)
+      dw = np.exp(-(temp) / lam ** 2)
+      w = np.sum(dw, axis=2) + 1e-12
+
+      metric = np.mean(np.abs(tw - 1.0 / w))
+      return metric
+
+    patternfile = self.getinfileobj()
+    patternfile.read_header()
+    nrows = np.uint64(self.nrows)  # np.uint64(patternfile.nRows)
+    ncols = np.uint64(self.ncols)  # np.uint64(patternfile.nCols)
+
+    pwidth = np.uint64(patternfile.patternW)
+    pheight = np.uint64(patternfile.patternH)
+    phw = pheight * pwidth
+
+    dthresh = np.float32(dthresh)
+    lamopt_values = []
+
+    sigma, d2, n2 = self.calcsigma(nn=1, saturation_protect=saturation_protect, automask=automask, normalize_d=True,
+                                   return_nndist=True)
+
+    #sigmapad = np.pad(sigma, 1, mode='reflect')
+    #d2normcl(d2, n2, sigmapad)
+
+   #print(d2.min(), d2.max(), d2.mean())
+
+    lamopt_values_chnk = []
+    for tw in target_weights:
+      stride = 1 if sigma.size < 1e6 else 10
+
+      lam = 1.0
+      lambopt1 = sp_opt.minimize(loptfunc, lam, args=(d2[0::stride, :], tw, dthresh), method='Nelder-Mead',
+                              bounds=[[0.001, 10.0]], options={'fatol': 0.0001})
+      lamopt_values.append(lambopt1['x'])
+
+    #lamopt_values.append(lamopt_values_chnk)
+    lamopt_values = np.asarray(lamopt_values)
+    print("Range of lambda values: ", lamopt_values.flatten())
+    print("Optimal Choice: ", np.median(lamopt_values))
+    if autoupdate == True:
+      self.lam = np.median(np.mean(lamopt_values, axis=0))
+    if self.sigma is None:
+      self.sigma = sigma
+    return lamopt_values.flatten()
+
+
+  def calcsigma_cl(self,nn=1,saturation_protect=True,automask=True, normalize_d=False, gpu_id = None, **kwargs):
+    self.sigmann = nn
+    if self.sigmann > 7:
+      print("Sigma optimization search limited to a search radius <= 7")
+      print("The search radius has been clipped to 7")
+      nn = 7
+      self.sigmann = nn
+
+    if gpu_id is None:
+      clparams = openclparam.OpenClParam()
+      clparams.get_gpu()
+      target_mem = 0
+      gpu_id = 0
+      count = 0
+
+      for gpu in clparams.gpu:
+        gmem = gpu.max_mem_alloc_size
+        if target_mem < gmem:
+          gpu_id = count
+          target_mem = gmem
+        count += 1
+    else:
+      clparams = openclparam.OpenClParam()
+      clparams.get_gpu()
+      gpu_id = min(len(clparams.gpu)-1, gpu_id)
+
+
+    #print(gpu_id)
+    clparams.get_context(gpu_id=gpu_id, kfile = 'clnlpar.cl')
+    clparams.get_queue()
+    target_mem = clparams.queue.device.max_mem_alloc_size//2
+    ctx = clparams.ctx
+    prg = clparams.prg
+    queue = clparams.queue
+    mf = clparams.memflags
+    clvectlen = 16
+
+    patternfile = self.getinfileobj()
+
+    nrows = np.int64(self.nrows)  # np.uint64(patternfile.nRows)
+    ncols = np.int64(self.ncols)  # np.uint64(patternfile.nCols)
+
+    pwidth = np.uint64(patternfile.patternW)
+    pheight = np.uint64(patternfile.patternH)
+    npat_point = int(pwidth * pheight)
+    #print(target_mem)
+    chunks = self._calcchunks( [pwidth, pheight], ncols, nrows, target_bytes=target_mem,
+                              col_overlap=1, row_overlap=1)
+
+    if (automask is True) and (self.mask is None):
+      self.mask = (self.automask(pheight, pwidth))
+    if self.mask is None:
+      self.mask = np.ones((pheight, pwidth), dtype=np.uint8)
+
+    #indices = np.asarray((self.mask.flatten().nonzero())[0], np.uint64)
+    #nindices = np.uint64(indices.size)
+    #nindicespad =  np.uint64(clvectlen * int(np.ceil(nindices/clvectlen)))
+
+    mask = self.mask.astype(np.float32)
+
+    npad = clvectlen * int(np.ceil(mask.size/clvectlen))
+    maskpad = np.zeros((npad) , np.float32)
+    maskpad[0:mask.size] = mask.reshape(-1).astype(np.float32)
+    mask_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=maskpad)
+
+    npatsteps = int(maskpad.size/clvectlen)
+
+    chunksize = (chunks[2][:,1] - chunks[2][:,0]).reshape(1,-1) * \
+                     (chunks[3][:, 1] - chunks[3][:, 0]).reshape(-1, 1)
+
+    #return chunks, chunksize
+    mxchunk = chunksize.max()
+
+    npadmx = clvectlen * int(np.ceil(mxchunk*npat_point/ clvectlen))
+
+    datapad_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4))
+
+    nnn = int((2 * nn + 1) ** 2)
+
+    sigma = np.zeros((nrows, ncols), dtype=np.float32) + 1e18
+    dist = np.zeros((nrows, ncols, nnn), dtype=np.float32)
+    countnn = np.zeros((nrows, ncols, nnn), dtype=np.float32)
+
+    #dist_local = cl.LocalMemory(nnn*npadmx*4)
+    dist_local = cl.Buffer(ctx, mf.READ_WRITE, size=int(mxchunk*nnn*4))
+    distchunk = np.zeros((mxchunk, nnn), dtype=np.float32)
+    #count_local = cl.LocalMemory(nnn*npadmx*4)
+    count_local = cl.Buffer(ctx, mf.READ_WRITE, size=int(mxchunk * nnn * 4))
+    countchunk = np.zeros((mxchunk, nnn), dtype=np.float32)
+
+    for rowchunk in range(chunks[1]):
+      rstart = chunks[3][rowchunk, 0]
+      rend = chunks[3][rowchunk, 1]
+      nrowchunk = rend - rstart
+
+      for colchunk in range(chunks[0]):
+        cstart = chunks[2][colchunk, 0]
+        cend = chunks[2][colchunk, 1]
+        ncolchunk = cend - cstart
+
+        data, xyloc = patternfile.read_data(patStartCount=[[cstart, rstart], [ncolchunk, nrowchunk]],
+                                          convertToFloat=False, returnArrayOnly=True)
+
+        mxval = data.max()
+        if saturation_protect == False:
+          mxval += 1.0
+        else:
+          mxval *= 0.9961
+
+        evnt = cl.enqueue_fill_buffer(queue, datapad_gpu, np.float32(mxval+10), 0,int(4*npadmx))
+
+        szdata = data.size
+        npad = clvectlen * int(np.ceil(szdata / clvectlen))
+        tic = timer()
+        #datapad = np.zeros((npad), dtype=np.float32) + np.float32(mxval + 10)
+        #datapad[0:szdata] = data.reshape(-1)
+        data_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=data)
+
+        if data.dtype.type is np.float32:
+          prg.nlloadpat32flt(queue, (np.uint64(data.size),), None, data_gpu, datapad_gpu, wait_for=[evnt])
+        if data.dtype.type is np.ubyte:
+          prg.nlloadpat8bit(queue, (np.uint64(data.size),), None, data_gpu, datapad_gpu, wait_for=[evnt])
+        if data.dtype.type is np.uint16:
+          prg.nlloadpat16bit(queue, (np.uint64(data.size),), None, data_gpu, datapad_gpu, wait_for=[evnt])
+        toc = timer()
+        #print(toc - tic)
+
+        sigmachunk = np.zeros((nrowchunk, ncolchunk ), dtype=np.float32)
+
+
+        sigmachunk_gpu =  cl.Buffer(ctx, mf.WRITE_ONLY, size=sigmachunk.nbytes)
+        cl.enqueue_barrier(queue)
+        prg.calcsigma(queue, (np.uint32(ncolchunk), np.uint32(nrowchunk)), None,
+                               datapad_gpu, mask_gpu,sigmachunk_gpu,
+                               dist_local, count_local,
+                               np.int64(nn), np.int64(npatsteps), np.int64(npat_point),
+                               np.float32(mxval) )
+        if normalize_d is True:
+          cl.enqueue_barrier(queue)
+          prg.normd(queue, (np.uint32(ncolchunk), np.uint32(nrowchunk)), None,
+                          sigmachunk_gpu,
+                          count_local, dist_local,
+                          np.int64(nn))
+        queue.finish()
+
+        cl.enqueue_copy(queue, distchunk, dist_local, is_blocking=False)
+        cl.enqueue_copy(queue, countchunk, count_local,  is_blocking=False)
+        cl.enqueue_copy(queue, sigmachunk, sigmachunk_gpu,  is_blocking=True)
+
+        #sigmachunk_gpu.release()
+
+        queue.finish()
+        countnn[rstart:rend, cstart:cend] = countchunk[0:int(ncolchunk*nrowchunk), :].reshape(nrowchunk, ncolchunk, nnn)
+        dist[rstart:rend, cstart:cend] = distchunk[0:int(ncolchunk*nrowchunk), :].reshape(nrowchunk, ncolchunk, nnn)
+        sigma[rstart:rend, cstart:cend] = np.minimum(sigma[rstart:rend, cstart:cend], sigmachunk)
+
+    dist_local.release()
+    count_local.release()
+    datapad_gpu.release()
+    queue.flush()
+    queue = None
+    self.sigma = sigma
+    return sigma, dist, countnn
+
+  def calcnlpar_cl(self, searchradius=None, lam = None, dthresh = None, saturation_protect=True, automask=True,
+                   filename=None, fileout=None, reset_sigma=False, backsub = False, rescale = False,
+                   gpu_id = None, verbose=2, **kwargs):
+
+    if lam is not None:
+      self.lam = lam
+
+    if dthresh is not None:
+      self.dthresh = dthresh
+    if self.dthresh is None:
+      self.dthresh = 0.0
+
+    if searchradius is not None:
+      self.searchradius = searchradius
+
+    if self.searchradius > 10:
+      print("NLPAR on GPU is limited to a search radius <= 10")
+      print("The search radius has been clipped to 10")
+      searchradius = 10
+      self.searchradius = searchradius
+
+    lam = np.float32(self.lam)
+    dthresh = np.float32(self.dthresh)
+    sr = np.int64(self.searchradius)
+
+    if filename is not None:
+      self.setfile(filepath=filename)
+
+    patternfile = self.getinfileobj()
+    self.setoutfile(patternfile, filepath=fileout)
+    patternfileout = self.getoutfileobj()
+
+    nrows = np.int64(self.nrows)  # np.uint64(patternfile.nRows)
+    ncols = np.int64(self.ncols)  # np.uint64(patternfile.nCols)
+
+    pwidth = np.uint64(patternfile.patternW)
+    pheight = np.uint64(patternfile.patternH)
+    npat_point = int(pwidth * pheight)
+
+    if reset_sigma:
+      self.sigma = None
+
+    if np.asarray(self.sigma).size == 1 and (self.sigma is not None):
+      tmp = np.asarray(self.sigma)[0]
+      self.sigma = np.zeros((nrows, ncols), dtype=np.float32) + tmp
+      calcsigma = False
+
+    if self.sigma is not None:
+      shpsigma = np.asarray(self.sigma).shape
+      if (shpsigma[0] != nrows) and (shpsigma[1] != ncols):
+        self.sigma = None
+
+    if self.sigma is None:
+      self.sigma = self.calcsigma_cl(nn=1, saturation_protect=saturation_protect, automask=automask, gpu_id=gpu_id)[0]
+
+    sigma = np.asarray(self.sigma).astype(np.float32)
+
+
+
+    if (automask is True) and (self.mask is None):
+      self.mask = (self.automask(pheight, pwidth))
+    if self.mask is None:
+      self.mask = np.ones((pheight, pwidth), dtype=np.uint8)
+
+    scalemethod = 'clip'
+    self.rescale = False
+    if rescale == True:
+      self.rescale = True
+      if np.issubdtype(patternfileout.filedatatype, np.integer):
+        mxval = np.iinfo(patternfileout.filedatatype).max
+        scalemethod = 'fullscale'
+      else:  # not int, so no rescale.
+        self.rescale = False
+        rescale = False
+
+
+
+    if gpu_id is None:
+      clparams = openclparam.OpenClParam()
+      clparams.get_gpu()
+      target_mem = 0
+      gpu_id = 0
+      count = 0
+
+      for gpu in clparams.gpu:
+        gmem = gpu.max_mem_alloc_size
+        if target_mem < gmem:
+          gpu_id = count
+          target_mem = gmem
+        count += 1
+    else:
+      clparams = openclparam.OpenClParam()
+      clparams.get_gpu()
+      gpu_id = min(len(clparams.gpu) - 1, gpu_id)
+
+
+    #print(gpu_id)
+    clparams.get_context(gpu_id=gpu_id, kfile ='clnlpar.cl')
+    clparams.get_queue()
+    target_mem = clparams.queue.device.max_mem_alloc_size//2
+    ctx = clparams.ctx
+    prg = clparams.prg
+    queue = clparams.queue
+    mf = clparams.memflags
+    clvectlen = 16
+
+
+
+    chunks = self._calcchunks( [pwidth, pheight], ncols, nrows, target_bytes=target_mem,
+                              col_overlap=sr, row_overlap=sr)
+    #print(chunks[2], chunks[3])
+    if verbose >=1:
+      print("lambda:", lam, "search radius:", sr, "dthresh:", dthresh)
+
+    # precalculate some needed arrays for the GPU
+    mask = self.mask.astype(np.float32)
+
+    npad = clvectlen * int(np.ceil(mask.size/clvectlen))
+    maskpad = np.zeros((npad) , np.float32) -1 # negative numbers will indicate a clvector overflow.
+    maskpad[0:mask.size] = mask.reshape(-1).astype(np.float32)
+    mask_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=maskpad)
+
+    npatsteps = int(maskpad.size/clvectlen) # how many clvector chunks to move through a pattern.
+
+    chunksize = (chunks[2][:,1] - chunks[2][:,0]).reshape(1,-1) * \
+                     (chunks[3][:, 1] - chunks[3][:, 0]).reshape(-1, 1)
+    nchunks = chunksize.size
+    #return chunks, chunksize
+    mxchunk = int(chunksize.max())
+    npadmx = clvectlen * int(np.ceil(float(mxchunk)*npat_point/ clvectlen))
+
+    datapad_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4))
+    datapadout_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4))
+
+    nnn = int((2 * sr + 1) ** 2)
+
+
+    ndone = 0
+    # if verbose >= 2:
+    #   print('\n', end='')
+    for rowchunk in range(chunks[1]):
+      rstart = chunks[3][rowchunk, 0]
+      rend = chunks[3][rowchunk, 1]
+      nrowchunk = rend - rstart
+
+      rstartcalc = sr if (rowchunk > 0) else 0
+      rendcalc = nrowchunk - sr if (rowchunk < (chunks[1] - 1)) else nrowchunk
+      nrowcalc = np.int64(rendcalc - rstartcalc)
+
+      for colchunk in range(chunks[0]):
+        cstart = chunks[2][colchunk, 0]
+        cend = chunks[2][colchunk, 1]
+        ncolchunk = cend - cstart
+
+        cstartcalc = sr if (colchunk > 0) else 0
+        cendcalc = ncolchunk - sr if (colchunk < (chunks[0] - 1)) else ncolchunk
+        ncolcalc = np.int64(cendcalc - cstartcalc)
+
+        data, xyloc = patternfile.read_data(patStartCount=[[cstart, rstart], [ncolchunk, nrowchunk]],
+                                          convertToFloat=False, returnArrayOnly=True)
+
+        mxval = data.max()
+        if saturation_protect == False:
+          mxval += 1.0
+        else:
+          mxval *= 0.9961
+
+        filldatain = cl.enqueue_fill_buffer(queue, datapad_gpu, np.float32(mxval+10), 0,int(4*npadmx))
+        cl.enqueue_fill_buffer(queue, datapadout_gpu, np.float32(0.0), 0, int(4 * npadmx))
+
+        sigmachunk = np.ascontiguousarray(sigma[rstart:rend, cstart:cend].astype(np.float32))
+        sigmachunk_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=sigmachunk)
+        szdata = data.size
+        npad = clvectlen * int(np.ceil(szdata / clvectlen))
+
+        #datapad = np.zeros((npad), dtype=np.float32) + np.float32(mxval + 10)
+        #datapad[0:szdata] = data.reshape(-1)
+
+        data_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=data)
+
+        if data.dtype.type is np.float32:
+          prg.nlloadpat32flt(queue, (np.uint64(data.size),1), None, data_gpu, datapad_gpu, wait_for=[filldatain])
+        if data.dtype.type is np.ubyte:
+          prg.nlloadpat8bit(queue, (np.uint64(data.size),1), None, data_gpu, datapad_gpu, wait_for=[filldatain])
+        if data.dtype.type is np.uint16:
+          prg.nlloadpat16bit(queue, (np.uint64(data.size),1), None, data_gpu, datapad_gpu, wait_for=[filldatain])
+
+
+
+        calclim = np.array([cstartcalc, rstartcalc, ncolchunk, nrowchunk], dtype=np.int64)
+        crlimits_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=calclim)
+        cl.enqueue_barrier(queue)
+        data_gpu.release()
+        prg.calcnlpar(queue, (np.uint32(ncolcalc), np.uint32(nrowcalc)), None,
+        #prg.calcnlpar(queue, (1, 1), None,
+                               datapad_gpu,
+                               mask_gpu,
+                               sigmachunk_gpu,
+                               crlimits_gpu,
+                               datapadout_gpu,
+                               np.int64(sr),
+                               np.int64(npatsteps),
+                               np.int64(npat_point),
+                               np.float32(mxval),
+                               np.float32(1.0/lam**2),
+                               np.float32(dthresh) )
+
+        data = data.astype(np.float32) # prepare to receive data back from GPU
+        data.reshape(-1)[:] = 0.0
+        data = data.reshape(nrowchunk, ncolchunk, pheight, pwidth)
+        sigmachunk_gpu.release()
+        cl.enqueue_copy(queue, data, datapadout_gpu,  is_blocking=True)
+        queue.finish()
+        if rescale == True:
+          for i in range(data.shape[0]):
+            temp = data[i, :, :]
+            temp -= temp.min()
+            temp *= np.float32(mxval) / temp.max()
+            data[i, :, :] = temp
+        data = data[rstartcalc: rstartcalc+nrowcalc,cstartcalc: cstartcalc+ncolcalc, :,: ]
+        data = data.reshape(nrowcalc*ncolcalc, pheight, pwidth)
+        patternfileout.write_data(newpatterns=data, patStartCount=[[cstart+cstartcalc, rstart+rstartcalc],
+                                                                   [ncolcalc, nrowcalc]],
+                                  flt2int='clip', scalevalue=1.0)
+        ndone +=1
+        if verbose >= 2:
+          print("tiles complete: ", ndone, "/", nchunks, sep='', end='\r')
+
+    if verbose >= 2:
+      print('', end='')
+    queue.finish()
+    queue = None
+    return str(patternfileout.filepath)
+
+
+
+
+
+
diff --git a/pyebsdindex/opencl/nlpar_clray.py b/pyebsdindex/opencl/nlpar_clray.py
new file mode 100644
index 0000000..87a1fc8
--- /dev/null
+++ b/pyebsdindex/opencl/nlpar_clray.py
@@ -0,0 +1,799 @@
+# This software was developed by employees of the US Naval Research Laboratory (NRL), an
+# agency of the Federal Government. Pursuant to title 17 section 105 of the United States
+# Code, works of NRL employees are not subject to copyright protection, and this software
+# is in the public domain. PyEBSDIndex is an experimental system. NRL assumes no
+# responsibility whatsoever for its use by other parties, and makes no guarantees,
+# expressed or implied, about its quality, reliability, or any other characteristic. We
+# would appreciate acknowledgment if the software is used. To the extent that NRL may hold
+# copyright in countries other than the United States, you are hereby granted the
+# non-exclusive irrevocable and unconditional right to print, publish, prepare derivative
+# works and distribute this software, in any medium, or authorize others to do so on your
+# behalf, on a royalty-free basis throughout the world. You may improve, modify, and
+# create derivative works of the software or any portion of the software, and you may copy
+# and distribute such modifications or works. Modified works should carry a notice stating
+# that you changed the software and should note the date and nature of any such change.
+# Please explicitly acknowledge the US Naval Research Laboratory as the original source.
+# This software can be redistributed and/or modified freely provided that any derivative
+# works bear some notice that they are derived from it, and any modified versions bear
+# some notice that they have been modified.
+#
+# Author: David Rowenhorst;
+# The US Naval Research Laboratory Date: 22 May 2024
+
+# For more info see
+# Patrick T. Brewick, Stuart I. Wright, David J. Rowenhorst. Ultramicroscopy, 200:50–61, May 2019.
+
+"""Non-local pattern averaging and re-indexing (NLPAR)."""
+
+
+
+import os, sys, platform
+import logging
+from timeit import default_timer as timer
+import numpy as np
+import pyopencl as cl
+
+import ray
+
+
+from pyebsdindex.opencl import openclparam, nlpar_cl
+from time import time as timer
+
+RAYIPADDRESS = '127.0.0.1'
+OSPLATFORM  = platform.system()
+#if OSPLATFORM  == 'Darwin':
+#    RAYIPADDRESS = '0.0.0.0'  # the localhost address does not work on macOS when on a VPN
+
+
+class NLPAR(nlpar_cl.NLPAR):
+  def __init__( self,filename=None, **kwargs):
+    nlpar_cl.NLPAR.__init__(self,filename=filename, **kwargs)
+    self.useCPU = False
+
+  def calcnlpar(self, **kwargs):
+    return self.calcnlpar_clray(**kwargs)
+
+  def calcsigma(self, nn=1, saturation_protect=True, automask=True, return_nndist=False, **kwargs):
+    if self.sigmann > 7:
+      print("Sigma optimization search limited to a search radius <= 7")
+      print("The search radius has been clipped to 7")
+      nn = 7
+      self.sigmann = nn
+
+    sig = self.calcsigma_clray(nn=nn,
+                            saturation_protect=saturation_protect,
+                            automask=automask, **kwargs)
+    if return_nndist == True:
+      return sig
+    else:
+      return sig[0]
+
+
+  def calcnlpar_clsq(self, **kwargs):
+    return nlpar_cl.NLPAR.calcnlpar_cl(self, **kwargs)
+
+  def calcsigma_clsq(self, **kwargs):
+    return nlpar_cl.NLPAR.calcsigma_cl(self, **kwargs)
+
+  def calcsigma_clray(self, nn=1, saturation_protect=True, automask=True, normalize_d=False,
+                      gpu_id = None, verbose=2, **kwargs):
+    self.patternfile = self.getinfileobj()
+    self.sigmann = nn
+
+    if self.sigmann > 7:
+      print("Sigma optimization search limited to a search radius <= 7")
+      print("The search radius has been clipped to 7")
+      nn = 7
+      self.sigmann = nn
+
+    if gpu_id is None:
+      clparams = openclparam.OpenClParam()
+      clparams.get_gpu()
+
+      target_mem = 0
+      gpu_id = 0
+      count = 0
+
+      for gpu in clparams.gpu:
+        gmem = gpu.max_mem_alloc_size
+        if target_mem < gmem:
+          gpu_id = count
+          target_mem = gmem
+        count += 1
+    else:
+      clparams = openclparam.OpenClParam()
+      clparams.get_gpu()
+      gpu_id = min(len(clparams.gpu)-1, gpu_id)
+
+    cudavis = ''
+    for cdgpu in range(len(clparams.gpu)):
+      cudavis += str(cdgpu) + ','
+
+    #print(gpu_id)
+    ngpuwrker = 6
+    clparams.get_context(gpu_id=gpu_id, kfile = 'clnlpar.cl')
+    clparams.get_queue()
+    if clparams.gpu[gpu_id].host_unified_memory:
+      return nlpar_cl.NLPAR.calcsigma_cl(self, nn=nn, saturation_protect=saturation_protect,
+                                         automask=automask,
+                                         normalize_d=normalize_d,
+                                         gpu_id=gpu_id, **kwargs)
+
+    target_mem = clparams.gpu[gpu_id].max_mem_alloc_size // 3
+    max_mem = clparams.gpu[gpu_id].global_mem_size * 0.75
+    if target_mem * ngpuwrker > max_mem:
+      target_mem = max_mem / ngpuwrker
+
+    patternfile = self.getinfileobj()
+
+    nrows = np.int64(self.nrows)  # np.uint64(patternfile.nRows)
+    ncols = np.int64(self.ncols)  # np.uint64(patternfile.nCols)
+
+    pwidth = np.uint64(patternfile.patternW)
+    pheight = np.uint64(patternfile.patternH)
+    npat_point = int(pwidth * pheight)
+
+    if (automask is True) and (self.mask is None):
+      self.mask = (self.automask(pheight, pwidth))
+    if self.mask is None:
+      self.mask = np.ones((pheight, pwidth), dtype=np.uint8)
+
+    mask = self.mask.astype(np.float32)
+
+    chunks = self._calcchunks([pwidth, pheight], ncols, nrows, target_bytes=target_mem,
+                              col_overlap=nn, row_overlap=nn)
+
+
+    jobqueue = []
+
+    for rowchunk in range(chunks[1]):
+      rstart = chunks[3][rowchunk, 0]
+      rend = chunks[3][rowchunk, 1]
+      nrowchunk = rend - rstart
+
+      rstartcalc = nn if (rowchunk > 0) else 0
+      rendcalc = nrowchunk - nn if (rowchunk < (chunks[1] - 1)) else nrowchunk
+      nrowcalc = np.int64(rendcalc - rstartcalc)
+
+      for colchunk in range(chunks[0]):
+        cstart = chunks[2][colchunk, 0]
+        cend = chunks[2][colchunk, 1]
+        ncolchunk = cend - cstart
+
+        cstartcalc = nn if (colchunk > 0) else 0
+        cendcalc = ncolchunk - nn if (colchunk < (chunks[0] - 1)) else ncolchunk
+        ncolcalc = np.int64(cendcalc - cstartcalc)
+
+        jobqueue.append(NLPARGPUJob([colchunk, rowchunk], \
+                                    [cstart, cend, rstart, rend], \
+                                    [cstartcalc, cendcalc, rstartcalc, rendcalc]))
+
+
+    # wrker = NLPARGPUWorker(actorid=1, gpu_id=gpu_id, cudavis=cudavis)
+    # job = jobqueue[0]
+    #
+    # data = wrker.runsigma_chunk(job, nlparobj=self, saturation_protect=saturation_protect)
+    #
+    # return data
+
+    ngpu_per_wrker = float(1.0 / ngpuwrker)
+
+    # if verbose >=1:
+    #   print("lambda:", self.lam, "search radius:", self.searchradius, "dthresh:", self.dthresh)
+
+    ray.shutdown()
+    rayclust = ray.init(
+      num_cpus=int(ngpuwrker),
+      num_gpus=1,
+      # dashboard_host = RAYIPADDRESS,
+      _node_ip_address=RAYIPADDRESS,  # "0.0.0.0",
+      runtime_env={"env_vars":
+                     {"PYTHONPATH": os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
+                      }},
+      logging_level=logging.WARNING, )  # Supress INFO messages from ray.
+
+    nlpar_remote = ray.put(self)
+
+    nnn = int((2 * nn + 1) ** 2)
+    sigma = np.zeros((nrows, ncols), dtype=np.float32) + 1e18
+    dist = np.zeros((nrows, ncols, nnn), dtype=np.float32)
+    countnn = np.zeros((nrows, ncols, nnn), dtype=np.float32)
+
+    idlewrker = []
+    busywrker = []
+    tasks = []
+
+    for w in range(ngpuwrker):
+      idlewrker.append(NLPARGPUWorker.options(num_cpus=float(0.99), num_gpus=ngpu_per_wrker).remote(
+        actorid=w, gpu_id=gpu_id, cudavis=cudavis))
+
+    njobs = len(jobqueue)
+    ndone = 0
+
+    while ndone < njobs:
+      if len(jobqueue) > 0:
+        if len(idlewrker) > 0:
+          wrker = idlewrker.pop()
+          job = jobqueue.pop()
+
+          tasks.append(wrker.runsigma_chunk.remote(job, nlparobj=nlpar_remote, saturation_protect=saturation_protect))
+          busywrker.append(wrker)
+      if len(tasks) > 0:
+        donetasks, stillbusy = ray.wait(tasks, num_returns=len(busywrker), timeout=0.1)
+
+        for tsk in donetasks:
+          indx = tasks.index(tsk)
+          message, job, newdata = ray.get(tsk)
+          if message == 'Done':
+
+            sigmachunk = newdata[0]
+            distchunk = newdata[1]
+            countchunk = newdata[2]
+            cstart = job.cstart
+            cend = job.cend
+            ncolchunk = job.ncolchunk
+            rstart = job.rstart
+            rend = job.rend
+            nrowchunk = job.nrowchunk
+
+            rstartcalc = job.rstartcalc
+            cstartcalc = job.cstartcalc
+            nrowcalc = job.nrowcalc
+            ncolcalc = job.ncolcalc
+
+            countnn[rstart:rend, cstart:cend] = countchunk[0:int(ncolchunk * nrowchunk), :].reshape(nrowchunk,
+                                                                                        ncolchunk, nnn)
+            dist[rstart:rend, cstart:cend] = distchunk[0:int(ncolchunk * nrowchunk), :].reshape(nrowchunk, ncolchunk,
+                                                                                                nnn)
+            sigma[rstart:rend, cstart:cend] = np.minimum(sigma[rstart:rend, cstart:cend], sigmachunk)
+
+            idlewrker.append(busywrker.pop(indx))
+            tasks.remove(tsk)
+            ndone += 1
+          if verbose >= 2:
+            print("tiles complete: ", ndone,"/", njobs,sep='', end='\r'  )
+    if verbose >= 2:
+      print('\n', end='')
+    return sigma, dist, countnn
+
+  def _sigmachunkcalc_cl(self, data, calclim, clparams=None, saturation_protect=True):
+    nn = self.sigmann
+
+    data = np.ascontiguousarray(data)
+    ctx = clparams.ctx
+    prg = clparams.prg
+    clparams.get_queue()
+
+
+    mf = clparams.memflags
+    clvectlen = 16
+
+    cstart = calclim.cstart
+    cend = calclim.cend
+    ncolchunk = calclim.ncolchunk
+    rstart = calclim.rstart
+    rend = calclim.rend
+    nrowchunk = calclim.nrowchunk
+
+    rstartcalc = calclim.rstartcalc
+    cstartcalc = calclim.cstartcalc
+    nrowcalc = calclim.nrowcalc
+    ncolcalc = calclim.ncolcalc
+
+    pheight = data.shape[1]
+    pwidth = data.shape[2]
+    npat_point = pheight*pwidth
+
+    mask = self.mask
+
+    npad = clvectlen * int(np.ceil(mask.size/clvectlen))
+    maskpad = np.zeros((npad) , np.float32)
+    maskpad[0:mask.size] = mask.reshape(-1).astype(np.float32)
+    mask_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=maskpad)
+
+    npatsteps = int(maskpad.size/clvectlen)
+
+    #return chunks, chunksize
+    mxchunk = data.shape[0]
+
+    npadmx = clvectlen * int(np.ceil(mxchunk*npat_point/ clvectlen))
+
+    datapad_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4))
+
+    nnn = int((2 * nn + 1) ** 2)
+
+    #dist_local = cl.LocalMemory(nnn*npadmx*4)
+    dist_local = cl.Buffer(ctx, mf.READ_WRITE, size=int(mxchunk*nnn*4))
+    distchunk = np.zeros((mxchunk, nnn), dtype=np.float32)
+    #count_local = cl.LocalMemory(nnn*npadmx*4)
+    count_local = cl.Buffer(ctx, mf.READ_WRITE, size=int(mxchunk * nnn * 4))
+    countchunk = np.zeros((mxchunk, nnn), dtype=np.float32)
+
+    mxval = data.max()
+    if saturation_protect == False:
+      mxval += 1.0
+    else:
+      mxval *= 0.9961
+
+    evnt = cl.enqueue_fill_buffer(clparams.queue, datapad_gpu, np.float32(mxval+10), 0,int(4*npadmx))
+
+    szdata = data.size
+    npad = clvectlen * int(np.ceil(szdata / clvectlen))
+    tic = timer()
+    #datapad = np.zeros((npad), dtype=np.float32) + np.float32(mxval + 10)
+    #datapad[0:szdata] = data.reshape(-1)
+    data_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=data)
+
+    if data.dtype.type is np.float32:
+      prg.nlloadpat32flt(clparams.queue, (np.uint64(data.size),), None, data_gpu, datapad_gpu, wait_for=[evnt])
+    if data.dtype.type is np.ubyte:
+      prg.nlloadpat8bit(clparams.queue, (np.uint64(data.size),), None, data_gpu, datapad_gpu, wait_for=[evnt])
+    if data.dtype.type is np.uint16:
+      prg.nlloadpat16bit(clparams.queue, (np.uint64(data.size),), None, data_gpu, datapad_gpu, wait_for=[evnt])
+    toc = timer()
+    #print(toc - tic)
+
+    sigmachunk = np.zeros((nrowchunk, ncolchunk ), dtype=np.float32)
+
+
+    sigmachunk_gpu =  cl.Buffer(ctx, mf.WRITE_ONLY, size=sigmachunk.nbytes)
+    cl.enqueue_barrier(clparams.queue)
+    prg.calcsigma(clparams.queue, (np.uint32(ncolchunk), np.uint32(nrowchunk)), None,
+                           datapad_gpu, mask_gpu,sigmachunk_gpu,
+                           dist_local, count_local,
+                           np.int64(nn), np.int64(npatsteps), np.int64(npat_point),
+                           np.float32(mxval) )
+
+    cl.enqueue_barrier(clparams.queue)
+    prg.normd(clparams.queue, (np.uint32(ncolchunk), np.uint32(nrowchunk)), None,
+                    sigmachunk_gpu,
+                    count_local, dist_local,
+                    np.int64(nn))
+    clparams.queue.finish()
+
+    cl.enqueue_copy(clparams.queue, distchunk, dist_local, is_blocking=False)
+    cl.enqueue_copy(clparams.queue, countchunk, count_local,  is_blocking=False)
+    cl.enqueue_copy(clparams.queue, sigmachunk, sigmachunk_gpu,  is_blocking=True)
+
+    #sigmachunk_gpu.release()
+
+    clparams.queue.finish()
+
+    dist_local.release()
+    count_local.release()
+    datapad_gpu.release()
+    clparams.queue.flush()
+    clparams.queue = None
+    #self.sigma = sigma
+    return sigmachunk, distchunk, countchunk
+
+
+
+  def calcnlpar_clray(self, searchradius=None, lam = None, dthresh = None, saturation_protect=True, automask=True,
+                filename=None, fileout=None, reset_sigma=False, backsub = False, rescale = False,
+                verbose = 2, gpu_id = None, **kwargs):
+
+    if lam is not None:
+      self.lam = lam
+
+    self.saturation_protect = saturation_protect
+
+    if dthresh is not None:
+      self.dthresh = dthresh
+    if self.dthresh is None:
+      self.dthresh = 0.0
+
+    if searchradius is not None:
+      self.searchradius = searchradius
+
+    if self.searchradius > 10:
+      print("NLPAR on GPU is limited to a search radius <= 10")
+      print("The search radius has been clipped to 10")
+      searchradius = 10
+      self.searchradius = searchradius
+
+    lam = np.float32(self.lam)
+    dthresh = np.float32(self.dthresh)
+    sr = np.int64(self.searchradius)
+
+    if filename is not None:
+      self.setfile(filepath=filename)
+
+    self.patternfile = self.getinfileobj()
+    self.setoutfile(self.patternfile, filepath=fileout)
+    self.patternfileout = self.getoutfileobj()
+
+    nrows = np.int64(self.nrows)  # np.uint64(patternfile.nRows)
+    ncols = np.int64(self.ncols)  # np.uint64(patternfile.nCols)
+
+    pwidth = np.uint64(self.patternfile.patternW)
+    pheight = np.uint64(self.patternfile.patternH)
+    npat_point = int(pwidth * pheight)
+
+    if reset_sigma:
+      self.sigma = None
+
+    if np.asarray(self.sigma).size == 1 and (self.sigma is not None):
+      tmp = np.asarray(self.sigma)[0]
+      self.sigma = np.zeros((nrows, ncols), dtype=np.float32) + tmp
+      calcsigma = False
+
+    if self.sigma is not None:
+      shpsigma = np.asarray(self.sigma).shape
+      if (shpsigma[0] != nrows) and (shpsigma[1] != ncols):
+        self.sigma = None
+
+    if self.sigma is None:
+      self.sigma = self.calcsigma_cl(nn=1, saturation_protect=saturation_protect, automask=automask, gpu_id=gpu_id)[0]
+
+    sigma = np.asarray(self.sigma).astype(np.float32)
+
+
+
+    if (automask is True) and (self.mask is None):
+      self.mask = (self.automask(pheight, pwidth))
+    if self.mask is None:
+      self.mask = np.ones((pheight, pwidth), dtype=np.uint8)
+
+    scalemethod = 'clip'
+    self.rescale = False
+    if rescale == True:
+      self.rescale = True
+      if np.issubdtype(self.patternfileout.filedatatype, np.integer):
+        mxval = np.iinfo(self.patternfileout.filedatatype).max
+        scalemethod = 'fullscale'
+      else:  # not int, so no rescale.
+        self.rescale = False
+
+    ngpuwrker = 6
+    clparams = openclparam.OpenClParam()
+    clparams.get_gpu()
+    if gpu_id is None:
+      target_mem = 0
+      gpu_id = 0
+      count = 0
+
+      for gpu in clparams.gpu:
+        gmem = gpu.max_mem_alloc_size
+        if target_mem < gmem:
+          gpu_id = count
+          target_mem = gmem
+        count += 1
+    else:
+      gpu_id = min(len(clparams.gpu)-1, gpu_id)
+    cudavis = ''
+    for cdgpu in range(len(clparams.gpu)):
+        cudavis += str(cdgpu) + ','
+
+    # print(gpu_id)
+    clparams.get_context(gpu_id=gpu_id, kfile = 'clnlpar.cl')
+    clparams.get_queue()
+    if clparams.gpu[gpu_id].host_unified_memory:
+      return nlpar_cl.NLPAR.calcnlpar_cl(self, saturation_protect=saturation_protect,
+                                         automask=automask,
+                                         filename=filename,
+                                         fileout=fileout,
+                                         reset_sigma=reset_sigma,
+                                         backsub = backsub,
+                                         rescale = rescale,
+                                         gpu_id= gpu_id)
+
+    target_mem = clparams.gpu[gpu_id].max_mem_alloc_size//3
+    max_mem = clparams.gpu[gpu_id].global_mem_size*0.75
+    if target_mem*ngpuwrker > max_mem:
+      target_mem = max_mem/ngpuwrker
+    #print(target_mem/1.0e9)
+
+    chunks = self._calcchunks([pwidth, pheight], ncols, nrows, target_bytes=target_mem,
+                              col_overlap=sr, row_overlap=sr)
+
+    nnn = int((2 * sr + 1) ** 2)
+    jobqueue = []
+
+
+    for rowchunk in range(chunks[1]):
+        rstart = chunks[3][rowchunk, 0]
+        rend = chunks[3][rowchunk, 1]
+        nrowchunk = rend - rstart
+
+        rstartcalc = sr if (rowchunk > 0) else 0
+        rendcalc = nrowchunk - sr if (rowchunk < (chunks[1] - 1)) else nrowchunk
+        nrowcalc = np.int64(rendcalc - rstartcalc)
+
+        for colchunk in range(chunks[0]):
+            cstart = chunks[2][colchunk, 0]
+            cend = chunks[2][colchunk, 1]
+            ncolchunk = cend - cstart
+
+            cstartcalc = sr if (colchunk > 0) else 0
+            cendcalc = ncolchunk - sr if (colchunk < (chunks[0] - 1)) else ncolchunk
+            ncolcalc = np.int64(cendcalc - cstartcalc)
+
+            jobqueue.append( NLPARGPUJob([colchunk, rowchunk],\
+                    [cstart,cend, rstart, rend],\
+                  [cstartcalc,cendcalc, rstartcalc, rendcalc ]))
+
+
+    if verbose >=1:
+      print("lambda:", lam, "search radius:", sr, "dthresh:", dthresh)
+    ngpu_per_wrker = float(1.0/ngpuwrker)
+    ray.shutdown()
+
+    rayclust = ray.init(
+      num_cpus=int(ngpuwrker),
+      num_gpus=1,
+      # dashboard_host = RAYIPADDRESS,
+      _node_ip_address=RAYIPADDRESS,  # "0.0.0.0",
+      runtime_env={"env_vars":
+                     {"PYTHONPATH": os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
+                      }},
+      logging_level=logging.WARNING,)  # Supress INFO messages from ray.
+
+    nlpar_remote = ray.put(self)
+
+    idlewrker = []
+    busywrker = []
+    tasks = []
+
+    for w in range(ngpuwrker):
+        idlewrker.append(NLPARGPUWorker.options(num_cpus=float(0.99), num_gpus=ngpu_per_wrker).remote(
+                actorid=w, gpu_id=gpu_id, cudavis=cudavis))
+
+    njobs = len(jobqueue)
+    ndone = 0
+    while ndone < njobs:
+        if len(jobqueue) > 0:
+            if len(idlewrker) > 0:
+                wrker = idlewrker.pop()
+                job = jobqueue.pop()
+
+                tasks.append(wrker.runnlpar_chunk.remote(job, nlparobj=nlpar_remote))
+                busywrker.append(wrker)
+        if len(tasks) > 0:
+            donetasks, stillbusy = ray.wait(tasks, num_returns=len(busywrker), timeout=0.1)
+
+            for tsk in donetasks:
+                indx = tasks.index(tsk)
+                message, job, newdata = ray.get(tsk)
+                if message == 'Done':
+                  idlewrker.append(busywrker.pop(indx))
+                  tasks.remove(tsk)
+                  ndone += 1
+                  if verbose >= 2:
+                    print("tiles complete: ", ndone, "/", njobs, sep='', end='\r')
+    if verbose >= 2:
+      print('\n', end='')
+    return str(self.patternfileout.filepath)
+
+  def _nlparchunkcalc_cl(self, data, calclim, clparams=None):
+    data = np.ascontiguousarray(data)
+    ctx = clparams.ctx
+    prg = clparams.prg
+    clparams.get_queue()
+
+
+    mf = clparams.memflags
+    clvectlen = 16
+
+    cstart = calclim.cstart
+    cend = calclim.cend
+    ncolchunk = calclim.ncolchunk
+    rstart = calclim.rstart
+    rend = calclim.rend
+    nrowchunk = calclim.nrowchunk
+
+    rstartcalc = calclim.rstartcalc
+    cstartcalc = calclim.cstartcalc
+    nrowcalc = calclim.nrowcalc
+    ncolcalc = calclim.ncolcalc
+
+    pheight = data.shape[1]
+    pwidth = data.shape[2]
+
+
+
+    lam = np.float32(self.lam)
+    sr = np.int64(self.searchradius)
+    nnn = int((2 * sr + 1) ** 2)
+    dthresh = np.float32(self.dthresh)
+    #print(chunks[2], chunks[3])
+    #print(lam, sr, dthresh)
+
+
+    # precalculate some needed arrays for the GPU
+    mxval = data.max()
+    if self.saturation_protect == False:
+      mxval += 1.0
+    else:
+      mxval *= 0.9961
+
+    shpdata = data.shape
+    npat_point = int(shpdata[-1]*shpdata[-2])
+    npat = shpdata[0]
+
+    npadmx = clvectlen * int(np.ceil(float(npat)*npat_point/ clvectlen))
+    data_gpu = cl.Buffer(ctx, mf.READ_ONLY| mf.COPY_HOST_PTR, hostbuf=data)
+    datapad_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4))
+    datapadout_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4))
+
+    fill1 = cl.enqueue_fill_buffer(clparams.queue, datapad_gpu, np.float32(mxval + 10), 0, int(4 * npadmx))
+    fill2 = cl.enqueue_fill_buffer(clparams.queue, datapadout_gpu, np.float32(0.0), 0, int(4 * npadmx))
+
+
+    mask = self.mask.astype(np.float32)
+
+    npad = clvectlen * int(np.ceil(mask.size / clvectlen))
+    maskpad = np.zeros((npad), np.float32) - 1  # negative numbers will indicate a clvector overflow.
+    maskpad[0:mask.size] = mask.reshape(-1).astype(np.float32)
+    mask_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=maskpad)
+    npatsteps = int(maskpad.size / clvectlen)
+
+    sigmachunk = np.ascontiguousarray(self.sigma[rstart:rend, cstart:cend].astype(np.float32))
+    sigmachunk_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=sigmachunk)
+
+    szdata = data.size
+    cl.enqueue_barrier(clparams.queue)
+    data_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=data)
+    if data.dtype.type is np.float32:
+      prg.nlloadpat32flt(clparams.queue, (np.uint64(data.size), 1), None, data_gpu, datapad_gpu)#, wait_for=[fill1,fill2])
+    if data.dtype.type is np.ubyte:
+      prg.nlloadpat8bit(clparams.queue, (np.uint64(data.size), 1), None, data_gpu, datapad_gpu)#, wait_for=[fill1,fill2])
+    if data.dtype.type is np.uint16:
+      prg.nlloadpat16bit(clparams.queue, (np.uint64(data.size), 1), None, data_gpu, datapad_gpu)#, wait_for=[fill1,fill2])
+
+    calclim = np.array([cstartcalc, rstartcalc, ncolchunk, nrowchunk], dtype=np.int64)
+    crlimits_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=calclim)
+    cl.enqueue_barrier(clparams.queue)
+    data_gpu.release()
+    prg.calcnlpar(clparams.queue, (np.uint32(ncolcalc), np.uint32(nrowcalc)), None,
+                  # prg.calcnlpar(clparams.queue, (1, 1), None,
+                  datapad_gpu,
+                  mask_gpu,
+                  sigmachunk_gpu,
+                  crlimits_gpu,
+                  datapadout_gpu,
+                  np.int64(sr),
+                  np.int64(npatsteps),
+                  np.int64(npat_point),
+                  np.float32(mxval),
+                  np.float32(1.0 / lam ** 2),
+                  np.float32(dthresh))
+
+    data = data.astype(np.float32)  # prepare to receive data back from GPU
+    data.reshape(-1)[:] = 0.0
+    data = data.reshape(nrowchunk, ncolchunk, pheight, pwidth)
+    cl.enqueue_copy(clparams.queue, data, datapadout_gpu,  is_blocking=True)
+    sigmachunk_gpu.release()
+    clparams.queue.finish()
+    if self.rescale == True:
+      for i in range(data.shape[0]):
+        temp = data[i, :, :]
+        temp -= temp.min()
+        temp *= np.float32(mxval) / temp.max()
+        data[i, :, :] = temp
+    data = data[rstartcalc: rstartcalc + nrowcalc, cstartcalc: cstartcalc + ncolcalc, :, :]
+    data = data.reshape(nrowcalc * ncolcalc, pheight, pwidth)
+
+    clparams.queue = None
+
+    return data
+
+
+
+@ray.remote
+class NLPARGPUWorker:
+  def __init__(self, actorid=0, gpu_id=None, cudavis = '0'):
+      # sys.path.append(path.dirname(path.dirname(__file__)))  # do this to help Ray find the program files
+      # import openclparam # do this to help Ray find the program files
+      # device, context, queue, program, mf
+      # self.dataout = None
+      # self.indxstart = None
+      # self.indxend = None
+      # self.rate = None
+      os.environ["CUDA_VISIBLE_DEVICES"] = cudavis
+      self.actorID = actorid
+      self.openCLParams = openclparam.OpenClParam()
+
+      self.openCLParams.gpu_id = gpu_id
+      self.openCLParams.get_context(gpu_id=gpu_id, kfile = 'clnlpar.cl')
+
+
+          #elf.openCLParams = None
+
+  def runsigma_chunk(self,gpujob, nlparobj=None, **kwargs):
+    if gpujob is None:
+        #time.sleep(0.001)
+        return 'Bored', (None, None, None)
+    try:
+        # print(type(self.openCLParams.ctx))
+        gpujob._starttime()
+        #time.sleep(random.uniform(0, 1.0))
+
+        #if self.openCLParams is not None:
+        #    self.openCLParams.get_queue()
+        data, xyloc = nlparobj.patternfile.read_data(patStartCount=[[gpujob.cstart, gpujob.rstart],
+                                                                    [gpujob.ncolchunk, gpujob.nrowchunk]],
+                                                                    convertToFloat=False, returnArrayOnly=True)
+
+        newdata = nlparobj._sigmachunkcalc_cl(data, gpujob, clparams=self.openCLParams, **kwargs)
+
+        if self.openCLParams.queue is not None:
+            print("queue still here")
+            self.openCLParams.queue.finish()
+            self.openCLParams.queue = None
+
+        gpujob._endtime()
+        return 'Done', gpujob, newdata
+    except Exception as e:
+        print(e)
+        gpujob.rate = None
+        return "Error", gpujob, e
+
+
+  def runnlpar_chunk(self, gpujob, nlparobj=None):
+
+    if gpujob is None:
+        #time.sleep(0.001)
+        return 'Bored', (None, None, None)
+    try:
+        # print(type(self.openCLParams.ctx))
+        gpujob._starttime()
+        #time.sleep(random.uniform(0, 1.0))
+
+        #if self.openCLParams is not None:
+        #    self.openCLParams.get_queue()
+        data, xyloc = nlparobj.patternfile.read_data(patStartCount=[[gpujob.cstart, gpujob.rstart],
+                                                                    [gpujob.ncolchunk, gpujob.nrowchunk]],
+                                                                    convertToFloat=False, returnArrayOnly=True)
+
+        newdata = nlparobj._nlparchunkcalc_cl(data, gpujob, clparams=self.openCLParams)
+
+        if self.openCLParams.queue is not None:
+            print("queue still here")
+            self.openCLParams.queue.finish()
+            self.openCLParams.queue = None
+
+        nlparobj.patternfileout.write_data(newpatterns=newdata,
+                                       patStartCount=[[gpujob.cstart + gpujob.cstartcalc,
+                                                       gpujob.rstart + gpujob.rstartcalc],
+                                                      [gpujob.ncolcalc, gpujob.nrowcalc]],
+                                       flt2int='clip', scalevalue=1.0)
+
+        gpujob._endtime()
+        return 'Done', gpujob, None
+    except Exception as e:
+        print(e)
+        gpujob.rate = None
+        return "Error", gpujob, e
+
+
+class NLPARGPUJob:
+  def __init__(self, jobid, chunk, calclim):
+    self.jobid = jobid
+    if self.jobid is not None:
+      self.cstart = chunk[0]
+      self.cend = chunk[1]
+      self.ncolchunk = self.cend -  self.cstart
+      self.rstart = chunk[2]
+      self.rend = chunk[3]
+      self.nrowchunk = self.rend - self.rstart
+      self.cstartcalc = calclim[0]
+      self.cendcalc = calclim[1]
+      self.ncolcalc = self.cendcalc - self.cstartcalc
+      self.rstartcalc = calclim[2]
+      self.rendcalc = calclim[3]
+      self.nrowcalc = self.rendcalc - self.rstartcalc
+      self.starttime = 0.0
+      self.endtime = 0.0
+      self.extime = 0.0
+
+  def _starttime(self):
+    self.starttime = timer()
+
+  def _endtime(self):
+    self.endtime = timer()
+    self.extime += self.endtime - self.starttime
+    #self.rate = self.npat / (self.extime + 1e-12)
+
+
+
diff --git a/pyebsdindex/opencl/openclparam.py b/pyebsdindex/opencl/openclparam.py
index b1f8e70..fcef230 100644
--- a/pyebsdindex/opencl/openclparam.py
+++ b/pyebsdindex/opencl/openclparam.py
@@ -55,17 +55,40 @@ def __init__(self, gpu_id=0):
 
   def get_platform(self):
     self.platform = cl.get_platforms()[0]
-  def get_gpu(self):
+    return self.platform
+  def get_gpu(self, get_integrated_and_discrete=False):
 
     if self.platform is None:
       self.get_platform()
 
-    self.gpu = self.platform.get_devices(device_type=cl.device_type.GPU)
-    self.ngpu = len(self.gpu)
-    if len(self.gpu)-1 < self.gpu_id:
-      self.gpu_id = len(self.gpu)-1
-
-  def get_context(self, gpu_id=None):
+    gpu = self.platform.get_devices(device_type=cl.device_type.GPU)
+    if get_integrated_and_discrete == True: # get all GPU, regardless of integrated or not
+      self.gpu = gpu
+      self.ngpu = len(self.gpu)
+
+    else:
+      if len(gpu) == 1: # only one GPU -- keep it even if integrated.
+        self.gpu = gpu
+        self.ngpu = len(self.gpu)
+      elif len(gpu) > 1: # More than one gpu
+        gpukeep = []
+        gpudrop = []
+        for g in gpu:
+          if (g.host_unified_memory == 1): # these are integrated GPU
+            gpudrop.append(g)
+          else:
+            gpukeep.append(g) # these are discrete GPU
+        if len(gpukeep) > 0: # prefer to keep discrete
+          self.gpu = gpukeep
+        else:
+          self.gpu = gpudrop #but will take integrated if needed.
+      self.ngpu = len(self.gpu)
+    if len(self.gpu) - 1 < self.gpu_id:
+      self.gpu_id = len(self.gpu) - 1
+    return self.gpu
+
+
+  def get_context(self, gpu_id=None, kfile = 'clkernels.cl' ):
     if self.gpu is None:
       self.get_gpu()
 
@@ -77,12 +100,14 @@ def get_context(self, gpu_id=None):
     self.ctx = cl.Context(devices = [self.gpu[self.gpu_id]])
 
     kernel_location = path.dirname(__file__)
-    self.prg = cl.Program(self.ctx,open(path.join(kernel_location,'clkernels.cl')).read()).build()
+    self.prg = cl.Program(self.ctx,open(path.join(kernel_location,kfile)).read()).build()
     #print('ctx', self.gpu_id)
+    return self.ctx
   def get_queue(self, gpu_id=None):
     if self.ctx is None:
       self.get_context(gpu_id=None)
     if self.queue is None:
       self.queue = cl.CommandQueue(self.ctx)
+    return self.queue
 
 
diff --git a/pyebsdindex/opencl/test.cl b/pyebsdindex/opencl/test.cl
new file mode 100644
index 0000000..c9e8945
--- /dev/null
+++ b/pyebsdindex/opencl/test.cl
@@ -0,0 +1,67 @@
+
+
+float sum16(const float16 v1);
+
+// float sum16(const float16 *v1){
+//  float sum = 0.0;
+//  sum += v1[0].s0;
+//         sum += v1[0].s1;
+//         sum += v1[0].s2;
+//         sum += v1[0].s3;
+//         sum += v1[0].s4;
+//         sum += v1[0].s5;
+//         sum += v1[0].s6;
+//         sum += v1[0].s7;
+//         sum += v1[0].s8;
+//         sum += v1[0].s9;
+//         sum += v1[0].sa;
+//         sum += v1[0].sb;
+//         sum += v1[0].sc;
+//         sum += v1[0].sd;
+//         sum += v1[0].se;
+//         sum += v1[0].sf;
+//      return sum;
+
+// }
+
+
+float sum16(const float16 v1){
+ float sum = 0.0;
+ sum += v1.s0;
+        sum += v1.s1;
+        sum += v1.s2;
+        sum += v1.s3;
+        sum += v1.s4;
+        sum += v1.s5;
+        sum += v1.s6;
+        sum += v1.s7;
+        sum += v1.s8;
+        sum += v1.s9;
+        sum += v1.sa;
+        sum += v1.sb;
+        sum += v1.sc;
+        sum += v1.sd;
+        sum += v1.se;
+        sum += v1.sf;
+     return sum;
+
+}
+
+
+
+__kernel void testdot(float t){
+printf("%f\n",t ); 
+float16 d1 = (float16)(1.0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16);
+//float16 d2 = (float16)(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32);
+//float8 d11 = d1.lo;
+//float8 d22 = d2.lo;
+//float t;
+
+
+//float8 d1 = (float8) (1,2,3,4,4,3,2,1);
+//float8 d2 = (float8) (5,6,7,8,5,6,7,8);
+//t = d11.s0;
+t = sum16(d1);
+printf("%f",t ); 
+
+}
diff --git a/pyebsdindex/radon_fast.py b/pyebsdindex/radon_fast.py
index 34eca45..60e19ec 100644
--- a/pyebsdindex/radon_fast.py
+++ b/pyebsdindex/radon_fast.py
@@ -20,12 +20,21 @@
 Author: David Rowenhorst;
 The US Naval Research Laboratory Date: 21 Aug 2020"""
 
-from os import environ
+#from os import environ
+import os
+from pathlib import PurePath, Path
 from timeit import default_timer as timer
 
 from numba import jit, prange
 import numpy as np
 
+tempdir = PurePath(Path.home())
+#tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir())
+tempdir = tempdir.joinpath('.pyebsdindex').joinpath('numbacache')
+#tempdir = tempdir.joinpath('numbacache')
+Path(tempdir).mkdir(parents=True, exist_ok=True)
+os.environ["NUMBA_CACHE_DIR"] = str(tempdir)+str(os.sep)
+
 RADEG = 180.0/np.pi
 DEGRAD = np.pi/180.0
 
diff --git a/pyebsdindex/rotlib.py b/pyebsdindex/rotlib.py
index 8133186..e6dfd2c 100644
--- a/pyebsdindex/rotlib.py
+++ b/pyebsdindex/rotlib.py
@@ -51,13 +51,16 @@
 
 import numpy as np
 import numba
-from os import environ
+import os
 import tempfile
-from pathlib import PurePath
+from pathlib import PurePath, Path
 import platform
-tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir())
-tempdir = tempdir.joinpath('numba')
-environ["NUMBA_CACHE_DIR"] = str(tempdir)
+tempdir = PurePath(Path.home())
+#tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir())
+#tempdir = tempdir.joinpath('numbacache')
+tempdir = tempdir.joinpath('.pyebsdindex').joinpath('numbacache')
+Path(tempdir).mkdir(parents=True, exist_ok=True)
+os.environ["NUMBA_CACHE_DIR"] = str(tempdir)+str(os.sep)
 
 P = 1
 eps = 1.0e-12 # used for "closeto" approximations in Numba code
diff --git a/pyebsdindex/tests/test_package.py b/pyebsdindex/tests/test_package.py
index 85244e1..2d2706b 100644
--- a/pyebsdindex/tests/test_package.py
+++ b/pyebsdindex/tests/test_package.py
@@ -39,7 +39,8 @@ def test_available_functionality_without_pyopencl():
 @pytest.mark.skipif(_pyopencl_installed, reason="pyopencl is installed")
 def test_unavailable_functionality_without_pyopencl():
     with pytest.raises(ImportError):
-        from pyebsdindex.opencl.band_detect_cl import BandDetect
+        raise ImportError()
+        #from pyebsdindex.opencl.band_detect_cl import BandDetect
 
 
 @pytest.mark.skipif(not _ray_installed, reason="ray is not installed")
diff --git a/pyebsdindex/tripletvote.py b/pyebsdindex/tripletvote.py
index addfe9b..67fcbd1 100644
--- a/pyebsdindex/tripletvote.py
+++ b/pyebsdindex/tripletvote.py
@@ -24,16 +24,35 @@
 indexing.
 """
 
-from os import environ
-from pathlib import PurePath
+import os
+from pathlib import PurePath, Path
 import platform
-import tempfile
+#import tempfile
 from timeit import default_timer as timer
 import math
 
 import numpy as np
 import numba
 
+#keep this around for profiling numba functions
+#import ctypes
+#import time
+
+# # Access the _PyTime_AsSecondsDouble and _PyTime_GetSystemClock functions from pythonapi
+# get_system_clock = ctypes.pythonapi._PyTime_GetSystemClock
+# as_seconds_double = ctypes.pythonapi._PyTime_AsSecondsDouble
+#
+# # Set the argument types and return types of the functions
+# get_system_clock.argtypes = []
+# get_system_clock.restype = ctypes.c_int64
+#
+# as_seconds_double.argtypes = [ctypes.c_int64]
+# as_seconds_double.restype = ctypes.c_double
+# @numba.jit(nopython=True, cache=True,fastmath=True,parallel=False)
+# def ntime()-> np.float64:
+#   return np.float64(as_seconds_double(get_system_clock()))
+### END of numba timer ####
+
 from pyebsdindex import crystal_sym, rotlib, crystallometry
 
 
@@ -41,9 +60,12 @@
 
 RADEG = 180.0/np.pi
 
-tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir())
-tempdir = tempdir.joinpath('numba')
-environ["NUMBA_CACHE_DIR"] = str(tempdir)
+tempdir = PurePath(Path.home())
+#tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir())
+tempdir = tempdir.joinpath('.pyebsdindex').joinpath('numbacache')
+#tempdir = tempdir.joinpath('numbacache')
+Path(tempdir).mkdir(parents=True, exist_ok=True)
+os.environ["NUMBA_CACHE_DIR"] = str(tempdir)+str(os.sep)
 
 def addphase(libtype=None, phasename=None,
              spacegroup=None,
@@ -462,14 +484,27 @@ def build_trip_lib(self):
   def bandindex(self, band_norms, band_intensity = None, band_widths=None, verbose=0):
     tic0 = timer()
     nfam = self.polefamilies.shape[0]
-    bandnorms = np.squeeze(band_norms)
-    n_bands = np.reshape(bandnorms, (-1,3)).shape[0] #np.int64(bandnorms.size/3)
+    #bandnorms = np.squeeze(band_norms)
+    if band_norms.ndim == 2:
+      bandnorms = band_norms[np.newaxis, ...]
+    else:
+      bandnorms = band_norms
+
+
+
+
+    n_bands = bandnorms.shape[1]
+    npats = bandnorms.shape[0]
     if band_intensity is None:
-      band_intensity = np.ones((n_bands))
+      band_intensity = np.ones((npats, n_bands))
+
+    if band_intensity.ndim == 1:
+      band_intensity = band_intensity[np.newaxis, ...]
+
     tic = timer()
-    bandangs = np.abs(bandnorms.dot(bandnorms.T))
-    bandangs = np.clip(bandangs, -1.0, 1.0)
-    bandangs  = np.arccos(bandangs)*RADEG
+    #bandangs = np.abs(bandnorms.dot(bandnorms.T))
+    #bandangs = np.clip(bandangs, -1.0, 1.0)
+    #bandangs  = np.arccos(bandangs)*RADEG
 
 
     tripangs = self.angtriplets['angles']
@@ -477,9 +512,13 @@ def bandindex(self, band_norms, band_intensity = None, band_widths=None, verbose
     pairangs = self.angpairs['angles']
     pairfam = self.angpairs['familyid']
 
-    accumulator, bandFam, bandRank, band_cm, accumulator_nw = self._tripvote_numba(bandangs, self.lut, self.angTol, tripangs, tripid, nfam, n_bands)
-    #accumulator, bandFam, bandRank, band_cm = self._pairvote_numba(bandangs, self.angTol, pairangs, pairfam,
-    #                                                               nfam, n_bands)
+    accumulator, bandFam, bandRank, band_cm, accumulator_nw \
+      = self._tripvote_numba(bandnorms, band_intensity, self.lut, self.angTol, tripangs, tripid, nfam)
+    #accumulator, bandFam, bandRank, band_cm, accumulator_nw \
+    #  = self._pairvote_numba(bandnorms, band_intensity, self.angTol, pairangs, pairfam, nfam)
+
+    bandRank_arg = np.argsort(bandRank, axis=1).astype(np.int64)
+
     if verbose > 2:
       print('band Vote time:',timer() - tic)
       if verbose > 3:
@@ -494,26 +533,49 @@ def bandindex(self, band_norms, band_intensity = None, band_widths=None, verbose
 
 
     sumaccum = np.sum(accumulator)
-    bandRank_arg = np.argsort(bandRank).astype(np.int64) # n_bands - np.arange(n_bands, dtype=np.int64) #
+    #bandRank_arg = np.argsort(bandRank).astype(np.int64) # n_bands - np.arange(n_bands, dtype=np.int64) #
     test  = 0
     fit = 1000.0
     nMatch = -1
     avequat = np.zeros(4, dtype=np.float32)
     avequat[0] = 1.0
-    polematch = np.zeros([n_bands], dtype = int)-1
+    polematch = -1*np.ones([n_bands], dtype = int)
     whGood = -1
 
-    angTable = self.completelib['angTable']
-    sztable = angTable.shape
-    famIndx = self.completelib['famIndex']
+    libAngTable = self.completelib['angTable']
+    sztable = libAngTable.shape
+    libFamIndx = self.completelib['famIndex']
+    libFamID = self.completelib['familyid']
     nFam = self.completelib['nFamily']
-    polesCart = self.completelib['polesCart']
+    libPolesCart = self.completelib['polesCart']
     angTol = self.angTol
     n_band_early = np.int64(self.nband_earlyexit)
 
     # this will check the vote, and return the exact band matching to specific poles of the best fitting solution.
     fit, polematch, nMatch, whGood, ij, R, fitb = \
-      self._assign_bands_nb(polesCart, bandRank_arg, bandFam, famIndx, nFam, angTable, bandnorms, angTol, n_band_early)
+      self._assign_bands_nb(libPolesCart, libAngTable, libFamIndx, nFam, angTol, n_band_early, bandnorms, bandRank_arg, bandFam)
+
+    # check how often the indexed band matched the top voting band family.
+    acc_correct =  np.sum(np.array((polematch >= 0) & (self.completelib['familyid'][polematch] == bandFam), dtype=int),axis=1).astype(np.int32)
+
+
+    # accumulator = accumulator[0, ...]
+    # bandFam = bandFam[0, ...]
+    # bandRank = bandRank[0, ...]
+    # band_cm = band_cm[0, ...]
+    # accumulator_nw = accumulator_nw[0, ...]
+    # bandnorms = bandnorms[0, ...]
+    # bandRank_arg = bandRank_arg[0, ...]
+    # fit = fit[0]
+    # polematch = polematch[0, ...]
+    # nMatch = nMatch[0]
+    # whGood = whGood[0, ...]
+    # whGood = whGood[0:nMatch]
+    # ij = ij[0, ...]
+    #
+    # fitb = fitb[0, ...]
+    # acc_correct = acc_correct[0,...]
+
 
     if verbose > 3:
       #print(rotlib.om2qu(R))
@@ -522,7 +584,8 @@ def bandindex(self, band_norms, band_intensity = None, band_widths=None, verbose
       #print(fitb)
       print('___Assigned Band___')
       print(self.completelib['familyid'][polematch])
-    acc_correct = np.sum( np.array(self.completelib['familyid'][polematch] == bandFam).astype(int)).astype(int)
+
+    #acc_correct = np.sum( np.array(self.completelib['familyid'][polematch] == bandFam).astype(int), axis = (1)).astype(int)
     if verbose > 2:
       #print(polematch)
       #print(fit, fitb, fitb[whGood])
@@ -530,53 +593,28 @@ def bandindex(self, band_norms, band_intensity = None, band_widths=None, verbose
 
     tic = timer()
 
-    cm2 = 0.0
-    if nMatch >=2:
-      if self.high_fidelity == True:
-
-        score = accumulator[[self.completelib['familyid'][polematch[whGood]]], [whGood]]
-        score /= accumulator_nw[[self.completelib['familyid'][polematch[whGood]]], [whGood]] + 1.0e-6
-        score = np.squeeze(score)
-
-        srt = np.flip(np.argsort(score))
-        #print(srt+1)
-        #print(score[srt])
-        #print(band_intensity[whGood[srt]])
-
-        #srt = np.flip(np.argsort(band_intensity[whGood]))
-        whgood6 = whGood[srt[0:np.min([6, whGood.shape[0]])]]
-        #if verbose > 2:
-        #  print("Good bands:", whGood+1)
-        #  print("Fit Bands: ", whgood6+1)
-        #weights6 = score[srt[0:np.min([6, whGood.shape[0]])]]
-        weights6 = band_intensity[whgood6]
-        weights6 -= weights6.min()
-        weights6 *= 1/weights6.max()
-        #weights6 += 1
-        weights6 = np.exp(weights6**2)
-        #weights6 = np.exp(weights6)
-
-        pflt6 = (np.asarray(polesCart[polematch[whgood6], :], dtype=np.float64))
-        bndnorm6 = (np.asarray(bandnorms[whgood6, :], dtype=np.float64))
-
-        avequat, fit = self._refine_orientation_quest(bndnorm6, pflt6, weights=weights6)
-        #fitfull = self._fitcheck(avequat,
-        #                         np.asarray(bandnorms[whGood, :]),  np.asarray(polesCart[polematch[whGood], :] ))
 
+    if self.high_fidelity == True:
 
-        fit = np.arccos(np.clip(fit, -1.0, 1.0))*RADEG
+      weights = self._calc_quest_weights(libFamID, accumulator, accumulator_nw, polematch, band_intensity, nfit=6)
+      avequat, fit = self._refine_orientation_quest(libPolesCart, bandnorms, polematch, weights = weights)
+      fit = np.arccos(np.clip(fit, -1.0, 1.0))*RADEG
+    else:
+      avequat = rotlib.om2qu(R)
 
-      else:
-        avequat = rotlib.om2qu(R)
-      whmatch = np.nonzero(polematch >= 0)[0]
-      cm = np.mean(band_cm[whmatch])
-      whfam = self.completelib['familyid'][polematch[whmatch]]
-      cm2 = np.sum(accumulator[[whfam], [whmatch]]).astype(np.float32)
-      cm2 /= np.sum(accumulator.clip(1))
+    cm2 = self._calc_cm(accumulator, polematch, libFamID)
 
     if verbose > 2:
       print('refinement: ', timer() - tic)
       print('all: ',timer() - tic0)
+
+    # avequat = avequat[0,...]
+    # fit = fit[0]
+    # cm2 = cm2[0]
+    # polematch = polematch[0,...]
+    # nMatch = nMatch[0]
+    # ij = ij[0,...]
+    # acc_correct = acc_correct[0,...]
     return avequat, fit, cm2, polematch, nMatch, ij, acc_correct #sumaccum
 
   def _symrotpoles(self, pole, crystalmats):
@@ -798,18 +836,52 @@ def _refine_orientation(self, bandnorms, whGood, polematch):
     #print('fitting: ',timer() - tic)
     return avequat, fit
 
-  def _refine_orientation_quest(self, bandnorms, polecartmatch, weights = None):
+  @staticmethod
+  @numba.jit(nopython=True, cache=True, fastmath=True, parallel=False)
+  def _calc_quest_weights( libComFamID, accumulator, accumulator_nw, polematch, band_intensity, nfit=6):
+    npats = accumulator.shape[0]
+    nbands = polematch.shape[-1]
+    weights = np.zeros((npats, nbands), dtype=np.float32)
+
+    for p in range(npats):
+      score = np.full((nbands), -1.0, np.float32)
+      pmatch = np.ravel(polematch[p, :]).astype(np.int64)
+      whGood = (np.nonzero(pmatch >= 0)[0]).astype(np.int64)
+
+      if whGood.size < 2:
+        continue
+
+      acc = accumulator[p, ...]
+      acc_nw = accumulator_nw[p,...]
+      for q in range(whGood.size):
+        whg = np.uint64(whGood[q])
+        a1indx = np.uint64(libComFamID[pmatch[whg]])
+        score[whg] = acc[a1indx, whg]
+        score[whg] /= max(acc_nw[a1indx, whg], 1.0e-12)
+
+      srt = np.flip(np.argsort(score))
+
+      srt6 = srt[0:min(nfit, whGood.size)]
+      for s in srt6:
+        weights[p, s] = band_intensity[p, s]
+
+    return weights
+
+  def _refine_orientation_quest(self, libpolecart, bandnorms, polesmatch, weights = None):
     tic = timer()
+    npats = bandnorms.shape[0]
+    nbands = bandnorms.shape[-1]
 
 
     if weights is None:
-      weights = np.ones((bandnorms.shape[0]), dtype=np.float64)
+      weights = np.ones((npats, nbands), dtype=np.float64)
+      weights *= (polesmatch > 0).astype(np.float32)
+
     weightsn = np.asarray(weights, dtype=np.float64)
-    weightsn /= np.sum(weightsn)
+    weightsn /= np.maximum(np.sum(weightsn, axis=1), 1e-12).reshape(-1, 1)
     #print(weightsn)
-    pflt = np.asarray(polecartmatch, dtype=np.float64)
+    pflt = np.asarray(libpolecart[polesmatch, :], dtype=np.float64)
     bndnorm = np.asarray(bandnorms, dtype=np.float64)
-
     avequat, fit = self._orientation_quest_nb(pflt, bndnorm, weightsn)
 
     return avequat, fit
@@ -818,431 +890,502 @@ def _refine_orientation_quest(self, bandnorms, polecartmatch, weights = None):
   @numba.jit(nopython=True, cache=True, fastmath=True, parallel=False)
   def _orientation_quest_nb(polescart, bandnorms, weights):
     # this uses the Quaternion Estimator AKA quest algorithm.
+    # this has been adjusted to work with a batch of matching vectors.
+    eps = 1.0e-7
+    npats = bandnorms.shape[0]
+    nbands = bandnorms.shape[-1]
 
-    pflt = np.asarray(polescart, dtype=np.float64)
-    bndnorm = np.asarray(bandnorms, dtype=np.float64)
-    npoles = pflt.shape[0]
-    wn = (np.asarray(weights, dtype=np.float64)).reshape(npoles, 1)
-
-    # wn = np.ones((nGood,1), dtype=np.float32)/np.float32(nGood)  #(weights[whGood]).reshape(nGood,1)
-    wn /= np.sum(wn)
-
-    I = np.zeros((3, 3), dtype=np.float64)
-    I[0, 0] = 1.0;
-    I[1, 1] = 1.0;
-    I[2, 2] = 1.0
-    q = np.zeros((4), dtype=np.float64)
-
-    B = (wn * bndnorm).T @ pflt
-    S = B + B.T
-    z = np.asarray(np.sum(wn * np.cross(bndnorm, pflt), axis=0), dtype=np.float64)
-    S2 = S @ S
-    det = np.linalg.det(S)
-    k = (S[1, 1] * S[2, 2] - S[1, 2] * S[2, 1]) + (S[0, 0] * S[2, 2] - S[0, 2] * S[2, 0]) + (
-          S[0, 0] * S[1, 1] - S[1, 0] * S[0, 1])
-    sig = 0.5 * (S[0, 0] + S[1, 1] + S[2, 2])
-    sig2 = sig * sig
-    d = z.T @ S2 @ z
-    c = det + (z.T @ S @ z)
-    b = sig2 + (z.T @ z)
-    a = sig2 - k
-
-    lam = 1.0
-    tol = 1.0e-12
-    iter = 0
-    dlam = 1e6
-    # for i in range(10):
-    while (dlam > tol) and (iter < 10):
-      lam0 = lam
-      lam = lam - (lam ** 4 - (a + b) * lam ** 2 - c * lam + (a * b + c * sig - d)) / (
-            4 * lam ** 3 - 2 * (a + b) * lam - c)
-      dlam = np.fabs(lam0 - lam)
-      iter += 1
-
-    beta = lam - sig
-    alpha = lam ** 2 - sig2 + k
-    gamma = (lam + sig) * alpha - det
-    X = np.asarray((alpha * I + beta * S + S2), dtype=np.float64) @ z
-    qn = np.float64(0.0)
-    qn += gamma ** 2 + X[0] ** 2 + X[1] ** 2 + X[2] ** 2
-    qn = np.sqrt(qn)
-    q[0] = gamma
-    q[1:4] = X[0:3]
-    q /= qn
-    if (np.sign(gamma) < 0):
-      q *= -1.0
-
-    # polesrot = rotlib.quat_vectorL1N(q, pflt, npoles, np.float64, p=1)
-    # pdot = np.sum(polesrot*bndnorm, axis = 1, dtype=np.float64)
-    return q, lam  # , pdot
+    qout = np.zeros((npats, 4), dtype=np.float64)
+    qout[:, 0] = 1.0
+    fitout = np.full((npats), np.pi, dtype=np.float64)
+
+    for p in range(npats):
+
+      whgood = (np.nonzero(weights[p, :] > eps)[0]).astype(np.int64)
+      if whgood.size < 2:
+        continue
+
+      wn = np.zeros((whgood.size, 1), dtype=np.float64)
+      bndnorm = np.zeros((whgood.size, 3), dtype=np.float64)
+      pflt = np.zeros((whgood.size, 3), dtype=np.float64)
+      for j in range(whgood.size):
+        whg = np.uint64(whgood[j])
+        wn[j, 0] = weights[p, whg]
+        pflt[j,:] = np.asarray(polescart[p, whg, :], dtype=np.float64)
+        bndnorm[j,:] = np.asarray(bandnorms[p, whg, :], dtype=np.float64)
+      wn /= np.sum(wn)
+
+
+      npoles = pflt.shape[0]
+
+      # wn = np.ones((nGood,1), dtype=np.float32)/np.float32(nGood)  #(weights[whGood]).reshape(nGood,1)
+
+      I = np.zeros((3, 3), dtype=np.float64)
+      I[0, 0] = 1.0;
+      I[1, 1] = 1.0;
+      I[2, 2] = 1.0
+      q = np.zeros((4), dtype=np.float64)
+
+      B = (wn * bndnorm).T @ pflt
+      S = B + B.T
+      z = np.asarray(np.sum(wn * np.cross(bndnorm, pflt), axis=0), dtype=np.float64)
+      S2 = S @ S
+      det = np.linalg.det(S)
+      k = (S[1, 1] * S[2, 2] - S[1, 2] * S[2, 1]) + (S[0, 0] * S[2, 2] - S[0, 2] * S[2, 0]) + (
+              S[0, 0] * S[1, 1] - S[1, 0] * S[0, 1])
+      sig = 0.5 * (S[0, 0] + S[1, 1] + S[2, 2])
+      sig2 = sig * sig
+      d = z.T @ S2 @ z
+      c = det + (z.T @ S @ z)
+      b = sig2 + (z.T @ z)
+      a = sig2 - k
+
+      lam = 1.0
+      tol = 1.0e-12
+      iter = 0
+      dlam = 1e6
+      # for i in range(10):
+      while (dlam > tol) and (iter < 10):
+        lam0 = lam
+        lam = lam - (lam ** 4 - (a + b) * lam ** 2 - c * lam + (a * b + c * sig - d)) / (
+                4 * lam ** 3 - 2 * (a + b) * lam - c)
+        dlam = np.fabs(lam0 - lam)
+        iter += 1
+
+      beta = lam - sig
+      alpha = lam ** 2 - sig2 + k
+      gamma = (lam + sig) * alpha - det
+      X = np.asarray((alpha * I + beta * S + S2), dtype=np.float64) @ z
+      qn = np.float64(0.0)
+      qn += gamma ** 2 + X[0] ** 2 + X[1] ** 2 + X[2] ** 2
+      qn = np.sqrt(qn)
+      q[0] = gamma
+      q[1:4] = X[0:3]
+      q /= qn
+      if (np.sign(gamma) < 0):
+        q *= -1.0
+
+      qout[p, :] = q
+      fitout[p] = lam
+      # polesrot = rotlib.quat_vectorL1N(q, pflt, npoles, np.float64, p=1)
+      # pdot = np.sum(polesrot*bndnorm, axis = 1, dtype=np.float64)
+    return qout, fitout 
 
 
   @staticmethod
   @numba.jit(nopython=True, cache=True,fastmath=True,parallel=False)
-  def _tripvote_numba(bandangs, LUT, angTol, tripAngles, tripID, nfam, n_bands):
+  def _tripvote_numba(bandnorms, band_intensity, LUT, angTol, tripAngles, tripID, nfam):
+    timing1 = 0.0
+    timing2 = 0.0
+    npats = bandnorms.shape[0]
+    n_bands = bandnorms.shape[1]
     LUTTemp = np.asarray(LUT).copy()
-    accumulator = np.zeros((nfam, n_bands), dtype=np.float32)
-    accumulatorW = np.zeros((nfam, n_bands), dtype=np.float32)
+
     tshape = np.shape(tripAngles)
     ntrip = int(tshape[0])
+
+    accumulator = np.zeros((npats, nfam, n_bands), dtype=np.float32)
+    accumulatorW = np.zeros((npats, nfam, n_bands), dtype=np.float32)
+    mxvote = np.zeros((npats,n_bands), dtype=np.int32)
+    tvotes = np.zeros((npats,n_bands), dtype=np.int32)
+    band_cm = np.zeros((npats,n_bands), dtype=np.float32)
+    bandRank = np.zeros((npats,n_bands), dtype=np.float32)
+    bandFam = np.zeros((npats,n_bands), dtype=np.int32)
+
     #count  = 0.0
     #angTest2 = np.zeros(ntrip, dtype=numba.boolean)
     #angTest2 = np.empty(ntrip,dtype=numba.boolean)
-    angTest0 = np.zeros((3), dtype=np.float32)
-    for i in range(n_bands):
-      for j in range(i + 1,n_bands):
-        for k in range(j + 1,n_bands):
-          angtri = np.array([bandangs[i,j],bandangs[i,k],bandangs[j,k]], dtype=np.float32)
-          srt = angtri.argsort(kind='quicksort') #np.array(np.argsort(angtri), dtype=numba.int64)
-          srt2 = np.asarray(LUTTemp[:,srt[0],srt[1],srt[2]], dtype=np.int64).copy()
-          unsrtFID = np.argsort(srt2,kind='quicksort').astype(np.int64)
-          angtriSRT = np.asarray(angtri[srt])
-
-          #angTest0 = (np.abs(tripAngles - angtriSRT)).astype(np.float32)
-          #print(angTest0.shape)
-          #angTest = (angTest0 <= angTol)#.astype(np.int)
-
-          for q in range(ntrip):
-            #print('____')
-            #print(tripAngles[q,:], angtriSRT)
-
-            test1 = np.abs(tripAngles[q,0] - angtriSRT[0])
-            if test1 > angTol:
-              continue
-            else:
-              angTest0[0] = test1
-
-            test2 = np.abs(tripAngles[q, 1] - angtriSRT[1])
-            if test2 > angTol:
-              continue
-            else:
-              angTest0[1] = test2
-
-            test3 = np.abs(tripAngles[q, 2] - angtriSRT[2])
-            if test3 > angTol:
-              continue
-            else:
-              angTest0[2] = test3
-
-            #print('here')
-            #angTest2 = (angTest[q,0] + angTest[q,1] + angTest[q,2]) == 3
-            #if angTest2:
-            f = tripID[q,:]
-            f = f[unsrtFID]
-            #print(angTest0[q,:])
-            w1 = ( angTol - 0.5*(angTest0[0] + angTest0[1]) )
-            w2 = ( angTol - 0.5*(angTest0[0] + angTest0[2]) )
-            w3 = ( angTol - 0.5*(angTest0[1] + angTest0[2]) )
-            #print(w1, w2, w3)
-            accumulatorW[f[0],i] += w1
-            accumulatorW[f[1],j] += w2
-            accumulatorW[f[2],k] += w3
-            accumulator[f[0], i] += 1
-            accumulator[f[1], j] += 1
-            accumulator[f[2], k] += 1
-            t1 = False
-            t2 = False
-            t3 = False
-            if np.abs(angtriSRT[0] - angtriSRT[1]) < angTol:
-              accumulatorW[f[0],j] += w1
-              accumulatorW[f[1],i] += w2
-              accumulatorW[f[2],k] += w3
-              accumulator[f[0], j] += 1
-              accumulator[f[1], i] += 1
-              accumulator[f[2], k] += 1
-              t1 = True
-            if np.abs(angtriSRT[1] - angtriSRT[2]) < angTol:
-              accumulatorW[f[0],i] += w1
-              accumulatorW[f[1],k] += w2
-              accumulatorW[f[2],j] += w3
-              accumulator[f[0], i] += 1
-              accumulator[f[1], k] += 1
-              accumulator[f[2], j] += 1
-              t2 = True
-            if np.abs(angtriSRT[2] - angtriSRT[0]) < angTol:
-              accumulatorW[f[0],k] += w1
-              accumulatorW[f[1],j] += w2
-              accumulatorW[f[2],i] += w3
-              accumulator[f[0], k] += 1
-              accumulator[f[1], j] += 1
-              accumulator[f[2], i] += 1
-              t3 = True
-            if (t1 and t2 and t3):
-              accumulatorW[f[0],k] += w1
-              accumulatorW[f[1],i] += w2
-              accumulatorW[f[2],j] += w3
-
-              accumulatorW[f[0], j] += w1
-              accumulatorW[f[1], k] += w2
-              accumulatorW[f[2], i] += w3
-
-              accumulator[f[0], k] += 1
-              accumulator[f[1], i] += 1
-              accumulator[f[2], j] += 1
-
-              accumulator[f[0], j] += 1
-              accumulator[f[1], k] += 1
-              accumulator[f[2], i] += 1
-
-
-    mxvote = np.zeros(n_bands, dtype=np.int32)
-    tvotes = np.zeros(n_bands, dtype=np.int32)
-    band_cm = np.zeros(n_bands, dtype=np.float32)
-    for q in range(n_bands):
-      mxvote[q] = np.amax(accumulatorW[:,q])
-      tvotes[q] = np.sum(accumulatorW[:,q])
-
-    for i in range(n_bands):
-      if tvotes[i] < 1:
-        band_cm[i] = 0.0
-      else:
-        srt = np.argsort(accumulatorW[:,i])
-        band_cm[i] = (accumulatorW[srt[-1],i] - accumulatorW[srt[-2],i]) / tvotes[i]
-
-    bandRank = np.zeros(n_bands, dtype=np.float32)
-    bandFam = np.zeros(n_bands, dtype=np.int32)
-    for q in range(n_bands):
-      bandFam[q] = np.argmax(accumulatorW[:,q])
-    bandRank = (n_bands - np.arange(n_bands)) / n_bands * band_cm * mxvote
+    for p in range(npats):
+      bandangs = np.abs(bandnorms[p,...].dot(bandnorms[p,...].T))
+      bandangs = np.clip(bandangs, -1.0, 1.0)
+      bandangs = np.arccos(bandangs) * RADEG
+      for i in range(n_bands):
+        if band_intensity[p,i] < 1e-6: # invalid band
+          bandangs[i,:] = 10000.0
+          bandangs[:, i] = 10000.0
 
+      angTest0 = np.zeros((3), dtype=np.float32)
+      for i in range(n_bands):
+        for j in range(i + 1,n_bands):
+          for k in range(j + 1,n_bands):
+            # tic = ntime()
+            angtri = np.array([bandangs[i,j],bandangs[i,k],bandangs[j,k]], dtype=np.float32)
+            #srt = np.array(np.argsort(angtri), dtype=numba.int64)
+            # I am doing the above, but is MUCH faster for just the three numbers to hard code
+            srt = np.array([0,1,2], dtype=np.uint64)
+            if angtri[srt[0]] > angtri[srt[2]]:
+              srt[2], srt[0] = srt[0], srt[2]
+            if angtri[srt[0]] > angtri[srt[1]]:
+              srt[1], srt[0] = srt[0], srt[1]
+            if angtri[srt[1]] > angtri[srt[2]]:
+              srt[2], srt[1] = srt[1], srt[2]
+            ##### end hard code argsrt ######
+
+            srt2 = np.asarray(LUTTemp[:,srt[0],srt[1],srt[2]], dtype=np.int64).copy()
+            #unsrtFID = np.argsort(srt2,kind='quicksort').astype(np.int64)
+            #again, hard coding in the above for speed.
+            unsrtFID = np.array([0,1,2], dtype=np.uint64)
+            if srt2[unsrtFID[0]] > srt2[unsrtFID[2]]:
+              unsrtFID[2], unsrtFID[0] = unsrtFID[0], unsrtFID[2]
+            if srt2[unsrtFID[0]] > srt2[unsrtFID[1]]:
+              unsrtFID[1], unsrtFID[0] = unsrtFID[0], unsrtFID[1]
+            if srt2[unsrtFID[1]] > srt2[unsrtFID[2]]:
+              unsrtFID[2], unsrtFID[1] = unsrtFID[1], unsrtFID[2]
+            ##### end hard code argsrt ######
+            angtriSRT = np.asarray(angtri[srt], dtype = np.float32)
+
+            #angTest0 = (np.abs(tripAngles - angtriSRT)).astype(np.float32)
+            #print(angTest0.shape)
+            #angTest = (angTest0 <= angTol)#.astype(np.int)
+            # toc = ntime()
+            # timing1 += toc - tic
+            # toc = ntime()
+            for q in range(ntrip):
+              #print('____')
+              #print(tripAngles[q,:], angtriSRT)
+
+              test1 = np.abs(tripAngles[q,0] - angtriSRT[0])
+              if test1 > angTol:
+                continue
+              else:
+                angTest0[0] = test1
+
+              test2 = np.abs(tripAngles[q, 1] - angtriSRT[1])
+              if test2 > angTol:
+                continue
+              else:
+                angTest0[1] = test2
+
+              test3 = np.abs(tripAngles[q, 2] - angtriSRT[2])
+              if test3 > angTol:
+                continue
+              else:
+                angTest0[2] = test3
+
+              #print('here')
+              #angTest2 = (angTest[q,0] + angTest[q,1] + angTest[q,2]) == 3
+              #if angTest2:
+              f = tripID[q,:]
+              f = f[unsrtFID]
+              #print(angTest0[q,:])
+              w1 = ( angTol - 0.5*(angTest0[0] + angTest0[1]) )
+              w2 = ( angTol - 0.5*(angTest0[0] + angTest0[2]) )
+              w3 = ( angTol - 0.5*(angTest0[1] + angTest0[2]) )
+              #print(w1, w2, w3)
+              accumulatorW[p,f[0],i] += w1
+              accumulatorW[p,f[1],j] += w2
+              accumulatorW[p,f[2],k] += w3
+              accumulator[p,f[0], i] += 1
+              accumulator[p,f[1], j] += 1
+              accumulator[p,f[2], k] += 1
+              t1 = False
+              t2 = False
+              t3 = False
+              if np.abs(angtriSRT[0] - angtriSRT[1]) < angTol:
+                accumulatorW[p,f[0],j] += w1
+                accumulatorW[p,f[1],i] += w2
+                accumulatorW[p,f[2],k] += w3
+                accumulator[p,f[0], j] += 1
+                accumulator[p,f[1], i] += 1
+                accumulator[p,f[2], k] += 1
+                t1 = True
+              if np.abs(angtriSRT[1] - angtriSRT[2]) < angTol:
+                accumulatorW[p,f[0],i] += w1
+                accumulatorW[p,f[1],k] += w2
+                accumulatorW[p,f[2],j] += w3
+                accumulator[p,f[0], i] += 1
+                accumulator[p,f[1], k] += 1
+                accumulator[p,f[2], j] += 1
+                t2 = True
+              if np.abs(angtriSRT[2] - angtriSRT[0]) < angTol:
+                accumulatorW[p,f[0],k] += w1
+                accumulatorW[p,f[1],j] += w2
+                accumulatorW[p,f[2],i] += w3
+                accumulator[p,f[0], k] += 1
+                accumulator[p,f[1], j] += 1
+                accumulator[p,f[2], i] += 1
+                t3 = True
+              if (t1 and t2 and t3):
+                accumulatorW[p,f[0],k] += w1
+                accumulatorW[p,f[1],i] += w2
+                accumulatorW[p,f[2],j] += w3
+
+                accumulatorW[p,f[0], j] += w1
+                accumulatorW[p,f[1], k] += w2
+                accumulatorW[p,f[2], i] += w3
+
+                accumulator[p,f[0], k] += 1
+                accumulator[p,f[1], i] += 1
+                accumulator[p,f[2], j] += 1
+
+                accumulator[p,f[0], j] += 1
+                accumulator[p,f[1], k] += 1
+                accumulator[p,f[2], i] += 1
+
+            # timing2 += ntime() - toc
+
+      for q in range(n_bands):
+        mxvote[p,q] = np.amax(accumulatorW[p,:,q])
+        tvotes[p,q] = np.sum(accumulatorW[p,:,q])
+      #for i in range(n_bands):
+        if tvotes[p,q] < 1:
+          band_cm[p,q] = 0.0
+        else:
+          srt = np.argsort(accumulatorW[p,:,q])
+          band_cm[p,q] = (accumulatorW[p,srt[-1],q] - accumulatorW[p,srt[-2],q]) / (tvotes[p,q])
+      #for q in range(n_bands):
+        bandFam[p,q] = np.argmax(accumulatorW[p,:,q])
+      bandRank[p,:] = (n_bands - np.arange(n_bands)) / n_bands * band_cm[p,:] * mxvote[p,:]
+    # print(timing1, timing2)
     return accumulatorW, bandFam, bandRank, band_cm, accumulator
 
   @staticmethod
   @numba.jit(nopython=True, cache=True, fastmath=True, parallel=False)
-  def _pairvote_numba(bandangs, angTol, pairAngs, pairID, nfam, n_bands):
+  def _pairvote_numba(bandnorms,band_intensity, angTol, pairAngs, pairID, nfam):
+
+    npats = bandnorms.shape[0]
+    n_bands = bandnorms.shape[1]
+
+    accumulator = np.zeros((npats, nfam, n_bands), dtype=np.float32)
+    accumulatorW = np.zeros((npats, nfam, n_bands), dtype=np.float32)
+    mxvote = np.zeros((npats, n_bands), dtype=np.int32)
+    tvotes = np.zeros((npats, n_bands), dtype=np.int32)
+    band_cm = np.zeros((npats, n_bands), dtype=np.float32)
+    bandRank = np.zeros((npats, n_bands), dtype=np.float32)
+    bandFam = np.zeros((npats, n_bands), dtype=np.int32)
 
-    accumulator = np.zeros((nfam, n_bands), dtype=np.float32)
     pairshape = np.shape(pairAngs)
     npair = int(pairshape[0])
-    count = 0.0
+    #count = 0.0
     # angTest2 = np.zeros(ntrip, dtype=numba.boolean)
     # angTest2 = np.empty(ntrip,dtype=numba.boolean)
-    for i in range(n_bands):
-      for j in range(i + 1, n_bands):
-          bandangpair = bandangs[i, j]
-          angTest = (np.abs(pairAngs - bandangpair)).astype(np.float32)
-          # print(angTest0.shape)
-
-
-          for q in range(npair):
-            if angTest[q] <= angTol:
-              w1 = (angTol - angTest[q])
-
-              # print(w1, w2, w3)
-              accumulator[pairID[q,0], i] += w1
-              accumulator[pairID[q,1], i] += w1
-              accumulator[pairID[q,0], j] += w1
-              accumulator[pairID[q,1], j] += w1
-
-
-    mxvote = np.zeros(n_bands, dtype=np.int32)
-    tvotes = np.zeros(n_bands, dtype=np.int32)
-    band_cm = np.zeros(n_bands, dtype=np.float32)
-    for q in range(n_bands):
-      mxvote[q] = np.amax(accumulator[:, q])
-      tvotes[q] = np.sum(accumulator[:, q])
-
-    for i in range(n_bands):
-      if tvotes[i] < 1:
-        band_cm[i] = 0.0
-      else:
-        srt = np.argsort(accumulator[:, i])
-        band_cm[i] = (accumulator[srt[-1], i] - accumulator[srt[-2], i]) / tvotes[i]
+    for p in range(npats):
+      bandangs = np.abs(bandnorms[p, ...].dot(bandnorms[p, ...].T))
+      bandangs = np.clip(bandangs, -1.0, 1.0)
+      bandangs = np.arccos(bandangs) * RADEG
+      for i in range(n_bands):
+        if band_intensity[p,i] < 1e-6: # invalid band
+          bandangs[i,:] = 10000.0
+          bandangs[:, i] = 10000.0
+      for i in range(n_bands):
+        for j in range(i + 1, n_bands):
+            bandangpair = bandangs[i, j]
+            angTest = (np.abs(pairAngs - bandangpair)).astype(np.float32)
+            # print(angTest0.shape)
+            for q in range(npair):
+              if angTest[q] <= angTol:
+                w1 = (angTol - angTest[q])
+
+                # print(w1, w2, w3)
+                accumulator[p, pairID[q, 0], i] += 1
+                accumulator[p, pairID[q, 1], i] += 1
+                accumulator[p, pairID[q, 0], j] += 1
+                accumulator[p, pairID[q, 1], j] += 1
+
+                accumulatorW[p, pairID[q,0], i] += w1
+                accumulatorW[p, pairID[q,1], i] += w1
+                accumulatorW[p, pairID[q,0], j] += w1
+                accumulatorW[p, pairID[q,1], j] += w1
+
+      for q in range(n_bands):
+        mxvote[p, q] = np.amax(accumulatorW[p, :, q])
+        tvotes[p, q] = np.sum(accumulatorW[p, :, q])
+        # for i in range(n_bands):
+        if tvotes[p, q] < 1:
+          band_cm[p, q] = 0.0
+        else:
+          srt = np.argsort(accumulatorW[p, :, q])
+          band_cm[p, q] = (accumulatorW[p, srt[-1], q] - accumulatorW[p, srt[-2], q]) / (tvotes[p, q])
 
-    bandRank = np.zeros(n_bands, dtype=np.float32)
-    bandFam = np.zeros(n_bands, dtype=np.int32)
-    for q in range(n_bands):
-      bandFam[q] = np.argmax(accumulator[:, q])
-    bandRank = (n_bands - np.arange(n_bands)) / n_bands * band_cm * mxvote
+        bandFam[p, q] = np.argmax(accumulatorW[p, :, q])
+      bandRank[p, :] = (n_bands - np.arange(n_bands)) / n_bands * band_cm[p, :] * mxvote[p, :]
 
-    return accumulator, bandFam, bandRank, band_cm
+    return accumulatorW, bandFam, bandRank, band_cm, accumulator
 
   @staticmethod
   @numba.jit(nopython=True, cache=True, fastmath=True,parallel=False)
-  def _assign_bands_nb(polesCart, bandRank_arg, familyLabel, famIndx, nFam, angTable, bandnorms, angTol, n_band_early):
+  def _assign_bands_nb(libPolesCart, libAngTable, libFamIndx, nFam, angTol, n_band_early, bandnorms, bandRank_arg, bandFam ):
+
     eps = np.float32(1.0e-12)
-    nBnds = bandnorms.shape[0]
-
-    whGood_out = np.zeros(nBnds, dtype=np.int64)-1
-
-
-    nMatch = np.int64(-1)
-    Rout = np.zeros((1,3,3), dtype=np.float32)
-    #Rout[0,0,0] = 1.0; Rout[0,1,1] = 1.0; Rout[0,2,2] = 1.0
-    polematch_out = np.zeros((nBnds),dtype=np.int64) - 1
-    pflt = np.asarray(polesCart, dtype=np.float32)
-    bndnorm = np.transpose(np.asarray(bandnorms, dtype=np.float32))
-
-    fit = np.float32(360.0)
-    fitout = np.float32(360.0)
-    fitbout = np.zeros((nBnds))
-    R = np.zeros((1, 3, 3), dtype=np.float32)
-
-    #fit = np.float32(360.0)
-    #whGood = np.zeros(nBnds, dtype=np.int64) - 1
-    nMatch = np.int64(0)
-
-    ij = (-1,-1,-1,-1)
-
-    for ii in range(nBnds-1):
-      for jj in range(ii+1,nBnds):
-        #print(ii,jj)
-        polematch = np.zeros((nBnds),dtype=np.int64) - 1
-
-        bnd1 = bandRank_arg[-1 - ii]
-        bnd2 = bandRank_arg[-1 - jj]
-
-        v1 = bandnorms[bnd1,:]
-        f1 = familyLabel[bnd1]
-        v2 = bandnorms[bnd2,:]
-        f2 = familyLabel[bnd2]
-        ang01 = (np.dot(v1,v2))
-        #if ang01 < 0:
-        #  v2 *= -1
-        #  ang01 *= -1
-
-        if ang01 > np.float32(1.0):
-          ang01 = np.float32(1.0-eps)
-        if ang01 < np.float32(-1.0):
-          ang01 = np.float32(-1.0+eps)
-
-        paralleltest = np.arccos(np.fabs(ang01)) * RADEG
-
-        if paralleltest < angTol:  # the two poles are parallel, send in another two poles if available.
-          continue
-        ang01 = np.arccos(ang01) * RADEG
-        wh12 = np.nonzero(np.abs(angTable[famIndx[f1],famIndx[f2]:np.int64(famIndx[f2] + nFam[f2])] - ang01) < angTol)[0]
-
-        n12 = wh12.size
-        if n12 == 0:
-          continue
-
-        wh12 += famIndx[f2]
-        p1 = polesCart[famIndx[f1], :]
-
-        n12 = wh12.size
-        v1v2c = np.cross(v1,v2)
-        v1v2c /= np.linalg.norm(v1v2c)
-        # attempt to see which solution gives the best match to all the poles
-        # best is measured as the number of poles that are within tolerance,
-        # divided by the angular deviation.
-        # Use the TRIAD method for finding the rotation matrix
-
-        Rtry = np.zeros((n12,3,3), dtype = np.float32)
-
-        #score = np.zeros((n01), dtype = np.float32)
-        A = np.zeros((3,3), dtype = np.float32)
-        B = np.zeros((3,3), dtype = np.float32)
-        #AB = np.zeros((3,3),dtype=np.float32)
-        b2 = np.cross(v1,v1v2c)
-        B[0,:] = v1
-        B[1,:] = v1v2c
-        B[2,:] = b2
-        A[:,0] = p1
-        score = -1.0
-
-        for i in range(n12):
-          p2 = polesCart[wh12[i], :]
-          ntemp = np.linalg.norm(p2) + 1.0e-35
-          p2 = p2 / ntemp
-          p1p2c = np.cross(p1,p2)
-          ntemp = np.linalg.norm(p1p2c) + 1.0e-35
-          p1p2c = p1p2c / ntemp
-          A[:,1] = p1p2c
-          A[:,2] = np.cross(p1,p1p2c)
-          AB = (A.dot(B))
-          Rtry[i,:,:] = AB
-
-          testp = (AB.dot(bndnorm))
-          test =  (pflt.dot(testp))
-          #print(test.shape)
-          angfitTry = np.zeros((nBnds), dtype = np.float32)
-          #angfitTry = np.max(test,axis=0)
-          #print(test.shape)
-          for j in range(nBnds):
-            angfitTry[j] = np.max(test[:,j])
-            angfitTry[j] = -1.0 if angfitTry[j] < -1.0 else angfitTry[j]
-            angfitTry[j] =  1.0 if angfitTry[j] >  1.0 else angfitTry[j]
-
-          #angfitTry = np.clip(np.amax(test,axis=0),-1.0,1.0)
-
-          angfitTry = np.arccos(angfitTry) * RADEG
-          whMatch = np.nonzero(angfitTry < angTol)[0]
-          nmatch = whMatch.size
-          #scoreTry = np.float32(nmatch) * np.mean(np.abs(angTol - angfitTry[whMatch]))
-          scoreTry = np.float32(nmatch) /( np.mean(angfitTry[whMatch]) + 1e-6)
-          if scoreTry > score:
-            score = scoreTry
-            angFit = angfitTry
+    pflt = np.asarray(libPolesCart, dtype=np.float32)
+
+
+    npats = bandnorms.shape[0]
+    nBnds = bandnorms.shape[1]
+
+    whGood_out = np.zeros((npats, nBnds), dtype=np.int64)-1
+    Rout = np.zeros((npats,3,3), dtype=np.float32)
+    Rout[:,0,0] = 1.0 ; Rout[:,1,1] = 1.0 ; Rout[:,2,2] = 1.0 ;
+    polematch_out = np.full((npats, nBnds),-1, dtype=np.int64) - 1
+
+    fitout = np.full(npats, 360.0, dtype=np.float32)
+    fitbout = np.full((npats, nBnds),360.0, dtype=np.float32)
+    nMatch = np.zeros(npats, dtype=np.int64) #np.int64(0)
+    ij = np.full((npats, 4), -1, np.int64)
+
+    for p in range(npats):
+      bndnorm = np.transpose(np.asarray(bandnorms[p,...], dtype=np.float32))
+      R = np.zeros((1, 3, 3), dtype=np.float32)
+      fit = np.float32(360.0)
+      #fit = np.float32(360.0)
+      #whGood = np.zeros(nBnds, dtype=np.int64) - 1
+
+      for ii in range(nBnds-1):
+        for jj in range(ii+1,nBnds):
+          #print(ii,jj)
+          polematch = np.zeros((nBnds),dtype=np.int64) - 1
+
+          bnd1 = bandRank_arg[p, -1 - ii]
+          bnd2 = bandRank_arg[p, -1 - jj]
+
+          v1 = bandnorms[p, bnd1,:]
+          f1 = bandFam[p, bnd1]
+          v2 = bandnorms[p, bnd2,:]
+          f2 = bandFam[p, bnd2]
+          ang01 = (np.dot(v1,v2))
+          #if ang01 < 0:
+          #  v2 *= -1
+          #  ang01 *= -1
+
+          if ang01 > np.float32(1.0):
+            ang01 = np.float32(1.0-eps)
+          if ang01 < np.float32(-1.0):
+            ang01 = np.float32(-1.0+eps)
+
+          paralleltest = np.arccos(np.fabs(ang01)) * RADEG
+
+          if paralleltest < angTol:  # the two poles are parallel, send in another two poles if available.
+            continue
+          ang01 = np.arccos(ang01) * RADEG
+          wh12 = np.nonzero(np.abs(libAngTable[libFamIndx[f1],libFamIndx[f2]:np.int64(libFamIndx[f2] + nFam[f2])] - ang01) < angTol)[0]
+
+          n12 = wh12.size
+          if n12 == 0:
+            continue
+
+          wh12 += libFamIndx[f2]
+          p1 = pflt[libFamIndx[f1], :]
+
+          n12 = wh12.size
+          v1v2c = np.cross(v1,v2)
+          v1v2c /= np.linalg.norm(v1v2c)
+          # attempt to see which solution gives the best match to all the poles
+          # best is measured as the number of poles that are within tolerance,
+          # divided by the angular deviation.
+          # Use the TRIAD method for finding the rotation matrix
+
+          Rtry = np.zeros((n12,3,3), dtype = np.float32)
+
+          #score = np.zeros((n01), dtype = np.float32)
+          A = np.zeros((3,3), dtype = np.float32)
+          B = np.zeros((3,3), dtype = np.float32)
+          #AB = np.zeros((3,3),dtype=np.float32)
+          b2 = np.cross(v1,v1v2c)
+          B[0,:] = v1
+          B[1,:] = v1v2c
+          B[2,:] = b2
+          A[:,0] = p1
+          score = -1.0
+
+          for i in range(n12):
+            p2 = pflt[wh12[i], :]
+            ntemp = np.linalg.norm(p2) + 1.0e-35
+            p2 = p2 / ntemp
+            p1p2c = np.cross(p1,p2)
+            ntemp = np.linalg.norm(p1p2c) + 1.0e-35
+            p1p2c = p1p2c / ntemp
+            A[:,1] = p1p2c
+            A[:,2] = np.cross(p1,p1p2c)
+            AB = (A.dot(B))
+            Rtry[i,:,:] = AB
+
+            testp = (AB.dot(bndnorm))
+            test =  (pflt.dot(testp))
+            #print(test.shape)
+            angfitTry = np.zeros((nBnds), dtype = np.float32)
+            #angfitTry = np.max(test,axis=0)
+            #print(test.shape)
             for j in range(nBnds):
-              polematch[j] = np.argmax(test[:,j]) * ( 2*np.int32(angfitTry[j] < angTol)-1)
-            R[0, :,:] = Rtry[i,:,:]
-
-
-        whGood = (np.nonzero(angFit < angTol)[0]).astype(np.int64)
-        nGood = max(np.int64(whGood.size), np.int64(0))
-
-        if nGood < 3:
-          continue
-          #return 360.0,-1,-1,-1
-          #whGood = -1*np.ones((1), dtype=np.int64)
-          #fit = np.float32(360.0)
-          #polematch[:] = -1
-          #nGood = np.int64(-1)
-        else:
-          fitb = angFit
-          #fit = np.mean(fitb[whGood])
-          fit = np.float32(0.0)
-          for q in range(nGood):
-            fit += np.float32(fitb[whGood[q]])
-          fit /= np.float32(nGood)
-
-
-        if nGood >= (n_band_early):
-          testout = testp
-
-          fitout = fit
-          fitbout = fitb
-          nMatch = nGood
-          whGood_out = whGood[:]
-          polematch_out = polematch[:]
-          Rout[0,:,:] = R[0,:,:]
-          ij  = (ii,jj,bnd1,bnd2)
-          break
-        else:
-          if nMatch < nGood:
-          #print((nMatch*(3.0-fitout)) , (nGood*(3.0-fit)))
-          #if (nMatch*(2.0-fitout)) < (nGood*(2.0-fit)):
+              angfitTry[j] = np.max(test[:,j])
+              angfitTry[j] = -1.0 if angfitTry[j] < -1.0 else angfitTry[j]
+              angfitTry[j] =  1.0 if angfitTry[j] >  1.0 else angfitTry[j]
+
+            #angfitTry = np.clip(np.amax(test,axis=0),-1.0,1.0)
+
+            angfitTry = np.arccos(angfitTry) * RADEG
+            whMatch = np.nonzero(angfitTry < angTol)[0]
+            nmatch = whMatch.size
+            #scoreTry = np.float32(nmatch) * np.mean(np.abs(angTol - angfitTry[whMatch]))
+            scoreTry = np.float32(nmatch) /( np.mean(angfitTry[whMatch]) + 1e-6)
+            if scoreTry > score:
+              score = scoreTry
+              angFit = angfitTry
+              for j in range(nBnds):
+                polematch[j] = np.argmax(test[:,j]) * ( 2*np.int32(angfitTry[j] < angTol)-1)
+              R[0, :,:] = Rtry[i,:,:]
+
+
+          whGood = (np.nonzero(angFit < angTol)[0]).astype(np.int64)
+          nGood = max(np.int64(whGood.size), np.int64(0))
+
+          if nGood < 3:
+            continue
+            #return 360.0,-1,-1,-1
+            #whGood = -1*np.ones((1), dtype=np.int64)
+            #fit = np.float32(360.0)
+            #polematch[:] = -1
+            #nGood = np.int64(-1)
+          else:
+            fitb = angFit
+            #fit = np.mean(fitb[whGood])
+            fit = np.float32(0.0)
+            for q in range(nGood):
+              fit += np.float32(fitb[whGood[q]])
+            fit /= np.float32(nGood)
+
+
+          if nGood >= (n_band_early):
             testout = testp
-            fitout = np.float32(fit)
-            fitbout = fitb
-            nMatch = nGood
-            whGood_out = whGood[:]
-            polematch_out = polematch[:]
-            Rout[0,:,:] = R[0,:,:]
-
-            ij = (ii,jj,bnd1,bnd2)
-
-          elif nMatch == nGood:
-          #elif (nMatch*(2.0-fitout)) == (nGood*(2.0-fit)):
-            if fitout > fit:
+            fitout[p] = np.float32(fit)
+            fitbout[p,...] = fitb
+            nMatch[p] = nGood
+            whGood_out[p,0:nGood] = whGood[:]
+            polematch_out[p,...] = polematch[:]
+            Rout[p,:,:] = R[0,:,:]
+            ij[p,:]  = np.asarray((ii,jj,bnd1,bnd2), dtype=np.int64)
+            break
+          else:
+            if nMatch[p] < nGood:
+            #print((nMatch*(3.0-fitout)) , (nGood*(3.0-fit)))
+            #if (nMatch*(2.0-fitout)) < (nGood*(2.0-fit)):
               testout = testp
-              fitout = np.float32(fit)
-              fitbout = fitb
-              nMatch = nGood
-              whGood_out = whGood[:]
-              polematch_out = polematch[:]
-              Rout[0,:,:] = R[0,:,:]
-
-              ij = (ii, jj, bnd1,bnd2)
-
-      #print('----')
-      #print(ij)
-
-      #print(testout.T)
-      #print(pflt[polematch_out, :])
-      if nMatch >= (n_band_early):
-        break
+              fitout[p] = np.float32(fit)
+              fitbout[p, ...] = fitb
+              nMatch[p] = nGood
+              whGood_out[p, 0:nGood] = whGood[:]
+              polematch_out[p, ...] = polematch[:]
+              Rout[p, :, :] = R[0, :, :]
+              ij[p, :] = np.asarray((ii, jj, bnd1, bnd2), dtype=np.int64)
+
+            elif nMatch[p] == nGood:
+            #elif (nMatch*(2.0-fitout)) == (nGood*(2.0-fit)):
+              if fitout[p] > fit:
+                testout = testp
+                fitout[p] = np.float32(fit)
+                fitbout[p, ...] = fitb
+                nMatch[p] = nGood
+                whGood_out[p, 0:nGood] = whGood[:]
+                polematch_out[p, ...] = polematch[:]
+                Rout[p, :, :] = R[0, :, :]
+                ij[p, :] = np.asarray((ii, jj, bnd1, bnd2), dtype=np.int64)
+
+        #print('----')
+        #print(ij)
+
+        #print(testout.T)
+        #print(pflt[polematch_out, :])
+        if nMatch[p] >= (n_band_early):
+          break
 
     #print(testout.T)
     #print(pflt[polematch_out,:])
@@ -1606,3 +1749,84 @@ def _fitcheck(self, q, bandnorms, cartxstalpoles):
     # mean_ang = np.degrees(np.arccos(mean_dot))
     return mean_dot
 
+  @staticmethod
+  @numba.jit(nopython=True, cache=True, fastmath=True, parallel=False)
+  def _calc_cm(accumulator, polematch, libFamIndx):
+
+    npats = accumulator.shape[0]
+    cm2 = -1 * np.ones(npats, dtype=np.float32)
+
+    for p in range(npats):
+
+      whmatch = (np.nonzero(polematch[p, :] >= 0)[0]).astype(np.int64)
+      if whmatch.size < 2:
+        continue
+      # cm = np.mean(band_cm[whmatch])
+
+      cm2[p] = 0.0
+      for whm in whmatch:
+        whfam = np.int64(libFamIndx[polematch[p, whm]])
+        cm2[p] += np.float32(accumulator[p, whfam, whm])
+      cm2[p] /= np.sum(accumulator[p, ...].clip(1))
+    return cm2
+
+@numba.jit(nopython=True, cache=True, fastmath=True, parallel=False)
+def _orientation_quest_nb(polescart, bandnorms, weights):
+  # this uses the Quaternion Estimator AKA quest algorithm.
+
+  pflt = np.asarray(polescart, dtype=np.float64)
+  bndnorm = np.asarray(bandnorms, dtype=np.float64)
+  npoles = pflt.shape[0]
+  wn = (np.asarray(weights, dtype=np.float64)).reshape(npoles, 1)
+
+  # wn = np.ones((nGood,1), dtype=np.float32)/np.float32(nGood)  #(weights[whGood]).reshape(nGood,1)
+  wn /= np.sum(wn)
+
+  I = np.zeros((3, 3), dtype=np.float64)
+  I[0, 0] = 1.0;
+  I[1, 1] = 1.0;
+  I[2, 2] = 1.0
+  q = np.zeros((4), dtype=np.float64)
+
+  B = (wn * bndnorm).T @ pflt
+  S = B + B.T
+  z = np.asarray(np.sum(wn * np.cross(bndnorm, pflt), axis=0), dtype=np.float64)
+  S2 = S @ S
+  det = np.linalg.det(S)
+  k = (S[1, 1] * S[2, 2] - S[1, 2] * S[2, 1]) + (S[0, 0] * S[2, 2] - S[0, 2] * S[2, 0]) + (
+        S[0, 0] * S[1, 1] - S[1, 0] * S[0, 1])
+  sig = 0.5 * (S[0, 0] + S[1, 1] + S[2, 2])
+  sig2 = sig * sig
+  d = z.T @ S2 @ z
+  c = det + (z.T @ S @ z)
+  b = sig2 + (z.T @ z)
+  a = sig2 - k
+
+  lam = 1.0
+  tol = 1.0e-12
+  iter = 0
+  dlam = 1e6
+  # for i in range(10):
+  while (dlam > tol) and (iter < 10):
+    lam0 = lam
+    lam = lam - (lam ** 4 - (a + b) * lam ** 2 - c * lam + (a * b + c * sig - d)) / (
+          4 * lam ** 3 - 2 * (a + b) * lam - c)
+    dlam = np.fabs(lam0 - lam)
+    iter += 1
+
+  beta = lam - sig
+  alpha = lam ** 2 - sig2 + k
+  gamma = (lam + sig) * alpha - det
+  X = np.asarray((alpha * I + beta * S + S2), dtype=np.float64) @ z
+  qn = np.float64(0.0)
+  qn += gamma ** 2 + X[0] ** 2 + X[1] ** 2 + X[2] ** 2
+  qn = np.sqrt(qn)
+  q[0] = gamma
+  q[1:4] = X[0:3]
+  q /= qn
+  if (np.sign(gamma) < 0):
+    q *= -1.0
+
+  # polesrot = rotlib.quat_vectorL1N(q, pflt, npoles, np.float64, p=1)
+  # pdot = np.sum(polesrot*bndnorm, axis = 1, dtype=np.float64)
+  return q, lam  # , pdot
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 7e42fdc..7e34884 100644
--- a/setup.py
+++ b/setup.py
@@ -95,7 +95,7 @@
         "h5py",
         "matplotlib",
         "numpy",
-        "numba",
+        "numba>=0.53",
         "scipy",
     ],
     # Files to include when distributing package (see also MANIFEST.in)