From 88136e60b8bc6a6c53c507bfc4dda1a39163e864 Mon Sep 17 00:00:00 2001 From: Dave Rowenhorst <45666721+drowenhorst-nrl@users.noreply.github.com> Date: Wed, 22 May 2024 17:41:59 -0400 Subject: [PATCH] NLPAR GPU (#56) * GPU enabled version of NLPAR * PyEBSDIndex will now default to using discrete graphics if both integrated and discrete GPUs are found. --- .github/dependabot.yml | 5 +- .github/workflows/tests.yml | 4 +- License | 2 +- pyebsdindex/__init__.py | 12 +- pyebsdindex/_ebsd_index_single.py | 112 ++- pyebsdindex/band_detect.py | 73 +- pyebsdindex/band_vote.py | 944 --------------------- pyebsdindex/crystal_sym.py | 2 +- pyebsdindex/crystallometry.py | 52 +- pyebsdindex/misorientation.py | 70 +- pyebsdindex/nlpar.py | 693 +--------------- pyebsdindex/nlpar_cpu.py | 794 ++++++++++++++++++ pyebsdindex/opencl/band_detect_cl.py | 55 +- pyebsdindex/opencl/clkernels.cl | 85 +- pyebsdindex/opencl/clnlpar.cl | 455 ++++++++++ pyebsdindex/opencl/nlpar_cl.py | 538 ++++++++++++ pyebsdindex/opencl/nlpar_clray.py | 799 ++++++++++++++++++ pyebsdindex/opencl/openclparam.py | 41 +- pyebsdindex/opencl/test.cl | 67 ++ pyebsdindex/radon_fast.py | 11 +- pyebsdindex/rotlib.py | 13 +- pyebsdindex/tests/test_package.py | 3 +- pyebsdindex/tripletvote.py | 1150 +++++++++++++++----------- setup.py | 2 +- 24 files changed, 3723 insertions(+), 2259 deletions(-) delete mode 100644 pyebsdindex/band_vote.py create mode 100644 pyebsdindex/nlpar_cpu.py create mode 100644 pyebsdindex/opencl/clnlpar.cl create mode 100644 pyebsdindex/opencl/nlpar_cl.py create mode 100644 pyebsdindex/opencl/nlpar_clray.py create mode 100644 pyebsdindex/opencl/test.cl diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 5990d9c..be902ed 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,7 +5,10 @@ version: 2 updates: - - package-ecosystem: "" # See documentation for possible values + - package-ecosystem: "github-actions" # See documentation for possible values directory: "/" # Location of package manifests + target-branch: "develop" schedule: interval: "weekly" + open-pull-requests-limit: 2 + diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b845d52..4f7d5a2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -49,10 +49,10 @@ jobs: DEPENDENCIES: matplotlib==3.3 numba==0.52 numpy==1.19 ray[default]==1.13 LABEL: -oldest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} diff --git a/License b/License index 1110b57..10cb697 100644 --- a/License +++ b/License @@ -18,5 +18,5 @@ works bear some notice that they are derived from it, and any modified versions some notice that they have been modified. Author: David Rowenhorst; -The US Naval Research Laboratory Date: 21 Aug 2020 +The US Naval Research Laboratory Date: 22 May 2024 diff --git a/pyebsdindex/__init__.py b/pyebsdindex/__init__.py index 1a0318f..ac40fd0 100644 --- a/pyebsdindex/__init__.py +++ b/pyebsdindex/__init__.py @@ -10,10 +10,18 @@ __version__ = "0.2.2dev" -# Try to import only once +# Try to import only once - also will perform check that at least one GPU is found. try: + _pyopencl_installed = False import pyopencl - _pyopencl_installed = True + from pyebsdindex.opencl import openclparam + testcl = openclparam.OpenClParam() + try: + gpu = testcl.get_gpu() + if len(gpu) > 0: + _pyopencl_installed = True + except: + raise ImportError('pyopencl could not find GPU') except ImportError: _pyopencl_installed = False diff --git a/pyebsdindex/_ebsd_index_single.py b/pyebsdindex/_ebsd_index_single.py index eb9989a..d11647f 100644 --- a/pyebsdindex/_ebsd_index_single.py +++ b/pyebsdindex/_ebsd_index_single.py @@ -18,7 +18,14 @@ # some notice that they have been modified. # # Author: David Rowenhorst; -# The US Naval Research Laboratory Date: 21 Aug 2020 +# The US Naval Research Laboratory Date: 22 May 2024 + +# For further information see: +# David J. Rowenhorst, Patrick G. Callahan, Håkon W. Ånes. Fast Radon transforms for +# high-precision EBSD orientation determination using PyEBSDIndex. +# Journal of Applied Crystallography, 57(1):3–19, 2024. +# DOI: 10.1107/S1600576723010221 + """Setup and handling of Radon indexing runs of EBSD patterns on a single thread. @@ -412,6 +419,12 @@ def update_file(self, filename=None, patDim=np.array([120, 120], dtype=np.int32) if filename is None: self.filein = None self.bandDetectPlan.band_detect_setup(patDim=patDim) + elif isinstance(filename, ebsd_pattern.EBSDPatternFile): + self.fID = filename + self.bandDetectPlan.band_detect_setup( + patDim=[self.fID.patternH, self.fID.patternW] + ) + self.filein = filename.filepath else: self.filein = filename self.fID = ebsd_pattern.get_pattern_file_obj(self.filein) @@ -617,6 +630,102 @@ def _detectbands(self, pats, PC, xyloc=None, clparams=None, verbose=0, chunksize def _indexbandsphase(self, banddata, bandnorm, verbose=0): +# + rhomax = self.bandDetectPlan.rhoMax * (1-self.bandDetectPlan.rhoMaskFrac) + shpBandDat = banddata.shape + npoints = int(banddata.size/(shpBandDat[-1])+0.1) + nPhases = len(self.phaseLib) + q = np.zeros((nPhases, npoints, 4)) + indxData = np.zeros((nPhases + 1, npoints), dtype=self.dataTemplate) + bandmatchindex = np.zeros((nPhases, npoints,shpBandDat[-1],2), dtype=np.int32)-100 + banddataout = banddata.copy() + + indxData["phase"] = -1 + indxData["fit"] = 180.0 + indxData["totvotes"] = 0 + indxData["nmatch"] = 0 + + + if self.phaseLib[0] is None: + return indxData, banddata + + if self.nband_earlyexit is None: + earlyexit = -1 + for ph in self.phaselist: + if hasattr(ph, 'nband_earlyexit'): + earlyexit = max(earlyexit, ph.nband_earlyexit) + if earlyexit < 0: + earlyexit = shpBandDat[1] # default to all the poles. + self.nband_earlyexit = earlyexit + else: + earlyexit = self.nband_earlyexit + + + adj_intensity = (-1 * np.abs(banddata["rho"]) * 0.5 / rhomax + 1) * banddata["max"] + adj_intensity *= ((banddata["theta"] > (2 * np.pi / 180)).astype(np.float32) + 0.5) / 2 + adj_intensity *= ((banddata["theta"] < (178.0 * np.pi / 180)).astype(np.float32) + 0.5) / 2 + + adj_intensity *= banddata["valid"] + + #print(adj_intensity.shape) + + + for j in range(len(self.phaseLib)): + + indxData['pq'][j, :] = np.sum(banddata['max'] * banddata['valid'], axis=1) + p2do = np.ravel(np.nonzero(np.max(indxData["nmatch"], axis=0) < earlyexit)[0]) + + if p2do.size ==0: + break + ( + avequat, + fit, + cm, + bandmatch, + nMatch, + matchAttempts, + totvotes, + ) = self.phaseLib[j].bandindex( + bandnorm[p2do, ...], band_intensity=adj_intensity[p2do, ...], band_widths=banddata["width"][p2do, ...], verbose=verbose) + whgood = np.nonzero(nMatch >= 3 )[0] + if whgood.size > 0: + whgood2 = p2do[whgood] + q[j, whgood2, :] = avequat[whgood, ...] + indxData["fit"][j, whgood2] = fit[whgood] + indxData["cm"][j, whgood2] = cm[whgood] + indxData["phase"][j, whgood2] = j + indxData["nmatch"][j, whgood2] = nMatch[whgood] + indxData["matchattempts"][j, whgood2] = matchAttempts[whgood, ...] + indxData["totvotes"][j, whgood2] = totvotes[whgood] + bandmatchindex[j, whgood2, ..., 1] = bandmatch[whgood, ...] + + + + + qref2detect = self._detector2refframe() + q = q.reshape(nPhases * npoints, 4) + q = rotlib.quat_multiply(q, qref2detect) + q = rotlib.quatnorm(q) + q = q.reshape(nPhases, npoints, 4) + indxData["quat"][0:nPhases, :, :] = q + indxData[-1, :] = indxData[0, :] + banddataout['band_match_index'][:,:,:] = bandmatchindex[0,:,:,:].squeeze() + if nPhases > 1: + for j in range(1, nPhases): + # indxData[-1, :] = np.where( + # (indxData[j, :]["cm"] * indxData[j, :]["nmatch"]) + # > (indxData[j + 1, :]["cm"] * indxData[j + 1, :]["nmatch"]), + # indxData[j, :], + # indxData[j + 1, :], + phasetest = ((3.0 - indxData[j, :]["fit"]) * indxData[j, :]["nmatch"]) \ + > ((3.0 - indxData[-1, :]["fit"]) * indxData[-1, :]["nmatch"]) + whbetter = np.nonzero(phasetest) + indxData[-1, whbetter] = indxData[j, whbetter] + banddataout['band_match_index'][whbetter,:] = bandmatchindex[j,whbetter,:,:].squeeze() + return indxData, banddataout + + def _indexbandsphase_old(self, banddata, bandnorm, verbose=0): + # rhomax = self.bandDetectPlan.rhoMax * (1-self.bandDetectPlan.rhoMaskFrac) shpBandDat = banddata.shape @@ -706,7 +815,6 @@ def _indexbandsphase(self, banddata, bandnorm, verbose=0): indxData[-1, whbetter] = indxData[j, whbetter] banddataout['band_match_index'][whbetter,:] = bandmatchindex[j,whbetter,:,:].squeeze() return indxData, banddataout - def _detector2refframe(self): ven = str.upper(self.vendor) if ven in ["EDAX", "EMSOFT", "KIKUCHIPY"]: diff --git a/pyebsdindex/band_detect.py b/pyebsdindex/band_detect.py index c77b572..af5abb9 100644 --- a/pyebsdindex/band_detect.py +++ b/pyebsdindex/band_detect.py @@ -1,29 +1,41 @@ -"""This software was developed by employees of the US Naval Research Laboratory (NRL), an -agency of the Federal Government. Pursuant to title 17 section 105 of the United States -Code, works of NRL employees are not subject to copyright protection, and this software -is in the public domain. PyEBSDIndex is an experimental system. NRL assumes no -responsibility whatsoever for its use by other parties, and makes no guarantees, -expressed or implied, about its quality, reliability, or any other characteristic. We -would appreciate acknowledgment if the software is used. To the extent that NRL may hold -copyright in countries other than the United States, you are hereby granted the -non-exclusive irrevocable and unconditional right to print, publish, prepare derivative -works and distribute this software, in any medium, or authorize others to do so on your -behalf, on a royalty-free basis throughout the world. You may improve, modify, and -create derivative works of the software or any portion of the software, and you may copy -and distribute such modifications or works. Modified works should carry a notice stating -that you changed the software and should note the date and nature of any such change. -Please explicitly acknowledge the US Naval Research Laboratory as the original source. -This software can be redistributed and/or modified freely provided that any derivative -works bear some notice that they are derived from it, and any modified versions bear -some notice that they have been modified. - -Author: David Rowenhorst; -The US Naval Research Laboratory Date: 21 Aug 2020""" - -from os import environ -from pathlib import PurePath +# This software was developed by employees of the US Naval Research Laboratory (NRL), an +# agency of the Federal Government. Pursuant to title 17 section 105 of the United States +# Code, works of NRL employees are not subject to copyright protection, and this software +# is in the public domain. PyEBSDIndex is an experimental system. NRL assumes no +# responsibility whatsoever for its use by other parties, and makes no guarantees, +# expressed or implied, about its quality, reliability, or any other characteristic. We +# would appreciate acknowledgment if the software is used. To the extent that NRL may hold +# copyright in countries other than the United States, you are hereby granted the +# non-exclusive irrevocable and unconditional right to print, publish, prepare derivative +# works and distribute this software, in any medium, or authorize others to do so on your +# behalf, on a royalty-free basis throughout the world. You may improve, modify, and +# create derivative works of the software or any portion of the software, and you may copy +# and distribute such modifications or works. Modified works should carry a notice stating +# that you changed the software and should note the date and nature of any such change. +# Please explicitly acknowledge the US Naval Research Laboratory as the original source. +# This software can be redistributed and/or modified freely provided that any derivative +# works bear some notice that they are derived from it, and any modified versions bear +# some notice that they have been modified. +# +# +# Author: David Rowenhorst; +# The US Naval Research Laboratory Date: 22 May 2024 +# +# For further information see: +# David J. Rowenhorst, Patrick G. Callahan, Håkon W. Ånes. Fast Radon transforms for +# high-precision EBSD orientation determination using PyEBSDIndex. +# Journal of Applied Crystallography, 57(1):3–19, 2024. +# DOI: 10.1107/S1600576723010221 +# +# + +"""Class that reads in EBSD patterns, and finds the bands within those patterns using a Radon Transform.""" + +#from os import environ +import os +from pathlib import PurePath, Path import platform -import tempfile +# import tempfile from timeit import default_timer as timer import matplotlib.pyplot as plt @@ -37,10 +49,12 @@ from pyebsdindex import radon_fast - -tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir()) -tempdir = tempdir.joinpath('numba') -environ["NUMBA_CACHE_DIR"] = str(tempdir) +tempdir = PurePath(Path.home()) +#tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir()) +#tempdir = tempdir.joinpath('numbacache') +tempdir = tempdir.joinpath('.pyebsdindex').joinpath('numbacache') +Path(tempdir).mkdir(parents=True, exist_ok=True) +os.environ["NUMBA_CACHE_DIR"] = str(tempdir)+str(os.sep) RADEG = 180.0/np.pi @@ -80,6 +94,7 @@ def __init__( self.EDAXIQ = False self.backgroundsub = None self.patternmask = None + self.useCPU = True self.dataType = np.dtype([('id', np.int32), ('max', np.float32), \ ('maxloc', np.float32, (2)), ('avemax', np.float32), ('aveloc', np.float32, (2)),\ diff --git a/pyebsdindex/band_vote.py b/pyebsdindex/band_vote.py deleted file mode 100644 index 63d2210..0000000 --- a/pyebsdindex/band_vote.py +++ /dev/null @@ -1,944 +0,0 @@ -"""This software was developed by employees of the US Naval Research Laboratory (NRL), an -agency of the Federal Government. Pursuant to title 17 section 105 of the United States -Code, works of NRL employees are not subject to copyright protection, and this software -is in the public domain. PyEBSDIndex is an experimental system. NRL assumes no -responsibility whatsoever for its use by other parties, and makes no guarantees, -expressed or implied, about its quality, reliability, or any other characteristic. We -would appreciate acknowledgment if the software is used. To the extent that NRL may hold -copyright in countries other than the United States, you are hereby granted the -non-exclusive irrevocable and unconditional right to print, publish, prepare derivative -works and distribute this software, in any medium, or authorize others to do so on your -behalf, on a royalty-free basis throughout the world. You may improve, modify, and -create derivative works of the software or any portion of the software, and you may copy -and distribute such modifications or works. Modified works should carry a notice stating -that you changed the software and should note the date and nature of any such change. -Please explicitly acknowledge the US Naval Research Laboratory as the original source. -This software can be redistributed and/or modified freely provided that any derivative -works bear some notice that they are derived from it, and any modified versions bear -some notice that they have been modified. - -Author: David Rowenhorst; -The US Naval Research Laboratory Date: 21 Aug 2020""" - -from os import environ -from pathlib import PurePath -import platform -import tempfile -from timeit import default_timer as timer - -import numba -import numpy as np - -from pyebsdindex import rotlib - - -RADEG = 180.0/np.pi - -tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir()) -tempdir = tempdir.joinpath('numba') -environ["NUMBA_CACHE_DIR"] = str(tempdir) - - -class BandVote: - def __init__(self, tripLib, angTol=3.0, high_fidelity=True): - self.tripLib = tripLib - self.phase_name = self.tripLib.phaseName - self.phase_sym = self.tripLib.pointgroup - self.lattice_param = self.tripLib.latticeParameter - self.angTol = angTol - self.n_band_early_exit = 8 - self.high_fidelity = high_fidelity - # these lookup tables are used to order the index for the pole-family when - # sorting triplet angles from low to high. - LUTA = np.array([[0,1,2],[0,2,1],[1,0,2],[1,2,0],[2,0,1],[2,1,0]],dtype=np.int64) - LUTB = np.array([[0,1,2],[1,0,2],[0,2,1],[2,0,1],[1,2,0],[2,1,0]],dtype=np.int64) - - LUT = np.zeros((3,3,3,3),dtype=np.int64) - for i in range(6): - LUT[:,LUTA[i,0],LUTA[i,1],LUTA[i,2]] = LUTB[i,:] - self.LUT = np.asarray(LUT).copy() - - def tripvote(self, band_norms, band_intensity = None, goNumba = True, verbose=0): - tic0 = timer() - nfam = self.tripLib.polefamilies.shape[0] - bandnorms = np.squeeze(band_norms) - n_bands = np.int64(bandnorms.size/3) - if band_intensity is None: - band_intensity = np.ones((n_bands)) - tic = timer() - bandangs = np.abs(bandnorms.dot(bandnorms.T)) - bandangs = np.clip(bandangs, -1.0, 1.0) - bandangs = np.arccos(bandangs)*RADEG - - - if goNumba == True: - accumulator, bandFam, bandRank, band_cm = self.tripvote_numba(bandangs, self.LUT, self.angTol, self.tripLib.tripAngles, self.tripLib.tripID, nfam, n_bands) - else: - count = 0 - accumulator = np.zeros((nfam,n_bands),dtype=np.int32) - for i in range(n_bands): - for j in range(i+1,n_bands): - for k in range(j+1, n_bands): - angtri = np.array([bandangs[i,j], bandangs[i,k], bandangs[j,k]]) - srt = np.argsort(angtri) - srt2 = np.array(self.LUT[:, srt[0], srt[1], srt[2]]) - unsrtFID = np.argsort(srt2) - angtriSRT = np.array(angtri[srt]) - angTest = (np.abs(self.tripLib.tripAngles - angtriSRT)) <= self.angTol - angTest = np.all(angTest, axis = 1) - wh = angTest.nonzero()[0] - - for q in wh: - f = self.tripLib.tripID[q,:] - f = f[unsrtFID] - accumulator[f, [i,j,k]] += 1 - - - mxvote = np.amax(accumulator, axis = 0) - tvotes = np.sum(accumulator, axis = 0) - band_cm = np.zeros(n_bands) - - - - for i in range(n_bands): - if tvotes[i] < 1: - band_cm[i] = 0.0 - else: - srt = np.argsort(accumulator[:,i]) - band_cm[i] = (accumulator[srt[-1], i] - accumulator[srt[-2], i])/tvotes[i] - - bandFam = np.argmax(accumulator, axis=0) - bandRank = (n_bands - np.arange(n_bands))/n_bands * band_cm * mxvote - #print(np.sum(accumulator)) - #print(accumulator) - #print(bandRank) - #print(tvotes, band_cm, mxvote) - #print('vote loops: ', timer() - tic) - if verbose > 2: - print('band Vote time:',timer() - tic) - tic = timer() - - # avequat,fit,bandmatch,nMatch = self.band_vote_refine(bandnorms,bandRank,bandFam, angTol = self.angTol) - # if nMatch == 0: - # srt = np.argsort(bandRank) - # for i in range(np.min([5, n_bands])): - # bandRank2 = bandRank - # bandRank2[srt[i]] = -1.0 - # #avequat, fit, bandmatch, nMatch = self.band_vote_refine(bandnorms,bandRank2,bandFam,self.tripLib.completelib,self.angTol) - # avequat,fit,bandmatch,nMatch = self.band_vote_refine(bandnorms,bandRank2,bandFam) - # if nMatch > 2: - # break - #print('refinement: ', timer() - tic) - #print('tripvote: ',timer() - tic0) - sumaccum = np.sum(accumulator) - bandRank_arg = np.argsort(bandRank).astype(np.int64) - test = 0 - fit = 1000.0 - nMatch = -1 - avequat = np.zeros(4, dtype=np.float32) - polematch = np.array([-1]) - whGood = -1 - - angTable = self.tripLib.completelib['angTable'] - sztable = angTable.shape - famIndx = self.tripLib.completelib['famIndex'] - nFam = self.tripLib.completelib['nFamily'] - polesCart = self.tripLib.completelib['polesCart'] - angTol = self.angTol - n_band_early = np.int64(self.n_band_early_exit) - - # this will check the vote, and return the exact band matching to specific poles of the best fitting solution. - fit, polematch, nMatch, whGood, ij, R, fitb = \ - self.band_index_nb(polesCart, bandRank_arg, bandFam, famIndx, nFam, angTable, bandnorms, angTol, n_band_early) - - if verbose > 2: - print('band index: ',timer() - tic) - tic = timer() - - cm2 = 0.0 - if nMatch >=2: - if self.high_fidelity == True: - - srt = np.argsort(fitb[whGood]) - whgood6 = whGood[srt[0:np.min([8, whGood.shape[0]])]] - - weights6 = band_intensity[whgood6] - pflt6 = (np.asarray(polesCart[polematch[whgood6], :], dtype=np.float64)) - bndnorm6 = (np.asarray(bandnorms[whgood6, :], dtype=np.float64)) - - avequat, fit = self.refine_orientation_quest(bndnorm6, pflt6 , weights=weights6) - fit = np.arccos(np.clip(fit, -1.0, 1.0))*RADEG - #avequat, fit = self.refine_orientation(bandnorms,whGood,polematch) - else: - avequat = rotlib.om2qu(R) - whmatch = np.nonzero(polematch >= 0)[0] - cm = np.mean(band_cm[whmatch]) - whfam = self.tripLib.completelib['poleFamID'][polematch[whmatch]] - cm2 = np.sum(accumulator[[whfam], [whmatch]]).astype(np.float32) - cm2 /= np.sum(accumulator.clip(1)) - - if verbose > 2: - print('refinement: ', timer() - tic) - print('all: ',timer() - tic0) - return avequat, fit, cm2, polematch, nMatch, ij, sumaccum - - - - def band_index(self,bandnorms,bnd1,bnd2,familyLabel,angTol=3.0, verbose = 0): - - #nBands = np.int32(bandnorms.size/3) - angTable = self.tripLib.completelib['angTable'] - sztable = angTable.shape - #whGood = -1 - famIndx = self.tripLib.completelib['famIndex'] - nFam = self.tripLib.completelib['nFamily'] - poles = self.tripLib.completelib['polesCart'] - #ang01 = 0.0 - # need to check that the two selected bands are not parallel. - #v0 = bandnorms[bnd1, :] - #f0 = familyLabel[bnd1] - #v1 = bandnorms[bnd2, :] - #f1 = familyLabel[bnd2] - #ang01 = np.clip(np.dot(v0, v1), -1.0, 1.0) - #ang01 = np.arccos(ang01)*RADEG - #if ang01 < angTol: # the two poles are parallel, send in another two poles if available. - # return 360.0, 0, whGood, -1 - - #wh01 = np.nonzero(np.abs(angTable[famIndx[f0], famIndx[f1]:np.int(famIndx[f1]+nFam[f1])] - ang01) < angTol)[0] - - #n01 = wh01.size - #if n01 == 0: - # return 360.0, 0, whGood, -1 - - #wh01 += famIndx[f1] - #p0 = poles[famIndx[f0], :] - #print('pre first loop: ',timer() - tic) - #tic = timer() - # place numba code here ... - - - - #fit, polematch, R, nGood, whGood = self.band_vote_refine_loops1(poles,v0,v1, p0, wh01, bandnorms, angTol) - fit,polematch,R,nGood,whGood = self.band_vote_refine_loops1(poles, bnd1, bnd2, familyLabel, famIndx, nFam, angTable, bandnorms, angTol) - - #print('numba first loops',timer() - tic) - #whGood = np.nonzero(angFit < angTol)[0] - #nGood = np.int64(whGood.size) - #if nGood < 3: - # return 360.0, -1, -1, -1 - - #fit = np.mean(angFit[whGood]) - #print('all bindexed time', timer()-tic0) - return fit, nGood, whGood, polematch - - def refine_orientation(self,bandnorms, whGood, polematch): - tic = timer() - poles = self.tripLib.completelib['polesCart'] - nGood = whGood.size - n2Fit = np.int64(np.product(np.arange(2)+(nGood-2+1))/np.int64(2)) - whGood = np.asarray(whGood,dtype=np.int64) - #AB, ABgood = self.orientation_refine_loops_am(nGood,whGood,poles,bandnorms,polematch,n2Fit) - # tic = timer() - # quats = rotlib.om2quL(AB[ABgood.nonzero()[0], :, :]) - # print("om2qu", timer() - tic) - # tic = timer() - # avequat = rotlib.quatave(quats) - - AB, weights = self.orientation_refine_loops_am(nGood, whGood, poles, bandnorms, polematch, n2Fit) - - wh_weight = np.nonzero(weights < 359.0)[0] - quats = rotlib.om2quL(AB[wh_weight, :, :]) - - expw = weights[wh_weight] - - rng = expw.max()-expw.min() - #print(rng, len(wh_weight)) - if rng > 1e-6: - expw -= expw.min() - #print(np.arccos(1.0 - expw)*RADEG) - expw = np.exp(-expw/(0.5*(rng))) - expw /= np.sum(expw) - #print(quats) - #print(expw) - #print(expw*len(wh_weight)) - avequat = rotlib.quatave(quats * np.expand_dims(expw, axis=-1)) - #print(avequat) - else: - avequat = rotlib.quatave(quats) - - - test = rotlib.quat_vectorL(avequat,bandnorms[whGood,:]) - - tic = timer() - test = np.sum(test * poles[polematch[whGood], :], axis = 1) - test = np.arccos(np.clip(test, -1.0, 1.0))*RADEG - - - fit = np.mean(test) - - #print('fitting: ',timer() - tic) - return avequat, fit - - def refine_orientation_quest(self, bandnorms, polecartmatch, weights = None): - tic = timer() - - - if weights is None: - weights = np.ones((bandnorms.shape[0]), dtype=np.float64) - weightsn = np.asarray(weights, dtype=np.float64) - weightsn /= np.sum(weightsn) - #print(weightsn) - pflt = np.asarray(polecartmatch, dtype=np.float64) - bndnorm = np.asarray(bandnorms, dtype=np.float64) - - avequat, fit = self.orientation_quest_nb(pflt, bndnorm, weightsn) - - return avequat, fit - - - @staticmethod - @numba.jit(nopython=True, cache=True,fastmath=True,parallel=False) - def tripvote_numba( bandangs, LUT, angTol, tripAngles, tripID, nfam, n_bands): - LUTTemp = np.asarray(LUT).copy() - accumulator = np.zeros((nfam, n_bands), dtype=np.int32) - tshape = np.shape(tripAngles) - ntrip = int(tshape[0]) - count = 0.0 - #angTest2 = np.zeros(ntrip, dtype=numba.boolean) - #angTest2 = np.empty(ntrip,dtype=numba.boolean) - for i in range(n_bands): - for j in range(i + 1,n_bands): - for k in range(j + 1,n_bands): - angtri = np.array([bandangs[i,j],bandangs[i,k],bandangs[j,k]], dtype=numba.float32) - srt = angtri.argsort(kind='quicksort') #np.array(np.argsort(angtri), dtype=numba.int64) - srt2 = np.asarray(LUTTemp[:,srt[0],srt[1],srt[2]], dtype=numba.int64).copy() - unsrtFID = np.argsort(srt2,kind='quicksort').astype(np.int64) - angtriSRT = np.asarray(angtri[srt]) - angTest = (np.abs(tripAngles - angtriSRT)) <= angTol - - for q in range(ntrip): - angTest2 = (angTest[q,0] + angTest[q,1] + angTest[q,2]) == 3 - if angTest2: - f = tripID[q,:] - f = f[unsrtFID] - accumulator[f[0],i] += 1 - accumulator[f[1],j] += 1 - accumulator[f[2],k] += 1 - t1 = False - t2 = False - t3 = False - if np.abs(angtriSRT[0] - angtriSRT[1]) < angTol: - accumulator[f[0],i] += 1 - accumulator[f[1],k] += 1 - accumulator[f[2],j] += 1 - t1 = True - if np.abs(angtriSRT[1] - angtriSRT[2]) < angTol: - accumulator[f[0],j] += 1 - accumulator[f[1],i] += 1 - accumulator[f[2],k] += 1 - t2 = True - if np.abs(angtriSRT[2] - angtriSRT[0]) < angTol: - accumulator[f[0],k] += 1 - accumulator[f[1],j] += 1 - accumulator[f[2],i] += 1 - t3 = True - if (t1 and t2 and t3): - accumulator[f[0],k] += 1 - accumulator[f[1],i] += 1 - accumulator[f[2],j] += 1 - - mxvote = np.zeros(n_bands, dtype=np.int32) - tvotes = np.zeros(n_bands, dtype=np.int32) - band_cm = np.zeros(n_bands, dtype=np.float32) - for q in range(n_bands): - mxvote[q] = np.amax(accumulator[:,q]) - tvotes[q] = np.sum(accumulator[:,q]) - - - - for i in range(n_bands): - if tvotes[i] < 1: - band_cm[i] = 0.0 - else: - srt = np.argsort(accumulator[:,i]) - band_cm[i] = (accumulator[srt[-1],i] - accumulator[srt[-2],i]) / tvotes[i] - - bandRank = np.zeros(n_bands, dtype=np.float32) - bandFam = np.zeros(n_bands, dtype=np.int32) - for q in range(n_bands): - bandFam[q] = np.argmax(accumulator[:,q]) - bandRank = (n_bands - np.arange(n_bands)) / n_bands * band_cm * mxvote - - return accumulator, bandFam, bandRank, band_cm - - @staticmethod - @numba.jit(nopython=True, cache=True, fastmath=True,parallel=False) - def band_index_nb(polesCart, bandRank_arg, familyLabel, famIndx, nFam, angTable, bandnorms, angTol, n_band_early): - eps = np.float32(1.0e-12) - nBnds = bandnorms.shape[0] - - whGood_out = np.zeros(nBnds, dtype=np.int64)-1 - - - nMatch = np.int64(-1) - Rout = np.zeros((1,3,3), dtype=np.float32) - #Rout[0,0,0] = 1.0; Rout[0,1,1] = 1.0; Rout[0,2,2] = 1.0 - polematch_out = np.zeros((nBnds),dtype=np.int64) - 1 - pflt = np.asarray(polesCart, dtype=np.float32) - bndnorm = np.transpose(np.asarray(bandnorms, dtype=np.float32)) - - fit = np.float32(360.0) - fitout = np.float32(360.0) - fitbout = np.zeros((nBnds)) - R = np.zeros((1, 3, 3), dtype=np.float32) - #fit = np.float32(360.0) - #whGood = np.zeros(nBnds, dtype=np.int64) - 1 - nGood = np.int64(-1) - ij = (-1,-1) - for ii in range(nBnds-1): - for jj in range(ii+1,nBnds): - - polematch = np.zeros((nBnds),dtype=np.int64) - 1 - - bnd1 = bandRank_arg[-1 - ii] - bnd2 = bandRank_arg[-1 - jj] - - v0 = bandnorms[bnd1,:] - f0 = familyLabel[bnd1] - v1 = bandnorms[bnd2,:] - f1 = familyLabel[bnd2] - ang01 = np.dot(v0,v1) - if ang01 > np.float32(1.0): - ang01 = np.float32(1.0-eps) - if ang01 < np.float32(-1.0): - ang01 = np.float32(-1.0+eps) - - paralleltest = np.arccos(np.fabs(ang01)) * RADEG - if paralleltest < angTol: # the two poles are parallel, send in another two poles if available. - continue - - ang01 = np.arccos(ang01) * RADEG - - wh01 = np.nonzero(np.abs(angTable[famIndx[f0],famIndx[f1]:np.int64(famIndx[f1] + nFam[f1])] - ang01) < angTol)[0] - - n01 = wh01.size - if n01 == 0: - continue - - wh01 += famIndx[f1] - p0 = polesCart[famIndx[f0], :] - - n01 = wh01.size - v0v1c = np.cross(v0,v1) - v0v1c /= np.linalg.norm(v0v1c) - # attempt to see which solution gives the best match to all the poles - # best is measured as the number of poles that are within tolerance, - # divided by the angular deviation. - # Use the TRIAD method for finding the rotation matrix - - Rtry = np.zeros((n01,3,3), dtype = np.float32) - - #score = np.zeros((n01), dtype = np.float32) - A = np.zeros((3,3), dtype = np.float32) - B = np.zeros((3,3), dtype = np.float32) - #AB = np.zeros((3,3),dtype=np.float32) - b2 = np.cross(v0,v0v1c) - B[0,:] = v0 - B[1,:] = v0v1c - B[2,:] = b2 - A[:,0] = p0 - score = -1.0 - - for i in range(n01): - p1 = polesCart[wh01[i], :] - ntemp = np.linalg.norm(p1) + 1.0e-35 - p1 = p1 / ntemp - p0p1c = np.cross(p0,p1) - ntemp = np.linalg.norm(p0p1c) + 1.0e-35 - p0p1c = p0p1c / ntemp - A[:,1] = p0p1c - A[:,2] = np.cross(p0,p0p1c) - AB = A.dot(B) - Rtry[i,:,:] = AB - - testp = (AB.dot(bndnorm)) - test = pflt.dot(testp) - #print(test.shape) - angfitTry = np.zeros((nBnds), dtype = np.float32) - #angfitTry = np.max(test,axis=0) - for j in range(nBnds): - angfitTry[j] = np.max(test[:,j]) - angfitTry[j] = -1.0 if angfitTry[j] < -1.0 else angfitTry[j] - angfitTry[j] = 1.0 if angfitTry[j] > 1.0 else angfitTry[j] - - #angfitTry = np.clip(np.amax(test,axis=0),-1.0,1.0) - - angfitTry = np.arccos(angfitTry) * RADEG - whMatch = np.nonzero(angfitTry < angTol)[0] - nmatch = whMatch.size - #scoreTry = np.float32(nmatch) * np.mean(np.abs(angTol - angfitTry[whMatch])) - scoreTry = np.float32(nmatch) /( np.mean(angfitTry[whMatch]) + 1e-6) - if scoreTry > score: - score = scoreTry - angFit = angfitTry - for j in range(nBnds): - polematch[j] = np.argmax(test[:,j]) * ( 2*np.int32(angfitTry[j] < angTol)-1) - R[0, :,:] = Rtry[i,:,:] - - - whGood = (np.nonzero(angFit < angTol)[0]).astype(np.int64) - nGood = max(np.int64(whGood.size), np.int64(0)) - - if nGood < 3: - continue - #return 360.0,-1,-1,-1 - #whGood = -1*np.ones((1), dtype=np.int64) - #fit = np.float32(360.0) - #polematch[:] = -1 - #nGood = np.int64(-1) - else: - fitb = angFit - #fit = np.mean(fitb[whGood]) - fit = np.float32(0.0) - for q in range(nGood): - fit += np.float32(fitb[whGood[q]]) - fit /= np.float32(nGood) - - - if nGood >= (n_band_early): - fitout = fit - fitbout = fitb - nMatch = nGood - whGood_out = whGood - polematch_out = polematch - Rout = R - ij = (bnd1,bnd2) - break - else: - if nMatch < nGood: - fitout = np.float32(fit) - fitbout = fitb - nMatch = nGood - whGood_out = whGood - polematch_out = polematch - Rout = R - ij = (bnd1, bnd2) - elif nMatch == nGood: - if fitout > fit: - fitout = np.float32(fit) - fitbout = fitb - nMatch = nGood - whGood_out = whGood - polematch_out = polematch - Rout = R - ij = (bnd1, bnd2) - if nMatch >= (n_band_early): - break - #quatout = rotlib.om2quL(Rout) - return fitout, polematch_out,nMatch, whGood_out, ij, Rout, fitbout - - @staticmethod - @numba.jit(nopython=True, cache=True, fastmath=True,parallel=False) - def orientation_refine_loops_triad(nGood, whGood, poles, bandnorms, polematch, n2Fit): - #uses the TRIAD method for getting rotation matrix from imperfect poles. - quats = np.zeros((n2Fit, 4), dtype=np.float32) - counter = 0 - A = np.zeros((3, 3), dtype=np.float32) - B = np.zeros((3, 3), dtype=np.float32) - AB = np.zeros((n2Fit, 3, 3), dtype=np.float32) - whgood2 = np.zeros(n2Fit, dtype=np.int32) - for i in range(nGood): - v0 = bandnorms[whGood[i],:] - p0 = poles[polematch[whGood[i]],:] - A[:,0] = p0 - B[0,:] = v0 - for j in range(i + 1,nGood): - v1 = bandnorms[whGood[j],:] - p1 = poles[polematch[whGood[j]],:] - v0v1c = np.cross(v0,v1) - # v0v1c /= np.linalg.norm(v0v1c)+1.0e-35 - # v0v1c = vectnorm(v0v1c) # faster to inline these functions - norm = numba.float32(0.0) - for ii in range(3): - norm += v0v1c[ii] * v0v1c[ii] - norm = np.sqrt(norm) + 1.0e-35 - #print(norm) - if norm > (0.087): # these vectors are not parallel (greater than 5 degrees) - for ii in range(3): - v0v1c[ii] = v0v1c[ii] / norm - - p0p1c = np.cross(p0,p1) - # p0p1c /= (np.linalg.norm(p0p1c))+1.0e-35 - #p0p1c = vectnorm(p0p1c) # faster to inline these functions - norm = numba.float32(0.0) - for ii in range(3): - norm += p0p1c[ii] * p0p1c[ii] - norm = np.sqrt(norm) + 1.0e-35 - for ii in range(3): - p0p1c[ii] = p0p1c[ii] / norm - - A[:,1] = p0p1c - B[1,:] = v0v1c - A[:,2] = np.cross(p0,p0p1c) - B[2,:] = np.cross(v0,v0v1c) - AB[counter, :,:] = A.dot(B) - whgood2[counter] = 1 - #AB = np.reshape(AB, (1,3,3)) - #quats[counter,:] = rotlib.om2quL(AB) - counter += 1 - else: # the two are parallel - throwout the result. - whgood2[counter] = 0 - counter +=1 - return AB, whgood2 - - @staticmethod - @numba.jit(nopython=True,cache=True,fastmath=True,parallel=False) - def orientation_refine_loops_am(nGood,whGood,poles,bandnorms,polematch,n2Fit): - # this uses the method laid out by A. Morawiec 2020 Eq.4 for getting rotation matrix - # from imperfect poles - counter = 0 - - pflt = np.transpose(np.asarray(poles[polematch[whGood], :], dtype=np.float32)) - bndnorm = np.transpose(np.asarray(bandnorms[whGood,:], dtype=np.float32)) - - A = np.zeros((3, 3), dtype=np.float32) - B = np.zeros((3, 3), dtype=np.float32) - AB = np.zeros((n2Fit, 3, 3),dtype=np.float32) - #whgood2 = np.zeros(n2Fit, dtype=np.int32) - whgood2 = np.zeros(n2Fit, dtype=np.float32) - - for i in range(nGood): - v0 = bandnorms[whGood[i],:] - p0 = poles[polematch[whGood[i]],:] - - for j in range(i + 1,nGood): - v1 = bandnorms[whGood[j],:] - p1 = poles[polematch[whGood[j]],:] - v0v1c = np.cross(v0,v1) - p0p1c = np.cross(p0,p1) - p0p1add = p0 + p1 - v0v1add = v0 + v1 - p0p1sub = p0 - p1 - v0v1sub = v0 - v1 - - normPCross = numba.float32(0.0) - normVCross = numba.float32(0.0) - normPAdd = numba.float32(0.0) - normVAdd = numba.float32(0.0) - normPSub = numba.float32(0.0) - normVSub = numba.float32(0.0) - - for ii in range(3): - normPCross += p0p1c[ii] * p0p1c[ii] - normVCross += v0v1c[ii] * v0v1c[ii] - normPAdd += p0p1add[ii] * p0p1add[ii] - normVAdd += v0v1add[ii] * v0v1add[ii] - normPSub += p0p1sub[ii] * p0p1sub[ii] - normVSub += v0v1sub[ii] * v0v1sub[ii] - - normPCross = np.sqrt(normPCross) + 1.0e-35 - normVCross = np.sqrt(normVCross) + 1.0e-35 - normPAdd = np.sqrt(normPAdd) + 1.0e-35 - normVAdd = np.sqrt(normVAdd) + 1.0e-35 - normPSub = np.sqrt(normPSub) + 1.0e-35 - normVSub = np.sqrt(normVSub) + 1.0e-35 - - # print(norm) - if normVCross > (0.087): # these vectors are not parallel (greater than 5 degrees) - for ii in range(3): - v0v1c[ii] /= normVCross - p0p1c[ii] /= normPCross - v0v1add[ii] /= normVAdd - p0p1add[ii] /= normPAdd - v0v1sub[ii] /= normVSub - p0p1sub[ii] /= normPSub - - A[:,0] = p0p1c - B[0,:] = v0v1c - - A[:,1] = p0p1add - B[1,:] = v0v1add - - A[:,2] = p0p1sub - B[2,:] = v0v1sub - R = A.dot(B) - AB[counter,:,:] = A.dot(B) - - # test the fit of each candidate - testp = (R.dot(bndnorm)) - #test = pflt.dot(testp) - test = np.sum(pflt*testp, axis=0) - #print(test.shape) - #angfitTry = np.zeros((nGood), dtype=np.float32) - # angfitTry = np.max(test,axis=0) - #for qq in range(nGood): - # angfitTry[qq] = np.max(test[:, qq]) - # angfitTry[qq] = -1.0 if angfitTry[qq] < -1.0 else angfitTry[qq] - # angfitTry[qq] = 1.0 if angfitTry[qq] > 1.0 else angfitTry[qq] - #angfitTry = np.mean(np.arccos(angfitTry) * RADEG) - #print(test) - - - #whgood2[counter] = 1 - whgood2[counter] = 1.0 - np.mean(test) - #print(1.0 - np.mean(test)) - counter += 1 - else: # the two are parallel - throwout the result. - whgood2[counter] = np.float32(360.0) - counter += 1 - return AB,whgood2 - - @staticmethod - @numba.jit(nopython=True, cache=True, fastmath=True, parallel=False) - def orientation_quest_nb(polescart, bandnorms, weights): - # this uses the Quaternion Estimator AKA quest algorithm. - - pflt = np.asarray(polescart, dtype=np.float64) - bndnorm = np.asarray(bandnorms, dtype=np.float64) - npoles = pflt.shape[0] - wn = (np.asarray(weights, dtype=np.float64)).reshape(npoles, 1) - - #wn = np.ones((nGood,1), dtype=np.float32)/np.float32(nGood) #(weights[whGood]).reshape(nGood,1) - wn /= np.sum(wn) - - I = np.zeros((3,3), dtype=np.float64) - I[0,0] = 1.0 ; I[1,1] = 1.0 ; I[2,2] = 1.0 - q = np.zeros((4), dtype=np.float64) - - B = (wn * bndnorm).T @ pflt - S = B + B.T - z = np.asarray(np.sum(wn * np.cross(bndnorm, pflt), axis = 0), dtype=np.float64) - S2 = S @ S - det = np.linalg.det(S) - k = (S[1,1]*S[2,2] - S[1,2]*S[2,1]) + (S[0,0]*S[2,2] - S[0,2]*S[2,0]) + (S[0,0]*S[1,1] - S[1,0]*S[0,1]) - sig = 0.5 * (S[0,0] + S[1,1] + S[2,2]) - sig2 = sig * sig - d = z.T @ S2 @ z - c = det + (z.T @ S @ z) - b = sig2 + (z.T @ z) - a = sig2 -k - - lam = 1.0 - tol = 1.0e-6 - iter = 0 - dlam = 1e6 - #for i in range(10): - while (dlam > tol) and (iter < 10): - lam0 = lam - lam = lam - (lam**4 - (a + b) * lam**2 - c * lam + (a * b + c * sig - d))/ (4 * lam**3 - 2 * (a + b) * lam - c) - dlam = np.fabs(lam0-lam) - iter += 1 - - beta = lam - sig - alpha = lam ** 2 - sig2 + k - gamma = (lam + sig) * alpha - det - X = np.asarray( (alpha * I + beta * S + S2), dtype=np.float64) @ z - qn = np.float64(0.0) - qn += gamma ** 2 + X[0] **2 + X[1] **2 + X[2] **2 - qn = np.sqrt(qn) - q[0] = gamma - q[1:4] = X[0:3] - q /= qn - if (np.sign(gamma) < 0): - q *= -1.0 - - #polesrot = rotlib.quat_vectorL1N(q, pflt, npoles, np.float64, p=1) - #pdot = np.sum(polesrot*bndnorm, axis = 1, dtype=np.float64) - return q, lam#, pdot - - def pairVoteOrientation(self,bandnormsIN,goNumba=True): - tic0 = timer() - nfam = self.tripLib.polefamilies.shape[0] - bandnorms = np.squeeze(bandnormsIN) - n_bands = np.int64(bandnorms.size / 3) - - bandangs = np.abs(bandnorms.dot(bandnorms.T)) - bandangs = np.clip(bandangs,-1.0,1.0) - bandangs = np.arccos(bandangs) * RADEG - - angTable = self.tripLib.completelib['angTable'] - sztable = angTable.shape - whGood = -1 - famIndx = self.tripLib.completelib['famIndex'] - nFam = self.tripLib.completelib['nFamily'] - poles = self.tripLib.completelib['polesCart'] - angTableReduce = angTable[famIndx,:] - polesReduce = poles[famIndx,:] - qsym = self.tripLib.qsymops - tic = timer() - - - if goNumba == True: - solutions, nsolutions, solutionVotes, solSrt = self.pairVoteOrientationNumba(bandnorms,bandangs,qsym,angTableReduce,poles,polesReduce, self.angTol) - else: - solutions = np.empty((500, 24, 4), dtype=np.float32) - solutions[0,:,:] = rotlib.quat_multiply(qsym, np.array([1.0, 0.0, 0.0, 0.0], dtype=np.float32)) - solutionVotes = np.zeros(500, dtype=np.int32) - nsolutions = 1 - soltol = np.cos(5.0/RADEG/2.0) - - A = np.zeros((3,3), dtype=np.float32) - B = np.zeros((3,3), dtype=np.float32) # used for TRIAD calculations - - for i in range(n_bands): - for j in range(i + 1,n_bands): - - angPair = bandangs[i,j] - if angPair > 10.0: - angTest = (np.abs(angTableReduce - angPair)) <= self.angTol - wh = angTest.nonzero() - if len(wh[0] > 0): - - v0 = bandnorms[i,:] - v1 = bandnorms[j,:] - v0v1c = np.cross(v0,v1) - v0v1c /= np.linalg.norm(v0v1c) - B[0,:] = v0 - B[1,:] = v0v1c - B[2,:] = np.cross(v0,v0v1c) - for k in range(len(wh[0])): - - p0 = polesReduce[wh[0][k],:] - p1 = poles[wh[1][k],:] - p0p1c = np.cross(p0,p1) - p0p1c /= np.linalg.norm(v0v1c) - A[:,0] = p0 - A[:,1] = p0p1c - A[:,2] = np.cross(p0,p0p1c) - AB = A.dot(B) - - qAB = rotlib.om2qu(AB) - qABinv = rotlib.quatconj(qAB) - #qABsym = rotlib.quat_multiply(qsym, qAB) - - solutionFound = False - for q in range(nsolutions): - #rotlib.quat_multiplyLNN(q1,q2,n,intype,p=P) - - soltest = np.max( np.abs( (rotlib.quat_multiply((solutions[q,:,:]), qABinv))[:,0] )) - - if soltest >= soltol: - solutionVotes[q] += 1 - solutionFound = True - if solutionFound == False: - solutions[nsolutions, :, :] = rotlib.quat_multiply(qsym, qAB) - solutionVotes[nsolutions] += 1 - nsolutions += 1 - - solSrt = np.argsort(solutionVotes) - #print(nsolutions, solutionVotes[solSrt[-10:]]) - mxvote = np.max(solutionVotes) - #print(timer()-tic) - tic = timer() - if mxvote > 0: - whmxvotes = np.nonzero(solutionVotes == mxvote) - nmxvote = len(whmxvotes[0]) - fit = np.zeros(nmxvote, dtype=np.float32) - avequat = np.zeros((nmxvote,4),dtype=np.float32) - nMatch = np.zeros((nmxvote),dtype=np.float32) - poleMatch = np.zeros((nmxvote, n_bands), dtype=np.int32) - 1 - #cm = (mxvote-solutionVotes[solSrt[-nmxvote-1]])/np.sum(solutionVotes) - cm = mxvote/np.sum(solutionVotes) - for q in range(nmxvote): - testbands = rotlib.quat_vector(solutions[whmxvotes[0][q], 0, : ], bandnorms) - fittest = (testbands.dot(poles.T)).clip(-1.0, 1.0) - poleMatch1 = np.argmax(fittest, axis = 1) - fittemp = np.arccos(np.max(fittest,axis=1)) * RADEG - - whGood = np.nonzero(fittemp < self.angTol) - nMatch[q] = len(whGood[0]) - poleMatch[q,whGood[0]] = poleMatch1[whGood[0]] - - avequat1, fit1 = self.refine_orientation(bandnorms,whGood[0],poleMatch1) - - fit[q] = fit1 - avequat[q,:] = avequat1 - - keep = np.argmax(nMatch) - #print(timer() - tic) - return avequat[keep,:],fit[keep],cm,poleMatch[keep,:],nMatch[keep],(0,0) - - else: # no solutions - fit = 1000.0 - nMatch = -1 - avequat = np.zeros(4,dtype=np.float32) - polematch = np.array([-1]) - whGood = -1 - print(timer() - tic) - return avequat,fit,-1,polematch,nMatch,(0,0) - - - @staticmethod - @numba.jit(nopython=True,cache=True,fastmath=True,parallel=False) - def pairVoteOrientationNumba(bandnorms,bandangs, qsym, angTableReduce, poles, polesReduce, angTol): - n_bands = bandnorms.shape[0] - nsym = qsym.shape[0] - solutions = np.empty((500,24,4),dtype=np.float32) - solutions[0,:,:] = rotlib.quat_multiplyL(qsym,np.array([1.0,0.0,0.0,0.0],dtype=np.float32)) - solutionVotes = np.zeros(500, dtype=np.int32) - nsolutions = 1 - soltol = np.cos(5.0 / RADEG / 2.0) - - A = np.empty((3,3),dtype=np.float32) - B = np.empty((3,3),dtype=np.float32) # used for TRIAD calculations - AB = np.empty((1,3,3), dtype=np.float32) - qAB = np.empty((1,4), dtype=np.float32) - qABinv = np.empty((1,4),dtype=np.float32) - soltemp = np.empty((nsym,4), dtype=np.float32) - - for i in range(n_bands): - for j in range(i + 1,n_bands): - - angPair = bandangs[i,j] - if angPair > 10.0: - angTest = (np.abs(angTableReduce - angPair)) <= angTol - wh = angTest.nonzero() - if len(wh[0] > 0): - - v0 = bandnorms[i,:] - v1 = bandnorms[j,:] - v0v1c = np.cross(v0,v1) - v0v1c /= np.linalg.norm(v0v1c) - B[0,:] = v0 - B[1,:] = v0v1c - B[2,:] = np.cross(v0,v0v1c) - for k in range(len(wh[0])): - - p0 = polesReduce[wh[0][k],:] - p1 = poles[wh[1][k],:] - p0p1c = np.cross(p0,p1) - p0p1c /= np.linalg.norm(v0v1c) - A[:,0] = p0 - A[:,1] = p0p1c - A[:,2] = np.cross(p0,p0p1c) - AB[0,:,:] = A.dot(B) - - qAB = rotlib.om2quL(AB) - qABinv = rotlib.quatconjL(qAB) - #print(qABinv.shape) - # qABsym = rotlib.quat_multiply(qsym, qAB) - - solutionFound = False - for q in range(nsolutions): - # rotlib.quat_multiplyLNN(q1,q2,n,intype,p=P) - - soltemp = np.copy(solutions[q,:,:]) - #print(soltemp.shape) - soltemp = rotlib.quat_multiplyL(soltemp,qABinv) - - soltest = -1.0 - for qq in range(nsym): - if soltemp[qq,0] > soltest: - soltest = soltemp[qq,0] - - if soltest >= soltol: - solutionVotes[q] += 1 - solutionFound = True - if solutionFound == False: - solutions[nsolutions,:,:] = rotlib.quat_multiplyL(qsym,qAB) - solutionVotes[nsolutions] += 1 - nsolutions += 1 - - solSrt = np.argsort(solutionVotes) - - return solutions, nsolutions, solutionVotes, solSrt diff --git a/pyebsdindex/crystal_sym.py b/pyebsdindex/crystal_sym.py index 64709ef..fa0bd79 100644 --- a/pyebsdindex/crystal_sym.py +++ b/pyebsdindex/crystal_sym.py @@ -381,7 +381,7 @@ def spacegroup2lauenumber(spacegroupid): Parameters ---------- - sapcegroupid : int + spacegroupid : int number between 1 and 230 Returns diff --git a/pyebsdindex/crystallometry.py b/pyebsdindex/crystallometry.py index 81660f8..fd61d79 100644 --- a/pyebsdindex/crystallometry.py +++ b/pyebsdindex/crystallometry.py @@ -1,28 +1,28 @@ -'''This software was developed by employees of the US Naval Research Laboratory (NRL), an -agency of the Federal Government. Pursuant to title 17 section 105 of the United States -Code, works of NRL employees are not subject to copyright protection, and this software -is in the public domain. PyEBSDIndex is an experimental system. NRL assumes no -responsibility whatsoever for its use by other parties, and makes no guarantees, -expressed or implied, about its quality, reliability, or any other characteristic. We -would appreciate acknowledgment if the software is used. To the extent that NRL may hold -copyright in countries other than the United States, you are hereby granted the -non-exclusive irrevocable and unconditional right to print, publish, prepare derivative -works and distribute this software, in any medium, or authorize others to do so on your -behalf, on a royalty-free basis throughout the world. You may improve, modify, and -create derivative works of the software or any portion of the software, and you may copy -and distribute such modifications or works. Modified works should carry a notice stating -that you changed the software and should note the date and nature of any such change. -Please explicitly acknowledge the US Naval Research Laboratory as the original source. -This software can be redistributed and/or modified freely provided that any derivative -works bear some notice that they are derived from it, and any modified versions bear -some notice that they have been modified. - -This borrows heavily from Marc De Graef's EMsoft package. Many functions here -are Python translations of functions from it. - - -Author: David Rowenhorst, Patrick Callahan; -The US Naval Research Laboratory Date: 21 Aug 2020''' +# This software was developed by employees of the US Naval Research Laboratory (NRL), an +# agency of the Federal Government. Pursuant to title 17 section 105 of the United States +# Code, works of NRL employees are not subject to copyright protection, and this software +# is in the public domain. PyEBSDIndex is an experimental system. NRL assumes no +# responsibility whatsoever for its use by other parties, and makes no guarantees, +# expressed or implied, about its quality, reliability, or any other characteristic. We +# would appreciate acknowledgment if the software is used. To the extent that NRL may hold +# copyright in countries other than the United States, you are hereby granted the +# non-exclusive irrevocable and unconditional right to print, publish, prepare derivative +# works and distribute this software, in any medium, or authorize others to do so on your +# behalf, on a royalty-free basis throughout the world. You may improve, modify, and +# create derivative works of the software or any portion of the software, and you may copy +# and distribute such modifications or works. Modified works should carry a notice stating +# that you changed the software and should note the date and nature of any such change. +# Please explicitly acknowledge the US Naval Research Laboratory as the original source. +# This software can be redistributed and/or modified freely provided that any derivative +# works bear some notice that they are derived from it, and any modified versions bear +# some notice that they have been modified. +# +# This borrows heavily from Marc De Graef's EMsoft package. Many functions here +# are Python translations of functions from it. +# +# +# Author: David Rowenhorst, Patrick Callahan; +# The US Naval Research Laboratory Date: 22 May 2024 import numpy as np @@ -35,7 +35,7 @@ class CrystalPlane: - def __init__(self,hkl = [0,0,0],centering = 'F'): + def __init__(self,hkl = (0,0,0),centering = 'F'): self.hkl = np.array(hkl) self.allowed = True self.lattice_centering = centering diff --git a/pyebsdindex/misorientation.py b/pyebsdindex/misorientation.py index 41f6a8e..cf4df04 100644 --- a/pyebsdindex/misorientation.py +++ b/pyebsdindex/misorientation.py @@ -1,50 +1,46 @@ -''' -2022, David Rowenhorst/The US Naval Research Laboratory, Washington DC -# Pursuant to title 17 section 105 of the United States Code, works of US Governement employees -# are not not subject to copyright protection. +# This software was developed by employees of the US Naval Research Laboratory (NRL), an +# agency of the Federal Government. Pursuant to title 17 section 105 of the United States +# Code, works of NRL employees are not subject to copyright protection, and this software +# is in the public domain. PyEBSDIndex is an experimental system. NRL assumes no +# responsibility whatsoever for its use by other parties, and makes no guarantees, +# expressed or implied, about its quality, reliability, or any other characteristic. We +# would appreciate acknowledgment if the software is used. To the extent that NRL may hold +# copyright in countries other than the United States, you are hereby granted the +# non-exclusive irrevocable and unconditional right to print, publish, prepare derivative +# works and distribute this software, in any medium, or authorize others to do so on your +# behalf, on a royalty-free basis throughout the world. You may improve, modify, and +# create derivative works of the software or any portion of the software, and you may copy +# and distribute such modifications or works. Modified works should carry a notice stating +# that you changed the software and should note the date and nature of any such change. +# Please explicitly acknowledge the US Naval Research Laboratory as the original source. +# This software can be redistributed and/or modified freely provided that any derivative +# works bear some notice that they are derived from it, and any modified versions bear +# some notice that they have been modified. # -# Copyright (c) 2013-2014, Marc De Graef/Carnegie Mellon University -# All rights reserved. # -# Redistribution and use in source and binary forms, with or without modification, are -# permitted provided that the following conditions are met: +# Author: David Rowenhorst; +# The US Naval Research Laboratory Date: 22 May 2024 # -# - Redistributions of source code must retain the above copyright notice, this list -# of conditions and the following disclaimer. -# - Redistributions in binary form must reproduce the above copyright notice, this -# list of conditions and the following disclaimer in the documentation and/or -# other materials provided with the distribution. -# - Neither the names of Marc De Graef, Carnegie Mellon University nor the names -# of its contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. +# For further information see: +# David J. Rowenhorst, Patrick G. Callahan, Håkon W. Ånes. Fast Radon transforms for +# high-precision EBSD orientation determination using PyEBSDIndex. +# Journal of Applied Crystallography, 57(1):3–19, 2024. +# DOI: 10.1107/S1600576723010221 # -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#################################################################################################### - - -#Author David Rowenhorst/The US Naval Research Laboratory, Washington DC # -# -''' import numpy as np import numba -from os import environ +import os import tempfile -from pathlib import PurePath +from pathlib import PurePath, Path import platform -tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir()) -tempdir = tempdir.joinpath('numba') -environ["NUMBA_CACHE_DIR"] = str(tempdir) +tempdir = PurePath(Path.home()) +#tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir()) +#tempdir = tempdir.joinpath('numbacache') +tempdir = tempdir.joinpath('.pyebsdindex').joinpath('numbacache') +Path(tempdir).mkdir(parents=True, exist_ok=True) +os.environ["NUMBA_CACHE_DIR"] = str(tempdir)+str(os.sep) import rotlib diff --git a/pyebsdindex/nlpar.py b/pyebsdindex/nlpar.py index bb82881..e1fc4cd 100644 --- a/pyebsdindex/nlpar.py +++ b/pyebsdindex/nlpar.py @@ -1,3 +1,4 @@ + # This software was developed by employees of the US Naval Research Laboratory (NRL), an # agency of the Federal Government. Pursuant to title 17 section 105 of the United States # Code, works of NRL employees are not subject to copyright protection, and this software @@ -18,694 +19,28 @@ # some notice that they have been modified. # # Author: David Rowenhorst; -# The US Naval Research Laboratory Date: 21 Aug 2020 +# The US Naval Research Laboratory Date: 22 May 2024 # For more info see # Patrick T. Brewick, Stuart I. Wright, David J. Rowenhorst. Ultramicroscopy, 200:50–61, May 2019. -"""Non-local pattern averaging and re-indexing (NLPAR).""" - -from pathlib import Path -from timeit import default_timer as timer - -import numba -import numpy as np -import scipy.optimize as opt - -from pyebsdindex import ebsd_pattern - -#from os import environ -#environ["NUMBA_CACHE_DIR"] = str(tempdir) - - -__all__ = [ - "NLPAR", -] - - -class NLPAR: - def __init__(self, filename=None, lam=0.7, searchradius=3,dthresh=0.0, nrows = None, ncols = None): - self.lam = lam - self.searchradius = searchradius - self.dthresh = dthresh - self.filepath = None - self.hdfdatapath = None - self.filepathout = None - self.hdfdatapathout = None - #self.patternfile = None - #self.patternfileout = None - self.setfile(filename) - self.mask = None - self.sigma = None - self.nrows = None - self.ncols = None - if nrows is not None: - self.nrows = nrows - if ncols is not None: - self.ncols = ncols - - - def setfile(self,filepath=None): - self.filepath = None - self.hdfdatapath = None - pathtemp = np.atleast_1d(filepath) - fpath = pathtemp[0] - hdf5path = None - if pathtemp.size > 1: - hdf5path = pathtemp[1] - if fpath is not None: - self.filepath = Path(fpath) - self.hdfdatapath = hdf5path - - def setoutfile(self, patternfile, filepath=None): - """Set the output file. - - Parameters - ---------- - patternfile - Input pattern file object from ebsd_pattern. - filepath - String. - - Notes - ----- - In the future I want to be able to specify the HDF5 data path to - store the output data, but that is proving to be a bit of a mess. - For now, a copy of the original HDF5 is made, and the NLPAR patterns - will be overwritten on top of the originals. - """ - self.filepathout = None - self.hdfdatapathout = None - pathtemp = np.atleast_1d(filepath) - fpath = pathtemp[0] - hdf5path = None - #if pathtemp.size > 1: - # hdf5path = pathtemp[1] - #print(fpath, hdf5path) - if fpath is not None: # the user has set an output file path. - self.filepathout = Path(fpath).expanduser().resolve() - self.hdfdatapathout = hdf5path - if patternfile.filetype != 'HDF5': #check that the input and output are not the same. - pathok = self.filepathout.exists() - if pathok: - pathok = not self.filepathout.samefile(patternfile.filepath) - if not pathok: - raise ValueError('Error: File input and output are exactly the same.') - return - - patternfile.copy_file([self.filepathout,self.hdfdatapathout], empty_data=True) - return # fpath and (maybe) hdf5 path were set manually. - else: # this is a hdf5 file - if self.hdfdatapathout is None: - patternfile.copy_file(self.filepathout, empty_data=True) - self.hdfdatapathout = patternfile.h5patdatpth - return - else: - patternfile.copy_file([self.filepathout, self.hdfdatapathout], empty_data=True) - return - - if patternfile is not None: # the user has set no path. - hdf5path = None - - if patternfile.filetype in ['UP', 'EBSP']: - p = Path(patternfile.filepath) - appnd = "_NLPAR_l{:1.2f}".format(self.lam) + "sr{:d}".format(self.searchradius) - newfilepath = str(p.parent / Path(p.stem + appnd + p.suffix)) - patternfile.copy_file(newfilepath,empty_data=True) - - if patternfile.filetype == 'HDF5': - hdf5path_tmp = str(patternfile.h5patdatpth).split('/') - if hdf5path_tmp[0] == '': - hdf5path_org = hdf5path_tmp[1] - else: - hdf5path_org = hdf5path_tmp[0] - p = Path(patternfile.filepath) - appnd = "_NLPAR_l{:1.2f}".format(self.lam) + "sr{:d}".format(self.searchradius) - hdf5path = hdf5path_org+appnd - newfilepath = str(p.parent / Path(p.stem + appnd + p.suffix)) - #patternfile.copy_file([newfilepath, hdf5path_org], newh5path=hdf5path) - patternfile.copy_file([newfilepath], empty_data=True) - hdf5path = patternfile.h5patdatpth - - self.filepathout = newfilepath - self.hdfdatapathout = hdf5path - return - - def getinfileobj(self): - if self.filepath is not None: - fID = ebsd_pattern.get_pattern_file_obj([self.filepath, self.hdfdatapath]) - if (fID.nRows is not None): - if (self.nrows is None): - self.nrows = fID.nRows - else: - fID.nRows = self.nrows - - if (fID.nCols is not None): - if (self.ncols is None): - self.ncols = fID.nCols - else: - fID.nCols = self.ncols - - return fID - - else: - return None - - def getoutfileobj(self): - if self.filepathout is not None: - fID = ebsd_pattern.get_pattern_file_obj([self.filepathout, self.hdfdatapathout]) - if self.nrows is not None: - fID.nRows = self.nrows - else: - self.nrows = fID.nRows - - if self.ncols is not None: - fID.nCols = self.ncols - else: - self.ncols = fID.nCols - return fID - else: - return None - - def opt_lambda(self,chunksize=0,saturation_protect=True,automask=True, backsub = False, - target_weights=[0.5, 0.34, 0.25], dthresh=0.0, autoupdate=True): - - target_weights = np.asarray(target_weights) - - def loptfunc(lam,d2,tw,dthresh): - temp = (d2 > dthresh).choose(dthresh, d2) - dw = np.exp(-(temp) / lam ** 2) - w = np.sum(dw, axis=2) + 1e-12 - - metric = np.mean(np.abs(tw - 1.0 / w)) - return metric - - - @numba.njit(fastmath=True, cache=True,parallel=True) - def d2norm(d2, n2, dij, sigma): - shp = d2.shape - s2 = sigma**2 - for j in numba.prange(shp[0]): - for i in range(shp[1]): - for q in range(shp[2]): - if n2[j,i,q] > 0: - ii = dij[j,i,q,1] - jj = dij[j,i,q,0] - s2_12 = (s2[j,i]+s2[jj,ii]) - d2[j,i,q] -= n2[j,i,q] * s2_12 - d2[j,i,q] /= s2_12*np.sqrt(2.0*n2[j,i,q]) - - patternfile = self.getinfileobj() - patternfile.read_header() - nrows = np.uint64(self.nrows) #np.uint64(patternfile.nRows) - ncols = np.uint64(self.ncols) #np.uint64(patternfile.nCols) - - pwidth = np.uint64(patternfile.patternW) - pheight = np.uint64(patternfile.patternH) - phw = pheight * pwidth - - nn = 1 - if chunksize <= 0: - chunksize = np.maximum(1, np.round(1e9 / phw / ncols) ) # keeps the chunk at about 4GB - chunksize = np.minimum(nrows,chunksize) - print("Chunk size set to nrows:", int(chunksize)) - chunksize = np.int64(chunksize) - - nn = np.uint64(nn) - - if (automask is True) and (self.mask is None): - self.mask = (self.automask(pheight,pwidth)) - if self.mask is None: - self.mask = np.ones((pheight,pwidth),dtype=np.uint8) - - self.mask = (self.mask).astype(np.uint8) - indices = np.asarray((self.mask.flatten().nonzero())[0],np.uint64) - - sigma = np.zeros((nrows,ncols),dtype=np.float32)+1e24 - colstartcount = np.asarray([0,ncols],dtype=np.int64) - - - dthresh = np.float32(dthresh) - lamopt_values = [] - - for j in range(0,nrows,chunksize): - print('Block',j) - #rowstartread = np.int64(max(0,j - nn)) - rowstartread = np.int64(j) - rowend = min(j + chunksize + nn,nrows) - rowcountread = np.int64(rowend - rowstartread) - data, xyloc = patternfile.read_data(patStartCount=[[0,rowstartread],[ncols,rowcountread]], - convertToFloat=True,returnArrayOnly=True) - - shp = data.shape - - if backsub is True: - data = self.backsub(data) - #back = np.mean(data, axis=0) - #back -= np.mean(back) - #data -= back - data = data.reshape(shp[0], phw) - - rowstartcount = np.asarray([0,rowcountread],dtype=np.int64) - sigchunk, (d2,n2, dij) = self.sigma_numba(data,nn,rowcountread,ncols,rowstartcount,colstartcount,indices,saturation_protect) - tmp = (sigma[j:j + rowstartcount[1],:] < sigchunk).choose( sigchunk, sigma[j:j + rowstartcount[1],:]) - sigma[j:j + rowstartcount[1],:] = tmp - - - d2norm(d2, n2, dij, sigchunk) - lamopt_values_chnk = [] - for tw in target_weights: - lam = 1.0 - lambopt1 = opt.minimize(loptfunc,lam,args=(d2,tw,dthresh),method='Nelder-Mead', - bounds = [[0.001, 10.0]],options={'fatol': 0.0001}) - lamopt_values_chnk.append(lambopt1['x']) - - - lamopt_values.append(lamopt_values_chnk) - lamopt_values = np.asarray(lamopt_values) - print("Range of lambda values: ", np.mean(lamopt_values, axis = 0).flatten()) - print("Optimal Choice: ", np.median(np.mean(lamopt_values, axis = 0))) - if autoupdate == True: - self.lam = np.median(np.mean(lamopt_values, axis = 0)) - if self.sigma is None: - self.sigma = sigma - return np.mean(lamopt_values, axis = 0).flatten() - - def calcnlpar(self, chunksize=0, searchradius=None, lam = None, dthresh = None, saturation_protect=True, automask=True, - filename=None, fileout=None, reset_sigma=True, backsub = False, rescale = False): - - if lam is not None: - self.lam = lam - - if dthresh is not None: - self.dthresh = dthresh - - if searchradius is not None: - self.searchradius = searchradius - - lam = np.float32(self.lam) - dthresh = np.float32(self.dthresh) - sr = np.int64(self.searchradius) - - if filename is not None: - self.setfile(filepath=filename) - - if reset_sigma: - self.sigma = None - - patternfile = self.getinfileobj() - - #if filepathout is not None: - self.setoutfile(patternfile, filepath=fileout) - - patternfileout = self.getoutfileobj() - - - nrows = np.int64(self.nrows)#np.int64(patternfile.nRows) - ncols = np.int64(self.ncols)#np.int64(patternfile.nCols) - if patternfileout.nCols is None: - patternfileout.nCols = ncols - if patternfileout.nRows is None: - patternfileout.nRows = nrows - - - pwidth = np.int64(patternfile.patternW) - pheight = np.int64(patternfile.patternH) - phw = pheight*pwidth - - if chunksize <= 0: - chunksize = np.maximum(1, np.round(1e9 / phw / ncols) ) # keeps the chunk at about 8GB - chunksize = np.minimum(nrows, chunksize) - print("Chunk size set to nrows:", int(chunksize)) - chunksize = np.int64(chunksize) - if (automask is True) and (self.mask is None): - self.mask = (self.automask(pheight,pwidth)) - if self.mask is None: - self.mask = np.ones((pheight,pwidth), dtype=np.uint8) - - indices = np.asarray( (self.mask.flatten().nonzero())[0], np.uint64) - calcsigma = False - if self.sigma is None: - calcsigma = True - self.sigma = np.zeros((nrows, ncols), dtype=np.float32)+1e24 - - - if np.asarray(self.sigma).size == 1: - tmp = np.asarray(self.sigma)[0] - self.sigma = np.zeros((nrows, ncols), dtype=np.float32)+tmp - calcsigma = False - - shpsigma = np.asarray(self.sigma).shape - if (shpsigma[0] != nrows) and (shpsigma[1] != ncols): - self.sigma = np.zeros((nrows,ncols),dtype=np.float32) + 1e24 - calcsigma = True - - - sigma = np.asarray(self.sigma) - scalemethod = 'clip' - if rescale == True: - if np.issubdtype(patternfileout.filedatatype, np.integer): - mxval = np.iinfo(patternfileout.filedatatype).max - scalemethod = 'fullscale' - else: # not int, so no rescale. - rescale = False - - nthreadpos = numba.get_num_threads() - #numba.set_num_threads(36) - colstartcount = np.asarray([0,ncols],dtype=np.int64) - print(lam, sr, dthresh) - - for j in range(0,nrows,chunksize): - print('Row start', j) - - rowstartread = np.int64(max(0, j-sr)) - rowend = min(j + chunksize+sr,nrows) - - if (rowend - rowstartread) < (2*sr+1): - rowstartread = np.int64(max(0, rowend - (2*sr+1))) - rowcountread = np.int64(rowend-rowstartread) - data, xyloc = patternfile.read_data(patStartCount = [[0,rowstartread], [ncols,rowcountread]], - convertToFloat=True,returnArrayOnly=True) - - shpdata = data.shape - - if backsub is True: - data = self.backsub(data) - - - data = data.reshape(shpdata[0], phw) - - rowstartcount = np.asarray([0,rowcountread],dtype=np.int64) - if calcsigma is True: - sigchunk, tmp = self.sigma_numba(data,1,rowcountread,ncols,rowstartcount,colstartcount,indices,saturation_protect) - del tmp - tmp = (sigma[rowstartread:rowend,:] < sigchunk).choose(sigchunk,sigma[rowstartread:rowend,:]) - sigma[rowstartread:rowend,:] = tmp - else: - sigchunk = sigma[rowstartread:rowend,:] - - #dataout = data - - dataout = self.nlpar_nb(data,lam, sr, dthresh, sigchunk, - rowcountread,ncols,indices,saturation_protect) - - dataout = dataout.reshape(rowcountread, ncols, phw) - dataout = dataout[j-rowstartread:, :, : ] - shpout = dataout.shape - dataout = dataout.reshape(shpout[0]*shpout[1], pheight, pwidth) - if rescale == True: - for i in range(dataout.shape[0]): - temp = dataout[i,:,:] - temp -= temp.min() - temp *= np.float32(mxval)/temp.max() - dataout[i,:,:] = temp - - patternfileout.write_data(newpatterns=dataout,patStartCount = [[0,j], [ncols, shpout[0]]], - flt2int='clip',scalevalue=1.0 ) - #self.patternfileout.write_data(newpatterns=dataout,patStartCount=[j*ncols,shpout[0]*shpout[1]], - # flt2int='clip',scalevalue=1.0 ) - #return dataout - #sigma[j:j+rowstartcount[1],:] += \ - # sigchunk[rowstartcount[0]:rowstartcount[0]+rowstartcount[1],:] - - numba.set_num_threads(nthreadpos) - return str(patternfileout.filepath) - - def calcsigma(self,chunksize=0,nn=1,saturation_protect=True,automask=True): - - patternfile = self.getinfileobj() - - - nrows = np.int64(self.nrows)#np.uint64(patternfile.nRows) - ncols = np.int64(self.ncols)#np.uint64(patternfile.nCols) - - pwidth = np.uint64(patternfile.patternW) - pheight = np.uint64(patternfile.patternH) - phw = pheight*pwidth - - if chunksize <= 0: - chunksize = np.round(2e9/phw/ncols) # keeps the chunk at about 8GB - chunksize = np.minimum(nrows,chunksize) - print("Chunk size set to nrows:", int(chunksize)) - chunksize = np.int64(chunksize) - - - nn = np.uint64(nn) - - if (automask is True) and (self.mask is None): - self.mask = (self.automask(pheight,pwidth)) - if self.mask is None: - self.mask = np.ones((pheight,pwidth), dytype=np.uint8) - - indices = np.asarray( (self.mask.flatten().nonzero())[0], np.uint64) - - sigma = np.zeros((nrows, ncols), dtype=np.float32) - #d_nn = np.zeros((nrows, ncols, int((2*nn+1)**2)), dtype=np.float32) - colstartcount = np.asarray([0,ncols],dtype=np.int64) - - for j in range(0,nrows,chunksize): - rowstartread = np.int64(max(0, j-nn)) - rowend = min(j + chunksize+nn,nrows) - if (rowend - rowstartread) < (3): - rowstartread = np.int64(max(0, rowend - (3))) - rowcountread = np.int64(rowend-rowstartread) - data, xyloc = patternfile.read_data(patStartCount = [[0,rowstartread], [ncols,rowcountread]], - convertToFloat=True,returnArrayOnly=True) - - shp = data.shape - data = data.reshape(data.shape[0], phw) - - #data = None - if rowend == nrows: - rowstartcount = np.asarray([j-rowstartread,rowcountread - (j-rowstartread) ], dtype=np.int64) - else: - rowstartcount = np.asarray([j-rowstartread,chunksize ], dtype=np.int64) - - sigchunk, temp = self.sigma_numba(data,nn, rowcountread,ncols,rowstartcount,colstartcount,indices,saturation_protect) - - sigma[j:j+rowstartcount[1],:] += \ - sigchunk[rowstartcount[0]:rowstartcount[0]+rowstartcount[1],:] - - return sigma - - def backsub(self, data): - # This function will fit a 2D gaussian on top of a plane to the averaged set of patterns (data) that is provided. - # It will automatically use whatever mask is defined for valid data. - # If the gaussian fit fails to converge, it will fall back to just using the mean set of patterns for the background - # with a warning. - - - def gaussian_surf(x, y, a, x0, y0, sigx, sigy, c, d, e): - # equation for 2D gaussian on top of a plane. - return a * np.exp(- ((x - x0) ** 2) / (2.0 * sigx ** 2) - ((y - y0) ** 2) / (2.0 * sigy ** 2)) + c + d * x + e * y - - def fit_gauss(M, *args): - # helper function - x, y = M - #arr = np.zeros(x.shape) - return gaussian_surf(x, y, *args) - - back = np.mean(data, axis=0) # start with the mean of all the data - # now fit a 2D gaussian sitting on a plane. See fuction def above. - nx = data.shape[-1] - ny = data.shape[-2] - x = np.arange(nx, dtype=float) - x = (np.broadcast_to(x.reshape(1,nx), (ny, nx))) - y = np.arange(ny, dtype=float) - y = (np.broadcast_to(y, (nx, ny)).T) - x = x.ravel() - y = y.ravel() - - # need to grab only the values that are in the mask. - wh = np.nonzero(self.mask.ravel())[0] - xwh = x[wh] - ywh = y[wh] - xywh = np.vstack((xwh, ywh)) - zwh = (back.ravel())[wh] - whmx = np.unravel_index(back.argmax(), back.shape) - minz = zwh.min() - # initialize a guess for the parameters. - # [gauss amplitude, max loc x, max loc y, sigx, sigy, const offset, slope x, slope y] - p0 = [(zwh.max() - zwh.min()), whmx[1], whmx[0], nx/2.355, ny/2.355, minz, 0, 0] - try: - popt, pcov = opt.curve_fit(fit_gauss, xywh, zwh, p0) - backfit = (gaussian_surf(x, y, *popt)).reshape(ny, nx) - #print(p0, popt) - except RuntimeError: - print('Warning: no convergence on back subtract ... using mean of the patterns.') - print('This may not be ideal for scans with few grains across the width of the scan.') - backfit = back - backfit -= np.mean(backfit) - #f, axarr = plt.subplots(1, 3) - #f.set_size_inches(10, 4) - #axarr[0].imshow(data[0,:,:].squeeze(), cmap='gray') - #axarr[1].imshow(data[0,:,:].squeeze() - backfit, cmap='gray') - #axarr[2].imshow(backfit, cmap='gray') - - data -= backfit - return data - - @staticmethod - def automask( h,w ): - r = (min(h,w)*0.98*0.5) - x = np.arange(w, dtype=np.float32) - x = np.minimum(x , (w-x)) - x = x.reshape(1,w) - y = np.arange(h, dtype=np.float32) - y = np.minimum(y , (h - y)) - y = y.reshape(h,1) - - mask =np.sqrt(y**2 + x**2) - mask = (mask < r).astype(np.uint8) - mask = np.roll(mask, (int(h/2), int(w/2) ), axis=(0,1)) - - return mask - - @staticmethod - @numba.jit(nopython=True,cache=True,fastmath=False,parallel=True) - def sigma_numba(data, nn, nrows, ncols, rowstartcount, colstartcount, indices, saturation_protect=True): - sigma = np.zeros((nrows,ncols), dtype = np.float32) - dout = np.zeros((nrows,ncols,(nn*2+1)**2), dtype=np.float32) - nout = np.zeros((nrows,ncols,(nn * 2 + 1) ** 2),dtype=np.float32) - dij = np.zeros((nrows,ncols,(nn * 2 + 1) ** 2, 2),dtype=np.uint64) - shpdata = data.shape - - n0 = np.float32(shpdata[-1]) - shpind = indices.shape - mxval = np.max(data) - if saturation_protect == False: - mxval += 1.0 - else: - mxval *= 0.9961 - - for j in numba.prange(rowstartcount[0], rowstartcount[0]+rowstartcount[1]): - nn_r_start = j - nn if (j - nn) >= 0 else 0 - nn_r_end = (j + nn if (j + nn) < nrows else nrows-1)+1 - for i in numba.prange(colstartcount[0], colstartcount[0]+colstartcount[1]): - nn_c_start = i - nn if (i - nn) >= 0 else 0 - nn_c_end = (i + nn if (i + nn) < ncols else ncols - 1) + 1 - - mind = np.float32(1e24) - indx_0 = i+ncols*j - count = 0 - for j_nn in range(nn_r_start,nn_r_end ): - for i_nn in range(nn_c_start,nn_c_end): - dij[j,i,count,0] = np.uint64(j_nn) - dij[j,i,count,1] = np.uint64(i_nn) # want to save this for labmda optimization - indx_nn = i_nn+ncols*j_nn - d2 = np.float32(0.0) - n2 = np.float32(1.0e-12) - nout[j,i,count] = n0 # want to save this for labmda optimization - if not((i == i_nn) and (j == j_nn)): - for q in range(shpind[0]): - d0 = data[indx_0, indices[q]] - d1 = data[indx_nn, indices[q]] - if (d1 < mxval) and (d0 < mxval): - n2 += 1.0 - d2 += (d0 - d1)**2 - nout[j,i,count] = n2 - - if d2 >= 1.e-3: #sometimes EDAX collects the same pattern twice - s0 = d2 / np.float32(n2 * 2.0) - if s0 < mind: - mind = s0 - dout[j,i,count] = d2 # want to save this for labmda optimization - - count += 1 - - sigma[j,i] = np.sqrt(mind) - #if sigma[j,i] > 1e12: - # print(sigma[j,i], dout[j,i,:], nout[i,j,:]) - return sigma,( dout, nout, dij) - - @staticmethod - @numba.jit(nopython=True,cache=True,fastmath=False,parallel=True) - def nlpar_nb(data,lam, sr, dthresh, sigma, nrows,ncols,indices,saturation_protect=True): - def getpairid(idx0, idx1): - idx0_t = int(idx0) - idx1_t = int(idx1) - if idx0 < idx1: - pairid = idx0_t + (idx1_t << 32) - else: - pairid = idx1_t + (idx0_t << 32) - return numba.uint64(pairid) - - lam2 = 1.0 / lam**2 - dataout = np.zeros_like(data, np.float32) - shpdata = data.shape - shpind = indices.shape - winsz = np.int32((2*sr+1)**2) - - - mxval = np.max(data) - if saturation_protect == False: - mxval += np.float32(1.0) - else: - mxval *= np.float32(0.999) - for i in numba.prange(ncols): - winstart_x = max((i - sr),0) - max((i + sr - (ncols - 1)),0) - winend_x = min((i + sr),(ncols - 1)) + max((sr - i),0) + 1 - pairdict = numba.typed.Dict.empty(key_type=numba.core.types.uint64,value_type=numba.core.types.float32) - for j in range(nrows): - winstart_y = max((j - sr),0) - max((j + sr - (nrows - 1)),0) - winend_y = min((j + sr),(nrows - 1)) + max((sr - j),0) + 1 - - weights = np.zeros(winsz,dtype=np.float32) - pindx = np.zeros(winsz,dtype=np.uint64) - - indx_0 = i + ncols * j - sigma0 = sigma[j,i]**2 - - counter = 0 - for j_nn in range(winstart_y,winend_y): - for i_nn in range(winstart_x,winend_x): - indx_nn = i_nn + ncols * j_nn - pindx[counter] = indx_nn +"""Non-local pattern averaging and re-indexing (NLPAR).""" - if indx_nn == indx_0: - weights[counter] = np.float32(-1.0e6) - else: - pairid = getpairid(indx_0, indx_nn) - if pairid in pairdict: - weights[counter] = pairdict[pairid] - else: - sigma1 = sigma[j_nn,i_nn]**2 - d2 = np.float32(0.0) - n2 = np.float32(0.0) - for q in range(shpind[0]): - d0 = data[indx_0,indices[q]] - d1 = data[indx_nn,indices[q]] - if (d1 < mxval) and (d0 < mxval): - n2 += np.float32(1.0) - d2 += (d0 - d1) ** np.int32(2) - d2 -= n2*(sigma0+sigma1) - dnorm = (sigma1 + sigma0)*np.sqrt(np.float32(2.0)*n2) - if dnorm > np.float32(1.e-8): - d2 /= dnorm - else: - d2 = np.float32(1e6)*n2 - weights[counter] = d2 - pairdict[pairid] = numba.float32(d2) - counter += 1 - #print('________________') - # end of window scanning - sum = np.float32(0.0) - for i_nn in range(winsz): - weights[i_nn] = np.maximum(weights[i_nn]-dthresh, numba.float32(0.0)) - weights[i_nn] = np.exp(-1.0 * weights[i_nn] * lam2) - sum += weights[i_nn] +from pyebsdindex import _ray_installed +from pyebsdindex import _pyopencl_installed - for i_nn in range(winsz): - indx_nn = pindx[i_nn] - weights[i_nn] /= sum - #print(weights[i_nn], ' \n') - for q in range(shpdata[1]): - dataout[indx_0, q] += data[indx_nn, q]*weights[i_nn] - #print('_______', '\n') - return dataout +if _ray_installed and _pyopencl_installed: + from pyebsdindex.opencl.nlpar_clray import NLPAR +elif _pyopencl_installed and not _ray_installed: + from pyebsdindex.opencl.nlpar_cl import NLPAR +else: + from pyebsdindex.nlpar_cpu import NLPAR +__all__ = [ + "NLPAR", +] \ No newline at end of file diff --git a/pyebsdindex/nlpar_cpu.py b/pyebsdindex/nlpar_cpu.py new file mode 100644 index 0000000..129968a --- /dev/null +++ b/pyebsdindex/nlpar_cpu.py @@ -0,0 +1,794 @@ +# This software was developed by employees of the US Naval Research Laboratory (NRL), an +# agency of the Federal Government. Pursuant to title 17 section 105 of the United States +# Code, works of NRL employees are not subject to copyright protection, and this software +# is in the public domain. PyEBSDIndex is an experimental system. NRL assumes no +# responsibility whatsoever for its use by other parties, and makes no guarantees, +# expressed or implied, about its quality, reliability, or any other characteristic. We +# would appreciate acknowledgment if the software is used. To the extent that NRL may hold +# copyright in countries other than the United States, you are hereby granted the +# non-exclusive irrevocable and unconditional right to print, publish, prepare derivative +# works and distribute this software, in any medium, or authorize others to do so on your +# behalf, on a royalty-free basis throughout the world. You may improve, modify, and +# create derivative works of the software or any portion of the software, and you may copy +# and distribute such modifications or works. Modified works should carry a notice stating +# that you changed the software and should note the date and nature of any such change. +# Please explicitly acknowledge the US Naval Research Laboratory as the original source. +# This software can be redistributed and/or modified freely provided that any derivative +# works bear some notice that they are derived from it, and any modified versions bear +# some notice that they have been modified. +# +# Author: David Rowenhorst; +# The US Naval Research Laboratory Date: 22 May 2024 + +# For more info see +# Patrick T. Brewick, Stuart I. Wright, David J. Rowenhorst. Ultramicroscopy, 200:50–61, May 2019. + + + +from pathlib import Path +from timeit import default_timer as timer + +import numba +import numpy as np +import scipy.optimize as opt + +from pyebsdindex import ebsd_pattern + +#from os import environ +#environ["NUMBA_CACHE_DIR"] = str(tempdir) + + +__all__ = [ + "NLPAR", +] + + +class NLPAR: + def __init__(self, filename=None, lam=0.7, searchradius=3,dthresh=0.0, nrows = None, ncols = None, **kwargs): + self.lam = lam + self.searchradius = searchradius + self.dthresh = dthresh + self.filepath = None + self.hdfdatapath = None + self.filepathout = None + self.hdfdatapathout = None + #self.patternfile = None + #self.patternfileout = None + self.setfile(filename) + self.mask = None + self.sigma = None + self.sigmann = 1 + self.nrows = None + self.ncols = None + if nrows is not None: + self.nrows = nrows + if ncols is not None: + self.ncols = ncols + + + def setfile(self,filepath=None): + self.filepath = None + self.hdfdatapath = None + pathtemp = np.atleast_1d(filepath) + fpath = pathtemp[0] + hdf5path = None + if pathtemp.size > 1: + hdf5path = pathtemp[1] + if fpath is not None: + self.filepath = Path(fpath) + self.hdfdatapath = hdf5path + + def setoutfile(self, patternfile, filepath=None): + """Set the output file. + + Parameters + ---------- + patternfile + Input pattern file object from ebsd_pattern. + filepath + String. + + Notes + ----- + In the future I want to be able to specify the HDF5 data path to + store the output data, but that is proving to be a bit of a mess. + For now, a copy of the original HDF5 is made, and the NLPAR patterns + will be overwritten on top of the originals. + """ + self.filepathout = None + self.hdfdatapathout = None + pathtemp = np.atleast_1d(filepath) + fpath = pathtemp[0] + hdf5path = None + #if pathtemp.size > 1: + # hdf5path = pathtemp[1] + #print(fpath, hdf5path) + if fpath is not None: # the user has set an output file path. + self.filepathout = Path(fpath).expanduser().resolve() + self.hdfdatapathout = hdf5path + if patternfile.filetype != 'HDF5': #check that the input and output are not the same. + pathok = self.filepathout.exists() + if pathok: + pathok = not self.filepathout.samefile(patternfile.filepath) + if not pathok: + raise ValueError('Error: File input and output are exactly the same.') + return + + patternfile.copy_file([self.filepathout,self.hdfdatapathout], empty_data=True) + return # fpath and (maybe) hdf5 path were set manually. + else: # this is a hdf5 file + if self.hdfdatapathout is None: + patternfile.copy_file(self.filepathout, empty_data=True) + self.hdfdatapathout = patternfile.h5patdatpth + return + else: + patternfile.copy_file([self.filepathout, self.hdfdatapathout], empty_data=True) + return + + if patternfile is not None: # the user has set no path. + hdf5path = None + + if patternfile.filetype in ['UP', 'EBSP']: + p = Path(patternfile.filepath) + appnd = "_NLPAR_l{:1.2f}".format(self.lam) + "sr{:d}".format(self.searchradius) + newfilepath = str(p.parent / Path(p.stem + appnd + p.suffix)) + patternfile.copy_file(newfilepath,empty_data=True) + + if patternfile.filetype == 'HDF5': + hdf5path_tmp = str(patternfile.h5patdatpth).split('/') + if hdf5path_tmp[0] == '': + hdf5path_org = hdf5path_tmp[1] + else: + hdf5path_org = hdf5path_tmp[0] + p = Path(patternfile.filepath) + appnd = "_NLPAR_l{:1.2f}".format(self.lam) + "sr{:d}".format(self.searchradius) + hdf5path = hdf5path_org+appnd + newfilepath = str(p.parent / Path(p.stem + appnd + p.suffix)) + #patternfile.copy_file([newfilepath, hdf5path_org], newh5path=hdf5path) + patternfile.copy_file([newfilepath], empty_data=True) + hdf5path = patternfile.h5patdatpth + + self.filepathout = newfilepath + self.hdfdatapathout = hdf5path + return + + def getinfileobj(self): + if self.filepath is not None: + fID = ebsd_pattern.get_pattern_file_obj([self.filepath, self.hdfdatapath]) + if (fID.nRows is not None): + if (self.nrows is None): + self.nrows = fID.nRows + else: + fID.nRows = self.nrows + + if (fID.nCols is not None): + if (self.ncols is None): + self.ncols = fID.nCols + else: + fID.nCols = self.ncols + + return fID + + else: + return None + + def getoutfileobj(self): + if self.filepathout is not None: + fID = ebsd_pattern.get_pattern_file_obj([self.filepathout, self.hdfdatapathout]) + if self.nrows is not None: + fID.nRows = self.nrows + else: + self.nrows = fID.nRows + + if self.ncols is not None: + fID.nCols = self.ncols + else: + self.ncols = fID.nCols + return fID + else: + return None + + def opt_lambda(self,chunksize=0,saturation_protect=True,automask=True, backsub = False, + target_weights=[0.5, 0.34, 0.25], dthresh=0.0, autoupdate=True, **kwargs): + + target_weights = np.asarray(target_weights) + + def loptfunc(lam,d2,tw,dthresh): + temp = (d2 > dthresh).choose(dthresh, d2) + dw = np.exp(-(temp) / lam ** 2) + w = np.sum(dw, axis=2) + 1e-12 + + metric = np.mean(np.abs(tw - 1.0 / w)) + return metric + + + @numba.njit(fastmath=True, cache=True,parallel=True) + def d2norm(d2, n2, dij, sigma): + shp = d2.shape + s2 = sigma**2 + for j in numba.prange(shp[0]): + for i in range(shp[1]): + for q in range(shp[2]): + if n2[j,i,q] > 0: + ii = dij[j,i,q,1] + jj = dij[j,i,q,0] + s2_12 = (s2[j,i]+s2[jj,ii]) + d2[j,i,q] -= n2[j,i,q] * s2_12 + d2[j,i,q] /= s2_12*np.sqrt(2.0*n2[j,i,q]) + + patternfile = self.getinfileobj() + patternfile.read_header() + nrows = np.uint64(self.nrows) #np.uint64(patternfile.nRows) + ncols = np.uint64(self.ncols) #np.uint64(patternfile.nCols) + + pwidth = np.uint64(patternfile.patternW) + pheight = np.uint64(patternfile.patternH) + phw = pheight * pwidth + + nn = 1 + if chunksize <= 0: + chunksize = np.maximum(1, np.round(1e9 / phw / ncols) ) # keeps the chunk at about 4GB + chunksize = np.minimum(nrows,chunksize) + print("Chunk size set to nrows:", int(chunksize)) + chunksize = np.int64(chunksize) + + nn = np.uint64(nn) + + if (automask is True) and (self.mask is None): + self.mask = (self.automask(pheight,pwidth)) + if self.mask is None: + self.mask = np.ones((pheight,pwidth),dtype=np.uint8) + + self.mask = (self.mask).astype(np.uint8) + indices = np.asarray((self.mask.flatten().nonzero())[0],np.uint64) + + sigma = np.zeros((nrows,ncols),dtype=np.float32)+1e24 + colstartcount = np.asarray([0,ncols],dtype=np.int64) + + + dthresh = np.float32(dthresh) + lamopt_values = [] + + for j in range(0,nrows,chunksize): + print('Block',j) + #rowstartread = np.int64(max(0,j - nn)) + rowstartread = np.int64(j) + rowend = min(j + chunksize + nn,nrows) + rowcountread = np.int64(rowend - rowstartread) + data, xyloc = patternfile.read_data(patStartCount=[[0,rowstartread],[ncols,rowcountread]], + convertToFloat=True,returnArrayOnly=True) + + shp = data.shape + + if backsub is True: + data = self.backsub(data) + #back = np.mean(data, axis=0) + #back -= np.mean(back) + #data -= back + data = data.reshape(shp[0], phw) + + rowstartcount = np.asarray([0,rowcountread],dtype=np.int64) + sigchunk, (d2,n2, dij) = self.sigma_numba(data,nn,rowcountread,ncols,rowstartcount,colstartcount,indices,saturation_protect) + tmp = (sigma[j:j + rowstartcount[1],:] < sigchunk).choose( sigchunk, sigma[j:j + rowstartcount[1],:]) + sigma[j:j + rowstartcount[1],:] = tmp + + + d2norm(d2, n2, dij, sigchunk) + + + lamopt_values_chnk = [] + for tw in target_weights: + lam = 1.0 + lambopt1 = opt.minimize(loptfunc,lam,args=(d2,tw,dthresh),method='Nelder-Mead', + bounds = [[0.001, 10.0]],options={'fatol': 0.0001}) + lamopt_values_chnk.append(lambopt1['x']) + + + lamopt_values.append(lamopt_values_chnk) + lamopt_values = np.asarray(lamopt_values) + print("Range of lambda values: ", np.mean(lamopt_values, axis = 0).flatten()) + print("Optimal Choice: ", np.median(np.mean(lamopt_values, axis = 0))) + if autoupdate == True: + self.lam = np.median(np.mean(lamopt_values, axis = 0)) + if self.sigma is None: + self.sigma = sigma + return np.mean(lamopt_values, axis = 0).flatten() + + def calcnlpar(self, chunksize=0, searchradius=None, lam = None, dthresh = None, saturation_protect=True, automask=True, + filename=None, fileout=None, reset_sigma=False, backsub = False, rescale = False, **kwargs): + + if lam is not None: + self.lam = lam + + if dthresh is not None: + self.dthresh = dthresh + + if searchradius is not None: + self.searchradius = searchradius + + lam = np.float32(self.lam) + dthresh = np.float32(self.dthresh) + sr = np.int64(self.searchradius) + + if filename is not None: + self.setfile(filepath=filename) + + + + + patternfile = self.getinfileobj() + + #if filepathout is not None: + self.setoutfile(patternfile, filepath=fileout) + + patternfileout = self.getoutfileobj() + + + nrows = np.int64(self.nrows)#np.int64(patternfile.nRows) + ncols = np.int64(self.ncols)#np.int64(patternfile.nCols) + if patternfileout.nCols is None: + patternfileout.nCols = ncols + if patternfileout.nRows is None: + patternfileout.nRows = nrows + + + pwidth = np.int64(patternfile.patternW) + pheight = np.int64(patternfile.patternH) + phw = pheight*pwidth + + if chunksize <= 0: + chunksize = np.maximum(1, np.round(1e9 / phw / ncols) ) # keeps the chunk at about 8GB + chunksize = np.minimum(nrows, chunksize) + print("Chunk size set to nrows:", int(chunksize)) + chunksize = np.int64(chunksize) + + + + if (automask is True) and (self.mask is None): + self.mask = (self.automask(pheight,pwidth)) + if self.mask is None: + self.mask = np.ones((pheight,pwidth), dtype=np.uint8) + + indices = np.asarray( (self.mask.flatten().nonzero())[0], np.uint64) + calcsigma = False + if self.sigma is None: + calcsigma = True + self.sigma = np.zeros((nrows, ncols), dtype=np.float32)+1e24 + + if reset_sigma: + self.sigma = None + + if (np.asarray(self.sigma).size == 1) and (self.sigma is not None): + tmp = np.asarray(self.sigma)[0] + self.sigma = np.zeros((nrows, ncols), dtype=np.float32)+tmp + calcsigma = False + + shpsigma = np.asarray(self.sigma).shape + if (shpsigma[0] != nrows) and (shpsigma[1] != ncols): + self.sigma = np.zeros((nrows,ncols),dtype=np.float32) + 1e24 + calcsigma = True + + sigma = np.asarray(self.sigma) + + scalemethod = 'clip' + if rescale == True: + if np.issubdtype(patternfileout.filedatatype, np.integer): + mxval = np.iinfo(patternfileout.filedatatype).max + scalemethod = 'fullscale' + else: # not int, so no rescale. + rescale = False + + nthreadpos = numba.get_num_threads() + #numba.set_num_threads(36) + colstartcount = np.asarray([0,ncols],dtype=np.int64) + print(lam, sr, dthresh) + + for j in range(0,nrows,chunksize): + print('Row start', j) + + rowstartread = np.int64(max(0, j-sr)) + rowend = min(j + chunksize+sr,nrows) + + if (rowend - rowstartread) < (2*sr+1): + rowstartread = np.int64(max(0, rowend - (2*sr+1))) + rowcountread = np.int64(rowend-rowstartread) + data, xyloc = patternfile.read_data(patStartCount = [[0,rowstartread], [ncols,rowcountread]], + convertToFloat=True,returnArrayOnly=True) + + shpdata = data.shape + + if backsub is True: + data = self.backsub(data) + + + data = data.reshape(shpdata[0], phw) + + rowstartcount = np.asarray([0,rowcountread],dtype=np.int64) + if calcsigma is True: + sigchunk, tmp = self.sigma_numba(data,1,rowcountread,ncols,rowstartcount,colstartcount,indices,saturation_protect) + del tmp + tmp = (sigma[rowstartread:rowend,:] < sigchunk).choose(sigchunk,sigma[rowstartread:rowend,:]) + sigma[rowstartread:rowend,:] = tmp + else: + sigchunk = sigma[rowstartread:rowend,:] + + #dataout = data + + dataout = self.nlpar_nb(data,lam, sr, dthresh, sigchunk, + rowcountread,ncols,indices,saturation_protect) + + dataout = dataout.reshape(rowcountread, ncols, phw) + dataout = dataout[j-rowstartread:, :, : ] + shpout = dataout.shape + dataout = dataout.reshape(shpout[0]*shpout[1], pheight, pwidth) + if rescale == True: + for i in range(dataout.shape[0]): + temp = dataout[i,:,:] + temp -= temp.min() + temp *= np.float32(mxval)/temp.max() + dataout[i,:,:] = temp + + patternfileout.write_data(newpatterns=dataout,patStartCount = [[0,j], [ncols, shpout[0]]], + flt2int='clip',scalevalue=1.0 ) + #self.patternfileout.write_data(newpatterns=dataout,patStartCount=[j*ncols,shpout[0]*shpout[1]], + # flt2int='clip',scalevalue=1.0 ) + #return dataout + #sigma[j:j+rowstartcount[1],:] += \ + # sigchunk[rowstartcount[0]:rowstartcount[0]+rowstartcount[1],:] + + numba.set_num_threads(nthreadpos) + return str(patternfileout.filepath) + + def calcsigma(self,chunksize=0,nn=1,saturation_protect=True,automask=True): + self.sigmann = nn + patternfile = self.getinfileobj() + + + nrows = np.int64(self.nrows)#np.uint64(patternfile.nRows) + ncols = np.int64(self.ncols)#np.uint64(patternfile.nCols) + + pwidth = np.uint64(patternfile.patternW) + pheight = np.uint64(patternfile.patternH) + phw = pheight*pwidth + + if chunksize <= 0: + chunksize = np.round(2e9/phw/ncols) # keeps the chunk at about 8GB + chunksize = np.minimum(nrows,chunksize) + print("Chunk size set to nrows:", int(chunksize)) + chunksize = np.int64(chunksize) + + + nn = np.uint64(nn) + + if (automask is True) and (self.mask is None): + self.mask = (self.automask(pheight,pwidth)) + if self.mask is None: + self.mask = np.ones((pheight,pwidth), dtype=np.uint8) + + indices = np.asarray( (self.mask.flatten().nonzero())[0], np.uint64) + + sigma = np.zeros((nrows, ncols), dtype=np.float32) + #d_nn = np.zeros((nrows, ncols, int((2*nn+1)**2)), dtype=np.float32) + colstartcount = np.asarray([0,ncols],dtype=np.int64) + + for j in range(0,nrows,chunksize): + rowstartread = np.int64(max(0, j-nn)) + rowend = min(j + chunksize+nn,nrows) + if (rowend - rowstartread) < (3): + rowstartread = np.int64(max(0, rowend - (3))) + rowcountread = np.int64(rowend-rowstartread) + data, xyloc = patternfile.read_data(patStartCount = [[0,rowstartread], [ncols,rowcountread]], + convertToFloat=True,returnArrayOnly=True) + + shp = data.shape + data = data.reshape(data.shape[0], phw) + + #data = None + if rowend == nrows: + rowstartcount = np.asarray([j-rowstartread,rowcountread - (j-rowstartread) ], dtype=np.int64) + else: + rowstartcount = np.asarray([j-rowstartread,chunksize ], dtype=np.int64) + + sigchunk, temp = self.sigma_numba(data,nn, rowcountread,ncols,rowstartcount,colstartcount,indices,saturation_protect) + + sigma[j:j+rowstartcount[1],:] += \ + sigchunk[rowstartcount[0]:rowstartcount[0]+rowstartcount[1],:] + + return sigma + + def backsub(self, data): + # This function will fit a 2D gaussian on top of a plane to the averaged set of patterns (data) that is provided. + # It will automatically use whatever mask is defined for valid data. + # If the gaussian fit fails to converge, it will fall back to just using the mean set of patterns for the background + # with a warning. + + + def gaussian_surf(x, y, a, x0, y0, sigx, sigy, c, d, e): + # equation for 2D gaussian on top of a plane. + return a * np.exp(- ((x - x0) ** 2) / (2.0 * sigx ** 2) - ((y - y0) ** 2) / (2.0 * sigy ** 2)) + c + d * x + e * y + + def fit_gauss(M, *args): + # helper function + x, y = M + #arr = np.zeros(x.shape) + return gaussian_surf(x, y, *args) + + back = np.mean(data, axis=0) # start with the mean of all the data + # now fit a 2D gaussian sitting on a plane. See fuction def above. + nx = data.shape[-1] + ny = data.shape[-2] + x = np.arange(nx, dtype=float) + x = (np.broadcast_to(x.reshape(1,nx), (ny, nx))) + y = np.arange(ny, dtype=float) + y = (np.broadcast_to(y, (nx, ny)).T) + x = x.ravel() + y = y.ravel() + + # need to grab only the values that are in the mask. + wh = np.nonzero(self.mask.ravel())[0] + xwh = x[wh] + ywh = y[wh] + xywh = np.vstack((xwh, ywh)) + zwh = (back.ravel())[wh] + whmx = np.unravel_index(back.argmax(), back.shape) + minz = zwh.min() + # initialize a guess for the parameters. + # [gauss amplitude, max loc x, max loc y, sigx, sigy, const offset, slope x, slope y] + p0 = [(zwh.max() - zwh.min()), whmx[1], whmx[0], nx/2.355, ny/2.355, minz, 0, 0] + try: + popt, pcov = opt.curve_fit(fit_gauss, xywh, zwh, p0) + backfit = (gaussian_surf(x, y, *popt)).reshape(ny, nx) + #print(p0, popt) + except RuntimeError: + print('Warning: no convergence on back subtract ... using mean of the patterns.') + print('This may not be ideal for scans with few grains across the width of the scan.') + backfit = back + backfit -= np.mean(backfit) + #f, axarr = plt.subplots(1, 3) + #f.set_size_inches(10, 4) + #axarr[0].imshow(data[0,:,:].squeeze(), cmap='gray') + #axarr[1].imshow(data[0,:,:].squeeze() - backfit, cmap='gray') + #axarr[2].imshow(backfit, cmap='gray') + + data -= backfit + return data + + @staticmethod + def automask( h,w ): + r = (min(h,w)*0.98*0.5) + x = np.arange(w, dtype=np.float32) + x = np.minimum(x , (w-x)) + x = x.reshape(1,w) + y = np.arange(h, dtype=np.float32) + y = np.minimum(y , (h - y)) + y = y.reshape(h,1) + + mask =np.sqrt(y**2 + x**2) + mask = (mask < r).astype(np.uint8) + mask = np.roll(mask, (int(h/2), int(w/2) ), axis=(0,1)) + + return mask + + @staticmethod + @numba.jit(nopython=True,cache=True,fastmath=False,parallel=True) + def sigma_numba(data, nn, nrows, ncols, rowstartcount, colstartcount, indices, saturation_protect=True): + sigma = np.zeros((nrows,ncols), dtype = np.float32) + dout = np.zeros((nrows,ncols,(nn*2+1)**2), dtype=np.float32) + nout = np.zeros((nrows,ncols,(nn * 2 + 1) ** 2),dtype=np.float32) + dij = np.zeros((nrows,ncols,(nn * 2 + 1) ** 2, 2),dtype=np.uint64) + shpdata = data.shape + + n0 = np.float32(shpdata[-1]) + shpind = indices.shape + mxval = np.max(data) + if saturation_protect == False: + mxval += 1.0 + else: + mxval *= 0.9961 + + for j in numba.prange(rowstartcount[0], rowstartcount[0]+rowstartcount[1]): + nn_r_start = j - nn if (j - nn) >= 0 else 0 + nn_r_end = (j + nn if (j + nn) < nrows else nrows-1)+1 + for i in numba.prange(colstartcount[0], colstartcount[0]+colstartcount[1]): + nn_c_start = i - nn if (i - nn) >= 0 else 0 + nn_c_end = (i + nn if (i + nn) < ncols else ncols - 1) + 1 + + mind = np.float32(1e24) + indx_0 = i+ncols*j + count = 0 + for j_nn in range(nn_r_start,nn_r_end ): + for i_nn in range(nn_c_start,nn_c_end): + dij[j,i,count,0] = np.uint64(j_nn) + dij[j,i,count,1] = np.uint64(i_nn) # want to save this for labmda optimization + indx_nn = i_nn+ncols*j_nn + d2 = np.float32(0.0) + n2 = np.float32(1.0e-12) + nout[j,i,count] = n0 # want to save this for labmda optimization + if not((i == i_nn) and (j == j_nn)): + for q in range(shpind[0]): + d0 = data[indx_0, indices[q]] + d1 = data[indx_nn, indices[q]] + if (d1 < mxval) and (d0 < mxval): + n2 += 1.0 + d2 += (d0 - d1)**2 + nout[j,i,count] = n2 + + if d2 >= 1.e-3: #sometimes EDAX collects the same pattern twice + s0 = d2 / np.float32(n2 * 2.0) + if s0 < mind: + mind = s0 + dout[j,i,count] = d2 # want to save this for labmda optimization + + count += 1 + + sigma[j,i] = np.sqrt(mind) + #if sigma[j,i] > 1e12: + # print(sigma[j,i], dout[j,i,:], nout[i,j,:]) + return sigma,( dout, nout, dij) + + @staticmethod + @numba.jit(nopython=True,cache=True,fastmath=False,parallel=True) + def nlpar_nb(data,lam, sr, dthresh, sigma, nrows,ncols,indices,saturation_protect=True): + def getpairid(idx0, idx1): + idx0_t = int(idx0) + idx1_t = int(idx1) + if idx0 < idx1: + pairid = idx0_t + (idx1_t << 32) + else: + pairid = idx1_t + (idx0_t << 32) + return numba.uint64(pairid) + + lam2 = 1.0 / lam**2 + dataout = np.zeros_like(data, np.float32) + shpdata = data.shape + shpind = indices.shape + winsz = np.int32((2*sr+1)**2) + + + mxval = np.max(data) + if saturation_protect == False: + mxval += np.float32(1.0) + else: + mxval *= np.float32(0.999) + for i in numba.prange(ncols): + winstart_x = max((i - sr),0) - max((i + sr - (ncols - 1)),0) + winend_x = min((i + sr),(ncols - 1)) + max((sr - i),0) + 1 + pairdict = numba.typed.Dict.empty(key_type=numba.core.types.uint64,value_type=numba.core.types.float32) + for j in range(nrows): + winstart_y = max((j - sr),0) - max((j + sr - (nrows - 1)),0) + winend_y = min((j + sr),(nrows - 1)) + max((sr - j),0) + 1 + + weights = np.zeros(winsz,dtype=np.float32) + pindx = np.zeros(winsz,dtype=np.uint64) + + indx_0 = i + ncols * j + sigma0 = sigma[j,i]**2 + + counter = 0 + for j_nn in range(winstart_y,winend_y): + for i_nn in range(winstart_x,winend_x): + indx_nn = i_nn + ncols * j_nn + pindx[counter] = indx_nn + + if indx_nn == indx_0: + weights[counter] = np.float32(-1.0e6) + else: + pairid = getpairid(indx_0, indx_nn) + if pairid in pairdict: + weights[counter] = pairdict[pairid] + else: + sigma1 = sigma[j_nn,i_nn]**2 + d2 = np.float32(0.0) + n2 = np.float32(0.0) + for q in range(shpind[0]): + d0 = data[indx_0,indices[q]] + d1 = data[indx_nn,indices[q]] + if (d1 < mxval) and (d0 < mxval): + n2 += np.float32(1.0) + d2 += (d0 - d1) ** np.int32(2) + d2 -= n2*(sigma0+sigma1) + dnorm = (sigma1 + sigma0)*np.sqrt(np.float32(2.0)*n2) + if dnorm > np.float32(1.e-8): + d2 /= dnorm + else: + d2 = np.float32(1e6)*n2 + weights[counter] = d2 + pairdict[pairid] = numba.float32(d2) + counter += 1 + #print('________________') + # end of window scanning + sum = np.float32(0.0) + for i_nn in range(winsz): + + weights[i_nn] = np.maximum(weights[i_nn]-dthresh, numba.float32(0.0)) + weights[i_nn] = np.exp(-1.0 * weights[i_nn] * lam2) + sum += weights[i_nn] + + for i_nn in range(winsz): + indx_nn = pindx[i_nn] + weights[i_nn] /= sum + #print(weights[i_nn], ' \n') + for q in range(shpdata[1]): + dataout[indx_0, q] += data[indx_nn, q]*weights[i_nn] + #print('_______', '\n') + return dataout + + def _calcchunks(self, patdim, ncol, nrow, target_bytes=2e9, col_overlap=0, row_overlap=0, col_offset=0, row_offset=0): + + col_overlap = min(col_overlap, ncol - 1) + row_overlap = min(row_overlap, nrow - 1) + + byteperpat = patdim[-1] * patdim[-2] * 4 * 2 # assume a 4 byte float input and output array + byteperdataset = byteperpat * ncol * nrow + nchunks = int(np.ceil(byteperdataset / target_bytes)) + + ncolchunks = (max(np.round(np.sqrt(nchunks * float(ncol) / nrow)), 1)) + colstep = max((ncol / ncolchunks), 1) + ncolchunks = max(ncol / colstep, 1) + colstepov = min(colstep + 2 * col_overlap, ncol) + ncolchunks = max(int(np.ceil(ncolchunks)), 1) + colstep = max(int(np.round(colstep)), 1) + colstepov = min(colstep + 2 * col_overlap, ncol) + + nrowchunks = max(np.ceil(nchunks / ncolchunks), 1) + rowstep = max((nrow / nrowchunks), 1) + nrowchunks = max(nrow / rowstep, 1) + rowstepov = min(rowstep + 2 * row_overlap, nrow) + nrowchunks = max(int(np.ceil(nrowchunks)), 1) + rowstep = max(int(np.round(rowstep)), 1) + rowstepov = min(rowstep + 2 * row_overlap, nrow) + + # colchunks = np.round(np.arange(ncolchunks+1)*ncol/ncolchunks).astype(int) + colchunks = np.zeros((ncolchunks, 2), dtype=int) + colchunks[:, 0] = (np.arange(ncolchunks) * colstep).astype(int) + colchunks[:, 1] = colchunks[:, 0] + colstepov - int(col_overlap) + colchunks[:, 0] -= col_overlap + colchunks[0, 0] = 0; + + for i in range(ncolchunks - 1): + if colchunks[i + 1, 0] >= ncol: + colchunks = colchunks[0:i + 1, :] + + ncolchunks = colchunks.shape[0] + colchunks[-1, 1] = ncol + + colchunks += col_offset + + # colproc = np.zeros((ncolchunks, 2), dtype=int) + # if ncolchunks > 1: + # colproc[1:, 0] = col_overlap + # if ncolchunks > 1: + # colproc[0:, 1] = colchunks[:, 1] - colchunks[:, 0] - col_overlap + # colproc[-1, 1] = colchunks[-1, 1] - colchunks[-1, 0] + + # rowchunks = np.round(np.arange(nrowchunks + 1) * nrow / nrowchunks).astype(int) + rowchunks = np.zeros((nrowchunks, 2), dtype=int) + rowchunks[:, 0] = (np.arange(nrowchunks) * rowstep).astype(int) + rowchunks[:, 1] = rowchunks[:, 0] + rowstepov - int(row_overlap) + rowchunks[:, 0] -= row_overlap + rowchunks[0, 0] = 0; + + for i in range(nrowchunks - 1): + if rowchunks[i + 1, 0] >= nrow: + rowchunks = rowchunks[0:i + 1, :] + + nrowchunks = rowchunks.shape[0] + rowchunks[-1, 1] = nrow + + rowchunks += row_offset + + # rowproc = np.zeros((nrowchunks, 2), dtype=int) + # if nrowchunks > 1: + # rowproc[1:, 0] = row_overlap + # if nrowchunks > 1: + # rowproc[0:, 1] = rowchunks[:, 1] - rowchunks[:, 0] - row_overlap + # rowproc[-1, 1] = rowchunks[-1, 1] - rowchunks[-1, 0] + + return ncolchunks, nrowchunks, colchunks, rowchunks + + # def asciiupdate(self, nrow, ncol, completematrix): + # cm = completematrix + # ncdisplay = min(ncol, 80) + # nrow + + diff --git a/pyebsdindex/opencl/band_detect_cl.py b/pyebsdindex/opencl/band_detect_cl.py index ad288fd..f849224 100644 --- a/pyebsdindex/opencl/band_detect_cl.py +++ b/pyebsdindex/opencl/band_detect_cl.py @@ -36,9 +36,7 @@ #from os import environ #environ['PYOPENCL_COMPILER_OUTPUT'] = '1' -tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir()) -tempdir = tempdir.joinpath('numba') -environ["NUMBA_CACHE_DIR"] = str(tempdir) + RADEG = 180.0/np.pi @@ -257,42 +255,35 @@ def radon_fasterCL(self,image,padding = np.array([0,0]), fixArtifacts = False, b # reform = False clvtypesize = 16 # this is the vector size to be used in the openCL implementation. - nImCL = np.int32(clvtypesize * (np.int64(np.ceil(nIm/clvtypesize)))) + nImCL = np.uint64(clvtypesize * (np.int64(np.ceil(nIm/clvtypesize)))) imtype = image.dtype + tict = timer() + #image_align = np.ones((shapeIm[1], shapeIm[2], nImCL), dtype = imtype) + #image_align[:,:,0:nIm] = np.transpose(image, [1,2,0]).copy() + toct = timer() + - image_align = np.ones((shapeIm[1], shapeIm[2], nImCL), dtype = imtype) - image_align[:,:,0:nIm] = np.transpose(image, [1,2,0]).copy() - shpRdn = np.asarray( ((self.nRho+2*padding[0]), (self.nTheta+2*padding[1]), nImCL),dtype=np.uint64) - radon_gpu = cl.Buffer(ctx,mf.READ_WRITE,size=int((self.nRho+2*padding[0])*(self.nTheta+2*padding[1])*nImCL*4)) #radon_gpu = cl.Buffer(ctx,mf.READ_WRITE,size=radon.nbytes) #radon_gpu = cl.Buffer(ctx,mf.READ_WRITE | mf.COPY_HOST_PTR,hostbuf=radon) - image_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=image_align) - - + image_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=image) imstep = np.uint64(np.product(shapeIm[-2:])) - indxstep = np.uint64(self.radonPlan.indexPlan.shape[-1]) - rdnstep = np.uint64(self.nRho * self.nTheta) - - padRho = np.uint64(padding[0]) - padTheta = np.uint64(padding[1]) tic = timer() nImChunk = np.uint64(nImCL/clvtypesize) + image_gpuflt = cl.Buffer(ctx, mf.READ_WRITE, size=int(int(shapeIm[1])*int(shapeIm[2])*int(nImCL) * int(4))) # 32-bit float - if image.dtype.kind == 'f': - image_gpuflt = image_gpu - else: - image_gpuflt = cl.Buffer(ctx, mf.READ_WRITE, size=image_align.size * 4) # 32-bit float - if image_align.dtype.type is np.ubyte: - prg.loadubyte8(queue, (imstep, 1, 1), None, image_gpu, image_gpuflt, nImChunk) - if image_align.dtype.type is np.uint16: - prg.loaduint16(queue, (imstep, 1, 1), None, image_gpu, image_gpuflt, nImChunk) - queue.flush() - image_gpu.release() - image_gpu = None + if image.dtype.type is np.float32: + prg.loadfloat32(queue, (shapeIm[2], shapeIm[1], nIm), None, image_gpu, image_gpuflt, nImCL) + if image.dtype.type is np.ubyte: + prg.loadubyte8(queue, (shapeIm[2], shapeIm[1], nIm), None, image_gpu, image_gpuflt, nImCL) + if image.dtype.type is np.uint16: + prg.loaduint16(queue, (shapeIm[2], shapeIm[1], nIm), None, image_gpu, image_gpuflt, nImCL) + queue.flush() + image_gpu.release() + image_gpu = None @@ -302,6 +293,14 @@ def radon_fasterCL(self,image,padding = np.array([0,0]), fixArtifacts = False, b #imBack = np.zeros((shapeIm[1], shapeIm[2], nImCL),dtype=np.float32) #cl.enqueue_copy(queue,imBack,image_gpu,is_blocking=True) + indxstep = np.uint64(self.radonPlan.indexPlan.shape[-1]) + rdnstep = np.uint64(self.nRho * self.nTheta) + padRho = np.uint64(padding[0]) + padTheta = np.uint64(padding[1]) + shpRdn = np.asarray(((self.nRho + 2 * padding[0]), (self.nTheta + 2 * padding[1]), nImCL), dtype=np.uint64) + radon_gpu = cl.Buffer(ctx, mf.READ_WRITE, + size=int((self.nRho + 2 * padding[0]) * (self.nTheta + 2 * padding[1]) * nImCL * 4)) + rdnIndx_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.radonPlan.indexPlan) cl.enqueue_fill_buffer(queue, radon_gpu, np.float32(self.radonPlan.missingval), 0, radon_gpu.size) prg.radonSum(queue,(nImChunk,rdnstep),None,rdnIndx_gpu,image_gpuflt,radon_gpu, @@ -629,7 +628,7 @@ def rdn_local_maxCL(self,radonIn, clparams=None, returnBuff = True): def band_labelCL(self,rdnConvIn, lMaxRdnIn,clparams=None): - # an attempt to to run the band label on the GPU + # an attempt to run the band label on the GPU tic = timer() diff --git a/pyebsdindex/opencl/clkernels.cl b/pyebsdindex/opencl/clkernels.cl index ffbc20a..b712d2f 100644 --- a/pyebsdindex/opencl/clkernels.cl +++ b/pyebsdindex/opencl/clkernels.cl @@ -22,41 +22,70 @@ Author: David Rowenhorst; The US Naval Research Laboratory Date: 21 Aug 2020 */ -// simple program to convert a 8-bit byte to float -__kernel void loadubyte8( const __global uchar16 *im1, __global float16 *im1flt, - const unsigned long int nImChunk) +// simple program to convert a 8-bit byte to float and transpose array +__kernel void loadubyte8( const __global uchar *im1, __global float *im1flt, const unsigned long int nImCL) { - const unsigned long int xy = get_global_id(0); - //const unsigned long int szim = get_global_size(0); - unsigned long i; - uchar16 imVal; - float16 imValflt; + const unsigned long int x = get_global_id(0); + const unsigned long int y = get_global_id(1); + const unsigned long int z = get_global_id(2); + const unsigned long int nx = get_global_size(0); + const unsigned long int ny = get_global_size(1); + //const unsigned long int nz = get_global_size(2); + + const unsigned long int indx1 = x + nx*y + nx*ny*z; + const unsigned long int indx2 = z + nImCL*(x + nx*y); // transpose z->x, x->y, y->z + uchar imVal; + float imValflt; + + imVal = im1[indx1]; + imValflt = convert_float(imVal); + im1flt[indx2] = imValflt; + + +} - const unsigned long indx = nImChunk * xy; - for(i = 0; i< nImChunk; ++i){ - imVal = im1[indx+i]; - imValflt = convert_float16(imVal); - im1flt[indx+i] = imValflt; - } + +// simple program to convert a 8-bit byte to float and transpose array +__kernel void loaduint16( const __global ushort *im1, __global float *im1flt, const unsigned long int nImCL) + { + const unsigned long int x = get_global_id(0); + const unsigned long int y = get_global_id(1); + const unsigned long int z = get_global_id(2); + const unsigned long int nx = get_global_size(0); + const unsigned long int ny = get_global_size(1); + //const unsigned long int nz = get_global_size(2); + + const unsigned long int indx1 = x + nx*y + nx*ny*z; + const unsigned long int indx2 = z + nImCL*(x + nx*y); // transpose z->x, x->y, y->z + ushort imVal; + float imValflt; + + imVal = im1[indx1]; + imValflt = convert_float(imVal); + im1flt[indx2] = imValflt; + } -// simple program to convert a 16-bit int to float -__kernel void loaduint16( const __global ushort16 *im1, __global float16 *im1flt, - const unsigned long int nImChunk) + + +// simple program to convert a float to float and transpose array +__kernel void loaduufloat32( const __global float *im1, __global float *im1flt, const unsigned long int nImCL) { - const unsigned long int xy = get_global_id(0); - //const unsigned long int szim = get_global_size(0); - unsigned long i; - ushort16 imVal; - float16 imValflt; + const unsigned long int x = get_global_id(0); + const unsigned long int y = get_global_id(1); + const unsigned long int z = get_global_id(2); + const unsigned long int nx = get_global_size(0); + const unsigned long int ny = get_global_size(1); + //const unsigned long int nz = get_global_size(2); + + const unsigned long int indx1 = x + nx*y + nx*ny*z; + const unsigned long int indx2 = z + nImCL*(x + nx*y); // transpose z->x, x->y, y->z + float imVal; - const unsigned long indx = nImChunk * xy; - for(i = 0; i< nImChunk; ++i){ - imVal = im1[indx+i]; - imValflt = convert_float16(imVal); - im1flt[indx+i] = imValflt; - } + imVal = im1[indx1]; + im1flt[indx2] = imVal; + } diff --git a/pyebsdindex/opencl/clnlpar.cl b/pyebsdindex/opencl/clnlpar.cl new file mode 100644 index 0000000..6289dc7 --- /dev/null +++ b/pyebsdindex/opencl/clnlpar.cl @@ -0,0 +1,455 @@ +/* +This software was developed by employees of the US Naval Research Laboratory (NRL), an +agency of the Federal Government. Pursuant to title 17 section 105 of the United States +Code, works of NRL employees are not subject to copyright protection, and this software +is in the public domain. PyEBSDIndex is an experimental system. NRL assumes no +responsibility whatsoever for its use by other parties, and makes no guarantees, +expressed or implied, about its quality, reliability, or any other characteristic. We +would appreciate acknowledgment if the software is used. To the extent that NRL may hold +copyright in countries other than the United States, you are hereby granted the +non-exclusive irrevocable and unconditional right to print, publish, prepare derivative +works and distribute this software, in any medium, or authorize others to do so on your +behalf, on a royalty-free basis throughout the world. You may improve, modify, and +create derivative works of the software or any portion of the software, and you may copy +and distribute such modifications or works. Modified works should carry a notice stating +that you changed the software and should note the date and nature of any such change. +Please explicitly acknowledge the US Naval Research Laboratory as the original source. +This software can be redistributed and/or modified freely provided that any derivative +works bear some notice that they are derived from it, and any modified versions bear +some notice that they have been modified. + +Author: David Rowenhorst; +The US Naval Research Laboratory Date: 22 May 2024 + +For more info see +Patrick T. Brewick, Stuart I. Wright, David J. Rowenhorst. Ultramicroscopy, 200:50–61, May 2019. +"""Non-local pattern averaging and re-indexing (NLPAR).""" +*/ + + +float sum16(const float16 *v1); +float sum16(const float16 *v1){ + float sum = 0.0; + sum += v1[0].s0; + sum += v1[0].s1; + sum += v1[0].s2; + sum += v1[0].s3; + sum += v1[0].s4; + sum += v1[0].s5; + sum += v1[0].s6; + sum += v1[0].s7; + sum += v1[0].s8; + sum += v1[0].s9; + sum += v1[0].sa; + sum += v1[0].sb; + sum += v1[0].sc; + sum += v1[0].sd; + sum += v1[0].se; + sum += v1[0].sf; + return sum; + +} + + +void print16(const float16 v1); +void print16(const float16 v1){ + + printf( "%f, ", v1.s0); + printf( "%f, ", v1.s1); + printf( "%f, ", v1.s2); + printf( "%f, ", v1.s3); + printf( "%f, ", v1.s4); + printf( "%f, ", v1.s5); + printf( "%f, ", v1.s6); + printf( "%f, ", v1.s7); + printf( "%f, ", v1.s8); + printf( "%f, ", v1.s9); + printf( "%f, ", v1.sa); + printf( "%f, ", v1.sb); + printf( "%f, ", v1.sc); + printf( "%f, ", v1.sd); + printf( "%f, ", v1.se); + printf( "%f", v1.sf); + +} + + +__kernel void nlloadpat8bit(__global uchar *datain, __global float *dataout){ + const unsigned long int x = get_global_id(0); + uchar imVal = datain[x]; + float imValflt = convert_float(imVal); + dataout[x] = imValflt; +} + +__kernel void nlloadpat16bit(__global ushort *datain, __global float *dataout){ + const unsigned long int x = get_global_id(0); + ushort imVal = datain[x]; + float imValflt = convert_float(imVal); + dataout[x] = imValflt; +} + +__kernel void nlloadpat32flt(__global float *datain, __global float *dataout){ + const unsigned long int x = get_global_id(0); + dataout[x] = datain[x]; +} + +__kernel void calcsigma( __global float *data, __global float16 *mask, + __global float *sig, + __global float *dg, __global float *ng, + const long nn, const long ndatchunk, const long npatpoint, const float maxlim){ + // IDs of work-item represent x and y coordinates in image + const long x = get_global_id(0); + const long y = get_global_id(1); + const long ncol = get_global_size(0); + const long nrow = get_global_size(1); + const long indx_xy = x+y*ncol; + + const long indx0 = npatpoint * indx_xy; + + long i, j, z; + long indx_j, indx_ij, count; + + long nnn = (2*nn+1) * (2*nn+1); + + float16 d1, d0; + float16 maskchunk; + float16 mask0, mask1; + float dd; + + + float d[256]; + float n[256]; + + + for(j=0; j < nnn; ++j){ + d[j] = 0.0; + n[j] = 1.0e-6; + } + + ; + + for(z = 0; z (float16) 1.e-3 ? mask0 : (float16) 0.0; + //mask0 = select((float16) (1.0), (float16) (0.0), isless( d0, (float16) maxlim)); + + for(j=y-nn; j<=y+nn; ++j){ + + indx_j = (j >= 0) ? (j): abs(j); + indx_j = (indx_j < nrow) ? (indx_j): nrow - (indx_j -nrow +1); + indx_j = ncol * indx_j; + + for(i=x-nn; i<=x+nn; ++i){ + + indx_ij = (i >= 0) ? (i): abs(i); + indx_ij = (indx_ij < ncol) ? (indx_ij): ncol - (indx_ij -ncol +1); + indx_ij = npatpoint*(indx_ij + indx_j); + + mask1 = mask0; + d1 = *(__global float16*) (data + indx_ij+16*z); + + + mask1 = d1 < (float16) (maxlim) ? mask1 : (float16) (0.0); + //mask1 = select((float16) (1.0), (float16) (0.0), isgreater(mask0, (float16)(1e-6)) && isgreater(mask1,(float16)(1e-6))); + //printf("%*s\n", 'here'); + + d1 = (d0-d1); + d1 *= d1 ; + d1 *= mask1; + + dd = sum16(&d1); + dd = (indx_ij == indx0) ? -1*dd : dd; // mark the center point + + d[count] += dd; + n[count] += sum16(&mask1); + //d[count+nnn*indx_xy] += dd; + //n[count+nnn*indx_xy] += sum16(&mask1); + + count += 1; + } + } + //printf("%d %f\n", count, n[count]); + } + + + float mind = 1e24; + float s0; + ; //nnn*(x+ncol*y); + for(j=0; j 1e-5){ //sometimes EDAX collects the same pattern twice & catch the pixel of interest. + s0 = d[j]/(2.0*n[j]); + if (s0= 0) ? (j): abs(j); + indx_j = (indx_j < nrow) ? (indx_j): nrow - (indx_j -nrow +1); + indx_j = ncol * indx_j; + + for(i=x-nn; i<=x+nn; ++i){ + + indx_ij = (i >= 0) ? (i): abs(i); + indx_ij = (indx_ij < ncol) ? (indx_ij): ncol - (indx_ij -ncol +1); + indx_ij = (indx_ij + indx_j); + sigma_ij = sigma[indx_ij]; + sigma_ij *= sigma_ij; + + sigma_ij = sigma_ij + sigma_xy; + + d_ij = d[q+nnn*indx_xy]; + n_ij = n[q+nnn*indx_xy]; + if (n_ij > 1.0e-3){ + d_ij -= n_ij*sigma_ij; + d_ij *= 1.0/( sigma_ij * sqrt(2.0*n_ij) ); + + d[q+nnn*indx_xy] = d_ij; + } + q += 1.0; + } + } + //printf("%d, %d, %d\n",nn, q, nnn); +} + + + + +__kernel void calcnlpar( + const __global float *data, + const __global float16 *mask, + const __global float *sigma, + const __global long *crlimits, + __global float *dataout, + const long sr, + const long ndatchunk, + const long npatpoint, + const float maxlim, + const float lam2, + const float dthresh){ + //IDs of work-item represent x and y coordinates in image + //const long4 calclim = crlimits[0]; + const long x = get_global_id(0)+crlimits[0]; + const long y = get_global_id(1)+crlimits[1]; + const long ncol = crlimits[2]; //get_global_size(0); + const long nrow = crlimits[3];//get_global_size(1); + const long indx_xy = x+y*ncol; + //printf("%d\n", indx_xy); + const long indx0 = npatpoint * indx_xy; + //printf("%d, %d, %d, %d\n", x,y,ncol, nrow); + long i, j, z; + long indx_j, indx_ij, count; + + long nnn = (2*sr+1) * (2*sr+1); + + float16 d1, d0; + float16 mask0, mask1; + float dd, nn, sigma_ij, norm; + float sigma0 = sigma[indx_xy]; + sigma0 *= sigma0; + //float16* writeloc; + + float d[512]; // taking a risk here that noone will want a SR > 10 + float n[512]; + + + for(j=0; j < nnn; ++j){ + d[j] = 0.0; + n[j] = 1.0e-6; + } + + + for(z = 0; z (float16) 1.e-3) ? (float16) 1.0 : (float16) 0.0; + mask0 = d0 < (float16) (maxlim) ? mask0 : (float16) (0.0); + + + for(j=y-sr; j<=y+sr; ++j){ + + indx_j = (j >= 0) ? (j): abs(j); + indx_j = (indx_j < nrow) ? (indx_j): nrow - (indx_j -nrow +1); + indx_j = ncol * indx_j; + + for(i=x-sr; i<=x+sr; ++i){ + + indx_ij = (i >= 0) ? (i): abs(i); + indx_ij = (indx_ij < ncol) ? (indx_ij): ncol - (indx_ij -ncol +1); + + indx_ij = (indx_ij + indx_j); + + indx_ij *= npatpoint; + + mask1 = mask0; + d1 = *(__global float16*) (data + indx_ij+16*z); + + + mask1 = d1 < (float16) (maxlim) ? mask1 : (float16) (0.0); + //mask1 = select((float16) (1.0), (float16) (0.0), isgreater(mask0, (float16)(1e-6)) && isgreater(mask1,(float16)(1e-6))); + //printf("%*s\n", 'here'); + + d1 = (d0-d1); + d1 *= d1 ; + d1 *= mask1; + + dd = sum16(&d1); + dd = (indx_ij == indx0) ? -1.0 : dd; // mark the center point + + d[count] += dd; + n[count] += sum16(&mask1); + //d[count+nnn*indx_xy] += dd; + //n[count+nnn*indx_xy] += sum16(&mask1); + + count += 1; + } + } + + } +// calculate the weights + count = 0; + float sum = 0.0; + nn = 1.0; + for(j=y-sr; j<=y+sr; ++j){ + + indx_j = (j >= 0) ? (j): abs(j); + indx_j = (indx_j < nrow) ? (indx_j): nrow - (indx_j -nrow +1); + indx_j = ncol * indx_j; + + for(i=x-sr; i<=x+sr; ++i){ + + dd = d[count]; + if (dd > 1.e-3){ + indx_ij = (i >= 0) ? (i): abs(i); + indx_ij = (indx_ij < ncol) ? (indx_ij): ncol - (indx_ij -ncol +1); + + indx_ij = (indx_ij + indx_j); + + sigma_ij = sigma[indx_xy]; + sigma_ij *= sigma_ij; + nn = n[count]; + dd -= nn*(sigma_ij+sigma0); + + norm = (sigma0 + sigma_ij)*sqrt(2.0*nn); + if (norm > 1.0e-8){ + dd /= norm; + } else { + dd = 1e6*nn; + } + + } else { + dd = 0.0; + nn = 1.0; + } + + dd -= dthresh; + dd = dd >= 0.0 ? dd : 0.0; + + dd = exp(-1.0*dd*lam2); + sum += dd; + d[count] = dd; + + count += 1; + + } + } + + for (j =0;j (float16) -1.e-3) ? (float16) 1.0 : (float16) 0.0; + + for(j=y-sr; j<=y+sr; ++j){ + + indx_j = (j >= 0) ? (j): abs(j); + indx_j = (indx_j < nrow) ? (indx_j): nrow - (indx_j -nrow +1); + indx_j = ncol * indx_j; + + for(i=x-sr; i<=x+sr; ++i){ + + indx_ij = (i >= 0) ? (i): abs(i); + indx_ij = (indx_ij < ncol) ? (indx_ij): ncol - (indx_ij -ncol +1); + + indx_ij = (indx_ij + indx_j); + + indx_ij *= npatpoint; + + + d1 = *(__global float16*) (data + indx_ij+16*z); + + + d1 *= (float16) d[count]; + d1 *= mask0; + + *(__global float16*) (dataout + indx0+16*z) += d1; + + //writeadd16(&(dataout[indx0+16*z]), &d1 ); + + //d[count+nnn*indx_xy] += dd; + //n[count+nnn*indx_xy] += sum16(&mask1); + + count += 1; + } + } + } + + + +} + + + + + diff --git a/pyebsdindex/opencl/nlpar_cl.py b/pyebsdindex/opencl/nlpar_cl.py new file mode 100644 index 0000000..d0d5211 --- /dev/null +++ b/pyebsdindex/opencl/nlpar_cl.py @@ -0,0 +1,538 @@ +# This software was developed by employees of the US Naval Research Laboratory (NRL), an +# agency of the Federal Government. Pursuant to title 17 section 105 of the United States +# Code, works of NRL employees are not subject to copyright protection, and this software +# is in the public domain. PyEBSDIndex is an experimental system. NRL assumes no +# responsibility whatsoever for its use by other parties, and makes no guarantees, +# expressed or implied, about its quality, reliability, or any other characteristic. We +# would appreciate acknowledgment if the software is used. To the extent that NRL may hold +# copyright in countries other than the United States, you are hereby granted the +# non-exclusive irrevocable and unconditional right to print, publish, prepare derivative +# works and distribute this software, in any medium, or authorize others to do so on your +# behalf, on a royalty-free basis throughout the world. You may improve, modify, and +# create derivative works of the software or any portion of the software, and you may copy +# and distribute such modifications or works. Modified works should carry a notice stating +# that you changed the software and should note the date and nature of any such change. +# Please explicitly acknowledge the US Naval Research Laboratory as the original source. +# This software can be redistributed and/or modified freely provided that any derivative +# works bear some notice that they are derived from it, and any modified versions bear +# some notice that they have been modified. +# +# Author: David Rowenhorst; +# The US Naval Research Laboratory Date: 22 May 2024 + +# For more info see +# Patrick T. Brewick, Stuart I. Wright, David J. Rowenhorst. Ultramicroscopy, 200:50–61, May 2019. + + + +import os +from timeit import default_timer as timer +import numpy as np +import pyopencl as cl + + + +import scipy.optimize as sp_opt +from pyebsdindex import nlpar_cpu +from pyebsdindex.opencl import openclparam +from time import time as timer + +class NLPAR(nlpar_cpu.NLPAR): + def __init__( self, filename=None, **kwargs): + nlpar_cpu.NLPAR.__init__(self, filename=filename, **kwargs) + self.useCPU = False + + + def opt_lambda(self,saturation_protect=True, automask=True, backsub=False, + target_weights=[0.5, 0.34, 0.25], dthresh=0.0, autoupdate=True, **kwargs): + return self.opt_lambda_cl(saturation_protect=saturation_protect, + automask=automask, + backsub=backsub, + target_weights=target_weights, + dthresh=dthresh, + autoupdate=autoupdate, **kwargs) + def calcnlpar(self, **kwargs): + return self.calcnlpar_cl(**kwargs) + + + def calcsigma(self,nn=1, saturation_protect=True,automask=True, return_nndist=False, **kwargs): + self.sigmann = nn + if self.sigmann > 7: + print("Sigma optimization search limited to a search radius <= 7") + print("The search radius has been clipped to 7") + nn = 7 + self.sigmann = nn + + sig = self.calcsigma_cl(nn=nn, + saturation_protect=saturation_protect, + automask=automask, **kwargs) + if return_nndist == True: + return sig + else: + return sig[0] + def opt_lambda_cpu(self, **kwargs): + return nlpar_cpu.NLPAR.opt_lambda(self, **kwargs) + + def calcnlpar_cpu(self, **kwargs): + return nlpar_cpu.NLPAR.calcnlpar(self, **kwargs) + + def calcsigma_cpu(self,nn=1, saturation_protect=True,automask=True, **kwargs): + return nlpar_cpu.NLPAR.calcsigma(self, nn=nn, + saturation_protect=saturation_protect, automask=automask, **kwargs) + + def opt_lambda_cl(self, saturation_protect=True, automask=True, backsub=False, + target_weights=[0.5, 0.34, 0.25], dthresh=0.0, autoupdate=True, **kwargs): + + target_weights = np.asarray(target_weights) + + def loptfunc(lam, d2, tw, dthresh): + temp = np.maximum(d2, dthresh)#(d2 > dthresh).choose(dthresh, d2) + dw = np.exp(-(temp) / lam ** 2) + w = np.sum(dw, axis=2) + 1e-12 + + metric = np.mean(np.abs(tw - 1.0 / w)) + return metric + + patternfile = self.getinfileobj() + patternfile.read_header() + nrows = np.uint64(self.nrows) # np.uint64(patternfile.nRows) + ncols = np.uint64(self.ncols) # np.uint64(patternfile.nCols) + + pwidth = np.uint64(patternfile.patternW) + pheight = np.uint64(patternfile.patternH) + phw = pheight * pwidth + + dthresh = np.float32(dthresh) + lamopt_values = [] + + sigma, d2, n2 = self.calcsigma(nn=1, saturation_protect=saturation_protect, automask=automask, normalize_d=True, + return_nndist=True) + + #sigmapad = np.pad(sigma, 1, mode='reflect') + #d2normcl(d2, n2, sigmapad) + + #print(d2.min(), d2.max(), d2.mean()) + + lamopt_values_chnk = [] + for tw in target_weights: + stride = 1 if sigma.size < 1e6 else 10 + + lam = 1.0 + lambopt1 = sp_opt.minimize(loptfunc, lam, args=(d2[0::stride, :], tw, dthresh), method='Nelder-Mead', + bounds=[[0.001, 10.0]], options={'fatol': 0.0001}) + lamopt_values.append(lambopt1['x']) + + #lamopt_values.append(lamopt_values_chnk) + lamopt_values = np.asarray(lamopt_values) + print("Range of lambda values: ", lamopt_values.flatten()) + print("Optimal Choice: ", np.median(lamopt_values)) + if autoupdate == True: + self.lam = np.median(np.mean(lamopt_values, axis=0)) + if self.sigma is None: + self.sigma = sigma + return lamopt_values.flatten() + + + def calcsigma_cl(self,nn=1,saturation_protect=True,automask=True, normalize_d=False, gpu_id = None, **kwargs): + self.sigmann = nn + if self.sigmann > 7: + print("Sigma optimization search limited to a search radius <= 7") + print("The search radius has been clipped to 7") + nn = 7 + self.sigmann = nn + + if gpu_id is None: + clparams = openclparam.OpenClParam() + clparams.get_gpu() + target_mem = 0 + gpu_id = 0 + count = 0 + + for gpu in clparams.gpu: + gmem = gpu.max_mem_alloc_size + if target_mem < gmem: + gpu_id = count + target_mem = gmem + count += 1 + else: + clparams = openclparam.OpenClParam() + clparams.get_gpu() + gpu_id = min(len(clparams.gpu)-1, gpu_id) + + + #print(gpu_id) + clparams.get_context(gpu_id=gpu_id, kfile = 'clnlpar.cl') + clparams.get_queue() + target_mem = clparams.queue.device.max_mem_alloc_size//2 + ctx = clparams.ctx + prg = clparams.prg + queue = clparams.queue + mf = clparams.memflags + clvectlen = 16 + + patternfile = self.getinfileobj() + + nrows = np.int64(self.nrows) # np.uint64(patternfile.nRows) + ncols = np.int64(self.ncols) # np.uint64(patternfile.nCols) + + pwidth = np.uint64(patternfile.patternW) + pheight = np.uint64(patternfile.patternH) + npat_point = int(pwidth * pheight) + #print(target_mem) + chunks = self._calcchunks( [pwidth, pheight], ncols, nrows, target_bytes=target_mem, + col_overlap=1, row_overlap=1) + + if (automask is True) and (self.mask is None): + self.mask = (self.automask(pheight, pwidth)) + if self.mask is None: + self.mask = np.ones((pheight, pwidth), dtype=np.uint8) + + #indices = np.asarray((self.mask.flatten().nonzero())[0], np.uint64) + #nindices = np.uint64(indices.size) + #nindicespad = np.uint64(clvectlen * int(np.ceil(nindices/clvectlen))) + + mask = self.mask.astype(np.float32) + + npad = clvectlen * int(np.ceil(mask.size/clvectlen)) + maskpad = np.zeros((npad) , np.float32) + maskpad[0:mask.size] = mask.reshape(-1).astype(np.float32) + mask_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=maskpad) + + npatsteps = int(maskpad.size/clvectlen) + + chunksize = (chunks[2][:,1] - chunks[2][:,0]).reshape(1,-1) * \ + (chunks[3][:, 1] - chunks[3][:, 0]).reshape(-1, 1) + + #return chunks, chunksize + mxchunk = chunksize.max() + + npadmx = clvectlen * int(np.ceil(mxchunk*npat_point/ clvectlen)) + + datapad_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4)) + + nnn = int((2 * nn + 1) ** 2) + + sigma = np.zeros((nrows, ncols), dtype=np.float32) + 1e18 + dist = np.zeros((nrows, ncols, nnn), dtype=np.float32) + countnn = np.zeros((nrows, ncols, nnn), dtype=np.float32) + + #dist_local = cl.LocalMemory(nnn*npadmx*4) + dist_local = cl.Buffer(ctx, mf.READ_WRITE, size=int(mxchunk*nnn*4)) + distchunk = np.zeros((mxchunk, nnn), dtype=np.float32) + #count_local = cl.LocalMemory(nnn*npadmx*4) + count_local = cl.Buffer(ctx, mf.READ_WRITE, size=int(mxchunk * nnn * 4)) + countchunk = np.zeros((mxchunk, nnn), dtype=np.float32) + + for rowchunk in range(chunks[1]): + rstart = chunks[3][rowchunk, 0] + rend = chunks[3][rowchunk, 1] + nrowchunk = rend - rstart + + for colchunk in range(chunks[0]): + cstart = chunks[2][colchunk, 0] + cend = chunks[2][colchunk, 1] + ncolchunk = cend - cstart + + data, xyloc = patternfile.read_data(patStartCount=[[cstart, rstart], [ncolchunk, nrowchunk]], + convertToFloat=False, returnArrayOnly=True) + + mxval = data.max() + if saturation_protect == False: + mxval += 1.0 + else: + mxval *= 0.9961 + + evnt = cl.enqueue_fill_buffer(queue, datapad_gpu, np.float32(mxval+10), 0,int(4*npadmx)) + + szdata = data.size + npad = clvectlen * int(np.ceil(szdata / clvectlen)) + tic = timer() + #datapad = np.zeros((npad), dtype=np.float32) + np.float32(mxval + 10) + #datapad[0:szdata] = data.reshape(-1) + data_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=data) + + if data.dtype.type is np.float32: + prg.nlloadpat32flt(queue, (np.uint64(data.size),), None, data_gpu, datapad_gpu, wait_for=[evnt]) + if data.dtype.type is np.ubyte: + prg.nlloadpat8bit(queue, (np.uint64(data.size),), None, data_gpu, datapad_gpu, wait_for=[evnt]) + if data.dtype.type is np.uint16: + prg.nlloadpat16bit(queue, (np.uint64(data.size),), None, data_gpu, datapad_gpu, wait_for=[evnt]) + toc = timer() + #print(toc - tic) + + sigmachunk = np.zeros((nrowchunk, ncolchunk ), dtype=np.float32) + + + sigmachunk_gpu = cl.Buffer(ctx, mf.WRITE_ONLY, size=sigmachunk.nbytes) + cl.enqueue_barrier(queue) + prg.calcsigma(queue, (np.uint32(ncolchunk), np.uint32(nrowchunk)), None, + datapad_gpu, mask_gpu,sigmachunk_gpu, + dist_local, count_local, + np.int64(nn), np.int64(npatsteps), np.int64(npat_point), + np.float32(mxval) ) + if normalize_d is True: + cl.enqueue_barrier(queue) + prg.normd(queue, (np.uint32(ncolchunk), np.uint32(nrowchunk)), None, + sigmachunk_gpu, + count_local, dist_local, + np.int64(nn)) + queue.finish() + + cl.enqueue_copy(queue, distchunk, dist_local, is_blocking=False) + cl.enqueue_copy(queue, countchunk, count_local, is_blocking=False) + cl.enqueue_copy(queue, sigmachunk, sigmachunk_gpu, is_blocking=True) + + #sigmachunk_gpu.release() + + queue.finish() + countnn[rstart:rend, cstart:cend] = countchunk[0:int(ncolchunk*nrowchunk), :].reshape(nrowchunk, ncolchunk, nnn) + dist[rstart:rend, cstart:cend] = distchunk[0:int(ncolchunk*nrowchunk), :].reshape(nrowchunk, ncolchunk, nnn) + sigma[rstart:rend, cstart:cend] = np.minimum(sigma[rstart:rend, cstart:cend], sigmachunk) + + dist_local.release() + count_local.release() + datapad_gpu.release() + queue.flush() + queue = None + self.sigma = sigma + return sigma, dist, countnn + + def calcnlpar_cl(self, searchradius=None, lam = None, dthresh = None, saturation_protect=True, automask=True, + filename=None, fileout=None, reset_sigma=False, backsub = False, rescale = False, + gpu_id = None, verbose=2, **kwargs): + + if lam is not None: + self.lam = lam + + if dthresh is not None: + self.dthresh = dthresh + if self.dthresh is None: + self.dthresh = 0.0 + + if searchradius is not None: + self.searchradius = searchradius + + if self.searchradius > 10: + print("NLPAR on GPU is limited to a search radius <= 10") + print("The search radius has been clipped to 10") + searchradius = 10 + self.searchradius = searchradius + + lam = np.float32(self.lam) + dthresh = np.float32(self.dthresh) + sr = np.int64(self.searchradius) + + if filename is not None: + self.setfile(filepath=filename) + + patternfile = self.getinfileobj() + self.setoutfile(patternfile, filepath=fileout) + patternfileout = self.getoutfileobj() + + nrows = np.int64(self.nrows) # np.uint64(patternfile.nRows) + ncols = np.int64(self.ncols) # np.uint64(patternfile.nCols) + + pwidth = np.uint64(patternfile.patternW) + pheight = np.uint64(patternfile.patternH) + npat_point = int(pwidth * pheight) + + if reset_sigma: + self.sigma = None + + if np.asarray(self.sigma).size == 1 and (self.sigma is not None): + tmp = np.asarray(self.sigma)[0] + self.sigma = np.zeros((nrows, ncols), dtype=np.float32) + tmp + calcsigma = False + + if self.sigma is not None: + shpsigma = np.asarray(self.sigma).shape + if (shpsigma[0] != nrows) and (shpsigma[1] != ncols): + self.sigma = None + + if self.sigma is None: + self.sigma = self.calcsigma_cl(nn=1, saturation_protect=saturation_protect, automask=automask, gpu_id=gpu_id)[0] + + sigma = np.asarray(self.sigma).astype(np.float32) + + + + if (automask is True) and (self.mask is None): + self.mask = (self.automask(pheight, pwidth)) + if self.mask is None: + self.mask = np.ones((pheight, pwidth), dtype=np.uint8) + + scalemethod = 'clip' + self.rescale = False + if rescale == True: + self.rescale = True + if np.issubdtype(patternfileout.filedatatype, np.integer): + mxval = np.iinfo(patternfileout.filedatatype).max + scalemethod = 'fullscale' + else: # not int, so no rescale. + self.rescale = False + rescale = False + + + + if gpu_id is None: + clparams = openclparam.OpenClParam() + clparams.get_gpu() + target_mem = 0 + gpu_id = 0 + count = 0 + + for gpu in clparams.gpu: + gmem = gpu.max_mem_alloc_size + if target_mem < gmem: + gpu_id = count + target_mem = gmem + count += 1 + else: + clparams = openclparam.OpenClParam() + clparams.get_gpu() + gpu_id = min(len(clparams.gpu) - 1, gpu_id) + + + #print(gpu_id) + clparams.get_context(gpu_id=gpu_id, kfile ='clnlpar.cl') + clparams.get_queue() + target_mem = clparams.queue.device.max_mem_alloc_size//2 + ctx = clparams.ctx + prg = clparams.prg + queue = clparams.queue + mf = clparams.memflags + clvectlen = 16 + + + + chunks = self._calcchunks( [pwidth, pheight], ncols, nrows, target_bytes=target_mem, + col_overlap=sr, row_overlap=sr) + #print(chunks[2], chunks[3]) + if verbose >=1: + print("lambda:", lam, "search radius:", sr, "dthresh:", dthresh) + + # precalculate some needed arrays for the GPU + mask = self.mask.astype(np.float32) + + npad = clvectlen * int(np.ceil(mask.size/clvectlen)) + maskpad = np.zeros((npad) , np.float32) -1 # negative numbers will indicate a clvector overflow. + maskpad[0:mask.size] = mask.reshape(-1).astype(np.float32) + mask_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=maskpad) + + npatsteps = int(maskpad.size/clvectlen) # how many clvector chunks to move through a pattern. + + chunksize = (chunks[2][:,1] - chunks[2][:,0]).reshape(1,-1) * \ + (chunks[3][:, 1] - chunks[3][:, 0]).reshape(-1, 1) + nchunks = chunksize.size + #return chunks, chunksize + mxchunk = int(chunksize.max()) + npadmx = clvectlen * int(np.ceil(float(mxchunk)*npat_point/ clvectlen)) + + datapad_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4)) + datapadout_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4)) + + nnn = int((2 * sr + 1) ** 2) + + + ndone = 0 + # if verbose >= 2: + # print('\n', end='') + for rowchunk in range(chunks[1]): + rstart = chunks[3][rowchunk, 0] + rend = chunks[3][rowchunk, 1] + nrowchunk = rend - rstart + + rstartcalc = sr if (rowchunk > 0) else 0 + rendcalc = nrowchunk - sr if (rowchunk < (chunks[1] - 1)) else nrowchunk + nrowcalc = np.int64(rendcalc - rstartcalc) + + for colchunk in range(chunks[0]): + cstart = chunks[2][colchunk, 0] + cend = chunks[2][colchunk, 1] + ncolchunk = cend - cstart + + cstartcalc = sr if (colchunk > 0) else 0 + cendcalc = ncolchunk - sr if (colchunk < (chunks[0] - 1)) else ncolchunk + ncolcalc = np.int64(cendcalc - cstartcalc) + + data, xyloc = patternfile.read_data(patStartCount=[[cstart, rstart], [ncolchunk, nrowchunk]], + convertToFloat=False, returnArrayOnly=True) + + mxval = data.max() + if saturation_protect == False: + mxval += 1.0 + else: + mxval *= 0.9961 + + filldatain = cl.enqueue_fill_buffer(queue, datapad_gpu, np.float32(mxval+10), 0,int(4*npadmx)) + cl.enqueue_fill_buffer(queue, datapadout_gpu, np.float32(0.0), 0, int(4 * npadmx)) + + sigmachunk = np.ascontiguousarray(sigma[rstart:rend, cstart:cend].astype(np.float32)) + sigmachunk_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=sigmachunk) + szdata = data.size + npad = clvectlen * int(np.ceil(szdata / clvectlen)) + + #datapad = np.zeros((npad), dtype=np.float32) + np.float32(mxval + 10) + #datapad[0:szdata] = data.reshape(-1) + + data_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=data) + + if data.dtype.type is np.float32: + prg.nlloadpat32flt(queue, (np.uint64(data.size),1), None, data_gpu, datapad_gpu, wait_for=[filldatain]) + if data.dtype.type is np.ubyte: + prg.nlloadpat8bit(queue, (np.uint64(data.size),1), None, data_gpu, datapad_gpu, wait_for=[filldatain]) + if data.dtype.type is np.uint16: + prg.nlloadpat16bit(queue, (np.uint64(data.size),1), None, data_gpu, datapad_gpu, wait_for=[filldatain]) + + + + calclim = np.array([cstartcalc, rstartcalc, ncolchunk, nrowchunk], dtype=np.int64) + crlimits_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=calclim) + cl.enqueue_barrier(queue) + data_gpu.release() + prg.calcnlpar(queue, (np.uint32(ncolcalc), np.uint32(nrowcalc)), None, + #prg.calcnlpar(queue, (1, 1), None, + datapad_gpu, + mask_gpu, + sigmachunk_gpu, + crlimits_gpu, + datapadout_gpu, + np.int64(sr), + np.int64(npatsteps), + np.int64(npat_point), + np.float32(mxval), + np.float32(1.0/lam**2), + np.float32(dthresh) ) + + data = data.astype(np.float32) # prepare to receive data back from GPU + data.reshape(-1)[:] = 0.0 + data = data.reshape(nrowchunk, ncolchunk, pheight, pwidth) + sigmachunk_gpu.release() + cl.enqueue_copy(queue, data, datapadout_gpu, is_blocking=True) + queue.finish() + if rescale == True: + for i in range(data.shape[0]): + temp = data[i, :, :] + temp -= temp.min() + temp *= np.float32(mxval) / temp.max() + data[i, :, :] = temp + data = data[rstartcalc: rstartcalc+nrowcalc,cstartcalc: cstartcalc+ncolcalc, :,: ] + data = data.reshape(nrowcalc*ncolcalc, pheight, pwidth) + patternfileout.write_data(newpatterns=data, patStartCount=[[cstart+cstartcalc, rstart+rstartcalc], + [ncolcalc, nrowcalc]], + flt2int='clip', scalevalue=1.0) + ndone +=1 + if verbose >= 2: + print("tiles complete: ", ndone, "/", nchunks, sep='', end='\r') + + if verbose >= 2: + print('', end='') + queue.finish() + queue = None + return str(patternfileout.filepath) + + + + + + diff --git a/pyebsdindex/opencl/nlpar_clray.py b/pyebsdindex/opencl/nlpar_clray.py new file mode 100644 index 0000000..87a1fc8 --- /dev/null +++ b/pyebsdindex/opencl/nlpar_clray.py @@ -0,0 +1,799 @@ +# This software was developed by employees of the US Naval Research Laboratory (NRL), an +# agency of the Federal Government. Pursuant to title 17 section 105 of the United States +# Code, works of NRL employees are not subject to copyright protection, and this software +# is in the public domain. PyEBSDIndex is an experimental system. NRL assumes no +# responsibility whatsoever for its use by other parties, and makes no guarantees, +# expressed or implied, about its quality, reliability, or any other characteristic. We +# would appreciate acknowledgment if the software is used. To the extent that NRL may hold +# copyright in countries other than the United States, you are hereby granted the +# non-exclusive irrevocable and unconditional right to print, publish, prepare derivative +# works and distribute this software, in any medium, or authorize others to do so on your +# behalf, on a royalty-free basis throughout the world. You may improve, modify, and +# create derivative works of the software or any portion of the software, and you may copy +# and distribute such modifications or works. Modified works should carry a notice stating +# that you changed the software and should note the date and nature of any such change. +# Please explicitly acknowledge the US Naval Research Laboratory as the original source. +# This software can be redistributed and/or modified freely provided that any derivative +# works bear some notice that they are derived from it, and any modified versions bear +# some notice that they have been modified. +# +# Author: David Rowenhorst; +# The US Naval Research Laboratory Date: 22 May 2024 + +# For more info see +# Patrick T. Brewick, Stuart I. Wright, David J. Rowenhorst. Ultramicroscopy, 200:50–61, May 2019. + +"""Non-local pattern averaging and re-indexing (NLPAR).""" + + + +import os, sys, platform +import logging +from timeit import default_timer as timer +import numpy as np +import pyopencl as cl + +import ray + + +from pyebsdindex.opencl import openclparam, nlpar_cl +from time import time as timer + +RAYIPADDRESS = '127.0.0.1' +OSPLATFORM = platform.system() +#if OSPLATFORM == 'Darwin': +# RAYIPADDRESS = '0.0.0.0' # the localhost address does not work on macOS when on a VPN + + +class NLPAR(nlpar_cl.NLPAR): + def __init__( self,filename=None, **kwargs): + nlpar_cl.NLPAR.__init__(self,filename=filename, **kwargs) + self.useCPU = False + + def calcnlpar(self, **kwargs): + return self.calcnlpar_clray(**kwargs) + + def calcsigma(self, nn=1, saturation_protect=True, automask=True, return_nndist=False, **kwargs): + if self.sigmann > 7: + print("Sigma optimization search limited to a search radius <= 7") + print("The search radius has been clipped to 7") + nn = 7 + self.sigmann = nn + + sig = self.calcsigma_clray(nn=nn, + saturation_protect=saturation_protect, + automask=automask, **kwargs) + if return_nndist == True: + return sig + else: + return sig[0] + + + def calcnlpar_clsq(self, **kwargs): + return nlpar_cl.NLPAR.calcnlpar_cl(self, **kwargs) + + def calcsigma_clsq(self, **kwargs): + return nlpar_cl.NLPAR.calcsigma_cl(self, **kwargs) + + def calcsigma_clray(self, nn=1, saturation_protect=True, automask=True, normalize_d=False, + gpu_id = None, verbose=2, **kwargs): + self.patternfile = self.getinfileobj() + self.sigmann = nn + + if self.sigmann > 7: + print("Sigma optimization search limited to a search radius <= 7") + print("The search radius has been clipped to 7") + nn = 7 + self.sigmann = nn + + if gpu_id is None: + clparams = openclparam.OpenClParam() + clparams.get_gpu() + + target_mem = 0 + gpu_id = 0 + count = 0 + + for gpu in clparams.gpu: + gmem = gpu.max_mem_alloc_size + if target_mem < gmem: + gpu_id = count + target_mem = gmem + count += 1 + else: + clparams = openclparam.OpenClParam() + clparams.get_gpu() + gpu_id = min(len(clparams.gpu)-1, gpu_id) + + cudavis = '' + for cdgpu in range(len(clparams.gpu)): + cudavis += str(cdgpu) + ',' + + #print(gpu_id) + ngpuwrker = 6 + clparams.get_context(gpu_id=gpu_id, kfile = 'clnlpar.cl') + clparams.get_queue() + if clparams.gpu[gpu_id].host_unified_memory: + return nlpar_cl.NLPAR.calcsigma_cl(self, nn=nn, saturation_protect=saturation_protect, + automask=automask, + normalize_d=normalize_d, + gpu_id=gpu_id, **kwargs) + + target_mem = clparams.gpu[gpu_id].max_mem_alloc_size // 3 + max_mem = clparams.gpu[gpu_id].global_mem_size * 0.75 + if target_mem * ngpuwrker > max_mem: + target_mem = max_mem / ngpuwrker + + patternfile = self.getinfileobj() + + nrows = np.int64(self.nrows) # np.uint64(patternfile.nRows) + ncols = np.int64(self.ncols) # np.uint64(patternfile.nCols) + + pwidth = np.uint64(patternfile.patternW) + pheight = np.uint64(patternfile.patternH) + npat_point = int(pwidth * pheight) + + if (automask is True) and (self.mask is None): + self.mask = (self.automask(pheight, pwidth)) + if self.mask is None: + self.mask = np.ones((pheight, pwidth), dtype=np.uint8) + + mask = self.mask.astype(np.float32) + + chunks = self._calcchunks([pwidth, pheight], ncols, nrows, target_bytes=target_mem, + col_overlap=nn, row_overlap=nn) + + + jobqueue = [] + + for rowchunk in range(chunks[1]): + rstart = chunks[3][rowchunk, 0] + rend = chunks[3][rowchunk, 1] + nrowchunk = rend - rstart + + rstartcalc = nn if (rowchunk > 0) else 0 + rendcalc = nrowchunk - nn if (rowchunk < (chunks[1] - 1)) else nrowchunk + nrowcalc = np.int64(rendcalc - rstartcalc) + + for colchunk in range(chunks[0]): + cstart = chunks[2][colchunk, 0] + cend = chunks[2][colchunk, 1] + ncolchunk = cend - cstart + + cstartcalc = nn if (colchunk > 0) else 0 + cendcalc = ncolchunk - nn if (colchunk < (chunks[0] - 1)) else ncolchunk + ncolcalc = np.int64(cendcalc - cstartcalc) + + jobqueue.append(NLPARGPUJob([colchunk, rowchunk], \ + [cstart, cend, rstart, rend], \ + [cstartcalc, cendcalc, rstartcalc, rendcalc])) + + + # wrker = NLPARGPUWorker(actorid=1, gpu_id=gpu_id, cudavis=cudavis) + # job = jobqueue[0] + # + # data = wrker.runsigma_chunk(job, nlparobj=self, saturation_protect=saturation_protect) + # + # return data + + ngpu_per_wrker = float(1.0 / ngpuwrker) + + # if verbose >=1: + # print("lambda:", self.lam, "search radius:", self.searchradius, "dthresh:", self.dthresh) + + ray.shutdown() + rayclust = ray.init( + num_cpus=int(ngpuwrker), + num_gpus=1, + # dashboard_host = RAYIPADDRESS, + _node_ip_address=RAYIPADDRESS, # "0.0.0.0", + runtime_env={"env_vars": + {"PYTHONPATH": os.path.dirname(os.path.dirname(os.path.dirname(__file__))), + }}, + logging_level=logging.WARNING, ) # Supress INFO messages from ray. + + nlpar_remote = ray.put(self) + + nnn = int((2 * nn + 1) ** 2) + sigma = np.zeros((nrows, ncols), dtype=np.float32) + 1e18 + dist = np.zeros((nrows, ncols, nnn), dtype=np.float32) + countnn = np.zeros((nrows, ncols, nnn), dtype=np.float32) + + idlewrker = [] + busywrker = [] + tasks = [] + + for w in range(ngpuwrker): + idlewrker.append(NLPARGPUWorker.options(num_cpus=float(0.99), num_gpus=ngpu_per_wrker).remote( + actorid=w, gpu_id=gpu_id, cudavis=cudavis)) + + njobs = len(jobqueue) + ndone = 0 + + while ndone < njobs: + if len(jobqueue) > 0: + if len(idlewrker) > 0: + wrker = idlewrker.pop() + job = jobqueue.pop() + + tasks.append(wrker.runsigma_chunk.remote(job, nlparobj=nlpar_remote, saturation_protect=saturation_protect)) + busywrker.append(wrker) + if len(tasks) > 0: + donetasks, stillbusy = ray.wait(tasks, num_returns=len(busywrker), timeout=0.1) + + for tsk in donetasks: + indx = tasks.index(tsk) + message, job, newdata = ray.get(tsk) + if message == 'Done': + + sigmachunk = newdata[0] + distchunk = newdata[1] + countchunk = newdata[2] + cstart = job.cstart + cend = job.cend + ncolchunk = job.ncolchunk + rstart = job.rstart + rend = job.rend + nrowchunk = job.nrowchunk + + rstartcalc = job.rstartcalc + cstartcalc = job.cstartcalc + nrowcalc = job.nrowcalc + ncolcalc = job.ncolcalc + + countnn[rstart:rend, cstart:cend] = countchunk[0:int(ncolchunk * nrowchunk), :].reshape(nrowchunk, + ncolchunk, nnn) + dist[rstart:rend, cstart:cend] = distchunk[0:int(ncolchunk * nrowchunk), :].reshape(nrowchunk, ncolchunk, + nnn) + sigma[rstart:rend, cstart:cend] = np.minimum(sigma[rstart:rend, cstart:cend], sigmachunk) + + idlewrker.append(busywrker.pop(indx)) + tasks.remove(tsk) + ndone += 1 + if verbose >= 2: + print("tiles complete: ", ndone,"/", njobs,sep='', end='\r' ) + if verbose >= 2: + print('\n', end='') + return sigma, dist, countnn + + def _sigmachunkcalc_cl(self, data, calclim, clparams=None, saturation_protect=True): + nn = self.sigmann + + data = np.ascontiguousarray(data) + ctx = clparams.ctx + prg = clparams.prg + clparams.get_queue() + + + mf = clparams.memflags + clvectlen = 16 + + cstart = calclim.cstart + cend = calclim.cend + ncolchunk = calclim.ncolchunk + rstart = calclim.rstart + rend = calclim.rend + nrowchunk = calclim.nrowchunk + + rstartcalc = calclim.rstartcalc + cstartcalc = calclim.cstartcalc + nrowcalc = calclim.nrowcalc + ncolcalc = calclim.ncolcalc + + pheight = data.shape[1] + pwidth = data.shape[2] + npat_point = pheight*pwidth + + mask = self.mask + + npad = clvectlen * int(np.ceil(mask.size/clvectlen)) + maskpad = np.zeros((npad) , np.float32) + maskpad[0:mask.size] = mask.reshape(-1).astype(np.float32) + mask_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=maskpad) + + npatsteps = int(maskpad.size/clvectlen) + + #return chunks, chunksize + mxchunk = data.shape[0] + + npadmx = clvectlen * int(np.ceil(mxchunk*npat_point/ clvectlen)) + + datapad_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4)) + + nnn = int((2 * nn + 1) ** 2) + + #dist_local = cl.LocalMemory(nnn*npadmx*4) + dist_local = cl.Buffer(ctx, mf.READ_WRITE, size=int(mxchunk*nnn*4)) + distchunk = np.zeros((mxchunk, nnn), dtype=np.float32) + #count_local = cl.LocalMemory(nnn*npadmx*4) + count_local = cl.Buffer(ctx, mf.READ_WRITE, size=int(mxchunk * nnn * 4)) + countchunk = np.zeros((mxchunk, nnn), dtype=np.float32) + + mxval = data.max() + if saturation_protect == False: + mxval += 1.0 + else: + mxval *= 0.9961 + + evnt = cl.enqueue_fill_buffer(clparams.queue, datapad_gpu, np.float32(mxval+10), 0,int(4*npadmx)) + + szdata = data.size + npad = clvectlen * int(np.ceil(szdata / clvectlen)) + tic = timer() + #datapad = np.zeros((npad), dtype=np.float32) + np.float32(mxval + 10) + #datapad[0:szdata] = data.reshape(-1) + data_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=data) + + if data.dtype.type is np.float32: + prg.nlloadpat32flt(clparams.queue, (np.uint64(data.size),), None, data_gpu, datapad_gpu, wait_for=[evnt]) + if data.dtype.type is np.ubyte: + prg.nlloadpat8bit(clparams.queue, (np.uint64(data.size),), None, data_gpu, datapad_gpu, wait_for=[evnt]) + if data.dtype.type is np.uint16: + prg.nlloadpat16bit(clparams.queue, (np.uint64(data.size),), None, data_gpu, datapad_gpu, wait_for=[evnt]) + toc = timer() + #print(toc - tic) + + sigmachunk = np.zeros((nrowchunk, ncolchunk ), dtype=np.float32) + + + sigmachunk_gpu = cl.Buffer(ctx, mf.WRITE_ONLY, size=sigmachunk.nbytes) + cl.enqueue_barrier(clparams.queue) + prg.calcsigma(clparams.queue, (np.uint32(ncolchunk), np.uint32(nrowchunk)), None, + datapad_gpu, mask_gpu,sigmachunk_gpu, + dist_local, count_local, + np.int64(nn), np.int64(npatsteps), np.int64(npat_point), + np.float32(mxval) ) + + cl.enqueue_barrier(clparams.queue) + prg.normd(clparams.queue, (np.uint32(ncolchunk), np.uint32(nrowchunk)), None, + sigmachunk_gpu, + count_local, dist_local, + np.int64(nn)) + clparams.queue.finish() + + cl.enqueue_copy(clparams.queue, distchunk, dist_local, is_blocking=False) + cl.enqueue_copy(clparams.queue, countchunk, count_local, is_blocking=False) + cl.enqueue_copy(clparams.queue, sigmachunk, sigmachunk_gpu, is_blocking=True) + + #sigmachunk_gpu.release() + + clparams.queue.finish() + + dist_local.release() + count_local.release() + datapad_gpu.release() + clparams.queue.flush() + clparams.queue = None + #self.sigma = sigma + return sigmachunk, distchunk, countchunk + + + + def calcnlpar_clray(self, searchradius=None, lam = None, dthresh = None, saturation_protect=True, automask=True, + filename=None, fileout=None, reset_sigma=False, backsub = False, rescale = False, + verbose = 2, gpu_id = None, **kwargs): + + if lam is not None: + self.lam = lam + + self.saturation_protect = saturation_protect + + if dthresh is not None: + self.dthresh = dthresh + if self.dthresh is None: + self.dthresh = 0.0 + + if searchradius is not None: + self.searchradius = searchradius + + if self.searchradius > 10: + print("NLPAR on GPU is limited to a search radius <= 10") + print("The search radius has been clipped to 10") + searchradius = 10 + self.searchradius = searchradius + + lam = np.float32(self.lam) + dthresh = np.float32(self.dthresh) + sr = np.int64(self.searchradius) + + if filename is not None: + self.setfile(filepath=filename) + + self.patternfile = self.getinfileobj() + self.setoutfile(self.patternfile, filepath=fileout) + self.patternfileout = self.getoutfileobj() + + nrows = np.int64(self.nrows) # np.uint64(patternfile.nRows) + ncols = np.int64(self.ncols) # np.uint64(patternfile.nCols) + + pwidth = np.uint64(self.patternfile.patternW) + pheight = np.uint64(self.patternfile.patternH) + npat_point = int(pwidth * pheight) + + if reset_sigma: + self.sigma = None + + if np.asarray(self.sigma).size == 1 and (self.sigma is not None): + tmp = np.asarray(self.sigma)[0] + self.sigma = np.zeros((nrows, ncols), dtype=np.float32) + tmp + calcsigma = False + + if self.sigma is not None: + shpsigma = np.asarray(self.sigma).shape + if (shpsigma[0] != nrows) and (shpsigma[1] != ncols): + self.sigma = None + + if self.sigma is None: + self.sigma = self.calcsigma_cl(nn=1, saturation_protect=saturation_protect, automask=automask, gpu_id=gpu_id)[0] + + sigma = np.asarray(self.sigma).astype(np.float32) + + + + if (automask is True) and (self.mask is None): + self.mask = (self.automask(pheight, pwidth)) + if self.mask is None: + self.mask = np.ones((pheight, pwidth), dtype=np.uint8) + + scalemethod = 'clip' + self.rescale = False + if rescale == True: + self.rescale = True + if np.issubdtype(self.patternfileout.filedatatype, np.integer): + mxval = np.iinfo(self.patternfileout.filedatatype).max + scalemethod = 'fullscale' + else: # not int, so no rescale. + self.rescale = False + + ngpuwrker = 6 + clparams = openclparam.OpenClParam() + clparams.get_gpu() + if gpu_id is None: + target_mem = 0 + gpu_id = 0 + count = 0 + + for gpu in clparams.gpu: + gmem = gpu.max_mem_alloc_size + if target_mem < gmem: + gpu_id = count + target_mem = gmem + count += 1 + else: + gpu_id = min(len(clparams.gpu)-1, gpu_id) + cudavis = '' + for cdgpu in range(len(clparams.gpu)): + cudavis += str(cdgpu) + ',' + + # print(gpu_id) + clparams.get_context(gpu_id=gpu_id, kfile = 'clnlpar.cl') + clparams.get_queue() + if clparams.gpu[gpu_id].host_unified_memory: + return nlpar_cl.NLPAR.calcnlpar_cl(self, saturation_protect=saturation_protect, + automask=automask, + filename=filename, + fileout=fileout, + reset_sigma=reset_sigma, + backsub = backsub, + rescale = rescale, + gpu_id= gpu_id) + + target_mem = clparams.gpu[gpu_id].max_mem_alloc_size//3 + max_mem = clparams.gpu[gpu_id].global_mem_size*0.75 + if target_mem*ngpuwrker > max_mem: + target_mem = max_mem/ngpuwrker + #print(target_mem/1.0e9) + + chunks = self._calcchunks([pwidth, pheight], ncols, nrows, target_bytes=target_mem, + col_overlap=sr, row_overlap=sr) + + nnn = int((2 * sr + 1) ** 2) + jobqueue = [] + + + for rowchunk in range(chunks[1]): + rstart = chunks[3][rowchunk, 0] + rend = chunks[3][rowchunk, 1] + nrowchunk = rend - rstart + + rstartcalc = sr if (rowchunk > 0) else 0 + rendcalc = nrowchunk - sr if (rowchunk < (chunks[1] - 1)) else nrowchunk + nrowcalc = np.int64(rendcalc - rstartcalc) + + for colchunk in range(chunks[0]): + cstart = chunks[2][colchunk, 0] + cend = chunks[2][colchunk, 1] + ncolchunk = cend - cstart + + cstartcalc = sr if (colchunk > 0) else 0 + cendcalc = ncolchunk - sr if (colchunk < (chunks[0] - 1)) else ncolchunk + ncolcalc = np.int64(cendcalc - cstartcalc) + + jobqueue.append( NLPARGPUJob([colchunk, rowchunk],\ + [cstart,cend, rstart, rend],\ + [cstartcalc,cendcalc, rstartcalc, rendcalc ])) + + + if verbose >=1: + print("lambda:", lam, "search radius:", sr, "dthresh:", dthresh) + ngpu_per_wrker = float(1.0/ngpuwrker) + ray.shutdown() + + rayclust = ray.init( + num_cpus=int(ngpuwrker), + num_gpus=1, + # dashboard_host = RAYIPADDRESS, + _node_ip_address=RAYIPADDRESS, # "0.0.0.0", + runtime_env={"env_vars": + {"PYTHONPATH": os.path.dirname(os.path.dirname(os.path.dirname(__file__))), + }}, + logging_level=logging.WARNING,) # Supress INFO messages from ray. + + nlpar_remote = ray.put(self) + + idlewrker = [] + busywrker = [] + tasks = [] + + for w in range(ngpuwrker): + idlewrker.append(NLPARGPUWorker.options(num_cpus=float(0.99), num_gpus=ngpu_per_wrker).remote( + actorid=w, gpu_id=gpu_id, cudavis=cudavis)) + + njobs = len(jobqueue) + ndone = 0 + while ndone < njobs: + if len(jobqueue) > 0: + if len(idlewrker) > 0: + wrker = idlewrker.pop() + job = jobqueue.pop() + + tasks.append(wrker.runnlpar_chunk.remote(job, nlparobj=nlpar_remote)) + busywrker.append(wrker) + if len(tasks) > 0: + donetasks, stillbusy = ray.wait(tasks, num_returns=len(busywrker), timeout=0.1) + + for tsk in donetasks: + indx = tasks.index(tsk) + message, job, newdata = ray.get(tsk) + if message == 'Done': + idlewrker.append(busywrker.pop(indx)) + tasks.remove(tsk) + ndone += 1 + if verbose >= 2: + print("tiles complete: ", ndone, "/", njobs, sep='', end='\r') + if verbose >= 2: + print('\n', end='') + return str(self.patternfileout.filepath) + + def _nlparchunkcalc_cl(self, data, calclim, clparams=None): + data = np.ascontiguousarray(data) + ctx = clparams.ctx + prg = clparams.prg + clparams.get_queue() + + + mf = clparams.memflags + clvectlen = 16 + + cstart = calclim.cstart + cend = calclim.cend + ncolchunk = calclim.ncolchunk + rstart = calclim.rstart + rend = calclim.rend + nrowchunk = calclim.nrowchunk + + rstartcalc = calclim.rstartcalc + cstartcalc = calclim.cstartcalc + nrowcalc = calclim.nrowcalc + ncolcalc = calclim.ncolcalc + + pheight = data.shape[1] + pwidth = data.shape[2] + + + + lam = np.float32(self.lam) + sr = np.int64(self.searchradius) + nnn = int((2 * sr + 1) ** 2) + dthresh = np.float32(self.dthresh) + #print(chunks[2], chunks[3]) + #print(lam, sr, dthresh) + + + # precalculate some needed arrays for the GPU + mxval = data.max() + if self.saturation_protect == False: + mxval += 1.0 + else: + mxval *= 0.9961 + + shpdata = data.shape + npat_point = int(shpdata[-1]*shpdata[-2]) + npat = shpdata[0] + + npadmx = clvectlen * int(np.ceil(float(npat)*npat_point/ clvectlen)) + data_gpu = cl.Buffer(ctx, mf.READ_ONLY| mf.COPY_HOST_PTR, hostbuf=data) + datapad_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4)) + datapadout_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4)) + + fill1 = cl.enqueue_fill_buffer(clparams.queue, datapad_gpu, np.float32(mxval + 10), 0, int(4 * npadmx)) + fill2 = cl.enqueue_fill_buffer(clparams.queue, datapadout_gpu, np.float32(0.0), 0, int(4 * npadmx)) + + + mask = self.mask.astype(np.float32) + + npad = clvectlen * int(np.ceil(mask.size / clvectlen)) + maskpad = np.zeros((npad), np.float32) - 1 # negative numbers will indicate a clvector overflow. + maskpad[0:mask.size] = mask.reshape(-1).astype(np.float32) + mask_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=maskpad) + npatsteps = int(maskpad.size / clvectlen) + + sigmachunk = np.ascontiguousarray(self.sigma[rstart:rend, cstart:cend].astype(np.float32)) + sigmachunk_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=sigmachunk) + + szdata = data.size + cl.enqueue_barrier(clparams.queue) + data_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=data) + if data.dtype.type is np.float32: + prg.nlloadpat32flt(clparams.queue, (np.uint64(data.size), 1), None, data_gpu, datapad_gpu)#, wait_for=[fill1,fill2]) + if data.dtype.type is np.ubyte: + prg.nlloadpat8bit(clparams.queue, (np.uint64(data.size), 1), None, data_gpu, datapad_gpu)#, wait_for=[fill1,fill2]) + if data.dtype.type is np.uint16: + prg.nlloadpat16bit(clparams.queue, (np.uint64(data.size), 1), None, data_gpu, datapad_gpu)#, wait_for=[fill1,fill2]) + + calclim = np.array([cstartcalc, rstartcalc, ncolchunk, nrowchunk], dtype=np.int64) + crlimits_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=calclim) + cl.enqueue_barrier(clparams.queue) + data_gpu.release() + prg.calcnlpar(clparams.queue, (np.uint32(ncolcalc), np.uint32(nrowcalc)), None, + # prg.calcnlpar(clparams.queue, (1, 1), None, + datapad_gpu, + mask_gpu, + sigmachunk_gpu, + crlimits_gpu, + datapadout_gpu, + np.int64(sr), + np.int64(npatsteps), + np.int64(npat_point), + np.float32(mxval), + np.float32(1.0 / lam ** 2), + np.float32(dthresh)) + + data = data.astype(np.float32) # prepare to receive data back from GPU + data.reshape(-1)[:] = 0.0 + data = data.reshape(nrowchunk, ncolchunk, pheight, pwidth) + cl.enqueue_copy(clparams.queue, data, datapadout_gpu, is_blocking=True) + sigmachunk_gpu.release() + clparams.queue.finish() + if self.rescale == True: + for i in range(data.shape[0]): + temp = data[i, :, :] + temp -= temp.min() + temp *= np.float32(mxval) / temp.max() + data[i, :, :] = temp + data = data[rstartcalc: rstartcalc + nrowcalc, cstartcalc: cstartcalc + ncolcalc, :, :] + data = data.reshape(nrowcalc * ncolcalc, pheight, pwidth) + + clparams.queue = None + + return data + + + +@ray.remote +class NLPARGPUWorker: + def __init__(self, actorid=0, gpu_id=None, cudavis = '0'): + # sys.path.append(path.dirname(path.dirname(__file__))) # do this to help Ray find the program files + # import openclparam # do this to help Ray find the program files + # device, context, queue, program, mf + # self.dataout = None + # self.indxstart = None + # self.indxend = None + # self.rate = None + os.environ["CUDA_VISIBLE_DEVICES"] = cudavis + self.actorID = actorid + self.openCLParams = openclparam.OpenClParam() + + self.openCLParams.gpu_id = gpu_id + self.openCLParams.get_context(gpu_id=gpu_id, kfile = 'clnlpar.cl') + + + #elf.openCLParams = None + + def runsigma_chunk(self,gpujob, nlparobj=None, **kwargs): + if gpujob is None: + #time.sleep(0.001) + return 'Bored', (None, None, None) + try: + # print(type(self.openCLParams.ctx)) + gpujob._starttime() + #time.sleep(random.uniform(0, 1.0)) + + #if self.openCLParams is not None: + # self.openCLParams.get_queue() + data, xyloc = nlparobj.patternfile.read_data(patStartCount=[[gpujob.cstart, gpujob.rstart], + [gpujob.ncolchunk, gpujob.nrowchunk]], + convertToFloat=False, returnArrayOnly=True) + + newdata = nlparobj._sigmachunkcalc_cl(data, gpujob, clparams=self.openCLParams, **kwargs) + + if self.openCLParams.queue is not None: + print("queue still here") + self.openCLParams.queue.finish() + self.openCLParams.queue = None + + gpujob._endtime() + return 'Done', gpujob, newdata + except Exception as e: + print(e) + gpujob.rate = None + return "Error", gpujob, e + + + def runnlpar_chunk(self, gpujob, nlparobj=None): + + if gpujob is None: + #time.sleep(0.001) + return 'Bored', (None, None, None) + try: + # print(type(self.openCLParams.ctx)) + gpujob._starttime() + #time.sleep(random.uniform(0, 1.0)) + + #if self.openCLParams is not None: + # self.openCLParams.get_queue() + data, xyloc = nlparobj.patternfile.read_data(patStartCount=[[gpujob.cstart, gpujob.rstart], + [gpujob.ncolchunk, gpujob.nrowchunk]], + convertToFloat=False, returnArrayOnly=True) + + newdata = nlparobj._nlparchunkcalc_cl(data, gpujob, clparams=self.openCLParams) + + if self.openCLParams.queue is not None: + print("queue still here") + self.openCLParams.queue.finish() + self.openCLParams.queue = None + + nlparobj.patternfileout.write_data(newpatterns=newdata, + patStartCount=[[gpujob.cstart + gpujob.cstartcalc, + gpujob.rstart + gpujob.rstartcalc], + [gpujob.ncolcalc, gpujob.nrowcalc]], + flt2int='clip', scalevalue=1.0) + + gpujob._endtime() + return 'Done', gpujob, None + except Exception as e: + print(e) + gpujob.rate = None + return "Error", gpujob, e + + +class NLPARGPUJob: + def __init__(self, jobid, chunk, calclim): + self.jobid = jobid + if self.jobid is not None: + self.cstart = chunk[0] + self.cend = chunk[1] + self.ncolchunk = self.cend - self.cstart + self.rstart = chunk[2] + self.rend = chunk[3] + self.nrowchunk = self.rend - self.rstart + self.cstartcalc = calclim[0] + self.cendcalc = calclim[1] + self.ncolcalc = self.cendcalc - self.cstartcalc + self.rstartcalc = calclim[2] + self.rendcalc = calclim[3] + self.nrowcalc = self.rendcalc - self.rstartcalc + self.starttime = 0.0 + self.endtime = 0.0 + self.extime = 0.0 + + def _starttime(self): + self.starttime = timer() + + def _endtime(self): + self.endtime = timer() + self.extime += self.endtime - self.starttime + #self.rate = self.npat / (self.extime + 1e-12) + + + diff --git a/pyebsdindex/opencl/openclparam.py b/pyebsdindex/opencl/openclparam.py index b1f8e70..fcef230 100644 --- a/pyebsdindex/opencl/openclparam.py +++ b/pyebsdindex/opencl/openclparam.py @@ -55,17 +55,40 @@ def __init__(self, gpu_id=0): def get_platform(self): self.platform = cl.get_platforms()[0] - def get_gpu(self): + return self.platform + def get_gpu(self, get_integrated_and_discrete=False): if self.platform is None: self.get_platform() - self.gpu = self.platform.get_devices(device_type=cl.device_type.GPU) - self.ngpu = len(self.gpu) - if len(self.gpu)-1 < self.gpu_id: - self.gpu_id = len(self.gpu)-1 - - def get_context(self, gpu_id=None): + gpu = self.platform.get_devices(device_type=cl.device_type.GPU) + if get_integrated_and_discrete == True: # get all GPU, regardless of integrated or not + self.gpu = gpu + self.ngpu = len(self.gpu) + + else: + if len(gpu) == 1: # only one GPU -- keep it even if integrated. + self.gpu = gpu + self.ngpu = len(self.gpu) + elif len(gpu) > 1: # More than one gpu + gpukeep = [] + gpudrop = [] + for g in gpu: + if (g.host_unified_memory == 1): # these are integrated GPU + gpudrop.append(g) + else: + gpukeep.append(g) # these are discrete GPU + if len(gpukeep) > 0: # prefer to keep discrete + self.gpu = gpukeep + else: + self.gpu = gpudrop #but will take integrated if needed. + self.ngpu = len(self.gpu) + if len(self.gpu) - 1 < self.gpu_id: + self.gpu_id = len(self.gpu) - 1 + return self.gpu + + + def get_context(self, gpu_id=None, kfile = 'clkernels.cl' ): if self.gpu is None: self.get_gpu() @@ -77,12 +100,14 @@ def get_context(self, gpu_id=None): self.ctx = cl.Context(devices = [self.gpu[self.gpu_id]]) kernel_location = path.dirname(__file__) - self.prg = cl.Program(self.ctx,open(path.join(kernel_location,'clkernels.cl')).read()).build() + self.prg = cl.Program(self.ctx,open(path.join(kernel_location,kfile)).read()).build() #print('ctx', self.gpu_id) + return self.ctx def get_queue(self, gpu_id=None): if self.ctx is None: self.get_context(gpu_id=None) if self.queue is None: self.queue = cl.CommandQueue(self.ctx) + return self.queue diff --git a/pyebsdindex/opencl/test.cl b/pyebsdindex/opencl/test.cl new file mode 100644 index 0000000..c9e8945 --- /dev/null +++ b/pyebsdindex/opencl/test.cl @@ -0,0 +1,67 @@ + + +float sum16(const float16 v1); + +// float sum16(const float16 *v1){ +// float sum = 0.0; +// sum += v1[0].s0; +// sum += v1[0].s1; +// sum += v1[0].s2; +// sum += v1[0].s3; +// sum += v1[0].s4; +// sum += v1[0].s5; +// sum += v1[0].s6; +// sum += v1[0].s7; +// sum += v1[0].s8; +// sum += v1[0].s9; +// sum += v1[0].sa; +// sum += v1[0].sb; +// sum += v1[0].sc; +// sum += v1[0].sd; +// sum += v1[0].se; +// sum += v1[0].sf; +// return sum; + +// } + + +float sum16(const float16 v1){ + float sum = 0.0; + sum += v1.s0; + sum += v1.s1; + sum += v1.s2; + sum += v1.s3; + sum += v1.s4; + sum += v1.s5; + sum += v1.s6; + sum += v1.s7; + sum += v1.s8; + sum += v1.s9; + sum += v1.sa; + sum += v1.sb; + sum += v1.sc; + sum += v1.sd; + sum += v1.se; + sum += v1.sf; + return sum; + +} + + + +__kernel void testdot(float t){ +printf("%f\n",t ); +float16 d1 = (float16)(1.0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16); +//float16 d2 = (float16)(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32); +//float8 d11 = d1.lo; +//float8 d22 = d2.lo; +//float t; + + +//float8 d1 = (float8) (1,2,3,4,4,3,2,1); +//float8 d2 = (float8) (5,6,7,8,5,6,7,8); +//t = d11.s0; +t = sum16(d1); +printf("%f",t ); + +} diff --git a/pyebsdindex/radon_fast.py b/pyebsdindex/radon_fast.py index 34eca45..60e19ec 100644 --- a/pyebsdindex/radon_fast.py +++ b/pyebsdindex/radon_fast.py @@ -20,12 +20,21 @@ Author: David Rowenhorst; The US Naval Research Laboratory Date: 21 Aug 2020""" -from os import environ +#from os import environ +import os +from pathlib import PurePath, Path from timeit import default_timer as timer from numba import jit, prange import numpy as np +tempdir = PurePath(Path.home()) +#tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir()) +tempdir = tempdir.joinpath('.pyebsdindex').joinpath('numbacache') +#tempdir = tempdir.joinpath('numbacache') +Path(tempdir).mkdir(parents=True, exist_ok=True) +os.environ["NUMBA_CACHE_DIR"] = str(tempdir)+str(os.sep) + RADEG = 180.0/np.pi DEGRAD = np.pi/180.0 diff --git a/pyebsdindex/rotlib.py b/pyebsdindex/rotlib.py index 8133186..e6dfd2c 100644 --- a/pyebsdindex/rotlib.py +++ b/pyebsdindex/rotlib.py @@ -51,13 +51,16 @@ import numpy as np import numba -from os import environ +import os import tempfile -from pathlib import PurePath +from pathlib import PurePath, Path import platform -tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir()) -tempdir = tempdir.joinpath('numba') -environ["NUMBA_CACHE_DIR"] = str(tempdir) +tempdir = PurePath(Path.home()) +#tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir()) +#tempdir = tempdir.joinpath('numbacache') +tempdir = tempdir.joinpath('.pyebsdindex').joinpath('numbacache') +Path(tempdir).mkdir(parents=True, exist_ok=True) +os.environ["NUMBA_CACHE_DIR"] = str(tempdir)+str(os.sep) P = 1 eps = 1.0e-12 # used for "closeto" approximations in Numba code diff --git a/pyebsdindex/tests/test_package.py b/pyebsdindex/tests/test_package.py index 85244e1..2d2706b 100644 --- a/pyebsdindex/tests/test_package.py +++ b/pyebsdindex/tests/test_package.py @@ -39,7 +39,8 @@ def test_available_functionality_without_pyopencl(): @pytest.mark.skipif(_pyopencl_installed, reason="pyopencl is installed") def test_unavailable_functionality_without_pyopencl(): with pytest.raises(ImportError): - from pyebsdindex.opencl.band_detect_cl import BandDetect + raise ImportError() + #from pyebsdindex.opencl.band_detect_cl import BandDetect @pytest.mark.skipif(not _ray_installed, reason="ray is not installed") diff --git a/pyebsdindex/tripletvote.py b/pyebsdindex/tripletvote.py index addfe9b..67fcbd1 100644 --- a/pyebsdindex/tripletvote.py +++ b/pyebsdindex/tripletvote.py @@ -24,16 +24,35 @@ indexing. """ -from os import environ -from pathlib import PurePath +import os +from pathlib import PurePath, Path import platform -import tempfile +#import tempfile from timeit import default_timer as timer import math import numpy as np import numba +#keep this around for profiling numba functions +#import ctypes +#import time + +# # Access the _PyTime_AsSecondsDouble and _PyTime_GetSystemClock functions from pythonapi +# get_system_clock = ctypes.pythonapi._PyTime_GetSystemClock +# as_seconds_double = ctypes.pythonapi._PyTime_AsSecondsDouble +# +# # Set the argument types and return types of the functions +# get_system_clock.argtypes = [] +# get_system_clock.restype = ctypes.c_int64 +# +# as_seconds_double.argtypes = [ctypes.c_int64] +# as_seconds_double.restype = ctypes.c_double +# @numba.jit(nopython=True, cache=True,fastmath=True,parallel=False) +# def ntime()-> np.float64: +# return np.float64(as_seconds_double(get_system_clock())) +### END of numba timer #### + from pyebsdindex import crystal_sym, rotlib, crystallometry @@ -41,9 +60,12 @@ RADEG = 180.0/np.pi -tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir()) -tempdir = tempdir.joinpath('numba') -environ["NUMBA_CACHE_DIR"] = str(tempdir) +tempdir = PurePath(Path.home()) +#tempdir = PurePath("/tmp" if platform.system() == "Darwin" else tempfile.gettempdir()) +tempdir = tempdir.joinpath('.pyebsdindex').joinpath('numbacache') +#tempdir = tempdir.joinpath('numbacache') +Path(tempdir).mkdir(parents=True, exist_ok=True) +os.environ["NUMBA_CACHE_DIR"] = str(tempdir)+str(os.sep) def addphase(libtype=None, phasename=None, spacegroup=None, @@ -462,14 +484,27 @@ def build_trip_lib(self): def bandindex(self, band_norms, band_intensity = None, band_widths=None, verbose=0): tic0 = timer() nfam = self.polefamilies.shape[0] - bandnorms = np.squeeze(band_norms) - n_bands = np.reshape(bandnorms, (-1,3)).shape[0] #np.int64(bandnorms.size/3) + #bandnorms = np.squeeze(band_norms) + if band_norms.ndim == 2: + bandnorms = band_norms[np.newaxis, ...] + else: + bandnorms = band_norms + + + + + n_bands = bandnorms.shape[1] + npats = bandnorms.shape[0] if band_intensity is None: - band_intensity = np.ones((n_bands)) + band_intensity = np.ones((npats, n_bands)) + + if band_intensity.ndim == 1: + band_intensity = band_intensity[np.newaxis, ...] + tic = timer() - bandangs = np.abs(bandnorms.dot(bandnorms.T)) - bandangs = np.clip(bandangs, -1.0, 1.0) - bandangs = np.arccos(bandangs)*RADEG + #bandangs = np.abs(bandnorms.dot(bandnorms.T)) + #bandangs = np.clip(bandangs, -1.0, 1.0) + #bandangs = np.arccos(bandangs)*RADEG tripangs = self.angtriplets['angles'] @@ -477,9 +512,13 @@ def bandindex(self, band_norms, band_intensity = None, band_widths=None, verbose pairangs = self.angpairs['angles'] pairfam = self.angpairs['familyid'] - accumulator, bandFam, bandRank, band_cm, accumulator_nw = self._tripvote_numba(bandangs, self.lut, self.angTol, tripangs, tripid, nfam, n_bands) - #accumulator, bandFam, bandRank, band_cm = self._pairvote_numba(bandangs, self.angTol, pairangs, pairfam, - # nfam, n_bands) + accumulator, bandFam, bandRank, band_cm, accumulator_nw \ + = self._tripvote_numba(bandnorms, band_intensity, self.lut, self.angTol, tripangs, tripid, nfam) + #accumulator, bandFam, bandRank, band_cm, accumulator_nw \ + # = self._pairvote_numba(bandnorms, band_intensity, self.angTol, pairangs, pairfam, nfam) + + bandRank_arg = np.argsort(bandRank, axis=1).astype(np.int64) + if verbose > 2: print('band Vote time:',timer() - tic) if verbose > 3: @@ -494,26 +533,49 @@ def bandindex(self, band_norms, band_intensity = None, band_widths=None, verbose sumaccum = np.sum(accumulator) - bandRank_arg = np.argsort(bandRank).astype(np.int64) # n_bands - np.arange(n_bands, dtype=np.int64) # + #bandRank_arg = np.argsort(bandRank).astype(np.int64) # n_bands - np.arange(n_bands, dtype=np.int64) # test = 0 fit = 1000.0 nMatch = -1 avequat = np.zeros(4, dtype=np.float32) avequat[0] = 1.0 - polematch = np.zeros([n_bands], dtype = int)-1 + polematch = -1*np.ones([n_bands], dtype = int) whGood = -1 - angTable = self.completelib['angTable'] - sztable = angTable.shape - famIndx = self.completelib['famIndex'] + libAngTable = self.completelib['angTable'] + sztable = libAngTable.shape + libFamIndx = self.completelib['famIndex'] + libFamID = self.completelib['familyid'] nFam = self.completelib['nFamily'] - polesCart = self.completelib['polesCart'] + libPolesCart = self.completelib['polesCart'] angTol = self.angTol n_band_early = np.int64(self.nband_earlyexit) # this will check the vote, and return the exact band matching to specific poles of the best fitting solution. fit, polematch, nMatch, whGood, ij, R, fitb = \ - self._assign_bands_nb(polesCart, bandRank_arg, bandFam, famIndx, nFam, angTable, bandnorms, angTol, n_band_early) + self._assign_bands_nb(libPolesCart, libAngTable, libFamIndx, nFam, angTol, n_band_early, bandnorms, bandRank_arg, bandFam) + + # check how often the indexed band matched the top voting band family. + acc_correct = np.sum(np.array((polematch >= 0) & (self.completelib['familyid'][polematch] == bandFam), dtype=int),axis=1).astype(np.int32) + + + # accumulator = accumulator[0, ...] + # bandFam = bandFam[0, ...] + # bandRank = bandRank[0, ...] + # band_cm = band_cm[0, ...] + # accumulator_nw = accumulator_nw[0, ...] + # bandnorms = bandnorms[0, ...] + # bandRank_arg = bandRank_arg[0, ...] + # fit = fit[0] + # polematch = polematch[0, ...] + # nMatch = nMatch[0] + # whGood = whGood[0, ...] + # whGood = whGood[0:nMatch] + # ij = ij[0, ...] + # + # fitb = fitb[0, ...] + # acc_correct = acc_correct[0,...] + if verbose > 3: #print(rotlib.om2qu(R)) @@ -522,7 +584,8 @@ def bandindex(self, band_norms, band_intensity = None, band_widths=None, verbose #print(fitb) print('___Assigned Band___') print(self.completelib['familyid'][polematch]) - acc_correct = np.sum( np.array(self.completelib['familyid'][polematch] == bandFam).astype(int)).astype(int) + + #acc_correct = np.sum( np.array(self.completelib['familyid'][polematch] == bandFam).astype(int), axis = (1)).astype(int) if verbose > 2: #print(polematch) #print(fit, fitb, fitb[whGood]) @@ -530,53 +593,28 @@ def bandindex(self, band_norms, band_intensity = None, band_widths=None, verbose tic = timer() - cm2 = 0.0 - if nMatch >=2: - if self.high_fidelity == True: - - score = accumulator[[self.completelib['familyid'][polematch[whGood]]], [whGood]] - score /= accumulator_nw[[self.completelib['familyid'][polematch[whGood]]], [whGood]] + 1.0e-6 - score = np.squeeze(score) - - srt = np.flip(np.argsort(score)) - #print(srt+1) - #print(score[srt]) - #print(band_intensity[whGood[srt]]) - - #srt = np.flip(np.argsort(band_intensity[whGood])) - whgood6 = whGood[srt[0:np.min([6, whGood.shape[0]])]] - #if verbose > 2: - # print("Good bands:", whGood+1) - # print("Fit Bands: ", whgood6+1) - #weights6 = score[srt[0:np.min([6, whGood.shape[0]])]] - weights6 = band_intensity[whgood6] - weights6 -= weights6.min() - weights6 *= 1/weights6.max() - #weights6 += 1 - weights6 = np.exp(weights6**2) - #weights6 = np.exp(weights6) - - pflt6 = (np.asarray(polesCart[polematch[whgood6], :], dtype=np.float64)) - bndnorm6 = (np.asarray(bandnorms[whgood6, :], dtype=np.float64)) - - avequat, fit = self._refine_orientation_quest(bndnorm6, pflt6, weights=weights6) - #fitfull = self._fitcheck(avequat, - # np.asarray(bandnorms[whGood, :]), np.asarray(polesCart[polematch[whGood], :] )) + if self.high_fidelity == True: - fit = np.arccos(np.clip(fit, -1.0, 1.0))*RADEG + weights = self._calc_quest_weights(libFamID, accumulator, accumulator_nw, polematch, band_intensity, nfit=6) + avequat, fit = self._refine_orientation_quest(libPolesCart, bandnorms, polematch, weights = weights) + fit = np.arccos(np.clip(fit, -1.0, 1.0))*RADEG + else: + avequat = rotlib.om2qu(R) - else: - avequat = rotlib.om2qu(R) - whmatch = np.nonzero(polematch >= 0)[0] - cm = np.mean(band_cm[whmatch]) - whfam = self.completelib['familyid'][polematch[whmatch]] - cm2 = np.sum(accumulator[[whfam], [whmatch]]).astype(np.float32) - cm2 /= np.sum(accumulator.clip(1)) + cm2 = self._calc_cm(accumulator, polematch, libFamID) if verbose > 2: print('refinement: ', timer() - tic) print('all: ',timer() - tic0) + + # avequat = avequat[0,...] + # fit = fit[0] + # cm2 = cm2[0] + # polematch = polematch[0,...] + # nMatch = nMatch[0] + # ij = ij[0,...] + # acc_correct = acc_correct[0,...] return avequat, fit, cm2, polematch, nMatch, ij, acc_correct #sumaccum def _symrotpoles(self, pole, crystalmats): @@ -798,18 +836,52 @@ def _refine_orientation(self, bandnorms, whGood, polematch): #print('fitting: ',timer() - tic) return avequat, fit - def _refine_orientation_quest(self, bandnorms, polecartmatch, weights = None): + @staticmethod + @numba.jit(nopython=True, cache=True, fastmath=True, parallel=False) + def _calc_quest_weights( libComFamID, accumulator, accumulator_nw, polematch, band_intensity, nfit=6): + npats = accumulator.shape[0] + nbands = polematch.shape[-1] + weights = np.zeros((npats, nbands), dtype=np.float32) + + for p in range(npats): + score = np.full((nbands), -1.0, np.float32) + pmatch = np.ravel(polematch[p, :]).astype(np.int64) + whGood = (np.nonzero(pmatch >= 0)[0]).astype(np.int64) + + if whGood.size < 2: + continue + + acc = accumulator[p, ...] + acc_nw = accumulator_nw[p,...] + for q in range(whGood.size): + whg = np.uint64(whGood[q]) + a1indx = np.uint64(libComFamID[pmatch[whg]]) + score[whg] = acc[a1indx, whg] + score[whg] /= max(acc_nw[a1indx, whg], 1.0e-12) + + srt = np.flip(np.argsort(score)) + + srt6 = srt[0:min(nfit, whGood.size)] + for s in srt6: + weights[p, s] = band_intensity[p, s] + + return weights + + def _refine_orientation_quest(self, libpolecart, bandnorms, polesmatch, weights = None): tic = timer() + npats = bandnorms.shape[0] + nbands = bandnorms.shape[-1] if weights is None: - weights = np.ones((bandnorms.shape[0]), dtype=np.float64) + weights = np.ones((npats, nbands), dtype=np.float64) + weights *= (polesmatch > 0).astype(np.float32) + weightsn = np.asarray(weights, dtype=np.float64) - weightsn /= np.sum(weightsn) + weightsn /= np.maximum(np.sum(weightsn, axis=1), 1e-12).reshape(-1, 1) #print(weightsn) - pflt = np.asarray(polecartmatch, dtype=np.float64) + pflt = np.asarray(libpolecart[polesmatch, :], dtype=np.float64) bndnorm = np.asarray(bandnorms, dtype=np.float64) - avequat, fit = self._orientation_quest_nb(pflt, bndnorm, weightsn) return avequat, fit @@ -818,431 +890,502 @@ def _refine_orientation_quest(self, bandnorms, polecartmatch, weights = None): @numba.jit(nopython=True, cache=True, fastmath=True, parallel=False) def _orientation_quest_nb(polescart, bandnorms, weights): # this uses the Quaternion Estimator AKA quest algorithm. + # this has been adjusted to work with a batch of matching vectors. + eps = 1.0e-7 + npats = bandnorms.shape[0] + nbands = bandnorms.shape[-1] - pflt = np.asarray(polescart, dtype=np.float64) - bndnorm = np.asarray(bandnorms, dtype=np.float64) - npoles = pflt.shape[0] - wn = (np.asarray(weights, dtype=np.float64)).reshape(npoles, 1) - - # wn = np.ones((nGood,1), dtype=np.float32)/np.float32(nGood) #(weights[whGood]).reshape(nGood,1) - wn /= np.sum(wn) - - I = np.zeros((3, 3), dtype=np.float64) - I[0, 0] = 1.0; - I[1, 1] = 1.0; - I[2, 2] = 1.0 - q = np.zeros((4), dtype=np.float64) - - B = (wn * bndnorm).T @ pflt - S = B + B.T - z = np.asarray(np.sum(wn * np.cross(bndnorm, pflt), axis=0), dtype=np.float64) - S2 = S @ S - det = np.linalg.det(S) - k = (S[1, 1] * S[2, 2] - S[1, 2] * S[2, 1]) + (S[0, 0] * S[2, 2] - S[0, 2] * S[2, 0]) + ( - S[0, 0] * S[1, 1] - S[1, 0] * S[0, 1]) - sig = 0.5 * (S[0, 0] + S[1, 1] + S[2, 2]) - sig2 = sig * sig - d = z.T @ S2 @ z - c = det + (z.T @ S @ z) - b = sig2 + (z.T @ z) - a = sig2 - k - - lam = 1.0 - tol = 1.0e-12 - iter = 0 - dlam = 1e6 - # for i in range(10): - while (dlam > tol) and (iter < 10): - lam0 = lam - lam = lam - (lam ** 4 - (a + b) * lam ** 2 - c * lam + (a * b + c * sig - d)) / ( - 4 * lam ** 3 - 2 * (a + b) * lam - c) - dlam = np.fabs(lam0 - lam) - iter += 1 - - beta = lam - sig - alpha = lam ** 2 - sig2 + k - gamma = (lam + sig) * alpha - det - X = np.asarray((alpha * I + beta * S + S2), dtype=np.float64) @ z - qn = np.float64(0.0) - qn += gamma ** 2 + X[0] ** 2 + X[1] ** 2 + X[2] ** 2 - qn = np.sqrt(qn) - q[0] = gamma - q[1:4] = X[0:3] - q /= qn - if (np.sign(gamma) < 0): - q *= -1.0 - - # polesrot = rotlib.quat_vectorL1N(q, pflt, npoles, np.float64, p=1) - # pdot = np.sum(polesrot*bndnorm, axis = 1, dtype=np.float64) - return q, lam # , pdot + qout = np.zeros((npats, 4), dtype=np.float64) + qout[:, 0] = 1.0 + fitout = np.full((npats), np.pi, dtype=np.float64) + + for p in range(npats): + + whgood = (np.nonzero(weights[p, :] > eps)[0]).astype(np.int64) + if whgood.size < 2: + continue + + wn = np.zeros((whgood.size, 1), dtype=np.float64) + bndnorm = np.zeros((whgood.size, 3), dtype=np.float64) + pflt = np.zeros((whgood.size, 3), dtype=np.float64) + for j in range(whgood.size): + whg = np.uint64(whgood[j]) + wn[j, 0] = weights[p, whg] + pflt[j,:] = np.asarray(polescart[p, whg, :], dtype=np.float64) + bndnorm[j,:] = np.asarray(bandnorms[p, whg, :], dtype=np.float64) + wn /= np.sum(wn) + + + npoles = pflt.shape[0] + + # wn = np.ones((nGood,1), dtype=np.float32)/np.float32(nGood) #(weights[whGood]).reshape(nGood,1) + + I = np.zeros((3, 3), dtype=np.float64) + I[0, 0] = 1.0; + I[1, 1] = 1.0; + I[2, 2] = 1.0 + q = np.zeros((4), dtype=np.float64) + + B = (wn * bndnorm).T @ pflt + S = B + B.T + z = np.asarray(np.sum(wn * np.cross(bndnorm, pflt), axis=0), dtype=np.float64) + S2 = S @ S + det = np.linalg.det(S) + k = (S[1, 1] * S[2, 2] - S[1, 2] * S[2, 1]) + (S[0, 0] * S[2, 2] - S[0, 2] * S[2, 0]) + ( + S[0, 0] * S[1, 1] - S[1, 0] * S[0, 1]) + sig = 0.5 * (S[0, 0] + S[1, 1] + S[2, 2]) + sig2 = sig * sig + d = z.T @ S2 @ z + c = det + (z.T @ S @ z) + b = sig2 + (z.T @ z) + a = sig2 - k + + lam = 1.0 + tol = 1.0e-12 + iter = 0 + dlam = 1e6 + # for i in range(10): + while (dlam > tol) and (iter < 10): + lam0 = lam + lam = lam - (lam ** 4 - (a + b) * lam ** 2 - c * lam + (a * b + c * sig - d)) / ( + 4 * lam ** 3 - 2 * (a + b) * lam - c) + dlam = np.fabs(lam0 - lam) + iter += 1 + + beta = lam - sig + alpha = lam ** 2 - sig2 + k + gamma = (lam + sig) * alpha - det + X = np.asarray((alpha * I + beta * S + S2), dtype=np.float64) @ z + qn = np.float64(0.0) + qn += gamma ** 2 + X[0] ** 2 + X[1] ** 2 + X[2] ** 2 + qn = np.sqrt(qn) + q[0] = gamma + q[1:4] = X[0:3] + q /= qn + if (np.sign(gamma) < 0): + q *= -1.0 + + qout[p, :] = q + fitout[p] = lam + # polesrot = rotlib.quat_vectorL1N(q, pflt, npoles, np.float64, p=1) + # pdot = np.sum(polesrot*bndnorm, axis = 1, dtype=np.float64) + return qout, fitout @staticmethod @numba.jit(nopython=True, cache=True,fastmath=True,parallel=False) - def _tripvote_numba(bandangs, LUT, angTol, tripAngles, tripID, nfam, n_bands): + def _tripvote_numba(bandnorms, band_intensity, LUT, angTol, tripAngles, tripID, nfam): + timing1 = 0.0 + timing2 = 0.0 + npats = bandnorms.shape[0] + n_bands = bandnorms.shape[1] LUTTemp = np.asarray(LUT).copy() - accumulator = np.zeros((nfam, n_bands), dtype=np.float32) - accumulatorW = np.zeros((nfam, n_bands), dtype=np.float32) + tshape = np.shape(tripAngles) ntrip = int(tshape[0]) + + accumulator = np.zeros((npats, nfam, n_bands), dtype=np.float32) + accumulatorW = np.zeros((npats, nfam, n_bands), dtype=np.float32) + mxvote = np.zeros((npats,n_bands), dtype=np.int32) + tvotes = np.zeros((npats,n_bands), dtype=np.int32) + band_cm = np.zeros((npats,n_bands), dtype=np.float32) + bandRank = np.zeros((npats,n_bands), dtype=np.float32) + bandFam = np.zeros((npats,n_bands), dtype=np.int32) + #count = 0.0 #angTest2 = np.zeros(ntrip, dtype=numba.boolean) #angTest2 = np.empty(ntrip,dtype=numba.boolean) - angTest0 = np.zeros((3), dtype=np.float32) - for i in range(n_bands): - for j in range(i + 1,n_bands): - for k in range(j + 1,n_bands): - angtri = np.array([bandangs[i,j],bandangs[i,k],bandangs[j,k]], dtype=np.float32) - srt = angtri.argsort(kind='quicksort') #np.array(np.argsort(angtri), dtype=numba.int64) - srt2 = np.asarray(LUTTemp[:,srt[0],srt[1],srt[2]], dtype=np.int64).copy() - unsrtFID = np.argsort(srt2,kind='quicksort').astype(np.int64) - angtriSRT = np.asarray(angtri[srt]) - - #angTest0 = (np.abs(tripAngles - angtriSRT)).astype(np.float32) - #print(angTest0.shape) - #angTest = (angTest0 <= angTol)#.astype(np.int) - - for q in range(ntrip): - #print('____') - #print(tripAngles[q,:], angtriSRT) - - test1 = np.abs(tripAngles[q,0] - angtriSRT[0]) - if test1 > angTol: - continue - else: - angTest0[0] = test1 - - test2 = np.abs(tripAngles[q, 1] - angtriSRT[1]) - if test2 > angTol: - continue - else: - angTest0[1] = test2 - - test3 = np.abs(tripAngles[q, 2] - angtriSRT[2]) - if test3 > angTol: - continue - else: - angTest0[2] = test3 - - #print('here') - #angTest2 = (angTest[q,0] + angTest[q,1] + angTest[q,2]) == 3 - #if angTest2: - f = tripID[q,:] - f = f[unsrtFID] - #print(angTest0[q,:]) - w1 = ( angTol - 0.5*(angTest0[0] + angTest0[1]) ) - w2 = ( angTol - 0.5*(angTest0[0] + angTest0[2]) ) - w3 = ( angTol - 0.5*(angTest0[1] + angTest0[2]) ) - #print(w1, w2, w3) - accumulatorW[f[0],i] += w1 - accumulatorW[f[1],j] += w2 - accumulatorW[f[2],k] += w3 - accumulator[f[0], i] += 1 - accumulator[f[1], j] += 1 - accumulator[f[2], k] += 1 - t1 = False - t2 = False - t3 = False - if np.abs(angtriSRT[0] - angtriSRT[1]) < angTol: - accumulatorW[f[0],j] += w1 - accumulatorW[f[1],i] += w2 - accumulatorW[f[2],k] += w3 - accumulator[f[0], j] += 1 - accumulator[f[1], i] += 1 - accumulator[f[2], k] += 1 - t1 = True - if np.abs(angtriSRT[1] - angtriSRT[2]) < angTol: - accumulatorW[f[0],i] += w1 - accumulatorW[f[1],k] += w2 - accumulatorW[f[2],j] += w3 - accumulator[f[0], i] += 1 - accumulator[f[1], k] += 1 - accumulator[f[2], j] += 1 - t2 = True - if np.abs(angtriSRT[2] - angtriSRT[0]) < angTol: - accumulatorW[f[0],k] += w1 - accumulatorW[f[1],j] += w2 - accumulatorW[f[2],i] += w3 - accumulator[f[0], k] += 1 - accumulator[f[1], j] += 1 - accumulator[f[2], i] += 1 - t3 = True - if (t1 and t2 and t3): - accumulatorW[f[0],k] += w1 - accumulatorW[f[1],i] += w2 - accumulatorW[f[2],j] += w3 - - accumulatorW[f[0], j] += w1 - accumulatorW[f[1], k] += w2 - accumulatorW[f[2], i] += w3 - - accumulator[f[0], k] += 1 - accumulator[f[1], i] += 1 - accumulator[f[2], j] += 1 - - accumulator[f[0], j] += 1 - accumulator[f[1], k] += 1 - accumulator[f[2], i] += 1 - - - mxvote = np.zeros(n_bands, dtype=np.int32) - tvotes = np.zeros(n_bands, dtype=np.int32) - band_cm = np.zeros(n_bands, dtype=np.float32) - for q in range(n_bands): - mxvote[q] = np.amax(accumulatorW[:,q]) - tvotes[q] = np.sum(accumulatorW[:,q]) - - for i in range(n_bands): - if tvotes[i] < 1: - band_cm[i] = 0.0 - else: - srt = np.argsort(accumulatorW[:,i]) - band_cm[i] = (accumulatorW[srt[-1],i] - accumulatorW[srt[-2],i]) / tvotes[i] - - bandRank = np.zeros(n_bands, dtype=np.float32) - bandFam = np.zeros(n_bands, dtype=np.int32) - for q in range(n_bands): - bandFam[q] = np.argmax(accumulatorW[:,q]) - bandRank = (n_bands - np.arange(n_bands)) / n_bands * band_cm * mxvote + for p in range(npats): + bandangs = np.abs(bandnorms[p,...].dot(bandnorms[p,...].T)) + bandangs = np.clip(bandangs, -1.0, 1.0) + bandangs = np.arccos(bandangs) * RADEG + for i in range(n_bands): + if band_intensity[p,i] < 1e-6: # invalid band + bandangs[i,:] = 10000.0 + bandangs[:, i] = 10000.0 + angTest0 = np.zeros((3), dtype=np.float32) + for i in range(n_bands): + for j in range(i + 1,n_bands): + for k in range(j + 1,n_bands): + # tic = ntime() + angtri = np.array([bandangs[i,j],bandangs[i,k],bandangs[j,k]], dtype=np.float32) + #srt = np.array(np.argsort(angtri), dtype=numba.int64) + # I am doing the above, but is MUCH faster for just the three numbers to hard code + srt = np.array([0,1,2], dtype=np.uint64) + if angtri[srt[0]] > angtri[srt[2]]: + srt[2], srt[0] = srt[0], srt[2] + if angtri[srt[0]] > angtri[srt[1]]: + srt[1], srt[0] = srt[0], srt[1] + if angtri[srt[1]] > angtri[srt[2]]: + srt[2], srt[1] = srt[1], srt[2] + ##### end hard code argsrt ###### + + srt2 = np.asarray(LUTTemp[:,srt[0],srt[1],srt[2]], dtype=np.int64).copy() + #unsrtFID = np.argsort(srt2,kind='quicksort').astype(np.int64) + #again, hard coding in the above for speed. + unsrtFID = np.array([0,1,2], dtype=np.uint64) + if srt2[unsrtFID[0]] > srt2[unsrtFID[2]]: + unsrtFID[2], unsrtFID[0] = unsrtFID[0], unsrtFID[2] + if srt2[unsrtFID[0]] > srt2[unsrtFID[1]]: + unsrtFID[1], unsrtFID[0] = unsrtFID[0], unsrtFID[1] + if srt2[unsrtFID[1]] > srt2[unsrtFID[2]]: + unsrtFID[2], unsrtFID[1] = unsrtFID[1], unsrtFID[2] + ##### end hard code argsrt ###### + angtriSRT = np.asarray(angtri[srt], dtype = np.float32) + + #angTest0 = (np.abs(tripAngles - angtriSRT)).astype(np.float32) + #print(angTest0.shape) + #angTest = (angTest0 <= angTol)#.astype(np.int) + # toc = ntime() + # timing1 += toc - tic + # toc = ntime() + for q in range(ntrip): + #print('____') + #print(tripAngles[q,:], angtriSRT) + + test1 = np.abs(tripAngles[q,0] - angtriSRT[0]) + if test1 > angTol: + continue + else: + angTest0[0] = test1 + + test2 = np.abs(tripAngles[q, 1] - angtriSRT[1]) + if test2 > angTol: + continue + else: + angTest0[1] = test2 + + test3 = np.abs(tripAngles[q, 2] - angtriSRT[2]) + if test3 > angTol: + continue + else: + angTest0[2] = test3 + + #print('here') + #angTest2 = (angTest[q,0] + angTest[q,1] + angTest[q,2]) == 3 + #if angTest2: + f = tripID[q,:] + f = f[unsrtFID] + #print(angTest0[q,:]) + w1 = ( angTol - 0.5*(angTest0[0] + angTest0[1]) ) + w2 = ( angTol - 0.5*(angTest0[0] + angTest0[2]) ) + w3 = ( angTol - 0.5*(angTest0[1] + angTest0[2]) ) + #print(w1, w2, w3) + accumulatorW[p,f[0],i] += w1 + accumulatorW[p,f[1],j] += w2 + accumulatorW[p,f[2],k] += w3 + accumulator[p,f[0], i] += 1 + accumulator[p,f[1], j] += 1 + accumulator[p,f[2], k] += 1 + t1 = False + t2 = False + t3 = False + if np.abs(angtriSRT[0] - angtriSRT[1]) < angTol: + accumulatorW[p,f[0],j] += w1 + accumulatorW[p,f[1],i] += w2 + accumulatorW[p,f[2],k] += w3 + accumulator[p,f[0], j] += 1 + accumulator[p,f[1], i] += 1 + accumulator[p,f[2], k] += 1 + t1 = True + if np.abs(angtriSRT[1] - angtriSRT[2]) < angTol: + accumulatorW[p,f[0],i] += w1 + accumulatorW[p,f[1],k] += w2 + accumulatorW[p,f[2],j] += w3 + accumulator[p,f[0], i] += 1 + accumulator[p,f[1], k] += 1 + accumulator[p,f[2], j] += 1 + t2 = True + if np.abs(angtriSRT[2] - angtriSRT[0]) < angTol: + accumulatorW[p,f[0],k] += w1 + accumulatorW[p,f[1],j] += w2 + accumulatorW[p,f[2],i] += w3 + accumulator[p,f[0], k] += 1 + accumulator[p,f[1], j] += 1 + accumulator[p,f[2], i] += 1 + t3 = True + if (t1 and t2 and t3): + accumulatorW[p,f[0],k] += w1 + accumulatorW[p,f[1],i] += w2 + accumulatorW[p,f[2],j] += w3 + + accumulatorW[p,f[0], j] += w1 + accumulatorW[p,f[1], k] += w2 + accumulatorW[p,f[2], i] += w3 + + accumulator[p,f[0], k] += 1 + accumulator[p,f[1], i] += 1 + accumulator[p,f[2], j] += 1 + + accumulator[p,f[0], j] += 1 + accumulator[p,f[1], k] += 1 + accumulator[p,f[2], i] += 1 + + # timing2 += ntime() - toc + + for q in range(n_bands): + mxvote[p,q] = np.amax(accumulatorW[p,:,q]) + tvotes[p,q] = np.sum(accumulatorW[p,:,q]) + #for i in range(n_bands): + if tvotes[p,q] < 1: + band_cm[p,q] = 0.0 + else: + srt = np.argsort(accumulatorW[p,:,q]) + band_cm[p,q] = (accumulatorW[p,srt[-1],q] - accumulatorW[p,srt[-2],q]) / (tvotes[p,q]) + #for q in range(n_bands): + bandFam[p,q] = np.argmax(accumulatorW[p,:,q]) + bandRank[p,:] = (n_bands - np.arange(n_bands)) / n_bands * band_cm[p,:] * mxvote[p,:] + # print(timing1, timing2) return accumulatorW, bandFam, bandRank, band_cm, accumulator @staticmethod @numba.jit(nopython=True, cache=True, fastmath=True, parallel=False) - def _pairvote_numba(bandangs, angTol, pairAngs, pairID, nfam, n_bands): + def _pairvote_numba(bandnorms,band_intensity, angTol, pairAngs, pairID, nfam): + + npats = bandnorms.shape[0] + n_bands = bandnorms.shape[1] + + accumulator = np.zeros((npats, nfam, n_bands), dtype=np.float32) + accumulatorW = np.zeros((npats, nfam, n_bands), dtype=np.float32) + mxvote = np.zeros((npats, n_bands), dtype=np.int32) + tvotes = np.zeros((npats, n_bands), dtype=np.int32) + band_cm = np.zeros((npats, n_bands), dtype=np.float32) + bandRank = np.zeros((npats, n_bands), dtype=np.float32) + bandFam = np.zeros((npats, n_bands), dtype=np.int32) - accumulator = np.zeros((nfam, n_bands), dtype=np.float32) pairshape = np.shape(pairAngs) npair = int(pairshape[0]) - count = 0.0 + #count = 0.0 # angTest2 = np.zeros(ntrip, dtype=numba.boolean) # angTest2 = np.empty(ntrip,dtype=numba.boolean) - for i in range(n_bands): - for j in range(i + 1, n_bands): - bandangpair = bandangs[i, j] - angTest = (np.abs(pairAngs - bandangpair)).astype(np.float32) - # print(angTest0.shape) - - - for q in range(npair): - if angTest[q] <= angTol: - w1 = (angTol - angTest[q]) - - # print(w1, w2, w3) - accumulator[pairID[q,0], i] += w1 - accumulator[pairID[q,1], i] += w1 - accumulator[pairID[q,0], j] += w1 - accumulator[pairID[q,1], j] += w1 - - - mxvote = np.zeros(n_bands, dtype=np.int32) - tvotes = np.zeros(n_bands, dtype=np.int32) - band_cm = np.zeros(n_bands, dtype=np.float32) - for q in range(n_bands): - mxvote[q] = np.amax(accumulator[:, q]) - tvotes[q] = np.sum(accumulator[:, q]) - - for i in range(n_bands): - if tvotes[i] < 1: - band_cm[i] = 0.0 - else: - srt = np.argsort(accumulator[:, i]) - band_cm[i] = (accumulator[srt[-1], i] - accumulator[srt[-2], i]) / tvotes[i] + for p in range(npats): + bandangs = np.abs(bandnorms[p, ...].dot(bandnorms[p, ...].T)) + bandangs = np.clip(bandangs, -1.0, 1.0) + bandangs = np.arccos(bandangs) * RADEG + for i in range(n_bands): + if band_intensity[p,i] < 1e-6: # invalid band + bandangs[i,:] = 10000.0 + bandangs[:, i] = 10000.0 + for i in range(n_bands): + for j in range(i + 1, n_bands): + bandangpair = bandangs[i, j] + angTest = (np.abs(pairAngs - bandangpair)).astype(np.float32) + # print(angTest0.shape) + for q in range(npair): + if angTest[q] <= angTol: + w1 = (angTol - angTest[q]) + + # print(w1, w2, w3) + accumulator[p, pairID[q, 0], i] += 1 + accumulator[p, pairID[q, 1], i] += 1 + accumulator[p, pairID[q, 0], j] += 1 + accumulator[p, pairID[q, 1], j] += 1 + + accumulatorW[p, pairID[q,0], i] += w1 + accumulatorW[p, pairID[q,1], i] += w1 + accumulatorW[p, pairID[q,0], j] += w1 + accumulatorW[p, pairID[q,1], j] += w1 + + for q in range(n_bands): + mxvote[p, q] = np.amax(accumulatorW[p, :, q]) + tvotes[p, q] = np.sum(accumulatorW[p, :, q]) + # for i in range(n_bands): + if tvotes[p, q] < 1: + band_cm[p, q] = 0.0 + else: + srt = np.argsort(accumulatorW[p, :, q]) + band_cm[p, q] = (accumulatorW[p, srt[-1], q] - accumulatorW[p, srt[-2], q]) / (tvotes[p, q]) - bandRank = np.zeros(n_bands, dtype=np.float32) - bandFam = np.zeros(n_bands, dtype=np.int32) - for q in range(n_bands): - bandFam[q] = np.argmax(accumulator[:, q]) - bandRank = (n_bands - np.arange(n_bands)) / n_bands * band_cm * mxvote + bandFam[p, q] = np.argmax(accumulatorW[p, :, q]) + bandRank[p, :] = (n_bands - np.arange(n_bands)) / n_bands * band_cm[p, :] * mxvote[p, :] - return accumulator, bandFam, bandRank, band_cm + return accumulatorW, bandFam, bandRank, band_cm, accumulator @staticmethod @numba.jit(nopython=True, cache=True, fastmath=True,parallel=False) - def _assign_bands_nb(polesCart, bandRank_arg, familyLabel, famIndx, nFam, angTable, bandnorms, angTol, n_band_early): + def _assign_bands_nb(libPolesCart, libAngTable, libFamIndx, nFam, angTol, n_band_early, bandnorms, bandRank_arg, bandFam ): + eps = np.float32(1.0e-12) - nBnds = bandnorms.shape[0] - - whGood_out = np.zeros(nBnds, dtype=np.int64)-1 - - - nMatch = np.int64(-1) - Rout = np.zeros((1,3,3), dtype=np.float32) - #Rout[0,0,0] = 1.0; Rout[0,1,1] = 1.0; Rout[0,2,2] = 1.0 - polematch_out = np.zeros((nBnds),dtype=np.int64) - 1 - pflt = np.asarray(polesCart, dtype=np.float32) - bndnorm = np.transpose(np.asarray(bandnorms, dtype=np.float32)) - - fit = np.float32(360.0) - fitout = np.float32(360.0) - fitbout = np.zeros((nBnds)) - R = np.zeros((1, 3, 3), dtype=np.float32) - - #fit = np.float32(360.0) - #whGood = np.zeros(nBnds, dtype=np.int64) - 1 - nMatch = np.int64(0) - - ij = (-1,-1,-1,-1) - - for ii in range(nBnds-1): - for jj in range(ii+1,nBnds): - #print(ii,jj) - polematch = np.zeros((nBnds),dtype=np.int64) - 1 - - bnd1 = bandRank_arg[-1 - ii] - bnd2 = bandRank_arg[-1 - jj] - - v1 = bandnorms[bnd1,:] - f1 = familyLabel[bnd1] - v2 = bandnorms[bnd2,:] - f2 = familyLabel[bnd2] - ang01 = (np.dot(v1,v2)) - #if ang01 < 0: - # v2 *= -1 - # ang01 *= -1 - - if ang01 > np.float32(1.0): - ang01 = np.float32(1.0-eps) - if ang01 < np.float32(-1.0): - ang01 = np.float32(-1.0+eps) - - paralleltest = np.arccos(np.fabs(ang01)) * RADEG - - if paralleltest < angTol: # the two poles are parallel, send in another two poles if available. - continue - ang01 = np.arccos(ang01) * RADEG - wh12 = np.nonzero(np.abs(angTable[famIndx[f1],famIndx[f2]:np.int64(famIndx[f2] + nFam[f2])] - ang01) < angTol)[0] - - n12 = wh12.size - if n12 == 0: - continue - - wh12 += famIndx[f2] - p1 = polesCart[famIndx[f1], :] - - n12 = wh12.size - v1v2c = np.cross(v1,v2) - v1v2c /= np.linalg.norm(v1v2c) - # attempt to see which solution gives the best match to all the poles - # best is measured as the number of poles that are within tolerance, - # divided by the angular deviation. - # Use the TRIAD method for finding the rotation matrix - - Rtry = np.zeros((n12,3,3), dtype = np.float32) - - #score = np.zeros((n01), dtype = np.float32) - A = np.zeros((3,3), dtype = np.float32) - B = np.zeros((3,3), dtype = np.float32) - #AB = np.zeros((3,3),dtype=np.float32) - b2 = np.cross(v1,v1v2c) - B[0,:] = v1 - B[1,:] = v1v2c - B[2,:] = b2 - A[:,0] = p1 - score = -1.0 - - for i in range(n12): - p2 = polesCart[wh12[i], :] - ntemp = np.linalg.norm(p2) + 1.0e-35 - p2 = p2 / ntemp - p1p2c = np.cross(p1,p2) - ntemp = np.linalg.norm(p1p2c) + 1.0e-35 - p1p2c = p1p2c / ntemp - A[:,1] = p1p2c - A[:,2] = np.cross(p1,p1p2c) - AB = (A.dot(B)) - Rtry[i,:,:] = AB - - testp = (AB.dot(bndnorm)) - test = (pflt.dot(testp)) - #print(test.shape) - angfitTry = np.zeros((nBnds), dtype = np.float32) - #angfitTry = np.max(test,axis=0) - #print(test.shape) - for j in range(nBnds): - angfitTry[j] = np.max(test[:,j]) - angfitTry[j] = -1.0 if angfitTry[j] < -1.0 else angfitTry[j] - angfitTry[j] = 1.0 if angfitTry[j] > 1.0 else angfitTry[j] - - #angfitTry = np.clip(np.amax(test,axis=0),-1.0,1.0) - - angfitTry = np.arccos(angfitTry) * RADEG - whMatch = np.nonzero(angfitTry < angTol)[0] - nmatch = whMatch.size - #scoreTry = np.float32(nmatch) * np.mean(np.abs(angTol - angfitTry[whMatch])) - scoreTry = np.float32(nmatch) /( np.mean(angfitTry[whMatch]) + 1e-6) - if scoreTry > score: - score = scoreTry - angFit = angfitTry + pflt = np.asarray(libPolesCart, dtype=np.float32) + + + npats = bandnorms.shape[0] + nBnds = bandnorms.shape[1] + + whGood_out = np.zeros((npats, nBnds), dtype=np.int64)-1 + Rout = np.zeros((npats,3,3), dtype=np.float32) + Rout[:,0,0] = 1.0 ; Rout[:,1,1] = 1.0 ; Rout[:,2,2] = 1.0 ; + polematch_out = np.full((npats, nBnds),-1, dtype=np.int64) - 1 + + fitout = np.full(npats, 360.0, dtype=np.float32) + fitbout = np.full((npats, nBnds),360.0, dtype=np.float32) + nMatch = np.zeros(npats, dtype=np.int64) #np.int64(0) + ij = np.full((npats, 4), -1, np.int64) + + for p in range(npats): + bndnorm = np.transpose(np.asarray(bandnorms[p,...], dtype=np.float32)) + R = np.zeros((1, 3, 3), dtype=np.float32) + fit = np.float32(360.0) + #fit = np.float32(360.0) + #whGood = np.zeros(nBnds, dtype=np.int64) - 1 + + for ii in range(nBnds-1): + for jj in range(ii+1,nBnds): + #print(ii,jj) + polematch = np.zeros((nBnds),dtype=np.int64) - 1 + + bnd1 = bandRank_arg[p, -1 - ii] + bnd2 = bandRank_arg[p, -1 - jj] + + v1 = bandnorms[p, bnd1,:] + f1 = bandFam[p, bnd1] + v2 = bandnorms[p, bnd2,:] + f2 = bandFam[p, bnd2] + ang01 = (np.dot(v1,v2)) + #if ang01 < 0: + # v2 *= -1 + # ang01 *= -1 + + if ang01 > np.float32(1.0): + ang01 = np.float32(1.0-eps) + if ang01 < np.float32(-1.0): + ang01 = np.float32(-1.0+eps) + + paralleltest = np.arccos(np.fabs(ang01)) * RADEG + + if paralleltest < angTol: # the two poles are parallel, send in another two poles if available. + continue + ang01 = np.arccos(ang01) * RADEG + wh12 = np.nonzero(np.abs(libAngTable[libFamIndx[f1],libFamIndx[f2]:np.int64(libFamIndx[f2] + nFam[f2])] - ang01) < angTol)[0] + + n12 = wh12.size + if n12 == 0: + continue + + wh12 += libFamIndx[f2] + p1 = pflt[libFamIndx[f1], :] + + n12 = wh12.size + v1v2c = np.cross(v1,v2) + v1v2c /= np.linalg.norm(v1v2c) + # attempt to see which solution gives the best match to all the poles + # best is measured as the number of poles that are within tolerance, + # divided by the angular deviation. + # Use the TRIAD method for finding the rotation matrix + + Rtry = np.zeros((n12,3,3), dtype = np.float32) + + #score = np.zeros((n01), dtype = np.float32) + A = np.zeros((3,3), dtype = np.float32) + B = np.zeros((3,3), dtype = np.float32) + #AB = np.zeros((3,3),dtype=np.float32) + b2 = np.cross(v1,v1v2c) + B[0,:] = v1 + B[1,:] = v1v2c + B[2,:] = b2 + A[:,0] = p1 + score = -1.0 + + for i in range(n12): + p2 = pflt[wh12[i], :] + ntemp = np.linalg.norm(p2) + 1.0e-35 + p2 = p2 / ntemp + p1p2c = np.cross(p1,p2) + ntemp = np.linalg.norm(p1p2c) + 1.0e-35 + p1p2c = p1p2c / ntemp + A[:,1] = p1p2c + A[:,2] = np.cross(p1,p1p2c) + AB = (A.dot(B)) + Rtry[i,:,:] = AB + + testp = (AB.dot(bndnorm)) + test = (pflt.dot(testp)) + #print(test.shape) + angfitTry = np.zeros((nBnds), dtype = np.float32) + #angfitTry = np.max(test,axis=0) + #print(test.shape) for j in range(nBnds): - polematch[j] = np.argmax(test[:,j]) * ( 2*np.int32(angfitTry[j] < angTol)-1) - R[0, :,:] = Rtry[i,:,:] - - - whGood = (np.nonzero(angFit < angTol)[0]).astype(np.int64) - nGood = max(np.int64(whGood.size), np.int64(0)) - - if nGood < 3: - continue - #return 360.0,-1,-1,-1 - #whGood = -1*np.ones((1), dtype=np.int64) - #fit = np.float32(360.0) - #polematch[:] = -1 - #nGood = np.int64(-1) - else: - fitb = angFit - #fit = np.mean(fitb[whGood]) - fit = np.float32(0.0) - for q in range(nGood): - fit += np.float32(fitb[whGood[q]]) - fit /= np.float32(nGood) - - - if nGood >= (n_band_early): - testout = testp - - fitout = fit - fitbout = fitb - nMatch = nGood - whGood_out = whGood[:] - polematch_out = polematch[:] - Rout[0,:,:] = R[0,:,:] - ij = (ii,jj,bnd1,bnd2) - break - else: - if nMatch < nGood: - #print((nMatch*(3.0-fitout)) , (nGood*(3.0-fit))) - #if (nMatch*(2.0-fitout)) < (nGood*(2.0-fit)): + angfitTry[j] = np.max(test[:,j]) + angfitTry[j] = -1.0 if angfitTry[j] < -1.0 else angfitTry[j] + angfitTry[j] = 1.0 if angfitTry[j] > 1.0 else angfitTry[j] + + #angfitTry = np.clip(np.amax(test,axis=0),-1.0,1.0) + + angfitTry = np.arccos(angfitTry) * RADEG + whMatch = np.nonzero(angfitTry < angTol)[0] + nmatch = whMatch.size + #scoreTry = np.float32(nmatch) * np.mean(np.abs(angTol - angfitTry[whMatch])) + scoreTry = np.float32(nmatch) /( np.mean(angfitTry[whMatch]) + 1e-6) + if scoreTry > score: + score = scoreTry + angFit = angfitTry + for j in range(nBnds): + polematch[j] = np.argmax(test[:,j]) * ( 2*np.int32(angfitTry[j] < angTol)-1) + R[0, :,:] = Rtry[i,:,:] + + + whGood = (np.nonzero(angFit < angTol)[0]).astype(np.int64) + nGood = max(np.int64(whGood.size), np.int64(0)) + + if nGood < 3: + continue + #return 360.0,-1,-1,-1 + #whGood = -1*np.ones((1), dtype=np.int64) + #fit = np.float32(360.0) + #polematch[:] = -1 + #nGood = np.int64(-1) + else: + fitb = angFit + #fit = np.mean(fitb[whGood]) + fit = np.float32(0.0) + for q in range(nGood): + fit += np.float32(fitb[whGood[q]]) + fit /= np.float32(nGood) + + + if nGood >= (n_band_early): testout = testp - fitout = np.float32(fit) - fitbout = fitb - nMatch = nGood - whGood_out = whGood[:] - polematch_out = polematch[:] - Rout[0,:,:] = R[0,:,:] - - ij = (ii,jj,bnd1,bnd2) - - elif nMatch == nGood: - #elif (nMatch*(2.0-fitout)) == (nGood*(2.0-fit)): - if fitout > fit: + fitout[p] = np.float32(fit) + fitbout[p,...] = fitb + nMatch[p] = nGood + whGood_out[p,0:nGood] = whGood[:] + polematch_out[p,...] = polematch[:] + Rout[p,:,:] = R[0,:,:] + ij[p,:] = np.asarray((ii,jj,bnd1,bnd2), dtype=np.int64) + break + else: + if nMatch[p] < nGood: + #print((nMatch*(3.0-fitout)) , (nGood*(3.0-fit))) + #if (nMatch*(2.0-fitout)) < (nGood*(2.0-fit)): testout = testp - fitout = np.float32(fit) - fitbout = fitb - nMatch = nGood - whGood_out = whGood[:] - polematch_out = polematch[:] - Rout[0,:,:] = R[0,:,:] - - ij = (ii, jj, bnd1,bnd2) - - #print('----') - #print(ij) - - #print(testout.T) - #print(pflt[polematch_out, :]) - if nMatch >= (n_band_early): - break + fitout[p] = np.float32(fit) + fitbout[p, ...] = fitb + nMatch[p] = nGood + whGood_out[p, 0:nGood] = whGood[:] + polematch_out[p, ...] = polematch[:] + Rout[p, :, :] = R[0, :, :] + ij[p, :] = np.asarray((ii, jj, bnd1, bnd2), dtype=np.int64) + + elif nMatch[p] == nGood: + #elif (nMatch*(2.0-fitout)) == (nGood*(2.0-fit)): + if fitout[p] > fit: + testout = testp + fitout[p] = np.float32(fit) + fitbout[p, ...] = fitb + nMatch[p] = nGood + whGood_out[p, 0:nGood] = whGood[:] + polematch_out[p, ...] = polematch[:] + Rout[p, :, :] = R[0, :, :] + ij[p, :] = np.asarray((ii, jj, bnd1, bnd2), dtype=np.int64) + + #print('----') + #print(ij) + + #print(testout.T) + #print(pflt[polematch_out, :]) + if nMatch[p] >= (n_band_early): + break #print(testout.T) #print(pflt[polematch_out,:]) @@ -1606,3 +1749,84 @@ def _fitcheck(self, q, bandnorms, cartxstalpoles): # mean_ang = np.degrees(np.arccos(mean_dot)) return mean_dot + @staticmethod + @numba.jit(nopython=True, cache=True, fastmath=True, parallel=False) + def _calc_cm(accumulator, polematch, libFamIndx): + + npats = accumulator.shape[0] + cm2 = -1 * np.ones(npats, dtype=np.float32) + + for p in range(npats): + + whmatch = (np.nonzero(polematch[p, :] >= 0)[0]).astype(np.int64) + if whmatch.size < 2: + continue + # cm = np.mean(band_cm[whmatch]) + + cm2[p] = 0.0 + for whm in whmatch: + whfam = np.int64(libFamIndx[polematch[p, whm]]) + cm2[p] += np.float32(accumulator[p, whfam, whm]) + cm2[p] /= np.sum(accumulator[p, ...].clip(1)) + return cm2 + +@numba.jit(nopython=True, cache=True, fastmath=True, parallel=False) +def _orientation_quest_nb(polescart, bandnorms, weights): + # this uses the Quaternion Estimator AKA quest algorithm. + + pflt = np.asarray(polescart, dtype=np.float64) + bndnorm = np.asarray(bandnorms, dtype=np.float64) + npoles = pflt.shape[0] + wn = (np.asarray(weights, dtype=np.float64)).reshape(npoles, 1) + + # wn = np.ones((nGood,1), dtype=np.float32)/np.float32(nGood) #(weights[whGood]).reshape(nGood,1) + wn /= np.sum(wn) + + I = np.zeros((3, 3), dtype=np.float64) + I[0, 0] = 1.0; + I[1, 1] = 1.0; + I[2, 2] = 1.0 + q = np.zeros((4), dtype=np.float64) + + B = (wn * bndnorm).T @ pflt + S = B + B.T + z = np.asarray(np.sum(wn * np.cross(bndnorm, pflt), axis=0), dtype=np.float64) + S2 = S @ S + det = np.linalg.det(S) + k = (S[1, 1] * S[2, 2] - S[1, 2] * S[2, 1]) + (S[0, 0] * S[2, 2] - S[0, 2] * S[2, 0]) + ( + S[0, 0] * S[1, 1] - S[1, 0] * S[0, 1]) + sig = 0.5 * (S[0, 0] + S[1, 1] + S[2, 2]) + sig2 = sig * sig + d = z.T @ S2 @ z + c = det + (z.T @ S @ z) + b = sig2 + (z.T @ z) + a = sig2 - k + + lam = 1.0 + tol = 1.0e-12 + iter = 0 + dlam = 1e6 + # for i in range(10): + while (dlam > tol) and (iter < 10): + lam0 = lam + lam = lam - (lam ** 4 - (a + b) * lam ** 2 - c * lam + (a * b + c * sig - d)) / ( + 4 * lam ** 3 - 2 * (a + b) * lam - c) + dlam = np.fabs(lam0 - lam) + iter += 1 + + beta = lam - sig + alpha = lam ** 2 - sig2 + k + gamma = (lam + sig) * alpha - det + X = np.asarray((alpha * I + beta * S + S2), dtype=np.float64) @ z + qn = np.float64(0.0) + qn += gamma ** 2 + X[0] ** 2 + X[1] ** 2 + X[2] ** 2 + qn = np.sqrt(qn) + q[0] = gamma + q[1:4] = X[0:3] + q /= qn + if (np.sign(gamma) < 0): + q *= -1.0 + + # polesrot = rotlib.quat_vectorL1N(q, pflt, npoles, np.float64, p=1) + # pdot = np.sum(polesrot*bndnorm, axis = 1, dtype=np.float64) + return q, lam # , pdot \ No newline at end of file diff --git a/setup.py b/setup.py index 7e42fdc..7e34884 100644 --- a/setup.py +++ b/setup.py @@ -95,7 +95,7 @@ "h5py", "matplotlib", "numpy", - "numba", + "numba>=0.53", "scipy", ], # Files to include when distributing package (see also MANIFEST.in)