Redefine run function as bw_auto

inti-abbate · Sep 16, 2024 · 1d55576 · 1d55576
1 parent 005ffc0
commit 1d55576
Show file tree

Hide file tree

Showing 9 changed files with 267 additions and 281 deletions.
diff --git a/docs/examples/Verification.ipynb b/docs/examples/Verification.ipynb
diff --git a/docs/examples/resampled.mcpl.gz b/docs/examples/resampled.mcpl.gz
diff --git a/docs/examples/samples.mcpl.gz b/docs/examples/samples.mcpl.gz
diff --git a/docs/examples/samples_bws b/docs/examples/samples_bws
diff --git a/docs/examples/source.xml b/docs/examples/source.xml
diff --git a/python/kdsource/__init__.py b/python/kdsource/__init__.py
@@ -30,10 +30,10 @@
 """
 
 __version__ = "0.1.0"
-__author__ = "inti-abbate"
+__author__ = "KDSource"
 
 from .geom import Geometry, Metric
-from .kde import bw_silv
-from .kdsource import KDSource, load, run
+from .kde import bw_silv, bw_methods
+from .kdsource import KDSource, load
 from .plist import PList, appendssv, convert2mcpl, join2mcpl, savessv
 from .surfsource import SurfaceSourceFile, create_source_file
diff --git a/python/kdsource/kde.py b/python/kdsource/kde.py
@@ -183,7 +183,7 @@ def bw_mlcv(data, weights=None, n_splits=10, seed=None, grid=None, show=True):
         Rule.
     grid: array-like
         Grid of factors for computing bandwidth grid. Default:
-        numpy.logspace(-1, 1, 20)
+        numpy.logspace(-1, 1, 10)
     show: bool
         Whether to show the scores plot.
 
@@ -200,7 +200,9 @@ def bw_mlcv(data, weights=None, n_splits=10, seed=None, grid=None, show=True):
         )
         seed = bw_silv(np.shape(data)[1], N_eff)
     if grid is None:
-        grid = np.logspace(-1, 1, 20)
+        print("No grid specified. Using np.logspace(-0.1, 0.1, 10).")
+        print("If fitting fails, change the grid.")
+        grid = np.logspace(-0.1, 0.1, 10)
     bw_grid = np.reshape(grid, (-1, *np.ones(np.ndim(seed), dtype=int))) * seed
     if n_splits > len(data):
         n_splits = len(data)
@@ -225,6 +227,65 @@ def bw_mlcv(data, weights=None, n_splits=10, seed=None, grid=None, show=True):
     return bw_mlcv
 
 
+def bw_auto(data, weights, grid=None, Nmlcv=1E4, Nbatch=1E4, Nknn=10, show=True,
+    n_splits=10):
+
+    """
+    Automatic bandwidth optimization with kNN and MLCV methods combined.
+
+    Parameters
+    ----------
+    data: array-like
+        Array of samples. Must have shape (obs, dim).
+    weights: array-like, optional
+        Array of sample weights. By default all weights are 1.
+    grid: array-like
+        Grid to be used for the MLCV bandwidth optimization. Default:
+        numpy.logspace(-1, 1, 10)
+    Nmlcv: int
+        Number of particles to use for the MLCV bandwidth optimization.
+    Nbatch: int
+        Number of particles per batch to use for the kNN bandwidth
+        optimization.
+    Nknn: int
+        Number of closest neighbors used for the kNN bandiwdth
+        optimization.
+    show: bool
+        Whether to show the scores plot of cross-validation.
+    n_splits: int, optional
+        Number of folds for cross-validation. Must be at least 2.
+    """
+
+    N, dim = data.shape
+
+    # First fit with kNN to generate a seed adaptative bandwidth
+    print('Fitting first with kNN method:')
+    if Nbatch == -1:
+        print("No batch size for kNN selected. Using all the particles list.")
+        Nbatch = N
+    BW_knn = bw_knn(data, weights=weights, k=Nknn, batch_size=Nbatch)
+    print('')
+
+    # Then fit with MLCV
+    print('Fitting now with MLCV using the previous fitting as seed:')
+    if Nmlcv == -1:
+        print("No size for MLCV selected. Using all the particles list.")
+        Nmlcv = N
+    seed = BW_knn[:Nmlcv]
+    print("If fitting takes too long, consider reducing Nmlcv.")
+    BW_mlcv = bw_mlcv(data[:Nmlcv], weights=weights, seed=seed, grid=grid,
+        show=show, n_splits=n_splits)
+    print('')
+
+    print('Extending the MLCV optimization to full kNN bandwidth:')
+    BW_knn_mlcv = BW_knn * BW_mlcv[0] / BW_knn[0]
+    BW_knn_mlcv *= bw_silv(dim, len(BW_knn)) / bw_silv(dim, len(BW_mlcv))
+    print('Done.')
+    return BW_knn_mlcv
+
+    return s
+
+
 def optimize_bw(
     bw_method, vecs, ws=None, weightfun=None, maskfun=None, **kwargs
 ):
@@ -275,9 +336,15 @@ def optimize_bw(
         return bw_knn(vecs, weights=ws, **kwargs)
     elif bw_method == "mlcv":  # Maximum Likelihood Cross-Validation method
         return bw_mlcv(vecs, weights=ws, **kwargs)
+    elif bw_method == "auto":  # kNN and MLCV combination
+        return bw_auto(vecs, weights=ws, **kwargs)
     else:
         keys = list(bw_methods.keys())
         raise Exception("Invalid bw_method. Available: {}".format(keys))
 
 
-bw_methods = {"silv": bw_silv, "knn": bw_knn, "mlcv": bw_mlcv}
+bw_methods = {
+    "auto": bw_auto,
+    "silv": bw_silv,
+    "knn": bw_knn,
+    "mlcv": bw_mlcv}
diff --git a/python/kdsource/kdsource.py b/python/kdsource/kdsource.py
@@ -15,7 +15,7 @@
 import numpy as np
 
 from .geom import Geometry
-from .kde import bw_silv
+from .kde import bw_silv, bw_methods
 from .kde import optimize_bw
 from .plist import PList
 
@@ -78,7 +78,7 @@ def load(xmlfilename, N=-1):
 
 
 class KDSource:
-    def __init__(self, plist, geom, bw="silv", J=1.0, kernel="gaussian"):
+    def __init__(self, plist, geom, bw="auto", J=1.0, kernel="gaussian"):
         """
         Object representing Kernel Density Estimation (KDE) sources.
 
@@ -130,7 +130,8 @@ def __init__(self, plist, geom, bw="silv", J=1.0, kernel="gaussian"):
         self.R = R[self.kernel[0]]
         self.fitted = False
 
-    def fit(self, N=-1, skip=0, scaling=None, **kwargs):
+    def fit(self, N=-1, skip=0, scaling=None, importance=None, bw_method=None,
+        **kwargs):
         """
         Fit KDE model to particle list.
 
@@ -151,8 +152,18 @@ def fit(self, N=-1, skip=0, scaling=None, **kwargs):
         scaling: array-like, optional
             Scaling to be applied to each variable. This means each
             particle variable will be divided by the corresponding
-            scaling element before applying KDE. By default, the
-            standard deviation of each variable is used.
+            scaling element before applying KDE. If set, overrides
+            importance parameter. By default, the standard deviation of
+            each variable is used.
+        importance: array-like, optional
+            Scaling to be applied to each variable. This means each
+            particle variable standard deviation will be divided by
+            the corresponding scaling element before applying KDE.
+            By default, the importance is set to 1, i.e.
+            the standard deviation of each variable is used.
+        bw_method: str
+            Bandwidth selection method. See bw_methods for available
+            methods. If None, self.bw_method will be used.
         **kwargs: optional
             Parameters to be passed to bandwidth selection method. Refer
             corresponding method for docs (see bw_methods for method
@@ -167,10 +178,23 @@ def fit(self, N=-1, skip=0, scaling=None, **kwargs):
         self.N_eff = np.sum(ws) ** 2 / np.sum(ws ** 2)
         if scaling is None:
             scaling = self.geom.std(vecs=vecs, weights=ws)
+            # If an importance is specified, apply to the scaling factors
+            if importance is None:
+                importance = [1] * self.geom.dim
+            elif len(importance) != self.geom.dim:
+                raise Exception(
+                    "importance must have len = {}.".format(self.geom.dim))
+            scaling /= importance
         else:
             scaling = np.array(scaling)
         scaling[scaling == 0] = STD_DEFAULT
         self.scaling = scaling
+        if bw_method is not None:
+            keys = list(bw_methods.keys())
+            if bw_method not in keys:
+                raise Exception(
+                    "Invalid bw_method. Available: {}".format(keys))
+            self.bw_method = bw_method
         if self.bw_method is not None:
             print("Calculating bw ... ")
             bw = optimize_bw(self.bw_method, vecs / self.scaling, ws, **kwargs)
@@ -881,105 +905,3 @@ def plot2D_integr(self, vrs, grids, vec0=None, vec1=None, **kwargs):
         plt.tight_layout()
 
         return [plt.gcf(), [scores, errs]]
-
-
-def run(plist, geom, kernel='gaussian', skip=0, importance=None,
-        N=-1, Nmlcv=-1, Nbatch=-1, Nknn=10, grid=None,
-        show=True):
-
-    """
-    Run KDSource to fit model to particle list with kNN and MLCV methods.
-
-    Parameters
-    ----------
-    plist: PList object
-        The PList wrapping the MCPL file containing the particle
-        list.
-    geom: Geometry object
-        The Geometry defining particle variables treatment.
-    kernel: string, optional
-        The function kernel to fit the variables. Available options
-        are 'gaussian', 'box', and 'epa'
-    skip: int
-        Number of particles to skip in the list before starting to
-        read.
-    importance: array-like, optional
-        Scaling to be applied to each variable. This means each
-        particle variable standard deviation will be divided by
-        the corresponding scaling element before applying KDE.
-        By default, the importance is set to 1, i.e.
-        the standard deviation of each variable is used.
-    N: int
-        Number of particles to use for fitting. The real number of
-        particles used may be lower if end of particle list is
-        reached or there are particles with zero weight. -1 to use
-        all particles.
-    Nmlcv: int
-        Number of particles to use for the MLCV bandwidth optimization.
-    Nbatch: int
-        Number of particles per batch to use for the kNN bandwidth
-        optimization.
-    Nknn: int
-        Number of closest neighbors used for the kNN bandiwdth
-        optimization.
-    grid: array-like
-        Grid to be used for the MLCV bandwidth optimization.
-    **kwargs: optional
-        Parameters to be passed to bandwidth selection method. Refer
-        corresponding method for docs (see bw_methods for method
-        names).
-    """
-
-    # Create KDSource
-    s = KDSource(plist, geom, kernel)
-
-    # Get the length of the particles list
-    parts, ws = s.plist.get(N=-1)
-    N = len(parts)
-    print('Using N={:d} particles from the original particles list.\n'.format(
-        N))
-
-    # Use the standard deviation as scaling factor
-    scaling = s.geom.std(parts=parts)
-
-    # If an importance is specified, apply to the scaling factors
-    if importance is None:
-        print('No importance selected. Using 1 for each variable.\n')
-        importance = [1] * s.geom.dim
-
-    scaling /= importance
-
-    # First fit with kNN to generate a seed adaptative bandwidth
-    print('Fitting first with kNN method:')
-    s.bw_method = "knn"
-    if Nbatch == -1:
-        Nbatch = int(N / Nknn)
-    s.fit(N, skip=skip, scaling=scaling, batch_size=Nbatch, k=Nknn)
-    bw_knn = s.kde.bw
-    print('')
-
-    # Then fit with MLCV
-    print('Fitting now with MLCV using the previous fitting as seed:')
-    s.bw_method = "mlcv"
-    if Nmlcv == -1:
-        print("No size for MLCV selected. Using all the particles list.")
-        Nmlcv = N
-    if grid is None:
-        print("No grid specified. Using np.logspace(-0.1, 0.1, 10).")
-        print("If fitting fails, change the grid.")
-        grid = np.logspace(-0.1, 0.1, 10)
-    seed = bw_knn[:Nmlcv]
-    print("If fitting takes too long, consider reducing Nmlcv.")
-    s.fit(Nmlcv, scaling=scaling, seed=seed, grid=grid, show=show)
-    bw_mlcv = s.kde.bw
-    print('')
-
-    print('Extending the MLCV optimization to full kNN bandwidth:')
-    bw_knn_mlcv = bw_knn * bw_mlcv[0] / bw_knn[0]
-    dim = s.geom.dim
-    bw_knn_mlcv *= bw_silv(dim, len(bw_knn)) / bw_silv(dim, len(bw_mlcv))
-    s = KDSource(plist, geom, bw=bw_knn_mlcv, kernel=kernel)
-    s.fit(N=N, scaling=scaling)
-    print('Done.')
-
-    return s
diff --git a/python/setup.py b/python/setup.py
@@ -30,7 +30,7 @@
     description="Python API for KDSource package",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    url="https://github.com/inti-abbate/KDSource",
+    url="https://github.com/KDSource/KDSource",
     project_urls={},
     classifiers=[
         "Programming Language :: Python :: 3",