Skip to content

Commit

Permalink
Redefine run function as bw_auto
Browse files Browse the repository at this point in the history
  • Loading branch information
inti-abbate committed Sep 16, 2024
1 parent 005ffc0 commit 1d55576
Show file tree
Hide file tree
Showing 9 changed files with 267 additions and 281 deletions.
296 changes: 164 additions & 132 deletions docs/examples/Verification.ipynb

Large diffs are not rendered by default.

Binary file removed docs/examples/resampled.mcpl.gz
Binary file not shown.
Binary file removed docs/examples/samples.mcpl.gz
Binary file not shown.
Binary file modified docs/examples/samples_bws
Binary file not shown.
35 changes: 0 additions & 35 deletions docs/examples/source.xml

This file was deleted.

6 changes: 3 additions & 3 deletions python/kdsource/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@
"""

__version__ = "0.1.0"
__author__ = "inti-abbate"
__author__ = "KDSource"

from .geom import Geometry, Metric
from .kde import bw_silv
from .kdsource import KDSource, load, run
from .kde import bw_silv, bw_methods
from .kdsource import KDSource, load
from .plist import PList, appendssv, convert2mcpl, join2mcpl, savessv
from .surfsource import SurfaceSourceFile, create_source_file
73 changes: 70 additions & 3 deletions python/kdsource/kde.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def bw_mlcv(data, weights=None, n_splits=10, seed=None, grid=None, show=True):
Rule.
grid: array-like
Grid of factors for computing bandwidth grid. Default:
numpy.logspace(-1, 1, 20)
numpy.logspace(-1, 1, 10)
show: bool
Whether to show the scores plot.
Expand All @@ -200,7 +200,9 @@ def bw_mlcv(data, weights=None, n_splits=10, seed=None, grid=None, show=True):
)
seed = bw_silv(np.shape(data)[1], N_eff)
if grid is None:
grid = np.logspace(-1, 1, 20)
print("No grid specified. Using np.logspace(-0.1, 0.1, 10).")
print("If fitting fails, change the grid.")
grid = np.logspace(-0.1, 0.1, 10)
bw_grid = np.reshape(grid, (-1, *np.ones(np.ndim(seed), dtype=int))) * seed
if n_splits > len(data):
n_splits = len(data)
Expand All @@ -225,6 +227,65 @@ def bw_mlcv(data, weights=None, n_splits=10, seed=None, grid=None, show=True):
return bw_mlcv


def bw_auto(data, weights, grid=None, Nmlcv=1E4, Nbatch=1E4, Nknn=10, show=True,
n_splits=10):

"""
Automatic bandwidth optimization with kNN and MLCV methods combined.
Parameters
----------
data: array-like
Array of samples. Must have shape (obs, dim).
weights: array-like, optional
Array of sample weights. By default all weights are 1.
grid: array-like
Grid to be used for the MLCV bandwidth optimization. Default:
numpy.logspace(-1, 1, 10)
Nmlcv: int
Number of particles to use for the MLCV bandwidth optimization.
Nbatch: int
Number of particles per batch to use for the kNN bandwidth
optimization.
Nknn: int
Number of closest neighbors used for the kNN bandiwdth
optimization.
show: bool
Whether to show the scores plot of cross-validation.
n_splits: int, optional
Number of folds for cross-validation. Must be at least 2.
"""

N, dim = data.shape

# First fit with kNN to generate a seed adaptative bandwidth
print('Fitting first with kNN method:')
if Nbatch == -1:
print("No batch size for kNN selected. Using all the particles list.")
Nbatch = N
BW_knn = bw_knn(data, weights=weights, k=Nknn, batch_size=Nbatch)
print('')

# Then fit with MLCV
print('Fitting now with MLCV using the previous fitting as seed:')
if Nmlcv == -1:
print("No size for MLCV selected. Using all the particles list.")
Nmlcv = N
seed = BW_knn[:Nmlcv]
print("If fitting takes too long, consider reducing Nmlcv.")
BW_mlcv = bw_mlcv(data[:Nmlcv], weights=weights, seed=seed, grid=grid,
show=show, n_splits=n_splits)
print('')

print('Extending the MLCV optimization to full kNN bandwidth:')
BW_knn_mlcv = BW_knn * BW_mlcv[0] / BW_knn[0]
BW_knn_mlcv *= bw_silv(dim, len(BW_knn)) / bw_silv(dim, len(BW_mlcv))
print('Done.')
return BW_knn_mlcv

return s


def optimize_bw(
bw_method, vecs, ws=None, weightfun=None, maskfun=None, **kwargs
):
Expand Down Expand Up @@ -275,9 +336,15 @@ def optimize_bw(
return bw_knn(vecs, weights=ws, **kwargs)
elif bw_method == "mlcv": # Maximum Likelihood Cross-Validation method
return bw_mlcv(vecs, weights=ws, **kwargs)
elif bw_method == "auto": # kNN and MLCV combination
return bw_auto(vecs, weights=ws, **kwargs)
else:
keys = list(bw_methods.keys())
raise Exception("Invalid bw_method. Available: {}".format(keys))


bw_methods = {"silv": bw_silv, "knn": bw_knn, "mlcv": bw_mlcv}
bw_methods = {
"auto": bw_auto,
"silv": bw_silv,
"knn": bw_knn,
"mlcv": bw_mlcv}
136 changes: 29 additions & 107 deletions python/kdsource/kdsource.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import numpy as np

from .geom import Geometry
from .kde import bw_silv
from .kde import bw_silv, bw_methods
from .kde import optimize_bw
from .plist import PList

Expand Down Expand Up @@ -78,7 +78,7 @@ def load(xmlfilename, N=-1):


class KDSource:
def __init__(self, plist, geom, bw="silv", J=1.0, kernel="gaussian"):
def __init__(self, plist, geom, bw="auto", J=1.0, kernel="gaussian"):
"""
Object representing Kernel Density Estimation (KDE) sources.
Expand Down Expand Up @@ -130,7 +130,8 @@ def __init__(self, plist, geom, bw="silv", J=1.0, kernel="gaussian"):
self.R = R[self.kernel[0]]
self.fitted = False

def fit(self, N=-1, skip=0, scaling=None, **kwargs):
def fit(self, N=-1, skip=0, scaling=None, importance=None, bw_method=None,
**kwargs):
"""
Fit KDE model to particle list.
Expand All @@ -151,8 +152,18 @@ def fit(self, N=-1, skip=0, scaling=None, **kwargs):
scaling: array-like, optional
Scaling to be applied to each variable. This means each
particle variable will be divided by the corresponding
scaling element before applying KDE. By default, the
standard deviation of each variable is used.
scaling element before applying KDE. If set, overrides
importance parameter. By default, the standard deviation of
each variable is used.
importance: array-like, optional
Scaling to be applied to each variable. This means each
particle variable standard deviation will be divided by
the corresponding scaling element before applying KDE.
By default, the importance is set to 1, i.e.
the standard deviation of each variable is used.
bw_method: str
Bandwidth selection method. See bw_methods for available
methods. If None, self.bw_method will be used.
**kwargs: optional
Parameters to be passed to bandwidth selection method. Refer
corresponding method for docs (see bw_methods for method
Expand All @@ -167,10 +178,23 @@ def fit(self, N=-1, skip=0, scaling=None, **kwargs):
self.N_eff = np.sum(ws) ** 2 / np.sum(ws ** 2)
if scaling is None:
scaling = self.geom.std(vecs=vecs, weights=ws)
# If an importance is specified, apply to the scaling factors
if importance is None:
importance = [1] * self.geom.dim
elif len(importance) != self.geom.dim:
raise Exception(
"importance must have len = {}.".format(self.geom.dim))
scaling /= importance
else:
scaling = np.array(scaling)
scaling[scaling == 0] = STD_DEFAULT
self.scaling = scaling
if bw_method is not None:
keys = list(bw_methods.keys())
if bw_method not in keys:
raise Exception(
"Invalid bw_method. Available: {}".format(keys))
self.bw_method = bw_method
if self.bw_method is not None:
print("Calculating bw ... ")
bw = optimize_bw(self.bw_method, vecs / self.scaling, ws, **kwargs)
Expand Down Expand Up @@ -881,105 +905,3 @@ def plot2D_integr(self, vrs, grids, vec0=None, vec1=None, **kwargs):
plt.tight_layout()

return [plt.gcf(), [scores, errs]]


def run(plist, geom, kernel='gaussian', skip=0, importance=None,
N=-1, Nmlcv=-1, Nbatch=-1, Nknn=10, grid=None,
show=True):

"""
Run KDSource to fit model to particle list with kNN and MLCV methods.
Parameters
----------
plist: PList object
The PList wrapping the MCPL file containing the particle
list.
geom: Geometry object
The Geometry defining particle variables treatment.
kernel: string, optional
The function kernel to fit the variables. Available options
are 'gaussian', 'box', and 'epa'
skip: int
Number of particles to skip in the list before starting to
read.
importance: array-like, optional
Scaling to be applied to each variable. This means each
particle variable standard deviation will be divided by
the corresponding scaling element before applying KDE.
By default, the importance is set to 1, i.e.
the standard deviation of each variable is used.
N: int
Number of particles to use for fitting. The real number of
particles used may be lower if end of particle list is
reached or there are particles with zero weight. -1 to use
all particles.
Nmlcv: int
Number of particles to use for the MLCV bandwidth optimization.
Nbatch: int
Number of particles per batch to use for the kNN bandwidth
optimization.
Nknn: int
Number of closest neighbors used for the kNN bandiwdth
optimization.
grid: array-like
Grid to be used for the MLCV bandwidth optimization.
**kwargs: optional
Parameters to be passed to bandwidth selection method. Refer
corresponding method for docs (see bw_methods for method
names).
"""

# Create KDSource
s = KDSource(plist, geom, kernel)

# Get the length of the particles list
parts, ws = s.plist.get(N=-1)
N = len(parts)
print('Using N={:d} particles from the original particles list.\n'.format(
N))

# Use the standard deviation as scaling factor
scaling = s.geom.std(parts=parts)

# If an importance is specified, apply to the scaling factors
if importance is None:
print('No importance selected. Using 1 for each variable.\n')
importance = [1] * s.geom.dim

scaling /= importance

# First fit with kNN to generate a seed adaptative bandwidth
print('Fitting first with kNN method:')
s.bw_method = "knn"
if Nbatch == -1:
Nbatch = int(N / Nknn)
s.fit(N, skip=skip, scaling=scaling, batch_size=Nbatch, k=Nknn)
bw_knn = s.kde.bw
print('')

# Then fit with MLCV
print('Fitting now with MLCV using the previous fitting as seed:')
s.bw_method = "mlcv"
if Nmlcv == -1:
print("No size for MLCV selected. Using all the particles list.")
Nmlcv = N
if grid is None:
print("No grid specified. Using np.logspace(-0.1, 0.1, 10).")
print("If fitting fails, change the grid.")
grid = np.logspace(-0.1, 0.1, 10)
seed = bw_knn[:Nmlcv]
print("If fitting takes too long, consider reducing Nmlcv.")
s.fit(Nmlcv, scaling=scaling, seed=seed, grid=grid, show=show)
bw_mlcv = s.kde.bw
print('')

print('Extending the MLCV optimization to full kNN bandwidth:')
bw_knn_mlcv = bw_knn * bw_mlcv[0] / bw_knn[0]
dim = s.geom.dim
bw_knn_mlcv *= bw_silv(dim, len(bw_knn)) / bw_silv(dim, len(bw_mlcv))
s = KDSource(plist, geom, bw=bw_knn_mlcv, kernel=kernel)
s.fit(N=N, scaling=scaling)
print('Done.')

return s
2 changes: 1 addition & 1 deletion python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
description="Python API for KDSource package",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/inti-abbate/KDSource",
url="https://github.com/KDSource/KDSource",
project_urls={},
classifiers=[
"Programming Language :: Python :: 3",
Expand Down

0 comments on commit 1d55576

Please sign in to comment.