Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding an option for target normalization in SVR #1853

Open
wants to merge 17 commits into
base: devel
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 38 additions & 13 deletions dependencies.xml
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,26 @@ Note all install methods after "main" take
-->
<dependencies>
<main>
<h5py>2.10</h5py>
<numpy>1.18</numpy>
<scipy>1.2</scipy>
<scikit-learn>0.24</scikit-learn>
<pandas>1.1</pandas>
<xarray>0.16</xarray>
<h5py/>
<numpy>1.21</numpy>
<scipy>1.7</scipy>
<scikit-learn>1.0</scikit-learn>
<pandas>1.3</pandas>
<!--
Note that xarray 0.20 throws this exception:
xarray/backends/plugins.py", line 105, in list_engines
entrypoints = entry_points().get("xarray.backends", ())
AttributeError: 'EntryPoints' object has no attribute 'get'
so should be skipped.
-->
<xarray>0.19</xarray>
<netcdf4>1.5</netcdf4>
<matplotlib>3.2</matplotlib>
<statsmodels/>
<cloudpickle>1.6</cloudpickle>
<tensorflow>2.0</tensorflow>
<python skip_check='True'>3</python>
<matplotlib>3.3</matplotlib>
<statsmodels>0.13</statsmodels>
<cloudpickle>2.2</cloudpickle>
<tensorflow>2.9</tensorflow>
<python skip_check='True' os='windows'>3.7</python>
<python skip_check='True' os='mac,linux'>3</python>
<hdf5 skip_check='True'/>
<swig skip_check='True'/>
<pylint/>
Expand All @@ -59,13 +67,22 @@ Note all install methods after "main" take
<nomkl os='linux' skip_check='True'/>
<numexpr os='linux'/>
<cmake skip_check='True' optional='True'/>
<ray os="mac,linux" source="pip" pip_extra="[default]">1.9</ray>
<imageio>2.9</imageio>
<ray source="pip" pip_extra="[default]">1.13</ray>
<!-- redis is needed by ray, but on windows, this seems to need to be explicitly stated -->
<redis source="pip" os='windows'/>
<imageio>2.22</imageio>
<line_profiler optional='True'/>
<!-- <ete3 optional='True'/> -->
<pywavelets optional='True'>1.1</pywavelets>
<numdifftools source="pip">0.9</numdifftools>
<fmpy optional='True'/>
<xmlschema source="pip"/>
<pyomo optional='True'>6.4</pyomo>
<glpk skip_check='True' optional='True'/>
<ipopt skip_check='True' optional='True'/>
<cyipopt optional='True'/>
<pyomo-extensions source="pyomo" skip_check='True' optional='True'/>
<protobuf source="pip">3</protobuf> <!-- needed because protobuf 4 incompatible with current ray -->
<setuptools/>
</main>
<alternate name="pip">
Expand All @@ -76,4 +93,12 @@ Note all install methods after "main" take
<nomkl>remove</nomkl>
<numexpr>remove</numexpr>
</alternate>
<alternate name="none">
<hdf5>remove</hdf5>
<swig>remove</swig>
<pip>remove</pip>
<python>remove</python>
<nomkl>remove</nomkl>
<numexpr>remove</numexpr>
</alternate>
</dependencies>
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
Created on Jan 21, 2020

@author: alfoa, wangc
Linear Support Vector Classifier
Linear Support Vector Regressor

"""
#Internal Modules (Lazy Importer)--------------------------------------------------------------------
Expand All @@ -33,7 +33,7 @@ class LinearSVR(ScikitLearnBase):
"""
Linear Support Vector Regressor
"""
info = {'problemtype':'regression', 'normalize':True}
info = {'problemtype':'regression', 'normalize':True, 'normalizeTargets':False}

def __init__(self):
"""
Expand Down Expand Up @@ -97,6 +97,10 @@ def _handleInput(self, paramInput):
super()._handleInput(paramInput)
settings, notFound = paramInput.findNodesAndExtractValues(['epsilon', 'dual', 'loss', 'tol', 'fit_intercept',
'intercept_scaling', 'max_iter'])

setting,_ = paramInput.findNodesAndExtractValues(['normalizeTargets'])
self.info['normalizeTargets'] = setting['normalizeTargets']

# notFound must be empty
assert(not notFound)
self.initializeModel(settings)
8 changes: 5 additions & 3 deletions ravenframework/SupervisedLearning/ScikitLearn/SVM/SVR.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,18 @@

#External Modules------------------------------------------------------------------------------------
np = importModuleLazy("numpy")
import ast
#External Modules End--------------------------------------------------------------------------------

#Internal Modules------------------------------------------------------------------------------------
from ....SupervisedLearning.ScikitLearn import ScikitLearnBase
from ....utils import utils
from ....utils import InputData, InputTypes
#Internal Modules End--------------------------------------------------------------------------------

class SVR(ScikitLearnBase):
"""
Support Vector Regressor
"""
info = {'problemtype':'regression', 'normalize':True}
info = {'problemtype':'regression', 'normalize':True, 'normalizeTargets':False}

def __init__(self):
"""
Expand Down Expand Up @@ -101,6 +99,7 @@ class cls.
descr=r"""Enable verbose output. Note that this setting takes advantage
of a per-process runtime setting in libsvm that, if enabled, may not
work properly in a multithreaded context.""", default=False))

return specs

def _handleInput(self, paramInput):
Expand All @@ -113,6 +112,9 @@ def _handleInput(self, paramInput):
settings, notFound = paramInput.findNodesAndExtractValues(['C', 'kernel', 'degree', 'gamma', 'coef0',
'tol', 'cache_size', 'epsilon', 'shrinking', 'max_iter',
'verbose'])

setting,_ = paramInput.findNodesAndExtractValues(['normalizeTargets'])
self.info['normalizeTargets'] = setting['normalizeTargets']
Comment on lines +116 to +117
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can create a method to check if we want to perform normalization or not. For SVR, we can compute the ratio (basically, the normalized parameters and the default parameters), if the normalized parameters are too large, we can provide normalization on the targets.

# notFound must be empty
assert(not notFound)
self.initializeModel(settings)
30 changes: 14 additions & 16 deletions ravenframework/SupervisedLearning/ScikitLearn/ScikitLearnBase.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,17 @@

#External Modules------------------------------------------------------------------------------------
np = importModuleLazy("numpy")
import ast
#External Modules End--------------------------------------------------------------------------------

#Internal Modules------------------------------------------------------------------------------------
from ..SupervisedLearning import SupervisedLearning
from ...utils import utils
#Internal Modules End--------------------------------------------------------------------------------

class ScikitLearnBase(SupervisedLearning):
"""
Base Class for Scikitlearn-based surrogate models (classifiers and regressors)
"""
info = {'problemtype':None, 'normalize':None}
info = {'problemtype':None, 'normalize':None, 'normalizeTargets':None}

def __init__(self):
"""
Expand All @@ -50,7 +48,6 @@ def __init__(self):
self.model = None # Scikitlearn estimator/model
self.multioutputWrapper = True # If True, use MultiOutputRegressor or MultiOutputClassifier to wrap self.model else
# the self.model can handle multioutput/multi-targets prediction

def updateSettings(self, settings):
"""
Update the parameters of the self.model if the model is wrapper by sklearn.multioutput class
Expand Down Expand Up @@ -185,15 +182,16 @@ def __returnCurrentSettingLocal__(self):
"""
pass

def _localNormalizeData(self,values,names,feat):
"""
Overwrites default normalization procedure.
@ In, values, list(float), unused
@ In, names, list(string), unused
@ In, feat, string, feature to (not) normalize
@ Out, None
"""
if not self.info['normalize']:
self.muAndSigmaFeatures[feat] = (0.0,1.0)
else:
super()._localNormalizeData(values,names,feat)
# def _localNormalizeData(self,values,names,feat):
# """
# Overwrites default normalization procedure.
# @ In, values, list(float), unused
# @ In, names, list(string), unused
# @ In, feat, string, feature to (not) normalize
# @ Out, None
# """
# if not self.info['normalize']:
# self.muAndSigmaFeatures[feat] = (0.0,1.0)
# self.muAndSigmaTargets[self.target[0]] = (0.0,1.0)
# else:
# super()._localNormalizeData(values,names,feat)
75 changes: 51 additions & 24 deletions ravenframework/SupervisedLearning/SupervisedLearning.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
#External Modules End--------------------------------------------------------------------------------

#Internal Modules------------------------------------------------------------------------------------
from ..utils import utils, mathUtils, xmlUtils
from ..utils import mathUtils, xmlUtils
from ..utils import InputTypes, InputData
from ..BaseClasses import BaseInterface
#Internal Modules End--------------------------------------------------------------------------------
Expand All @@ -45,6 +45,7 @@ class SupervisedLearning(BaseInterface):
# 'boolean', 'integer', 'float'
qualityEstType = [] # this describe the type of estimator returned known type are 'distance', 'probability'.
# The values are returned by the self.__confidenceLocal__(Features)
info = {'problemtype':'regression', 'normalize':None, 'normalizeTargets':None}
@classmethod
def getInputSpecification(cls):
"""
Expand All @@ -69,6 +70,10 @@ class cls.
spec.addSub(InputData.parameterInputFactory('pivotParameter',contentType=InputTypes.StringType,
descr=r"""If a time-dependent ROM is requested, please specifies the pivot
variable (e.g. time, etc) used in the input HistorySet.""", default='time'))

spec.addSub(InputData.parameterInputFactory("normalizeTargets", contentType=InputTypes.BoolType,
descr=r"""enables target normalization by centering (subtracting the mean) and dividing by the standard deviation.
This is known to make the ROM less sensitive to parameters such as epsilon, gamma, etc.""", default=False))
cvInput = InputData.parameterInputFactory("CV", contentType=InputTypes.StringType,
descr=r"""The text portion of this node needs to contain the name of the \xmlNode{PostProcessor} with \xmlAttr{subType}
``CrossValidation``.""")
Expand Down Expand Up @@ -129,7 +134,8 @@ def __init__(self):
#average value and sigma are used for normalization of the feature data
#a dictionary where for each feature a tuple (average value, sigma)
#these need to be declared in the child classes!!!!
self.muAndSigmaFeatures = {} # normalization parameters
self.muAndSigmaFeatures = {} # normalizing features
self.muAndSigmaTargets = {} # normalizing targets
self.metadataKeys = set() # keys that can be passed to DataObject as meta information
self.metadataParams = {} # indexMap for metadataKeys to pass to a DataObject as meta dimensionality

Expand Down Expand Up @@ -247,30 +253,39 @@ def train(self, tdict, indexMap=None):
featureValues = np.zeros(shape=(len(targetValues), featLen,len(self.features)))
else:
featureValues = np.zeros(shape=(len(targetValues), len(self.features)))
for cnt, feat in enumerate(self.features):
if feat not in names:
self.raiseAnError(IOError,'The feature sought '+feat+' is not in the training set')
else:
valueToUse = values[names.index(feat)]
resp = self.checkArrayConsistency(valueToUse, self.isDynamic())
if not resp[0]:
self.raiseAnError(IOError,'In training set for feature '+feat+':'+resp[1])
valueToUse = np.asarray(valueToUse)
if len(valueToUse) != featureValues.shape[0]:
self.raiseAWarning('feature values:',featureValues.shape[0],tag='ERROR')
self.raiseAWarning('target values:',len(valueToUse),tag='ERROR')
self.raiseAnError(IOError,'In training set, the number of values provided for feature '+feat+' are != number of target outcomes!')
self._localNormalizeData(values,names,feat)
# valueToUse can be either a matrix (for who can handle time-dep data) or a vector (for who can not)
if self.dynamicFeatures:
featureValues[:, :, cnt] = (valueToUse[:, :]- self.muAndSigmaFeatures[feat][0])/self.muAndSigmaFeatures[feat][1]
for tgtCnt, targ in enumerate(self.target):
for cnt, feat in enumerate(self.features):
if feat not in names:
self.raiseAnError(IOError,'The feature sought '+feat+' is not in the training set')
elif targ not in names:
self.raiseAnError(IOError,'The target sought '+targ+' is not in the training set')
else:
featureValues[:,cnt] = ( (valueToUse[:,0] if len(valueToUse.shape) > 1 else valueToUse[:]) - self.muAndSigmaFeatures[feat][0])/self.muAndSigmaFeatures[feat][1]

valueToUse = values[names.index(feat)]
resp = self.checkArrayConsistency(valueToUse, self.isDynamic())
targetValueToUse = values[names.index(targ)]
tarResp = self.checkArrayConsistency(targetValueToUse, self.isDynamic())
if not resp[0]:
self.raiseAnError(IOError,'In training set for feature '+feat+':'+resp[1])
if not tarResp[0]:
self.raiseAnError(IOError,'In training set for target '+targ+':'+tarResp[1])
valueToUse = np.asarray(valueToUse)
targetValueToUse = np.asarray(targetValueToUse)
if len(valueToUse) != featureValues.shape[0]:
self.raiseAWarning('feature values:',featureValues.shape[0],tag='ERROR')
self.raiseAWarning('target values:',len(valueToUse),tag='ERROR')
self.raiseAnError(IOError,'In training set, the number of values provided for feature '+feat+' are != number of target outcomes!')
self._localNormalizeData(values,names,feat,targ)
# valueToUse can be either a matrix (for who can handle time-dep data) or a vector (for who can not)
if self.dynamicFeatures:
featureValues[:, :, cnt] = (valueToUse[:, :]- self.muAndSigmaFeatures[feat][0])/self.muAndSigmaFeatures[feat][1]
targetValues[:,tgtCnt] = (targetValueToUse[:]- self.muAndSigmaTargets[targ][0])/self.muAndSigmaTargets[targ][1]
else:
featureValues[:,cnt] = ( (valueToUse[:,0] if len(valueToUse.shape) > 1 else valueToUse[:]) - self.muAndSigmaFeatures[feat][0])/self.muAndSigmaFeatures[feat][1]
targetValues[:,tgtCnt] = ( (targetValueToUse[:,0] if len(targetValueToUse.shape) > 1 else targetValueToUse[:]) - self.muAndSigmaTargets[targ][0])/self.muAndSigmaTargets[targ][1]
self.__trainLocal__(featureValues,targetValues)
self.amITrained = True

def _localNormalizeData(self,values,names,feat):
def _localNormalizeData(self,values,names,feat,targ):
"""
Method to normalize data based on the mean and standard deviation. If undesired for a particular ROM,
this method can be overloaded to simply pass (see, e.g., GaussPolynomialRom).
Expand All @@ -279,7 +294,14 @@ def _localNormalizeData(self,values,names,feat):
@ In, feat, list, list of features (from ROM)
@ Out, None
"""
self.muAndSigmaFeatures[feat] = mathUtils.normalizationFactors(values[names.index(feat)])
if not self.info['normalize']:
self.muAndSigmaFeatures[feat] = (0.0,1.0)
else:
self.muAndSigmaFeatures[feat] = mathUtils.normalizationFactors(values[names.index(feat)])
if not self.info['normalizeTargets']:
self.muAndSigmaTargets[targ] = (0.0,1.0)
else:
self.muAndSigmaTargets[targ] = mathUtils.normalizationFactors(values[names.index(targ)])

def confidence(self, edict):
"""
Expand Down Expand Up @@ -349,6 +371,8 @@ def evaluate(self,edict):
else:
featureValues = np.zeros(shape=(values[0].size, len(self.features)))
for cnt, feat in enumerate(self.features):
# feat = featTarg[0]
# targ = featTarg[1]
if feat not in names:
self.raiseAnError(IOError,'The feature sought '+feat+' is not in the evaluate set')
else:
Expand All @@ -359,7 +383,10 @@ def evaluate(self,edict):
featureValues[:, :, cnt] = ((values[names.index(feat)] - self.muAndSigmaFeatures[feat][0]))/self.muAndSigmaFeatures[feat][1]
else:
featureValues[:,cnt] = ((values[names.index(feat)] - self.muAndSigmaFeatures[feat][0]))/self.muAndSigmaFeatures[feat][1]
return self.__evaluateLocal__(featureValues)
target = self.__evaluateLocal__(featureValues)
if ('normalizeTargets' in self.info.keys()) and self.info['normalizeTargets']:
target.update((x, y * self.muAndSigmaTargets[x][1] + self.muAndSigmaTargets[x][0]) for x, y in target.items())
return target

def reset(self):
"""
Expand Down
2 changes: 1 addition & 1 deletion ravenframework/utils/mathUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def normalizationFactors(values, mode='z'):
elif mode == 'scale':
offset = np.min(values)
scale = np.max(values) - offset
else:
else: ##TODO this should be changed, currently if the user entered anything other than z or scale it will not normalize
offset = 0.0
scale = 1.0

Expand Down
Loading