From 023d6ce48bf43a4d3d3b7f37613652a643ac36ff Mon Sep 17 00:00:00 2001 From: daholste <43974253+daholste@users.noreply.github.com> Date: Mon, 28 Jan 2019 23:50:57 -0800 Subject: [PATCH] Misc fixes (#39) * misc fixes -- fix bug where SMAC returning already-seen values; fix param encoding return bug in pipeline object model; nit clean-up AutoFit; return in pipeline suggester when sweeper has no next proposal; null ref fix in public object model pipeline suggester * fix in BuildPipelineNodePropsLightGbm test, fix / use correct 'newTrainer' variable in PipelneSuggester * SMAC perf improvement --- src/AutoML/AutoFitter/AutoFitter.cs | 8 +-- .../PipelineSuggesters/PipelineSuggester.cs | 34 ++++++++-- src/AutoML/Sweepers/SmacSweeper.cs | 12 ++-- src/AutoML/Sweepers/SweeperBase.cs | 6 +- .../TrainerExtensions/TrainerExtensionUtil.cs | 6 +- src/Test/GetNextPipelineTests.cs | 4 ++ src/Test/InferredPipelineTests.cs | 64 +++++++++++++++++++ src/Test/TrainerExtensionsTests.cs | 34 +++++----- 8 files changed, 128 insertions(+), 40 deletions(-) create mode 100644 src/Test/InferredPipelineTests.cs diff --git a/src/AutoML/AutoFitter/AutoFitter.cs b/src/AutoML/AutoFitter/AutoFitter.cs index 689e8c3095f..a1d0aff01ff 100644 --- a/src/AutoML/AutoFitter/AutoFitter.cs +++ b/src/AutoML/AutoFitter/AutoFitter.cs @@ -41,12 +41,6 @@ public AutoFitter(MLContext context, OptimizingMetricInfo metricInfo, AutoFitSet } public InferredPipelineRunResult[] Fit() - { - IteratePipelinesAndFit(); - return _history.ToArray(); - } - - private void IteratePipelinesAndFit() { var stopwatch = Stopwatch.StartNew(); var columns = AutoMlUtils.GetColumnInfoTuples(_context, _trainData, _label, _purposeOverrides); @@ -68,6 +62,8 @@ private void IteratePipelinesAndFit() } while (_history.Count < _settings.StoppingCriteria.MaxIterations && stopwatch.Elapsed.TotalMinutes < _settings.StoppingCriteria.TimeOutInMinutes); + + return _history.ToArray(); } private void ProcessPipeline(InferredPipeline pipeline) diff --git a/src/AutoML/PipelineSuggesters/PipelineSuggester.cs b/src/AutoML/PipelineSuggesters/PipelineSuggester.cs index ff0e7c520cb..fc59151e0f8 100644 --- a/src/AutoML/PipelineSuggesters/PipelineSuggester.cs +++ b/src/AutoML/PipelineSuggesters/PipelineSuggester.cs @@ -21,7 +21,7 @@ public static Pipeline GetNextPipeline(IEnumerable history, { var inferredHistory = history.Select(r => InferredPipelineRunResult.FromPipelineRunResult(r)); var nextInferredPipeline = GetNextInferredPipeline(inferredHistory, columns, task, iterationsRemaining, isMaximizingMetric); - return nextInferredPipeline.ToPipeline(); + return nextInferredPipeline?.ToPipeline(); } public static InferredPipeline GetNextInferredPipeline(IEnumerable history, @@ -47,21 +47,31 @@ public static InferredPipeline GetNextInferredPipeline(IEnumerable(history.Select(h => h.Pipeline)); + // iterate over top trainers (from least run to most run), // to find next pipeline - foreach(var trainer in orderedTopTrainers) + foreach (var trainer in orderedTopTrainers) { var newTrainer = trainer.Clone(); - // make sure we have not seen pipeline before. // repeat until passes or runs out of chances - var visitedPipelines = new HashSet(history.Select(h => h.Pipeline)); const int maxNumberAttempts = 10; var count = 0; do { - SampleHyperparameters(newTrainer, history, isMaximizingMetric); + // sample new hyperparameters for the learner + if (!SampleHyperparameters(newTrainer, history, isMaximizingMetric)) + { + // if unable to sample new hyperparameters for the learner + // (ie SMAC returned 0 suggestions), break + break; + } + var pipeline = new InferredPipeline(transforms, newTrainer); + + // make sure we have not seen pipeline before if (!visitedPipelines.Contains(pipeline)) { return pipeline; @@ -169,7 +179,11 @@ private static IValueGenerator[] ConvertToValueGenerators(IEnumerable history, bool isMaximizingMetric) + /// + /// Samples new hyperparameters for the trainer, and sets them. + /// Returns true if success (new hyperparams were suggested and set). Else, returns false. + /// + private static bool SampleHyperparameters(SuggestedTrainer trainer, IEnumerable history, bool isMaximizingMetric) { var sps = ConvertToValueGenerators(trainer.SweepParams); var sweeper = new SmacSweeper( @@ -179,14 +193,20 @@ private static void SampleHyperparameters(SuggestedTrainer trainer, IEnumerable< }); IEnumerable historyToUse = history - .Where(r => r.RunSucceded && r.Pipeline.Trainer.TrainerName == trainer.TrainerName && r.Pipeline.Trainer.HyperParamSet != null); + .Where(r => r.RunSucceded && r.Pipeline.Trainer.TrainerName == trainer.TrainerName && r.Pipeline.Trainer.HyperParamSet != null && r.Pipeline.Trainer.HyperParamSet.Any()); // get new set of hyperparameter values var proposedParamSet = sweeper.ProposeSweeps(1, historyToUse.Select(h => h.ToRunResult(isMaximizingMetric))).First(); + if(!proposedParamSet.Any()) + { + return false; + } // associate proposed param set with trainer, so that smart hyperparam // sweepers (like KDO) can map them back. trainer.SetHyperparamValues(proposedParamSet); + + return true; } private static IEnumerable CalculateTransforms(MLContext context, diff --git a/src/AutoML/Sweepers/SmacSweeper.cs b/src/AutoML/Sweepers/SmacSweeper.cs index 5f855854ded..490c3b80b14 100644 --- a/src/AutoML/Sweepers/SmacSweeper.cs +++ b/src/AutoML/Sweepers/SmacSweeper.cs @@ -190,13 +190,17 @@ private ParameterSet[] GreedyPlusRandomSearch(ParameterSet[] parents, FastForest for (int i = 0; i < randomConfigs.Length; i++) configurations.Add(new Tuple(randomEIs[i], randomConfigs[i])); - HashSet retainedConfigs = new HashSet(); IOrderedEnumerable> bestConfigurations = configurations.OrderByDescending(x => x.Item1); - foreach (Tuple t in bestConfigurations.Take(numOfCandidates)) - retainedConfigs.Add(t.Item2); + var retainedConfigs = new HashSet(bestConfigurations.Select(x => x.Item2)); - return retainedConfigs.ToArray(); + // remove configurations matching previous run + foreach(var previousRun in previousRuns) + { + retainedConfigs.Remove(previousRun.ParameterSet); + } + + return retainedConfigs.Take(numOfCandidates).ToArray(); } /// diff --git a/src/AutoML/Sweepers/SweeperBase.cs b/src/AutoML/Sweepers/SweeperBase.cs index ba990288d2b..402e4db9f91 100644 --- a/src/AutoML/Sweepers/SweeperBase.cs +++ b/src/AutoML/Sweepers/SweeperBase.cs @@ -44,7 +44,7 @@ protected SweeperBase(ArgumentsBase args, IValueGenerator[] sweepParameters, str public virtual ParameterSet[] ProposeSweeps(int maxSweeps, IEnumerable previousRuns = null) { - var prevParamSets = previousRuns?.Select(r => r.ParameterSet).ToList() ?? new List(); + var prevParamSets = new HashSet(previousRuns?.Select(r => r.ParameterSet).ToList() ?? new List()); var result = new HashSet(); for (int i = 0; i < maxSweeps; i++) { @@ -66,9 +66,9 @@ public virtual ParameterSet[] ProposeSweeps(int maxSweeps, IEnumerable previousRuns) + protected static bool AlreadyGenerated(ParameterSet paramSet, ISet previousRuns) { - return previousRuns.Any(previousRun => previousRun.Equals(paramSet)); + return previousRuns.Contains(paramSet); } } } diff --git a/src/AutoML/TrainerExtensions/TrainerExtensionUtil.cs b/src/AutoML/TrainerExtensions/TrainerExtensionUtil.cs index b70119e49c0..c7591386374 100644 --- a/src/AutoML/TrainerExtensions/TrainerExtensionUtil.cs +++ b/src/AutoML/TrainerExtensions/TrainerExtensionUtil.cs @@ -83,7 +83,7 @@ public static IDictionary BuildPipelineNodeProps(TrainerName tra return BuildLightGbmPipelineNodeProps(sweepParams); } - return sweepParams.ToDictionary(p => p.Name, p => (object)p.RawValue); + return sweepParams.ToDictionary(p => p.Name, p => (object)p.ProcessedValue()); } private static IDictionary BuildLightGbmPipelineNodeProps(IEnumerable sweepParams) @@ -91,10 +91,10 @@ private static IDictionary BuildLightGbmPipelineNodeProps(IEnume var treeBoosterParams = sweepParams.Where(p => _lightGbmTreeBoosterParamNames.Contains(p.Name)); var parentArgParams = sweepParams.Except(treeBoosterParams); - var treeBoosterProps = treeBoosterParams.ToDictionary(p => p.Name, p => (object)p.RawValue); + var treeBoosterProps = treeBoosterParams.ToDictionary(p => p.Name, p => (object)p.ProcessedValue()); var treeBoosterCustomProp = new CustomProperty("Microsoft.ML.LightGBM.TreeBooster", treeBoosterProps); - var props = parentArgParams.ToDictionary(p => p.Name, p => (object)p.RawValue); + var props = parentArgParams.ToDictionary(p => p.Name, p => (object)p.ProcessedValue()); props[LightGbmTreeBoosterPropName] = treeBoosterCustomProp; return props; diff --git a/src/Test/GetNextPipelineTests.cs b/src/Test/GetNextPipelineTests.cs index 51c07548040..1e7f78007db 100644 --- a/src/Test/GetNextPipelineTests.cs +++ b/src/Test/GetNextPipelineTests.cs @@ -50,6 +50,10 @@ public void GetNextPipelineMock() { // get next pipeline var pipeline = PipelineSuggester.GetNextPipeline(history, columns, TaskKind.BinaryClassification, maxIterations - i); + if(pipeline == null) + { + break; + } var result = new PipelineRunResult(pipeline, AutoMlUtils.Random.NextDouble(), true); history.Add(result); diff --git a/src/Test/InferredPipelineTests.cs b/src/Test/InferredPipelineTests.cs new file mode 100644 index 00000000000..313fe0f1fdc --- /dev/null +++ b/src/Test/InferredPipelineTests.cs @@ -0,0 +1,64 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.VisualStudio.TestTools.UnitTesting; + +namespace Microsoft.ML.Auto.Test +{ + [TestClass] + public class InferredPipelineTests + { + [TestMethod] + public void InferredPipelinesHashTest() + { + var context = new MLContext(); + + // test same learners with no hyperparams have the same hash code + var trainer1 = new SuggestedTrainer(context, new LightGbmBinaryExtension()); + var trainer2 = new SuggestedTrainer(context, new LightGbmBinaryExtension()); + var transforms1 = new List(); + var transforms2 = new List(); + var inferredPipeline1 = new InferredPipeline(transforms1, trainer1); + var inferredPipeline2 = new InferredPipeline(transforms2, trainer2); + Assert.AreEqual(inferredPipeline1.GetHashCode(), inferredPipeline2.GetHashCode()); + + // test same learners with hyperparams set vs empty hyperparams have different hash codes + var hyperparams1 = new ParameterSet(new List() { new LongParameterValue("NumLeaves", 2) }); + trainer1 = new SuggestedTrainer(context, new LightGbmBinaryExtension(), hyperparams1); + trainer2 = new SuggestedTrainer(context, new LightGbmBinaryExtension()); + inferredPipeline1 = new InferredPipeline(transforms1, trainer1); + inferredPipeline2 = new InferredPipeline(transforms2, trainer2); + Assert.AreNotEqual(inferredPipeline1.GetHashCode(), inferredPipeline2.GetHashCode()); + + // same learners with different hyperparams + hyperparams1 = new ParameterSet(new List() { new LongParameterValue("NumLeaves", 2) }); + var hyperparams2 = new ParameterSet(new List() { new LongParameterValue("NumLeaves", 6) }); + trainer1 = new SuggestedTrainer(context, new LightGbmBinaryExtension(), hyperparams1); + trainer2 = new SuggestedTrainer(context, new LightGbmBinaryExtension(), hyperparams2); + inferredPipeline1 = new InferredPipeline(transforms1, trainer1); + inferredPipeline2 = new InferredPipeline(transforms2, trainer2); + Assert.AreNotEqual(inferredPipeline1.GetHashCode(), inferredPipeline2.GetHashCode()); + + // same learners with same transforms + trainer1 = new SuggestedTrainer(context, new LightGbmBinaryExtension()); + trainer2 = new SuggestedTrainer(context, new LightGbmBinaryExtension()); + transforms1 = new List() { ColumnConcatenatingExtension.CreateSuggestedTransform(context, new[] { "In" }, "Out") }; + transforms2 = new List() { ColumnConcatenatingExtension.CreateSuggestedTransform(context, new[] { "In" }, "Out") }; + inferredPipeline1 = new InferredPipeline(transforms1, trainer1); + inferredPipeline2 = new InferredPipeline(transforms2, trainer2); + Assert.AreEqual(inferredPipeline1.GetHashCode(), inferredPipeline2.GetHashCode()); + + // same transforms with different learners + trainer1 = new SuggestedTrainer(context, new SdcaBinaryExtension()); + trainer2 = new SuggestedTrainer(context, new LightGbmBinaryExtension()); + transforms1 = new List() { ColumnConcatenatingExtension.CreateSuggestedTransform(context, new[] { "In" }, "Out") }; + transforms2 = new List() { ColumnConcatenatingExtension.CreateSuggestedTransform(context, new[] { "In" }, "Out") }; + inferredPipeline1 = new InferredPipeline(transforms1, trainer1); + inferredPipeline2 = new InferredPipeline(transforms2, trainer2); + Assert.AreNotEqual(inferredPipeline1.GetHashCode(), inferredPipeline2.GetHashCode()); + } + } +} diff --git a/src/Test/TrainerExtensionsTests.cs b/src/Test/TrainerExtensionsTests.cs index 63d116f69c6..e00d0752893 100644 --- a/src/Test/TrainerExtensionsTests.cs +++ b/src/Test/TrainerExtensionsTests.cs @@ -63,22 +63,22 @@ public void BuildPipelineNodePropsLightGbm() var expectedJson = @" { - ""NumBoostRound"": 1, + ""NumBoostRound"": 20, ""LearningRate"": 1, ""NumLeaves"": 1, - ""MinDataPerLeaf"": 1, - ""UseSoftmax"": 1, - ""UseCat"": 1, - ""UseMissing"": 1, - ""MinDataPerGroup"": 1, - ""MaxCatThreshold"": 1, - ""CatSmooth"": 1, - ""CatL2"": 1, + ""MinDataPerLeaf"": 10, + ""UseSoftmax"": false, + ""UseCat"": false, + ""UseMissing"": false, + ""MinDataPerGroup"": 50, + ""MaxCatThreshold"": 16, + ""CatSmooth"": 10, + ""CatL2"": 0.5, ""TreeBooster"": { ""Name"": ""Microsoft.ML.LightGBM.TreeBooster"", ""Properties"": { - ""RegLambda"": 1, - ""RegAlpha"": 1 + ""RegLambda"": 0.5, + ""RegAlpha"": 0.5 } } }"; @@ -99,12 +99,12 @@ public void BuildPipelineNodePropsSdca() var sdcaBinaryProps = TrainerExtensionUtil.BuildPipelineNodeProps(TrainerName.SdcaBinary, sweepParams); var expectedJson = @" { - ""L2Const"": 1, - ""L1Threshold"": 1, - ""ConvergenceTolerance"": 1, - ""MaxIterations"": 1, - ""Shuffle"": 1, - ""BiasLearningRate"": 1 + ""L2Const"": 1E-07, + ""L1Threshold"": 0.0, + ""ConvergenceTolerance"": 0.01, + ""MaxIterations"": 10, + ""Shuffle"": true, + ""BiasLearningRate"": 0.01 }"; Util.AssertObjectMatchesJson(expectedJson, sdcaBinaryProps); }