diff --git a/docs/api-reference/io-columns-tree-featurization-binary-classification.md b/docs/api-reference/io-columns-tree-featurization-binary-classification.md new file mode 100644 index 0000000000..1fd9a68a89 --- /dev/null +++ b/docs/api-reference/io-columns-tree-featurization-binary-classification.md @@ -0,0 +1,14 @@ +### Input and Output Columns +The input label column data must be . +The input features column data must be a known-sized vector of . + +This estimator outputs the following columns: + +| Output Column Name | Column Type | Description| +| -- | -- | -- | +| `Trees` | Known-sized vector of | The output values of all trees. Its size is identical to the total number of trees in the tree ensemble model. | +| `Leaves` | Known-sized vector of | 0-1 vector representation to the IDs of all leaves where the input feature vector falls into. Its size is the number of total leaves in the tree ensemble model. | +| `Paths` | Known-sized vector of | 0-1 vector representation to the paths the input feature vector passed through to reach the leaves. Its size is the number of non-leaf nodes in the tree ensemble model. | + +Those output columns are all optional and user can change their names. +Please set the names of skipped columns to null so that they would not be produced. \ No newline at end of file diff --git a/docs/api-reference/io-columns-tree-featurization-ranking.md b/docs/api-reference/io-columns-tree-featurization-ranking.md new file mode 100644 index 0000000000..375ad18f9c --- /dev/null +++ b/docs/api-reference/io-columns-tree-featurization-ranking.md @@ -0,0 +1,20 @@ +### Input and Output Columns +The input label data type must be [key](xref:Microsoft.ML.Data.KeyDataViewType) +type or . The value of the label determines relevance, where +higher values indicate higher relevance. If the label is a +[key](xref:Microsoft.ML.Data.KeyDataViewType) type, then the key index is the +relevance value, where the smallest index is the least relevant. If the label is a +, larger values indicate higher relevance. The feature +column must be a known-sized vector of and input row group +column must be [key](xref:Microsoft.ML.Data.KeyDataViewType) type. + +This estimator outputs the following columns: + +| Output Column Name | Column Type | Description| +| -- | -- | -- | +| `Trees` | Known-sized vector of | The output values of all trees. Its size is identical to the total number of trees in the tree ensemble model. | +| `Leaves` | Known-sized vector of | 0-1 vector representation to the IDs of all leaves where the input feature vector falls into. Its size is the number of total leaves in the tree ensemble model. | +| `Paths` | Known-sized vector of | 0-1 vector representation to the paths the input feature vector passed through to reach the leaves. Its size is the number of non-leaf nodes in the tree ensemble model. | + +Those output columns are all optional and user can change their names. +Please set the names of skipped columns to null so that they would not be produced. \ No newline at end of file diff --git a/docs/api-reference/io-columns-tree-featurization-regression.md b/docs/api-reference/io-columns-tree-featurization-regression.md new file mode 100644 index 0000000000..d4acf06f39 --- /dev/null +++ b/docs/api-reference/io-columns-tree-featurization-regression.md @@ -0,0 +1,14 @@ +### Input and Output Columns +The input label column data must be . +The input features column data must be a known-sized vector of . + +This estimator outputs the following columns: + +| Output Column Name | Column Type | Description| +| -- | -- | -- | +| `Trees` | Known-sized vector of | The output values of all trees. Its size is identical to the total number of trees in the tree ensemble model. | +| `Leaves` | Known-sized vector of | 0-1 vector representation to the IDs of all leaves where the input feature vector falls into. Its size is the number of total leaves in the tree ensemble model. | +| `Paths` | Known-sized vector of | 0-1 vector representation to the paths the input feature vector passed through to reach the leaves. Its size is the number of non-leaf nodes in the tree ensemble model. | + +Those output columns are all optional and user can change their names. +Please set the names of skipped columns to null so that they would not be produced. \ No newline at end of file diff --git a/docs/api-reference/tree-featurization-prediction.md b/docs/api-reference/tree-featurization-prediction.md new file mode 100644 index 0000000000..6516dfef8c --- /dev/null +++ b/docs/api-reference/tree-featurization-prediction.md @@ -0,0 +1,25 @@ +### Prediction Details +This estimator produces several output columns from a tree ensemble model. Assume that the model contains only one decision tree: + + Node 0 + / \ + / \ + / \ + / \ + Node 1 Node 2 + / \ / \ + / \ / \ + / \ Leaf -3 Node 3 + Leaf -1 Leaf -2 / \ + / \ + Leaf -4 Leaf -5 + +Assume that the input feature vector falls into `Leaf -1`. The output `Trees` may be a 1-element vector where +the only value is the decision value carried by `Leaf -1`. The output `Leaves` is a 0-1 vector. If the reached +leaf is the $i$-th (indexed by $-(i+1)$ so the first leaf is `Leaf -1`) leaf in the tree, the $i$-th value in `Leaves` +would be 1 and all other values would be 0. The output `Paths` is a 0-1 representation of the nodes passed +through before reaching the leaf. The $i$-th element in `Paths` indicates if the $i$-th node (indexed by $i$) is touched. +For example, reaching `Leaf -1` lead to $[1, 1, 0, 0]$ as the `Paths`. If there are multiple trees, this estimator +just concatenates `Trees`'s, `Leaves`'s, `Paths`'s from all trees (first tree's information comes first in the concatenated vectors). + +Check the See Also section for links to usage examples. \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/BinaryClassificationFeaturization.ttinclude b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/BinaryClassificationFeaturization.ttinclude new file mode 100644 index 0000000000..ec5507c3a7 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/BinaryClassificationFeaturization.ttinclude @@ -0,0 +1,110 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +<# if (TrainerOptions != null) { #> +<#=OptionsInclude#> +<# } #> + +namespace Samples.Dynamic.Transforms.TreeFeaturization +{ + public static class <#=ClassName#> + {<#=Comments#> + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of data points to be transformed. + var dataPoints = GenerateRandomDataPoints(100).ToList(); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); +<# if (CacheData) { #> + + // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, + // it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory, + // a solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms + // which needs many data passes. + dataView = mlContext.Data.Cache(dataView); +<# } #> + + // Define input and output columns of tree-based featurizer. + string labelColumnName = nameof(DataPoint.Label); + string featureColumnName = nameof(DataPoint.Features); + string treesColumnName = nameof(TransformedDataPoint.Trees); + string leavesColumnName = nameof(TransformedDataPoint.Leaves); + string pathsColumnName = nameof(TransformedDataPoint.Paths); + + // Define the configuration of the trainer used to train a tree-based model. + var trainerOptions = new <#=TrainerOptions#>; + + // Define the tree-based featurizer's configuration. + var options = new <#=Options#>; + + // Define the featurizer. + var pipeline = mlContext.Transforms.<#=Trainer#>(options); + + // Train the model. + var model = pipeline.Fit(dataView); + + // Apply the trained transformer to the considered data set. + var transformed = model.Transform(dataView); + + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); + + // Print out the transformation of the first 3 data points. + for (int i = 0; i < 3; ++i) + { + var dataPoint = dataPoints[i]; + var transformedDataPoint = transformedDataPoints[i]; + Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:"); + Console.WriteLine($" Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}]."); + Console.WriteLine($" Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}]."); + Console.WriteLine($" Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}]."); + } + + <#=ExpectedOutput#> + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) + { + var random = new Random(seed); + float randomFloat() => (float)random.NextDouble(); + for (int i = 0; i < count; i++) + { + var label = randomFloat() > <#=LabelThreshold#>; + yield return new DataPoint + { + Label = label, + // Create random features that are correlated with the label. + // For data points with false label, the feature values are slightly increased by adding a constant. + Features = Enumerable.Repeat(label, 3).Select(x => x ? randomFloat() : randomFloat() + <#=DataSepValue#>).ToArray() + }; + } + } + + // Example with label and 3 feature values. A data set is a collection of such examples. + private class DataPoint + { + public bool Label { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture the output of tree-base featurization. + private class TransformedDataPoint : DataPoint + { + // The i-th value is the output value of the i-th decision tree. + public float[] Trees { get; set; } + // The 0-1 encoding of leaves the input feature vector falls into. + public float[] Leaves { get; set; } + // The 0-1 encoding of paths the input feature vector reaches the leaves. + public float[] Paths { get; set; } + } + } +} \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs new file mode 100644 index 0000000000..6c9b100fa4 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs @@ -0,0 +1,139 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.FastTree; + +namespace Samples.Dynamic.Transforms.TreeFeaturization +{ + public static class FastForestBinaryFeaturizationWithOptions + { + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of data points to be transformed. + var dataPoints = GenerateRandomDataPoints(100).ToList(); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); + + // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, + // it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory, + // a solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms + // which needs many data passes. + dataView = mlContext.Data.Cache(dataView); + + // Define input and output columns of tree-based featurizer. + string labelColumnName = nameof(DataPoint.Label); + string featureColumnName = nameof(DataPoint.Features); + string treesColumnName = nameof(TransformedDataPoint.Trees); + string leavesColumnName = nameof(TransformedDataPoint.Leaves); + string pathsColumnName = nameof(TransformedDataPoint.Paths); + + // Define the configuration of the trainer used to train a tree-based model. + var trainerOptions = new FastForestBinaryTrainer.Options + { + // Create a simpler model by penalizing usage of new features. + FeatureFirstUsePenalty = 0.1, + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + // Feature column name. + FeatureColumnName = featureColumnName, + // Label column name. + LabelColumnName = labelColumnName + }; + + // Define the tree-based featurizer's configuration. + var options = new FastForestBinaryFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }; + + // Define the featurizer. + var pipeline = mlContext.Transforms.FeaturizeByFastForestBinary(options); + + // Train the model. + var model = pipeline.Fit(dataView); + + // Apply the trained transformer to the considered data set. + var transformed = model.Transform(dataView); + + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); + + // Print out the transformation of the first 3 data points. + for (int i = 0; i < 3; ++i) + { + var dataPoint = dataPoints[i]; + var transformedDataPoint = transformedDataPoints[i]; + Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:"); + Console.WriteLine($" Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}]."); + Console.WriteLine($" Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}]."); + Console.WriteLine($" Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}]."); + } + + // Expected output: + // The original feature vector [0.8173254,0.7680227,0.5581612] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.1111111,0.8823529]. + // Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,0,1,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,0,1,0]. + // The original feature vector [0.5888848,0.9360271,0.4721779] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.4545455,0.8]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,1,1,0,1,0,1,1]. + // The original feature vector [0.2737045,0.2919063,0.4673147] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.4545455,0.1111111]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,1,0,0,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,0,1,0,1,1]. + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) + { + var random = new Random(seed); + float randomFloat() => (float)random.NextDouble(); + for (int i = 0; i < count; i++) + { + var label = randomFloat() > 0.5f; + yield return new DataPoint + { + Label = label, + // Create random features that are correlated with the label. + // For data points with false label, the feature values are slightly increased by adding a constant. + Features = Enumerable.Repeat(label, 3).Select(x => x ? randomFloat() : randomFloat() + 0.03f).ToArray() + }; + } + } + + // Example with label and 3 feature values. A data set is a collection of such examples. + private class DataPoint + { + public bool Label { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture the output of tree-base featurization. + private class TransformedDataPoint : DataPoint + { + // The i-th value is the output value of the i-th decision tree. + public float[] Trees { get; set; } + // The 0-1 encoding of leaves the input feature vector falls into. + public float[] Leaves { get; set; } + // The 0-1 encoding of paths the input feature vector reaches the leaves. + public float[] Paths { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.tt new file mode 100644 index 0000000000..a2640ea95e --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.tt @@ -0,0 +1,49 @@ +<#@ include file="BinaryClassificationFeaturization.ttinclude"#> +<#+ +string ClassName="FastForestBinaryFeaturizationWithOptions"; +string Trainer = "FeaturizeByFastForestBinary"; +bool CacheData = true; +string LabelThreshold = "0.5f"; +string DataSepValue = "0.03f"; +string OptionsInclude = "using Microsoft.ML.Trainers.FastTree;"; +string Comments= @" + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree."; + +string TrainerOptions = @"FastForestBinaryTrainer.Options + { + // Create a simpler model by penalizing usage of new features. + FeatureFirstUsePenalty = 0.1, + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + // Feature column name. + FeatureColumnName = featureColumnName, + // Label column name. + LabelColumnName = labelColumnName + }"; + +string Options = @"FastForestBinaryFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }"; + +string ExpectedOutput = @"// Expected output: + // The original feature vector [0.8173254,0.7680227,0.5581612] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.1111111,0.8823529]. + // Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,0,1,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,0,1,0]. + // The original feature vector [0.5888848,0.9360271,0.4721779] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.4545455,0.8]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,1,1,0,1,0,1,1]. + // The original feature vector [0.2737045,0.2919063,0.4673147] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.4545455,0.1111111]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,1,0,0,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,0,1,0,1,1]."; +#> \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.cs new file mode 100644 index 0000000000..ca6c5a27c8 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.cs @@ -0,0 +1,138 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.FastTree; + +namespace Samples.Dynamic.Transforms.TreeFeaturization +{ + public static class FastForestRegressionFeaturizationWithOptions + { + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of training data points. + var dataPoints = GenerateRandomDataPoints(100).ToList(); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); + + // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, + // it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory, + // a solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms + // which needs many data passes. + dataView = mlContext.Data.Cache(dataView); + + // Define input and output columns of tree-based featurizer. + string labelColumnName = nameof(DataPoint.Label); + string featureColumnName = nameof(DataPoint.Features); + string treesColumnName = nameof(TransformedDataPoint.Trees); + string leavesColumnName = nameof(TransformedDataPoint.Leaves); + string pathsColumnName = nameof(TransformedDataPoint.Paths); + + // Define the configuration of the trainer used to train a tree-based model. + var trainerOptions = new FastForestRegressionTrainer.Options + { + // Only use 80% of features to reduce over-fitting. + FeatureFraction = 0.8, + // Create a simpler model by penalizing usage of new features. + FeatureFirstUsePenalty = 0.1, + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + LabelColumnName = labelColumnName, + FeatureColumnName = featureColumnName + }; + + // Define the tree-based featurizer's configuration. + var options = new FastForestRegressionFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }; + + // Define the featurizer. + var pipeline = mlContext.Transforms.FeaturizeByFastForestRegression(options); + + // Train the model. + var model = pipeline.Fit(dataView); + + // Create testing data. Use different random seed to make it different from training data. + var transformed = model.Transform(dataView); + + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); + + // Print out the transformation of the first 3 data points. + for (int i = 0; i < 3; ++i) + { + var dataPoint = dataPoints[i]; + var transformedDataPoint = transformedDataPoints[i]; + Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:"); + Console.WriteLine($" Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}]."); + Console.WriteLine($" Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}]."); + Console.WriteLine($" Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}]."); + } + + // Expected output: + // The original feature vector[1.543569, 1.494266, 1.284405] is transformed to three different tree - based feature vectors: + // Trees' output values: [0.7291142,0.7825329,0.8764582]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0]. + // Paths IDs' 0-1 representation: [1,0,0,0,1,1,1,0,0,1,1,1,0,1]. + // The original feature vector[0.764918, 1.11206, 0.648211] is transformed to three different tree - based feature vectors: + // Trees' output values: [0.3802337,0.584159,0.5648927]. + // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,0,0,1,1,0,0,0,1,1,0,0]. + // The original feature vector[1.251254, 1.269456, 1.444864] is transformed to three different tree - based feature vectors: + // Trees' output values: [0.7591804,0.7825329,0.7443035]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,0,0,0,1,1,1,0,0,1,1,1,0,1]. + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) + { + var random = new Random(seed); + for (int i = 0; i < count; i++) + { + float label = (float)random.NextDouble(); + yield return new DataPoint + { + Label = label, + // Create random features that are correlated with the label. + Features = Enumerable.Repeat(label, 3).Select(x => x + (float)random.NextDouble()).ToArray() + }; + } + } + + // Example with label and 50 feature values. A data set is a collection of such examples. + private class DataPoint + { + public float Label { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture the output of tree-base featurization. + private class TransformedDataPoint : DataPoint + { + // The i-th value is the output value of the i-th decision tree. + public float[] Trees { get; set; } + // The 0-1 encoding of leaves the input feature vector falls into. + public float[] Leaves { get; set; } + // The 0-1 encoding of paths the input feature vector reaches the leaves. + public float[] Paths { get; set; } + } + } +} + diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.tt new file mode 100644 index 0000000000..1d949629d4 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.tt @@ -0,0 +1,47 @@ +<#@ include file="RegressionFeaturization.ttinclude"#> + +<#+ +string ClassHeader = @"// This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. "; + +string ClassName="FastForestRegressionFeaturizationWithOptions"; +bool CacheData = true; +string ExtraUsing = "using Microsoft.ML.Trainers.FastTree;"; +string Trainer = @"FeaturizeByFastForestRegression"; +string TrainerOptions = @"FastForestRegressionTrainer.Options + { + // Only use 80% of features to reduce over-fitting. + FeatureFraction = 0.8, + // Create a simpler model by penalizing usage of new features. + FeatureFirstUsePenalty = 0.1, + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + LabelColumnName = labelColumnName, + FeatureColumnName = featureColumnName + }"; + +string Options = @"FastForestRegressionFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }"; + +string ExpectedOutput = @"// Expected output: + // The original feature vector[1.543569, 1.494266, 1.284405] is transformed to three different tree - based feature vectors: + // Trees' output values: [0.7291142,0.7825329,0.8764582]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0]. + // Paths IDs' 0-1 representation: [1,0,0,0,1,1,1,0,0,1,1,1,0,1]. + // The original feature vector[0.764918, 1.11206, 0.648211] is transformed to three different tree - based feature vectors: + // Trees' output values: [0.3802337,0.584159,0.5648927]. + // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,0,0,1,1,0,0,0,1,1,0,0]. + // The original feature vector[1.251254, 1.269456, 1.444864] is transformed to three different tree - based feature vectors: + // Trees' output values: [0.7591804,0.7825329,0.7443035]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,0,0,0,1,1,1,0,0,1,1,1,0,1]."; +#> \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.cs new file mode 100644 index 0000000000..c8c52e1490 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.cs @@ -0,0 +1,141 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.FastTree; + +namespace Samples.Dynamic.Transforms.TreeFeaturization +{ + public static class FastTreeBinaryFeaturizationWithOptions + { + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of data points to be transformed. + var dataPoints = GenerateRandomDataPoints(100).ToList(); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); + + // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, + // it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory, + // a solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms + // which needs many data passes. + dataView = mlContext.Data.Cache(dataView); + + // Define input and output columns of tree-based featurizer. + string labelColumnName = nameof(DataPoint.Label); + string featureColumnName = nameof(DataPoint.Features); + string treesColumnName = nameof(TransformedDataPoint.Trees); + string leavesColumnName = nameof(TransformedDataPoint.Leaves); + string pathsColumnName = nameof(TransformedDataPoint.Paths); + + // Define the configuration of the trainer used to train a tree-based model. + var trainerOptions = new FastTreeBinaryTrainer.Options + { + // Use L2Norm for early stopping. + EarlyStoppingMetric = EarlyStoppingMetric.L2Norm, + // Create a simpler model by penalizing usage of new features. + FeatureFirstUsePenalty = 0.1, + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + // Feature column name. + FeatureColumnName = featureColumnName, + // Label column name. + LabelColumnName = labelColumnName + }; + + // Define the tree-based featurizer's configuration. + var options = new FastTreeBinaryFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }; + + // Define the featurizer. + var pipeline = mlContext.Transforms.FeaturizeByFastTreeBinary(options); + + // Train the model. + var model = pipeline.Fit(dataView); + + // Apply the trained transformer to the considered data set. + var transformed = model.Transform(dataView); + + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); + + // Print out the transformation of the first 3 data points. + for (int i = 0; i < 3; ++i) + { + var dataPoint = dataPoints[i]; + var transformedDataPoint = transformedDataPoints[i]; + Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:"); + Console.WriteLine($" Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}]."); + Console.WriteLine($" Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}]."); + Console.WriteLine($" Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}]."); + } + + // Expected output: + // The original feature vector [0.8173254,0.7680227,0.5581612] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.5714286,0.4636412,0.535588]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,0,0,1,1,1,0,1,0,1,1,1,1,1,1]. + // The original feature vector [0.5888848,0.9360271,0.4721779] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.2352941,-0.1382389,0.535588]. + // Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,0,0,1,1,1,0,1,0,1,1,1,1,1,1]. + // The original feature vector [0.2737045,0.2919063,0.4673147] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.2352941,-0.1382389,-0.2184284]. + // Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0]. + // Paths IDs' 0-1 representation: [1,0,0,1,1,1,0,1,0,1,1,1,0,0,0]. + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) + { + var random = new Random(seed); + float randomFloat() => (float)random.NextDouble(); + for (int i = 0; i < count; i++) + { + var label = randomFloat() > 0.5f; + yield return new DataPoint + { + Label = label, + // Create random features that are correlated with the label. + // For data points with false label, the feature values are slightly increased by adding a constant. + Features = Enumerable.Repeat(label, 3).Select(x => x ? randomFloat() : randomFloat() + 0.03f).ToArray() + }; + } + } + + // Example with label and 3 feature values. A data set is a collection of such examples. + private class DataPoint + { + public bool Label { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture the output of tree-base featurization. + private class TransformedDataPoint : DataPoint + { + // The i-th value is the output value of the i-th decision tree. + public float[] Trees { get; set; } + // The 0-1 encoding of leaves the input feature vector falls into. + public float[] Leaves { get; set; } + // The 0-1 encoding of paths the input feature vector reaches the leaves. + public float[] Paths { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.tt new file mode 100644 index 0000000000..ec055986d5 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.tt @@ -0,0 +1,57 @@ +<#@ include file="BinaryClassificationFeaturization.ttinclude"#> +<#+ +string ClassName="FastTreeBinaryFeaturizationWithOptions"; +string Trainer = "FeaturizeByFastTreeBinary"; +bool CacheData = true; +string LabelThreshold = "0.5f"; +string DataSepValue = "0.03f"; +string OptionsInclude = "using Microsoft.ML.Trainers.FastTree;"; +string Comments= @" + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree."; +string TrainerOptions = @"FastTreeBinaryTrainer.Options + { + // Use L2Norm for early stopping. + EarlyStoppingMetric = EarlyStoppingMetric.L2Norm, + // Create a simpler model by penalizing usage of new features. + FeatureFirstUsePenalty = 0.1, + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + // Feature column name. + FeatureColumnName = featureColumnName, + // Label column name. + LabelColumnName = labelColumnName + }"; + +string Options = @"FastTreeBinaryFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }"; + +string ExpectedOutputPerInstance= @"// Expected output: + // Label: True, Prediction: True + // Label: False, Prediction: False + // Label: True, Prediction: True + // Label: True, Prediction: True + // Label: False, Prediction: False"; + +string ExpectedOutput = @"// Expected output: + // The original feature vector [0.8173254,0.7680227,0.5581612] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.5714286,0.4636412,0.535588]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,0,0,1,1,1,0,1,0,1,1,1,1,1,1]. + // The original feature vector [0.5888848,0.9360271,0.4721779] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.2352941,-0.1382389,0.535588]. + // Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,0,0,1,1,1,0,1,0,1,1,1,1,1,1]. + // The original feature vector [0.2737045,0.2919063,0.4673147] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.2352941,-0.1382389,-0.2184284]. + // Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0]. + // Paths IDs' 0-1 representation: [1,0,0,1,1,1,0,1,0,1,1,1,0,0,0]."; +#> \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.cs new file mode 100644 index 0000000000..9e525643d3 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.cs @@ -0,0 +1,141 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.FastTree; + +namespace Samples.Dynamic.Transforms.TreeFeaturization +{ + public static class FastTreeRankingFeaturizationWithOptions + { + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of training data points. + var dataPoints = GenerateRandomDataPoints(100).ToList(); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); + + // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, + // it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory, + // a solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms + // which needs many data passes. + dataView = mlContext.Data.Cache(dataView); + + // Define input and output columns of tree-based featurizer. + string labelColumnName = nameof(DataPoint.Label); + string featureColumnName = nameof(DataPoint.Features); + string treesColumnName = nameof(TransformedDataPoint.Trees); + string leavesColumnName = nameof(TransformedDataPoint.Leaves); + string pathsColumnName = nameof(TransformedDataPoint.Paths); + + // Define the configuration of the trainer used to train a tree-based model. + var trainerOptions = new FastTreeRankingTrainer.Options + { + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + // Feature column name. + FeatureColumnName = featureColumnName, + // Label column name. + LabelColumnName = labelColumnName + }; + + // Define the tree-based featurizer's configuration. + var options = new FastTreeRankingFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }; + + // Define the featurizer. + var pipeline = mlContext.Transforms.FeaturizeByFastTreeRanking(options); + + // Train the model. + var model = pipeline.Fit(dataView); + + // Apply the trained transformer to the considered data set. + var transformed = model.Transform(dataView); + + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); + + // Print out the transformation of the first 3 data points. + for (int i = 0; i < 3; ++i) + { + var dataPoint = dataPoints[i]; + var transformedDataPoint = transformedDataPoints[i]; + Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:"); + Console.WriteLine($" Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}]."); + Console.WriteLine($" Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}]."); + Console.WriteLine($" Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}]."); + } + + // Expected output: + // The original feature vector [1.117325,1.068023,0.8581612] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.4095458,0.2061437,0.2364294]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]. + // The original feature vector [0.6588848,1.006027,0.5421779] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.2543825,-0.06570309,-0.1456212]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,0]. + // The original feature vector [0.6737045,0.6919063,0.8673147] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.2543825,-0.06570309,0.01300209]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]. + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed = 0, int groupSize = 10) + { + var random = new Random(seed); + float randomFloat() => (float)random.NextDouble(); + for (int i = 0; i < count; i++) + { + var label = random.Next(0, 5); + yield return new DataPoint + { + Label = (uint)label, + GroupId = (uint)(i / groupSize), + // Create random features that are correlated with the label. + // For data points with larger labels, the feature values are slightly increased by adding a constant. + Features = Enumerable.Repeat(label, 3).Select(x => randomFloat() + x * 0.1f).ToArray() + }; + } + } + + // Example with label, groupId, and 3 feature values. A data set is a collection of such examples. + private class DataPoint + { + [KeyType(5)] + public uint Label { get; set; } + [KeyType(100)] + public uint GroupId { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture the output of tree-base featurization. + private class TransformedDataPoint : DataPoint + { + // The i-th value is the output value of the i-th decision tree. + public float[] Trees { get; set; } + // The 0-1 encoding of leaves the input feature vector falls into. + public float[] Leaves { get; set; } + // The 0-1 encoding of paths the input feature vector reaches the leaves. + public float[] Paths { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.tt new file mode 100644 index 0000000000..8be69bf2df --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.tt @@ -0,0 +1,47 @@ +<#@ include file="RankingFeaturization.ttinclude"#> +<#+ +string ClassName = "FastTreeRankingFeaturizationWithOptions"; +string Trainer = "FeaturizeByFastTreeRanking"; +bool CacheData = true; + +string TrainerOptions = @"FastTreeRankingTrainer.Options + { + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + // Feature column name. + FeatureColumnName = featureColumnName, + // Label column name. + LabelColumnName = labelColumnName + }"; + +string Options = @"FastTreeRankingFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }"; + +string OptionsInclude = "using Microsoft.ML.Trainers.FastTree;"; + +string Comments= @" + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree."; + +string ExpectedOutput = @"// Expected output: + // The original feature vector [1.117325,1.068023,0.8581612] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.4095458,0.2061437,0.2364294]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]. + // The original feature vector [0.6588848,1.006027,0.5421779] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.2543825,-0.06570309,-0.1456212]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,0]. + // The original feature vector [0.6737045,0.6919063,0.8673147] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.2543825,-0.06570309,0.01300209]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]."; +#> \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.cs new file mode 100644 index 0000000000..c8660e8127 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.cs @@ -0,0 +1,138 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.FastTree; + +namespace Samples.Dynamic.Transforms.TreeFeaturization +{ + public static class FastTreeRegressionFeaturizationWithOptions + { + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of training data points. + var dataPoints = GenerateRandomDataPoints(100).ToList(); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); + + // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, + // it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory, + // a solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms + // which needs many data passes. + dataView = mlContext.Data.Cache(dataView); + + // Define input and output columns of tree-based featurizer. + string labelColumnName = nameof(DataPoint.Label); + string featureColumnName = nameof(DataPoint.Features); + string treesColumnName = nameof(TransformedDataPoint.Trees); + string leavesColumnName = nameof(TransformedDataPoint.Leaves); + string pathsColumnName = nameof(TransformedDataPoint.Paths); + + // Define the configuration of the trainer used to train a tree-based model. + var trainerOptions = new FastTreeRegressionTrainer.Options + { + // Only use 80% of features to reduce over-fitting. + FeatureFraction = 0.8, + // Create a simpler model by penalizing usage of new features. + FeatureFirstUsePenalty = 0.1, + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + LabelColumnName = labelColumnName, + FeatureColumnName = featureColumnName + }; + + // Define the tree-based featurizer's configuration. + var options = new FastTreeRegressionFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }; + + // Define the featurizer. + var pipeline = mlContext.Transforms.FeaturizeByFastTreeRegression(options); + + // Train the model. + var model = pipeline.Fit(dataView); + + // Create testing data. Use different random seed to make it different from training data. + var transformed = model.Transform(dataView); + + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); + + // Print out the transformation of the first 3 data points. + for (int i = 0; i < 3; ++i) + { + var dataPoint = dataPoints[i]; + var transformedDataPoint = transformedDataPoints[i]; + Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:"); + Console.WriteLine($" Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}]."); + Console.WriteLine($" Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}]."); + Console.WriteLine($" Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}]."); + } + + // Expected output: + // The original feature vector [1.543569,1.494266,1.284405] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.1507567,0.1372715,0.1019326]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0]. + // Paths IDs' 0-1 representation: [1,0,0,0,0,1,1,1,0,0,1,1,1,1,0]. + // The original feature vector [0.764918,1.11206,0.648211] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.07604675,0.08244576,0.03080027]. + // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,1,0,0,1,1,0,0,0,1,0,0,0,1]. + // The original feature vector [1.251254,1.269456,1.444864] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.1507567,0.1090626,0.0731837]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0]. + // Paths IDs' 0-1 representation: [1,0,0,0,0,1,1,1,0,0,1,1,1,1,0]. + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) + { + var random = new Random(seed); + for (int i = 0; i < count; i++) + { + float label = (float)random.NextDouble(); + yield return new DataPoint + { + Label = label, + // Create random features that are correlated with the label. + Features = Enumerable.Repeat(label, 3).Select(x => x + (float)random.NextDouble()).ToArray() + }; + } + } + + // Example with label and 50 feature values. A data set is a collection of such examples. + private class DataPoint + { + public float Label { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture the output of tree-base featurization. + private class TransformedDataPoint : DataPoint + { + // The i-th value is the output value of the i-th decision tree. + public float[] Trees { get; set; } + // The 0-1 encoding of leaves the input feature vector falls into. + public float[] Leaves { get; set; } + // The 0-1 encoding of paths the input feature vector reaches the leaves. + public float[] Paths { get; set; } + } + } +} + diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.tt new file mode 100644 index 0000000000..e22153c900 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.tt @@ -0,0 +1,47 @@ +<#@ include file="RegressionFeaturization.ttinclude"#> + +<#+ +string ClassHeader = @"// This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. "; + +string ClassName="FastTreeRegressionFeaturizationWithOptions"; +bool CacheData = true; +string ExtraUsing = "using Microsoft.ML.Trainers.FastTree;"; +string Trainer = @"FeaturizeByFastTreeRegression"; +string TrainerOptions = @"FastTreeRegressionTrainer.Options + { + // Only use 80% of features to reduce over-fitting. + FeatureFraction = 0.8, + // Create a simpler model by penalizing usage of new features. + FeatureFirstUsePenalty = 0.1, + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + LabelColumnName = labelColumnName, + FeatureColumnName = featureColumnName + }"; + +string Options = @"FastTreeRegressionFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }"; + +string ExpectedOutput = @"// Expected output: + // The original feature vector [1.543569,1.494266,1.284405] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.1507567,0.1372715,0.1019326]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0]. + // Paths IDs' 0-1 representation: [1,0,0,0,0,1,1,1,0,0,1,1,1,1,0]. + // The original feature vector [0.764918,1.11206,0.648211] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.07604675,0.08244576,0.03080027]. + // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,1,0,0,1,1,0,0,0,1,0,0,0,1]. + // The original feature vector [1.251254,1.269456,1.444864] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.1507567,0.1090626,0.0731837]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0]. + // Paths IDs' 0-1 representation: [1,0,0,0,0,1,1,1,0,0,1,1,1,1,0]."; +#> \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.cs new file mode 100644 index 0000000000..b6624560a3 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.cs @@ -0,0 +1,138 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.FastTree; + +namespace Samples.Dynamic.Transforms.TreeFeaturization +{ + public static class FastTreeTweedieFeaturizationWithOptions + { + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of training data points. + var dataPoints = GenerateRandomDataPoints(100).ToList(); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); + + // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, + // it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory, + // a solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms + // which needs many data passes. + dataView = mlContext.Data.Cache(dataView); + + // Define input and output columns of tree-based featurizer. + string labelColumnName = nameof(DataPoint.Label); + string featureColumnName = nameof(DataPoint.Features); + string treesColumnName = nameof(TransformedDataPoint.Trees); + string leavesColumnName = nameof(TransformedDataPoint.Leaves); + string pathsColumnName = nameof(TransformedDataPoint.Paths); + + // Define the configuration of the trainer used to train a tree-based model. + var trainerOptions = new FastTreeTweedieTrainer.Options + { + // Only use 80% of features to reduce over-fitting. + FeatureFraction = 0.8, + // Create a simpler model by penalizing usage of new features. + FeatureFirstUsePenalty = 0.1, + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + LabelColumnName = labelColumnName, + FeatureColumnName = featureColumnName + }; + + // Define the tree-based featurizer's configuration. + var options = new FastTreeTweedieFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }; + + // Define the featurizer. + var pipeline = mlContext.Transforms.FeaturizeByFastTreeTweedie(options); + + // Train the model. + var model = pipeline.Fit(dataView); + + // Create testing data. Use different random seed to make it different from training data. + var transformed = model.Transform(dataView); + + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); + + // Print out the transformation of the first 3 data points. + for (int i = 0; i < 3; ++i) + { + var dataPoint = dataPoints[i]; + var transformedDataPoint = transformedDataPoints[i]; + Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:"); + Console.WriteLine($" Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}]."); + Console.WriteLine($" Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}]."); + Console.WriteLine($" Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}]."); + } + + // Expected output: + // The original feature vector [1.543569,1.494266,1.284405] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.05652997,-0.02312196,-0.01179363]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0]. + // Paths IDs' 0-1 representation: [1,0,0,0,0,1,1,0,1,0,1,1,0,0,0]. + // The original feature vector [0.764918,1.11206,0.648211] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.1933938,-0.1042738,-0.2312837]. + // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,0,0,1,1,0,0,0,1,0,0,0,0]. + // The original feature vector [1.251254,1.269456,1.444864] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.05652997,-0.06082304,-0.04528879]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0]. + // Paths IDs' 0-1 representation: [1,0,0,0,0,1,1,0,1,0,1,1,1,0,1]. + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) + { + var random = new Random(seed); + for (int i = 0; i < count; i++) + { + float label = (float)random.NextDouble(); + yield return new DataPoint + { + Label = label, + // Create random features that are correlated with the label. + Features = Enumerable.Repeat(label, 3).Select(x => x + (float)random.NextDouble()).ToArray() + }; + } + } + + // Example with label and 50 feature values. A data set is a collection of such examples. + private class DataPoint + { + public float Label { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture the output of tree-base featurization. + private class TransformedDataPoint : DataPoint + { + // The i-th value is the output value of the i-th decision tree. + public float[] Trees { get; set; } + // The 0-1 encoding of leaves the input feature vector falls into. + public float[] Leaves { get; set; } + // The 0-1 encoding of paths the input feature vector reaches the leaves. + public float[] Paths { get; set; } + } + } +} + diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.tt new file mode 100644 index 0000000000..a075887d1e --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.tt @@ -0,0 +1,54 @@ +<#@ include file="RegressionFeaturization.ttinclude"#> + +<#+ +string ClassHeader = @"// This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. "; + +string ClassName="FastTreeTweedieFeaturizationWithOptions"; +bool CacheData = true; +string ExtraUsing = "using Microsoft.ML.Trainers.FastTree;"; +string Trainer = @"FeaturizeByFastTreeTweedie"; +string TrainerOptions = @"FastTreeTweedieTrainer.Options + { + // Only use 80% of features to reduce over-fitting. + FeatureFraction = 0.8, + // Create a simpler model by penalizing usage of new features. + FeatureFirstUsePenalty = 0.1, + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + LabelColumnName = labelColumnName, + FeatureColumnName = featureColumnName + }"; + +string Options = @"FastTreeTweedieFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }"; + +string ExpectedOutputPerInstance= @"// Expected output: + // Label: 0.985, Prediction: 0.866 + // Label: 0.155, Prediction: 0.171 + // Label: 0.515, Prediction: 0.470 + // Label: 0.566, Prediction: 0.476 + // Label: 0.096, Prediction: 0.140"; + +string ExpectedOutput = @"// Expected output: + // The original feature vector [1.543569,1.494266,1.284405] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.05652997,-0.02312196,-0.01179363]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0]. + // Paths IDs' 0-1 representation: [1,0,0,0,0,1,1,0,1,0,1,1,0,0,0]. + // The original feature vector [0.764918,1.11206,0.648211] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.1933938,-0.1042738,-0.2312837]. + // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,0,0,1,1,0,0,0,1,0,0,0,0]. + // The original feature vector [1.251254,1.269456,1.444864] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.05652997,-0.06082304,-0.04528879]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0]. + // Paths IDs' 0-1 representation: [1,0,0,0,0,1,1,0,1,0,1,1,1,0,1]."; +#> \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/PretrainedTreeEnsembleFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/PretrainedTreeEnsembleFeaturizationWithOptions.cs new file mode 100644 index 0000000000..f96e595a63 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/PretrainedTreeEnsembleFeaturizationWithOptions.cs @@ -0,0 +1,148 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.FastTree; + +namespace Samples.Dynamic.Transforms.TreeFeaturization +{ + public static class PretrainedTreeEnsembleFeaturizationWithOptions + { + public static void Example() + { + // Create data set + int dataPointCount = 200; + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of training data points. + var dataPoints = GenerateRandomDataPoints(dataPointCount).ToList(); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); + + // Define input and output columns of tree-based featurizer. + string labelColumnName = nameof(DataPoint.Label); + string featureColumnName = nameof(DataPoint.Features); + string treesColumnName = nameof(TransformedDataPoint.Trees); + string leavesColumnName = nameof(TransformedDataPoint.Leaves); + string pathsColumnName = nameof(TransformedDataPoint.Paths); + + // Define a tree model whose trees will be extracted to construct a tree featurizer. + var trainer = mlContext.BinaryClassification.Trainers.FastTree( + new FastTreeBinaryTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 1, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 1, + FeatureColumnName = featureColumnName, + LabelColumnName = labelColumnName + }); + + // Train the defined tree model. + var model = trainer.Fit(dataView); + var predicted = model.Transform(dataView); + + // Define the configuration of tree-based featurizer. + var options = new PretrainedTreeFeaturizationEstimator.Options() + { + InputColumnName = featureColumnName, + ModelParameters = model.Model.SubModel, // Pretrained tree model. + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName + }; + + // Fit the created featurizer. It doesn't perform actual training because a pretrained model is provided. + var treeFeaturizer = mlContext.Transforms.FeaturizeByPretrainTreeEnsemble(options).Fit(dataView); + + // Apply TreeEnsembleFeaturizer to the input data. + var transformed = treeFeaturizer.Transform(dataView); + + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); + + // Print out the transformation of the first 3 data points. + for (int i = 0; i < 3; ++i) + { + var dataPoint = dataPoints[i]; + var transformedDataPoint = transformedDataPoints[i]; + Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:"); + Console.WriteLine($" Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}]."); + Console.WriteLine($" Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}]."); + Console.WriteLine($" Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}]."); + } + + // Expected output: + // The original feature vector[0.8173254, 0.7680227, 0.5581612] is transformed to three different tree - based feature vectors: + // Trees' output values: [0.4172185]. + // Leave IDs' 0-1 representation: [1,0,0,0]. + // Paths IDs' 0-1 representation: [1,1,1]. + // The original feature vector[0.7588848, 1.106027, 0.6421779] is transformed to three different tree - based feature vectors: + // Trees' output values: [-1]. + // Leave IDs' 0-1 representation: [0,0,1,0]. + // Paths IDs' 0-1 representation: [1,1,0]. + // The original feature vector[0.2737045, 0.2919063, 0.4673147] is transformed to three different tree - based feature vectors: + // Trees' output values: [0.4172185]. + // Leave IDs' 0-1 representation: [1,0,0,0]. + // Paths IDs' 0-1 representation: [1,1,1]. + // + // Note that the trained model contains only one tree. + // + // Node 0 + // / \ + // / Leaf -2 + // Node 1 + // / \ + // / Leaf -3 + // Node 2 + // / \ + // / Leaf -4 + // Leaf -1 + // + // Thus, if a data point reaches Leaf indexed by -1, its 0-1 path representation may be [1,1,1] because that data point + // went through all Node 0, Node 1, and Node 2. + + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) + { + var random = new Random(seed); + float randomFloat() => (float)random.NextDouble(); + for (int i = 0; i < count; i++) + { + var label = randomFloat() > 0.5; + yield return new DataPoint + { + Label = label, + // Create random features that are correlated with the label. + // For data points with false label, the feature values are slightly increased by adding a constant. + Features = Enumerable.Repeat(label, 3).Select(x => x ? randomFloat() : randomFloat() + 0.2f).ToArray() + }; + } + } + + // Example with label and 3 feature values. A data set is a collection of such examples. + private class DataPoint + { + public bool Label { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture the output of tree-base featurization. + private class TransformedDataPoint : DataPoint + { + // The i-th value is the output value of the i-th decision tree. + public float[] Trees { get; set; } + // The 0-1 encoding of leaves the input feature vector falls into. + public float[] Leaves { get; set; } + // The 0-1 encoding of paths the input feature vector reaches the leaves. + public float[] Paths { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RankingFeaturization.ttinclude b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RankingFeaturization.ttinclude new file mode 100644 index 0000000000..16d6858c91 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RankingFeaturization.ttinclude @@ -0,0 +1,114 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +<# if (TrainerOptions != null) { #> +<#=OptionsInclude#> +<# } #> + +namespace Samples.Dynamic.Transforms.TreeFeaturization +{ + public static class <#=ClassName#> + {<#=Comments#> + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of training data points. + var dataPoints = GenerateRandomDataPoints(100).ToList(); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); +<# if (CacheData) { #> + + // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, + // it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory, + // a solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms + // which needs many data passes. + dataView = mlContext.Data.Cache(dataView); +<# } #> + + // Define input and output columns of tree-based featurizer. + string labelColumnName = nameof(DataPoint.Label); + string featureColumnName = nameof(DataPoint.Features); + string treesColumnName = nameof(TransformedDataPoint.Trees); + string leavesColumnName = nameof(TransformedDataPoint.Leaves); + string pathsColumnName = nameof(TransformedDataPoint.Paths); + + // Define the configuration of the trainer used to train a tree-based model. + var trainerOptions = new <#=TrainerOptions#>; + + // Define the tree-based featurizer's configuration. + var options = new <#=Options#>; + + // Define the featurizer. + var pipeline = mlContext.Transforms.<#=Trainer#>(options); + + // Train the model. + var model = pipeline.Fit(dataView); + + // Apply the trained transformer to the considered data set. + var transformed = model.Transform(dataView); + + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); + + // Print out the transformation of the first 3 data points. + for (int i = 0; i < 3; ++i) + { + var dataPoint = dataPoints[i]; + var transformedDataPoint = transformedDataPoints[i]; + Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:"); + Console.WriteLine($" Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}]."); + Console.WriteLine($" Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}]."); + Console.WriteLine($" Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}]."); + } + + <#=ExpectedOutput#> + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed = 0, int groupSize = 10) + { + var random = new Random(seed); + float randomFloat() => (float)random.NextDouble(); + for (int i = 0; i < count; i++) + { + var label = random.Next(0, 5); + yield return new DataPoint + { + Label = (uint)label, + GroupId = (uint)(i / groupSize), + // Create random features that are correlated with the label. + // For data points with larger labels, the feature values are slightly increased by adding a constant. + Features = Enumerable.Repeat(label, 3).Select(x => randomFloat() + x * 0.1f).ToArray() + }; + } + } + + // Example with label, groupId, and 3 feature values. A data set is a collection of such examples. + private class DataPoint + { + [KeyType(5)] + public uint Label { get; set; } + [KeyType(100)] + public uint GroupId { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture the output of tree-base featurization. + private class TransformedDataPoint : DataPoint + { + // The i-th value is the output value of the i-th decision tree. + public float[] Trees { get; set; } + // The 0-1 encoding of leaves the input feature vector falls into. + public float[] Leaves { get; set; } + // The 0-1 encoding of paths the input feature vector reaches the leaves. + public float[] Paths { get; set; } + } + } +} \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RegressionFeaturization.ttinclude b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RegressionFeaturization.ttinclude new file mode 100644 index 0000000000..28ee91ffaf --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RegressionFeaturization.ttinclude @@ -0,0 +1,111 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +<# if (ExtraUsing != null) { #> +<#=ExtraUsing#> +<# } #> + +namespace Samples.Dynamic.Transforms.TreeFeaturization +{ + public static class <#=ClassName#> + { +<# if (ClassHeader != null) { #> + <#=ClassHeader#> +<# } #> + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of training data points. + var dataPoints = GenerateRandomDataPoints(100).ToList(); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); +<# if (CacheData) { #> + + // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, + // it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory, + // a solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms + // which needs many data passes. + dataView = mlContext.Data.Cache(dataView); +<# } #> + + // Define input and output columns of tree-based featurizer. + string labelColumnName = nameof(DataPoint.Label); + string featureColumnName = nameof(DataPoint.Features); + string treesColumnName = nameof(TransformedDataPoint.Trees); + string leavesColumnName = nameof(TransformedDataPoint.Leaves); + string pathsColumnName = nameof(TransformedDataPoint.Paths); + + // Define the configuration of the trainer used to train a tree-based model. + var trainerOptions = new <#=TrainerOptions#>; + + // Define the tree-based featurizer's configuration. + var options = new <#=Options#>; + + // Define the featurizer. + var pipeline = mlContext.Transforms.<#=Trainer#>(options); + + // Train the model. + var model = pipeline.Fit(dataView); + + // Create testing data. Use different random seed to make it different from training data. + var transformed = model.Transform(dataView); + + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); + + // Print out the transformation of the first 3 data points. + for (int i = 0; i < 3; ++i) + { + var dataPoint = dataPoints[i]; + var transformedDataPoint = transformedDataPoints[i]; + Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:"); + Console.WriteLine($" Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}]."); + Console.WriteLine($" Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}]."); + Console.WriteLine($" Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}]."); + } + + <#=ExpectedOutput#> + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) + { + var random = new Random(seed); + for (int i = 0; i < count; i++) + { + float label = (float)random.NextDouble(); + yield return new DataPoint + { + Label = label, + // Create random features that are correlated with the label. + Features = Enumerable.Repeat(label, 3).Select(x => x + (float)random.NextDouble()).ToArray() + }; + } + } + + // Example with label and 50 feature values. A data set is a collection of such examples. + private class DataPoint + { + public float Label { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture the output of tree-base featurization. + private class TransformedDataPoint : DataPoint + { + // The i-th value is the output value of the i-th decision tree. + public float[] Trees { get; set; } + // The 0-1 encoding of leaves the input feature vector falls into. + public float[] Leaves { get; set; } + // The 0-1 encoding of paths the input feature vector reaches the leaves. + public float[] Paths { get; set; } + } + } +} \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj index b78cf41bfa..6793313c63 100644 --- a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj +++ b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj @@ -448,6 +448,30 @@ TextTemplatingFileGenerator SdcaWithOptions.cs + + TextTemplatingFileGenerator + FastForestBinaryFeaturizationWithOptions.cs + + + TextTemplatingFileGenerator + FastForestRegressionFeaturizationWithOptions.cs + + + TextTemplatingFileGenerator + FastTreeBinaryFeaturizationWithOptions.cs + + + TextTemplatingFileGenerator + FastTreeRankingFeaturizationWithOptions.cs + + + TextTemplatingFileGenerator + FastTreeRegressionFeaturizationWithOptions.cs + + + TextTemplatingFileGenerator + FastTreeTweedieFeaturizationWithOptions.cs + @@ -895,6 +919,36 @@ True SdcaWithOptions.tt + + True + True + FastForestBinaryFeaturizationWithOptions.tt + + + FastForestRegressionFeaturizationWithOptions.tt + True + True + + + FastTreeBinaryFeaturizationWithOptions.tt + True + True + + + True + True + FastTreeRankingFeaturizationWithOptions.tt + + + True + True + FastTreeRegressionFeaturizationWithOptions.tt + + + True + True + FastTreeTweedieFeaturizationWithOptions.tt + diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs new file mode 100644 index 0000000000..c7248fe3c1 --- /dev/null +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs @@ -0,0 +1,464 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Linq; +using Microsoft.ML.Data; +using Microsoft.ML.Runtime; + +namespace Microsoft.ML.Trainers.FastTree +{ + /// + /// This class encapsulates the common behavior of all tree-based featurizers such as , + /// , , + /// , and . + /// All tree-based featurizers share the same output schema computed by . All tree-based featurizers + /// requires an input feature column name and a suffix for all output columns. The returned by + /// produces three columns: (1) the prediction values of all trees, (2) the IDs of leaves the input feature vector falling into, and (3) + /// the binary vector which encodes the paths to those destination leaves. + /// + public abstract class TreeEnsembleFeaturizationEstimatorBase : IEstimator + { + /// + /// The common options of tree-based featurizations such as , , + /// , , and . + /// + public abstract class OptionsBase + { + /// + /// The name of feature column in the when calling . + /// The column type must be a vector of . The column called would be mapped + /// to columns called , , and in the output + /// of and its derived classes. Note that is not + /// necessary to be the same as the feature column used to train the underlying tree model. + /// + public string InputColumnName; + + /// + /// The name of the column that stores the prediction values of all trees. Its type is a vector of + /// and the i-th vector element is the prediction value predicted by the i-th tree. + /// If is , this output column may not be generated. + /// + public string TreesColumnName; + + /// + /// The 0-1 encoding of all leaf nodes' IDs. Its type is a vector of . If the given feature + /// vector falls into the first leaf of the first tree, the first element in the 0-1 encoding would be 1. + /// If is , this output column may not be generated. + /// + public string LeavesColumnName; + + /// + /// The 0-1 encoding of the paths to the leaves. If the path to the first tree's leaf is node 1 (2nd node in the first tree), + /// node 3 (4th node in the first tree), and node 5 (6th node in the first tree), the 2nd, 4th, and 6th element in that encoding + /// would be 1. + /// If is , this output column may not be generated. + /// + public string PathsColumnName; + }; + + /// + /// See . + /// + private protected readonly string FeatureColumnName; + + /// + /// See . + /// + private protected readonly string TreesColumnName; + + /// + /// See . + /// + private protected readonly string LeavesColumnName; + + /// + /// See . + /// + private protected readonly string PathsColumnName; + + /// + /// Environment of this instance. It controls error throwing and other environment settings. + /// + private protected readonly IHostEnvironment Env; + + private protected TreeEnsembleFeaturizationEstimatorBase(IHostEnvironment env, OptionsBase options) + { + Env = env; + if (options.InputColumnName == null) + throw Env.Except(nameof(options), "The " + nameof(options.InputColumnName) + " cannot be null."); + if (options.TreesColumnName == null && options.LeavesColumnName == null && options.PathsColumnName == null) + throw Env.Except($"{nameof(OptionsBase.TreesColumnName)}, {nameof(OptionsBase.LeavesColumnName)}, and {nameof(OptionsBase.PathsColumnName)} cannot be all null at the same time. " + + $"At least one output column name should be provided so at least one output column may be generated."); + + FeatureColumnName = options.InputColumnName; + TreesColumnName = options.TreesColumnName; + LeavesColumnName = options.LeavesColumnName; + PathsColumnName = options.PathsColumnName; + } + + /// + /// All derived class should implement to tell how to get a + /// out from and parameters inside this or derived classes. + /// + /// Data used to train a tree model. + /// The trees used in . + private protected abstract TreeEnsembleModelParameters PrepareModel(IDataView input); + + /// + /// Produce a which maps the column called in + /// to three output columns. + /// + public TreeEnsembleFeaturizationTransformer Fit(IDataView input) + { + var model = PrepareModel(input); + return new TreeEnsembleFeaturizationTransformer(Env, input.Schema, input.Schema[FeatureColumnName], model, + TreesColumnName, LeavesColumnName, PathsColumnName); + } + + /// + /// adds three float-vector columns into . + /// Given a feature vector column, the added columns are the prediction values of all trees, the leaf IDs the feature + /// vector falls into, and the paths to those leaves. + /// + /// A schema which contains a feature column. Note that feature column name can be specified + /// by . + /// Output produced by . + public SchemaShape GetOutputSchema(SchemaShape inputSchema) + { + Env.CheckValue(inputSchema, nameof(inputSchema)); + + if (!inputSchema.TryFindColumn(FeatureColumnName, out var col)) + throw Env.ExceptSchemaMismatch(nameof(inputSchema), "input", FeatureColumnName); + + var result = inputSchema.ToDictionary(x => x.Name); + + if (TreesColumnName != null) + result[TreesColumnName] = new SchemaShape.Column(TreesColumnName, + SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false); + + if (LeavesColumnName != null) + result[LeavesColumnName] = new SchemaShape.Column(LeavesColumnName, + SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false); + + if (PathsColumnName != null) + result[PathsColumnName] = new SchemaShape.Column(PathsColumnName, + SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false); + + return new SchemaShape(result.Values); + } + } + /// + /// A which contains a pre-trained and calling its + /// produces a featurizer based on the pre-trained model. + /// + /// + /// . + /// The input features column data must be a known-sized vector of. + /// + /// This estimator outputs the following columns: + /// + /// | Output Column Name | Column Type | Description| + /// | -- | -- | -- | + /// | `Trees` | Vector of | The output values of all trees. | + /// | `Leaves` | Vector of | The IDs of all leaves where the input feature vector falls into. | + /// | `Paths` | Vector of | The paths the input feature vector passed through to reach the leaves. | + /// + /// Those output columns are all optional and user can change their names. + /// Please set the names of skipped columns to null so that they would not be produced. + /// + /// [!include[algorithm](~/../docs/samples/docs/api-reference/tree-featurization-prediction.md)] + /// ]]> + /// + /// + /// + public sealed class PretrainedTreeFeaturizationEstimator : TreeEnsembleFeaturizationEstimatorBase + { + /// + /// of as + /// used when calling . + /// + public sealed class Options : OptionsBase + { + /// + /// The pretrained tree model used to do tree-based featurization. Note that contains a collection of decision trees. + /// + public TreeEnsembleModelParameters ModelParameters; + }; + + private TreeEnsembleModelParameters _modelParameters; + + internal PretrainedTreeFeaturizationEstimator(IHostEnvironment env, Options options) : base(env, options) + { + _modelParameters = options.ModelParameters; + } + + /// + /// Produce the for tree-based feature engineering. This function does not + /// invoke training procedure and just returns the pre-trained model passed in via . + /// + private protected override TreeEnsembleModelParameters PrepareModel(IDataView input) => _modelParameters; + } + + /// + /// A to transform input feature vector to tree-based features. + /// + /// + /// + /// + /// + /// + public sealed class FastTreeBinaryFeaturizationEstimator : TreeEnsembleFeaturizationEstimatorBase + { + private readonly FastTreeBinaryTrainer.Options _trainerOptions; + + /// + /// Options for the . + /// + public sealed class Options : OptionsBase + { + /// + /// The configuration of used to train the underlying . + /// + public FastTreeBinaryTrainer.Options TrainerOptions; + } + + internal FastTreeBinaryFeaturizationEstimator(IHostEnvironment env, Options options) + : base(env, options) + { + _trainerOptions = options.TrainerOptions; + } + + private protected override TreeEnsembleModelParameters PrepareModel(IDataView input) + { + var trainer = new FastTreeBinaryTrainer(Env, _trainerOptions); + var trained = trainer.Fit(input); + return trained.Model.SubModel; + } + } + + /// + /// A to transform input feature vector to tree-based features. + /// + /// + /// + /// + /// + /// + public sealed class FastTreeRegressionFeaturizationEstimator : TreeEnsembleFeaturizationEstimatorBase + { + private readonly FastTreeRegressionTrainer.Options _trainerOptions; + + /// + /// Options for the . + /// + public sealed class Options : OptionsBase + { + /// + /// The configuration of used to train the underlying . + /// + public FastTreeRegressionTrainer.Options TrainerOptions; + } + + internal FastTreeRegressionFeaturizationEstimator(IHostEnvironment env, Options options) + : base(env, options) + { + _trainerOptions = options.TrainerOptions; + } + + private protected override TreeEnsembleModelParameters PrepareModel(IDataView input) + { + var trainer = new FastTreeRegressionTrainer(Env, _trainerOptions); + var trained = trainer.Fit(input); + return trained.Model; + } + } + + /// + /// A to transform input feature vector to tree-based features. + /// + /// + /// + /// + /// + /// + public sealed class FastForestBinaryFeaturizationEstimator : TreeEnsembleFeaturizationEstimatorBase + { + private readonly FastForestBinaryTrainer.Options _trainerOptions; + + /// + /// Options for the . + /// + public sealed class Options : OptionsBase + { + /// + /// The configuration of used to train the underlying . + /// + public FastForestBinaryTrainer.Options TrainerOptions; + } + + internal FastForestBinaryFeaturizationEstimator(IHostEnvironment env, Options options) + : base(env, options) + { + _trainerOptions = options.TrainerOptions; + } + + private protected override TreeEnsembleModelParameters PrepareModel(IDataView input) + { + var trainer = new FastForestBinaryTrainer(Env, _trainerOptions); + var trained = trainer.Fit(input); + return trained.Model; + } + } + + /// + /// A to transform input feature vector to tree-based features. + /// + /// + /// + /// + /// + /// + public sealed class FastForestRegressionFeaturizationEstimator : TreeEnsembleFeaturizationEstimatorBase + { + private readonly FastForestRegressionTrainer.Options _trainerOptions; + + /// + /// Options for the . + /// + public sealed class Options : OptionsBase + { + /// + /// The configuration of used to train the underlying . + /// + public FastForestRegressionTrainer.Options TrainerOptions; + } + + internal FastForestRegressionFeaturizationEstimator(IHostEnvironment env, Options options) + : base(env, options) + { + _trainerOptions = options.TrainerOptions; + } + + private protected override TreeEnsembleModelParameters PrepareModel(IDataView input) + { + var trainer = new FastForestRegressionTrainer(Env, _trainerOptions); + var trained = trainer.Fit(input); + return trained.Model; + } + } + + /// + /// A to transform input feature vector to tree-based features. + /// + /// + /// + /// + /// + /// + public sealed class FastTreeRankingFeaturizationEstimator : TreeEnsembleFeaturizationEstimatorBase + { + private readonly FastTreeRankingTrainer.Options _trainerOptions; + + /// + /// Options for the . + /// + public sealed class Options : OptionsBase + { + /// + /// The configuration of used to train the underlying . + /// + public FastTreeRankingTrainer.Options TrainerOptions; + } + + internal FastTreeRankingFeaturizationEstimator(IHostEnvironment env, Options options) + : base(env, options) + { + _trainerOptions = options.TrainerOptions; + } + + private protected override TreeEnsembleModelParameters PrepareModel(IDataView input) + { + var trainer = new FastTreeRankingTrainer(Env, _trainerOptions); + var trained = trainer.Fit(input); + return trained.Model; + } + } + + /// + /// A to transform input feature vector to tree-based features. + /// + /// + /// + /// + /// + /// + public sealed class FastTreeTweedieFeaturizationEstimator : TreeEnsembleFeaturizationEstimatorBase + { + private readonly FastTreeTweedieTrainer.Options _trainerOptions; + + /// + /// Options for the . + /// + public sealed class Options : OptionsBase + { + /// + /// The configuration of used to train the underlying . + /// + public FastTreeTweedieTrainer.Options TrainerOptions; + } + + internal FastTreeTweedieFeaturizationEstimator(IHostEnvironment env, Options options) + : base(env, options) + { + _trainerOptions = options.TrainerOptions; + } + + private protected override TreeEnsembleModelParameters PrepareModel(IDataView input) + { + var trainer = new FastTreeTweedieTrainer(Env, _trainerOptions); + var trained = trainer.Fit(input); + return trained.Model; + } + } +} diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs new file mode 100644 index 0000000000..d30337b07e --- /dev/null +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs @@ -0,0 +1,184 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Data.IO; +using Microsoft.ML.Runtime; +using Microsoft.ML.Trainers.FastTree; + +[assembly: LoadableClass(typeof(TreeEnsembleFeaturizationTransformer), typeof(TreeEnsembleFeaturizationTransformer), + null, typeof(SignatureLoadModel), "", TreeEnsembleFeaturizationTransformer.LoaderSignature)] + +namespace Microsoft.ML.Trainers.FastTree +{ + /// + /// resulting from fitting any derived class of . + /// The derived classes include, for example, and + /// . + /// + public sealed class TreeEnsembleFeaturizationTransformer : PredictionTransformerBase + { + internal const string LoaderSignature = "TreeEnseFeat"; + private readonly TreeEnsembleFeaturizerBindableMapper.Arguments _scorerArgs; + private readonly DataViewSchema.DetachedColumn _featureDetachedColumn; + /// + /// See . + /// + private readonly string _treesColumnName; + /// + /// See . + /// + private readonly string _leavesColumnName; + /// + /// See . + /// + private readonly string _pathsColumnName; + /// + /// Check if is compatible with . + /// + /// A column checked against . + private void CheckFeatureColumnCompatibility(DataViewSchema.Column inspectedFeatureColumn) + { + string nameErrorMessage = $"The column called {inspectedFeatureColumn.Name} does not match the expected " + + $"feature column with name {_featureDetachedColumn.Name} and type {_featureDetachedColumn.Type}. " + + $"Please rename your column by calling CopyColumns defined in TransformExtensionsCatalog"; + // Check if column names are the same. + Host.Check(_featureDetachedColumn.Name == inspectedFeatureColumn.Name, nameErrorMessage); + + string typeErrorMessage = $"The column called {inspectedFeatureColumn.Name} has a type {inspectedFeatureColumn.Type}, " + + $"which does not match the expected feature column with name {_featureDetachedColumn.Name} and type {_featureDetachedColumn.Type}. " + + $"Please make sure your feature column type is {_featureDetachedColumn.Type}."; + // Check if column types are identical. + Host.Check(_featureDetachedColumn.Type.Equals(inspectedFeatureColumn.Type), typeErrorMessage); + } + + /// + /// Create from by using as the feature role. + /// + /// The original schema to be mapped. + private RoleMappedSchema MakeFeatureRoleMappedSchema(DataViewSchema schema) + { + var roles = new List>(); + roles.Add(new KeyValuePair(RoleMappedSchema.ColumnRole.Feature, _featureDetachedColumn.Name)); + return new RoleMappedSchema(schema, roles); + } + + internal TreeEnsembleFeaturizationTransformer(IHostEnvironment env, DataViewSchema inputSchema, + DataViewSchema.Column featureColumn, TreeEnsembleModelParameters modelParameters, + string treesColumnName, string leavesColumnName, string pathsColumnName) : + base(Contracts.CheckRef(env, nameof(env)).Register(nameof(TreeEnsembleFeaturizationTransformer)), modelParameters, inputSchema) + { + // Store featureColumn as a detached column because a fitted transformer can be applied to different IDataViews and different + // IDataView may have different schemas. + _featureDetachedColumn = new DataViewSchema.DetachedColumn(featureColumn); + + // Check if featureColumn matches a column in inputSchema. The answer is yes if they have the same name and type. + // The indexed column, inputSchema[featureColumn.Index], should match the detached column, _featureDetachedColumn. + CheckFeatureColumnCompatibility(inputSchema[featureColumn.Index]); + + // Store output column names so that this transformer can be saved into a file later. + _treesColumnName = treesColumnName; + _leavesColumnName = leavesColumnName; + _pathsColumnName = pathsColumnName; + + // Create an argument, _scorerArgs, to pass the output column names to the underlying scorer. + _scorerArgs = new TreeEnsembleFeaturizerBindableMapper.Arguments { + TreesColumnName = _treesColumnName, LeavesColumnName = _leavesColumnName, PathsColumnName = _pathsColumnName }; + + // Create a bindable mapper. It provides the core computation and can be attached to any IDataView and produce + // a transformed IDataView. + BindableMapper = new TreeEnsembleFeaturizerBindableMapper(env, _scorerArgs, modelParameters); + + // Create a scorer. + var roleMappedSchema = MakeFeatureRoleMappedSchema(inputSchema); + Scorer = new GenericScorer(Host, _scorerArgs, new EmptyDataView(Host, inputSchema), BindableMapper.Bind(Host, roleMappedSchema), roleMappedSchema); + } + + private TreeEnsembleFeaturizationTransformer(IHostEnvironment host, ModelLoadContext ctx) + : base(Contracts.CheckRef(host, nameof(host)).Register(nameof(TreeEnsembleFeaturizationTransformer)), ctx) + { + // *** Binary format *** + // + // string: feature column's name. + // string: the name of the columns where tree prediction values are stored. + // string: the name of the columns where trees' leave are stored. + // string: the name of the columns where trees' paths are stored. + + // Load stored fields. + string featureColumnName = ctx.LoadString(); + _featureDetachedColumn = new DataViewSchema.DetachedColumn(TrainSchema[featureColumnName]); + _treesColumnName = ctx.LoadStringOrNull(); + _leavesColumnName = ctx.LoadStringOrNull(); + _pathsColumnName = ctx.LoadStringOrNull(); + + // Create an argument to specify output columns' names of this transformer. + _scorerArgs = new TreeEnsembleFeaturizerBindableMapper.Arguments { + TreesColumnName = _treesColumnName, LeavesColumnName = _leavesColumnName, PathsColumnName = _pathsColumnName }; + + // Create a bindable mapper. It provides the core computation and can be attached to any IDataView and produce + // a transformed IDataView. + BindableMapper = new TreeEnsembleFeaturizerBindableMapper(host, _scorerArgs, Model); + + // Create a scorer. + var roleMappedSchema = MakeFeatureRoleMappedSchema(TrainSchema); + Scorer = new GenericScorer(Host, _scorerArgs, new EmptyDataView(Host, TrainSchema), BindableMapper.Bind(Host, roleMappedSchema), roleMappedSchema); + } + + /// + /// appends three columns to the . + /// The three columns are all vectors. The fist column stores the prediction values of all trees and + /// its default name is "Trees". The second column (default name: "Leaves") contains leaf IDs where the given feature vector falls into. + /// The third column (default name: "Paths") encodes the paths to those leaves via a 0-1 vector. + /// + /// of the data to be transformed. + /// of the transformed data if the input schema is . + public override DataViewSchema GetOutputSchema(DataViewSchema inputSchema) => Transform(new EmptyDataView(Host, inputSchema)).Schema; + + private protected override void SaveModel(ModelSaveContext ctx) + { + Host.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(); + ctx.SetVersionInfo(GetVersionInfo()); + + // *** Binary format *** + // model: prediction model. + // stream: empty data view that contains train schema. + // string: feature column name. + // string: the name of the columns where tree prediction values are stored. + // string: the name of the columns where trees' leave are stored. + // string: the name of the columns where trees' paths are stored. + + ctx.SaveModel(Model, DirModel); + ctx.SaveBinaryStream(DirTransSchema, writer => + { + using (var ch = Host.Start("Saving train schema")) + { + var saver = new BinarySaver(Host, new BinarySaver.Arguments { Silent = true }); + DataSaverUtils.SaveDataView(ch, saver, new EmptyDataView(Host, TrainSchema), writer.BaseStream); + } + }); + + ctx.SaveString(_featureDetachedColumn.Name); + ctx.SaveStringOrNull(_treesColumnName); + ctx.SaveStringOrNull(_leavesColumnName); + ctx.SaveStringOrNull(_pathsColumnName); + } + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "TREEFEAT", // "TREE" ensemble "FEAT"urizer. + verWrittenCur: 0x00010001, // Initial + verReadableCur: 0x00010001, + verWeCanReadBack: 0x00010001, + loaderSignature: LoaderSignature, + loaderAssemblyName: typeof(TreeEnsembleFeaturizationTransformer).Assembly.FullName); + } + + private static TreeEnsembleFeaturizationTransformer Create(IHostEnvironment env, ModelLoadContext ctx) + => new TreeEnsembleFeaturizationTransformer(env, ctx); + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs index 38cbeda4f2..bbb69aa222 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs @@ -44,45 +44,65 @@ namespace Microsoft.ML.Data /// internal sealed class TreeEnsembleFeaturizerBindableMapper : ISchemaBindableMapper, ICanSaveModel { - public static class OutputColumnNames - { - public const string Trees = "Trees"; - public const string Paths = "Paths"; - public const string Leaves = "Leaves"; - } - + /// + /// In addition to options inherited from , + /// adds output columns' names of tree-based featurizer. + /// public sealed class Arguments : ScorerArgumentsBase - { - } - - private sealed class BoundMapper : ISchemaBoundRowMapper { /// - /// Column index of values predicted by all trees in an ensemble in . + /// See . /// - private const int TreeValuesColumnId = 0; + public string TreesColumnName; /// - /// Column index of leaf IDs containing the considered example in . + /// See . /// - private const int LeafIdsColumnId = 1; + public string LeavesColumnName; /// - /// Column index of path IDs which specify the paths the considered example passing through per tree in . + /// See . /// - private const int PathIdsColumnId = 2; - - private readonly TreeEnsembleFeaturizerBindableMapper _owner; - private readonly IExceptionContext _ectx; + public string PathsColumnName; + } + private sealed class BoundMapper : ISchemaBoundRowMapper + { public RoleMappedSchema InputRoleMappedSchema { get; } - public DataViewSchema InputSchema => InputRoleMappedSchema.Schema; public DataViewSchema OutputSchema { get; } + public ISchemaBindableMapper Bindable => _owner; + + private readonly TreeEnsembleFeaturizerBindableMapper _owner; + private readonly IExceptionContext _ectx; + + /// + /// Feature vector to be mapped to tree-based features. + /// private DataViewSchema.Column FeatureColumn => InputRoleMappedSchema.Feature.Value; - public ISchemaBindableMapper Bindable => _owner; + /// + /// The name of the column that stores the prediction values of all trees. Its type is a vector of + /// and the i-th vector element is the prediction value predicted by the i-th tree. + /// If is , this output column may not be generated. + /// + private string _treesColumnName; + + /// + /// The 0-1 encoding of all leaf nodes' IDs. Its type is a vector of . If the given feature + /// vector falls into the first leaf of the first tree, the first element in the 0-1 encoding would be 1. + /// If is , this output column may not be generated. + /// + private string _leavesColumnName; - public BoundMapper(IExceptionContext ectx, TreeEnsembleFeaturizerBindableMapper owner, - RoleMappedSchema schema) + /// + /// The 0-1 encoding of the paths to the leaves. If the path to the first tree's leaf is node 1 (2nd node in the first tree), + /// node 3 (4th node in the first tree), and node 5 (6th node in the first tree), the 2nd, 4th, and 6th element in that encoding + /// would be 1. + /// If is , this output column may not be generated. + /// + private string _pathsColumnName; + + public BoundMapper(IExceptionContext ectx, TreeEnsembleFeaturizerBindableMapper owner, RoleMappedSchema schema, + string treesColumnName, string leavesColumnName, string pathsColumnName) { Contracts.AssertValue(ectx); ectx.AssertValue(owner); @@ -111,37 +131,45 @@ public BoundMapper(IExceptionContext ectx, TreeEnsembleFeaturizerBindableMapper // Start creating output schema with types derived above. var schemaBuilder = new DataViewSchema.Builder(); - // Metadata of tree values. - var treeIdMetadataBuilder = new DataViewSchema.Annotations.Builder(); - treeIdMetadataBuilder.Add(AnnotationUtils.Kinds.SlotNames, AnnotationUtils.GetNamesType(treeValueType.Size), - (ValueGetter>>)owner.GetTreeSlotNames); - // Add the column of trees' output values - schemaBuilder.AddColumn(OutputColumnNames.Trees, treeValueType, treeIdMetadataBuilder.ToAnnotations()); - - // Metadata of leaf IDs. - var leafIdMetadataBuilder = new DataViewSchema.Annotations.Builder(); - leafIdMetadataBuilder.Add(AnnotationUtils.Kinds.SlotNames, AnnotationUtils.GetNamesType(leafIdType.Size), - (ValueGetter>>)owner.GetLeafSlotNames); - leafIdMetadataBuilder.Add(AnnotationUtils.Kinds.IsNormalized, BooleanDataViewType.Instance, (ref bool value) => value = true); - // Add the column of leaves' IDs where the input example reaches. - schemaBuilder.AddColumn(OutputColumnNames.Leaves, leafIdType, leafIdMetadataBuilder.ToAnnotations()); - - // Metadata of path IDs. - var pathIdMetadataBuilder = new DataViewSchema.Annotations.Builder(); - pathIdMetadataBuilder.Add(AnnotationUtils.Kinds.SlotNames, AnnotationUtils.GetNamesType(pathIdType.Size), - (ValueGetter>>)owner.GetPathSlotNames); - pathIdMetadataBuilder.Add(AnnotationUtils.Kinds.IsNormalized, BooleanDataViewType.Instance, (ref bool value) => value = true); - // Add the column of encoded paths which the input example passes. - schemaBuilder.AddColumn(OutputColumnNames.Paths, pathIdType, pathIdMetadataBuilder.ToAnnotations()); + _treesColumnName = treesColumnName; + if (treesColumnName != null) + { + // Metadata of tree values. + var treeIdMetadataBuilder = new DataViewSchema.Annotations.Builder(); + treeIdMetadataBuilder.Add(AnnotationUtils.Kinds.SlotNames, AnnotationUtils.GetNamesType(treeValueType.Size), + (ValueGetter>>)owner.GetTreeSlotNames); - OutputSchema = schemaBuilder.ToSchema(); + // Add the column of trees' output values + schemaBuilder.AddColumn(treesColumnName, treeValueType, treeIdMetadataBuilder.ToAnnotations()); + } - // Tree values must be the first output column. - Contracts.Assert(OutputSchema[OutputColumnNames.Trees].Index == TreeValuesColumnId); - // leaf IDs must be the second output column. - Contracts.Assert(OutputSchema[OutputColumnNames.Leaves].Index == LeafIdsColumnId); - // Path IDs must be the third output column. - Contracts.Assert(OutputSchema[OutputColumnNames.Paths].Index == PathIdsColumnId); + _leavesColumnName = leavesColumnName; + if (leavesColumnName != null) + { + // Metadata of leaf IDs. + var leafIdMetadataBuilder = new DataViewSchema.Annotations.Builder(); + leafIdMetadataBuilder.Add(AnnotationUtils.Kinds.SlotNames, AnnotationUtils.GetNamesType(leafIdType.Size), + (ValueGetter>>)owner.GetLeafSlotNames); + leafIdMetadataBuilder.Add(AnnotationUtils.Kinds.IsNormalized, BooleanDataViewType.Instance, (ref bool value) => value = true); + + // Add the column of leaves' IDs where the input example reaches. + schemaBuilder.AddColumn(leavesColumnName, leafIdType, leafIdMetadataBuilder.ToAnnotations()); + } + + _pathsColumnName = pathsColumnName; + if (pathsColumnName != null) + { + // Metadata of path IDs. + var pathIdMetadataBuilder = new DataViewSchema.Annotations.Builder(); + pathIdMetadataBuilder.Add(AnnotationUtils.Kinds.SlotNames, AnnotationUtils.GetNamesType(pathIdType.Size), + (ValueGetter>>)owner.GetPathSlotNames); + pathIdMetadataBuilder.Add(AnnotationUtils.Kinds.IsNormalized, BooleanDataViewType.Instance, (ref bool value) => value = true); + + // Add the column of encoded paths which the input example passes. + schemaBuilder.AddColumn(pathsColumnName, pathIdType, pathIdMetadataBuilder.ToAnnotations()); + } + + OutputSchema = schemaBuilder.ToSchema(); } DataViewRow ISchemaBoundRowMapper.GetRow(DataViewRow input, IEnumerable activeColumns) @@ -156,40 +184,41 @@ private Delegate[] CreateGetters(DataViewRow input, IEnumerable(); var activeIndices = activeColumns.Select(c => c.Index); - var treeValueActive = activeIndices.Contains(TreeValuesColumnId); - var leafIdActive = activeIndices.Contains(LeafIdsColumnId); - var pathIdActive = activeIndices.Contains(PathIdsColumnId); - - if (!treeValueActive && !leafIdActive && !pathIdActive) - return delegates; - var state = new State(_ectx, input, _owner._ensemble, _owner._totalLeafCount, FeatureColumn.Index); // Get the tree value getter. - if (treeValueActive) + if (_treesColumnName != null) { ValueGetter> fn = state.GetTreeValues; - delegates[TreeValuesColumnId] = fn; + if(activeIndices.Contains(OutputSchema[_treesColumnName].Index)) + delegates.Add(fn); + else + delegates.Add(null); } // Get the leaf indicator getter. - if (leafIdActive) + if (_leavesColumnName != null ) { ValueGetter> fn = state.GetLeafIds; - delegates[LeafIdsColumnId] = fn; + if (activeIndices.Contains(OutputSchema[_leavesColumnName].Index)) + delegates.Add(fn); + else + delegates.Add(null); } // Get the path indicators getter. - if (pathIdActive) + if (_pathsColumnName != null) { ValueGetter> fn = state.GetPathIds; - delegates[PathIdsColumnId] = fn; + if (activeIndices.Contains(OutputSchema[_pathsColumnName].Index)) + delegates.Add(fn); + else + delegates.Add(null); } - return delegates; + return delegates.ToArray(); } private sealed class State @@ -350,9 +379,10 @@ private static VersionInfo GetVersionInfo() return new VersionInfo( modelSignature: "TREEMAPR", // verWrittenCur: 0x00010001, // Initial - verWrittenCur: 0x00010002, // Add _defaultValueForMissing - verReadableCur: 0x00010002, - verWeCanReadBack: 0x00010001, + // verWrittenCur: 0x00010002, // Add _defaultValueForMissing + verWrittenCur: 0x00010003, // Add output column names (_treesColumnName, _leavesColumnName, _pathsColumnName) + verReadableCur: 0x00010003, + verWeCanReadBack: 0x00010002, loaderSignature: LoaderSignature, loaderAssemblyName: typeof(TreeEnsembleFeaturizerBindableMapper).Assembly.FullName); } @@ -360,6 +390,9 @@ private static VersionInfo GetVersionInfo() private readonly IHost _host; private readonly TreeEnsembleModelParameters _ensemble; private readonly int _totalLeafCount; + private readonly string _treesColumnName; + private readonly string _leavesColumnName; + private readonly string _pathsColumnName; public TreeEnsembleFeaturizerBindableMapper(IHostEnvironment env, Arguments args, IPredictor predictor) { @@ -368,6 +401,11 @@ public TreeEnsembleFeaturizerBindableMapper(IHostEnvironment env, Arguments args _host.CheckValue(args, nameof(args)); _host.CheckValue(predictor, nameof(predictor)); + // Store output columns specified by the user. + _treesColumnName = args.TreesColumnName; + _leavesColumnName = args.LeavesColumnName; + _pathsColumnName = args.PathsColumnName; + // This function accepts models trained by FastTreeTrainer family. There are four types that "predictor" can be. // 1. CalibratedPredictorBase // 2. FastTreeRankingModelParameters @@ -387,12 +425,19 @@ public TreeEnsembleFeaturizerBindableMapper(IHostEnvironment env, ModelLoadConte Contracts.CheckValue(env, nameof(env)); _host = env.Register(LoaderSignature); _host.AssertValue(ctx); + ctx.CheckAtModel(GetVersionInfo()); // *** Binary format *** // ensemble + // string: treesColumnName + // string: leavesColumnName + // string: pathsColumnName ctx.LoadModel(env, out _ensemble, "Ensemble"); _totalLeafCount = CountLeaves(_ensemble); + _treesColumnName = ctx.LoadStringOrNull(); + _leavesColumnName = ctx.LoadStringOrNull(); + _pathsColumnName = ctx.LoadStringOrNull(); } void ICanSaveModel.Save(ModelSaveContext ctx) @@ -403,9 +448,15 @@ void ICanSaveModel.Save(ModelSaveContext ctx) // *** Binary format *** // ensemble + // string: treesColumnName + // string: leavesColumnName + // string: pathsColumnName _host.AssertValue(_ensemble); ctx.SaveModel(_ensemble, "Ensemble"); + ctx.SaveStringOrNull(_treesColumnName); + ctx.SaveStringOrNull(_leavesColumnName); + ctx.SaveStringOrNull(_pathsColumnName); } private static int CountLeaves(TreeEnsembleModelParameters ensemble) @@ -474,7 +525,7 @@ ISchemaBoundMapper ISchemaBindableMapper.Bind(IHostEnvironment env, RoleMappedSc env.AssertValue(schema); env.CheckParam(schema.Feature != null, nameof(schema), "Need a feature column"); - return new BoundMapper(env, this, schema); + return new BoundMapper(env, this, schema, _treesColumnName, _leavesColumnName, _pathsColumnName); } } @@ -572,7 +623,8 @@ private static IDataTransform Create(IHostEnvironment env, Arguments args, IData IDataTransform xf; using (var ch = host.Start("Create Tree Ensemble Scorer")) { - var scorerArgs = new TreeEnsembleFeaturizerBindableMapper.Arguments() { Suffix = args.Suffix }; + var scorerArgs = new TreeEnsembleFeaturizerBindableMapper.Arguments() { + Suffix = args.Suffix, TreesColumnName = "Trees", LeavesColumnName = "Leaves", PathsColumnName = "Paths" }; if (!string.IsNullOrWhiteSpace(args.TrainedModelFile)) { if (args.Trainer != null) @@ -644,7 +696,8 @@ public static IDataTransform CreateForEntryPoint(IHostEnvironment env, Arguments using (var ch = host.Start("Create Tree Ensemble Scorer")) { - var scorerArgs = new TreeEnsembleFeaturizerBindableMapper.Arguments() { Suffix = args.Suffix }; + var scorerArgs = new TreeEnsembleFeaturizerBindableMapper.Arguments() { + Suffix = args.Suffix, TreesColumnName = "Trees", LeavesColumnName = "Leaves", PathsColumnName = "Paths" }; var predictor = args.PredictorModel.Predictor; ch.Trace("Prepare data"); RoleMappedData data = null; diff --git a/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs b/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs index 3909bca2ff..cdd4f5f87f 100644 --- a/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs +++ b/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs @@ -9,9 +9,9 @@ namespace Microsoft.ML { /// - /// Collection of extension methods used by , - /// , , - /// and to create instances of decision tree trainers. + /// Collection of extension methods used by , , + /// , , and + /// to create instances of decision tree trainers and featurizers. /// public static class TreeExtensions { @@ -427,7 +427,6 @@ public static FastForestBinaryTrainer FastForest(this BinaryClassificationCatalo /// ]]> /// /// - public static FastForestBinaryTrainer FastForest(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, FastForestBinaryTrainer.Options options) { @@ -437,5 +436,152 @@ public static FastForestBinaryTrainer FastForest(this BinaryClassificationCatalo var env = CatalogUtils.GetEnvironment(catalog); return new FastForestBinaryTrainer(env, options); } + + /// + /// Create , which produces tree-based features given a . + /// + /// The context to create . + /// The options to configure . See and + /// for available settings. + /// + /// + /// + /// + /// + public static PretrainedTreeFeaturizationEstimator FeaturizeByPretrainTreeEnsemble(this TransformsCatalog catalog, + PretrainedTreeFeaturizationEstimator.Options options) + { + Contracts.CheckValue(catalog, nameof(catalog)); + var env = CatalogUtils.GetEnvironment(catalog); + return new PretrainedTreeFeaturizationEstimator(env, options); + } + + /// + /// Create , which uses to train to create tree-based features. + /// + /// The context to create . + /// The options to configure . See and + /// for available settings. + /// + /// + /// + /// + /// + public static FastForestRegressionFeaturizationEstimator FeaturizeByFastForestRegression(this TransformsCatalog catalog, + FastForestRegressionFeaturizationEstimator.Options options) + { + Contracts.CheckValue(catalog, nameof(catalog)); + var env = CatalogUtils.GetEnvironment(catalog); + return new FastForestRegressionFeaturizationEstimator(env, options); + } + + /// + /// Create , which uses to train to create tree-based features. + /// + /// The context to create . + /// The options to configure . See and + /// for available settings. + /// + /// + /// + /// + /// + public static FastTreeRegressionFeaturizationEstimator FeaturizeByFastTreeRegression(this TransformsCatalog catalog, + FastTreeRegressionFeaturizationEstimator.Options options) + { + Contracts.CheckValue(catalog, nameof(catalog)); + var env = CatalogUtils.GetEnvironment(catalog); + return new FastTreeRegressionFeaturizationEstimator(env, options); + } + + /// + /// Create , which uses to train to create tree-based features. + /// + /// The context to create . + /// The options to configure . See and + /// for available settings. + /// + /// + /// + /// + /// + public static FastForestBinaryFeaturizationEstimator FeaturizeByFastForestBinary(this TransformsCatalog catalog, + FastForestBinaryFeaturizationEstimator.Options options) + { + Contracts.CheckValue(catalog, nameof(catalog)); + var env = CatalogUtils.GetEnvironment(catalog); + return new FastForestBinaryFeaturizationEstimator(env, options); + } + + /// + /// Create , which uses to train to create tree-based features. + /// + /// The context to create . + /// The options to configure . See and + /// for available settings. + /// + /// + /// + /// + /// + public static FastTreeBinaryFeaturizationEstimator FeaturizeByFastTreeBinary(this TransformsCatalog catalog, + FastTreeBinaryFeaturizationEstimator.Options options) + { + Contracts.CheckValue(catalog, nameof(catalog)); + var env = CatalogUtils.GetEnvironment(catalog); + return new FastTreeBinaryFeaturizationEstimator(env, options); + } + + /// + /// Create , which uses to train to create tree-based features. + /// + /// The context to create . + /// The options to configure . See and + /// for available settings. + /// + /// + /// + /// + /// + public static FastTreeRankingFeaturizationEstimator FeaturizeByFastTreeRanking(this TransformsCatalog catalog, + FastTreeRankingFeaturizationEstimator.Options options) + { + Contracts.CheckValue(catalog, nameof(catalog)); + var env = CatalogUtils.GetEnvironment(catalog); + return new FastTreeRankingFeaturizationEstimator(env, options); + } + + /// + /// Create , which uses to train to create tree-based features. + /// + /// The context to create . + /// The options to configure . See and + /// for available settings. + /// + /// + /// + /// + /// + public static FastTreeTweedieFeaturizationEstimator FeaturizeByFastTreeTweedie(this TransformsCatalog catalog, + FastTreeTweedieFeaturizationEstimator.Options options) + { + Contracts.CheckValue(catalog, nameof(catalog)); + var env = CatalogUtils.GetEnvironment(catalog); + return new FastTreeTweedieFeaturizationEstimator(env, options); + } } } diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs index 188a9ced46..ea8026f99e 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs @@ -3,6 +3,8 @@ // See the LICENSE file in the project root for more information. using System; +using System.Collections.Generic; +using System.IO; using System.Linq; using Microsoft.ML.Data; using Microsoft.ML.Trainers.FastTree; @@ -32,7 +34,12 @@ public void TreeEnsembleFeaturizerOutputSchemaTest() var model = trainer.Fit(dataView); // From the trained tree model, a mapper of tree featurizer is created. - var treeFeaturizer = new TreeEnsembleFeaturizerBindableMapper(Env, new TreeEnsembleFeaturizerBindableMapper.Arguments(), model.Model); + const string treesColumnName = "MyTrees"; + const string leavesColumnName = "MyLeaves"; + const string pathsColumnName = "MyPaths"; + var args = new TreeEnsembleFeaturizerBindableMapper.Arguments() { + TreesColumnName = treesColumnName, LeavesColumnName = leavesColumnName, PathsColumnName = pathsColumnName }; + var treeFeaturizer = new TreeEnsembleFeaturizerBindableMapper(Env, args, model.Model); // To get output schema, we need to create RoleMappedSchema for calling Bind(...). var roleMappedSchema = new RoleMappedSchema(dataView.Schema, @@ -46,7 +53,7 @@ public void TreeEnsembleFeaturizerOutputSchemaTest() { // Check if output schema is correct. var treeValuesColumn = outputSchema[0]; - Assert.Equal("Trees", treeValuesColumn.Name); + Assert.Equal(treesColumnName, treeValuesColumn.Name); VectorDataViewType treeValuesType = treeValuesColumn.Type as VectorDataViewType; Assert.NotNull(treeValuesType); Assert.Equal(NumberDataViewType.Single, treeValuesType.ItemType); @@ -64,7 +71,7 @@ public void TreeEnsembleFeaturizerOutputSchemaTest() { var treeLeafIdsColumn = outputSchema[1]; // Check column of tree leaf IDs. - Assert.Equal("Leaves", treeLeafIdsColumn.Name); + Assert.Equal(leavesColumnName, treeLeafIdsColumn.Name); VectorDataViewType treeLeafIdsType = treeLeafIdsColumn.Type as VectorDataViewType; Assert.NotNull(treeLeafIdsType); Assert.Equal(NumberDataViewType.Single, treeLeafIdsType.ItemType); @@ -87,7 +94,7 @@ public void TreeEnsembleFeaturizerOutputSchemaTest() { var treePathIdsColumn = outputSchema[2]; // Check column of path IDs. - Assert.Equal("Paths", treePathIdsColumn.Name); + Assert.Equal(pathsColumnName, treePathIdsColumn.Name); VectorDataViewType treePathIdsType = treePathIdsColumn.Type as VectorDataViewType; Assert.NotNull(treePathIdsType); Assert.Equal(NumberDataViewType.Single, treePathIdsType.ItemType); @@ -108,5 +115,710 @@ public void TreeEnsembleFeaturizerOutputSchemaTest() } } + + [Fact] + public void TreeEnsembleFeaturizerTransformerFastTreeBinary() + { + // Create data set + int dataPointCount = 20; + var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + + // Define a tree model whose trees will be extracted to construct a tree featurizer. + var trainer = ML.BinaryClassification.Trainers.FastTree( + new FastTreeBinaryTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 1, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 1 + }); + + // Train the defined tree model. + var model = trainer.Fit(dataView); + var predicted = model.Transform(dataView); + + // From the trained tree model, a mapper of tree featurizer is created. + const string treesColumnName = "MyTrees"; + const string leavesColumnName = "MyLeaves"; + const string pathsColumnName = "MyPaths"; + var treeFeaturizer = new TreeEnsembleFeaturizationTransformer(ML, dataView.Schema, dataView.Schema["Features"], model.Model.SubModel, + treesColumnName: treesColumnName, leavesColumnName: leavesColumnName, pathsColumnName: pathsColumnName); + + // Apply TreeEnsembleFeaturizer to the input data. + var transformed = treeFeaturizer.Transform(dataView); + + // Extract the outputs of TreeEnsembleFeaturizer. + var features = transformed.GetColumn("Features").ToArray(); + var leafValues = transformed.GetColumn(treesColumnName).ToArray(); + var leafIds = transformed.GetColumn(leavesColumnName).ToArray(); + var paths = transformed.GetColumn(pathsColumnName).ToArray(); + + // Check if the TreeEnsembleFeaturizer produce expected values. + List path = null; + for (int dataPointIndex = 0; dataPointIndex < dataPointCount; ++dataPointIndex) + { + int treeIndex = 0; + var leafId = model.Model.SubModel.GetLeaf(treeIndex, new VBuffer(10, features[dataPointIndex]), ref path); + var leafValue = model.Model.SubModel.GetLeafValue(0, leafId); + Assert.Equal(leafValues[dataPointIndex][treeIndex], leafValue); + Assert.Equal(1.0, leafIds[dataPointIndex][leafId]); + foreach (var nodeId in path) + Assert.Equal(1.0, paths[dataPointIndex][nodeId]); + } + } + + [Fact] + public void TreeEnsembleFeaturizerTransformerFastForestBinary() + { + // Create data set + int dataPointCount = 20; + var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + + // Define a tree model whose trees will be extracted to construct a tree featurizer. + var trainer = ML.BinaryClassification.Trainers.FastForest( + new FastForestBinaryTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 1, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 1 + }); + + // Train the defined tree model. + var model = trainer.Fit(dataView); + + // From the trained tree model, a mapper of tree featurizer is created. + const string treesColumnName = "MyTrees"; + const string leavesColumnName = "MyLeaves"; + const string pathsColumnName = "MyPaths"; + var treeFeaturizer = new TreeEnsembleFeaturizationTransformer(ML, dataView.Schema, dataView.Schema["Features"], model.Model, + treesColumnName: treesColumnName, leavesColumnName: leavesColumnName, pathsColumnName: pathsColumnName); + + // Apply TreeEnsembleFeaturizer to the input data. + var transformed = treeFeaturizer.Transform(dataView); + + // Extract the outputs of TreeEnsembleFeaturizer. + var features = transformed.GetColumn("Features").ToArray(); + var leafValues = transformed.GetColumn(treesColumnName).ToArray(); + var leafIds = transformed.GetColumn(leavesColumnName).ToArray(); + var paths = transformed.GetColumn(pathsColumnName).ToArray(); + + // Check if the TreeEnsembleFeaturizer produce expected values. + List path = null; + for (int dataPointIndex = 0; dataPointIndex < dataPointCount; ++dataPointIndex) + { + int treeIndex = 0; + var leafId = model.Model.GetLeaf(treeIndex, new VBuffer(10, features[dataPointIndex]), ref path); + var leafValue = model.Model.GetLeafValue(0, leafId); + Assert.Equal(leafValues[dataPointIndex][treeIndex], leafValue); + Assert.Equal(1.0, leafIds[dataPointIndex][leafId]); + foreach (var nodeId in path) + Assert.Equal(1.0, paths[dataPointIndex][nodeId]); + } + } + + /// + /// A test of . + /// + [Fact] + public void TestPretrainedTreeFeaturizationEstimator() + { + // Create data set + int dataPointCount = 20; + var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); + + // Define a tree model whose trees will be extracted to construct a tree featurizer. + var trainer = ML.BinaryClassification.Trainers.FastTree( + new FastTreeBinaryTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 1, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 1 + }); + + // Train the defined tree model. + var model = trainer.Fit(dataView); + var predicted = model.Transform(dataView); + + // From the trained tree model, a mapper of tree featurizer is created. + string featureColumnName = "Features"; + string treesColumnName = "MyTrees"; // a tree-based feature column. + string leavesColumnName = "MyLeaves"; // a tree-based feature column. + string pathsColumnName = "MyPaths"; // a tree-based feature column. + var options = new PretrainedTreeFeaturizationEstimator.Options() + { + InputColumnName = featureColumnName, + ModelParameters = model.Model.SubModel, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName + }; + var treeFeaturizer = ML.Transforms.FeaturizeByPretrainTreeEnsemble(options).Fit(dataView); + + // Apply TreeEnsembleFeaturizer to the input data. + var transformed = treeFeaturizer.Transform(dataView); + + // Extract the outputs of TreeEnsembleFeaturizer. + var features = transformed.GetColumn(featureColumnName).ToArray(); + var leafValues = transformed.GetColumn(treesColumnName).ToArray(); + var leafIds = transformed.GetColumn(leavesColumnName).ToArray(); + var paths = transformed.GetColumn(pathsColumnName).ToArray(); + + // Check if the TreeEnsembleFeaturizer produce expected values. + List path = null; + for (int dataPointIndex = 0; dataPointIndex < dataPointCount; ++dataPointIndex) + { + int treeIndex = 0; + var leafId = model.Model.SubModel.GetLeaf(treeIndex, new VBuffer(10, features[dataPointIndex]), ref path); + var leafValue = model.Model.SubModel.GetLeafValue(0, leafId); + Assert.Equal(leafValues[dataPointIndex][treeIndex], leafValue); + Assert.Equal(1.0, leafIds[dataPointIndex][leafId]); + foreach (var nodeId in path) + Assert.Equal(1.0, paths[dataPointIndex][nodeId]); + } + } + + /// + /// This test contains several steps. + /// 1. It first trains a using . + /// 2. Then, it creates the a from the trained . + /// 3. The feature produced in step 2 would be fed into to enhance the training accuracy of that linear model. + /// 4. We train another without features from trees and finally compare their scores. + /// + [Fact] + public void TreeEnsembleFeaturizingPipeline() + { + // Create data set + int dataPointCount = 200; + var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); + + // Define a tree model whose trees will be extracted to construct a tree featurizer. + var trainer = ML.BinaryClassification.Trainers.FastTree( + new FastTreeBinaryTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10 + }); + + // Train the defined tree model. This trained model will be used to construct TreeEnsembleFeaturizationEstimator. + var treeModel = trainer.Fit(dataView); + var predicted = treeModel.Transform(dataView); + + // Combine the output of TreeEnsembleFeaturizationTransformer and the original features as the final training features. + // Then train a linear model. + var options = new PretrainedTreeFeaturizationEstimator.Options() + { + InputColumnName = "Features", + TreesColumnName = "Trees", + LeavesColumnName = "Leaves", + PathsColumnName = "Paths", + ModelParameters = treeModel.Model.SubModel + }; + var pipeline = ML.Transforms.FeaturizeByPretrainTreeEnsemble(options) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) + .Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); + var model = pipeline.Fit(dataView); + var prediction = model.Transform(dataView); + var metrics = ML.BinaryClassification.Evaluate(prediction); + + // Then train the same linear model without tree features. + var naivePipeline = ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "Features"); + var naiveModel = naivePipeline.Fit(dataView); + var naivePrediction = naiveModel.Transform(dataView); + var naiveMetrics = ML.BinaryClassification.Evaluate(naivePrediction); + + // The linear model trained with tree features should perform better than that without tree features. + Assert.True(metrics.Accuracy > naiveMetrics.Accuracy); + Assert.True(metrics.LogLoss < naiveMetrics.LogLoss); + Assert.True(metrics.AreaUnderPrecisionRecallCurve > naiveMetrics.AreaUnderPrecisionRecallCurve); + } + + [Fact] + public void TestFastTreeBinaryFeaturizationInPipeline() + { + int dataPointCount = 200; + var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); + + var trainerOptions = new FastTreeBinaryTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" + }; + + var options = new FastTreeBinaryFeaturizationEstimator.Options() + { + InputColumnName = "Features", + TreesColumnName = "Trees", + LeavesColumnName = "Leaves", + PathsColumnName = "Paths", + TrainerOptions = trainerOptions + }; + + var pipeline = ML.Transforms.FeaturizeByFastTreeBinary(options) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) + .Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); + var model = pipeline.Fit(dataView); + var prediction = model.Transform(dataView); + var metrics = ML.BinaryClassification.Evaluate(prediction); + + Assert.True(metrics.Accuracy > 0.98); + Assert.True(metrics.LogLoss < 0.05); + Assert.True(metrics.AreaUnderPrecisionRecallCurve > 0.98); + } + + [Fact] + public void TestFastForestBinaryFeaturizationInPipeline() + { + int dataPointCount = 200; + var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); + + var trainerOptions = new FastForestBinaryTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" + }; + + var options = new FastForestBinaryFeaturizationEstimator.Options() + { + InputColumnName = "Features", + TreesColumnName = "Trees", + LeavesColumnName = "Leaves", + PathsColumnName = "Paths", + TrainerOptions = trainerOptions + }; + + var pipeline = ML.Transforms.FeaturizeByFastForestBinary(options) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) + .Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); + var model = pipeline.Fit(dataView); + var prediction = model.Transform(dataView); + var metrics = ML.BinaryClassification.Evaluate(prediction); + + Assert.True(metrics.Accuracy > 0.97); + Assert.True(metrics.LogLoss < 0.07); + Assert.True(metrics.AreaUnderPrecisionRecallCurve > 0.98); + } + + [Fact] + public void TestFastTreeRegressionFeaturizationInPipeline() + { + int dataPointCount = 200; + var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); + + var trainerOptions = new FastTreeRegressionTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" + }; + + var options = new FastTreeRegressionFeaturizationEstimator.Options() + { + InputColumnName = "Features", + TreesColumnName = "Trees", + LeavesColumnName = "Leaves", + PathsColumnName = "Paths", + TrainerOptions = trainerOptions + }; + + var pipeline = ML.Transforms.FeaturizeByFastTreeRegression(options) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) + .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); + var model = pipeline.Fit(dataView); + var prediction = model.Transform(dataView); + var metrics = ML.Regression.Evaluate(prediction); + + Assert.True(metrics.MeanAbsoluteError < 0.2); + Assert.True(metrics.MeanSquaredError < 0.05); + } + + [Fact] + public void TestFastForestRegressionFeaturizationInPipeline() + { + int dataPointCount = 200; + var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); + + var trainerOptions = new FastForestRegressionTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" + }; + + var options = new FastForestRegressionFeaturizationEstimator.Options() + { + InputColumnName = "Features", + TreesColumnName = "Trees", + LeavesColumnName = "Leaves", + PathsColumnName = "Paths", + TrainerOptions = trainerOptions + }; + + var pipeline = ML.Transforms.FeaturizeByFastForestRegression(options) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) + .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); + var model = pipeline.Fit(dataView); + var prediction = model.Transform(dataView); + var metrics = ML.Regression.Evaluate(prediction); + + Assert.True(metrics.MeanAbsoluteError < 0.25); + Assert.True(metrics.MeanSquaredError < 0.1); + } + + [Fact] + public void TestFastTreeTweedieFeaturizationInPipeline() + { + int dataPointCount = 200; + var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); + + var trainerOptions = new FastTreeTweedieTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" + }; + + var options = new FastTreeTweedieFeaturizationEstimator.Options() + { + InputColumnName = "Features", + TreesColumnName = "Trees", + LeavesColumnName = "Leaves", + PathsColumnName = "Paths", + TrainerOptions = trainerOptions + }; + + var pipeline = ML.Transforms.FeaturizeByFastTreeTweedie(options) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) + .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); + var model = pipeline.Fit(dataView); + var prediction = model.Transform(dataView); + var metrics = ML.Regression.Evaluate(prediction); + + Assert.True(metrics.MeanAbsoluteError < 0.25); + Assert.True(metrics.MeanSquaredError < 0.1); + } + + [Fact] + public void TestFastTreeRankingFeaturizationInPipeline() + { + int dataPointCount = 200; + var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); + + var trainerOptions = new FastTreeRankingTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" + }; + + var options = new FastTreeRankingFeaturizationEstimator.Options() + { + InputColumnName = "Features", + TreesColumnName = "Trees", + LeavesColumnName = "Leaves", + PathsColumnName = "Paths", + TrainerOptions = trainerOptions + }; + + var pipeline = ML.Transforms.FeaturizeByFastTreeRanking(options) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) + .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); + var model = pipeline.Fit(dataView); + var prediction = model.Transform(dataView); + var metrics = ML.Regression.Evaluate(prediction); + + Assert.True(metrics.MeanAbsoluteError < 0.25); + Assert.True(metrics.MeanSquaredError < 0.1); + } + + [Fact] + public void TestSaveAndLoadTreeFeaturizer() + { + int dataPointCount = 200; + var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); + + var trainerOptions = new FastForestRegressionTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" + }; + + var options = new FastForestRegressionFeaturizationEstimator.Options() + { + InputColumnName = "Features", + TreesColumnName = "Trees", + LeavesColumnName = "Leaves", + PathsColumnName = "Paths", + TrainerOptions = trainerOptions + }; + + var pipeline = ML.Transforms.FeaturizeByFastForestRegression(options) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) + .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); + var model = pipeline.Fit(dataView); + var prediction = model.Transform(dataView); + var metrics = ML.Regression.Evaluate(prediction); + + Assert.True(metrics.MeanAbsoluteError < 0.25); + Assert.True(metrics.MeanSquaredError < 0.1); + + // Save the trained model into file. + ITransformer loadedModel = null; + var tempPath = Path.GetTempFileName(); + using (var file = new SimpleFileHandle(Env, tempPath, true, true)) + { + using (var fs = file.CreateWriteStream()) + ML.Model.Save(model, null, fs); + + using (var fs = file.OpenReadStream()) + loadedModel = ML.Model.Load(fs, out var schema); + } + var loadedPrediction = loadedModel.Transform(dataView); + var loadedMetrics = ML.Regression.Evaluate(loadedPrediction); + + Assert.Equal(metrics.MeanAbsoluteError, loadedMetrics.MeanAbsoluteError); + Assert.Equal(metrics.MeanSquaredError, loadedMetrics.MeanSquaredError); + } + + [Fact] + public void TestSaveAndLoadDoubleTreeFeaturizer() + { + int dataPointCount = 200; + var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); + + var trainerOptions = new FastForestRegressionTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" + }; + + // Trains tree featurization on "Features" and applies on "CopiedFeatures". + var options = new FastForestRegressionFeaturizationEstimator.Options() + { + InputColumnName = "CopiedFeatures", + TrainerOptions = trainerOptions, + TreesColumnName = "OhMyTrees", + LeavesColumnName = "OhMyLeaves", + PathsColumnName = "OhMyPaths" + }; + + var pipeline = ML.Transforms.CopyColumns("CopiedFeatures", "Features") + .Append(ML.Transforms.FeaturizeByFastForestRegression(options)) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "OhMyTrees", "OhMyLeaves", "OhMyPaths")) + .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); + var model = pipeline.Fit(dataView); + var prediction = model.Transform(dataView); + var metrics = ML.Regression.Evaluate(prediction); + + Assert.True(metrics.MeanAbsoluteError < 0.25); + Assert.True(metrics.MeanSquaredError < 0.1); + + // Save the trained model into file and then load it back. + ITransformer loadedModel = null; + var tempPath = Path.GetTempFileName(); + using (var file = new SimpleFileHandle(Env, tempPath, true, true)) + { + using (var fs = file.CreateWriteStream()) + ML.Model.Save(model, null, fs); + + using (var fs = file.OpenReadStream()) + loadedModel = ML.Model.Load(fs, out var schema); + } + + // Compute prediction using the loaded model. + var loadedPrediction = loadedModel.Transform(dataView); + var loadedMetrics = ML.Regression.Evaluate(loadedPrediction); + + // Check if the loaded model produces the same result as the trained model. + Assert.Equal(metrics.MeanAbsoluteError, loadedMetrics.MeanAbsoluteError); + Assert.Equal(metrics.MeanSquaredError, loadedMetrics.MeanSquaredError); + + var secondPipeline = ML.Transforms.CopyColumns("CopiedFeatures", "Features") + .Append(ML.Transforms.NormalizeBinning("CopiedFeatures")) + .Append(ML.Transforms.FeaturizeByFastForestRegression(options)) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "OhMyTrees", "OhMyLeaves", "OhMyPaths")) + .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); + var secondModel = secondPipeline.Fit(dataView); + var secondPrediction = secondModel.Transform(dataView); + var secondMetrics = ML.Regression.Evaluate(secondPrediction); + + // The second pipeline trains a tree featurizer on a bin-based normalized feature, so the second pipeline + // is different from the first pipeline. + Assert.NotEqual(metrics.MeanAbsoluteError, secondMetrics.MeanAbsoluteError); + Assert.NotEqual(metrics.MeanSquaredError, secondMetrics.MeanSquaredError); + } + + [Fact] + public void TestFastTreeBinaryFeaturizationInPipelineWithOptionalOutputs() + { + int dataPointCount = 200; + var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); + + var trainerOptions = new FastTreeBinaryTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" + }; + + var options = new FastTreeBinaryFeaturizationEstimator.Options() + { + InputColumnName = "Features", + TrainerOptions = trainerOptions, + TreesColumnName = null, + PathsColumnName = null, + LeavesColumnName = "Leaves" + }; + + + bool isWrong = false; + try + { + var wrongPipeline = ML.Transforms.FeaturizeByFastTreeBinary(options) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) + .Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); + var wrongModel = wrongPipeline.Fit(dataView); + } + catch + { + isWrong = true; // Only "Leaves" is produced by the tree featurizer, so accessing "Trees" and "Paths" will lead to an error. + } + Assert.True(isWrong); + + var pipeline = ML.Transforms.FeaturizeByFastTreeBinary(options) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Leaves")) + .Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); + var model = pipeline.Fit(dataView); + var prediction = model.Transform(dataView); + var metrics = ML.BinaryClassification.Evaluate(prediction); + + Assert.True(metrics.Accuracy > 0.98); + Assert.True(metrics.LogLoss < 0.05); + Assert.True(metrics.AreaUnderPrecisionRecallCurve > 0.98); + } + + /// + /// Apply tree-based featurization on multiclass classification by converting key-typed labels to floats and training + /// a regression tree model for featurization. + /// + [Fact] + public void TreeEnsembleFeaturizingPipelineMulticlass() + { + int dataPointCount = 1000; + var data = SamplesUtils.DatasetUtils.GenerateRandomMulticlassClassificationExamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); + + var trainerOptions = new FastForestRegressionTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "FloatLabel", + ShuffleLabels = true + }; + + var options = new FastForestRegressionFeaturizationEstimator.Options() + { + InputColumnName = "Features", + TreesColumnName = "Trees", + LeavesColumnName = "Leaves", + PathsColumnName = "Paths", + TrainerOptions = trainerOptions + }; + + Action actionConvertKeyToFloat = (RowWithKey rowWithKey, RowWithFloat rowWithFloat) => + { + rowWithFloat.FloatLabel = rowWithKey.KeyLabel == 0 ? float.NaN : rowWithKey.KeyLabel - 1; + }; + + var split = ML.Data.TrainTestSplit(dataView, 0.5); + var trainData = split.TrainSet; + var testData = split.TestSet; + + var pipeline = ML.Transforms.Conversion.MapValueToKey("KeyLabel", "Label") + .Append(ML.Transforms.CustomMapping(actionConvertKeyToFloat, "KeyLabel")) + .Append(ML.Transforms.FeaturizeByFastForestRegression(options)) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Trees", "Leaves", "Paths")) + .Append(ML.MulticlassClassification.Trainers.SdcaMaximumEntropy("KeyLabel", "CombinedFeatures")); + + var model = pipeline.Fit(trainData); + var prediction = model.Transform(testData); + var metrics = ML.MulticlassClassification.Evaluate(prediction, labelColumnName: "KeyLabel"); + + Assert.True(metrics.MacroAccuracy > 0.6); + Assert.True(metrics.MicroAccuracy > 0.6); + } + + private class RowWithKey + { + [KeyType()] + public uint KeyLabel { get; set; } + } + + private class RowWithFloat + { + public float FloatLabel { get; set; } + } } }