From bfac765e12e8fe5fa0b36df45e10b16c45421385 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Mon, 25 Jun 2018 11:45:19 -0700 Subject: [PATCH 1/8] Added convenience constructor for set of transforms (#371). --- .../Transforms/ConcatTransform.cs | 22 ++++ .../Transforms/CopyColumnsTransform.cs | 26 +++++ .../Transforms/DropColumnsTransform.cs | 10 ++ src/Microsoft.ML.Data/Transforms/NAFilter.cs | 6 + .../BootstrapSampleTransform.cs | 6 + .../CategoricalHashTransform.cs | 26 +++++ .../CategoricalTransform.cs | 26 +++++ .../CountFeatureSelection.cs | 10 ++ src/Microsoft.ML.Transforms/GcnTransform.cs | 66 ++++++++++- .../NormalizeColumn.cs | 107 ++++++++++++++++++ 10 files changed, 303 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs b/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs index 544bce0aeb..7274fb4edc 100644 --- a/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs +++ b/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs @@ -527,6 +527,28 @@ private static VersionInfo GetVersionInfo() public override ISchema Schema => _bindings; + public ConcatTransform(IHostEnvironment env, IDataView input, string outputColumn, params string[] inputColumns) + : base(env, RegistrationName, input) + { + var cols = new Column[1]; + cols[0] = new Column() + { + Name = outputColumn, + Source = inputColumns + }; + + var args = new Arguments() + { + Column = cols + }; + Host.CheckValue(args, nameof(args)); + Host.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column)); + for (int i = 0; i < args.Column.Length; i++) + Host.CheckUserArg(Utils.Size(args.Column[i].Source) > 0, nameof(args.Column)); + + _bindings = new Bindings(args.Column, null, Source.Schema); + } + /// /// Public constructor corresponding to SignatureDataTransform. /// diff --git a/src/Microsoft.ML.Data/Transforms/CopyColumnsTransform.cs b/src/Microsoft.ML.Data/Transforms/CopyColumnsTransform.cs index f365dd9e98..c5d8a647d7 100644 --- a/src/Microsoft.ML.Data/Transforms/CopyColumnsTransform.cs +++ b/src/Microsoft.ML.Data/Transforms/CopyColumnsTransform.cs @@ -64,6 +64,32 @@ private static VersionInfo GetVersionInfo() private const string RegistrationName = "CopyColumns"; + public static CopyColumnsTransform Create(IHostEnvironment env, IDataView input, params string[] inputColumns) + { + var inputOutputColumns = new(string inputColumn, string outputColumn)[inputColumns.Length]; + for (int i = 0; i < inputColumns.Length; i++) + { + inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; + } + return Create(env, input, inputOutputColumns); + } + + public static CopyColumnsTransform Create(IHostEnvironment env, IDataView input, params (string inputColumn, string outputColumn)[] inputOutputColumns) + { + Column[] cols = new Column[inputOutputColumns.Length]; + for (int i = 0; i < inputOutputColumns.Length; i++) + { + cols[i] = new Column(); + cols[i].Source = inputOutputColumns[i].inputColumn; + cols[i].Name = inputOutputColumns[i].outputColumn; + } + var args = new Arguments() + { + Column = cols + }; + return new CopyColumnsTransform(env,args,input); + } + public CopyColumnsTransform(IHostEnvironment env, Arguments args, IDataView input) : base(env, RegistrationName, env.CheckRef(args, nameof(args)).Column, input, null) { diff --git a/src/Microsoft.ML.Data/Transforms/DropColumnsTransform.cs b/src/Microsoft.ML.Data/Transforms/DropColumnsTransform.cs index 502e6f395d..51e432a653 100644 --- a/src/Microsoft.ML.Data/Transforms/DropColumnsTransform.cs +++ b/src/Microsoft.ML.Data/Transforms/DropColumnsTransform.cs @@ -237,6 +237,16 @@ private static VersionInfo GetVersionInfo() private const string DropRegistrationName = "DropColumns"; private const string KeepRegistrationName = "KeepColumns"; + public DropColumnsTransform CreateColumnDroper(IHostEnvironment env, IDataView input, params string[] columnsToDrop) + { + return new DropColumnsTransform(env, new Arguments() { Column = columnsToDrop }, input); + } + + public static DropColumnsTransform CreateColumnSelector(IHostEnvironment env, IDataView input, params string[] columnsToKeep) + { + return new DropColumnsTransform(env, new KeepArguments() { Column = columnsToKeep }, input); + } + /// /// Public constructor corresponding to SignatureDataTransform. /// diff --git a/src/Microsoft.ML.Data/Transforms/NAFilter.cs b/src/Microsoft.ML.Data/Transforms/NAFilter.cs index 96c2111366..711a1a07a2 100644 --- a/src/Microsoft.ML.Data/Transforms/NAFilter.cs +++ b/src/Microsoft.ML.Data/Transforms/NAFilter.cs @@ -72,6 +72,12 @@ private static VersionInfo GetVersionInfo() private readonly bool _complement; private const string RegistrationName = "MissingValueFilter"; + public NAFilter(IHostEnvironment env, IDataView input, params string[] inputColumns) + : this(env, new Arguments() { Column = inputColumns}, input) + { + + } + public NAFilter(IHostEnvironment env, Arguments args, IDataView input) : base(env, RegistrationName, input) { diff --git a/src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs b/src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs index b7f03deeec..86dea7db21 100644 --- a/src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs +++ b/src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs @@ -76,6 +76,12 @@ public BootstrapSampleTransform(IHostEnvironment env, Arguments args, IDataView _poolSize = args.PoolSize; } + public BootstrapSampleTransform(IHostEnvironment env, IDataView input, bool complement = false, uint? seed = null, bool shuffleInput = true, int poolSize = 1000) + : this(env, new Arguments() { Complement = complement, Seed = seed, ShuffleInput = shuffleInput, PoolSize = poolSize }, input) + { + + } + private BootstrapSampleTransform(IHost host, ModelLoadContext ctx, IDataView input) : base(host, input) { diff --git a/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs b/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs index a30b9a6c84..c80ecfaa14 100644 --- a/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs +++ b/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs @@ -120,6 +120,32 @@ public sealed class Arguments : TransformInputBase public const string UserName = "Categorical Hash Transform"; + public static IDataTransform Create(IHostEnvironment env, IDataView input, params string[] inputColumns) + { + var inputOutputColumns = new(string inputColumn, string outputColumn)[inputColumns.Length]; + for (int i = 0; i < inputColumns.Length; i++) + { + inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; + } + return Create(env, input, inputOutputColumns); + } + + public static IDataTransform Create(IHostEnvironment env, IDataView input, params (string inputColumn, string outputColumn)[] inputOutputColumns) + { + Column[] cols = new Column[inputOutputColumns.Length]; + for (int i = 0; i < inputOutputColumns.Length; i++) + { + cols[i] = new Column(); + cols[i].Source = inputOutputColumns[i].inputColumn; + cols[i].Name = inputOutputColumns[i].outputColumn; + } + var args = new Arguments() + { + Column = cols + }; + return Create(env, args, input); + } + public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) { Contracts.CheckValue(env, nameof(env)); diff --git a/src/Microsoft.ML.Transforms/CategoricalTransform.cs b/src/Microsoft.ML.Transforms/CategoricalTransform.cs index 621bbe6c1d..ff0b8a8ed8 100644 --- a/src/Microsoft.ML.Transforms/CategoricalTransform.cs +++ b/src/Microsoft.ML.Transforms/CategoricalTransform.cs @@ -118,6 +118,32 @@ public Arguments() public const string UserName = "Categorical Transform"; + public static IDataTransform Create(IHostEnvironment env, IDataView input, params string[] inputColumns) + { + var inputOutputColumns = new (string inputColumn, string outputColumn)[inputColumns.Length]; + for (int i = 0; i < inputColumns.Length; i++) + { + inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; + } + return Create(env, input, inputOutputColumns); + } + + public static IDataTransform Create(IHostEnvironment env, IDataView input, params (string inputColumn, string outputColumn)[] inputOutputColumns) + { + Column[] cols = new Column[inputOutputColumns.Length]; + for (int i = 0; i < inputOutputColumns.Length; i++) + { + cols[i] = new Column(); + cols[i].Source = inputOutputColumns[i].inputColumn; + cols[i].Name = inputOutputColumns[i].outputColumn; + } + var args = new Arguments() + { + Column = cols + }; + return Create(env, args, input); + } + public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) { Contracts.CheckValue(env, nameof(env)); diff --git a/src/Microsoft.ML.Transforms/CountFeatureSelection.cs b/src/Microsoft.ML.Transforms/CountFeatureSelection.cs index 72131902b5..aac9c9e05f 100644 --- a/src/Microsoft.ML.Transforms/CountFeatureSelection.cs +++ b/src/Microsoft.ML.Transforms/CountFeatureSelection.cs @@ -39,6 +39,16 @@ public sealed class Arguments : TransformInputBase internal static string RegistrationName = "CountFeatureSelectionTransform"; + public static IDataTransform Create(IHostEnvironment env, IDataView input, long count = 1, params string[] columns) + { + var args = new Arguments() + { + Column = columns, + Count = count + }; + return Create(env, args, input); + } + /// /// Create method corresponding to SignatureDataTransform. /// diff --git a/src/Microsoft.ML.Transforms/GcnTransform.cs b/src/Microsoft.ML.Transforms/GcnTransform.cs index 2fee0e8ef3..3450faa682 100644 --- a/src/Microsoft.ML.Transforms/GcnTransform.cs +++ b/src/Microsoft.ML.Transforms/GcnTransform.cs @@ -237,6 +237,37 @@ private static VersionInfo GetVersionInfo() private readonly ColInfoEx[] _exes; + public static IDataTransform CreateGlobalContrastNormalizer(IHostEnvironment env, IDataView input, params string[] inputColumns) + { + var inputOutputColumns = new(string inputColumn, string outputColumn)[inputColumns.Length]; + for (int i = 0; i < inputColumns.Length; i++) + { + inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; + } + return CreateGlobalContrastNormalizer(env, input, inputOutputColumns); + } + + public static IDataTransform CreateGlobalContrastNormalizer(IHostEnvironment env, IDataView input, params (string inputColumn, string outputColumn)[] inputOutputColumns) + { + GcnColumn[] cols = new GcnColumn[inputOutputColumns.Length]; + for (int i = 0; i < inputOutputColumns.Length; i++) + { + cols[i] = new GcnColumn(); + cols[i].Source = inputOutputColumns[i].inputColumn; + cols[i].Name = inputOutputColumns[i].outputColumn; + } + var args = new GcnArguments() + { + Column = cols + }; + return new LpNormNormalizerTransform(env, args, input); + } + + public static IDataTransform CreateGlobalContrastNormalizer(IHostEnvironment env, IDataView input, GcnArguments args) + { + return new LpNormNormalizerTransform(env, args, input); + } + /// /// Public constructor corresponding to SignatureDataTransform. /// @@ -263,9 +294,40 @@ public LpNormNormalizerTransform(IHostEnvironment env, GcnArguments args, IDataV SetMetadata(); } + public static IDataTransform CreateLpNormNormalizer(IHostEnvironment env, IDataView input, params string[] inputColumns) + { + var inputOutputColumns = new(string inputColumn, string outputColumn)[inputColumns.Length]; + for (int i = 0; i < inputColumns.Length; i++) + { + inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; + } + return CreateLpNormNormalizer(env, input, inputOutputColumns); + } + + public static IDataTransform CreateLpNormNormalizer(IHostEnvironment env, IDataView input, params (string inputColumn, string outputColumn)[] inputOutputColumns) + { + Column[] cols = new Column[inputOutputColumns.Length]; + for (int i = 0; i < inputOutputColumns.Length; i++) + { + cols[i] = new Column(); + cols[i].Source = inputOutputColumns[i].inputColumn; + cols[i].Name = inputOutputColumns[i].outputColumn; + } + var args = new Arguments() + { + Column = cols + }; + return new LpNormNormalizerTransform(env, args, input); + } + + public static IDataTransform CreateLpNormNormalizer(IHostEnvironment env, IDataView input, Arguments args) + { + return new LpNormNormalizerTransform(env, args, input); + } + public LpNormNormalizerTransform(IHostEnvironment env, Arguments args, IDataView input) - : base(env, RegistrationName, env.CheckRef(args, nameof(args)).Column, - input, TestIsFloatVector) + : base(env, RegistrationName, env.CheckRef(args, nameof(args)).Column, + input, TestIsFloatVector) { Host.AssertNonEmpty(Infos); Host.Assert(Infos.Length == Utils.Size(args.Column)); diff --git a/src/Microsoft.ML.Transforms/NormalizeColumn.cs b/src/Microsoft.ML.Transforms/NormalizeColumn.cs index a5769ec90a..c8ffb23c9d 100644 --- a/src/Microsoft.ML.Transforms/NormalizeColumn.cs +++ b/src/Microsoft.ML.Transforms/NormalizeColumn.cs @@ -218,6 +218,32 @@ public sealed class SupervisedBinArguments : BinArgumentsBase public const string BinNormalizerShortName = "Bin"; public const string SupervisedBinNormalizerShortName = "SupBin"; + public static NormalizeTransform CreateMinMaxNormalizer(IHostEnvironment env, IDataView input, params string[] inputColumns) + { + var inputOutputColumns = new(string inputColumn, string outputColumn)[inputColumns.Length]; + for (int i = 0; i < inputColumns.Length; i++) + { + inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; + } + return CreateMinMaxNormalizer(env, input, inputOutputColumns); + } + + public static NormalizeTransform CreateMinMaxNormalizer(IHostEnvironment env, IDataView input, params (string inputColumn, string outputColumn)[] inputOutputColumns) + { + AffineColumn[] cols = new AffineColumn[inputOutputColumns.Length]; + for (int i = 0; i < inputOutputColumns.Length; i++) + { + cols[i] = new AffineColumn(); + cols[i].Source = inputOutputColumns[i].inputColumn; + cols[i].Name = inputOutputColumns[i].outputColumn; + } + var args = new MinMaxArguments() + { + Column = cols + }; + return Create(env, args, input); + } + /// /// Public create method corresponding to SignatureDataTransform. /// @@ -234,6 +260,33 @@ public static NormalizeTransform Create(IHostEnvironment env, MinMaxArguments ar return func; } + public static NormalizeTransform CreateMeanVarNormalizer(IHostEnvironment env, IDataView input, bool UseCdf, params string[] inputColumns) + { + var inputOutputColumns = new(string inputColumn, string outputColumn)[inputColumns.Length]; + for (int i = 0; i < inputColumns.Length; i++) + { + inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; + } + return CreateMeanVarNormalizer(env, input, UseCdf, inputOutputColumns); + } + + public static NormalizeTransform CreateMeanVarNormalizer(IHostEnvironment env, IDataView input, bool UseCdf, params (string inputColumn, string outputColumn)[] inputOutputColumns) + { + AffineColumn[] cols = new AffineColumn[inputOutputColumns.Length]; + for (int i = 0; i < inputOutputColumns.Length; i++) + { + cols[i] = new AffineColumn(); + cols[i].Source = inputOutputColumns[i].inputColumn; + cols[i].Name = inputOutputColumns[i].outputColumn; + } + var args = new MeanVarArguments() + { + Column = cols, + UseCdf = UseCdf + }; + return Create(env, args, input); + } + /// /// Public create method corresponding to SignatureDataTransform. /// @@ -250,6 +303,33 @@ public static NormalizeTransform Create(IHostEnvironment env, MeanVarArguments a return func; } + public static NormalizeTransform CreateLogMeanVarNormalizer(IHostEnvironment env, IDataView input, bool UseCdf = true, params string[] inputColumns) + { + var inputOutputColumns = new(string inputColumn, string outputColumn)[inputColumns.Length]; + for (int i = 0; i < inputColumns.Length; i++) + { + inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; + } + return CreateLogMeanVarNormalizer(env, input, UseCdf, inputOutputColumns); + } + + public static NormalizeTransform CreateLogMeanVarNormalizer(IHostEnvironment env, IDataView input, bool UseCdf = true, params (string inputColumn, string outputColumn)[] inputOutputColumns) + { + LogNormalColumn[] cols = new LogNormalColumn[inputOutputColumns.Length]; + for (int i = 0; i < inputOutputColumns.Length; i++) + { + cols[i] = new LogNormalColumn(); + cols[i].Source = inputOutputColumns[i].inputColumn; + cols[i].Name = inputOutputColumns[i].outputColumn; + } + var args = new LogMeanVarArguments() + { + Column = cols, + UseCdf = UseCdf + }; + return Create(env, args, input); + } + /// /// Public create method corresponding to SignatureDataTransform. /// @@ -266,6 +346,33 @@ public static NormalizeTransform Create(IHostEnvironment env, LogMeanVarArgument return func; } + public static NormalizeTransform CreateBinningNormalizer(IHostEnvironment env, IDataView input, int numBins = 1024, params string[] inputColumns) + { + var inputOutputColumns = new(string inputColumn, string outputColumn)[inputColumns.Length]; + for (int i = 0; i < inputColumns.Length; i++) + { + inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; + } + return CreateBinningNormalizer(env, input, numBins, inputOutputColumns); + } + + public static NormalizeTransform CreateBinningNormalizer(IHostEnvironment env, IDataView input, int numBins = 1024, params (string inputColumn, string outputColumn)[] inputOutputColumns) + { + BinColumn[] cols = new BinColumn[inputOutputColumns.Length]; + for (int i = 0; i < inputOutputColumns.Length; i++) + { + cols[i] = new BinColumn(); + cols[i].Source = inputOutputColumns[i].inputColumn; + cols[i].Name = inputOutputColumns[i].outputColumn; + } + var args = new BinArguments() + { + Column = cols, + NumBins = numBins + }; + return Create(env, args, input); + } + /// /// Public create method corresponding to SignatureDataTransform. /// From 20d91943df7fcea791629e3f69fa7e86def7bbf8 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Mon, 25 Jun 2018 12:14:04 -0700 Subject: [PATCH 2/8] Removed useless validation from Concate transform. --- src/Microsoft.ML.Data/Transforms/ConcatTransform.cs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs b/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs index 7274fb4edc..70363951e5 100644 --- a/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs +++ b/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs @@ -541,10 +541,6 @@ public ConcatTransform(IHostEnvironment env, IDataView input, string outputColum { Column = cols }; - Host.CheckValue(args, nameof(args)); - Host.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column)); - for (int i = 0; i < args.Column.Length; i++) - Host.CheckUserArg(Utils.Size(args.Column[i].Source) > 0, nameof(args.Column)); _bindings = new Bindings(args.Column, null, Source.Schema); } From 44e8e8d9ca35cecbe1e08ffc962219610974606d Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Mon, 25 Jun 2018 13:40:02 -0700 Subject: [PATCH 3/8] Added more parameters to some transforms. --- .../Transforms/ConcatTransform.cs | 7 +------ src/Microsoft.ML.Data/Transforms/NAFilter.cs | 4 ++-- .../CategoricalHashTransform.cs | 17 +++++++++++++---- .../CategoricalTransform.cs | 9 +++++---- src/Microsoft.ML.Transforms/GcnTransform.cs | 17 ++++++++++------- 5 files changed, 31 insertions(+), 23 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs b/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs index 70363951e5..79b7a1fd21 100644 --- a/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs +++ b/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs @@ -537,12 +537,7 @@ public ConcatTransform(IHostEnvironment env, IDataView input, string outputColum Source = inputColumns }; - var args = new Arguments() - { - Column = cols - }; - - _bindings = new Bindings(args.Column, null, Source.Schema); + _bindings = new Bindings(cols, null, Source.Schema); } /// diff --git a/src/Microsoft.ML.Data/Transforms/NAFilter.cs b/src/Microsoft.ML.Data/Transforms/NAFilter.cs index 711a1a07a2..7c711d6e3e 100644 --- a/src/Microsoft.ML.Data/Transforms/NAFilter.cs +++ b/src/Microsoft.ML.Data/Transforms/NAFilter.cs @@ -72,8 +72,8 @@ private static VersionInfo GetVersionInfo() private readonly bool _complement; private const string RegistrationName = "MissingValueFilter"; - public NAFilter(IHostEnvironment env, IDataView input, params string[] inputColumns) - : this(env, new Arguments() { Column = inputColumns}, input) + public NAFilter(IHostEnvironment env, IDataView input, bool complement = false, params string[] inputColumns) + : this(env, new Arguments() { Column = inputColumns, Complement = complement }, input) { } diff --git a/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs b/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs index c80ecfaa14..af1eec96cb 100644 --- a/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs +++ b/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs @@ -120,17 +120,21 @@ public sealed class Arguments : TransformInputBase public const string UserName = "Categorical Hash Transform"; - public static IDataTransform Create(IHostEnvironment env, IDataView input, params string[] inputColumns) + public static IDataTransform Create(IHostEnvironment env, IDataView input, int hashBits = 16, uint seed = 314489979, + bool ordered = true, int invertHash = 0, CategoricalTransform.OutputKind outputKind = CategoricalTransform.OutputKind.Bag, + params string[] inputColumns) { var inputOutputColumns = new(string inputColumn, string outputColumn)[inputColumns.Length]; for (int i = 0; i < inputColumns.Length; i++) { inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; } - return Create(env, input, inputOutputColumns); + return Create(env, input, hashBits, seed, ordered, invertHash, outputKind, inputOutputColumns); } - public static IDataTransform Create(IHostEnvironment env, IDataView input, params (string inputColumn, string outputColumn)[] inputOutputColumns) + public static IDataTransform Create(IHostEnvironment env, IDataView input, int hashBits = 16, uint seed = 314489979, + bool ordered = true, int invertHash = 0, CategoricalTransform.OutputKind outputKind = CategoricalTransform.OutputKind.Bag, + params(string inputColumn, string outputColumn)[] inputOutputColumns) { Column[] cols = new Column[inputOutputColumns.Length]; for (int i = 0; i < inputOutputColumns.Length; i++) @@ -141,7 +145,12 @@ public static IDataTransform Create(IHostEnvironment env, IDataView input, param } var args = new Arguments() { - Column = cols + Column = cols, + HashBits = hashBits, + Seed = seed, + Ordered = ordered, + InvertHash = invertHash, + OutputKind = outputKind }; return Create(env, args, input); } diff --git a/src/Microsoft.ML.Transforms/CategoricalTransform.cs b/src/Microsoft.ML.Transforms/CategoricalTransform.cs index ff0b8a8ed8..2dfc0727ff 100644 --- a/src/Microsoft.ML.Transforms/CategoricalTransform.cs +++ b/src/Microsoft.ML.Transforms/CategoricalTransform.cs @@ -118,17 +118,17 @@ public Arguments() public const string UserName = "Categorical Transform"; - public static IDataTransform Create(IHostEnvironment env, IDataView input, params string[] inputColumns) + public static IDataTransform Create(IHostEnvironment env, IDataView input, OutputKind outputKind = OutputKind.Ind, params string[] inputColumns) { var inputOutputColumns = new (string inputColumn, string outputColumn)[inputColumns.Length]; for (int i = 0; i < inputColumns.Length; i++) { inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; } - return Create(env, input, inputOutputColumns); + return Create(env, input, outputKind, inputOutputColumns); } - public static IDataTransform Create(IHostEnvironment env, IDataView input, params (string inputColumn, string outputColumn)[] inputOutputColumns) + public static IDataTransform Create(IHostEnvironment env, IDataView input, OutputKind outputKind = OutputKind.Ind, params (string inputColumn, string outputColumn)[] inputOutputColumns) { Column[] cols = new Column[inputOutputColumns.Length]; for (int i = 0; i < inputOutputColumns.Length; i++) @@ -139,7 +139,8 @@ public static IDataTransform Create(IHostEnvironment env, IDataView input, param } var args = new Arguments() { - Column = cols + Column = cols, + OutputKind = outputKind }; return Create(env, args, input); } diff --git a/src/Microsoft.ML.Transforms/GcnTransform.cs b/src/Microsoft.ML.Transforms/GcnTransform.cs index 3450faa682..55d1b55952 100644 --- a/src/Microsoft.ML.Transforms/GcnTransform.cs +++ b/src/Microsoft.ML.Transforms/GcnTransform.cs @@ -237,17 +237,17 @@ private static VersionInfo GetVersionInfo() private readonly ColInfoEx[] _exes; - public static IDataTransform CreateGlobalContrastNormalizer(IHostEnvironment env, IDataView input, params string[] inputColumns) + public static IDataTransform CreateGlobalContrastNormalizer(IHostEnvironment env, IDataView input, bool subMean = true, bool useStdDev = false, Float scale = 1, params string[] inputColumns) { var inputOutputColumns = new(string inputColumn, string outputColumn)[inputColumns.Length]; for (int i = 0; i < inputColumns.Length; i++) { inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; } - return CreateGlobalContrastNormalizer(env, input, inputOutputColumns); + return CreateGlobalContrastNormalizer(env, input, subMean, useStdDev, scale, inputOutputColumns); } - public static IDataTransform CreateGlobalContrastNormalizer(IHostEnvironment env, IDataView input, params (string inputColumn, string outputColumn)[] inputOutputColumns) + public static IDataTransform CreateGlobalContrastNormalizer(IHostEnvironment env, IDataView input, bool subMean = true, bool useStdDev = false, Float scale = 1, params (string inputColumn, string outputColumn)[] inputOutputColumns) { GcnColumn[] cols = new GcnColumn[inputOutputColumns.Length]; for (int i = 0; i < inputOutputColumns.Length; i++) @@ -258,7 +258,10 @@ public static IDataTransform CreateGlobalContrastNormalizer(IHostEnvironment env } var args = new GcnArguments() { - Column = cols + Column = cols, + SubMean = subMean, + UseStdDev = useStdDev, + Scale = scale }; return new LpNormNormalizerTransform(env, args, input); } @@ -294,17 +297,17 @@ public LpNormNormalizerTransform(IHostEnvironment env, GcnArguments args, IDataV SetMetadata(); } - public static IDataTransform CreateLpNormNormalizer(IHostEnvironment env, IDataView input, params string[] inputColumns) + public static IDataTransform CreateLpNormNormalizer(IHostEnvironment env, IDataView input, NormalizerKind normKind = NormalizerKind.L2Norm, bool subMean = false, params string[] inputColumns) { var inputOutputColumns = new(string inputColumn, string outputColumn)[inputColumns.Length]; for (int i = 0; i < inputColumns.Length; i++) { inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; } - return CreateLpNormNormalizer(env, input, inputOutputColumns); + return CreateLpNormNormalizer(env, input, normKind, subMean, inputOutputColumns); } - public static IDataTransform CreateLpNormNormalizer(IHostEnvironment env, IDataView input, params (string inputColumn, string outputColumn)[] inputOutputColumns) + public static IDataTransform CreateLpNormNormalizer(IHostEnvironment env, IDataView input, NormalizerKind normKind = NormalizerKind.L2Norm, bool subMean = false, params (string inputColumn, string outputColumn)[] inputOutputColumns) { Column[] cols = new Column[inputOutputColumns.Length]; for (int i = 0; i < inputOutputColumns.Length; i++) From d3c627e0e76f85e890139a3408f2ae0829ca7348 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Mon, 25 Jun 2018 18:36:39 -0700 Subject: [PATCH 4/8] Addressed reviewers' comments. --- .../Transforms/ConcatTransform.cs | 25 +++-- .../Transforms/CopyColumnsTransform.cs | 26 +---- .../Transforms/DropColumnsTransform.cs | 15 +-- src/Microsoft.ML.Data/Transforms/NAFilter.cs | 4 +- .../CategoricalHashTransform.cs | 33 ++---- .../CategoricalTransform.cs | 25 ++--- src/Microsoft.ML.Transforms/GcnTransform.cs | 62 +++-------- .../NormalizeColumn.cs | 100 +++++------------- 8 files changed, 79 insertions(+), 211 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs b/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs index 79b7a1fd21..2812523f28 100644 --- a/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs +++ b/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs @@ -90,6 +90,19 @@ public bool TryUnparse(StringBuilder sb) public sealed class Arguments : TransformInputBase { + public Arguments() + { + } + + public Arguments(string name, params string[] source) + { + Column = new[] { new Column() + { + Name = name, + Source = source + }}; + } + [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:srcs)", ShortName = "col", SortOrder = 1)] public Column[] Column; } @@ -527,17 +540,9 @@ private static VersionInfo GetVersionInfo() public override ISchema Schema => _bindings; - public ConcatTransform(IHostEnvironment env, IDataView input, string outputColumn, params string[] inputColumns) - : base(env, RegistrationName, input) + public ConcatTransform(IHostEnvironment env, IDataView input, string name, params string[] source) + : this(env, new Arguments(name, source), input) { - var cols = new Column[1]; - cols[0] = new Column() - { - Name = outputColumn, - Source = inputColumns - }; - - _bindings = new Bindings(cols, null, Source.Schema); } /// diff --git a/src/Microsoft.ML.Data/Transforms/CopyColumnsTransform.cs b/src/Microsoft.ML.Data/Transforms/CopyColumnsTransform.cs index c5d8a647d7..35a845b118 100644 --- a/src/Microsoft.ML.Data/Transforms/CopyColumnsTransform.cs +++ b/src/Microsoft.ML.Data/Transforms/CopyColumnsTransform.cs @@ -64,30 +64,10 @@ private static VersionInfo GetVersionInfo() private const string RegistrationName = "CopyColumns"; - public static CopyColumnsTransform Create(IHostEnvironment env, IDataView input, params string[] inputColumns) + public CopyColumnsTransform(IHostEnvironment env, IDataView input, string name, string source) + : this(env, new Arguments(){ Column = new[] { new Column() { Source = source, Name = name }}}, input) { - var inputOutputColumns = new(string inputColumn, string outputColumn)[inputColumns.Length]; - for (int i = 0; i < inputColumns.Length; i++) - { - inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; - } - return Create(env, input, inputOutputColumns); - } - - public static CopyColumnsTransform Create(IHostEnvironment env, IDataView input, params (string inputColumn, string outputColumn)[] inputOutputColumns) - { - Column[] cols = new Column[inputOutputColumns.Length]; - for (int i = 0; i < inputOutputColumns.Length; i++) - { - cols[i] = new Column(); - cols[i].Source = inputOutputColumns[i].inputColumn; - cols[i].Name = inputOutputColumns[i].outputColumn; - } - var args = new Arguments() - { - Column = cols - }; - return new CopyColumnsTransform(env,args,input); + } public CopyColumnsTransform(IHostEnvironment env, Arguments args, IDataView input) diff --git a/src/Microsoft.ML.Data/Transforms/DropColumnsTransform.cs b/src/Microsoft.ML.Data/Transforms/DropColumnsTransform.cs index 51e432a653..d22c250fbb 100644 --- a/src/Microsoft.ML.Data/Transforms/DropColumnsTransform.cs +++ b/src/Microsoft.ML.Data/Transforms/DropColumnsTransform.cs @@ -237,14 +237,9 @@ private static VersionInfo GetVersionInfo() private const string DropRegistrationName = "DropColumns"; private const string KeepRegistrationName = "KeepColumns"; - public DropColumnsTransform CreateColumnDroper(IHostEnvironment env, IDataView input, params string[] columnsToDrop) + public DropColumnsTransform(IHostEnvironment env, IDataView input, params string[] columnsToDrop) + :this(env, new Arguments() { Column = columnsToDrop }, input) { - return new DropColumnsTransform(env, new Arguments() { Column = columnsToDrop }, input); - } - - public static DropColumnsTransform CreateColumnSelector(IHostEnvironment env, IDataView input, params string[] columnsToKeep) - { - return new DropColumnsTransform(env, new KeepArguments() { Column = columnsToKeep }, input); } /// @@ -393,4 +388,10 @@ public ValueGetter GetGetter(int col) } } } + + public class KeepColumnsTransform + { + public static IDataTransform Create(IHostEnvironment env, IDataView input, params string[] columnsToKeep) + => new DropColumnsTransform(env, new DropColumnsTransform.KeepArguments() { Column = columnsToKeep }, input); + } } diff --git a/src/Microsoft.ML.Data/Transforms/NAFilter.cs b/src/Microsoft.ML.Data/Transforms/NAFilter.cs index 7c711d6e3e..4137a8277b 100644 --- a/src/Microsoft.ML.Data/Transforms/NAFilter.cs +++ b/src/Microsoft.ML.Data/Transforms/NAFilter.cs @@ -72,8 +72,8 @@ private static VersionInfo GetVersionInfo() private readonly bool _complement; private const string RegistrationName = "MissingValueFilter"; - public NAFilter(IHostEnvironment env, IDataView input, bool complement = false, params string[] inputColumns) - : this(env, new Arguments() { Column = inputColumns, Complement = complement }, input) + public NAFilter(IHostEnvironment env, IDataView input, bool complement = false, params string[] columns) + : this(env, new Arguments() { Column = columns, Complement = complement }, input) { } diff --git a/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs b/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs index af1eec96cb..c0e27635e7 100644 --- a/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs +++ b/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs @@ -119,36 +119,17 @@ public sealed class Arguments : TransformInputBase + "bag. If the input column is a vector, a single indicator bag is returned for it."; public const string UserName = "Categorical Hash Transform"; - - public static IDataTransform Create(IHostEnvironment env, IDataView input, int hashBits = 16, uint seed = 314489979, - bool ordered = true, int invertHash = 0, CategoricalTransform.OutputKind outputKind = CategoricalTransform.OutputKind.Bag, - params string[] inputColumns) - { - var inputOutputColumns = new(string inputColumn, string outputColumn)[inputColumns.Length]; - for (int i = 0; i < inputColumns.Length; i++) - { - inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; - } - return Create(env, input, hashBits, seed, ordered, invertHash, outputKind, inputOutputColumns); - } - - public static IDataTransform Create(IHostEnvironment env, IDataView input, int hashBits = 16, uint seed = 314489979, - bool ordered = true, int invertHash = 0, CategoricalTransform.OutputKind outputKind = CategoricalTransform.OutputKind.Bag, - params(string inputColumn, string outputColumn)[] inputOutputColumns) + + public static IDataTransform Create(IHostEnvironment env, IDataView input, string name, string source =null, int hashBits = 16, int invertHash = 0, CategoricalTransform.OutputKind outputKind = CategoricalTransform.OutputKind.Bag) { - Column[] cols = new Column[inputOutputColumns.Length]; - for (int i = 0; i < inputOutputColumns.Length; i++) - { - cols[i] = new Column(); - cols[i].Source = inputOutputColumns[i].inputColumn; - cols[i].Name = inputOutputColumns[i].outputColumn; - } var args = new Arguments() { - Column = cols, + Column = new[] { new Column(){ + Source = source ?? name, + Name = name + } + }, HashBits = hashBits, - Seed = seed, - Ordered = ordered, InvertHash = invertHash, OutputKind = outputKind }; diff --git a/src/Microsoft.ML.Transforms/CategoricalTransform.cs b/src/Microsoft.ML.Transforms/CategoricalTransform.cs index 2dfc0727ff..f8cd635468 100644 --- a/src/Microsoft.ML.Transforms/CategoricalTransform.cs +++ b/src/Microsoft.ML.Transforms/CategoricalTransform.cs @@ -118,28 +118,15 @@ public Arguments() public const string UserName = "Categorical Transform"; - public static IDataTransform Create(IHostEnvironment env, IDataView input, OutputKind outputKind = OutputKind.Ind, params string[] inputColumns) + public static IDataTransform Create(IHostEnvironment env, IDataView input, string name, string source = null, OutputKind outputKind = OutputKind.Ind) { - var inputOutputColumns = new (string inputColumn, string outputColumn)[inputColumns.Length]; - for (int i = 0; i < inputColumns.Length; i++) - { - inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; - } - return Create(env, input, outputKind, inputOutputColumns); - } - - public static IDataTransform Create(IHostEnvironment env, IDataView input, OutputKind outputKind = OutputKind.Ind, params (string inputColumn, string outputColumn)[] inputOutputColumns) - { - Column[] cols = new Column[inputOutputColumns.Length]; - for (int i = 0; i < inputOutputColumns.Length; i++) - { - cols[i] = new Column(); - cols[i].Source = inputOutputColumns[i].inputColumn; - cols[i].Name = inputOutputColumns[i].outputColumn; - } var args = new Arguments() { - Column = cols, + Column = new[] { new Column(){ + Source = source ?? name, + Name = name + } + }, OutputKind = outputKind }; return Create(env, args, input); diff --git a/src/Microsoft.ML.Transforms/GcnTransform.cs b/src/Microsoft.ML.Transforms/GcnTransform.cs index 55d1b55952..cb90ea775a 100644 --- a/src/Microsoft.ML.Transforms/GcnTransform.cs +++ b/src/Microsoft.ML.Transforms/GcnTransform.cs @@ -237,28 +237,15 @@ private static VersionInfo GetVersionInfo() private readonly ColInfoEx[] _exes; - public static IDataTransform CreateGlobalContrastNormalizer(IHostEnvironment env, IDataView input, bool subMean = true, bool useStdDev = false, Float scale = 1, params string[] inputColumns) + public static IDataTransform CreateGlobalContrastNormalizer(IHostEnvironment env, IDataView input, string name, string source = null, bool subMean = true, bool useStdDev = false, Float scale = 1) { - var inputOutputColumns = new(string inputColumn, string outputColumn)[inputColumns.Length]; - for (int i = 0; i < inputColumns.Length; i++) - { - inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; - } - return CreateGlobalContrastNormalizer(env, input, subMean, useStdDev, scale, inputOutputColumns); - } - - public static IDataTransform CreateGlobalContrastNormalizer(IHostEnvironment env, IDataView input, bool subMean = true, bool useStdDev = false, Float scale = 1, params (string inputColumn, string outputColumn)[] inputOutputColumns) - { - GcnColumn[] cols = new GcnColumn[inputOutputColumns.Length]; - for (int i = 0; i < inputOutputColumns.Length; i++) - { - cols[i] = new GcnColumn(); - cols[i].Source = inputOutputColumns[i].inputColumn; - cols[i].Name = inputOutputColumns[i].outputColumn; - } var args = new GcnArguments() { - Column = cols, + Column = new[] { new GcnColumn(){ + Source = source ?? name, + Name = name + } + }, SubMean = subMean, UseStdDev = useStdDev, Scale = scale @@ -266,11 +253,6 @@ public static IDataTransform CreateGlobalContrastNormalizer(IHostEnvironment env return new LpNormNormalizerTransform(env, args, input); } - public static IDataTransform CreateGlobalContrastNormalizer(IHostEnvironment env, IDataView input, GcnArguments args) - { - return new LpNormNormalizerTransform(env, args, input); - } - /// /// Public constructor corresponding to SignatureDataTransform. /// @@ -297,37 +279,21 @@ public LpNormNormalizerTransform(IHostEnvironment env, GcnArguments args, IDataV SetMetadata(); } - public static IDataTransform CreateLpNormNormalizer(IHostEnvironment env, IDataView input, NormalizerKind normKind = NormalizerKind.L2Norm, bool subMean = false, params string[] inputColumns) + public static IDataTransform CreateLpNormNormalizer(IHostEnvironment env, IDataView input, string name, string source = null, NormalizerKind normKind = NormalizerKind.L2Norm, bool subMean = false) { - var inputOutputColumns = new(string inputColumn, string outputColumn)[inputColumns.Length]; - for (int i = 0; i < inputColumns.Length; i++) - { - inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; - } - return CreateLpNormNormalizer(env, input, normKind, subMean, inputOutputColumns); - } - - public static IDataTransform CreateLpNormNormalizer(IHostEnvironment env, IDataView input, NormalizerKind normKind = NormalizerKind.L2Norm, bool subMean = false, params (string inputColumn, string outputColumn)[] inputOutputColumns) - { - Column[] cols = new Column[inputOutputColumns.Length]; - for (int i = 0; i < inputOutputColumns.Length; i++) - { - cols[i] = new Column(); - cols[i].Source = inputOutputColumns[i].inputColumn; - cols[i].Name = inputOutputColumns[i].outputColumn; - } var args = new Arguments() { - Column = cols + Column = new[] { new Column(){ + Source = source ?? name, + Name = name + } + }, + SubMean = subMean, + NormKind = normKind }; return new LpNormNormalizerTransform(env, args, input); } - public static IDataTransform CreateLpNormNormalizer(IHostEnvironment env, IDataView input, Arguments args) - { - return new LpNormNormalizerTransform(env, args, input); - } - public LpNormNormalizerTransform(IHostEnvironment env, Arguments args, IDataView input) : base(env, RegistrationName, env.CheckRef(args, nameof(args)).Column, input, TestIsFloatVector) diff --git a/src/Microsoft.ML.Transforms/NormalizeColumn.cs b/src/Microsoft.ML.Transforms/NormalizeColumn.cs index c8ffb23c9d..54654af8a2 100644 --- a/src/Microsoft.ML.Transforms/NormalizeColumn.cs +++ b/src/Microsoft.ML.Transforms/NormalizeColumn.cs @@ -218,28 +218,15 @@ public sealed class SupervisedBinArguments : BinArgumentsBase public const string BinNormalizerShortName = "Bin"; public const string SupervisedBinNormalizerShortName = "SupBin"; - public static NormalizeTransform CreateMinMaxNormalizer(IHostEnvironment env, IDataView input, params string[] inputColumns) + public static NormalizeTransform CreateMinMaxNormalizer(IHostEnvironment env, IDataView input, string name, string source = null) { - var inputOutputColumns = new(string inputColumn, string outputColumn)[inputColumns.Length]; - for (int i = 0; i < inputColumns.Length; i++) - { - inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; - } - return CreateMinMaxNormalizer(env, input, inputOutputColumns); - } - - public static NormalizeTransform CreateMinMaxNormalizer(IHostEnvironment env, IDataView input, params (string inputColumn, string outputColumn)[] inputOutputColumns) - { - AffineColumn[] cols = new AffineColumn[inputOutputColumns.Length]; - for (int i = 0; i < inputOutputColumns.Length; i++) - { - cols[i] = new AffineColumn(); - cols[i].Source = inputOutputColumns[i].inputColumn; - cols[i].Name = inputOutputColumns[i].outputColumn; - } var args = new MinMaxArguments() { - Column = cols + Column = new[] { new AffineColumn(){ + Source = source ?? name, + Name = name + } + } }; return Create(env, args, input); } @@ -260,28 +247,15 @@ public static NormalizeTransform Create(IHostEnvironment env, MinMaxArguments ar return func; } - public static NormalizeTransform CreateMeanVarNormalizer(IHostEnvironment env, IDataView input, bool UseCdf, params string[] inputColumns) + public static NormalizeTransform CreateMeanVarNormalizer(IHostEnvironment env, IDataView input, string name, string source=null, bool UseCdf = false) { - var inputOutputColumns = new(string inputColumn, string outputColumn)[inputColumns.Length]; - for (int i = 0; i < inputColumns.Length; i++) - { - inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; - } - return CreateMeanVarNormalizer(env, input, UseCdf, inputOutputColumns); - } - - public static NormalizeTransform CreateMeanVarNormalizer(IHostEnvironment env, IDataView input, bool UseCdf, params (string inputColumn, string outputColumn)[] inputOutputColumns) - { - AffineColumn[] cols = new AffineColumn[inputOutputColumns.Length]; - for (int i = 0; i < inputOutputColumns.Length; i++) - { - cols[i] = new AffineColumn(); - cols[i].Source = inputOutputColumns[i].inputColumn; - cols[i].Name = inputOutputColumns[i].outputColumn; - } var args = new MeanVarArguments() { - Column = cols, + Column = new[] { new AffineColumn(){ + Source = source ?? name, + Name = name + } + }, UseCdf = UseCdf }; return Create(env, args, input); @@ -303,28 +277,15 @@ public static NormalizeTransform Create(IHostEnvironment env, MeanVarArguments a return func; } - public static NormalizeTransform CreateLogMeanVarNormalizer(IHostEnvironment env, IDataView input, bool UseCdf = true, params string[] inputColumns) + public static NormalizeTransform CreateLogMeanVarNormalizer(IHostEnvironment env, IDataView input, string name, string source=null, bool UseCdf = true) { - var inputOutputColumns = new(string inputColumn, string outputColumn)[inputColumns.Length]; - for (int i = 0; i < inputColumns.Length; i++) - { - inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; - } - return CreateLogMeanVarNormalizer(env, input, UseCdf, inputOutputColumns); - } - - public static NormalizeTransform CreateLogMeanVarNormalizer(IHostEnvironment env, IDataView input, bool UseCdf = true, params (string inputColumn, string outputColumn)[] inputOutputColumns) - { - LogNormalColumn[] cols = new LogNormalColumn[inputOutputColumns.Length]; - for (int i = 0; i < inputOutputColumns.Length; i++) - { - cols[i] = new LogNormalColumn(); - cols[i].Source = inputOutputColumns[i].inputColumn; - cols[i].Name = inputOutputColumns[i].outputColumn; - } var args = new LogMeanVarArguments() { - Column = cols, + Column = new[] { new LogNormalColumn(){ + Source = source ?? name, + Name = name + } + }, UseCdf = UseCdf }; return Create(env, args, input); @@ -346,28 +307,15 @@ public static NormalizeTransform Create(IHostEnvironment env, LogMeanVarArgument return func; } - public static NormalizeTransform CreateBinningNormalizer(IHostEnvironment env, IDataView input, int numBins = 1024, params string[] inputColumns) + public static NormalizeTransform CreateBinningNormalizer(IHostEnvironment env, IDataView input, string name, string source=null, int numBins = 1024) { - var inputOutputColumns = new(string inputColumn, string outputColumn)[inputColumns.Length]; - for (int i = 0; i < inputColumns.Length; i++) - { - inputOutputColumns[i].inputColumn = inputOutputColumns[i].outputColumn = inputColumns[i]; - } - return CreateBinningNormalizer(env, input, numBins, inputOutputColumns); - } - - public static NormalizeTransform CreateBinningNormalizer(IHostEnvironment env, IDataView input, int numBins = 1024, params (string inputColumn, string outputColumn)[] inputOutputColumns) - { - BinColumn[] cols = new BinColumn[inputOutputColumns.Length]; - for (int i = 0; i < inputOutputColumns.Length; i++) - { - cols[i] = new BinColumn(); - cols[i].Source = inputOutputColumns[i].inputColumn; - cols[i].Name = inputOutputColumns[i].outputColumn; - } var args = new BinArguments() { - Column = cols, + Column = new[] { new BinColumn(){ + Source = source ?? name, + Name = name + } + }, NumBins = numBins }; return Create(env, args, input); From 5a34a16f3fd39f08f42a34fd8bc8a8eca8b3dc6e Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Tue, 26 Jun 2018 09:58:14 -0700 Subject: [PATCH 5/8] XML Comments added to constructors/helper methods. --- .../Transforms/ConcatTransform.cs | 7 +++++ .../Transforms/CopyColumnsTransform.cs | 7 +++++ .../Transforms/DropColumnsTransform.cs | 13 ++++++++ src/Microsoft.ML.Data/Transforms/NAFilter.cs | 7 +++++ .../BootstrapSampleTransform.cs | 9 ++++++ .../CategoricalHashTransform.cs | 12 ++++++- .../CategoricalTransform.cs | 8 +++++ .../CountFeatureSelection.cs | 8 +++++ src/Microsoft.ML.Transforms/GcnTransform.cs | 19 ++++++++++++ .../NormalizeColumn.cs | 31 ++++++++++++++++--- 10 files changed, 116 insertions(+), 5 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs b/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs index 2812523f28..a51f502ecd 100644 --- a/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs +++ b/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs @@ -540,6 +540,13 @@ private static VersionInfo GetVersionInfo() public override ISchema Schema => _bindings; + /// + /// Convenience constructor for public facing API. + /// + /// Host Environment. + /// Input . This is the output from previous transform or loader. + /// Name of the output column. + /// Input columns to concatenate. public ConcatTransform(IHostEnvironment env, IDataView input, string name, params string[] source) : this(env, new Arguments(name, source), input) { diff --git a/src/Microsoft.ML.Data/Transforms/CopyColumnsTransform.cs b/src/Microsoft.ML.Data/Transforms/CopyColumnsTransform.cs index 35a845b118..c2352f9e41 100644 --- a/src/Microsoft.ML.Data/Transforms/CopyColumnsTransform.cs +++ b/src/Microsoft.ML.Data/Transforms/CopyColumnsTransform.cs @@ -64,6 +64,13 @@ private static VersionInfo GetVersionInfo() private const string RegistrationName = "CopyColumns"; + /// + /// Convenience constructor for public facing API. + /// + /// Host Environment. + /// Input . This is the output from previous transform or loader. + /// Name of the output column. + /// Name of the column to be copied. public CopyColumnsTransform(IHostEnvironment env, IDataView input, string name, string source) : this(env, new Arguments(){ Column = new[] { new Column() { Source = source, Name = name }}}, input) { diff --git a/src/Microsoft.ML.Data/Transforms/DropColumnsTransform.cs b/src/Microsoft.ML.Data/Transforms/DropColumnsTransform.cs index d22c250fbb..3e15199ff7 100644 --- a/src/Microsoft.ML.Data/Transforms/DropColumnsTransform.cs +++ b/src/Microsoft.ML.Data/Transforms/DropColumnsTransform.cs @@ -237,6 +237,12 @@ private static VersionInfo GetVersionInfo() private const string DropRegistrationName = "DropColumns"; private const string KeepRegistrationName = "KeepColumns"; + /// + /// Convenience constructor for public facing API. + /// + /// Host Environment. + /// Input . This is the output from previous transform or loader. + /// Name of the columns to be dropped. public DropColumnsTransform(IHostEnvironment env, IDataView input, params string[] columnsToDrop) :this(env, new Arguments() { Column = columnsToDrop }, input) { @@ -391,6 +397,13 @@ public ValueGetter GetGetter(int col) public class KeepColumnsTransform { + /// + /// A helper method to create for public facing API. + /// + /// Host Environment. + /// Input . This is the output from previous transform or loader. + /// Name of the columns to be kept. All other columns will be removed. + /// public static IDataTransform Create(IHostEnvironment env, IDataView input, params string[] columnsToKeep) => new DropColumnsTransform(env, new DropColumnsTransform.KeepArguments() { Column = columnsToKeep }, input); } diff --git a/src/Microsoft.ML.Data/Transforms/NAFilter.cs b/src/Microsoft.ML.Data/Transforms/NAFilter.cs index 4137a8277b..690d996834 100644 --- a/src/Microsoft.ML.Data/Transforms/NAFilter.cs +++ b/src/Microsoft.ML.Data/Transforms/NAFilter.cs @@ -72,6 +72,13 @@ private static VersionInfo GetVersionInfo() private readonly bool _complement; private const string RegistrationName = "MissingValueFilter"; + /// + /// Convenience constructor for public facing API. + /// + /// Host Environment. + /// Input . This is the output from previous transform or loader. + /// If true, keep only rows that contain NA values, and filter the rest. + /// Name of the columns. Only these columns will be used to filter rows having 'NA' values. public NAFilter(IHostEnvironment env, IDataView input, bool complement = false, params string[] columns) : this(env, new Arguments() { Column = columns, Complement = complement }, input) { diff --git a/src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs b/src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs index 86dea7db21..f50f197628 100644 --- a/src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs +++ b/src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs @@ -76,6 +76,15 @@ public BootstrapSampleTransform(IHostEnvironment env, Arguments args, IDataView _poolSize = args.PoolSize; } + /// + /// Convenience constructor for public facing API. + /// + /// Host Environment. + /// Input . This is the output from previous transform or loader. + /// Whether this is the out-of-bag sample, that is, all those rows that are not selected by the transform. + /// The random seed. If unspecified random state will be instead derived from the environment. + /// Whether we should attempt to shuffle the source data. By default on, but can be turned off for efficiency. + /// When shuffling the output, the number of output rows to keep in that pool. Note that shuffling of output is completely distinct from shuffling of input. public BootstrapSampleTransform(IHostEnvironment env, IDataView input, bool complement = false, uint? seed = null, bool shuffleInput = true, int poolSize = 1000) : this(env, new Arguments() { Complement = complement, Seed = seed, ShuffleInput = shuffleInput, PoolSize = poolSize }, input) { diff --git a/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs b/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs index c0e27635e7..a3129e7345 100644 --- a/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs +++ b/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs @@ -119,7 +119,17 @@ public sealed class Arguments : TransformInputBase + "bag. If the input column is a vector, a single indicator bag is returned for it."; public const string UserName = "Categorical Hash Transform"; - + + /// + /// A helper method to create for public facing API. + /// + /// Host Environment. + /// Input . This is the output from previous transform or loader. + /// Name of the output column. + /// Name of the column to be transformed. If this is null '' will be used. + /// Number of bits to hash into. Must be between 1 and 30, inclusive. + /// Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit. + /// Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index). public static IDataTransform Create(IHostEnvironment env, IDataView input, string name, string source =null, int hashBits = 16, int invertHash = 0, CategoricalTransform.OutputKind outputKind = CategoricalTransform.OutputKind.Bag) { var args = new Arguments() diff --git a/src/Microsoft.ML.Transforms/CategoricalTransform.cs b/src/Microsoft.ML.Transforms/CategoricalTransform.cs index f8cd635468..9eb9a1f05e 100644 --- a/src/Microsoft.ML.Transforms/CategoricalTransform.cs +++ b/src/Microsoft.ML.Transforms/CategoricalTransform.cs @@ -118,6 +118,14 @@ public Arguments() public const string UserName = "Categorical Transform"; + /// + /// A helper method to create for public facing API. + /// + /// Host Environment. + /// Input . This is the output from previous transform or loader. + /// Name of the output column. + /// Name of the column to be transformed. If this is null '' will be used. + /// Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index). public static IDataTransform Create(IHostEnvironment env, IDataView input, string name, string source = null, OutputKind outputKind = OutputKind.Ind) { var args = new Arguments() diff --git a/src/Microsoft.ML.Transforms/CountFeatureSelection.cs b/src/Microsoft.ML.Transforms/CountFeatureSelection.cs index aac9c9e05f..2f555ea5ec 100644 --- a/src/Microsoft.ML.Transforms/CountFeatureSelection.cs +++ b/src/Microsoft.ML.Transforms/CountFeatureSelection.cs @@ -39,6 +39,14 @@ public sealed class Arguments : TransformInputBase internal static string RegistrationName = "CountFeatureSelectionTransform"; + /// + /// A helper method to create CountFeatureSelection transform for public facing API. + /// + /// Host Environment. + /// Input . This is the output from previous transform or loader. + /// If the count of non-default values for a slot is greater than or equal to this threshold, the slot is preserved. + /// Columns to use for feature selection. + /// public static IDataTransform Create(IHostEnvironment env, IDataView input, long count = 1, params string[] columns) { var args = new Arguments() diff --git a/src/Microsoft.ML.Transforms/GcnTransform.cs b/src/Microsoft.ML.Transforms/GcnTransform.cs index cb90ea775a..8ebf94303c 100644 --- a/src/Microsoft.ML.Transforms/GcnTransform.cs +++ b/src/Microsoft.ML.Transforms/GcnTransform.cs @@ -237,6 +237,16 @@ private static VersionInfo GetVersionInfo() private readonly ColInfoEx[] _exes; + /// + /// A helper method to create GlobalContrastNormalizer transform for public facing API. + /// + /// Host Environment. + /// Input . This is the output from previous transform or loader. + /// Name of the output column. + /// Name of the column to be transformed. If this is null '' will be used. + /// Subtract mean from each value before normalizing. + /// Normalize by standard deviation rather than L2 norm. + /// Scale features by this value. public static IDataTransform CreateGlobalContrastNormalizer(IHostEnvironment env, IDataView input, string name, string source = null, bool subMean = true, bool useStdDev = false, Float scale = 1) { var args = new GcnArguments() @@ -279,6 +289,15 @@ public LpNormNormalizerTransform(IHostEnvironment env, GcnArguments args, IDataV SetMetadata(); } + /// + /// A helper method to create LpNormNormalizer transform for public facing API. + /// + /// Host Environment. + /// Input . This is the output from previous transform or loader. + /// Name of the output column. + /// Name of the column to be transformed. If this is null '' will be used. + /// /// The norm to use to normalize each sample. + /// Subtract mean from each value before normalizing. public static IDataTransform CreateLpNormNormalizer(IHostEnvironment env, IDataView input, string name, string source = null, NormalizerKind normKind = NormalizerKind.L2Norm, bool subMean = false) { var args = new Arguments() diff --git a/src/Microsoft.ML.Transforms/NormalizeColumn.cs b/src/Microsoft.ML.Transforms/NormalizeColumn.cs index 54654af8a2..d0d3303e1a 100644 --- a/src/Microsoft.ML.Transforms/NormalizeColumn.cs +++ b/src/Microsoft.ML.Transforms/NormalizeColumn.cs @@ -218,6 +218,13 @@ public sealed class SupervisedBinArguments : BinArgumentsBase public const string BinNormalizerShortName = "Bin"; public const string SupervisedBinNormalizerShortName = "SupBin"; + /// + /// A helper method to create MinMaxNormalizer transform for public facing API. + /// + /// Host Environment. + /// Input . This is the output from previous transform or loader. + /// Name of the output column. + /// Name of the column to be transformed. If this is null '' will be used. public static NormalizeTransform CreateMinMaxNormalizer(IHostEnvironment env, IDataView input, string name, string source = null) { var args = new MinMaxArguments() @@ -247,7 +254,15 @@ public static NormalizeTransform Create(IHostEnvironment env, MinMaxArguments ar return func; } - public static NormalizeTransform CreateMeanVarNormalizer(IHostEnvironment env, IDataView input, string name, string source=null, bool UseCdf = false) + /// + /// A helper method to create MeanVarNormalizer transform for public facing API. + /// + /// Host Environment. + /// Input . This is the output from previous transform or loader. + /// Name of the output column. + /// Name of the column to be transformed. If this is null '' will be used. + /// /// Whether to use CDF as the output. + public static NormalizeTransform CreateMeanVarNormalizer(IHostEnvironment env, IDataView input, string name, string source=null, bool useCdf = false) { var args = new MeanVarArguments() { @@ -256,7 +271,7 @@ public static NormalizeTransform CreateMeanVarNormalizer(IHostEnvironment env, I Name = name } }, - UseCdf = UseCdf + UseCdf = useCdf }; return Create(env, args, input); } @@ -277,7 +292,15 @@ public static NormalizeTransform Create(IHostEnvironment env, MeanVarArguments a return func; } - public static NormalizeTransform CreateLogMeanVarNormalizer(IHostEnvironment env, IDataView input, string name, string source=null, bool UseCdf = true) + /// + /// A helper method to create LogMeanVarNormalizer transform for public facing API. + /// + /// Host Environment. + /// Input . This is the output from previous transform or loader. + /// Name of the output column. + /// Name of the column to be transformed. If this is null '' will be used. + /// /// Whether to use CDF as the output. + public static NormalizeTransform CreateLogMeanVarNormalizer(IHostEnvironment env, IDataView input, string name, string source=null, bool useCdf = true) { var args = new LogMeanVarArguments() { @@ -286,7 +309,7 @@ public static NormalizeTransform CreateLogMeanVarNormalizer(IHostEnvironment env Name = name } }, - UseCdf = UseCdf + UseCdf = useCdf }; return Create(env, args, input); } From 1566ed0c3bdb10fc27513a7e962c729b3711259f Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Tue, 26 Jun 2018 11:02:50 -0700 Subject: [PATCH 6/8] Created private static class for managing default values. --- src/Microsoft.ML.Data/Transforms/NAFilter.cs | 9 +++- .../BootstrapSampleTransform.cs | 15 +++++-- .../CategoricalHashTransform.cs | 23 +++++++---- .../CategoricalTransform.cs | 23 +++++++++-- .../CountFeatureSelection.cs | 9 +++- src/Microsoft.ML.Transforms/GcnTransform.cs | 23 +++++++---- .../NormalizeColumn.cs | 41 +++++++++++++++---- 7 files changed, 110 insertions(+), 33 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/NAFilter.cs b/src/Microsoft.ML.Data/Transforms/NAFilter.cs index 690d996834..1f92101bf3 100644 --- a/src/Microsoft.ML.Data/Transforms/NAFilter.cs +++ b/src/Microsoft.ML.Data/Transforms/NAFilter.cs @@ -28,13 +28,18 @@ namespace Microsoft.ML.Runtime.Data { public sealed class NAFilter : FilterBase { + private static class Defaults + { + public const bool Complement = false; + } + public sealed class Arguments : TransformInputBase { [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "Column", ShortName = "col", SortOrder = 1)] public string[] Column; [Argument(ArgumentType.Multiple, HelpText = "If true, keep only rows that contain NA values, and filter the rest.")] - public bool Complement; + public bool Complement = Defaults.Complement; } private sealed class ColInfo @@ -79,7 +84,7 @@ private static VersionInfo GetVersionInfo() /// Input . This is the output from previous transform or loader. /// If true, keep only rows that contain NA values, and filter the rest. /// Name of the columns. Only these columns will be used to filter rows having 'NA' values. - public NAFilter(IHostEnvironment env, IDataView input, bool complement = false, params string[] columns) + public NAFilter(IHostEnvironment env, IDataView input, bool complement = Defaults.Complement, params string[] columns) : this(env, new Arguments() { Column = columns, Complement = complement }, input) { diff --git a/src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs b/src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs index f50f197628..856f9cebe1 100644 --- a/src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs +++ b/src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs @@ -25,20 +25,27 @@ namespace Microsoft.ML.Runtime.Data /// public sealed class BootstrapSampleTransform : FilterBase { + private static class Defaults + { + public const bool Complement = false; + public const bool ShuffleInput = true; + public const int PoolSize = 1000; + } + public sealed class Arguments : TransformInputBase { [Argument(ArgumentType.AtMostOnce, HelpText = "Whether this is the out-of-bag sample, that is, all those rows that are not selected by the transform.", ShortName = "comp")] - public bool Complement; + public bool Complement = Defaults.Complement; [Argument(ArgumentType.AtMostOnce, HelpText = "The random seed. If unspecified random state will be instead derived from the environment.")] public uint? Seed; [Argument(ArgumentType.AtMostOnce, HelpText = "Whether we should attempt to shuffle the source data. By default on, but can be turned off for efficiency.", ShortName = "si")] - public bool ShuffleInput = true; + public bool ShuffleInput = Defaults.ShuffleInput; [Argument(ArgumentType.LastOccurenceWins, HelpText = "When shuffling the output, the number of output rows to keep in that pool. Note that shuffling of output is completely distinct from shuffling of input.", ShortName = "pool")] - public int PoolSize = 1000; + public int PoolSize = Defaults.PoolSize; } internal const string Summary = "Approximate bootstrap sampling."; @@ -85,7 +92,7 @@ public BootstrapSampleTransform(IHostEnvironment env, Arguments args, IDataView /// The random seed. If unspecified random state will be instead derived from the environment. /// Whether we should attempt to shuffle the source data. By default on, but can be turned off for efficiency. /// When shuffling the output, the number of output rows to keep in that pool. Note that shuffling of output is completely distinct from shuffling of input. - public BootstrapSampleTransform(IHostEnvironment env, IDataView input, bool complement = false, uint? seed = null, bool shuffleInput = true, int poolSize = 1000) + public BootstrapSampleTransform(IHostEnvironment env, IDataView input, bool complement = Defaults.Complement, uint? seed = null, bool shuffleInput = Defaults.ShuffleInput, int poolSize = Defaults.PoolSize) : this(env, new Arguments() { Complement = complement, Seed = seed, ShuffleInput = shuffleInput, PoolSize = poolSize }, input) { diff --git a/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs b/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs index a3129e7345..38706ca190 100644 --- a/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs +++ b/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs @@ -86,6 +86,15 @@ public bool TryUnparse(StringBuilder sb) } } + private static class Defaults + { + public const int HashBits = 16; + public const uint Seed = 314489979; + public const bool Ordered = true; + public const int InvertHash = 0; + public const CategoricalTransform.OutputKind OutputKind = CategoricalTransform.OutputKind.Bag; + } + /// /// This class is a merger of and /// with join option removed @@ -97,22 +106,22 @@ public sealed class Arguments : TransformInputBase [Argument(ArgumentType.AtMostOnce, HelpText = "Number of bits to hash into. Must be between 1 and 30, inclusive.", ShortName = "bits", SortOrder = 2)] - public int HashBits = 16; + public int HashBits = Defaults.HashBits; [Argument(ArgumentType.AtMostOnce, HelpText = "Hashing seed")] - public uint Seed = 314489979; + public uint Seed = Defaults.Seed; [Argument(ArgumentType.AtMostOnce, HelpText = "Whether the position of each term should be included in the hash", ShortName = "ord")] - public bool Ordered = true; + public bool Ordered = Defaults.Ordered; [Argument(ArgumentType.AtMostOnce, HelpText = "Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.", ShortName = "ih")] - public int InvertHash; + public int InvertHash = Defaults.InvertHash; [Argument(ArgumentType.AtMostOnce, HelpText = "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)", ShortName = "kind", SortOrder = 102)] - public CategoricalTransform.OutputKind OutputKind = CategoricalTransform.OutputKind.Bag; + public CategoricalTransform.OutputKind OutputKind = Defaults.OutputKind; } internal const string Summary = "Converts the categorical value into an indicator array by hashing the value and using the hash as an index in the " @@ -129,8 +138,8 @@ public sealed class Arguments : TransformInputBase /// Name of the column to be transformed. If this is null '' will be used. /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit. - /// Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index). - public static IDataTransform Create(IHostEnvironment env, IDataView input, string name, string source =null, int hashBits = 16, int invertHash = 0, CategoricalTransform.OutputKind outputKind = CategoricalTransform.OutputKind.Bag) + /// The type of output expected. + public static IDataTransform Create(IHostEnvironment env, IDataView input, string name, string source =null, int hashBits = Defaults.HashBits, int invertHash = Defaults.InvertHash, CategoricalTransform.OutputKind outputKind = Defaults.OutputKind) { var args = new Arguments() { diff --git a/src/Microsoft.ML.Transforms/CategoricalTransform.cs b/src/Microsoft.ML.Transforms/CategoricalTransform.cs index 9eb9a1f05e..dfc6849427 100644 --- a/src/Microsoft.ML.Transforms/CategoricalTransform.cs +++ b/src/Microsoft.ML.Transforms/CategoricalTransform.cs @@ -38,15 +38,27 @@ public static class CategoricalTransform { public enum OutputKind : byte { + /// + /// Output is a bag (multi-set) vector + /// [TGUI(Label = "Output is a bag (multi-set) vector")] Bag = 1, + /// + /// Output is an indicator vector + /// [TGUI(Label = "Output is an indicator vector")] Ind = 2, + /// + /// Output is a key value + /// [TGUI(Label = "Output is a key value")] Key = 3, + /// + /// Output is binary encoded + /// [TGUI(Label = "Output is binary encoded")] Bin = 4, } @@ -96,6 +108,11 @@ public bool TryUnparse(StringBuilder sb) } } + private static class Defaults + { + public const OutputKind OutKind = OutputKind.Ind; + } + public sealed class Arguments : TermTransform.ArgumentsBase { [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:src)", ShortName = "col", SortOrder = 1)] @@ -103,7 +120,7 @@ public sealed class Arguments : TermTransform.ArgumentsBase [Argument(ArgumentType.AtMostOnce, HelpText = "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)", ShortName = "kind", SortOrder = 102)] - public OutputKind OutputKind = OutputKind.Ind; + public OutputKind OutputKind = Defaults.OutKind; public Arguments() { @@ -125,8 +142,8 @@ public Arguments() /// Input . This is the output from previous transform or loader. /// Name of the output column. /// Name of the column to be transformed. If this is null '' will be used. - /// Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index). - public static IDataTransform Create(IHostEnvironment env, IDataView input, string name, string source = null, OutputKind outputKind = OutputKind.Ind) + /// The type of output expected. + public static IDataTransform Create(IHostEnvironment env, IDataView input, string name, string source = null, OutputKind outputKind = Defaults.OutKind) { var args = new Arguments() { diff --git a/src/Microsoft.ML.Transforms/CountFeatureSelection.cs b/src/Microsoft.ML.Transforms/CountFeatureSelection.cs index 2f555ea5ec..79adda882e 100644 --- a/src/Microsoft.ML.Transforms/CountFeatureSelection.cs +++ b/src/Microsoft.ML.Transforms/CountFeatureSelection.cs @@ -28,13 +28,18 @@ public static class CountFeatureSelectionTransform public const string Summary = "Selects the slots for which the count of non-default values is greater than or equal to a threshold."; public const string UserName = "Count Feature Selection Transform"; + private static class Defaults + { + public const long Count = 1; + } + public sealed class Arguments : TransformInputBase { [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "Columns to use for feature selection", ShortName = "col", SortOrder = 1)] public string[] Column; [Argument(ArgumentType.Required, HelpText = "If the count of non-default values for a slot is greater than or equal to this threshold, the slot is preserved", ShortName = "c", SortOrder = 1)] - public long Count = 1; + public long Count = Defaults.Count; } internal static string RegistrationName = "CountFeatureSelectionTransform"; @@ -47,7 +52,7 @@ public sealed class Arguments : TransformInputBase /// If the count of non-default values for a slot is greater than or equal to this threshold, the slot is preserved. /// Columns to use for feature selection. /// - public static IDataTransform Create(IHostEnvironment env, IDataView input, long count = 1, params string[] columns) + public static IDataTransform Create(IHostEnvironment env, IDataView input, long count = Defaults.Count, params string[] columns) { var args = new Arguments() { diff --git a/src/Microsoft.ML.Transforms/GcnTransform.cs b/src/Microsoft.ML.Transforms/GcnTransform.cs index 8ebf94303c..38e7cc15bc 100644 --- a/src/Microsoft.ML.Transforms/GcnTransform.cs +++ b/src/Microsoft.ML.Transforms/GcnTransform.cs @@ -53,16 +53,25 @@ public enum NormalizerKind : byte LInf = 3 } + private static class Defaults + { + public const NormalizerKind NormKind = NormalizerKind.L2Norm; + public const bool LpSubMean = false; + public const bool GcnSubMean = true; + public const bool UseStdDev = false; + public const Float Scale = 1; + } + public sealed class Arguments : TransformInputBase { [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:src)", ShortName = "col", SortOrder = 1)] public Column[] Column; [Argument(ArgumentType.AtMostOnce, HelpText = "The norm to use to normalize each sample", ShortName = "norm", SortOrder = 1)] - public NormalizerKind NormKind = NormalizerKind.L2Norm; + public NormalizerKind NormKind = Defaults.NormKind; [Argument(ArgumentType.AtMostOnce, HelpText = "Subtract mean from each value before normalizing", SortOrder = 2)] - public bool SubMean = false; + public bool SubMean = Defaults.LpSubMean; } public sealed class GcnArguments : TransformInputBase @@ -71,13 +80,13 @@ public sealed class GcnArguments : TransformInputBase public GcnColumn[] Column; [Argument(ArgumentType.AtMostOnce, HelpText = "Subtract mean from each value before normalizing", SortOrder = 1)] - public bool SubMean = true; + public bool SubMean = Defaults.GcnSubMean; [Argument(ArgumentType.AtMostOnce, HelpText = "Normalize by standard deviation rather than L2 norm", ShortName = "useStd")] - public bool UseStdDev = false; + public bool UseStdDev = Defaults.UseStdDev; [Argument(ArgumentType.AtMostOnce, HelpText = "Scale features by this value")] - public Float Scale = 1; + public Float Scale = Defaults.Scale; } public abstract class ColumnBase : OneToOneColumn @@ -247,7 +256,7 @@ private static VersionInfo GetVersionInfo() /// Subtract mean from each value before normalizing. /// Normalize by standard deviation rather than L2 norm. /// Scale features by this value. - public static IDataTransform CreateGlobalContrastNormalizer(IHostEnvironment env, IDataView input, string name, string source = null, bool subMean = true, bool useStdDev = false, Float scale = 1) + public static IDataTransform CreateGlobalContrastNormalizer(IHostEnvironment env, IDataView input, string name, string source = null, bool subMean = Defaults.GcnSubMean, bool useStdDev = Defaults.UseStdDev, Float scale = Defaults.Scale) { var args = new GcnArguments() { @@ -298,7 +307,7 @@ public LpNormNormalizerTransform(IHostEnvironment env, GcnArguments args, IDataV /// Name of the column to be transformed. If this is null '' will be used. /// /// The norm to use to normalize each sample. /// Subtract mean from each value before normalizing. - public static IDataTransform CreateLpNormNormalizer(IHostEnvironment env, IDataView input, string name, string source = null, NormalizerKind normKind = NormalizerKind.L2Norm, bool subMean = false) + public static IDataTransform CreateLpNormNormalizer(IHostEnvironment env, IDataView input, string name, string source = null, NormalizerKind normKind = Defaults.NormKind, bool subMean = Defaults.LpSubMean) { var args = new Arguments() { diff --git a/src/Microsoft.ML.Transforms/NormalizeColumn.cs b/src/Microsoft.ML.Transforms/NormalizeColumn.cs index d0d3303e1a..c876812011 100644 --- a/src/Microsoft.ML.Transforms/NormalizeColumn.cs +++ b/src/Microsoft.ML.Transforms/NormalizeColumn.cs @@ -135,12 +135,21 @@ public bool TryUnparse(StringBuilder sb) } } + private static class Defaults + { + public const bool FixZero = true; + public const bool MeanVarCdf = false; + public const bool LogMeanVarCdf = true; + public const int NumBins = 1024; + public const int MinBinSize = 10; + } + public abstract class FixZeroArgumentsBase : ArgumentsBase { // REVIEW: This only allows mapping either zero or min to zero. It might make sense to allow also max, midpoint and mean to be mapped to zero. // REVIEW: Convert this to bool? or even an enum{Auto, No, Yes}, and automatically map zero to zero when it is null/Auto. [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to map zero to zero, preserving sparsity", ShortName = "zero")] - public bool FixZero = true; + public bool FixZero = Defaults.FixZero; } public abstract class AffineArgumentsBase : FixZeroArgumentsBase @@ -158,13 +167,13 @@ public sealed class MinMaxArguments : AffineArgumentsBase public sealed class MeanVarArguments : AffineArgumentsBase { [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to use CDF as the output", ShortName = "cdf")] - public bool UseCdf; + public bool UseCdf = Defaults.MeanVarCdf; } public sealed class LogMeanVarArguments : ArgumentsBase { [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to use CDF as the output", ShortName = "cdf")] - public bool UseCdf = true; + public bool UseCdf = Defaults.LogMeanVarCdf; [Argument(ArgumentType.Multiple, HelpText = "New column definition(s) (optional form: name:src)", ShortName = "col", SortOrder = 1)] public LogNormalColumn[] Column; @@ -179,7 +188,7 @@ public abstract class BinArgumentsBase : FixZeroArgumentsBase [Argument(ArgumentType.AtMostOnce, HelpText = "Max number of bins, power of 2 recommended", ShortName = "bins")] [TGUI(Label = "Max number of bins")] - public int NumBins = 1024; + public int NumBins = Defaults.NumBins; public override OneToOneColumn[] GetColumns() => Column; } @@ -196,7 +205,7 @@ public sealed class SupervisedBinArguments : BinArgumentsBase public string LabelColumn; [Argument(ArgumentType.AtMostOnce, HelpText = "Minimum number of examples per bin")] - public int MinBinSize = 10; + public int MinBinSize = Defaults.MinBinSize; } public const string MinMaxNormalizerSummary = "Normalizes the data based on the observed minimum and maximum values of the data."; @@ -262,7 +271,7 @@ public static NormalizeTransform Create(IHostEnvironment env, MinMaxArguments ar /// Name of the output column. /// Name of the column to be transformed. If this is null '' will be used. /// /// Whether to use CDF as the output. - public static NormalizeTransform CreateMeanVarNormalizer(IHostEnvironment env, IDataView input, string name, string source=null, bool useCdf = false) + public static NormalizeTransform CreateMeanVarNormalizer(IHostEnvironment env, IDataView input, string name, string source=null, bool useCdf = Defaults.MeanVarCdf) { var args = new MeanVarArguments() { @@ -300,7 +309,7 @@ public static NormalizeTransform Create(IHostEnvironment env, MeanVarArguments a /// Name of the output column. /// Name of the column to be transformed. If this is null '' will be used. /// /// Whether to use CDF as the output. - public static NormalizeTransform CreateLogMeanVarNormalizer(IHostEnvironment env, IDataView input, string name, string source=null, bool useCdf = true) + public static NormalizeTransform CreateLogMeanVarNormalizer(IHostEnvironment env, IDataView input, string name, string source=null, bool useCdf = Defaults.LogMeanVarCdf) { var args = new LogMeanVarArguments() { @@ -330,7 +339,7 @@ public static NormalizeTransform Create(IHostEnvironment env, LogMeanVarArgument return func; } - public static NormalizeTransform CreateBinningNormalizer(IHostEnvironment env, IDataView input, string name, string source=null, int numBins = 1024) + public static NormalizeTransform CreateBinningNormalizer(IHostEnvironment env, IDataView input, string name, string source=null, int numBins = Defaults.NumBins) { var args = new BinArguments() { @@ -360,6 +369,22 @@ public static NormalizeTransform Create(IHostEnvironment env, BinArguments args, return func; } + public static NormalizeTransform CreateBinningNormalizer(IHostEnvironment env, IDataView input, string labelColumn, string name, string source = null, int numBins = Defaults.NumBins, int minBinSize = Defaults.MinBinSize) + { + var args = new SupervisedBinArguments() + { + Column = new[] { new BinColumn(){ + Source = source ?? name, + Name = name + } + }, + LabelColumn = labelColumn, + NumBins = numBins, + MinBinSize = minBinSize + }; + return Create(env, args, input); + } + /// /// Public create method corresponding to SignatureDataTransform. /// From e9b10234c09cf483a6d5c0bf2101a69783d5c3cc Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Tue, 26 Jun 2018 16:14:11 -0700 Subject: [PATCH 7/8] Addressed reviewers' comments. --- .../BootstrapSampleTransform.cs | 7 ++++- .../CategoricalHashTransform.cs | 8 +++++- src/Microsoft.ML.Transforms/GcnTransform.cs | 15 +++++++++-- .../NormalizeColumn.cs | 26 ++++++++++++++++--- 4 files changed, 48 insertions(+), 8 deletions(-) diff --git a/src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs b/src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs index 856f9cebe1..83f5681326 100644 --- a/src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs +++ b/src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs @@ -92,7 +92,12 @@ public BootstrapSampleTransform(IHostEnvironment env, Arguments args, IDataView /// The random seed. If unspecified random state will be instead derived from the environment. /// Whether we should attempt to shuffle the source data. By default on, but can be turned off for efficiency. /// When shuffling the output, the number of output rows to keep in that pool. Note that shuffling of output is completely distinct from shuffling of input. - public BootstrapSampleTransform(IHostEnvironment env, IDataView input, bool complement = Defaults.Complement, uint? seed = null, bool shuffleInput = Defaults.ShuffleInput, int poolSize = Defaults.PoolSize) + public BootstrapSampleTransform(IHostEnvironment env, + IDataView input, + bool complement = Defaults.Complement, + uint? seed = null, + bool shuffleInput = Defaults.ShuffleInput, + int poolSize = Defaults.PoolSize) : this(env, new Arguments() { Complement = complement, Seed = seed, ShuffleInput = shuffleInput, PoolSize = poolSize }, input) { diff --git a/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs b/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs index 38706ca190..5aec5658b2 100644 --- a/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs +++ b/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs @@ -139,7 +139,13 @@ public sealed class Arguments : TransformInputBase /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit. /// The type of output expected. - public static IDataTransform Create(IHostEnvironment env, IDataView input, string name, string source =null, int hashBits = Defaults.HashBits, int invertHash = Defaults.InvertHash, CategoricalTransform.OutputKind outputKind = Defaults.OutputKind) + public static IDataTransform Create(IHostEnvironment env, + IDataView input, + string name, + string source =null, + int hashBits = Defaults.HashBits, + int invertHash = Defaults.InvertHash, + CategoricalTransform.OutputKind outputKind = Defaults.OutputKind) { var args = new Arguments() { diff --git a/src/Microsoft.ML.Transforms/GcnTransform.cs b/src/Microsoft.ML.Transforms/GcnTransform.cs index 38e7cc15bc..fdb1d26c25 100644 --- a/src/Microsoft.ML.Transforms/GcnTransform.cs +++ b/src/Microsoft.ML.Transforms/GcnTransform.cs @@ -256,7 +256,13 @@ private static VersionInfo GetVersionInfo() /// Subtract mean from each value before normalizing. /// Normalize by standard deviation rather than L2 norm. /// Scale features by this value. - public static IDataTransform CreateGlobalContrastNormalizer(IHostEnvironment env, IDataView input, string name, string source = null, bool subMean = Defaults.GcnSubMean, bool useStdDev = Defaults.UseStdDev, Float scale = Defaults.Scale) + public static IDataTransform CreateGlobalContrastNormalizer(IHostEnvironment env, + IDataView input, + string name, + string source = null, + bool subMean = Defaults.GcnSubMean, + bool useStdDev = Defaults.UseStdDev, + Float scale = Defaults.Scale) { var args = new GcnArguments() { @@ -307,7 +313,12 @@ public LpNormNormalizerTransform(IHostEnvironment env, GcnArguments args, IDataV /// Name of the column to be transformed. If this is null '' will be used. /// /// The norm to use to normalize each sample. /// Subtract mean from each value before normalizing. - public static IDataTransform CreateLpNormNormalizer(IHostEnvironment env, IDataView input, string name, string source = null, NormalizerKind normKind = Defaults.NormKind, bool subMean = Defaults.LpSubMean) + public static IDataTransform CreateLpNormNormalizer(IHostEnvironment env, + IDataView input, + string name, + string source = null, + NormalizerKind normKind = Defaults.NormKind, + bool subMean = Defaults.LpSubMean) { var args = new Arguments() { diff --git a/src/Microsoft.ML.Transforms/NormalizeColumn.cs b/src/Microsoft.ML.Transforms/NormalizeColumn.cs index c876812011..f6e8851f51 100644 --- a/src/Microsoft.ML.Transforms/NormalizeColumn.cs +++ b/src/Microsoft.ML.Transforms/NormalizeColumn.cs @@ -271,7 +271,11 @@ public static NormalizeTransform Create(IHostEnvironment env, MinMaxArguments ar /// Name of the output column. /// Name of the column to be transformed. If this is null '' will be used. /// /// Whether to use CDF as the output. - public static NormalizeTransform CreateMeanVarNormalizer(IHostEnvironment env, IDataView input, string name, string source=null, bool useCdf = Defaults.MeanVarCdf) + public static NormalizeTransform CreateMeanVarNormalizer(IHostEnvironment env, + IDataView input, + string name, + string source=null, + bool useCdf = Defaults.MeanVarCdf) { var args = new MeanVarArguments() { @@ -309,7 +313,11 @@ public static NormalizeTransform Create(IHostEnvironment env, MeanVarArguments a /// Name of the output column. /// Name of the column to be transformed. If this is null '' will be used. /// /// Whether to use CDF as the output. - public static NormalizeTransform CreateLogMeanVarNormalizer(IHostEnvironment env, IDataView input, string name, string source=null, bool useCdf = Defaults.LogMeanVarCdf) + public static NormalizeTransform CreateLogMeanVarNormalizer(IHostEnvironment env, + IDataView input, + string name, + string source=null, + bool useCdf = Defaults.LogMeanVarCdf) { var args = new LogMeanVarArguments() { @@ -339,7 +347,11 @@ public static NormalizeTransform Create(IHostEnvironment env, LogMeanVarArgument return func; } - public static NormalizeTransform CreateBinningNormalizer(IHostEnvironment env, IDataView input, string name, string source=null, int numBins = Defaults.NumBins) + public static NormalizeTransform CreateBinningNormalizer(IHostEnvironment env, + IDataView input, + string name, + string source=null, + int numBins = Defaults.NumBins) { var args = new BinArguments() { @@ -369,7 +381,13 @@ public static NormalizeTransform Create(IHostEnvironment env, BinArguments args, return func; } - public static NormalizeTransform CreateBinningNormalizer(IHostEnvironment env, IDataView input, string labelColumn, string name, string source = null, int numBins = Defaults.NumBins, int minBinSize = Defaults.MinBinSize) + public static NormalizeTransform CreateSupervisedBinningNormalizer(IHostEnvironment env, + IDataView input, + string labelColumn, + string name, + string source = null, + int numBins = Defaults.NumBins, + int minBinSize = Defaults.MinBinSize) { var args = new SupervisedBinArguments() { From 05637d0c2407bcfd34d391cb670c4037d4edddeb Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Tue, 26 Jun 2018 17:20:44 -0700 Subject: [PATCH 8/8] Resolved some formatting issues. --- src/Microsoft.ML.Data/Transforms/CopyColumnsTransform.cs | 1 - src/Microsoft.ML.Data/Transforms/NAFilter.cs | 1 - src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs | 1 - 3 files changed, 3 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/CopyColumnsTransform.cs b/src/Microsoft.ML.Data/Transforms/CopyColumnsTransform.cs index c2352f9e41..2729a48e3e 100644 --- a/src/Microsoft.ML.Data/Transforms/CopyColumnsTransform.cs +++ b/src/Microsoft.ML.Data/Transforms/CopyColumnsTransform.cs @@ -74,7 +74,6 @@ private static VersionInfo GetVersionInfo() public CopyColumnsTransform(IHostEnvironment env, IDataView input, string name, string source) : this(env, new Arguments(){ Column = new[] { new Column() { Source = source, Name = name }}}, input) { - } public CopyColumnsTransform(IHostEnvironment env, Arguments args, IDataView input) diff --git a/src/Microsoft.ML.Data/Transforms/NAFilter.cs b/src/Microsoft.ML.Data/Transforms/NAFilter.cs index 1f92101bf3..7b94ff1e07 100644 --- a/src/Microsoft.ML.Data/Transforms/NAFilter.cs +++ b/src/Microsoft.ML.Data/Transforms/NAFilter.cs @@ -87,7 +87,6 @@ private static VersionInfo GetVersionInfo() public NAFilter(IHostEnvironment env, IDataView input, bool complement = Defaults.Complement, params string[] columns) : this(env, new Arguments() { Column = columns, Complement = complement }, input) { - } public NAFilter(IHostEnvironment env, Arguments args, IDataView input) diff --git a/src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs b/src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs index 83f5681326..72e7030604 100644 --- a/src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs +++ b/src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs @@ -100,7 +100,6 @@ public BootstrapSampleTransform(IHostEnvironment env, int poolSize = Defaults.PoolSize) : this(env, new Arguments() { Complement = complement, Seed = seed, ShuffleInput = shuffleInput, PoolSize = poolSize }, input) { - } private BootstrapSampleTransform(IHost host, ModelLoadContext ctx, IDataView input)