Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions src/Microsoft.ML.Data/Transforms/ConcatTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,19 @@ public bool TryUnparse(StringBuilder sb)

public sealed class Arguments : TransformInputBase
{
public Arguments()
{
}

public Arguments(string name, params string[] source)
{
Column = new[] { new Column()
{
Name = name,
Source = source
}};
}

[Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:srcs)", ShortName = "col", SortOrder = 1)]
public Column[] Column;
}
Expand Down Expand Up @@ -527,6 +540,18 @@ private static VersionInfo GetVersionInfo()

public override ISchema Schema => _bindings;

/// <summary>
/// Convenience constructor for public facing API.
/// </summary>
/// <param name="env">Host Environment.</param>
/// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param>
/// <param name="name">Name of the output column.</param>
/// <param name="source">Input columns to concatenate.</param>
public ConcatTransform(IHostEnvironment env, IDataView input, string name, params string[] source)
: this(env, new Arguments(name, source), input)
{
}

/// <summary>
/// Public constructor corresponding to SignatureDataTransform.
/// </summary>
Expand Down
13 changes: 13 additions & 0 deletions src/Microsoft.ML.Data/Transforms/CopyColumnsTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,19 @@ private static VersionInfo GetVersionInfo()

private const string RegistrationName = "CopyColumns";

/// <summary>
/// Convenience constructor for public facing API.
/// </summary>
/// <param name="env">Host Environment.</param>
/// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param>
/// <param name="name">Name of the output column.</param>
/// <param name="source">Name of the column to be copied.</param>
public CopyColumnsTransform(IHostEnvironment env, IDataView input, string name, string source)
: this(env, new Arguments(){ Column = new[] { new Column() { Source = source, Name = name }}}, input)
{

}
Copy link
Contributor

@TomFinley TomFinley Jun 27, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Blank lines with trailing whitespace on them is evil. #Resolved


public CopyColumnsTransform(IHostEnvironment env, Arguments args, IDataView input)
: base(env, RegistrationName, env.CheckRef(args, nameof(args)).Column, input, null)
{
Expand Down
24 changes: 24 additions & 0 deletions src/Microsoft.ML.Data/Transforms/DropColumnsTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,17 @@ private static VersionInfo GetVersionInfo()
private const string DropRegistrationName = "DropColumns";
private const string KeepRegistrationName = "KeepColumns";

/// <summary>
/// Convenience constructor for public facing API.
/// </summary>
/// <param name="env">Host Environment.</param>
/// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param>
/// <param name="columnsToDrop">Name of the columns to be dropped.</param>
public DropColumnsTransform(IHostEnvironment env, IDataView input, params string[] columnsToDrop)
:this(env, new Arguments() { Column = columnsToDrop }, input)
{
}
Copy link
Contributor

@TomFinley TomFinley Jun 26, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In order to keep columns, we do DropColumnsTransform.CreateColumnSelector. :) The way that we resolve this on the command line, entry-points, the GUI, and whatnot, is we have this "KeepColumnsTransform" loadname declared, to make it look like there is another transform.

The fact that we do so in the underlying C# code is just because previously this little bit of confusion didn't matter, since we weren't advocating that people use this directly. However since we now are, we ought to adjust accordingly.

For that reason, I wonder if we can make the relationship explicit...

public static class KeepColumnsTransform
{
    public IDataTransform Create(IHostEnvironment env, IDataView input, params string[] columnsToKeep)
        => new DropColumnsTransform(env, new KeepArguments() { Column = columnsToKeep }, input);
}

or something along these lines.

Once that ambiguity is resolved, we can change "drop" creator back to a plain old constructor. #Closed


/// <summary>
/// Public constructor corresponding to SignatureDataTransform.
/// </summary>
Expand Down Expand Up @@ -383,4 +394,17 @@ public ValueGetter<TValue> GetGetter<TValue>(int col)
}
}
}

public class KeepColumnsTransform
{
/// <summary>
/// A helper method to create <see cref="KeepColumnsTransform"/> for public facing API.
/// </summary>
/// <param name="env">Host Environment.</param>
/// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param>
/// <param name="columnsToKeep">Name of the columns to be kept. All other columns will be removed.</param>
/// <returns></returns>
public static IDataTransform Create(IHostEnvironment env, IDataView input, params string[] columnsToKeep)
=> new DropColumnsTransform(env, new DropColumnsTransform.KeepArguments() { Column = columnsToKeep }, input);
}
}
13 changes: 13 additions & 0 deletions src/Microsoft.ML.Data/Transforms/NAFilter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,19 @@ private static VersionInfo GetVersionInfo()
private readonly bool _complement;
private const string RegistrationName = "MissingValueFilter";

/// <summary>
/// Convenience constructor for public facing API.
/// </summary>
/// <param name="env">Host Environment.</param>
/// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param>
/// <param name="complement">If true, keep only rows that contain NA values, and filter the rest.</param>
/// <param name="columns">Name of the columns. Only these columns will be used to filter rows having 'NA' values.</param>
public NAFilter(IHostEnvironment env, IDataView input, bool complement = false, params string[] columns)
: this(env, new Arguments() { Column = columns, Complement = complement }, input)
{

}

public NAFilter(IHostEnvironment env, Arguments args, IDataView input)
: base(env, RegistrationName, input)
{
Expand Down
15 changes: 15 additions & 0 deletions src/Microsoft.ML.Transforms/BootstrapSampleTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,21 @@ public BootstrapSampleTransform(IHostEnvironment env, Arguments args, IDataView
_poolSize = args.PoolSize;
}

/// <summary>
/// Convenience constructor for public facing API.
/// </summary>
/// <param name="env">Host Environment.</param>
/// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param>
/// <param name="complement">Whether this is the out-of-bag sample, that is, all those rows that are not selected by the transform.</param>
/// <param name="seed">The random seed. If unspecified random state will be instead derived from the environment.</param>
/// <param name="shuffleInput">Whether we should attempt to shuffle the source data. By default on, but can be turned off for efficiency.</param>
/// <param name="poolSize">When shuffling the output, the number of output rows to keep in that pool. Note that shuffling of output is completely distinct from shuffling of input.</param>
public BootstrapSampleTransform(IHostEnvironment env, IDataView input, bool complement = false, uint? seed = null, bool shuffleInput = true, int poolSize = 1000)
Copy link
Contributor

@Ivanidzo4ka Ivanidzo4ka Jun 25, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bool complement = false, uint? seed = null, bool shuffleInput = true, int poolSize = 1000 [](start = 79, length = 89)

out of curiosity, why argument object is bad solution? why we want to flatten that object in set of parameters?
I'm also concern what we will have to support this defaults in two places, and it really error prone. #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so that transform can be neatly instantiated on one line. Otherwise for every transform a relevant "Argument" object is needed to be created first which is also another option.


In reply to: 197909492 [](ancestors = 197909492)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can see why it can be helpful in case if you have only Column object inside argument class, you add one more constructor which is nice to use. But in this particular case, i don't see any advantages of using this constructor, and it makes code maintainability much harder.
Also compare this constructor with Cathash constructor.
In cathash you hide all other parameters from argument class other than columns, and here you expose them.
Can we be consistent?

I don't mind to have constructor which will take only Columns information and will use default parameters for internal argument class, but make sure to proper comment which indicates what if user needs more, it need to use Arguments class


In reply to: 197912065 [](ancestors = 197912065,197909492)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but again, feel free to bring heavy artillery (@TomFinley)


In reply to: 197921908 [](ancestors = 197921908,197912065,197909492)

Copy link
Contributor Author

@zeahmed zeahmed Jun 25, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For this case, definitely constructor can be removed. But I have a concern regarding if user has to instantiate an Argument object for a very simple case e.g. for all default values.

Another option could be

public BootstrapSampleTransform(IHostEnvironment env, IDataView input)
: this(env, new Arguments() { }, input)
{
}

Yes, for CatHash more parameters can be added. Waiting for more comments on this. #Closed

Copy link
Contributor

@TomFinley TomFinley Jun 26, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TBH I actually rather liked the constructor as it stands -- I do not consider the proposed replacement BootstrapSampleTransform(IHostEnvironment env, IDataView input) acceptable because the bootstrap sampler I expect is one of those things that, when it is used, will be used in a pair: first to get the in-bag sample (for training), then to get the out of bag sample (for validation and testing). Whether we expose the other things as parameters or not, I am less certain about the utility of these.

@Ivanidzo4ka , is the concern with having a convenience constructor with default arguments, that the defaults could diverge? If that is the case, would perhaps prefer constants (either directly as private const in the transform class itself, or perhaps more ideally as public const in a nested private static class named Defaults or something).

If the point is rather that there should be exactly one way to do this, I'm not entirely certain I agree -- the arguments object may need to exist for the command line parser and entry-points which (being reflection driven) require total uniformity in how things are instantiated, but this bears very little resemblance to how someone, if trying to write a .NET API, would actually write code. If it is a small "shim" as it is now over the arguments, I'm not sure I see the harm, and I see a considerable benefit. #Resolved

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just don't see much reason to have constructor which duplicate other constructor with one difference: instead of one object we accept all the fields of this object.
I just don't understand why it's better and what convenience it brings.
I'm perfectly fine with something like

class(main_option)
class(main_option, secondary_parameter)
i just don't understand why you want to have
class(A,B,C,D,E,F)
and
class(argument_object) where argument_object has fields A,B,C,D,E,F


In reply to: 197983331 [](ancestors = 197983331)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I disagree. Where they can be applied, the new idioms are clearly easier to use and understand.


In reply to: 198273557 [](ancestors = 198273557,197983331)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is based on the convenience you get in calling constructor. In my point of view, calling

new BootstrapSampleTransform(env, input, Complement,  Seed, ShuffleInput, PoolSize);

is more convenient and elegant than calling

new BootstrapSampleTransform(env, input, new Arguments() { Complement = complement, Seed = seed, ShuffleInput = shuffleInput, PoolSize = poolSize });

In reply to: 198273557 [](ancestors = 198273557,197983331)

: this(env, new Arguments() { Complement = complement, Seed = seed, ShuffleInput = shuffleInput, PoolSize = poolSize }, input)
{

}

private BootstrapSampleTransform(IHost host, ModelLoadContext ctx, IDataView input)
: base(host, input)
{
Expand Down
26 changes: 26 additions & 0 deletions src/Microsoft.ML.Transforms/CategoricalHashTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,32 @@ public sealed class Arguments : TransformInputBase

public const string UserName = "Categorical Hash Transform";

/// <summary>
/// A helper method to create <see cref="CategoricalHashTransform"/> for public facing API.
/// </summary>
/// <param name="env">Host Environment.</param>
/// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param>
/// <param name="name">Name of the output column.</param>
/// <param name="source">Name of the column to be transformed. If this is null '<paramref name="name"/>' will be used.</param>
/// <param name="hashBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
/// <param name="invertHash">Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.</param>
/// <param name="outputKind">Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index).</param>
Copy link
Contributor

@TomFinley TomFinley Jun 26, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bag (multi-set vector), Ind (indicator vector), or Key (index [](start = 50, length = 61)

This would be better handled by XML comments on the enum itself, I'd think. #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added comments to enum directly and updated comments here.


In reply to: 198231959 [](ancestors = 198231959)

public static IDataTransform Create(IHostEnvironment env, IDataView input, string name, string source =null, int hashBits = 16, int invertHash = 0, CategoricalTransform.OutputKind outputKind = CategoricalTransform.OutputKind.Bag)
{
var args = new Arguments()
{
Column = new[] { new Column(){
Source = source ?? name,
Name = name
}
},
HashBits = hashBits,
InvertHash = invertHash,
OutputKind = outputKind
};
return Create(env, args, input);
}

public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input)
{
Contracts.CheckValue(env, nameof(env));
Expand Down
22 changes: 22 additions & 0 deletions src/Microsoft.ML.Transforms/CategoricalTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,28 @@ public Arguments()

public const string UserName = "Categorical Transform";

/// <summary>
/// A helper method to create <see cref="CategoricalTransform"/> for public facing API.
/// </summary>
/// <param name="env">Host Environment.</param>
/// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param>
/// <param name="name">Name of the output column.</param>
/// <param name="source">Name of the column to be transformed. If this is null '<paramref name="name"/>' will be used.</param>
/// <param name="outputKind">Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index).</param>
public static IDataTransform Create(IHostEnvironment env, IDataView input, string name, string source = null, OutputKind outputKind = OutputKind.Ind)
{
var args = new Arguments()
{
Column = new[] { new Column(){
Source = source ?? name,
Name = name
}
},
OutputKind = outputKind
};
return Create(env, args, input);
}

public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input)
{
Contracts.CheckValue(env, nameof(env));
Expand Down
18 changes: 18 additions & 0 deletions src/Microsoft.ML.Transforms/CountFeatureSelection.cs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,24 @@ public sealed class Arguments : TransformInputBase

internal static string RegistrationName = "CountFeatureSelectionTransform";

/// <summary>
/// A helper method to create CountFeatureSelection transform for public facing API.
/// </summary>
/// <param name="env">Host Environment.</param>
/// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param>
/// <param name="count">If the count of non-default values for a slot is greater than or equal to this threshold, the slot is preserved.</param>
/// <param name="columns">Columns to use for feature selection.</param>
/// <returns></returns>
public static IDataTransform Create(IHostEnvironment env, IDataView input, long count = 1, params string[] columns)
{
var args = new Arguments()
{
Column = columns,
Count = count
};
return Create(env, args, input);
}

/// <summary>
/// Create method corresponding to SignatureDataTransform.
/// </summary>
Expand Down
54 changes: 52 additions & 2 deletions src/Microsoft.ML.Transforms/GcnTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,32 @@ private static VersionInfo GetVersionInfo()

private readonly ColInfoEx[] _exes;

/// <summary>
/// A helper method to create GlobalContrastNormalizer transform for public facing API.
/// </summary>
/// <param name="env">Host Environment.</param>
/// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param>
/// <param name="name">Name of the output column.</param>
/// <param name="source">Name of the column to be transformed. If this is null '<paramref name="name"/>' will be used.</param>
/// <param name="subMean">Subtract mean from each value before normalizing.</param>
/// <param name="useStdDev">Normalize by standard deviation rather than L2 norm.</param>
/// <param name="scale">Scale features by this value.</param>
public static IDataTransform CreateGlobalContrastNormalizer(IHostEnvironment env, IDataView input, string name, string source = null, bool subMean = true, bool useStdDev = false, Float scale = 1)
{
var args = new GcnArguments()
{
Column = new[] { new GcnColumn(){
Source = source ?? name,
Name = name
}
},
SubMean = subMean,
UseStdDev = useStdDev,
Scale = scale
};
return new LpNormNormalizerTransform(env, args, input);
}

/// <summary>
/// Public constructor corresponding to SignatureDataTransform.
/// </summary>
Expand All @@ -263,9 +289,33 @@ public LpNormNormalizerTransform(IHostEnvironment env, GcnArguments args, IDataV
SetMetadata();
}

/// <summary>
/// A helper method to create LpNormNormalizer transform for public facing API.
/// </summary>
/// <param name="env">Host Environment.</param>
/// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param>
/// <param name="name">Name of the output column.</param>
/// <param name="source">Name of the column to be transformed. If this is null '<paramref name="name"/>' will be used.</param>
/// /// <param name="normKind">The norm to use to normalize each sample.</param>
/// <param name="subMean">Subtract mean from each value before normalizing.</param>
public static IDataTransform CreateLpNormNormalizer(IHostEnvironment env, IDataView input, string name, string source = null, NormalizerKind normKind = NormalizerKind.L2Norm, bool subMean = false)
{
var args = new Arguments()
{
Column = new[] { new Column(){
Source = source ?? name,
Name = name
}
},
SubMean = subMean,
NormKind = normKind
};
return new LpNormNormalizerTransform(env, args, input);
}

public LpNormNormalizerTransform(IHostEnvironment env, Arguments args, IDataView input)
: base(env, RegistrationName, env.CheckRef(args, nameof(args)).Column,
input, TestIsFloatVector)
: base(env, RegistrationName, env.CheckRef(args, nameof(args)).Column,
input, TestIsFloatVector)
{
Host.AssertNonEmpty(Infos);
Host.Assert(Infos.Length == Utils.Size(args.Column));
Expand Down
78 changes: 78 additions & 0 deletions src/Microsoft.ML.Transforms/NormalizeColumn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,26 @@ public sealed class SupervisedBinArguments : BinArgumentsBase
public const string BinNormalizerShortName = "Bin";
public const string SupervisedBinNormalizerShortName = "SupBin";

/// <summary>
/// A helper method to create MinMaxNormalizer transform for public facing API.
/// </summary>
/// <param name="env">Host Environment.</param>
/// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param>
/// <param name="name">Name of the output column.</param>
/// <param name="source">Name of the column to be transformed. If this is null '<paramref name="name"/>' will be used.</param>
public static NormalizeTransform CreateMinMaxNormalizer(IHostEnvironment env, IDataView input, string name, string source = null)
{
var args = new MinMaxArguments()
{
Column = new[] { new AffineColumn(){
Source = source ?? name,
Name = name
}
}
};
return Create(env, args, input);
}

/// <summary>
/// Public create method corresponding to SignatureDataTransform.
/// </summary>
Expand All @@ -234,6 +254,28 @@ public static NormalizeTransform Create(IHostEnvironment env, MinMaxArguments ar
return func;
}

/// <summary>
/// A helper method to create MeanVarNormalizer transform for public facing API.
/// </summary>
/// <param name="env">Host Environment.</param>
/// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param>
/// <param name="name">Name of the output column.</param>
/// <param name="source">Name of the column to be transformed. If this is null '<paramref name="name"/>' will be used.</param>
/// /// <param name="useCdf">Whether to use CDF as the output.</param>
public static NormalizeTransform CreateMeanVarNormalizer(IHostEnvironment env, IDataView input, string name, string source=null, bool useCdf = false)
{
var args = new MeanVarArguments()
{
Column = new[] { new AffineColumn(){
Source = source ?? name,
Name = name
}
},
UseCdf = useCdf
};
return Create(env, args, input);
}

/// <summary>
/// Public create method corresponding to SignatureDataTransform.
/// </summary>
Expand All @@ -250,6 +292,28 @@ public static NormalizeTransform Create(IHostEnvironment env, MeanVarArguments a
return func;
}

/// <summary>
/// A helper method to create LogMeanVarNormalizer transform for public facing API.
/// </summary>
/// <param name="env">Host Environment.</param>
/// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param>
/// <param name="name">Name of the output column.</param>
/// <param name="source">Name of the column to be transformed. If this is null '<paramref name="name"/>' will be used.</param>
/// /// <param name="useCdf">Whether to use CDF as the output.</param>
public static NormalizeTransform CreateLogMeanVarNormalizer(IHostEnvironment env, IDataView input, string name, string source=null, bool useCdf = true)
{
var args = new LogMeanVarArguments()
{
Column = new[] { new LogNormalColumn(){
Source = source ?? name,
Name = name
}
},
UseCdf = useCdf
};
return Create(env, args, input);
}

/// <summary>
/// Public create method corresponding to SignatureDataTransform.
/// </summary>
Expand All @@ -266,6 +330,20 @@ public static NormalizeTransform Create(IHostEnvironment env, LogMeanVarArgument
return func;
}

public static NormalizeTransform CreateBinningNormalizer(IHostEnvironment env, IDataView input, string name, string source=null, int numBins = 1024)
{
var args = new BinArguments()
{
Column = new[] { new BinColumn(){
Source = source ?? name,
Name = name
}
},
NumBins = numBins
};
return Create(env, args, input);
}

/// <summary>
/// Public create method corresponding to SignatureDataTransform.
/// </summary>
Expand Down