Skip to content

Commit b25c9f8

Browse files
sfilipieerhardt
authored andcommitted
Xml docs for trainers and a minor infrastructure changes (dotnet#455)
* updating the C# api generator to append the Remarks XML to the generated Summary class XML. Adding documentation details and references for another batch of the trainers. * correcting test trigger condition * typo * More documentation * substituting the <see> and <seealso> tags with <a> tags, since there is no official documentation on using href with those tags. * adressing PR comments. Fixing new line character in the ep_list and manifest. * Fixing the list and code generation merges from master
1 parent e363c98 commit b25c9f8

34 files changed

+527
-121
lines changed

src/Microsoft.ML.Core/EntryPoints/ModuleArgs.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -527,6 +527,11 @@ public sealed class EntryPointAttribute : Attribute
527527
/// Short name of the Entry Point
528528
/// </summary>
529529
public string ShortName { get; set; }
530+
531+
/// <summary>
532+
/// Remarks on the Entry Point, for more extensive XML documentation on the C#API
533+
/// </summary>
534+
public string Remarks { get; set; }
530535
}
531536

532537
/// <summary>

src/Microsoft.ML.Core/EntryPoints/ModuleCatalog.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ public sealed class EntryPointInfo
4444
public readonly string Description;
4545
public readonly string ShortName;
4646
public readonly string FriendlyName;
47+
public readonly string Remarks;
4748
public readonly MethodInfo Method;
4849
public readonly Type InputType;
4950
public readonly Type OutputType;
@@ -63,6 +64,7 @@ internal EntryPointInfo(IExceptionContext ectx, MethodInfo method,
6364
Method = method;
6465
ShortName = attribute.ShortName;
6566
FriendlyName = attribute.UserName;
67+
Remarks = attribute.Remarks;
6668
ObsoleteAttribute = obsoleteAttribute;
6769

6870
// There are supposed to be 2 parameters, env and input for non-macro nodes.

src/Microsoft.ML.FastTree/FastTree.cs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,31 @@ public abstract class FastTreeTrainerBase<TArgs, TPredictor> :
8282

8383
protected string InnerArgs => CmdParser.GetSettings(Host, Args, new TArgs());
8484

85+
internal const string Remarks = @"<remarks>
86+
<para>FastTrees is an efficient implementation of the <a href='https://arxiv.org/abs/1505.01866'>MART</a> gradient boosting algorithm.
87+
Gradient boosting is a machine learning technique for regression problems.
88+
It builds each regression tree in a step-wise fashion, using a predefined loss function to measure the error for each step and corrects for it in the next.
89+
So this prediction model is actually an ensemble of weaker prediction models. In regression problems, boosting builds a series of of such trees in a step-wise fashion and then selects the optimal tree using an arbitrary differentiable loss function.
90+
</para>
91+
<para>
92+
MART learns an ensemble of regression trees, which is a decision tree with scalar values in its leaves.
93+
A decision (or regression) tree is a binary tree-like flow chart, where at each interior node one decides which of the two child nodes to continue to based on one of the feature values from the input.
94+
At each leaf node, a value is returned. In the interior nodes, the decision is based on the test 'x <= v' where x is the value of the feature in the input sample and v is one of the possible values of this feature.
95+
The functions that can be produced by a regression tree are all the piece-wise constant functions.
96+
</para>
97+
<para>
98+
The ensemble of trees is produced by computing, in each step, a regression tree that approximates the gradient of the loss function, and adding it to the previous tree with coefficients that minimize the loss of the new tree.
99+
The output of the ensemble produced by MART on a given instance is the sum of the tree outputs.
100+
</para>
101+
<list type='bullet'>
102+
<item>In case of a binary classification problem, the output is converted to a probability by using some form of calibration.</item>
103+
<item>In case of a regression problem, the output is the predicted value of the function.</item>
104+
<item>In case of a ranking problem, the instances are ordered by the output value of the ensemble.</item>
105+
</list>
106+
<a href='https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting'>Wikipedia: Gradient boosting (Gradient tree boosting)</a>.
107+
<a href='http://projecteuclid.org/DPubS?service=UI&version=1.0&verb=Display&handle=euclid.aos/1013203451'>Greedy function approximation: A gradient boosting machine.</a>.
108+
</remarks>";
109+
85110
public override bool NeedNormalization => false;
86111

87112
public override bool WantCaching => false;

src/Microsoft.ML.FastTree/FastTreeClassification.cs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,11 @@ public void AdjustTreeOutputs(IChannel ch, RegressionTree tree,
338338

339339
public static partial class FastTree
340340
{
341-
[TlcModule.EntryPoint(Name = "Trainers.FastTreeBinaryClassifier", Desc = FastTreeBinaryClassificationTrainer.Summary, UserName = FastTreeBinaryClassificationTrainer.UserNameValue, ShortName = FastTreeBinaryClassificationTrainer.ShortName)]
341+
[TlcModule.EntryPoint(Name = "Trainers.FastTreeBinaryClassifier",
342+
Desc = FastTreeBinaryClassificationTrainer.Summary,
343+
Remarks = FastTreeBinaryClassificationTrainer.Remarks,
344+
UserName = FastTreeBinaryClassificationTrainer.UserNameValue,
345+
ShortName = FastTreeBinaryClassificationTrainer.ShortName)]
342346
public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, FastTreeBinaryClassificationTrainer.Arguments input)
343347
{
344348
Contracts.CheckValue(env, nameof(env));

src/Microsoft.ML.FastTree/FastTreeRanking.cs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1096,7 +1096,11 @@ public static FastTreeRankingPredictor Create(IHostEnvironment env, ModelLoadCon
10961096

10971097
public static partial class FastTree
10981098
{
1099-
[TlcModule.EntryPoint(Name = "Trainers.FastTreeRanker", Desc = FastTreeRankingTrainer.Summary, UserName = FastTreeRankingTrainer.UserNameValue, ShortName = FastTreeRankingTrainer.ShortName)]
1099+
[TlcModule.EntryPoint(Name = "Trainers.FastTreeRanker",
1100+
Desc = FastTreeRankingTrainer.Summary,
1101+
Remarks = FastTreeRankingTrainer.Remarks,
1102+
UserName = FastTreeRankingTrainer.UserNameValue,
1103+
ShortName = FastTreeRankingTrainer.ShortName)]
11001104
public static CommonOutputs.RankingOutput TrainRanking(IHostEnvironment env, FastTreeRankingTrainer.Arguments input)
11011105
{
11021106
Contracts.CheckValue(env, nameof(env));

src/Microsoft.ML.FastTree/FastTreeRegression.cs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,11 @@ public static FastTreeRegressionPredictor Create(IHostEnvironment env, ModelLoad
448448

449449
public static partial class FastTree
450450
{
451-
[TlcModule.EntryPoint(Name = "Trainers.FastTreeRegressor", Desc = FastTreeRegressionTrainer.Summary, UserName = FastTreeRegressionTrainer.UserNameValue, ShortName = FastTreeRegressionTrainer.ShortName)]
451+
[TlcModule.EntryPoint(Name = "Trainers.FastTreeRegressor",
452+
Desc = FastTreeRegressionTrainer.Summary,
453+
Remarks = FastTreeRegressionTrainer.Remarks,
454+
UserName = FastTreeRegressionTrainer.UserNameValue,
455+
ShortName = FastTreeRegressionTrainer.ShortName)]
452456
public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, FastTreeRegressionTrainer.Arguments input)
453457
{
454458
Contracts.CheckValue(env, nameof(env));

src/Microsoft.ML.FastTree/FastTreeTweedie.cs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,11 @@ public sealed partial class FastTreeTweedieTrainer : BoostingFastTreeTrainerBase
3636
{
3737
public const string LoadNameValue = "FastTreeTweedieRegression";
3838
public const string UserNameValue = "FastTree (Boosted Trees) Tweedie Regression";
39-
public const string Summary = "Trains gradient boosted decision trees to fit target values using a Tweedie loss function. This learner " +
40-
"is a generalization of Poisson, compound Poisson, and gamma regression.";
39+
public const string Summary = "Trains gradient boosted decision trees to fit target values using a Tweedie loss function. This learner is a generalization of Poisson, compound Poisson, and gamma regression.";
40+
new public const string Remarks = @"<remarks>
41+
<a href='https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting'>Wikipedia: Gradient boosting (Gradient tree boosting)</a>
42+
<a href='http://projecteuclid.org/DPubS?service=UI&version=1.0&verb=Display&handle=euclid.aos/1013203451'>Greedy function approximation: A gradient boosting machine</a>
43+
</remarks>";
4144

4245
public const string ShortName = "fttweedie";
4346

@@ -460,7 +463,10 @@ protected override void Map(ref VBuffer<float> src, ref float dst)
460463

461464
public static partial class FastTree
462465
{
463-
[TlcModule.EntryPoint(Name = "Trainers.FastTreeTweedieRegressor", Desc = FastTreeTweedieTrainer.Summary, UserName = FastTreeTweedieTrainer.UserNameValue, ShortName = FastTreeTweedieTrainer.ShortName)]
466+
[TlcModule.EntryPoint(Name = "Trainers.FastTreeTweedieRegressor",
467+
Desc = FastTreeTweedieTrainer.Summary,
468+
UserName = FastTreeTweedieTrainer.UserNameValue,
469+
ShortName = FastTreeTweedieTrainer.ShortName)]
464470
public static CommonOutputs.RegressionOutput TrainTweedieRegression(IHostEnvironment env, FastTreeTweedieTrainer.Arguments input)
465471
{
466472
Contracts.CheckValue(env, nameof(env));

src/Microsoft.ML.FastTree/RandomForest.cs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,28 @@ public abstract class RandomForestTrainerBase<TArgs, TPredictor> : FastTreeTrain
1212
where TArgs : FastForestArgumentsBase, new()
1313
where TPredictor : IPredictorProducing<Float>
1414
{
15+
new internal const string Remarks = @"<remarks>
16+
Decision trees are non-parametric models that perform a sequence of simple tests on inputs.
17+
This decision procedure maps them to outputs found in the training dataset whose inputs were similar to the instance being processed.
18+
A decision is made at each node of the binary tree data structure based on a measure of similarity that maps each instance recursively through the branches of the tree until the appropriate leaf node is reached and the output decision returned.
19+
<para>Decision trees have several advantages:</para>
20+
<list type='bullet'>
21+
<item>They are efficient in both computation and memory usage during training and prediction. </item>
22+
<item>They can represent non-linear decision boundaries.</item>
23+
<item>They perform integrated feature selection and classification. </item>
24+
<item>They are resilient in the presence of noisy features.</item>
25+
</list>
26+
Fast forest is a random forest implementation.
27+
The model consists of an ensemble of decision trees. Each tree in a decision forest outputs a Gaussian distribution by way of prediction.
28+
An aggregation is performed over the ensemble of trees to find a Gaussian distribution closest to the combined distribution for all trees in the model.
29+
This decision forest classifier consists of an ensemble of decision trees.
30+
Generally, ensemble models provide better coverage and accuracy than single decision trees.
31+
Each tree in a decision forest outputs a Gaussian distribution.
32+
<a href='http://en.wikipedia.org/wiki/Random_forest'>Wikipedia: Random forest</a>
33+
<a href='http://jmlr.org/papers/volume7/meinshausen06a/meinshausen06a.pdf'>Quantile regression forest</a>
34+
<a href='https://blogs.technet.microsoft.com/machinelearning/2014/09/10/from-stumps-to-trees-to-forests/'>From Stumps to Trees to Forests</a>
35+
</remarks>";
36+
1537
private readonly bool _quantileEnabled;
1638

1739
protected RandomForestTrainerBase(IHostEnvironment env, TArgs args, bool quantileEnabled = false)

src/Microsoft.ML.FastTree/RandomForestClassification.cs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,11 @@ protected override void GetGradientInOneQuery(int query, int threadIndex)
208208

209209
public static partial class FastForest
210210
{
211-
[TlcModule.EntryPoint(Name = "Trainers.FastForestBinaryClassifier", Desc = FastForestClassification.Summary, UserName = FastForestClassification.UserNameValue, ShortName = FastForestClassification.ShortName)]
211+
[TlcModule.EntryPoint(Name = "Trainers.FastForestBinaryClassifier",
212+
Desc = FastForestClassification.Summary,
213+
Remarks = FastForestClassification.Remarks,
214+
UserName = FastForestClassification.UserNameValue,
215+
ShortName = FastForestClassification.ShortName)]
212216
public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, FastForestClassification.Arguments input)
213217
{
214218
Contracts.CheckValue(env, nameof(env));

src/Microsoft.ML.FastTree/RandomForestRegression.cs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,11 @@ public BasicImpl(Dataset trainData, Arguments args)
280280

281281
public static partial class FastForest
282282
{
283-
[TlcModule.EntryPoint(Name = "Trainers.FastForestRegressor", Desc = FastForestRegression.Summary, UserName = FastForestRegression.LoadNameValue, ShortName = FastForestRegression.ShortName)]
283+
[TlcModule.EntryPoint(Name = "Trainers.FastForestRegressor",
284+
Desc = FastForestRegression.Summary,
285+
Remarks = FastForestRegression.Remarks,
286+
UserName = FastForestRegression.LoadNameValue,
287+
ShortName = FastForestRegression.ShortName)]
284288
public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, FastForestRegression.Arguments input)
285289
{
286290
Contracts.CheckValue(env, nameof(env));

0 commit comments

Comments
 (0)