Skip to content

Commit

Permalink
[release/4.0] Fix up docs for MLContext (#7363)
Browse files Browse the repository at this point in the history
* fix up docs for MLContext

* some more fixes

* text class and sentence similarity trainers

---------

Co-authored-by: Genevieve Warren <24882762+gewarren@users.noreply.github.com>
  • Loading branch information
github-actions[bot] and gewarren authored Jan 14, 2025
1 parent b489340 commit 1fc5296
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 61 deletions.
40 changes: 20 additions & 20 deletions src/Microsoft.ML.AutoML/API/ColumnInference.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ namespace Microsoft.ML.AutoML
public sealed class ColumnInferenceResults
{
/// <summary>
/// Inferred <see cref="TextLoader.Options" /> for the dataset.
/// Gets the inferred <see cref="TextLoader.Options" /> for the dataset.
/// </summary>
/// <remarks>
/// Can be used to instantiate a new <see cref="TextLoader" /> to load
Expand All @@ -25,69 +25,69 @@ public sealed class ColumnInferenceResults
public TextLoader.Options TextLoaderOptions { get; internal set; }

/// <summary>
/// Information about the inferred columns in the dataset.
/// Gets information about the inferred columns in the dataset.
/// </summary>
/// <remarks>
/// <para>Contains the inferred purposes of each column. See <see cref="AutoML.ColumnInformation"/> for more details.</para>
/// <para>This can be fed to the AutoML API when running an experiment.
/// See <typeref cref="ExperimentBase{TMetrics, TExperimentSettings}.Execute(IDataView, ColumnInformation, IEstimator{ITransformer}, System.IProgress{RunDetail{TMetrics}})" />
/// for example.</para>
/// <para>This value can be fed to the AutoML API when running an experiment.
/// See <see cref="ExperimentBase{TMetrics, TExperimentSettings}.Execute(IDataView, ColumnInformation, IEstimator{ITransformer}, System.IProgress{RunDetail{TMetrics}})" />, for example.</para>
/// </remarks>
[JsonProperty(DefaultValueHandling = DefaultValueHandling.Include)]
public ColumnInformation ColumnInformation { get; internal set; }
}

/// <summary>
/// Information about the columns in a dataset.
/// Provides information about the columns in a dataset.
/// </summary>
/// <remarks>
/// <para>Contains information about the purpose of each column in the dataset. For instance,
/// it enumerates the dataset columns that AutoML should treat as categorical,
/// the columns AutoML should ignore, which column is the label, etc.</para>
/// <para><see cref="ColumnInformation"/> can be fed to the AutoML API when running an experiment.
/// See <typeref cref="ExperimentBase{TMetrics, TExperimentSettings}.Execute(IDataView, ColumnInformation, IEstimator{ITransformer}, System.IProgress{RunDetail{TMetrics}})" />
/// for example.</para>
/// See <see cref="ExperimentBase{TMetrics, TExperimentSettings}.Execute(IDataView, ColumnInformation, IEstimator{ITransformer}, System.IProgress{RunDetail{TMetrics}})" />, for example.</para>
/// </remarks>
public sealed class ColumnInformation
{
/// <summary>
/// The dataset column to use as the label.
/// Gets or sets the dataset column to use as the label.
/// </summary>
/// <value>The default value is "Label".</value>
public string LabelColumnName { get; set; }

/// <summary>
/// The dataset column to use as a user ID for computation.
/// Gets or sets the dataset column to use as a user ID for computation.
/// </summary>
public string UserIdColumnName { get; set; }

/// <summary>
/// The dataset column to use as a group ID for computation in a Ranking Task.
/// Gets or sets the dataset column to use as a group ID for computation in a Ranking Task.
/// If a SamplingKeyColumnName is provided, then it should be the same as this column.
/// </summary>
public string GroupIdColumnName { get; set; }

/// <summary>
/// The dataset column to use as a item ID for computation.
/// Gets or sets the dataset column to use as a item ID for computation.
/// </summary>
public string ItemIdColumnName { get; set; }

/// <summary>
/// The dataset column to use for example weight.
/// Gets or sets the dataset column to use for example weight.
/// </summary>
public string ExampleWeightColumnName { get; set; }

/// <summary>
/// The dataset column to use for grouping rows.
/// Gets or sets the dataset column to use for grouping rows.
/// </summary>
/// <remarks>
/// If two examples share the same sampling key column name,
/// they are guaranteed to appear in the same subset (train or test).
/// This can be used to ensure no label leakage from the train to the test set.
/// If <see langword="null"/>, no row grouping will be performed.
/// </summary>
/// </remarks>
public string SamplingKeyColumnName { get; set; }

/// <summary>
/// The dataset columns that are categorical.
/// Gets or sets the dataset columns that are categorical.
/// </summary>
/// <value>The default value is a new, empty <see cref="Collection{String}"/>.</value>
/// <remarks>
Expand All @@ -97,28 +97,28 @@ public sealed class ColumnInformation
public ICollection<string> CategoricalColumnNames { get; private set; }

/// <summary>
/// The dataset columns that are numeric.
/// Gets the dataset columns that are numeric.
/// </summary>
/// <value>The default value is a new, empty <see cref="Collection{String}"/>.</value>
[JsonProperty]
public ICollection<string> NumericColumnNames { get; private set; }

/// <summary>
/// The dataset columns that are text.
/// Gets the dataset columns that are text.
/// </summary>
/// <value>The default value is a new, empty <see cref="Collection{String}"/>.</value>
[JsonProperty]
public ICollection<string> TextColumnNames { get; private set; }

/// <summary>
/// The dataset columns that AutoML should ignore.
/// Gets the dataset columns that AutoML should ignore.
/// </summary>
/// <value>The default value is a new, empty <see cref="Collection{String}"/>.</value>
[JsonProperty]
public ICollection<string> IgnoredColumnNames { get; private set; }

/// <summary>
/// The dataset columns that are image paths.
/// Gets the dataset columns that are image paths.
/// </summary>
/// <value>The default value is a new, empty <see cref="Collection{String}"/>.</value>
[JsonProperty]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ public static ColumnInformation BuildColumnInfo(IEnumerable<DatasetColumnInfo> c
}

/// <summary>
/// Get all column names that are in <paramref name="columnInformation"/>.
/// Gets all column names that are in <paramref name="columnInformation"/>.
/// </summary>
/// <param name="columnInformation">Column information.</param>
public static IEnumerable<string> GetColumnNames(ColumnInformation columnInformation)
Expand Down
47 changes: 27 additions & 20 deletions src/Microsoft.ML.Data/MLContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,72 +11,78 @@
namespace Microsoft.ML
{
/// <summary>
/// The common context for all ML.NET operations. Once instantiated by the user, it provides a way to
/// Represents the common context for all ML.NET operations.
/// </summary>
/// <remarks>
/// Once instantiated by the user, this class provides a way to
/// create components for data preparation, feature engineering, training, prediction, and model evaluation.
/// It also allows logging, execution control, and the ability to set repeatable random numbers.
/// </summary>
/// </remarks>
public sealed class MLContext : IHostEnvironmentInternal
{
// REVIEW: consider making LocalEnvironment and MLContext the same class instead of encapsulation.
private readonly LocalEnvironment _env;

/// <summary>
/// Trainers and tasks specific to binary classification problems.
/// Gets the trainers and tasks specific to binary classification problems.
/// </summary>
public BinaryClassificationCatalog BinaryClassification { get; }

/// <summary>
/// Trainers and tasks specific to multiclass classification problems.
/// Gets the trainers and tasks specific to multiclass classification problems.
/// </summary>
public MulticlassClassificationCatalog MulticlassClassification { get; }

/// <summary>
/// Trainers and tasks specific to regression problems.
/// Gets the trainers and tasks specific to regression problems.
/// </summary>
public RegressionCatalog Regression { get; }

/// <summary>
/// Trainers and tasks specific to clustering problems.
/// Gets the trainers and tasks specific to clustering problems.
/// </summary>
public ClusteringCatalog Clustering { get; }

/// <summary>
/// Trainers and tasks specific to ranking problems.
/// Gets the trainers and tasks specific to ranking problems.
/// </summary>
public RankingCatalog Ranking { get; }

/// <summary>
/// Trainers and tasks specific to anomaly detection problems.
/// Gets the trainers and tasks specific to anomaly detection problems.
/// </summary>
public AnomalyDetectionCatalog AnomalyDetection { get; }

/// <summary>
/// Trainers and tasks specific to forecasting problems.
/// Gets the trainers and tasks specific to forecasting problems.
/// </summary>
public ForecastingCatalog Forecasting { get; }

/// <summary>
/// Data processing operations.
/// Gets the data processing operations.
/// </summary>
public TransformsCatalog Transforms { get; }

/// <summary>
/// Operations with trained models.
/// Gets the operations with trained models.
/// </summary>
public ModelOperationsCatalog Model { get; }

/// <summary>
/// Data loading and saving.
/// Gets the data loading and saving operations.
/// </summary>
public DataOperationsCatalog Data { get; }

// REVIEW: I think it's valuable to have the simplest possible interface for logging interception here,
// and expand if and when necessary. Exposing classes like ChannelMessage, MessageSensitivity and so on
// looks premature at this point.
/// <summary>
/// The handler for the log messages.
/// Represents the callback method that will handle the log messages.
/// </summary>
public event EventHandler<LoggingEventArgs> Log;

/// <summary>
/// This is a catalog of components that will be used for model loading.
/// Gets the catalog of components that will be used for model loading.
/// </summary>
public ComponentCatalog ComponentCatalog => _env.ComponentCatalog;

Expand All @@ -90,7 +96,8 @@ public string TempFilePath
}

/// <summary>
/// Allow falling back to run on CPU if couldn't run on GPU.
/// Gets or sets a value that indicates whether the CPU will
/// be used if the task couldn't run on GPU.
/// </summary>
public bool FallbackToCpu
{
Expand All @@ -99,7 +106,7 @@ public bool FallbackToCpu
}

/// <summary>
/// GPU device ID to run execution on, <see langword="null" /> to run on CPU.
/// Gets or sets the GPU device ID to run execution on, <see langword="null" /> to run on CPU.
/// </summary>
public int? GpuDeviceId
{
Expand All @@ -120,17 +127,17 @@ public int? GpuDeviceId
///
/// If a fixed seed is provided by <paramref name="seed"/>, MLContext environment becomes
/// deterministic, meaning that the results are repeatable and will remain the same across multiple runs.
/// For instance in many of ML.NET's API reference example code snippets, a seed is provided.
/// For instance, in many of ML.NET's API reference example code snippets, a seed is provided.
/// That's because we want the users to get the same output as what's included in example comments,
/// when they run the example on their own machine.
///
/// Generally though, repeatability is not a requirement and that's the default behavior.
/// If a seed is not provided by <paramref name="seed"/>, i.e. it's set to <see langword="null"/>,
/// If a seed is not provided by <paramref name="seed"/>, that is, it's set to <see langword="null"/>,
/// MLContext environment becomes non-deterministic and outputs change across multiple runs.
///
/// There are many operations in ML.NET that don't use any randomness, such as
/// min-max normalization, concatenating columns, missing value indication, etc.
/// The behavior of those operations are deterministic regardless of the seed value.
/// min-max normalization, concatenating columns, and missing value indication.
/// The behavior of those operations is deterministic regardless of the seed value.
///
/// Also ML.NET trainers don't use randomness *after* the training is finished.
/// So, the predictions from a loaded model don't depend on the seed value.
Expand Down
21 changes: 11 additions & 10 deletions src/Microsoft.ML.TorchSharp/NasBert/SentenceSimilarityTrainer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,31 +27,32 @@
namespace Microsoft.ML.TorchSharp.NasBert
{
/// <summary>
/// The <see cref="IEstimator{TTransformer}"/> for training a Deep Neural Network(DNN) to classify text.
/// Represents the <see cref="IEstimator{TTransformer}"/> for training a Deep Neural Network (DNN) to determine sentence similarity.
/// </summary>
/// <remarks>
/// <format type="text/markdown"><![CDATA[
/// To create this trainer, use [TextClassification](xref:Microsoft.ML.TorchSharpCatalog.TextClassification(Microsoft.ML.MulticlassClassificationCatalog.MulticlassClassificationTrainers,Int32,System.String,System.String,System.String,System.String,Int32,Int32,Int32,Microsoft.ML.TorchSharp.NasBert.BertArchitecture,Microsoft.ML.IDataView)).
///
/// ### Input and Output Columns
/// The input label column data must be type<xref:System.Single> type and the sentence columns must be of type<xref:Microsoft.ML.Data.TextDataViewType>.
/// ### Input and output columns
/// The input label column data must be type <xref:System.Single> and the sentence columns must be of type <xref:Microsoft.ML.Data.TextDataViewType>.
///
/// This trainer outputs the following columns:
///
/// | Output Column Name | Column Type | Description|
/// | Output column name | Column type | Description|
/// | -- | -- | -- |
/// | `Score` | <xref:System.Single> | The degree of similarity between the 2 sentences. |
/// ### Trainer Characteristics
/// | | |
/// | `Score` | <xref:System.Single> | The degree of similarity between the two sentences. |
///
/// ### Trainer characteristics
/// | Characteristic | Value |
/// | -- | -- |
/// | Machine learning task | Rregression |
/// | Machine learning task | Regression |
/// | Is normalization required? | No |
/// | Is caching required? | No |
/// | Required NuGet in addition to Microsoft.ML | Microsoft.ML.TorchSharp and libtorch-cpu or libtorch-cuda-11.3 or any of the OS specific variants. |
/// | Exportable to ONNX | No |
///
/// ### Training Algorithm Details
/// Trains a Deep Neural Network(DNN) by leveraging an existing pre-trained NAS-BERT roBERTa model for the purpose of determining sentence similarity.
/// ### Training algorithm details
/// Trains a Deep Neural Network (DNN) by leveraging an existing, pretrained NAS-BERT roBERTa model for the purpose of determining sentence similarity.
/// ]]>
/// </format>
/// </remarks>
Expand Down
21 changes: 11 additions & 10 deletions src/Microsoft.ML.TorchSharp/NasBert/TextClassificationTrainer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,32 +28,33 @@
namespace Microsoft.ML.TorchSharp.NasBert
{
/// <summary>
/// The <see cref="IEstimator{TTransformer}"/> for training a Deep Neural Network(DNN) to classify text.
/// The <see cref="IEstimator{TTransformer}"/> for training a Deep Neural Network (DNN) to classify text.
/// </summary>
/// <remarks>
/// <format type="text/markdown"><![CDATA[
/// To create this trainer, use [TextClassification](xref:Microsoft.ML.TorchSharpCatalog.TextClassification(Microsoft.ML.MulticlassClassificationCatalog.MulticlassClassificationTrainers,Int32,System.String,System.String,System.String,System.String,Int32,Int32,Int32,Microsoft.ML.TorchSharp.NasBert.BertArchitecture,Microsoft.ML.IDataView)).
///
/// ### Input and Output Columns
/// The input label column data must be [key](xref:Microsoft.ML.Data.KeyDataViewType) type and the sentence columns must be of type<xref:Microsoft.ML.Data.TextDataViewType>.
/// ### Input and output columns
/// The input label column data must be [key](xref:Microsoft.ML.Data.KeyDataViewType) type and the sentence columns must be of type <xref:Microsoft.ML.Data.TextDataViewType>.
///
/// This trainer outputs the following columns:
///
/// | Output Column Name | Column Type | Description|
/// | Output column name | Column type | Description|
/// | -- | -- | -- |
/// | `PredictedLabel` | [key](xref:Microsoft.ML.Data.KeyDataViewType) type | The predicted label's index. If its value is i, the actual label would be the i-th category in the key-valued input label type. |
/// | `Score` | Vector of<xref:System.Single> | The scores of all classes.Higher value means higher probability to fall into the associated class. If the i-th element has the largest value, the predicted label index would be i.Note that i is zero-based index. |
/// ### Trainer Characteristics
/// | | |
/// | `PredictedLabel` | [key](xref:Microsoft.ML.Data.KeyDataViewType) type | The predicted label's index. If its value is `i`, the actual label would be the `i`-th category in the key-valued input label type. |
/// | `Score` | Vector of<xref:System.Single> | The scores of all classes. Higher value means higher probability to fall into the associated class. If the `i`-th element has the largest value, the predicted label index would be `i`. Note that `i` is a zero-based index. |
///
/// ### Trainer characteristics
/// | Characteristic | Value |
/// | -- | -- |
/// | Machine learning task | Multiclass classification |
/// | Is normalization required? | No |
/// | Is caching required? | No |
/// | Required NuGet in addition to Microsoft.ML | Microsoft.ML.TorchSharp and libtorch-cpu or libtorch-cuda-11.3 or any of the OS specific variants. |
/// | Exportable to ONNX | No |
///
/// ### Training Algorithm Details
/// Trains a Deep Neural Network(DNN) by leveraging an existing pre-trained NAS-BERT roBERTa model for the purpose of classifying text.
/// ### Training algorithm details
/// Trains a Deep Neural Network (DNN) by leveraging an existing, pretrained NAS-BERT roBERTa model for the purpose of classifying text.
/// ]]>
/// </format>
/// </remarks>
Expand Down

0 comments on commit 1fc5296

Please sign in to comment.