diff --git a/src/Microsoft.ML.Data/DataLoadSave/FakeSchema.cs b/src/Microsoft.ML.Data/DataLoadSave/FakeSchema.cs new file mode 100644 index 0000000000..d94219a453 --- /dev/null +++ b/src/Microsoft.ML.Data/DataLoadSave/FakeSchema.cs @@ -0,0 +1,110 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Core.Data; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Internal.Utilities; +using System.Collections.Generic; +using System.Linq; + +namespace Microsoft.ML.Data.DataLoadSave +{ + + /// + /// A fake schema that is manufactured out of a SchemaShape. + /// It will pretend that all vector sizes are equal to 10, all key value counts are equal to 10, + /// and all values are defaults (for metadata). + /// + internal sealed class FakeSchema : ISchema + { + private const int AllVectorSizes = 10; + private const int AllKeySizes = 10; + + private readonly IHostEnvironment _env; + private readonly SchemaShape _shape; + private readonly Dictionary _colMap; + + public FakeSchema(IHostEnvironment env, SchemaShape inputShape) + { + _env = env; + _shape = inputShape; + _colMap = Enumerable.Range(0, _shape.Columns.Length) + .ToDictionary(idx => _shape.Columns[idx].Name, idx => idx); + } + + public int ColumnCount => _shape.Columns.Length; + + public string GetColumnName(int col) + { + _env.Check(0 <= col && col < ColumnCount); + return _shape.Columns[col].Name; + } + + public ColumnType GetColumnType(int col) + { + _env.Check(0 <= col && col < ColumnCount); + var inputCol = _shape.Columns[col]; + return MakeColumnType(inputCol); + } + + public bool TryGetColumnIndex(string name, out int col) => _colMap.TryGetValue(name, out col); + + private static ColumnType MakeColumnType(SchemaShape.Column inputCol) + { + ColumnType curType = inputCol.ItemType; + if (inputCol.IsKey) + curType = new KeyType(curType.AsPrimitive.RawKind, 0, AllKeySizes); + if (inputCol.Kind == SchemaShape.Column.VectorKind.VariableVector) + curType = new VectorType(curType.AsPrimitive, 0); + else if (inputCol.Kind == SchemaShape.Column.VectorKind.Vector) + curType = new VectorType(curType.AsPrimitive, AllVectorSizes); + return curType; + } + + public void GetMetadata(string kind, int col, ref TValue value) + { + _env.Check(0 <= col && col < ColumnCount); + var inputCol = _shape.Columns[col]; + var metaShape = inputCol.Metadata; + if (metaShape == null || !metaShape.TryFindColumn(kind, out var metaColumn)) + throw _env.ExceptGetMetadata(); + + var colType = MakeColumnType(metaColumn); + _env.Check(colType.RawType.Equals(typeof(TValue))); + + if (colType.IsVector) + { + // This as an atypical use of VBuffer: we create it in GetMetadataVec, and then pass through + // via boxing to be returned out of this method. This is intentional. + value = (TValue)Utils.MarshalInvoke(GetMetadataVec, colType.ItemType.RawType); + } + else + value = default; + } + + private object GetMetadataVec() => new VBuffer(AllVectorSizes, 0, null, null); + + public ColumnType GetMetadataTypeOrNull(string kind, int col) + { + _env.Check(0 <= col && col < ColumnCount); + var inputCol = _shape.Columns[col]; + var metaShape = inputCol.Metadata; + if (metaShape == null || !metaShape.TryFindColumn(kind, out var metaColumn)) + return null; + return MakeColumnType(metaColumn); + } + + public IEnumerable> GetMetadataTypes(int col) + { + _env.Check(0 <= col && col < ColumnCount); + var inputCol = _shape.Columns[col]; + var metaShape = inputCol.Metadata; + if (metaShape == null) + return Enumerable.Empty>(); + + return metaShape.Columns.Select(c => new KeyValuePair(c.Name, MakeColumnType(c))); + } + } +} diff --git a/src/Microsoft.ML.Data/DataLoadSave/TransformWrapper.cs b/src/Microsoft.ML.Data/DataLoadSave/TransformWrapper.cs new file mode 100644 index 0000000000..64130ed80e --- /dev/null +++ b/src/Microsoft.ML.Data/DataLoadSave/TransformWrapper.cs @@ -0,0 +1,152 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Core.Data; +using Microsoft.ML.Data.DataLoadSave; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Data.IO; +using Microsoft.ML.Runtime.Model; +using System.Collections.Generic; + +[assembly: LoadableClass(typeof(TransformWrapper), null, typeof(SignatureLoadModel), + "Transform wrapper", TransformWrapper.LoaderSignature)] + +namespace Microsoft.ML.Runtime.Data +{ + // REVIEW: this class is public, as long as the Wrappers.cs in tests still rely on it. + // It needs to become internal. + public sealed class TransformWrapper : ITransformer, ICanSaveModel + { + public const string LoaderSignature = "TransformWrapper"; + private const string TransformDirTemplate = "Step_{0:000}"; + + private readonly IHost _host; + private readonly IDataView _xf; + + public TransformWrapper(IHostEnvironment env, IDataView xf) + { + Contracts.CheckValue(env, nameof(env)); + _host = env.Register(nameof(TransformWrapper)); + _host.CheckValue(xf, nameof(xf)); + _xf = xf; + } + + public ISchema GetOutputSchema(ISchema inputSchema) + { + _host.CheckValue(inputSchema, nameof(inputSchema)); + + var dv = new EmptyDataView(_host, inputSchema); + var output = ApplyTransformUtils.ApplyAllTransformsToData(_host, _xf, dv); + return output.Schema; + } + + public void Save(ModelSaveContext ctx) + { + ctx.CheckAtModel(); + ctx.SetVersionInfo(GetVersionInfo()); + + var dataPipe = _xf; + var transforms = new List(); + while (dataPipe is IDataTransform xf) + { + // REVIEW: a malicious user could construct a loop in the Source chain, that would + // cause this method to iterate forever (and throw something when the list overflows). There's + // no way to insulate from ALL malicious behavior. + transforms.Add(xf); + dataPipe = xf.Source; + Contracts.AssertValue(dataPipe); + } + transforms.Reverse(); + + ctx.SaveSubModel("Loader", c => BinaryLoader.SaveInstance(_host, c, dataPipe.Schema)); + + ctx.Writer.Write(transforms.Count); + for (int i = 0; i < transforms.Count; i++) + { + var dirName = string.Format(TransformDirTemplate, i); + ctx.SaveModel(transforms[i], dirName); + } + } + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "XF WRPR", + verWrittenCur: 0x00010001, // Initial + verReadableCur: 0x00010001, + verWeCanReadBack: 0x00010001, + loaderSignature: LoaderSignature); + } + + // Factory for SignatureLoadModel. + public TransformWrapper(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.CheckValue(env, nameof(env)); + _host = env.Register(nameof(TransformWrapper)); + _host.CheckValue(ctx, nameof(ctx)); + + ctx.CheckAtModel(GetVersionInfo()); + int n = ctx.Reader.ReadInt32(); + _host.CheckDecode(n >= 0); + + ctx.LoadModel(env, out var loader, "Loader", new MultiFileSource(null)); + + IDataView data = loader; + for (int i = 0; i < n; i++) + { + var dirName = string.Format(TransformDirTemplate, i); + ctx.LoadModel(env, out var xf, dirName, data); + data = xf; + } + + _xf = data; + } + + public IDataView Transform(IDataView input) => ApplyTransformUtils.ApplyAllTransformsToData(_host, _xf, input); + } + + /// + /// Estimator for trained wrapped transformers. + /// + internal abstract class TrainedWrapperEstimatorBase : IEstimator + { + private readonly IHost _host; + + protected TrainedWrapperEstimatorBase(IHost host) + { + Contracts.CheckValue(host, nameof(host)); + _host = host; + } + + public abstract TransformWrapper Fit(IDataView input); + + public SchemaShape GetOutputSchema(SchemaShape inputSchema) + { + _host.CheckValue(inputSchema, nameof(inputSchema)); + + var fakeSchema = new FakeSchema(_host, inputSchema); + var transformer = Fit(new EmptyDataView(_host, fakeSchema)); + return SchemaShape.Create(transformer.GetOutputSchema(fakeSchema)); + } + } + + /// + /// Estimator for untrained wrapped transformers. + /// + public abstract class TrivialWrapperEstimator : TrivialEstimator + { + protected TrivialWrapperEstimator(IHost host, TransformWrapper transformer) + : base(host, transformer) + { + } + + public override SchemaShape GetOutputSchema(SchemaShape inputSchema) + { + Host.CheckValue(inputSchema, nameof(inputSchema)); + var fakeSchema = new FakeSchema(Host, inputSchema); + return SchemaShape.Create(Transformer.GetOutputSchema(fakeSchema)); + } + } +} diff --git a/src/Microsoft.ML.Transforms/Text/TextStaticExtensions.cs b/src/Microsoft.ML.Transforms/Text/TextStaticExtensions.cs new file mode 100644 index 0000000000..4e0821241b --- /dev/null +++ b/src/Microsoft.ML.Transforms/Text/TextStaticExtensions.cs @@ -0,0 +1,116 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Core.Data; +using Microsoft.ML.Data.StaticPipe.Runtime; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Data; +using System; +using System.Collections.Generic; + +namespace Microsoft.ML.Transforms.Text +{ + /// + /// Extensions for statically typed word tokenizer. + /// + public static class WordTokenizerExtensions + { + private sealed class OutPipelineColumn : VarVector + { + public readonly Scalar Input; + + public OutPipelineColumn(Scalar input, string separators) + : base(new Reconciler(separators), input) + { + Input = input; + } + } + + private sealed class Reconciler : EstimatorReconciler + { + private readonly string _separators; + + public Reconciler(string separators) + { + _separators = separators; + } + + public override IEstimator Reconcile(IHostEnvironment env, + PipelineColumn[] toOutput, + IReadOnlyDictionary inputNames, + IReadOnlyDictionary outputNames, + IReadOnlyCollection usedNames) + { + Contracts.Assert(toOutput.Length == 1); + + var pairs = new List<(string input, string output)>(); + foreach (var outCol in toOutput) + pairs.Add((inputNames[((OutPipelineColumn)outCol).Input], outputNames[outCol])); + + return new WordTokenizer(env, pairs.ToArray(), _separators); + } + } + + /// + /// Tokenize incoming text using and output the tokens. + /// + /// The column to apply to. + /// The separators to use (comma separated). + public static VarVector TokenizeText(this Scalar input, string separators = "space") => new OutPipelineColumn(input, separators); + } + + /// + /// Extensions for statically typed character tokenizer. + /// + public static class CharacterTokenizerExtensions + { + private sealed class OutPipelineColumn : VarVector> + { + public readonly Scalar Input; + + public OutPipelineColumn(Scalar input, bool useMarkerChars) + : base(new Reconciler(useMarkerChars), input) + { + Input = input; + } + } + + private sealed class Reconciler : EstimatorReconciler, IEquatable + { + private readonly bool _useMarker; + + public Reconciler(bool useMarkerChars) + { + _useMarker = useMarkerChars; + } + + public bool Equals(Reconciler other) + { + return _useMarker == other._useMarker; + } + + public override IEstimator Reconcile(IHostEnvironment env, + PipelineColumn[] toOutput, + IReadOnlyDictionary inputNames, + IReadOnlyDictionary outputNames, + IReadOnlyCollection usedNames) + { + Contracts.Assert(toOutput.Length == 1); + + var pairs = new List<(string input, string output)>(); + foreach (var outCol in toOutput) + pairs.Add((inputNames[((OutPipelineColumn)outCol).Input], outputNames[outCol])); + + return new CharacterTokenizer(env, pairs.ToArray(), _useMarker); + } + } + + /// + /// Tokenize incoming text into a sequence of characters. + /// + /// The column to apply to. + /// Whether to use marker characters to separate words. + public static VarVector> TokenizeIntoCharacters(this Scalar input, bool useMarkerCharacters = true) => new OutPipelineColumn(input, useMarkerCharacters); + } +} diff --git a/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs b/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs new file mode 100644 index 0000000000..8d4330212d --- /dev/null +++ b/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs @@ -0,0 +1,122 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.TextAnalytics; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Microsoft.ML.Transforms +{ + /// + /// Word tokenizer splits text into tokens using the delimiter. + /// For each text input, the output column is a variable vector of text. + /// + public sealed class WordTokenizer : TrivialWrapperEstimator + { + /// + /// Tokenize incoming text in and output the tokens as . + /// + /// The environment. + /// The column containing text to tokenize. + /// The column containing output tokens. Null means is replaced. + /// The separators to use (comma separated). + public WordTokenizer(IHostEnvironment env, string inputColumn, string outputColumn = null, string separators = "space") + : this(env, new[] { (inputColumn, outputColumn ?? inputColumn) }, separators) + { + } + + /// + /// Tokenize incoming text in input columns and output the tokens as output columns. + /// + /// The environment. + /// Pairs of columns to run the tokenization on. + /// The separators to use (comma separated). + public WordTokenizer(IHostEnvironment env, (string input, string output)[] columns, string separators = "space") + : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(WordTokenizer)), MakeTransformer(env, columns, separators)) + { + } + + private static TransformWrapper MakeTransformer(IHostEnvironment env, (string input, string output)[] columns, string separators) + { + Contracts.AssertValue(env); + env.CheckNonEmpty(columns, nameof(columns)); + foreach (var (input, output) in columns) + { + env.CheckValue(input, nameof(input)); + env.CheckValue(output, nameof(input)); + } + + // Create arguments. + // REVIEW: enable multiple separators via something other than parsing strings. + var args = new DelimitedTokenizeTransform.Arguments + { + Column = columns.Select(x => new DelimitedTokenizeTransform.Column { Source = x.input, Name = x.output }).ToArray(), + TermSeparators = separators + }; + + // Create a valid instance of data. + var schema = new SimpleSchema(env, columns.Select(x => new KeyValuePair(x.input, TextType.Instance)).ToArray()); + var emptyData = new EmptyDataView(env, schema); + + return new TransformWrapper(env, new DelimitedTokenizeTransform(env, args, emptyData)); + } + } + + /// + /// Character tokenizer splits text into sequences of characters using a sliding window. + /// + public sealed class CharacterTokenizer : TrivialWrapperEstimator + { + /// + /// Tokenize incoming text in and output the tokens as . + /// + /// The environment. + /// The column containing text to tokenize. + /// The column containing output tokens. Null means is replaced. + /// Whether to use marker characters to separate words. + public CharacterTokenizer(IHostEnvironment env, string inputColumn, string outputColumn = null, bool useMarkerCharacters = true) + : this (env, new[] { (inputColumn, outputColumn ?? inputColumn) }, useMarkerCharacters) + { + } + + /// + /// Tokenize incoming text in input columns and output the tokens as output columns. + /// + /// The environment. + /// Pairs of columns to run the tokenization on. + /// Whether to use marker characters to separate words. + public CharacterTokenizer(IHostEnvironment env, (string input, string output)[] columns, bool useMarkerCharacters = true) + : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(CharacterTokenizer)), MakeTransformer(env, columns, useMarkerCharacters)) + { + } + + private static TransformWrapper MakeTransformer(IHostEnvironment env, (string input, string output)[] columns, bool useMarkerChars) + { + Contracts.AssertValue(env); + env.CheckNonEmpty(columns, nameof(columns)); + foreach (var (input, output) in columns) + { + env.CheckValue(input, nameof(input)); + env.CheckValue(output, nameof(input)); + } + + // Create arguments. + var args = new CharTokenizeTransform.Arguments + { + Column = columns.Select(x => new CharTokenizeTransform.Column { Source = x.input, Name = x.output }).ToArray(), + UseMarkerChars = useMarkerChars + }; + + // Create a valid instance of data. + var schema = new SimpleSchema(env, columns.Select(x => new KeyValuePair(x.input, TextType.Instance)).ToArray()); + var emptyData = new EmptyDataView(env, schema); + + return new TransformWrapper(env, new CharTokenizeTransform(env, args, emptyData)); + } + } +} diff --git a/test/BaselineOutput/SingleDebug/Text/tokenized.tsv b/test/BaselineOutput/SingleDebug/Text/tokenized.tsv new file mode 100644 index 0000000000..eccf8ca0e6 --- /dev/null +++ b/test/BaselineOutput/SingleDebug/Text/tokenized.tsv @@ -0,0 +1,12 @@ +#@ TextLoader{ +#@ header+ +#@ sep=tab +#@ col=text:TX:0 +#@ col=words:TX:1-** +#@ col={name=chars type=TX src={ min=-1 var=+}} +#@ } +text +==RUDE== Dude, you are rude upload that carl picture back, or else. ==RUDE== Dude, you are rude upload that carl picture back, or else. <␂> = = R U D E = = <␠> D u d e , <␠> y o u <␠> a r e <␠> r u d e <␠> u p l o a d <␠> t h a t <␠> c a r l <␠> p i c t u r e <␠> b a c k , <␠> o r <␠> e l s e . <␃> +== OK! == IM GOING TO VANDALIZE WILD ONES WIKI THEN!!! == OK! == IM GOING TO VANDALIZE WILD ONES WIKI THEN!!! <␂> = = <␠> O K ! <␠> = = <␠> <␠> I M <␠> G O I N G <␠> T O <␠> V A N D A L I Z E <␠> W I L D <␠> O N E S <␠> W I K I <␠> T H E N ! ! ! <␠> <␠> <␠> <␃> +Stop trolling, zapatancas, calling me a liar merely demonstartes that you arer Zapatancas. You may choose to chase every legitimate editor from this site and ignore me but I am an editor with a record that isnt 99% trolling and therefore my wishes are not to be completely ignored by a sockpuppet like yourself. The consensus is overwhelmingly against you and your trollin g lover Zapatancas, Stop trolling, zapatancas, calling me a liar merely demonstartes that you arer Zapatancas. You may choose to chase every legitimate editor from this site and ignore me but I am an editor with a record that isnt 99% trolling and therefore my wishes are not to be completely ignored by a sockpuppet like yourself. The consensus is overwhelmingly against you and your trollin g lover Zapatancas, <␂> S t o p <␠> t r o l l i n g , <␠> z a p a t a n c a s , <␠> c a l l i n g <␠> m e <␠> a <␠> l i a r <␠> m e r e l y <␠> d e m o n s t a r t e s <␠> t h a t <␠> y o u <␠> a r e r <␠> Z a p a t a n c a s . <␠> Y o u <␠> m a y <␠> c h o o s e <␠> t o <␠> c h a s e <␠> e v e r y <␠> l e g i t i m a t e <␠> e d i t o r <␠> f r o m <␠> t h i s <␠> s i t e <␠> a n d <␠> i g n o r e <␠> m e <␠> b u t <␠> I <␠> a m <␠> a n <␠> e d i t o r <␠> w i t h <␠> a <␠> r e c o r d <␠> t h a t <␠> i s n t <␠> 9 9 % <␠> t r o l l i n g <␠> a n d <␠> t h e r e f o r e <␠> m y <␠> w i s h e s <␠> a r e <␠> n o t <␠> t o <␠> b e <␠> c o m p l e t e l y <␠> i g n o r e d <␠> b y <␠> a <␠> s o c k p u p p e t <␠> l i k e <␠> y o u r s e l f . <␠> T h e <␠> c o n s e n s u s <␠> i s <␠> o v e r w h e l m i n g l y <␠> a g a i n s t <␠> y o u <␠> a n d <␠> y o u r <␠> t r o l l i n <␠> g <␠> l o v e r <␠> Z a p a t a n c a s , <␠> <␠> <␃> +==You're cool== You seem like a really cool guy... *bursts out laughing at sarcasm*. ==You're cool== You seem like a really cool guy... *bursts out laughing at sarcasm*. <␂> = = Y o u ' r e <␠> c o o l = = <␠> <␠> Y o u <␠> s e e m <␠> l i k e <␠> a <␠> r e a l l y <␠> c o o l <␠> g u y . . . <␠> * b u r s t s <␠> o u t <␠> l a u g h i n g <␠> a t <␠> s a r c a s m * . <␃> diff --git a/test/BaselineOutput/SingleRelease/Text/tokenized.tsv b/test/BaselineOutput/SingleRelease/Text/tokenized.tsv new file mode 100644 index 0000000000..eccf8ca0e6 --- /dev/null +++ b/test/BaselineOutput/SingleRelease/Text/tokenized.tsv @@ -0,0 +1,12 @@ +#@ TextLoader{ +#@ header+ +#@ sep=tab +#@ col=text:TX:0 +#@ col=words:TX:1-** +#@ col={name=chars type=TX src={ min=-1 var=+}} +#@ } +text +==RUDE== Dude, you are rude upload that carl picture back, or else. ==RUDE== Dude, you are rude upload that carl picture back, or else. <␂> = = R U D E = = <␠> D u d e , <␠> y o u <␠> a r e <␠> r u d e <␠> u p l o a d <␠> t h a t <␠> c a r l <␠> p i c t u r e <␠> b a c k , <␠> o r <␠> e l s e . <␃> +== OK! == IM GOING TO VANDALIZE WILD ONES WIKI THEN!!! == OK! == IM GOING TO VANDALIZE WILD ONES WIKI THEN!!! <␂> = = <␠> O K ! <␠> = = <␠> <␠> I M <␠> G O I N G <␠> T O <␠> V A N D A L I Z E <␠> W I L D <␠> O N E S <␠> W I K I <␠> T H E N ! ! ! <␠> <␠> <␠> <␃> +Stop trolling, zapatancas, calling me a liar merely demonstartes that you arer Zapatancas. You may choose to chase every legitimate editor from this site and ignore me but I am an editor with a record that isnt 99% trolling and therefore my wishes are not to be completely ignored by a sockpuppet like yourself. The consensus is overwhelmingly against you and your trollin g lover Zapatancas, Stop trolling, zapatancas, calling me a liar merely demonstartes that you arer Zapatancas. You may choose to chase every legitimate editor from this site and ignore me but I am an editor with a record that isnt 99% trolling and therefore my wishes are not to be completely ignored by a sockpuppet like yourself. The consensus is overwhelmingly against you and your trollin g lover Zapatancas, <␂> S t o p <␠> t r o l l i n g , <␠> z a p a t a n c a s , <␠> c a l l i n g <␠> m e <␠> a <␠> l i a r <␠> m e r e l y <␠> d e m o n s t a r t e s <␠> t h a t <␠> y o u <␠> a r e r <␠> Z a p a t a n c a s . <␠> Y o u <␠> m a y <␠> c h o o s e <␠> t o <␠> c h a s e <␠> e v e r y <␠> l e g i t i m a t e <␠> e d i t o r <␠> f r o m <␠> t h i s <␠> s i t e <␠> a n d <␠> i g n o r e <␠> m e <␠> b u t <␠> I <␠> a m <␠> a n <␠> e d i t o r <␠> w i t h <␠> a <␠> r e c o r d <␠> t h a t <␠> i s n t <␠> 9 9 % <␠> t r o l l i n g <␠> a n d <␠> t h e r e f o r e <␠> m y <␠> w i s h e s <␠> a r e <␠> n o t <␠> t o <␠> b e <␠> c o m p l e t e l y <␠> i g n o r e d <␠> b y <␠> a <␠> s o c k p u p p e t <␠> l i k e <␠> y o u r s e l f . <␠> T h e <␠> c o n s e n s u s <␠> i s <␠> o v e r w h e l m i n g l y <␠> a g a i n s t <␠> y o u <␠> a n d <␠> y o u r <␠> t r o l l i n <␠> g <␠> l o v e r <␠> Z a p a t a n c a s , <␠> <␠> <␃> +==You're cool== You seem like a really cool guy... *bursts out laughing at sarcasm*. ==You're cool== You seem like a really cool guy... *bursts out laughing at sarcasm*. <␂> = = Y o u ' r e <␠> c o o l = = <␠> <␠> Y o u <␠> s e e m <␠> l i k e <␠> a <␠> r e a l l y <␠> c o o l <␠> g u y . . . <␠> * b u r s t s <␠> o u t <␠> l a u g h i n g <␠> a t <␠> s a r c a s m * . <␃> diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs index 0095d2f4cd..4b8194afe1 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs @@ -3,11 +3,11 @@ // See the LICENSE file in the project root for more information. using Microsoft.ML.Data.StaticPipe; -using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Data.IO; using Microsoft.ML.Runtime.Internal.Utilities; using Microsoft.ML.TestFramework; +using Microsoft.ML.Transforms.Text; using System; using System.Collections.Generic; using System.Collections.Immutable; @@ -403,5 +403,35 @@ public void ConcatWith() Assert.Equal(NumberType.Float, types[2].ItemType); Assert.Equal(NumberType.Float, types[3].ItemType); } + + [Fact] + public void Tokenize() + { + var env = new ConsoleEnvironment(seed: 0); + var dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); + var reader = TextLoader.CreateReader(env, ctx => ( + label: ctx.LoadBool(0), + text: ctx.LoadText(1)), hasHeader: true); + var dataSource = new MultiFileSource(dataPath); + var data = reader.Read(dataSource); + + var est = data.MakeNewEstimator() + .Append(r => ( + r.label, + tokens: r.text.TokenizeText(), + chars: r.text.TokenizeIntoCharacters())); + + var tdata = est.Fit(data).Transform(data); + var schema = tdata.AsDynamic.Schema; + + Assert.True(schema.TryGetColumnIndex("tokens", out int tokensCol)); + var type = schema.GetColumnType(tokensCol); + Assert.True(type.IsVector && !type.IsKnownSizeVector && type.ItemType.IsText); + + Assert.True(schema.TryGetColumnIndex("chars", out int charsCol)); + type = schema.GetColumnType(charsCol); + Assert.True(type.IsVector && !type.IsKnownSizeVector && type.ItemType.IsKey); + Assert.True(type.ItemType.AsKey.RawKind == DataKind.U2); + } } } diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Wrappers.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Wrappers.cs index 86799ce445..c5a6a40703 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Wrappers.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Wrappers.cs @@ -6,29 +6,16 @@ using Microsoft.ML.Legacy.Models; using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Api; -using Microsoft.ML.Runtime.CommandLine; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Data.IO; -using Microsoft.ML.Runtime.Internal.Internallearn; -using Microsoft.ML.Runtime.Learners; using Microsoft.ML.Runtime.Model; -using Microsoft.ML.Runtime.Training; -using Microsoft.ML.Tests.Scenarios.Api; using System; using System.Collections.Generic; using System.IO; using System.Linq; -[assembly: LoadableClass(typeof(TransformWrapper), null, typeof(SignatureLoadModel), - "Transform wrapper", TransformWrapper.LoaderSignature)] -[assembly: LoadableClass(typeof(LoaderWrapper), null, typeof(SignatureLoadModel), - "Loader wrapper", LoaderWrapper.LoaderSignature)] - namespace Microsoft.ML.Tests.Scenarios.Api { - using TScalarPredictor = IPredictorProducing; - using TWeightsPredictor = IPredictorWithFeatureWeights; - public sealed class LoaderWrapper : IDataReader, ICanSaveModel { public const string LoaderSignature = "LoaderWrapper"; @@ -93,212 +80,6 @@ public LoaderWrapper(IHostEnvironment env, ModelLoadContext ctx) } } - public class TransformWrapper : ITransformer, ICanSaveModel - { - public const string LoaderSignature = "TransformWrapper"; - private const string TransformDirTemplate = "Step_{0:000}"; - - protected readonly IHostEnvironment _env; - protected readonly IDataView _xf; - - public TransformWrapper(IHostEnvironment env, IDataView xf) - { - _env = env; - _xf = xf; - } - - public ISchema GetOutputSchema(ISchema inputSchema) - { - var dv = new EmptyDataView(_env, inputSchema); - var output = ApplyTransformUtils.ApplyAllTransformsToData(_env, _xf, dv); - return output.Schema; - } - - public void Save(ModelSaveContext ctx) - { - ctx.CheckAtModel(); - ctx.SetVersionInfo(GetVersionInfo()); - - var dataPipe = _xf; - var transforms = new List(); - while (dataPipe is IDataTransform xf) - { - // REVIEW: a malicious user could construct a loop in the Source chain, that would - // cause this method to iterate forever (and throw something when the list overflows). There's - // no way to insulate from ALL malicious behavior. - transforms.Add(xf); - dataPipe = xf.Source; - Contracts.AssertValue(dataPipe); - } - transforms.Reverse(); - - ctx.SaveSubModel("Loader", c => BinaryLoader.SaveInstance(_env, c, dataPipe.Schema)); - - ctx.Writer.Write(transforms.Count); - for (int i = 0; i < transforms.Count; i++) - { - var dirName = string.Format(TransformDirTemplate, i); - ctx.SaveModel(transforms[i], dirName); - } - } - - private static VersionInfo GetVersionInfo() - { - return new VersionInfo( - modelSignature: "XF WRPR", - verWrittenCur: 0x00010001, // Initial - verReadableCur: 0x00010001, - verWeCanReadBack: 0x00010001, - loaderSignature: LoaderSignature); - } - - public TransformWrapper(IHostEnvironment env, ModelLoadContext ctx) - { - ctx.CheckAtModel(GetVersionInfo()); - int n = ctx.Reader.ReadInt32(); - - ctx.LoadModel(env, out var loader, "Loader", new MultiFileSource(null)); - - IDataView data = loader; - for (int i = 0; i < n; i++) - { - var dirName = string.Format(TransformDirTemplate, i); - ctx.LoadModel(env, out var xf, dirName, data); - data = xf; - } - - _env = env; - _xf = data; - } - - public IDataView Transform(IDataView input) => ApplyTransformUtils.ApplyAllTransformsToData(_env, _xf, input); - } - - public class ScorerWrapper : TransformWrapper, IPredictionTransformer - where TModel : IPredictor - { - protected readonly string _featureColumn; - - public ScorerWrapper(IHostEnvironment env, IDataView scorer, TModel trainedModel, string featureColumn) - : base(env, scorer) - { - _featureColumn = featureColumn; - Model = trainedModel; - } - - public TModel Model { get; } - - public string FeatureColumn => _featureColumn; - - public ColumnType FeatureColumnType => throw _env.ExceptNotSupp(); - } - - public class BinaryScorerWrapper : ScorerWrapper - where TModel : IPredictor - { - public BinaryScorerWrapper(IHostEnvironment env, TModel model, ISchema inputSchema, string featureColumn, BinaryClassifierScorer.Arguments args) - : base(env, MakeScorer(env, inputSchema, featureColumn, model, args), model, featureColumn) - { - } - - private static IDataView MakeScorer(IHostEnvironment env, ISchema schema, string featureColumn, TModel model, BinaryClassifierScorer.Arguments args) - { - var settings = $"Binary{{{CmdParser.GetSettings(env, args, new BinaryClassifierScorer.Arguments())}}}"; - - var scorerFactorySettings = CmdParser.CreateComponentFactory( - typeof(IComponentFactory), - typeof(SignatureDataScorer), - settings); - - var bindable = ScoreUtils.GetSchemaBindableMapper(env, model, scorerFactorySettings: scorerFactorySettings); - var edv = new EmptyDataView(env, schema); - var data = new RoleMappedData(edv, "Label", featureColumn, opt: true); - - return new BinaryClassifierScorer(env, args, data.Data, bindable.Bind(env, data.Schema), data.Schema); - } - - public BinaryScorerWrapper Clone(BinaryClassifierScorer.Arguments scorerArgs) - { - var scorer = _xf as IDataScorerTransform; - return new BinaryScorerWrapper(_env, Model, scorer.Source.Schema, _featureColumn, scorerArgs); - } - } - - public abstract class TrainerBase : ITrainerEstimator - where TTransformer : ScorerWrapper - where TModel : IPredictor - { - protected readonly IHostEnvironment _env; - protected readonly string _featureCol; - protected readonly string _labelCol; - - public abstract PredictionKind PredictionKind { get; } - - public TrainerInfo Info { get; } - - protected TrainerBase(IHostEnvironment env, TrainerInfo trainerInfo, string featureColumn, string labelColumn) - { - _env = env; - _featureCol = featureColumn; - _labelCol = labelColumn; - Info = trainerInfo; - } - - public TTransformer Fit(IDataView input) - { - return TrainTransformer(input); - } - - protected TTransformer TrainTransformer(IDataView trainSet, - IDataView validationSet = null, IPredictor initPredictor = null) - { - var cachedTrain = Info.WantCaching ? new CacheDataView(_env, trainSet, prefetch: null) : trainSet; - - var trainRoles = new RoleMappedData(cachedTrain, label: _labelCol, feature: _featureCol); - var emptyData = new EmptyDataView(_env, trainSet.Schema); - IDataView normalizer = emptyData; - - if (Info.NeedNormalization && trainRoles.Schema.FeaturesAreNormalized() == false) - { - var view = NormalizeTransform.CreateMinMaxNormalizer(_env, trainRoles.Data, name: trainRoles.Schema.Feature.Name); - normalizer = ApplyTransformUtils.ApplyAllTransformsToData(_env, view, emptyData, cachedTrain); - - trainRoles = new RoleMappedData(view, trainRoles.Schema.GetColumnRoleNames()); - } - - RoleMappedData validRoles; - - if (validationSet == null) - validRoles = null; - else - { - var cachedValid = Info.WantCaching ? new CacheDataView(_env, validationSet, prefetch: null) : validationSet; - cachedValid = ApplyTransformUtils.ApplyAllTransformsToData(_env, normalizer, cachedValid); - validRoles = new RoleMappedData(cachedValid, label: _labelCol, feature: _featureCol); - } - - var pred = TrainCore(new TrainContext(trainRoles, validRoles, initPredictor)); - - var scoreRoles = new RoleMappedData(normalizer, label: _labelCol, feature: _featureCol); - return MakeScorer(pred, scoreRoles); - } - - public SchemaShape GetOutputSchema(SchemaShape inputSchema) - { - throw new NotImplementedException(); - } - - protected abstract TModel TrainCore(TrainContext trainContext); - - protected abstract TTransformer MakeScorer(TModel predictor, RoleMappedData data); - - protected ScorerWrapper MakeScorerBasic(TModel predictor, RoleMappedData data) - { - var scorer = ScoreUtils.GetScorer(predictor, data, _env, data.Schema); - return (TTransformer)(new ScorerWrapper(_env, scorer, predictor, data.Schema.Feature.Name)); - } - } - public sealed class MyBinaryClassifierEvaluator { private readonly IHostEnvironment _env; diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs index 74e0fe39ec..9946996bb3 100644 --- a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs @@ -6,6 +6,7 @@ using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Data.IO; using Microsoft.ML.Runtime.RunTests; +using Microsoft.ML.Transforms; using System.IO; using Xunit; using Xunit.Abstractions; @@ -53,5 +54,39 @@ public void TextFeaturizerWorkout() CheckEquality("Text", "featurized.tsv"); Done(); } + + [Fact] + public void TextTokenizationWorkout() + { + string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); + var data = TextLoader.CreateReader(Env, ctx => ( + label: ctx.LoadBool(0), + text: ctx.LoadText(1)), hasHeader: true) + .Read(new MultiFileSource(sentimentDataPath)); + + var invalidData = TextLoader.CreateReader(Env, ctx => ( + label: ctx.LoadBool(0), + text: ctx.LoadFloat(1)), hasHeader: true) + .Read(new MultiFileSource(sentimentDataPath)); + + var est = new WordTokenizer(Env, "text", "words") + .Append(new CharacterTokenizer(Env, "text", "chars")) + .Append(new KeyToValueEstimator(Env, "chars")); + TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); + + var outputPath = GetOutputPath("Text", "tokenized.tsv"); + using (var ch = Env.Start("save")) + { + var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); + IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); + savedData = new ChooseColumnsTransform(Env, savedData, "text", "words", "chars"); + + using (var fs = File.Create(outputPath)) + DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); + } + + CheckEquality("Text", "tokenized.tsv"); + Done(); + } } }