Skip to content

Commit 2b5bd21

Browse files
Fixes #4571. About memory leak when using FeaturizeText. (#4576)
* Fixes issue 4571, a memory leak when using Featurize Text, by always creating a new ReadOnlyMemory<char> before adding a NormStr to a NormStr.Pool. * Added a benchmark
1 parent 290e069 commit 2b5bd21

File tree

3 files changed

+184
-3
lines changed

3 files changed

+184
-3
lines changed

src/Microsoft.ML.Core/Utilities/NormStr.cs

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ public NormStr Get(string str, bool add = false)
116116
return add ? AddCore(str.AsMemory(), hash) : null;
117117
}
118118

119-
public NormStr Get(ReadOnlyMemory<char> str, bool add = false)
119+
public NormStr Get(ReadOnlyMemory<char> str, bool add = false, bool duplicateStr = true)
120120
{
121121
AssertValid();
122122

@@ -136,6 +136,15 @@ public NormStr Get(ReadOnlyMemory<char> str, bool add = false)
136136
}
137137
Contracts.Assert(ins == -1);
138138

139+
if (duplicateStr)
140+
{
141+
// To avoid the case where 'str' actually stores a string with the
142+
// content of a whole row in the dataset, a new 'str' is created
143+
// See issue https://github.com/dotnet/machinelearning/issues/4571
144+
// and PR https://github.com/dotnet/machinelearning/pull/4576
145+
return add ? AddCore(str.ToString().AsMemory(), hash) : null;
146+
}
147+
139148
return add ? AddCore(str, hash) : null;
140149
}
141150

@@ -147,9 +156,9 @@ public NormStr Add(string str)
147156
return Get(str, true);
148157
}
149158

150-
public NormStr Add(ReadOnlyMemory<char> str)
159+
public NormStr Add(ReadOnlyMemory<char> str, bool duplicateStr = true)
151160
{
152-
return Get(str, true);
161+
return Get(str, true, duplicateStr);
153162
}
154163

155164
/// <summary>
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using System;
6+
using System.IO;
7+
using System.Collections.Generic;
8+
using System.Linq;
9+
using Microsoft.ML.Data;
10+
using BenchmarkDotNet.Attributes;
11+
using Microsoft.ML.Transforms.Text;
12+
using Xunit;
13+
14+
namespace Microsoft.ML.Benchmarks
15+
{
16+
[Config(typeof(TrainConfig))]
17+
public class FeaturizeTextBench
18+
{
19+
private MLContext mlContext;
20+
private IDataView dataset;
21+
private static int numColumns = 1000;
22+
private static int numRows = 300;
23+
private static int maxWordLength = 15;
24+
25+
[GlobalSetup]
26+
public void SetupData()
27+
{
28+
Path.GetTempFileName();
29+
mlContext = new MLContext(seed: 1);
30+
var path = Path.GetTempFileName();
31+
Console.WriteLine($"Created dataset in temporary file:\n{path}\n");
32+
path = CreateRandomFile(path);
33+
34+
var columns = new List<TextLoader.Column>();
35+
for(int i = 0; i < numColumns; i++)
36+
{
37+
columns.Add(new TextLoader.Column($"Column{i}", DataKind.String, i));
38+
}
39+
40+
var textLoader = mlContext.Data.CreateTextLoader(new TextLoader.Options()
41+
{
42+
Columns = columns.ToArray(),
43+
HasHeader = false,
44+
Separators = new char[] { ',' }
45+
});
46+
47+
dataset = textLoader.Load(path);
48+
}
49+
50+
[Benchmark]
51+
public ITransformer TrainFeaturizeText()
52+
{
53+
var textColumns = new List<string>();
54+
for (int i = 0; i < 20; i++) // Only load first 20 columns
55+
{
56+
textColumns.Add($"Column{i}");
57+
}
58+
59+
var featurizers = new List<TextFeaturizingEstimator>();
60+
foreach (var textColumn in textColumns)
61+
{
62+
var featurizer = mlContext.Transforms.Text.FeaturizeText(textColumn, new TextFeaturizingEstimator.Options()
63+
{
64+
CharFeatureExtractor = null,
65+
WordFeatureExtractor = new WordBagEstimator.Options()
66+
{
67+
NgramLength = 2,
68+
MaximumNgramsCount = new int[] { 200000 }
69+
}
70+
});
71+
featurizers.Add(featurizer);
72+
}
73+
74+
IEstimator<ITransformer> pipeline = featurizers.First();
75+
foreach (var featurizer in featurizers.Skip(1))
76+
{
77+
pipeline = pipeline.Append(featurizer);
78+
}
79+
80+
var model = pipeline.Fit(dataset);
81+
82+
// BENCHMARK OUTPUT
83+
// * Summary *
84+
85+
//BenchmarkDotNet = v0.11.3, OS = Windows 10.0.18363
86+
//Intel Xeon W - 2133 CPU 3.60GHz, 1 CPU, 12 logical and 6 physical cores
87+
//.NET Core SDK = 3.0.100
88+
//[Host] : .NET Core 2.1.13(CoreCLR 4.6.28008.01, CoreFX 4.6.28008.01), 64bit RyuJIT
89+
//Job - KDKCUJ : .NET Core 2.1.13(CoreCLR 4.6.28008.01, CoreFX 4.6.28008.01), 64bit RyuJIT
90+
91+
//Arguments =/ p:Configuration = Release Toolchain = netcoreapp2.1 IterationCount = 1
92+
//LaunchCount = 3 MaxIterationCount = 20 RunStrategy = ColdStart
93+
//UnrollFactor = 1 WarmupCount = 1
94+
95+
// Method | Mean | Error | StdDev | Extra Metric | Gen 0 / 1k Op | Gen 1 / 1k Op | Gen 2 / 1k Op | Allocated Memory / Op |
96+
//------------------- | --------:| --------:| ---------:| -------------:| -------------:| ------------: | ------------: | --------------------: |
97+
// TrainFeaturizeText | 17.00 s | 6.337 s | 0.3474 s | - | 1949000.0000 | 721000.0000 | 36000.0000 | 315.48 MB |
98+
99+
//// * Legends *
100+
// Mean : Arithmetic mean of all measurements
101+
// Error : Half of 99.9 % confidence interval
102+
// StdDev : Standard deviation of all measurements
103+
// Extra Metric: Value of the provided extra metric
104+
// Gen 0 / 1k Op : GC Generation 0 collects per 1k Operations
105+
// Gen 1 / 1k Op : GC Generation 1 collects per 1k Operations
106+
// Gen 2 / 1k Op : GC Generation 2 collects per 1k Operations
107+
// Allocated Memory/ Op : Allocated memory per single operation(managed only, inclusive, 1KB = 1024B)
108+
// 1 s: 1 Second(1 sec)
109+
110+
//// * Diagnostic Output - MemoryDiagnoser *
111+
//// ***** BenchmarkRunner: End *****
112+
// Run time: 00:01:52(112.92 sec), executed benchmarks: 1
113+
114+
//// * Artifacts cleanup *
115+
// Global total time: 00:01:59(119.89 sec), executed benchmarks: 1
116+
117+
return model;
118+
}
119+
120+
public static string CreateRandomFile(string path)
121+
{
122+
// Create file with random strings
123+
// to use as dataset of the benchmark
124+
125+
Random random = new Random(1);
126+
127+
using (StreamWriter file = new StreamWriter(path))
128+
{
129+
for(int i = 0; i < numRows; i++)
130+
file.WriteLine(CreateRandomLine(numColumns, random));
131+
}
132+
return path;
133+
}
134+
135+
public static string CreateRandomLine(int columns, Random random)
136+
{
137+
var lineSB = new System.Text.StringBuilder();
138+
for(int i = 0; i < columns; i++)
139+
{
140+
lineSB.Append(CreateRandomColumn(random, random.Next(100)));
141+
lineSB.Append(",");
142+
}
143+
return lineSB.ToString();
144+
}
145+
146+
public static string CreateRandomColumn(Random random, int numwords)
147+
{
148+
const string characters =
149+
"01234567890" +
150+
"abcdefghijklmnopqrstuvwxyz" +
151+
"ABCDEFGHIJKLMNOPQRSTUVWXYZ";
152+
153+
var columnSB = new System.Text.StringBuilder();
154+
int wordLength;
155+
156+
for(int i = 0; i < numwords; i++)
157+
{
158+
wordLength = random.Next(1, maxWordLength);
159+
for(int j = 0; j < wordLength; j++)
160+
columnSB.Append(characters[random.Next(characters.Length)]);
161+
162+
columnSB.Append(" ");
163+
}
164+
165+
if (random.Next(2) == 0) // sometimes return the column as lowercase
166+
return columnSB.ToString().ToLower();
167+
168+
return columnSB.ToString();
169+
}
170+
}
171+
}

test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
// See the LICENSE file in the project root for more information.
44

55
using System;
6+
using System.Collections.Generic;
67
using System.IO;
78
using System.Linq;
89
using System.Text.RegularExpressions;

0 commit comments

Comments
 (0)