Skip to content

Commit

Permalink
Fixes data invariant format problems
Browse files Browse the repository at this point in the history
The tests do not pass on machines that have different formatting than
English language. The error happens since the results are written in different
than expected format.

1. The main fix is to imbue en-US culture to the test thread so that results
will be output in format that is comparable with the test format.

2. A secondary fix is to make comparisons between culture sensitive data type
representations invariant when they do not have human readable dimensions. In
OptimizationMonitor.cs case the cast between culture sensitive floating point
and string will cause orders of magnitudes of error in output results.

The intention of this path is not to offer a robust solution and remove future
issues. There is room for refactoring where, for instance, locale information
would be applied to input and output and logging/tracing would be clearly
separated from another kind of locale sensitive handling. This way culture
sensitive parts would be separated and particular output formats could be
tested as separate cases if so desired.

Fixes #74
  • Loading branch information
veikkoeeva committed May 11, 2018
1 parent 3780923 commit 135791d
Show file tree
Hide file tree
Showing 13 changed files with 74 additions and 59 deletions.
23 changes: 12 additions & 11 deletions src/Microsoft.ML.Core/Environment/TlcEnvironment.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Threading;
Expand Down Expand Up @@ -212,7 +213,7 @@ public void GetAndPrintAllProgress(ProgressReporting.ProgressTracker progressTra
PrintOperationStop(_out, ev);
break;
case ProgressReporting.ProgressEvent.EventKind.Progress:
_out.Write("[{0}] ", ev.Index);
_out.Write(string.Format(CultureInfo.InvariantCulture, "[{0}] ", ev.Index));
PrintProgressLine(_out, ev);
break;
}
Expand All @@ -225,7 +226,7 @@ public void GetAndPrintAllProgress(ProgressReporting.ProgressTracker progressTra

if (PrintDot())
{
// We need to print an extended status line. At this point, every event should be
// We need to print an extended status line. At this point, every event should be
// a non-checkpoint progress event.
bool needPrepend = entries.Count > 1;
foreach (var ev in entries)
Expand All @@ -236,7 +237,7 @@ public void GetAndPrintAllProgress(ProgressReporting.ProgressTracker progressTra
{
EnsureNewLine();
WriteAndReturnLinePrefix(MessageSensitivity.None, _out);
_out.Write("[{0}] ", ev.Index);
_out.Write(string.Format(CultureInfo.InvariantCulture, "[{0}] ", ev.Index));
}
else
{
Expand All @@ -252,24 +253,24 @@ public void GetAndPrintAllProgress(ProgressReporting.ProgressTracker progressTra

private static void PrintOperationStart(TextWriter writer, ProgressReporting.ProgressEvent ev)
{
writer.WriteLine("[{0}] '{1}' started.", ev.Index, ev.Name);
writer.WriteLine(string.Format(CultureInfo.InvariantCulture, "[{0}] '{1}' started.", ev.Index, ev.Name));
}

private static void PrintOperationStop(TextWriter writer, ProgressReporting.ProgressEvent ev)
{
writer.WriteLine("[{0}] '{1}' finished in {2}.", ev.Index, ev.Name, ev.EventTime - ev.StartTime);
writer.WriteLine(string.Format(CultureInfo.InvariantCulture, "[{0}] '{1}' finished in {2}.", ev.Index, ev.Name, ev.EventTime - ev.StartTime));
}

private void PrintProgressLine(TextWriter writer, ProgressReporting.ProgressEvent ev)
{
// Elapsed time.
var elapsed = ev.EventTime - ev.StartTime;
if (elapsed.TotalMinutes < 1)
writer.Write("(00:{0:00.00})", elapsed.TotalSeconds);
writer.Write(string.Format(CultureInfo.InvariantCulture, "(00:{0:00.00})", elapsed.TotalSeconds));
else if (elapsed.TotalHours < 1)
writer.Write("({0:00}:{1:00.0})", elapsed.Minutes, elapsed.TotalSeconds - 60 * elapsed.Minutes);
writer.Write(string.Format(CultureInfo.InvariantCulture, "({0:00}:{1:00.0})", elapsed.Minutes, elapsed.TotalSeconds - 60 * elapsed.Minutes));
else
writer.Write("({0:00}:{1:00}:{2:00})", elapsed.Hours, elapsed.Minutes, elapsed.Seconds);
writer.Write(string.Format(CultureInfo.InvariantCulture, "({0:00}:{1:00}:{2:00})", elapsed.Hours, elapsed.Minutes, elapsed.Seconds));

// Progress units.
bool first = true;
Expand All @@ -281,7 +282,7 @@ private void PrintProgressLine(TextWriter writer, ProgressReporting.ProgressEven
first = false;
writer.Write("{0}", ev.ProgressEntry.Progress[i]);
if (ev.ProgressEntry.ProgressLim[i] != null)
writer.Write("/{0}", ev.ProgressEntry.ProgressLim[i].Value);
writer.Write("/{0}", ev.ProgressEntry.ProgressLim[i].Value.ToString(CultureInfo.InvariantCulture));
writer.Write(" {0}", ev.ProgressEntry.Header.UnitNames[i]);
}

Expand All @@ -291,7 +292,7 @@ private void PrintProgressLine(TextWriter writer, ProgressReporting.ProgressEven
if (ev.ProgressEntry.Metrics[i] == null)
continue;
// REVIEW: print metrics prettier.
writer.Write("\t{0}: {1}", ev.ProgressEntry.Header.MetricNames[i], ev.ProgressEntry.Metrics[i].Value);
writer.Write("\t{0}: {1}", ev.ProgressEntry.Header.MetricNames[i], ev.ProgressEntry.Metrics[i].Value.ToString(CultureInfo.InvariantCulture));
}

writer.WriteLine();
Expand All @@ -306,7 +307,7 @@ private void EnsureNewLine(bool isError = false)
return;

// If _err and _out is the same writer, we need to print new line as well.
// If _out and _err writes to Console.Out and Console.Error respectively,
// If _out and _err writes to Console.Out and Console.Error respectively,
// in the general user scenario they ends up with writing to the same underlying stream,.
// so write a new line to the stream anyways.
if (isError && _err != _out && (_out != Console.Out || _err != Console.Error))
Expand Down
3 changes: 2 additions & 1 deletion src/Microsoft.ML.Data/Utilities/TimerScope.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// See the LICENSE file in the project root for more information.

using System;
using System.Globalization;
using Microsoft.ML.Runtime;
using Microsoft.ML.Runtime.Data;

Expand Down Expand Up @@ -46,7 +47,7 @@ public void Dispose()

// REVIEW: This is \n\n is to prevent changes across bunch of baseline files.
// Ideally we should change our comparison method to ignore empty lines.
_ch.Info("{0}\t Time elapsed(s): {1}\n\n", DateTime.Now, elapsedSeconds);
_ch.Info("{0}\t Time elapsed(s): {1}\n\n", DateTime.Now.ToString(CultureInfo.InvariantCulture), elapsedSeconds.ToString(CultureInfo.InvariantCulture));

using (var pipe = _host.StartPipe<TelemetryMessage>("TelemetryPipe"))
{
Expand Down
7 changes: 4 additions & 3 deletions src/Microsoft.ML.FastTree/Training/Test.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
Expand Down Expand Up @@ -191,7 +192,7 @@ public virtual string FormatInfoString()
var sb = new System.Text.StringBuilder();
foreach (var r in ComputeTests())
{
sb.AppendFormat("{0}.{1}={2}\n", ScoreTracker.DatasetName, r.LossFunctionName, r.FinalValue);
sb.AppendFormat(CultureInfo.InvariantCulture, "{0}.{1}={2}\n", ScoreTracker.DatasetName, r.LossFunctionName, r.FinalValue);
}
return sb.ToString();
}
Expand Down Expand Up @@ -377,7 +378,7 @@ public override string FormatInfoString()
{
if (i > 1)
sb.Append("\t");
sb.AppendFormat("@{0}:{1:00.00}", i++, 100.0 * t.FinalValue);
sb.AppendFormat(CultureInfo.InvariantCulture, "@{0}:{1:00.00}", i++, 100.0 * t.FinalValue);
}
sb.AppendLine();
return sb.ToString();
Expand Down Expand Up @@ -512,7 +513,7 @@ public override string FormatInfoString()
{
if (i > 1)
sb.Append("\t");
sb.AppendFormat("{0}:{1:00.00}", t.LossFunctionName, t.FinalValue);
sb.AppendFormat(CultureInfo.InvariantCulture, "{0}:{1:00.00}", t.LossFunctionName, t.FinalValue);
i++;
}
sb.AppendLine();
Expand Down
19 changes: 10 additions & 9 deletions src/Microsoft.ML.FastTree/TreeEnsemble/Ensemble.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Text;
Expand Down Expand Up @@ -128,13 +129,13 @@ public string ToTreeEnsembleIni(FeaturesToContentMap fmap,

numNodes += evaluatorCounter;

sb.AppendFormat("[TreeEnsemble]\nInputs={0}\nEvaluators={1}\n", featureToID.Count, evaluatorCounter + 1);
sb.AppendFormat(CultureInfo.InvariantCulture, "[TreeEnsemble]\nInputs={0}\nEvaluators={1}\n", featureToID.Count, evaluatorCounter + 1);

sb.Append(sbInput);
sb.Append(sbEvaluator);

// Append the final aggregator
sb.AppendFormat("\n[Evaluator:{0}]\nEvaluatorType=Aggregator\nNumNodes={1}\nNodes=", evaluatorCounter + 1, numNodes);
sb.AppendFormat(CultureInfo.InvariantCulture, "\n[Evaluator:{0}]\nEvaluatorType=Aggregator\nNumNodes={1}\nNodes=", evaluatorCounter + 1, numNodes);

// Nodes
if (_firstInputInitializationContent != null)
Expand Down Expand Up @@ -163,7 +164,7 @@ public string ToTreeEnsembleIni(FeaturesToContentMap fmap,
{
if (_firstInputInitializationContent != null)
sb.Append("\t");
sb.AppendFormat("{0}", _trees[0].Weight);
sb.AppendFormat(CultureInfo.InvariantCulture, "{0}", _trees[0].Weight);
}

for (int w = 1; w < NumTrees; ++w)
Expand All @@ -172,7 +173,7 @@ public string ToTreeEnsembleIni(FeaturesToContentMap fmap,
{
sb.Append("\t");
}
sb.Append(_trees[w].Weight);
sb.Append(_trees[w].Weight.ToString(CultureInfo.InvariantCulture));
}

sb.AppendFormat("\nBias={0}", Bias);
Expand All @@ -193,15 +194,15 @@ public string ToTreeEnsembleIni(FeaturesToContentMap fmap,

protected int AppendComments(StringBuilder sb, string trainingParams)
{
sb.AppendFormat("\n\n[Comments]\nC:0=Regression Tree Ensemble\nC:1=Generated using FastTree\nC:2=Created on {0}\n", DateTime.Now);
sb.AppendFormat("\n\n[Comments]\nC:0=Regression Tree Ensemble\nC:1=Generated using FastTree\nC:2=Created on {0}\n", DateTime.Now.ToString(CultureInfo.InvariantCulture));

string[] trainingParamsList = trainingParams.Split(new char[] { '\n' });
int i = 0;
for (; i < trainingParamsList.Length; ++i)
{
if (trainingParamsList[i].Length > 0)
{
sb.AppendFormat("C:{0}=PARAM:{1}\n", i + 3, trainingParamsList[i]);
sb.AppendFormat(CultureInfo.InvariantCulture, "C:{0}=PARAM:{1}\n", i + 3, trainingParamsList[i]);
}
}
return i + 3;
Expand Down Expand Up @@ -328,15 +329,15 @@ public string ToGainSummary(FeaturesToContentMap fmap, Dictionary<int, int> feat
foreach (var pair in sortedByGain)
{
int outputInputId = featureToID.ContainsKey(pair.Key) ? featureToID[pair.Key] : 0;
output.Append(string.Format("C:{0}=FG:I{1}:{2}:{3}\n", startingCommentNumber++, outputInputId,
fmap.GetName(pair.Key), Math.Pow(pair.Value, power) / normalizingFactor));
output.AppendFormat(CultureInfo.InvariantCulture, "C:{0}=FG:I{1}:{2}:{3}\n", startingCommentNumber++, outputInputId,
fmap.GetName(pair.Key), Math.Pow(pair.Value, power) / normalizingFactor);
}
return output.ToString();
}

/// <summary>
/// Returns a vector of feature contributions for a given example.
/// <paramref name="builder"/> is used as a buffer to accumulate the contributions across trees.
/// <paramref name="builder"/> is used as a buffer to accumulate the contributions across trees.
/// If <paramref name="builder"/> is null, it will be created, otherwise it will be reused.
/// </summary>
internal void GetFeatureContributions(ref VBuffer<float> features, ref VBuffer<float> contribs, ref BufferBuilder<float> builder)
Expand Down
18 changes: 10 additions & 8 deletions src/Microsoft.ML.FastTree/TreeEnsemble/RegressionTree.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@

using Float = System.Single;

using Newtonsoft.Json.Linq;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Text;
Expand All @@ -18,7 +20,7 @@
using LotusvNext.Expressions;*/
using Microsoft.ML.Runtime.Model.Pfa;
using Microsoft.ML.Runtime.Internal.Internallearn;
using Newtonsoft.Json.Linq;


namespace Microsoft.ML.Runtime.FastTree.Internal
{
Expand All @@ -45,12 +47,12 @@ public class RegressionTree
/// </summary>
public bool[] CategoricalSplit { get; }
/// <summary>
/// Array of categorical values for the categorical feature that might be chosen as
/// Array of categorical values for the categorical feature that might be chosen as
/// a split feature for a node.
/// </summary>
public int[][] CategoricalSplitFeatures;
/// <summary>
/// For a given categorical feature that is chosen as a split feature for a node, this
/// For a given categorical feature that is chosen as a split feature for a node, this
/// array contains it's start and end range in the input feature vector at prediction time.
/// </summary>
public int[][] CategoricalSplitFeatureRanges;
Expand Down Expand Up @@ -1184,7 +1186,7 @@ public void ToTreeEnsembleFormat(StringBuilder sbEvaluator, StringBuilder sbInpu
private void ToTreeEnsembleFormatForCategoricalSplit(StringBuilder sbEvaluator, StringBuilder sbInput, FeaturesToContentMap featureContents,
ref int evaluatorCounter, Dictionary<int, int> featureToId, Dictionary<int, int> categoricalSplitNodeToId)
{
//REVIEW: Can all these conditions even be true?
//REVIEW: Can all these conditions even be true?
if (CategoricalSplitFeatures == null ||
CategoricalSplitFeatures.Length == 0 ||
CategoricalSplitFeatures.All(val => val == null))
Expand Down Expand Up @@ -1234,7 +1236,7 @@ private void ToTreeEnsembleFormatForCategoricalSplit(StringBuilder sbEvaluator,
sbLteChild.Append((n + 1) + toAppend);
sbGtChild.Append(~n + toAppend);
sbOutput.Append("1\t");
sbThreshold.Append(((double)0.5).ToString("R") + toAppend);
sbThreshold.Append(((double)0.5).ToString("R", CultureInfo.InvariantCulture) + toAppend);
}

sbOutput.Append("0");
Expand Down Expand Up @@ -1266,12 +1268,12 @@ public string ToOldIni(FeatureNameCollection featureNames)
if (gtChildCorrected < 0)
gtChildCorrected = numNonLeaves + (~gtChildCorrected);

output.AppendFormat("\nNodeType:{0}=Branch\nNodeDecision:{0}={1}\nNodeThreshold:{0}={2}\nNodeLTE:{0}={3}\nNodeGT:{0}={4}\n", n, name, currentThreshold, lteChildCorrected, gtChildCorrected);
output.AppendFormat(CultureInfo.InvariantCulture, "\nNodeType:{0}=Branch\nNodeDecision:{0}={1}\nNodeThreshold:{0}={2}\nNodeLTE:{0}={3}\nNodeGT:{0}={4}\n", n, name, currentThreshold, lteChildCorrected, gtChildCorrected);
}

for (int n = 0; n < NumLeaves; ++n)
{
output.AppendFormat("\nNodeType:{0}=Value\nNodeValue:{0}={1}\n", numNonLeaves + n, LeafValues[n]);
output.AppendFormat(CultureInfo.InvariantCulture, "\nNodeType:{0}=Value\nNodeValue:{0}={1}\n", numNonLeaves + n, LeafValues[n]);
}

return output.ToString();
Expand Down Expand Up @@ -1552,7 +1554,7 @@ public void AppendFeatureContributions(ref VBuffer<Float> src, BufferBuilder<Flo
var ghostLeaf = GetLeafFrom(ref src, otherWay);
var ghostOutput = GetOutput(ghostLeaf);

// If the ghost got a smaller output, the contribution of the feature is positive, so
// If the ghost got a smaller output, the contribution of the feature is positive, so
// the contribution is true minus ghost.
contributions.AddFeature(ifeat, (Float)(trueOutput - ghostOutput));
}
Expand Down
7 changes: 4 additions & 3 deletions src/Microsoft.ML.FastTree/Utils/Timer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// See the LICENSE file in the project root for more information.

using System;
using System.Globalization;
using System.Text;
using System.Threading;

Expand Down Expand Up @@ -156,7 +157,7 @@ public override string ToString()

string padded = "Name".PadRight(MaxEventNameLen);

sb.AppendFormat("{0} {1,10}{2,10}{3,8}{4,11}\n", padded, "Time", "%", "#Calls", "Time/Call");
sb.AppendFormat(CultureInfo.InvariantCulture, "{0} {1,10}{2,10}{3,8}{4,11}\n", padded, "Time", "%", "#Calls", "Time/Call");
foreach (TimerEvent n in Enum.GetValues(typeof(TimerEvent)))
{
double time = (double)TickTotals[(int)n] / Stopwatch.Frequency;
Expand All @@ -167,7 +168,7 @@ public override string ToString()

padded = n.ToString().PadRight(MaxEventNameLen);

sb.AppendFormat("{0} {1,10:0.000}{2,9:00.00}%{3,8}{4,11:0.000}\n", padded, time, perc, numCalls, timePerCall);
sb.AppendFormat(CultureInfo.InvariantCulture, "{0} {1,10:0.000}{2,9:00.00}%{3,8}{4,11:0.000}\n", padded, time, perc, numCalls, timePerCall);
}
sb.AppendFormat("Count Statistics:\n");
padded = "Name".PadRight(MaxEventNameLen);
Expand All @@ -178,7 +179,7 @@ public override string ToString()

padded = n.ToString().PadRight(MaxEventNameLen);

sb.AppendFormat("{0} {1,10}\n", padded, count);
sb.AppendFormat(CultureInfo.InvariantCulture, "{0} {1,10}\n", padded, count);
}
return sb.ToString();
}
Expand Down
3 changes: 2 additions & 1 deletion src/Microsoft.ML.FastTree/Utils/VectorUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// See the LICENSE file in the project root for more information.

using System;
using System.Globalization;
using System.Text;

namespace Microsoft.ML.Runtime.FastTree.Internal
Expand Down Expand Up @@ -338,7 +339,7 @@ public static string ToString(double[] vector)
{
sb.Append(", ");
}
sb.Append(vector[f]);
sb.Append(vector[f].ToString(CultureInfo.InvariantCulture));
}
return sb.ToString();
}
Expand Down
Loading

0 comments on commit 135791d

Please sign in to comment.