From 2d5fbf6766e93330edbd18a9f770a2e5aa544aa0 Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Thu, 25 Oct 2018 23:53:10 -0700
Subject: [PATCH 01/14] Adding another flavor of LR in the HAL learners
 package.  This will compute the std errors

---
 .../LRWithTrainingStatistics.cs               | 346 ++++++++++++++++++
 .../LogisticRegression/LbfgsPredictorBase.cs  |  14 +-
 .../LogisticRegression/LogisticRegression.cs  |  22 +-
 .../Standard/ModelStatistics.cs               |   4 +-
 4 files changed, 366 insertions(+), 20 deletions(-)
 create mode 100644 src/Microsoft.ML.HalLearners/LRWithTrainingStatistics.cs
diff --git a/src/Microsoft.ML.HalLearners/LRWithTrainingStatistics.cs b/src/Microsoft.ML.HalLearners/LRWithTrainingStatistics.cs
new file mode 100644
index 0000000000..a8a9ee0d3a
--- /dev/null
+++ b/src/Microsoft.ML.HalLearners/LRWithTrainingStatistics.cs
@@ -0,0 +1,346 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.Runtime;
+using Microsoft.ML.Runtime.Data;
+using Microsoft.ML.Runtime.EntryPoints;
+using Microsoft.ML.Runtime.Internal.Internallearn;
+using Microsoft.ML.Runtime.Internal.Utilities;
+using Microsoft.ML.Runtime.Learners;
+using Microsoft.ML.Runtime.Numeric;
+using Microsoft.ML.Runtime.Training;
+using Microsoft.ML.Trainers.HalLearners;
+using System;
+using System.Collections.Generic;
+
+[assembly: LoadableClass(LogisticRegressionWithStats.Summary, typeof(LogisticRegressionWithStats), typeof(LogisticRegression.Arguments),
+    new[] { typeof(SignatureBinaryClassifierTrainer), typeof(SignatureTrainer), typeof(SignatureFeatureScorerTrainer) },
+    LogisticRegressionWithStats.UserNameValue,
+    LogisticRegressionWithStats.LoadNameValue,
+    LogisticRegressionWithStats.ShortName,
+    "logisticregressionwrapper")]
+
+[assembly: LoadableClass(typeof(void), typeof(LogisticRegressionWithStats), null, typeof(SignatureEntryPointModule), LogisticRegressionWithStats.LoadNameValue)]
+
+namespace Microsoft.ML.Trainers.HalLearners
+{
+    using Mkl = OlsLinearRegressionTrainer.Mkl;
+
+    /// <include file='doc.xml' path='doc/members/member[@name="LBFGS"]/*' />
+    /// <include file='doc.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
+    public sealed partial class LogisticRegressionWithStats : LogisticRegression
+    {
+        public new const string LoadNameValue = "LogisticRegressionWithStats";
+        internal const string UserNameValue = "Logistic Regression";
+        internal const string ShortName = "lrwstats";
+        internal const string Summary = "Logistic Regression is a method in statistics used to predict the probability of occurrence of an event and can "
+            + "be used as a classification algorithm. The algorithm predicts the probability of occurrence of an event by fitting data to a logistical function.";
+
+        /// <summary>
+        /// Initializes a new instance of <see cref="LogisticRegression"/>
+        /// </summary>
+        /// <param name="env">The environment to use.</param>
+        /// <param name="labelColumn">The name of the label column.</param>
+        /// <param name="featureColumn">The name of the feature column.</param>
+        /// <param name="weightColumn">The name for the example weight column.</param>
+        /// <param name="enforceNoNegativity">Enforce non-negative weights.</param>
+        /// <param name="l1Weight">Weight of L1 regularizer term.</param>
+        /// <param name="l2Weight">Weight of L2 regularizer term.</param>
+        /// <param name="memorySize">Memory size for <see cref="LogisticRegression"/>. Lower=faster, less accurate.</param>
+        /// <param name="optimizationTolerance">Threshold for optimizer convergence.</param>
+        /// <param name="advancedSettings">A delegate to apply all the advanced arguments to the algorithm.</param>
+        public LogisticRegressionWithStats(IHostEnvironment env,
+            string featureColumn,
+            string labelColumn,
+            string weightColumn = null,
+            float l1Weight = Arguments.Defaults.L1Weight,
+            float l2Weight = Arguments.Defaults.L2Weight,
+            float optimizationTolerance = Arguments.Defaults.OptTol,
+            int memorySize = Arguments.Defaults.MemorySize,
+            bool enforceNoNegativity = Arguments.Defaults.EnforceNonNegativity,
+            Action<Arguments> advancedSettings = null)
+            : base(env, featureColumn, labelColumn, weightColumn,
+                  l1Weight, l2Weight, optimizationTolerance, memorySize, enforceNoNegativity, advancedSettings)
+        {
+        }
+
+        /// <summary>
+        /// Initializes a new instance of <see cref="LogisticRegression"/>
+        /// </summary>
+        internal LogisticRegressionWithStats(IHostEnvironment env, Arguments args)
+            : base(env, args)
+        {
+        }
+
+        protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory cursorFactory, float loss, int numParams)
+        {
+            Contracts.AssertValue(ch);
+            Contracts.AssertValue(cursorFactory);
+            Contracts.Assert(NumGoodRows > 0);
+            Contracts.Assert(WeightSum > 0);
+            Contracts.Assert(BiasCount == 1);
+            Contracts.Assert(loss >= 0);
+            Contracts.Assert(numParams >= BiasCount);
+            Contracts.Assert(CurrentWeights.IsDense);
+
+            ch.Info("Model trained with {0} training examples.", NumGoodRows);
+
+            // Compute deviance: start with loss function.
+            float deviance = (float)(2 * loss * WeightSum);
+
+            if (L2Weight > 0)
+            {
+                // Need to subtract L2 regularization loss.
+                // The bias term is not regularized.
+                var regLoss = VectorUtils.NormSquared(CurrentWeights.Values, 1, CurrentWeights.Length - 1) * L2Weight;
+                deviance -= regLoss;
+            }
+
+            if (L1Weight > 0)
+            {
+                // Need to subtract L1 regularization loss.
+                // The bias term is not regularized.
+                Double regLoss = 0;
+                VBufferUtils.ForEachDefined(ref CurrentWeights, (ind, value) => { if (ind >= BiasCount) regLoss += Math.Abs(value); });
+                deviance -= (float)regLoss * L1Weight * 2;
+            }
+
+            ch.Info("Residual Deviance: \t{0} (on {1} degrees of freedom)", deviance, Math.Max(NumGoodRows - numParams, 0));
+
+            // Compute null deviance, i.e., the deviance of null hypothesis.
+            // Cap the prior positive rate at 1e-15.
+            Double priorPosRate = PosWeight / WeightSum;
+            Contracts.Assert(0 <= priorPosRate && priorPosRate <= 1);
+            float nullDeviance = (priorPosRate <= 1e-15 || 1 - priorPosRate <= 1e-15) ?
+                0f : (float)(2 * WeightSum * MathUtils.Entropy(priorPosRate, true));
+            ch.Info("Null Deviance:     \t{0} (on {1} degrees of freedom)", nullDeviance, NumGoodRows - 1);
+
+            // Compute AIC.
+            ch.Info("AIC:               \t{0}", 2 * numParams + deviance);
+
+            // Show the coefficients statistics table.
+            var featureColIdx = cursorFactory.Data.Schema.Feature.Index;
+            var schema = cursorFactory.Data.Data.Schema;
+            var featureLength = CurrentWeights.Length - BiasCount;
+            var namesSpans = VBufferUtils.CreateEmpty<ReadOnlyMemory<char>>(featureLength);
+            if (schema.HasSlotNames(featureColIdx, featureLength))
+                schema.GetMetadata(MetadataUtils.Kinds.SlotNames, featureColIdx, ref namesSpans);
+            Host.Assert(namesSpans.Length == featureLength);
+
+            // Inverse mapping of non-zero weight slots.
+            Dictionary<int, int> weightIndicesInvMap = null;
+
+            // Indices of bias and non-zero weight slots.
+            int[] weightIndices = null;
+
+            // Whether all weights are non-zero.
+            bool denseWeight = numParams == CurrentWeights.Length;
+
+            // Extract non-zero indices of weight.
+            if (!denseWeight)
+            {
+                weightIndices = new int[numParams];
+                weightIndicesInvMap = new Dictionary<int, int>(numParams);
+                weightIndices[0] = 0;
+                weightIndicesInvMap[0] = 0;
+                int j = 1;
+                for (int i = 1; i < CurrentWeights.Length; i++)
+                {
+                    if (CurrentWeights.Values[i] != 0)
+                    {
+                        weightIndices[j] = i;
+                        weightIndicesInvMap[i] = j++;
+                    }
+                }
+
+                Contracts.Assert(j == numParams);
+            }
+
+            // Compute the standard error of coefficients.
+            long hessianDimension = (long)numParams * (numParams + 1) / 2;
+            if (hessianDimension > int.MaxValue)
+            {
+                ch.Warning("The number of parameter is too large. Cannot hold the variance-covariance matrix in memory. " +
+                    "Skipping computation of standard errors and z-statistics of coefficients. Consider choosing a larger L1 regularizer" +
+                    "to reduce the number of parameters.");
+                Stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance);
+                return;
+            }
+
+            // Building the variance-covariance matrix for parameters.
+            // The layout of this algorithm is a packed row-major lower triangular matrix.
+            // For example, layout of indices for 4-by-4:
+            // 0
+            // 1 2
+            // 3 4 5
+            // 6 7 8 9
+            var hessian = new Double[hessianDimension];
+
+            // Initialize diagonal elements with L2 regularizers except for the first entry (index 0)
+            // Since bias is not regularized.
+            if (L2Weight > 0)
+            {
+                // i is the array index of the diagonal entry at iRow-th row and iRow-th column.
+                // iRow is one-based.
+                int i = 0;
+                for (int iRow = 2; iRow <= numParams; iRow++)
+                {
+                    i += iRow;
+                    hessian[i] = L2Weight;
+                }
+
+                Contracts.Assert(i == hessian.Length - 1);
+            }
+
+            // Initialize the remaining entries.
+            var bias = CurrentWeights.Values[0];
+            using (var cursor = cursorFactory.Create())
+            {
+                while (cursor.MoveNext())
+                {
+                    var label = cursor.Label;
+                    var weight = cursor.Weight;
+                    var score = bias + VectorUtils.DotProductWithOffset(ref CurrentWeights, 1, ref cursor.Features);
+                    // Compute Bernoulli variance n_i * p_i * (1 - p_i) for the i-th training example.
+                    var variance = weight / (2 + 2 * Math.Cosh(score));
+
+                    // Increment the first entry of hessian.
+                    hessian[0] += variance;
+
+                    var values = cursor.Features.Values;
+                    if (cursor.Features.IsDense)
+                    {
+                        int ioff = 1;
+
+                        // Increment remaining entries of hessian.
+                        for (int i = 1; i < numParams; i++)
+                        {
+                            ch.Assert(ioff == i * (i + 1) / 2);
+                            int wi = weightIndices == null ? i - 1 : weightIndices[i] - 1;
+                            Contracts.Assert(0 <= wi && wi < cursor.Features.Length);
+                            var val = values[wi] * variance;
+                            // Add the implicit first bias term to X'X
+                            hessian[ioff++] += val;
+                            // Add the remainder of X'X
+                            for (int j = 0; j < i; j++)
+                            {
+                                int wj = weightIndices == null ? j : weightIndices[j + 1] - 1;
+                                Contracts.Assert(0 <= wj && wj < cursor.Features.Length);
+                                hessian[ioff++] += val * values[wj];
+                            }
+                        }
+                        ch.Assert(ioff == hessian.Length);
+                    }
+                    else
+                    {
+                        var indices = cursor.Features.Indices;
+                        for (int ii = 0; ii < cursor.Features.Count; ++ii)
+                        {
+                            int i = indices[ii];
+                            int wi = i + 1;
+                            if (weightIndicesInvMap != null && !weightIndicesInvMap.TryGetValue(i + 1, out wi))
+                                continue;
+
+                            Contracts.Assert(0 < wi && wi <= cursor.Features.Length);
+                            int ioff = wi * (wi + 1) / 2;
+                            var val = values[ii] * variance;
+                            // Add the implicit first bias term to X'X
+                            hessian[ioff] += val;
+                            // Add the remainder of X'X
+                            for (int jj = 0; jj <= ii; jj++)
+                            {
+                                int j = indices[jj];
+                                int wj = j + 1;
+                                if (weightIndicesInvMap != null && !weightIndicesInvMap.TryGetValue(j + 1, out wj))
+                                    continue;
+
+                                Contracts.Assert(0 < wj && wj <= cursor.Features.Length);
+                                hessian[ioff + wj] += val * values[jj];
+                            }
+                        }
+                    }
+                }
+            }
+
+            // Apply Cholesky Decomposition to find the inverse of the Hessian.
+            Double[] invHessian = null;
+            try
+            {
+                // First, find the Cholesky decomposition LL' of the Hessian.
+                Mkl.Pptrf(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numParams, hessian);
+                // Note that hessian is already modified at this point. It is no longer the original Hessian,
+                // but instead represents the Cholesky decomposition L.
+                // Also note that the following routine is supposed to consume the Cholesky decomposition L instead
+                // of the original information matrix.
+                Mkl.Pptri(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numParams, hessian);
+                // At this point, hessian should contain the inverse of the original Hessian matrix.
+                // Swap hessian with invHessian to avoid confusion in the following context.
+                Utils.Swap(ref hessian, ref invHessian);
+                Contracts.Assert(hessian == null);
+            }
+            catch (DllNotFoundException)
+            {
+                throw ch.ExceptNotSupp("The MKL library (Microsoft.ML.MklImports.dll) or one of its dependencies is missing.");
+            }
+
+            float[] stdErrorValues = new float[numParams];
+            stdErrorValues[0] = (float)Math.Sqrt(invHessian[0]);
+
+            for (int i = 1; i < numParams; i++)
+            {
+                // Initialize with inverse Hessian.
+                stdErrorValues[i] = (Single)invHessian[i * (i + 1) / 2 + i];
+            }
+
+            if (L2Weight > 0)
+            {
+                // Iterate through all entries of inverse Hessian to make adjustment to variance.
+                // A discussion on ridge regularized LR coefficient covariance matrix can be found here:
+                // http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3228544/
+                // http://www.inf.unibz.it/dis/teaching/DWDM/project2010/LogisticRegression.pdf
+                int ioffset = 1;
+                for (int iRow = 1; iRow < numParams; iRow++)
+                {
+                    for (int iCol = 0; iCol <= iRow; iCol++)
+                    {
+                        var entry = (Single)invHessian[ioffset];
+                        var adjustment = -L2Weight * entry * entry;
+                        stdErrorValues[iRow] -= adjustment;
+                        if (0 < iCol && iCol < iRow)
+                            stdErrorValues[iCol] -= adjustment;
+                        ioffset++;
+                    }
+                }
+
+                Contracts.Assert(ioffset == invHessian.Length);
+            }
+
+            for (int i = 1; i < numParams; i++)
+                stdErrorValues[i] = (float)Math.Sqrt(stdErrorValues[i]);
+
+            VBuffer<float> stdErrors = new VBuffer<float>(CurrentWeights.Length, numParams, stdErrorValues, weightIndices);
+            Stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance, ref stdErrors);
+        }
+
+        [TlcModule.EntryPoint(Name = "Trainers.LogisticRegressionBinaryClassifierWithStats",
+            Desc = Summary,
+            UserName = UserNameValue,
+            ShortName = ShortName,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='doc/members/member[@name=""LBFGS""]/*' />",
+                                 @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='doc/members/example[@name=""LogisticRegressionBinaryClassifier""]/*' />"})]
+
+        public static new CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)
+        {
+            Contracts.CheckValue(env, nameof(env));
+            var host = env.Register("TrainLRBinary");
+            host.CheckValue(input, nameof(input));
+            EntryPointUtils.CheckInputArgs(host, input);
+
+            return LearnerEntryPointsUtils.Train<Arguments, CommonOutputs.BinaryClassificationOutput>(host, input,
+                () => new LogisticRegressionWithStats(host, input),
+                () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.LabelColumn),
+                () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.WeightColumn));
+        }
+    }
+}
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
index 5f7a40c5d2..c7656822a7 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
@@ -92,14 +92,14 @@ public abstract class ArgumentsBase : LearnerInputBaseWithWeight
             [Argument(ArgumentType.AtMostOnce, HelpText = "Enforce non-negative weights", ShortName = "nn", SortOrder = 90)]
             public bool EnforceNonNegativity = Defaults.EnforceNonNegativity;
 
-            internal static class Defaults
+            public static class Defaults
             {
-                internal const float L2Weight = 1;
-                internal const float L1Weight = 1;
-                internal const float OptTol = 1e-7f;
-                internal const int MemorySize = 20;
-                internal const int MaxIterations = int.MaxValue;
-                internal const bool EnforceNonNegativity = false;
+                public const float L2Weight = 1;
+                public const float L1Weight = 1;
+                public const float OptTol = 1e-7f;
+                public const int MemorySize = 20;
+                public const int MaxIterations = int.MaxValue;
+                public const bool EnforceNonNegativity = false;
             }
         }
 
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
index 48757b347a..1b827c1e10 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
@@ -30,7 +30,7 @@ namespace Microsoft.ML.Runtime.Learners
 
     /// <include file='doc.xml' path='doc/members/member[@name="LBFGS"]/*' />
     /// <include file='doc.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
-    public sealed partial class LogisticRegression : LbfgsTrainerBase<LogisticRegression.Arguments, BinaryPredictionTransformer<ParameterMixingCalibratedPredictor>, ParameterMixingCalibratedPredictor>
+    public partial class LogisticRegression : LbfgsTrainerBase<LogisticRegression.Arguments, BinaryPredictionTransformer<ParameterMixingCalibratedPredictor>, ParameterMixingCalibratedPredictor>
     {
         public const string LoadNameValue = "LogisticRegression";
         internal const string UserNameValue = "Logistic Regression";
@@ -44,8 +44,8 @@ public sealed class Arguments : ArgumentsBase
             public bool ShowTrainingStats = false;
         }
 
-        private Double _posWeight;
-        private LinearModelStatistics _stats;
+        protected Double PosWeight;
+        protected LinearModelStatistics Stats;
 
         /// <summary>
         /// Initializes a new instance of <see cref="LogisticRegression"/>
@@ -76,17 +76,17 @@ public LogisticRegression(IHostEnvironment env,
             Host.CheckNonEmpty(featureColumn, nameof(featureColumn));
             Host.CheckNonEmpty(labelColumn, nameof(labelColumn));
 
-            _posWeight = 0;
+            PosWeight = 0;
             ShowTrainingStats = Args.ShowTrainingStats;
         }
 
         /// <summary>
         /// Initializes a new instance of <see cref="LogisticRegression"/>
         /// </summary>
-        internal LogisticRegression(IHostEnvironment env, Arguments args)
+        public LogisticRegression(IHostEnvironment env, Arguments args)
             : base(env, args, TrainerUtils.MakeBoolScalarLabel(args.LabelColumn))
         {
-            _posWeight = 0;
+            PosWeight = 0;
             ShowTrainingStats = Args.ShowTrainingStats;
         }
 
@@ -177,7 +177,7 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
 
             // Compute null deviance, i.e., the deviance of null hypothesis.
             // Cap the prior positive rate at 1e-15.
-            Double priorPosRate = _posWeight / WeightSum;
+            Double priorPosRate = PosWeight / WeightSum;
             Contracts.Assert(0 <= priorPosRate && priorPosRate <= 1);
             float nullDeviance = (priorPosRate <= 1e-15 || 1 - priorPosRate <= 1e-15) ?
                 0f : (float)(2 * WeightSum * MathUtils.Entropy(priorPosRate, true));
@@ -231,7 +231,7 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
                 ch.Warning("The number of parameter is too large. Cannot hold the variance-covariance matrix in memory. " +
                     "Skipping computation of standard errors and z-statistics of coefficients. Consider choosing a larger L1 regularizer" +
                     "to reduce the number of parameters.");
-                _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance);
+                Stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance);
                 return;
             }
 
@@ -330,13 +330,13 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
                 }
             }
 
-            _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance);
+            Stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance);
         }
 
         protected override void ProcessPriorDistribution(float label, float weight)
         {
             if (label > 0)
-                _posWeight += weight;
+                PosWeight += weight;
         }
 
         //Override default termination criterion MeanRelativeImprovementCriterion with
@@ -373,7 +373,7 @@ protected override ParameterMixingCalibratedPredictor CreatePredictor()
             CurrentWeights.GetItemOrDefault(0, ref bias);
             CurrentWeights.CopyTo(ref weights, 1, CurrentWeights.Length - 1);
             return new ParameterMixingCalibratedPredictor(Host,
-                new LinearBinaryPredictor(Host, ref weights, bias, _stats),
+                new LinearBinaryPredictor(Host, ref weights, bias, Stats),
                 new PlattCalibrator(Host, -1, 0));
         }
 
diff --git a/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs b/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs
index 8e12b04c3f..1c735d0c66 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs
@@ -92,7 +92,7 @@ private static VersionInfo GetVersionInfo()
 
         public int ParametersCount { get { return _paramCount; } }
 
-        internal LinearModelStatistics(IHostEnvironment env, long trainingExampleCount, int paramCount, Single deviance, Single nullDeviance)
+        public LinearModelStatistics(IHostEnvironment env, long trainingExampleCount, int paramCount, Single deviance, Single nullDeviance)
         {
             Contracts.AssertValue(env);
             env.Assert(trainingExampleCount > 0);
@@ -104,7 +104,7 @@ internal LinearModelStatistics(IHostEnvironment env, long trainingExampleCount,
             _nullDeviance = nullDeviance;
         }
 
-        internal LinearModelStatistics(IHostEnvironment env, long trainingExampleCount, int paramCount, Single deviance, Single nullDeviance, ref VBuffer<Single> coeffStdError)
+        public LinearModelStatistics(IHostEnvironment env, long trainingExampleCount, int paramCount, Single deviance, Single nullDeviance, ref VBuffer<Single> coeffStdError)
             : this(env, trainingExampleCount, paramCount, deviance, nullDeviance)
         {
             _env.Assert(coeffStdError.Count == _paramCount);

From 68dd4a66e63bb66923050151082006de585df35c Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Tue, 30 Oct 2018 16:33:00 -0700
Subject: [PATCH 02/14] Adding ComputeExtendedTrainingStatistics as an
 extension method on LR., rather than a separate trainer.

---
 .../LRWithTrainingStatistics.cs               | 346 ------------------
 .../LogisticRegressionTrainingStats.cs        |  88 +++++
 .../LogisticRegression/LbfgsPredictorBase.cs  |  30 +-
 .../LogisticRegression/LogisticRegression.cs  |  42 ++-
 .../MulticlassLogisticRegression.cs           |   8 +-
 .../Standard/ModelStatistics.cs               |  22 +-
 .../PoissonRegression/PoissonRegression.cs    |   2 +-
 .../TrainerEstimators/LbfgsTests.cs           |  15 +
 8 files changed, 170 insertions(+), 383 deletions(-)
 delete mode 100644 src/Microsoft.ML.HalLearners/LRWithTrainingStatistics.cs
 create mode 100644 src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs

diff --git a/src/Microsoft.ML.HalLearners/LRWithTrainingStatistics.cs b/src/Microsoft.ML.HalLearners/LRWithTrainingStatistics.cs
deleted file mode 100644
index a8a9ee0d3a..0000000000
--- a/src/Microsoft.ML.HalLearners/LRWithTrainingStatistics.cs
+++ /dev/null
@@ -1,346 +0,0 @@
-﻿// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-using Microsoft.ML.Runtime;
-using Microsoft.ML.Runtime.Data;
-using Microsoft.ML.Runtime.EntryPoints;
-using Microsoft.ML.Runtime.Internal.Internallearn;
-using Microsoft.ML.Runtime.Internal.Utilities;
-using Microsoft.ML.Runtime.Learners;
-using Microsoft.ML.Runtime.Numeric;
-using Microsoft.ML.Runtime.Training;
-using Microsoft.ML.Trainers.HalLearners;
-using System;
-using System.Collections.Generic;
-
-[assembly: LoadableClass(LogisticRegressionWithStats.Summary, typeof(LogisticRegressionWithStats), typeof(LogisticRegression.Arguments),
-    new[] { typeof(SignatureBinaryClassifierTrainer), typeof(SignatureTrainer), typeof(SignatureFeatureScorerTrainer) },
-    LogisticRegressionWithStats.UserNameValue,
-    LogisticRegressionWithStats.LoadNameValue,
-    LogisticRegressionWithStats.ShortName,
-    "logisticregressionwrapper")]
-
-[assembly: LoadableClass(typeof(void), typeof(LogisticRegressionWithStats), null, typeof(SignatureEntryPointModule), LogisticRegressionWithStats.LoadNameValue)]
-
-namespace Microsoft.ML.Trainers.HalLearners
-{
-    using Mkl = OlsLinearRegressionTrainer.Mkl;
-
-    /// <include file='doc.xml' path='doc/members/member[@name="LBFGS"]/*' />
-    /// <include file='doc.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
-    public sealed partial class LogisticRegressionWithStats : LogisticRegression
-    {
-        public new const string LoadNameValue = "LogisticRegressionWithStats";
-        internal const string UserNameValue = "Logistic Regression";
-        internal const string ShortName = "lrwstats";
-        internal const string Summary = "Logistic Regression is a method in statistics used to predict the probability of occurrence of an event and can "
-            + "be used as a classification algorithm. The algorithm predicts the probability of occurrence of an event by fitting data to a logistical function.";
-
-        /// <summary>
-        /// Initializes a new instance of <see cref="LogisticRegression"/>
-        /// </summary>
-        /// <param name="env">The environment to use.</param>
-        /// <param name="labelColumn">The name of the label column.</param>
-        /// <param name="featureColumn">The name of the feature column.</param>
-        /// <param name="weightColumn">The name for the example weight column.</param>
-        /// <param name="enforceNoNegativity">Enforce non-negative weights.</param>
-        /// <param name="l1Weight">Weight of L1 regularizer term.</param>
-        /// <param name="l2Weight">Weight of L2 regularizer term.</param>
-        /// <param name="memorySize">Memory size for <see cref="LogisticRegression"/>. Lower=faster, less accurate.</param>
-        /// <param name="optimizationTolerance">Threshold for optimizer convergence.</param>
-        /// <param name="advancedSettings">A delegate to apply all the advanced arguments to the algorithm.</param>
-        public LogisticRegressionWithStats(IHostEnvironment env,
-            string featureColumn,
-            string labelColumn,
-            string weightColumn = null,
-            float l1Weight = Arguments.Defaults.L1Weight,
-            float l2Weight = Arguments.Defaults.L2Weight,
-            float optimizationTolerance = Arguments.Defaults.OptTol,
-            int memorySize = Arguments.Defaults.MemorySize,
-            bool enforceNoNegativity = Arguments.Defaults.EnforceNonNegativity,
-            Action<Arguments> advancedSettings = null)
-            : base(env, featureColumn, labelColumn, weightColumn,
-                  l1Weight, l2Weight, optimizationTolerance, memorySize, enforceNoNegativity, advancedSettings)
-        {
-        }
-
-        /// <summary>
-        /// Initializes a new instance of <see cref="LogisticRegression"/>
-        /// </summary>
-        internal LogisticRegressionWithStats(IHostEnvironment env, Arguments args)
-            : base(env, args)
-        {
-        }
-
-        protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory cursorFactory, float loss, int numParams)
-        {
-            Contracts.AssertValue(ch);
-            Contracts.AssertValue(cursorFactory);
-            Contracts.Assert(NumGoodRows > 0);
-            Contracts.Assert(WeightSum > 0);
-            Contracts.Assert(BiasCount == 1);
-            Contracts.Assert(loss >= 0);
-            Contracts.Assert(numParams >= BiasCount);
-            Contracts.Assert(CurrentWeights.IsDense);
-
-            ch.Info("Model trained with {0} training examples.", NumGoodRows);
-
-            // Compute deviance: start with loss function.
-            float deviance = (float)(2 * loss * WeightSum);
-
-            if (L2Weight > 0)
-            {
-                // Need to subtract L2 regularization loss.
-                // The bias term is not regularized.
-                var regLoss = VectorUtils.NormSquared(CurrentWeights.Values, 1, CurrentWeights.Length - 1) * L2Weight;
-                deviance -= regLoss;
-            }
-
-            if (L1Weight > 0)
-            {
-                // Need to subtract L1 regularization loss.
-                // The bias term is not regularized.
-                Double regLoss = 0;
-                VBufferUtils.ForEachDefined(ref CurrentWeights, (ind, value) => { if (ind >= BiasCount) regLoss += Math.Abs(value); });
-                deviance -= (float)regLoss * L1Weight * 2;
-            }
-
-            ch.Info("Residual Deviance: \t{0} (on {1} degrees of freedom)", deviance, Math.Max(NumGoodRows - numParams, 0));
-
-            // Compute null deviance, i.e., the deviance of null hypothesis.
-            // Cap the prior positive rate at 1e-15.
-            Double priorPosRate = PosWeight / WeightSum;
-            Contracts.Assert(0 <= priorPosRate && priorPosRate <= 1);
-            float nullDeviance = (priorPosRate <= 1e-15 || 1 - priorPosRate <= 1e-15) ?
-                0f : (float)(2 * WeightSum * MathUtils.Entropy(priorPosRate, true));
-            ch.Info("Null Deviance:     \t{0} (on {1} degrees of freedom)", nullDeviance, NumGoodRows - 1);
-
-            // Compute AIC.
-            ch.Info("AIC:               \t{0}", 2 * numParams + deviance);
-
-            // Show the coefficients statistics table.
-            var featureColIdx = cursorFactory.Data.Schema.Feature.Index;
-            var schema = cursorFactory.Data.Data.Schema;
-            var featureLength = CurrentWeights.Length - BiasCount;
-            var namesSpans = VBufferUtils.CreateEmpty<ReadOnlyMemory<char>>(featureLength);
-            if (schema.HasSlotNames(featureColIdx, featureLength))
-                schema.GetMetadata(MetadataUtils.Kinds.SlotNames, featureColIdx, ref namesSpans);
-            Host.Assert(namesSpans.Length == featureLength);
-
-            // Inverse mapping of non-zero weight slots.
-            Dictionary<int, int> weightIndicesInvMap = null;
-
-            // Indices of bias and non-zero weight slots.
-            int[] weightIndices = null;
-
-            // Whether all weights are non-zero.
-            bool denseWeight = numParams == CurrentWeights.Length;
-
-            // Extract non-zero indices of weight.
-            if (!denseWeight)
-            {
-                weightIndices = new int[numParams];
-                weightIndicesInvMap = new Dictionary<int, int>(numParams);
-                weightIndices[0] = 0;
-                weightIndicesInvMap[0] = 0;
-                int j = 1;
-                for (int i = 1; i < CurrentWeights.Length; i++)
-                {
-                    if (CurrentWeights.Values[i] != 0)
-                    {
-                        weightIndices[j] = i;
-                        weightIndicesInvMap[i] = j++;
-                    }
-                }
-
-                Contracts.Assert(j == numParams);
-            }
-
-            // Compute the standard error of coefficients.
-            long hessianDimension = (long)numParams * (numParams + 1) / 2;
-            if (hessianDimension > int.MaxValue)
-            {
-                ch.Warning("The number of parameter is too large. Cannot hold the variance-covariance matrix in memory. " +
-                    "Skipping computation of standard errors and z-statistics of coefficients. Consider choosing a larger L1 regularizer" +
-                    "to reduce the number of parameters.");
-                Stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance);
-                return;
-            }
-
-            // Building the variance-covariance matrix for parameters.
-            // The layout of this algorithm is a packed row-major lower triangular matrix.
-            // For example, layout of indices for 4-by-4:
-            // 0
-            // 1 2
-            // 3 4 5
-            // 6 7 8 9
-            var hessian = new Double[hessianDimension];
-
-            // Initialize diagonal elements with L2 regularizers except for the first entry (index 0)
-            // Since bias is not regularized.
-            if (L2Weight > 0)
-            {
-                // i is the array index of the diagonal entry at iRow-th row and iRow-th column.
-                // iRow is one-based.
-                int i = 0;
-                for (int iRow = 2; iRow <= numParams; iRow++)
-                {
-                    i += iRow;
-                    hessian[i] = L2Weight;
-                }
-
-                Contracts.Assert(i == hessian.Length - 1);
-            }
-
-            // Initialize the remaining entries.
-            var bias = CurrentWeights.Values[0];
-            using (var cursor = cursorFactory.Create())
-            {
-                while (cursor.MoveNext())
-                {
-                    var label = cursor.Label;
-                    var weight = cursor.Weight;
-                    var score = bias + VectorUtils.DotProductWithOffset(ref CurrentWeights, 1, ref cursor.Features);
-                    // Compute Bernoulli variance n_i * p_i * (1 - p_i) for the i-th training example.
-                    var variance = weight / (2 + 2 * Math.Cosh(score));
-
-                    // Increment the first entry of hessian.
-                    hessian[0] += variance;
-
-                    var values = cursor.Features.Values;
-                    if (cursor.Features.IsDense)
-                    {
-                        int ioff = 1;
-
-                        // Increment remaining entries of hessian.
-                        for (int i = 1; i < numParams; i++)
-                        {
-                            ch.Assert(ioff == i * (i + 1) / 2);
-                            int wi = weightIndices == null ? i - 1 : weightIndices[i] - 1;
-                            Contracts.Assert(0 <= wi && wi < cursor.Features.Length);
-                            var val = values[wi] * variance;
-                            // Add the implicit first bias term to X'X
-                            hessian[ioff++] += val;
-                            // Add the remainder of X'X
-                            for (int j = 0; j < i; j++)
-                            {
-                                int wj = weightIndices == null ? j : weightIndices[j + 1] - 1;
-                                Contracts.Assert(0 <= wj && wj < cursor.Features.Length);
-                                hessian[ioff++] += val * values[wj];
-                            }
-                        }
-                        ch.Assert(ioff == hessian.Length);
-                    }
-                    else
-                    {
-                        var indices = cursor.Features.Indices;
-                        for (int ii = 0; ii < cursor.Features.Count; ++ii)
-                        {
-                            int i = indices[ii];
-                            int wi = i + 1;
-                            if (weightIndicesInvMap != null && !weightIndicesInvMap.TryGetValue(i + 1, out wi))
-                                continue;
-
-                            Contracts.Assert(0 < wi && wi <= cursor.Features.Length);
-                            int ioff = wi * (wi + 1) / 2;
-                            var val = values[ii] * variance;
-                            // Add the implicit first bias term to X'X
-                            hessian[ioff] += val;
-                            // Add the remainder of X'X
-                            for (int jj = 0; jj <= ii; jj++)
-                            {
-                                int j = indices[jj];
-                                int wj = j + 1;
-                                if (weightIndicesInvMap != null && !weightIndicesInvMap.TryGetValue(j + 1, out wj))
-                                    continue;
-
-                                Contracts.Assert(0 < wj && wj <= cursor.Features.Length);
-                                hessian[ioff + wj] += val * values[jj];
-                            }
-                        }
-                    }
-                }
-            }
-
-            // Apply Cholesky Decomposition to find the inverse of the Hessian.
-            Double[] invHessian = null;
-            try
-            {
-                // First, find the Cholesky decomposition LL' of the Hessian.
-                Mkl.Pptrf(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numParams, hessian);
-                // Note that hessian is already modified at this point. It is no longer the original Hessian,
-                // but instead represents the Cholesky decomposition L.
-                // Also note that the following routine is supposed to consume the Cholesky decomposition L instead
-                // of the original information matrix.
-                Mkl.Pptri(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numParams, hessian);
-                // At this point, hessian should contain the inverse of the original Hessian matrix.
-                // Swap hessian with invHessian to avoid confusion in the following context.
-                Utils.Swap(ref hessian, ref invHessian);
-                Contracts.Assert(hessian == null);
-            }
-            catch (DllNotFoundException)
-            {
-                throw ch.ExceptNotSupp("The MKL library (Microsoft.ML.MklImports.dll) or one of its dependencies is missing.");
-            }
-
-            float[] stdErrorValues = new float[numParams];
-            stdErrorValues[0] = (float)Math.Sqrt(invHessian[0]);
-
-            for (int i = 1; i < numParams; i++)
-            {
-                // Initialize with inverse Hessian.
-                stdErrorValues[i] = (Single)invHessian[i * (i + 1) / 2 + i];
-            }
-
-            if (L2Weight > 0)
-            {
-                // Iterate through all entries of inverse Hessian to make adjustment to variance.
-                // A discussion on ridge regularized LR coefficient covariance matrix can be found here:
-                // http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3228544/
-                // http://www.inf.unibz.it/dis/teaching/DWDM/project2010/LogisticRegression.pdf
-                int ioffset = 1;
-                for (int iRow = 1; iRow < numParams; iRow++)
-                {
-                    for (int iCol = 0; iCol <= iRow; iCol++)
-                    {
-                        var entry = (Single)invHessian[ioffset];
-                        var adjustment = -L2Weight * entry * entry;
-                        stdErrorValues[iRow] -= adjustment;
-                        if (0 < iCol && iCol < iRow)
-                            stdErrorValues[iCol] -= adjustment;
-                        ioffset++;
-                    }
-                }
-
-                Contracts.Assert(ioffset == invHessian.Length);
-            }
-
-            for (int i = 1; i < numParams; i++)
-                stdErrorValues[i] = (float)Math.Sqrt(stdErrorValues[i]);
-
-            VBuffer<float> stdErrors = new VBuffer<float>(CurrentWeights.Length, numParams, stdErrorValues, weightIndices);
-            Stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance, ref stdErrors);
-        }
-
-        [TlcModule.EntryPoint(Name = "Trainers.LogisticRegressionBinaryClassifierWithStats",
-            Desc = Summary,
-            UserName = UserNameValue,
-            ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='doc/members/member[@name=""LBFGS""]/*' />",
-                                 @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='doc/members/example[@name=""LogisticRegressionBinaryClassifier""]/*' />"})]
-
-        public static new CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)
-        {
-            Contracts.CheckValue(env, nameof(env));
-            var host = env.Register("TrainLRBinary");
-            host.CheckValue(input, nameof(input));
-            EntryPointUtils.CheckInputArgs(host, input);
-
-            return LearnerEntryPointsUtils.Train<Arguments, CommonOutputs.BinaryClassificationOutput>(host, input,
-                () => new LogisticRegressionWithStats(host, input),
-                () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.LabelColumn),
-                () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.WeightColumn));
-        }
-    }
-}
diff --git a/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs b/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
new file mode 100644
index 0000000000..5f7bf7441f
--- /dev/null
+++ b/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
@@ -0,0 +1,88 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.Runtime.Data;
+using Microsoft.ML.Runtime.Internal.Utilities;
+using Microsoft.ML.Trainers.HalLearners;
+using System;
+
+namespace Microsoft.ML.Runtime.Learners
+{
+    using Mkl = OlsLinearRegressionTrainer.Mkl;
+
+    /// <include file='doc.xml' path='doc/members/member[@name="LBFGS"]/*' />
+    /// <include file='doc.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
+    public static class LogisticRegressionTrainingStats
+    {
+
+        public static void ComputeExtendedTrainingStatistics(this LogisticRegression trainer, IChannel ch)
+        {
+            Contracts.AssertValue(ch);
+            Contracts.AssertValue(trainer.Stats, $"Training Statistics can get generated after training finishes. Train with setting: ShowTrainigStats set to true.");
+            Contracts.Assert(trainer.GetL2Weight > 0);
+            Contracts.Assert(trainer.GetNumGoodRows > 0);
+
+            ch.Info("Model trained with {0} training examples.", trainer.GetNumGoodRows);
+
+            // Apply Cholesky Decomposition to find the inverse of the Hessian.
+            Double[] invHessian = null;
+            try
+            {
+                // First, find the Cholesky decomposition LL' of the Hessian.
+                Mkl.Pptrf(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, trainer.GetNumSelectedParams, trainer.Stats.Hessian);
+                // Note that hessian is already modified at this point. It is no longer the original Hessian,
+                // but instead represents the Cholesky decomposition L.
+                // Also note that the following routine is supposed to consume the Cholesky decomposition L instead
+                // of the original information matrix.
+                Mkl.Pptri(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, trainer.GetNumSelectedParams, trainer.Stats.Hessian);
+                // At this point, hessian should contain the inverse of the original Hessian matrix.
+                // Swap hessian with invHessian to avoid confusion in the following context.
+                Utils.Swap(ref trainer.Stats.Hessian, ref invHessian);
+                Contracts.Assert(trainer.Stats.Hessian == null);
+            }
+            catch (DllNotFoundException)
+            {
+                throw ch.ExceptNotSupp("The MKL library (MklImports.dll) or one of its dependencies is missing.");
+            }
+
+            float[] stdErrorValues = new float[trainer.GetNumSelectedParams];
+            stdErrorValues[0] = (float)Math.Sqrt(invHessian[0]);
+
+            for (int i = 1; i < trainer.GetNumSelectedParams; i++)
+            {
+                // Initialize with inverse Hessian.
+                stdErrorValues[i] = (Single)invHessian[i * (i + 1) / 2 + i];
+            }
+
+            if (trainer.GetL2Weight > 0)
+            {
+                // Iterate through all entries of inverse Hessian to make adjustment to variance.
+                // A discussion on ridge regularized LR coefficient covariance matrix can be found here:
+                // http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3228544/
+                // http://www.inf.unibz.it/dis/teaching/DWDM/project2010/LogisticRegression.pdf
+                int ioffset = 1;
+                for (int iRow = 1; iRow < trainer.GetNumSelectedParams; iRow++)
+                {
+                    for (int iCol = 0; iCol <= iRow; iCol++)
+                    {
+                        var entry = (Single)invHessian[ioffset];
+                        var adjustment = -trainer.GetL2Weight * entry * entry;
+                        stdErrorValues[iRow] -= adjustment;
+                        if (0 < iCol && iCol < iRow)
+                            stdErrorValues[iCol] -= adjustment;
+                        ioffset++;
+                    }
+                }
+
+                Contracts.Assert(ioffset == invHessian.Length);
+            }
+
+            for (int i = 1; i < trainer.GetNumSelectedParams; i++)
+                stdErrorValues[i] = (float)Math.Sqrt(stdErrorValues[i]);
+
+           VBuffer<float> stdErrors = new VBuffer<float>(trainer.GetWeights.Length, trainer.GetNumSelectedParams, stdErrorValues, trainer.Stats.WeightIndices);
+           trainer.Stats.SetCoeffStdError(ref stdErrors);
+        }
+    }
+}
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
index c7656822a7..5bbf80eb51 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
@@ -136,6 +136,7 @@ public static class Defaults
         private VBuffer<float>[] _features;
         private float[] _labels;
         private float[] _weights;
+        protected int NumParams;
 
         // Stores the bounds of the chunk to be used by each thread. The 0th slot is 0. The length
         // is one more than the number of threads to use.
@@ -151,6 +152,23 @@ public static class Defaults
         private static readonly TrainerInfo _info = new TrainerInfo(caching: true, supportIncrementalTrain: true);
         public override TrainerInfo Info => _info;
 
+        /// <summary>
+        /// Gets the number of useful training rows.
+        /// </summary>
+        public long GetNumGoodRows => NumGoodRows;
+
+        /// <summary>
+        /// Gets the L2weight
+        /// </summary>
+        public float GetL2Weight => L2Weight;
+
+        public int GetNumSelectedParams => NumParams;
+
+        /// <summary>
+        /// Gets the training weights.
+        /// </summary>
+        public VBuffer<float> GetWeights => CurrentWeights;
+
         internal LbfgsTrainerBase(IHostEnvironment env, string featureColumn, SchemaShape.Column labelColumn,
             string weightColumn, Action<TArgs> advancedSettings, float l1Weight,
             float l2Weight,
@@ -230,7 +248,7 @@ private static TArgs ArgsInit(string featureColumn, SchemaShape.Column labelColu
         }
 
         protected virtual int ClassCount => 1;
-        protected int BiasCount => ClassCount;
+        public int BiasCount => ClassCount;
         protected int WeightCount => ClassCount * NumFeatures;
         protected virtual Optimizer InitializeOptimizer(IChannel ch, FloatLabelCursor.Factory cursorFactory,
             out VBuffer<float> init, out ITerminationCriterion terminationCriterion)
@@ -525,16 +543,16 @@ protected virtual void TrainCore(IChannel ch, RoleMappedData data)
 
             ch.Assert(CurrentWeights.Length == BiasCount + WeightCount);
 
-            int numParams = BiasCount;
+            NumParams = BiasCount;
             if ((L1Weight > 0 && !Quiet) || ShowTrainingStats)
             {
-                VBufferUtils.ForEachDefined(ref CurrentWeights, (index, value) => { if (index >= BiasCount && value != 0) numParams++; });
+                VBufferUtils.ForEachDefined(ref CurrentWeights, (index, value) => { if (index >= BiasCount && value != 0) NumParams++; });
                 if (L1Weight > 0 && !Quiet)
-                    ch.Info("L1 regularization selected {0} of {1} weights.", numParams, BiasCount + WeightCount);
+                    ch.Info("L1 regularization selected {0} of {1} weights.", NumParams, BiasCount + WeightCount);
             }
 
             if (ShowTrainingStats)
-                ComputeTrainingStatistics(ch, cursorFactory, loss, numParams);
+                ComputeTrainingStatistics(ch, cursorFactory, loss);
         }
 
         // Ensure that the bias portion of vec is represented in vec.
@@ -550,7 +568,7 @@ protected void EnsureBiases(ref VBuffer<float> vec)
         protected abstract float AccumulateOneGradient(ref VBuffer<float> feat, float label, float weight,
             ref VBuffer<float> xDense, ref VBuffer<float> grad, ref float[] scratch);
 
-        protected abstract void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory cursorFactory, float loss, int numParams);
+        protected abstract void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory cursorFactory, float loss);
 
         protected abstract void ProcessPriorDistribution(float label, float weight);
         /// <summary>
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
index 1b827c1e10..1394a7f3f5 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
@@ -15,6 +15,7 @@
 using Microsoft.ML.Runtime.Learners;
 using Microsoft.ML.Runtime.Numeric;
 using Microsoft.ML.Runtime.Training;
+//using Microsoft.ML.Trainers.HalLearners;
 
 [assembly: LoadableClass(LogisticRegression.Summary, typeof(LogisticRegression), typeof(LogisticRegression.Arguments),
     new[] { typeof(SignatureBinaryClassifierTrainer), typeof(SignatureTrainer), typeof(SignatureFeatureScorerTrainer) },
@@ -44,8 +45,8 @@ public sealed class Arguments : ArgumentsBase
             public bool ShowTrainingStats = false;
         }
 
-        protected Double PosWeight;
-        protected LinearModelStatistics Stats;
+        private Double _posWeight;
+        public LinearModelStatistics Stats;
 
         /// <summary>
         /// Initializes a new instance of <see cref="LogisticRegression"/>
@@ -76,7 +77,7 @@ public LogisticRegression(IHostEnvironment env,
             Host.CheckNonEmpty(featureColumn, nameof(featureColumn));
             Host.CheckNonEmpty(labelColumn, nameof(labelColumn));
 
-            PosWeight = 0;
+            _posWeight = 0;
             ShowTrainingStats = Args.ShowTrainingStats;
         }
 
@@ -86,7 +87,7 @@ public LogisticRegression(IHostEnvironment env,
         public LogisticRegression(IHostEnvironment env, Arguments args)
             : base(env, args, TrainerUtils.MakeBoolScalarLabel(args.LabelColumn))
         {
-            PosWeight = 0;
+            _posWeight = 0;
             ShowTrainingStats = Args.ShowTrainingStats;
         }
 
@@ -140,7 +141,7 @@ protected override float AccumulateOneGradient(ref VBuffer<float> feat, float la
             return weight * datumLoss;
         }
 
-        protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory cursorFactory, float loss, int numParams)
+        protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory cursorFactory, float loss)
         {
             Contracts.AssertValue(ch);
             Contracts.AssertValue(cursorFactory);
@@ -148,7 +149,7 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
             Contracts.Assert(WeightSum > 0);
             Contracts.Assert(BiasCount == 1);
             Contracts.Assert(loss >= 0);
-            Contracts.Assert(numParams >= BiasCount);
+            Contracts.Assert(NumParams >= BiasCount);
             Contracts.Assert(CurrentWeights.IsDense);
 
             ch.Info("Model trained with {0} training examples.", NumGoodRows);
@@ -173,18 +174,18 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
                 deviance -= (float)regLoss * L1Weight * 2;
             }
 
-            ch.Info("Residual Deviance: \t{0} (on {1} degrees of freedom)", deviance, Math.Max(NumGoodRows - numParams, 0));
+            ch.Info("Residual Deviance: \t{0} (on {1} degrees of freedom)", deviance, Math.Max(NumGoodRows - NumParams, 0));
 
             // Compute null deviance, i.e., the deviance of null hypothesis.
             // Cap the prior positive rate at 1e-15.
-            Double priorPosRate = PosWeight / WeightSum;
+            Double priorPosRate = _posWeight / WeightSum;
             Contracts.Assert(0 <= priorPosRate && priorPosRate <= 1);
             float nullDeviance = (priorPosRate <= 1e-15 || 1 - priorPosRate <= 1e-15) ?
                 0f : (float)(2 * WeightSum * MathUtils.Entropy(priorPosRate, true));
             ch.Info("Null Deviance:     \t{0} (on {1} degrees of freedom)", nullDeviance, NumGoodRows - 1);
 
             // Compute AIC.
-            ch.Info("AIC:               \t{0}", 2 * numParams + deviance);
+            ch.Info("AIC:               \t{0}", 2 * NumParams + deviance);
 
             // Show the coefficients statistics table.
             var featureColIdx = cursorFactory.Data.Schema.Feature.Index;
@@ -202,13 +203,13 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
             int[] weightIndices = null;
 
             // Whether all weights are non-zero.
-            bool denseWeight = numParams == CurrentWeights.Length;
+            bool denseWeight = NumParams == CurrentWeights.Length;
 
             // Extract non-zero indices of weight.
             if (!denseWeight)
             {
-                weightIndices = new int[numParams];
-                weightIndicesInvMap = new Dictionary<int, int>(numParams);
+                weightIndices = new int[NumParams];
+                weightIndicesInvMap = new Dictionary<int, int>(NumParams);
                 weightIndices[0] = 0;
                 weightIndicesInvMap[0] = 0;
                 int j = 1;
@@ -221,17 +222,17 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
                     }
                 }
 
-                Contracts.Assert(j == numParams);
+                Contracts.Assert(j == NumParams);
             }
 
             // Compute the standard error of coefficients.
-            long hessianDimension = (long)numParams * (numParams + 1) / 2;
+            long hessianDimension = (long)NumParams * (NumParams + 1) / 2;
             if (hessianDimension > int.MaxValue)
             {
                 ch.Warning("The number of parameter is too large. Cannot hold the variance-covariance matrix in memory. " +
                     "Skipping computation of standard errors and z-statistics of coefficients. Consider choosing a larger L1 regularizer" +
                     "to reduce the number of parameters.");
-                Stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance);
+                Stats = new LinearModelStatistics(Host, NumGoodRows, NumParams, deviance, nullDeviance);
                 return;
             }
 
@@ -251,7 +252,7 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
                 // i is the array index of the diagonal entry at iRow-th row and iRow-th column.
                 // iRow is one-based.
                 int i = 0;
-                for (int iRow = 2; iRow <= numParams; iRow++)
+                for (int iRow = 2; iRow <= NumParams; iRow++)
                 {
                     i += iRow;
                     hessian[i] = L2Weight;
@@ -281,7 +282,7 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
                         int ioff = 1;
 
                         // Increment remaining entries of hessian.
-                        for (int i = 1; i < numParams; i++)
+                        for (int i = 1; i < NumParams; i++)
                         {
                             ch.Assert(ioff == i * (i + 1) / 2);
                             int wi = weightIndices == null ? i - 1 : weightIndices[i] - 1;
@@ -329,14 +330,15 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
                     }
                 }
             }
-
-            Stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance);
+            Stats = new LinearModelStatistics(Host, NumGoodRows, NumParams, deviance, nullDeviance);
+            Stats.Hessian = hessian;
+            Stats.WeightIndices = weightIndices;
         }
 
         protected override void ProcessPriorDistribution(float label, float weight)
         {
             if (label > 0)
-                PosWeight += weight;
+                _posWeight += weight;
         }
 
         //Override default termination criterion MeanRelativeImprovementCriterion with
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
index 5e2003f3e1..33d70c5653 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
@@ -251,7 +251,7 @@ protected override MulticlassLogisticRegressionPredictor CreatePredictor()
             return new MulticlassLogisticRegressionPredictor(Host, ref CurrentWeights, _numClasses, NumFeatures, _labelNames, _stats);
         }
 
-        protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory cursorFactory, float loss, int numParams)
+        protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory cursorFactory, float loss)
         {
             Contracts.AssertValue(ch);
             Contracts.AssertValue(cursorFactory);
@@ -259,7 +259,7 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
             Contracts.Assert(WeightSum > 0);
             Contracts.Assert(BiasCount == _numClasses);
             Contracts.Assert(loss >= 0);
-            Contracts.Assert(numParams >= BiasCount);
+            Contracts.Assert(NumParams >= BiasCount);
             Contracts.Assert(CurrentWeights.IsDense);
 
             ch.Info("Model trained with {0} training examples.", NumGoodRows);
@@ -299,10 +299,10 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
             ch.Info("Null Deviance:    \t{0}", nullDeviance);
 
             // Compute AIC.
-            ch.Info("AIC:              \t{0}", 2 * numParams + deviance);
+            ch.Info("AIC:              \t{0}", 2 * NumParams + deviance);
 
             // REVIEW: Figure out how to compute the statistics for the coefficients.
-            _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance);
+            _stats = new LinearModelStatistics(Host, NumGoodRows, NumParams, deviance, nullDeviance);
         }
 
         protected override void ProcessPriorDistribution(float label, float weight)
diff --git a/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs b/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs
index 1c735d0c66..e8ab9c5d79 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs
@@ -2,17 +2,16 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
-using System;
-using System.Collections.Generic;
-using System.ComponentModel;
-using System.IO;
-using System.Linq;
 using Microsoft.ML.Runtime;
 using Microsoft.ML.Runtime.Data;
 using Microsoft.ML.Runtime.Internal.CpuMath;
 using Microsoft.ML.Runtime.Internal.Utilities;
 using Microsoft.ML.Runtime.Learners;
 using Microsoft.ML.Runtime.Model;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
 
 // This is for deserialization from a model repository.
 [assembly: LoadableClass(typeof(LinearModelStatistics), null, typeof(SignatureLoadModel),
@@ -82,7 +81,7 @@ private static VersionInfo GetVersionInfo()
         // It could be null when there are too many non-zero weights so that
         // the memory is insufficient to hold the Hessian matrix necessary for the computation
         // of the variance-covariance matrix.
-        private readonly VBuffer<Single>? _coeffStdError;
+        private VBuffer<Single>? _coeffStdError;
 
         public long TrainingExampleCount { get { return _trainingExampleCount; } }
 
@@ -92,6 +91,11 @@ private static VersionInfo GetVersionInfo()
 
         public int ParametersCount { get { return _paramCount; } }
 
+        public Double[] Hessian;
+
+        // Indices of bias and non-zero weight slots.
+        public int[] WeightIndices;
+
         public LinearModelStatistics(IHostEnvironment env, long trainingExampleCount, int paramCount, Single deviance, Single nullDeviance)
         {
             Contracts.AssertValue(env);
@@ -285,6 +289,12 @@ private static void GetUnorderedCoefficientStatistics(LinearModelStatistics stat
                 };
         }
 
+        public void SetCoeffStdError(ref VBuffer<Single> coeffStdError)
+        {
+            _env.Assert(coeffStdError.Count == _paramCount);
+            _coeffStdError = coeffStdError;
+        }
+
         private IEnumerable<CoefficientStatistics> GetUnorderedCoefficientStatistics(LinearBinaryPredictor parent, RoleMappedSchema schema)
         {
             Contracts.AssertValue(_env);
diff --git a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs
index 4de37a209e..e714756de9 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs
@@ -155,7 +155,7 @@ protected override PoissonRegressionPredictor CreatePredictor()
             return new PoissonRegressionPredictor(Host, ref weights, bias);
         }
 
-        protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory factory, float loss, int numParams)
+        protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory factory, float loss)
         {
             // No-op by design.
         }
diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs b/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
index fc3b458145..295c2c2387 100644
--- a/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
+++ b/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
@@ -38,5 +38,20 @@ public void TestEstimatorPoissonRegression()
             TestEstimatorCore(pipe, dataView);
             Done();
         }
+
+        [Fact]
+        public void TestLogisticRegressionStats()
+        {
+            (IEstimator<ITransformer> pipe, IDataView dataView) = GetBinaryClassificationPipeline();
+
+            var trainer = new LogisticRegression(Env, "Features", "Label", advancedSettings: s=> { s.ShowTrainingStats = true; });
+            pipe = pipe.Append(trainer);
+            var transformer = pipe.Fit(dataView);
+
+            using (var ch = Env.Start("Calcuating STD for LR."))
+                trainer.ComputeExtendedTrainingStatistics(ch);
+
+            Done();
+        }
     }
 }

From e8fede2499358fe911dcb8629a95c533cae33d46 Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Wed, 31 Oct 2018 11:06:03 -0700
Subject: [PATCH 03/14] adding test checks

---
 .../LogisticRegressionTrainingStats.cs                |  2 +-
 .../Standard/ModelStatistics.cs                       |  4 ++--
 .../TrainerEstimators/LbfgsTests.cs                   | 11 +++++++++++
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs b/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
index 5f7bf7441f..70554ad53d 100644
--- a/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
+++ b/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
@@ -82,7 +82,7 @@ public static void ComputeExtendedTrainingStatistics(this LogisticRegression tra
                 stdErrorValues[i] = (float)Math.Sqrt(stdErrorValues[i]);
 
            VBuffer<float> stdErrors = new VBuffer<float>(trainer.GetWeights.Length, trainer.GetNumSelectedParams, stdErrorValues, trainer.Stats.WeightIndices);
-           trainer.Stats.SetCoeffStdError(ref stdErrors);
+           trainer.Stats.SetCoeffStdError(stdErrors);
         }
     }
 }
diff --git a/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs b/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs
index e8ab9c5d79..fd3f775e29 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs
@@ -226,7 +226,7 @@ public static bool TryGetBiasStatistics(LinearModelStatistics stats, Single bias
             stdError = stats._coeffStdError.Value.Values[0];
             Contracts.Assert(stdError == stats._coeffStdError.Value.GetItemOrDefault(0));
             zScore = bias / stdError;
-            pValue = 1 - (Single)ProbabilityFunctions.Erf(Math.Abs(zScore / sqrt2));
+            pValue = 1.0f - (Single)ProbabilityFunctions.Erf(Math.Abs(zScore / sqrt2));
             return true;
         }
 
@@ -289,7 +289,7 @@ private static void GetUnorderedCoefficientStatistics(LinearModelStatistics stat
                 };
         }
 
-        public void SetCoeffStdError(ref VBuffer<Single> coeffStdError)
+        public void SetCoeffStdError(VBuffer<Single> coeffStdError)
         {
             _env.Assert(coeffStdError.Count == _paramCount);
             _coeffStdError = coeffStdError;
diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs b/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
index 295c2c2387..0f290fd0dd 100644
--- a/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
+++ b/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
@@ -48,9 +48,20 @@ public void TestLogisticRegressionStats()
             pipe = pipe.Append(trainer);
             var transformer = pipe.Fit(dataView);
 
+            LinearModelStatistics.TryGetBiasStatistics(trainer.Stats, 2, out float stdError, out float zScore, out float pValue);
+
+            Assert.Equal(0.0f, stdError);
+            Assert.Equal(0.0f, zScore);
+            Assert.Equal(0.0f, pValue);
+
             using (var ch = Env.Start("Calcuating STD for LR."))
                 trainer.ComputeExtendedTrainingStatistics(ch);
 
+            LinearModelStatistics.TryGetBiasStatistics(trainer.Stats, 2, out stdError, out zScore, out pValue);
+
+            Assert.True(stdError > 0);
+            Assert.True(zScore > 0);
+
             Done();
         }
     }

From 06f97042eb6bd3dc9f05c17aa06b8c70145713ff Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Wed, 31 Oct 2018 15:11:58 -0700
Subject: [PATCH 04/14] moving the calculations of the extended training
 statistics in the predictor.

---
 .../LogisticRegressionTrainingStats.cs        | 33 ++++++++-------
 .../LogisticRegression/LbfgsPredictorBase.cs  | 31 +++-----------
 .../LogisticRegression/LogisticRegression.cs  | 41 +++++++++----------
 .../MulticlassLogisticRegression.cs           |  8 ++--
 .../Standard/ModelStatistics.cs               |  8 ++--
 .../PoissonRegression/PoissonRegression.cs    |  2 +-
 .../TrainerEstimators/LbfgsTests.cs           | 15 ++++---
 7 files changed, 59 insertions(+), 79 deletions(-)

diff --git a/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs b/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
index 70554ad53d..9c40d864ab 100644
--- a/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
+++ b/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
@@ -16,58 +16,57 @@ namespace Microsoft.ML.Runtime.Learners
     public static class LogisticRegressionTrainingStats
     {
 
-        public static void ComputeExtendedTrainingStatistics(this LogisticRegression trainer, IChannel ch)
+        public static void ComputeExtendedTrainingStatistics(this LinearBinaryPredictor model, IChannel ch, float l2Weight = LogisticRegression.Arguments.Defaults.L2Weight)
         {
             Contracts.AssertValue(ch);
-            Contracts.AssertValue(trainer.Stats, $"Training Statistics can get generated after training finishes. Train with setting: ShowTrainigStats set to true.");
-            Contracts.Assert(trainer.GetL2Weight > 0);
-            Contracts.Assert(trainer.GetNumGoodRows > 0);
+            Contracts.AssertValue(model.Statistics, $"Training Statistics can get generated after training finishes. Train with setting: ShowTrainigStats set to true.");
+            Contracts.Assert(l2Weight > 0);
 
-            ch.Info("Model trained with {0} training examples.", trainer.GetNumGoodRows);
+            int numSelectedParams = model.Statistics.ParametersCount;
 
             // Apply Cholesky Decomposition to find the inverse of the Hessian.
             Double[] invHessian = null;
             try
             {
                 // First, find the Cholesky decomposition LL' of the Hessian.
-                Mkl.Pptrf(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, trainer.GetNumSelectedParams, trainer.Stats.Hessian);
+                Mkl.Pptrf(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numSelectedParams, model.Statistics.Hessian);
                 // Note that hessian is already modified at this point. It is no longer the original Hessian,
                 // but instead represents the Cholesky decomposition L.
                 // Also note that the following routine is supposed to consume the Cholesky decomposition L instead
                 // of the original information matrix.
-                Mkl.Pptri(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, trainer.GetNumSelectedParams, trainer.Stats.Hessian);
+                Mkl.Pptri(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numSelectedParams, model.Statistics.Hessian);
                 // At this point, hessian should contain the inverse of the original Hessian matrix.
                 // Swap hessian with invHessian to avoid confusion in the following context.
-                Utils.Swap(ref trainer.Stats.Hessian, ref invHessian);
-                Contracts.Assert(trainer.Stats.Hessian == null);
+                Utils.Swap(ref model.Statistics.Hessian, ref invHessian);
+                Contracts.Assert(model.Statistics.Hessian == null);
             }
             catch (DllNotFoundException)
             {
                 throw ch.ExceptNotSupp("The MKL library (MklImports.dll) or one of its dependencies is missing.");
             }
 
-            float[] stdErrorValues = new float[trainer.GetNumSelectedParams];
+            float[] stdErrorValues = new float[numSelectedParams];
             stdErrorValues[0] = (float)Math.Sqrt(invHessian[0]);
 
-            for (int i = 1; i < trainer.GetNumSelectedParams; i++)
+            for (int i = 1; i < numSelectedParams; i++)
             {
                 // Initialize with inverse Hessian.
                 stdErrorValues[i] = (Single)invHessian[i * (i + 1) / 2 + i];
             }
 
-            if (trainer.GetL2Weight > 0)
+            if (l2Weight > 0)
             {
                 // Iterate through all entries of inverse Hessian to make adjustment to variance.
                 // A discussion on ridge regularized LR coefficient covariance matrix can be found here:
                 // http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3228544/
                 // http://www.inf.unibz.it/dis/teaching/DWDM/project2010/LogisticRegression.pdf
                 int ioffset = 1;
-                for (int iRow = 1; iRow < trainer.GetNumSelectedParams; iRow++)
+                for (int iRow = 1; iRow < numSelectedParams; iRow++)
                 {
                     for (int iCol = 0; iCol <= iRow; iCol++)
                     {
                         var entry = (Single)invHessian[ioffset];
-                        var adjustment = -trainer.GetL2Weight * entry * entry;
+                        var adjustment = -l2Weight * entry * entry;
                         stdErrorValues[iRow] -= adjustment;
                         if (0 < iCol && iCol < iRow)
                             stdErrorValues[iCol] -= adjustment;
@@ -78,11 +77,11 @@ public static void ComputeExtendedTrainingStatistics(this LogisticRegression tra
                 Contracts.Assert(ioffset == invHessian.Length);
             }
 
-            for (int i = 1; i < trainer.GetNumSelectedParams; i++)
+            for (int i = 1; i < numSelectedParams; i++)
                 stdErrorValues[i] = (float)Math.Sqrt(stdErrorValues[i]);
 
-           VBuffer<float> stdErrors = new VBuffer<float>(trainer.GetWeights.Length, trainer.GetNumSelectedParams, stdErrorValues, trainer.Stats.WeightIndices);
-           trainer.Stats.SetCoeffStdError(stdErrors);
+           VBuffer<float> stdErrors = new VBuffer<float>(model.Weights2.Count, numSelectedParams, stdErrorValues, model.Statistics.WeightIndices);
+            model.Statistics.SetCoeffStdError(stdErrors);
         }
     }
 }
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
index b93c3a86c0..766cf06152 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
@@ -136,7 +136,6 @@ public static class Defaults
         private VBuffer<float>[] _features;
         private float[] _labels;
         private float[] _weights;
-        protected int NumParams;
 
         // Stores the bounds of the chunk to be used by each thread. The 0th slot is 0. The length
         // is one more than the number of threads to use.
@@ -147,26 +146,6 @@ public static class Defaults
         private VBuffer<float>[] _localGradients;
         private float[] _localLosses;
 
-        /// <summary>
-        /// Gets the number of useful training rows.
-        /// </summary>
-        public long GetNumGoodRows => NumGoodRows;
-
-        /// <summary>
-        /// Gets the L2weight
-        /// </summary>
-        public float GetL2Weight => L2Weight;
-
-        /// <summary>
-        /// Gets the number of parameters selected
-        /// </summary>
-        public int GetNumSelectedParams => NumParams;
-
-        /// <summary>
-        /// Gets the training weights.
-        /// </summary>
-        public VBuffer<float> GetWeights => CurrentWeights;
-
         // REVIEW: It's pointless to request caching when we're going to load everything into
         // memory, that is, when using multiple threads. So should caching not be requested?
         private static readonly TrainerInfo _info = new TrainerInfo(caching: true, supportIncrementalTrain: true);
@@ -574,16 +553,16 @@ protected virtual void TrainCore(IChannel ch, RoleMappedData data)
 
             ch.Assert(CurrentWeights.Length == BiasCount + WeightCount);
 
-            NumParams = BiasCount;
+            int numParams = BiasCount;
             if ((L1Weight > 0 && !Quiet) || ShowTrainingStats)
             {
-                VBufferUtils.ForEachDefined(ref CurrentWeights, (index, value) => { if (index >= BiasCount && value != 0) NumParams++; });
+                VBufferUtils.ForEachDefined(ref CurrentWeights, (index, value) => { if (index >= BiasCount && value != 0) numParams++; });
                 if (L1Weight > 0 && !Quiet)
-                    ch.Info("L1 regularization selected {0} of {1} weights.", NumParams, BiasCount + WeightCount);
+                    ch.Info("L1 regularization selected {0} of {1} weights.", numParams, BiasCount + WeightCount);
             }
 
             if (ShowTrainingStats)
-                ComputeTrainingStatistics(ch, cursorFactory, loss);
+                ComputeTrainingStatistics(ch, cursorFactory, loss, numParams);
         }
 
         // Ensure that the bias portion of vec is represented in vec.
@@ -599,7 +578,7 @@ protected void EnsureBiases(ref VBuffer<float> vec)
         protected abstract float AccumulateOneGradient(ref VBuffer<float> feat, float label, float weight,
             ref VBuffer<float> xDense, ref VBuffer<float> grad, ref float[] scratch);
 
-        protected abstract void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory cursorFactory, float loss);
+        protected abstract void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory cursorFactory, float loss, int numParams);
 
         protected abstract void ProcessPriorDistribution(float label, float weight);
         /// <summary>
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
index 1394a7f3f5..370810eb3c 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
@@ -15,7 +15,6 @@
 using Microsoft.ML.Runtime.Learners;
 using Microsoft.ML.Runtime.Numeric;
 using Microsoft.ML.Runtime.Training;
-//using Microsoft.ML.Trainers.HalLearners;
 
 [assembly: LoadableClass(LogisticRegression.Summary, typeof(LogisticRegression), typeof(LogisticRegression.Arguments),
     new[] { typeof(SignatureBinaryClassifierTrainer), typeof(SignatureTrainer), typeof(SignatureFeatureScorerTrainer) },
@@ -31,7 +30,7 @@ namespace Microsoft.ML.Runtime.Learners
 
     /// <include file='doc.xml' path='doc/members/member[@name="LBFGS"]/*' />
     /// <include file='doc.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
-    public partial class LogisticRegression : LbfgsTrainerBase<LogisticRegression.Arguments, BinaryPredictionTransformer<ParameterMixingCalibratedPredictor>, ParameterMixingCalibratedPredictor>
+    public sealed partial class LogisticRegression : LbfgsTrainerBase<LogisticRegression.Arguments, BinaryPredictionTransformer<ParameterMixingCalibratedPredictor>, ParameterMixingCalibratedPredictor>
     {
         public const string LoadNameValue = "LogisticRegression";
         internal const string UserNameValue = "Logistic Regression";
@@ -45,8 +44,8 @@ public sealed class Arguments : ArgumentsBase
             public bool ShowTrainingStats = false;
         }
 
-        private Double _posWeight;
-        public LinearModelStatistics Stats;
+        private double _posWeight;
+        private LinearModelStatistics _stats;
 
         /// <summary>
         /// Initializes a new instance of <see cref="LogisticRegression"/>
@@ -84,7 +83,7 @@ public LogisticRegression(IHostEnvironment env,
         /// <summary>
         /// Initializes a new instance of <see cref="LogisticRegression"/>
         /// </summary>
-        public LogisticRegression(IHostEnvironment env, Arguments args)
+        internal LogisticRegression(IHostEnvironment env, Arguments args)
             : base(env, args, TrainerUtils.MakeBoolScalarLabel(args.LabelColumn))
         {
             _posWeight = 0;
@@ -141,7 +140,7 @@ protected override float AccumulateOneGradient(ref VBuffer<float> feat, float la
             return weight * datumLoss;
         }
 
-        protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory cursorFactory, float loss)
+        protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory cursorFactory, float loss, int numParams)
         {
             Contracts.AssertValue(ch);
             Contracts.AssertValue(cursorFactory);
@@ -149,7 +148,7 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
             Contracts.Assert(WeightSum > 0);
             Contracts.Assert(BiasCount == 1);
             Contracts.Assert(loss >= 0);
-            Contracts.Assert(NumParams >= BiasCount);
+            Contracts.Assert(numParams >= BiasCount);
             Contracts.Assert(CurrentWeights.IsDense);
 
             ch.Info("Model trained with {0} training examples.", NumGoodRows);
@@ -174,7 +173,7 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
                 deviance -= (float)regLoss * L1Weight * 2;
             }
 
-            ch.Info("Residual Deviance: \t{0} (on {1} degrees of freedom)", deviance, Math.Max(NumGoodRows - NumParams, 0));
+            ch.Info("Residual Deviance: \t{0} (on {1} degrees of freedom)", deviance, Math.Max(NumGoodRows - numParams, 0));
 
             // Compute null deviance, i.e., the deviance of null hypothesis.
             // Cap the prior positive rate at 1e-15.
@@ -185,7 +184,7 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
             ch.Info("Null Deviance:     \t{0} (on {1} degrees of freedom)", nullDeviance, NumGoodRows - 1);
 
             // Compute AIC.
-            ch.Info("AIC:               \t{0}", 2 * NumParams + deviance);
+            ch.Info("AIC:               \t{0}", 2 * numParams + deviance);
 
             // Show the coefficients statistics table.
             var featureColIdx = cursorFactory.Data.Schema.Feature.Index;
@@ -203,13 +202,13 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
             int[] weightIndices = null;
 
             // Whether all weights are non-zero.
-            bool denseWeight = NumParams == CurrentWeights.Length;
+            bool denseWeight = numParams == CurrentWeights.Length;
 
             // Extract non-zero indices of weight.
             if (!denseWeight)
             {
-                weightIndices = new int[NumParams];
-                weightIndicesInvMap = new Dictionary<int, int>(NumParams);
+                weightIndices = new int[numParams];
+                weightIndicesInvMap = new Dictionary<int, int>(numParams);
                 weightIndices[0] = 0;
                 weightIndicesInvMap[0] = 0;
                 int j = 1;
@@ -222,17 +221,17 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
                     }
                 }
 
-                Contracts.Assert(j == NumParams);
+                Contracts.Assert(j == numParams);
             }
 
             // Compute the standard error of coefficients.
-            long hessianDimension = (long)NumParams * (NumParams + 1) / 2;
+            long hessianDimension = (long)numParams * (numParams + 1) / 2;
             if (hessianDimension > int.MaxValue)
             {
                 ch.Warning("The number of parameter is too large. Cannot hold the variance-covariance matrix in memory. " +
                     "Skipping computation of standard errors and z-statistics of coefficients. Consider choosing a larger L1 regularizer" +
                     "to reduce the number of parameters.");
-                Stats = new LinearModelStatistics(Host, NumGoodRows, NumParams, deviance, nullDeviance);
+                _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance);
                 return;
             }
 
@@ -252,7 +251,7 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
                 // i is the array index of the diagonal entry at iRow-th row and iRow-th column.
                 // iRow is one-based.
                 int i = 0;
-                for (int iRow = 2; iRow <= NumParams; iRow++)
+                for (int iRow = 2; iRow <= numParams; iRow++)
                 {
                     i += iRow;
                     hessian[i] = L2Weight;
@@ -282,7 +281,7 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
                         int ioff = 1;
 
                         // Increment remaining entries of hessian.
-                        for (int i = 1; i < NumParams; i++)
+                        for (int i = 1; i < numParams; i++)
                         {
                             ch.Assert(ioff == i * (i + 1) / 2);
                             int wi = weightIndices == null ? i - 1 : weightIndices[i] - 1;
@@ -330,9 +329,9 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
                     }
                 }
             }
-            Stats = new LinearModelStatistics(Host, NumGoodRows, NumParams, deviance, nullDeviance);
-            Stats.Hessian = hessian;
-            Stats.WeightIndices = weightIndices;
+            _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance);
+            _stats.Hessian = hessian;
+            _stats.WeightIndices = weightIndices;
         }
 
         protected override void ProcessPriorDistribution(float label, float weight)
@@ -375,7 +374,7 @@ protected override ParameterMixingCalibratedPredictor CreatePredictor()
             CurrentWeights.GetItemOrDefault(0, ref bias);
             CurrentWeights.CopyTo(ref weights, 1, CurrentWeights.Length - 1);
             return new ParameterMixingCalibratedPredictor(Host,
-                new LinearBinaryPredictor(Host, ref weights, bias, Stats),
+                new LinearBinaryPredictor(Host, ref weights, bias, _stats),
                 new PlattCalibrator(Host, -1, 0));
         }
 
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
index 617bd717f0..70e896feef 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
@@ -251,7 +251,7 @@ protected override MulticlassLogisticRegressionPredictor CreatePredictor()
             return new MulticlassLogisticRegressionPredictor(Host, ref CurrentWeights, _numClasses, NumFeatures, _labelNames, _stats);
         }
 
-        protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory cursorFactory, float loss)
+        protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory cursorFactory, float loss, int numParams)
         {
             Contracts.AssertValue(ch);
             Contracts.AssertValue(cursorFactory);
@@ -259,7 +259,7 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
             Contracts.Assert(WeightSum > 0);
             Contracts.Assert(BiasCount == _numClasses);
             Contracts.Assert(loss >= 0);
-            Contracts.Assert(NumParams >= BiasCount);
+            Contracts.Assert(numParams >= BiasCount);
             Contracts.Assert(CurrentWeights.IsDense);
 
             ch.Info("Model trained with {0} training examples.", NumGoodRows);
@@ -299,10 +299,10 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
             ch.Info("Null Deviance:    \t{0}", nullDeviance);
 
             // Compute AIC.
-            ch.Info("AIC:              \t{0}", 2 * NumParams + deviance);
+            ch.Info("AIC:              \t{0}", 2 * numParams + deviance);
 
             // REVIEW: Figure out how to compute the statistics for the coefficients.
-            _stats = new LinearModelStatistics(Host, NumGoodRows, NumParams, deviance, nullDeviance);
+            _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance);
         }
 
         protected override void ProcessPriorDistribution(float label, float weight)
diff --git a/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs b/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs
index fd3f775e29..b9ccbe47cc 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs
@@ -83,13 +83,13 @@ private static VersionInfo GetVersionInfo()
         // of the variance-covariance matrix.
         private VBuffer<Single>? _coeffStdError;
 
-        public long TrainingExampleCount { get { return _trainingExampleCount; } }
+        public long TrainingExampleCount => _trainingExampleCount;
 
-        public Single Deviance { get { return _deviance; } }
+        public Single Deviance => _deviance;
 
-        public Single NullDeviance { get { return _nullDeviance; } }
+        public Single NullDeviance => _nullDeviance;
 
-        public int ParametersCount { get { return _paramCount; } }
+        public int ParametersCount => _paramCount;
 
         public Double[] Hessian;
 
diff --git a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs
index e714756de9..4de37a209e 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs
@@ -155,7 +155,7 @@ protected override PoissonRegressionPredictor CreatePredictor()
             return new PoissonRegressionPredictor(Host, ref weights, bias);
         }
 
-        protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory factory, float loss)
+        protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory factory, float loss, int numParams)
         {
             // No-op by design.
         }
diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs b/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
index 0f290fd0dd..7f8c1e6f59 100644
--- a/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
+++ b/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
@@ -4,6 +4,7 @@
 
 using Microsoft.ML.Core.Data;
 using Microsoft.ML.Runtime.Data;
+using Microsoft.ML.Runtime.Internal.Calibration;
 using Microsoft.ML.Runtime.Learners;
 using Microsoft.ML.Trainers;
 using Xunit;
@@ -44,20 +45,22 @@ public void TestLogisticRegressionStats()
         {
             (IEstimator<ITransformer> pipe, IDataView dataView) = GetBinaryClassificationPipeline();
 
-            var trainer = new LogisticRegression(Env, "Features", "Label", advancedSettings: s=> { s.ShowTrainingStats = true; });
-            pipe = pipe.Append(trainer);
-            var transformer = pipe.Fit(dataView);
+            pipe = pipe.Append(new LogisticRegression(Env, "Features", "Label", advancedSettings: s => { s.ShowTrainingStats = true; }));
+            var transformerChain = pipe.Fit(dataView) as TransformerChain<BinaryPredictionTransformer<ParameterMixingCalibratedPredictor>>;
 
-            LinearModelStatistics.TryGetBiasStatistics(trainer.Stats, 2, out float stdError, out float zScore, out float pValue);
+            var linearModel = transformerChain.LastTransformer.Model.SubPredictor as LinearBinaryPredictor;
+            var stats = linearModel.Statistics;
+
+            LinearModelStatistics.TryGetBiasStatistics(stats, 2, out float stdError, out float zScore, out float pValue);
 
             Assert.Equal(0.0f, stdError);
             Assert.Equal(0.0f, zScore);
             Assert.Equal(0.0f, pValue);
 
             using (var ch = Env.Start("Calcuating STD for LR."))
-                trainer.ComputeExtendedTrainingStatistics(ch);
+                linearModel.ComputeExtendedTrainingStatistics(ch);
 
-            LinearModelStatistics.TryGetBiasStatistics(trainer.Stats, 2, out stdError, out zScore, out pValue);
+            LinearModelStatistics.TryGetBiasStatistics(stats, 2, out stdError, out zScore, out pValue);
 
             Assert.True(stdError > 0);
             Assert.True(zScore > 0);

From 3831f5d7feeccecf7348b680ef301b33cad73eb8 Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Wed, 7 Nov 2018 00:32:36 -0800
Subject: [PATCH 05/14] Creating two separate methods to compute the matrix of
 standartDeviations, one the old MKl way in the HAL Learners package, and the
 other making use of Math.Numerics

---
 build/Dependencies.props                      |  1 +
 .../LogisticRegressionTrainingStats.cs        | 21 ++--
 .../Microsoft.ML.StandardLearners.csproj      |  6 +-
 .../Standard/ModelStatistics.cs               | 98 ++++++++++++++++++-
 .../TrainerEstimators/LbfgsTests.cs           | 32 +++++-
 5 files changed, 142 insertions(+), 16 deletions(-)

diff --git a/build/Dependencies.props b/build/Dependencies.props
index 7a79b3a087..3f053444db 100644
--- a/build/Dependencies.props
+++ b/build/Dependencies.props
@@ -22,6 +22,7 @@
     <SystemIOFileSystemAccessControl>4.5.0</SystemIOFileSystemAccessControl>
     <SystemSecurityPrincipalWindows>4.5.0</SystemSecurityPrincipalWindows>
     <TensorFlowVersion>1.10.0</TensorFlowVersion>
+    <MathNumericPackageVersion>4.6.0</MathNumericPackageVersion>
   </PropertyGroup>
 
   <!-- Code Analyzer Dependencies -->
diff --git a/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs b/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
index 9c40d864ab..adf823c42c 100644
--- a/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
+++ b/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
@@ -11,12 +11,17 @@ namespace Microsoft.ML.Runtime.Learners
 {
     using Mkl = OlsLinearRegressionTrainer.Mkl;
 
-    /// <include file='doc.xml' path='doc/members/member[@name="LBFGS"]/*' />
-    /// <include file='doc.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
     public static class LogisticRegressionTrainingStats
     {
-
-        public static void ComputeExtendedTrainingStatistics(this LinearBinaryPredictor model, IChannel ch, float l2Weight = LogisticRegression.Arguments.Defaults.L2Weight)
+        /// <summary>
+        /// Computes the standart deviation matrix of each of the non-zero training weights, needed to calculate further the standart deviation,
+        /// p-value and z-Score.
+        /// This function performs the same calculations as <see cref="ComputeStd(LinearBinaryPredictor, IChannel, float)"/> but it is faster than it, because it makes use of Intel's MKL.
+        /// </summary>
+        /// <param name="model">A <see cref="LinearBinaryPredictor"/> obtained as a result of training with <see cref="LogisticRegression"/>.</param>
+        /// <param name="ch">The <see cref="IChannel"/> used for messaging.</param>
+        /// <param name="l2Weight">The L2Weight used for training. (Supply the same one that got used during training.)</param>
+        public static void ComputeStd(LinearBinaryPredictor model, IChannel ch, float l2Weight = LogisticRegression.Arguments.Defaults.L2Weight)
         {
             Contracts.AssertValue(ch);
             Contracts.AssertValue(model.Statistics, $"Training Statistics can get generated after training finishes. Train with setting: ShowTrainigStats set to true.");
@@ -58,8 +63,8 @@ public static void ComputeExtendedTrainingStatistics(this LinearBinaryPredictor
             {
                 // Iterate through all entries of inverse Hessian to make adjustment to variance.
                 // A discussion on ridge regularized LR coefficient covariance matrix can be found here:
-                // http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3228544/
-                // http://www.inf.unibz.it/dis/teaching/DWDM/project2010/LogisticRegression.pdf
+                // http://www.aloki.hu/pdf/0402_171179.pdf (Equations 11 and 25)
+                // http://www.inf.unibz.it/dis/teaching/DWDM/project2010/LogisticRegression.pdf (Section "Significance testing in ridge logistic regression")
                 int ioffset = 1;
                 for (int iRow = 1; iRow < numSelectedParams; iRow++)
                 {
@@ -80,7 +85,9 @@ public static void ComputeExtendedTrainingStatistics(this LinearBinaryPredictor
             for (int i = 1; i < numSelectedParams; i++)
                 stdErrorValues[i] = (float)Math.Sqrt(stdErrorValues[i]);
 
-           VBuffer<float> stdErrors = new VBuffer<float>(model.Weights2.Count, numSelectedParams, stdErrorValues, model.Statistics.WeightIndices);
+            // currentWeights vector size is Weights2 + the bias
+            var currentWeightsCount = model.Weights2.Count + 1;
+            VBuffer<float> stdErrors = new VBuffer<float>(currentWeightsCount, numSelectedParams, stdErrorValues, model.Statistics.WeightIndices);
             model.Statistics.SetCoeffStdError(stdErrors);
         }
     }
diff --git a/src/Microsoft.ML.StandardLearners/Microsoft.ML.StandardLearners.csproj b/src/Microsoft.ML.StandardLearners/Microsoft.ML.StandardLearners.csproj
index 8bd76945aa..9eb8c8cc58 100644
--- a/src/Microsoft.ML.StandardLearners/Microsoft.ML.StandardLearners.csproj
+++ b/src/Microsoft.ML.StandardLearners/Microsoft.ML.StandardLearners.csproj
@@ -1,4 +1,4 @@
-<Project Sdk="Microsoft.NET.Sdk">
+﻿<Project Sdk="Microsoft.NET.Sdk">
 
   <PropertyGroup>
     <TargetFramework>netstandard2.0</TargetFramework>
@@ -6,6 +6,10 @@
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
   </PropertyGroup>
 
+  <ItemGroup>
+    <PackageReference Include="MathNet.Numerics.Signed" Version="$(MathNumericPackageVersion)" />
+  </ItemGroup>
+
   <ItemGroup>
     <ProjectReference Include="..\Microsoft.ML.Core\Microsoft.ML.Core.csproj" />
     <ProjectReference Include="..\Microsoft.ML.CpuMath\Microsoft.ML.CpuMath.csproj" />
diff --git a/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs b/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs
index 0a128881dc..ff29b6375a 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs
@@ -2,6 +2,7 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
+using MathNet.Numerics.LinearAlgebra;
 using Microsoft.ML.Runtime;
 using Microsoft.ML.Runtime.Data;
 using Microsoft.ML.Runtime.Internal.CpuMath;
@@ -96,7 +97,7 @@ private static VersionInfo GetVersionInfo()
         // Indices of bias and non-zero weight slots.
         public int[] WeightIndices;
 
-        public LinearModelStatistics(IHostEnvironment env, long trainingExampleCount, int paramCount, Single deviance, Single nullDeviance)
+        internal LinearModelStatistics(IHostEnvironment env, long trainingExampleCount, int paramCount, Single deviance, Single nullDeviance)
         {
             Contracts.AssertValue(env);
             env.Assert(trainingExampleCount > 0);
@@ -115,7 +116,7 @@ internal LinearModelStatistics(IHostEnvironment env, long trainingExampleCount,
             _coeffStdError = coeffStdError;
         }
 
-        public LinearModelStatistics(IHostEnvironment env, ModelLoadContext ctx)
+        internal LinearModelStatistics(IHostEnvironment env, ModelLoadContext ctx)
         {
             Contracts.CheckValue(env, nameof(env));
             _env = env;
@@ -161,7 +162,7 @@ public LinearModelStatistics(IHostEnvironment env, ModelLoadContext ctx)
             _coeffStdError = new VBuffer<Single>(length, _paramCount, stdErrorValues, stdErrorIndices);
         }
 
-        public static LinearModelStatistics Create(IHostEnvironment env, ModelLoadContext ctx)
+        internal static LinearModelStatistics Create(IHostEnvironment env, ModelLoadContext ctx)
         {
             Contracts.CheckValue(env, nameof(env));
             env.CheckValue(ctx, nameof(ctx));
@@ -212,6 +213,10 @@ private void SaveCore(ModelSaveContext ctx)
             ctx.Writer.WriteIntsNoCount(_coeffStdError.Value.Indices, _paramCount);
         }
 
+        /// <summary>
+        /// Computes the standart deviation, Z-Score and p-Value.
+        /// Should be called after <see cref="ComputeStd(LinearBinaryPredictor, IChannel, float)"/>.
+        /// </summary>
         public static bool TryGetBiasStatistics(LinearModelStatistics stats, Single bias, out Single stdError, out Single zScore, out Single pValue)
         {
             if (!stats._coeffStdError.HasValue)
@@ -230,6 +235,93 @@ public static bool TryGetBiasStatistics(LinearModelStatistics stats, Single bias
             return true;
         }
 
+        /// <summary>
+        /// Computes the standart deviation matrix of each of the non-zero training weights, needed to calculate further the standart deviation,
+        /// p-value and z-Score.
+        /// If you need faster calculations, use the ComputeStd method from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration.
+        /// </summary>
+        /// <param name="model">A <see cref="LinearBinaryPredictor"/> obtained as a result of training with <see cref="LogisticRegression"/>.</param>
+        /// <param name="ch">The <see cref="IChannel"/> used for messaging.</param>
+        /// <param name="l2Weight">The L2Weight used for training. (Supply the same one that got used during training.)</param>
+        public static void ComputeStd(LinearBinaryPredictor model, IChannel ch, float l2Weight = LogisticRegression.Arguments.Defaults.L2Weight)
+        {
+            Contracts.AssertValue(ch);
+            Contracts.AssertValue(model.Statistics, $"Training Statistics can get generated after training finishes. Train with setting: ShowTrainigStats set to true.");
+            Contracts.Assert(l2Weight > 0);
+
+            int numSelectedParams = model.Statistics.ParametersCount;
+
+            double[] hessian = model.Statistics.Hessian;
+            double[,] matrixHessian = new double[numSelectedParams, numSelectedParams];
+
+            int hessianLength = 0;
+            int dimention = numSelectedParams - 1;
+
+            for (int row = dimention; row >= 0; row--)
+            {
+                for (int col = 0; col <= dimention; col++)
+                {
+                    if ((row + col) <= dimention)
+                    {
+                        if ((row + col) == dimention)
+                        {
+                            matrixHessian[row, col] = hessian[hessianLength];
+                        }
+                        else
+                        {
+                            matrixHessian[row, col] = hessian[hessianLength];
+                            matrixHessian[dimention - col, dimention - row] = hessian[hessianLength];
+                        }
+                        hessianLength++;
+                    }
+                    else
+                        continue;
+                }
+            }
+
+            var h = Matrix<double>.Build.DenseOfArray(matrixHessian);
+            var invers = h.Inverse();
+
+            float[] stdErrorValues2 = new float[numSelectedParams];
+            stdErrorValues2[0] = (float)Math.Sqrt(invers[0, numSelectedParams - 1]);
+
+            for (int i = 1; i < numSelectedParams; i++)
+            {
+                // Initialize with inverse Hessian.
+                // The diagonal of the inverse Hessian.
+                stdErrorValues2[i] = (Single)invers[i, numSelectedParams - i - 1];
+            }
+
+            if (l2Weight > 0)
+            {
+                // Iterate through all entries of inverse Hessian to make adjustment to variance.
+                // A discussion on ridge regularized LR coefficient covariance matrix can be found here:
+                // http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3228544/
+                // http://www.inf.unibz.it/dis/teaching/DWDM/project2010/LogisticRegression.pdf
+                int ioffset = 1;
+                for (int iRow = 1; iRow < numSelectedParams; iRow++)
+                {
+                    for (int iCol = 0; iCol <= iRow; iCol++)
+                    {
+                        float entry = (float)invers[iRow, numSelectedParams - iCol - 1];
+                        var adjustment = -l2Weight * entry * entry;
+                        stdErrorValues2[iRow] -= adjustment;
+
+                        if (0 < iCol && iCol < iRow)
+                            stdErrorValues2[iCol] -= adjustment;
+                        ioffset++;
+                    }
+                }
+            }
+
+            for (int i = 1; i < numSelectedParams; i++)
+                stdErrorValues2[i] = (float)Math.Sqrt(stdErrorValues2[i]);
+
+            var currentWeightsCount = model.Weights2.Count + 1; // adding one for the bias
+            VBuffer<float> stdErrors = new VBuffer<float>(currentWeightsCount, numSelectedParams, stdErrorValues2, model.Statistics.WeightIndices);
+            model.Statistics.SetCoeffStdError(stdErrors);
+        }
+
         private static void GetUnorderedCoefficientStatistics(LinearModelStatistics stats, in VBuffer<Single> weights, in VBuffer<ReadOnlyMemory<char>> names,
             ref VBuffer<Single> estimate, ref VBuffer<Single> stdErr, ref VBuffer<Single> zScore, ref VBuffer<Single> pValue, out ValueGetter<VBuffer<ReadOnlyMemory<char>>> getSlotNames)
         {
diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs b/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
index 7f8c1e6f59..15949e1c9f 100644
--- a/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
+++ b/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
@@ -50,7 +50,6 @@ public void TestLogisticRegressionStats()
 
             var linearModel = transformerChain.LastTransformer.Model.SubPredictor as LinearBinaryPredictor;
             var stats = linearModel.Statistics;
-
             LinearModelStatistics.TryGetBiasStatistics(stats, 2, out float stdError, out float zScore, out float pValue);
 
             Assert.Equal(0.0f, stdError);
@@ -58,14 +57,37 @@ public void TestLogisticRegressionStats()
             Assert.Equal(0.0f, pValue);
 
             using (var ch = Env.Start("Calcuating STD for LR."))
-                linearModel.ComputeExtendedTrainingStatistics(ch);
+                LinearModelStatistics.ComputeStd(linearModel, ch);
 
             LinearModelStatistics.TryGetBiasStatistics(stats, 2, out stdError, out zScore, out pValue);
 
-            Assert.True(stdError > 0);
-            Assert.True(zScore > 0);
+            Assert.True(stdError == 0.250672936f);
+            Assert.True(zScore == 7.97852373f);
+        }
 
-            Done();
+        [Fact]
+        public void TestLogisticRegressionStats_MKL()
+        {
+            (IEstimator<ITransformer> pipe, IDataView dataView) = GetBinaryClassificationPipeline();
+
+            pipe = pipe.Append(new LogisticRegression(Env, "Features", "Label", advancedSettings: s => { s.ShowTrainingStats = true; }));
+            var transformerChain = pipe.Fit(dataView) as TransformerChain<BinaryPredictionTransformer<ParameterMixingCalibratedPredictor>>;
+
+            var linearModel = transformerChain.LastTransformer.Model.SubPredictor as LinearBinaryPredictor;
+            var stats = linearModel.Statistics;
+            LinearModelStatistics.TryGetBiasStatistics(stats, 2, out float stdError, out float zScore, out float pValue);
+
+            Assert.Equal(0.0f, stdError);
+            Assert.Equal(0.0f, zScore);
+            Assert.Equal(0.0f, pValue);
+
+            using (var ch = Env.Start("Calcuating STD for LR."))
+                LogisticRegressionTrainingStats.ComputeStd(linearModel, ch);
+
+            LinearModelStatistics.TryGetBiasStatistics(stats, 2, out stdError, out zScore, out pValue);
+
+            Assert.True(stdError == 0.250672936f);
+            Assert.True(zScore == 7.97852373f);
         }
     }
 }

From c638cbd23c88b5f228017b55b6aa60265201b62e Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Wed, 7 Nov 2018 15:48:01 -0800
Subject: [PATCH 06/14] refactoring the std computations in an interface

---
 .../LogisticRegressionTrainingStats.cs        |  31 ++---
 .../LogisticRegression/LogisticRegression.cs  | 116 +++++++++++++++++-
 .../Standard/ModelStatistics.cs               | 101 +--------------
 .../TrainerEstimators/LbfgsTests.cs           |  24 +---
 .../InstanceInitializerAnalyzer.cs            |   2 +-
 5 files changed, 136 insertions(+), 138 deletions(-)

diff --git a/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs b/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
index adf823c42c..21a48dc746 100644
--- a/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
+++ b/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
@@ -11,39 +11,42 @@ namespace Microsoft.ML.Runtime.Learners
 {
     using Mkl = OlsLinearRegressionTrainer.Mkl;
 
-    public static class LogisticRegressionTrainingStats
+    public sealed class ComputeLRTrainingStdThroughHal : IComputeLRTrainingStd
     {
         /// <summary>
         /// Computes the standart deviation matrix of each of the non-zero training weights, needed to calculate further the standart deviation,
-        /// p-value and z-Score.
-        /// This function performs the same calculations as <see cref="ComputeStd(LinearBinaryPredictor, IChannel, float)"/> but it is faster than it, because it makes use of Intel's MKL.
+        /// p-value and z-Score, making use of Intel's MKL for the matrix operations.
         /// </summary>
-        /// <param name="model">A <see cref="LinearBinaryPredictor"/> obtained as a result of training with <see cref="LogisticRegression"/>.</param>
+        /// <param name="hessian"></param>
+        /// <param name="weightIndices"></param>
+        /// <param name="numSelectedParams"></param>
+        /// <param name="currentWeightsCount"></param>
         /// <param name="ch">The <see cref="IChannel"/> used for messaging.</param>
         /// <param name="l2Weight">The L2Weight used for training. (Supply the same one that got used during training.)</param>
-        public static void ComputeStd(LinearBinaryPredictor model, IChannel ch, float l2Weight = LogisticRegression.Arguments.Defaults.L2Weight)
+        public VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int numSelectedParams, int currentWeightsCount, IChannel ch, float l2Weight)
         {
             Contracts.AssertValue(ch);
-            Contracts.AssertValue(model.Statistics, $"Training Statistics can get generated after training finishes. Train with setting: ShowTrainigStats set to true.");
+            Contracts.AssertValue(hessian, $"Training Statistics can get generated after training finishes. Train with setting: ShowTrainigStats set to true.");
+            Contracts.AssertNonEmpty(weightIndices);
+            Contracts.Assert(numSelectedParams > 0);
+            Contracts.Assert(currentWeightsCount > 0);
             Contracts.Assert(l2Weight > 0);
 
-            int numSelectedParams = model.Statistics.ParametersCount;
-
             // Apply Cholesky Decomposition to find the inverse of the Hessian.
             Double[] invHessian = null;
             try
             {
                 // First, find the Cholesky decomposition LL' of the Hessian.
-                Mkl.Pptrf(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numSelectedParams, model.Statistics.Hessian);
+                Mkl.Pptrf(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numSelectedParams, hessian);
                 // Note that hessian is already modified at this point. It is no longer the original Hessian,
                 // but instead represents the Cholesky decomposition L.
                 // Also note that the following routine is supposed to consume the Cholesky decomposition L instead
                 // of the original information matrix.
-                Mkl.Pptri(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numSelectedParams, model.Statistics.Hessian);
+                Mkl.Pptri(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numSelectedParams, hessian);
                 // At this point, hessian should contain the inverse of the original Hessian matrix.
                 // Swap hessian with invHessian to avoid confusion in the following context.
-                Utils.Swap(ref model.Statistics.Hessian, ref invHessian);
-                Contracts.Assert(model.Statistics.Hessian == null);
+                Utils.Swap(ref hessian, ref invHessian);
+                Contracts.Assert(hessian == null);
             }
             catch (DllNotFoundException)
             {
@@ -86,9 +89,7 @@ public static void ComputeStd(LinearBinaryPredictor model, IChannel ch, float l2
                 stdErrorValues[i] = (float)Math.Sqrt(stdErrorValues[i]);
 
             // currentWeights vector size is Weights2 + the bias
-            var currentWeightsCount = model.Weights2.Count + 1;
-            VBuffer<float> stdErrors = new VBuffer<float>(currentWeightsCount, numSelectedParams, stdErrorValues, model.Statistics.WeightIndices);
-            model.Statistics.SetCoeffStdError(stdErrors);
+            return new VBuffer<float>(currentWeightsCount, numSelectedParams, stdErrorValues, weightIndices);
         }
     }
 }
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
index a6476fd04b..8318b6ae5f 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
@@ -4,6 +4,7 @@
 
 using System;
 using System.Collections.Generic;
+using MathNet.Numerics.LinearAlgebra;
 using Microsoft.ML.Core.Data;
 using Microsoft.ML.Runtime;
 using Microsoft.ML.Runtime.CommandLine;
@@ -42,6 +43,8 @@ public sealed class Arguments : ArgumentsBase
         {
             [Argument(ArgumentType.AtMostOnce, HelpText = "Show statistics of training examples.", ShortName = "stat", SortOrder = 50)]
             public bool ShowTrainingStats = false;
+
+            public IComputeLRTrainingStd StdComputer;
         }
 
         private double _posWeight;
@@ -78,6 +81,9 @@ public LogisticRegression(IHostEnvironment env,
 
             _posWeight = 0;
             ShowTrainingStats = Args.ShowTrainingStats;
+
+            if (Args.StdComputer == null)
+                Args.StdComputer = new ComputeLRTrainingStd();
         }
 
         /// <summary>
@@ -88,6 +94,9 @@ internal LogisticRegression(IHostEnvironment env, Arguments args)
         {
             _posWeight = 0;
             ShowTrainingStats = Args.ShowTrainingStats;
+
+            if (Args.StdComputer == null)
+                Args.StdComputer = new ComputeLRTrainingStd();
         }
 
         public override PredictionKind PredictionKind => PredictionKind.BinaryClassification;
@@ -329,9 +338,14 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
                     }
                 }
             }
-            _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance);
-            _stats.Hessian = hessian;
-            _stats.WeightIndices = weightIndices;
+
+            if (Args.StdComputer == null)
+                _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance);
+            else
+            {
+                var std = Args.StdComputer.ComputeStd(hessian, weightIndices, numParams, CurrentWeights.Count, ch, L2Weight);
+                _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance, std);
+            }
         }
 
         protected override void ProcessPriorDistribution(float label, float weight)
@@ -398,4 +412,100 @@ public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironm
                 () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.WeightColumn));
         }
     }
+
+    public interface IComputeLRTrainingStd
+    {
+        VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int parametersCount, int currentWeightsCount, IChannel ch, float l2Weight);
+    }
+
+    public sealed class ComputeLRTrainingStd: IComputeLRTrainingStd
+    {
+        /// <summary>
+        /// Computes the standart deviation matrix of each of the non-zero training weights, needed to calculate further the standart deviation,
+        /// p-value and z-Score.
+        /// If you need faster calculations, use the ComputeStd method from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration.
+        /// </summary>
+        /// <param name="hessian"></param>
+        /// <param name="weightIndices"></param>
+        /// <param name="numSelectedParams"></param>
+        /// <param name="currentWeightsCount"></param>
+        /// <param name="ch">The <see cref="IChannel"/> used for messaging.</param>
+        /// <param name="l2Weight">The L2Weight used for training. (Supply the same one that got used during training.)</param>
+        public VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int numSelectedParams, int currentWeightsCount, IChannel ch, float l2Weight)
+        {
+            Contracts.AssertValue(ch);
+            Contracts.AssertValue(hessian, $"Training Statistics can get generated after training finishes. Train with setting: ShowTrainigStats set to true.");
+            Contracts.AssertNonEmpty(weightIndices);
+            Contracts.Assert(numSelectedParams > 0);
+            Contracts.Assert(currentWeightsCount > 0);
+            Contracts.Assert(l2Weight > 0);
+
+            double[,] matrixHessian = new double[numSelectedParams, numSelectedParams];
+
+            int hessianLength = 0;
+            int dimention = numSelectedParams - 1;
+
+            for (int row = dimention; row >= 0; row--)
+            {
+                for (int col = 0; col <= dimention; col++)
+                {
+                    if ((row + col) <= dimention)
+                    {
+                        if ((row + col) == dimention)
+                        {
+                            matrixHessian[row, col] = hessian[hessianLength];
+                        }
+                        else
+                        {
+                            matrixHessian[row, col] = hessian[hessianLength];
+                            matrixHessian[dimention - col, dimention - row] = hessian[hessianLength];
+                        }
+                        hessianLength++;
+                    }
+                    else
+                        continue;
+                }
+            }
+
+            var h = Matrix<double>.Build.DenseOfArray(matrixHessian);
+            var invers = h.Inverse();
+
+            float[] stdErrorValues2 = new float[numSelectedParams];
+            stdErrorValues2[0] = (float)Math.Sqrt(invers[0, numSelectedParams - 1]);
+
+            for (int i = 1; i < numSelectedParams; i++)
+            {
+                // Initialize with inverse Hessian.
+                // The diagonal of the inverse Hessian.
+                stdErrorValues2[i] = (Single)invers[i, numSelectedParams - i - 1];
+            }
+
+            if (l2Weight > 0)
+            {
+                // Iterate through all entries of inverse Hessian to make adjustment to variance.
+                // A discussion on ridge regularized LR coefficient covariance matrix can be found here:
+                // http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3228544/
+                // http://www.inf.unibz.it/dis/teaching/DWDM/project2010/LogisticRegression.pdf
+                int ioffset = 1;
+                for (int iRow = 1; iRow < numSelectedParams; iRow++)
+                {
+                    for (int iCol = 0; iCol <= iRow; iCol++)
+                    {
+                        float entry = (float)invers[iRow, numSelectedParams - iCol - 1];
+                        var adjustment = -l2Weight * entry * entry;
+                        stdErrorValues2[iRow] -= adjustment;
+
+                        if (0 < iCol && iCol < iRow)
+                            stdErrorValues2[iCol] -= adjustment;
+                        ioffset++;
+                    }
+                }
+            }
+
+            for (int i = 1; i < numSelectedParams; i++)
+                stdErrorValues2[i] = (float)Math.Sqrt(stdErrorValues2[i]);
+
+            return new VBuffer<float>(currentWeightsCount, numSelectedParams, stdErrorValues2, weightIndices);
+        }
+    }
 }
diff --git a/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs b/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs
index ff29b6375a..90158e5fdc 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs
@@ -82,7 +82,7 @@ private static VersionInfo GetVersionInfo()
         // It could be null when there are too many non-zero weights so that
         // the memory is insufficient to hold the Hessian matrix necessary for the computation
         // of the variance-covariance matrix.
-        private VBuffer<Single>? _coeffStdError;
+        private readonly VBuffer<Single>? _coeffStdError;
 
         public long TrainingExampleCount => _trainingExampleCount;
 
@@ -92,11 +92,6 @@ private static VersionInfo GetVersionInfo()
 
         public int ParametersCount => _paramCount;
 
-        public Double[] Hessian;
-
-        // Indices of bias and non-zero weight slots.
-        public int[] WeightIndices;
-
         internal LinearModelStatistics(IHostEnvironment env, long trainingExampleCount, int paramCount, Single deviance, Single nullDeviance)
         {
             Contracts.AssertValue(env);
@@ -215,7 +210,6 @@ private void SaveCore(ModelSaveContext ctx)
 
         /// <summary>
         /// Computes the standart deviation, Z-Score and p-Value.
-        /// Should be called after <see cref="ComputeStd(LinearBinaryPredictor, IChannel, float)"/>.
         /// </summary>
         public static bool TryGetBiasStatistics(LinearModelStatistics stats, Single bias, out Single stdError, out Single zScore, out Single pValue)
         {
@@ -235,93 +229,6 @@ public static bool TryGetBiasStatistics(LinearModelStatistics stats, Single bias
             return true;
         }
 
-        /// <summary>
-        /// Computes the standart deviation matrix of each of the non-zero training weights, needed to calculate further the standart deviation,
-        /// p-value and z-Score.
-        /// If you need faster calculations, use the ComputeStd method from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration.
-        /// </summary>
-        /// <param name="model">A <see cref="LinearBinaryPredictor"/> obtained as a result of training with <see cref="LogisticRegression"/>.</param>
-        /// <param name="ch">The <see cref="IChannel"/> used for messaging.</param>
-        /// <param name="l2Weight">The L2Weight used for training. (Supply the same one that got used during training.)</param>
-        public static void ComputeStd(LinearBinaryPredictor model, IChannel ch, float l2Weight = LogisticRegression.Arguments.Defaults.L2Weight)
-        {
-            Contracts.AssertValue(ch);
-            Contracts.AssertValue(model.Statistics, $"Training Statistics can get generated after training finishes. Train with setting: ShowTrainigStats set to true.");
-            Contracts.Assert(l2Weight > 0);
-
-            int numSelectedParams = model.Statistics.ParametersCount;
-
-            double[] hessian = model.Statistics.Hessian;
-            double[,] matrixHessian = new double[numSelectedParams, numSelectedParams];
-
-            int hessianLength = 0;
-            int dimention = numSelectedParams - 1;
-
-            for (int row = dimention; row >= 0; row--)
-            {
-                for (int col = 0; col <= dimention; col++)
-                {
-                    if ((row + col) <= dimention)
-                    {
-                        if ((row + col) == dimention)
-                        {
-                            matrixHessian[row, col] = hessian[hessianLength];
-                        }
-                        else
-                        {
-                            matrixHessian[row, col] = hessian[hessianLength];
-                            matrixHessian[dimention - col, dimention - row] = hessian[hessianLength];
-                        }
-                        hessianLength++;
-                    }
-                    else
-                        continue;
-                }
-            }
-
-            var h = Matrix<double>.Build.DenseOfArray(matrixHessian);
-            var invers = h.Inverse();
-
-            float[] stdErrorValues2 = new float[numSelectedParams];
-            stdErrorValues2[0] = (float)Math.Sqrt(invers[0, numSelectedParams - 1]);
-
-            for (int i = 1; i < numSelectedParams; i++)
-            {
-                // Initialize with inverse Hessian.
-                // The diagonal of the inverse Hessian.
-                stdErrorValues2[i] = (Single)invers[i, numSelectedParams - i - 1];
-            }
-
-            if (l2Weight > 0)
-            {
-                // Iterate through all entries of inverse Hessian to make adjustment to variance.
-                // A discussion on ridge regularized LR coefficient covariance matrix can be found here:
-                // http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3228544/
-                // http://www.inf.unibz.it/dis/teaching/DWDM/project2010/LogisticRegression.pdf
-                int ioffset = 1;
-                for (int iRow = 1; iRow < numSelectedParams; iRow++)
-                {
-                    for (int iCol = 0; iCol <= iRow; iCol++)
-                    {
-                        float entry = (float)invers[iRow, numSelectedParams - iCol - 1];
-                        var adjustment = -l2Weight * entry * entry;
-                        stdErrorValues2[iRow] -= adjustment;
-
-                        if (0 < iCol && iCol < iRow)
-                            stdErrorValues2[iCol] -= adjustment;
-                        ioffset++;
-                    }
-                }
-            }
-
-            for (int i = 1; i < numSelectedParams; i++)
-                stdErrorValues2[i] = (float)Math.Sqrt(stdErrorValues2[i]);
-
-            var currentWeightsCount = model.Weights2.Count + 1; // adding one for the bias
-            VBuffer<float> stdErrors = new VBuffer<float>(currentWeightsCount, numSelectedParams, stdErrorValues2, model.Statistics.WeightIndices);
-            model.Statistics.SetCoeffStdError(stdErrors);
-        }
-
         private static void GetUnorderedCoefficientStatistics(LinearModelStatistics stats, in VBuffer<Single> weights, in VBuffer<ReadOnlyMemory<char>> names,
             ref VBuffer<Single> estimate, ref VBuffer<Single> stdErr, ref VBuffer<Single> zScore, ref VBuffer<Single> pValue, out ValueGetter<VBuffer<ReadOnlyMemory<char>>> getSlotNames)
         {
@@ -381,12 +288,6 @@ private static void GetUnorderedCoefficientStatistics(LinearModelStatistics stat
                 };
         }
 
-        public void SetCoeffStdError(VBuffer<Single> coeffStdError)
-        {
-            _env.Assert(coeffStdError.Count == _paramCount);
-            _coeffStdError = coeffStdError;
-        }
-
         private IEnumerable<CoefficientStatistics> GetUnorderedCoefficientStatistics(LinearBinaryPredictor parent, RoleMappedSchema schema)
         {
             Contracts.AssertValue(_env);
diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs b/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
index 15949e1c9f..8587cd6961 100644
--- a/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
+++ b/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
@@ -52,15 +52,6 @@ public void TestLogisticRegressionStats()
             var stats = linearModel.Statistics;
             LinearModelStatistics.TryGetBiasStatistics(stats, 2, out float stdError, out float zScore, out float pValue);
 
-            Assert.Equal(0.0f, stdError);
-            Assert.Equal(0.0f, zScore);
-            Assert.Equal(0.0f, pValue);
-
-            using (var ch = Env.Start("Calcuating STD for LR."))
-                LinearModelStatistics.ComputeStd(linearModel, ch);
-
-            LinearModelStatistics.TryGetBiasStatistics(stats, 2, out stdError, out zScore, out pValue);
-
             Assert.True(stdError == 0.250672936f);
             Assert.True(zScore == 7.97852373f);
         }
@@ -70,22 +61,17 @@ public void TestLogisticRegressionStats_MKL()
         {
             (IEstimator<ITransformer> pipe, IDataView dataView) = GetBinaryClassificationPipeline();
 
-            pipe = pipe.Append(new LogisticRegression(Env, "Features", "Label", advancedSettings: s => { s.ShowTrainingStats = true; }));
+            pipe = pipe.Append(new LogisticRegression(Env, "Features", "Label", advancedSettings: s => {
+                s.ShowTrainingStats = true;
+                s.StdComputer = new ComputeLRTrainingStdThroughHal();
+            }));
+
             var transformerChain = pipe.Fit(dataView) as TransformerChain<BinaryPredictionTransformer<ParameterMixingCalibratedPredictor>>;
 
             var linearModel = transformerChain.LastTransformer.Model.SubPredictor as LinearBinaryPredictor;
             var stats = linearModel.Statistics;
             LinearModelStatistics.TryGetBiasStatistics(stats, 2, out float stdError, out float zScore, out float pValue);
 
-            Assert.Equal(0.0f, stdError);
-            Assert.Equal(0.0f, zScore);
-            Assert.Equal(0.0f, pValue);
-
-            using (var ch = Env.Start("Calcuating STD for LR."))
-                LogisticRegressionTrainingStats.ComputeStd(linearModel, ch);
-
-            LinearModelStatistics.TryGetBiasStatistics(stats, 2, out stdError, out zScore, out pValue);
-
             Assert.True(stdError == 0.250672936f);
             Assert.True(zScore == 7.97852373f);
         }
diff --git a/tools-local/Microsoft.ML.InternalCodeAnalyzer/InstanceInitializerAnalyzer.cs b/tools-local/Microsoft.ML.InternalCodeAnalyzer/InstanceInitializerAnalyzer.cs
index c7ee67537a..c4bd2fe38c 100644
--- a/tools-local/Microsoft.ML.InternalCodeAnalyzer/InstanceInitializerAnalyzer.cs
+++ b/tools-local/Microsoft.ML.InternalCodeAnalyzer/InstanceInitializerAnalyzer.cs
@@ -18,7 +18,7 @@ public sealed class InstanceInitializerAnalyzer : DiagnosticAnalyzer
         internal const string DiagnosticId = "MSML_NoInstanceInitializers";
 
         private const string Title = "No initializers on instance fields or properties";
-        private const string Format = "Member {0} has a {1} initialier outside the constructor";
+        private const string Format = "Member {0} has a {1} initializer outside the constructor";
 
         private static DiagnosticDescriptor Rule =
             new DiagnosticDescriptor(DiagnosticId, Title, Format, Category,

From dd9524edf1bfdae3998fc51370ecbdaec9a4ca8e Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Thu, 8 Nov 2018 10:34:41 -0800
Subject: [PATCH 07/14] fix visibility and xml comments

---
 .../LogisticRegressionTrainingStats.cs        |  4 ++-
 .../LogisticRegression/LbfgsPredictorBase.cs  | 16 +++++------
 .../LogisticRegression/LogisticRegression.cs  | 28 +++++++++++++++++++
 3 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs b/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
index 21a48dc746..071296c112 100644
--- a/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
+++ b/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
@@ -15,7 +15,9 @@ public sealed class ComputeLRTrainingStdThroughHal : IComputeLRTrainingStd
     {
         /// <summary>
         /// Computes the standart deviation matrix of each of the non-zero training weights, needed to calculate further the standart deviation,
-        /// p-value and z-Score, making use of Intel's MKL for the matrix operations.
+        /// p-value and z-Score.
+        /// If you need faster calculations, use the ComputeStd method from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration.
+        /// Due to the existence of regularization, an approximation is used to compute the variances of the trained linear coefficients.
         /// </summary>
         /// <param name="hessian"></param>
         /// <param name="weightIndices"></param>
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
index 6b3f3fa734..f17e617e29 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
@@ -92,14 +92,14 @@ public abstract class ArgumentsBase : LearnerInputBaseWithWeight
             [Argument(ArgumentType.AtMostOnce, HelpText = "Enforce non-negative weights", ShortName = "nn", SortOrder = 90)]
             public bool EnforceNonNegativity = Defaults.EnforceNonNegativity;
 
-            public static class Defaults
+            internal static class Defaults
             {
-                public const float L2Weight = 1;
-                public const float L1Weight = 1;
-                public const float OptTol = 1e-7f;
-                public const int MemorySize = 20;
-                public const int MaxIterations = int.MaxValue;
-                public const bool EnforceNonNegativity = false;
+                internal const float L2Weight = 1;
+                internal const float L1Weight = 1;
+                internal const float OptTol = 1e-7f;
+                internal const int MemorySize = 20;
+                internal const int MaxIterations = int.MaxValue;
+                internal const bool EnforceNonNegativity = false;
             }
         }
 
@@ -258,7 +258,7 @@ private static TArgs ArgsInit(string featureColumn, SchemaShape.Column labelColu
         }
 
         protected virtual int ClassCount => 1;
-        public int BiasCount => ClassCount;
+        protected int BiasCount => ClassCount;
         protected int WeightCount => ClassCount * NumFeatures;
         protected virtual Optimizer InitializeOptimizer(IChannel ch, FloatLabelCursor.Factory cursorFactory,
             out VBuffer<float> init, out ITerminationCriterion terminationCriterion)
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
index 8318b6ae5f..bc177dcc63 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
@@ -41,9 +41,23 @@ public sealed partial class LogisticRegression : LbfgsTrainerBase<LogisticRegres
 
         public sealed class Arguments : ArgumentsBase
         {
+            /// <summary>
+            /// If set to <value>true</value>training statistics will be generated at the end of training.
+            /// If you have a large number of learned training parameters(more than 500),
+            /// generating the training statistics might take a few seconds.
+            /// More than 1000 weights might take a few minutes. For those cases consider using the instance of <see cref="IComputeLRTrainingStd"/>
+            /// present in the Microsoft.ML.HalLearners package. That computes the statistics using hardware acceleration.
+            /// </summary>
             [Argument(ArgumentType.AtMostOnce, HelpText = "Show statistics of training examples.", ShortName = "stat", SortOrder = 50)]
             public bool ShowTrainingStats = false;
 
+            /// <summary>
+            /// The instance of <see cref="IComputeLRTrainingStd"/> that computes the training statistics at the end of training.
+            /// If you have a large number of learned training parameters(more than 500),
+            /// generating the training statistics might take a few seconds.
+            /// More than 1000 weights might take a few minutes. For those cases consider using the instance of <see cref="IComputeLRTrainingStd"/>
+            /// present in the Microsoft.ML.HalLearners package. That computes the statistics using hardware acceleration.
+            /// </summary>
             public IComputeLRTrainingStd StdComputer;
         }
 
@@ -413,8 +427,21 @@ public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironm
         }
     }
 
+    /// <summary>
+    /// Computes the standart deviation matrix of each of the non-zero training weights, needed to calculate further the standart deviation,
+    /// p-value and z-Score.
+    /// If you need fast calculations, use the <see cref="IComputeLRTrainingStd"/> implementation in the Microsoft.ML.HALLearners package,
+    /// which makes use of hardware acceleration.
+    /// Due to the existence of regularization, an approximation is used to compute the variances of the trained linear coefficients.
+    /// </summary>
     public interface IComputeLRTrainingStd
     {
+        /// <summary>
+        /// Computes the standart deviation matrix of each of the non-zero training weights, needed to calculate further the standart deviation,
+        /// p-value and z-Score.
+        /// If you need fast calculations, use the ComputeStd method from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration.
+        /// Due to the existence of regularization, an approximation is used to compute the variances of the trained linear coefficients.
+        /// </summary>
         VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int parametersCount, int currentWeightsCount, IChannel ch, float l2Weight);
     }
 
@@ -424,6 +451,7 @@ public sealed class ComputeLRTrainingStd: IComputeLRTrainingStd
         /// Computes the standart deviation matrix of each of the non-zero training weights, needed to calculate further the standart deviation,
         /// p-value and z-Score.
         /// If you need faster calculations, use the ComputeStd method from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration.
+        /// Due to the existence of regularization, an approximation is used to compute the variances of the trained linear coefficients.
         /// </summary>
         /// <param name="hessian"></param>
         /// <param name="weightIndices"></param>

From e540d6343206b781f69a2809675f59571aa4f00c Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Thu, 8 Nov 2018 10:58:58 -0800
Subject: [PATCH 08/14] tolerance around test numbr comparisons

---
 .../BaseTestBaseline.cs                       | 51 ++++++++++---------
 .../TrainerEstimators/LbfgsTests.cs           |  8 +--
 2 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/test/Microsoft.ML.TestFramework/BaseTestBaseline.cs b/test/Microsoft.ML.TestFramework/BaseTestBaseline.cs
index f4443f6fc9..3fb0867696 100644
--- a/test/Microsoft.ML.TestFramework/BaseTestBaseline.cs
+++ b/test/Microsoft.ML.TestFramework/BaseTestBaseline.cs
@@ -535,29 +535,7 @@ private bool MatchNumberWithTolerance(MatchCollection firstCollection, MatchColl
                 double f1 = double.Parse(firstCollection[i].ToString());
                 double f2 = double.Parse(secondCollection[i].ToString());
 
-                // this follows the IEEE recommendations for how to compare floating point numbers
-                double allowedVariance = Math.Pow(10, -digitsOfPrecision);
-                double delta = Round(f1, digitsOfPrecision) - Round(f2, digitsOfPrecision);
-                // limitting to the digits we care about. 
-                delta = Math.Round(delta, digitsOfPrecision);
-
-                bool inRange = delta > -allowedVariance && delta < allowedVariance;
-
-                // for some cases, rounding up is not beneficial
-                // so checking on whether the difference is significant prior to rounding, and failing only then. 
-                // example, for 5 digits of precision. 
-                // F1 = 1.82844949 Rounds to 1.8284
-                // F2 = 1.8284502  Rounds to 1.8285
-                // would fail the inRange == true check, but would suceed the following, and we doconsider those two numbers 
-                // (1.82844949 - 1.8284502) = -0.00000071
-
-                if (!inRange)
-                {
-                    delta = Math.Round(f1 - f2, digitsOfPrecision);
-                    inRange = delta >= -allowedVariance && delta <= allowedVariance;
-                }
-
-                if(!inRange)
+                if(!CompareNumbersWithTolerance(f1, f2, digitsOfPrecision))
                 {
                     return false;
                 }
@@ -566,6 +544,33 @@ private bool MatchNumberWithTolerance(MatchCollection firstCollection, MatchColl
             return true;
         }
 
+        public bool CompareNumbersWithTolerance(double expected, double actual, int digitsOfPrecision = DigitsOfPrecision)
+        {
+            // this follows the IEEE recommendations for how to compare floating point numbers
+            double allowedVariance = Math.Pow(10, -digitsOfPrecision);
+            double delta = Round(expected, digitsOfPrecision) - Round(actual, digitsOfPrecision);
+            // limitting to the digits we care about. 
+            delta = Math.Round(delta, digitsOfPrecision);
+
+            bool inRange = delta > -allowedVariance && delta < allowedVariance;
+
+            // for some cases, rounding up is not beneficial
+            // so checking on whether the difference is significant prior to rounding, and failing only then. 
+            // example, for 5 digits of precision. 
+            // F1 = 1.82844949 Rounds to 1.8284
+            // F2 = 1.8284502  Rounds to 1.8285
+            // would fail the inRange == true check, but would suceed the following, and we doconsider those two numbers 
+            // (1.82844949 - 1.8284502) = -0.00000071
+
+            if (!inRange)
+            {
+                delta = Math.Round(expected - actual, digitsOfPrecision);
+                inRange = delta >= -allowedVariance && delta <= allowedVariance;
+            }
+
+            return inRange;
+        }
+
         private static double Round(double value, int digitsOfPrecision)
         {
             if ((value == 0) || double.IsInfinity(value) || double.IsNaN(value))
diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs b/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
index 8587cd6961..5ebf27edae 100644
--- a/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
+++ b/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
@@ -52,8 +52,8 @@ public void TestLogisticRegressionStats()
             var stats = linearModel.Statistics;
             LinearModelStatistics.TryGetBiasStatistics(stats, 2, out float stdError, out float zScore, out float pValue);
 
-            Assert.True(stdError == 0.250672936f);
-            Assert.True(zScore == 7.97852373f);
+            CompareNumbersWithTolerance(stdError, 0.250672936);
+            CompareNumbersWithTolerance(zScore, 7.97852373);
         }
 
         [Fact]
@@ -72,8 +72,8 @@ public void TestLogisticRegressionStats_MKL()
             var stats = linearModel.Statistics;
             LinearModelStatistics.TryGetBiasStatistics(stats, 2, out float stdError, out float zScore, out float pValue);
 
-            Assert.True(stdError == 0.250672936f);
-            Assert.True(zScore == 7.97852373f);
+            CompareNumbersWithTolerance(stdError, 0.250672936);
+            CompareNumbersWithTolerance(zScore, 7.97852373);
         }
     }
 }

From fe29307efc072cec96255e10483919c1c6a5bd3c Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Thu, 8 Nov 2018 23:47:20 -0800
Subject: [PATCH 09/14] addressing PR comments

---
 pkg/Microsoft.ML/Microsoft.ML.nupkgproj                |  1 +
 .../LogisticRegressionTrainingStats.cs                 |  8 ++++----
 .../Standard/LogisticRegression/LogisticRegression.cs  | 10 +++++-----
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/pkg/Microsoft.ML/Microsoft.ML.nupkgproj b/pkg/Microsoft.ML/Microsoft.ML.nupkgproj
index 75517c587e..f479d0e970 100644
--- a/pkg/Microsoft.ML/Microsoft.ML.nupkgproj
+++ b/pkg/Microsoft.ML/Microsoft.ML.nupkgproj
@@ -8,6 +8,7 @@
   <ItemGroup>
     <ProjectReference Include="../Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.nupkgproj" />
 
+    <PackageReference Include="MathNet.Numerics.Signed" Version="$(MathNumericPackageVersion)" />
     <PackageReference Include="Newtonsoft.Json" Version="$(NewtonsoftJsonPackageVersion)" />
     <PackageReference Include="System.Reflection.Emit.Lightweight" Version="$(SystemReflectionEmitLightweightPackageVersion)" />
     <PackageReference Include="System.Threading.Tasks.Dataflow" Version="$(SystemThreadingTasksDataflowPackageVersion)" />
diff --git a/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs b/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
index 071296c112..9aca098a35 100644
--- a/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
+++ b/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
@@ -28,7 +28,7 @@ public sealed class ComputeLRTrainingStdThroughHal : IComputeLRTrainingStd
         public VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int numSelectedParams, int currentWeightsCount, IChannel ch, float l2Weight)
         {
             Contracts.AssertValue(ch);
-            Contracts.AssertValue(hessian, $"Training Statistics can get generated after training finishes. Train with setting: ShowTrainigStats set to true.");
+            Contracts.AssertValue(hessian, nameof(hessian));
             Contracts.AssertNonEmpty(weightIndices);
             Contracts.Assert(numSelectedParams > 0);
             Contracts.Assert(currentWeightsCount > 0);
@@ -61,7 +61,7 @@ public VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int numS
             for (int i = 1; i < numSelectedParams; i++)
             {
                 // Initialize with inverse Hessian.
-                stdErrorValues[i] = (Single)invHessian[i * (i + 1) / 2 + i];
+                stdErrorValues[i] = (float)invHessian[i * (i + 1) / 2 + i];
             }
 
             if (l2Weight > 0)
@@ -75,8 +75,8 @@ public VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int numS
                 {
                     for (int iCol = 0; iCol <= iRow; iCol++)
                     {
-                        var entry = (Single)invHessian[ioffset];
-                        var adjustment = -l2Weight * entry * entry;
+                        var entry = (float)invHessian[ioffset];
+                        var adjustment = l2Weight * entry * entry;
                         stdErrorValues[iRow] -= adjustment;
                         if (0 < iCol && iCol < iRow)
                             stdErrorValues[iCol] -= adjustment;
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
index bc177dcc63..6e29b88dcd 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
@@ -96,7 +96,7 @@ public LogisticRegression(IHostEnvironment env,
             _posWeight = 0;
             ShowTrainingStats = Args.ShowTrainingStats;
 
-            if (Args.StdComputer == null)
+            if (ShowTrainingStats && Args.StdComputer == null)
                 Args.StdComputer = new ComputeLRTrainingStd();
         }
 
@@ -109,7 +109,7 @@ internal LogisticRegression(IHostEnvironment env, Arguments args)
             _posWeight = 0;
             ShowTrainingStats = Args.ShowTrainingStats;
 
-            if (Args.StdComputer == null)
+            if (ShowTrainingStats && Args.StdComputer == null)
                 Args.StdComputer = new ComputeLRTrainingStd();
         }
 
@@ -462,7 +462,7 @@ public sealed class ComputeLRTrainingStd: IComputeLRTrainingStd
         public VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int numSelectedParams, int currentWeightsCount, IChannel ch, float l2Weight)
         {
             Contracts.AssertValue(ch);
-            Contracts.AssertValue(hessian, $"Training Statistics can get generated after training finishes. Train with setting: ShowTrainigStats set to true.");
+            Contracts.AssertValue(hessian, nameof(hessian));
             Contracts.AssertNonEmpty(weightIndices);
             Contracts.Assert(numSelectedParams > 0);
             Contracts.Assert(currentWeightsCount > 0);
@@ -505,7 +505,7 @@ public VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int numS
             {
                 // Initialize with inverse Hessian.
                 // The diagonal of the inverse Hessian.
-                stdErrorValues2[i] = (Single)invers[i, numSelectedParams - i - 1];
+                stdErrorValues2[i] = (float)invers[i, numSelectedParams - i - 1];
             }
 
             if (l2Weight > 0)
@@ -520,7 +520,7 @@ public VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int numS
                     for (int iCol = 0; iCol <= iRow; iCol++)
                     {
                         float entry = (float)invers[iRow, numSelectedParams - iCol - 1];
-                        var adjustment = -l2Weight * entry * entry;
+                        var adjustment = l2Weight * entry * entry;
                         stdErrorValues2[iRow] -= adjustment;
 
                         if (0 < iCol && iCol < iRow)

From 5386a8cc065a33b4b7332c44067e00a60f56bd10 Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Fri, 9 Nov 2018 10:23:50 -0800
Subject: [PATCH 10/14] baselines have more information because of the added
 calculations

---
 .../EntryPoints/ensemble-model0-stats.txt     | 12 ++++++--
 .../EntryPoints/ensemble-model2-stats.txt     | 12 ++++++--
 .../ensemble-summary-key-value-pairs.txt      | 20 +++++++++++++
 .../Common/EntryPoints/ensemble-summary.txt   | 30 +++++++++++++++++++
 4 files changed, 70 insertions(+), 4 deletions(-)

diff --git a/test/BaselineOutput/Common/EntryPoints/ensemble-model0-stats.txt b/test/BaselineOutput/Common/EntryPoints/ensemble-model0-stats.txt
index 5c5d36e4b6..057ef0ff87 100644
--- a/test/BaselineOutput/Common/EntryPoints/ensemble-model0-stats.txt
+++ b/test/BaselineOutput/Common/EntryPoints/ensemble-model0-stats.txt
@@ -5,6 +5,14 @@
 #@   col={name={Residual Deviance} type=R4 src=1}
 #@   col={name={Null Deviance} type=R4 src=2}
 #@   col=AIC:R4:3
+#@   col=BiasEstimate:R4:4
+#@   col=BiasStandardError:R4:5
+#@   col=BiasZScore:R4:6
+#@   col=BiasPValue:R4:7
+#@   col=Estimate:R4:8-16
+#@   col=StandardError:R4:17-25
+#@   col=ZScore:R4:26-34
+#@   col=PValue:R4:35-43
 #@ }
-Count of training examples	Residual Deviance	Null Deviance	AIC
-521	98.29433	669.0935	118.294327
+Count of training examples	Residual Deviance	Null Deviance	AIC	BiasEstimate	BiasStandardError	BiasZScore	BiasPValue	Features.thickness	Features.uniform_size	Features.uniform_shape	Features.adhesion	Features.epit_size	Features.bare_nuclei	Features.bland_chromatin	Features.normal_nucleoli	Cat.1	Features.thickness	Features.uniform_size	Features.uniform_shape	Features.adhesion	Features.epit_size	Features.bare_nuclei	Features.bland_chromatin	Features.normal_nucleoli	Cat.1	Features.thickness	Features.uniform_size	Features.uniform_shape	Features.adhesion	Features.epit_size	Features.bare_nuclei	Features.bland_chromatin	Features.normal_nucleoli	Cat.1	Features.thickness	Features.uniform_size	Features.uniform_shape	Features.adhesion	Features.epit_size	Features.bare_nuclei	Features.bland_chromatin	Features.normal_nucleoli	Cat.1
+521	98.29433	669.0935	118.294327	-5.120674	0.699818552	-7.31714535	0	2.353567	1.78653753	1.9442488	1.38072	1.0831089	2.43588924	1.61141682	1.34575915	-0.7715381	0.4267568	0.42040658	0.41370967	0.482155383	0.456691444	0.451504	0.4605175	0.478413582	0.342069477	5.5150075	4.249547	4.69954872	2.86364126	2.37164259	5.395056	3.4991436	2.81296182	-2.255501	5.96046448E-08	2.14576721E-05	2.62260437E-06	0.00418818	0.0177091956	5.96046448E-08	0.000466823578	0.00490885973	0.0241017938
diff --git a/test/BaselineOutput/Common/EntryPoints/ensemble-model2-stats.txt b/test/BaselineOutput/Common/EntryPoints/ensemble-model2-stats.txt
index 152e94f64d..dbb2224574 100644
--- a/test/BaselineOutput/Common/EntryPoints/ensemble-model2-stats.txt
+++ b/test/BaselineOutput/Common/EntryPoints/ensemble-model2-stats.txt
@@ -5,6 +5,14 @@
 #@   col={name={Residual Deviance} type=R4 src=1}
 #@   col={name={Null Deviance} type=R4 src=2}
 #@   col=AIC:R4:3
+#@   col=BiasEstimate:R4:4
+#@   col=BiasStandardError:R4:5
+#@   col=BiasZScore:R4:6
+#@   col=BiasPValue:R4:7
+#@   col=Estimate:R4:8-16
+#@   col=StandardError:R4:17-25
+#@   col=ZScore:R4:26-34
+#@   col=PValue:R4:35-43
 #@ }
-Count of training examples	Residual Deviance	Null Deviance	AIC
-520	94.1969452	673.3445	114.196945
+Count of training examples	Residual Deviance	Null Deviance	AIC	BiasEstimate	BiasStandardError	BiasZScore	BiasPValue	Features.thickness	Features.uniform_size	Features.uniform_shape	Features.adhesion	Features.epit_size	Features.bare_nuclei	Features.bland_chromatin	Features.normal_nucleoli	Cat.1	Features.thickness	Features.uniform_size	Features.uniform_shape	Features.adhesion	Features.epit_size	Features.bare_nuclei	Features.bland_chromatin	Features.normal_nucleoli	Cat.1	Features.thickness	Features.uniform_size	Features.uniform_shape	Features.adhesion	Features.epit_size	Features.bare_nuclei	Features.bland_chromatin	Features.normal_nucleoli	Cat.1	Features.thickness	Features.uniform_size	Features.uniform_shape	Features.adhesion	Features.epit_size	Features.bare_nuclei	Features.bland_chromatin	Features.normal_nucleoli	Cat.1
+520	94.1969452	673.3445	114.196945	-4.860323	0.712811947	-6.81852055	0	2.143086	1.49418533	1.71121442	1.38318741	0.883200347	3.16845965	1.38684654	1.51904845	-0.8226236	0.430655479	0.4099987	0.4222687	0.4832917	0.457050323	0.457937717	0.445124656	0.4728626	0.338379949	4.976335	3.64436626	4.05243	2.86201358	1.93239188	6.918975	3.11563635	3.21245217	-2.43106484	6.556511E-07	0.0002681017	5.07235527E-05	0.00420969725	0.05331099	0	0.00183564425	0.00131618977	0.0150545239
diff --git a/test/BaselineOutput/Common/EntryPoints/ensemble-summary-key-value-pairs.txt b/test/BaselineOutput/Common/EntryPoints/ensemble-summary-key-value-pairs.txt
index beeec64d77..d89d7a7619 100644
--- a/test/BaselineOutput/Common/EntryPoints/ensemble-summary-key-value-pairs.txt
+++ b/test/BaselineOutput/Common/EntryPoints/ensemble-summary-key-value-pairs.txt
@@ -14,6 +14,16 @@ Count of training examples: 521
 Residual Deviance: 98.29433
 Null Deviance: 669.0935
 AIC: 118.2943
+(Bias): System.Single[]
+Features.thickness: System.Single[]
+Features.bare_nuclei: System.Single[]
+Features.uniform_shape: System.Single[]
+Features.uniform_size: System.Single[]
+Features.bland_chromatin: System.Single[]
+Features.adhesion: System.Single[]
+Features.normal_nucleoli: System.Single[]
+Features.epit_size: System.Single[]
+Cat.1: System.Single[]
 Partition model 1 summary:
 Per-feature gain summary for the boosted tree ensemble:
 Features.uniform_size: 1
@@ -43,6 +53,16 @@ Count of training examples: 520
 Residual Deviance: 94.19695
 Null Deviance: 673.3445
 AIC: 114.1969
+(Bias): System.Single[]
+Features.bare_nuclei: System.Single[]
+Features.thickness: System.Single[]
+Features.uniform_shape: System.Single[]
+Features.uniform_size: System.Single[]
+Features.normal_nucleoli: System.Single[]
+Features.bland_chromatin: System.Single[]
+Features.adhesion: System.Single[]
+Features.epit_size: System.Single[]
+Cat.1: System.Single[]
 Partition model 3 summary:
 Per-feature gain summary for the boosted tree ensemble:
 Features.uniform_size: 1
diff --git a/test/BaselineOutput/Common/EntryPoints/ensemble-summary.txt b/test/BaselineOutput/Common/EntryPoints/ensemble-summary.txt
index 50abe9df54..fadb2e27c8 100644
--- a/test/BaselineOutput/Common/EntryPoints/ensemble-summary.txt
+++ b/test/BaselineOutput/Common/EntryPoints/ensemble-summary.txt
@@ -17,6 +17,21 @@ Count of training examples:	521
 Residual Deviance:         	98.29433
 Null Deviance:             	669.0935
 AIC:                       	118.2943
+
+Coefficients statistics:
+Coefficient    	Estimate	Std. Error	z value  	Pr(>|z|)
+(Bias)         	-5.120674 	0.6998186 	-7.317145 	0 ***
+Features.thickness	2.353567  	0.4267568 	5.515007  	5.960464E-08 ***
+Features.bare_nuclei	2.435889  	0.451504  	5.395056  	5.960464E-08 ***
+Features.uniform_shape	1.944249  	0.4137097 	4.699549  	2.622604E-06 ***
+Features.uniform_size	1.786538  	0.4204066 	4.249547  	2.145767E-05 ***
+Features.bland_chromatin	1.611417  	0.4605175 	3.499144  	0.0004668236 ***
+Features.adhesion	1.38072   	0.4821554 	2.863641  	0.00418818 **
+Features.normal_nucleoli	1.345759  	0.4784136 	2.812962  	0.00490886 **
+Features.epit_size	1.083109  	0.4566914 	2.371643  	0.0177092 *
+Cat.1          	-0.7715381	0.3420695 	-2.255501 	0.02410179 *
+---
+Significance codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 Partition model 1 summary:
 
 Per-feature gain summary for the boosted tree ensemble:
@@ -50,6 +65,21 @@ Count of training examples:	520
 Residual Deviance:         	94.19695
 Null Deviance:             	673.3445
 AIC:                       	114.1969
+
+Coefficients statistics:
+Coefficient    	Estimate	Std. Error	z value  	Pr(>|z|)
+(Bias)         	-4.860323 	0.7128119 	-6.818521 	0 ***
+Features.bare_nuclei	3.16846   	0.4579377 	6.918975  	0 ***
+Features.thickness	2.143086  	0.4306555 	4.976335  	6.556511E-07 ***
+Features.uniform_shape	1.711214  	0.4222687 	4.05243   	5.072355E-05 ***
+Features.uniform_size	1.494185  	0.4099987 	3.644366  	0.0002681017 ***
+Features.normal_nucleoli	1.519048  	0.4728626 	3.212452  	0.00131619 **
+Features.bland_chromatin	1.386847  	0.4451247 	3.115636  	0.001835644 **
+Features.adhesion	1.383187  	0.4832917 	2.862014  	0.004209697 **
+Features.epit_size	0.8832003 	0.4570503 	1.932392  	0.05331099 .
+Cat.1          	-0.8226236	0.3383799 	-2.431065 	0.01505452 *
+---
+Significance codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 Partition model 3 summary:
 
 Per-feature gain summary for the boosted tree ensemble:

From fb897ed0734135356dd45cdd9a2a916008c7bdee Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Fri, 9 Nov 2018 11:26:08 -0800
Subject: [PATCH 11/14] more baseline updates.

---
 .../LogisticRegressionTrainingStats.cs               |  1 -
 .../LogisticRegression/LogisticRegression.cs         |  3 +--
 .../Command/CommandTrainingLrWithStats-summary.txt   | 12 ++++++++++++
 test/BaselineOutput/Common/EntryPoints/lr-stats.txt  | 12 ++++++++++--
 4 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs b/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
index 9aca098a35..5b834edd72 100644
--- a/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
+++ b/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
@@ -29,7 +29,6 @@ public VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int numS
         {
             Contracts.AssertValue(ch);
             Contracts.AssertValue(hessian, nameof(hessian));
-            Contracts.AssertNonEmpty(weightIndices);
             Contracts.Assert(numSelectedParams > 0);
             Contracts.Assert(currentWeightsCount > 0);
             Contracts.Assert(l2Weight > 0);
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
index 6e29b88dcd..08317430ce 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
@@ -357,7 +357,7 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
                 _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance);
             else
             {
-                var std = Args.StdComputer.ComputeStd(hessian, weightIndices, numParams, CurrentWeights.Count, ch, L2Weight);
+                var std = Args.StdComputer.ComputeStd(hessian, weightIndices, numParams, CurrentWeights.Length, ch, L2Weight);
                 _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance, std);
             }
         }
@@ -463,7 +463,6 @@ public VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int numS
         {
             Contracts.AssertValue(ch);
             Contracts.AssertValue(hessian, nameof(hessian));
-            Contracts.AssertNonEmpty(weightIndices);
             Contracts.Assert(numSelectedParams > 0);
             Contracts.Assert(currentWeightsCount > 0);
             Contracts.Assert(l2Weight > 0);
diff --git a/test/BaselineOutput/Common/Command/CommandTrainingLrWithStats-summary.txt b/test/BaselineOutput/Common/Command/CommandTrainingLrWithStats-summary.txt
index 6d58cb8d2d..4bd1c57233 100644
--- a/test/BaselineOutput/Common/Command/CommandTrainingLrWithStats-summary.txt
+++ b/test/BaselineOutput/Common/Command/CommandTrainingLrWithStats-summary.txt
@@ -13,3 +13,15 @@ Count of training examples:	32561
 Residual Deviance:         	26705.74
 Null Deviance:             	35948.08
 AIC:                       	26719.74
+
+Coefficients statistics:
+Coefficient    	Estimate	Std. Error	z value  	Pr(>|z|)
+(Bias)         	-8.228298 	0.1161297 	-70.85435 	0 ***
+education-num  	5.066041  	0.1048074 	48.33666  	0 ***
+capital-gain   	18.58347  	0.4694776 	39.5833   	0 ***
+age            	3.86064   	0.1061118 	36.38277  	0 ***
+hours-per-week 	3.946534  	0.1258723 	31.35349  	0 ***
+capital-loss   	2.81616   	0.13793   	20.41732  	0 ***
+fnlwgt         	0.7489593 	0.2048056 	3.656927  	0.0002553463 ***
+---
+Significance codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
diff --git a/test/BaselineOutput/Common/EntryPoints/lr-stats.txt b/test/BaselineOutput/Common/EntryPoints/lr-stats.txt
index 8e04238c73..c467f102be 100644
--- a/test/BaselineOutput/Common/EntryPoints/lr-stats.txt
+++ b/test/BaselineOutput/Common/EntryPoints/lr-stats.txt
@@ -5,6 +5,14 @@
 #@   col={name={Residual Deviance} type=R4 src=1}
 #@   col={name={Null Deviance} type=R4 src=2}
 #@   col=AIC:R4:3
+#@   col=BiasEstimate:R4:4
+#@   col=BiasStandardError:R4:5
+#@   col=BiasZScore:R4:6
+#@   col=BiasPValue:R4:7
+#@   col=Estimate:R4:8-16
+#@   col=StandardError:R4:17-25
+#@   col=ZScore:R4:26-34
+#@   col=PValue:R4:35-43
 #@ }
-Count of training examples	Residual Deviance	Null Deviance	AIC
-683	126.83107	884.350159	146.83107
+Count of training examples	Residual Deviance	Null Deviance	AIC	BiasEstimate	BiasStandardError	BiasZScore	BiasPValue	thickness	uniform_size	uniform_shape	adhesion	epit_size	bare_nuclei	bland_chromatin	normal_nucleoli	mitoses	thickness	uniform_size	uniform_shape	adhesion	epit_size	bare_nuclei	bland_chromatin	normal_nucleoli	mitoses	thickness	uniform_size	uniform_shape	adhesion	epit_size	bare_nuclei	bland_chromatin	normal_nucleoli	mitoses	thickness	uniform_size	uniform_shape	adhesion	epit_size	bare_nuclei	bland_chromatin	normal_nucleoli	mitoses
+683	126.83107	884.350159	146.83107	-6.186806	0.459383339	-13.4676332	0	2.65800762	1.68089855	1.944068	1.42514718	0.8536965	2.9325006	1.74816787	1.58165014	0.595681	0.455618978	0.429146379	0.431570023	0.479817748	0.470442533	0.4381438	0.469593167	0.4714128	0.467883229	5.83383846	3.916842	4.504641	2.97018433	1.814667	6.69301	3.72272849	3.35512757	1.27314031	0	8.9764595E-05	6.67572E-06	0.002976358	0.06957501	0	0.00019711256	0.0007933974	0.202968419

From 89301b4ec82ec0c2e3337a43e64090ffc97d0c0c Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Fri, 9 Nov 2018 23:41:42 -0800
Subject: [PATCH 12/14] post merge test fixes.

---
 test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs b/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
index 3c3b739c9a..30906c8940 100644
--- a/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
+++ b/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
@@ -45,7 +45,7 @@ public void TestLogisticRegressionStats()
         {
             (IEstimator<ITransformer> pipe, IDataView dataView) = GetBinaryClassificationPipeline();
 
-            pipe = pipe.Append(new LogisticRegression(Env, "Features", "Label", advancedSettings: s => { s.ShowTrainingStats = true; }));
+            pipe = pipe.Append(new LogisticRegression(Env, "Label", "Features", advancedSettings: s => { s.ShowTrainingStats = true; }));
             var transformerChain = pipe.Fit(dataView) as TransformerChain<BinaryPredictionTransformer<ParameterMixingCalibratedPredictor>>;
 
             var linearModel = transformerChain.LastTransformer.Model.SubPredictor as LinearBinaryPredictor;
@@ -61,7 +61,7 @@ public void TestLogisticRegressionStats_MKL()
         {
             (IEstimator<ITransformer> pipe, IDataView dataView) = GetBinaryClassificationPipeline();
 
-            pipe = pipe.Append(new LogisticRegression(Env, "Features", "Label", advancedSettings: s => {
+            pipe = pipe.Append(new LogisticRegression(Env, "Label", "Features", advancedSettings: s => {
                 s.ShowTrainingStats = true;
                 s.StdComputer = new ComputeLRTrainingStdThroughHal();
             }));

From c8d060ac09b6dc1d343a8e0656ae54e62b984bb5 Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Mon, 12 Nov 2018 09:50:15 -0800
Subject: [PATCH 13/14] Eric's comments

---
 build/Dependencies.props                      |  2 +-
 ...s.cs => ComputeLRTrainingStdThroughHal.cs} | 10 +---
 .../AssemblyInfo.cs                           |  1 +
 .../LogisticRegression/LogisticRegression.cs  | 59 +++++++++++--------
 .../Standard/ModelStatistics.cs               |  1 -
 5 files changed, 38 insertions(+), 35 deletions(-)
 rename src/Microsoft.ML.HalLearners/{LogisticRegressionTrainingStats.cs => ComputeLRTrainingStdThroughHal.cs} (89%)

diff --git a/build/Dependencies.props b/build/Dependencies.props
index 3f053444db..47f34e0e1e 100644
--- a/build/Dependencies.props
+++ b/build/Dependencies.props
@@ -9,6 +9,7 @@
     <SystemReflectionEmitLightweightPackageVersion>4.3.0</SystemReflectionEmitLightweightPackageVersion>
     <SystemThreadingTasksDataflowPackageVersion>4.8.0</SystemThreadingTasksDataflowPackageVersion>
     <SystemComponentModelCompositionVersion>4.5.0</SystemComponentModelCompositionVersion>
+	<MathNumericPackageVersion>4.6.0</MathNumericPackageVersion>
   </PropertyGroup>
 
   <!-- Other/Non-Core Product Dependencies -->
@@ -22,7 +23,6 @@
     <SystemIOFileSystemAccessControl>4.5.0</SystemIOFileSystemAccessControl>
     <SystemSecurityPrincipalWindows>4.5.0</SystemSecurityPrincipalWindows>
     <TensorFlowVersion>1.10.0</TensorFlowVersion>
-    <MathNumericPackageVersion>4.6.0</MathNumericPackageVersion>
   </PropertyGroup>
 
   <!-- Code Analyzer Dependencies -->
diff --git a/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs b/src/Microsoft.ML.HalLearners/ComputeLRTrainingStdThroughHal.cs
similarity index 89%
rename from src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
rename to src/Microsoft.ML.HalLearners/ComputeLRTrainingStdThroughHal.cs
index 5b834edd72..d55526d19d 100644
--- a/src/Microsoft.ML.HalLearners/LogisticRegressionTrainingStats.cs
+++ b/src/Microsoft.ML.HalLearners/ComputeLRTrainingStdThroughHal.cs
@@ -25,7 +25,7 @@ public sealed class ComputeLRTrainingStdThroughHal : IComputeLRTrainingStd
         /// <param name="currentWeightsCount"></param>
         /// <param name="ch">The <see cref="IChannel"/> used for messaging.</param>
         /// <param name="l2Weight">The L2Weight used for training. (Supply the same one that got used during training.)</param>
-        public VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int numSelectedParams, int currentWeightsCount, IChannel ch, float l2Weight)
+        public override VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int numSelectedParams, int currentWeightsCount, IChannel ch, float l2Weight)
         {
             Contracts.AssertValue(ch);
             Contracts.AssertValue(hessian, nameof(hessian));
@@ -74,12 +74,8 @@ public VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int numS
                 {
                     for (int iCol = 0; iCol <= iRow; iCol++)
                     {
-                        var entry = (float)invHessian[ioffset];
-                        var adjustment = l2Weight * entry * entry;
-                        stdErrorValues[iRow] -= adjustment;
-                        if (0 < iCol && iCol < iRow)
-                            stdErrorValues[iCol] -= adjustment;
-                        ioffset++;
+                        var entry = (float)invHessian[ioffset++];
+                        AdjustVariance(entry, iRow, iCol, l2Weight, stdErrorValues);
                     }
                 }
 
diff --git a/src/Microsoft.ML.StandardLearners/AssemblyInfo.cs b/src/Microsoft.ML.StandardLearners/AssemblyInfo.cs
index 415752aa8d..671913b203 100644
--- a/src/Microsoft.ML.StandardLearners/AssemblyInfo.cs
+++ b/src/Microsoft.ML.StandardLearners/AssemblyInfo.cs
@@ -6,5 +6,6 @@
 using Microsoft.ML;
 
 [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Legacy" + PublicKey.Value)]
+[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.HalLearners" + PublicKey.Value)]
 
 [assembly: WantsToBeBestFriends]
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
index 08317430ce..cdc1aec49c 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
@@ -428,27 +428,40 @@ public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironm
     }
 
     /// <summary>
-    /// Computes the standart deviation matrix of each of the non-zero training weights, needed to calculate further the standart deviation,
+    /// Computes the standard deviation matrix of each of the non-zero training weights, needed to calculate further the standard deviation,
     /// p-value and z-Score.
     /// If you need fast calculations, use the <see cref="IComputeLRTrainingStd"/> implementation in the Microsoft.ML.HALLearners package,
     /// which makes use of hardware acceleration.
     /// Due to the existence of regularization, an approximation is used to compute the variances of the trained linear coefficients.
     /// </summary>
-    public interface IComputeLRTrainingStd
+    public abstract class IComputeLRTrainingStd
     {
         /// <summary>
-        /// Computes the standart deviation matrix of each of the non-zero training weights, needed to calculate further the standart deviation,
+        /// Computes the standard deviation matrix of each of the non-zero training weights, needed to calculate further the standard deviation,
         /// p-value and z-Score.
         /// If you need fast calculations, use the ComputeStd method from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration.
         /// Due to the existence of regularization, an approximation is used to compute the variances of the trained linear coefficients.
         /// </summary>
-        VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int parametersCount, int currentWeightsCount, IChannel ch, float l2Weight);
+        public abstract VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int parametersCount, int currentWeightsCount, IChannel ch, float l2Weight);
+
+        /// <summary>
+        /// Adjust the variance for regularized cases.
+        /// </summary>
+        [BestFriend]
+        internal void AdjustVariance(float inverseEntry, int iRow, int iCol, float l2Weight, float[] stdErrorValues2)
+        {
+            var adjustment = l2Weight * inverseEntry * inverseEntry;
+            stdErrorValues2[iRow] -= adjustment;
+
+            if (0 < iCol && iCol < iRow)
+                stdErrorValues2[iCol] -= adjustment;
+        }
     }
 
     public sealed class ComputeLRTrainingStd: IComputeLRTrainingStd
     {
         /// <summary>
-        /// Computes the standart deviation matrix of each of the non-zero training weights, needed to calculate further the standart deviation,
+        /// Computes the standard deviation matrix of each of the non-zero training weights, needed to calculate further the standard deviation,
         /// p-value and z-Score.
         /// If you need faster calculations, use the ComputeStd method from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration.
         /// Due to the existence of regularization, an approximation is used to compute the variances of the trained linear coefficients.
@@ -459,7 +472,7 @@ public sealed class ComputeLRTrainingStd: IComputeLRTrainingStd
         /// <param name="currentWeightsCount"></param>
         /// <param name="ch">The <see cref="IChannel"/> used for messaging.</param>
         /// <param name="l2Weight">The L2Weight used for training. (Supply the same one that got used during training.)</param>
-        public VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int numSelectedParams, int currentWeightsCount, IChannel ch, float l2Weight)
+        public override VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int numSelectedParams, int currentWeightsCount, IChannel ch, float l2Weight)
         {
             Contracts.AssertValue(ch);
             Contracts.AssertValue(hessian, nameof(hessian));
@@ -470,22 +483,22 @@ public VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int numS
             double[,] matrixHessian = new double[numSelectedParams, numSelectedParams];
 
             int hessianLength = 0;
-            int dimention = numSelectedParams - 1;
+            int dimension = numSelectedParams - 1;
 
-            for (int row = dimention; row >= 0; row--)
+            for (int row = dimension; row >= 0; row--)
             {
-                for (int col = 0; col <= dimention; col++)
+                for (int col = 0; col <= dimension; col++)
                 {
-                    if ((row + col) <= dimention)
+                    if ((row + col) <= dimension)
                     {
-                        if ((row + col) == dimention)
+                        if ((row + col) == dimension)
                         {
                             matrixHessian[row, col] = hessian[hessianLength];
                         }
                         else
                         {
                             matrixHessian[row, col] = hessian[hessianLength];
-                            matrixHessian[dimention - col, dimention - row] = hessian[hessianLength];
+                            matrixHessian[dimension - col, dimension - row] = hessian[hessianLength];
                         }
                         hessianLength++;
                     }
@@ -497,42 +510,36 @@ public VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int numS
             var h = Matrix<double>.Build.DenseOfArray(matrixHessian);
             var invers = h.Inverse();
 
-            float[] stdErrorValues2 = new float[numSelectedParams];
-            stdErrorValues2[0] = (float)Math.Sqrt(invers[0, numSelectedParams - 1]);
+            float[] stdErrorValues = new float[numSelectedParams];
+            stdErrorValues[0] = (float)Math.Sqrt(invers[0, numSelectedParams - 1]);
 
             for (int i = 1; i < numSelectedParams; i++)
             {
                 // Initialize with inverse Hessian.
                 // The diagonal of the inverse Hessian.
-                stdErrorValues2[i] = (float)invers[i, numSelectedParams - i - 1];
+                stdErrorValues[i] = (float)invers[i, numSelectedParams - i - 1];
             }
 
             if (l2Weight > 0)
             {
                 // Iterate through all entries of inverse Hessian to make adjustment to variance.
                 // A discussion on ridge regularized LR coefficient covariance matrix can be found here:
-                // http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3228544/
-                // http://www.inf.unibz.it/dis/teaching/DWDM/project2010/LogisticRegression.pdf
-                int ioffset = 1;
+                // http://www.aloki.hu/pdf/0402_171179.pdf (Equations 11 and 25)
+                // http://www.inf.unibz.it/dis/teaching/DWDM/project2010/LogisticRegression.pdf (Section "Significance testing in ridge logistic regression")
                 for (int iRow = 1; iRow < numSelectedParams; iRow++)
                 {
                     for (int iCol = 0; iCol <= iRow; iCol++)
                     {
                         float entry = (float)invers[iRow, numSelectedParams - iCol - 1];
-                        var adjustment = l2Weight * entry * entry;
-                        stdErrorValues2[iRow] -= adjustment;
-
-                        if (0 < iCol && iCol < iRow)
-                            stdErrorValues2[iCol] -= adjustment;
-                        ioffset++;
+                        AdjustVariance(entry, iRow, iCol, l2Weight, stdErrorValues);
                     }
                 }
             }
 
             for (int i = 1; i < numSelectedParams; i++)
-                stdErrorValues2[i] = (float)Math.Sqrt(stdErrorValues2[i]);
+                stdErrorValues[i] = (float)Math.Sqrt(stdErrorValues[i]);
 
-            return new VBuffer<float>(currentWeightsCount, numSelectedParams, stdErrorValues2, weightIndices);
+            return new VBuffer<float>(currentWeightsCount, numSelectedParams, stdErrorValues, weightIndices);
         }
     }
 }
diff --git a/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs b/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs
index 67c588a93c..1eeb043c01 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs
@@ -2,7 +2,6 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
-using MathNet.Numerics.LinearAlgebra;
 using Microsoft.ML.Runtime;
 using Microsoft.ML.Runtime.Data;
 using Microsoft.ML.Runtime.Internal.CpuMath;

From 39ca55e899a27dc8474522f57f1e2acdcf6ac1cb Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Mon, 12 Nov 2018 10:37:34 -0800
Subject: [PATCH 14/14] renaming interface.

---
 .../ComputeLRTrainingStdThroughHal.cs         |  2 +-
 .../LogisticRegression/LogisticRegression.cs  | 23 +++++++++++--------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/Microsoft.ML.HalLearners/ComputeLRTrainingStdThroughHal.cs b/src/Microsoft.ML.HalLearners/ComputeLRTrainingStdThroughHal.cs
index d55526d19d..66868c1c9a 100644
--- a/src/Microsoft.ML.HalLearners/ComputeLRTrainingStdThroughHal.cs
+++ b/src/Microsoft.ML.HalLearners/ComputeLRTrainingStdThroughHal.cs
@@ -11,7 +11,7 @@ namespace Microsoft.ML.Runtime.Learners
 {
     using Mkl = OlsLinearRegressionTrainer.Mkl;
 
-    public sealed class ComputeLRTrainingStdThroughHal : IComputeLRTrainingStd
+    public sealed class ComputeLRTrainingStdThroughHal : ComputeLRTrainingStd
     {
         /// <summary>
         /// Computes the standart deviation matrix of each of the non-zero training weights, needed to calculate further the standart deviation,
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
index 9c3e0595d5..60c81b0ed1 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
@@ -45,20 +45,20 @@ public sealed class Arguments : ArgumentsBase
             /// If set to <value>true</value>training statistics will be generated at the end of training.
             /// If you have a large number of learned training parameters(more than 500),
             /// generating the training statistics might take a few seconds.
-            /// More than 1000 weights might take a few minutes. For those cases consider using the instance of <see cref="IComputeLRTrainingStd"/>
+            /// More than 1000 weights might take a few minutes. For those cases consider using the instance of <see cref="ComputeLRTrainingStd"/>
             /// present in the Microsoft.ML.HalLearners package. That computes the statistics using hardware acceleration.
             /// </summary>
             [Argument(ArgumentType.AtMostOnce, HelpText = "Show statistics of training examples.", ShortName = "stat", SortOrder = 50)]
             public bool ShowTrainingStats = false;
 
             /// <summary>
-            /// The instance of <see cref="IComputeLRTrainingStd"/> that computes the training statistics at the end of training.
+            /// The instance of <see cref="ComputeLRTrainingStd"/> that computes the training statistics at the end of training.
             /// If you have a large number of learned training parameters(more than 500),
             /// generating the training statistics might take a few seconds.
-            /// More than 1000 weights might take a few minutes. For those cases consider using the instance of <see cref="IComputeLRTrainingStd"/>
+            /// More than 1000 weights might take a few minutes. For those cases consider using the instance of <see cref="ComputeLRTrainingStd"/>
             /// present in the Microsoft.ML.HalLearners package. That computes the statistics using hardware acceleration.
             /// </summary>
-            public IComputeLRTrainingStd StdComputer;
+            public ComputeLRTrainingStd StdComputer;
         }
 
         private double _posWeight;
@@ -97,7 +97,7 @@ public LogisticRegression(IHostEnvironment env,
             ShowTrainingStats = Args.ShowTrainingStats;
 
             if (ShowTrainingStats && Args.StdComputer == null)
-                Args.StdComputer = new ComputeLRTrainingStd();
+                Args.StdComputer = new ComputeLRTrainingStdImpl();
         }
 
         /// <summary>
@@ -110,7 +110,7 @@ internal LogisticRegression(IHostEnvironment env, Arguments args)
             ShowTrainingStats = Args.ShowTrainingStats;
 
             if (ShowTrainingStats && Args.StdComputer == null)
-                Args.StdComputer = new ComputeLRTrainingStd();
+                Args.StdComputer = new ComputeLRTrainingStdImpl();
         }
 
         public override PredictionKind PredictionKind => PredictionKind.BinaryClassification;
@@ -430,11 +430,11 @@ public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironm
     /// <summary>
     /// Computes the standard deviation matrix of each of the non-zero training weights, needed to calculate further the standard deviation,
     /// p-value and z-Score.
-    /// If you need fast calculations, use the <see cref="IComputeLRTrainingStd"/> implementation in the Microsoft.ML.HALLearners package,
+    /// If you need fast calculations, use the <see cref="ComputeLRTrainingStd"/> implementation in the Microsoft.ML.HALLearners package,
     /// which makes use of hardware acceleration.
     /// Due to the existence of regularization, an approximation is used to compute the variances of the trained linear coefficients.
     /// </summary>
-    public abstract class IComputeLRTrainingStd
+    public abstract class ComputeLRTrainingStd
     {
         /// <summary>
         /// Computes the standard deviation matrix of each of the non-zero training weights, needed to calculate further the standard deviation,
@@ -458,7 +458,12 @@ internal void AdjustVariance(float inverseEntry, int iRow, int iCol, float l2Wei
         }
     }
 
-    public sealed class ComputeLRTrainingStd: IComputeLRTrainingStd
+    /// <summary>
+    /// Extends the <see cref="ComputeLRTrainingStd"/> implementing <see cref="ComputeLRTrainingStd.ComputeStd(double[], int[], int, int, IChannel, float)"/> making use of Math.Net numeric
+    /// If you need faster calculations(have non-sparse weight vectors of more than 300 features), use the instance of ComputeLRTrainingStd from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration
+    /// for those computations.
+    /// </summary>
+    public sealed class ComputeLRTrainingStdImpl : ComputeLRTrainingStd
     {
         /// <summary>
         /// Computes the standard deviation matrix of each of the non-zero training weights, needed to calculate further the standard deviation,