Skip to content
This repository has been archived by the owner on Nov 16, 2023. It is now read-only.

Commit

Permalink
Add azureml-dataprep support for dataflow objects (#181)
Browse files Browse the repository at this point in the history
* draft code

* draft

* delete

* add dprep dependency

* rollback

* rollback

* rollback

* test & example on using DprepDataStream

* add dprep path

* add dprep path

* fix mlnetpath

* optional dependency on dprep

* run dprep tests optionally

* fix typo

* Up sdk version

* fix linux dprep tests
  • Loading branch information
ganik authored Jul 12, 2019
1 parent ab27816 commit c2f2b6b
Show file tree
Hide file tree
Showing 20 changed files with 176 additions and 43 deletions.
4 changes: 3 additions & 1 deletion build.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ if /i [%1] == [DbgWinPy2.7] (
:Build
:: Install dotnet SDK version, see https://docs.microsoft.com/en-us/dotnet/core/tools/dotnet-install-script
echo Installing dotnet SDK ...
powershell -NoProfile -ExecutionPolicy unrestricted -Command "[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12; &([scriptblock]::Create((Invoke-WebRequest -useb 'https://dot.net/v1/dotnet-install.ps1'))) -Version 2.1.200 -InstallDir ./cli"
powershell -NoProfile -ExecutionPolicy unrestricted -Command "[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12; &([scriptblock]::Create((Invoke-WebRequest -useb 'https://dot.net/v1/dotnet-install.ps1'))) -Version 2.1.701 -InstallDir ./cli"

set _dotnetRoot=%__currentScriptDir%cli

Expand Down Expand Up @@ -339,6 +339,8 @@ echo "Running tests ... "
echo "#################################"
call "%PythonExe%" -m pip install --upgrade nose pytest graphviz imageio pytest-cov "jupyter_client>=4.4.0" "nbconvert>=4.2.0"
if %PythonVersion% == 2.7 ( call "%PythonExe%" -m pip install --upgrade pyzmq )
:: Run azureml-dataprep tests only in pyhon 3.7 as its an optional dependency
if %PythonVersion% == 3.7 ( call "%PythonExe%" -m pip install --upgrade azureml-dataprep )
call "%PythonExe%" -m pip install --upgrade "%__currentScriptDir%target\%WheelFile%"
call "%PythonExe%" -m pip install "scikit-learn==0.19.2"

Expand Down
7 changes: 5 additions & 2 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ if [ ${__buildDotNetBridge} = true ]
then
# Install dotnet SDK version, see https://docs.microsoft.com/en-us/dotnet/core/tools/dotnet-install-script
echo "Installing dotnet SDK ... "
curl -sSL https://dot.net/v1/dotnet-install.sh | bash /dev/stdin -Version 2.1.200 -InstallDir ./cli
curl -sSL https://dot.net/v1/dotnet-install.sh | bash /dev/stdin -Version 2.1.701 -InstallDir ./cli

# Build managed code
echo "Building managed code ... "
Expand Down Expand Up @@ -266,7 +266,10 @@ then
elif [ ${PythonVersion} = 3.6 ] && [ "$(uname -s)" = "Darwin" ]
then
"${PythonExe}" -m pip install --upgrade pytest-remotedata
fi
elif [ ${PythonVersion} = 3.7 ]
then
"${PythonExe}" -m pip install --upgrade azureml-dataprep
fi
"${PythonExe}" -m pip install --upgrade "${Wheel}"
"${PythonExe}" -m pip install "scikit-learn==0.19.2"

Expand Down
2 changes: 2 additions & 0 deletions build/libs_linux.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,6 @@ lib_lightgbm.so
libtensorflow.so
libtensorflow_framework.so
System.Drawing.Common.dll
Microsoft.DataPrep.dll
Microsoft.DPrep.*
Microsoft.ML.*
2 changes: 2 additions & 0 deletions build/libs_mac.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,6 @@ lib_lightgbm.dylib
libtensorflow.dylib
libtensorflow_framework.dylib
System.Drawing.Common.dll
Microsoft.DataPrep.dll
Microsoft.DPrep.*
Microsoft.ML.*
2 changes: 2 additions & 0 deletions build/libs_win.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,6 @@ MklImports.dll
SymSgdNative.dll
tensorflow.dll
System.Drawing.Common.dll
Microsoft.DataPrep.dll
Microsoft.DPrep.*
Microsoft.ML.*
3 changes: 2 additions & 1 deletion src/DotNetBridge/DotNetBridge.csproj
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>netstandard2.0</TargetFramework>
<TargetFramework>netcoreapp2.1</TargetFramework>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
<Platforms>x64</Platforms>
<DefineConstants>CORECLR</DefineConstants>
Expand Down Expand Up @@ -42,5 +42,6 @@
<PackageReference Include="Microsoft.ML.TensorFlow" Version="1.2.0" />
<PackageReference Include="Microsoft.ML.Ensemble" Version="0.14.0" />
<PackageReference Include="Microsoft.ML.TimeSeries" Version="1.2.0" />
<PackageReference Include="Microsoft.DataPrep" Version="0.0.1.5-preview" />
</ItemGroup>
</Project>
4 changes: 3 additions & 1 deletion src/DotNetBridge/RunGraph.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using System.Globalization;
using System.IO;
using System.Linq;
using Microsoft.DataPrep.Common;
using Microsoft.ML;
using Microsoft.ML.CommandLine;
using Microsoft.ML.Data;
Expand Down Expand Up @@ -146,7 +147,8 @@ private static void RunGraphCore(EnvironmentBlock* penv, IHostEnvironment env, s
var extension = Path.GetExtension(path);
if (extension == ".txt")
dv = TextLoader.LoadFile(host, new TextLoader.Options(), new MultiFileSource(path));

else if(extension == ".dprep")
dv = DataFlow.FromDPrepFile(path).ToDataView();
else
dv = new BinaryLoader(host, new BinaryLoader.Arguments(), path);
}
Expand Down
15 changes: 10 additions & 5 deletions src/NativeBridge/UnixInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,15 +144,20 @@ class UnixMlNetInterface
{
}

FNGETTER EnsureGetter(const char *nimbuslibspath, const char *coreclrpath)
FNGETTER EnsureGetter(const char *mlnetpath, const char *coreclrpath, const char *dpreppath)
{
if (_getter != nullptr)
return _getter;

std::string libsroot(nimbuslibspath);
std::string libsroot(mlnetpath);
std::string coreclrdir(coreclrpath);
if (strlen(dpreppath) == 0)
{
dpreppath = mlnetpath;
}
std::string dprepdir(dpreppath);

ICLRRuntimeHost2* host = EnsureClrHost(libsroot.c_str(), coreclrdir.c_str());
ICLRRuntimeHost2* host = EnsureClrHost(libsroot.c_str(), coreclrdir.c_str(), dprepdir.c_str());
if (host == nullptr)
return nullptr;

Expand Down Expand Up @@ -246,7 +251,7 @@ class UnixMlNetInterface
closedir(dir);
}

ICLRRuntimeHost2* EnsureClrHost(const char * libsRoot, const char * coreclrDirRoot)
ICLRRuntimeHost2* EnsureClrHost(const char * libsRoot, const char * coreclrDirRoot, const char * dprepDirRoot)
{
if (_host != nullptr)
return _host;
Expand Down Expand Up @@ -284,7 +289,7 @@ class UnixMlNetInterface
// TRUSTED_PLATFORM_ASSEMBLIES
tpaList.c_str(),
// APP_PATHS
libsRoot,
dprepDirRoot,
// AppDomainCompatSwitch
W("UseLatestBehaviorWhenTFMNotSpecified")
};
Expand Down
23 changes: 13 additions & 10 deletions src/NativeBridge/WinInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ class WinMlNetInterface
FindClose(findHandle);
}

ICLRRuntimeHost2* EnsureClrHost(const wchar_t * libsRoot, const wchar_t * coreclrDirRoot)
ICLRRuntimeHost2* EnsureClrHost(const wchar_t * libsRoot, const wchar_t * coreclrDirRoot, const wchar_t * dprepDirRoot)
{
if (_host != nullptr)
return _host;
Expand Down Expand Up @@ -228,7 +228,7 @@ class WinMlNetInterface
// TRUSTED_PLATFORM_ASSEMBLIES
tpaList.c_str(),
// APP_PATHS
libsRoot,
dprepDirRoot,
// AppDomainCompatSwitch
W("UseLatestBehaviorWhenTFMNotSpecified")
};
Expand Down Expand Up @@ -267,26 +267,29 @@ class WinMlNetInterface
}

public:
FNGETTER EnsureGetter(const char *nimbuslibspath, const char *coreclrpath)
FNGETTER EnsureGetter(const char *mlnetpath, const char *coreclrpath, const char *dpreppath)
{
if (_getter != nullptr)
return _getter;

std::wstring libsdir = Utf8ToUtf16le(nimbuslibspath);
std::wstring libsdir = Utf8ToUtf16le(mlnetpath);
ConvertToWinPath(libsdir);

std::wstring coreclrdir;
if (strlen(coreclrpath) != 0)
std::wstring coreclrdir = Utf8ToUtf16le(coreclrpath);
ConvertToWinPath(coreclrdir);

std::wstring dprepdir;
if (strlen(dpreppath) != 0)
{
coreclrdir = Utf8ToUtf16le(coreclrpath);
ConvertToWinPath(coreclrdir);
dprepdir = Utf8ToUtf16le(dpreppath);
ConvertToWinPath(dprepdir);
}
else
{
coreclrdir = libsdir;
dprepdir = libsdir;
}

ICLRRuntimeHost2* host = EnsureClrHost(libsdir.c_str(), coreclrdir.c_str());
ICLRRuntimeHost2* host = EnsureClrHost(libsdir.c_str(), coreclrdir.c_str(), dprepdir.c_str());
if (host == nullptr)
return nullptr;

Expand Down
22 changes: 13 additions & 9 deletions src/NativeBridge/dllmain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
#define PARAM_SEED "seed"
#define PARAM_GRAPH "graph"
#define PARAM_VERBOSE "verbose"
#define PARAM_NIMBUSML_PATH "nimbusmlPath"
#define PARAM_MLNET_PATH "mlnetPath"
#define PARAM_DOTNETCLR_PATH "dotnetClrPath"
#define PARAM_DPREP_PATH "dprepPath"
#define PARAM_DATA "data"


Expand Down Expand Up @@ -44,14 +45,14 @@ static MlNetInterface *g_mlnetInterface = nullptr;
static GENERICEXEC g_exec = nullptr;

// Ensure that we have the DotNetBridge managed code entry point.
GENERICEXEC EnsureExec(const char *nimbuslibspath, const char *coreclrpath)
GENERICEXEC EnsureExec(const char *mlnetpath, const char *coreclrpath, const char *dpreppath)
{
if (g_mlnetInterface == nullptr)
g_mlnetInterface = new MlNetInterface();

if (g_exec == nullptr)
{
FNGETTER getter = g_mlnetInterface->EnsureGetter(nimbuslibspath, coreclrpath);
FNGETTER getter = g_mlnetInterface->EnsureGetter(mlnetpath, coreclrpath, dpreppath);
if (getter != nullptr)
g_exec = (GENERICEXEC)getter(FnIdGenericExec);
}
Expand All @@ -70,20 +71,23 @@ bp::dict pxCall(bp::dict& params)
try
{
bp::extract<std::string> graph(params[PARAM_GRAPH]);
bp::extract<std::string> nimbusmlPath(params[PARAM_NIMBUSML_PATH]);
bp::extract<std::string> mlnetPath(params[PARAM_MLNET_PATH]);
bp::extract<std::string> dotnetClrPath(params[PARAM_DOTNETCLR_PATH]);
bp::extract<std::string> dprepPath(params[PARAM_DPREP_PATH]);
bp::extract<std::int32_t> verbose(params[PARAM_VERBOSE]);
std::int32_t i_verbose = std::int32_t(verbose);
std::string s_nimbusmlPath = std::string(nimbusmlPath);
std::string s_mlnetPath = std::string(mlnetPath);
std::string s_dotnetClrPath = std::string(dotnetClrPath);
std::string s_dprepPath = std::string(dprepPath);
std::string s_graph = std::string(graph);
const char *nimbuslibspath = s_nimbusmlPath.c_str();
const char *mlnetpath = s_mlnetPath.c_str();
const char *coreclrpath = s_dotnetClrPath.c_str();
const char *dpreppath = s_dprepPath.c_str();

GENERICEXEC exec = EnsureExec(nimbuslibspath, coreclrpath);
GENERICEXEC exec = EnsureExec(mlnetpath, coreclrpath, dpreppath);
if (exec == nullptr)
throw std::invalid_argument("Failed to communicate with the managed library. Path searched: "
+ s_nimbusmlPath + " and " + s_dotnetClrPath);
throw std::invalid_argument("Failed to communicate with the managed library. Paths searched: "
+ s_mlnetPath + " and " + s_dotnetClrPath);

int seed = 42;
if (params.has_key(PARAM_SEED))
Expand Down
3 changes: 2 additions & 1 deletion src/Platforms/build.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<PropertyGroup>
<AssemblyName>dummy</AssemblyName>
<OutputType>Exe</OutputType>
<TargetFramework>netcoreapp2.0</TargetFramework>
<TargetFramework>netcoreapp2.1</TargetFramework>
<Platforms>x64</Platforms>
<Configurations>DbgWinPy3.7;DbgWinPy3.6;DbgWinPy3.5;DbgWinPy2.7;RlsWinPy3.7;RlsWinPy3.6;RlsWinPy3.5;RlsWinPy2.7;DbgLinPy3.7;DbgLinPy3.6;DbgLinPy3.5;DbgLinPy2.7;RlsLinPy3.7;RlsLinPy3.6;RlsLinPy3.5;RlsLinPy2.7;RlsMacPy3.7;RlsMacPy3.6</Configurations>
<OutputPath>$(ProjectDir)..\..\x64\$(Configuration)\Platform\</OutputPath>
Expand All @@ -21,6 +21,7 @@
<PackageReference Include="Microsoft.ML.TensorFlow" Version="1.2.0" />
<PackageReference Include="Microsoft.ML.Ensemble" Version="0.14.0" />
<PackageReference Include="Microsoft.ML.TimeSeries" Version="1.2.0" />
<PackageReference Include="Microsoft.DataPrep" Version="0.0.1.5-preview" />
</ItemGroup>

</Project>
8 changes: 6 additions & 2 deletions src/python/nimbusml.pyproj
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@
<ProjectTypeGuids>{888888a0-9f3d-457c-b088-3a5042f75d52}</ProjectTypeGuids>
<LaunchProvider>Standard Python launcher</LaunchProvider>
<Name>nimbusml</Name>
<InterpreterId>Global|VisualStudio|Py3.7</InterpreterId>
<InterpreterId>Global|VisualStudio|MinePy37</InterpreterId>
<InterpreterPath>..\..\dependencies\Python3.7\python.exe</InterpreterPath>
<EnableNativeCodeDebugging>False</EnableNativeCodeDebugging>
<StartupFile>nimbusml\tests\dprep\test_dprep.py</StartupFile>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)' == 'Debug'" />
<PropertyGroup Condition="'$(Configuration)' == 'Release'" />
Expand Down Expand Up @@ -581,6 +582,8 @@
<Compile Include="nimbusml\tests\decomposition\test_pcaanomalydetector.py" />
<Compile Include="nimbusml\tests\decomposition\test_pcatransformer.py" />
<Compile Include="nimbusml\tests\decomposition\__init__.py" />
<Compile Include="nimbusml\tests\dprep\test_dprep.py" />
<Compile Include="nimbusml\tests\dprep\__init__.py" />
<Compile Include="nimbusml\tests\ensemble\test_fasttreesbinaryclassifier.py" />
<Compile Include="nimbusml\tests\ensemble\test_fasttreestweedieregressor.py" />
<Compile Include="nimbusml\tests\ensemble\test_gambinaryclassifier.py" />
Expand Down Expand Up @@ -735,6 +738,7 @@
<Folder Include="docs\sphinx\modules\svm\kernel\" />
<Folder Include="docs\sphinx\_static\" />
<Folder Include="docs\sphinx\_static\images\" />
<Folder Include="nimbusml\tests\dprep\" />
<Folder Include="tests_extended\" />
<Folder Include="nimbusml\" />
<Folder Include="nimbusml\cluster\" />
Expand Down Expand Up @@ -1136,7 +1140,7 @@
<Content Include="tools\manifest_diff.json" />
</ItemGroup>
<ItemGroup>
<InterpreterReference Include="Global|VisualStudio|Py3.7" />
<InterpreterReference Include="Global|VisualStudio|MinePy37" />
</ItemGroup>
<Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
</Project>
1 change: 1 addition & 0 deletions src/python/nimbusml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from .internal.utils.data_roles import Role
from .internal.utils.data_schema import DataSchema
from .internal.utils.data_stream import BinaryDataStream
from .internal.utils.data_stream import DprepDataStream
from .internal.utils.data_stream import FileDataStream
from .internal.utils.utils import run_tests
from .pipeline import Pipeline
Expand Down
33 changes: 33 additions & 0 deletions src/python/nimbusml/internal/utils/data_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
"""
Owns nimbusml's containers.
"""
import os
import tempfile
from shutil import copyfile

from .data_roles import DataRoles
Expand Down Expand Up @@ -467,3 +469,34 @@ def clone(self):
"Method clone was not overwritten for class '{0}'".format(
type(self)))
return BinaryDataStream(self._filename)


class DprepDataStream(BinaryDataStream):
"""
Defines a data view over dprep file.
"""

def __init__(self, dataflow=None, filename=None):
if dataflow is None and filename is None:
raise ValueError('Both dataflow object and filename are None')
super(DprepDataStream, self).__init__(DataSchema(""))
if dataflow is not None:
(fd, filename) = tempfile.mkstemp(suffix='.dprep')
fl = os.fdopen(fd, "wt")
fl.write(dataflow.to_json())
fl.close()
self._filename = filename

def __repr__(self):
return "DprepDataStream('{2}',\n '{0}',\n {1})".format(
self._schema, self._roles, self._filename.replace('\\', '\\\\'))

def clone(self):
"""
Copy/clone the object.
"""
if not isinstance(self, DprepDataStream):
raise NotImplementedError(
"Method clone was not overwritten for class '{0}'".format(
type(self)))
return DprepDataStream(self._filename)
Loading

0 comments on commit c2f2b6b

Please sign in to comment.