From 53488362ec845bb210579c0f21ee613980385f68 Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Tue, 4 Jun 2019 19:55:59 -0700 Subject: [PATCH 1/4] upgrade to ML.NET 1.1 --- src/DotNetBridge/DotNetBridge.csproj | 20 ++++++++++---------- src/Platforms/build.csproj | 18 +++++++++--------- src/python/nimbusml/__init__.py | 2 +- src/python/setup.py | 2 +- version.txt | 2 +- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/DotNetBridge/DotNetBridge.csproj b/src/DotNetBridge/DotNetBridge.csproj index 1c1cb0e6..92365878 100644 --- a/src/DotNetBridge/DotNetBridge.csproj +++ b/src/DotNetBridge/DotNetBridge.csproj @@ -31,15 +31,15 @@ all runtime; build; native; contentfiles; analyzers - - - - - - - - - - + + + + + + + + + + diff --git a/src/Platforms/build.csproj b/src/Platforms/build.csproj index b9b3ae1a..7491fac8 100644 --- a/src/Platforms/build.csproj +++ b/src/Platforms/build.csproj @@ -11,15 +11,15 @@ - - - - - - - - - + + + + + + + + + diff --git a/src/python/nimbusml/__init__.py b/src/python/nimbusml/__init__.py index 249155ed..d8da5d6d 100644 --- a/src/python/nimbusml/__init__.py +++ b/src/python/nimbusml/__init__.py @@ -2,7 +2,7 @@ Microsoft Machine Learning for Python """ -__version__ = '1.0.2' +__version__ = '1.1.0' # CoreCLR version of MicrosoftML is built on Windows. # But file permissions are not preserved when it's copied to Linux. diff --git a/src/python/setup.py b/src/python/setup.py index 60979dd7..2ed6c93d 100644 --- a/src/python/setup.py +++ b/src/python/setup.py @@ -45,7 +45,7 @@ # Versions should comply with PEP440. For a discussion on # single-sourcing the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='1.0.2', + version='1.1.0', description='NimbusML', long_description=long_description, diff --git a/version.txt b/version.txt index e6d5cb83..1cc5f657 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -1.0.2 \ No newline at end of file +1.1.0 \ No newline at end of file From 6eedc73c8033813570a50a5e6a7746c9a962ecb0 Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Tue, 4 Jun 2019 20:28:35 -0700 Subject: [PATCH 2/4] by default quote is + --- .../nimbusml/internal/utils/data_schema.py | 2 + .../text/test_ngramfeaturizer.py | 6 +- src/python/nimbusml/tests/test_data_schema.py | 82 +++++++++---------- src/python/nimbusml/tests/test_data_stream.py | 10 +-- 4 files changed, 51 insertions(+), 49 deletions(-) diff --git a/src/python/nimbusml/internal/utils/data_schema.py b/src/python/nimbusml/internal/utils/data_schema.py index 51ff5c82..0fb409e1 100644 --- a/src/python/nimbusml/internal/utils/data_schema.py +++ b/src/python/nimbusml/internal/utils/data_schema.py @@ -472,6 +472,8 @@ def format_options(self, add_sep=False): val = ['quote+'] for k, v in sorted(opts.items()): + if k == 'quote': + continue if isinstance(v, bool): v = "+" if v else '-' elif k == 'sep' and v == '\t': diff --git a/src/python/nimbusml/tests/preprocessing/text/test_ngramfeaturizer.py b/src/python/nimbusml/tests/preprocessing/text/test_ngramfeaturizer.py index 592d1665..084e38af 100644 --- a/src/python/nimbusml/tests/preprocessing/text/test_ngramfeaturizer.py +++ b/src/python/nimbusml/tests/preprocessing/text/test_ngramfeaturizer.py @@ -198,20 +198,20 @@ def test_ngramfeaturizer_single(self): path = get_dataset('infert').as_filepath() file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \ 'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \ - 'col=spontaneous:R4:6 header=+' + 'col=spontaneous:R4:6 quote+ header=+' data = FileDataStream(path, schema=file_schema) xf = NGramFeaturizer(word_feature_extractor=n_gram(), columns={'features': ['id', 'education']}) features = xf.fit_transform(data) - assert features.shape == (248, 637) + assert features.shape == (248, 652) def test_ngramfeaturizer_multi(self): path = get_dataset('infert').as_filepath() file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \ 'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \ - 'col=spontaneous:R4:6 header=+' + 'col=spontaneous:R4:6 quote+ header=+' data = FileDataStream(path, schema=file_schema) try: xf = NGramFeaturizer(word_feature_extractor=n_gram(), diff --git a/src/python/nimbusml/tests/test_data_schema.py b/src/python/nimbusml/tests/test_data_schema.py index 3b48266e..d2a59439 100644 --- a/src/python/nimbusml/tests/test_data_schema.py +++ b/src/python/nimbusml/tests/test_data_schema.py @@ -73,7 +73,7 @@ def test_data_schema_collapse_no(self): self.assertEqual( s, 'col=tt:TX:0 col=ff:R8:1 col=ff2:R8:2 col=tt1:TX:3 ' - 'col=ii:I8:4 col=gg:R8:5 header=+') + 'col=ii:I8:4 col=gg:R8:5 quote+ header=+') def test_data_schema_collapse_yes(self): @@ -92,7 +92,7 @@ def test_data_schema_collapse_yes(self): self.assertEqual( s, 'col=tt:TX:0 col=ff:R8:1-2 col=tt1:TX:3 col=ii:I8:4 ' - 'col=gg:R8:5 header=+') + 'col=gg:R8:5 quote+ header=+') def test_data_schema_collapse_no_file(self): @@ -110,7 +110,7 @@ def test_data_schema_collapse_no_file(self): self.assertEqual( s, 'col=tt:TX:0 col=ff:R8:1 col=ff2:R8:2 col=tt1:TX:3 ' - 'col=ii:I8:4 col=gg:R8:5 header=+') + 'col=ii:I8:4 col=gg:R8:5 quote+ header=+') def test_data_schema_collapse_yes_file(self): @@ -128,7 +128,7 @@ def test_data_schema_collapse_yes_file(self): self.assertEqual( s, 'col=tt:TX:0 col=ff:R8:1-2 col=tt1:TX:3 col=ii:I8:4 ' - 'col=gg:R8:5 header=+') + 'col=gg:R8:5 quote+ header=+') @unittest.skip( reason="needs another entrypoint to guess the schema with nimbusml, " @@ -178,9 +178,9 @@ def test_data_schema(self): s2 = DataSchema([DataColumn(name='text', type='TX', pos=5)]) assert list(s0.columns.keys()) == ['text'] assert list(s1.columns.keys()) == ['text'] - assert str(s1) == 'col=text:TX:5' - assert str(s2) == 'col=text:TX:5' - assert str(s0) == 'col=text:TX:5' + assert str(s1) == 'col=text:TX:5 quote+' + assert str(s2) == 'col=text:TX:5 quote+' + assert str(s0) == 'col=text:TX:5 quote+' assert s1 == s2 assert s1 == s0 assert s1 == DataSchema(s0) @@ -196,31 +196,31 @@ def test_data_schema_read_schema(self): d=[False, True])) sch = DataSchema.read_schema(df) assert str( - sch) == 'col=a:I8:0 col=b:R8:1 col=c:TX:2 col=d:BL:3 header=+' + sch) == 'col=a:I8:0 col=b:R8:1 col=c:TX:2 col=d:BL:3 quote+ header=+' sch = DataSchema.read_schema(df, sep=',') assert str( sch) == 'col=a:I8:0 col=b:R8:1 col=c:TX:2 col=d:BL:3 ' \ - 'header=+ sep=,' + 'quote+ header=+ sep=,' csr = csr_matrix([[0, 1], [1, 0]], dtype='int32') sch = DataSchema.read_schema(csr, sep=',') - assert str(sch) == 'col=Data:I4:0-1 header=+ sep=,' + assert str(sch) == 'col=Data:I4:0-1 quote+ header=+ sep=,' csr = matrix([[0, 1], [1, 0]], dtype='int32') sch = DataSchema.read_schema(csr, sep=',') - assert str(sch) == 'col=Data:I4:0-1 header=+ sep=,' + assert str(sch) == 'col=Data:I4:0-1 quote+ header=+ sep=,' csr = matrix([[0, 1], [1.5, 0.5]]) sch = DataSchema.read_schema(csr, sep=',') - assert str(sch) == 'col=Data:R8:0-1 header=+ sep=,' + assert str(sch) == 'col=Data:R8:0-1 quote+ header=+ sep=,' def test_data_schema_read_schema_tab(self): df = pandas.DataFrame(dict(a=[0, 1], b=[0.1, 1.1], c=['r', 'd'], d=[False, True])) sch = DataSchema.read_schema(df) assert str( - sch) == 'col=a:I8:0 col=b:R8:1 col=c:TX:2 col=d:BL:3 header=+' + sch) == 'col=a:I8:0 col=b:R8:1 col=c:TX:2 col=d:BL:3 quote+ header=+' sch = DataSchema.read_schema(df, sep='\t') assert str( sch) == 'col=a:I8:0 col=b:R8:1 col=c:TX:2 col=d:BL:3 ' \ - 'header=+ sep=tab' + 'quote+ header=+ sep=tab' def test_schema_infert(self): train_file = get_dataset("infert").as_filepath() @@ -228,7 +228,7 @@ def test_schema_infert(self): schema = "col=row_num:I8:0 col=education:TX:1 col=age:I8:2 " \ "col=parity:I8:3 col=induced:I8:4 " + \ "col=case:I8:5 col=spontaneous:I8:6 col=stratum:I8:7 " \ - "col=pooled.stratum:I8:8 header=+" + "col=pooled.stratum:I8:8 quote+ header=+" assert str(found) == schema fds = FileDataStream(train_file, schema) assert str(fds.schema) == schema @@ -242,7 +242,7 @@ def test_schema_infert_R4(self): schema = "col=row_num:R4:0 col=education:TX:1 col=age:R4:2 " \ "col=parity:R4:3 col=induced:R4:4 " + \ "col=case:R4:5 col=spontaneous:R4:6 col=stratum:R4:7 " \ - "col=pooled.stratum:R4:8 header=+" + "col=pooled.stratum:R4:8 quote+ header=+" assert str(found) == schema fds = FileDataStream(train_file, schema) assert str(fds.schema) == schema @@ -257,7 +257,7 @@ def test_schema_infert_R4one(self): schema = "col=row_num:I8:0 col=education:TX:1 col=age:R4:2 " \ "col=parity:I8:3 col=induced:I8:4 " + \ "col=case:I8:5 col=spontaneous:I8:6 col=stratum:I8:7 " \ - "col=pooled.stratum:I8:8 header=+" + "col=pooled.stratum:I8:8 quote+ header=+" assert str(found) == schema fds = FileDataStream(train_file, schema) assert str(fds.schema) == schema @@ -270,7 +270,7 @@ def test_schema_airquality(self): found = DataSchema.read_schema(train_file) schema = "col=Unnamed0:I8:0 col=Ozone:R8:1 col=Solar_R:R8:2 " \ "col=Wind:R8:3 col=Temp:I8:4 col=Month:I8:5 " \ - "col=Day:I8:6 header=+" + "col=Day:I8:6 quote+ header=+" assert str(found) == schema fds = FileDataStream(train_file, schema) assert str(fds.schema) == schema @@ -288,7 +288,7 @@ def test_schema_collapse_all(self): assert str( file_schema) == "col=row_num:R4:0 col=education:TX:1 " \ "col=Features:R4:2-4,6-8 col=case:R4:5 " \ - "header=+ sep=," + "quote+ header=+ sep=," def test_schema_documentation(self): @@ -303,7 +303,7 @@ def test_schema_documentation(self): if sys.version_info[:2] >= (3, 6): assert str( schema) == 'col=real:R8:0 col=integer:I8:1 col=text:TX:2 ' \ - 'col=real32:R4:3 header=+' + 'col=real32:R4:3 quote+ header=+' data = DataFrame( OrderedDict( @@ -316,7 +316,7 @@ def test_schema_documentation(self): if sys.version_info[:2] >= (3, 6): assert str( schema) == 'col=real:R8:0 col=integer:I8:1 col=text:TX:2' \ - ' header=+' + ' quote+ header=+' data = DataFrame( OrderedDict( @@ -329,7 +329,7 @@ def test_schema_documentation(self): if sys.version_info[:2] >= (3, 6): assert str( schema) == 'col=real:R8:0 col=integer:I8:1 col=text:TX:2' \ - ' header=+' + ' quote+ header=+' data = DataFrame( OrderedDict( @@ -343,7 +343,7 @@ def test_schema_documentation(self): if sys.version_info[:2] >= (3, 6): assert str( schema) == 'col=real:R8:0-1 col=integer:I8:2 ' \ - 'col=text:TX:3 header=+' + 'col=text:TX:3 quote+ header=+' data = DataFrame( OrderedDict( @@ -357,7 +357,7 @@ def test_schema_documentation(self): 1: 'newname2'}) if sys.version_info[:2] >= (3, 6): assert str( - schema) == 'col=newname:R8:0 col=newname2:TX:1-2 header=+' + schema) == 'col=newname:R8:0 col=newname2:TX:1-2 quote+ header=+' data = DataFrame( OrderedDict( @@ -371,7 +371,7 @@ def test_schema_documentation(self): if sys.version_info[:2] >= (3, 6): assert str( schema) == 'col=real:R8:0 col=text_0:TX:1 ' \ - 'col=text_1:TX:2 header=+' + 'col=text_1:TX:2 quote+ header=+' data = DataFrame(OrderedDict(real=[0.1, 0.2], text1=["a", "b"])) data.to_csv('data.txt', index=False) @@ -379,7 +379,7 @@ def test_schema_documentation(self): 'data.txt', collapse=True, dtype={ 'real': numpy.float32}) if sys.version_info[:2] >= (3, 6): - assert str(schema) == 'col=real:R4:0 col=text1:TX:1 header=+' + assert str(schema) == 'col=real:R4:0 col=text1:TX:1 quote+ header=+' for c in schema: assert repr(c).startswith("DataColumn(name='") assert repr(schema).startswith("DataSchema([DataColumn(name='") @@ -399,7 +399,7 @@ def test_schema_tab(self): assert str( train_file_stream.schema) == 'col=review:TX:0 ' \ 'col=review_reverse:TX:1 ' \ - 'col=label:I8:2 header=+ sep=,' + 'col=label:I8:2 quote+ header=+ sep=,' train_file_stream = FileDataStream.read_csv( train_file, sep=',', names={ @@ -408,7 +408,7 @@ def test_schema_tab(self): assert str( train_file_stream.schema) == 'col=review:TX:0 ' \ 'col=review_reverse:TX:1 ' \ - 'col=label:U4:2 header=+ sep=,' + 'col=label:U4:2 quote+ header=+ sep=,' def test_schema_dtype_regex(self): path = get_dataset('gen_tickettrain').as_filepath() @@ -431,7 +431,7 @@ def test_schema_dtype_regex(self): assert str( file_schema) == 'col=Label:R4:0 col=GroupId:TX:1 ' \ 'col=carrier:TX:2 col=Features:R4:3-7 ' \ - 'header=+ sep=,' + 'quote+ header=+ sep=,' def test_schema_dtype_slice(self): path = get_dataset('gen_tickettrain').as_filepath() @@ -443,20 +443,20 @@ def test_schema_dtype_slice(self): assert str( file_schema) == 'col=Label:R4:0 col=GroupId:TX:1 ' \ 'col=carrier:TX:2 col=price:R4:3 ' \ - 'col=Class:I8:4-6 col=duration:R8:7 header=+ ' \ + 'col=Class:I8:4-6 col=duration:R8:7 quote+ header=+ ' \ 'sep=,' def test_schema_dtype_list_int(self): li = [[1.0, 1.0, 2.0], [3.0, 5.0, 6.0]] schema = DataSchema.read_schema(li) assert str( - schema) == 'col=c0:R8:0 col=c1:R8:1 col=c2:R8:2 header=+' + schema) == 'col=c0:R8:0 col=c1:R8:1 col=c2:R8:2 quote+ header=+' def test_schema_dtype_list_trueint(self): li = [[1, 1, 2], [3, 5, 6]] schema = DataSchema.read_schema(li) assert str( - schema) == 'col=c0:I8:0 col=c1:I8:1 col=c2:I8:2 header=+' + schema) == 'col=c0:I8:0 col=c1:I8:1 col=c2:I8:2 quote+ header=+' def test_schema_dtype_numpy_trueint(self): li = [[1, 1, 2], [3, 5, 6]] @@ -465,9 +465,9 @@ def test_schema_dtype_numpy_trueint(self): schema = DataSchema.read_schema(mat) # The behavior is not the same on every OS. if dt == numpy.int64: - assert str(schema) == 'col=Data:I8:0-2 header=+' + assert str(schema) == 'col=Data:I8:0-2 quote+ header=+' elif dt == numpy.int32: - assert str(schema) == 'col=Data:I4:0-2 header=+' + assert str(schema) == 'col=Data:I4:0-2 quote+ header=+' else: raise TypeError("unexpected type {0}".format(dt)) @@ -475,7 +475,7 @@ def test_schema_dtype_numpy_float(self): li = [[1.0, 1.0, 2.0], [3.0, 5.0, 6.0]] mat = numpy.array(li) schema = DataSchema.read_schema(mat) - assert str(schema) == 'col=Data:R8:0-2 header=+' + assert str(schema) == 'col=Data:R8:0-2 quote+ header=+' def test_schema_sep_default(self): data = pandas.DataFrame( @@ -490,12 +490,12 @@ def test_schema_sep_default(self): collapse=False, numeric_dtype=numpy.float32) assert str( - ds.schema) == "col=real:R4:0 col=text:TX:1 col=y:R4:2 header=+" + ds.schema) == "col=real:R4:0 col=text:TX:1 col=y:R4:2 quote+ header=+" assert ds.schema.to_string() == "col=real:R4:0 col=text:TX:1 " \ - "col=y:R4:2 header=+" + "col=y:R4:2 quote+ header=+" assert ds.schema.to_string( add_sep=True) == "col=real:R4:0 col=text:TX:1 col=y:R4:2 " \ - "header=+ sep=," + "quote+ header=+ sep=," exp = Pipeline([OneHotVectorizer(columns=['text']), LightGbmRegressor(minimum_example_count_per_leaf=1)]) exp.fit(ds, 'y') @@ -512,7 +512,7 @@ def test_schema__repr(self): "col=age:R4:2 col=parity:R4:3 " \ "col=induced:R4:4 col=case:R4:5 " \ "col=spontaneous:R4:6 col=stratum:R4:7 " \ - "col=pooled.stratum:R4:8 header=+ sep=," + "col=pooled.stratum:R4:8 quote+ header=+ sep=," assert "DataSchema([DataColumn(name='row_num', type='R4', " \ "pos=0)" in str(repr(data.schema)) @@ -520,7 +520,7 @@ def test_schema__repr(self): data = FileDataStream.read_csv( path, sep=',', numeric_dtype=numpy.float32, collapse=True) assert str( - data.schema) == "col=review:TX:0-1 col=label:R4:2 header=+ " \ + data.schema) == "col=review:TX:0-1 col=label:R4:2 quote+ header=+ " \ "sep=," assert "DataSchema([DataColumn(name='review', type='TX', pos=(0," \ " 1))" in str(repr(data.schema)) @@ -530,7 +530,7 @@ def test_schema__repr(self): path, sep=',', numeric_dtype=numpy.float32, collapse=False) assert str( data.schema) == "col=review:TX:0 col=review_reverse:TX:1 " \ - "col=label:R4:2 header=+ sep=," + "col=label:R4:2 quote+ header=+ sep=," assert "DataSchema([DataColumn(name='review', type='TX', pos=0)," \ in str(repr(data.schema)) diff --git a/src/python/nimbusml/tests/test_data_stream.py b/src/python/nimbusml/tests/test_data_stream.py index 42f9e140..744c1854 100644 --- a/src/python/nimbusml/tests/test_data_stream.py +++ b/src/python/nimbusml/tests/test_data_stream.py @@ -34,24 +34,24 @@ def test_data_header_no_dataframe(self): li = [1.0, 1.0, 2.0] df = pandas.DataFrame(li) schema0 = DataSchema.read_schema(df) - assert str(schema0) == 'col=c0:R8:0 header=+' + assert str(schema0) == 'col=c0:R8:0 quote+ header=+' li = [[1.0, 1.0, 2.0], [3.0, 5.0, 6.0]] schema1 = DataSchema.read_schema(li) - assert str(schema1) == 'col=c0:R8:0 col=c1:R8:1 col=c2:R8:2 header=+' + assert str(schema1) == 'col=c0:R8:0 col=c1:R8:1 col=c2:R8:2 quote+ header=+' df = pandas.DataFrame([[1.0, 1.0, 2.0], [3.0, 5.0, 6.0]]) schema2 = DataSchema.read_schema(df) - assert str(schema2) == 'col=c0:R8:0 col=c1:R8:1 col=c2:R8:2 header=+' + assert str(schema2) == 'col=c0:R8:0 col=c1:R8:1 col=c2:R8:2 quote+ header=+' mat = numpy.array([[1.0, 1.0, 2.0], [3.0, 5.0, 6.0]]) schema3 = DataSchema.read_schema(mat) - assert str(schema3) == 'col=Data:R8:0-2 header=+' + assert str(schema3) == 'col=Data:R8:0-2 quote+ header=+' li = [1.0, 1.0, 2.0] df = pandas.DataFrame(li) schema0 = DataSchema.read_schema(df, header=False) - assert str(schema0) == 'col=c0:R8:0 header=-' + assert str(schema0) == 'col=c0:R8:0 quote+ header=-' def test_data_stream_head_file(self): df = pandas.DataFrame(dict(a=[0, 1], b=[0.1, 0.2])) From b0db93e1c0e187dcd31fd33b36a8a6af076763d3 Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Tue, 4 Jun 2019 20:44:21 -0700 Subject: [PATCH 3/4] assert changes due to quote --- .../feature_extraction/text/test_wordembedding.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/python/nimbusml/tests/feature_extraction/text/test_wordembedding.py b/src/python/nimbusml/tests/feature_extraction/text/test_wordembedding.py index 31d46f9a..805fec02 100644 --- a/src/python/nimbusml/tests/feature_extraction/text/test_wordembedding.py +++ b/src/python/nimbusml/tests/feature_extraction/text/test_wordembedding.py @@ -76,7 +76,7 @@ def test_word_embedding_example(self): path = get_dataset('infert').as_filepath() file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \ 'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \ - 'col=spontaneous:R4:6 header=+' + 'col=spontaneous:R4:6 quote+ header=+' data = FileDataStream(path, schema=file_schema) # transform usage @@ -92,7 +92,7 @@ def test_word_embedding_example(self): ]) features = pipeline.fit_transform(data) - assert features.shape == (248, 787) + assert features.shape == (248, 802) # TODO: fix ssl issue on test centos7 & ubuntu14 boxes. # Test works on ubuntu16. @@ -117,7 +117,7 @@ def test_word_embedding_example2(self): path = get_dataset('infert').as_filepath() file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \ 'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \ - 'col=spontaneous:R4:6 header=+' + 'col=spontaneous:R4:6 quote+ header=+' data = FileDataStream(path, schema=file_schema) pipeline = Pipeline([ @@ -129,7 +129,7 @@ def test_word_embedding_example2(self): ]) features = pipeline.fit_transform(data) - assert features.shape == (248, 787) + assert features.shape == (248, 802) assert 'features_TransformedText.94' in list(features.columns) # TODO: fix ssl issue on test centos7 & ubuntu14 boxes. @@ -155,7 +155,7 @@ def test_word_embedding_example_dict_same_name(self): path = get_dataset('infert').as_filepath() file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \ 'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \ - 'col=spontaneous:R4:6 header=+' + 'col=spontaneous:R4:6 quote+ header=+' data = FileDataStream(path, schema=file_schema) pipeline = Pipeline([ NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens_column_name='features_TransformedText', @@ -168,14 +168,14 @@ def test_word_embedding_example_dict_same_name(self): ]) features = pipeline.fit_transform(data) - assert features.shape == (248, 787) + assert features.shape == (248, 802) @unittest.skip('System.ArgumentOutOfRangeException') def test_word_embedding_example_dict_newname(self): path = get_dataset('infert').as_filepath() file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \ 'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \ - 'col=spontaneous:R4:6 header=+' + 'col=spontaneous:R4:6 quote+ header=+' data = FileDataStream(path, schema=file_schema) pipeline = Pipeline([ NGramFeaturizer(word_feature_extractor=Ngram(), From f21a02d831d2fb95faf2a7d06600b189ccbd31d2 Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Tue, 4 Jun 2019 21:30:13 -0700 Subject: [PATCH 4/4] fix tensor flow example --- src/python/tests/test_docs_example.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/python/tests/test_docs_example.py b/src/python/tests/test_docs_example.py index f8c8eeac..50333cd9 100644 --- a/src/python/tests/test_docs_example.py +++ b/src/python/tests/test_docs_example.py @@ -118,6 +118,10 @@ def test_examples(self): "Your CPU supports instructions that this TensorFlow", "CacheClassesFromAssembly: can't map name " "OLSLinearRegression to Void, already mapped to Void", + # TensorFlowScorer.py + "tensorflow/compiler/xla/service/service.cc:150] XLA service", + "tensorflow/compiler/xla/service/service.cc:158] StreamExecutor device", + "tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency:", # Binner.py "from collections import Mapping, defaultdict", "DeprecationWarning: Using or importing the ABCs",