Merge pull request #21 from dato-code/1.6

Update to version 1.6
turi-code · Sep 23, 2015 · c8947e6 · c8947e6
2 parents f54302f + b75c9e7
commit c8947e6
Show file tree

Hide file tree

Showing 9 changed files with 35 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -41,7 +41,7 @@ We release an updated version about once every 1.5 months or so. You can downloa
 the most recent version directly from [pypi](https://pypi.python.org/pypi/SFrame)
 or using pip:
 
-    pip install sframe
+    pip install -U sframe
 
 **Requirements**
 

diff --git a/oss_local_scripts/conda_requirements.txt b/oss_local_scripts/conda_requirements.txt
@@ -16,5 +16,6 @@ scikit-learn==0.16.1
 scipy==0.15.1
 six==1.9.0
 tornado==4.1
+wheel==0.24.0
 statsmodels
 PIL
diff --git a/oss_src/sframe/CMakeLists.txt b/oss_src/sframe/CMakeLists.txt
@@ -55,6 +55,6 @@ make_copy_target(_local_sys_util_
                   pylambda
  )
 add_dependencies(spark_unity _local_sys_util_)
-file(DOWNLOAD http://s3-us-west-2.amazonaws.com/glbin-engine/spark_unity_0.1.jar ${CMAKE_CURRENT_BINARY_DIR}/spark_unity.jar
-      EXPECTED_MD5 f5e0653648f0b474cee57bd134d9ee83)
+file(DOWNLOAD http://s3-us-west-2.amazonaws.com/glbin-engine/spark_unity_0.3.jar ${CMAKE_CURRENT_BINARY_DIR}/spark_unity.jar
+      EXPECTED_MD5 b6261c8614da3da1c89dce64fed09420)
 
diff --git a/oss_src/sframe/spark_unity.cpp b/oss_src/sframe/spark_unity.cpp
@@ -625,12 +625,8 @@ int concat_main(std::string & _output_directory, std::string & _prefix) {
 
   }
 
-  auto first_sframe_ptr = std::make_shared<sframe>(list_filenames[0]);
-  sframe append_sframe;
-  append_sframe.open_for_write(first_sframe_ptr->column_names(),first_sframe_ptr->column_types(), "", 1, false);
-  append_sframe.close();
-
-  for(int index=0;index<list_filenames.size();index++) { 
+  sframe append_sframe(list_filenames[0]);
+  for(int index=1;index<list_filenames.size();index++) { 
       auto sframe_ptr = std::make_shared<sframe>(list_filenames[index]);
       append_sframe = append_sframe.append(*sframe_ptr); 
   }

diff --git a/oss_src/unity/python/doc/source/graphlab.data_structures.connectors.rst b/oss_src/unity/python/doc/source/graphlab.data_structures.connectors.rst
@@ -40,7 +40,6 @@ Spark RDD
 
   SFrame.from_rdd
   SFrame.to_rdd
-  SFrame.to_schema_rdd
 
 SQL Database
 ----------------

diff --git a/oss_src/unity/python/setup.py b/oss_src/unity/python/setup.py
@@ -160,7 +160,7 @@ def run(self):
             exclude=["*.tests", "*.tests.*", "tests.*", "tests", "*.test", "*.test.*", "test.*", "test"]),
         url='https://dato.com',
         license='LICENSE.txt',
-        description='SFrame enables developers and data scientists to apply machine learning to build state of the art data products.',
+        description='SFrame is an scalable, out-of-core dataframe, which allows you to work with datasets that are larger than the amount of RAM on your system.',
         # long_description=open('README.txt').read(),
         classifiers=classifiers,
         install_requires=[

diff --git a/oss_src/unity/python/sframe/data_structures/image.py b/oss_src/unity/python/sframe/data_structures/image.py
@@ -223,7 +223,7 @@ def show(self):
         Displays the image. Requires PIL/Pillow.
 
         Alternatively, you can create an :class:`graphlab.SArray` of this image
-        and use :func:`graphlab.SArray.show()`
+        and use py:func:`graphlab.SArray.show()`
 
         See Also
         --------

diff --git a/oss_src/unity/python/sframe/data_structures/sframe.py b/oss_src/unity/python/sframe/data_structures/sframe.py
@@ -1697,10 +1697,11 @@ def to_spark_dataframe(self,sc,sql,number_of_partitions=4):
 
         >>> from pyspark import SparkContext, SQLContext
         >>> from graphlab import SFrame
+        >>> from pyspark.sql import SQLContext
         >>> sc = SparkContext('local')
-        >>> sqlc = SQLContext(sc)
+        >>> sql = SQLContext(sc)
         >>> sf = SFrame({'x': [1,2,3], 'y': ['fish', 'chips', 'salad']})
-        >>> df = sf.to_spark_dataframe(sc, sqlc)
+        >>> df = sf.to_spark_dataframe(sc, sql)
         >>> df.show()
         x y
         1 fish
@@ -1902,15 +1903,28 @@ def from_rdd(cls, rdd, cur_sc):
                 df, tmp_loc, finalSFramePrefix)
         else:
             if encoding == 'utf8':
-                finalSFrameFilename = graphlab_util_ref.toSFrame(
-                    rdd._jrdd.rdd(),tmp_loc, finalSFramePrefix)
-            else:
-                # Prep the additional arguments to feed into the pySparkToSFrame function in Java
-                # that will call the spark_unity binary which does the actual encoding
-                additiona_args = os.path.join(" --encoding=%s " % encoding +\
-                                    " --type=rdd ")
-                finalSFrameFilename = graphlab_util_ref.pySparkToSFrame(
-                    rdd._jrdd, tmp_loc, finalSFramePrefix, additiona_args)
+                ## TODO: This is a temporary solution. Here we are completely bypassing 
+                ## toSFrame() codepath when encoding is 'utf8'. This is because of Spark1.5 error
+                ## for closure cleaning issue on deep nested functions.
+
+                def f(iterator): 
+                    for obj in iterator:
+                        yield obj.encode("utf-8")
+
+                rdd = rdd.mapPartitions(f)
+                encoding = "batch"
+                if(rdd._jrdd_deserializer.__class__.__name__ == 'PickleSerializer'):
+                    encoding = "pickle"
+
+                #finalSFrameFilename = graphlab_util_ref.toSFrame(
+                #    rdd._jrdd.rdd(),tmp_loc, finalSFramePrefix)
+            #else:
+            # Prep the additional arguments to feed into the pySparkToSFrame function in Java
+            # that will call the spark_unity binary which does the actual encoding
+            additiona_args = os.path.join(" --encoding=%s " % encoding +\
+                                " --type=rdd ")
+            finalSFrameFilename = graphlab_util_ref.pySparkToSFrame(
+                rdd._jrdd, tmp_loc, finalSFramePrefix, additiona_args)
 
         # Load and return the sframe
         sf = SFrame()

diff --git a/oss_src/unity/python/sframe/toolkits/_model.py b/oss_src/unity/python/sframe/toolkits/_model.py
@@ -427,10 +427,9 @@ def show(self, view=None, model_type='base'):
         view : str, optional
             The name of the Model view to show. Can be one of:
 
-            - 'Summary': The summary description of a Model.
+            - Summary: Shows the statistics of the training process such as size of the data and time cost. The summary also shows the parameters and settings for the model training process if available.
+            - Evaluation: Shows precision recall plot as line chart. Tooltip is provided for pointwise analysis. Precision recall values are shown in the tooltip at any given cutoff value the mouse points to.
 
-            - 'Evaluation': A visual representation of the evaluation results for
-                a Model.
 
         Returns
         -------