Skip to content

Commit

Permalink
Merge pull request #21 from dato-code/1.6
Browse files Browse the repository at this point in the history
Update to version 1.6
  • Loading branch information
Yucheng Low committed Sep 23, 2015
2 parents f54302f + b75c9e7 commit c8947e6
Show file tree
Hide file tree
Showing 9 changed files with 35 additions and 26 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ We release an updated version about once every 1.5 months or so. You can downloa
the most recent version directly from [pypi](https://pypi.python.org/pypi/SFrame)
or using pip:

pip install sframe
pip install -U sframe

**Requirements**

Expand Down
1 change: 1 addition & 0 deletions oss_local_scripts/conda_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@ scikit-learn==0.16.1
scipy==0.15.1
six==1.9.0
tornado==4.1
wheel==0.24.0
statsmodels
PIL
4 changes: 2 additions & 2 deletions oss_src/sframe/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,6 @@ make_copy_target(_local_sys_util_
pylambda
)
add_dependencies(spark_unity _local_sys_util_)
file(DOWNLOAD http://s3-us-west-2.amazonaws.com/glbin-engine/spark_unity_0.1.jar ${CMAKE_CURRENT_BINARY_DIR}/spark_unity.jar
EXPECTED_MD5 f5e0653648f0b474cee57bd134d9ee83)
file(DOWNLOAD http://s3-us-west-2.amazonaws.com/glbin-engine/spark_unity_0.3.jar ${CMAKE_CURRENT_BINARY_DIR}/spark_unity.jar
EXPECTED_MD5 b6261c8614da3da1c89dce64fed09420)

8 changes: 2 additions & 6 deletions oss_src/sframe/spark_unity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -625,12 +625,8 @@ int concat_main(std::string & _output_directory, std::string & _prefix) {

}

auto first_sframe_ptr = std::make_shared<sframe>(list_filenames[0]);
sframe append_sframe;
append_sframe.open_for_write(first_sframe_ptr->column_names(),first_sframe_ptr->column_types(), "", 1, false);
append_sframe.close();

for(int index=0;index<list_filenames.size();index++) {
sframe append_sframe(list_filenames[0]);
for(int index=1;index<list_filenames.size();index++) {
auto sframe_ptr = std::make_shared<sframe>(list_filenames[index]);
append_sframe = append_sframe.append(*sframe_ptr);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ Spark RDD

SFrame.from_rdd
SFrame.to_rdd
SFrame.to_schema_rdd

SQL Database
----------------
Expand Down
2 changes: 1 addition & 1 deletion oss_src/unity/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def run(self):
exclude=["*.tests", "*.tests.*", "tests.*", "tests", "*.test", "*.test.*", "test.*", "test"]),
url='https://dato.com',
license='LICENSE.txt',
description='SFrame enables developers and data scientists to apply machine learning to build state of the art data products.',
description='SFrame is an scalable, out-of-core dataframe, which allows you to work with datasets that are larger than the amount of RAM on your system.',
# long_description=open('README.txt').read(),
classifiers=classifiers,
install_requires=[
Expand Down
2 changes: 1 addition & 1 deletion oss_src/unity/python/sframe/data_structures/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def show(self):
Displays the image. Requires PIL/Pillow.
Alternatively, you can create an :class:`graphlab.SArray` of this image
and use :func:`graphlab.SArray.show()`
and use py:func:`graphlab.SArray.show()`
See Also
--------
Expand Down
36 changes: 25 additions & 11 deletions oss_src/unity/python/sframe/data_structures/sframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1697,10 +1697,11 @@ def to_spark_dataframe(self,sc,sql,number_of_partitions=4):
>>> from pyspark import SparkContext, SQLContext
>>> from graphlab import SFrame
>>> from pyspark.sql import SQLContext
>>> sc = SparkContext('local')
>>> sqlc = SQLContext(sc)
>>> sql = SQLContext(sc)
>>> sf = SFrame({'x': [1,2,3], 'y': ['fish', 'chips', 'salad']})
>>> df = sf.to_spark_dataframe(sc, sqlc)
>>> df = sf.to_spark_dataframe(sc, sql)
>>> df.show()
x y
1 fish
Expand Down Expand Up @@ -1902,15 +1903,28 @@ def from_rdd(cls, rdd, cur_sc):
df, tmp_loc, finalSFramePrefix)
else:
if encoding == 'utf8':
finalSFrameFilename = graphlab_util_ref.toSFrame(
rdd._jrdd.rdd(),tmp_loc, finalSFramePrefix)
else:
# Prep the additional arguments to feed into the pySparkToSFrame function in Java
# that will call the spark_unity binary which does the actual encoding
additiona_args = os.path.join(" --encoding=%s " % encoding +\
" --type=rdd ")
finalSFrameFilename = graphlab_util_ref.pySparkToSFrame(
rdd._jrdd, tmp_loc, finalSFramePrefix, additiona_args)
## TODO: This is a temporary solution. Here we are completely bypassing
## toSFrame() codepath when encoding is 'utf8'. This is because of Spark1.5 error
## for closure cleaning issue on deep nested functions.

def f(iterator):
for obj in iterator:
yield obj.encode("utf-8")

rdd = rdd.mapPartitions(f)
encoding = "batch"
if(rdd._jrdd_deserializer.__class__.__name__ == 'PickleSerializer'):
encoding = "pickle"

#finalSFrameFilename = graphlab_util_ref.toSFrame(
# rdd._jrdd.rdd(),tmp_loc, finalSFramePrefix)
#else:
# Prep the additional arguments to feed into the pySparkToSFrame function in Java
# that will call the spark_unity binary which does the actual encoding
additiona_args = os.path.join(" --encoding=%s " % encoding +\
" --type=rdd ")
finalSFrameFilename = graphlab_util_ref.pySparkToSFrame(
rdd._jrdd, tmp_loc, finalSFramePrefix, additiona_args)

# Load and return the sframe
sf = SFrame()
Expand Down
5 changes: 2 additions & 3 deletions oss_src/unity/python/sframe/toolkits/_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,10 +427,9 @@ def show(self, view=None, model_type='base'):
view : str, optional
The name of the Model view to show. Can be one of:
- 'Summary': The summary description of a Model.
- Summary: Shows the statistics of the training process such as size of the data and time cost. The summary also shows the parameters and settings for the model training process if available.
- Evaluation: Shows precision recall plot as line chart. Tooltip is provided for pointwise analysis. Precision recall values are shown in the tooltip at any given cutoff value the mouse points to.
- 'Evaluation': A visual representation of the evaluation results for
a Model.
Returns
-------
Expand Down

0 comments on commit c8947e6

Please sign in to comment.