From a4044bb7bc5ba2d02bed6b18197433a0a257b4f3 Mon Sep 17 00:00:00 2001
From: qibin <qb1124_hn@163.com>
Date: Tue, 10 Jan 2017 16:28:27 +0800
Subject: [PATCH 01/18] add cifar

---
 python/paddle/data/__init__.py |   0
 python/paddle/data/cifar_10.py | 100 +++++++++++++++++++++++++++++++++
 python/setup.py.in             |   1 +
 3 files changed, 101 insertions(+)
 create mode 100644 python/paddle/data/__init__.py
 create mode 100644 python/paddle/data/cifar_10.py

diff --git a/python/paddle/data/__init__.py b/python/paddle/data/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/python/paddle/data/cifar_10.py b/python/paddle/data/cifar_10.py
new file mode 100644
index 00000000000000..2c5d40810b40e8
--- /dev/null
+++ b/python/paddle/data/cifar_10.py
@@ -0,0 +1,100 @@
+#/usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import shutil
+import os
+import sys
+import tarfile
+import zipfile
+import collections
+import numpy as np
+from six.moves import urllib
+
+source_url='https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
+source_file = "cifar-10-batches-py"
+label_map = {
+0: "airplane",
+1: "automobile",
+2: "bird",
+3: "cat",
+4: "deer",
+5: "dog",
+6: "frog",
+7: "horse",
+8: "ship",
+9: "truck"
+}
+
+def fetch():
+    num_images_train = 50000
+    num_batch = 5
+    source_name = "cifar"
+    file_source = "cifar-10-batches-py"
+    #Set the download dir for cifar.
+    data_home = set_data_path(source_name)
+    filepath = data_download(data_home,source_url)
+    """
+    for i in range(1, num_batch + 1):
+        fpath = os.path.join(filepath, "data_batch_%d" % i)
+    """
+
+def _unpickle(file_path):
+    with open(file_path, mode='rb') as file:
+        if sys.version_info < (3,):
+            data = cPickle.load(file)
+        else:
+            data = cPickle.load(file, encoding='bytes')
+    return data
+
+def set_data_path(source_name):
+     data_base = os.path.expanduser(os.path.join('~','.paddle'))
+     print data_base
+     if not os.access(data_base, os.W_OK):
+         data_base = os.path.join('/tmp', '.paddle')
+     datadir = os.path.join(data_base, source_name)
+     print datadir
+     if not os.path.exists(datadir):
+         os.makedirs(datadir)
+     return datadir
+
+def data_download(download_dir,source_url):
+    src_file = source_url.strip().split('/')[-1]
+    file_path = os.path.join(download_dir,src_file)
+    if not os.path.exists(file_path):
+        temp_file_name,_ = download_with_urlretrieve(source_url)
+        temp_file_path = os.getcwd()
+        os.rename(temp_file_name,src_file)
+        move_files(src_file,download_dir)
+        print("Download finished,Extracting files.")
+        tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
+        print("Unpacking done!")
+    else:
+        tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
+        print("Data has been already downloaded and unpacked!")
+    return download_dir
+
+def move_files(source_dire,target_dire):
+    shutil.move(source_dire,target_dire)
+
+def download_with_urlretrieve(url, filename=None):
+    return urllib.request.urlretrieve(url, filename)
+
+
+if __name__ == '__main__':
+    path = fetch()
+    print path
diff --git a/python/setup.py.in b/python/setup.py.in
index b66a42e87c7870..5b25b3ab350903 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,6 +1,7 @@
 from setuptools import setup
 
 packages=['paddle',
+          'paddle.data',
           'paddle.proto',
           'paddle.trainer',
           'paddle.trainer_config_helpers',

From 22a8d068d8a613273dbefcf2e67c378d7bdf7ff3 Mon Sep 17 00:00:00 2001
From: qibin <qb1124_hn@163.com>
Date: Tue, 10 Jan 2017 16:48:56 +0800
Subject: [PATCH 02/18] update cifar

---
 python/paddle/data/cifar_10.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/data/cifar_10.py b/python/paddle/data/cifar_10.py
index 2c5d40810b40e8..02608c79911e30 100644
--- a/python/paddle/data/cifar_10.py
+++ b/python/paddle/data/cifar_10.py
@@ -47,7 +47,7 @@ def fetch():
     file_source = "cifar-10-batches-py"
     #Set the download dir for cifar.
     data_home = set_data_path(source_name)
-    filepath = data_download(data_home,source_url)
+    filepath = data_download(data_home, source_url)
     """
     for i in range(1, num_batch + 1):
         fpath = os.path.join(filepath, "data_batch_%d" % i)

From 7192a6bce9a358876ef208fee8289149cdecccaf Mon Sep 17 00:00:00 2001
From: qibin <qb1124_hn@163.com>
Date: Tue, 10 Jan 2017 16:53:17 +0800
Subject: [PATCH 03/18] update cifar

---
 python/paddle/data/cifar_10.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/paddle/data/cifar_10.py b/python/paddle/data/cifar_10.py
index 02608c79911e30..762d4b2d40ca52 100644
--- a/python/paddle/data/cifar_10.py
+++ b/python/paddle/data/cifar_10.py
@@ -62,7 +62,7 @@ def _unpickle(file_path):
     return data
 
 def set_data_path(source_name):
-     data_base = os.path.expanduser(os.path.join('~','.paddle'))
+     data_base = os.path.expanduser(os.path.join('~', '.paddle'))
      print data_base
      if not os.access(data_base, os.W_OK):
          data_base = os.path.join('/tmp', '.paddle')
@@ -72,14 +72,14 @@ def set_data_path(source_name):
          os.makedirs(datadir)
      return datadir
 
-def data_download(download_dir,source_url):
+def data_download(download_dir, source_url):
     src_file = source_url.strip().split('/')[-1]
-    file_path = os.path.join(download_dir,src_file)
+    file_path = os.path.join(download_dir, src_file)
     if not os.path.exists(file_path):
         temp_file_name,_ = download_with_urlretrieve(source_url)
         temp_file_path = os.getcwd()
-        os.rename(temp_file_name,src_file)
-        move_files(src_file,download_dir)
+        os.rename(temp_file_name, src_file)
+        move_files(src_file, download_dir)
         print("Download finished,Extracting files.")
         tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
         print("Unpacking done!")
@@ -88,8 +88,8 @@ def data_download(download_dir,source_url):
         print("Data has been already downloaded and unpacked!")
     return download_dir
 
-def move_files(source_dire,target_dire):
-    shutil.move(source_dire,target_dire)
+def move_files(source_dire, target_dire):
+    shutil.move(source_dire, target_dire)
 
 def download_with_urlretrieve(url, filename=None):
     return urllib.request.urlretrieve(url, filename)

From 0913bbc83faf9a675548558545333018e6469e23 Mon Sep 17 00:00:00 2001
From: qibin <qb1124_hn@163.com>
Date: Tue, 10 Jan 2017 21:57:26 +0800
Subject: [PATCH 04/18] add mnist and amazon

---
 python/paddle/data/amazon.py | 103 +++++++++++++++++++++++++++++++++++
 python/paddle/data/mnist.py  |  83 ++++++++++++++++++++++++++++
 2 files changed, 186 insertions(+)
 create mode 100644 python/paddle/data/amazon.py
 create mode 100644 python/paddle/data/mnist.py

diff --git a/python/paddle/data/amazon.py b/python/paddle/data/amazon.py
new file mode 100644
index 00000000000000..c3c4cde65abc5d
--- /dev/null
+++ b/python/paddle/data/amazon.py
@@ -0,0 +1,103 @@
+#/usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import shutil
+import os
+import sys
+import zipfile
+import collections
+import numpy as np
+from six.moves import urllib
+import stat
+
+source_url='http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz'
+moses_url='https://github.com/moses-smt/mosesdecoder/archive/master.zip'
+file_source = "mosesdecoder-master"
+def fetch():
+    source_name = "amazon"
+    #file_source = "mosesdecoder-master"
+    #Set the download dir for cifar.
+    data_home = set_data_path(source_name)
+    #filepath = data_download(data_home,moses_url)
+    filepath = data_download(data_home,source_url)
+    filepath = data_download(data_home,moses_url)
+    """
+    for i in range(1, num_batch + 1):
+        fpath = os.path.join(filepath, "data_batch_%d" % i)
+    """
+
+def _unpickle(file_path):
+    with open(file_path, mode='rb') as file:
+        if sys.version_info < (3,):
+            data = cPickle.load(file)
+        else:
+            data = cPickle.load(file, encoding='bytes')
+    return data
+
+def set_data_path(source_name):
+     data_base = os.path.expanduser(os.path.join('~',' .paddle'))
+     if not os.access(data_base, os.W_OK):
+         data_base = os.path.join('/tmp', '.paddle')
+     datadir = os.path.join(data_base, source_name)
+     print datadir
+     if not os.path.exists(datadir):
+         os.makedirs(datadir)
+     return datadir
+
+def data_download(download_dir,source_url):
+    src_file = source_url.strip().split('/')[-1]
+    file_path = os.path.join(download_dir, src_file)
+
+    if not os.path.exists(file_path):
+        temp_file_name,_ = download_with_urlretrieve(source_url)
+        temp_file_path = os.getcwd()
+        os.rename(temp_file_name, src_file)
+        move_files(src_file, download_dir)
+        print("Download finished, Extracting files.")
+
+        if 'zip' in src_file:
+            tar = zipfile.ZipFile(file_path,'r')
+            infos = tar.infolist()
+            for file in infos:
+                tar.extract(file, download_dir)
+                fpath = os.path.join(download_dir, file.filename)
+                os.chmod(fpath,stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH)
+            os.remove(file_path)
+        print("Unpacking done!")
+    else:
+        if 'zip' in src_file:
+            tar = zipfile.ZipFile(file_path,'r')
+            infos = tar.infolist()
+            for file in infos:
+                tar.extract(file, download_dir)
+                fpath = os.path.join(download_dir, file.filename)
+                os.chmod(fpath,stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH)
+            os.remove(file_path)
+        print("Data has been already downloaded and unpacked!")
+    return download_dir
+
+def move_files(source_dire,target_dire):
+    shutil.move(source_dire,target_dire)
+
+def download_with_urlretrieve(url, filename=None):
+    return urllib.request.urlretrieve(url, filename)
+
+
+if __name__ == '__main__':
+    path = fetch()
+    print path
diff --git a/python/paddle/data/mnist.py b/python/paddle/data/mnist.py
new file mode 100644
index 00000000000000..7e04440f3765b4
--- /dev/null
+++ b/python/paddle/data/mnist.py
@@ -0,0 +1,83 @@
+#/usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import os
+import sys
+import collections
+import numpy as np
+from six.moves import urllib
+import urlparse
+import gzip
+
+source_url = 'http://yann.lecun.com/exdb/mnist/'
+filename = ['train-images-idx3-ubyte.gz','train-labels-idx1-ubyte.gz','t10k-images-idx3-ubyte.gz','t10k-labels-idx1-ubyte.gz']
+
+def fetch():
+    source_name = "mnist"
+    file_source = "cifar-10-batches-py"
+    #Set the download dir for cifar.
+    data_home = set_data_path(source_name)
+    filepath = data_download(data_home,source_url)
+    """
+    for i in range(1, num_batch + 1):
+        fpath = os.path.join(filepath, "data_batch_%d" % i)
+    """
+
+def set_data_path(source_name):
+     data_base = os.path.expanduser(os.path.join('~','.paddle'))
+     if not os.access(data_base, os.W_OK):
+         data_base = os.path.join('/tmp', '.paddle')
+     datadir = os.path.join(data_base, source_name)
+     print datadir
+     if not os.path.exists(datadir):
+         os.makedirs(datadir)
+     return datadir
+
+def data_download(download_dir,source_url):
+    for file in filename:
+        data_url = urlparse.urljoin(source_url,file)
+        file_path = os.path.join(download_dir,file)
+        untar_path = os.path.join(download_dir,file.replace(".gz",""))
+        if not os.path.exists(file_path):
+            temp_file_name,_ = download_with_urlretrieve(data_url)
+            temp_file_path = os.getcwd()
+            os.rename(temp_file_name,file)
+            move_files(file,download_dir)
+            print("Download finished,Extracting files.")
+            g_file = gzip.GzipFile(file_path)
+            open(untar_path,'w+').write(g_file.read())
+            g_file.close()
+            print("Unpacking done!")
+        else:
+            g_file = gzip.GzipFile(file_path)
+            open(untar_path,'w+').write(g_file.read())
+            g_file.close()
+            print("Data has been already downloaded and unpacked!")
+        os.remove(file_path)
+    return download_dir
+
+def move_files(source_dire,target_dire):
+    shutil.move(source_dire,target_dire)
+
+def download_with_urlretrieve(url, filename=None):
+    return urllib.request.urlretrieve(url, filename)
+
+
+if __name__ == '__main__':
+    path = fetch()
+    print path

From a4ed79877fa614c61978a5d11184afcaa22b166a Mon Sep 17 00:00:00 2001
From: qibin <qb1124_hn@163.com>
Date: Tue, 10 Jan 2017 22:00:18 +0800
Subject: [PATCH 05/18] update amazon and mnist

---
 python/paddle/data/amazon.py | 10 +++++-----
 python/paddle/data/mnist.py  | 22 +++++++++++-----------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/python/paddle/data/amazon.py b/python/paddle/data/amazon.py
index c3c4cde65abc5d..7e4985b4cc1415 100644
--- a/python/paddle/data/amazon.py
+++ b/python/paddle/data/amazon.py
@@ -34,8 +34,8 @@ def fetch():
     #Set the download dir for cifar.
     data_home = set_data_path(source_name)
     #filepath = data_download(data_home,moses_url)
-    filepath = data_download(data_home,source_url)
-    filepath = data_download(data_home,moses_url)
+    filepath = data_download(data_home, source_url)
+    filepath = data_download(data_home, moses_url)
     """
     for i in range(1, num_batch + 1):
         fpath = os.path.join(filepath, "data_batch_%d" % i)
@@ -59,7 +59,7 @@ def set_data_path(source_name):
          os.makedirs(datadir)
      return datadir
 
-def data_download(download_dir,source_url):
+def data_download(download_dir, source_url):
     src_file = source_url.strip().split('/')[-1]
     file_path = os.path.join(download_dir, src_file)
 
@@ -91,8 +91,8 @@ def data_download(download_dir,source_url):
         print("Data has been already downloaded and unpacked!")
     return download_dir
 
-def move_files(source_dire,target_dire):
-    shutil.move(source_dire,target_dire)
+def move_files(source_dire, target_dire):
+    shutil.move(source_dire, target_dire)
 
 def download_with_urlretrieve(url, filename=None):
     return urllib.request.urlretrieve(url, filename)
diff --git a/python/paddle/data/mnist.py b/python/paddle/data/mnist.py
index 7e04440f3765b4..959875889c2322 100644
--- a/python/paddle/data/mnist.py
+++ b/python/paddle/data/mnist.py
@@ -32,14 +32,14 @@ def fetch():
     file_source = "cifar-10-batches-py"
     #Set the download dir for cifar.
     data_home = set_data_path(source_name)
-    filepath = data_download(data_home,source_url)
+    filepath = data_download(data_home, source_url)
     """
     for i in range(1, num_batch + 1):
         fpath = os.path.join(filepath, "data_batch_%d" % i)
     """
 
 def set_data_path(source_name):
-     data_base = os.path.expanduser(os.path.join('~','.paddle'))
+     data_base = os.path.expanduser(os.path.join('~', '.paddle'))
      if not os.access(data_base, os.W_OK):
          data_base = os.path.join('/tmp', '.paddle')
      datadir = os.path.join(data_base, source_name)
@@ -48,16 +48,16 @@ def set_data_path(source_name):
          os.makedirs(datadir)
      return datadir
 
-def data_download(download_dir,source_url):
+def data_download(download_dir, source_url):
     for file in filename:
-        data_url = urlparse.urljoin(source_url,file)
-        file_path = os.path.join(download_dir,file)
-        untar_path = os.path.join(download_dir,file.replace(".gz",""))
+        data_url = urlparse.urljoin(source_url, file)
+        file_path = os.path.join(download_dir, file)
+        untar_path = os.path.join(download_dir, file.replace(".gz", ""))
         if not os.path.exists(file_path):
             temp_file_name,_ = download_with_urlretrieve(data_url)
             temp_file_path = os.getcwd()
-            os.rename(temp_file_name,file)
-            move_files(file,download_dir)
+            os.rename(temp_file_name, file)
+            move_files(file, download_dir)
             print("Download finished,Extracting files.")
             g_file = gzip.GzipFile(file_path)
             open(untar_path,'w+').write(g_file.read())
@@ -65,14 +65,14 @@ def data_download(download_dir,source_url):
             print("Unpacking done!")
         else:
             g_file = gzip.GzipFile(file_path)
-            open(untar_path,'w+').write(g_file.read())
+            open(untar_path, 'w+').write(g_file.read())
             g_file.close()
             print("Data has been already downloaded and unpacked!")
         os.remove(file_path)
     return download_dir
 
-def move_files(source_dire,target_dire):
-    shutil.move(source_dire,target_dire)
+def move_files(source_dire, target_dire):
+    shutil.move(source_dire, target_dire)
 
 def download_with_urlretrieve(url, filename=None):
     return urllib.request.urlretrieve(url, filename)

From ce0f5b0db14cf5e08d59a4732f577018a7d7a52e Mon Sep 17 00:00:00 2001
From: qibin <qb1124_hn@163.com>
Date: Wed, 11 Jan 2017 21:32:38 +0800
Subject: [PATCH 06/18] add other data

---
 python/paddle/data/__init__.py       |   5 +
 python/paddle/data/amazon.py         |  84 +++++++++++--
 python/paddle/data/cifar10.py        | 172 ++++++++++++++++++++++++++
 python/paddle/data/mnist.py          | 102 ++++++++++++---
 python/paddle/data/recommendation.py | 168 +++++++++++++++++++++++++
 python/paddle/data/semantic.py       | 164 +++++++++++++++++++++++++
 python/paddle/data/sentiment.py      | 177 +++++++++++++++++++++++++++
 python/paddle/data/seqToseq.py       | 162 ++++++++++++++++++++++++
 8 files changed, 1008 insertions(+), 26 deletions(-)
 create mode 100644 python/paddle/data/cifar10.py
 create mode 100644 python/paddle/data/recommendation.py
 create mode 100644 python/paddle/data/semantic.py
 create mode 100644 python/paddle/data/sentiment.py
 create mode 100644 python/paddle/data/seqToseq.py

diff --git a/python/paddle/data/__init__.py b/python/paddle/data/__init__.py
index e69de29bb2d1d6..970e56f072fdf9 100644
--- a/python/paddle/data/__init__.py
+++ b/python/paddle/data/__init__.py
@@ -0,0 +1,5 @@
+"""
+The :mod:`paddle.datasets` module includes utilities to load datasets,
+including methods to load and fetch popular reference datasets. It also
+features some artificial data generators.
+"""
diff --git a/python/paddle/data/amazon.py b/python/paddle/data/amazon.py
index 7e4985b4cc1415..54e90e83e8be63 100644
--- a/python/paddle/data/amazon.py
+++ b/python/paddle/data/amazon.py
@@ -28,18 +28,28 @@
 source_url='http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz'
 moses_url='https://github.com/moses-smt/mosesdecoder/archive/master.zip'
 file_source = "mosesdecoder-master"
+
+
 def fetch():
+    """
+    According to the source name,set the download path for source,
+    download the data from the source url,and return the download path to fetch for training api.
+
+    Args:
+
+    Returns:
+        path to downloaded file.
+    """
     source_name = "amazon"
-    #file_source = "mosesdecoder-master"
-    #Set the download dir for cifar.
     data_home = set_data_path(source_name)
-    #filepath = data_download(data_home,moses_url)
-    filepath = data_download(data_home, source_url)
-    filepath = data_download(data_home, moses_url)
+    filepath = data_download(data_home,source_url)
+    filepath = data_download(data_home,moses_url)
     """
     for i in range(1, num_batch + 1):
         fpath = os.path.join(filepath, "data_batch_%d" % i)
     """
+    return filepath
+
 
 def _unpickle(file_path):
     with open(file_path, mode='rb') as file:
@@ -49,7 +59,17 @@ def _unpickle(file_path):
             data = cPickle.load(file, encoding='bytes')
     return data
 
+
 def set_data_path(source_name):
+    """
+    Set the path for download according to the source name.
+
+    Args:
+        source_name:the source
+
+    Returns:
+        the data directory for data download.
+    """
      data_base = os.path.expanduser(os.path.join('~',' .paddle'))
      if not os.access(data_base, os.W_OK):
          data_base = os.path.join('/tmp', '.paddle')
@@ -59,7 +79,19 @@ def set_data_path(source_name):
          os.makedirs(datadir)
      return datadir
 
-def data_download(download_dir, source_url):
+
+def data_download(download_dir,source_url):
+    """
+    Download data according to the url for mnist.
+    when downloading,it can see each download process.
+
+    Args:
+        download_dir:the directory for data download.
+        source_url:the url for data download.
+
+    Returns:
+        the path after data downloaded.
+    """
     src_file = source_url.strip().split('/')[-1]
     file_path = os.path.join(download_dir, src_file)
 
@@ -91,11 +123,49 @@ def data_download(download_dir, source_url):
         print("Data has been already downloaded and unpacked!")
     return download_dir
 
+
 def move_files(source_dire, target_dire):
+    """
+    Renaming the source file to other name.
+
+    Args:
+        source_dire:the source name of file
+        target_dire:the target name of file.
+
+    Returns:
+    """
     shutil.move(source_dire, target_dire)
 
+
 def download_with_urlretrieve(url, filename=None):
-    return urllib.request.urlretrieve(url, filename)
+    """
+    Download each file with urlretrieve,and the download process can be seen.
+
+    Args:
+        url:the url for data downoad.
+        filename:the target name for download.
+
+    Returns:
+           the temp name after urlretrieve downloaded.
+    """
+    return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress)
+
+
+def check_download_progress(count, block_size, total_size):
+    """
+    Print and check the download process.
+
+    Args:
+        count:
+        block_size:
+        total_size:
+
+    Returns:
+    """
+    percent = float(count * block_size) / total_size
+    msg = "\r- Download progress: {:.1%}".format(percent)
+    sys.stdout.write(msg)
+    sys.stdout.flush()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/data/cifar10.py b/python/paddle/data/cifar10.py
new file mode 100644
index 00000000000000..1d461ba4466a49
--- /dev/null
+++ b/python/paddle/data/cifar10.py
@@ -0,0 +1,172 @@
+#/usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import shutil
+import os
+import sys
+import tarfile
+import zipfile
+import collections
+import numpy as np
+from six.moves import urllib
+
+source_url='https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
+source_file = "cifar-10-batches-py"
+label_map = {
+0: "airplane",
+1: "automobile",
+2: "bird",
+3: "cat",
+4: "deer",
+5: "dog",
+6: "frog",
+7: "horse",
+8: "ship",
+9: "truck"
+}
+
+
+def fetch():
+    """
+    According to the source name,set the download path for source,
+    download the data from the source url,and return the download path to fetch for training api.
+
+    Args:
+
+    Returns:
+        path to downloaded file.
+    """
+    num_images_train = 50000
+    num_batch = 5
+    source_name = "cifar"
+    file_source = "cifar-10-batches-py"
+    #Set the download dir for cifar.
+    data_home = set_data_path(source_name)
+    filepath = data_download(data_home,source_url)
+    """
+    for i in range(1, num_batch + 1):
+        fpath = os.path.join(filepath, "data_batch_%d" % i)
+    """
+    return filepath
+
+
+def _unpickle(file_path):
+
+    with open(file_path, mode='rb') as file:
+        if sys.version_info < (3,):
+            data = cPickle.load(file)
+        else:
+            data = cPickle.load(file, encoding='bytes')
+    return data
+
+
+def set_data_path(source_name):
+    """
+    Set the path for download according to the source name.
+
+    Args:
+        source_name:the source
+
+    Returns:
+        the data directory for data download.
+    """
+     data_base = os.path.expanduser(os.path.join('~','.paddle'))
+     print data_base
+     if not os.access(data_base, os.W_OK):
+         data_base = os.path.join('/tmp', '.paddle')
+     datadir = os.path.join(data_base, source_name)
+     print datadir
+     if not os.path.exists(datadir):
+         os.makedirs(datadir)
+     return datadir
+
+
+def data_download(download_dir, source_url):
+    """
+    Download data according to the url for mnist.
+    when downloading,it can see each download process.
+
+    Args:
+        download_dir:the directory for data download.
+        source_url:the url for data download.
+
+    Returns:
+        the path after data downloaded.
+    """
+    src_file = source_url.strip().split('/')[-1]
+    file_path = os.path.join(download_dir, src_file)
+    if not os.path.exists(file_path):
+        temp_file_name,_ = download_with_urlretrieve(source_url)
+        temp_file_path = os.getcwd()
+        os.rename(temp_file_name, src_file)
+        move_files(src_file,download_dir)
+        print("Download finished, Extracting files.")
+        tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
+        print("Unpacking done!")
+    else:
+        tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
+        print("Data has been already downloaded and unpacked!")
+    return download_dir
+
+
+def move_files(source_dire, target_dire):
+    """
+    Renaming the source file to other name.
+
+    Args:
+        source_dire:the source name of file
+        target_dire:the target name of file.
+
+    Returns:
+    """
+    shutil.move(source_dire, target_dire)
+
+
+def download_with_urlretrieve(url, filename=None):
+    """
+    Download each file with urlretrieve,and the download process can be seen.
+
+    Args:
+        url:the url for data downoad.
+        filename:the target name for download.
+
+    Returns:
+           the temp name after urlretrieve downloaded.
+    """
+    return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress)
+
+
+ def check_download_progress(count, block_size, total_size):
+     """
+     Print and check the download process.
+
+     Args:
+         count:
+         block_size:
+         total_size:
+
+     Returns:
+     """
+     percent = float(count * block_size) / total_size
+     msg = "\r- Download progress: {:.1%}".format(percent)
+     sys.stdout.write(msg)
+     sys.stdout.flush()
+
+if __name__ == '__main__':
+    path = fetch()
+    print path
diff --git a/python/paddle/data/mnist.py b/python/paddle/data/mnist.py
index 959875889c2322..ac16cf1919350b 100644
--- a/python/paddle/data/mnist.py
+++ b/python/paddle/data/mnist.py
@@ -25,21 +25,35 @@
 import gzip
 
 source_url = 'http://yann.lecun.com/exdb/mnist/'
-filename = ['train-images-idx3-ubyte.gz','train-labels-idx1-ubyte.gz','t10k-images-idx3-ubyte.gz','t10k-labels-idx1-ubyte.gz']
+filename = ['train-images-idx3-ubyte.gz','t10k-images-idx3-ubyte.gz','train-labels-idx1-ubyte.gz','t10k-labels-idx1-ubyte.gz']
 
 def fetch():
-    source_name = "mnist"
-    file_source = "cifar-10-batches-py"
-    #Set the download dir for cifar.
-    data_home = set_data_path(source_name)
-    filepath = data_download(data_home, source_url)
     """
-    for i in range(1, num_batch + 1):
-        fpath = os.path.join(filepath, "data_batch_%d" % i)
+    According to the source name,set the download path for source,
+    download the data from the source url,and return the download path to fetch for training api.
+
+    Args:
+
+    Returns:
+        path to downloaded file.
     """
+    source_name = "mnist"
+    data_home = set_data_path(source_name)
+    filepath = data_download(data_home,source_url)
+    return filepath
+
 
 def set_data_path(source_name):
-     data_base = os.path.expanduser(os.path.join('~', '.paddle'))
+    """
+    Set the path for download according to the source name.
+
+    Args:
+        source_name:the source
+
+    Returns:
+        the data directory for data download.
+    """
+     data_base = os.path.expanduser(os.path.join('~','.paddle'))
      if not os.access(data_base, os.W_OK):
          data_base = os.path.join('/tmp', '.paddle')
      datadir = os.path.join(data_base, source_name)
@@ -48,16 +62,28 @@ def set_data_path(source_name):
          os.makedirs(datadir)
      return datadir
 
-def data_download(download_dir, source_url):
+
+def data_download(download_dir,source_url):
+    """
+    Download data according to the url for mnist.
+    when downloading,it can see each download process.
+
+    Args:
+        download_dir:the directory for data download.
+        source_url:the url for data download.
+
+    Returns:
+        the path after data downloaded.
+    """
     for file in filename:
-        data_url = urlparse.urljoin(source_url, file)
-        file_path = os.path.join(download_dir, file)
-        untar_path = os.path.join(download_dir, file.replace(".gz", ""))
+        data_url = urlparse.urljoin(source_url,file)
+        file_path = os.path.join(download_dir,file)
+        untar_path = os.path.join(download_dir,file.replace(".gz",""))
         if not os.path.exists(file_path):
             temp_file_name,_ = download_with_urlretrieve(data_url)
             temp_file_path = os.getcwd()
-            os.rename(temp_file_name, file)
-            move_files(file, download_dir)
+            os.rename(temp_file_name,file)
+            move_files(file,download_dir)
             print("Download finished,Extracting files.")
             g_file = gzip.GzipFile(file_path)
             open(untar_path,'w+').write(g_file.read())
@@ -65,17 +91,55 @@ def data_download(download_dir, source_url):
             print("Unpacking done!")
         else:
             g_file = gzip.GzipFile(file_path)
-            open(untar_path, 'w+').write(g_file.read())
+            open(untar_path,'w+').write(g_file.read())
             g_file.close()
             print("Data has been already downloaded and unpacked!")
         os.remove(file_path)
     return download_dir
 
-def move_files(source_dire, target_dire):
-    shutil.move(source_dire, target_dire)
+
+def move_files(source_dire,target_dire):
+    """
+    Renaming the source file to other name.
+
+    Args:
+        source_dire:the source name of file
+        target_dire:the target name of file.
+
+    Returns:
+    """
+    shutil.move(source_dire,target_dire)
+
 
 def download_with_urlretrieve(url, filename=None):
-    return urllib.request.urlretrieve(url, filename)
+    """
+    Download each file with urlretrieve,and the download process can be seen.
+
+    Args:
+        url:the url for data downoad.
+        filename:the target name for download.
+
+    Returns:
+           the temp name after urlretrieve downloaded.
+    """
+    return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress)
+
+
+def check_download_progress(count, block_size, total_size):
+    """
+    Print and check the download process.
+
+    Args:
+        count:
+        block_size:
+        total_size:
+
+    Returns:
+    """
+    percent = float(count * block_size) / total_size
+    msg = "\r- Download progress: {:.1%}".format(percent)
+    sys.stdout.write(msg)
+    sys.stdout.flush()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/data/recommendation.py b/python/paddle/data/recommendation.py
new file mode 100644
index 00000000000000..1e93b6dc161224
--- /dev/null
+++ b/python/paddle/data/recommendation.py
@@ -0,0 +1,168 @@
+#/usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import shutil
+import os
+import sys
+import zipfile
+import collections
+import numpy as np
+from six.moves import urllib
+import stat
+
+
+source_url='http://files.grouplens.org/datasets/movielens/ml-1m.zip'
+file_source = "mosesdecoder-master"
+
+
+def fetch():
+    """
+    According to the source name,set the download path for source,
+    download the data from the source url,and return the download path to fetch for training api.
+
+    Args:
+
+    Returns:
+        path to downloaded file.
+    """
+    source_name = "recommendation"
+    #Set the download dir for recommendation.
+    data_home = set_data_path(source_name)
+    filepath = data_download(data_home, source_url)
+    """
+    for i in range(1, num_batch + 1):
+        fpath = os.path.join(filepath, "data_batch_%d" % i)
+    """
+    return filepath
+
+
+def _unpickle(file_path):
+    with open(file_path, mode='rb') as file:
+        if sys.version_info < (3,):
+            data = cPickle.load(file)
+        else:
+            data = cPickle.load(file, encoding='bytes')
+    return data
+
+
+def set_data_path(source_name):
+    """
+    Set the path for download according to the source name.
+
+    Args:
+        source_name:the source
+
+    Returns:
+        the data directory for data download.
+    """
+     data_base = os.path.expanduser(os.path.join('~',' .paddle'))
+     if not os.access(data_base, os.W_OK):
+         data_base = os.path.join('/tmp', '.paddle')
+     datadir = os.path.join(data_base, source_name)
+     print datadir
+     if not os.path.exists(datadir):
+         os.makedirs(datadir)
+     return datadir
+
+
+def data_download(download_dir, source_url):
+    """
+    Download data according to the url for mnist.
+    when downloading,it can see each download process.
+
+    Args:
+        download_dir:the directory for data download.
+        source_url:the url for data download.
+
+    Returns:
+        the path after data downloaded.
+    """
+    src_file = source_url.strip().split('/')[-1]
+    file_path = os.path.join(download_dir, src_file)
+
+    if not os.path.exists(file_path):
+        temp_file_name,_ = download_with_urlretrieve(source_url)
+        temp_file_path = os.getcwd()
+        os.rename(temp_file_name, src_file)
+        move_files(src_file, download_dir)
+        print("Download finished, Extracting files.")
+        tar = zipfile.ZipFile(file_path, 'r')
+        infos = tar.infolist()
+        for file in infos:
+            tar.extract(file, download_dir)
+            fpath = os.path.join(download_dir, file.filename)
+        os.remove(file_path)
+        print("Unpacking done!")
+    else:
+        tar = zipfile.ZipFile(file_path, 'r')
+        infos = tar.infolist()
+        for file in infos:
+            tar.extract(file, download_dir)
+            fpath = os.path.join(download_dir, file.filename)
+        os.remove(file_path)
+        print("Data has been already downloaded and unpacked!")
+    return download_dir
+
+
+def move_files(source_dire, target_dire):
+    """
+    Renaming the source file to other name.
+
+    Args:
+        source_dire:the source name of file
+        target_dire:the target name of file.
+
+    Returns:
+    """
+    shutil.move(source_dire, target_dire)
+
+
+def download_with_urlretrieve(url, filename=None):
+    """
+    Download each file with urlretrieve,and the download process can be seen.
+
+    Args:
+        url:the url for data downoad.
+        filename:the target name for download.
+
+    Returns:
+           the temp name after urlretrieve downloaded.
+    """
+    return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress)
+
+
+def check_download_progress(count, block_size, total_size):
+    """
+    Print and check the download process.
+
+    Args:
+        count:
+        block_size:
+        total_size:
+
+    Returns:
+    """
+    percent = float(count * block_size) / total_size
+    msg = "\r- Download progress: {:.1%}".format(percent)
+    sys.stdout.write(msg)
+    sys.stdout.flush()
+
+
+if __name__ == '__main__':
+    path = fetch()
+    print path
diff --git a/python/paddle/data/semantic.py b/python/paddle/data/semantic.py
new file mode 100644
index 00000000000000..087e6e6640fea3
--- /dev/null
+++ b/python/paddle/data/semantic.py
@@ -0,0 +1,164 @@
+#/usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import shutil
+import os
+import sys
+import zipfile
+import collections
+import numpy as np
+from six.moves import urllib
+import stat
+
+source_url=['http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz',
+        'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt',
+        'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt',
+        'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt',
+        'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
+        ]
+
+
+def fetch():
+    """
+    According to the source name,set the download path for source,
+    download the data from the source url,and return the download path to fetch for training api.
+
+    Args:
+
+    Returns:
+        path to downloaded file.
+    """
+    source_name = "semantic"
+    data_home = set_data_path(source_name)
+    model_path = data_download(data_home, model_url)
+    for url in source_url:
+        filepath = data_download(data_home, moses_url)
+    """
+    for i in range(1, num_batch + 1):
+        fpath = os.path.join(filepath, "data_batch_%d" % i)
+    """
+       return filepath
+
+
+def _unpickle(file_path):
+    with open(file_path, mode='rb') as file:
+        if sys.version_info < (3,):
+            data = cPickle.load(file)
+        else:
+            data = cPickle.load(file, encoding='bytes')
+    return data
+
+
+def set_data_path(source_name):
+    """
+    Set the path for download according to the source name.
+
+    Args:
+        source_name:the source
+
+    Returns:
+        the data directory for data download.
+    """
+     data_base = os.path.expanduser(os.path.join('~',' .paddle'))
+     if not os.access(data_base, os.W_OK):
+         data_base = os.path.join('/tmp', '.paddle')
+     datadir = os.path.join(data_base, source_name)
+     print datadir
+     if not os.path.exists(datadir):
+         os.makedirs(datadir)
+     return datadir
+
+
+def data_download(download_dir, source_url):
+   """
+    Download data according to the url for mnist.
+    when downloading,it can see each download process.
+
+    Args:
+        download_dir:the directory for data download.
+        source_url:the url for data download.
+
+    Returns:
+        the path after data downloaded.
+    """
+    src_file = url.strip().split('/')[-1]
+    file_path = os.path.join(download_dir, src_file)
+
+    if not os.path.exists(file_path):
+        temp_file_name,_ = download_with_urlretrieve(source_url)
+        temp_file_path = os.getcwd()
+        os.rename(temp_file_name, src_file)
+        move_files(src_file, download_dir)
+        print("Download finished, Extracting files.")
+        tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
+        os.remove(file_path)
+        print("Unpacking done!")
+    else:
+        tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
+        os.remove(file_path)
+        print("Data has been already downloaded and unpacked!")
+    return download_dir
+
+
+def move_files(source_dire, target_dire):
+    """
+    Renaming the source file to other name.
+
+    Args:
+        source_dire:the source name of file
+        target_dire:the target name of file.
+
+    Returns:
+    """
+    shutil.move(source_dire, target_dire)
+
+
+def download_with_urlretrieve(url, filename=None):
+   """
+    Download each file with urlretrieve,and the download process can be seen.
+
+    Args:
+        url:the url for data downoad.
+        filename:the target name for download.
+
+    Returns:
+           the temp name after urlretrieve downloaded.
+    """
+    return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress)
+
+
+def check_download_progress(count, block_size, total_size):
+    """
+    Print and check the download process.
+
+    Args:
+        count:
+        block_size:
+        total_size:
+
+    Returns:
+    """
+    percent = float(count * block_size) / total_size
+    msg = "\r- Download progress: {:.1%}".format(percent)
+    sys.stdout.write(msg)
+    sys.stdout.flush()
+
+
+if __name__ == '__main__':
+    path = fetch()
+    print path
diff --git a/python/paddle/data/sentiment.py b/python/paddle/data/sentiment.py
new file mode 100644
index 00000000000000..18146ba93803a7
--- /dev/null
+++ b/python/paddle/data/sentiment.py
@@ -0,0 +1,177 @@
+#/usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import shutil
+import os
+import sys
+import zipfile
+import collections
+import numpy as np
+from six.moves import urllib
+import stat
+
+source_url='http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
+moses_url='https://github.com/moses-smt/mosesdecoder/archive/master.zip'
+file_source = "mosesdecoder-master"
+
+
+def fetch():
+    """
+    According to the source name,set the download path for source,
+    download the data from the source url,and return the download path to fetch for training api.
+
+    Args:
+
+    Returns:
+        path to downloaded file.
+    """
+    source_name = "sentiment"
+    data_home = set_data_path(source_name)
+    filepath = data_download(data_home, source_url)
+    filepath = data_download(data_home, moses_url)
+    """
+    for i in range(1, num_batch + 1):
+        fpath = os.path.join(filepath, "data_batch_%d" % i)
+    """
+    return filepath
+
+
+def _unpickle(file_path):
+    with open(file_path, mode='rb') as file:
+        if sys.version_info < (3,):
+            data = cPickle.load(file)
+        else:
+            data = cPickle.load(file, encoding='bytes')
+    return data
+
+
+def set_data_path(source_name):
+   """
+    Set the path for download according to the source name.
+
+    Args:
+        source_name:the source
+
+    Returns:
+        the data directory for data download.
+    """
+     data_base = os.path.expanduser(os.path.join('~',' .paddle'))
+     if not os.access(data_base, os.W_OK):
+         data_base = os.path.join('/tmp', '.paddle')
+     datadir = os.path.join(data_base, source_name)
+     print datadir
+     if not os.path.exists(datadir):
+         os.makedirs(datadir)
+     return datadir
+
+
+def data_download(download_dir, source_url):
+    """
+    Download data according to the url for mnist.
+    when downloading,it can see each download process.
+
+    Args:
+        download_dir:the directory for data download.
+        source_url:the url for data download.
+
+    Returns:
+        the path after data downloaded.
+    """
+    src_file = source_url.strip().split('/')[-1]
+    file_path = os.path.join(download_dir, src_file)
+
+    if not os.path.exists(file_path):
+        temp_file_name,_ = download_with_urlretrieve(source_url)
+        temp_file_path = os.getcwd()
+        os.rename(temp_file_name, src_file)
+        move_files(src_file, download_dir)
+        print("Download finished, Extracting files.")
+
+        if 'zip' in src_file:
+            tar = zipfile.ZipFile(file_path, 'r')
+            infos = tar.infolist()
+            for file in infos:
+                tar.extract(file, download_dir)
+                fpath = os.path.join(download_dir, file.filename)
+                os.chmod(fpath,stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH)
+        else:
+            tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
+        os.remove(file_path)
+        print("Unpacking done!")
+    else:
+        if 'zip' in src_file:
+            tar = zipfile.ZipFile(file_path, 'r')
+            infos = tar.infolist()
+            for file in infos:
+                tar.extract(file, download_dir)
+                fpath = os.path.join(download_dir, file.filename)
+                os.chmod(fpath, stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH)
+        else:
+            tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
+        os.remove(file_path)
+        print("Data has been already downloaded and unpacked!")
+    return download_dir
+
+
+def move_files(source_dire, target_dire):
+    """
+    Renaming the source file to other name.
+
+    Args:
+        source_dire:the source name of file
+        target_dire:the target name of file.
+
+    Returns:
+    """
+    shutil.move(source_dire, target_dire)
+
+
+def download_with_urlretrieve(url, filename=None):
+    """
+    Download each file with urlretrieve,and the download process can be seen.
+
+    Args:
+        url:the url for data downoad.
+        filename:the target name for download.
+
+    Returns:
+           the temp name after urlretrieve downloaded.
+    """
+    return urllib.request.urlretrieve(url, filename, rereporthook=check_download_progress)
+
+
+def check_download_progress(count, block_size, total_size):
+    """
+    Print and check the download process.
+
+    Args:
+        count:
+        block_size:
+        total_size:
+
+    Returns:
+    """
+    percent = float(count * block_size) / total_size
+    msg = "\r- Download progress: {:.1%}".format(percent)
+    sys.stdout.write(msg)
+    sys.stdout.flush()
+
+
+if __name__ == '__main__':
+    path = fetch()
+    print path
diff --git a/python/paddle/data/seqToseq.py b/python/paddle/data/seqToseq.py
new file mode 100644
index 00000000000000..a9bdd0bf73cd2c
--- /dev/null
+++ b/python/paddle/data/seqToseq.py
@@ -0,0 +1,162 @@
+#/usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import shutil
+import os
+import sys
+import zipfile
+import collections
+import numpy as np
+from six.moves import urllib
+import stat
+
+source_url=['http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz',
+        'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz'
+        ]
+model_url='http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz'
+
+
+def fetch():
+    """
+    According to the source name,set the download path for source,
+    download the data from the source url,and return the download path to fetch for training api.
+
+    Args:
+
+    Returns:
+        path to downloaded file.
+    """
+    source_name = "seqToseq"
+    data_home = set_data_path(source_name)
+    model_path = data_download(data_home, model_url)
+    for url in source_url:
+        filepath = data_download(data_home, source_url)
+    """
+    for i in range(1, num_batch + 1):
+        fpath = os.path.join(filepath, "data_batch_%d" % i)
+    """
+        return filepath
+
+
+def _unpickle(file_path):
+    with open(file_path, mode='rb') as file:
+        if sys.version_info < (3,):
+            data = cPickle.load(file)
+        else:
+            data = cPickle.load(file, encoding='bytes')
+    return data
+
+
+def set_data_path(source_name):
+   """
+    Set the path for download according to the source name.
+
+    Args:
+        source_name:the source
+
+    Returns:
+        the data directory for data download.
+    """
+     data_base = os.path.expanduser(os.path.join('~',' .paddle'))
+     if not os.access(data_base, os.W_OK):
+         data_base = os.path.join('/tmp', '.paddle')
+     datadir = os.path.join(data_base, source_name)
+     print datadir
+     if not os.path.exists(datadir):
+         os.makedirs(datadir)
+     return datadir
+
+
+def data_download(download_dir, source_url):
+    """
+    Download data according to the url for mnist.
+    when downloading,it can see each download process.
+
+    Args:
+        download_dir:the directory for data download.
+        source_url:the url for data download.
+
+    Returns:
+        the path after data downloaded.
+    """
+    src_file = url.strip().split('/')[-1]
+    file_path = os.path.join(download_dir, src_file)
+
+    if not os.path.exists(file_path):
+        temp_file_name,_ = download_with_urlretrieve(source_url)
+        temp_file_path = os.getcwd()
+        os.rename(temp_file_name, src_file)
+        move_files(src_file, download_dir)
+        print("Download finished, Extracting files.")
+        tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
+        os.remove(file_path)
+        print("Unpacking done!")
+    else:
+        tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
+        os.remove(file_path)
+        print("Data has been already downloaded and unpacked!")
+    return download_dir
+
+
+def move_files(source_dire, target_dire):
+    """
+    Renaming the source file to other name.
+
+    Args:
+        source_dire:the source name of file
+        target_dire:the target name of file.
+
+    Returns:
+    """
+    shutil.move(source_dire, target_dire)
+
+
+def download_with_urlretrieve(url, filename=None):
+    """
+    Download each file with urlretrieve,and the download process can be seen.
+
+    Args:
+        url:the url for data downoad.
+        filename:the target name for download.
+
+    Returns:
+           the temp name after urlretrieve downloaded.
+    """
+    return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress)
+
+
+def check_download_progress(count, block_size, total_size):
+    """
+    Print and check the download process.
+
+    Args:
+        count:
+        block_size:
+        total_size:
+
+    Returns:
+    """
+    percent = float(count * block_size) / total_size
+    msg = "\r- Download progress: {:.1%}".format(percent)
+    sys.stdout.write(msg)
+    sys.stdout.flush()
+
+
+if __name__ == '__main__':
+    path = fetch()
+    print path

From 1373977e2c066f9d8509691da0b37f234cb05d0d Mon Sep 17 00:00:00 2001
From: baidu <baidu@qibinMacBook-Pro.local>
Date: Fri, 13 Jan 2017 11:45:27 +0800
Subject: [PATCH 07/18] update code

---
 python/paddle/data/DATA.md           |  28 +++++
 python/paddle/data/amazon.py         | 165 ++++++---------------------
 python/paddle/data/cifar10.py        | 158 +++++--------------------
 python/paddle/data/http_download.py  | 124 ++++++++++++++++++++
 python/paddle/data/mnist.py          | 129 ++++-----------------
 python/paddle/data/recommendation.py | 145 ++++-------------------
 python/paddle/data/semantic.py       | 152 ++++++------------------
 python/paddle/data/sentiment.py      | 155 +++++--------------------
 python/paddle/data/seqToseq.py       | 129 +++------------------
 9 files changed, 340 insertions(+), 845 deletions(-)
 create mode 100644 python/paddle/data/DATA.md
 create mode 100644 python/paddle/data/http_download.py

diff --git a/python/paddle/data/DATA.md b/python/paddle/data/DATA.md
new file mode 100644
index 00000000000000..ce186d42619509
--- /dev/null
+++ b/python/paddle/data/DATA.md
@@ -0,0 +1,28 @@
+## 需求
+
+Paddle目前提供了很多demo，且各demo运行时需要从原生网站下载其数据，并进行复杂的预处理过程，整个过程会耗费大量时间。
+
+所以我们需要数据封装接口，采用import数据源的方式(如\：import paddle.data.amazon.review.GetJSON)来简化获取训练所需数据的时间；但是如果你习惯自己处理原生数据，我们依然提供原生数据接口来满足你的需求。
+
+## 整体思路
+
+数据封装接口的目的是提供数据。不论是原生数据，还是预处理数据都通过import方式导入各模型进行训练；考虑到某些模型的预处理后的数据量依然很大，或有时就仅仅想训练相对较小的网络模型，没必要考虑全量数据，自动配置数据量大小必然更符合不同需求。整个接口初步设想如下：
+* 开关来控制数据来源
+   * 导入数据接口时，带有开关(如:src\_from = True，来自预处理源；否则,来自原生数据源)
+* 预处理数据部分添加配置train和test的数据量的大小
+* 原生数据部分的数据下载数据模块化
+   * 开关(src\_from = False)和<模型，数据源>对完成相关数据的下载
+* 原生数据的预处理部分保持原状，通过<模型,预处理过程>对完成数据的预处理
+* 在paddle的train的配置文件中修改数据源的导入方式
+
+整个过程在tensorflow的mnist模型已有人实现，借鉴此思想，实现paddle的各demo数据接口的通用化。
+
+```python
+amazon = input_data.load_dataset(
+         'Amazon',
+         '/Users/baidu/git/test_package/data',
+         data_unneed=False,
+         src_flag=False)
+batch = amazon.train.shrink_txt('train',10)
+```
+
diff --git a/python/paddle/data/amazon.py b/python/paddle/data/amazon.py
index 54e90e83e8be63..361a3fa79ded6e 100644
--- a/python/paddle/data/amazon.py
+++ b/python/paddle/data/amazon.py
@@ -15,22 +15,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+########################################################################
+#
+# Function for fetch the data untar directory for amazon training api.
+# As the python can read the data in "reviews_Electronics_5.json.gz",
+#here is no need to untar the data.
+#
+#
+# First,we let the data download path is "~/paddle_data_directory"
+# when u no special the download path.
+#
+#
+# Then,download the data,according to the speical source url.
+# Here,no need to untar the "reviews_Electronics_5.json.gz".
+#
+# After download the data,return the path of data.
+#
+#
+#########################################################################
+
 
 import shutil
 import os
 import sys
 import zipfile
 import collections
-import numpy as np
-from six.moves import urllib
 import stat
+from six.moves import urllib
+from http_download import data_download
+
 
 source_url='http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz'
 moses_url='https://github.com/moses-smt/mosesdecoder/archive/master.zip'
-file_source = "mosesdecoder-master"
+
+mose_source = "mosesdecoder-master"
 
 
-def fetch():
+def fetch(directory=None):
     """
     According to the source name,set the download path for source,
     download the data from the source url,and return the download path to fetch for training api.
@@ -38,136 +59,20 @@ def fetch():
     Args:
 
     Returns:
-        path to downloaded file.
+        path for the data untar.
     """
     source_name = "amazon"
-    data_home = set_data_path(source_name)
-    filepath = data_download(data_home,source_url)
-    filepath = data_download(data_home,moses_url)
-    """
-    for i in range(1, num_batch + 1):
-        fpath = os.path.join(filepath, "data_batch_%d" % i)
-    """
-    return filepath
-
-
-def _unpickle(file_path):
-    with open(file_path, mode='rb') as file:
-        if sys.version_info < (3,):
-            data = cPickle.load(file)
-        else:
-            data = cPickle.load(file, encoding='bytes')
-    return data
-
-
-def set_data_path(source_name):
-    """
-    Set the path for download according to the source name.
-
-    Args:
-        source_name:the source
-
-    Returns:
-        the data directory for data download.
-    """
-     data_base = os.path.expanduser(os.path.join('~',' .paddle'))
-     if not os.access(data_base, os.W_OK):
-         data_base = os.path.join('/tmp', '.paddle')
-     datadir = os.path.join(data_base, source_name)
-     print datadir
-     if not os.path.exists(datadir):
-         os.makedirs(datadir)
-     return datadir
-
+    if directory is None:
+        directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory'))
 
-def data_download(download_dir,source_url):
-    """
-    Download data according to the url for mnist.
-    when downloading,it can see each download process.
-
-    Args:
-        download_dir:the directory for data download.
-        source_url:the url for data download.
-
-    Returns:
-        the path after data downloaded.
-    """
-    src_file = source_url.strip().split('/')[-1]
-    file_path = os.path.join(download_dir, src_file)
-
-    if not os.path.exists(file_path):
-        temp_file_name,_ = download_with_urlretrieve(source_url)
-        temp_file_path = os.getcwd()
-        os.rename(temp_file_name, src_file)
-        move_files(src_file, download_dir)
-        print("Download finished, Extracting files.")
-
-        if 'zip' in src_file:
-            tar = zipfile.ZipFile(file_path,'r')
-            infos = tar.infolist()
-            for file in infos:
-                tar.extract(file, download_dir)
-                fpath = os.path.join(download_dir, file.filename)
-                os.chmod(fpath,stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH)
-            os.remove(file_path)
-        print("Unpacking done!")
-    else:
-        if 'zip' in src_file:
-            tar = zipfile.ZipFile(file_path,'r')
-            infos = tar.infolist()
-            for file in infos:
-                tar.extract(file, download_dir)
-                fpath = os.path.join(download_dir, file.filename)
-                os.chmod(fpath,stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH)
-            os.remove(file_path)
-        print("Data has been already downloaded and unpacked!")
-    return download_dir
-
-
-def move_files(source_dire, target_dire):
-    """
-    Renaming the source file to other name.
+    download_path = os.path.join(directory, source_name)
+    if not os.path.exists(download_path):
+        os.makedirs(download_path)
 
-    Args:
-        source_dire:the source name of file
-        target_dire:the target name of file.
-
-    Returns:
-    """
-    shutil.move(source_dire, target_dire)
-
-
-def download_with_urlretrieve(url, filename=None):
-    """
-    Download each file with urlretrieve,and the download process can be seen.
+    moses_src = data_download(download_path, moses_url)
+    moses_path = os.path.join(moses_src, mose_source)
 
-    Args:
-        url:the url for data downoad.
-        filename:the target name for download.
-
-    Returns:
-           the temp name after urlretrieve downloaded.
-    """
-    return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress)
-
-
-def check_download_progress(count, block_size, total_size):
-    """
-    Print and check the download process.
-
-    Args:
-        count:
-        block_size:
-        total_size:
-
-    Returns:
-    """
-    percent = float(count * block_size) / total_size
-    msg = "\r- Download progress: {:.1%}".format(percent)
-    sys.stdout.write(msg)
-    sys.stdout.flush()
+    filepath = data_download(download_path, source_url)
+    return filepath
 
 
-if __name__ == '__main__':
-    path = fetch()
-    print path
diff --git a/python/paddle/data/cifar10.py b/python/paddle/data/cifar10.py
index 1d461ba4466a49..72c4c9bcbc11ce 100644
--- a/python/paddle/data/cifar10.py
+++ b/python/paddle/data/cifar10.py
@@ -15,33 +15,38 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+########################################################################
+#
+# Function for fetch the data untar directory for cifar10 training api.
+# you can use this data for image classifation and gun traing.
+# As the python can read the data in "cifar-10-python.tar.gz",herer is
+# no need to untar the data.
+#
+#
+# First,we let the data download path is "~/paddle_data_directory",
+# when u no special the download path.
+#
+#
+# Then,download the cifar10 dataset,and returns the data directory for
+# training api.
+#
+########################################################################
+
 
 import shutil
 import os
 import sys
-import tarfile
-import zipfile
 import collections
 import numpy as np
 from six.moves import urllib
+from http_download import data_download
+
 
 source_url='https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
 source_file = "cifar-10-batches-py"
-label_map = {
-0: "airplane",
-1: "automobile",
-2: "bird",
-3: "cat",
-4: "deer",
-5: "dog",
-6: "frog",
-7: "horse",
-8: "ship",
-9: "truck"
-}
 
 
-def fetch():
+def fetch(directory=None):
     """
     According to the source name,set the download path for source,
     download the data from the source url,and return the download path to fetch for training api.
@@ -49,124 +54,17 @@ def fetch():
     Args:
 
     Returns:
-        path to downloaded file.
+        path to untar file.
     """
-    num_images_train = 50000
-    num_batch = 5
     source_name = "cifar"
-    file_source = "cifar-10-batches-py"
-    #Set the download dir for cifar.
-    data_home = set_data_path(source_name)
-    filepath = data_download(data_home,source_url)
-    """
-    for i in range(1, num_batch + 1):
-        fpath = os.path.join(filepath, "data_batch_%d" % i)
-    """
-    return filepath
-
-
-def _unpickle(file_path):
-
-    with open(file_path, mode='rb') as file:
-        if sys.version_info < (3,):
-            data = cPickle.load(file)
-        else:
-            data = cPickle.load(file, encoding='bytes')
-    return data
-
-
-def set_data_path(source_name):
-    """
-    Set the path for download according to the source name.
-
-    Args:
-        source_name:the source
-
-    Returns:
-        the data directory for data download.
-    """
-     data_base = os.path.expanduser(os.path.join('~','.paddle'))
-     print data_base
-     if not os.access(data_base, os.W_OK):
-         data_base = os.path.join('/tmp', '.paddle')
-     datadir = os.path.join(data_base, source_name)
-     print datadir
-     if not os.path.exists(datadir):
-         os.makedirs(datadir)
-     return datadir
-
-
-def data_download(download_dir, source_url):
-    """
-    Download data according to the url for mnist.
-    when downloading,it can see each download process.
-
-    Args:
-        download_dir:the directory for data download.
-        source_url:the url for data download.
-
-    Returns:
-        the path after data downloaded.
-    """
-    src_file = source_url.strip().split('/')[-1]
-    file_path = os.path.join(download_dir, src_file)
-    if not os.path.exists(file_path):
-        temp_file_name,_ = download_with_urlretrieve(source_url)
-        temp_file_path = os.getcwd()
-        os.rename(temp_file_name, src_file)
-        move_files(src_file,download_dir)
-        print("Download finished, Extracting files.")
-        tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
-        print("Unpacking done!")
-    else:
-        tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
-        print("Data has been already downloaded and unpacked!")
-    return download_dir
-
-
-def move_files(source_dire, target_dire):
-    """
-    Renaming the source file to other name.
-
-    Args:
-        source_dire:the source name of file
-        target_dire:the target name of file.
-
-    Returns:
-    """
-    shutil.move(source_dire, target_dire)
-
-
-def download_with_urlretrieve(url, filename=None):
-    """
-    Download each file with urlretrieve,and the download process can be seen.
-
-    Args:
-        url:the url for data downoad.
-        filename:the target name for download.
-
-    Returns:
-           the temp name after urlretrieve downloaded.
-    """
-    return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress)
-
 
- def check_download_progress(count, block_size, total_size):
-     """
-     Print and check the download process.
+    if directory is None:
+        directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory'))
 
-     Args:
-         count:
-         block_size:
-         total_size:
+    download_path = os.path.join(directory, source_name)
+    if not os.path.exists(download_path):
+        os.makedirs(download_path)
+    filepath = data_download(download_path, source_url)
 
-     Returns:
-     """
-     percent = float(count * block_size) / total_size
-     msg = "\r- Download progress: {:.1%}".format(percent)
-     sys.stdout.write(msg)
-     sys.stdout.flush()
+    return filepath
 
-if __name__ == '__main__':
-    path = fetch()
-    print path
diff --git a/python/paddle/data/http_download.py b/python/paddle/data/http_download.py
new file mode 100644
index 00000000000000..bef850da841e48
--- /dev/null
+++ b/python/paddle/data/http_download.py
@@ -0,0 +1,124 @@
+#/usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+########################################################################
+#
+# Funciton for data download,it use the urllib urlretrieve and we can
+# see the download process when downloading the source.
+#
+# download process like: - Download progress:10%
+#
+########################################################################
+
+
+import os
+import sys
+import shutil
+import zipfile
+import tarfile
+import stat
+from six.moves import urllib
+
+
+def download_with_urlretrieve(url, filename=None):
+    """
+    Download each file with urlretrieve,and the download process can be seen.
+
+    Args:
+        url:the url for data downoad.
+        filename:the target name for download.
+
+    Returns:
+           the temp name after urlretrieve downloaded.
+    """
+    return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress)
+
+
+def check_download_progress(count, block_size, total_size):
+    """
+    Print and check the download process.
+
+    Args:
+        count:
+        block_size:
+        total_size:
+
+    Returns:
+    """
+    percent = float(count * block_size) / total_size
+    msg = "\r- Download progress: {:.1%}".format(percent)
+    sys.stdout.write(msg)
+    sys.stdout.flush()
+
+
+def data_download(download_dir, source_url):
+    """
+    Download data according to the url for source_name.
+    when downloading,it can see each download process.
+
+    Args:
+        download_dir:the directory for data download.
+        source_url:the url for data download.
+
+    Returns:
+        the path after data downloaded.
+    """
+    src_file = source_url.strip().split('/')[-1]
+    file_path = os.path.join(download_dir, src_file)
+
+    print file_path
+    if not os.path.exists(file_path):
+        temp_file_name,_ = download_with_urlretrieve(source_url)
+        temp_file_path = os.getcwd()
+        os.rename(temp_file_name, src_file)
+        shutil.move(src_file, download_dir)
+        print("Download finished, Extracting files.")
+
+        if 'zip' in src_file:
+            tar = zipfile.ZipFile(file_path, 'r')
+            infos = tar.infolist()
+            for file in infos:
+                tar.extract(file, download_dir)
+                fpath = os.path.join(download_dir, file.filename)
+                if 'master' in src_file:
+                    os.chmod(fpath,stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH)
+            os.remove(file_path)
+        elif src_file in ['.json.gz','txt','emb','python.tar.gz']:
+            pass
+        elif src_file.split('.')[-1] is 'gz':
+            tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
+            os.remove(file_path)
+        print("Unpacking done!")
+    else:
+        if 'zip' in src_file:
+            tar = zipfile.ZipFile(file_path, 'r')
+            infos = tar.infolist()
+            for file in infos:
+                tar.extract(file, download_dir)
+                fpath = os.path.join(download_dir, file.filename)
+                if 'master' in src_file:
+                    os.chmod(fpath,stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH)
+            os.remove(file_path)
+        elif src_file in ['.json.gz','txt','emb']:
+            pass
+        elif src_file.split('.')[-1] is 'gz':
+            tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
+            os.remove(file_path)
+        print("Data has been already downloaded and unpacked!")
+
+    return download_dir
+
diff --git a/python/paddle/data/mnist.py b/python/paddle/data/mnist.py
index ac16cf1919350b..c084954848b985 100644
--- a/python/paddle/data/mnist.py
+++ b/python/paddle/data/mnist.py
@@ -15,6 +15,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+############################################################################
+#
+# Function for fetch the data untar directory for mnist training api.
+# you can use this data for Digital identification.
+#
+# First,we special the data download directory is "~/paddle_data_directory".
+# For the mnist dataset,it untar the dataset,and returns the untar
+# directory for training api.
+#
+############################################################################
+
+
 import shutil
 import os
 import sys
@@ -24,10 +36,12 @@
 import urlparse
 import gzip
 
+
 source_url = 'http://yann.lecun.com/exdb/mnist/'
 filename = ['train-images-idx3-ubyte.gz','t10k-images-idx3-ubyte.gz','train-labels-idx1-ubyte.gz','t10k-labels-idx1-ubyte.gz']
 
-def fetch():
+
+def fetch(directory=None):
     """
     According to the source name,set the download path for source,
     download the data from the source url,and return the download path to fetch for training api.
@@ -35,113 +49,20 @@ def fetch():
     Args:
 
     Returns:
-        path to downloaded file.
+        path for untar file.
     """
     source_name = "mnist"
-    data_home = set_data_path(source_name)
-    filepath = data_download(data_home,source_url)
-    return filepath
-
 
-def set_data_path(source_name):
-    """
-    Set the path for download according to the source name.
+    if directory is None:
+        directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory'))
 
-    Args:
-        source_name:the source
+    download_path = os.path.join(directory, source_name)
+    if not os.path.exists(download_path):
+        os.makedirs(download_path)
 
-    Returns:
-        the data directory for data download.
-    """
-     data_base = os.path.expanduser(os.path.join('~','.paddle'))
-     if not os.access(data_base, os.W_OK):
-         data_base = os.path.join('/tmp', '.paddle')
-     datadir = os.path.join(data_base, source_name)
-     print datadir
-     if not os.path.exists(datadir):
-         os.makedirs(datadir)
-     return datadir
-
-
-def data_download(download_dir,source_url):
-    """
-    Download data according to the url for mnist.
-    when downloading,it can see each download process.
-
-    Args:
-        download_dir:the directory for data download.
-        source_url:the url for data download.
-
-    Returns:
-        the path after data downloaded.
-    """
     for file in filename:
-        data_url = urlparse.urljoin(source_url,file)
-        file_path = os.path.join(download_dir,file)
-        untar_path = os.path.join(download_dir,file.replace(".gz",""))
-        if not os.path.exists(file_path):
-            temp_file_name,_ = download_with_urlretrieve(data_url)
-            temp_file_path = os.getcwd()
-            os.rename(temp_file_name,file)
-            move_files(file,download_dir)
-            print("Download finished,Extracting files.")
-            g_file = gzip.GzipFile(file_path)
-            open(untar_path,'w+').write(g_file.read())
-            g_file.close()
-            print("Unpacking done!")
-        else:
-            g_file = gzip.GzipFile(file_path)
-            open(untar_path,'w+').write(g_file.read())
-            g_file.close()
-            print("Data has been already downloaded and unpacked!")
-        os.remove(file_path)
-    return download_dir
-
-
-def move_files(source_dire,target_dire):
-    """
-    Renaming the source file to other name.
-
-    Args:
-        source_dire:the source name of file
-        target_dire:the target name of file.
-
-    Returns:
-    """
-    shutil.move(source_dire,target_dire)
-
-
-def download_with_urlretrieve(url, filename=None):
-    """
-    Download each file with urlretrieve,and the download process can be seen.
-
-    Args:
-        url:the url for data downoad.
-        filename:the target name for download.
-
-    Returns:
-           the temp name after urlretrieve downloaded.
-    """
-    return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress)
-
-
-def check_download_progress(count, block_size, total_size):
-    """
-    Print and check the download process.
-
-    Args:
-        count:
-        block_size:
-        total_size:
-
-    Returns:
-    """
-    percent = float(count * block_size) / total_size
-    msg = "\r- Download progress: {:.1%}".format(percent)
-    sys.stdout.write(msg)
-    sys.stdout.flush()
-
+        url = urlparse.urljoin(source_url, file)
+        filepath = data_download(download_path, url)
+        data_dir = os.path.join(filepath, file.split('.')[0])
+        return data_dir
 
-if __name__ == '__main__':
-    path = fetch()
-    print path
diff --git a/python/paddle/data/recommendation.py b/python/paddle/data/recommendation.py
index 1e93b6dc161224..ca7dce8a9bfb10 100644
--- a/python/paddle/data/recommendation.py
+++ b/python/paddle/data/recommendation.py
@@ -15,6 +15,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+############################################################################
+#
+# Function for fetch the data untar directory for amazon training api.
+# you can use this data for movie recommendation.
+#
+# First,we special the data download directory is "~/paddle_data_directory".
+# For the movie recommendation dataset,it untar the dataset,and returns the
+# untar directory for training api.
+#
+##############################################################################
+
 
 import shutil
 import os
@@ -27,10 +38,10 @@
 
 
 source_url='http://files.grouplens.org/datasets/movielens/ml-1m.zip'
-file_source = "mosesdecoder-master"
+file_source = "ml-1m"
 
 
-def fetch():
+def fetch(directory=None):
     """
     According to the source name,set the download path for source,
     download the data from the source url,and return the download path to fetch for training api.
@@ -41,128 +52,14 @@ def fetch():
         path to downloaded file.
     """
     source_name = "recommendation"
-    #Set the download dir for recommendation.
-    data_home = set_data_path(source_name)
-    filepath = data_download(data_home, source_url)
-    """
-    for i in range(1, num_batch + 1):
-        fpath = os.path.join(filepath, "data_batch_%d" % i)
-    """
-    return filepath
-
-
-def _unpickle(file_path):
-    with open(file_path, mode='rb') as file:
-        if sys.version_info < (3,):
-            data = cPickle.load(file)
-        else:
-            data = cPickle.load(file, encoding='bytes')
-    return data
-
-
-def set_data_path(source_name):
-    """
-    Set the path for download according to the source name.
-
-    Args:
-        source_name:the source
-
-    Returns:
-        the data directory for data download.
-    """
-     data_base = os.path.expanduser(os.path.join('~',' .paddle'))
-     if not os.access(data_base, os.W_OK):
-         data_base = os.path.join('/tmp', '.paddle')
-     datadir = os.path.join(data_base, source_name)
-     print datadir
-     if not os.path.exists(datadir):
-         os.makedirs(datadir)
-     return datadir
-
-
-def data_download(download_dir, source_url):
-    """
-    Download data according to the url for mnist.
-    when downloading,it can see each download process.
-
-    Args:
-        download_dir:the directory for data download.
-        source_url:the url for data download.
-
-    Returns:
-        the path after data downloaded.
-    """
-    src_file = source_url.strip().split('/')[-1]
-    file_path = os.path.join(download_dir, src_file)
+    if directory is None:
+        directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory'))
 
-    if not os.path.exists(file_path):
-        temp_file_name,_ = download_with_urlretrieve(source_url)
-        temp_file_path = os.getcwd()
-        os.rename(temp_file_name, src_file)
-        move_files(src_file, download_dir)
-        print("Download finished, Extracting files.")
-        tar = zipfile.ZipFile(file_path, 'r')
-        infos = tar.infolist()
-        for file in infos:
-            tar.extract(file, download_dir)
-            fpath = os.path.join(download_dir, file.filename)
-        os.remove(file_path)
-        print("Unpacking done!")
-    else:
-        tar = zipfile.ZipFile(file_path, 'r')
-        infos = tar.infolist()
-        for file in infos:
-            tar.extract(file, download_dir)
-            fpath = os.path.join(download_dir, file.filename)
-        os.remove(file_path)
-        print("Data has been already downloaded and unpacked!")
-    return download_dir
-
-
-def move_files(source_dire, target_dire):
-    """
-    Renaming the source file to other name.
-
-    Args:
-        source_dire:the source name of file
-        target_dire:the target name of file.
-
-    Returns:
-    """
-    shutil.move(source_dire, target_dire)
-
-
-def download_with_urlretrieve(url, filename=None):
-    """
-    Download each file with urlretrieve,and the download process can be seen.
-
-    Args:
-        url:the url for data downoad.
-        filename:the target name for download.
-
-    Returns:
-           the temp name after urlretrieve downloaded.
-    """
-    return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress)
-
-
-def check_download_progress(count, block_size, total_size):
-    """
-    Print and check the download process.
-
-    Args:
-        count:
-        block_size:
-        total_size:
-
-    Returns:
-    """
-    percent = float(count * block_size) / total_size
-    msg = "\r- Download progress: {:.1%}".format(percent)
-    sys.stdout.write(msg)
-    sys.stdout.flush()
+    download_path = os.path.join(directory, source_name)
+    if not os.path.exists(download_path):
+        os.makedirs(download_path)
 
+    filepath = data_download(download_path, source_url)
+    data_path = os.path.join(filepath, file_source)
 
-if __name__ == '__main__':
-    path = fetch()
-    print path
+    return data_path
diff --git a/python/paddle/data/semantic.py b/python/paddle/data/semantic.py
index 087e6e6640fea3..8950bb4c98e3b4 100644
--- a/python/paddle/data/semantic.py
+++ b/python/paddle/data/semantic.py
@@ -16,6 +16,18 @@
 # limitations under the License.
 
 
+############################################################################
+#
+# Function for fetch the data untar directory for semantic_role_labeling
+# training api.you can use this data for semantic.
+#
+# First,we special the data download directory is "~/paddle_data_directory".
+# For the semantic role labeling,it untar the dataset,and returns the untar
+# directory for training api.
+#
+############################################################################
+
+
 import shutil
 import os
 import sys
@@ -32,8 +44,10 @@
         'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
         ]
 
+file_source = "conll05st-release"
+
 
-def fetch():
+def fetch(directory=None):
     """
     According to the source name,set the download path for source,
     download the data from the source url,and return the download path to fetch for training api.
@@ -44,121 +58,31 @@ def fetch():
         path to downloaded file.
     """
     source_name = "semantic"
-    data_home = set_data_path(source_name)
-    model_path = data_download(data_home, model_url)
-    for url in source_url:
-        filepath = data_download(data_home, moses_url)
-    """
-    for i in range(1, num_batch + 1):
-        fpath = os.path.join(filepath, "data_batch_%d" % i)
-    """
-       return filepath
+    if directory is None:
+        directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory'))
 
+    download_path = os.path.join(directory, source_name)
+    if not os.path.exists(download_path):
+        os.makedirs(download_path)
 
-def _unpickle(file_path):
-    with open(file_path, mode='rb') as file:
-        if sys.version_info < (3,):
-            data = cPickle.load(file)
+    for url in source_url:
+        file_name = url.split('/')[-1]
+        if 'gz' in file_name:
+            filepath = data_download(download_path, url)
+            data_path = os.path.join(filepath, file_source)
+
+            sub_file = ['est.wsj.words.gz', 'test.wsj.props.gz']
+            words_path = os.path.join(data_path, "test.wsj/words/test.wsj.words.gz")
+            props_path = os.path.join(data_path, "test.wsj/props/test.wsj.props.gz")
+
+            sub_path = [words_path, props_path]
+            for sub_file in sub_path:
+                new_sub_path = os.path.join(download_path, sub_file)
+                shutil.move(sub_path, new_subpath)
+                tarfile.open(name=new_subpath, mode="r:gz").extractall(download_path)
+                os.remove(new_subpath)
         else:
-            data = cPickle.load(file, encoding='bytes')
-    return data
-
-
-def set_data_path(source_name):
-    """
-    Set the path for download according to the source name.
-
-    Args:
-        source_name:the source
-
-    Returns:
-        the data directory for data download.
-    """
-     data_base = os.path.expanduser(os.path.join('~',' .paddle'))
-     if not os.access(data_base, os.W_OK):
-         data_base = os.path.join('/tmp', '.paddle')
-     datadir = os.path.join(data_base, source_name)
-     print datadir
-     if not os.path.exists(datadir):
-         os.makedirs(datadir)
-     return datadir
-
-
-def data_download(download_dir, source_url):
-   """
-    Download data according to the url for mnist.
-    when downloading,it can see each download process.
-
-    Args:
-        download_dir:the directory for data download.
-        source_url:the url for data download.
-
-    Returns:
-        the path after data downloaded.
-    """
-    src_file = url.strip().split('/')[-1]
-    file_path = os.path.join(download_dir, src_file)
-
-    if not os.path.exists(file_path):
-        temp_file_name,_ = download_with_urlretrieve(source_url)
-        temp_file_path = os.getcwd()
-        os.rename(temp_file_name, src_file)
-        move_files(src_file, download_dir)
-        print("Download finished, Extracting files.")
-        tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
-        os.remove(file_path)
-        print("Unpacking done!")
-    else:
-        tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
-        os.remove(file_path)
-        print("Data has been already downloaded and unpacked!")
-    return download_dir
-
-
-def move_files(source_dire, target_dire):
-    """
-    Renaming the source file to other name.
-
-    Args:
-        source_dire:the source name of file
-        target_dire:the target name of file.
-
-    Returns:
-    """
-    shutil.move(source_dire, target_dire)
-
-
-def download_with_urlretrieve(url, filename=None):
-   """
-    Download each file with urlretrieve,and the download process can be seen.
-
-    Args:
-        url:the url for data downoad.
-        filename:the target name for download.
-
-    Returns:
-           the temp name after urlretrieve downloaded.
-    """
-    return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress)
-
-
-def check_download_progress(count, block_size, total_size):
-    """
-    Print and check the download process.
-
-    Args:
-        count:
-        block_size:
-        total_size:
-
-    Returns:
-    """
-    percent = float(count * block_size) / total_size
-    msg = "\r- Download progress: {:.1%}".format(percent)
-    sys.stdout.write(msg)
-    sys.stdout.flush()
+            filepath = data_download(download_path, url)
 
+    return filepath
 
-if __name__ == '__main__':
-    path = fetch()
-    print path
diff --git a/python/paddle/data/sentiment.py b/python/paddle/data/sentiment.py
index 18146ba93803a7..c1d74c51b60a88 100644
--- a/python/paddle/data/sentiment.py
+++ b/python/paddle/data/sentiment.py
@@ -16,6 +16,18 @@
 # limitations under the License.
 
 
+############################################################################
+#
+# Function for fetch the data untar directory for sentiment training api.
+# you can use this data for sentiment analasis.
+#
+# First,we special the data download directory is "~/paddle_data_directory".
+# For the sentiment dataset,it untar the dataset,and returns the untar
+# directory for training api.
+#
+############################################################################
+
+
 import shutil
 import os
 import sys
@@ -27,10 +39,12 @@
 
 source_url='http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
 moses_url='https://github.com/moses-smt/mosesdecoder/archive/master.zip'
-file_source = "mosesdecoder-master"
+
+moses_source = "mosesdecoder-master"
+file_source = "aclImdb"
 
 
-def fetch():
+def fetch(directory=None):
     """
     According to the source name,set the download path for source,
     download the data from the source url,and return the download path to fetch for training api.
@@ -41,137 +55,20 @@ def fetch():
         path to downloaded file.
     """
     source_name = "sentiment"
-    data_home = set_data_path(source_name)
-    filepath = data_download(data_home, source_url)
-    filepath = data_download(data_home, moses_url)
-    """
-    for i in range(1, num_batch + 1):
-        fpath = os.path.join(filepath, "data_batch_%d" % i)
-    """
-    return filepath
-
-
-def _unpickle(file_path):
-    with open(file_path, mode='rb') as file:
-        if sys.version_info < (3,):
-            data = cPickle.load(file)
-        else:
-            data = cPickle.load(file, encoding='bytes')
-    return data
-
-
-def set_data_path(source_name):
-   """
-    Set the path for download according to the source name.
-
-    Args:
-        source_name:the source
-
-    Returns:
-        the data directory for data download.
-    """
-     data_base = os.path.expanduser(os.path.join('~',' .paddle'))
-     if not os.access(data_base, os.W_OK):
-         data_base = os.path.join('/tmp', '.paddle')
-     datadir = os.path.join(data_base, source_name)
-     print datadir
-     if not os.path.exists(datadir):
-         os.makedirs(datadir)
-     return datadir
-
-
-def data_download(download_dir, source_url):
-    """
-    Download data according to the url for mnist.
-    when downloading,it can see each download process.
-
-    Args:
-        download_dir:the directory for data download.
-        source_url:the url for data download.
+    if directory is None:
+        directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory'))
 
-    Returns:
-        the path after data downloaded.
-    """
-    src_file = source_url.strip().split('/')[-1]
-    file_path = os.path.join(download_dir, src_file)
-
-    if not os.path.exists(file_path):
-        temp_file_name,_ = download_with_urlretrieve(source_url)
-        temp_file_path = os.getcwd()
-        os.rename(temp_file_name, src_file)
-        move_files(src_file, download_dir)
-        print("Download finished, Extracting files.")
-
-        if 'zip' in src_file:
-            tar = zipfile.ZipFile(file_path, 'r')
-            infos = tar.infolist()
-            for file in infos:
-                tar.extract(file, download_dir)
-                fpath = os.path.join(download_dir, file.filename)
-                os.chmod(fpath,stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH)
-        else:
-            tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
-        os.remove(file_path)
-        print("Unpacking done!")
-    else:
-        if 'zip' in src_file:
-            tar = zipfile.ZipFile(file_path, 'r')
-            infos = tar.infolist()
-            for file in infos:
-                tar.extract(file, download_dir)
-                fpath = os.path.join(download_dir, file.filename)
-                os.chmod(fpath, stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH)
-        else:
-            tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
-        os.remove(file_path)
-        print("Data has been already downloaded and unpacked!")
-    return download_dir
-
-
-def move_files(source_dire, target_dire):
-    """
-    Renaming the source file to other name.
+    download_path = os.path.join(directory, source_name)
+    if not os.path.exists(download_path):
+        os.makedirs(download_path)
 
-    Args:
-        source_dire:the source name of file
-        target_dire:the target name of file.
-
-    Returns:
-    """
-    shutil.move(source_dire, target_dire)
+    moses_path = data_download(download_path, moses_url)
+    moses_data = os.path.join(moses_path, moses_source)
 
+    filepath = data_download(download_path, source_url)
+    data_path = os.path.join(filepath, file_source)
 
-def download_with_urlretrieve(url, filename=None):
-    """
-    Download each file with urlretrieve,and the download process can be seen.
-
-    Args:
-        url:the url for data downoad.
-        filename:the target name for download.
+    return data_path
 
-    Returns:
-           the temp name after urlretrieve downloaded.
-    """
-    return urllib.request.urlretrieve(url, filename, rereporthook=check_download_progress)
-
-
-def check_download_progress(count, block_size, total_size):
-    """
-    Print and check the download process.
-
-    Args:
-        count:
-        block_size:
-        total_size:
-
-    Returns:
-    """
-    percent = float(count * block_size) / total_size
-    msg = "\r- Download progress: {:.1%}".format(percent)
-    sys.stdout.write(msg)
-    sys.stdout.flush()
 
 
-if __name__ == '__main__':
-    path = fetch()
-    print path
diff --git a/python/paddle/data/seqToseq.py b/python/paddle/data/seqToseq.py
index a9bdd0bf73cd2c..4ead9def25563a 100644
--- a/python/paddle/data/seqToseq.py
+++ b/python/paddle/data/seqToseq.py
@@ -25,13 +25,16 @@
 from six.moves import urllib
 import stat
 
+
 source_url=['http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz',
         'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz'
         ]
 model_url='http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz'
 
+model_source = "wmt14_model"
+file_source = "bitexts.selected"
 
-def fetch():
+def fetch(directory=None):
     """
     According to the source name,set the download path for source,
     download the data from the source url,and return the download path to fetch for training api.
@@ -42,121 +45,19 @@ def fetch():
         path to downloaded file.
     """
     source_name = "seqToseq"
-    data_home = set_data_path(source_name)
-    model_path = data_download(data_home, model_url)
-    for url in source_url:
-        filepath = data_download(data_home, source_url)
-    """
-    for i in range(1, num_batch + 1):
-        fpath = os.path.join(filepath, "data_batch_%d" % i)
-    """
-        return filepath
-
-
-def _unpickle(file_path):
-    with open(file_path, mode='rb') as file:
-        if sys.version_info < (3,):
-            data = cPickle.load(file)
-        else:
-            data = cPickle.load(file, encoding='bytes')
-    return data
-
-
-def set_data_path(source_name):
-   """
-    Set the path for download according to the source name.
-
-    Args:
-        source_name:the source
-
-    Returns:
-        the data directory for data download.
-    """
-     data_base = os.path.expanduser(os.path.join('~',' .paddle'))
-     if not os.access(data_base, os.W_OK):
-         data_base = os.path.join('/tmp', '.paddle')
-     datadir = os.path.join(data_base, source_name)
-     print datadir
-     if not os.path.exists(datadir):
-         os.makedirs(datadir)
-     return datadir
-
-
-def data_download(download_dir, source_url):
-    """
-    Download data according to the url for mnist.
-    when downloading,it can see each download process.
-
-    Args:
-        download_dir:the directory for data download.
-        source_url:the url for data download.
-
-    Returns:
-        the path after data downloaded.
-    """
-    src_file = url.strip().split('/')[-1]
-    file_path = os.path.join(download_dir, src_file)
-
-    if not os.path.exists(file_path):
-        temp_file_name,_ = download_with_urlretrieve(source_url)
-        temp_file_path = os.getcwd()
-        os.rename(temp_file_name, src_file)
-        move_files(src_file, download_dir)
-        print("Download finished, Extracting files.")
-        tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
-        os.remove(file_path)
-        print("Unpacking done!")
-    else:
-        tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
-        os.remove(file_path)
-        print("Data has been already downloaded and unpacked!")
-    return download_dir
+    if directory is None:
+        directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory'))
 
+    download_path = os.path.join(directory, source_name)
+    if not os.path.exists(download_path):
+        os.makedirs(download_path)
 
-def move_files(source_dire, target_dire):
-    """
-    Renaming the source file to other name.
-
-    Args:
-        source_dire:the source name of file
-        target_dire:the target name of file.
-
-    Returns:
-    """
-    shutil.move(source_dire, target_dire)
-
-
-def download_with_urlretrieve(url, filename=None):
-    """
-    Download each file with urlretrieve,and the download process can be seen.
-
-    Args:
-        url:the url for data downoad.
-        filename:the target name for download.
+    model_data = data_download(download_path, model_url)
+    model_path = os.path.join(model_data, model_source)
 
-    Returns:
-           the temp name after urlretrieve downloaded.
-    """
-    return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress)
-
-
-def check_download_progress(count, block_size, total_size):
-    """
-    Print and check the download process.
-
-    Args:
-        count:
-        block_size:
-        total_size:
-
-    Returns:
-    """
-    percent = float(count * block_size) / total_size
-    msg = "\r- Download progress: {:.1%}".format(percent)
-    sys.stdout.write(msg)
-    sys.stdout.flush()
+    for url in source_url:
+        filepath = data_download(download_path, url)
+        data_path = os.path.join(filepath, file_source)
+        return data_path
 
 
-if __name__ == '__main__':
-    path = fetch()
-    print path

From ee9b1c639470c619cfcbb33e4f51cd589ba48e2b Mon Sep 17 00:00:00 2001
From: baidu <baidu@qibinMacBook-Pro.local>
Date: Fri, 13 Jan 2017 12:29:48 +0800
Subject: [PATCH 08/18] update

---
 python/paddle/data/DATA.md           |  28 --------
 python/paddle/data/amazon.py         |  11 ++-
 python/paddle/data/cifar10.py        |   8 +--
 python/paddle/data/cifar_10.py       | 100 ---------------------------
 python/paddle/data/http_download.py  |  15 ++--
 python/paddle/data/mnist.py          |  11 +--
 python/paddle/data/recommendation.py |   7 +-
 python/paddle/data/semantic.py       |  28 ++++----
 python/paddle/data/sentiment.py      |  12 ++--
 python/paddle/data/seqToseq.py       |  17 +++--
 10 files changed, 50 insertions(+), 187 deletions(-)
 delete mode 100644 python/paddle/data/DATA.md
 delete mode 100644 python/paddle/data/cifar_10.py

diff --git a/python/paddle/data/DATA.md b/python/paddle/data/DATA.md
deleted file mode 100644
index ce186d42619509..00000000000000
--- a/python/paddle/data/DATA.md
+++ /dev/null
@@ -1,28 +0,0 @@
-## 需求
-
-Paddle目前提供了很多demo，且各demo运行时需要从原生网站下载其数据，并进行复杂的预处理过程，整个过程会耗费大量时间。
-
-所以我们需要数据封装接口，采用import数据源的方式(如\：import paddle.data.amazon.review.GetJSON)来简化获取训练所需数据的时间；但是如果你习惯自己处理原生数据，我们依然提供原生数据接口来满足你的需求。
-
-## 整体思路
-
-数据封装接口的目的是提供数据。不论是原生数据，还是预处理数据都通过import方式导入各模型进行训练；考虑到某些模型的预处理后的数据量依然很大，或有时就仅仅想训练相对较小的网络模型，没必要考虑全量数据，自动配置数据量大小必然更符合不同需求。整个接口初步设想如下：
-* 开关来控制数据来源
-   * 导入数据接口时，带有开关(如:src\_from = True，来自预处理源；否则,来自原生数据源)
-* 预处理数据部分添加配置train和test的数据量的大小
-* 原生数据部分的数据下载数据模块化
-   * 开关(src\_from = False)和<模型，数据源>对完成相关数据的下载
-* 原生数据的预处理部分保持原状，通过<模型,预处理过程>对完成数据的预处理
-* 在paddle的train的配置文件中修改数据源的导入方式
-
-整个过程在tensorflow的mnist模型已有人实现，借鉴此思想，实现paddle的各demo数据接口的通用化。
-
-```python
-amazon = input_data.load_dataset(
-         'Amazon',
-         '/Users/baidu/git/test_package/data',
-         data_unneed=False,
-         src_flag=False)
-batch = amazon.train.shrink_txt('train',10)
-```
-
diff --git a/python/paddle/data/amazon.py b/python/paddle/data/amazon.py
index 361a3fa79ded6e..a284e1dc4fd8d8 100644
--- a/python/paddle/data/amazon.py
+++ b/python/paddle/data/amazon.py
@@ -34,7 +34,6 @@
 #
 #########################################################################
 
-
 import shutil
 import os
 import sys
@@ -44,9 +43,8 @@
 from six.moves import urllib
 from http_download import data_download
 
-
-source_url='http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz'
-moses_url='https://github.com/moses-smt/mosesdecoder/archive/master.zip'
+source_url = 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz'
+moses_url = 'https://github.com/moses-smt/mosesdecoder/archive/master.zip'
 
 mose_source = "mosesdecoder-master"
 
@@ -63,7 +61,8 @@ def fetch(directory=None):
     """
     source_name = "amazon"
     if directory is None:
-        directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory'))
+        directory = os.path.expanduser(
+            os.path.join('~', 'paddle_data_directory'))
 
     download_path = os.path.join(directory, source_name)
     if not os.path.exists(download_path):
@@ -74,5 +73,3 @@ def fetch(directory=None):
 
     filepath = data_download(download_path, source_url)
     return filepath
-
-
diff --git a/python/paddle/data/cifar10.py b/python/paddle/data/cifar10.py
index 72c4c9bcbc11ce..d6d893288a851c 100644
--- a/python/paddle/data/cifar10.py
+++ b/python/paddle/data/cifar10.py
@@ -32,7 +32,6 @@
 #
 ########################################################################
 
-
 import shutil
 import os
 import sys
@@ -41,8 +40,7 @@
 from six.moves import urllib
 from http_download import data_download
 
-
-source_url='https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
+source_url = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
 source_file = "cifar-10-batches-py"
 
 
@@ -59,7 +57,8 @@ def fetch(directory=None):
     source_name = "cifar"
 
     if directory is None:
-        directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory'))
+        directory = os.path.expanduser(
+            os.path.join('~', 'paddle_data_directory'))
 
     download_path = os.path.join(directory, source_name)
     if not os.path.exists(download_path):
@@ -67,4 +66,3 @@ def fetch(directory=None):
     filepath = data_download(download_path, source_url)
 
     return filepath
-
diff --git a/python/paddle/data/cifar_10.py b/python/paddle/data/cifar_10.py
deleted file mode 100644
index 762d4b2d40ca52..00000000000000
--- a/python/paddle/data/cifar_10.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#/usr/bin/env python
-# -*- coding:utf-8 -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import shutil
-import os
-import sys
-import tarfile
-import zipfile
-import collections
-import numpy as np
-from six.moves import urllib
-
-source_url='https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
-source_file = "cifar-10-batches-py"
-label_map = {
-0: "airplane",
-1: "automobile",
-2: "bird",
-3: "cat",
-4: "deer",
-5: "dog",
-6: "frog",
-7: "horse",
-8: "ship",
-9: "truck"
-}
-
-def fetch():
-    num_images_train = 50000
-    num_batch = 5
-    source_name = "cifar"
-    file_source = "cifar-10-batches-py"
-    #Set the download dir for cifar.
-    data_home = set_data_path(source_name)
-    filepath = data_download(data_home, source_url)
-    """
-    for i in range(1, num_batch + 1):
-        fpath = os.path.join(filepath, "data_batch_%d" % i)
-    """
-
-def _unpickle(file_path):
-    with open(file_path, mode='rb') as file:
-        if sys.version_info < (3,):
-            data = cPickle.load(file)
-        else:
-            data = cPickle.load(file, encoding='bytes')
-    return data
-
-def set_data_path(source_name):
-     data_base = os.path.expanduser(os.path.join('~', '.paddle'))
-     print data_base
-     if not os.access(data_base, os.W_OK):
-         data_base = os.path.join('/tmp', '.paddle')
-     datadir = os.path.join(data_base, source_name)
-     print datadir
-     if not os.path.exists(datadir):
-         os.makedirs(datadir)
-     return datadir
-
-def data_download(download_dir, source_url):
-    src_file = source_url.strip().split('/')[-1]
-    file_path = os.path.join(download_dir, src_file)
-    if not os.path.exists(file_path):
-        temp_file_name,_ = download_with_urlretrieve(source_url)
-        temp_file_path = os.getcwd()
-        os.rename(temp_file_name, src_file)
-        move_files(src_file, download_dir)
-        print("Download finished,Extracting files.")
-        tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
-        print("Unpacking done!")
-    else:
-        tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
-        print("Data has been already downloaded and unpacked!")
-    return download_dir
-
-def move_files(source_dire, target_dire):
-    shutil.move(source_dire, target_dire)
-
-def download_with_urlretrieve(url, filename=None):
-    return urllib.request.urlretrieve(url, filename)
-
-
-if __name__ == '__main__':
-    path = fetch()
-    print path
diff --git a/python/paddle/data/http_download.py b/python/paddle/data/http_download.py
index bef850da841e48..0f128c19727582 100644
--- a/python/paddle/data/http_download.py
+++ b/python/paddle/data/http_download.py
@@ -24,7 +24,6 @@
 #
 ########################################################################
 
-
 import os
 import sys
 import shutil
@@ -45,7 +44,8 @@ def download_with_urlretrieve(url, filename=None):
     Returns:
            the temp name after urlretrieve downloaded.
     """
-    return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress)
+    return urllib.request.urlretrieve(
+        url, filename, reporthook=check_download_progress)
 
 
 def check_download_progress(count, block_size, total_size):
@@ -82,7 +82,7 @@ def data_download(download_dir, source_url):
 
     print file_path
     if not os.path.exists(file_path):
-        temp_file_name,_ = download_with_urlretrieve(source_url)
+        temp_file_name, _ = download_with_urlretrieve(source_url)
         temp_file_path = os.getcwd()
         os.rename(temp_file_name, src_file)
         shutil.move(src_file, download_dir)
@@ -95,9 +95,9 @@ def data_download(download_dir, source_url):
                 tar.extract(file, download_dir)
                 fpath = os.path.join(download_dir, file.filename)
                 if 'master' in src_file:
-                    os.chmod(fpath,stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH)
+                    os.chmod(fpath, stat.S_IRWXU | stat.S_IRGRP | stat.S_IROTH)
             os.remove(file_path)
-        elif src_file in ['.json.gz','txt','emb','python.tar.gz']:
+        elif src_file in ['.json.gz', 'txt', 'emb', 'python.tar.gz']:
             pass
         elif src_file.split('.')[-1] is 'gz':
             tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
@@ -111,9 +111,9 @@ def data_download(download_dir, source_url):
                 tar.extract(file, download_dir)
                 fpath = os.path.join(download_dir, file.filename)
                 if 'master' in src_file:
-                    os.chmod(fpath,stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH)
+                    os.chmod(fpath, stat.S_IRWXU | stat.S_IRGRP | stat.S_IROTH)
             os.remove(file_path)
-        elif src_file in ['.json.gz','txt','emb']:
+        elif src_file in ['.json.gz', 'txt', 'emb']:
             pass
         elif src_file.split('.')[-1] is 'gz':
             tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
@@ -121,4 +121,3 @@ def data_download(download_dir, source_url):
         print("Data has been already downloaded and unpacked!")
 
     return download_dir
-
diff --git a/python/paddle/data/mnist.py b/python/paddle/data/mnist.py
index c084954848b985..151e50c3d65a29 100644
--- a/python/paddle/data/mnist.py
+++ b/python/paddle/data/mnist.py
@@ -26,7 +26,6 @@
 #
 ############################################################################
 
-
 import shutil
 import os
 import sys
@@ -36,9 +35,11 @@
 import urlparse
 import gzip
 
-
 source_url = 'http://yann.lecun.com/exdb/mnist/'
-filename = ['train-images-idx3-ubyte.gz','t10k-images-idx3-ubyte.gz','train-labels-idx1-ubyte.gz','t10k-labels-idx1-ubyte.gz']
+filename = [
+    'train-images-idx3-ubyte.gz', 't10k-images-idx3-ubyte.gz',
+    'train-labels-idx1-ubyte.gz', 't10k-labels-idx1-ubyte.gz'
+]
 
 
 def fetch(directory=None):
@@ -54,7 +55,8 @@ def fetch(directory=None):
     source_name = "mnist"
 
     if directory is None:
-        directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory'))
+        directory = os.path.expanduser(
+            os.path.join('~', 'paddle_data_directory'))
 
     download_path = os.path.join(directory, source_name)
     if not os.path.exists(download_path):
@@ -65,4 +67,3 @@ def fetch(directory=None):
         filepath = data_download(download_path, url)
         data_dir = os.path.join(filepath, file.split('.')[0])
         return data_dir
-
diff --git a/python/paddle/data/recommendation.py b/python/paddle/data/recommendation.py
index ca7dce8a9bfb10..387180231c9fcb 100644
--- a/python/paddle/data/recommendation.py
+++ b/python/paddle/data/recommendation.py
@@ -26,7 +26,6 @@
 #
 ##############################################################################
 
-
 import shutil
 import os
 import sys
@@ -36,8 +35,7 @@
 from six.moves import urllib
 import stat
 
-
-source_url='http://files.grouplens.org/datasets/movielens/ml-1m.zip'
+source_url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
 file_source = "ml-1m"
 
 
@@ -53,7 +51,8 @@ def fetch(directory=None):
     """
     source_name = "recommendation"
     if directory is None:
-        directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory'))
+        directory = os.path.expanduser(
+            os.path.join('~', 'paddle_data_directory'))
 
     download_path = os.path.join(directory, source_name)
     if not os.path.exists(download_path):
diff --git a/python/paddle/data/semantic.py b/python/paddle/data/semantic.py
index 8950bb4c98e3b4..d9b7367044579e 100644
--- a/python/paddle/data/semantic.py
+++ b/python/paddle/data/semantic.py
@@ -15,7 +15,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 ############################################################################
 #
 # Function for fetch the data untar directory for semantic_role_labeling
@@ -27,7 +26,6 @@
 #
 ############################################################################
 
-
 import shutil
 import os
 import sys
@@ -37,12 +35,13 @@
 from six.moves import urllib
 import stat
 
-source_url=['http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz',
-        'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt',
-        'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt',
-        'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt',
-        'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
-        ]
+source_url = [
+    'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz',
+    'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt',
+    'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt',
+    'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt',
+    'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
+]
 
 file_source = "conll05st-release"
 
@@ -59,7 +58,8 @@ def fetch(directory=None):
     """
     source_name = "semantic"
     if directory is None:
-        directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory'))
+        directory = os.path.expanduser(
+            os.path.join('~', 'paddle_data_directory'))
 
     download_path = os.path.join(directory, source_name)
     if not os.path.exists(download_path):
@@ -72,17 +72,19 @@ def fetch(directory=None):
             data_path = os.path.join(filepath, file_source)
 
             sub_file = ['est.wsj.words.gz', 'test.wsj.props.gz']
-            words_path = os.path.join(data_path, "test.wsj/words/test.wsj.words.gz")
-            props_path = os.path.join(data_path, "test.wsj/props/test.wsj.props.gz")
+            words_path = os.path.join(data_path,
+                                      "test.wsj/words/test.wsj.words.gz")
+            props_path = os.path.join(data_path,
+                                      "test.wsj/props/test.wsj.props.gz")
 
             sub_path = [words_path, props_path]
             for sub_file in sub_path:
                 new_sub_path = os.path.join(download_path, sub_file)
                 shutil.move(sub_path, new_subpath)
-                tarfile.open(name=new_subpath, mode="r:gz").extractall(download_path)
+                tarfile.open(
+                    name=new_subpath, mode="r:gz").extractall(download_path)
                 os.remove(new_subpath)
         else:
             filepath = data_download(download_path, url)
 
     return filepath
-
diff --git a/python/paddle/data/sentiment.py b/python/paddle/data/sentiment.py
index c1d74c51b60a88..ea4193e53ed299 100644
--- a/python/paddle/data/sentiment.py
+++ b/python/paddle/data/sentiment.py
@@ -15,7 +15,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 ############################################################################
 #
 # Function for fetch the data untar directory for sentiment training api.
@@ -27,7 +26,6 @@
 #
 ############################################################################
 
-
 import shutil
 import os
 import sys
@@ -37,8 +35,8 @@
 from six.moves import urllib
 import stat
 
-source_url='http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
-moses_url='https://github.com/moses-smt/mosesdecoder/archive/master.zip'
+source_url = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
+moses_url = 'https://github.com/moses-smt/mosesdecoder/archive/master.zip'
 
 moses_source = "mosesdecoder-master"
 file_source = "aclImdb"
@@ -56,7 +54,8 @@ def fetch(directory=None):
     """
     source_name = "sentiment"
     if directory is None:
-        directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory'))
+        directory = os.path.expanduser(
+            os.path.join('~', 'paddle_data_directory'))
 
     download_path = os.path.join(directory, source_name)
     if not os.path.exists(download_path):
@@ -69,6 +68,3 @@ def fetch(directory=None):
     data_path = os.path.join(filepath, file_source)
 
     return data_path
-
-
-
diff --git a/python/paddle/data/seqToseq.py b/python/paddle/data/seqToseq.py
index 4ead9def25563a..8850292f0dda87 100644
--- a/python/paddle/data/seqToseq.py
+++ b/python/paddle/data/seqToseq.py
@@ -15,7 +15,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import shutil
 import os
 import sys
@@ -25,15 +24,16 @@
 from six.moves import urllib
 import stat
 
-
-source_url=['http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz',
-        'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz'
-        ]
-model_url='http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz'
+source_url = [
+    'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz',
+    'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz'
+]
+model_url = 'http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz'
 
 model_source = "wmt14_model"
 file_source = "bitexts.selected"
 
+
 def fetch(directory=None):
     """
     According to the source name,set the download path for source,
@@ -46,7 +46,8 @@ def fetch(directory=None):
     """
     source_name = "seqToseq"
     if directory is None:
-        directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory'))
+        directory = os.path.expanduser(
+            os.path.join('~', 'paddle_data_directory'))
 
     download_path = os.path.join(directory, source_name)
     if not os.path.exists(download_path):
@@ -59,5 +60,3 @@ def fetch(directory=None):
         filepath = data_download(download_path, url)
         data_path = os.path.join(filepath, file_source)
         return data_path
-
-

From c53599f1d78aecec3e619d576e09df04df38fe78 Mon Sep 17 00:00:00 2001
From: baidu <baidu@qibinMacBook-Pro.local>
Date: Fri, 13 Jan 2017 12:33:36 +0800
Subject: [PATCH 09/18] update

---
 python/paddle/data/mnist.py          | 1 +
 python/paddle/data/recommendation.py | 2 ++
 python/paddle/data/semantic.py       | 3 ++-
 python/paddle/data/sentiment.py      | 1 +
 python/paddle/data/seqToseq.py       | 2 ++
 5 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/python/paddle/data/mnist.py b/python/paddle/data/mnist.py
index 151e50c3d65a29..5fe6f6dccc0875 100644
--- a/python/paddle/data/mnist.py
+++ b/python/paddle/data/mnist.py
@@ -34,6 +34,7 @@
 from six.moves import urllib
 import urlparse
 import gzip
+from http_download import data_download
 
 source_url = 'http://yann.lecun.com/exdb/mnist/'
 filename = [
diff --git a/python/paddle/data/recommendation.py b/python/paddle/data/recommendation.py
index 387180231c9fcb..6c3fba55c8919b 100644
--- a/python/paddle/data/recommendation.py
+++ b/python/paddle/data/recommendation.py
@@ -34,6 +34,8 @@
 import numpy as np
 from six.moves import urllib
 import stat
+from http_download import data_download
+
 
 source_url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
 file_source = "ml-1m"
diff --git a/python/paddle/data/semantic.py b/python/paddle/data/semantic.py
index d9b7367044579e..dfafb5120cf88c 100644
--- a/python/paddle/data/semantic.py
+++ b/python/paddle/data/semantic.py
@@ -33,7 +33,8 @@
 import collections
 import numpy as np
 from six.moves import urllib
-import stat
+from http_download import data_download
+
 
 source_url = [
     'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz',
diff --git a/python/paddle/data/sentiment.py b/python/paddle/data/sentiment.py
index ea4193e53ed299..e0a72e0d9b9809 100644
--- a/python/paddle/data/sentiment.py
+++ b/python/paddle/data/sentiment.py
@@ -34,6 +34,7 @@
 import numpy as np
 from six.moves import urllib
 import stat
+from http_download import data_download
 
 source_url = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
 moses_url = 'https://github.com/moses-smt/mosesdecoder/archive/master.zip'
diff --git a/python/paddle/data/seqToseq.py b/python/paddle/data/seqToseq.py
index 8850292f0dda87..ced53be2d1a9d5 100644
--- a/python/paddle/data/seqToseq.py
+++ b/python/paddle/data/seqToseq.py
@@ -23,6 +23,8 @@
 import numpy as np
 from six.moves import urllib
 import stat
+from http_download import data_download
+
 
 source_url = [
     'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz',

From 6173153da0006699ab020dde8418cf54575eb77f Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 13 Jan 2017 16:20:53 +0800
Subject: [PATCH 10/18] Refine amazon_product_reviews.py

---
 python/paddle/data/amazon.py                 |  75 ------------
 python/paddle/data/amazon_product_reviews.py | 119 +++++++++++++++++++
 python/paddle/data/http_download.py          |  48 ++++----
 python/paddle/data/logger.py                 |   5 +
 4 files changed, 147 insertions(+), 100 deletions(-)
 delete mode 100644 python/paddle/data/amazon.py
 create mode 100644 python/paddle/data/amazon_product_reviews.py
 create mode 100644 python/paddle/data/logger.py

diff --git a/python/paddle/data/amazon.py b/python/paddle/data/amazon.py
deleted file mode 100644
index a284e1dc4fd8d8..00000000000000
--- a/python/paddle/data/amazon.py
+++ /dev/null
@@ -1,75 +0,0 @@
-#/usr/bin/env python
-# -*- coding:utf-8 -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-########################################################################
-#
-# Function for fetch the data untar directory for amazon training api.
-# As the python can read the data in "reviews_Electronics_5.json.gz",
-#here is no need to untar the data.
-#
-#
-# First,we let the data download path is "~/paddle_data_directory"
-# when u no special the download path.
-#
-#
-# Then,download the data,according to the speical source url.
-# Here,no need to untar the "reviews_Electronics_5.json.gz".
-#
-# After download the data,return the path of data.
-#
-#
-#########################################################################
-
-import shutil
-import os
-import sys
-import zipfile
-import collections
-import stat
-from six.moves import urllib
-from http_download import data_download
-
-source_url = 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz'
-moses_url = 'https://github.com/moses-smt/mosesdecoder/archive/master.zip'
-
-mose_source = "mosesdecoder-master"
-
-
-def fetch(directory=None):
-    """
-    According to the source name,set the download path for source,
-    download the data from the source url,and return the download path to fetch for training api.
-
-    Args:
-
-    Returns:
-        path for the data untar.
-    """
-    source_name = "amazon"
-    if directory is None:
-        directory = os.path.expanduser(
-            os.path.join('~', 'paddle_data_directory'))
-
-    download_path = os.path.join(directory, source_name)
-    if not os.path.exists(download_path):
-        os.makedirs(download_path)
-
-    moses_src = data_download(download_path, moses_url)
-    moses_path = os.path.join(moses_src, mose_source)
-
-    filepath = data_download(download_path, source_url)
-    return filepath
diff --git a/python/paddle/data/amazon_product_reviews.py b/python/paddle/data/amazon_product_reviews.py
new file mode 100644
index 00000000000000..5a282293eb1be4
--- /dev/null
+++ b/python/paddle/data/amazon_product_reviews.py
@@ -0,0 +1,119 @@
+# /usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A utility for fetching, reading amazon product review data set.
+
+http://jmcauley.ucsd.edu/data/amazon/
+"""
+
+import os
+from http_download import download
+from logger import logger
+import hashlib
+
+BASE_URL = 'http://snap.stanford.edu/data/' \
+           'amazon/productGraph/categoryFiles/reviews_%s_5.json.gz'
+
+
+class Categories(object):
+    Books = "Books"
+    Electronics = "Electronics"
+    MoviesAndTV = "Movies_and_TV"
+    CDsAndVinyl = "CDs_and_Vinyl"
+    ClothingShoesAndJewelry = "Clothing_Shoes_and_Jewelry"
+    HomeAndKitchen = "Home_and_Kitchen"
+    KindleStore = "Kindle_Store"
+    SportsAndOutdoors = "Sports_and_Outdoors"
+    CellPhonesAndAccessories = "Cell_Phones_and_Accessories"
+    HealthAndPersonalCare = "Health_and_Personal_Care"
+    ToysAndGames = "Toys_and_Games"
+    VideoGames = "Video_Games"
+    ToolsAndHomeImprovement = "Tools_and_Home_Improvement"
+    Beauty = "Beauty"
+    AppsForAndroid = "Apps_for_Android"
+    OfficeProducts = "Office_Products"
+    PetSupplies = "Pet_Supplies"
+    Automotive = "Automotive"
+    GroceryAndGourmetFood = "Grocery_and_Gourmet"
+    PatioLawnAndGarden = "Patio_Lawn_and_Garden"
+    Baby = "Baby"
+    DigitalMusic = "Digital_Music"
+    MusicalInstruments = "Musical_Instruments"
+    AmazonInstantVideo = "Amazon_Instant_Video"
+
+    __md5__ = dict()
+
+    __md5__[AmazonInstantVideo] = '10812e43e99c345f63333d8ee10aef6a'
+    __md5__[AppsForAndroid] = 'a7d1ae198b862eea6910fe45c842b0c6'
+    __md5__[Automotive] = '757fdb1ab2c5e2fc0934047721082011'
+    __md5__[Baby] = '7698a4179a1d8385e946ed9083490d22'
+    __md5__[Beauty] = '5d2ccdcd86641efcfbae344317c10829'
+
+
+__all__ = ['fetch', 'Categories']
+
+
+def fetch(category=None, directory=None):
+    """
+    According to the source name,set the download path for source,
+    download the data from the source url,and return the download path to fetch
+    for training api.
+
+    Args:
+
+    Returns:
+        path for the data untar.
+    """
+    if category is None:
+        category = Categories.Electronics
+
+    if directory is None:
+        directory = os.path.expanduser(
+            os.path.join('~', 'paddle_data', 'amazon'))
+
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+    logger.info("Downloading amazon review dataset for %s category" % category)
+    return download(BASE_URL % category,
+                    os.path.join(directory, '%s.json.gz' % category))
+
+
+def calculate_md5(fn):
+    h = hashlib.md5()
+    with open(fn, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def main():
+    categories = filter(
+        lambda c: getattr(Categories, c) not in Categories.__md5__.keys(),
+        filter(lambda c: c[0] != '_', dir(Categories)))
+
+    for each in categories:
+        try:
+            filename = fetch(category=getattr(Categories, each))
+        except Exception as e:
+            print type(e)
+            continue
+        print each, calculate_md5(filename)
+        os.remove(filename)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/data/http_download.py b/python/paddle/data/http_download.py
index 0f128c19727582..668dc9966778a8 100644
--- a/python/paddle/data/http_download.py
+++ b/python/paddle/data/http_download.py
@@ -1,4 +1,4 @@
-#/usr/bin/env python
+# /usr/bin/env python
 # -*- coding:utf-8 -*-
 
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
@@ -33,36 +33,31 @@
 from six.moves import urllib
 
 
-def download_with_urlretrieve(url, filename=None):
+def download_with_urlretrieve(url, filename=None, with_progress=True):
     """
     Download each file with urlretrieve,and the download process can be seen.
 
-    Args:
-        url:the url for data downoad.
-        filename:the target name for download.
-
-    Returns:
-           the temp name after urlretrieve downloaded.
+    :param url: the url for data download.
+    :type url: basestring
+    :param filename: Output file name. None if use default file name.
+    :type filename: basestring
+    :param with_progress: with progress bar or not. Default is true.
+    :type with_progress: bool
+    :return: the downloaded filename
+    :rtype: basestring
     """
-    return urllib.request.urlretrieve(
-        url, filename, reporthook=check_download_progress)
 
+    def check_download_progress(count, block_size, total_size):
+        percent = float(count * block_size) / total_size
+        msg = "\r- Downloading {1} progress: {0:.1%}".format(percent, filename)
+        sys.stdout.write(msg)
+        sys.stdout.flush()
 
-def check_download_progress(count, block_size, total_size):
-    """
-    Print and check the download process.
-
-    Args:
-        count:
-        block_size:
-        total_size:
+    hook = None
+    if with_progress:
+        hook = check_download_progress
 
-    Returns:
-    """
-    percent = float(count * block_size) / total_size
-    msg = "\r- Download progress: {:.1%}".format(percent)
-    sys.stdout.write(msg)
-    sys.stdout.flush()
+    return urllib.request.urlretrieve(url, filename, reporthook=hook)[0]
 
 
 def data_download(download_dir, source_url):
@@ -82,7 +77,7 @@ def data_download(download_dir, source_url):
 
     print file_path
     if not os.path.exists(file_path):
-        temp_file_name, _ = download_with_urlretrieve(source_url)
+        temp_file_name = download_with_urlretrieve(source_url)
         temp_file_path = os.getcwd()
         os.rename(temp_file_name, src_file)
         shutil.move(src_file, download_dir)
@@ -121,3 +116,6 @@ def data_download(download_dir, source_url):
         print("Data has been already downloaded and unpacked!")
 
     return download_dir
+
+
+download = download_with_urlretrieve
diff --git a/python/paddle/data/logger.py b/python/paddle/data/logger.py
new file mode 100644
index 00000000000000..52b0df4535a491
--- /dev/null
+++ b/python/paddle/data/logger.py
@@ -0,0 +1,5 @@
+import logging
+
+__all__ = ['__logger__']
+
+logger = logging.getLogger("paddle.data")

From 7972f74f141e7b8f37495f2c351294c8b2b32a33 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 13 Jan 2017 16:53:34 +0800
Subject: [PATCH 11/18] Add md5 checks.

---
 python/paddle/data/amazon_product_reviews.py | 61 +++++++++++---------
 1 file changed, 34 insertions(+), 27 deletions(-)

diff --git a/python/paddle/data/amazon_product_reviews.py b/python/paddle/data/amazon_product_reviews.py
index 5a282293eb1be4..2f459bf98b2b60 100644
--- a/python/paddle/data/amazon_product_reviews.py
+++ b/python/paddle/data/amazon_product_reviews.py
@@ -62,11 +62,38 @@ class Categories(object):
     __md5__[Automotive] = '757fdb1ab2c5e2fc0934047721082011'
     __md5__[Baby] = '7698a4179a1d8385e946ed9083490d22'
     __md5__[Beauty] = '5d2ccdcd86641efcfbae344317c10829'
+    __md5__[Books] = 'bc1e2aa650fe51f978e9d3a7a4834bc6'
+    __md5__[CDsAndVinyl] = '82bffdc956e76c32fa655b98eca9576b'
+    __md5__[CellPhonesAndAccessories] = '903a19524d874970a2f0ae32a175a48f'
+    __md5__[ClothingShoesAndJewelry] = 'b333fba48651ea2309288aeb51f8c6e4'
+    __md5__[DigitalMusic] = '35e62f7a7475b53714f9b177d9dae3e7'
+    __md5__[Electronics] = 'e4524af6c644cd044b1969bac7b62b2a'
+    __md5__[GroceryAndGourmetFood] = 'd8720f98ea82c71fa5c1223f39b6e3d9'
+    __md5__[HealthAndPersonalCare] = '352ea1f780a8629783220c7c9a9f7575'
+    __md5__[HomeAndKitchen] = '90221797ccc4982f57e6a5652bea10fc'
+    __md5__[KindleStore] = 'b608740c754287090925a1a186505353'
+    __md5__[MoviesAndTV] = 'd3bb01cfcda2602c07bcdbf1c4222997'
+    __md5__[MusicalInstruments] = '8035b6e3f9194844785b3f4cee296577'
+    __md5__[OfficeProducts] = '1b7e64c707ecbdcdeca1efa09b716499'
+    __md5__[PatioLawnAndGarden] = '4d2669abc5319d0f073ec3c3a85f18af'
+    __md5__[PetSupplies] = '40568b32ca1536a4292e8410c5b9de12'
+    __md5__[SportsAndOutdoors] = '1df6269552761c82aaec9667bf9a0b1d'
+    __md5__[ToolsAndHomeImprovement] = '80bca79b84621d4848a88dcf37a1c34b'
+    __md5__[ToysAndGames] = 'dbd07c142c47473c6ee22b535caee81f'
+    __md5__[VideoGames] = '730612da2d6a93ed19f39a808b63993e'
 
 
 __all__ = ['fetch', 'Categories']
 
 
+def calculate_md5(fn):
+    h = hashlib.md5()
+    with open(fn, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
 def fetch(category=None, directory=None):
     """
     According to the source name,set the download path for source,
@@ -87,33 +114,13 @@ def fetch(category=None, directory=None):
 
     if not os.path.exists(directory):
         os.makedirs(directory)
-    logger.info("Downloading amazon review dataset for %s category" % category)
-    return download(BASE_URL % category,
-                    os.path.join(directory, '%s.json.gz' % category))
-
 
-def calculate_md5(fn):
-    h = hashlib.md5()
-    with open(fn, 'rb') as f:
-        for chunk in iter(lambda: f.read(4096), b""):
-            h.update(chunk)
-    return h.hexdigest()
-
-
-def main():
-    categories = filter(
-        lambda c: getattr(Categories, c) not in Categories.__md5__.keys(),
-        filter(lambda c: c[0] != '_', dir(Categories)))
+    fn = os.path.join(directory, '%s.json.gz' % category)
 
-    for each in categories:
-        try:
-            filename = fetch(category=getattr(Categories, each))
-        except Exception as e:
-            print type(e)
-            continue
-        print each, calculate_md5(filename)
-        os.remove(filename)
+    if os.path.exists(fn) and \
+                    calculate_md5(category) == Categories.__md5__[category]:
+        # already download.
+        return fn
 
-
-if __name__ == '__main__':
-    main()
+    logger.info("Downloading amazon review dataset for %s category" % category)
+    return download(BASE_URL % category, fn)

From 294f2981a8c3c512f535bc6c0c3258022d6f48ab Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 13 Jan 2017 17:40:41 +0800
Subject: [PATCH 12/18] Add preprocess method for amazon reviews

---
 paddle/setup.py.in                           |  2 +
 python/paddle/data/amazon_product_reviews.py | 91 +++++++++++++++++++-
 2 files changed, 91 insertions(+), 2 deletions(-)

diff --git a/paddle/setup.py.in b/paddle/setup.py.in
index e3650bf1c0c469..75c8325aec7bdc 100644
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -69,6 +69,8 @@ setup(name="py_paddle",
   packages=['py_paddle'],
   include_dirs = include_dirs,
   install_requires = [
+    'h5py',
+    'nltk',
     'numpy>=1.8.0',      # The numpy is required.
     'protobuf>=2.4.1' # The paddle protobuf version
   ],
diff --git a/python/paddle/data/amazon_product_reviews.py b/python/paddle/data/amazon_product_reviews.py
index 2f459bf98b2b60..4cc56e4c6f77fa 100644
--- a/python/paddle/data/amazon_product_reviews.py
+++ b/python/paddle/data/amazon_product_reviews.py
@@ -23,7 +23,13 @@
 import os
 from http_download import download
 from logger import logger
+import gzip
+import json
 import hashlib
+import nltk
+import collections
+import h5py
+import numpy
 
 BASE_URL = 'http://snap.stanford.edu/data/' \
            'amazon/productGraph/categoryFiles/reviews_%s_5.json.gz'
@@ -83,7 +89,7 @@ class Categories(object):
     __md5__[VideoGames] = '730612da2d6a93ed19f39a808b63993e'
 
 
-__all__ = ['fetch', 'Categories']
+__all__ = ['fetch', 'Categories', 'preprocess']
 
 
 def calculate_md5(fn):
@@ -118,9 +124,90 @@ def fetch(category=None, directory=None):
     fn = os.path.join(directory, '%s.json.gz' % category)
 
     if os.path.exists(fn) and \
-                    calculate_md5(category) == Categories.__md5__[category]:
+                    calculate_md5(fn) == Categories.__md5__[category]:
         # already download.
         return fn
 
     logger.info("Downloading amazon review dataset for %s category" % category)
     return download(BASE_URL % category, fn)
+
+
+def preprocess(category=None, directory=None):
+    """
+    Download and preprocess amazon reviews data set. Save the preprocessed
+    result to hdf5 file.
+
+    In preprocess, it uses nltk to tokenize english sentence. It is slightly
+    different from moses. But nltk is a pure python library, it could be
+    integrated well with Paddle.
+
+    :return: hdf5 file name.
+    """
+    if category is None:
+        category = Categories.Electronics
+
+    if directory is None:
+        directory = os.path.expanduser(
+            os.path.join('~', 'paddle_data', 'amazon'))
+
+    preprocess_fn = os.path.join(directory, '%s.hdf5' % category)
+    raw_file_fn = fetch(category, directory)
+
+    word_dict = collections.defaultdict(int)
+    if not os.path.exists(preprocess_fn):  # already preprocessed
+        with gzip.open(raw_file_fn, mode='r') as f:
+            for sample_num, line in enumerate(f):
+                txt = json.loads(line)['reviewText']
+                try:  # automatically download nltk tokenizer data.
+                    words = nltk.tokenize.word_tokenize(txt, 'english')
+                except LookupError:
+                    nltk.download('punkt')
+                    words = nltk.tokenize.word_tokenize(txt, 'english')
+                for each_word in words:
+                    word_dict[each_word] += 1
+            sample_num += 1
+
+        word_dict_sorted = []
+        for each in word_dict:
+            word_dict_sorted.append((each, word_dict[each]))
+
+        word_dict_sorted.sort(cmp=lambda a, b: a[1] > b[1])
+
+        word_dict = dict()
+
+        h5file = h5py.File(preprocess_fn, 'w')
+        try:
+            word_dict_h5 = h5file.create_dataset(
+                'word_dict',
+                shape=(len(word_dict_sorted), ),
+                dtype=h5py.special_dtype(vlen=str))
+            for i, each in enumerate(word_dict_sorted):
+                word_dict_h5[i] = each[0]
+                word_dict[each[0]] = i
+
+            sentence = h5file.create_dataset(
+                'sentence',
+                shape=(sample_num, ),
+                dtype=h5py.special_dtype(vlen=numpy.int32))
+
+            label = h5file.create_dataset(
+                'label', shape=(sample_num, 1), dtype=numpy.int8)
+
+            with gzip.open(raw_file_fn, mode='r') as f:
+                for i, line in enumerate(f):
+                    obj = json.loads(line)
+                    txt = obj['reviewText']
+                    score = numpy.int8(obj['overall'])
+                    words = nltk.tokenize.word_tokenize(txt, 'english')
+                    words = numpy.array(
+                        [word_dict[w] for w in words], dtype=numpy.int32)
+                    sentence[i] = words
+                    label[i] = score
+
+        finally:
+            h5file.close()
+    return preprocess_fn
+
+
+if __name__ == '__main__':
+    preprocess(category=Categories.AmazonInstantVideo)

From 20c96b73040f4b880618ddab0f195d8b8babd655 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Sat, 14 Jan 2017 15:53:05 +0800
Subject: [PATCH 13/18] Done with amazon product reviews.

---
 python/paddle/data/amazon_product_reviews.py  | 84 ++++++++++++++++++-
 python/paddle/data/base.py                    | 15 ++++
 python/test/__init__.py                       |  0
 python/test/data/__init__.py                  |  0
 .../test/data/test_amazon_product_reviews.py  | 24 ++++++
 5 files changed, 119 insertions(+), 4 deletions(-)
 create mode 100644 python/paddle/data/base.py
 create mode 100644 python/test/__init__.py
 create mode 100644 python/test/data/__init__.py
 create mode 100644 python/test/data/test_amazon_product_reviews.py

diff --git a/python/paddle/data/amazon_product_reviews.py b/python/paddle/data/amazon_product_reviews.py
index 4cc56e4c6f77fa..bce74cb1a8bd8e 100644
--- a/python/paddle/data/amazon_product_reviews.py
+++ b/python/paddle/data/amazon_product_reviews.py
@@ -23,6 +23,7 @@
 import os
 from http_download import download
 from logger import logger
+from base import BaseDataSet
 import gzip
 import json
 import hashlib
@@ -34,6 +35,9 @@
 BASE_URL = 'http://snap.stanford.edu/data/' \
            'amazon/productGraph/categoryFiles/reviews_%s_5.json.gz'
 
+DATASET_LABEL = 'label'
+DATASET_SENTENCE = 'sentence'
+
 
 class Categories(object):
     Books = "Books"
@@ -89,7 +93,7 @@ class Categories(object):
     __md5__[VideoGames] = '730612da2d6a93ed19f39a808b63993e'
 
 
-__all__ = ['fetch', 'Categories', 'preprocess']
+__all__ = ['fetch', 'Categories', 'preprocess', 'dataset']
 
 
 def calculate_md5(fn):
@@ -186,12 +190,12 @@ def preprocess(category=None, directory=None):
                 word_dict[each[0]] = i
 
             sentence = h5file.create_dataset(
-                'sentence',
+                DATASET_SENTENCE,
                 shape=(sample_num, ),
                 dtype=h5py.special_dtype(vlen=numpy.int32))
 
             label = h5file.create_dataset(
-                'label', shape=(sample_num, 1), dtype=numpy.int8)
+                DATASET_LABEL, shape=(sample_num, 1), dtype=numpy.int8)
 
             with gzip.open(raw_file_fn, mode='r') as f:
                 for i, line in enumerate(f):
@@ -209,5 +213,77 @@ def preprocess(category=None, directory=None):
     return preprocess_fn
 
 
+class AmazonProductReviewsDataSet(BaseDataSet):
+    def __init__(self,
+                 category=None,
+                 directory=None,
+                 test_ratio=0.1,
+                 positive_threshold=5,
+                 negative_threshold=2,
+                 random_seed=0):
+        super(AmazonProductReviewsDataSet, self).__init__(
+            random_seed=random_seed)
+
+        fn = preprocess(category=category, directory=directory)
+
+        self.__h5file__ = h5py.File(fn, 'r')
+
+        self.__label__ = self.__h5file__[DATASET_LABEL]
+        self.__sentence__ = self.__h5file__[DATASET_SENTENCE]
+
+        positive_idx = []
+        negative_idx = []
+        for i, lbl in enumerate(self.__label__):
+            if lbl >= positive_threshold:
+                positive_idx.append(i)
+            elif lbl <= negative_threshold:
+                negative_idx.append(i)
+
+        positive_len = int(test_ratio * len(positive_idx))
+        negative_len = int(test_ratio * len(negative_idx))
+
+        self.__train_set__ = positive_idx[positive_len:] + negative_idx[
+            negative_len:]
+        self.__test_set__ = positive_idx[:
+                                         positive_len] + negative_idx[:
+                                                                      negative_len]
+        self.__test_set__.sort()
+        self.__positive_threshold__ = positive_threshold
+        self.__negative_threshold__ = negative_threshold
+        self.__is_reading_train_data__ = False
+
+    def __read_data__(self, idx):
+        return self.__sentence__[
+            idx], self.__label__ >= self.__positive_threshold__
+
+    def train_data(self):
+        if self.__is_reading_train_data__:
+            raise RuntimeError("Should not get multiple train_data generators")
+
+        self.__is_reading_train_data__ = True
+        try:
+            self.__random__.shuffle(self.__train_set__)
+            for each_id in self.__train_set__:
+                yield self.__read_data__(each_id)
+        finally:
+            self.__is_reading_train_data__ = False
+
+    def test_data(self):
+        for each_id in self.__test_set__:
+            yield self.__read_data__(each_id)
+
+    def __del__(self):
+        self.__h5file__.close()
+
+
+dataset = AmazonProductReviewsDataSet
+
 if __name__ == '__main__':
-    preprocess(category=Categories.AmazonInstantVideo)
+    ds = dataset(category=Categories.AmazonInstantVideo)
+
+    for each_train_data in ds.train_data():
+        # print each_train_data
+        pass
+
+    for each_test_data in ds.test_data():
+        pass
diff --git a/python/paddle/data/base.py b/python/paddle/data/base.py
new file mode 100644
index 00000000000000..3f049527a762c8
--- /dev/null
+++ b/python/paddle/data/base.py
@@ -0,0 +1,15 @@
+import random
+
+__all__ = ['BaseDataSet']
+
+
+class BaseDataSet(object):
+    def __init__(self, random_seed):
+        self.__random__ = random.Random()
+        self.__random__.seed(random_seed)
+
+    def train_data(self):
+        raise NotImplemented()
+
+    def test_data(self):
+        raise NotImplemented()
diff --git a/python/test/__init__.py b/python/test/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/python/test/data/__init__.py b/python/test/data/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/python/test/data/test_amazon_product_reviews.py b/python/test/data/test_amazon_product_reviews.py
new file mode 100644
index 00000000000000..8efcfd1f8268fc
--- /dev/null
+++ b/python/test/data/test_amazon_product_reviews.py
@@ -0,0 +1,24 @@
+import unittest
+from paddle.data import amazon_product_reviews
+
+
+class AmazonReviewsTest(unittest.TestCase):
+    def test_read_data(self):
+        dataset = amazon_product_reviews.dataset(
+            category=amazon_product_reviews.Categories.AmazonInstantVideo,
+            positive_threshold=4,
+            negative_threshold=3)
+
+        sample_num = 0
+
+        for _ in dataset.train_data():
+            sample_num += 1
+
+        for _ in dataset.test_data():
+            sample_num += 1
+
+        self.assertEqual(37126, sample_num)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 0c76c644fe2575a367c286aa6ded3b6e57ac79d0 Mon Sep 17 00:00:00 2001
From: baidu <baidu@qibinMacBook-Pro.local>
Date: Mon, 16 Jan 2017 18:42:50 +0800
Subject: [PATCH 14/18] add new file path

---
 python/paddle/data/cifar.py          |  77 +++++++++++++++++
 python/paddle/data/mnist.py          | 110 +++++++++++++++---------
 python/paddle/data/recommendation.py |  98 +++++++++++++--------
 python/paddle/data/semantic.py       | 122 +++++++++++++--------------
 python/paddle/data/seqToseq.py       | 100 ++++++++++++++--------
 5 files changed, 338 insertions(+), 169 deletions(-)
 create mode 100644 python/paddle/data/cifar.py

diff --git a/python/paddle/data/cifar.py b/python/paddle/data/cifar.py
new file mode 100644
index 00000000000000..e038ebd76b0fd4
--- /dev/null
+++ b/python/paddle/data/cifar.py
@@ -0,0 +1,77 @@
+#/usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+A utility for fetching, reading CIFAR-10 dataset.
+
+https://www.cs.toronto.edu/~kriz/cifar.html
+"""
+
+import os
+from http_download import download
+from logger import logger
+import hashlib
+
+BASE_URL = 'https://www.cs.toronto.edu/~kriz/cifar-%s-python.tar.gz'
+
+
+class Categories(object):
+    Ten = 10
+    Hundred = 100
+
+    __md5__ = dict()
+
+    __md5__[Ten] = 'c58f30108f718f92721af3b95e74349a'
+    __md5__[Hundred] = 'eb9058c3a382ffc7106e4002c42a8d85'
+
+__all__ = ['fetch', 'Categories']
+
+
+def calculate_md5(fn):
+    h = hashlib.md5()
+    with open(fn, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def fetch(category=None, directory=None):
+    """
+    According to the source name,set the download path for source,
+    download the data from the source url,and return the download path to fetch for training api.
+
+    Args:
+
+    Returns:
+        path to untar file.
+    """
+    if directory is None:
+        directory = os.path.expanduser(
+            os.path.join('~', 'paddle_data_directory', 'cifar'))
+
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+
+    cn = 'cifar' + category
+    fn = os.path.join(directory, '%s.tar.gz' % cn)
+
+    if os.path.exists(fn) and calculate_md5(fn) == Categories.__md5__[category]:
+        return fn
+
+    logger.info("Downloading cifar dataset for %s category" % cn)
+    return download(BASE_URL % category,
+                    os.path.join(directory, '%s.tar.gz' % cn))
diff --git a/python/paddle/data/mnist.py b/python/paddle/data/mnist.py
index 5fe6f6dccc0875..23a7ed46b633e8 100644
--- a/python/paddle/data/mnist.py
+++ b/python/paddle/data/mnist.py
@@ -14,57 +14,89 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+A utility for fetching, reading mnist handwritten digit dataset.
 
-############################################################################
-#
-# Function for fetch the data untar directory for mnist training api.
-# you can use this data for Digital identification.
-#
-# First,we special the data download directory is "~/paddle_data_directory".
-# For the mnist dataset,it untar the dataset,and returns the untar
-# directory for training api.
-#
-############################################################################
+http://yann.lecun.com/exdb/mnist/
+"""
 
-import shutil
 import os
-import sys
-import collections
-import numpy as np
-from six.moves import urllib
-import urlparse
+from http_download import download
+from logger import logger
+from base import BaseDataSet
 import gzip
-from http_download import data_download
+import json
+import hashlib
+import nltk
+import collections
+import h5py
+import numpy
 
-source_url = 'http://yann.lecun.com/exdb/mnist/'
-filename = [
-    'train-images-idx3-ubyte.gz', 't10k-images-idx3-ubyte.gz',
-    'train-labels-idx1-ubyte.gz', 't10k-labels-idx1-ubyte.gz'
-]
+BASE_URL = 'http://yann.lecun.com/exdb/mnist/%s-ubyte.gz'
 
 
-def fetch(directory=None):
-    """
-    According to the source name,set the download path for source,
-    download the data from the source url,and return the download path to fetch for training api.
+class Categories(object):
+    TrainImage = 'train-images-idx3'
+    TrainLabels = 'train-labels-idx1'
+    TestImage = 't10k-images-idx3'
+    TestLabels = 't10k-labels-idx1'
+
+    All = [TrainImage, TrainLabels, TestImage, TestLabels]
+
+    __md5__ = dict()
+
+    __md5__[TrainImage] = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
+    __md5__[TrainLabels] = 'd53e105ee54ea40749a09fcbcd1e9432'
+    __md5__[TestImage] = '9fb629c4189551a2d022fa330f9573f3'
+    __md5__[TestLabels] = 'ec29112dd5afa0611ce80d1b7f02629c'
 
-    Args:
 
-    Returns:
-        path for untar file.
+__all__ = ['fetch', 'Categories']
+
+
+def calculate_md5(fn):
+    h = hashlib.md5()
+    with open(fn, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def fetch_data(category=None, directory=None):
+    """
+    Calculate each md5 value.
+    :param category:
+    :param directory:
+    :return:
     """
-    source_name = "mnist"
+    cn = category + '-ubyte'
+    fn = os.path.join(directory, '%s.gz' % cn)
+    if os.path.exists(fn) and \
+                    calculate_md5(fn) == Categories.__md5__[category]:
+        return fn
+    logger.info("Downloading mnist handwritten digit dataset for %s category" % cn)
+    return download(BASE_URL % category, fn)
 
+
+def fetch(category=None, directory=None):
+    """
+    According to the source name,set the download path for source,
+    download the data from the source url,and return the download path to fetch
+    for training api.
+    :param category:
+    :param directory:
+    :return:
+    """
     if directory is None:
         directory = os.path.expanduser(
-            os.path.join('~', 'paddle_data_directory'))
+            os.path.join('~', 'paddle_data', 'mnist'))
 
-    download_path = os.path.join(directory, source_name)
-    if not os.path.exists(download_path):
-        os.makedirs(download_path)
+    if not os.path.exists(directory):
+        os.makedirs(directory)
 
-    for file in filename:
-        url = urlparse.urljoin(source_url, file)
-        filepath = data_download(download_path, url)
-        data_dir = os.path.join(filepath, file.split('.')[0])
-        return data_dir
+    if category is None:
+        category = [category for category in Categories.All]
+        fl = []  # download file list
+        for index, line in range(len(category)):
+            fl.append(fetch_data(line, directory))
+        return fl
diff --git a/python/paddle/data/recommendation.py b/python/paddle/data/recommendation.py
index 6c3fba55c8919b..602ac257ea9b21 100644
--- a/python/paddle/data/recommendation.py
+++ b/python/paddle/data/recommendation.py
@@ -1,4 +1,4 @@
-#/usr/bin/env python
+# /usr/bin/env python
 # -*- coding:utf-8 -*-
 
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
@@ -14,53 +14,81 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+A utility for fetching, reading MovieLens dataset.
 
-############################################################################
-#
-# Function for fetch the data untar directory for amazon training api.
-# you can use this data for movie recommendation.
-#
-# First,we special the data download directory is "~/paddle_data_directory".
-# For the movie recommendation dataset,it untar the dataset,and returns the
-# untar directory for training api.
-#
-##############################################################################
+http://files.grouplens.org/datasets/movielens
+"""
 
-import shutil
 import os
-import sys
-import zipfile
+from http_download import download
+from logger import logger
+from base import BaseDataSet
+import gzip
+import json
+import hashlib
+import nltk
 import collections
-import numpy as np
-from six.moves import urllib
-import stat
-from http_download import data_download
+import h5py
+import numpy
 
+BASE_URL = 'http://files.grouplens.org/datasets/movielens/%s.zip'
 
-source_url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
-file_source = "ml-1m"
 
+class Categories(object):
+    M1m = "ml-1m"
+    M10m = "ml-10m"
+    M20m = "ml-20m"
+    M100k = "ml-100k"
+    MLatestSmall = "ml-latest-small"
+    MLatest = "ml-latest"
+
+    __md5__ = dict()
+
+    __md5__[M1m] = 'c4d9eecfca2ab87c1945afe126590906'
+    __md5__[M10m] = 'ce571fd55effeba0271552578f2648bd'
+    __md5__[M20m] = 'cd245b17a1ae2cc31bb14903e1204af3'
+    __md5__[M100k] = '0e33842e24a9c977be4e0107933c0723'
+    __md5__[MLatestSmall] = 'be5b02baacd9e70dd97734ea0e19528a'
+    __md5__[MLatest] = '0c827eaafc7e89c455986510827662bd'
 
-def fetch(directory=None):
-    """
-    According to the source name,set the download path for source,
-    download the data from the source url,and return the download path to fetch for training api.
 
-    Args:
+__all__ = ['fetch', 'Categories', 'preprocess']
 
-    Returns:
-        path to downloaded file.
+
+def calculate_md5(fn):
+    h = hashlib.md5()
+    with open(fn, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def fetch(category=None, directory=None):
     """
-    source_name = "recommendation"
+    According to the source name,set the download path for source,
+    download the data from the source url,and return the download path to fetch
+    for training api.
+    :param category:
+    :param directory:
+    :return:
+    """
+    if category is None:
+        category = Categories.M1m
+
     if directory is None:
         directory = os.path.expanduser(
-            os.path.join('~', 'paddle_data_directory'))
+            os.path.join('~', 'paddle_data', 'recommendation'))
+
+    if not os.path.exists(directory):
+        os.makedirs(directory)
 
-    download_path = os.path.join(directory, source_name)
-    if not os.path.exists(download_path):
-        os.makedirs(download_path)
+    fn = os.path.join(directory, '%s.zip' % category)
 
-    filepath = data_download(download_path, source_url)
-    data_path = os.path.join(filepath, file_source)
+    if os.path.exists(fn) and \
+                    calculate_md5(fn) == Categories.__md5__[category]:
+        # already download.
+        return fn
 
-    return data_path
+    logger.info("Downloading MovieLens dataset for %s category" % category)
+    return download(BASE_URL % category, fn)
diff --git a/python/paddle/data/semantic.py b/python/paddle/data/semantic.py
index dfafb5120cf88c..bf04c044c6322e 100644
--- a/python/paddle/data/semantic.py
+++ b/python/paddle/data/semantic.py
@@ -1,4 +1,4 @@
-#/usr/bin/env python
+# /usr/bin/env python
 # -*- coding:utf-8 -*-
 
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
@@ -14,78 +14,76 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+A utility for fetching, reading semantic data set.
 
-############################################################################
-#
-# Function for fetch the data untar directory for semantic_role_labeling
-# training api.you can use this data for semantic.
-#
-# First,we special the data download directory is "~/paddle_data_directory".
-# For the semantic role labeling,it untar the dataset,and returns the untar
-# directory for training api.
-#
-############################################################################
+http://www.cs.upc.edu/~srlconll
+"""
 
-import shutil
 import os
-import sys
-import zipfile
+from http_download import download
+from logger import logger
+from base import BaseDataSet
+import gzip
+import json
+import hashlib
+import nltk
 import collections
-import numpy as np
-from six.moves import urllib
-from http_download import data_download
+import h5py
+import numpy
 
+BASE_URL = 'http://www.cs.upc.edu/~srlconll/%s.tar.gz'
 
-source_url = [
-    'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz',
-    'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt',
-    'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt',
-    'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt',
-    'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
-]
+DATASET_LABEL = 'label'
+DATASET_SENTENCE = 'sentence'
 
-file_source = "conll05st-release"
 
+class Categories(object):
+    Conll05test = "conll05st-tests"
 
-def fetch(directory=None):
-    """
-    According to the source name,set the download path for source,
-    download the data from the source url,and return the download path to fetch for training api.
+    __md5__ = dict()
+
+    __md5__[Conll05test] = '387719152ae52d60422c016e92a742fc'
+
+
+__all__ = ['fetch', 'Categories', 'preprocess', 'dataset']
+
+
+def calculate_md5(fn):
+    h = hashlib.md5()
+    with open(fn, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            h.update(chunk)
+    return h.hexdigest()
 
-    Args:
 
-    Returns:
-        path to downloaded file.
+def fetch(category=None, directory=None):
     """
-    source_name = "semantic"
+    According to the source name,set the download path for source,
+    download the data from the source url,and return the download path to fetch
+    for training api.
+    :param category:
+    :param directory:
+    :return:
+    """
+    if category is None:
+        category = Categories.Conll05test
+
     if directory is None:
         directory = os.path.expanduser(
-            os.path.join('~', 'paddle_data_directory'))
-
-    download_path = os.path.join(directory, source_name)
-    if not os.path.exists(download_path):
-        os.makedirs(download_path)
-
-    for url in source_url:
-        file_name = url.split('/')[-1]
-        if 'gz' in file_name:
-            filepath = data_download(download_path, url)
-            data_path = os.path.join(filepath, file_source)
-
-            sub_file = ['est.wsj.words.gz', 'test.wsj.props.gz']
-            words_path = os.path.join(data_path,
-                                      "test.wsj/words/test.wsj.words.gz")
-            props_path = os.path.join(data_path,
-                                      "test.wsj/props/test.wsj.props.gz")
-
-            sub_path = [words_path, props_path]
-            for sub_file in sub_path:
-                new_sub_path = os.path.join(download_path, sub_file)
-                shutil.move(sub_path, new_subpath)
-                tarfile.open(
-                    name=new_subpath, mode="r:gz").extractall(download_path)
-                os.remove(new_subpath)
-        else:
-            filepath = data_download(download_path, url)
-
-    return filepath
+            os.path.join('~', 'paddle_data', 'amazon'))
+
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+
+    fn = os.path.join(directory, '%s.json.gz' % category)
+
+    if os.path.exists(fn) and \
+                    calculate_md5(fn) == Categories.__md5__[category]:
+        # already download.
+        return fn
+
+    logger.info("Downloading amazon review dataset for %s category" % category)
+    return download(BASE_URL % category, fn)
+
+
diff --git a/python/paddle/data/seqToseq.py b/python/paddle/data/seqToseq.py
index ced53be2d1a9d5..8cdfd4ee68b959 100644
--- a/python/paddle/data/seqToseq.py
+++ b/python/paddle/data/seqToseq.py
@@ -1,4 +1,4 @@
-#/usr/bin/env python
+# /usr/bin/env python
 # -*- coding:utf-8 -*-
 
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
@@ -14,51 +14,85 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+A utility for fetching, reading sequence to sequence data set.
+
+http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data
+"""
 
-import shutil
 import os
-import sys
-import zipfile
+from http_download import download
+from logger import logger
+from base import BaseDataSet
+import gzip
+import json
+import hashlib
+import nltk
 import collections
-import numpy as np
-from six.moves import urllib
-import stat
-from http_download import data_download
+import h5py
+import numpy
+
+BASE_URL = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/%s.tgz'
+
+DATASET_LABEL = 'label'
+DATASET_SENTENCE = 'sentence'
+
+
+class Categories(object):
+    BiTexts = "bitexts"
+    DevTest = "dev+test"
+    All = [BiTexts, DevTest]
+
+    __md5__ = dict()
 
+    __md5__[BiTexts] = '15861dbac4a52c8c75561d5027062d7d'
+    __md5__[DevTest] = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
 
-source_url = [
-    'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz',
-    'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz'
-]
-model_url = 'http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz'
+__all__ = ['fetch', 'Categories', 'preprocess', 'dataset']
 
-model_source = "wmt14_model"
-file_source = "bitexts.selected"
 
+def calculate_md5(fn):
+    h = hashlib.md5()
+    with open(fn, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            h.update(chunk)
+    return h.hexdigest()
 
-def fetch(directory=None):
+
+def fetch_data(category=None, directory=None):
     """
-    According to the source name,set the download path for source,
-    download the data from the source url,and return the download path to fetch for training api.
+    Calculate each md5 value.
+    :param category:
+    :param directory:
+    :return:
+    """
+    fn = os.path.join(directory, '%s.tgz' % category)
+    if os.path.exists(fn) and \
+                    calculate_md5(fn) == Categories.__md5__[category]:
+        return fn
+    logger.info("Downloading mnist handwritten digit dataset for %s category" % category)
+    return download(BASE_URL % category, fn)
 
-    Args:
 
-    Returns:
-        path to downloaded file.
+def fetch(category=None, directory=None):
+    """
+    According to the source name,set the download path for source,
+    download the data from the source url,and return the download path to fetch
+    for training api.
+    :param category:
+    :param directory:
+    :return:
     """
-    source_name = "seqToseq"
     if directory is None:
         directory = os.path.expanduser(
-            os.path.join('~', 'paddle_data_directory'))
-
-    download_path = os.path.join(directory, source_name)
-    if not os.path.exists(download_path):
-        os.makedirs(download_path)
+            os.path.join('~', 'paddle_data', 'seqToseq'))
 
-    model_data = data_download(download_path, model_url)
-    model_path = os.path.join(model_data, model_source)
+    if not os.path.exists(directory):
+        os.makedirs(directory)
 
-    for url in source_url:
-        filepath = data_download(download_path, url)
-        data_path = os.path.join(filepath, file_source)
-        return data_path
+    if category is None:
+        category = [category for category in Categories.All]
+        fl = []  # download file list
+        for index, line in range(len(category)):
+            fl.append(fetch_data(line, directory))
+        return fl

From 9a803d091ca84dc1457d4f19ecb34a0d4a5e6ffd Mon Sep 17 00:00:00 2001
From: baidu <baidu@qibinMacBook-Pro.local>
Date: Mon, 16 Jan 2017 18:46:54 +0800
Subject: [PATCH 15/18] update file path

---
 python/paddle/data/sentiment.py | 94 +++++++++++++++++++--------------
 1 file changed, 55 insertions(+), 39 deletions(-)

diff --git a/python/paddle/data/sentiment.py b/python/paddle/data/sentiment.py
index e0a72e0d9b9809..ef3300f2678d29 100644
--- a/python/paddle/data/sentiment.py
+++ b/python/paddle/data/sentiment.py
@@ -1,4 +1,4 @@
-#/usr/bin/env python
+# /usr/bin/env python
 # -*- coding:utf-8 -*-
 
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
@@ -14,58 +14,74 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+A utility for fetching, reading sentiment data set.
 
-############################################################################
-#
-# Function for fetch the data untar directory for sentiment training api.
-# you can use this data for sentiment analasis.
-#
-# First,we special the data download directory is "~/paddle_data_directory".
-# For the sentiment dataset,it untar the dataset,and returns the untar
-# directory for training api.
-#
-############################################################################
+http://ai.stanford.edu/%7Eamaas/data/sentiment
+"""
 
-import shutil
 import os
-import sys
-import zipfile
+from http_download import download
+from logger import logger
+from base import BaseDataSet
+import gzip
+import json
+import hashlib
+import nltk
 import collections
-import numpy as np
-from six.moves import urllib
-import stat
-from http_download import data_download
+import h5py
+import numpy
 
-source_url = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
-moses_url = 'https://github.com/moses-smt/mosesdecoder/archive/master.zip'
+BASE_URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/%s.tar.gz'
 
-moses_source = "mosesdecoder-master"
-file_source = "aclImdb"
 
+DATASET_LABEL = 'label'
+DATASET_SENTENCE = 'sentence'
 
-def fetch(directory=None):
-    """
-    According to the source name,set the download path for source,
-    download the data from the source url,and return the download path to fetch for training api.
 
-    Args:
+class Categories(object):
+    AclImdb = "aclImdb_v1"
+
+    __md5__ = dict()
+
+    __md5__[AclImdb] = '7c2ac02c03563afcf9b574c7e56c153a'
 
-    Returns:
-        path to downloaded file.
+__all__ = ['fetch', 'Categories', 'preprocess', 'dataset']
+
+
+def calculate_md5(fn):
+    h = hashlib.md5()
+    with open(fn, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def fetch(category=None, directory=None):
     """
-    source_name = "sentiment"
+    According to the source name,set the download path for source,
+    download the data from the source url,and return the download path to fetch
+    for training api.
+    :param category:
+    :param directory:
+    :return:
+    """
+    if category is None:
+        category = Categories.AclImdb
+
     if directory is None:
         directory = os.path.expanduser(
-            os.path.join('~', 'paddle_data_directory'))
+            os.path.join('~', 'paddle_data', 'sentiment'))
 
-    download_path = os.path.join(directory, source_name)
-    if not os.path.exists(download_path):
-        os.makedirs(download_path)
+    if not os.path.exists(directory):
+        os.makedirs(directory)
 
-    moses_path = data_download(download_path, moses_url)
-    moses_data = os.path.join(moses_path, moses_source)
+    fn = os.path.join(directory, '%s.tar.gz' % category)
 
-    filepath = data_download(download_path, source_url)
-    data_path = os.path.join(filepath, file_source)
+    if os.path.exists(fn) and \
+                    calculate_md5(fn) == Categories.__md5__[category]:
+        # already download.
+        return fn
 
-    return data_path
+    logger.info("Downloading binary sentiment classification dataset for %s category" % category)
+    return download(BASE_URL % category, fn)

From c6a260068df61679a9f588744f821b3a7f476492 Mon Sep 17 00:00:00 2001
From: baidu <baidu@qibinMacBook-Pro.local>
Date: Fri, 20 Jan 2017 11:13:47 +0800
Subject: [PATCH 16/18] add Data md

---
 python/paddle/data/DATA.md     | 34 +++++++++++++++++
 python/paddle/data/cifar10.py  | 68 ----------------------------------
 python/paddle/data/semantic.py |  2 +-
 3 files changed, 35 insertions(+), 69 deletions(-)
 create mode 100644 python/paddle/data/DATA.md
 delete mode 100644 python/paddle/data/cifar10.py

diff --git a/python/paddle/data/DATA.md b/python/paddle/data/DATA.md
new file mode 100644
index 00000000000000..1752d294b9e879
--- /dev/null
+++ b/python/paddle/data/DATA.md
@@ -0,0 +1,34 @@
+### 数据集
+
+Paddle目前提供了很多demo，且各demo运行时需要从原生网站下载其数据，并进行复杂的预处理过程，整个过程会耗费大量时间。同时为了方便大家用Paddle做实验的时候，可以直接访问这些预处理好的数据，我们提供一套Python库。采用import数据源的方式(如：paddle.data.amazon_product_reviews)来简化获取训练所需数据的时间；但是如果你习惯自己处理原生数据，我们依然提供原生数据接口来满足你的需求。
+
+## 接口设计
+数据集的导入通过import paddle.data.amazon_product_reviews 来实现，你可以直接通过load_data(category=None,
+directory=None)获取你所需的数据集。考虑到类似Amazon的数据类型不止一种，通过category你可以选择控制所需要的数据源;如果你不指定数据源，默认为"Electronics"。directory用来指定下载路径，如果你不指定下载路径，默认为"~/paddle_data/amazon"。通过load_data()导入的数据源data为object，他是我们预处理的numpy格式数据，直接通过data.train_data()获取训练数据或者通过data.test_data()获取测试数据。你还可以打印训练数据和测试数据的数据信息，
+```python
+ for each_train_data in data.train_data():
+     print each_train_data
+```
+即可。
+
+具体的demo使用情况如下：
+```python
+import paddle.data.amazon_product_reviews  as raw
+
+data = raw.load_data()
+train_data = data.train_data()
+test_data = data.test_data()
+```
+你也可以打印出各数据集的数据信息：
+```python
+for each_train_data in data.train_data():
+    print each_train_data
+```
+打印出来的数据信息都是预处理之后的numpy格式的数据：
+```python
+(array([ 730143,  452087,  369164, 1128311, 1451292,  294749, 1370072,
+       1202482, 1522860, 1055269,   39557,    1579, 1184187, 1410234,
+       362445, 1133007, 1400596,  216811,  540527,  489771,  208467,
+       369164,  311153,  387289,  801432,  433138,  179848,  320757,
+       1410234], dtype=int32), True)
+```
diff --git a/python/paddle/data/cifar10.py b/python/paddle/data/cifar10.py
deleted file mode 100644
index d6d893288a851c..00000000000000
--- a/python/paddle/data/cifar10.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#/usr/bin/env python
-# -*- coding:utf-8 -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-########################################################################
-#
-# Function for fetch the data untar directory for cifar10 training api.
-# you can use this data for image classifation and gun traing.
-# As the python can read the data in "cifar-10-python.tar.gz",herer is
-# no need to untar the data.
-#
-#
-# First,we let the data download path is "~/paddle_data_directory",
-# when u no special the download path.
-#
-#
-# Then,download the cifar10 dataset,and returns the data directory for
-# training api.
-#
-########################################################################
-
-import shutil
-import os
-import sys
-import collections
-import numpy as np
-from six.moves import urllib
-from http_download import data_download
-
-source_url = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
-source_file = "cifar-10-batches-py"
-
-
-def fetch(directory=None):
-    """
-    According to the source name,set the download path for source,
-    download the data from the source url,and return the download path to fetch for training api.
-
-    Args:
-
-    Returns:
-        path to untar file.
-    """
-    source_name = "cifar"
-
-    if directory is None:
-        directory = os.path.expanduser(
-            os.path.join('~', 'paddle_data_directory'))
-
-    download_path = os.path.join(directory, source_name)
-    if not os.path.exists(download_path):
-        os.makedirs(download_path)
-    filepath = data_download(download_path, source_url)
-
-    return filepath
diff --git a/python/paddle/data/semantic.py b/python/paddle/data/semantic.py
index bf04c044c6322e..a5fb39ccdbffe4 100644
--- a/python/paddle/data/semantic.py
+++ b/python/paddle/data/semantic.py
@@ -71,7 +71,7 @@ def fetch(category=None, directory=None):
 
     if directory is None:
         directory = os.path.expanduser(
-            os.path.join('~', 'paddle_data', 'amazon'))
+            os.path.join('~', 'paddle_data', 'semantic'))
 
     if not os.path.exists(directory):
         os.makedirs(directory)

From 3fa7f21a9a2eee7a79d589706b1faf1a3ced8630 Mon Sep 17 00:00:00 2001
From: baidu <baidu@qibinMacBook-Pro.local>
Date: Mon, 6 Feb 2017 20:11:07 +0800
Subject: [PATCH 17/18] new

---
 python/paddle/data/DATA.md                   |  64 +++---
 python/paddle/data/amazon_product_reviews.py | 196 ++++++++++++-------
 2 files changed, 159 insertions(+), 101 deletions(-)

diff --git a/python/paddle/data/DATA.md b/python/paddle/data/DATA.md
index 1752d294b9e879..cf326406705fd9 100644
--- a/python/paddle/data/DATA.md
+++ b/python/paddle/data/DATA.md
@@ -5,30 +5,40 @@ Paddle目前提供了很多demo，且各demo运行时需要从原生网站下载
 ## 接口设计
 数据集的导入通过import paddle.data.amazon_product_reviews 来实现，你可以直接通过load_data(category=None,
 directory=None)获取你所需的数据集。考虑到类似Amazon的数据类型不止一种，通过category你可以选择控制所需要的数据源;如果你不指定数据源，默认为"Electronics"。directory用来指定下载路径，如果你不指定下载路径，默认为"~/paddle_data/amazon"。通过load_data()导入的数据源data为object，他是我们预处理的numpy格式数据，直接通过data.train_data()获取训练数据或者通过data.test_data()获取测试数据。你还可以打印训练数据和测试数据的数据信息，
-```python
- for each_train_data in data.train_data():
-     print each_train_data
-```
-即可。
-
-具体的demo使用情况如下：
-```python
-import paddle.data.amazon_product_reviews  as raw
-
-data = raw.load_data()
-train_data = data.train_data()
-test_data = data.test_data()
-```
-你也可以打印出各数据集的数据信息：
-```python
-for each_train_data in data.train_data():
-    print each_train_data
-```
-打印出来的数据信息都是预处理之后的numpy格式的数据：
-```python
-(array([ 730143,  452087,  369164, 1128311, 1451292,  294749, 1370072,
-       1202482, 1522860, 1055269,   39557,    1579, 1184187, 1410234,
-       362445, 1133007, 1400596,  216811,  540527,  489771,  208467,
-       369164,  311153,  387289,  801432,  433138,  179848,  320757,
-       1410234], dtype=int32), True)
-```
+
+ ```python
+  for each_train_data in data.train_data():
+      print each_train_data
+ ```
+ 即可。
+
+ 具体的demo使用情况如下：
+ ```python
+ import paddle.data.amazon_product_reviews  as raw
+
+ raw.data(batch_size=10)
+ ```
+ 你也可以打印出各数据集的数据信息：
+ 如果是测试集或者训练数据集,可以这么打印
+ ```python
+ import paddle.data.amazon_product_reviews  as raw
+
+ raw.test_data(batch_size=10)
+ raw.train_data(batch_size=10)
+
+ ```
+
+ 打印出来的数据信息都是预处理之后的numpy格式的数据：
+ ```python
+ (array([1370072,  884914, 1658622, 1562803,    1579,  369164, 1129091,
+        1073545, 1410234,  857854,  672274,  884920, 1078270, 1410234,
+                777903, 1352600,  497103,  132906,  239745,   65294, 1502324,
+                       1165610,  204273, 1610806,  942942,  709056,  452087,  118093,
+                              1410234], dtype=int32), array([ True], dtype=bool))
+ (array([ 777903,  713632,  452087, 1647686,  877980,  294749, 1575945,
+         662947, 1431519,  462950,  452087,  902916,  479242,  294749,
+                1278816,  672274,    1579,  394865, 1129091, 1352600,  294749,
+                       1073545], dtype=int32), array([ True], dtype=bool))
+
+ ```
+
diff --git a/python/paddle/data/amazon_product_reviews.py b/python/paddle/data/amazon_product_reviews.py
index bce74cb1a8bd8e..ce6b52071d2fec 100644
--- a/python/paddle/data/amazon_product_reviews.py
+++ b/python/paddle/data/amazon_product_reviews.py
@@ -23,7 +23,6 @@
 import os
 from http_download import download
 from logger import logger
-from base import BaseDataSet
 import gzip
 import json
 import hashlib
@@ -31,6 +30,8 @@
 import collections
 import h5py
 import numpy
+import random
+
 
 BASE_URL = 'http://snap.stanford.edu/data/' \
            'amazon/productGraph/categoryFiles/reviews_%s_5.json.gz'
@@ -38,6 +39,10 @@
 DATASET_LABEL = 'label'
 DATASET_SENTENCE = 'sentence'
 
+positive_threshold = 5
+negative_threshold = 2
+
+
 
 class Categories(object):
     Books = "Books"
@@ -93,7 +98,7 @@ class Categories(object):
     __md5__[VideoGames] = '730612da2d6a93ed19f39a808b63993e'
 
 
-__all__ = ['fetch', 'Categories', 'preprocess', 'dataset']
+__all__ = ['fetch', 'Categories', 'preprocess', 'dataset', 'load_data']
 
 
 def calculate_md5(fn):
@@ -109,11 +114,9 @@ def fetch(category=None, directory=None):
     According to the source name,set the download path for source,
     download the data from the source url,and return the download path to fetch
     for training api.
-
-    Args:
-
-    Returns:
-        path for the data untar.
+    :param category:
+    :param directory:
+    :return:
     """
     if category is None:
         category = Categories.Electronics
@@ -213,77 +216,122 @@ def preprocess(category=None, directory=None):
     return preprocess_fn
 
 
-class AmazonProductReviewsDataSet(BaseDataSet):
-    def __init__(self,
-                 category=None,
-                 directory=None,
-                 test_ratio=0.1,
-                 positive_threshold=5,
-                 negative_threshold=2,
-                 random_seed=0):
-        super(AmazonProductReviewsDataSet, self).__init__(
-            random_seed=random_seed)
-
-        fn = preprocess(category=category, directory=directory)
-
-        self.__h5file__ = h5py.File(fn, 'r')
-
-        self.__label__ = self.__h5file__[DATASET_LABEL]
-        self.__sentence__ = self.__h5file__[DATASET_SENTENCE]
-
-        positive_idx = []
-        negative_idx = []
-        for i, lbl in enumerate(self.__label__):
-            if lbl >= positive_threshold:
-                positive_idx.append(i)
-            elif lbl <= negative_threshold:
-                negative_idx.append(i)
-
-        positive_len = int(test_ratio * len(positive_idx))
-        negative_len = int(test_ratio * len(negative_idx))
-
-        self.__train_set__ = positive_idx[positive_len:] + negative_idx[
-            negative_len:]
-        self.__test_set__ = positive_idx[:
-                                         positive_len] + negative_idx[:
-                                                                      negative_len]
-        self.__test_set__.sort()
-        self.__positive_threshold__ = positive_threshold
-        self.__negative_threshold__ = negative_threshold
-        self.__is_reading_train_data__ = False
-
-    def __read_data__(self, idx):
-        return self.__sentence__[
-            idx], self.__label__ >= self.__positive_threshold__
-
-    def train_data(self):
-        if self.__is_reading_train_data__:
-            raise RuntimeError("Should not get multiple train_data generators")
-
-        self.__is_reading_train_data__ = True
-        try:
-            self.__random__.shuffle(self.__train_set__)
-            for each_id in self.__train_set__:
-                yield self.__read_data__(each_id)
-        finally:
-            self.__is_reading_train_data__ = False
+def data(batch_size, category=None, directory=None):
+    """
 
-    def test_data(self):
-        for each_id in self.__test_set__:
-            yield self.__read_data__(each_id)
+    :param batch_size:
+    :param category:
+    :param directory:
+    :return:
+    """
+    if category is None:
+        category = Categories.Electronics
 
-    def __del__(self):
-        self.__h5file__.close()
+    if directory is None:
+        directory = os.path.expanduser(
+            os.path.join('~', 'paddle_data', 'amazon'))
 
+    fn = preprocess(category=category, directory=directory)
+    datasets = h5py.File(fn, 'r')
 
-dataset = AmazonProductReviewsDataSet
+    label = datasets[DATASET_LABEL]
+    sentence = datasets[DATASET_SENTENCE]
 
-if __name__ == '__main__':
-    ds = dataset(category=Categories.AmazonInstantVideo)
+    if label.shape[0] <= batch_size:
+        lens = label.shape[0]
+    else:
+        lens = batch_size
+
+    for index in range(lens):
+        if label[index] >= positive_threshold:
+            print (numpy.array(sentence[index]), label[index] >= positive_threshold)
+        elif label[index] <= negative_threshold:
+            print (numpy.array(sentence[index]), label[index] <= negative_threshold)
+
+
+def test_data(batch_size, category=None, directory=None):
+    """
+
+    :param batch_size:
+    :param category:
+    :param directory:
+    :return:
+    """
+    if category is None:
+        category = Categories.Electronics
+
+    if directory is None:
+        directory = os.path.expanduser(
+            os.path.join('~', 'paddle_data', 'amazon'))
 
-    for each_train_data in ds.train_data():
-        # print each_train_data
-        pass
+    fn = preprocess(category=category, directory=directory)
+    datasets = h5py.File(fn, 'r')
+
+    label = datasets[DATASET_LABEL]
+    sentence = datasets[DATASET_SENTENCE]
+
+    if label.shape[0] <= batch_size:
+        lens = label.shape[0]
+    else:
+        lens = batch_size
+
+    positive_idx = []
+    negative_idx = []
+    for i, lbl in enumerate(label):
+        if label[i] >= positive_threshold:
+            positive_idx.append(i)
+        elif lbl <= negative_threshold:
+            negative_idx.append(i)
+
+    __test_set__ = positive_idx[:lens] + negative_idx[:lens]
+
+    random.shuffle(__test_set__)
+
+    for index in range(lens):
+        print (numpy.array(sentence[index]), label[index] >= positive_threshold)
+
+
+def train_data(batch_size, category=None, directory=None):
+    """
+
+    :param batch_size:
+    :param category:
+    :param directory:
+    :return:
+    """
+    if category is None:
+        category = Categories.Electronics
+
+    if directory is None:
+        directory = os.path.expanduser(
+            os.path.join('~', 'paddle_data', 'amazon'))
+
+    fn = preprocess(category=category, directory=directory)
+    datasets = h5py.File(fn, 'r')
+
+    label = datasets[DATASET_LABEL]
+    sentence = datasets[DATASET_SENTENCE]
+
+    if label.shape[0] <= batch_size:
+        lens = label.shape[0]
+    else:
+        lens = batch_size
+
+    positive_idx = []
+    negative_idx = []
+    for i, lbl in enumerate(label):
+        if label[i] >= positive_threshold:
+            positive_idx.append(i)
+        elif lbl <= negative_threshold:
+            negative_idx.append(i)
+    __train_set__ = positive_idx[lens:] + negative_idx[lens:]
+
+    random.shuffle(__train_set__)
+
+    for index in range(lens):
+        print (numpy.array(sentence[index]), label[index] >= positive_threshold)
+
+
+if __name__ == '__main__':
+    data(10)
 
-    for each_test_data in ds.test_data():
-        pass

From bee88c9f24c1f738d676a3dba58ceb4fdcf672fc Mon Sep 17 00:00:00 2001
From: baidu <baidu@qibinMacBook-Pro.local>
Date: Thu, 9 Feb 2017 15:42:56 +0800
Subject: [PATCH 18/18] updata amazon & cifar & mnist

---
 python/paddle/data/amazon_product_reviews.py |   4 +-
 python/paddle/data/cifar.py                  | 101 +++++++++++-
 python/paddle/data/mnist.py                  | 153 +++++++++++++------
 3 files changed, 203 insertions(+), 55 deletions(-)

diff --git a/python/paddle/data/amazon_product_reviews.py b/python/paddle/data/amazon_product_reviews.py
index ce6b52071d2fec..fba9db6ee9f588 100644
--- a/python/paddle/data/amazon_product_reviews.py
+++ b/python/paddle/data/amazon_product_reviews.py
@@ -98,7 +98,7 @@ class Categories(object):
     __md5__[VideoGames] = '730612da2d6a93ed19f39a808b63993e'
 
 
-__all__ = ['fetch', 'Categories', 'preprocess', 'dataset', 'load_data']
+__all__ = ['fetch', 'data', 'train_data', 'test_data']
 
 
 def calculate_md5(fn):
@@ -112,7 +112,7 @@ def calculate_md5(fn):
 def fetch(category=None, directory=None):
     """
     According to the source name,set the download path for source,
-    download the data from the source url,and return the download path to fetch
+    download the data from the source url, and return the download path to fetch
     for training api.
     :param category:
     :param directory:
diff --git a/python/paddle/data/cifar.py b/python/paddle/data/cifar.py
index e038ebd76b0fd4..8baf073709e3ef 100644
--- a/python/paddle/data/cifar.py
+++ b/python/paddle/data/cifar.py
@@ -22,23 +22,25 @@
 """
 
 import os
+import cPickle
 from http_download import download
 from logger import logger
 import hashlib
+import tarfile
+import numpy
 
 BASE_URL = 'https://www.cs.toronto.edu/~kriz/cifar-%s-python.tar.gz'
-
+DATA = "cifar-10-batches-py"
 
 class Categories(object):
-    Ten = 10
-    Hundred = 100
+    Ten = '10'
+    Hundred = '100'
 
     __md5__ = dict()
-
     __md5__[Ten] = 'c58f30108f718f92721af3b95e74349a'
     __md5__[Hundred] = 'eb9058c3a382ffc7106e4002c42a8d85'
 
-__all__ = ['fetch', 'Categories']
+__all__ = ['fetch', 'Categories', 'train_data', 'test_data']
 
 
 def calculate_md5(fn):
@@ -59,9 +61,11 @@ def fetch(category=None, directory=None):
     Returns:
         path to untar file.
     """
+    if category is None:
+        category = Categories.Ten
     if directory is None:
         directory = os.path.expanduser(
-            os.path.join('~', 'paddle_data_directory', 'cifar'))
+            os.path.join('~', 'paddle_data', 'cifar'))
 
     if not os.path.exists(directory):
         os.makedirs(directory)
@@ -69,9 +73,92 @@ def fetch(category=None, directory=None):
     cn = 'cifar' + category
     fn = os.path.join(directory, '%s.tar.gz' % cn)
 
-    if os.path.exists(fn) and calculate_md5(fn) == Categories.__md5__[category]:
+    if os.path.exists(fn) and calculate_md5(fn) == \
+            Categories.__md5__[category]:
         return fn
 
     logger.info("Downloading cifar dataset for %s category" % cn)
     return download(BASE_URL % category,
                     os.path.join(directory, '%s.tar.gz' % cn))
+
+
+def untar(category=None, directory=None):
+    """
+
+    :param category:
+    :param directory:
+    :return:
+    """
+    if directory is None:
+        directory = os.path.expanduser(
+            os.path.join('~', 'paddle_data', 'cifar'))
+    raw_file_fn = fetch(category, directory)
+    #raw_file_fn = os.path.join(directory, 'cifar10.tar.gz')
+    tar = tarfile.open(raw_file_fn, "r:gz")
+    names = tar.getnames()
+    for file in names:
+        tar.extract(file, directory)
+    tar.close()
+
+
+def create_mean(dataset, directory=None):
+    """
+
+    :param dataset:
+    :param directory:
+    :return:
+    """
+    if directory is None:
+        directory = os.path.expanduser(
+            os.path.join('~', 'paddle_data', 'cifar'))
+
+    if not os.path.isfile("mean.meta"):
+        mean = numpy.zeros(3 * 32 * 3)
+        num = 0
+        for f in dataset:
+            batch = numpy.load(f)
+            mean += batch['data'].sum(0)
+            num += len(batch['data'])
+        mean /= num
+        print mean.size
+        data = {"mean": mean, "size": mean.size}
+        cPickle.dump(
+            data, open("mean.meta", 'w'), protocol=cPickle.HIGHEST_PROTOCOL)
+
+
+def train_data(directory=None):
+    """
+    :param directory:
+    :return:
+    """
+    if directory is None:
+        directory = os.path.expanduser(os.path.join('~', 'paddle_data', 'cifar'))
+
+    untar()
+    datatset = [DATA + "/data_batch_%d" % (i + 1) for i in xrange(0, 5)]
+    for f in datatset:
+        train_set = os.path.join(directory, f)
+        fo = open(train_set, 'rb')
+        dict = cPickle.load(fo)
+        fo.close()
+        print dict
+
+
+def test_data(directory=None):
+    """
+    :param directory:
+    :return:
+    """
+    if directory is None:
+        directory = os.path.expanduser(os.path.join('~', 'paddle_data', 'cifar'))
+    untar()
+    test_set = os.path.join(directory, DATA + "/test_batch")
+    fo = open(test_set, 'rb')
+    dict = cPickle.load(fo)
+    fo.close()
+    print dict
+
+
+if __name__ == '__main__':
+    train_data()
+    #test_data()
\ No newline at end of file
diff --git a/python/paddle/data/mnist.py b/python/paddle/data/mnist.py
index 23a7ed46b633e8..8308ae16eb9e2e 100644
--- a/python/paddle/data/mnist.py
+++ b/python/paddle/data/mnist.py
@@ -23,35 +23,22 @@
 import os
 from http_download import download
 from logger import logger
-from base import BaseDataSet
 import gzip
-import json
 import hashlib
-import nltk
-import collections
-import h5py
 import numpy
+import struct
 
-BASE_URL = 'http://yann.lecun.com/exdb/mnist/%s-ubyte.gz'
 
+BASE_URL = 'http://yann.lecun.com/exdb/mnist/%s.gz'
+FILE_NAME = {
+    'train-images-idx3-ubyte': 'f68b3c2dcbeaaa9fbdd348bbdeb94873',
+    'train-labels-idx1-ubyte': 'd53e105ee54ea40749a09fcbcd1e9432',
+    't10k-images-idx3-ubyte': '9fb629c4189551a2d022fa330f9573f3',
+    't10k-labels-idx1-ubyte': 'ec29112dd5afa0611ce80d1b7f02629c'
+}
 
-class Categories(object):
-    TrainImage = 'train-images-idx3'
-    TrainLabels = 'train-labels-idx1'
-    TestImage = 't10k-images-idx3'
-    TestLabels = 't10k-labels-idx1'
 
-    All = [TrainImage, TrainLabels, TestImage, TestLabels]
-
-    __md5__ = dict()
-
-    __md5__[TrainImage] = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
-    __md5__[TrainLabels] = 'd53e105ee54ea40749a09fcbcd1e9432'
-    __md5__[TestImage] = '9fb629c4189551a2d022fa330f9573f3'
-    __md5__[TestLabels] = 'ec29112dd5afa0611ce80d1b7f02629c'
-
-
-__all__ = ['fetch', 'Categories']
+__all__ = ['train_data', 'test_data', 'fetch']
 
 
 def calculate_md5(fn):
@@ -62,28 +49,11 @@ def calculate_md5(fn):
     return h.hexdigest()
 
 
-def fetch_data(category=None, directory=None):
-    """
-    Calculate each md5 value.
-    :param category:
-    :param directory:
-    :return:
-    """
-    cn = category + '-ubyte'
-    fn = os.path.join(directory, '%s.gz' % cn)
-    if os.path.exists(fn) and \
-                    calculate_md5(fn) == Categories.__md5__[category]:
-        return fn
-    logger.info("Downloading mnist handwritten digit dataset for %s category" % cn)
-    return download(BASE_URL % category, fn)
-
-
-def fetch(category=None, directory=None):
+def fetch(directory=None):
     """
     According to the source name,set the download path for source,
     download the data from the source url,and return the download path to fetch
     for training api.
-    :param category:
     :param directory:
     :return:
     """
@@ -94,9 +64,100 @@ def fetch(category=None, directory=None):
     if not os.path.exists(directory):
         os.makedirs(directory)
 
-    if category is None:
-        category = [category for category in Categories.All]
-        fl = []  # download file list
-        for index, line in range(len(category)):
-            fl.append(fetch_data(line, directory))
-        return fl
+    fl = []
+    for index in range(len(FILE_NAME.keys())):
+        fn = os.path.join(directory, '%s.gz' % FILE_NAME.keys()[index])
+        if os.path.exists(fn) and calculate_md5(fn) == FILE_NAME.keys()[0]:
+            return fn
+        logger.info("Downloading digital handwritten digit dataset for %s " % FILE_NAME.keys()[index])
+        fl.append(download(BASE_URL % FILE_NAME.keys()[index], fn))
+
+    return fl
+
+
+def preprocess(directory=None):
+    """
+    :param category:
+    :param directory:
+    :return:
+    """
+    if directory is None:
+        directory = os.path.expanduser(os.path.join('~', 'paddle_data', 'mnist'))
+
+    raw_file_list = fetch(directory)
+    print raw_file_list
+
+    for cn in raw_file_list:
+        sz = cn.split('.')[0]
+        print sz
+        g = gzip.GzipFile(fileobj=open(cn, 'rb'))
+        open(sz, 'wb').write(g.read())
+
+
+def data(filename, directory=None):
+    """
+    :param filename:
+    :param directory:
+    :return:
+    """
+    if directory is None:
+        directory = os.path.expanduser(os.path.join('~', 'paddle_data', 'mnist'))
+
+    image = '-images-idx3-ubyte'
+    label = '-labels-idx1-ubyte'
+
+    if filename is 'train':
+        image_file = os.path.join(directory, filename + image)
+        label_file = os.path.join(directory, filename + label)
+    else:
+        image_file = os.path.join(directory, 't10' + image)
+        label_file = os.path.join(directory, 't10' + label)
+
+    if os.path.exists(image_file) and os.path.exists(label_file):
+        print "File is exists!"
+    else:
+        preprocess()
+
+    print image_file
+    print label_file
+
+    with open(image_file, "rb") as f:
+        num_magic, n, num_row, num_col = struct.unpack(">IIII", f.read(16))
+        images = numpy.fromfile(f, 'ubyte', count=n * num_row * num_col).\
+            reshape(n, num_row, num_col).astype('float32')
+        images = images / 255.0 * 2.0 - 1.0
+
+    with open(label_file, "rb") as fn:
+        num_magic, num_label = struct.unpack(">II", fn.read(8))
+        labels = numpy.fromfile(fn, 'ubyte', count=num_label).astype('int')
+
+    return images, labels
+
+
+def train_data(directory=None):
+    """
+    :param directory:
+    :return:
+    """
+    if directory is None:
+        directory = os.path.expanduser(os.path.join('~', 'paddle_data', 'mnist'))
+
+    train_images, train_labels = data('train')
+    print train_images, train_labels
+
+
+def test_data(directory=None):
+    """
+    :param directory:
+    :return:
+    """
+    if directory is None:
+        directory = os.path.expanduser(os.path.join('~', 'paddle_data', 'mnist'))
+
+    test_images, test_labels = data('test')
+    print test_images, test_labels
+
+
+if __name__ == '__main__':
+    train_data()
+    #test_data()
\ No newline at end of file