From a4044bb7bc5ba2d02bed6b18197433a0a257b4f3 Mon Sep 17 00:00:00 2001 From: qibin Date: Tue, 10 Jan 2017 16:28:27 +0800 Subject: [PATCH 01/18] add cifar --- python/paddle/data/__init__.py | 0 python/paddle/data/cifar_10.py | 100 +++++++++++++++++++++++++++++++++ python/setup.py.in | 1 + 3 files changed, 101 insertions(+) create mode 100644 python/paddle/data/__init__.py create mode 100644 python/paddle/data/cifar_10.py diff --git a/python/paddle/data/__init__.py b/python/paddle/data/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/python/paddle/data/cifar_10.py b/python/paddle/data/cifar_10.py new file mode 100644 index 00000000000000..2c5d40810b40e8 --- /dev/null +++ b/python/paddle/data/cifar_10.py @@ -0,0 +1,100 @@ +#/usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import shutil +import os +import sys +import tarfile +import zipfile +import collections +import numpy as np +from six.moves import urllib + +source_url='https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' +source_file = "cifar-10-batches-py" +label_map = { +0: "airplane", +1: "automobile", +2: "bird", +3: "cat", +4: "deer", +5: "dog", +6: "frog", +7: "horse", +8: "ship", +9: "truck" +} + +def fetch(): + num_images_train = 50000 + num_batch = 5 + source_name = "cifar" + file_source = "cifar-10-batches-py" + #Set the download dir for cifar. + data_home = set_data_path(source_name) + filepath = data_download(data_home,source_url) + """ + for i in range(1, num_batch + 1): + fpath = os.path.join(filepath, "data_batch_%d" % i) + """ + +def _unpickle(file_path): + with open(file_path, mode='rb') as file: + if sys.version_info < (3,): + data = cPickle.load(file) + else: + data = cPickle.load(file, encoding='bytes') + return data + +def set_data_path(source_name): + data_base = os.path.expanduser(os.path.join('~','.paddle')) + print data_base + if not os.access(data_base, os.W_OK): + data_base = os.path.join('/tmp', '.paddle') + datadir = os.path.join(data_base, source_name) + print datadir + if not os.path.exists(datadir): + os.makedirs(datadir) + return datadir + +def data_download(download_dir,source_url): + src_file = source_url.strip().split('/')[-1] + file_path = os.path.join(download_dir,src_file) + if not os.path.exists(file_path): + temp_file_name,_ = download_with_urlretrieve(source_url) + temp_file_path = os.getcwd() + os.rename(temp_file_name,src_file) + move_files(src_file,download_dir) + print("Download finished,Extracting files.") + tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) + print("Unpacking done!") + else: + tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) + print("Data has been already downloaded and unpacked!") + return download_dir + +def move_files(source_dire,target_dire): + shutil.move(source_dire,target_dire) + +def download_with_urlretrieve(url, filename=None): + return urllib.request.urlretrieve(url, filename) + + +if __name__ == '__main__': + path = fetch() + print path diff --git a/python/setup.py.in b/python/setup.py.in index b66a42e87c7870..5b25b3ab350903 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -1,6 +1,7 @@ from setuptools import setup packages=['paddle', + 'paddle.data', 'paddle.proto', 'paddle.trainer', 'paddle.trainer_config_helpers', From 22a8d068d8a613273dbefcf2e67c378d7bdf7ff3 Mon Sep 17 00:00:00 2001 From: qibin Date: Tue, 10 Jan 2017 16:48:56 +0800 Subject: [PATCH 02/18] update cifar --- python/paddle/data/cifar_10.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/data/cifar_10.py b/python/paddle/data/cifar_10.py index 2c5d40810b40e8..02608c79911e30 100644 --- a/python/paddle/data/cifar_10.py +++ b/python/paddle/data/cifar_10.py @@ -47,7 +47,7 @@ def fetch(): file_source = "cifar-10-batches-py" #Set the download dir for cifar. data_home = set_data_path(source_name) - filepath = data_download(data_home,source_url) + filepath = data_download(data_home, source_url) """ for i in range(1, num_batch + 1): fpath = os.path.join(filepath, "data_batch_%d" % i) From 7192a6bce9a358876ef208fee8289149cdecccaf Mon Sep 17 00:00:00 2001 From: qibin Date: Tue, 10 Jan 2017 16:53:17 +0800 Subject: [PATCH 03/18] update cifar --- python/paddle/data/cifar_10.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/paddle/data/cifar_10.py b/python/paddle/data/cifar_10.py index 02608c79911e30..762d4b2d40ca52 100644 --- a/python/paddle/data/cifar_10.py +++ b/python/paddle/data/cifar_10.py @@ -62,7 +62,7 @@ def _unpickle(file_path): return data def set_data_path(source_name): - data_base = os.path.expanduser(os.path.join('~','.paddle')) + data_base = os.path.expanduser(os.path.join('~', '.paddle')) print data_base if not os.access(data_base, os.W_OK): data_base = os.path.join('/tmp', '.paddle') @@ -72,14 +72,14 @@ def set_data_path(source_name): os.makedirs(datadir) return datadir -def data_download(download_dir,source_url): +def data_download(download_dir, source_url): src_file = source_url.strip().split('/')[-1] - file_path = os.path.join(download_dir,src_file) + file_path = os.path.join(download_dir, src_file) if not os.path.exists(file_path): temp_file_name,_ = download_with_urlretrieve(source_url) temp_file_path = os.getcwd() - os.rename(temp_file_name,src_file) - move_files(src_file,download_dir) + os.rename(temp_file_name, src_file) + move_files(src_file, download_dir) print("Download finished,Extracting files.") tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) print("Unpacking done!") @@ -88,8 +88,8 @@ def data_download(download_dir,source_url): print("Data has been already downloaded and unpacked!") return download_dir -def move_files(source_dire,target_dire): - shutil.move(source_dire,target_dire) +def move_files(source_dire, target_dire): + shutil.move(source_dire, target_dire) def download_with_urlretrieve(url, filename=None): return urllib.request.urlretrieve(url, filename) From 0913bbc83faf9a675548558545333018e6469e23 Mon Sep 17 00:00:00 2001 From: qibin Date: Tue, 10 Jan 2017 21:57:26 +0800 Subject: [PATCH 04/18] add mnist and amazon --- python/paddle/data/amazon.py | 103 +++++++++++++++++++++++++++++++++++ python/paddle/data/mnist.py | 83 ++++++++++++++++++++++++++++ 2 files changed, 186 insertions(+) create mode 100644 python/paddle/data/amazon.py create mode 100644 python/paddle/data/mnist.py diff --git a/python/paddle/data/amazon.py b/python/paddle/data/amazon.py new file mode 100644 index 00000000000000..c3c4cde65abc5d --- /dev/null +++ b/python/paddle/data/amazon.py @@ -0,0 +1,103 @@ +#/usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import shutil +import os +import sys +import zipfile +import collections +import numpy as np +from six.moves import urllib +import stat + +source_url='http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz' +moses_url='https://github.com/moses-smt/mosesdecoder/archive/master.zip' +file_source = "mosesdecoder-master" +def fetch(): + source_name = "amazon" + #file_source = "mosesdecoder-master" + #Set the download dir for cifar. + data_home = set_data_path(source_name) + #filepath = data_download(data_home,moses_url) + filepath = data_download(data_home,source_url) + filepath = data_download(data_home,moses_url) + """ + for i in range(1, num_batch + 1): + fpath = os.path.join(filepath, "data_batch_%d" % i) + """ + +def _unpickle(file_path): + with open(file_path, mode='rb') as file: + if sys.version_info < (3,): + data = cPickle.load(file) + else: + data = cPickle.load(file, encoding='bytes') + return data + +def set_data_path(source_name): + data_base = os.path.expanduser(os.path.join('~',' .paddle')) + if not os.access(data_base, os.W_OK): + data_base = os.path.join('/tmp', '.paddle') + datadir = os.path.join(data_base, source_name) + print datadir + if not os.path.exists(datadir): + os.makedirs(datadir) + return datadir + +def data_download(download_dir,source_url): + src_file = source_url.strip().split('/')[-1] + file_path = os.path.join(download_dir, src_file) + + if not os.path.exists(file_path): + temp_file_name,_ = download_with_urlretrieve(source_url) + temp_file_path = os.getcwd() + os.rename(temp_file_name, src_file) + move_files(src_file, download_dir) + print("Download finished, Extracting files.") + + if 'zip' in src_file: + tar = zipfile.ZipFile(file_path,'r') + infos = tar.infolist() + for file in infos: + tar.extract(file, download_dir) + fpath = os.path.join(download_dir, file.filename) + os.chmod(fpath,stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH) + os.remove(file_path) + print("Unpacking done!") + else: + if 'zip' in src_file: + tar = zipfile.ZipFile(file_path,'r') + infos = tar.infolist() + for file in infos: + tar.extract(file, download_dir) + fpath = os.path.join(download_dir, file.filename) + os.chmod(fpath,stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH) + os.remove(file_path) + print("Data has been already downloaded and unpacked!") + return download_dir + +def move_files(source_dire,target_dire): + shutil.move(source_dire,target_dire) + +def download_with_urlretrieve(url, filename=None): + return urllib.request.urlretrieve(url, filename) + + +if __name__ == '__main__': + path = fetch() + print path diff --git a/python/paddle/data/mnist.py b/python/paddle/data/mnist.py new file mode 100644 index 00000000000000..7e04440f3765b4 --- /dev/null +++ b/python/paddle/data/mnist.py @@ -0,0 +1,83 @@ +#/usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +import os +import sys +import collections +import numpy as np +from six.moves import urllib +import urlparse +import gzip + +source_url = 'http://yann.lecun.com/exdb/mnist/' +filename = ['train-images-idx3-ubyte.gz','train-labels-idx1-ubyte.gz','t10k-images-idx3-ubyte.gz','t10k-labels-idx1-ubyte.gz'] + +def fetch(): + source_name = "mnist" + file_source = "cifar-10-batches-py" + #Set the download dir for cifar. + data_home = set_data_path(source_name) + filepath = data_download(data_home,source_url) + """ + for i in range(1, num_batch + 1): + fpath = os.path.join(filepath, "data_batch_%d" % i) + """ + +def set_data_path(source_name): + data_base = os.path.expanduser(os.path.join('~','.paddle')) + if not os.access(data_base, os.W_OK): + data_base = os.path.join('/tmp', '.paddle') + datadir = os.path.join(data_base, source_name) + print datadir + if not os.path.exists(datadir): + os.makedirs(datadir) + return datadir + +def data_download(download_dir,source_url): + for file in filename: + data_url = urlparse.urljoin(source_url,file) + file_path = os.path.join(download_dir,file) + untar_path = os.path.join(download_dir,file.replace(".gz","")) + if not os.path.exists(file_path): + temp_file_name,_ = download_with_urlretrieve(data_url) + temp_file_path = os.getcwd() + os.rename(temp_file_name,file) + move_files(file,download_dir) + print("Download finished,Extracting files.") + g_file = gzip.GzipFile(file_path) + open(untar_path,'w+').write(g_file.read()) + g_file.close() + print("Unpacking done!") + else: + g_file = gzip.GzipFile(file_path) + open(untar_path,'w+').write(g_file.read()) + g_file.close() + print("Data has been already downloaded and unpacked!") + os.remove(file_path) + return download_dir + +def move_files(source_dire,target_dire): + shutil.move(source_dire,target_dire) + +def download_with_urlretrieve(url, filename=None): + return urllib.request.urlretrieve(url, filename) + + +if __name__ == '__main__': + path = fetch() + print path From a4ed79877fa614c61978a5d11184afcaa22b166a Mon Sep 17 00:00:00 2001 From: qibin Date: Tue, 10 Jan 2017 22:00:18 +0800 Subject: [PATCH 05/18] update amazon and mnist --- python/paddle/data/amazon.py | 10 +++++----- python/paddle/data/mnist.py | 22 +++++++++++----------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/python/paddle/data/amazon.py b/python/paddle/data/amazon.py index c3c4cde65abc5d..7e4985b4cc1415 100644 --- a/python/paddle/data/amazon.py +++ b/python/paddle/data/amazon.py @@ -34,8 +34,8 @@ def fetch(): #Set the download dir for cifar. data_home = set_data_path(source_name) #filepath = data_download(data_home,moses_url) - filepath = data_download(data_home,source_url) - filepath = data_download(data_home,moses_url) + filepath = data_download(data_home, source_url) + filepath = data_download(data_home, moses_url) """ for i in range(1, num_batch + 1): fpath = os.path.join(filepath, "data_batch_%d" % i) @@ -59,7 +59,7 @@ def set_data_path(source_name): os.makedirs(datadir) return datadir -def data_download(download_dir,source_url): +def data_download(download_dir, source_url): src_file = source_url.strip().split('/')[-1] file_path = os.path.join(download_dir, src_file) @@ -91,8 +91,8 @@ def data_download(download_dir,source_url): print("Data has been already downloaded and unpacked!") return download_dir -def move_files(source_dire,target_dire): - shutil.move(source_dire,target_dire) +def move_files(source_dire, target_dire): + shutil.move(source_dire, target_dire) def download_with_urlretrieve(url, filename=None): return urllib.request.urlretrieve(url, filename) diff --git a/python/paddle/data/mnist.py b/python/paddle/data/mnist.py index 7e04440f3765b4..959875889c2322 100644 --- a/python/paddle/data/mnist.py +++ b/python/paddle/data/mnist.py @@ -32,14 +32,14 @@ def fetch(): file_source = "cifar-10-batches-py" #Set the download dir for cifar. data_home = set_data_path(source_name) - filepath = data_download(data_home,source_url) + filepath = data_download(data_home, source_url) """ for i in range(1, num_batch + 1): fpath = os.path.join(filepath, "data_batch_%d" % i) """ def set_data_path(source_name): - data_base = os.path.expanduser(os.path.join('~','.paddle')) + data_base = os.path.expanduser(os.path.join('~', '.paddle')) if not os.access(data_base, os.W_OK): data_base = os.path.join('/tmp', '.paddle') datadir = os.path.join(data_base, source_name) @@ -48,16 +48,16 @@ def set_data_path(source_name): os.makedirs(datadir) return datadir -def data_download(download_dir,source_url): +def data_download(download_dir, source_url): for file in filename: - data_url = urlparse.urljoin(source_url,file) - file_path = os.path.join(download_dir,file) - untar_path = os.path.join(download_dir,file.replace(".gz","")) + data_url = urlparse.urljoin(source_url, file) + file_path = os.path.join(download_dir, file) + untar_path = os.path.join(download_dir, file.replace(".gz", "")) if not os.path.exists(file_path): temp_file_name,_ = download_with_urlretrieve(data_url) temp_file_path = os.getcwd() - os.rename(temp_file_name,file) - move_files(file,download_dir) + os.rename(temp_file_name, file) + move_files(file, download_dir) print("Download finished,Extracting files.") g_file = gzip.GzipFile(file_path) open(untar_path,'w+').write(g_file.read()) @@ -65,14 +65,14 @@ def data_download(download_dir,source_url): print("Unpacking done!") else: g_file = gzip.GzipFile(file_path) - open(untar_path,'w+').write(g_file.read()) + open(untar_path, 'w+').write(g_file.read()) g_file.close() print("Data has been already downloaded and unpacked!") os.remove(file_path) return download_dir -def move_files(source_dire,target_dire): - shutil.move(source_dire,target_dire) +def move_files(source_dire, target_dire): + shutil.move(source_dire, target_dire) def download_with_urlretrieve(url, filename=None): return urllib.request.urlretrieve(url, filename) From ce0f5b0db14cf5e08d59a4732f577018a7d7a52e Mon Sep 17 00:00:00 2001 From: qibin Date: Wed, 11 Jan 2017 21:32:38 +0800 Subject: [PATCH 06/18] add other data --- python/paddle/data/__init__.py | 5 + python/paddle/data/amazon.py | 84 +++++++++++-- python/paddle/data/cifar10.py | 172 ++++++++++++++++++++++++++ python/paddle/data/mnist.py | 102 ++++++++++++--- python/paddle/data/recommendation.py | 168 +++++++++++++++++++++++++ python/paddle/data/semantic.py | 164 +++++++++++++++++++++++++ python/paddle/data/sentiment.py | 177 +++++++++++++++++++++++++++ python/paddle/data/seqToseq.py | 162 ++++++++++++++++++++++++ 8 files changed, 1008 insertions(+), 26 deletions(-) create mode 100644 python/paddle/data/cifar10.py create mode 100644 python/paddle/data/recommendation.py create mode 100644 python/paddle/data/semantic.py create mode 100644 python/paddle/data/sentiment.py create mode 100644 python/paddle/data/seqToseq.py diff --git a/python/paddle/data/__init__.py b/python/paddle/data/__init__.py index e69de29bb2d1d6..970e56f072fdf9 100644 --- a/python/paddle/data/__init__.py +++ b/python/paddle/data/__init__.py @@ -0,0 +1,5 @@ +""" +The :mod:`paddle.datasets` module includes utilities to load datasets, +including methods to load and fetch popular reference datasets. It also +features some artificial data generators. +""" diff --git a/python/paddle/data/amazon.py b/python/paddle/data/amazon.py index 7e4985b4cc1415..54e90e83e8be63 100644 --- a/python/paddle/data/amazon.py +++ b/python/paddle/data/amazon.py @@ -28,18 +28,28 @@ source_url='http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz' moses_url='https://github.com/moses-smt/mosesdecoder/archive/master.zip' file_source = "mosesdecoder-master" + + def fetch(): + """ + According to the source name,set the download path for source, + download the data from the source url,and return the download path to fetch for training api. + + Args: + + Returns: + path to downloaded file. + """ source_name = "amazon" - #file_source = "mosesdecoder-master" - #Set the download dir for cifar. data_home = set_data_path(source_name) - #filepath = data_download(data_home,moses_url) - filepath = data_download(data_home, source_url) - filepath = data_download(data_home, moses_url) + filepath = data_download(data_home,source_url) + filepath = data_download(data_home,moses_url) """ for i in range(1, num_batch + 1): fpath = os.path.join(filepath, "data_batch_%d" % i) """ + return filepath + def _unpickle(file_path): with open(file_path, mode='rb') as file: @@ -49,7 +59,17 @@ def _unpickle(file_path): data = cPickle.load(file, encoding='bytes') return data + def set_data_path(source_name): + """ + Set the path for download according to the source name. + + Args: + source_name:the source + + Returns: + the data directory for data download. + """ data_base = os.path.expanduser(os.path.join('~',' .paddle')) if not os.access(data_base, os.W_OK): data_base = os.path.join('/tmp', '.paddle') @@ -59,7 +79,19 @@ def set_data_path(source_name): os.makedirs(datadir) return datadir -def data_download(download_dir, source_url): + +def data_download(download_dir,source_url): + """ + Download data according to the url for mnist. + when downloading,it can see each download process. + + Args: + download_dir:the directory for data download. + source_url:the url for data download. + + Returns: + the path after data downloaded. + """ src_file = source_url.strip().split('/')[-1] file_path = os.path.join(download_dir, src_file) @@ -91,11 +123,49 @@ def data_download(download_dir, source_url): print("Data has been already downloaded and unpacked!") return download_dir + def move_files(source_dire, target_dire): + """ + Renaming the source file to other name. + + Args: + source_dire:the source name of file + target_dire:the target name of file. + + Returns: + """ shutil.move(source_dire, target_dire) + def download_with_urlretrieve(url, filename=None): - return urllib.request.urlretrieve(url, filename) + """ + Download each file with urlretrieve,and the download process can be seen. + + Args: + url:the url for data downoad. + filename:the target name for download. + + Returns: + the temp name after urlretrieve downloaded. + """ + return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress) + + +def check_download_progress(count, block_size, total_size): + """ + Print and check the download process. + + Args: + count: + block_size: + total_size: + + Returns: + """ + percent = float(count * block_size) / total_size + msg = "\r- Download progress: {:.1%}".format(percent) + sys.stdout.write(msg) + sys.stdout.flush() if __name__ == '__main__': diff --git a/python/paddle/data/cifar10.py b/python/paddle/data/cifar10.py new file mode 100644 index 00000000000000..1d461ba4466a49 --- /dev/null +++ b/python/paddle/data/cifar10.py @@ -0,0 +1,172 @@ +#/usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import shutil +import os +import sys +import tarfile +import zipfile +import collections +import numpy as np +from six.moves import urllib + +source_url='https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' +source_file = "cifar-10-batches-py" +label_map = { +0: "airplane", +1: "automobile", +2: "bird", +3: "cat", +4: "deer", +5: "dog", +6: "frog", +7: "horse", +8: "ship", +9: "truck" +} + + +def fetch(): + """ + According to the source name,set the download path for source, + download the data from the source url,and return the download path to fetch for training api. + + Args: + + Returns: + path to downloaded file. + """ + num_images_train = 50000 + num_batch = 5 + source_name = "cifar" + file_source = "cifar-10-batches-py" + #Set the download dir for cifar. + data_home = set_data_path(source_name) + filepath = data_download(data_home,source_url) + """ + for i in range(1, num_batch + 1): + fpath = os.path.join(filepath, "data_batch_%d" % i) + """ + return filepath + + +def _unpickle(file_path): + + with open(file_path, mode='rb') as file: + if sys.version_info < (3,): + data = cPickle.load(file) + else: + data = cPickle.load(file, encoding='bytes') + return data + + +def set_data_path(source_name): + """ + Set the path for download according to the source name. + + Args: + source_name:the source + + Returns: + the data directory for data download. + """ + data_base = os.path.expanduser(os.path.join('~','.paddle')) + print data_base + if not os.access(data_base, os.W_OK): + data_base = os.path.join('/tmp', '.paddle') + datadir = os.path.join(data_base, source_name) + print datadir + if not os.path.exists(datadir): + os.makedirs(datadir) + return datadir + + +def data_download(download_dir, source_url): + """ + Download data according to the url for mnist. + when downloading,it can see each download process. + + Args: + download_dir:the directory for data download. + source_url:the url for data download. + + Returns: + the path after data downloaded. + """ + src_file = source_url.strip().split('/')[-1] + file_path = os.path.join(download_dir, src_file) + if not os.path.exists(file_path): + temp_file_name,_ = download_with_urlretrieve(source_url) + temp_file_path = os.getcwd() + os.rename(temp_file_name, src_file) + move_files(src_file,download_dir) + print("Download finished, Extracting files.") + tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) + print("Unpacking done!") + else: + tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) + print("Data has been already downloaded and unpacked!") + return download_dir + + +def move_files(source_dire, target_dire): + """ + Renaming the source file to other name. + + Args: + source_dire:the source name of file + target_dire:the target name of file. + + Returns: + """ + shutil.move(source_dire, target_dire) + + +def download_with_urlretrieve(url, filename=None): + """ + Download each file with urlretrieve,and the download process can be seen. + + Args: + url:the url for data downoad. + filename:the target name for download. + + Returns: + the temp name after urlretrieve downloaded. + """ + return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress) + + + def check_download_progress(count, block_size, total_size): + """ + Print and check the download process. + + Args: + count: + block_size: + total_size: + + Returns: + """ + percent = float(count * block_size) / total_size + msg = "\r- Download progress: {:.1%}".format(percent) + sys.stdout.write(msg) + sys.stdout.flush() + +if __name__ == '__main__': + path = fetch() + print path diff --git a/python/paddle/data/mnist.py b/python/paddle/data/mnist.py index 959875889c2322..ac16cf1919350b 100644 --- a/python/paddle/data/mnist.py +++ b/python/paddle/data/mnist.py @@ -25,21 +25,35 @@ import gzip source_url = 'http://yann.lecun.com/exdb/mnist/' -filename = ['train-images-idx3-ubyte.gz','train-labels-idx1-ubyte.gz','t10k-images-idx3-ubyte.gz','t10k-labels-idx1-ubyte.gz'] +filename = ['train-images-idx3-ubyte.gz','t10k-images-idx3-ubyte.gz','train-labels-idx1-ubyte.gz','t10k-labels-idx1-ubyte.gz'] def fetch(): - source_name = "mnist" - file_source = "cifar-10-batches-py" - #Set the download dir for cifar. - data_home = set_data_path(source_name) - filepath = data_download(data_home, source_url) """ - for i in range(1, num_batch + 1): - fpath = os.path.join(filepath, "data_batch_%d" % i) + According to the source name,set the download path for source, + download the data from the source url,and return the download path to fetch for training api. + + Args: + + Returns: + path to downloaded file. """ + source_name = "mnist" + data_home = set_data_path(source_name) + filepath = data_download(data_home,source_url) + return filepath + def set_data_path(source_name): - data_base = os.path.expanduser(os.path.join('~', '.paddle')) + """ + Set the path for download according to the source name. + + Args: + source_name:the source + + Returns: + the data directory for data download. + """ + data_base = os.path.expanduser(os.path.join('~','.paddle')) if not os.access(data_base, os.W_OK): data_base = os.path.join('/tmp', '.paddle') datadir = os.path.join(data_base, source_name) @@ -48,16 +62,28 @@ def set_data_path(source_name): os.makedirs(datadir) return datadir -def data_download(download_dir, source_url): + +def data_download(download_dir,source_url): + """ + Download data according to the url for mnist. + when downloading,it can see each download process. + + Args: + download_dir:the directory for data download. + source_url:the url for data download. + + Returns: + the path after data downloaded. + """ for file in filename: - data_url = urlparse.urljoin(source_url, file) - file_path = os.path.join(download_dir, file) - untar_path = os.path.join(download_dir, file.replace(".gz", "")) + data_url = urlparse.urljoin(source_url,file) + file_path = os.path.join(download_dir,file) + untar_path = os.path.join(download_dir,file.replace(".gz","")) if not os.path.exists(file_path): temp_file_name,_ = download_with_urlretrieve(data_url) temp_file_path = os.getcwd() - os.rename(temp_file_name, file) - move_files(file, download_dir) + os.rename(temp_file_name,file) + move_files(file,download_dir) print("Download finished,Extracting files.") g_file = gzip.GzipFile(file_path) open(untar_path,'w+').write(g_file.read()) @@ -65,17 +91,55 @@ def data_download(download_dir, source_url): print("Unpacking done!") else: g_file = gzip.GzipFile(file_path) - open(untar_path, 'w+').write(g_file.read()) + open(untar_path,'w+').write(g_file.read()) g_file.close() print("Data has been already downloaded and unpacked!") os.remove(file_path) return download_dir -def move_files(source_dire, target_dire): - shutil.move(source_dire, target_dire) + +def move_files(source_dire,target_dire): + """ + Renaming the source file to other name. + + Args: + source_dire:the source name of file + target_dire:the target name of file. + + Returns: + """ + shutil.move(source_dire,target_dire) + def download_with_urlretrieve(url, filename=None): - return urllib.request.urlretrieve(url, filename) + """ + Download each file with urlretrieve,and the download process can be seen. + + Args: + url:the url for data downoad. + filename:the target name for download. + + Returns: + the temp name after urlretrieve downloaded. + """ + return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress) + + +def check_download_progress(count, block_size, total_size): + """ + Print and check the download process. + + Args: + count: + block_size: + total_size: + + Returns: + """ + percent = float(count * block_size) / total_size + msg = "\r- Download progress: {:.1%}".format(percent) + sys.stdout.write(msg) + sys.stdout.flush() if __name__ == '__main__': diff --git a/python/paddle/data/recommendation.py b/python/paddle/data/recommendation.py new file mode 100644 index 00000000000000..1e93b6dc161224 --- /dev/null +++ b/python/paddle/data/recommendation.py @@ -0,0 +1,168 @@ +#/usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import shutil +import os +import sys +import zipfile +import collections +import numpy as np +from six.moves import urllib +import stat + + +source_url='http://files.grouplens.org/datasets/movielens/ml-1m.zip' +file_source = "mosesdecoder-master" + + +def fetch(): + """ + According to the source name,set the download path for source, + download the data from the source url,and return the download path to fetch for training api. + + Args: + + Returns: + path to downloaded file. + """ + source_name = "recommendation" + #Set the download dir for recommendation. + data_home = set_data_path(source_name) + filepath = data_download(data_home, source_url) + """ + for i in range(1, num_batch + 1): + fpath = os.path.join(filepath, "data_batch_%d" % i) + """ + return filepath + + +def _unpickle(file_path): + with open(file_path, mode='rb') as file: + if sys.version_info < (3,): + data = cPickle.load(file) + else: + data = cPickle.load(file, encoding='bytes') + return data + + +def set_data_path(source_name): + """ + Set the path for download according to the source name. + + Args: + source_name:the source + + Returns: + the data directory for data download. + """ + data_base = os.path.expanduser(os.path.join('~',' .paddle')) + if not os.access(data_base, os.W_OK): + data_base = os.path.join('/tmp', '.paddle') + datadir = os.path.join(data_base, source_name) + print datadir + if not os.path.exists(datadir): + os.makedirs(datadir) + return datadir + + +def data_download(download_dir, source_url): + """ + Download data according to the url for mnist. + when downloading,it can see each download process. + + Args: + download_dir:the directory for data download. + source_url:the url for data download. + + Returns: + the path after data downloaded. + """ + src_file = source_url.strip().split('/')[-1] + file_path = os.path.join(download_dir, src_file) + + if not os.path.exists(file_path): + temp_file_name,_ = download_with_urlretrieve(source_url) + temp_file_path = os.getcwd() + os.rename(temp_file_name, src_file) + move_files(src_file, download_dir) + print("Download finished, Extracting files.") + tar = zipfile.ZipFile(file_path, 'r') + infos = tar.infolist() + for file in infos: + tar.extract(file, download_dir) + fpath = os.path.join(download_dir, file.filename) + os.remove(file_path) + print("Unpacking done!") + else: + tar = zipfile.ZipFile(file_path, 'r') + infos = tar.infolist() + for file in infos: + tar.extract(file, download_dir) + fpath = os.path.join(download_dir, file.filename) + os.remove(file_path) + print("Data has been already downloaded and unpacked!") + return download_dir + + +def move_files(source_dire, target_dire): + """ + Renaming the source file to other name. + + Args: + source_dire:the source name of file + target_dire:the target name of file. + + Returns: + """ + shutil.move(source_dire, target_dire) + + +def download_with_urlretrieve(url, filename=None): + """ + Download each file with urlretrieve,and the download process can be seen. + + Args: + url:the url for data downoad. + filename:the target name for download. + + Returns: + the temp name after urlretrieve downloaded. + """ + return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress) + + +def check_download_progress(count, block_size, total_size): + """ + Print and check the download process. + + Args: + count: + block_size: + total_size: + + Returns: + """ + percent = float(count * block_size) / total_size + msg = "\r- Download progress: {:.1%}".format(percent) + sys.stdout.write(msg) + sys.stdout.flush() + + +if __name__ == '__main__': + path = fetch() + print path diff --git a/python/paddle/data/semantic.py b/python/paddle/data/semantic.py new file mode 100644 index 00000000000000..087e6e6640fea3 --- /dev/null +++ b/python/paddle/data/semantic.py @@ -0,0 +1,164 @@ +#/usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import shutil +import os +import sys +import zipfile +import collections +import numpy as np +from six.moves import urllib +import stat + +source_url=['http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz', + 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt', + 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt', + 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt', + 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb' + ] + + +def fetch(): + """ + According to the source name,set the download path for source, + download the data from the source url,and return the download path to fetch for training api. + + Args: + + Returns: + path to downloaded file. + """ + source_name = "semantic" + data_home = set_data_path(source_name) + model_path = data_download(data_home, model_url) + for url in source_url: + filepath = data_download(data_home, moses_url) + """ + for i in range(1, num_batch + 1): + fpath = os.path.join(filepath, "data_batch_%d" % i) + """ + return filepath + + +def _unpickle(file_path): + with open(file_path, mode='rb') as file: + if sys.version_info < (3,): + data = cPickle.load(file) + else: + data = cPickle.load(file, encoding='bytes') + return data + + +def set_data_path(source_name): + """ + Set the path for download according to the source name. + + Args: + source_name:the source + + Returns: + the data directory for data download. + """ + data_base = os.path.expanduser(os.path.join('~',' .paddle')) + if not os.access(data_base, os.W_OK): + data_base = os.path.join('/tmp', '.paddle') + datadir = os.path.join(data_base, source_name) + print datadir + if not os.path.exists(datadir): + os.makedirs(datadir) + return datadir + + +def data_download(download_dir, source_url): + """ + Download data according to the url for mnist. + when downloading,it can see each download process. + + Args: + download_dir:the directory for data download. + source_url:the url for data download. + + Returns: + the path after data downloaded. + """ + src_file = url.strip().split('/')[-1] + file_path = os.path.join(download_dir, src_file) + + if not os.path.exists(file_path): + temp_file_name,_ = download_with_urlretrieve(source_url) + temp_file_path = os.getcwd() + os.rename(temp_file_name, src_file) + move_files(src_file, download_dir) + print("Download finished, Extracting files.") + tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) + os.remove(file_path) + print("Unpacking done!") + else: + tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) + os.remove(file_path) + print("Data has been already downloaded and unpacked!") + return download_dir + + +def move_files(source_dire, target_dire): + """ + Renaming the source file to other name. + + Args: + source_dire:the source name of file + target_dire:the target name of file. + + Returns: + """ + shutil.move(source_dire, target_dire) + + +def download_with_urlretrieve(url, filename=None): + """ + Download each file with urlretrieve,and the download process can be seen. + + Args: + url:the url for data downoad. + filename:the target name for download. + + Returns: + the temp name after urlretrieve downloaded. + """ + return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress) + + +def check_download_progress(count, block_size, total_size): + """ + Print and check the download process. + + Args: + count: + block_size: + total_size: + + Returns: + """ + percent = float(count * block_size) / total_size + msg = "\r- Download progress: {:.1%}".format(percent) + sys.stdout.write(msg) + sys.stdout.flush() + + +if __name__ == '__main__': + path = fetch() + print path diff --git a/python/paddle/data/sentiment.py b/python/paddle/data/sentiment.py new file mode 100644 index 00000000000000..18146ba93803a7 --- /dev/null +++ b/python/paddle/data/sentiment.py @@ -0,0 +1,177 @@ +#/usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import shutil +import os +import sys +import zipfile +import collections +import numpy as np +from six.moves import urllib +import stat + +source_url='http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz' +moses_url='https://github.com/moses-smt/mosesdecoder/archive/master.zip' +file_source = "mosesdecoder-master" + + +def fetch(): + """ + According to the source name,set the download path for source, + download the data from the source url,and return the download path to fetch for training api. + + Args: + + Returns: + path to downloaded file. + """ + source_name = "sentiment" + data_home = set_data_path(source_name) + filepath = data_download(data_home, source_url) + filepath = data_download(data_home, moses_url) + """ + for i in range(1, num_batch + 1): + fpath = os.path.join(filepath, "data_batch_%d" % i) + """ + return filepath + + +def _unpickle(file_path): + with open(file_path, mode='rb') as file: + if sys.version_info < (3,): + data = cPickle.load(file) + else: + data = cPickle.load(file, encoding='bytes') + return data + + +def set_data_path(source_name): + """ + Set the path for download according to the source name. + + Args: + source_name:the source + + Returns: + the data directory for data download. + """ + data_base = os.path.expanduser(os.path.join('~',' .paddle')) + if not os.access(data_base, os.W_OK): + data_base = os.path.join('/tmp', '.paddle') + datadir = os.path.join(data_base, source_name) + print datadir + if not os.path.exists(datadir): + os.makedirs(datadir) + return datadir + + +def data_download(download_dir, source_url): + """ + Download data according to the url for mnist. + when downloading,it can see each download process. + + Args: + download_dir:the directory for data download. + source_url:the url for data download. + + Returns: + the path after data downloaded. + """ + src_file = source_url.strip().split('/')[-1] + file_path = os.path.join(download_dir, src_file) + + if not os.path.exists(file_path): + temp_file_name,_ = download_with_urlretrieve(source_url) + temp_file_path = os.getcwd() + os.rename(temp_file_name, src_file) + move_files(src_file, download_dir) + print("Download finished, Extracting files.") + + if 'zip' in src_file: + tar = zipfile.ZipFile(file_path, 'r') + infos = tar.infolist() + for file in infos: + tar.extract(file, download_dir) + fpath = os.path.join(download_dir, file.filename) + os.chmod(fpath,stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH) + else: + tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) + os.remove(file_path) + print("Unpacking done!") + else: + if 'zip' in src_file: + tar = zipfile.ZipFile(file_path, 'r') + infos = tar.infolist() + for file in infos: + tar.extract(file, download_dir) + fpath = os.path.join(download_dir, file.filename) + os.chmod(fpath, stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH) + else: + tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) + os.remove(file_path) + print("Data has been already downloaded and unpacked!") + return download_dir + + +def move_files(source_dire, target_dire): + """ + Renaming the source file to other name. + + Args: + source_dire:the source name of file + target_dire:the target name of file. + + Returns: + """ + shutil.move(source_dire, target_dire) + + +def download_with_urlretrieve(url, filename=None): + """ + Download each file with urlretrieve,and the download process can be seen. + + Args: + url:the url for data downoad. + filename:the target name for download. + + Returns: + the temp name after urlretrieve downloaded. + """ + return urllib.request.urlretrieve(url, filename, rereporthook=check_download_progress) + + +def check_download_progress(count, block_size, total_size): + """ + Print and check the download process. + + Args: + count: + block_size: + total_size: + + Returns: + """ + percent = float(count * block_size) / total_size + msg = "\r- Download progress: {:.1%}".format(percent) + sys.stdout.write(msg) + sys.stdout.flush() + + +if __name__ == '__main__': + path = fetch() + print path diff --git a/python/paddle/data/seqToseq.py b/python/paddle/data/seqToseq.py new file mode 100644 index 00000000000000..a9bdd0bf73cd2c --- /dev/null +++ b/python/paddle/data/seqToseq.py @@ -0,0 +1,162 @@ +#/usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import shutil +import os +import sys +import zipfile +import collections +import numpy as np +from six.moves import urllib +import stat + +source_url=['http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz', + 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz' + ] +model_url='http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz' + + +def fetch(): + """ + According to the source name,set the download path for source, + download the data from the source url,and return the download path to fetch for training api. + + Args: + + Returns: + path to downloaded file. + """ + source_name = "seqToseq" + data_home = set_data_path(source_name) + model_path = data_download(data_home, model_url) + for url in source_url: + filepath = data_download(data_home, source_url) + """ + for i in range(1, num_batch + 1): + fpath = os.path.join(filepath, "data_batch_%d" % i) + """ + return filepath + + +def _unpickle(file_path): + with open(file_path, mode='rb') as file: + if sys.version_info < (3,): + data = cPickle.load(file) + else: + data = cPickle.load(file, encoding='bytes') + return data + + +def set_data_path(source_name): + """ + Set the path for download according to the source name. + + Args: + source_name:the source + + Returns: + the data directory for data download. + """ + data_base = os.path.expanduser(os.path.join('~',' .paddle')) + if not os.access(data_base, os.W_OK): + data_base = os.path.join('/tmp', '.paddle') + datadir = os.path.join(data_base, source_name) + print datadir + if not os.path.exists(datadir): + os.makedirs(datadir) + return datadir + + +def data_download(download_dir, source_url): + """ + Download data according to the url for mnist. + when downloading,it can see each download process. + + Args: + download_dir:the directory for data download. + source_url:the url for data download. + + Returns: + the path after data downloaded. + """ + src_file = url.strip().split('/')[-1] + file_path = os.path.join(download_dir, src_file) + + if not os.path.exists(file_path): + temp_file_name,_ = download_with_urlretrieve(source_url) + temp_file_path = os.getcwd() + os.rename(temp_file_name, src_file) + move_files(src_file, download_dir) + print("Download finished, Extracting files.") + tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) + os.remove(file_path) + print("Unpacking done!") + else: + tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) + os.remove(file_path) + print("Data has been already downloaded and unpacked!") + return download_dir + + +def move_files(source_dire, target_dire): + """ + Renaming the source file to other name. + + Args: + source_dire:the source name of file + target_dire:the target name of file. + + Returns: + """ + shutil.move(source_dire, target_dire) + + +def download_with_urlretrieve(url, filename=None): + """ + Download each file with urlretrieve,and the download process can be seen. + + Args: + url:the url for data downoad. + filename:the target name for download. + + Returns: + the temp name after urlretrieve downloaded. + """ + return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress) + + +def check_download_progress(count, block_size, total_size): + """ + Print and check the download process. + + Args: + count: + block_size: + total_size: + + Returns: + """ + percent = float(count * block_size) / total_size + msg = "\r- Download progress: {:.1%}".format(percent) + sys.stdout.write(msg) + sys.stdout.flush() + + +if __name__ == '__main__': + path = fetch() + print path From 1373977e2c066f9d8509691da0b37f234cb05d0d Mon Sep 17 00:00:00 2001 From: baidu Date: Fri, 13 Jan 2017 11:45:27 +0800 Subject: [PATCH 07/18] update code --- python/paddle/data/DATA.md | 28 +++++ python/paddle/data/amazon.py | 165 ++++++--------------------- python/paddle/data/cifar10.py | 158 +++++-------------------- python/paddle/data/http_download.py | 124 ++++++++++++++++++++ python/paddle/data/mnist.py | 129 ++++----------------- python/paddle/data/recommendation.py | 145 ++++------------------- python/paddle/data/semantic.py | 152 ++++++------------------ python/paddle/data/sentiment.py | 155 +++++-------------------- python/paddle/data/seqToseq.py | 129 +++------------------ 9 files changed, 340 insertions(+), 845 deletions(-) create mode 100644 python/paddle/data/DATA.md create mode 100644 python/paddle/data/http_download.py diff --git a/python/paddle/data/DATA.md b/python/paddle/data/DATA.md new file mode 100644 index 00000000000000..ce186d42619509 --- /dev/null +++ b/python/paddle/data/DATA.md @@ -0,0 +1,28 @@ +## 需求 + +Paddle目前提供了很多demo,且各demo运行时需要从原生网站下载其数据,并进行复杂的预处理过程,整个过程会耗费大量时间。 + +所以我们需要数据封装接口,采用import数据源的方式(如\:import paddle.data.amazon.review.GetJSON)来简化获取训练所需数据的时间;但是如果你习惯自己处理原生数据,我们依然提供原生数据接口来满足你的需求。 + +## 整体思路 + +数据封装接口的目的是提供数据。不论是原生数据,还是预处理数据都通过import方式导入各模型进行训练;考虑到某些模型的预处理后的数据量依然很大,或有时就仅仅想训练相对较小的网络模型,没必要考虑全量数据,自动配置数据量大小必然更符合不同需求。整个接口初步设想如下: +* 开关来控制数据来源 + * 导入数据接口时,带有开关(如:src\_from = True,来自预处理源;否则,来自原生数据源) +* 预处理数据部分添加配置train和test的数据量的大小 +* 原生数据部分的数据下载数据模块化 + * 开关(src\_from = False)和<模型,数据源>对完成相关数据的下载 +* 原生数据的预处理部分保持原状,通过<模型,预处理过程>对完成数据的预处理 +* 在paddle的train的配置文件中修改数据源的导入方式 + +整个过程在tensorflow的mnist模型已有人实现,借鉴此思想,实现paddle的各demo数据接口的通用化。 + +```python +amazon = input_data.load_dataset( + 'Amazon', + '/Users/baidu/git/test_package/data', + data_unneed=False, + src_flag=False) +batch = amazon.train.shrink_txt('train',10) +``` + diff --git a/python/paddle/data/amazon.py b/python/paddle/data/amazon.py index 54e90e83e8be63..361a3fa79ded6e 100644 --- a/python/paddle/data/amazon.py +++ b/python/paddle/data/amazon.py @@ -15,22 +15,43 @@ # See the License for the specific language governing permissions and # limitations under the License. +######################################################################## +# +# Function for fetch the data untar directory for amazon training api. +# As the python can read the data in "reviews_Electronics_5.json.gz", +#here is no need to untar the data. +# +# +# First,we let the data download path is "~/paddle_data_directory" +# when u no special the download path. +# +# +# Then,download the data,according to the speical source url. +# Here,no need to untar the "reviews_Electronics_5.json.gz". +# +# After download the data,return the path of data. +# +# +######################################################################### + import shutil import os import sys import zipfile import collections -import numpy as np -from six.moves import urllib import stat +from six.moves import urllib +from http_download import data_download + source_url='http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz' moses_url='https://github.com/moses-smt/mosesdecoder/archive/master.zip' -file_source = "mosesdecoder-master" + +mose_source = "mosesdecoder-master" -def fetch(): +def fetch(directory=None): """ According to the source name,set the download path for source, download the data from the source url,and return the download path to fetch for training api. @@ -38,136 +59,20 @@ def fetch(): Args: Returns: - path to downloaded file. + path for the data untar. """ source_name = "amazon" - data_home = set_data_path(source_name) - filepath = data_download(data_home,source_url) - filepath = data_download(data_home,moses_url) - """ - for i in range(1, num_batch + 1): - fpath = os.path.join(filepath, "data_batch_%d" % i) - """ - return filepath - - -def _unpickle(file_path): - with open(file_path, mode='rb') as file: - if sys.version_info < (3,): - data = cPickle.load(file) - else: - data = cPickle.load(file, encoding='bytes') - return data - - -def set_data_path(source_name): - """ - Set the path for download according to the source name. - - Args: - source_name:the source - - Returns: - the data directory for data download. - """ - data_base = os.path.expanduser(os.path.join('~',' .paddle')) - if not os.access(data_base, os.W_OK): - data_base = os.path.join('/tmp', '.paddle') - datadir = os.path.join(data_base, source_name) - print datadir - if not os.path.exists(datadir): - os.makedirs(datadir) - return datadir - + if directory is None: + directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory')) -def data_download(download_dir,source_url): - """ - Download data according to the url for mnist. - when downloading,it can see each download process. - - Args: - download_dir:the directory for data download. - source_url:the url for data download. - - Returns: - the path after data downloaded. - """ - src_file = source_url.strip().split('/')[-1] - file_path = os.path.join(download_dir, src_file) - - if not os.path.exists(file_path): - temp_file_name,_ = download_with_urlretrieve(source_url) - temp_file_path = os.getcwd() - os.rename(temp_file_name, src_file) - move_files(src_file, download_dir) - print("Download finished, Extracting files.") - - if 'zip' in src_file: - tar = zipfile.ZipFile(file_path,'r') - infos = tar.infolist() - for file in infos: - tar.extract(file, download_dir) - fpath = os.path.join(download_dir, file.filename) - os.chmod(fpath,stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH) - os.remove(file_path) - print("Unpacking done!") - else: - if 'zip' in src_file: - tar = zipfile.ZipFile(file_path,'r') - infos = tar.infolist() - for file in infos: - tar.extract(file, download_dir) - fpath = os.path.join(download_dir, file.filename) - os.chmod(fpath,stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH) - os.remove(file_path) - print("Data has been already downloaded and unpacked!") - return download_dir - - -def move_files(source_dire, target_dire): - """ - Renaming the source file to other name. + download_path = os.path.join(directory, source_name) + if not os.path.exists(download_path): + os.makedirs(download_path) - Args: - source_dire:the source name of file - target_dire:the target name of file. - - Returns: - """ - shutil.move(source_dire, target_dire) - - -def download_with_urlretrieve(url, filename=None): - """ - Download each file with urlretrieve,and the download process can be seen. + moses_src = data_download(download_path, moses_url) + moses_path = os.path.join(moses_src, mose_source) - Args: - url:the url for data downoad. - filename:the target name for download. - - Returns: - the temp name after urlretrieve downloaded. - """ - return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress) - - -def check_download_progress(count, block_size, total_size): - """ - Print and check the download process. - - Args: - count: - block_size: - total_size: - - Returns: - """ - percent = float(count * block_size) / total_size - msg = "\r- Download progress: {:.1%}".format(percent) - sys.stdout.write(msg) - sys.stdout.flush() + filepath = data_download(download_path, source_url) + return filepath -if __name__ == '__main__': - path = fetch() - print path diff --git a/python/paddle/data/cifar10.py b/python/paddle/data/cifar10.py index 1d461ba4466a49..72c4c9bcbc11ce 100644 --- a/python/paddle/data/cifar10.py +++ b/python/paddle/data/cifar10.py @@ -15,33 +15,38 @@ # See the License for the specific language governing permissions and # limitations under the License. +######################################################################## +# +# Function for fetch the data untar directory for cifar10 training api. +# you can use this data for image classifation and gun traing. +# As the python can read the data in "cifar-10-python.tar.gz",herer is +# no need to untar the data. +# +# +# First,we let the data download path is "~/paddle_data_directory", +# when u no special the download path. +# +# +# Then,download the cifar10 dataset,and returns the data directory for +# training api. +# +######################################################################## + import shutil import os import sys -import tarfile -import zipfile import collections import numpy as np from six.moves import urllib +from http_download import data_download + source_url='https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' source_file = "cifar-10-batches-py" -label_map = { -0: "airplane", -1: "automobile", -2: "bird", -3: "cat", -4: "deer", -5: "dog", -6: "frog", -7: "horse", -8: "ship", -9: "truck" -} -def fetch(): +def fetch(directory=None): """ According to the source name,set the download path for source, download the data from the source url,and return the download path to fetch for training api. @@ -49,124 +54,17 @@ def fetch(): Args: Returns: - path to downloaded file. + path to untar file. """ - num_images_train = 50000 - num_batch = 5 source_name = "cifar" - file_source = "cifar-10-batches-py" - #Set the download dir for cifar. - data_home = set_data_path(source_name) - filepath = data_download(data_home,source_url) - """ - for i in range(1, num_batch + 1): - fpath = os.path.join(filepath, "data_batch_%d" % i) - """ - return filepath - - -def _unpickle(file_path): - - with open(file_path, mode='rb') as file: - if sys.version_info < (3,): - data = cPickle.load(file) - else: - data = cPickle.load(file, encoding='bytes') - return data - - -def set_data_path(source_name): - """ - Set the path for download according to the source name. - - Args: - source_name:the source - - Returns: - the data directory for data download. - """ - data_base = os.path.expanduser(os.path.join('~','.paddle')) - print data_base - if not os.access(data_base, os.W_OK): - data_base = os.path.join('/tmp', '.paddle') - datadir = os.path.join(data_base, source_name) - print datadir - if not os.path.exists(datadir): - os.makedirs(datadir) - return datadir - - -def data_download(download_dir, source_url): - """ - Download data according to the url for mnist. - when downloading,it can see each download process. - - Args: - download_dir:the directory for data download. - source_url:the url for data download. - - Returns: - the path after data downloaded. - """ - src_file = source_url.strip().split('/')[-1] - file_path = os.path.join(download_dir, src_file) - if not os.path.exists(file_path): - temp_file_name,_ = download_with_urlretrieve(source_url) - temp_file_path = os.getcwd() - os.rename(temp_file_name, src_file) - move_files(src_file,download_dir) - print("Download finished, Extracting files.") - tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) - print("Unpacking done!") - else: - tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) - print("Data has been already downloaded and unpacked!") - return download_dir - - -def move_files(source_dire, target_dire): - """ - Renaming the source file to other name. - - Args: - source_dire:the source name of file - target_dire:the target name of file. - - Returns: - """ - shutil.move(source_dire, target_dire) - - -def download_with_urlretrieve(url, filename=None): - """ - Download each file with urlretrieve,and the download process can be seen. - - Args: - url:the url for data downoad. - filename:the target name for download. - - Returns: - the temp name after urlretrieve downloaded. - """ - return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress) - - def check_download_progress(count, block_size, total_size): - """ - Print and check the download process. + if directory is None: + directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory')) - Args: - count: - block_size: - total_size: + download_path = os.path.join(directory, source_name) + if not os.path.exists(download_path): + os.makedirs(download_path) + filepath = data_download(download_path, source_url) - Returns: - """ - percent = float(count * block_size) / total_size - msg = "\r- Download progress: {:.1%}".format(percent) - sys.stdout.write(msg) - sys.stdout.flush() + return filepath -if __name__ == '__main__': - path = fetch() - print path diff --git a/python/paddle/data/http_download.py b/python/paddle/data/http_download.py new file mode 100644 index 00000000000000..bef850da841e48 --- /dev/null +++ b/python/paddle/data/http_download.py @@ -0,0 +1,124 @@ +#/usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +######################################################################## +# +# Funciton for data download,it use the urllib urlretrieve and we can +# see the download process when downloading the source. +# +# download process like: - Download progress:10% +# +######################################################################## + + +import os +import sys +import shutil +import zipfile +import tarfile +import stat +from six.moves import urllib + + +def download_with_urlretrieve(url, filename=None): + """ + Download each file with urlretrieve,and the download process can be seen. + + Args: + url:the url for data downoad. + filename:the target name for download. + + Returns: + the temp name after urlretrieve downloaded. + """ + return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress) + + +def check_download_progress(count, block_size, total_size): + """ + Print and check the download process. + + Args: + count: + block_size: + total_size: + + Returns: + """ + percent = float(count * block_size) / total_size + msg = "\r- Download progress: {:.1%}".format(percent) + sys.stdout.write(msg) + sys.stdout.flush() + + +def data_download(download_dir, source_url): + """ + Download data according to the url for source_name. + when downloading,it can see each download process. + + Args: + download_dir:the directory for data download. + source_url:the url for data download. + + Returns: + the path after data downloaded. + """ + src_file = source_url.strip().split('/')[-1] + file_path = os.path.join(download_dir, src_file) + + print file_path + if not os.path.exists(file_path): + temp_file_name,_ = download_with_urlretrieve(source_url) + temp_file_path = os.getcwd() + os.rename(temp_file_name, src_file) + shutil.move(src_file, download_dir) + print("Download finished, Extracting files.") + + if 'zip' in src_file: + tar = zipfile.ZipFile(file_path, 'r') + infos = tar.infolist() + for file in infos: + tar.extract(file, download_dir) + fpath = os.path.join(download_dir, file.filename) + if 'master' in src_file: + os.chmod(fpath,stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH) + os.remove(file_path) + elif src_file in ['.json.gz','txt','emb','python.tar.gz']: + pass + elif src_file.split('.')[-1] is 'gz': + tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) + os.remove(file_path) + print("Unpacking done!") + else: + if 'zip' in src_file: + tar = zipfile.ZipFile(file_path, 'r') + infos = tar.infolist() + for file in infos: + tar.extract(file, download_dir) + fpath = os.path.join(download_dir, file.filename) + if 'master' in src_file: + os.chmod(fpath,stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH) + os.remove(file_path) + elif src_file in ['.json.gz','txt','emb']: + pass + elif src_file.split('.')[-1] is 'gz': + tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) + os.remove(file_path) + print("Data has been already downloaded and unpacked!") + + return download_dir + diff --git a/python/paddle/data/mnist.py b/python/paddle/data/mnist.py index ac16cf1919350b..c084954848b985 100644 --- a/python/paddle/data/mnist.py +++ b/python/paddle/data/mnist.py @@ -15,6 +15,18 @@ # See the License for the specific language governing permissions and # limitations under the License. +############################################################################ +# +# Function for fetch the data untar directory for mnist training api. +# you can use this data for Digital identification. +# +# First,we special the data download directory is "~/paddle_data_directory". +# For the mnist dataset,it untar the dataset,and returns the untar +# directory for training api. +# +############################################################################ + + import shutil import os import sys @@ -24,10 +36,12 @@ import urlparse import gzip + source_url = 'http://yann.lecun.com/exdb/mnist/' filename = ['train-images-idx3-ubyte.gz','t10k-images-idx3-ubyte.gz','train-labels-idx1-ubyte.gz','t10k-labels-idx1-ubyte.gz'] -def fetch(): + +def fetch(directory=None): """ According to the source name,set the download path for source, download the data from the source url,and return the download path to fetch for training api. @@ -35,113 +49,20 @@ def fetch(): Args: Returns: - path to downloaded file. + path for untar file. """ source_name = "mnist" - data_home = set_data_path(source_name) - filepath = data_download(data_home,source_url) - return filepath - -def set_data_path(source_name): - """ - Set the path for download according to the source name. + if directory is None: + directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory')) - Args: - source_name:the source + download_path = os.path.join(directory, source_name) + if not os.path.exists(download_path): + os.makedirs(download_path) - Returns: - the data directory for data download. - """ - data_base = os.path.expanduser(os.path.join('~','.paddle')) - if not os.access(data_base, os.W_OK): - data_base = os.path.join('/tmp', '.paddle') - datadir = os.path.join(data_base, source_name) - print datadir - if not os.path.exists(datadir): - os.makedirs(datadir) - return datadir - - -def data_download(download_dir,source_url): - """ - Download data according to the url for mnist. - when downloading,it can see each download process. - - Args: - download_dir:the directory for data download. - source_url:the url for data download. - - Returns: - the path after data downloaded. - """ for file in filename: - data_url = urlparse.urljoin(source_url,file) - file_path = os.path.join(download_dir,file) - untar_path = os.path.join(download_dir,file.replace(".gz","")) - if not os.path.exists(file_path): - temp_file_name,_ = download_with_urlretrieve(data_url) - temp_file_path = os.getcwd() - os.rename(temp_file_name,file) - move_files(file,download_dir) - print("Download finished,Extracting files.") - g_file = gzip.GzipFile(file_path) - open(untar_path,'w+').write(g_file.read()) - g_file.close() - print("Unpacking done!") - else: - g_file = gzip.GzipFile(file_path) - open(untar_path,'w+').write(g_file.read()) - g_file.close() - print("Data has been already downloaded and unpacked!") - os.remove(file_path) - return download_dir - - -def move_files(source_dire,target_dire): - """ - Renaming the source file to other name. - - Args: - source_dire:the source name of file - target_dire:the target name of file. - - Returns: - """ - shutil.move(source_dire,target_dire) - - -def download_with_urlretrieve(url, filename=None): - """ - Download each file with urlretrieve,and the download process can be seen. - - Args: - url:the url for data downoad. - filename:the target name for download. - - Returns: - the temp name after urlretrieve downloaded. - """ - return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress) - - -def check_download_progress(count, block_size, total_size): - """ - Print and check the download process. - - Args: - count: - block_size: - total_size: - - Returns: - """ - percent = float(count * block_size) / total_size - msg = "\r- Download progress: {:.1%}".format(percent) - sys.stdout.write(msg) - sys.stdout.flush() - + url = urlparse.urljoin(source_url, file) + filepath = data_download(download_path, url) + data_dir = os.path.join(filepath, file.split('.')[0]) + return data_dir -if __name__ == '__main__': - path = fetch() - print path diff --git a/python/paddle/data/recommendation.py b/python/paddle/data/recommendation.py index 1e93b6dc161224..ca7dce8a9bfb10 100644 --- a/python/paddle/data/recommendation.py +++ b/python/paddle/data/recommendation.py @@ -15,6 +15,17 @@ # See the License for the specific language governing permissions and # limitations under the License. +############################################################################ +# +# Function for fetch the data untar directory for amazon training api. +# you can use this data for movie recommendation. +# +# First,we special the data download directory is "~/paddle_data_directory". +# For the movie recommendation dataset,it untar the dataset,and returns the +# untar directory for training api. +# +############################################################################## + import shutil import os @@ -27,10 +38,10 @@ source_url='http://files.grouplens.org/datasets/movielens/ml-1m.zip' -file_source = "mosesdecoder-master" +file_source = "ml-1m" -def fetch(): +def fetch(directory=None): """ According to the source name,set the download path for source, download the data from the source url,and return the download path to fetch for training api. @@ -41,128 +52,14 @@ def fetch(): path to downloaded file. """ source_name = "recommendation" - #Set the download dir for recommendation. - data_home = set_data_path(source_name) - filepath = data_download(data_home, source_url) - """ - for i in range(1, num_batch + 1): - fpath = os.path.join(filepath, "data_batch_%d" % i) - """ - return filepath - - -def _unpickle(file_path): - with open(file_path, mode='rb') as file: - if sys.version_info < (3,): - data = cPickle.load(file) - else: - data = cPickle.load(file, encoding='bytes') - return data - - -def set_data_path(source_name): - """ - Set the path for download according to the source name. - - Args: - source_name:the source - - Returns: - the data directory for data download. - """ - data_base = os.path.expanduser(os.path.join('~',' .paddle')) - if not os.access(data_base, os.W_OK): - data_base = os.path.join('/tmp', '.paddle') - datadir = os.path.join(data_base, source_name) - print datadir - if not os.path.exists(datadir): - os.makedirs(datadir) - return datadir - - -def data_download(download_dir, source_url): - """ - Download data according to the url for mnist. - when downloading,it can see each download process. - - Args: - download_dir:the directory for data download. - source_url:the url for data download. - - Returns: - the path after data downloaded. - """ - src_file = source_url.strip().split('/')[-1] - file_path = os.path.join(download_dir, src_file) + if directory is None: + directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory')) - if not os.path.exists(file_path): - temp_file_name,_ = download_with_urlretrieve(source_url) - temp_file_path = os.getcwd() - os.rename(temp_file_name, src_file) - move_files(src_file, download_dir) - print("Download finished, Extracting files.") - tar = zipfile.ZipFile(file_path, 'r') - infos = tar.infolist() - for file in infos: - tar.extract(file, download_dir) - fpath = os.path.join(download_dir, file.filename) - os.remove(file_path) - print("Unpacking done!") - else: - tar = zipfile.ZipFile(file_path, 'r') - infos = tar.infolist() - for file in infos: - tar.extract(file, download_dir) - fpath = os.path.join(download_dir, file.filename) - os.remove(file_path) - print("Data has been already downloaded and unpacked!") - return download_dir - - -def move_files(source_dire, target_dire): - """ - Renaming the source file to other name. - - Args: - source_dire:the source name of file - target_dire:the target name of file. - - Returns: - """ - shutil.move(source_dire, target_dire) - - -def download_with_urlretrieve(url, filename=None): - """ - Download each file with urlretrieve,and the download process can be seen. - - Args: - url:the url for data downoad. - filename:the target name for download. - - Returns: - the temp name after urlretrieve downloaded. - """ - return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress) - - -def check_download_progress(count, block_size, total_size): - """ - Print and check the download process. - - Args: - count: - block_size: - total_size: - - Returns: - """ - percent = float(count * block_size) / total_size - msg = "\r- Download progress: {:.1%}".format(percent) - sys.stdout.write(msg) - sys.stdout.flush() + download_path = os.path.join(directory, source_name) + if not os.path.exists(download_path): + os.makedirs(download_path) + filepath = data_download(download_path, source_url) + data_path = os.path.join(filepath, file_source) -if __name__ == '__main__': - path = fetch() - print path + return data_path diff --git a/python/paddle/data/semantic.py b/python/paddle/data/semantic.py index 087e6e6640fea3..8950bb4c98e3b4 100644 --- a/python/paddle/data/semantic.py +++ b/python/paddle/data/semantic.py @@ -16,6 +16,18 @@ # limitations under the License. +############################################################################ +# +# Function for fetch the data untar directory for semantic_role_labeling +# training api.you can use this data for semantic. +# +# First,we special the data download directory is "~/paddle_data_directory". +# For the semantic role labeling,it untar the dataset,and returns the untar +# directory for training api. +# +############################################################################ + + import shutil import os import sys @@ -32,8 +44,10 @@ 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb' ] +file_source = "conll05st-release" + -def fetch(): +def fetch(directory=None): """ According to the source name,set the download path for source, download the data from the source url,and return the download path to fetch for training api. @@ -44,121 +58,31 @@ def fetch(): path to downloaded file. """ source_name = "semantic" - data_home = set_data_path(source_name) - model_path = data_download(data_home, model_url) - for url in source_url: - filepath = data_download(data_home, moses_url) - """ - for i in range(1, num_batch + 1): - fpath = os.path.join(filepath, "data_batch_%d" % i) - """ - return filepath + if directory is None: + directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory')) + download_path = os.path.join(directory, source_name) + if not os.path.exists(download_path): + os.makedirs(download_path) -def _unpickle(file_path): - with open(file_path, mode='rb') as file: - if sys.version_info < (3,): - data = cPickle.load(file) + for url in source_url: + file_name = url.split('/')[-1] + if 'gz' in file_name: + filepath = data_download(download_path, url) + data_path = os.path.join(filepath, file_source) + + sub_file = ['est.wsj.words.gz', 'test.wsj.props.gz'] + words_path = os.path.join(data_path, "test.wsj/words/test.wsj.words.gz") + props_path = os.path.join(data_path, "test.wsj/props/test.wsj.props.gz") + + sub_path = [words_path, props_path] + for sub_file in sub_path: + new_sub_path = os.path.join(download_path, sub_file) + shutil.move(sub_path, new_subpath) + tarfile.open(name=new_subpath, mode="r:gz").extractall(download_path) + os.remove(new_subpath) else: - data = cPickle.load(file, encoding='bytes') - return data - - -def set_data_path(source_name): - """ - Set the path for download according to the source name. - - Args: - source_name:the source - - Returns: - the data directory for data download. - """ - data_base = os.path.expanduser(os.path.join('~',' .paddle')) - if not os.access(data_base, os.W_OK): - data_base = os.path.join('/tmp', '.paddle') - datadir = os.path.join(data_base, source_name) - print datadir - if not os.path.exists(datadir): - os.makedirs(datadir) - return datadir - - -def data_download(download_dir, source_url): - """ - Download data according to the url for mnist. - when downloading,it can see each download process. - - Args: - download_dir:the directory for data download. - source_url:the url for data download. - - Returns: - the path after data downloaded. - """ - src_file = url.strip().split('/')[-1] - file_path = os.path.join(download_dir, src_file) - - if not os.path.exists(file_path): - temp_file_name,_ = download_with_urlretrieve(source_url) - temp_file_path = os.getcwd() - os.rename(temp_file_name, src_file) - move_files(src_file, download_dir) - print("Download finished, Extracting files.") - tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) - os.remove(file_path) - print("Unpacking done!") - else: - tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) - os.remove(file_path) - print("Data has been already downloaded and unpacked!") - return download_dir - - -def move_files(source_dire, target_dire): - """ - Renaming the source file to other name. - - Args: - source_dire:the source name of file - target_dire:the target name of file. - - Returns: - """ - shutil.move(source_dire, target_dire) - - -def download_with_urlretrieve(url, filename=None): - """ - Download each file with urlretrieve,and the download process can be seen. - - Args: - url:the url for data downoad. - filename:the target name for download. - - Returns: - the temp name after urlretrieve downloaded. - """ - return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress) - - -def check_download_progress(count, block_size, total_size): - """ - Print and check the download process. - - Args: - count: - block_size: - total_size: - - Returns: - """ - percent = float(count * block_size) / total_size - msg = "\r- Download progress: {:.1%}".format(percent) - sys.stdout.write(msg) - sys.stdout.flush() + filepath = data_download(download_path, url) + return filepath -if __name__ == '__main__': - path = fetch() - print path diff --git a/python/paddle/data/sentiment.py b/python/paddle/data/sentiment.py index 18146ba93803a7..c1d74c51b60a88 100644 --- a/python/paddle/data/sentiment.py +++ b/python/paddle/data/sentiment.py @@ -16,6 +16,18 @@ # limitations under the License. +############################################################################ +# +# Function for fetch the data untar directory for sentiment training api. +# you can use this data for sentiment analasis. +# +# First,we special the data download directory is "~/paddle_data_directory". +# For the sentiment dataset,it untar the dataset,and returns the untar +# directory for training api. +# +############################################################################ + + import shutil import os import sys @@ -27,10 +39,12 @@ source_url='http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz' moses_url='https://github.com/moses-smt/mosesdecoder/archive/master.zip' -file_source = "mosesdecoder-master" + +moses_source = "mosesdecoder-master" +file_source = "aclImdb" -def fetch(): +def fetch(directory=None): """ According to the source name,set the download path for source, download the data from the source url,and return the download path to fetch for training api. @@ -41,137 +55,20 @@ def fetch(): path to downloaded file. """ source_name = "sentiment" - data_home = set_data_path(source_name) - filepath = data_download(data_home, source_url) - filepath = data_download(data_home, moses_url) - """ - for i in range(1, num_batch + 1): - fpath = os.path.join(filepath, "data_batch_%d" % i) - """ - return filepath - - -def _unpickle(file_path): - with open(file_path, mode='rb') as file: - if sys.version_info < (3,): - data = cPickle.load(file) - else: - data = cPickle.load(file, encoding='bytes') - return data - - -def set_data_path(source_name): - """ - Set the path for download according to the source name. - - Args: - source_name:the source - - Returns: - the data directory for data download. - """ - data_base = os.path.expanduser(os.path.join('~',' .paddle')) - if not os.access(data_base, os.W_OK): - data_base = os.path.join('/tmp', '.paddle') - datadir = os.path.join(data_base, source_name) - print datadir - if not os.path.exists(datadir): - os.makedirs(datadir) - return datadir - - -def data_download(download_dir, source_url): - """ - Download data according to the url for mnist. - when downloading,it can see each download process. - - Args: - download_dir:the directory for data download. - source_url:the url for data download. + if directory is None: + directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory')) - Returns: - the path after data downloaded. - """ - src_file = source_url.strip().split('/')[-1] - file_path = os.path.join(download_dir, src_file) - - if not os.path.exists(file_path): - temp_file_name,_ = download_with_urlretrieve(source_url) - temp_file_path = os.getcwd() - os.rename(temp_file_name, src_file) - move_files(src_file, download_dir) - print("Download finished, Extracting files.") - - if 'zip' in src_file: - tar = zipfile.ZipFile(file_path, 'r') - infos = tar.infolist() - for file in infos: - tar.extract(file, download_dir) - fpath = os.path.join(download_dir, file.filename) - os.chmod(fpath,stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH) - else: - tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) - os.remove(file_path) - print("Unpacking done!") - else: - if 'zip' in src_file: - tar = zipfile.ZipFile(file_path, 'r') - infos = tar.infolist() - for file in infos: - tar.extract(file, download_dir) - fpath = os.path.join(download_dir, file.filename) - os.chmod(fpath, stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH) - else: - tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) - os.remove(file_path) - print("Data has been already downloaded and unpacked!") - return download_dir - - -def move_files(source_dire, target_dire): - """ - Renaming the source file to other name. + download_path = os.path.join(directory, source_name) + if not os.path.exists(download_path): + os.makedirs(download_path) - Args: - source_dire:the source name of file - target_dire:the target name of file. - - Returns: - """ - shutil.move(source_dire, target_dire) + moses_path = data_download(download_path, moses_url) + moses_data = os.path.join(moses_path, moses_source) + filepath = data_download(download_path, source_url) + data_path = os.path.join(filepath, file_source) -def download_with_urlretrieve(url, filename=None): - """ - Download each file with urlretrieve,and the download process can be seen. - - Args: - url:the url for data downoad. - filename:the target name for download. + return data_path - Returns: - the temp name after urlretrieve downloaded. - """ - return urllib.request.urlretrieve(url, filename, rereporthook=check_download_progress) - - -def check_download_progress(count, block_size, total_size): - """ - Print and check the download process. - - Args: - count: - block_size: - total_size: - - Returns: - """ - percent = float(count * block_size) / total_size - msg = "\r- Download progress: {:.1%}".format(percent) - sys.stdout.write(msg) - sys.stdout.flush() -if __name__ == '__main__': - path = fetch() - print path diff --git a/python/paddle/data/seqToseq.py b/python/paddle/data/seqToseq.py index a9bdd0bf73cd2c..4ead9def25563a 100644 --- a/python/paddle/data/seqToseq.py +++ b/python/paddle/data/seqToseq.py @@ -25,13 +25,16 @@ from six.moves import urllib import stat + source_url=['http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz', 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz' ] model_url='http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz' +model_source = "wmt14_model" +file_source = "bitexts.selected" -def fetch(): +def fetch(directory=None): """ According to the source name,set the download path for source, download the data from the source url,and return the download path to fetch for training api. @@ -42,121 +45,19 @@ def fetch(): path to downloaded file. """ source_name = "seqToseq" - data_home = set_data_path(source_name) - model_path = data_download(data_home, model_url) - for url in source_url: - filepath = data_download(data_home, source_url) - """ - for i in range(1, num_batch + 1): - fpath = os.path.join(filepath, "data_batch_%d" % i) - """ - return filepath - - -def _unpickle(file_path): - with open(file_path, mode='rb') as file: - if sys.version_info < (3,): - data = cPickle.load(file) - else: - data = cPickle.load(file, encoding='bytes') - return data - - -def set_data_path(source_name): - """ - Set the path for download according to the source name. - - Args: - source_name:the source - - Returns: - the data directory for data download. - """ - data_base = os.path.expanduser(os.path.join('~',' .paddle')) - if not os.access(data_base, os.W_OK): - data_base = os.path.join('/tmp', '.paddle') - datadir = os.path.join(data_base, source_name) - print datadir - if not os.path.exists(datadir): - os.makedirs(datadir) - return datadir - - -def data_download(download_dir, source_url): - """ - Download data according to the url for mnist. - when downloading,it can see each download process. - - Args: - download_dir:the directory for data download. - source_url:the url for data download. - - Returns: - the path after data downloaded. - """ - src_file = url.strip().split('/')[-1] - file_path = os.path.join(download_dir, src_file) - - if not os.path.exists(file_path): - temp_file_name,_ = download_with_urlretrieve(source_url) - temp_file_path = os.getcwd() - os.rename(temp_file_name, src_file) - move_files(src_file, download_dir) - print("Download finished, Extracting files.") - tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) - os.remove(file_path) - print("Unpacking done!") - else: - tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) - os.remove(file_path) - print("Data has been already downloaded and unpacked!") - return download_dir + if directory is None: + directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory')) + download_path = os.path.join(directory, source_name) + if not os.path.exists(download_path): + os.makedirs(download_path) -def move_files(source_dire, target_dire): - """ - Renaming the source file to other name. - - Args: - source_dire:the source name of file - target_dire:the target name of file. - - Returns: - """ - shutil.move(source_dire, target_dire) - - -def download_with_urlretrieve(url, filename=None): - """ - Download each file with urlretrieve,and the download process can be seen. - - Args: - url:the url for data downoad. - filename:the target name for download. + model_data = data_download(download_path, model_url) + model_path = os.path.join(model_data, model_source) - Returns: - the temp name after urlretrieve downloaded. - """ - return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress) - - -def check_download_progress(count, block_size, total_size): - """ - Print and check the download process. - - Args: - count: - block_size: - total_size: - - Returns: - """ - percent = float(count * block_size) / total_size - msg = "\r- Download progress: {:.1%}".format(percent) - sys.stdout.write(msg) - sys.stdout.flush() + for url in source_url: + filepath = data_download(download_path, url) + data_path = os.path.join(filepath, file_source) + return data_path -if __name__ == '__main__': - path = fetch() - print path From ee9b1c639470c619cfcbb33e4f51cd589ba48e2b Mon Sep 17 00:00:00 2001 From: baidu Date: Fri, 13 Jan 2017 12:29:48 +0800 Subject: [PATCH 08/18] update --- python/paddle/data/DATA.md | 28 -------- python/paddle/data/amazon.py | 11 ++- python/paddle/data/cifar10.py | 8 +-- python/paddle/data/cifar_10.py | 100 --------------------------- python/paddle/data/http_download.py | 15 ++-- python/paddle/data/mnist.py | 11 +-- python/paddle/data/recommendation.py | 7 +- python/paddle/data/semantic.py | 28 ++++---- python/paddle/data/sentiment.py | 12 ++-- python/paddle/data/seqToseq.py | 17 +++-- 10 files changed, 50 insertions(+), 187 deletions(-) delete mode 100644 python/paddle/data/DATA.md delete mode 100644 python/paddle/data/cifar_10.py diff --git a/python/paddle/data/DATA.md b/python/paddle/data/DATA.md deleted file mode 100644 index ce186d42619509..00000000000000 --- a/python/paddle/data/DATA.md +++ /dev/null @@ -1,28 +0,0 @@ -## 需求 - -Paddle目前提供了很多demo,且各demo运行时需要从原生网站下载其数据,并进行复杂的预处理过程,整个过程会耗费大量时间。 - -所以我们需要数据封装接口,采用import数据源的方式(如\:import paddle.data.amazon.review.GetJSON)来简化获取训练所需数据的时间;但是如果你习惯自己处理原生数据,我们依然提供原生数据接口来满足你的需求。 - -## 整体思路 - -数据封装接口的目的是提供数据。不论是原生数据,还是预处理数据都通过import方式导入各模型进行训练;考虑到某些模型的预处理后的数据量依然很大,或有时就仅仅想训练相对较小的网络模型,没必要考虑全量数据,自动配置数据量大小必然更符合不同需求。整个接口初步设想如下: -* 开关来控制数据来源 - * 导入数据接口时,带有开关(如:src\_from = True,来自预处理源;否则,来自原生数据源) -* 预处理数据部分添加配置train和test的数据量的大小 -* 原生数据部分的数据下载数据模块化 - * 开关(src\_from = False)和<模型,数据源>对完成相关数据的下载 -* 原生数据的预处理部分保持原状,通过<模型,预处理过程>对完成数据的预处理 -* 在paddle的train的配置文件中修改数据源的导入方式 - -整个过程在tensorflow的mnist模型已有人实现,借鉴此思想,实现paddle的各demo数据接口的通用化。 - -```python -amazon = input_data.load_dataset( - 'Amazon', - '/Users/baidu/git/test_package/data', - data_unneed=False, - src_flag=False) -batch = amazon.train.shrink_txt('train',10) -``` - diff --git a/python/paddle/data/amazon.py b/python/paddle/data/amazon.py index 361a3fa79ded6e..a284e1dc4fd8d8 100644 --- a/python/paddle/data/amazon.py +++ b/python/paddle/data/amazon.py @@ -34,7 +34,6 @@ # ######################################################################### - import shutil import os import sys @@ -44,9 +43,8 @@ from six.moves import urllib from http_download import data_download - -source_url='http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz' -moses_url='https://github.com/moses-smt/mosesdecoder/archive/master.zip' +source_url = 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz' +moses_url = 'https://github.com/moses-smt/mosesdecoder/archive/master.zip' mose_source = "mosesdecoder-master" @@ -63,7 +61,8 @@ def fetch(directory=None): """ source_name = "amazon" if directory is None: - directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory')) + directory = os.path.expanduser( + os.path.join('~', 'paddle_data_directory')) download_path = os.path.join(directory, source_name) if not os.path.exists(download_path): @@ -74,5 +73,3 @@ def fetch(directory=None): filepath = data_download(download_path, source_url) return filepath - - diff --git a/python/paddle/data/cifar10.py b/python/paddle/data/cifar10.py index 72c4c9bcbc11ce..d6d893288a851c 100644 --- a/python/paddle/data/cifar10.py +++ b/python/paddle/data/cifar10.py @@ -32,7 +32,6 @@ # ######################################################################## - import shutil import os import sys @@ -41,8 +40,7 @@ from six.moves import urllib from http_download import data_download - -source_url='https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' +source_url = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' source_file = "cifar-10-batches-py" @@ -59,7 +57,8 @@ def fetch(directory=None): source_name = "cifar" if directory is None: - directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory')) + directory = os.path.expanduser( + os.path.join('~', 'paddle_data_directory')) download_path = os.path.join(directory, source_name) if not os.path.exists(download_path): @@ -67,4 +66,3 @@ def fetch(directory=None): filepath = data_download(download_path, source_url) return filepath - diff --git a/python/paddle/data/cifar_10.py b/python/paddle/data/cifar_10.py deleted file mode 100644 index 762d4b2d40ca52..00000000000000 --- a/python/paddle/data/cifar_10.py +++ /dev/null @@ -1,100 +0,0 @@ -#/usr/bin/env python -# -*- coding:utf-8 -*- - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import shutil -import os -import sys -import tarfile -import zipfile -import collections -import numpy as np -from six.moves import urllib - -source_url='https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' -source_file = "cifar-10-batches-py" -label_map = { -0: "airplane", -1: "automobile", -2: "bird", -3: "cat", -4: "deer", -5: "dog", -6: "frog", -7: "horse", -8: "ship", -9: "truck" -} - -def fetch(): - num_images_train = 50000 - num_batch = 5 - source_name = "cifar" - file_source = "cifar-10-batches-py" - #Set the download dir for cifar. - data_home = set_data_path(source_name) - filepath = data_download(data_home, source_url) - """ - for i in range(1, num_batch + 1): - fpath = os.path.join(filepath, "data_batch_%d" % i) - """ - -def _unpickle(file_path): - with open(file_path, mode='rb') as file: - if sys.version_info < (3,): - data = cPickle.load(file) - else: - data = cPickle.load(file, encoding='bytes') - return data - -def set_data_path(source_name): - data_base = os.path.expanduser(os.path.join('~', '.paddle')) - print data_base - if not os.access(data_base, os.W_OK): - data_base = os.path.join('/tmp', '.paddle') - datadir = os.path.join(data_base, source_name) - print datadir - if not os.path.exists(datadir): - os.makedirs(datadir) - return datadir - -def data_download(download_dir, source_url): - src_file = source_url.strip().split('/')[-1] - file_path = os.path.join(download_dir, src_file) - if not os.path.exists(file_path): - temp_file_name,_ = download_with_urlretrieve(source_url) - temp_file_path = os.getcwd() - os.rename(temp_file_name, src_file) - move_files(src_file, download_dir) - print("Download finished,Extracting files.") - tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) - print("Unpacking done!") - else: - tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) - print("Data has been already downloaded and unpacked!") - return download_dir - -def move_files(source_dire, target_dire): - shutil.move(source_dire, target_dire) - -def download_with_urlretrieve(url, filename=None): - return urllib.request.urlretrieve(url, filename) - - -if __name__ == '__main__': - path = fetch() - print path diff --git a/python/paddle/data/http_download.py b/python/paddle/data/http_download.py index bef850da841e48..0f128c19727582 100644 --- a/python/paddle/data/http_download.py +++ b/python/paddle/data/http_download.py @@ -24,7 +24,6 @@ # ######################################################################## - import os import sys import shutil @@ -45,7 +44,8 @@ def download_with_urlretrieve(url, filename=None): Returns: the temp name after urlretrieve downloaded. """ - return urllib.request.urlretrieve(url, filename, reporthook=check_download_progress) + return urllib.request.urlretrieve( + url, filename, reporthook=check_download_progress) def check_download_progress(count, block_size, total_size): @@ -82,7 +82,7 @@ def data_download(download_dir, source_url): print file_path if not os.path.exists(file_path): - temp_file_name,_ = download_with_urlretrieve(source_url) + temp_file_name, _ = download_with_urlretrieve(source_url) temp_file_path = os.getcwd() os.rename(temp_file_name, src_file) shutil.move(src_file, download_dir) @@ -95,9 +95,9 @@ def data_download(download_dir, source_url): tar.extract(file, download_dir) fpath = os.path.join(download_dir, file.filename) if 'master' in src_file: - os.chmod(fpath,stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH) + os.chmod(fpath, stat.S_IRWXU | stat.S_IRGRP | stat.S_IROTH) os.remove(file_path) - elif src_file in ['.json.gz','txt','emb','python.tar.gz']: + elif src_file in ['.json.gz', 'txt', 'emb', 'python.tar.gz']: pass elif src_file.split('.')[-1] is 'gz': tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) @@ -111,9 +111,9 @@ def data_download(download_dir, source_url): tar.extract(file, download_dir) fpath = os.path.join(download_dir, file.filename) if 'master' in src_file: - os.chmod(fpath,stat.S_IRWXU|stat.S_IRGRP|stat.S_IROTH) + os.chmod(fpath, stat.S_IRWXU | stat.S_IRGRP | stat.S_IROTH) os.remove(file_path) - elif src_file in ['.json.gz','txt','emb']: + elif src_file in ['.json.gz', 'txt', 'emb']: pass elif src_file.split('.')[-1] is 'gz': tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) @@ -121,4 +121,3 @@ def data_download(download_dir, source_url): print("Data has been already downloaded and unpacked!") return download_dir - diff --git a/python/paddle/data/mnist.py b/python/paddle/data/mnist.py index c084954848b985..151e50c3d65a29 100644 --- a/python/paddle/data/mnist.py +++ b/python/paddle/data/mnist.py @@ -26,7 +26,6 @@ # ############################################################################ - import shutil import os import sys @@ -36,9 +35,11 @@ import urlparse import gzip - source_url = 'http://yann.lecun.com/exdb/mnist/' -filename = ['train-images-idx3-ubyte.gz','t10k-images-idx3-ubyte.gz','train-labels-idx1-ubyte.gz','t10k-labels-idx1-ubyte.gz'] +filename = [ + 'train-images-idx3-ubyte.gz', 't10k-images-idx3-ubyte.gz', + 'train-labels-idx1-ubyte.gz', 't10k-labels-idx1-ubyte.gz' +] def fetch(directory=None): @@ -54,7 +55,8 @@ def fetch(directory=None): source_name = "mnist" if directory is None: - directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory')) + directory = os.path.expanduser( + os.path.join('~', 'paddle_data_directory')) download_path = os.path.join(directory, source_name) if not os.path.exists(download_path): @@ -65,4 +67,3 @@ def fetch(directory=None): filepath = data_download(download_path, url) data_dir = os.path.join(filepath, file.split('.')[0]) return data_dir - diff --git a/python/paddle/data/recommendation.py b/python/paddle/data/recommendation.py index ca7dce8a9bfb10..387180231c9fcb 100644 --- a/python/paddle/data/recommendation.py +++ b/python/paddle/data/recommendation.py @@ -26,7 +26,6 @@ # ############################################################################## - import shutil import os import sys @@ -36,8 +35,7 @@ from six.moves import urllib import stat - -source_url='http://files.grouplens.org/datasets/movielens/ml-1m.zip' +source_url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip' file_source = "ml-1m" @@ -53,7 +51,8 @@ def fetch(directory=None): """ source_name = "recommendation" if directory is None: - directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory')) + directory = os.path.expanduser( + os.path.join('~', 'paddle_data_directory')) download_path = os.path.join(directory, source_name) if not os.path.exists(download_path): diff --git a/python/paddle/data/semantic.py b/python/paddle/data/semantic.py index 8950bb4c98e3b4..d9b7367044579e 100644 --- a/python/paddle/data/semantic.py +++ b/python/paddle/data/semantic.py @@ -15,7 +15,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - ############################################################################ # # Function for fetch the data untar directory for semantic_role_labeling @@ -27,7 +26,6 @@ # ############################################################################ - import shutil import os import sys @@ -37,12 +35,13 @@ from six.moves import urllib import stat -source_url=['http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz', - 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt', - 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt', - 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt', - 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb' - ] +source_url = [ + 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz', + 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt', + 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt', + 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt', + 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb' +] file_source = "conll05st-release" @@ -59,7 +58,8 @@ def fetch(directory=None): """ source_name = "semantic" if directory is None: - directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory')) + directory = os.path.expanduser( + os.path.join('~', 'paddle_data_directory')) download_path = os.path.join(directory, source_name) if not os.path.exists(download_path): @@ -72,17 +72,19 @@ def fetch(directory=None): data_path = os.path.join(filepath, file_source) sub_file = ['est.wsj.words.gz', 'test.wsj.props.gz'] - words_path = os.path.join(data_path, "test.wsj/words/test.wsj.words.gz") - props_path = os.path.join(data_path, "test.wsj/props/test.wsj.props.gz") + words_path = os.path.join(data_path, + "test.wsj/words/test.wsj.words.gz") + props_path = os.path.join(data_path, + "test.wsj/props/test.wsj.props.gz") sub_path = [words_path, props_path] for sub_file in sub_path: new_sub_path = os.path.join(download_path, sub_file) shutil.move(sub_path, new_subpath) - tarfile.open(name=new_subpath, mode="r:gz").extractall(download_path) + tarfile.open( + name=new_subpath, mode="r:gz").extractall(download_path) os.remove(new_subpath) else: filepath = data_download(download_path, url) return filepath - diff --git a/python/paddle/data/sentiment.py b/python/paddle/data/sentiment.py index c1d74c51b60a88..ea4193e53ed299 100644 --- a/python/paddle/data/sentiment.py +++ b/python/paddle/data/sentiment.py @@ -15,7 +15,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - ############################################################################ # # Function for fetch the data untar directory for sentiment training api. @@ -27,7 +26,6 @@ # ############################################################################ - import shutil import os import sys @@ -37,8 +35,8 @@ from six.moves import urllib import stat -source_url='http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz' -moses_url='https://github.com/moses-smt/mosesdecoder/archive/master.zip' +source_url = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz' +moses_url = 'https://github.com/moses-smt/mosesdecoder/archive/master.zip' moses_source = "mosesdecoder-master" file_source = "aclImdb" @@ -56,7 +54,8 @@ def fetch(directory=None): """ source_name = "sentiment" if directory is None: - directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory')) + directory = os.path.expanduser( + os.path.join('~', 'paddle_data_directory')) download_path = os.path.join(directory, source_name) if not os.path.exists(download_path): @@ -69,6 +68,3 @@ def fetch(directory=None): data_path = os.path.join(filepath, file_source) return data_path - - - diff --git a/python/paddle/data/seqToseq.py b/python/paddle/data/seqToseq.py index 4ead9def25563a..8850292f0dda87 100644 --- a/python/paddle/data/seqToseq.py +++ b/python/paddle/data/seqToseq.py @@ -15,7 +15,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import shutil import os import sys @@ -25,15 +24,16 @@ from six.moves import urllib import stat - -source_url=['http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz', - 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz' - ] -model_url='http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz' +source_url = [ + 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz', + 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz' +] +model_url = 'http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz' model_source = "wmt14_model" file_source = "bitexts.selected" + def fetch(directory=None): """ According to the source name,set the download path for source, @@ -46,7 +46,8 @@ def fetch(directory=None): """ source_name = "seqToseq" if directory is None: - directory = os.path.expanduser(os.path.join('~', 'paddle_data_directory')) + directory = os.path.expanduser( + os.path.join('~', 'paddle_data_directory')) download_path = os.path.join(directory, source_name) if not os.path.exists(download_path): @@ -59,5 +60,3 @@ def fetch(directory=None): filepath = data_download(download_path, url) data_path = os.path.join(filepath, file_source) return data_path - - From c53599f1d78aecec3e619d576e09df04df38fe78 Mon Sep 17 00:00:00 2001 From: baidu Date: Fri, 13 Jan 2017 12:33:36 +0800 Subject: [PATCH 09/18] update --- python/paddle/data/mnist.py | 1 + python/paddle/data/recommendation.py | 2 ++ python/paddle/data/semantic.py | 3 ++- python/paddle/data/sentiment.py | 1 + python/paddle/data/seqToseq.py | 2 ++ 5 files changed, 8 insertions(+), 1 deletion(-) diff --git a/python/paddle/data/mnist.py b/python/paddle/data/mnist.py index 151e50c3d65a29..5fe6f6dccc0875 100644 --- a/python/paddle/data/mnist.py +++ b/python/paddle/data/mnist.py @@ -34,6 +34,7 @@ from six.moves import urllib import urlparse import gzip +from http_download import data_download source_url = 'http://yann.lecun.com/exdb/mnist/' filename = [ diff --git a/python/paddle/data/recommendation.py b/python/paddle/data/recommendation.py index 387180231c9fcb..6c3fba55c8919b 100644 --- a/python/paddle/data/recommendation.py +++ b/python/paddle/data/recommendation.py @@ -34,6 +34,8 @@ import numpy as np from six.moves import urllib import stat +from http_download import data_download + source_url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip' file_source = "ml-1m" diff --git a/python/paddle/data/semantic.py b/python/paddle/data/semantic.py index d9b7367044579e..dfafb5120cf88c 100644 --- a/python/paddle/data/semantic.py +++ b/python/paddle/data/semantic.py @@ -33,7 +33,8 @@ import collections import numpy as np from six.moves import urllib -import stat +from http_download import data_download + source_url = [ 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz', diff --git a/python/paddle/data/sentiment.py b/python/paddle/data/sentiment.py index ea4193e53ed299..e0a72e0d9b9809 100644 --- a/python/paddle/data/sentiment.py +++ b/python/paddle/data/sentiment.py @@ -34,6 +34,7 @@ import numpy as np from six.moves import urllib import stat +from http_download import data_download source_url = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz' moses_url = 'https://github.com/moses-smt/mosesdecoder/archive/master.zip' diff --git a/python/paddle/data/seqToseq.py b/python/paddle/data/seqToseq.py index 8850292f0dda87..ced53be2d1a9d5 100644 --- a/python/paddle/data/seqToseq.py +++ b/python/paddle/data/seqToseq.py @@ -23,6 +23,8 @@ import numpy as np from six.moves import urllib import stat +from http_download import data_download + source_url = [ 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz', From 6173153da0006699ab020dde8418cf54575eb77f Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 13 Jan 2017 16:20:53 +0800 Subject: [PATCH 10/18] Refine amazon_product_reviews.py --- python/paddle/data/amazon.py | 75 ------------ python/paddle/data/amazon_product_reviews.py | 119 +++++++++++++++++++ python/paddle/data/http_download.py | 48 ++++---- python/paddle/data/logger.py | 5 + 4 files changed, 147 insertions(+), 100 deletions(-) delete mode 100644 python/paddle/data/amazon.py create mode 100644 python/paddle/data/amazon_product_reviews.py create mode 100644 python/paddle/data/logger.py diff --git a/python/paddle/data/amazon.py b/python/paddle/data/amazon.py deleted file mode 100644 index a284e1dc4fd8d8..00000000000000 --- a/python/paddle/data/amazon.py +++ /dev/null @@ -1,75 +0,0 @@ -#/usr/bin/env python -# -*- coding:utf-8 -*- - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -######################################################################## -# -# Function for fetch the data untar directory for amazon training api. -# As the python can read the data in "reviews_Electronics_5.json.gz", -#here is no need to untar the data. -# -# -# First,we let the data download path is "~/paddle_data_directory" -# when u no special the download path. -# -# -# Then,download the data,according to the speical source url. -# Here,no need to untar the "reviews_Electronics_5.json.gz". -# -# After download the data,return the path of data. -# -# -######################################################################### - -import shutil -import os -import sys -import zipfile -import collections -import stat -from six.moves import urllib -from http_download import data_download - -source_url = 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz' -moses_url = 'https://github.com/moses-smt/mosesdecoder/archive/master.zip' - -mose_source = "mosesdecoder-master" - - -def fetch(directory=None): - """ - According to the source name,set the download path for source, - download the data from the source url,and return the download path to fetch for training api. - - Args: - - Returns: - path for the data untar. - """ - source_name = "amazon" - if directory is None: - directory = os.path.expanduser( - os.path.join('~', 'paddle_data_directory')) - - download_path = os.path.join(directory, source_name) - if not os.path.exists(download_path): - os.makedirs(download_path) - - moses_src = data_download(download_path, moses_url) - moses_path = os.path.join(moses_src, mose_source) - - filepath = data_download(download_path, source_url) - return filepath diff --git a/python/paddle/data/amazon_product_reviews.py b/python/paddle/data/amazon_product_reviews.py new file mode 100644 index 00000000000000..5a282293eb1be4 --- /dev/null +++ b/python/paddle/data/amazon_product_reviews.py @@ -0,0 +1,119 @@ +# /usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +A utility for fetching, reading amazon product review data set. + +http://jmcauley.ucsd.edu/data/amazon/ +""" + +import os +from http_download import download +from logger import logger +import hashlib + +BASE_URL = 'http://snap.stanford.edu/data/' \ + 'amazon/productGraph/categoryFiles/reviews_%s_5.json.gz' + + +class Categories(object): + Books = "Books" + Electronics = "Electronics" + MoviesAndTV = "Movies_and_TV" + CDsAndVinyl = "CDs_and_Vinyl" + ClothingShoesAndJewelry = "Clothing_Shoes_and_Jewelry" + HomeAndKitchen = "Home_and_Kitchen" + KindleStore = "Kindle_Store" + SportsAndOutdoors = "Sports_and_Outdoors" + CellPhonesAndAccessories = "Cell_Phones_and_Accessories" + HealthAndPersonalCare = "Health_and_Personal_Care" + ToysAndGames = "Toys_and_Games" + VideoGames = "Video_Games" + ToolsAndHomeImprovement = "Tools_and_Home_Improvement" + Beauty = "Beauty" + AppsForAndroid = "Apps_for_Android" + OfficeProducts = "Office_Products" + PetSupplies = "Pet_Supplies" + Automotive = "Automotive" + GroceryAndGourmetFood = "Grocery_and_Gourmet" + PatioLawnAndGarden = "Patio_Lawn_and_Garden" + Baby = "Baby" + DigitalMusic = "Digital_Music" + MusicalInstruments = "Musical_Instruments" + AmazonInstantVideo = "Amazon_Instant_Video" + + __md5__ = dict() + + __md5__[AmazonInstantVideo] = '10812e43e99c345f63333d8ee10aef6a' + __md5__[AppsForAndroid] = 'a7d1ae198b862eea6910fe45c842b0c6' + __md5__[Automotive] = '757fdb1ab2c5e2fc0934047721082011' + __md5__[Baby] = '7698a4179a1d8385e946ed9083490d22' + __md5__[Beauty] = '5d2ccdcd86641efcfbae344317c10829' + + +__all__ = ['fetch', 'Categories'] + + +def fetch(category=None, directory=None): + """ + According to the source name,set the download path for source, + download the data from the source url,and return the download path to fetch + for training api. + + Args: + + Returns: + path for the data untar. + """ + if category is None: + category = Categories.Electronics + + if directory is None: + directory = os.path.expanduser( + os.path.join('~', 'paddle_data', 'amazon')) + + if not os.path.exists(directory): + os.makedirs(directory) + logger.info("Downloading amazon review dataset for %s category" % category) + return download(BASE_URL % category, + os.path.join(directory, '%s.json.gz' % category)) + + +def calculate_md5(fn): + h = hashlib.md5() + with open(fn, 'rb') as f: + for chunk in iter(lambda: f.read(4096), b""): + h.update(chunk) + return h.hexdigest() + + +def main(): + categories = filter( + lambda c: getattr(Categories, c) not in Categories.__md5__.keys(), + filter(lambda c: c[0] != '_', dir(Categories))) + + for each in categories: + try: + filename = fetch(category=getattr(Categories, each)) + except Exception as e: + print type(e) + continue + print each, calculate_md5(filename) + os.remove(filename) + + +if __name__ == '__main__': + main() diff --git a/python/paddle/data/http_download.py b/python/paddle/data/http_download.py index 0f128c19727582..668dc9966778a8 100644 --- a/python/paddle/data/http_download.py +++ b/python/paddle/data/http_download.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +# /usr/bin/env python # -*- coding:utf-8 -*- # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved @@ -33,36 +33,31 @@ from six.moves import urllib -def download_with_urlretrieve(url, filename=None): +def download_with_urlretrieve(url, filename=None, with_progress=True): """ Download each file with urlretrieve,and the download process can be seen. - Args: - url:the url for data downoad. - filename:the target name for download. - - Returns: - the temp name after urlretrieve downloaded. + :param url: the url for data download. + :type url: basestring + :param filename: Output file name. None if use default file name. + :type filename: basestring + :param with_progress: with progress bar or not. Default is true. + :type with_progress: bool + :return: the downloaded filename + :rtype: basestring """ - return urllib.request.urlretrieve( - url, filename, reporthook=check_download_progress) + def check_download_progress(count, block_size, total_size): + percent = float(count * block_size) / total_size + msg = "\r- Downloading {1} progress: {0:.1%}".format(percent, filename) + sys.stdout.write(msg) + sys.stdout.flush() -def check_download_progress(count, block_size, total_size): - """ - Print and check the download process. - - Args: - count: - block_size: - total_size: + hook = None + if with_progress: + hook = check_download_progress - Returns: - """ - percent = float(count * block_size) / total_size - msg = "\r- Download progress: {:.1%}".format(percent) - sys.stdout.write(msg) - sys.stdout.flush() + return urllib.request.urlretrieve(url, filename, reporthook=hook)[0] def data_download(download_dir, source_url): @@ -82,7 +77,7 @@ def data_download(download_dir, source_url): print file_path if not os.path.exists(file_path): - temp_file_name, _ = download_with_urlretrieve(source_url) + temp_file_name = download_with_urlretrieve(source_url) temp_file_path = os.getcwd() os.rename(temp_file_name, src_file) shutil.move(src_file, download_dir) @@ -121,3 +116,6 @@ def data_download(download_dir, source_url): print("Data has been already downloaded and unpacked!") return download_dir + + +download = download_with_urlretrieve diff --git a/python/paddle/data/logger.py b/python/paddle/data/logger.py new file mode 100644 index 00000000000000..52b0df4535a491 --- /dev/null +++ b/python/paddle/data/logger.py @@ -0,0 +1,5 @@ +import logging + +__all__ = ['__logger__'] + +logger = logging.getLogger("paddle.data") From 7972f74f141e7b8f37495f2c351294c8b2b32a33 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 13 Jan 2017 16:53:34 +0800 Subject: [PATCH 11/18] Add md5 checks. --- python/paddle/data/amazon_product_reviews.py | 61 +++++++++++--------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/python/paddle/data/amazon_product_reviews.py b/python/paddle/data/amazon_product_reviews.py index 5a282293eb1be4..2f459bf98b2b60 100644 --- a/python/paddle/data/amazon_product_reviews.py +++ b/python/paddle/data/amazon_product_reviews.py @@ -62,11 +62,38 @@ class Categories(object): __md5__[Automotive] = '757fdb1ab2c5e2fc0934047721082011' __md5__[Baby] = '7698a4179a1d8385e946ed9083490d22' __md5__[Beauty] = '5d2ccdcd86641efcfbae344317c10829' + __md5__[Books] = 'bc1e2aa650fe51f978e9d3a7a4834bc6' + __md5__[CDsAndVinyl] = '82bffdc956e76c32fa655b98eca9576b' + __md5__[CellPhonesAndAccessories] = '903a19524d874970a2f0ae32a175a48f' + __md5__[ClothingShoesAndJewelry] = 'b333fba48651ea2309288aeb51f8c6e4' + __md5__[DigitalMusic] = '35e62f7a7475b53714f9b177d9dae3e7' + __md5__[Electronics] = 'e4524af6c644cd044b1969bac7b62b2a' + __md5__[GroceryAndGourmetFood] = 'd8720f98ea82c71fa5c1223f39b6e3d9' + __md5__[HealthAndPersonalCare] = '352ea1f780a8629783220c7c9a9f7575' + __md5__[HomeAndKitchen] = '90221797ccc4982f57e6a5652bea10fc' + __md5__[KindleStore] = 'b608740c754287090925a1a186505353' + __md5__[MoviesAndTV] = 'd3bb01cfcda2602c07bcdbf1c4222997' + __md5__[MusicalInstruments] = '8035b6e3f9194844785b3f4cee296577' + __md5__[OfficeProducts] = '1b7e64c707ecbdcdeca1efa09b716499' + __md5__[PatioLawnAndGarden] = '4d2669abc5319d0f073ec3c3a85f18af' + __md5__[PetSupplies] = '40568b32ca1536a4292e8410c5b9de12' + __md5__[SportsAndOutdoors] = '1df6269552761c82aaec9667bf9a0b1d' + __md5__[ToolsAndHomeImprovement] = '80bca79b84621d4848a88dcf37a1c34b' + __md5__[ToysAndGames] = 'dbd07c142c47473c6ee22b535caee81f' + __md5__[VideoGames] = '730612da2d6a93ed19f39a808b63993e' __all__ = ['fetch', 'Categories'] +def calculate_md5(fn): + h = hashlib.md5() + with open(fn, 'rb') as f: + for chunk in iter(lambda: f.read(4096), b""): + h.update(chunk) + return h.hexdigest() + + def fetch(category=None, directory=None): """ According to the source name,set the download path for source, @@ -87,33 +114,13 @@ def fetch(category=None, directory=None): if not os.path.exists(directory): os.makedirs(directory) - logger.info("Downloading amazon review dataset for %s category" % category) - return download(BASE_URL % category, - os.path.join(directory, '%s.json.gz' % category)) - -def calculate_md5(fn): - h = hashlib.md5() - with open(fn, 'rb') as f: - for chunk in iter(lambda: f.read(4096), b""): - h.update(chunk) - return h.hexdigest() - - -def main(): - categories = filter( - lambda c: getattr(Categories, c) not in Categories.__md5__.keys(), - filter(lambda c: c[0] != '_', dir(Categories))) + fn = os.path.join(directory, '%s.json.gz' % category) - for each in categories: - try: - filename = fetch(category=getattr(Categories, each)) - except Exception as e: - print type(e) - continue - print each, calculate_md5(filename) - os.remove(filename) + if os.path.exists(fn) and \ + calculate_md5(category) == Categories.__md5__[category]: + # already download. + return fn - -if __name__ == '__main__': - main() + logger.info("Downloading amazon review dataset for %s category" % category) + return download(BASE_URL % category, fn) From 294f2981a8c3c512f535bc6c0c3258022d6f48ab Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 13 Jan 2017 17:40:41 +0800 Subject: [PATCH 12/18] Add preprocess method for amazon reviews --- paddle/setup.py.in | 2 + python/paddle/data/amazon_product_reviews.py | 91 +++++++++++++++++++- 2 files changed, 91 insertions(+), 2 deletions(-) diff --git a/paddle/setup.py.in b/paddle/setup.py.in index e3650bf1c0c469..75c8325aec7bdc 100644 --- a/paddle/setup.py.in +++ b/paddle/setup.py.in @@ -69,6 +69,8 @@ setup(name="py_paddle", packages=['py_paddle'], include_dirs = include_dirs, install_requires = [ + 'h5py', + 'nltk', 'numpy>=1.8.0', # The numpy is required. 'protobuf>=2.4.1' # The paddle protobuf version ], diff --git a/python/paddle/data/amazon_product_reviews.py b/python/paddle/data/amazon_product_reviews.py index 2f459bf98b2b60..4cc56e4c6f77fa 100644 --- a/python/paddle/data/amazon_product_reviews.py +++ b/python/paddle/data/amazon_product_reviews.py @@ -23,7 +23,13 @@ import os from http_download import download from logger import logger +import gzip +import json import hashlib +import nltk +import collections +import h5py +import numpy BASE_URL = 'http://snap.stanford.edu/data/' \ 'amazon/productGraph/categoryFiles/reviews_%s_5.json.gz' @@ -83,7 +89,7 @@ class Categories(object): __md5__[VideoGames] = '730612da2d6a93ed19f39a808b63993e' -__all__ = ['fetch', 'Categories'] +__all__ = ['fetch', 'Categories', 'preprocess'] def calculate_md5(fn): @@ -118,9 +124,90 @@ def fetch(category=None, directory=None): fn = os.path.join(directory, '%s.json.gz' % category) if os.path.exists(fn) and \ - calculate_md5(category) == Categories.__md5__[category]: + calculate_md5(fn) == Categories.__md5__[category]: # already download. return fn logger.info("Downloading amazon review dataset for %s category" % category) return download(BASE_URL % category, fn) + + +def preprocess(category=None, directory=None): + """ + Download and preprocess amazon reviews data set. Save the preprocessed + result to hdf5 file. + + In preprocess, it uses nltk to tokenize english sentence. It is slightly + different from moses. But nltk is a pure python library, it could be + integrated well with Paddle. + + :return: hdf5 file name. + """ + if category is None: + category = Categories.Electronics + + if directory is None: + directory = os.path.expanduser( + os.path.join('~', 'paddle_data', 'amazon')) + + preprocess_fn = os.path.join(directory, '%s.hdf5' % category) + raw_file_fn = fetch(category, directory) + + word_dict = collections.defaultdict(int) + if not os.path.exists(preprocess_fn): # already preprocessed + with gzip.open(raw_file_fn, mode='r') as f: + for sample_num, line in enumerate(f): + txt = json.loads(line)['reviewText'] + try: # automatically download nltk tokenizer data. + words = nltk.tokenize.word_tokenize(txt, 'english') + except LookupError: + nltk.download('punkt') + words = nltk.tokenize.word_tokenize(txt, 'english') + for each_word in words: + word_dict[each_word] += 1 + sample_num += 1 + + word_dict_sorted = [] + for each in word_dict: + word_dict_sorted.append((each, word_dict[each])) + + word_dict_sorted.sort(cmp=lambda a, b: a[1] > b[1]) + + word_dict = dict() + + h5file = h5py.File(preprocess_fn, 'w') + try: + word_dict_h5 = h5file.create_dataset( + 'word_dict', + shape=(len(word_dict_sorted), ), + dtype=h5py.special_dtype(vlen=str)) + for i, each in enumerate(word_dict_sorted): + word_dict_h5[i] = each[0] + word_dict[each[0]] = i + + sentence = h5file.create_dataset( + 'sentence', + shape=(sample_num, ), + dtype=h5py.special_dtype(vlen=numpy.int32)) + + label = h5file.create_dataset( + 'label', shape=(sample_num, 1), dtype=numpy.int8) + + with gzip.open(raw_file_fn, mode='r') as f: + for i, line in enumerate(f): + obj = json.loads(line) + txt = obj['reviewText'] + score = numpy.int8(obj['overall']) + words = nltk.tokenize.word_tokenize(txt, 'english') + words = numpy.array( + [word_dict[w] for w in words], dtype=numpy.int32) + sentence[i] = words + label[i] = score + + finally: + h5file.close() + return preprocess_fn + + +if __name__ == '__main__': + preprocess(category=Categories.AmazonInstantVideo) From 20c96b73040f4b880618ddab0f195d8b8babd655 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Sat, 14 Jan 2017 15:53:05 +0800 Subject: [PATCH 13/18] Done with amazon product reviews. --- python/paddle/data/amazon_product_reviews.py | 84 ++++++++++++++++++- python/paddle/data/base.py | 15 ++++ python/test/__init__.py | 0 python/test/data/__init__.py | 0 .../test/data/test_amazon_product_reviews.py | 24 ++++++ 5 files changed, 119 insertions(+), 4 deletions(-) create mode 100644 python/paddle/data/base.py create mode 100644 python/test/__init__.py create mode 100644 python/test/data/__init__.py create mode 100644 python/test/data/test_amazon_product_reviews.py diff --git a/python/paddle/data/amazon_product_reviews.py b/python/paddle/data/amazon_product_reviews.py index 4cc56e4c6f77fa..bce74cb1a8bd8e 100644 --- a/python/paddle/data/amazon_product_reviews.py +++ b/python/paddle/data/amazon_product_reviews.py @@ -23,6 +23,7 @@ import os from http_download import download from logger import logger +from base import BaseDataSet import gzip import json import hashlib @@ -34,6 +35,9 @@ BASE_URL = 'http://snap.stanford.edu/data/' \ 'amazon/productGraph/categoryFiles/reviews_%s_5.json.gz' +DATASET_LABEL = 'label' +DATASET_SENTENCE = 'sentence' + class Categories(object): Books = "Books" @@ -89,7 +93,7 @@ class Categories(object): __md5__[VideoGames] = '730612da2d6a93ed19f39a808b63993e' -__all__ = ['fetch', 'Categories', 'preprocess'] +__all__ = ['fetch', 'Categories', 'preprocess', 'dataset'] def calculate_md5(fn): @@ -186,12 +190,12 @@ def preprocess(category=None, directory=None): word_dict[each[0]] = i sentence = h5file.create_dataset( - 'sentence', + DATASET_SENTENCE, shape=(sample_num, ), dtype=h5py.special_dtype(vlen=numpy.int32)) label = h5file.create_dataset( - 'label', shape=(sample_num, 1), dtype=numpy.int8) + DATASET_LABEL, shape=(sample_num, 1), dtype=numpy.int8) with gzip.open(raw_file_fn, mode='r') as f: for i, line in enumerate(f): @@ -209,5 +213,77 @@ def preprocess(category=None, directory=None): return preprocess_fn +class AmazonProductReviewsDataSet(BaseDataSet): + def __init__(self, + category=None, + directory=None, + test_ratio=0.1, + positive_threshold=5, + negative_threshold=2, + random_seed=0): + super(AmazonProductReviewsDataSet, self).__init__( + random_seed=random_seed) + + fn = preprocess(category=category, directory=directory) + + self.__h5file__ = h5py.File(fn, 'r') + + self.__label__ = self.__h5file__[DATASET_LABEL] + self.__sentence__ = self.__h5file__[DATASET_SENTENCE] + + positive_idx = [] + negative_idx = [] + for i, lbl in enumerate(self.__label__): + if lbl >= positive_threshold: + positive_idx.append(i) + elif lbl <= negative_threshold: + negative_idx.append(i) + + positive_len = int(test_ratio * len(positive_idx)) + negative_len = int(test_ratio * len(negative_idx)) + + self.__train_set__ = positive_idx[positive_len:] + negative_idx[ + negative_len:] + self.__test_set__ = positive_idx[: + positive_len] + negative_idx[: + negative_len] + self.__test_set__.sort() + self.__positive_threshold__ = positive_threshold + self.__negative_threshold__ = negative_threshold + self.__is_reading_train_data__ = False + + def __read_data__(self, idx): + return self.__sentence__[ + idx], self.__label__ >= self.__positive_threshold__ + + def train_data(self): + if self.__is_reading_train_data__: + raise RuntimeError("Should not get multiple train_data generators") + + self.__is_reading_train_data__ = True + try: + self.__random__.shuffle(self.__train_set__) + for each_id in self.__train_set__: + yield self.__read_data__(each_id) + finally: + self.__is_reading_train_data__ = False + + def test_data(self): + for each_id in self.__test_set__: + yield self.__read_data__(each_id) + + def __del__(self): + self.__h5file__.close() + + +dataset = AmazonProductReviewsDataSet + if __name__ == '__main__': - preprocess(category=Categories.AmazonInstantVideo) + ds = dataset(category=Categories.AmazonInstantVideo) + + for each_train_data in ds.train_data(): + # print each_train_data + pass + + for each_test_data in ds.test_data(): + pass diff --git a/python/paddle/data/base.py b/python/paddle/data/base.py new file mode 100644 index 00000000000000..3f049527a762c8 --- /dev/null +++ b/python/paddle/data/base.py @@ -0,0 +1,15 @@ +import random + +__all__ = ['BaseDataSet'] + + +class BaseDataSet(object): + def __init__(self, random_seed): + self.__random__ = random.Random() + self.__random__.seed(random_seed) + + def train_data(self): + raise NotImplemented() + + def test_data(self): + raise NotImplemented() diff --git a/python/test/__init__.py b/python/test/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/python/test/data/__init__.py b/python/test/data/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/python/test/data/test_amazon_product_reviews.py b/python/test/data/test_amazon_product_reviews.py new file mode 100644 index 00000000000000..8efcfd1f8268fc --- /dev/null +++ b/python/test/data/test_amazon_product_reviews.py @@ -0,0 +1,24 @@ +import unittest +from paddle.data import amazon_product_reviews + + +class AmazonReviewsTest(unittest.TestCase): + def test_read_data(self): + dataset = amazon_product_reviews.dataset( + category=amazon_product_reviews.Categories.AmazonInstantVideo, + positive_threshold=4, + negative_threshold=3) + + sample_num = 0 + + for _ in dataset.train_data(): + sample_num += 1 + + for _ in dataset.test_data(): + sample_num += 1 + + self.assertEqual(37126, sample_num) + + +if __name__ == '__main__': + unittest.main() From 0c76c644fe2575a367c286aa6ded3b6e57ac79d0 Mon Sep 17 00:00:00 2001 From: baidu Date: Mon, 16 Jan 2017 18:42:50 +0800 Subject: [PATCH 14/18] add new file path --- python/paddle/data/cifar.py | 77 +++++++++++++++++ python/paddle/data/mnist.py | 110 +++++++++++++++--------- python/paddle/data/recommendation.py | 98 +++++++++++++-------- python/paddle/data/semantic.py | 122 +++++++++++++-------------- python/paddle/data/seqToseq.py | 100 ++++++++++++++-------- 5 files changed, 338 insertions(+), 169 deletions(-) create mode 100644 python/paddle/data/cifar.py diff --git a/python/paddle/data/cifar.py b/python/paddle/data/cifar.py new file mode 100644 index 00000000000000..e038ebd76b0fd4 --- /dev/null +++ b/python/paddle/data/cifar.py @@ -0,0 +1,77 @@ +#/usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +A utility for fetching, reading CIFAR-10 dataset. + +https://www.cs.toronto.edu/~kriz/cifar.html +""" + +import os +from http_download import download +from logger import logger +import hashlib + +BASE_URL = 'https://www.cs.toronto.edu/~kriz/cifar-%s-python.tar.gz' + + +class Categories(object): + Ten = 10 + Hundred = 100 + + __md5__ = dict() + + __md5__[Ten] = 'c58f30108f718f92721af3b95e74349a' + __md5__[Hundred] = 'eb9058c3a382ffc7106e4002c42a8d85' + +__all__ = ['fetch', 'Categories'] + + +def calculate_md5(fn): + h = hashlib.md5() + with open(fn, 'rb') as f: + for chunk in iter(lambda: f.read(4096), b""): + h.update(chunk) + return h.hexdigest() + + +def fetch(category=None, directory=None): + """ + According to the source name,set the download path for source, + download the data from the source url,and return the download path to fetch for training api. + + Args: + + Returns: + path to untar file. + """ + if directory is None: + directory = os.path.expanduser( + os.path.join('~', 'paddle_data_directory', 'cifar')) + + if not os.path.exists(directory): + os.makedirs(directory) + + cn = 'cifar' + category + fn = os.path.join(directory, '%s.tar.gz' % cn) + + if os.path.exists(fn) and calculate_md5(fn) == Categories.__md5__[category]: + return fn + + logger.info("Downloading cifar dataset for %s category" % cn) + return download(BASE_URL % category, + os.path.join(directory, '%s.tar.gz' % cn)) diff --git a/python/paddle/data/mnist.py b/python/paddle/data/mnist.py index 5fe6f6dccc0875..23a7ed46b633e8 100644 --- a/python/paddle/data/mnist.py +++ b/python/paddle/data/mnist.py @@ -14,57 +14,89 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +A utility for fetching, reading mnist handwritten digit dataset. -############################################################################ -# -# Function for fetch the data untar directory for mnist training api. -# you can use this data for Digital identification. -# -# First,we special the data download directory is "~/paddle_data_directory". -# For the mnist dataset,it untar the dataset,and returns the untar -# directory for training api. -# -############################################################################ +http://yann.lecun.com/exdb/mnist/ +""" -import shutil import os -import sys -import collections -import numpy as np -from six.moves import urllib -import urlparse +from http_download import download +from logger import logger +from base import BaseDataSet import gzip -from http_download import data_download +import json +import hashlib +import nltk +import collections +import h5py +import numpy -source_url = 'http://yann.lecun.com/exdb/mnist/' -filename = [ - 'train-images-idx3-ubyte.gz', 't10k-images-idx3-ubyte.gz', - 'train-labels-idx1-ubyte.gz', 't10k-labels-idx1-ubyte.gz' -] +BASE_URL = 'http://yann.lecun.com/exdb/mnist/%s-ubyte.gz' -def fetch(directory=None): - """ - According to the source name,set the download path for source, - download the data from the source url,and return the download path to fetch for training api. +class Categories(object): + TrainImage = 'train-images-idx3' + TrainLabels = 'train-labels-idx1' + TestImage = 't10k-images-idx3' + TestLabels = 't10k-labels-idx1' + + All = [TrainImage, TrainLabels, TestImage, TestLabels] + + __md5__ = dict() + + __md5__[TrainImage] = 'f68b3c2dcbeaaa9fbdd348bbdeb94873' + __md5__[TrainLabels] = 'd53e105ee54ea40749a09fcbcd1e9432' + __md5__[TestImage] = '9fb629c4189551a2d022fa330f9573f3' + __md5__[TestLabels] = 'ec29112dd5afa0611ce80d1b7f02629c' - Args: - Returns: - path for untar file. +__all__ = ['fetch', 'Categories'] + + +def calculate_md5(fn): + h = hashlib.md5() + with open(fn, 'rb') as f: + for chunk in iter(lambda: f.read(4096), b""): + h.update(chunk) + return h.hexdigest() + + +def fetch_data(category=None, directory=None): + """ + Calculate each md5 value. + :param category: + :param directory: + :return: """ - source_name = "mnist" + cn = category + '-ubyte' + fn = os.path.join(directory, '%s.gz' % cn) + if os.path.exists(fn) and \ + calculate_md5(fn) == Categories.__md5__[category]: + return fn + logger.info("Downloading mnist handwritten digit dataset for %s category" % cn) + return download(BASE_URL % category, fn) + +def fetch(category=None, directory=None): + """ + According to the source name,set the download path for source, + download the data from the source url,and return the download path to fetch + for training api. + :param category: + :param directory: + :return: + """ if directory is None: directory = os.path.expanduser( - os.path.join('~', 'paddle_data_directory')) + os.path.join('~', 'paddle_data', 'mnist')) - download_path = os.path.join(directory, source_name) - if not os.path.exists(download_path): - os.makedirs(download_path) + if not os.path.exists(directory): + os.makedirs(directory) - for file in filename: - url = urlparse.urljoin(source_url, file) - filepath = data_download(download_path, url) - data_dir = os.path.join(filepath, file.split('.')[0]) - return data_dir + if category is None: + category = [category for category in Categories.All] + fl = [] # download file list + for index, line in range(len(category)): + fl.append(fetch_data(line, directory)) + return fl diff --git a/python/paddle/data/recommendation.py b/python/paddle/data/recommendation.py index 6c3fba55c8919b..602ac257ea9b21 100644 --- a/python/paddle/data/recommendation.py +++ b/python/paddle/data/recommendation.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +# /usr/bin/env python # -*- coding:utf-8 -*- # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved @@ -14,53 +14,81 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +A utility for fetching, reading MovieLens dataset. -############################################################################ -# -# Function for fetch the data untar directory for amazon training api. -# you can use this data for movie recommendation. -# -# First,we special the data download directory is "~/paddle_data_directory". -# For the movie recommendation dataset,it untar the dataset,and returns the -# untar directory for training api. -# -############################################################################## +http://files.grouplens.org/datasets/movielens +""" -import shutil import os -import sys -import zipfile +from http_download import download +from logger import logger +from base import BaseDataSet +import gzip +import json +import hashlib +import nltk import collections -import numpy as np -from six.moves import urllib -import stat -from http_download import data_download +import h5py +import numpy +BASE_URL = 'http://files.grouplens.org/datasets/movielens/%s.zip' -source_url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip' -file_source = "ml-1m" +class Categories(object): + M1m = "ml-1m" + M10m = "ml-10m" + M20m = "ml-20m" + M100k = "ml-100k" + MLatestSmall = "ml-latest-small" + MLatest = "ml-latest" + + __md5__ = dict() + + __md5__[M1m] = 'c4d9eecfca2ab87c1945afe126590906' + __md5__[M10m] = 'ce571fd55effeba0271552578f2648bd' + __md5__[M20m] = 'cd245b17a1ae2cc31bb14903e1204af3' + __md5__[M100k] = '0e33842e24a9c977be4e0107933c0723' + __md5__[MLatestSmall] = 'be5b02baacd9e70dd97734ea0e19528a' + __md5__[MLatest] = '0c827eaafc7e89c455986510827662bd' -def fetch(directory=None): - """ - According to the source name,set the download path for source, - download the data from the source url,and return the download path to fetch for training api. - Args: +__all__ = ['fetch', 'Categories', 'preprocess'] - Returns: - path to downloaded file. + +def calculate_md5(fn): + h = hashlib.md5() + with open(fn, 'rb') as f: + for chunk in iter(lambda: f.read(4096), b""): + h.update(chunk) + return h.hexdigest() + + +def fetch(category=None, directory=None): """ - source_name = "recommendation" + According to the source name,set the download path for source, + download the data from the source url,and return the download path to fetch + for training api. + :param category: + :param directory: + :return: + """ + if category is None: + category = Categories.M1m + if directory is None: directory = os.path.expanduser( - os.path.join('~', 'paddle_data_directory')) + os.path.join('~', 'paddle_data', 'recommendation')) + + if not os.path.exists(directory): + os.makedirs(directory) - download_path = os.path.join(directory, source_name) - if not os.path.exists(download_path): - os.makedirs(download_path) + fn = os.path.join(directory, '%s.zip' % category) - filepath = data_download(download_path, source_url) - data_path = os.path.join(filepath, file_source) + if os.path.exists(fn) and \ + calculate_md5(fn) == Categories.__md5__[category]: + # already download. + return fn - return data_path + logger.info("Downloading MovieLens dataset for %s category" % category) + return download(BASE_URL % category, fn) diff --git a/python/paddle/data/semantic.py b/python/paddle/data/semantic.py index dfafb5120cf88c..bf04c044c6322e 100644 --- a/python/paddle/data/semantic.py +++ b/python/paddle/data/semantic.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +# /usr/bin/env python # -*- coding:utf-8 -*- # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved @@ -14,78 +14,76 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +A utility for fetching, reading semantic data set. -############################################################################ -# -# Function for fetch the data untar directory for semantic_role_labeling -# training api.you can use this data for semantic. -# -# First,we special the data download directory is "~/paddle_data_directory". -# For the semantic role labeling,it untar the dataset,and returns the untar -# directory for training api. -# -############################################################################ +http://www.cs.upc.edu/~srlconll +""" -import shutil import os -import sys -import zipfile +from http_download import download +from logger import logger +from base import BaseDataSet +import gzip +import json +import hashlib +import nltk import collections -import numpy as np -from six.moves import urllib -from http_download import data_download +import h5py +import numpy +BASE_URL = 'http://www.cs.upc.edu/~srlconll/%s.tar.gz' -source_url = [ - 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz', - 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt', - 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt', - 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt', - 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb' -] +DATASET_LABEL = 'label' +DATASET_SENTENCE = 'sentence' -file_source = "conll05st-release" +class Categories(object): + Conll05test = "conll05st-tests" -def fetch(directory=None): - """ - According to the source name,set the download path for source, - download the data from the source url,and return the download path to fetch for training api. + __md5__ = dict() + + __md5__[Conll05test] = '387719152ae52d60422c016e92a742fc' + + +__all__ = ['fetch', 'Categories', 'preprocess', 'dataset'] + + +def calculate_md5(fn): + h = hashlib.md5() + with open(fn, 'rb') as f: + for chunk in iter(lambda: f.read(4096), b""): + h.update(chunk) + return h.hexdigest() - Args: - Returns: - path to downloaded file. +def fetch(category=None, directory=None): """ - source_name = "semantic" + According to the source name,set the download path for source, + download the data from the source url,and return the download path to fetch + for training api. + :param category: + :param directory: + :return: + """ + if category is None: + category = Categories.Conll05test + if directory is None: directory = os.path.expanduser( - os.path.join('~', 'paddle_data_directory')) - - download_path = os.path.join(directory, source_name) - if not os.path.exists(download_path): - os.makedirs(download_path) - - for url in source_url: - file_name = url.split('/')[-1] - if 'gz' in file_name: - filepath = data_download(download_path, url) - data_path = os.path.join(filepath, file_source) - - sub_file = ['est.wsj.words.gz', 'test.wsj.props.gz'] - words_path = os.path.join(data_path, - "test.wsj/words/test.wsj.words.gz") - props_path = os.path.join(data_path, - "test.wsj/props/test.wsj.props.gz") - - sub_path = [words_path, props_path] - for sub_file in sub_path: - new_sub_path = os.path.join(download_path, sub_file) - shutil.move(sub_path, new_subpath) - tarfile.open( - name=new_subpath, mode="r:gz").extractall(download_path) - os.remove(new_subpath) - else: - filepath = data_download(download_path, url) - - return filepath + os.path.join('~', 'paddle_data', 'amazon')) + + if not os.path.exists(directory): + os.makedirs(directory) + + fn = os.path.join(directory, '%s.json.gz' % category) + + if os.path.exists(fn) and \ + calculate_md5(fn) == Categories.__md5__[category]: + # already download. + return fn + + logger.info("Downloading amazon review dataset for %s category" % category) + return download(BASE_URL % category, fn) + + diff --git a/python/paddle/data/seqToseq.py b/python/paddle/data/seqToseq.py index ced53be2d1a9d5..8cdfd4ee68b959 100644 --- a/python/paddle/data/seqToseq.py +++ b/python/paddle/data/seqToseq.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +# /usr/bin/env python # -*- coding:utf-8 -*- # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved @@ -14,51 +14,85 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +A utility for fetching, reading sequence to sequence data set. + +http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data +""" -import shutil import os -import sys -import zipfile +from http_download import download +from logger import logger +from base import BaseDataSet +import gzip +import json +import hashlib +import nltk import collections -import numpy as np -from six.moves import urllib -import stat -from http_download import data_download +import h5py +import numpy + +BASE_URL = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/%s.tgz' + +DATASET_LABEL = 'label' +DATASET_SENTENCE = 'sentence' + + +class Categories(object): + BiTexts = "bitexts" + DevTest = "dev+test" + All = [BiTexts, DevTest] + + __md5__ = dict() + __md5__[BiTexts] = '15861dbac4a52c8c75561d5027062d7d' + __md5__[DevTest] = '7d7897317ddd8ba0ae5c5fa7248d3ff5' -source_url = [ - 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz', - 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz' -] -model_url = 'http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz' +__all__ = ['fetch', 'Categories', 'preprocess', 'dataset'] -model_source = "wmt14_model" -file_source = "bitexts.selected" +def calculate_md5(fn): + h = hashlib.md5() + with open(fn, 'rb') as f: + for chunk in iter(lambda: f.read(4096), b""): + h.update(chunk) + return h.hexdigest() -def fetch(directory=None): + +def fetch_data(category=None, directory=None): """ - According to the source name,set the download path for source, - download the data from the source url,and return the download path to fetch for training api. + Calculate each md5 value. + :param category: + :param directory: + :return: + """ + fn = os.path.join(directory, '%s.tgz' % category) + if os.path.exists(fn) and \ + calculate_md5(fn) == Categories.__md5__[category]: + return fn + logger.info("Downloading mnist handwritten digit dataset for %s category" % category) + return download(BASE_URL % category, fn) - Args: - Returns: - path to downloaded file. +def fetch(category=None, directory=None): + """ + According to the source name,set the download path for source, + download the data from the source url,and return the download path to fetch + for training api. + :param category: + :param directory: + :return: """ - source_name = "seqToseq" if directory is None: directory = os.path.expanduser( - os.path.join('~', 'paddle_data_directory')) - - download_path = os.path.join(directory, source_name) - if not os.path.exists(download_path): - os.makedirs(download_path) + os.path.join('~', 'paddle_data', 'seqToseq')) - model_data = data_download(download_path, model_url) - model_path = os.path.join(model_data, model_source) + if not os.path.exists(directory): + os.makedirs(directory) - for url in source_url: - filepath = data_download(download_path, url) - data_path = os.path.join(filepath, file_source) - return data_path + if category is None: + category = [category for category in Categories.All] + fl = [] # download file list + for index, line in range(len(category)): + fl.append(fetch_data(line, directory)) + return fl From 9a803d091ca84dc1457d4f19ecb34a0d4a5e6ffd Mon Sep 17 00:00:00 2001 From: baidu Date: Mon, 16 Jan 2017 18:46:54 +0800 Subject: [PATCH 15/18] update file path --- python/paddle/data/sentiment.py | 94 +++++++++++++++++++-------------- 1 file changed, 55 insertions(+), 39 deletions(-) diff --git a/python/paddle/data/sentiment.py b/python/paddle/data/sentiment.py index e0a72e0d9b9809..ef3300f2678d29 100644 --- a/python/paddle/data/sentiment.py +++ b/python/paddle/data/sentiment.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +# /usr/bin/env python # -*- coding:utf-8 -*- # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved @@ -14,58 +14,74 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +A utility for fetching, reading sentiment data set. -############################################################################ -# -# Function for fetch the data untar directory for sentiment training api. -# you can use this data for sentiment analasis. -# -# First,we special the data download directory is "~/paddle_data_directory". -# For the sentiment dataset,it untar the dataset,and returns the untar -# directory for training api. -# -############################################################################ +http://ai.stanford.edu/%7Eamaas/data/sentiment +""" -import shutil import os -import sys -import zipfile +from http_download import download +from logger import logger +from base import BaseDataSet +import gzip +import json +import hashlib +import nltk import collections -import numpy as np -from six.moves import urllib -import stat -from http_download import data_download +import h5py +import numpy -source_url = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz' -moses_url = 'https://github.com/moses-smt/mosesdecoder/archive/master.zip' +BASE_URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/%s.tar.gz' -moses_source = "mosesdecoder-master" -file_source = "aclImdb" +DATASET_LABEL = 'label' +DATASET_SENTENCE = 'sentence' -def fetch(directory=None): - """ - According to the source name,set the download path for source, - download the data from the source url,and return the download path to fetch for training api. - Args: +class Categories(object): + AclImdb = "aclImdb_v1" + + __md5__ = dict() + + __md5__[AclImdb] = '7c2ac02c03563afcf9b574c7e56c153a' - Returns: - path to downloaded file. +__all__ = ['fetch', 'Categories', 'preprocess', 'dataset'] + + +def calculate_md5(fn): + h = hashlib.md5() + with open(fn, 'rb') as f: + for chunk in iter(lambda: f.read(4096), b""): + h.update(chunk) + return h.hexdigest() + + +def fetch(category=None, directory=None): """ - source_name = "sentiment" + According to the source name,set the download path for source, + download the data from the source url,and return the download path to fetch + for training api. + :param category: + :param directory: + :return: + """ + if category is None: + category = Categories.AclImdb + if directory is None: directory = os.path.expanduser( - os.path.join('~', 'paddle_data_directory')) + os.path.join('~', 'paddle_data', 'sentiment')) - download_path = os.path.join(directory, source_name) - if not os.path.exists(download_path): - os.makedirs(download_path) + if not os.path.exists(directory): + os.makedirs(directory) - moses_path = data_download(download_path, moses_url) - moses_data = os.path.join(moses_path, moses_source) + fn = os.path.join(directory, '%s.tar.gz' % category) - filepath = data_download(download_path, source_url) - data_path = os.path.join(filepath, file_source) + if os.path.exists(fn) and \ + calculate_md5(fn) == Categories.__md5__[category]: + # already download. + return fn - return data_path + logger.info("Downloading binary sentiment classification dataset for %s category" % category) + return download(BASE_URL % category, fn) From c6a260068df61679a9f588744f821b3a7f476492 Mon Sep 17 00:00:00 2001 From: baidu Date: Fri, 20 Jan 2017 11:13:47 +0800 Subject: [PATCH 16/18] add Data md --- python/paddle/data/DATA.md | 34 +++++++++++++++++ python/paddle/data/cifar10.py | 68 ---------------------------------- python/paddle/data/semantic.py | 2 +- 3 files changed, 35 insertions(+), 69 deletions(-) create mode 100644 python/paddle/data/DATA.md delete mode 100644 python/paddle/data/cifar10.py diff --git a/python/paddle/data/DATA.md b/python/paddle/data/DATA.md new file mode 100644 index 00000000000000..1752d294b9e879 --- /dev/null +++ b/python/paddle/data/DATA.md @@ -0,0 +1,34 @@ +### 数据集 + +Paddle目前提供了很多demo,且各demo运行时需要从原生网站下载其数据,并进行复杂的预处理过程,整个过程会耗费大量时间。同时为了方便大家用Paddle做实验的时候,可以直接访问这些预处理好的数据,我们提供一套Python库。采用import数据源的方式(如:paddle.data.amazon_product_reviews)来简化获取训练所需数据的时间;但是如果你习惯自己处理原生数据,我们依然提供原生数据接口来满足你的需求。 + +## 接口设计 +数据集的导入通过import paddle.data.amazon_product_reviews 来实现,你可以直接通过load_data(category=None, +directory=None)获取你所需的数据集。考虑到类似Amazon的数据类型不止一种,通过category你可以选择控制所需要的数据源;如果你不指定数据源,默认为"Electronics"。directory用来指定下载路径,如果你不指定下载路径,默认为"~/paddle_data/amazon"。通过load_data()导入的数据源data为object,他是我们预处理的numpy格式数据,直接通过data.train_data()获取训练数据或者通过data.test_data()获取测试数据。你还可以打印训练数据和测试数据的数据信息, +```python + for each_train_data in data.train_data(): + print each_train_data +``` +即可。 + +具体的demo使用情况如下: +```python +import paddle.data.amazon_product_reviews as raw + +data = raw.load_data() +train_data = data.train_data() +test_data = data.test_data() +``` +你也可以打印出各数据集的数据信息: +```python +for each_train_data in data.train_data(): + print each_train_data +``` +打印出来的数据信息都是预处理之后的numpy格式的数据: +```python +(array([ 730143, 452087, 369164, 1128311, 1451292, 294749, 1370072, + 1202482, 1522860, 1055269, 39557, 1579, 1184187, 1410234, + 362445, 1133007, 1400596, 216811, 540527, 489771, 208467, + 369164, 311153, 387289, 801432, 433138, 179848, 320757, + 1410234], dtype=int32), True) +``` diff --git a/python/paddle/data/cifar10.py b/python/paddle/data/cifar10.py deleted file mode 100644 index d6d893288a851c..00000000000000 --- a/python/paddle/data/cifar10.py +++ /dev/null @@ -1,68 +0,0 @@ -#/usr/bin/env python -# -*- coding:utf-8 -*- - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -######################################################################## -# -# Function for fetch the data untar directory for cifar10 training api. -# you can use this data for image classifation and gun traing. -# As the python can read the data in "cifar-10-python.tar.gz",herer is -# no need to untar the data. -# -# -# First,we let the data download path is "~/paddle_data_directory", -# when u no special the download path. -# -# -# Then,download the cifar10 dataset,and returns the data directory for -# training api. -# -######################################################################## - -import shutil -import os -import sys -import collections -import numpy as np -from six.moves import urllib -from http_download import data_download - -source_url = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' -source_file = "cifar-10-batches-py" - - -def fetch(directory=None): - """ - According to the source name,set the download path for source, - download the data from the source url,and return the download path to fetch for training api. - - Args: - - Returns: - path to untar file. - """ - source_name = "cifar" - - if directory is None: - directory = os.path.expanduser( - os.path.join('~', 'paddle_data_directory')) - - download_path = os.path.join(directory, source_name) - if not os.path.exists(download_path): - os.makedirs(download_path) - filepath = data_download(download_path, source_url) - - return filepath diff --git a/python/paddle/data/semantic.py b/python/paddle/data/semantic.py index bf04c044c6322e..a5fb39ccdbffe4 100644 --- a/python/paddle/data/semantic.py +++ b/python/paddle/data/semantic.py @@ -71,7 +71,7 @@ def fetch(category=None, directory=None): if directory is None: directory = os.path.expanduser( - os.path.join('~', 'paddle_data', 'amazon')) + os.path.join('~', 'paddle_data', 'semantic')) if not os.path.exists(directory): os.makedirs(directory) From 3fa7f21a9a2eee7a79d589706b1faf1a3ced8630 Mon Sep 17 00:00:00 2001 From: baidu Date: Mon, 6 Feb 2017 20:11:07 +0800 Subject: [PATCH 17/18] new --- python/paddle/data/DATA.md | 64 +++--- python/paddle/data/amazon_product_reviews.py | 196 ++++++++++++------- 2 files changed, 159 insertions(+), 101 deletions(-) diff --git a/python/paddle/data/DATA.md b/python/paddle/data/DATA.md index 1752d294b9e879..cf326406705fd9 100644 --- a/python/paddle/data/DATA.md +++ b/python/paddle/data/DATA.md @@ -5,30 +5,40 @@ Paddle目前提供了很多demo,且各demo运行时需要从原生网站下载 ## 接口设计 数据集的导入通过import paddle.data.amazon_product_reviews 来实现,你可以直接通过load_data(category=None, directory=None)获取你所需的数据集。考虑到类似Amazon的数据类型不止一种,通过category你可以选择控制所需要的数据源;如果你不指定数据源,默认为"Electronics"。directory用来指定下载路径,如果你不指定下载路径,默认为"~/paddle_data/amazon"。通过load_data()导入的数据源data为object,他是我们预处理的numpy格式数据,直接通过data.train_data()获取训练数据或者通过data.test_data()获取测试数据。你还可以打印训练数据和测试数据的数据信息, -```python - for each_train_data in data.train_data(): - print each_train_data -``` -即可。 - -具体的demo使用情况如下: -```python -import paddle.data.amazon_product_reviews as raw - -data = raw.load_data() -train_data = data.train_data() -test_data = data.test_data() -``` -你也可以打印出各数据集的数据信息: -```python -for each_train_data in data.train_data(): - print each_train_data -``` -打印出来的数据信息都是预处理之后的numpy格式的数据: -```python -(array([ 730143, 452087, 369164, 1128311, 1451292, 294749, 1370072, - 1202482, 1522860, 1055269, 39557, 1579, 1184187, 1410234, - 362445, 1133007, 1400596, 216811, 540527, 489771, 208467, - 369164, 311153, 387289, 801432, 433138, 179848, 320757, - 1410234], dtype=int32), True) -``` + + ```python + for each_train_data in data.train_data(): + print each_train_data + ``` + 即可。 + + 具体的demo使用情况如下: + ```python + import paddle.data.amazon_product_reviews as raw + + raw.data(batch_size=10) + ``` + 你也可以打印出各数据集的数据信息: + 如果是测试集或者训练数据集,可以这么打印 + ```python + import paddle.data.amazon_product_reviews as raw + + raw.test_data(batch_size=10) + raw.train_data(batch_size=10) + + ``` + + 打印出来的数据信息都是预处理之后的numpy格式的数据: + ```python + (array([1370072, 884914, 1658622, 1562803, 1579, 369164, 1129091, + 1073545, 1410234, 857854, 672274, 884920, 1078270, 1410234, + 777903, 1352600, 497103, 132906, 239745, 65294, 1502324, + 1165610, 204273, 1610806, 942942, 709056, 452087, 118093, + 1410234], dtype=int32), array([ True], dtype=bool)) + (array([ 777903, 713632, 452087, 1647686, 877980, 294749, 1575945, + 662947, 1431519, 462950, 452087, 902916, 479242, 294749, + 1278816, 672274, 1579, 394865, 1129091, 1352600, 294749, + 1073545], dtype=int32), array([ True], dtype=bool)) + + ``` + diff --git a/python/paddle/data/amazon_product_reviews.py b/python/paddle/data/amazon_product_reviews.py index bce74cb1a8bd8e..ce6b52071d2fec 100644 --- a/python/paddle/data/amazon_product_reviews.py +++ b/python/paddle/data/amazon_product_reviews.py @@ -23,7 +23,6 @@ import os from http_download import download from logger import logger -from base import BaseDataSet import gzip import json import hashlib @@ -31,6 +30,8 @@ import collections import h5py import numpy +import random + BASE_URL = 'http://snap.stanford.edu/data/' \ 'amazon/productGraph/categoryFiles/reviews_%s_5.json.gz' @@ -38,6 +39,10 @@ DATASET_LABEL = 'label' DATASET_SENTENCE = 'sentence' +positive_threshold = 5 +negative_threshold = 2 + + class Categories(object): Books = "Books" @@ -93,7 +98,7 @@ class Categories(object): __md5__[VideoGames] = '730612da2d6a93ed19f39a808b63993e' -__all__ = ['fetch', 'Categories', 'preprocess', 'dataset'] +__all__ = ['fetch', 'Categories', 'preprocess', 'dataset', 'load_data'] def calculate_md5(fn): @@ -109,11 +114,9 @@ def fetch(category=None, directory=None): According to the source name,set the download path for source, download the data from the source url,and return the download path to fetch for training api. - - Args: - - Returns: - path for the data untar. + :param category: + :param directory: + :return: """ if category is None: category = Categories.Electronics @@ -213,77 +216,122 @@ def preprocess(category=None, directory=None): return preprocess_fn -class AmazonProductReviewsDataSet(BaseDataSet): - def __init__(self, - category=None, - directory=None, - test_ratio=0.1, - positive_threshold=5, - negative_threshold=2, - random_seed=0): - super(AmazonProductReviewsDataSet, self).__init__( - random_seed=random_seed) - - fn = preprocess(category=category, directory=directory) - - self.__h5file__ = h5py.File(fn, 'r') - - self.__label__ = self.__h5file__[DATASET_LABEL] - self.__sentence__ = self.__h5file__[DATASET_SENTENCE] - - positive_idx = [] - negative_idx = [] - for i, lbl in enumerate(self.__label__): - if lbl >= positive_threshold: - positive_idx.append(i) - elif lbl <= negative_threshold: - negative_idx.append(i) - - positive_len = int(test_ratio * len(positive_idx)) - negative_len = int(test_ratio * len(negative_idx)) - - self.__train_set__ = positive_idx[positive_len:] + negative_idx[ - negative_len:] - self.__test_set__ = positive_idx[: - positive_len] + negative_idx[: - negative_len] - self.__test_set__.sort() - self.__positive_threshold__ = positive_threshold - self.__negative_threshold__ = negative_threshold - self.__is_reading_train_data__ = False - - def __read_data__(self, idx): - return self.__sentence__[ - idx], self.__label__ >= self.__positive_threshold__ - - def train_data(self): - if self.__is_reading_train_data__: - raise RuntimeError("Should not get multiple train_data generators") - - self.__is_reading_train_data__ = True - try: - self.__random__.shuffle(self.__train_set__) - for each_id in self.__train_set__: - yield self.__read_data__(each_id) - finally: - self.__is_reading_train_data__ = False +def data(batch_size, category=None, directory=None): + """ - def test_data(self): - for each_id in self.__test_set__: - yield self.__read_data__(each_id) + :param batch_size: + :param category: + :param directory: + :return: + """ + if category is None: + category = Categories.Electronics - def __del__(self): - self.__h5file__.close() + if directory is None: + directory = os.path.expanduser( + os.path.join('~', 'paddle_data', 'amazon')) + fn = preprocess(category=category, directory=directory) + datasets = h5py.File(fn, 'r') -dataset = AmazonProductReviewsDataSet + label = datasets[DATASET_LABEL] + sentence = datasets[DATASET_SENTENCE] -if __name__ == '__main__': - ds = dataset(category=Categories.AmazonInstantVideo) + if label.shape[0] <= batch_size: + lens = label.shape[0] + else: + lens = batch_size + + for index in range(lens): + if label[index] >= positive_threshold: + print (numpy.array(sentence[index]), label[index] >= positive_threshold) + elif label[index] <= negative_threshold: + print (numpy.array(sentence[index]), label[index] <= negative_threshold) + + +def test_data(batch_size, category=None, directory=None): + """ + + :param batch_size: + :param category: + :param directory: + :return: + """ + if category is None: + category = Categories.Electronics + + if directory is None: + directory = os.path.expanduser( + os.path.join('~', 'paddle_data', 'amazon')) - for each_train_data in ds.train_data(): - # print each_train_data - pass + fn = preprocess(category=category, directory=directory) + datasets = h5py.File(fn, 'r') + + label = datasets[DATASET_LABEL] + sentence = datasets[DATASET_SENTENCE] + + if label.shape[0] <= batch_size: + lens = label.shape[0] + else: + lens = batch_size + + positive_idx = [] + negative_idx = [] + for i, lbl in enumerate(label): + if label[i] >= positive_threshold: + positive_idx.append(i) + elif lbl <= negative_threshold: + negative_idx.append(i) + + __test_set__ = positive_idx[:lens] + negative_idx[:lens] + + random.shuffle(__test_set__) + + for index in range(lens): + print (numpy.array(sentence[index]), label[index] >= positive_threshold) + + +def train_data(batch_size, category=None, directory=None): + """ + + :param batch_size: + :param category: + :param directory: + :return: + """ + if category is None: + category = Categories.Electronics + + if directory is None: + directory = os.path.expanduser( + os.path.join('~', 'paddle_data', 'amazon')) + + fn = preprocess(category=category, directory=directory) + datasets = h5py.File(fn, 'r') + + label = datasets[DATASET_LABEL] + sentence = datasets[DATASET_SENTENCE] + + if label.shape[0] <= batch_size: + lens = label.shape[0] + else: + lens = batch_size + + positive_idx = [] + negative_idx = [] + for i, lbl in enumerate(label): + if label[i] >= positive_threshold: + positive_idx.append(i) + elif lbl <= negative_threshold: + negative_idx.append(i) + __train_set__ = positive_idx[lens:] + negative_idx[lens:] + + random.shuffle(__train_set__) + + for index in range(lens): + print (numpy.array(sentence[index]), label[index] >= positive_threshold) + + +if __name__ == '__main__': + data(10) - for each_test_data in ds.test_data(): - pass From bee88c9f24c1f738d676a3dba58ceb4fdcf672fc Mon Sep 17 00:00:00 2001 From: baidu Date: Thu, 9 Feb 2017 15:42:56 +0800 Subject: [PATCH 18/18] updata amazon & cifar & mnist --- python/paddle/data/amazon_product_reviews.py | 4 +- python/paddle/data/cifar.py | 101 +++++++++++- python/paddle/data/mnist.py | 153 +++++++++++++------ 3 files changed, 203 insertions(+), 55 deletions(-) diff --git a/python/paddle/data/amazon_product_reviews.py b/python/paddle/data/amazon_product_reviews.py index ce6b52071d2fec..fba9db6ee9f588 100644 --- a/python/paddle/data/amazon_product_reviews.py +++ b/python/paddle/data/amazon_product_reviews.py @@ -98,7 +98,7 @@ class Categories(object): __md5__[VideoGames] = '730612da2d6a93ed19f39a808b63993e' -__all__ = ['fetch', 'Categories', 'preprocess', 'dataset', 'load_data'] +__all__ = ['fetch', 'data', 'train_data', 'test_data'] def calculate_md5(fn): @@ -112,7 +112,7 @@ def calculate_md5(fn): def fetch(category=None, directory=None): """ According to the source name,set the download path for source, - download the data from the source url,and return the download path to fetch + download the data from the source url, and return the download path to fetch for training api. :param category: :param directory: diff --git a/python/paddle/data/cifar.py b/python/paddle/data/cifar.py index e038ebd76b0fd4..8baf073709e3ef 100644 --- a/python/paddle/data/cifar.py +++ b/python/paddle/data/cifar.py @@ -22,23 +22,25 @@ """ import os +import cPickle from http_download import download from logger import logger import hashlib +import tarfile +import numpy BASE_URL = 'https://www.cs.toronto.edu/~kriz/cifar-%s-python.tar.gz' - +DATA = "cifar-10-batches-py" class Categories(object): - Ten = 10 - Hundred = 100 + Ten = '10' + Hundred = '100' __md5__ = dict() - __md5__[Ten] = 'c58f30108f718f92721af3b95e74349a' __md5__[Hundred] = 'eb9058c3a382ffc7106e4002c42a8d85' -__all__ = ['fetch', 'Categories'] +__all__ = ['fetch', 'Categories', 'train_data', 'test_data'] def calculate_md5(fn): @@ -59,9 +61,11 @@ def fetch(category=None, directory=None): Returns: path to untar file. """ + if category is None: + category = Categories.Ten if directory is None: directory = os.path.expanduser( - os.path.join('~', 'paddle_data_directory', 'cifar')) + os.path.join('~', 'paddle_data', 'cifar')) if not os.path.exists(directory): os.makedirs(directory) @@ -69,9 +73,92 @@ def fetch(category=None, directory=None): cn = 'cifar' + category fn = os.path.join(directory, '%s.tar.gz' % cn) - if os.path.exists(fn) and calculate_md5(fn) == Categories.__md5__[category]: + if os.path.exists(fn) and calculate_md5(fn) == \ + Categories.__md5__[category]: return fn logger.info("Downloading cifar dataset for %s category" % cn) return download(BASE_URL % category, os.path.join(directory, '%s.tar.gz' % cn)) + + +def untar(category=None, directory=None): + """ + + :param category: + :param directory: + :return: + """ + if directory is None: + directory = os.path.expanduser( + os.path.join('~', 'paddle_data', 'cifar')) + raw_file_fn = fetch(category, directory) + #raw_file_fn = os.path.join(directory, 'cifar10.tar.gz') + tar = tarfile.open(raw_file_fn, "r:gz") + names = tar.getnames() + for file in names: + tar.extract(file, directory) + tar.close() + + +def create_mean(dataset, directory=None): + """ + + :param dataset: + :param directory: + :return: + """ + if directory is None: + directory = os.path.expanduser( + os.path.join('~', 'paddle_data', 'cifar')) + + if not os.path.isfile("mean.meta"): + mean = numpy.zeros(3 * 32 * 3) + num = 0 + for f in dataset: + batch = numpy.load(f) + mean += batch['data'].sum(0) + num += len(batch['data']) + mean /= num + print mean.size + data = {"mean": mean, "size": mean.size} + cPickle.dump( + data, open("mean.meta", 'w'), protocol=cPickle.HIGHEST_PROTOCOL) + + +def train_data(directory=None): + """ + :param directory: + :return: + """ + if directory is None: + directory = os.path.expanduser(os.path.join('~', 'paddle_data', 'cifar')) + + untar() + datatset = [DATA + "/data_batch_%d" % (i + 1) for i in xrange(0, 5)] + for f in datatset: + train_set = os.path.join(directory, f) + fo = open(train_set, 'rb') + dict = cPickle.load(fo) + fo.close() + print dict + + +def test_data(directory=None): + """ + :param directory: + :return: + """ + if directory is None: + directory = os.path.expanduser(os.path.join('~', 'paddle_data', 'cifar')) + untar() + test_set = os.path.join(directory, DATA + "/test_batch") + fo = open(test_set, 'rb') + dict = cPickle.load(fo) + fo.close() + print dict + + +if __name__ == '__main__': + train_data() + #test_data() \ No newline at end of file diff --git a/python/paddle/data/mnist.py b/python/paddle/data/mnist.py index 23a7ed46b633e8..8308ae16eb9e2e 100644 --- a/python/paddle/data/mnist.py +++ b/python/paddle/data/mnist.py @@ -23,35 +23,22 @@ import os from http_download import download from logger import logger -from base import BaseDataSet import gzip -import json import hashlib -import nltk -import collections -import h5py import numpy +import struct -BASE_URL = 'http://yann.lecun.com/exdb/mnist/%s-ubyte.gz' +BASE_URL = 'http://yann.lecun.com/exdb/mnist/%s.gz' +FILE_NAME = { + 'train-images-idx3-ubyte': 'f68b3c2dcbeaaa9fbdd348bbdeb94873', + 'train-labels-idx1-ubyte': 'd53e105ee54ea40749a09fcbcd1e9432', + 't10k-images-idx3-ubyte': '9fb629c4189551a2d022fa330f9573f3', + 't10k-labels-idx1-ubyte': 'ec29112dd5afa0611ce80d1b7f02629c' +} -class Categories(object): - TrainImage = 'train-images-idx3' - TrainLabels = 'train-labels-idx1' - TestImage = 't10k-images-idx3' - TestLabels = 't10k-labels-idx1' - All = [TrainImage, TrainLabels, TestImage, TestLabels] - - __md5__ = dict() - - __md5__[TrainImage] = 'f68b3c2dcbeaaa9fbdd348bbdeb94873' - __md5__[TrainLabels] = 'd53e105ee54ea40749a09fcbcd1e9432' - __md5__[TestImage] = '9fb629c4189551a2d022fa330f9573f3' - __md5__[TestLabels] = 'ec29112dd5afa0611ce80d1b7f02629c' - - -__all__ = ['fetch', 'Categories'] +__all__ = ['train_data', 'test_data', 'fetch'] def calculate_md5(fn): @@ -62,28 +49,11 @@ def calculate_md5(fn): return h.hexdigest() -def fetch_data(category=None, directory=None): - """ - Calculate each md5 value. - :param category: - :param directory: - :return: - """ - cn = category + '-ubyte' - fn = os.path.join(directory, '%s.gz' % cn) - if os.path.exists(fn) and \ - calculate_md5(fn) == Categories.__md5__[category]: - return fn - logger.info("Downloading mnist handwritten digit dataset for %s category" % cn) - return download(BASE_URL % category, fn) - - -def fetch(category=None, directory=None): +def fetch(directory=None): """ According to the source name,set the download path for source, download the data from the source url,and return the download path to fetch for training api. - :param category: :param directory: :return: """ @@ -94,9 +64,100 @@ def fetch(category=None, directory=None): if not os.path.exists(directory): os.makedirs(directory) - if category is None: - category = [category for category in Categories.All] - fl = [] # download file list - for index, line in range(len(category)): - fl.append(fetch_data(line, directory)) - return fl + fl = [] + for index in range(len(FILE_NAME.keys())): + fn = os.path.join(directory, '%s.gz' % FILE_NAME.keys()[index]) + if os.path.exists(fn) and calculate_md5(fn) == FILE_NAME.keys()[0]: + return fn + logger.info("Downloading digital handwritten digit dataset for %s " % FILE_NAME.keys()[index]) + fl.append(download(BASE_URL % FILE_NAME.keys()[index], fn)) + + return fl + + +def preprocess(directory=None): + """ + :param category: + :param directory: + :return: + """ + if directory is None: + directory = os.path.expanduser(os.path.join('~', 'paddle_data', 'mnist')) + + raw_file_list = fetch(directory) + print raw_file_list + + for cn in raw_file_list: + sz = cn.split('.')[0] + print sz + g = gzip.GzipFile(fileobj=open(cn, 'rb')) + open(sz, 'wb').write(g.read()) + + +def data(filename, directory=None): + """ + :param filename: + :param directory: + :return: + """ + if directory is None: + directory = os.path.expanduser(os.path.join('~', 'paddle_data', 'mnist')) + + image = '-images-idx3-ubyte' + label = '-labels-idx1-ubyte' + + if filename is 'train': + image_file = os.path.join(directory, filename + image) + label_file = os.path.join(directory, filename + label) + else: + image_file = os.path.join(directory, 't10' + image) + label_file = os.path.join(directory, 't10' + label) + + if os.path.exists(image_file) and os.path.exists(label_file): + print "File is exists!" + else: + preprocess() + + print image_file + print label_file + + with open(image_file, "rb") as f: + num_magic, n, num_row, num_col = struct.unpack(">IIII", f.read(16)) + images = numpy.fromfile(f, 'ubyte', count=n * num_row * num_col).\ + reshape(n, num_row, num_col).astype('float32') + images = images / 255.0 * 2.0 - 1.0 + + with open(label_file, "rb") as fn: + num_magic, num_label = struct.unpack(">II", fn.read(8)) + labels = numpy.fromfile(fn, 'ubyte', count=num_label).astype('int') + + return images, labels + + +def train_data(directory=None): + """ + :param directory: + :return: + """ + if directory is None: + directory = os.path.expanduser(os.path.join('~', 'paddle_data', 'mnist')) + + train_images, train_labels = data('train') + print train_images, train_labels + + +def test_data(directory=None): + """ + :param directory: + :return: + """ + if directory is None: + directory = os.path.expanduser(os.path.join('~', 'paddle_data', 'mnist')) + + test_images, test_labels = data('test') + print test_images, test_labels + + +if __name__ == '__main__': + train_data() + #test_data() \ No newline at end of file