Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEA: Auto-downloading atomic files #851

Merged
merged 5 commits into from
Jun 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions recbole/data/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import copy
import pickle
import os
import yaml
from collections import Counter
from logging import getLogger

Expand All @@ -27,6 +28,7 @@
from recbole.data.interaction import Interaction
from recbole.utils import FeatureSource, FeatureType, get_local_time
from recbole.utils.utils import set_color
from recbole.utils.url import decide_download, download_url, extract_zip, makedirs, rename_atomic_files


class Dataset(object):
Expand Down Expand Up @@ -195,6 +197,38 @@ def _build_feat_name_list(self):
feat_name_list.append(f'{suf}_feat')
return feat_name_list

def _get_download_url(self, url_file, allow_none=False):
current_path = os.path.dirname(os.path.realpath(__file__))
with open(os.path.join(current_path, f'../../properties/dataset/{url_file}.yaml')) as f:
dataset2url = yaml.load(f.read(), Loader=self.config.yaml_loader)

if self.dataset_name in dataset2url:
url = dataset2url[self.dataset_name]
return url
elif allow_none:
return None
else:
raise ValueError(f'Neither [{self.dataset_path}] exists in the device'
f'nor [{self.dataset_name}] a known dataset name.')

def _download(self):
url = self._get_download_url('url')
self.logger.info(f'Prepare to download dataset [{self.dataset_name}] from [{url}].')

if decide_download(url):
makedirs(self.dataset_path)
path = download_url(url, self.dataset_path)
extract_zip(path, self.dataset_path)
os.unlink(path)

basename = os.path.splitext(os.path.basename(path))[0]
rename_atomic_files(self.dataset_path, basename, self.dataset_name)

self.logger.info('Downloading done.')
else:
self.logger.info('Stop download.')
exit(-1)

def _load_data(self, token, dataset_path):
"""Load features.

Expand Down Expand Up @@ -224,6 +258,9 @@ def _load_inter_feat(self, token, dataset_path):
dataset_path (str): path of dataset dir.
"""
if self.benchmark_filename_list is None:
if not os.path.exists(dataset_path):
self._download()

inter_feat_path = os.path.join(dataset_path, f'{token}.inter')
if not os.path.isfile(inter_feat_path):
raise ValueError(f'File {inter_feat_path} not exist.')
Expand Down
26 changes: 26 additions & 0 deletions recbole/data/dataset/kg_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from recbole.data.dataset import Dataset
from recbole.utils import FeatureSource, FeatureType
from recbole.utils.utils import set_color
from recbole.utils.url import decide_download, download_url, extract_zip


class KnowledgeBasedDataset(Dataset):
Expand Down Expand Up @@ -110,6 +111,31 @@ def _filter_link(self):
for ent in illegal_ent:
del self.entity2item[ent]

def _download(self):
super()._download()

url = self._get_download_url('kg_url', allow_none=True)
if url is None:
return
self.logger.info(f'Prepare to download linked knowledge graph from [{url}].')

if decide_download(url):
# No need to create dir, as `super()._download()` has created one.
path = download_url(url, self.dataset_path)
extract_zip(path, self.dataset_path)
os.unlink(path)
self.logger.info(
f'\nLinked KG for [{self.dataset_name}] requires additional conversion '
f'to atomic files (.kg and .link).\n'
f'Please refer to https://github.com/RUCAIBox/RecSysDatasets/conversion_tools#knowledge-aware-datasets '
f'for detailed instructions.\n'
f'You can run RecBole after the conversion, see you soon.'
)
exit(0)
else:
self.logger.info('Stop download.')
exit(-1)

def _load_data(self, token, dataset_path):
super()._load_data(token, dataset_path)
self.kg_feat = self._load_kg(self.dataset_name, self.dataset_path)
Expand Down
11 changes: 11 additions & 0 deletions recbole/properties/dataset/kg_url.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
amazon-books: https://recbole.s3-accelerate.amazonaws.com/KGDatasets/Amazon-book-KG.zip
lfm1b-albums-merged: https://recbole.s3-accelerate.amazonaws.com/KGDatasets/LFM-1b-KG.zip
lfm1b-artists-merged: https://recbole.s3-accelerate.amazonaws.com/KGDatasets/LFM-1b-KG.zip
lfm1b-tracks-merged: https://recbole.s3-accelerate.amazonaws.com/KGDatasets/LFM-1b-KG.zip
lfm1b-albums-not-merged: https://recbole.s3-accelerate.amazonaws.com/KGDatasets/LFM-1b-KG.zip
lfm1b-artists-not-merged: https://recbole.s3-accelerate.amazonaws.com/KGDatasets/LFM-1b-KG.zip
lfm1b-tracks-not-merged: https://recbole.s3-accelerate.amazonaws.com/KGDatasets/LFM-1b-KG.zip
ml-100k: https://recbole.s3-accelerate.amazonaws.com/KGDatasets/MovieLens-KG.zip
ml-1m: https://recbole.s3-accelerate.amazonaws.com/KGDatasets/MovieLens-KG.zip
ml-10m: https://recbole.s3-accelerate.amazonaws.com/KGDatasets/MovieLens-KG.zip
ml-20m: https://recbole.s3-accelerate.amazonaws.com/KGDatasets/MovieLens-KG.zip
84 changes: 84 additions & 0 deletions recbole/properties/dataset/url.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
adult: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Adult/adult.zip
amazon-apps-for-android: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Apps_for_Android.zip
amazon-automotive: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Automotive.zip
amazon-baby: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Baby.zip
amazon-beauty: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Beauty.zip
amazon-books: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Books.zip
amazon-cell-phones-accessories: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Cell_Phones_and_Accessories.zip
amazon-clothing-shoes-jewelry: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Clothing_Shoes_and_Jewelry.zip
amazon-digital-music: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Digital_Music.zip
amazon-electronics: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Electronics.zip
amazon-grocery-gourmet-food: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Grocery_and_Gourmet_Food.zip
amazon-health-personal-care: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Health_and_Personal_Care.zip
amazon-home-kitchen: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Home_and_Kitchen.zip
amazon-instant-video: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Instant_Video.zip
amazon-kindle-store: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Kindle_Store.zip
amazon-musical-instruments: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Musical_Instruments.zip
amazon-movies-tv: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Movies_and_TV.zip
amazon-office-products: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Office_Products.zip
amazon-patio-lawn-garden: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Patio_Lawn_and_Garden.zip
amazon-pet-supplies: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Pet_Supplies.zip
amazon-sports-outdoors: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Sports_and_Outdoors.zip
amazon-tools-home-improvement: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Tools_and_Home_Improvement.zip
amazon-toys-games: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Toys_and_Games.zip
amazon-video-games: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Amazon_ratings/Amazon_Video_Games.zip
anime: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Anime/anime.zip
avazu: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Avazu/avazu.zip
book-crossing: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Book-Crossing/book-crossing.zip
criteo: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Criteo/criteo.zip
diginetica-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/DIGINETICA/merged/diginetica.zip
diginetica-not-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/DIGINETICA/not_merged/diginetica.zip
douban: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Douban/douban.zip
epinions: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Epinions/epinions.zip
foursquare-nyc-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Foursquare/merged/foursquare_NYC.zip
foursquare-tky-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Foursquare/merged/foursquare_TKY.zip
foursquare-nyc-not-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Foursquare/not_merged/foursquare_NYC.zip
foursquare-tky-not-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Foursquare/not_merged/foursquare_TKY.zip
gowalla-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Gowalla/merged/gowalla.zip
gowalla-not-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Gowalla/not_merged/gowalla.zip
ipinyou-click-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/iPinYou/merged/ipinyou-click.zip
ipinyou-view-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/iPinYou/merged/ipinyou-view.zip
ipinyou-click-not-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/iPinYou/not_merged/ipinyou-click.zip
ipinyou-view-not-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/iPinYou/not_merged/ipinyou-view.zip
jester: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Jester/jester.zip
kdd2010-algebra2006-2007: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/KDD2010/KDD2010-algebra2006_2007.zip
kdd2010-algebra2008-2009: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/KDD2010/KDD2010-algebra2008_2009.zip
kdd2010-bridge-to-algebra2006-2007: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/KDD2010/KDD2010-bridge-to-algebra2006_2007.zip
lastfm: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/LastFM/lastfm.zip
lfm1b-albums-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/LFM-1b/merged/lfm1b-albums.zip
lfm1b-artists-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/LFM-1b/merged/lfm1b-artists.zip
lfm1b-tracks-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/LFM-1b/merged/lfm1b-tracks.zip
lfm1b-albums-not-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/LFM-1b/not_merged/lfm1b-albums.zip
lfm1b-artists-not-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/LFM-1b/not_merged/lfm1b-artists.zip
lfm1b-tracks-not-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/LFM-1b/not_merged/lfm1b-tracks.zip
mind-large-dev: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/MIND/mind_large_dev.zip
mind-large-train: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/MIND/mind_large_train.zip
mind-small-dev: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/MIND/mind_small_dev.zip
mind-small_train: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/MIND/mind_small_train.zip
ml-100k: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/MovieLens/ml-100k.zip
ml-1m: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/MovieLens/ml-1m.zip
ml-10m: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/MovieLens/ml-10m.zip
ml-20m: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/MovieLens/ml-20m.zip
netflix: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Netflix/netflix.zip
phishing-website: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Phishing-websites/phishing-website.zip
pinterest: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Pinterest/pinterest.zip
retailrocket-addtocart-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Retailrocket/merged/retailrocket-addtocart.zip
retailrocket-transaction-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Retailrocket/merged/retailrocket-transaction.zip
retailrocket-view-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Retailrocket/merged/retailrocket-view.zip
retailrocket-addtocart-not-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Retailrocket/not_merged/retailrocket-addtocart.zip
retailrocket-transaction-not-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Retailrocket/not_merged/retailrocket-transaction.zip
retailrocket-view-not-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Retailrocket/not_merged/retailrocket-view.zip
steam-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Steam/merged/steam.zip
steam-not-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Steam/not_merged/steam.zip
ta-feng-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Ta-Feng/merged/ta-feng.zip
ta-feng-not-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Ta-Feng/not_merged/ta-feng.zip
tmall-buy-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Tmall/merged/tmall-buy.zip
tmall-click-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Tmall/merged/tmall-click.zip
tmall-buy-not-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Tmall/not_merged/tmall-buy.zip
tmall-click-not-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Tmall/not_merged/tmall-click.zip
yahoo-music: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Yahoo-Music/yahoo-music.zip
yelp: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/Yelp/yelp.zip
yoochoose-buys-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/YOOCHOOSE/merged/yoochoose-buys.zip
yoochoose-clicks-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/YOOCHOOSE/merged/yoochoose-clicks.zip
yoochoose-buys-not-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/YOOCHOOSE/not_merged/yoochoose-buys.zip
yoochoose-clicks-not-merged: https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/YOOCHOOSE/not_merged/yoochoose-clicks.zip
114 changes: 114 additions & 0 deletions recbole/utils/url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
'''
recbole.utils.url
################################
Reference code:
https://github.com/snap-stanford/ogb/blob/master/ogb/utils/url.py
'''

import urllib.request as ur
import zipfile
import os
import os.path as osp
import errno
from logging import getLogger

from tqdm import tqdm


GBFACTOR = float(1 << 30)


def decide_download(url):
d = ur.urlopen(url)
size = int(d.info()['Content-Length'])/GBFACTOR

### confirm if larger than 1GB
if size > 1:
return input('This will download %.2fGB. Will you proceed? (y/N)\n' % (size)).lower() == 'y'
else:
return True


def makedirs(path):
try:
os.makedirs(osp.expanduser(osp.normpath(path)))
except OSError as e:
if e.errno != errno.EEXIST and osp.isdir(path):
raise e


def download_url(url, folder):
'''Downloads the content of an URL to a specific folder.

Args:
url (string): The url.
folder (string): The folder.
'''

filename = url.rpartition('/')[2]
path = osp.join(folder, filename)
logger = getLogger()

if osp.exists(path) and osp.getsize(path) > 0: # pragma: no cover
logger.info('Using exist file', filename)
return path

logger.info('Downloading', url)

makedirs(folder)
data = ur.urlopen(url)

size = int(data.info()['Content-Length'])

chunk_size = 1024*1024
num_iter = int(size/chunk_size) + 2

downloaded_size = 0

try:
with open(path, 'wb') as f:
pbar = tqdm(range(num_iter))
for i in pbar:
chunk = data.read(chunk_size)
downloaded_size += len(chunk)
pbar.set_description('Downloaded {:.2f} GB'.format(float(downloaded_size)/GBFACTOR))
f.write(chunk)
except:
if os.path.exists(path):
os.remove(path)
raise RuntimeError('Stopped downloading due to interruption.')


return path


def extract_zip(path, folder):
'''Extracts a zip archive to a specific folder.

Args:
path (string): The path to the tar archive.
folder (string): The folder.
'''
logger = getLogger()
logger.info('Extracting', path)
with zipfile.ZipFile(path, 'r') as f:
f.extractall(folder)


def rename_atomic_files(folder, old_name, new_name):
'''Rename all atomic files in a given folder.

Args:
folder (string): The folder.
old_name (string): Old name for atomic files.
new_name (string): New name for atomic files.
'''
files = os.listdir(folder)
for f in files:
base, suf = os.path.splitext(f)
assert base == old_name
assert suf in {'.inter', '.user', '.item'}
os.rename(os.path.join(folder, f), os.path.join(folder, new_name + suf))

if __name__ == '__main__':
pass