Skip to content

Commit

Permalink
upload the original yelp dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
ShadowTinker committed Oct 12, 2024
1 parent 23af6d8 commit 67aa7ad
Show file tree
Hide file tree
Showing 3 changed files with 316,453 additions and 72 deletions.
170 changes: 98 additions & 72 deletions dataset/preprocess_yelp.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -15,7 +15,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -38,12 +38,12 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"root = '/root/dataset/yelp/'\n",
"output_path = './yelp-50/'\n",
"root = '/data/mjyin/DR4SR/dataset/yelp/yelp/'\n",
"output_path = './yelp/'\n",
"dataset_name_list = [\n",
" \"yelp\",\n",
"]\n",
Expand All @@ -64,7 +64,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -78,15 +78,6 @@
" dataset_list.append(dataset)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(dataset_list[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -103,9 +94,17 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 23,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"done!\n"
]
}
],
"source": [
"filtered_dataset_list = []\n",
"for dataset in dataset_list:\n",
Expand All @@ -129,7 +128,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -140,9 +139,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 25,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(30431,)\n",
"(20033,)\n"
]
}
],
"source": [
"user_id, user_token = pd.factorize(all_user)\n",
"item_id, item_token = pd.factorize(all_item)\n",
Expand All @@ -156,7 +164,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -166,7 +174,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -189,7 +197,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -205,7 +213,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -219,7 +227,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -228,65 +236,83 @@
"dataset = mapped_dataset_list[0]\n",
"dataset = dataset.sort_values(by=['user_id', 'timestamp'])\n",
"user_group = dataset.groupby('user_id')['item_id'].apply(to_list)\n",
"torch.save(user_group.tolist(), 'pattern-yelp-50.pth')"
"domain_name = output_dataset_name_list[0]\n",
"pattern_out_path = os.path.join(output_path, domain_name, f'seq2pat_data.pth')\n",
"torch.save(user_group.tolist(), pattern_out_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 18,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"yelp done!\n"
]
}
],
"source": [
"\n",
"for idx, dataset in enumerate(mapped_dataset_list):\n",
" dataset = dataset.sort_values(by=['user_id', 'timestamp'])\n",
" user_group = dataset.groupby('user_id')['item_id'].apply(list)\n",
" train, val, test = [], [], []\n",
" for user_id, user_seq in list(zip(user_group.index, user_group.tolist())):\n",
" # user_seq = user_seq[-max_seq_len:]\n",
" # iterate on each user sequence\n",
" # ------ test sample ------------\n",
" history, seq_len = truncate_or_pad(user_seq[:-1])\n",
" target_data = user_seq[-1]\n",
" label = 1\n",
" domain_id = [idx] * max_seq_len\n",
" user_hist = user_seq[:-1]\n",
" test.append([user_id, history, target_data, seq_len, label, domain_id, user_hist])\n",
" # ------ val sample -------------\n",
" history, seq_len = truncate_or_pad(user_seq[:-2])\n",
" target_data = user_seq[-2]\n",
" label = 1\n",
" domain_id = [idx] * max_seq_len\n",
" user_hist = user_seq[:-2]\n",
" val.append([user_id, history, target_data, seq_len, label, domain_id, user_hist])\n",
" # ------ train sample -----------\n",
" train_seq = user_seq[:-2]\n",
" assert len(train_seq) >=2\n",
" s, seq_len = 0, 1\n",
" while(True):\n",
" if s + seq_len == len(train_seq):\n",
" break\n",
" if seq_len > max_seq_len:\n",
" s += 1\n",
" seq_len = max_seq_len\n",
" history, _ = truncate_or_pad(train_seq[s:s + seq_len])\n",
" target_data = train_seq[s + seq_len]\n",
" label = 1\n",
" domain_id = idx\n",
" train.append([user_id, history, target_data, seq_len, label, domain_id])\n",
"# for idx, dataset in enumerate(mapped_dataset_list):\n",
"# dataset = dataset.sort_values(by=['user_id', 'timestamp'])\n",
"# user_group = dataset.groupby('user_id')['item_id'].apply(list)\n",
"# train, val, test = [], [], []\n",
"# for user_id, user_seq in list(zip(user_group.index, user_group.tolist())):\n",
"# # user_seq = user_seq[-max_seq_len:]\n",
"# # iterate on each user sequence\n",
"# # ------ test sample ------------\n",
"# history, seq_len = truncate_or_pad(user_seq[:-1])\n",
"# target_data = user_seq[-1]\n",
"# label = 1\n",
"# domain_id = [idx] * max_seq_len\n",
"# user_hist = user_seq[:-1]\n",
"# test.append([user_id, history, target_data, seq_len, label, domain_id, user_hist])\n",
"# # ------ val sample -------------\n",
"# history, seq_len = truncate_or_pad(user_seq[:-2])\n",
"# target_data = user_seq[-2]\n",
"# label = 1\n",
"# domain_id = [idx] * max_seq_len\n",
"# user_hist = user_seq[:-2]\n",
"# val.append([user_id, history, target_data, seq_len, label, domain_id, user_hist])\n",
"# # ------ train sample -----------\n",
"# train_seq = user_seq[:-2]\n",
"# assert len(train_seq) >=2\n",
"# s, seq_len = 0, 1\n",
"# while(True):\n",
"# if s + seq_len == len(train_seq):\n",
"# break\n",
"# if seq_len > max_seq_len:\n",
"# s += 1\n",
"# seq_len = max_seq_len\n",
"# history, _ = truncate_or_pad(train_seq[s:s + seq_len])\n",
"# target_data = train_seq[s + seq_len]\n",
"# label = 1\n",
"# domain_id = idx\n",
"# train.append([user_id, history, target_data, seq_len, label, domain_id])\n",
"\n",
" seq_len += 1\n",
" torch.save(train, os.path.join(output_path, output_dataset_name_list[idx], 'train.pth'))\n",
" torch.save(val, os.path.join(output_path, output_dataset_name_list[idx], 'val.pth'))\n",
" torch.save(test, os.path.join(output_path, output_dataset_name_list[idx], 'test.pth'))\n",
" print('{} done!'.format(output_dataset_name_list[idx]))"
"# seq_len += 1\n",
"# torch.save(train, os.path.join(output_path, output_dataset_name_list[idx], 'train.pth'))\n",
"# torch.save(val, os.path.join(output_path, output_dataset_name_list[idx], 'val.pth'))\n",
"# torch.save(test, os.path.join(output_path, output_dataset_name_list[idx], 'test.pth'))\n",
"# print('{} done!'.format(output_dataset_name_list[idx]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 32,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"yelp done!\n"
]
}
],
"source": [
"\n",
"\n",
Expand Down Expand Up @@ -379,7 +405,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.0"
"version": "3.9.19"
}
},
"nbformat": 4,
Expand Down
Binary file added dataset/yelp/yelp/seq2pat_data.pth
Binary file not shown.
Loading

0 comments on commit 67aa7ad

Please sign in to comment.