From aec761bfaffd70dd385c43edfa67563163a1ff3f Mon Sep 17 00:00:00 2001 From: Joppe Geluykens Date: Fri, 4 Nov 2022 18:44:48 +0000 Subject: [PATCH] Revert "Add H&M fashion recommendation dataset (#2708)" This reverts commit abfdc05018cc4dec5a2fed20ad09e94f1749fca9. --- .../configs/hm_fashion_recommendations.yaml | 17 --- ludwig/datasets/kaggle.py | 11 +- ludwig/datasets/loaders/dataset_loader.py | 3 +- .../loaders/hm_fashion_recommendations.py | 128 ------------------ .../datasets/titanic/test_titanic_workflow.py | 5 +- 5 files changed, 6 insertions(+), 158 deletions(-) delete mode 100644 ludwig/datasets/configs/hm_fashion_recommendations.yaml delete mode 100644 ludwig/datasets/loaders/hm_fashion_recommendations.py diff --git a/ludwig/datasets/configs/hm_fashion_recommendations.yaml b/ludwig/datasets/configs/hm_fashion_recommendations.yaml deleted file mode 100644 index 2274ababaf8..00000000000 --- a/ludwig/datasets/configs/hm_fashion_recommendations.yaml +++ /dev/null @@ -1,17 +0,0 @@ -version: 1.0 -name: hm_fashion_recommendations -kaggle_competition: h-and-m-personalized-fashion-recommendations -archive_filenames: - - articles.csv.zip - - customers.csv.zip - - transactions_train.csv.zip -sha256: - articles.csv.zip: 1c62791bac6a3db3df56e78c1509f45350ea98f083be8380c1d27a6f02bd014c - customers.csv.zip: 52b11c76e72dfe315c8a2e0a55d292e9f233a4fb0f5c69bf665be4b54cca9f23 - transactions_train.csv.zip: 5fcc90e80355e35aa04c04aa5c47587f4c6817eb3ba4ed3dc685b89a591559e3 -loader: hm_fashion_recommendations.HMLoader -description: | - H&M Group data for recommendations based on data from previous transactions, as well as from customer and - product meta data. The available meta data spans from simple data, such as garment type and customer age, - to text data from product descriptions, to image data from garment images. - https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations diff --git a/ludwig/datasets/kaggle.py b/ludwig/datasets/kaggle.py index e81ca0ad939..24723cbcfa5 100644 --- a/ludwig/datasets/kaggle.py +++ b/ludwig/datasets/kaggle.py @@ -29,7 +29,6 @@ def download_kaggle_dataset( kaggle_competition: Optional[str] = None, kaggle_username: Optional[str] = None, kaggle_key: Optional[str] = None, - filenames: Optional[list] = None, ): """Download all files in a kaggle dataset. One of kaggle_dataset_id, @@ -41,12 +40,8 @@ def download_kaggle_dataset( api = create_kaggle_client() api.authenticate() with upload_output_directory(download_directory) as (tmpdir, _): - dataset_or_competition = kaggle_competition or kaggle_dataset_id - if filenames: - download_fn = api.competition_download_file if kaggle_competition else api.dataset_download_file - for filename in filenames: - download_fn(dataset_or_competition, filename, path=tmpdir) + if kaggle_competition: + api.competition_download_files(kaggle_competition, path=tmpdir) else: - download_fn = api.competition_download_files if kaggle_competition else api.dataset_download_files - download_fn(dataset_or_competition, path=tmpdir) + api.dataset_download_files(kaggle_dataset_id, path=tmpdir) return [os.path.join(download_directory, f) for f in os.listdir(download_directory)] diff --git a/ludwig/datasets/loaders/dataset_loader.py b/ludwig/datasets/loaders/dataset_loader.py index 831f7aac82d..75950db4248 100644 --- a/ludwig/datasets/loaders/dataset_loader.py +++ b/ludwig/datasets/loaders/dataset_loader.py @@ -279,7 +279,7 @@ def load(self, split=False, kaggle_username=None, kaggle_key=None) -> pd.DataFra :param split: (bool) splits dataset along 'split' column if present. The split column should always have values 0: train, 1: validation, 2: test. """ - self._download_and_process(kaggle_username=kaggle_username, kaggle_key=kaggle_key) + self._download_and_process() if self.state == DatasetState.TRANSFORMED: dataset_df = self.load_transformed_dataset() if split: @@ -297,7 +297,6 @@ def download(self, kaggle_username=None, kaggle_key=None): kaggle_competition=self.config.kaggle_competition, kaggle_username=kaggle_username, kaggle_key=kaggle_key, - filenames=self.download_filenames, ) else: for url, filename in zip(self.download_urls, self.download_filenames): diff --git a/ludwig/datasets/loaders/hm_fashion_recommendations.py b/ludwig/datasets/loaders/hm_fashion_recommendations.py deleted file mode 100644 index 2722d27ce41..00000000000 --- a/ludwig/datasets/loaders/hm_fashion_recommendations.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2022 Predibase, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -from typing import List - -import numpy as np -import pandas as pd - -from ludwig.backend.base import LocalBackend -from ludwig.constants import SPLIT -from ludwig.data.split import get_splitter -from ludwig.datasets.loaders.dataset_loader import DatasetLoader - - -def _merge_dataframes(transactions_df, articles_df, customers_df): - """Merge the transactions, articles, and customers dataframes into a single dataframe.""" - # Merge the transactions and articles dataframes - transactions_df = pd.merge( - transactions_df, - articles_df, - how="left", - left_on="article_id", - right_on="article_id", - ) - - # Merge the transactions and customers dataframes - transactions_df = pd.merge( - transactions_df, - customers_df, - how="left", - left_on="customer_id", - right_on="customer_id", - ) - - return transactions_df - - -def _split(df): - """Split the dataframe into train, validation, and test dataframes. - - The split is done in a chronological manner based on the year_month column. The split is done by customer_id, - so that interactions for a given customer are present in all splits. - - Params: - df: The dataframe to split. - - Returns: - A tuple of (train_df, validation_df, test_df). - """ - splitter = get_splitter("datetime", column="year_month", probabilities=(0.7, 0.2, 0.1)) - - if not isinstance(df, pd.DataFrame): - df = df.compute() - - train_dfs, val_dfs, test_dfs = [], [], [] - for customer_id in df["customer_id"].unique(): - # Split per customer_id to ensure that interactions for a customer are across all splits - train_df, val_df, test_df = splitter.split(df[df["customer_id"] == customer_id], backend=LocalBackend()) - - train_dfs.append(train_df) - val_dfs.append(val_df) - test_dfs.append(test_df) - - return pd.concat(train_dfs), pd.concat(val_dfs), pd.concat(test_dfs) - - -class HMLoader(DatasetLoader): - def load_unprocessed_dataframe(self, file_paths: List[str], sample=True) -> pd.DataFrame: - """Load the dataframes from the given file paths. - - Params: - file_paths: A list of file paths to load the dataframes from. - sample: Whether to sample the dataframes. Since the dataset is quite large (31M transactions), this defaults - to True, which takes data after August 21, 2020 for a sample of 100 customers. - - Returns: - A single dataframe containing transactions, articles, and customers data. - """ - # Load transactions - df = pd.read_csv(file_paths[2]) - df["t_dat"] = pd.to_datetime(df.t_dat) - df["year_month"] = df.t_dat.dt.to_period("M").dt.strftime("%Y-%m") - - if sample: - df = df[df.t_dat > "2020-08-21"] - customer_ids = np.random.choice(df.customer_id, 100, replace=False) - df = df[df.customer_id.isin(customer_ids)] - - # 1. Set label to 1 for all known transactions, since the customer bought the article - df["label"] = 1 - - # 2. Split the data into train, validation and test sets. We split per customer_id to ensure that interactions - # for a customer are across all splits - train_df, val_df, test_df = _split(df) - - train_df[SPLIT] = 0 - val_df[SPLIT] = 1 - test_df[SPLIT] = 2 - df = pd.concat([train_df, val_df, test_df]) - - # 3. Add customer and article features - articles_df = pd.read_csv(file_paths[0]) - customers_df = pd.read_csv(file_paths[1]) - df = _merge_dataframes(df, articles_df, customers_df) - - # TODO(joppe): add image url once all images are available in a public bucket - # # Add image url - # def img_url_or_none(article_id): - # url = f"https://h-and-m-kaggle-images.s3.us-west-2.amazonaws.com/{article_id}.jpg" - # try: - # status_code = requests.head(url, headers={"Access-Control-Request-Method": "GET"}).status_code - # return url if status_code == 200 else None - # except: - # return None - # df["img_url"] = df["article_id"].apply(img_url_or_none, meta=("img_url", "object")) - - return df diff --git a/tests/ludwig/datasets/titanic/test_titanic_workflow.py b/tests/ludwig/datasets/titanic/test_titanic_workflow.py index 5e3eb482472..e1b1580b3e4 100644 --- a/tests/ludwig/datasets/titanic/test_titanic_workflow.py +++ b/tests/ludwig/datasets/titanic/test_titanic_workflow.py @@ -77,16 +77,15 @@ def test_download_titanic_dataset(tmpdir): test_filenames="test.csv", ) - def download_file(competition_name, filename, path): + def download_files(competition_name, path): assert competition_name == "titanic" - assert filename == "titanic.zip" copy(archive_filename, path) ludwig.datasets._get_dataset_configs.cache_clear() with mock.patch("ludwig.datasets._load_dataset_config", return_value=config): with mock.patch("ludwig.datasets.kaggle.create_kaggle_client") as mock_kaggle_cls: mock_kaggle_api = mock.MagicMock() - mock_kaggle_api.competition_download_file = download_file + mock_kaggle_api.competition_download_files = download_files mock_kaggle_cls.return_value = mock_kaggle_api dataset = ludwig.datasets.get_dataset("titanic", cache_dir=tmpdir)