From aec761bfaffd70dd385c43edfa67563163a1ff3f Mon Sep 17 00:00:00 2001
From: Joppe Geluykens <joppe@predibase.com>
Date: Fri, 4 Nov 2022 18:44:48 +0000
Subject: [PATCH] Revert "Add H&M fashion recommendation dataset (#2708)"

This reverts commit abfdc05018cc4dec5a2fed20ad09e94f1749fca9.
---
 .../configs/hm_fashion_recommendations.yaml   |  17 ---
 ludwig/datasets/kaggle.py                     |  11 +-
 ludwig/datasets/loaders/dataset_loader.py     |   3 +-
 .../loaders/hm_fashion_recommendations.py     | 128 ------------------
 .../datasets/titanic/test_titanic_workflow.py |   5 +-
 5 files changed, 6 insertions(+), 158 deletions(-)
 delete mode 100644 ludwig/datasets/configs/hm_fashion_recommendations.yaml
 delete mode 100644 ludwig/datasets/loaders/hm_fashion_recommendations.py

diff --git a/ludwig/datasets/configs/hm_fashion_recommendations.yaml b/ludwig/datasets/configs/hm_fashion_recommendations.yaml
deleted file mode 100644
index 2274ababaf8..00000000000
--- a/ludwig/datasets/configs/hm_fashion_recommendations.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-version: 1.0
-name: hm_fashion_recommendations
-kaggle_competition: h-and-m-personalized-fashion-recommendations
-archive_filenames:
-  - articles.csv.zip
-  - customers.csv.zip
-  - transactions_train.csv.zip
-sha256:
-  articles.csv.zip: 1c62791bac6a3db3df56e78c1509f45350ea98f083be8380c1d27a6f02bd014c
-  customers.csv.zip: 52b11c76e72dfe315c8a2e0a55d292e9f233a4fb0f5c69bf665be4b54cca9f23
-  transactions_train.csv.zip: 5fcc90e80355e35aa04c04aa5c47587f4c6817eb3ba4ed3dc685b89a591559e3
-loader: hm_fashion_recommendations.HMLoader
-description: |
-  H&M Group data for recommendations based on data from previous transactions, as well as from customer and
-  product meta data. The available meta data spans from simple data, such as garment type and customer age,
-  to text data from product descriptions, to image data from garment images.
-  https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations
diff --git a/ludwig/datasets/kaggle.py b/ludwig/datasets/kaggle.py
index e81ca0ad939..24723cbcfa5 100644
--- a/ludwig/datasets/kaggle.py
+++ b/ludwig/datasets/kaggle.py
@@ -29,7 +29,6 @@ def download_kaggle_dataset(
     kaggle_competition: Optional[str] = None,
     kaggle_username: Optional[str] = None,
     kaggle_key: Optional[str] = None,
-    filenames: Optional[list] = None,
 ):
     """Download all files in a kaggle dataset. One of kaggle_dataset_id,
 
@@ -41,12 +40,8 @@ def download_kaggle_dataset(
         api = create_kaggle_client()
         api.authenticate()
     with upload_output_directory(download_directory) as (tmpdir, _):
-        dataset_or_competition = kaggle_competition or kaggle_dataset_id
-        if filenames:
-            download_fn = api.competition_download_file if kaggle_competition else api.dataset_download_file
-            for filename in filenames:
-                download_fn(dataset_or_competition, filename, path=tmpdir)
+        if kaggle_competition:
+            api.competition_download_files(kaggle_competition, path=tmpdir)
         else:
-            download_fn = api.competition_download_files if kaggle_competition else api.dataset_download_files
-            download_fn(dataset_or_competition, path=tmpdir)
+            api.dataset_download_files(kaggle_dataset_id, path=tmpdir)
     return [os.path.join(download_directory, f) for f in os.listdir(download_directory)]
diff --git a/ludwig/datasets/loaders/dataset_loader.py b/ludwig/datasets/loaders/dataset_loader.py
index 831f7aac82d..75950db4248 100644
--- a/ludwig/datasets/loaders/dataset_loader.py
+++ b/ludwig/datasets/loaders/dataset_loader.py
@@ -279,7 +279,7 @@ def load(self, split=False, kaggle_username=None, kaggle_key=None) -> pd.DataFra
         :param split: (bool) splits dataset along 'split' column if present. The split column should always have values
         0: train, 1: validation, 2: test.
         """
-        self._download_and_process(kaggle_username=kaggle_username, kaggle_key=kaggle_key)
+        self._download_and_process()
         if self.state == DatasetState.TRANSFORMED:
             dataset_df = self.load_transformed_dataset()
             if split:
@@ -297,7 +297,6 @@ def download(self, kaggle_username=None, kaggle_key=None):
                 kaggle_competition=self.config.kaggle_competition,
                 kaggle_username=kaggle_username,
                 kaggle_key=kaggle_key,
-                filenames=self.download_filenames,
             )
         else:
             for url, filename in zip(self.download_urls, self.download_filenames):
diff --git a/ludwig/datasets/loaders/hm_fashion_recommendations.py b/ludwig/datasets/loaders/hm_fashion_recommendations.py
deleted file mode 100644
index 2722d27ce41..00000000000
--- a/ludwig/datasets/loaders/hm_fashion_recommendations.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2022 Predibase, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-from typing import List
-
-import numpy as np
-import pandas as pd
-
-from ludwig.backend.base import LocalBackend
-from ludwig.constants import SPLIT
-from ludwig.data.split import get_splitter
-from ludwig.datasets.loaders.dataset_loader import DatasetLoader
-
-
-def _merge_dataframes(transactions_df, articles_df, customers_df):
-    """Merge the transactions, articles, and customers dataframes into a single dataframe."""
-    # Merge the transactions and articles dataframes
-    transactions_df = pd.merge(
-        transactions_df,
-        articles_df,
-        how="left",
-        left_on="article_id",
-        right_on="article_id",
-    )
-
-    # Merge the transactions and customers dataframes
-    transactions_df = pd.merge(
-        transactions_df,
-        customers_df,
-        how="left",
-        left_on="customer_id",
-        right_on="customer_id",
-    )
-
-    return transactions_df
-
-
-def _split(df):
-    """Split the dataframe into train, validation, and test dataframes.
-
-    The split is done in a chronological manner based on the year_month column. The split is done by customer_id,
-    so that interactions for a given customer are present in all splits.
-
-    Params:
-        df: The dataframe to split.
-
-    Returns:
-        A tuple of (train_df, validation_df, test_df).
-    """
-    splitter = get_splitter("datetime", column="year_month", probabilities=(0.7, 0.2, 0.1))
-
-    if not isinstance(df, pd.DataFrame):
-        df = df.compute()
-
-    train_dfs, val_dfs, test_dfs = [], [], []
-    for customer_id in df["customer_id"].unique():
-        # Split per customer_id to ensure that interactions for a customer are across all splits
-        train_df, val_df, test_df = splitter.split(df[df["customer_id"] == customer_id], backend=LocalBackend())
-
-        train_dfs.append(train_df)
-        val_dfs.append(val_df)
-        test_dfs.append(test_df)
-
-    return pd.concat(train_dfs), pd.concat(val_dfs), pd.concat(test_dfs)
-
-
-class HMLoader(DatasetLoader):
-    def load_unprocessed_dataframe(self, file_paths: List[str], sample=True) -> pd.DataFrame:
-        """Load the dataframes from the given file paths.
-
-        Params:
-            file_paths: A list of file paths to load the dataframes from.
-            sample: Whether to sample the dataframes. Since the dataset is quite large (31M transactions), this defaults
-                to True, which takes data after August 21, 2020 for a sample of 100 customers.
-
-        Returns:
-            A single dataframe containing transactions, articles, and customers data.
-        """
-        # Load transactions
-        df = pd.read_csv(file_paths[2])
-        df["t_dat"] = pd.to_datetime(df.t_dat)
-        df["year_month"] = df.t_dat.dt.to_period("M").dt.strftime("%Y-%m")
-
-        if sample:
-            df = df[df.t_dat > "2020-08-21"]
-            customer_ids = np.random.choice(df.customer_id, 100, replace=False)
-            df = df[df.customer_id.isin(customer_ids)]
-
-        # 1. Set label to 1 for all known transactions, since the customer bought the article
-        df["label"] = 1
-
-        # 2. Split the data into train, validation and test sets. We split per customer_id to ensure that interactions
-        # for a customer are across all splits
-        train_df, val_df, test_df = _split(df)
-
-        train_df[SPLIT] = 0
-        val_df[SPLIT] = 1
-        test_df[SPLIT] = 2
-        df = pd.concat([train_df, val_df, test_df])
-
-        # 3. Add customer and article features
-        articles_df = pd.read_csv(file_paths[0])
-        customers_df = pd.read_csv(file_paths[1])
-        df = _merge_dataframes(df, articles_df, customers_df)
-
-        # TODO(joppe): add image url once all images are available in a public bucket
-        # # Add image url
-        # def img_url_or_none(article_id):
-        #     url = f"https://h-and-m-kaggle-images.s3.us-west-2.amazonaws.com/{article_id}.jpg"
-        #     try:
-        #         status_code = requests.head(url, headers={"Access-Control-Request-Method": "GET"}).status_code
-        #         return url if status_code == 200 else None
-        #     except:
-        #         return None
-        # df["img_url"] = df["article_id"].apply(img_url_or_none, meta=("img_url", "object"))
-
-        return df
diff --git a/tests/ludwig/datasets/titanic/test_titanic_workflow.py b/tests/ludwig/datasets/titanic/test_titanic_workflow.py
index 5e3eb482472..e1b1580b3e4 100644
--- a/tests/ludwig/datasets/titanic/test_titanic_workflow.py
+++ b/tests/ludwig/datasets/titanic/test_titanic_workflow.py
@@ -77,16 +77,15 @@ def test_download_titanic_dataset(tmpdir):
         test_filenames="test.csv",
     )
 
-    def download_file(competition_name, filename, path):
+    def download_files(competition_name, path):
         assert competition_name == "titanic"
-        assert filename == "titanic.zip"
         copy(archive_filename, path)
 
     ludwig.datasets._get_dataset_configs.cache_clear()
     with mock.patch("ludwig.datasets._load_dataset_config", return_value=config):
         with mock.patch("ludwig.datasets.kaggle.create_kaggle_client") as mock_kaggle_cls:
             mock_kaggle_api = mock.MagicMock()
-            mock_kaggle_api.competition_download_file = download_file
+            mock_kaggle_api.competition_download_files = download_files
             mock_kaggle_cls.return_value = mock_kaggle_api
 
             dataset = ludwig.datasets.get_dataset("titanic", cache_dir=tmpdir)