diff --git a/CHANGELOG.md b/CHANGELOG.md index cf09541..3d83c72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ **FuxiCTR v2.2.0, 2024-02-17** + [Feature] Add support of npz format for pretrained_emb ++ [Refactor] Change data format from h5 to npz ------------------------------- @@ -13,14 +14,14 @@ **FuxiCTR v2.1.3, 2024-02-17** + [Feature] Add GDCN model -+ [Edit] Rename FINAL model to FinalNet -+ [Edit] Update RecZoo URLs ++ [Refactor] Rename FINAL model to FinalNet ++ [Refactor] Update RecZoo URLs + [Fix] Fix bug #75 + [Fix] Fix h5 file extenstion issue + [Fix] Fix typo in FinalNet **FuxiCTR v2.1.2, 2023-11-01** -+ [Edit] Update H5DataBlockLoader to support dataloader with multiprocessing ++ [Refactor] Update H5DataBlockLoader to support dataloader with multiprocessing **FuxiCTR v2.1.1, 2023-10-26** + [Feature] Update to allow loading pretrained h5 directly in PretrainedEmbedding (skip key mapping in preprocess) @@ -45,7 +46,7 @@ **FuxiCTR v2.0.2, 2023-05-14** + [Feature] Update FINAL, DIEN -+ [Edit] Update ordered_features to use_features ++ [Refactor] Update ordered_features to use_features **FuxiCTR v2.0.1, 2023-02-15** + [Doc] Add fuxictr tutorials @@ -55,7 +56,7 @@ **FuxiCTR v2.0.0, 2023-01-19** + [Feature] Add more models of year 2021-2022. + [Feature] Add tensorflow backbone support -+ [Edit] Refine code structure to support model development with minimal code ++ [Refactor] Refine code structure to support model development with minimal code ------------------------------- @@ -85,7 +86,7 @@ + [Feature] Add new feature for loading blocks of h5 data + [Feature] Add tests for DIN, FmFM + [Feature] Add support for multiple fields concat for DIN -+ [Edit] Remove the unnecessary config of embedding_dropout because it does not help after some attempts ++ [Refactor] Remove the unnecessary config of embedding_dropout because it does not help after some attempts + [Feature] Add embedding_hooks of dense layers on pretrained embeddings + [Fix] Fix the bug in padding_idx (have no effect on Criteo/Avazu results) + [Fix] Fix the bug in loading pretrained embeddings (have no effect on Criteo/Avazu results) @@ -96,7 +97,7 @@ ### FuxiCTR v1.0 **FuxiCTR v1.0.2, 2021-12-01** -+ [Edit] Refactor the code and documentation to support reproducing the BARS-CTR benchmark. ++ [Refactor] Refactor the code and documentation to support reproducing the BARS-CTR benchmark. **FuxiCTR v1.0.1, 2021-10-01** + [Feature] The first release of FuxiCTR, including 28 models. This version was used for the CIKM'21 paper. diff --git a/data/tiny_csv/userid_emb_dim8.h5 b/data/tiny_csv/userid_emb_dim8.h5 deleted file mode 100644 index 65e3f5b..0000000 Binary files a/data/tiny_csv/userid_emb_dim8.h5 and /dev/null differ diff --git a/data/tiny_csv/userid_emb_dim8.npz b/data/tiny_csv/userid_emb_dim8.npz new file mode 100644 index 0000000..ee0c16c Binary files /dev/null and b/data/tiny_csv/userid_emb_dim8.npz differ diff --git a/data/tiny_h5/test.h5 b/data/tiny_h5/test.h5 deleted file mode 100644 index 1d9bdc7..0000000 Binary files a/data/tiny_h5/test.h5 and /dev/null differ diff --git a/data/tiny_h5/train.h5 b/data/tiny_h5/train.h5 deleted file mode 100644 index 7e576c1..0000000 Binary files a/data/tiny_h5/train.h5 and /dev/null differ diff --git a/data/tiny_h5/valid.h5 b/data/tiny_h5/valid.h5 deleted file mode 100644 index 1d9bdc7..0000000 Binary files a/data/tiny_h5/valid.h5 and /dev/null differ diff --git a/data/tiny_h5/feature_map.json b/data/tiny_npz/feature_map.json similarity index 87% rename from data/tiny_h5/feature_map.json rename to data/tiny_npz/feature_map.json index c6e8320..cc914f7 100644 --- a/data/tiny_h5/feature_map.json +++ b/data/tiny_npz/feature_map.json @@ -12,7 +12,6 @@ "source": "user", "type": "categorical", "padding_idx": 0, - "oov_idx": 25, "vocab_size": 26 } }, @@ -21,7 +20,6 @@ "source": "item", "type": "categorical", "padding_idx": 0, - "oov_idx": 95, "vocab_size": 96 } }, @@ -30,7 +28,6 @@ "source": "context", "type": "categorical", "padding_idx": 0, - "oov_idx": 3, "vocab_size": 4 } }, @@ -39,7 +36,6 @@ "source": "item", "type": "categorical", "padding_idx": 0, - "oov_idx": 48, "vocab_size": 49 } }, @@ -48,7 +44,6 @@ "source": "item", "type": "categorical", "padding_idx": 0, - "oov_idx": 98, "vocab_size": 99 } }, @@ -57,7 +52,6 @@ "source": "item", "type": "categorical", "padding_idx": 0, - "oov_idx": 97, "vocab_size": 98 } }, @@ -66,7 +60,6 @@ "source": "item", "type": "categorical", "padding_idx": 0, - "oov_idx": 66, "vocab_size": 67 } }, @@ -75,7 +68,6 @@ "source": "user", "type": "categorical", "padding_idx": 0, - "oov_idx": 10, "vocab_size": 11 } }, @@ -84,7 +76,6 @@ "source": "user", "type": "categorical", "padding_idx": 0, - "oov_idx": 10, "vocab_size": 11 } }, @@ -93,7 +84,6 @@ "source": "user", "type": "categorical", "padding_idx": 0, - "oov_idx": 3, "vocab_size": 4 } }, @@ -102,7 +92,6 @@ "source": "user", "type": "categorical", "padding_idx": 0, - "oov_idx": 6, "vocab_size": 7 } }, @@ -111,7 +100,6 @@ "source": "user", "type": "categorical", "padding_idx": 0, - "oov_idx": 3, "vocab_size": 4 } }, @@ -120,7 +108,6 @@ "source": "user", "type": "categorical", "padding_idx": 0, - "oov_idx": 4, "vocab_size": 5 } }, @@ -129,7 +116,6 @@ "source": "user", "type": "categorical", "padding_idx": 0, - "oov_idx": 3, "vocab_size": 4 } } diff --git a/data/tiny_npz/test.npz b/data/tiny_npz/test.npz new file mode 100644 index 0000000..c4bf5a6 Binary files /dev/null and b/data/tiny_npz/test.npz differ diff --git a/data/tiny_npz/train.npz b/data/tiny_npz/train.npz new file mode 100644 index 0000000..6bdb328 Binary files /dev/null and b/data/tiny_npz/train.npz differ diff --git a/data/tiny_npz/valid.npz b/data/tiny_npz/valid.npz new file mode 100644 index 0000000..c4bf5a6 Binary files /dev/null and b/data/tiny_npz/valid.npz differ diff --git a/data/tiny_seq/feature_map.json b/data/tiny_seq/feature_map.json index db9a2f1..ee08df1 100644 --- a/data/tiny_seq/feature_map.json +++ b/data/tiny_seq/feature_map.json @@ -12,7 +12,6 @@ "source": "", "type": "categorical", "padding_idx": 0, - "oov_idx": 25, "vocab_size": 26 } }, @@ -21,7 +20,6 @@ "source": "", "type": "categorical", "padding_idx": 0, - "oov_idx": 95, "vocab_size": 96 } }, @@ -30,7 +28,6 @@ "source": "", "type": "categorical", "padding_idx": 0, - "oov_idx": 3, "vocab_size": 4 } }, @@ -39,7 +36,6 @@ "source": "", "type": "categorical", "padding_idx": 0, - "oov_idx": 48, "vocab_size": 49 } }, @@ -48,7 +44,6 @@ "source": "", "type": "categorical", "padding_idx": 0, - "oov_idx": 98, "vocab_size": 99 } }, @@ -57,7 +52,6 @@ "source": "", "type": "categorical", "padding_idx": 0, - "oov_idx": 97, "vocab_size": 98 } }, @@ -66,7 +60,6 @@ "source": "", "type": "categorical", "padding_idx": 0, - "oov_idx": 66, "vocab_size": 67 } }, @@ -75,7 +68,6 @@ "source": "", "type": "categorical", "padding_idx": 0, - "oov_idx": 10, "vocab_size": 11 } }, @@ -84,7 +76,6 @@ "source": "", "type": "categorical", "padding_idx": 0, - "oov_idx": 10, "vocab_size": 11 } }, @@ -93,7 +84,6 @@ "source": "", "type": "categorical", "padding_idx": 0, - "oov_idx": 3, "vocab_size": 4 } }, @@ -102,7 +92,6 @@ "source": "", "type": "categorical", "padding_idx": 0, - "oov_idx": 6, "vocab_size": 7 } }, @@ -111,7 +100,6 @@ "source": "", "type": "categorical", "padding_idx": 0, - "oov_idx": 3, "vocab_size": 4 } }, @@ -120,7 +108,6 @@ "source": "", "type": "categorical", "padding_idx": 0, - "oov_idx": 4, "vocab_size": 5 } }, @@ -129,7 +116,6 @@ "source": "", "type": "categorical", "padding_idx": 0, - "oov_idx": 3, "vocab_size": 4 } }, @@ -139,10 +125,9 @@ "type": "sequence", "share_embedding": "adgroup_id", "padding_idx": 0, - "oov_idx": 95, "vocab_size": 96, "max_len": 5 } } ] -} \ No newline at end of file +} diff --git a/data/tiny_seq/test.h5 b/data/tiny_seq/test.h5 deleted file mode 100644 index 4199a5e..0000000 Binary files a/data/tiny_seq/test.h5 and /dev/null differ diff --git a/data/tiny_seq/test.npz b/data/tiny_seq/test.npz new file mode 100644 index 0000000..cb479f2 Binary files /dev/null and b/data/tiny_seq/test.npz differ diff --git a/data/tiny_seq/train.h5 b/data/tiny_seq/train.h5 deleted file mode 100644 index e2ed507..0000000 Binary files a/data/tiny_seq/train.h5 and /dev/null differ diff --git a/data/tiny_seq/train.npz b/data/tiny_seq/train.npz new file mode 100644 index 0000000..5ef8c43 Binary files /dev/null and b/data/tiny_seq/train.npz differ diff --git a/data/tiny_seq/valid.h5 b/data/tiny_seq/valid.h5 deleted file mode 100644 index 4199a5e..0000000 Binary files a/data/tiny_seq/valid.h5 and /dev/null differ diff --git a/data/tiny_seq/valid.npz b/data/tiny_seq/valid.npz new file mode 100644 index 0000000..cb479f2 Binary files /dev/null and b/data/tiny_seq/valid.npz differ diff --git a/demo/config/example2_config/dataset_config.yaml b/demo/config/example2_config/dataset_config.yaml index 9335f21..d856d2a 100644 --- a/demo/config/example2_config/dataset_config.yaml +++ b/demo/config/example2_config/dataset_config.yaml @@ -1,8 +1,7 @@ ### Tiny data for demo only -tiny_h5: +tiny_npz: data_root: ../data/ - data_format: h5 - train_data: ../data/tiny_h5/train.h5 - valid_data: ../data/tiny_h5/valid.h5 - test_data: ../data/tiny_h5/test.h5 - + data_format: npz + train_data: ../data/tiny_npz/train.npz + valid_data: ../data/tiny_npz/valid.npz + test_data: ../data/tiny_npz/test.npz diff --git a/demo/config/example4_config/dataset_config.yaml b/demo/config/example4_config/dataset_config.yaml index bad4293..28dc2e8 100644 --- a/demo/config/example4_config/dataset_config.yaml +++ b/demo/config/example4_config/dataset_config.yaml @@ -7,7 +7,7 @@ tiny_example4: test_data: ../data/tiny_csv/test_sample.csv min_categr_count: 1 feature_cols: - [{name: "userid", active: True, dtype: str, type: categorical, pretrained_emb: "../data/tiny_csv/userid_emb_dim8.h5", + [{name: "userid", active: True, dtype: str, type: categorical, pretrained_emb: "../data/tiny_csv/userid_emb_dim8.npz", embedding_dim: 8, freeze_emb: True}, {name: ["adgroup_id","pid","cate_id","campaign_id","customer","brand","cms_segid", "cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"], diff --git a/demo/config/example5_config/dataset_config.yaml b/demo/config/example5_config/dataset_config.yaml index 822ac75..38e48bf 100644 --- a/demo/config/example5_config/dataset_config.yaml +++ b/demo/config/example5_config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for demo only tiny_seq: data_root: ../data/ - data_format: h5 - train_data: ../data/tiny_seq/train.h5 - valid_data: ../data/tiny_seq/valid.h5 - test_data: ../data/tiny_seq/test.h5 + data_format: npz + train_data: ../data/tiny_seq/train.npz + valid_data: ../data/tiny_seq/valid.npz + test_data: ../data/tiny_seq/test.npz diff --git a/demo/example1_build_dataset_to_h5.py b/demo/example1_build_dataset_to_npz.py similarity index 96% rename from demo/example1_build_dataset_to_h5.py rename to demo/example1_build_dataset_to_npz.py index e0d95f8..9b50e58 100644 --- a/demo/example1_build_dataset_to_h5.py +++ b/demo/example1_build_dataset_to_npz.py @@ -22,9 +22,8 @@ dataset_id=dataset_id, data_root=params["data_root"]) - # Build dataset from csv to h5 + # Build dataset build_dataset(feature_encoder, train_data=params["train_data"], valid_data=params["valid_data"], test_data=params["test_data"]) - diff --git a/demo/example2_DeepFM_with_h5_input.py b/demo/example2_DeepFM_with_npz_input.py similarity index 59% rename from demo/example2_DeepFM_with_h5_input.py rename to demo/example2_DeepFM_with_npz_input.py index eac7075..e7d6f14 100644 --- a/demo/example2_DeepFM_with_h5_input.py +++ b/demo/example2_DeepFM_with_npz_input.py @@ -7,14 +7,14 @@ from fuxictr.utils import load_config, set_logger, print_to_json from fuxictr.features import FeatureMap from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader +from fuxictr.pytorch.dataloaders import RankDataLoader from model_zoo import DeepFM if __name__ == '__main__': # Load params from config files config_dir = './config/example2_config' - experiment_id = 'DeepFM_test_h5' # corresponds to h5 input `data/tiny_h5` + experiment_id = 'DeepFM_test_npz' # corresponds to input `data/tiny_npz` params = load_config(config_dir, experiment_id) # set up logger and random seed @@ -29,13 +29,13 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - # Get train and validation data generators from h5 - train_gen, valid_gen = H5DataLoader(feature_map, - stage='train', - train_data=params['train_data'], - valid_data=params['valid_data'], - batch_size=params['batch_size'], - shuffle=params['shuffle']).make_iterator() + # Get train and validation data generators + train_gen, valid_gen = RankDataLoader(feature_map, + stage='train', + train_data=params['train_data'], + valid_data=params['valid_data'], + batch_size=params['batch_size'], + shuffle=params['shuffle']).make_iterator() # Model initialization and fitting model = DeepFM(feature_map, **params) @@ -45,10 +45,9 @@ model.evaluate(valid_gen) logging.info('***** Test evaluation *****') - test_gen = H5DataLoader(feature_map, - stage='test', - test_data=params['test_data'], - batch_size=params['batch_size'], - shuffle=False).make_iterator() + test_gen = RankDataLoader(feature_map, + stage='test', + test_data=params['test_data'], + batch_size=params['batch_size'], + shuffle=False).make_iterator() model.evaluate(test_gen) - \ No newline at end of file diff --git a/demo/example3_DeepFM_with_csv_input.py b/demo/example3_DeepFM_with_csv_input.py index 3bb473c..26fd71f 100644 --- a/demo/example3_DeepFM_with_csv_input.py +++ b/demo/example3_DeepFM_with_csv_input.py @@ -7,7 +7,7 @@ from fuxictr.utils import load_config, set_logger, print_to_json from fuxictr.features import FeatureMap from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset from model_zoo import DeepFM @@ -15,7 +15,7 @@ if __name__ == '__main__': # Load params from config files config_dir = './config/example3_config' - experiment_id = 'DeepFM_test_csv' # corresponds to h5 input `data/tiny_h5` + experiment_id = 'DeepFM_test_csv' # corresponds to input `data/tiny_npz` params = load_config(config_dir, experiment_id) # set up logger and random seed @@ -29,7 +29,7 @@ dataset_id=params["dataset_id"], data_root=params["data_root"]) - # Build dataset from csv to h5, and remap data paths to h5 files + # Build dataset from csv to npz, and remap data paths to npz files params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, train_data=params["train_data"], @@ -42,13 +42,13 @@ feature_map.load(os.path.join(data_dir, "feature_map.json"), params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - # Get train and validation data generators from h5 - train_gen, valid_gen = H5DataLoader(feature_map, - stage='train', - train_data=params['train_data'], - valid_data=params['valid_data'], - batch_size=params['batch_size'], - shuffle=params['shuffle']).make_iterator() + # Get train and validation data generators + train_gen, valid_gen = RankDataLoader(feature_map, + stage='train', + train_data=params['train_data'], + valid_data=params['valid_data'], + batch_size=params['batch_size'], + shuffle=params['shuffle']).make_iterator() # Model initialization and fitting model = DeepFM(feature_map, **params) @@ -58,11 +58,11 @@ model.evaluate(valid_gen) logging.info('***** Test evaluation *****') - test_gen = H5DataLoader(feature_map, - stage='test', - test_data=params['test_data'], - batch_size=params['batch_size'], - shuffle=False).make_iterator() + test_gen = RankDataLoader(feature_map, + stage='test', + test_data=params['test_data'], + batch_size=params['batch_size'], + shuffle=False).make_iterator() model.evaluate(test_gen) diff --git a/demo/example4_DeepFM_with_pretrained_emb.py b/demo/example4_DeepFM_with_pretrained_emb.py index 9efa9f4..e8c2926 100644 --- a/demo/example4_DeepFM_with_pretrained_emb.py +++ b/demo/example4_DeepFM_with_pretrained_emb.py @@ -7,7 +7,7 @@ from fuxictr.utils import load_config, set_logger, print_to_json from fuxictr.features import FeatureMap from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset from model_zoo import DeepFM @@ -29,7 +29,7 @@ dataset_id=params["dataset_id"], data_root=params["data_root"]) - # Build dataset from csv to h5, and remap data paths to h5 files + # Build dataset from csv to npz, and remap data paths to npz files params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, train_data=params["train_data"], @@ -42,13 +42,13 @@ feature_map.load(os.path.join(data_dir, "feature_map.json"), params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - # Get train and validation data generators from h5 - train_gen, valid_gen = H5DataLoader(feature_map, - stage='train', - train_data=params['train_data'], - valid_data=params['valid_data'], - batch_size=params['batch_size'], - shuffle=params['shuffle']).make_iterator() + # Get train and validation data generators + train_gen, valid_gen = RankDataLoader(feature_map, + stage='train', + train_data=params['train_data'], + valid_data=params['valid_data'], + batch_size=params['batch_size'], + shuffle=params['shuffle']).make_iterator() # Model initialization and fitting model = DeepFM(feature_map, **params) @@ -58,11 +58,11 @@ model.evaluate(valid_gen) logging.info('***** Test evaluation *****') - test_gen = H5DataLoader(feature_map, - stage='test', - test_data=params['test_data'], - batch_size=params['batch_size'], - shuffle=False).make_iterator() + test_gen = RankDataLoader(feature_map, + stage='test', + test_data=params['test_data'], + batch_size=params['batch_size'], + shuffle=False).make_iterator() model.evaluate(test_gen) diff --git a/demo/example5_DIN_with_sequence_feature.py b/demo/example5_DIN_with_sequence_feature.py index 8dda198..3a0704f 100644 --- a/demo/example5_DIN_with_sequence_feature.py +++ b/demo/example5_DIN_with_sequence_feature.py @@ -7,7 +7,7 @@ from fuxictr.utils import load_config, set_logger, print_to_json from fuxictr.features import FeatureMap from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader +from fuxictr.pytorch.dataloaders import RankDataLoader from model_zoo import DIN @@ -28,13 +28,13 @@ feature_map.load(os.path.join(data_dir, "feature_map.json"), params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - # Get train and validation data generators from h5 - train_gen, valid_gen = H5DataLoader(feature_map, - stage='train', - train_data=params['train_data'], - valid_data=params['valid_data'], - batch_size=params['batch_size'], - shuffle=params['shuffle']).make_iterator() + # Get train and validation data generators + train_gen, valid_gen = RankDataLoader(feature_map, + stage='train', + train_data=params['train_data'], + valid_data=params['valid_data'], + batch_size=params['batch_size'], + shuffle=params['shuffle']).make_iterator() # Model initialization and fitting model = DIN(feature_map, **params) @@ -44,10 +44,9 @@ model.evaluate(valid_gen) logging.info('***** Test evaluation *****') - test_gen = H5DataLoader(feature_map, - stage='test', - test_data=params['test_data'], - batch_size=params['batch_size'], - shuffle=False).make_iterator() + test_gen = RankDataLoader(feature_map, + stage='test', + test_data=params['test_data'], + batch_size=params['batch_size'], + shuffle=False).make_iterator() model.evaluate(test_gen) - diff --git a/experiment/config/DCN_tiny_h5_tuner_config.yaml b/experiment/config/DCN_tiny_npz_tuner_config.yaml similarity index 92% rename from experiment/config/DCN_tiny_h5_tuner_config.yaml rename to experiment/config/DCN_tiny_npz_tuner_config.yaml index f5a50a5..9797271 100644 --- a/experiment/config/DCN_tiny_h5_tuner_config.yaml +++ b/experiment/config/DCN_tiny_npz_tuner_config.yaml @@ -1,6 +1,6 @@ base_config: ../model_zoo/DCN/DCN_torch/config/ base_expid: DCN_default -dataset_id: tiny_h5 +dataset_id: tiny_npz tuner_space: model_root: './checkpoints/' diff --git a/experiment/fuxictr_version.py b/experiment/fuxictr_version.py index 5cfc920..1d402de 100644 --- a/experiment/fuxictr_version.py +++ b/experiment/fuxictr_version.py @@ -2,4 +2,4 @@ import sys sys.path.append("../") import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/experiment/run_expid.py b/experiment/run_expid.py index 722dd99..7acff75 100644 --- a/experiment/run_expid.py +++ b/experiment/run_expid.py @@ -24,7 +24,7 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap -from fuxictr.pytorch.dataloaders import H5DataLoader +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything from fuxictr.preprocess import FeatureProcessor, build_dataset import model_zoo @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -65,7 +65,7 @@ model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -76,7 +76,7 @@ test_result = {} if params["test_data"]: logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' diff --git a/fuxictr/autotuner.py b/fuxictr/autotuner.py index 8ab13d8..5e4ba85 100644 --- a/fuxictr/autotuner.py +++ b/fuxictr/autotuner.py @@ -69,7 +69,7 @@ def enumerate_params(config_file, exclude_expid=[]): dataset_para_combs = dict() for idx, values in enumerate(itertools.product(*map(dataset_dict.get, dataset_para_keys))): dataset_params = dict(zip(dataset_para_keys, values)) - if dataset_params["data_format"] == "h5": + if dataset_params["data_format"] == "npz": dataset_para_combs[dataset_id] = dataset_params else: hash_id = hashlib.md5("".join(sorted(print_to_json(dataset_params))).encode("utf-8")).hexdigest()[0:8] diff --git a/fuxictr/datasets/avazu.py b/fuxictr/datasets/avazu.py index 4f65577..7853131 100644 --- a/fuxictr/datasets/avazu.py +++ b/fuxictr/datasets/avazu.py @@ -34,5 +34,3 @@ def _convert_weekend(timestamp): def convert_hour(self, df, col_name): return df['hour'].apply(lambda x: int(x[6:8])) - - diff --git a/fuxictr/datasets/criteo.py b/fuxictr/datasets/criteo.py index bd314e2..a3daedc 100644 --- a/fuxictr/datasets/criteo.py +++ b/fuxictr/datasets/criteo.py @@ -28,8 +28,3 @@ def _convert_to_bucket(value): value = int(value) return value return df[col_name].map(_convert_to_bucket).astype(int) - - - - - diff --git a/fuxictr/datasets/kkbox.py b/fuxictr/datasets/kkbox.py index 1fa1bbe..d50728b 100644 --- a/fuxictr/datasets/kkbox.py +++ b/fuxictr/datasets/kkbox.py @@ -45,5 +45,3 @@ def _bucketize(age): else: return "7" return df[col_name].apply(_bucketize) - - diff --git a/fuxictr/preprocess/build_dataset.py b/fuxictr/preprocess/build_dataset.py index 3e0941c..7f528fa 100644 --- a/fuxictr/preprocess/build_dataset.py +++ b/fuxictr/preprocess/build_dataset.py @@ -1,4 +1,5 @@ # ========================================================================= +# Copyright (C) 2024. FuxiCTR Authors. All rights reserved. # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -15,7 +16,6 @@ # ========================================================================= -import h5py import os import logging import numpy as np @@ -23,14 +23,6 @@ import multiprocessing as mp -def save_h5(darray_dict, data_path): - logging.info("Saving data to h5: " + data_path) - os.makedirs(os.path.dirname(data_path), exist_ok=True) - with h5py.File(data_path, 'w') as hf: - for key, arr in darray_dict.items(): - hf.create_dataset(key, data=arr) - - def split_train_test(train_ddf=None, valid_ddf=None, test_ddf=None, valid_size=0, test_size=0, split_type="sequential"): num_samples = len(train_ddf) @@ -55,14 +47,20 @@ def split_train_test(train_ddf=None, valid_ddf=None, test_ddf=None, valid_size=0 return train_ddf, valid_ddf, test_ddf +def save_npz(darray_dict, data_path): + logging.info("Saving data to npz: " + data_path) + os.makedirs(os.path.dirname(data_path), exist_ok=True) + np.savez(data_path, **darray_dict) + + def transform_block(feature_encoder, df_block, filename, preprocess=False): if preprocess: df_block = feature_encoder.preprocess(df_block) darray_dict = feature_encoder.transform(df_block) - save_h5(darray_dict, os.path.join(feature_encoder.data_dir, filename)) + save_npz(darray_dict, os.path.join(feature_encoder.data_dir, filename)) -def transform_h5(feature_encoder, ddf, filename, preprocess=False, block_size=0): +def transform(feature_encoder, ddf, filename, preprocess=False, block_size=0): if block_size > 0: pool = mp.Pool(mp.cpu_count() // 2) block_id = 0 @@ -70,18 +68,18 @@ def transform_h5(feature_encoder, ddf, filename, preprocess=False, block_size=0) df_block = ddf[idx: (idx + block_size)] pool.apply_async(transform_block, args=(feature_encoder, df_block, - '{}/part_{:05d}.h5'.format(filename, block_id), + '{}/part_{:05d}.npz'.format(filename, block_id), preprocess)) block_id += 1 pool.close() pool.join() else: - transform_block(feature_encoder, ddf, filename + ".h5", preprocess) + transform_block(feature_encoder, ddf, filename, preprocess) def build_dataset(feature_encoder, train_data=None, valid_data=None, test_data=None, valid_size=0, test_size=0, split_type="sequential", data_block_size=0, **kwargs): - """ Build feature_map and transform h5 data """ + """ Build feature_map and transform data """ feature_map_json = os.path.join(feature_encoder.data_dir, "feature_map.json") if os.path.exists(feature_map_json): @@ -103,7 +101,7 @@ def build_dataset(feature_encoder, train_data=None, valid_data=None, test_data=N # fit and transform train_ddf train_ddf = feature_encoder.preprocess(train_ddf) feature_encoder.fit(train_ddf, **kwargs) - transform_h5(feature_encoder, train_ddf, 'train', preprocess=False, block_size=data_block_size) + transform(feature_encoder, train_ddf, 'train', preprocess=False, block_size=data_block_size) del train_ddf gc.collect() @@ -111,7 +109,7 @@ def build_dataset(feature_encoder, train_data=None, valid_data=None, test_data=N if valid_ddf is None and (valid_data is not None): valid_ddf = feature_encoder.read_csv(valid_data, **kwargs) if valid_ddf is not None: - transform_h5(feature_encoder, valid_ddf, 'valid', preprocess=True, block_size=data_block_size) + transform(feature_encoder, valid_ddf, 'valid', preprocess=True, block_size=data_block_size) del valid_ddf gc.collect() @@ -119,10 +117,10 @@ def build_dataset(feature_encoder, train_data=None, valid_data=None, test_data=N if test_ddf is None and (test_data is not None): test_ddf = feature_encoder.read_csv(test_data, **kwargs) if test_ddf is not None: - transform_h5(feature_encoder, test_ddf, 'test', preprocess=True, block_size=data_block_size) + transform(feature_encoder, test_ddf, 'test', preprocess=True, block_size=data_block_size) del test_ddf gc.collect() - logging.info("Transform csv data to h5 done.") + logging.info("Transform csv data to npz done.") # Return processed data splits return os.path.join(feature_encoder.data_dir, "train"), \ diff --git a/fuxictr/preprocess/feature_processor.py b/fuxictr/preprocess/feature_processor.py index 22cd347..7f6d147 100644 --- a/fuxictr/preprocess/feature_processor.py +++ b/fuxictr/preprocess/feature_processor.py @@ -156,6 +156,9 @@ def fit(self, train_ddf, min_categr_count=1, num_buckets=10, **kwargs): "vocab_size": tokenizer.vocab_size()}) else: self.feature_map.total_features += self.feature_map.features[name]["vocab_size"] + if "pretrained_emb" not in spec: # "oov_idx" not used without pretrained_emb + del self.feature_map.features[name]["oov_idx"] + self.feature_map.num_fields = self.feature_map.get_num_fields() self.feature_map.set_column_index() self.save_pickle(self.pickle_file) diff --git a/fuxictr/pytorch/dataloaders/__init__.py b/fuxictr/pytorch/dataloaders/__init__.py index bf50e94..4f56583 100644 --- a/fuxictr/pytorch/dataloaders/__init__.py +++ b/fuxictr/pytorch/dataloaders/__init__.py @@ -1,3 +1 @@ -from .h5_block_dataloader import H5BlockDataLoader -from .h5_dataloader import H5DataLoader -from .dataloader import DataLoader +from .rank_dataloader import RankDataLoader diff --git a/fuxictr/pytorch/dataloaders/dataloader.py b/fuxictr/pytorch/dataloaders/dataloader.py deleted file mode 100644 index 9101fd7..0000000 --- a/fuxictr/pytorch/dataloaders/dataloader.py +++ /dev/null @@ -1,30 +0,0 @@ -# ========================================================================= -# Copyright (C) 2024, FuxiCTR Authors. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ========================================================================= - - -from .h5_block_dataloader import H5BlockDataLoader -from .h5_dataloader import H5DataLoader - - -class DataLoader(H5DataLoader, H5BlockDataLoader): - def __init__(self, feature_map, stage="both", train_data=None, valid_data=None, test_data=None, - batch_size=32, shuffle=True, verbose=0, streaming=False, **kwargs): - if streaming: - H5BlockDataLoader.__init__(self, feature_map, stage, train_data, valid_data, test_data, - batch_size, shuffle, verbose, **kwargs) - else: - H5DataLoader.__init__(self, feature_map, stage, train_data, valid_data, test_data, - batch_size, shuffle, **kwargs) diff --git a/fuxictr/pytorch/dataloaders/h5_block_dataloader.py b/fuxictr/pytorch/dataloaders/npz_block_dataloader.py similarity index 51% rename from fuxictr/pytorch/dataloaders/h5_block_dataloader.py rename to fuxictr/pytorch/dataloaders/npz_block_dataloader.py index 01cbd3f..58ac5f6 100644 --- a/fuxictr/pytorch/dataloaders/h5_block_dataloader.py +++ b/fuxictr/pytorch/dataloaders/npz_block_dataloader.py @@ -1,5 +1,5 @@ # ========================================================================= -# Copyright (C) 2023. FuxiCTR Authors. All rights reserved. +# Copyright (C) 2023-2024. FuxiCTR Authors. All rights reserved. # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -17,23 +17,19 @@ import numpy as np -from fuxictr.utils import load_h5 -import h5py from itertools import chain import torch from torch.utils import data -import logging import glob -class BlockIterDataPipe(data.IterDataPipe): - def __init__(self, block_datapipe, feature_map, verbose=0): +class BlockDataPipe(data.IterDataPipe): + def __init__(self, block_datapipe, feature_map): self.feature_map = feature_map self.block_datapipe = block_datapipe - self.verbose = verbose def load_data(self, data_path): - data_dict = load_h5(data_path, verbose=self.verbose) + data_dict = np.load(data_path) data_arrays = [] all_cols = list(self.feature_map.features.keys()) + self.feature_map.labels for col in all_cols: @@ -63,10 +59,10 @@ def __iter__(self): return chain.from_iterable(map(self.read_block, block_list)) -class DataLoader(data.DataLoader): +class NpzBlockDataLoader(data.DataLoader): def __init__(self, feature_map, data_path, batch_size=32, shuffle=False, - num_workers=1, verbose=0, buffer_size=100000, **kwargs): - data_blocks = glob.glob(data_path + "/*.h5") + num_workers=1, buffer_size=100000, **kwargs): + data_blocks = glob.glob(data_path + "/*.npz") assert len(data_blocks) > 0, f"invalid data_path: {data_path}" if len(data_blocks) > 1: data_blocks.sort() # sort by part name @@ -75,10 +71,11 @@ def __init__(self, feature_map, data_path, batch_size=32, shuffle=False, self.feature_map = feature_map self.batch_size = batch_size self.num_batches, self.num_samples = self.count_batches_and_samples() - datapipe = BlockIterDataPipe(data_blocks, feature_map, verbose) + datapipe = BlockDataPipe(data_blocks, feature_map) if shuffle: datapipe = datapipe.shuffle(buffer_size=buffer_size) - super(DataLoader, self).__init__(dataset=datapipe, batch_size=batch_size, num_workers=num_workers) + super(NpzBlockDataLoader, self).__init__(dataset=datapipe, batch_size=batch_size, + num_workers=num_workers) def __len__(self): return self.num_batches @@ -87,41 +84,7 @@ def count_batches_and_samples(self): num_samples = 0 num_batches = 0 for block_path in self.data_blocks: - with h5py.File(block_path, 'r') as hf: - y = hf[self.feature_map.labels[0]][:] - num_samples += len(y) - num_batches += int(np.ceil(len(y) * 1.0 / self.batch_size)) + block_size = np.load(block_path)[self.feature_map.labels[0]].shape[0] + num_samples += block_size + num_batches += int(np.ceil(block_size * 1.0 / self.batch_size)) return num_batches, num_samples - - -class H5BlockDataLoader(object): - def __init__(self, feature_map, stage="both", train_data=None, valid_data=None, test_data=None, - batch_size=32, shuffle=True, verbose=0, **kwargs): - logging.info("Loading data...") - train_gen = None - valid_gen = None - test_gen = None - self.stage = stage - if stage in ["both", "train"]: - train_gen = DataLoader(feature_map, train_data, batch_size=batch_size, shuffle=shuffle, verbose=verbose, **kwargs) - logging.info("Train samples: total/{:d}, blocks/{:d}".format(train_gen.num_samples, train_gen.num_blocks)) - if valid_data: - valid_gen = DataLoader(feature_map, valid_data, batch_size=batch_size, shuffle=False, verbose=verbose, **kwargs) - logging.info("Validation samples: total/{:d}, blocks/{:d}".format(valid_gen.num_samples, valid_gen.num_blocks)) - - if stage in ["both", "test"]: - if test_data: - test_gen = DataLoader(feature_map, test_data, batch_size=batch_size, shuffle=False, verbose=verbose, **kwargs) - logging.info("Test samples: total/{:d}, blocks/{:d}".format(test_gen.num_samples, test_gen.num_blocks)) - self.train_gen, self.valid_gen, self.test_gen = train_gen, valid_gen, test_gen - - def make_iterator(self): - if self.stage == "train": - logging.info("Loading train and validation data done.") - return self.train_gen, self.valid_gen - elif self.stage == "test": - logging.info("Loading test data done.") - return self.test_gen - else: - logging.info("Loading data done.") - return self.train_gen, self.valid_gen, self.test_gen diff --git a/fuxictr/pytorch/dataloaders/npz_dataloader.py b/fuxictr/pytorch/dataloaders/npz_dataloader.py new file mode 100644 index 0000000..e5740a1 --- /dev/null +++ b/fuxictr/pytorch/dataloaders/npz_dataloader.py @@ -0,0 +1,60 @@ +# ========================================================================= +# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= + + +import numpy as np +from torch.utils import data +import torch + + +class Dataset(data.Dataset): + def __init__(self, feature_map, data_path): + self.feature_map = feature_map + self.darray = self.load_data(data_path) + + def __getitem__(self, index): + return self.darray[index, :] + + def __len__(self): + return self.darray.shape[0] + + def load_data(self, data_path): + data_dict = np.load(data_path) # dict of arrays + data_arrays = [] + all_cols = list(self.feature_map.features.keys()) + self.feature_map.labels + for col in all_cols: + array = data_dict[col] + if array.ndim == 1: + data_arrays.append(array.reshape(-1, 1)) + else: + data_arrays.append(array) + data_tensor = torch.from_numpy(np.hstack(data_arrays)) + return data_tensor + + +class NpzDataLoader(data.DataLoader): + def __init__(self, feature_map, data_path, batch_size=32, shuffle=False, num_workers=1, **kwargs): + if not data_path.endswith(".npz"): + data_path += ".npz" + self.dataset = Dataset(feature_map, data_path) + super(NpzDataLoader, self).__init__(dataset=self.dataset, batch_size=batch_size, + shuffle=shuffle, num_workers=num_workers) + self.num_samples = len(self.dataset) + self.num_blocks = 1 + self.num_batches = int(np.ceil(self.num_samples * 1.0 / self.batch_size)) + + def __len__(self): + return self.num_batches diff --git a/fuxictr/pytorch/dataloaders/h5_dataloader.py b/fuxictr/pytorch/dataloaders/rank_dataloader.py similarity index 50% rename from fuxictr/pytorch/dataloaders/h5_dataloader.py rename to fuxictr/pytorch/dataloaders/rank_dataloader.py index d398ba3..1ca830f 100644 --- a/fuxictr/pytorch/dataloaders/h5_dataloader.py +++ b/fuxictr/pytorch/dataloaders/rank_dataloader.py @@ -1,5 +1,5 @@ # ========================================================================= -# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (C) 2024, FuxiCTR Authors. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,70 +15,31 @@ # ========================================================================= -import numpy as np -from torch.utils import data -from fuxictr.utils import load_h5 -import torch +from .npz_block_dataloader import NpzBlockDataLoader +from .npz_dataloader import NpzDataLoader import logging -class Dataset(data.Dataset): - def __init__(self, feature_map, data_path): - self.feature_map = feature_map - self.darray = self.load_data_array(data_path) - - def __getitem__(self, index): - return self.darray[index, :] - - def __len__(self): - return self.darray.shape[0] - - def load_data_array(self, data_path): - data_dict = load_h5(data_path) # dict of arrays from h5 - data_arrays = [] - all_cols = list(self.feature_map.features.keys()) + self.feature_map.labels - for col in all_cols: - array = data_dict[col] - if array.ndim == 1: - data_arrays.append(array.reshape(-1, 1)) - else: - data_arrays.append(array) - data_tensor = torch.from_numpy(np.hstack(data_arrays)) - return data_tensor - - -class DataLoader(data.DataLoader): - def __init__(self, feature_map, data_path, batch_size=32, shuffle=False, num_workers=1, **kwargs): - if not data_path.endswith(".h5"): - data_path += ".h5" - self.dataset = Dataset(feature_map, data_path) - super(DataLoader, self).__init__(dataset=self.dataset, batch_size=batch_size, - shuffle=shuffle, num_workers=num_workers) - self.num_samples = len(self.dataset) - self.num_batches = int(np.ceil(self.num_samples * 1.0 / self.batch_size)) - - def __len__(self): - return self.num_batches - - -class H5DataLoader(object): +class RankDataLoader(object): def __init__(self, feature_map, stage="both", train_data=None, valid_data=None, test_data=None, - batch_size=32, shuffle=True, **kwargs): - logging.info("Loading data...") + batch_size=32, shuffle=True, streaming=False, **kwargs): + logging.info("Loading datasets...") train_gen = None valid_gen = None test_gen = None + DataLoader = NpzBlockDataLoader if streaming else NpzDataLoader self.stage = stage if stage in ["both", "train"]: train_gen = DataLoader(feature_map, train_data, batch_size=batch_size, shuffle=shuffle, **kwargs) - logging.info("Train samples: total/{:d}, blocks/{:d}".format(train_gen.num_samples, 1)) - if valid_data: + logging.info("Train samples: total/{:d}, blocks/{:d}".format(train_gen.num_samples, train_gen.num_blocks)) + if valid_data: valid_gen = DataLoader(feature_map, valid_data, batch_size=batch_size, shuffle=False, **kwargs) - logging.info("Validation samples: total/{:d}, blocks/{:d}".format(valid_gen.num_samples, 1)) + logging.info("Validation samples: total/{:d}, blocks/{:d}".format(valid_gen.num_samples, valid_gen.num_blocks)) + if stage in ["both", "test"]: if test_data: test_gen = DataLoader(feature_map, test_data, batch_size=batch_size, shuffle=False, **kwargs) - logging.info("Test samples: total/{:d}, blocks/{:d}".format(test_gen.num_samples, 1)) + logging.info("Test samples: total/{:d}, blocks/{:d}".format(test_gen.num_samples, test_gen.num_blocks)) self.train_gen, self.valid_gen, self.test_gen = train_gen, valid_gen, test_gen def make_iterator(self): diff --git a/fuxictr/pytorch/layers/embeddings/pretrained_embedding.py b/fuxictr/pytorch/layers/embeddings/pretrained_embedding.py index 194aab1..488333f 100644 --- a/fuxictr/pytorch/layers/embeddings/pretrained_embedding.py +++ b/fuxictr/pytorch/layers/embeddings/pretrained_embedding.py @@ -67,6 +67,7 @@ def reset_parameters(self, embedding_initializer): embedding_initializer(self.id_embedding.weight[1:self.oov_idx, :]) def get_pretrained_embedding(self, pretrain_path): + logging.info("Loading pretrained_emb: {}".format(pretrain_path)) if pretrain_path.endswith("h5"): with h5py.File(pretrain_path, 'r') as hf: keys = hf["key"][:] @@ -74,7 +75,6 @@ def get_pretrained_embedding(self, pretrain_path): elif pretrain_path.endswith("npz"): npz = np.load(pretrain_path) keys, embeddings = npz["key"], npz["value"] - logging.info("Loading pretrained_emb: {}".format(pretrain_path)) return keys, embeddings def load_feature_vocab(self, vocab_path, feature_name): diff --git a/fuxictr/pytorch/torch_utils.py b/fuxictr/pytorch/torch_utils.py index 2fd1c1a..10a6bf0 100644 --- a/fuxictr/pytorch/torch_utils.py +++ b/fuxictr/pytorch/torch_utils.py @@ -22,7 +22,6 @@ from torch import nn import random from functools import partial -import h5py import re @@ -117,27 +116,3 @@ def get_initializer(initializer): raise ValueError("initializer={} is not supported."\ .format(initializer)) return initializer - -def save_init_embs(model, data_path="init_embs.h5"): - emb_dict = dict() - for k, v in model.state_dict().items(): - if "embedding_layers" in k: - if v.size(-1) > 1: - f_name = re.findall(r"embedding_layers.(.*).weight", k)[0] - emb_dict[f_name] = v.cpu().numpy() - with h5py.File(data_path, 'w') as hf: - for key, arr in emb_dict.items(): - hf.create_dataset(key, data=arr) - -def load_init_embs(model, data_path="init_embs.h5"): - state_dict = model.state_dict() - f_name_dict = dict() - for k in state_dict.keys(): - if "embedding_layers" in k and state_dict[k].size(-1) > 1: - f_name = re.findall(r"embedding_layers.(.*).weight", k)[0] - f_name_dict[f_name] = k - with h5py.File(data_path, 'r') as hf: - for key in hf.keys(): - if key in f_name_dict: - state_dict[f_name_dict[key]] = torch.from_numpy(hf[key][:]) - model.load_state_dict(state_dict) \ No newline at end of file diff --git a/fuxictr/utils.py b/fuxictr/utils.py index 722be2b..0011ff6 100644 --- a/fuxictr/utils.py +++ b/fuxictr/utils.py @@ -21,7 +21,6 @@ import glob import json from collections import OrderedDict -import h5py def load_config(config_dir, experiment_id): @@ -105,12 +104,3 @@ def get_value(self, logs): def get_metrics(self): return list(self.kv_pairs.keys()) - -def load_h5(data_path, verbose=0): - if verbose == 0: - logging.info('Loading data from h5: ' + data_path) - data_dict = dict() - with h5py.File(data_path, 'r') as hf: - for key in hf.keys(): - data_dict[key] = hf[key][:] - return data_dict diff --git a/model_zoo/AFM/config/dataset_config.yaml b/model_zoo/AFM/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/AFM/config/dataset_config.yaml +++ b/model_zoo/AFM/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/AFM/config/model_config.yaml b/model_zoo/AFM/config/model_config.yaml index 821b192..5678983 100644 --- a/model_zoo/AFM/config/model_config.yaml +++ b/model_zoo/AFM/config/model_config.yaml @@ -14,7 +14,7 @@ Base: AFM_test: model: AFM - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: binary_crossentropy metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/AFM/fuxictr_version.py b/model_zoo/AFM/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/AFM/fuxictr_version.py +++ b/model_zoo/AFM/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/AFM/run_expid.py b/model_zoo/AFM/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/AFM/run_expid.py +++ b/model_zoo/AFM/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/AFN/config/dataset_config.yaml b/model_zoo/AFN/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/AFN/config/dataset_config.yaml +++ b/model_zoo/AFN/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/AFN/config/model_config.yaml b/model_zoo/AFN/config/model_config.yaml index f4f59a2..f6c06f9 100644 --- a/model_zoo/AFN/config/model_config.yaml +++ b/model_zoo/AFN/config/model_config.yaml @@ -14,7 +14,7 @@ Base: AFN_test: model: AFN - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: binary_crossentropy metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/AFN/fuxictr_version.py b/model_zoo/AFN/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/AFN/fuxictr_version.py +++ b/model_zoo/AFN/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/AFN/run_expid.py b/model_zoo/AFN/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/AFN/run_expid.py +++ b/model_zoo/AFN/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/AOANet/config/dataset_config.yaml b/model_zoo/AOANet/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/AOANet/config/dataset_config.yaml +++ b/model_zoo/AOANet/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/AOANet/config/model_config.yaml b/model_zoo/AOANet/config/model_config.yaml index eba26a5..a93e070 100644 --- a/model_zoo/AOANet/config/model_config.yaml +++ b/model_zoo/AOANet/config/model_config.yaml @@ -14,7 +14,7 @@ Base: AOANet_test: model: AOANet - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/AOANet/fuxictr_version.py b/model_zoo/AOANet/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/AOANet/fuxictr_version.py +++ b/model_zoo/AOANet/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/AOANet/run_expid.py b/model_zoo/AOANet/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/AOANet/run_expid.py +++ b/model_zoo/AOANet/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/APG/config/dataset_config.yaml b/model_zoo/APG/config/dataset_config.yaml index 29d5abd..8496b4d 100644 --- a/model_zoo/APG/config/dataset_config.yaml +++ b/model_zoo/APG/config/dataset_config.yaml @@ -1,20 +1,7 @@ ### Tiny data for tests only -tiny_h5: - data_root: ../../../data/ - data_format: h5 - train_data: ../../../data/tiny_h5/train.h5 - valid_data: ../../../data/tiny_h5/valid.h5 - test_data: ../../../data/tiny_h5/test.h5 - -tiny_csv: - data_root: ../../../data/ - data_format: csv - train_data: ../../../data/tiny_csv/train_sample.csv - valid_data: ../../../data/tiny_csv/valid_sample.csv - test_data: ../../../data/tiny_csv/test_sample.csv - min_categr_count: 1 - feature_cols: - [{name: ["userid","adgroup_id","pid","cate_id","campaign_id","customer","brand","cms_segid", - "cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"], - active: True, dtype: str, type: categorical}] - label_col: {name: clk, dtype: float} +tiny_npz: + data_root: ../../data/ + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/APG/config/model_config.yaml b/model_zoo/APG/config/model_config.yaml index 5371bad..120fbac 100644 --- a/model_zoo/APG/config/model_config.yaml +++ b/model_zoo/APG/config/model_config.yaml @@ -14,7 +14,7 @@ Base: APG_DeepFM_test: model: APG_DeepFM - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification @@ -78,7 +78,7 @@ APG_DeepFM: # This is a config template APG_DCNv2_test: model: APG_DCNv2 - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/APG/fuxictr_version.py b/model_zoo/APG/fuxictr_version.py index 4a05f26..be6a36a 100644 --- a/model_zoo/APG/fuxictr_version.py +++ b/model_zoo/APG/fuxictr_version.py @@ -1,3 +1,3 @@ -# pip install fuxictr +# pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.2.post" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/APG/run_expid.py b/model_zoo/APG/run_expid.py index 2e5647b..2f099f4 100644 --- a/model_zoo/APG/run_expid.py +++ b/model_zoo/APG/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import model +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/AutoInt/config/dataset_config.yaml b/model_zoo/AutoInt/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/AutoInt/config/dataset_config.yaml +++ b/model_zoo/AutoInt/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/AutoInt/config/model_config.yaml b/model_zoo/AutoInt/config/model_config.yaml index 8f462fb..0ce228c 100644 --- a/model_zoo/AutoInt/config/model_config.yaml +++ b/model_zoo/AutoInt/config/model_config.yaml @@ -14,7 +14,7 @@ Base: AutoInt_test: model: AutoInt - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/AutoInt/fuxictr_version.py b/model_zoo/AutoInt/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/AutoInt/fuxictr_version.py +++ b/model_zoo/AutoInt/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/AutoInt/run_expid.py b/model_zoo/AutoInt/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/AutoInt/run_expid.py +++ b/model_zoo/AutoInt/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/BST/config/dataset_config.yaml b/model_zoo/BST/config/dataset_config.yaml index fe4d11f..4bed1c3 100644 --- a/model_zoo/BST/config/dataset_config.yaml +++ b/model_zoo/BST/config/dataset_config.yaml @@ -1,10 +1,9 @@ ### Tiny data for tests only tiny_seq: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_seq/train.h5 - valid_data: ../../data/tiny_seq/valid.h5 - test_data: ../../data/tiny_seq/test.h5 - + data_format: npz + train_data: ../../data/tiny_seq/train.npz + valid_data: ../../data/tiny_seq/valid.npz + test_data: ../../data/tiny_seq/test.npz diff --git a/model_zoo/BST/fuxictr_version.py b/model_zoo/BST/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/BST/fuxictr_version.py +++ b/model_zoo/BST/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/BST/run_expid.py b/model_zoo/BST/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/BST/run_expid.py +++ b/model_zoo/BST/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/CCPM/config/dataset_config.yaml b/model_zoo/CCPM/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/CCPM/config/dataset_config.yaml +++ b/model_zoo/CCPM/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/CCPM/config/model_config.yaml b/model_zoo/CCPM/config/model_config.yaml index 9acd8cd..4ebadce 100644 --- a/model_zoo/CCPM/config/model_config.yaml +++ b/model_zoo/CCPM/config/model_config.yaml @@ -14,7 +14,7 @@ Base: CCPM_test: model: CCPM - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/CCPM/fuxictr_version.py b/model_zoo/CCPM/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/CCPM/fuxictr_version.py +++ b/model_zoo/CCPM/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/CCPM/run_expid.py b/model_zoo/CCPM/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/CCPM/run_expid.py +++ b/model_zoo/CCPM/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/DCN/DCN_torch/config/dataset_config.yaml b/model_zoo/DCN/DCN_torch/config/dataset_config.yaml index df35c38..2d773e5 100644 --- a/model_zoo/DCN/DCN_torch/config/dataset_config.yaml +++ b/model_zoo/DCN/DCN_torch/config/dataset_config.yaml @@ -1,8 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../../data/ - data_format: h5 - train_data: ../../../data/tiny_h5/train.h5 - valid_data: ../../../data/tiny_h5/valid.h5 - test_data: ../../../data/tiny_h5/test.h5 - + data_format: npz + train_data: ../../../data/tiny_npz/train.npz + valid_data: ../../../data/tiny_npz/valid.npz + test_data: ../../../data/tiny_npz/test.npz diff --git a/model_zoo/DCN/DCN_torch/config/model_config.yaml b/model_zoo/DCN/DCN_torch/config/model_config.yaml index 492280c..a53318b 100644 --- a/model_zoo/DCN/DCN_torch/config/model_config.yaml +++ b/model_zoo/DCN/DCN_torch/config/model_config.yaml @@ -14,7 +14,7 @@ Base: DCN_test: model: DCN - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/DCN/DCN_torch/fuxictr_version.py b/model_zoo/DCN/DCN_torch/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/DCN/DCN_torch/fuxictr_version.py +++ b/model_zoo/DCN/DCN_torch/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/DCN/DCN_torch/run_expid.py b/model_zoo/DCN/DCN_torch/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/DCN/DCN_torch/run_expid.py +++ b/model_zoo/DCN/DCN_torch/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/DCNv2/config/dataset_config.yaml b/model_zoo/DCNv2/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/DCNv2/config/dataset_config.yaml +++ b/model_zoo/DCNv2/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/DCNv2/config/model_config.yaml b/model_zoo/DCNv2/config/model_config.yaml index 8ed5bbc..4ce668b 100644 --- a/model_zoo/DCNv2/config/model_config.yaml +++ b/model_zoo/DCNv2/config/model_config.yaml @@ -14,7 +14,7 @@ Base: DCNv2_test: model: DCNv2 - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/DCNv2/run_expid.py b/model_zoo/DCNv2/run_expid.py index b32ae2f..2f099f4 100644 --- a/model_zoo/DCNv2/run_expid.py +++ b/model_zoo/DCNv2/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: diff --git a/model_zoo/DESTINE/config/dataset_config.yaml b/model_zoo/DESTINE/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/DESTINE/config/dataset_config.yaml +++ b/model_zoo/DESTINE/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/DESTINE/config/model_config.yaml b/model_zoo/DESTINE/config/model_config.yaml index 9c7cec6..8d67230 100644 --- a/model_zoo/DESTINE/config/model_config.yaml +++ b/model_zoo/DESTINE/config/model_config.yaml @@ -14,7 +14,7 @@ Base: DESTINE_test: model: DESTINE - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/DESTINE/fuxictr_version.py b/model_zoo/DESTINE/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/DESTINE/fuxictr_version.py +++ b/model_zoo/DESTINE/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/DESTINE/run_expid.py b/model_zoo/DESTINE/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/DESTINE/run_expid.py +++ b/model_zoo/DESTINE/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/DIEN/config/dataset_config.yaml b/model_zoo/DIEN/config/dataset_config.yaml index 59e4725..4bed1c3 100644 --- a/model_zoo/DIEN/config/dataset_config.yaml +++ b/model_zoo/DIEN/config/dataset_config.yaml @@ -1,9 +1,9 @@ ### Tiny data for tests only tiny_seq: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_seq/train.h5 - valid_data: ../../data/tiny_seq/valid.h5 - test_data: ../../data/tiny_seq/test.h5 + data_format: npz + train_data: ../../data/tiny_seq/train.npz + valid_data: ../../data/tiny_seq/valid.npz + test_data: ../../data/tiny_seq/test.npz diff --git a/model_zoo/DIEN/fuxictr_version.py b/model_zoo/DIEN/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/DIEN/fuxictr_version.py +++ b/model_zoo/DIEN/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/DIEN/run_expid.py b/model_zoo/DIEN/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/DIEN/run_expid.py +++ b/model_zoo/DIEN/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/DIN/config/dataset_config.yaml b/model_zoo/DIN/config/dataset_config.yaml index e570661..4bed1c3 100644 --- a/model_zoo/DIN/config/dataset_config.yaml +++ b/model_zoo/DIN/config/dataset_config.yaml @@ -1,23 +1,9 @@ ### Tiny data for tests only tiny_seq: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_seq/train.h5 - valid_data: ../../data/tiny_seq/valid.h5 - test_data: ../../data/tiny_seq/test.h5 + data_format: npz + train_data: ../../data/tiny_seq/train.npz + valid_data: ../../data/tiny_seq/valid.npz + test_data: ../../data/tiny_seq/test.npz -tiny_seq2: - data_root: ../../data/ - data_format: csv - train_data: ../../data/tiny_csv/train_sample.csv - valid_data: ../../data/tiny_csv/valid_sample.csv - test_data: ../../data/tiny_csv/test_sample.csv - min_categr_count: 1 - feature_cols: - [{name: ["userid","adgroup_id","pid","cate_id","campaign_id","customer","brand","cms_segid", - "cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"], - active: True, dtype: str, type: categorical}, - {name: click_sequence, active: True, dtype: str, type: sequence, splitter: ^, max_len: 5, - share_embedding: adgroup_id, feature_encoder: null}] - label_col: {name: clk, dtype: float} diff --git a/model_zoo/DIN/fuxictr_version.py b/model_zoo/DIN/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/DIN/fuxictr_version.py +++ b/model_zoo/DIN/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/DIN/run_expid.py b/model_zoo/DIN/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/DIN/run_expid.py +++ b/model_zoo/DIN/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/DLRM/config/dataset_config.yaml b/model_zoo/DLRM/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/DLRM/config/dataset_config.yaml +++ b/model_zoo/DLRM/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/DLRM/config/model_config.yaml b/model_zoo/DLRM/config/model_config.yaml index 0cbac02..ae811f3 100644 --- a/model_zoo/DLRM/config/model_config.yaml +++ b/model_zoo/DLRM/config/model_config.yaml @@ -14,7 +14,7 @@ Base: DLRM_test: model: DLRM - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/DLRM/fuxictr_version.py b/model_zoo/DLRM/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/DLRM/fuxictr_version.py +++ b/model_zoo/DLRM/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/DLRM/run_expid.py b/model_zoo/DLRM/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/DLRM/run_expid.py +++ b/model_zoo/DLRM/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/DMIN/config/dataset_config.yaml b/model_zoo/DMIN/config/dataset_config.yaml index 59e4725..4bed1c3 100644 --- a/model_zoo/DMIN/config/dataset_config.yaml +++ b/model_zoo/DMIN/config/dataset_config.yaml @@ -1,9 +1,9 @@ ### Tiny data for tests only tiny_seq: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_seq/train.h5 - valid_data: ../../data/tiny_seq/valid.h5 - test_data: ../../data/tiny_seq/test.h5 + data_format: npz + train_data: ../../data/tiny_seq/train.npz + valid_data: ../../data/tiny_seq/valid.npz + test_data: ../../data/tiny_seq/test.npz diff --git a/model_zoo/DMIN/fuxictr_version.py b/model_zoo/DMIN/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/DMIN/fuxictr_version.py +++ b/model_zoo/DMIN/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/DMIN/run_expid.py b/model_zoo/DMIN/run_expid.py index 2e5647b..2f099f4 100644 --- a/model_zoo/DMIN/run_expid.py +++ b/model_zoo/DMIN/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import model +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/DMR/config/dataset_config.yaml b/model_zoo/DMR/config/dataset_config.yaml index 59e4725..4bed1c3 100644 --- a/model_zoo/DMR/config/dataset_config.yaml +++ b/model_zoo/DMR/config/dataset_config.yaml @@ -1,9 +1,9 @@ ### Tiny data for tests only tiny_seq: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_seq/train.h5 - valid_data: ../../data/tiny_seq/valid.h5 - test_data: ../../data/tiny_seq/test.h5 + data_format: npz + train_data: ../../data/tiny_seq/train.npz + valid_data: ../../data/tiny_seq/valid.npz + test_data: ../../data/tiny_seq/test.npz diff --git a/model_zoo/DMR/fuxictr_version.py b/model_zoo/DMR/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/DMR/fuxictr_version.py +++ b/model_zoo/DMR/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/DMR/run_expid.py b/model_zoo/DMR/run_expid.py index 2e5647b..2f099f4 100644 --- a/model_zoo/DMR/run_expid.py +++ b/model_zoo/DMR/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import model +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/DNN/DNN_torch/config/dataset_config.yaml b/model_zoo/DNN/DNN_torch/config/dataset_config.yaml index df35c38..19f977c 100644 --- a/model_zoo/DNN/DNN_torch/config/dataset_config.yaml +++ b/model_zoo/DNN/DNN_torch/config/dataset_config.yaml @@ -1,8 +1,8 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../../data/ - data_format: h5 - train_data: ../../../data/tiny_h5/train.h5 - valid_data: ../../../data/tiny_h5/valid.h5 - test_data: ../../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../../data/tiny_npz/train.npz + valid_data: ../../../data/tiny_npz/valid.npz + test_data: ../../../data/tiny_npz/test.npz diff --git a/model_zoo/DNN/DNN_torch/config/model_config.yaml b/model_zoo/DNN/DNN_torch/config/model_config.yaml index 2c19c52..aacc581 100644 --- a/model_zoo/DNN/DNN_torch/config/model_config.yaml +++ b/model_zoo/DNN/DNN_torch/config/model_config.yaml @@ -14,7 +14,7 @@ Base: DNN_test: model: DNN - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/DNN/DNN_torch/fuxictr_version.py b/model_zoo/DNN/DNN_torch/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/DNN/DNN_torch/fuxictr_version.py +++ b/model_zoo/DNN/DNN_torch/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/DNN/DNN_torch/run_expid.py b/model_zoo/DNN/DNN_torch/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/DNN/DNN_torch/run_expid.py +++ b/model_zoo/DNN/DNN_torch/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/DSSM/config/dataset_config.yaml b/model_zoo/DSSM/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/DSSM/config/dataset_config.yaml +++ b/model_zoo/DSSM/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/DSSM/config/model_config.yaml b/model_zoo/DSSM/config/model_config.yaml index 5926889..2037b5f 100644 --- a/model_zoo/DSSM/config/model_config.yaml +++ b/model_zoo/DSSM/config/model_config.yaml @@ -14,7 +14,7 @@ Base: DSSM_test: model: DSSM - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/DSSM/fuxictr_version.py b/model_zoo/DSSM/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/DSSM/fuxictr_version.py +++ b/model_zoo/DSSM/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/DSSM/run_expid.py b/model_zoo/DSSM/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/DSSM/run_expid.py +++ b/model_zoo/DSSM/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/DeepCrossing/config/dataset_config.yaml b/model_zoo/DeepCrossing/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/DeepCrossing/config/dataset_config.yaml +++ b/model_zoo/DeepCrossing/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/DeepCrossing/config/model_config.yaml b/model_zoo/DeepCrossing/config/model_config.yaml index 9a4a20e..ddd336b 100644 --- a/model_zoo/DeepCrossing/config/model_config.yaml +++ b/model_zoo/DeepCrossing/config/model_config.yaml @@ -14,7 +14,7 @@ Base: DeepCrossing_test: model: DeepCrossing - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/DeepCrossing/fuxictr_version.py b/model_zoo/DeepCrossing/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/DeepCrossing/fuxictr_version.py +++ b/model_zoo/DeepCrossing/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/DeepCrossing/run_expid.py b/model_zoo/DeepCrossing/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/DeepCrossing/run_expid.py +++ b/model_zoo/DeepCrossing/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/DeepFM/DeepFM_torch/config/dataset_config.yaml b/model_zoo/DeepFM/DeepFM_torch/config/dataset_config.yaml index 29d5abd..8496b4d 100644 --- a/model_zoo/DeepFM/DeepFM_torch/config/dataset_config.yaml +++ b/model_zoo/DeepFM/DeepFM_torch/config/dataset_config.yaml @@ -1,20 +1,7 @@ ### Tiny data for tests only -tiny_h5: - data_root: ../../../data/ - data_format: h5 - train_data: ../../../data/tiny_h5/train.h5 - valid_data: ../../../data/tiny_h5/valid.h5 - test_data: ../../../data/tiny_h5/test.h5 - -tiny_csv: - data_root: ../../../data/ - data_format: csv - train_data: ../../../data/tiny_csv/train_sample.csv - valid_data: ../../../data/tiny_csv/valid_sample.csv - test_data: ../../../data/tiny_csv/test_sample.csv - min_categr_count: 1 - feature_cols: - [{name: ["userid","adgroup_id","pid","cate_id","campaign_id","customer","brand","cms_segid", - "cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"], - active: True, dtype: str, type: categorical}] - label_col: {name: clk, dtype: float} +tiny_npz: + data_root: ../../data/ + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/DeepFM/DeepFM_torch/config/model_config.yaml b/model_zoo/DeepFM/DeepFM_torch/config/model_config.yaml index 41382b8..281b645 100644 --- a/model_zoo/DeepFM/DeepFM_torch/config/model_config.yaml +++ b/model_zoo/DeepFM/DeepFM_torch/config/model_config.yaml @@ -14,7 +14,7 @@ Base: DeepFM_test: model: DeepFM - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/DeepFM/DeepFM_torch/fuxictr_version.py b/model_zoo/DeepFM/DeepFM_torch/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/DeepFM/DeepFM_torch/fuxictr_version.py +++ b/model_zoo/DeepFM/DeepFM_torch/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/DeepFM/DeepFM_torch/run_expid.py b/model_zoo/DeepFM/DeepFM_torch/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/DeepFM/DeepFM_torch/run_expid.py +++ b/model_zoo/DeepFM/DeepFM_torch/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/DeepIM/config/dataset_config.yaml b/model_zoo/DeepIM/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/DeepIM/config/dataset_config.yaml +++ b/model_zoo/DeepIM/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/DeepIM/config/model_config.yaml b/model_zoo/DeepIM/config/model_config.yaml index 666330f..ecbf376 100644 --- a/model_zoo/DeepIM/config/model_config.yaml +++ b/model_zoo/DeepIM/config/model_config.yaml @@ -14,7 +14,7 @@ Base: DeepIM_test: model: DeepIM - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/DeepIM/fuxictr_version.py b/model_zoo/DeepIM/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/DeepIM/fuxictr_version.py +++ b/model_zoo/DeepIM/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/DeepIM/run_expid.py b/model_zoo/DeepIM/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/DeepIM/run_expid.py +++ b/model_zoo/DeepIM/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/EDCN/config/dataset_config.yaml b/model_zoo/EDCN/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/EDCN/config/dataset_config.yaml +++ b/model_zoo/EDCN/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/EDCN/config/model_config.yaml b/model_zoo/EDCN/config/model_config.yaml index b7151b5..b51a729 100644 --- a/model_zoo/EDCN/config/model_config.yaml +++ b/model_zoo/EDCN/config/model_config.yaml @@ -14,7 +14,7 @@ Base: EDCN_test: model: EDCN - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/EDCN/fuxictr_version.py b/model_zoo/EDCN/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/EDCN/fuxictr_version.py +++ b/model_zoo/EDCN/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/EDCN/run_expid.py b/model_zoo/EDCN/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/EDCN/run_expid.py +++ b/model_zoo/EDCN/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/ETA/config/dataset_config.yaml b/model_zoo/ETA/config/dataset_config.yaml index cecbd92..16c1bc3 100644 --- a/model_zoo/ETA/config/dataset_config.yaml +++ b/model_zoo/ETA/config/dataset_config.yaml @@ -1,8 +1,8 @@ ### Tiny data for tests only tiny_seq: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_seq/train.h5 - valid_data: ../../data/tiny_seq/valid.h5 - test_data: ../../data/tiny_seq/test.h5 + data_format: npz + train_data: ../../data/tiny_seq/train.npz + valid_data: ../../data/tiny_seq/valid.npz + test_data: ../../data/tiny_seq/test.npz diff --git a/model_zoo/ETA/fuxictr_version.py b/model_zoo/ETA/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/ETA/fuxictr_version.py +++ b/model_zoo/ETA/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/ETA/run_expid.py b/model_zoo/ETA/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/ETA/run_expid.py +++ b/model_zoo/ETA/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/FFM/config/dataset_config.yaml b/model_zoo/FFM/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/FFM/config/dataset_config.yaml +++ b/model_zoo/FFM/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/FFM/config/model_config.yaml b/model_zoo/FFM/config/model_config.yaml index b34cde9..8f1ea46 100644 --- a/model_zoo/FFM/config/model_config.yaml +++ b/model_zoo/FFM/config/model_config.yaml @@ -14,7 +14,7 @@ Base: FFM_test: model: FFM - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: binary_crossentropy metrics: ['logloss', 'AUC'] task: binary_classification @@ -48,7 +48,7 @@ FFM_default: # This is a config template FFMv2_test: model: FFMv2 - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: binary_crossentropy metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/FFM/fuxictr_version.py b/model_zoo/FFM/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/FFM/fuxictr_version.py +++ b/model_zoo/FFM/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/FFM/run_expid.py b/model_zoo/FFM/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/FFM/run_expid.py +++ b/model_zoo/FFM/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/FGCNN/config/dataset_config.yaml b/model_zoo/FGCNN/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/FGCNN/config/dataset_config.yaml +++ b/model_zoo/FGCNN/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/FGCNN/config/model_config.yaml b/model_zoo/FGCNN/config/model_config.yaml index b2223ff..066462b 100644 --- a/model_zoo/FGCNN/config/model_config.yaml +++ b/model_zoo/FGCNN/config/model_config.yaml @@ -14,7 +14,7 @@ Base: FGCNN_test: model: FGCNN - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/FGCNN/fuxictr_version.py b/model_zoo/FGCNN/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/FGCNN/fuxictr_version.py +++ b/model_zoo/FGCNN/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/FGCNN/run_expid.py b/model_zoo/FGCNN/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/FGCNN/run_expid.py +++ b/model_zoo/FGCNN/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/FLEN/config/dataset_config.yaml b/model_zoo/FLEN/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/FLEN/config/dataset_config.yaml +++ b/model_zoo/FLEN/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/FLEN/config/model_config.yaml b/model_zoo/FLEN/config/model_config.yaml index 210dad2..84de0d2 100644 --- a/model_zoo/FLEN/config/model_config.yaml +++ b/model_zoo/FLEN/config/model_config.yaml @@ -14,7 +14,7 @@ Base: FLEN_test: model: FLEN - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/FLEN/fuxictr_version.py b/model_zoo/FLEN/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/FLEN/fuxictr_version.py +++ b/model_zoo/FLEN/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/FLEN/run_expid.py b/model_zoo/FLEN/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/FLEN/run_expid.py +++ b/model_zoo/FLEN/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/FM/config/dataset_config.yaml b/model_zoo/FM/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/FM/config/dataset_config.yaml +++ b/model_zoo/FM/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/FM/config/model_config.yaml b/model_zoo/FM/config/model_config.yaml index b9408e2..e86532d 100644 --- a/model_zoo/FM/config/model_config.yaml +++ b/model_zoo/FM/config/model_config.yaml @@ -14,7 +14,7 @@ Base: FM_test: model: FM - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: binary_crossentropy metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/FM/fuxictr_version.py b/model_zoo/FM/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/FM/fuxictr_version.py +++ b/model_zoo/FM/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/FM/run_expid.py b/model_zoo/FM/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/FM/run_expid.py +++ b/model_zoo/FM/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/FiBiNET/config/dataset_config.yaml b/model_zoo/FiBiNET/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/FiBiNET/config/dataset_config.yaml +++ b/model_zoo/FiBiNET/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/FiBiNET/config/model_config.yaml b/model_zoo/FiBiNET/config/model_config.yaml index 187618a..1cc5500 100644 --- a/model_zoo/FiBiNET/config/model_config.yaml +++ b/model_zoo/FiBiNET/config/model_config.yaml @@ -14,7 +14,7 @@ Base: FiBiNET_test: model: FiBiNET - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: binary_crossentropy metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/FiBiNET/fuxictr_version.py b/model_zoo/FiBiNET/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/FiBiNET/fuxictr_version.py +++ b/model_zoo/FiBiNET/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/FiBiNET/run_expid.py b/model_zoo/FiBiNET/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/FiBiNET/run_expid.py +++ b/model_zoo/FiBiNET/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/FiGNN/config/dataset_config.yaml b/model_zoo/FiGNN/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/FiGNN/config/dataset_config.yaml +++ b/model_zoo/FiGNN/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/FiGNN/config/model_config.yaml b/model_zoo/FiGNN/config/model_config.yaml index 2990c32..88ea128 100644 --- a/model_zoo/FiGNN/config/model_config.yaml +++ b/model_zoo/FiGNN/config/model_config.yaml @@ -14,7 +14,7 @@ Base: FiGNN_test: model: FiGNN - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/FiGNN/fuxictr_version.py b/model_zoo/FiGNN/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/FiGNN/fuxictr_version.py +++ b/model_zoo/FiGNN/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/FiGNN/run_expid.py b/model_zoo/FiGNN/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/FiGNN/run_expid.py +++ b/model_zoo/FiGNN/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/FinalMLP/config/dataset_config.yaml b/model_zoo/FinalMLP/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/FinalMLP/config/dataset_config.yaml +++ b/model_zoo/FinalMLP/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/FinalMLP/config/model_config.yaml b/model_zoo/FinalMLP/config/model_config.yaml index a9294d2..4ab3d26 100644 --- a/model_zoo/FinalMLP/config/model_config.yaml +++ b/model_zoo/FinalMLP/config/model_config.yaml @@ -14,7 +14,7 @@ Base: DualMLP_test: model: DualMLP - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: binary_crossentropy metrics: ['logloss', 'AUC'] task: binary_classification @@ -66,7 +66,7 @@ DualMLP_default: # This is a config template FinalMLP_test: model: FinalMLP - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: binary_crossentropy metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/FinalMLP/run_expid.py b/model_zoo/FinalMLP/run_expid.py index 81ef34f..2f099f4 100644 --- a/model_zoo/FinalMLP/run_expid.py +++ b/model_zoo/FinalMLP/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/FinalNet/config/dataset_config.yaml b/model_zoo/FinalNet/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/FinalNet/config/dataset_config.yaml +++ b/model_zoo/FinalNet/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/FinalNet/config/model_config.yaml b/model_zoo/FinalNet/config/model_config.yaml index 9c15c7b..76941b9 100644 --- a/model_zoo/FinalNet/config/model_config.yaml +++ b/model_zoo/FinalNet/config/model_config.yaml @@ -13,7 +13,7 @@ Base: FinalNet_test: model: FinalNet - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: binary_crossentropy metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/FinalNet/run_expid.py b/model_zoo/FinalNet/run_expid.py index f67c409..2f099f4 100644 --- a/model_zoo/FinalNet/run_expid.py +++ b/model_zoo/FinalNet/run_expid.py @@ -24,8 +24,8 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset import src import gc @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -65,7 +65,7 @@ model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/FmFM/config/dataset_config.yaml b/model_zoo/FmFM/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/FmFM/config/dataset_config.yaml +++ b/model_zoo/FmFM/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/FmFM/config/model_config.yaml b/model_zoo/FmFM/config/model_config.yaml index 0230a1f..bd460a4 100644 --- a/model_zoo/FmFM/config/model_config.yaml +++ b/model_zoo/FmFM/config/model_config.yaml @@ -14,7 +14,7 @@ Base: FmFM_test: model: FmFM - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: binary_crossentropy metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/FmFM/fuxictr_version.py b/model_zoo/FmFM/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/FmFM/fuxictr_version.py +++ b/model_zoo/FmFM/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/FmFM/run_expid.py b/model_zoo/FmFM/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/FmFM/run_expid.py +++ b/model_zoo/FmFM/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/FwFM/config/dataset_config.yaml b/model_zoo/FwFM/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/FwFM/config/dataset_config.yaml +++ b/model_zoo/FwFM/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/FwFM/config/model_config.yaml b/model_zoo/FwFM/config/model_config.yaml index b8c6a4a..8be7a7a 100644 --- a/model_zoo/FwFM/config/model_config.yaml +++ b/model_zoo/FwFM/config/model_config.yaml @@ -14,7 +14,7 @@ Base: FwFM_test: model: FwFM - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: binary_crossentropy metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/FwFM/fuxictr_version.py b/model_zoo/FwFM/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/FwFM/fuxictr_version.py +++ b/model_zoo/FwFM/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/FwFM/run_expid.py b/model_zoo/FwFM/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/FwFM/run_expid.py +++ b/model_zoo/FwFM/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/GDCN/config/dataset_config.yaml b/model_zoo/GDCN/config/dataset_config.yaml index c45414b..1a6ea41 100644 --- a/model_zoo/GDCN/config/dataset_config.yaml +++ b/model_zoo/GDCN/config/dataset_config.yaml @@ -1,8 +1,8 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/GDCN/config/model_config.yaml b/model_zoo/GDCN/config/model_config.yaml index f2cc6ef..dd89ad0 100644 --- a/model_zoo/GDCN/config/model_config.yaml +++ b/model_zoo/GDCN/config/model_config.yaml @@ -14,7 +14,7 @@ Base: GDCNP_test: model: GDCNP - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification @@ -37,7 +37,7 @@ GDCNP_test: GDCNS_test: model: GDCNS - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/GDCN/fuxictr_version.py b/model_zoo/GDCN/fuxictr_version.py index 666e5f0..be6a36a 100644 --- a/model_zoo/GDCN/fuxictr_version.py +++ b/model_zoo/GDCN/fuxictr_version.py @@ -1,3 +1,3 @@ -# pip install -U fuxictr -import fuxictr -assert fuxictr.__version__ >= "2.0.0" +# pip install -U fuxictr +import fuxictr +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/GDCN/run_expid.py b/model_zoo/GDCN/run_expid.py index 2d44f3b..2f099f4 100644 --- a/model_zoo/GDCN/run_expid.py +++ b/model_zoo/GDCN/run_expid.py @@ -1,88 +1,87 @@ -# ========================================================================= -# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ========================================================================= - - -import os -os.chdir(os.path.dirname(os.path.realpath(__file__))) -import sys -import logging -import fuxictr_version -from fuxictr import datasets -from datetime import datetime -from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list -from fuxictr.features import FeatureMap -from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader -from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo -import gc -import argparse -import os -from pathlib import Path - - -if __name__ == '__main__': - ''' Usage: python run_expid.py --config {config_dir} --expid {experiment_id} --gpu {gpu_device_id} - ''' - parser = argparse.ArgumentParser() - parser.add_argument('--config', type=str, default='./config/', help='The config directory.') - parser.add_argument('--expid', type=str, default='DeepFM_test', help='The experiment id to run.') - parser.add_argument('--gpu', type=int, default=-1, help='The gpu index, -1 for cpu') - args = vars(parser.parse_args()) - - experiment_id = args['expid'] - params = load_config(args['config'], experiment_id) - params['gpu'] = args['gpu'] - set_logger(params) - logging.info("Params: " + print_to_json(params)) - seed_everything(seed=params['seed']) - - data_dir = os.path.join(params['data_root'], params['dataset_id']) - feature_map_json = os.path.join(data_dir, "feature_map.json") - if params["data_format"] == "csv": - # Build feature_map and transform h5 data - feature_encoder = FeatureProcessor(**params) - params["train_data"], params["valid_data"], params["test_data"] = \ - build_dataset(feature_encoder, **params) - feature_map = FeatureMap(params['dataset_id'], data_dir) - feature_map.load(feature_map_json, params) - logging.info("Feature specs: " + print_to_json(feature_map.features)) - - model_class = getattr(model_zoo, params['model']) - model = model_class(feature_map, **params) - model.count_parameters() # print number of parameters used in model - - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() - model.fit(train_gen, validation_data=valid_gen, **params) - - logging.info('****** Validation evaluation ******') - valid_result = model.evaluate(valid_gen) - del train_gen, valid_gen - gc.collect() - - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() - test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) - - result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' - with open(result_filename, 'a+') as fw: - fw.write(' {},[command] python {},[exp_id] {},[dataset_id] {},[train] {},[val] {},[test] {}\n' \ - .format(datetime.now().strftime('%Y%m%d-%H%M%S'), - ' '.join(sys.argv), experiment_id, params['dataset_id'], - "N.A.", print_to_list(valid_result), print_to_list(test_result))) - +# ========================================================================= +# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= + + +import os +os.chdir(os.path.dirname(os.path.realpath(__file__))) +import sys +import logging +import fuxictr_version +from fuxictr import datasets +from datetime import datetime +from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list +from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader +from fuxictr.pytorch.torch_utils import seed_everything +from fuxictr.preprocess import FeatureProcessor, build_dataset +import src +import gc +import argparse +import os +from pathlib import Path + + +if __name__ == '__main__': + ''' Usage: python run_expid.py --config {config_dir} --expid {experiment_id} --gpu {gpu_device_id} + ''' + parser = argparse.ArgumentParser() + parser.add_argument('--config', type=str, default='./config/', help='The config directory.') + parser.add_argument('--expid', type=str, default='DeepFM_test', help='The experiment id to run.') + parser.add_argument('--gpu', type=int, default=-1, help='The gpu index, -1 for cpu') + args = vars(parser.parse_args()) + + experiment_id = args['expid'] + params = load_config(args['config'], experiment_id) + params['gpu'] = args['gpu'] + set_logger(params) + logging.info("Params: " + print_to_json(params)) + seed_everything(seed=params['seed']) + + data_dir = os.path.join(params['data_root'], params['dataset_id']) + feature_map_json = os.path.join(data_dir, "feature_map.json") + if params["data_format"] == "csv": + # Build feature_map and transform data + feature_encoder = FeatureProcessor(**params) + params["train_data"], params["valid_data"], params["test_data"] = \ + build_dataset(feature_encoder, **params) + feature_map = FeatureMap(params['dataset_id'], data_dir) + feature_map.load(feature_map_json, params) + logging.info("Feature specs: " + print_to_json(feature_map.features)) + + model_class = getattr(src, params['model']) + model = model_class(feature_map, **params) + model.count_parameters() # print number of parameters used in model + + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() + model.fit(train_gen, validation_data=valid_gen, **params) + + logging.info('****** Validation evaluation ******') + valid_result = model.evaluate(valid_gen) + del train_gen, valid_gen + gc.collect() + + test_result = {} + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) + + result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' + with open(result_filename, 'a+') as fw: + fw.write(' {},[command] python {},[exp_id] {},[dataset_id] {},[train] {},[val] {},[test] {}\n' \ + .format(datetime.now().strftime('%Y%m%d-%H%M%S'), + ' '.join(sys.argv), experiment_id, params['dataset_id'], + "N.A.", print_to_list(valid_result), print_to_list(test_result))) diff --git a/model_zoo/HFM/config/dataset_config.yaml b/model_zoo/HFM/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/HFM/config/dataset_config.yaml +++ b/model_zoo/HFM/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/HFM/config/model_config.yaml b/model_zoo/HFM/config/model_config.yaml index 25d533b..30708f8 100644 --- a/model_zoo/HFM/config/model_config.yaml +++ b/model_zoo/HFM/config/model_config.yaml @@ -14,7 +14,7 @@ Base: HFM_test: model: HFM - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: binary_crossentropy metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/HFM/fuxictr_version.py b/model_zoo/HFM/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/HFM/fuxictr_version.py +++ b/model_zoo/HFM/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/HFM/run_expid.py b/model_zoo/HFM/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/HFM/run_expid.py +++ b/model_zoo/HFM/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/HOFM/config/dataset_config.yaml b/model_zoo/HOFM/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/HOFM/config/dataset_config.yaml +++ b/model_zoo/HOFM/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/HOFM/config/model_config.yaml b/model_zoo/HOFM/config/model_config.yaml index 71fe38a..1a6a5ce 100644 --- a/model_zoo/HOFM/config/model_config.yaml +++ b/model_zoo/HOFM/config/model_config.yaml @@ -14,7 +14,7 @@ Base: HOFM_test: model: HOFM - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: binary_crossentropy metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/HOFM/fuxictr_version.py b/model_zoo/HOFM/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/HOFM/fuxictr_version.py +++ b/model_zoo/HOFM/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/HOFM/run_expid.py b/model_zoo/HOFM/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/HOFM/run_expid.py +++ b/model_zoo/HOFM/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/InterHAt/config/dataset_config.yaml b/model_zoo/InterHAt/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/InterHAt/config/dataset_config.yaml +++ b/model_zoo/InterHAt/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/InterHAt/config/model_config.yaml b/model_zoo/InterHAt/config/model_config.yaml index 1fd37df..bfabc36 100644 --- a/model_zoo/InterHAt/config/model_config.yaml +++ b/model_zoo/InterHAt/config/model_config.yaml @@ -14,7 +14,7 @@ Base: InterHAt_test: model: InterHAt - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/InterHAt/fuxictr_version.py b/model_zoo/InterHAt/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/InterHAt/fuxictr_version.py +++ b/model_zoo/InterHAt/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/InterHAt/run_expid.py b/model_zoo/InterHAt/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/InterHAt/run_expid.py +++ b/model_zoo/InterHAt/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/LR/config/dataset_config.yaml b/model_zoo/LR/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/LR/config/dataset_config.yaml +++ b/model_zoo/LR/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/LR/config/model_config.yaml b/model_zoo/LR/config/model_config.yaml index 822e70a..953917a 100644 --- a/model_zoo/LR/config/model_config.yaml +++ b/model_zoo/LR/config/model_config.yaml @@ -14,7 +14,7 @@ Base: LR_test: model: LR - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: binary_crossentropy metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/LR/fuxictr_version.py b/model_zoo/LR/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/LR/fuxictr_version.py +++ b/model_zoo/LR/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/LR/run_expid.py b/model_zoo/LR/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/LR/run_expid.py +++ b/model_zoo/LR/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/LorentzFM/config/dataset_config.yaml b/model_zoo/LorentzFM/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/LorentzFM/config/dataset_config.yaml +++ b/model_zoo/LorentzFM/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/LorentzFM/config/model_config.yaml b/model_zoo/LorentzFM/config/model_config.yaml index 528b9e6..1a4dd33 100644 --- a/model_zoo/LorentzFM/config/model_config.yaml +++ b/model_zoo/LorentzFM/config/model_config.yaml @@ -14,7 +14,7 @@ Base: LorentzFM_test: model: LorentzFM - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: binary_crossentropy metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/LorentzFM/fuxictr_version.py b/model_zoo/LorentzFM/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/LorentzFM/fuxictr_version.py +++ b/model_zoo/LorentzFM/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/LorentzFM/run_expid.py b/model_zoo/LorentzFM/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/LorentzFM/run_expid.py +++ b/model_zoo/LorentzFM/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/MaskNet/config/dataset_config.yaml b/model_zoo/MaskNet/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/MaskNet/config/dataset_config.yaml +++ b/model_zoo/MaskNet/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/MaskNet/config/model_config.yaml b/model_zoo/MaskNet/config/model_config.yaml index 5283ed4..d6183a7 100644 --- a/model_zoo/MaskNet/config/model_config.yaml +++ b/model_zoo/MaskNet/config/model_config.yaml @@ -14,7 +14,7 @@ Base: MaskNet_test: model: MaskNet - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/MaskNet/fuxictr_version.py b/model_zoo/MaskNet/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/MaskNet/fuxictr_version.py +++ b/model_zoo/MaskNet/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/MaskNet/run_expid.py b/model_zoo/MaskNet/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/MaskNet/run_expid.py +++ b/model_zoo/MaskNet/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/NFM/config/dataset_config.yaml b/model_zoo/NFM/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/NFM/config/dataset_config.yaml +++ b/model_zoo/NFM/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/NFM/config/model_config.yaml b/model_zoo/NFM/config/model_config.yaml index caccb4b..bc6a936 100644 --- a/model_zoo/NFM/config/model_config.yaml +++ b/model_zoo/NFM/config/model_config.yaml @@ -14,7 +14,7 @@ Base: NFM_test: model: NFM - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/NFM/fuxictr_version.py b/model_zoo/NFM/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/NFM/fuxictr_version.py +++ b/model_zoo/NFM/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/NFM/run_expid.py b/model_zoo/NFM/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/NFM/run_expid.py +++ b/model_zoo/NFM/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/ONN/ONN_torch/config/dataset_config.yaml b/model_zoo/ONN/ONN_torch/config/dataset_config.yaml index 4ad2136..2d773e5 100644 --- a/model_zoo/ONN/ONN_torch/config/dataset_config.yaml +++ b/model_zoo/ONN/ONN_torch/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: - data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 +tiny_npz: + data_root: ../../../data/ + data_format: npz + train_data: ../../../data/tiny_npz/train.npz + valid_data: ../../../data/tiny_npz/valid.npz + test_data: ../../../data/tiny_npz/test.npz diff --git a/model_zoo/ONN/ONN_torch/config/model_config.yaml b/model_zoo/ONN/ONN_torch/config/model_config.yaml index b9a2ad4..a5b909b 100644 --- a/model_zoo/ONN/ONN_torch/config/model_config.yaml +++ b/model_zoo/ONN/ONN_torch/config/model_config.yaml @@ -14,7 +14,7 @@ Base: ONN_test: model: ONN - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: binary_crossentropy metrics: ['logloss', 'AUC'] task: binary_classification @@ -58,7 +58,7 @@ ONN_default: # This is a config template ONNv2_test: model: ONNv2 - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: binary_crossentropy metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/ONN/ONN_torch/fuxictr_version.py b/model_zoo/ONN/ONN_torch/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/ONN/ONN_torch/fuxictr_version.py +++ b/model_zoo/ONN/ONN_torch/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/ONN/ONN_torch/run_expid.py b/model_zoo/ONN/ONN_torch/run_expid.py index 2e5647b..2f099f4 100644 --- a/model_zoo/ONN/ONN_torch/run_expid.py +++ b/model_zoo/ONN/ONN_torch/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import model +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/PEPNet/config/dataset_config.yaml b/model_zoo/PEPNet/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/PEPNet/config/dataset_config.yaml +++ b/model_zoo/PEPNet/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/PEPNet/config/model_config.yaml b/model_zoo/PEPNet/config/model_config.yaml index 82ffc08..b8fa7b6 100644 --- a/model_zoo/PEPNet/config/model_config.yaml +++ b/model_zoo/PEPNet/config/model_config.yaml @@ -13,7 +13,7 @@ Base: PPNet_test: model: PPNet - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/PEPNet/fuxictr_version.py b/model_zoo/PEPNet/fuxictr_version.py index 7760495..be6a36a 100644 --- a/model_zoo/PEPNet/fuxictr_version.py +++ b/model_zoo/PEPNet/fuxictr_version.py @@ -1,3 +1,3 @@ -# pip install fuxictr +# pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.1" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/PEPNet/run_expid.py b/model_zoo/PEPNet/run_expid.py index 2e5647b..2f099f4 100644 --- a/model_zoo/PEPNet/run_expid.py +++ b/model_zoo/PEPNet/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import model +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/PNN/config/dataset_config.yaml b/model_zoo/PNN/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/PNN/config/dataset_config.yaml +++ b/model_zoo/PNN/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/PNN/config/model_config.yaml b/model_zoo/PNN/config/model_config.yaml index 013e703..1852a1f 100644 --- a/model_zoo/PNN/config/model_config.yaml +++ b/model_zoo/PNN/config/model_config.yaml @@ -14,7 +14,7 @@ Base: PNN_test: model: PNN - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/PNN/fuxictr_version.py b/model_zoo/PNN/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/PNN/fuxictr_version.py +++ b/model_zoo/PNN/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/PNN/run_expid.py b/model_zoo/PNN/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/PNN/run_expid.py +++ b/model_zoo/PNN/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/SAM/config/dataset_config.yaml b/model_zoo/SAM/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/SAM/config/dataset_config.yaml +++ b/model_zoo/SAM/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/SAM/config/model_config.yaml b/model_zoo/SAM/config/model_config.yaml index 052dcb6..58de3e8 100644 --- a/model_zoo/SAM/config/model_config.yaml +++ b/model_zoo/SAM/config/model_config.yaml @@ -14,7 +14,7 @@ Base: SAM_test: model: SAM - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/SAM/fuxictr_version.py b/model_zoo/SAM/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/SAM/fuxictr_version.py +++ b/model_zoo/SAM/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/SAM/run_expid.py b/model_zoo/SAM/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/SAM/run_expid.py +++ b/model_zoo/SAM/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/SDIM/config/dataset_config.yaml b/model_zoo/SDIM/config/dataset_config.yaml index cecbd92..16c1bc3 100644 --- a/model_zoo/SDIM/config/dataset_config.yaml +++ b/model_zoo/SDIM/config/dataset_config.yaml @@ -1,8 +1,8 @@ ### Tiny data for tests only tiny_seq: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_seq/train.h5 - valid_data: ../../data/tiny_seq/valid.h5 - test_data: ../../data/tiny_seq/test.h5 + data_format: npz + train_data: ../../data/tiny_seq/train.npz + valid_data: ../../data/tiny_seq/valid.npz + test_data: ../../data/tiny_seq/test.npz diff --git a/model_zoo/SDIM/fuxictr_version.py b/model_zoo/SDIM/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/SDIM/fuxictr_version.py +++ b/model_zoo/SDIM/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/SDIM/run_expid.py b/model_zoo/SDIM/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/SDIM/run_expid.py +++ b/model_zoo/SDIM/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/WideDeep/WideDeep_torch/config/dataset_config.yaml b/model_zoo/WideDeep/WideDeep_torch/config/dataset_config.yaml index 5681a58..2d773e5 100644 --- a/model_zoo/WideDeep/WideDeep_torch/config/dataset_config.yaml +++ b/model_zoo/WideDeep/WideDeep_torch/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../../data/ - data_format: h5 - train_data: ../../../data/tiny_h5/train.h5 - valid_data: ../../../data/tiny_h5/valid.h5 - test_data: ../../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../../data/tiny_npz/train.npz + valid_data: ../../../data/tiny_npz/valid.npz + test_data: ../../../data/tiny_npz/test.npz diff --git a/model_zoo/WideDeep/WideDeep_torch/config/model_config.yaml b/model_zoo/WideDeep/WideDeep_torch/config/model_config.yaml index 50f8a7a..70f400d 100644 --- a/model_zoo/WideDeep/WideDeep_torch/config/model_config.yaml +++ b/model_zoo/WideDeep/WideDeep_torch/config/model_config.yaml @@ -14,7 +14,7 @@ Base: WideDeep_test: model: WideDeep - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/WideDeep/WideDeep_torch/fuxictr_version.py b/model_zoo/WideDeep/WideDeep_torch/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/WideDeep/WideDeep_torch/fuxictr_version.py +++ b/model_zoo/WideDeep/WideDeep_torch/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/WideDeep/WideDeep_torch/run_expid.py b/model_zoo/WideDeep/WideDeep_torch/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/WideDeep/WideDeep_torch/run_expid.py +++ b/model_zoo/WideDeep/WideDeep_torch/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) - diff --git a/model_zoo/__init__.py b/model_zoo/__init__.py index 36cd377..cded9c0 100644 --- a/model_zoo/__init__.py +++ b/model_zoo/__init__.py @@ -24,6 +24,7 @@ from .FiBiNET.src import FiBiNET from .FiGNN.src import FiGNN from .FinalMLP.src import FinalMLP +from .FinalNet.src import FinalNet from .FLEN.src import FLEN from .FM.src import FM from .FmFM.src import FmFM diff --git a/model_zoo/multitask/MMoE/fuxictr_version.py b/model_zoo/multitask/MMoE/fuxictr_version.py index 9a0fbb2..be6a36a 100644 --- a/model_zoo/multitask/MMoE/fuxictr_version.py +++ b/model_zoo/multitask/MMoE/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.1" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/multitask/MMoE/run_expid.py b/model_zoo/multitask/MMoE/run_expid.py index a1864f8..f706b8c 100644 --- a/model_zoo/multitask/MMoE/run_expid.py +++ b/model_zoo/multitask/MMoE/run_expid.py @@ -25,7 +25,7 @@ from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset import src as model_zoo import gc @@ -65,7 +65,7 @@ model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -74,7 +74,7 @@ gc.collect() logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} if test_gen: test_result = model.evaluate(test_gen) diff --git a/model_zoo/multitask/PLE/fuxictr_version.py b/model_zoo/multitask/PLE/fuxictr_version.py index bb05cc9..be6a36a 100644 --- a/model_zoo/multitask/PLE/fuxictr_version.py +++ b/model_zoo/multitask/PLE/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.3" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/multitask/PLE/run_expid.py b/model_zoo/multitask/PLE/run_expid.py index a1864f8..f706b8c 100644 --- a/model_zoo/multitask/PLE/run_expid.py +++ b/model_zoo/multitask/PLE/run_expid.py @@ -25,7 +25,7 @@ from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset import src as model_zoo import gc @@ -65,7 +65,7 @@ model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -74,7 +74,7 @@ gc.collect() logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} if test_gen: test_result = model.evaluate(test_gen) diff --git a/model_zoo/multitask/SharedBottom/fuxictr_version.py b/model_zoo/multitask/SharedBottom/fuxictr_version.py index 9a0fbb2..be6a36a 100644 --- a/model_zoo/multitask/SharedBottom/fuxictr_version.py +++ b/model_zoo/multitask/SharedBottom/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.1" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/multitask/SharedBottom/run_expid.py b/model_zoo/multitask/SharedBottom/run_expid.py index c63d74e..85aa41f 100644 --- a/model_zoo/multitask/SharedBottom/run_expid.py +++ b/model_zoo/multitask/SharedBottom/run_expid.py @@ -25,7 +25,7 @@ from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset import src as model_zoo import gc @@ -65,7 +65,7 @@ model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -74,7 +74,7 @@ gc.collect() logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} if test_gen: test_result = model.evaluate(test_gen) diff --git a/model_zoo/xDeepFM/config/dataset_config.yaml b/model_zoo/xDeepFM/config/dataset_config.yaml index 4ad2136..8496b4d 100644 --- a/model_zoo/xDeepFM/config/dataset_config.yaml +++ b/model_zoo/xDeepFM/config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for tests only -tiny_h5: +tiny_npz: data_root: ../../data/ - data_format: h5 - train_data: ../../data/tiny_h5/train.h5 - valid_data: ../../data/tiny_h5/valid.h5 - test_data: ../../data/tiny_h5/test.h5 + data_format: npz + train_data: ../../data/tiny_npz/train.npz + valid_data: ../../data/tiny_npz/valid.npz + test_data: ../../data/tiny_npz/test.npz diff --git a/model_zoo/xDeepFM/config/model_config.yaml b/model_zoo/xDeepFM/config/model_config.yaml index 9f03d5a..bb6ce69 100644 --- a/model_zoo/xDeepFM/config/model_config.yaml +++ b/model_zoo/xDeepFM/config/model_config.yaml @@ -14,7 +14,7 @@ Base: xDeepFM_test: model: xDeepFM - dataset_id: tiny_h5 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/model_zoo/xDeepFM/fuxictr_version.py b/model_zoo/xDeepFM/fuxictr_version.py index 82c2895..be6a36a 100644 --- a/model_zoo/xDeepFM/fuxictr_version.py +++ b/model_zoo/xDeepFM/fuxictr_version.py @@ -1,3 +1,3 @@ # pip install -U fuxictr import fuxictr -assert fuxictr.__version__ >= "2.0.0" +assert fuxictr.__version__ >= "2.2.0" diff --git a/model_zoo/xDeepFM/run_expid.py b/model_zoo/xDeepFM/run_expid.py index a1864f8..2f099f4 100644 --- a/model_zoo/xDeepFM/run_expid.py +++ b/model_zoo/xDeepFM/run_expid.py @@ -24,10 +24,10 @@ from datetime import datetime from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list from fuxictr.features import FeatureMap +from fuxictr.pytorch.dataloaders import RankDataLoader from fuxictr.pytorch.torch_utils import seed_everything -from fuxictr.pytorch.dataloaders import H5DataLoader from fuxictr.preprocess import FeatureProcessor, build_dataset -import src as model_zoo +import src import gc import argparse import os @@ -53,7 +53,7 @@ data_dir = os.path.join(params['data_root'], params['dataset_id']) feature_map_json = os.path.join(data_dir, "feature_map.json") if params["data_format"] == "csv": - # Build feature_map and transform h5 data + # Build feature_map and transform data feature_encoder = FeatureProcessor(**params) params["train_data"], params["valid_data"], params["test_data"] = \ build_dataset(feature_encoder, **params) @@ -61,11 +61,11 @@ feature_map.load(feature_map_json, params) logging.info("Feature specs: " + print_to_json(feature_map.features)) - model_class = getattr(model_zoo, params['model']) + model_class = getattr(src, params['model']) model = model_class(feature_map, **params) model.count_parameters() # print number of parameters used in model - train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() + train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator() model.fit(train_gen, validation_data=valid_gen, **params) logging.info('****** Validation evaluation ******') @@ -73,11 +73,11 @@ del train_gen, valid_gen gc.collect() - logging.info('******** Test evaluation ********') - test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() test_result = {} - if test_gen: - test_result = model.evaluate(test_gen) + if params["test_data"]: + logging.info('******** Test evaluation ********') + test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator() + test_result = model.evaluate(test_gen) result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv' with open(result_filename, 'a+') as fw: @@ -85,4 +85,3 @@ .format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result), print_to_list(test_result))) -