Change data format from h5 to npz

reczoo · Feb 19, 2024 · df04ae1 · df04ae1
1 parent f10aadc
commit df04ae1
Show file tree

Hide file tree

Showing 227 changed files with 1,025 additions and 1,235 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,21 +6,22 @@
 
 **FuxiCTR v2.2.0, 2024-02-17**
 + [Feature] Add support of npz format for pretrained_emb
++ [Refactor] Change data format from h5 to npz
 
 -------------------------------
 
 ### FuxiCTR v2.1
 
 **FuxiCTR v2.1.3, 2024-02-17**
 + [Feature] Add GDCN model
-+ [Edit] Rename FINAL model to FinalNet
-+ [Edit] Update RecZoo URLs
++ [Refactor] Rename FINAL model to FinalNet
++ [Refactor] Update RecZoo URLs
 + [Fix] Fix bug #75
 + [Fix] Fix h5 file extenstion issue
 + [Fix] Fix typo in FinalNet
 
 **FuxiCTR v2.1.2, 2023-11-01**
-+ [Edit] Update H5DataBlockLoader to support dataloader with multiprocessing
++ [Refactor] Update H5DataBlockLoader to support dataloader with multiprocessing
 
 **FuxiCTR v2.1.1, 2023-10-26**
 + [Feature] Update to allow loading pretrained h5 directly in PretrainedEmbedding (skip key mapping in preprocess)
@@ -45,7 +46,7 @@
 
 **FuxiCTR v2.0.2, 2023-05-14**
 + [Feature] Update FINAL, DIEN
-+ [Edit] Update ordered_features to use_features
++ [Refactor] Update ordered_features to use_features
 
 **FuxiCTR v2.0.1, 2023-02-15**
 + [Doc] Add fuxictr tutorials
@@ -55,7 +56,7 @@
 **FuxiCTR v2.0.0, 2023-01-19**
 + [Feature] Add more models of year 2021-2022.
 + [Feature] Add tensorflow backbone support
-+ [Edit] Refine code structure to support model development with minimal code
++ [Refactor] Refine code structure to support model development with minimal code
 
 -------------------------------
 
@@ -85,7 +86,7 @@
 + [Feature] Add new feature for loading blocks of h5 data
 + [Feature] Add tests for DIN, FmFM
 + [Feature] Add support for multiple fields concat for DIN
-+ [Edit] Remove the unnecessary config of embedding_dropout because it does not help after some attempts
++ [Refactor] Remove the unnecessary config of embedding_dropout because it does not help after some attempts
 + [Feature] Add embedding_hooks of dense layers on pretrained embeddings
 + [Fix] Fix the bug in padding_idx (have no effect on Criteo/Avazu results)
 + [Fix] Fix the bug in loading pretrained embeddings (have no effect on Criteo/Avazu results)
@@ -96,7 +97,7 @@
 ### FuxiCTR v1.0
 
 **FuxiCTR v1.0.2, 2021-12-01**
-+ [Edit] Refactor the code and documentation to support reproducing the BARS-CTR benchmark.
++ [Refactor] Refactor the code and documentation to support reproducing the BARS-CTR benchmark.
 
 **FuxiCTR v1.0.1, 2021-10-01**
 + [Feature] The first release of FuxiCTR, including 28 models. This version was used for the CIKM'21 paper.
diff --git a/data/tiny_csv/userid_emb_dim8.h5 b/data/tiny_csv/userid_emb_dim8.h5
diff --git a/data/tiny_csv/userid_emb_dim8.npz b/data/tiny_csv/userid_emb_dim8.npz
diff --git a/data/tiny_h5/test.h5 b/data/tiny_h5/test.h5
diff --git a/data/tiny_h5/train.h5 b/data/tiny_h5/train.h5
diff --git a/data/tiny_h5/valid.h5 b/data/tiny_h5/valid.h5
diff --git a/data/tiny_h5/feature_map.json → data/tiny_npz/feature_map.json b/data/tiny_h5/feature_map.json → data/tiny_npz/feature_map.json
@@ -12,7 +12,6 @@
  "source": "user",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 25,
  "vocab_size": 26
  }
  },
@@ -21,7 +20,6 @@
  "source": "item",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 95,
  "vocab_size": 96
  }
  },
@@ -30,7 +28,6 @@
  "source": "context",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 3,
  "vocab_size": 4
  }
  },
@@ -39,7 +36,6 @@
  "source": "item",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 48,
  "vocab_size": 49
  }
  },
@@ -48,7 +44,6 @@
  "source": "item",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 98,
  "vocab_size": 99
  }
  },
@@ -57,7 +52,6 @@
  "source": "item",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 97,
  "vocab_size": 98
  }
  },
@@ -66,7 +60,6 @@
  "source": "item",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 66,
  "vocab_size": 67
  }
  },
@@ -75,7 +68,6 @@
  "source": "user",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 10,
  "vocab_size": 11
  }
  },
@@ -84,7 +76,6 @@
  "source": "user",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 10,
  "vocab_size": 11
  }
  },
@@ -93,7 +84,6 @@
  "source": "user",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 3,
  "vocab_size": 4
  }
  },
@@ -102,7 +92,6 @@
  "source": "user",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 6,
  "vocab_size": 7
  }
  },
@@ -111,7 +100,6 @@
  "source": "user",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 3,
  "vocab_size": 4
  }
  },
@@ -120,7 +108,6 @@
  "source": "user",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 4,
  "vocab_size": 5
  }
  },
@@ -129,7 +116,6 @@
  "source": "user",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 3,
  "vocab_size": 4
  }
  }

diff --git a/data/tiny_npz/test.npz b/data/tiny_npz/test.npz
diff --git a/data/tiny_npz/train.npz b/data/tiny_npz/train.npz
diff --git a/data/tiny_npz/valid.npz b/data/tiny_npz/valid.npz
diff --git a/data/tiny_seq/feature_map.json b/data/tiny_seq/feature_map.json
@@ -12,7 +12,6 @@
  "source": "",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 25,
  "vocab_size": 26
  }
  },
@@ -21,7 +20,6 @@
  "source": "",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 95,
  "vocab_size": 96
  }
  },
@@ -30,7 +28,6 @@
  "source": "",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 3,
  "vocab_size": 4
  }
  },
@@ -39,7 +36,6 @@
  "source": "",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 48,
  "vocab_size": 49
  }
  },
@@ -48,7 +44,6 @@
  "source": "",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 98,
  "vocab_size": 99
  }
  },
@@ -57,7 +52,6 @@
  "source": "",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 97,
  "vocab_size": 98
  }
  },
@@ -66,7 +60,6 @@
  "source": "",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 66,
  "vocab_size": 67
  }
  },
@@ -75,7 +68,6 @@
  "source": "",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 10,
  "vocab_size": 11
  }
  },
@@ -84,7 +76,6 @@
  "source": "",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 10,
  "vocab_size": 11
  }
  },
@@ -93,7 +84,6 @@
  "source": "",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 3,
  "vocab_size": 4
  }
  },
@@ -102,7 +92,6 @@
  "source": "",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 6,
  "vocab_size": 7
  }
  },
@@ -111,7 +100,6 @@
  "source": "",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 3,
  "vocab_size": 4
  }
  },
@@ -120,7 +108,6 @@
  "source": "",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 4,
  "vocab_size": 5
  }
  },
@@ -129,7 +116,6 @@
  "source": "",
  "type": "categorical",
  "padding_idx": 0,
- "oov_idx": 3,
  "vocab_size": 4
  }
  },
@@ -139,10 +125,9 @@
  "type": "sequence",
  "share_embedding": "adgroup_id",
  "padding_idx": 0,
- "oov_idx": 95,
  "vocab_size": 96,
  "max_len": 5
  }
  }
  ]
-}
+}
diff --git a/data/tiny_seq/test.h5 b/data/tiny_seq/test.h5
diff --git a/data/tiny_seq/test.npz b/data/tiny_seq/test.npz
diff --git a/data/tiny_seq/train.h5 b/data/tiny_seq/train.h5
diff --git a/data/tiny_seq/train.npz b/data/tiny_seq/train.npz
diff --git a/data/tiny_seq/valid.h5 b/data/tiny_seq/valid.h5
diff --git a/data/tiny_seq/valid.npz b/data/tiny_seq/valid.npz
diff --git a/demo/config/example2_config/dataset_config.yaml b/demo/config/example2_config/dataset_config.yaml
@@ -1,8 +1,7 @@
 ### Tiny data for demo only
-tiny_h5:
+tiny_npz:
  data_root: ../data/
- data_format: h5
- train_data: ../data/tiny_h5/train.h5
- valid_data: ../data/tiny_h5/valid.h5
- test_data: ../data/tiny_h5/test.h5
-
+ data_format: npz
+ train_data: ../data/tiny_npz/train.npz
+ valid_data: ../data/tiny_npz/valid.npz
+ test_data: ../data/tiny_npz/test.npz
diff --git a/demo/config/example4_config/dataset_config.yaml b/demo/config/example4_config/dataset_config.yaml
@@ -7,7 +7,7 @@ tiny_example4:
  test_data: ../data/tiny_csv/test_sample.csv
  min_categr_count: 1
  feature_cols:
- [{name: "userid", active: True, dtype: str, type: categorical, pretrained_emb: "../data/tiny_csv/userid_emb_dim8.h5",
+ [{name: "userid", active: True, dtype: str, type: categorical, pretrained_emb: "../data/tiny_csv/userid_emb_dim8.npz",
  embedding_dim: 8, freeze_emb: True},
  {name: ["adgroup_id","pid","cate_id","campaign_id","customer","brand","cms_segid",
  "cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"], 

diff --git a/demo/config/example5_config/dataset_config.yaml b/demo/config/example5_config/dataset_config.yaml
@@ -1,7 +1,7 @@
 ### Tiny data for demo only
 tiny_seq:
  data_root: ../data/
- data_format: h5
- train_data: ../data/tiny_seq/train.h5
- valid_data: ../data/tiny_seq/valid.h5
- test_data: ../data/tiny_seq/test.h5
+ data_format: npz
+ train_data: ../data/tiny_seq/train.npz
+ valid_data: ../data/tiny_seq/valid.npz
+ test_data: ../data/tiny_seq/test.npz
diff --git a/demo/example1_build_dataset_to_h5.py → demo/example1_build_dataset_to_npz.py b/demo/example1_build_dataset_to_h5.py → demo/example1_build_dataset_to_npz.py
@@ -22,9 +22,8 @@
  dataset_id=dataset_id, 
  data_root=params["data_root"])
 
- # Build dataset from csv to h5
+ # Build dataset
  build_dataset(feature_encoder, 
  train_data=params["train_data"],
  valid_data=params["valid_data"],
  test_data=params["test_data"])
-
diff --git a/demo/example2_DeepFM_with_h5_input.py → demo/example2_DeepFM_with_npz_input.py b/demo/example2_DeepFM_with_h5_input.py → demo/example2_DeepFM_with_npz_input.py
@@ -7,14 +7,14 @@
 from fuxictr.utils import load_config, set_logger, print_to_json
 from fuxictr.features import FeatureMap
 from fuxictr.pytorch.torch_utils import seed_everything
-from fuxictr.pytorch.dataloaders import H5DataLoader
+from fuxictr.pytorch.dataloaders import RankDataLoader
 from model_zoo import DeepFM
 
 
 if __name__ == '__main__':
  # Load params from config files
  config_dir = './config/example2_config'
- experiment_id = 'DeepFM_test_h5' # corresponds to h5 input `data/tiny_h5`
+ experiment_id = 'DeepFM_test_npz' # corresponds to input `data/tiny_npz`
  params = load_config(config_dir, experiment_id)
 
  # set up logger and random seed
@@ -29,13 +29,13 @@
  feature_map.load(feature_map_json, params)
  logging.info("Feature specs: " + print_to_json(feature_map.features))
 
- # Get train and validation data generators from h5
- train_gen, valid_gen = H5DataLoader(feature_map, 
- stage='train', 
- train_data=params['train_data'],
- valid_data=params['valid_data'],
- batch_size=params['batch_size'],
- shuffle=params['shuffle']).make_iterator()
+ # Get train and validation data generators
+ train_gen, valid_gen = RankDataLoader(feature_map,
+  stage='train',
+  train_data=params['train_data'],
+  valid_data=params['valid_data'],
+  batch_size=params['batch_size'],
+  shuffle=params['shuffle']).make_iterator()
 
  # Model initialization and fitting
  model = DeepFM(feature_map, **params)
@@ -45,10 +45,9 @@
  model.evaluate(valid_gen)
 
  logging.info('***** Test evaluation *****')
- test_gen = H5DataLoader(feature_map, 
- stage='test',
- test_data=params['test_data'],
- batch_size=params['batch_size'],
- shuffle=False).make_iterator()
+ test_gen = RankDataLoader(feature_map, 
+  stage='test',
+  test_data=params['test_data'],
+  batch_size=params['batch_size'],
+  shuffle=False).make_iterator()
  model.evaluate(test_gen)
-