From e6c0a53cca0634ea5fec2e699dd9702248099597 Mon Sep 17 00:00:00 2001 From: "yancheng.lgq" Date: Thu, 4 Aug 2022 17:42:41 +0800 Subject: [PATCH 1/3] fix neg sampler bug for sequence feature --- docs/source/feature/feature.rst | 3 + easy_rec/python/layers/input_layer.py | 15 +- easy_rec/python/protos/feature_config.proto | 1 + easy_rec/python/test/train_eval_test.py | 18 + ...undary_only_sequence_feature_taobao.config | 347 ++++++++++++++++++ ...m_neg_sampler_only_sequence_feature.config | 302 +++++++++++++++ .../dssm_neg_sampler_sequence_feature.config | 302 +++++++++++++++ 7 files changed, 986 insertions(+), 2 deletions(-) create mode 100644 samples/model_config/dbmtl_on_multi_numeric_boundary_only_sequence_feature_taobao.config create mode 100644 samples/model_config/dssm_neg_sampler_only_sequence_feature.config create mode 100644 samples/model_config/dssm_neg_sampler_sequence_feature.config diff --git a/docs/source/feature/feature.rst b/docs/source/feature/feature.rst index 517bb0345..45d0ec6a4 100644 --- a/docs/source/feature/feature.rst +++ b/docs/source/feature/feature.rst @@ -270,6 +270,7 @@ Sequense类特征格式一般为“XX\|XX\|XX”,如用户行为序列特征 sequence_features: { group_name: "seq_fea" allow_key_search: true + only_sequence_feature:false seq_att_map: { key: "brand" key: "cate_id" @@ -281,6 +282,8 @@ Sequense类特征格式一般为“XX\|XX\|XX”,如用户行为序列特征 - sequence_features: 序列特征组的名称 - allow_key_search: 当 key 对应的特征没有在 feature_groups 里面时,需要设置为 true, 将会复用对应特征的 embedding. +- only_sequence_feature : 默认为 false, 指过完 target attention 之后的特征会和 key 对应的特征 concat 之后返回。 + 设置为 true 时,将会只返回过完 target attention 之后的特征。 - seq_att_map: 具体细节可以参考排序里的 DIN 模型。 - NOTE:SequenceFeature一般放在 user 组里面。 diff --git a/easy_rec/python/layers/input_layer.py b/easy_rec/python/layers/input_layer.py index 33fc9e959..d2000c039 100644 --- a/easy_rec/python/layers/input_layer.py +++ b/easy_rec/python/layers/input_layer.py @@ -68,13 +68,18 @@ def __init__(self, def has_group(self, group_name): return group_name in self._feature_groups - def target_attention(self, dnn_config, deep_fea, name): + def target_attention(self, + dnn_config, + deep_fea, + name, + only_sequence_feature=False): cur_id, hist_id_col, seq_len = deep_fea['key'], deep_fea[ 'hist_seq_emb'], deep_fea['hist_seq_len'] seq_max_len = tf.shape(hist_id_col)[1] emb_dim = hist_id_col.shape[2] + cur_id = cur_id[:tf.shape(hist_id_col)[0], ...] # for negative sampler cur_ids = tf.tile(cur_id, [1, seq_max_len]) cur_ids = tf.reshape(cur_ids, tf.shape(hist_id_col)) # (B, seq_max_len, emb_dim) @@ -96,6 +101,8 @@ def target_attention(self, dnn_config, deep_fea, name): scores = tf.nn.softmax(scores) # (B, 1, seq_max_len) hist_din_emb = tf.matmul(scores, hist_id_col) # [B, 1, emb_dim] hist_din_emb = tf.reshape(hist_din_emb, [-1, emb_dim]) # [B, emb_dim] + if only_sequence_feature: + return hist_din_emb din_output = tf.concat([hist_din_emb, cur_id], axis=1) return din_output @@ -108,6 +115,7 @@ def call_seq_input_layer(self, for seq_att_map_config in all_seq_att_map_config: group_name = seq_att_map_config.group_name allow_key_search = seq_att_map_config.allow_key_search + only_sequence_feature = seq_att_map_config.only_sequence_feature seq_features = self._seq_input_layer(features, group_name, feature_name_to_output_tensors, allow_key_search) @@ -128,7 +136,10 @@ def call_seq_input_layer(self, seq_dnn_config.hidden_units.extend([128, 64, 32, 1]) cur_target_attention_name = 'seq_dnn' + group_name seq_fea = self.target_attention( - seq_dnn_config, seq_features, name=cur_target_attention_name) + seq_dnn_config, + seq_features, + name=cur_target_attention_name, + only_sequence_feature=only_sequence_feature) all_seq_fea.append(seq_fea) # concat all seq_fea all_seq_fea = tf.concat(all_seq_fea, axis=1) diff --git a/easy_rec/python/protos/feature_config.proto b/easy_rec/python/protos/feature_config.proto index 349735015..ba06e25e7 100644 --- a/easy_rec/python/protos/feature_config.proto +++ b/easy_rec/python/protos/feature_config.proto @@ -148,4 +148,5 @@ message SeqAttGroupConfig { optional bool tf_summary = 3 [default = false]; optional DNN seq_dnn = 4; optional bool allow_key_search = 5 [default = false]; + optional bool only_sequence_feature = 6 [default = false]; } diff --git a/easy_rec/python/test/train_eval_test.py b/easy_rec/python/test/train_eval_test.py index 4ec1e8ec4..801ad3e95 100644 --- a/easy_rec/python/test/train_eval_test.py +++ b/easy_rec/python/test/train_eval_test.py @@ -845,6 +845,24 @@ def test_distribute_eval_esmm(self): cur_eval_path, self._test_dir) self.assertTrue(self._success) + def test_dssm_neg_sampler_sequence_feature(self): + self._success = test_utils.test_single_train_eval( + 'samples/model_config/dssm_neg_sampler_sequence_feature.config', + self._test_dir) + self.assertTrue(self._success) + + def test_dssm_neg_sampler_only_sequence_feature(self): + self._success = test_utils.test_single_train_eval( + 'samples/model_config/dssm_neg_sampler_only_sequence_feature.config', + self._test_dir) + self.assertTrue(self._success) + + def test_dbmtl_on_multi_numeric_boundary_only_sequence_feature(self): + self._success = test_utils.test_single_train_eval( + 'samples/model_config/dbmtl_on_multi_numeric_boundary_only_sequence_feature_taobao.config', + self._test_dir) + self.assertTrue(self._success) + if __name__ == '__main__': tf.test.main() diff --git a/samples/model_config/dbmtl_on_multi_numeric_boundary_only_sequence_feature_taobao.config b/samples/model_config/dbmtl_on_multi_numeric_boundary_only_sequence_feature_taobao.config new file mode 100644 index 000000000..f1b645555 --- /dev/null +++ b/samples/model_config/dbmtl_on_multi_numeric_boundary_only_sequence_feature_taobao.config @@ -0,0 +1,347 @@ +train_input_path: "data/test/tb_data/taobao_multi_seq_train_data" +eval_input_path: "data/test/tb_data/taobao_multi_seq_test_data" +model_dir: "experiments/dbmtl_taobao_ckpt" + +train_config { + optimizer_config { + adam_optimizer { + learning_rate { + exponential_decay_learning_rate { + initial_learning_rate: 0.001 + decay_steps: 1000 + decay_factor: 0.5 + min_learning_rate: 1e-07 + } + } + } + use_moving_average: false + } + num_steps: 5000 + sync_replicas: true + save_checkpoints_steps: 100 + log_step_count_steps: 100 +} +eval_config { + metrics_set { + auc { + } + } +} +data_config { + batch_size: 4096 + label_fields: "clk" + label_fields: "buy" + prefetch_size: 1 + input_type: CSVInput + input_fields { + input_name: "clk" + input_type: INT32 + } + input_fields { + input_name: "buy" + input_type: INT32 + } + input_fields { + input_name: "pid" + input_type: STRING + } + input_fields { + input_name: "adgroup_id" + input_type: STRING + } + input_fields { + input_name: "cate_id" + input_type: STRING + } + input_fields { + input_name: "campaign_id" + input_type: STRING + } + input_fields { + input_name: "customer" + input_type: STRING + } + input_fields { + input_name: "brand" + input_type: STRING + } + input_fields { + input_name: "user_id" + input_type: STRING + } + input_fields { + input_name: "cms_segid" + input_type: STRING + } + input_fields { + input_name: "cms_group_id" + input_type: STRING + } + input_fields { + input_name: "final_gender_code" + input_type: STRING + } + input_fields { + input_name: "age_level" + input_type: STRING + } + input_fields { + input_name: "pvalue_level" + input_type: STRING + } + input_fields { + input_name: "shopping_level" + input_type: STRING + } + input_fields { + input_name: "occupation" + input_type: STRING + } + input_fields { + input_name: "new_user_class_level" + input_type: STRING + } + input_fields { + input_name: "tag_category_list" + input_type: STRING + default_val: "0" + } + input_fields { + input_name: "tag_brand_list" + input_type: STRING + default_val: "0" + } + input_fields { + input_name: "price" + input_type: INT32 + } +} +feature_configs { + input_names: "pid" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "adgroup_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "cate_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10000 +} +feature_configs { + input_names: "campaign_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "customer" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "brand" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "user_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "cms_segid" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs { + input_names: "cms_group_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs { + input_names: "final_gender_code" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "age_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "pvalue_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "shopping_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "occupation" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "new_user_class_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "tag_category_list" + feature_type: SequenceFeature + embedding_dim: 16 + boundaries: 15.0 + boundaries: 20.0 + boundaries: 21.0 + boundaries: 23.0 + boundaries: 30.0 + boundaries: 32.0 + boundaries: 40.0 + boundaries: 47.0 + boundaries: 66.0 + boundaries: 70.0 + boundaries: 77.0 + boundaries: 87.0 + boundaries: 99.0 + boundaries: 120.0 + boundaries: 148.0 + boundaries: 188.0 + boundaries: 199.0 + boundaries: 235.0 + boundaries: 301.0 + boundaries: 443.0 + boundaries: 597.0 + boundaries: 1314.0 + sub_feature_type: RawFeature + sequence_length: 300 + separator: "|" + seq_multi_sep: ";" +} +feature_configs { + input_names: "tag_brand_list" + feature_type: SequenceFeature + embedding_dim: 16 + boundaries: 15.0 + boundaries: 20.0 + boundaries: 21.0 + boundaries: 23.0 + boundaries: 30.0 + boundaries: 32.0 + boundaries: 40.0 + boundaries: 47.0 + boundaries: 66.0 + boundaries: 70.0 + boundaries: 77.0 + boundaries: 87.0 + boundaries: 99.0 + boundaries: 120.0 + boundaries: 148.0 + boundaries: 188.0 + boundaries: 199.0 + boundaries: 235.0 + boundaries: 301.0 + boundaries: 443.0 + boundaries: 597.0 + boundaries: 1314.0 + sub_feature_type: RawFeature + sequence_length: 300 + separator: "|" + seq_multi_sep: ";" +} +feature_configs { + input_names: "price" + feature_type: IdFeature + embedding_dim: 16 + num_buckets: 50 +} +model_config { + model_class: "DBMTL" + feature_groups { + group_name: "all" + feature_names: "user_id" + feature_names: "cms_segid" + feature_names: "cms_group_id" + feature_names: "age_level" + feature_names: "pvalue_level" + feature_names: "shopping_level" + feature_names: "occupation" + feature_names: "new_user_class_level" + feature_names: "adgroup_id" + feature_names: "cate_id" + feature_names: "campaign_id" + feature_names: "customer" + feature_names: "brand" + feature_names: "price" + feature_names: "pid" + wide_deep: DEEP + sequence_features: { + group_name: "seq_fea" + tf_summary: false + only_sequence_feature:true + seq_att_map: { + key: "brand" + key: "cate_id" + hist_seq: "tag_brand_list" + hist_seq: "tag_category_list" + } + } + } + dbmtl { + bottom_dnn { + hidden_units: [1024, 512, 256] + } + task_towers { + tower_name: "ctr" + label_name: "clk" + loss_type: CLASSIFICATION + metrics_set: { + auc {} + } + dnn { + hidden_units: [256, 128, 64, 32] + } + relation_dnn { + hidden_units: [32] + } + weight: 1.0 + } + task_towers { + tower_name: "cvr" + label_name: "buy" + loss_type: CLASSIFICATION + metrics_set: { + auc {} + } + dnn { + hidden_units: [256, 128, 64, 32] + } + relation_tower_names: ["ctr"] + relation_dnn { + hidden_units: [32] + } + weight: 1.0 + } + l2_regularization: 1e-6 + } + embedding_regularization: 5e-6 +} diff --git a/samples/model_config/dssm_neg_sampler_only_sequence_feature.config b/samples/model_config/dssm_neg_sampler_only_sequence_feature.config new file mode 100644 index 000000000..eb41ce7d6 --- /dev/null +++ b/samples/model_config/dssm_neg_sampler_only_sequence_feature.config @@ -0,0 +1,302 @@ +train_input_path: "data/test/tb_data/taobao_train_data" +eval_input_path: "data/test/tb_data/taobao_test_data" +model_dir: "experiments/dssm_neg_sampler_sequence_feature" + +train_config { + optimizer_config: { + adam_optimizer: { + learning_rate: { + exponential_decay_learning_rate { + initial_learning_rate: 0.001 + decay_steps: 1000 + decay_factor: 0.5 + min_learning_rate: 1e-07 + } + } + } + use_moving_average: false + } + num_steps: 1000 + sync_replicas: false + save_checkpoints_steps: 100 + log_step_count_steps: 10 +} + +eval_config { + metrics_set: { + auc { + } + } + metrics_set: { + gauc { + uid_field: "user_id" + } + } +} + +data_config { + batch_size: 1024 + input_fields { + input_name:'clk' + input_type: INT32 + } + input_fields { + input_name:'buy' + input_type: INT32 + } + input_fields { + input_name: 'pid' + input_type: STRING + } + input_fields { + input_name: 'adgroup_id' + input_type: STRING + } + input_fields { + input_name: 'cate_id' + input_type: STRING + } + input_fields { + input_name: 'campaign_id' + input_type: STRING + } + input_fields { + input_name: 'customer' + input_type: STRING + } + input_fields { + input_name: 'brand' + input_type: STRING + } + input_fields { + input_name: 'user_id' + input_type: STRING + } + input_fields { + input_name: 'cms_segid' + input_type: STRING + } + input_fields { + input_name: 'cms_group_id' + input_type: STRING + } + input_fields { + input_name: 'final_gender_code' + input_type: STRING + } + input_fields { + input_name: 'age_level' + input_type: STRING + } + input_fields { + input_name: 'pvalue_level' + input_type: STRING + } + input_fields { + input_name: 'shopping_level' + input_type: STRING + } + input_fields { + input_name: 'occupation' + input_type: STRING + } + input_fields { + input_name: 'new_user_class_level' + input_type: STRING + } + input_fields { + input_name: 'tag_category_list' + input_type: STRING + } + input_fields { + input_name: 'tag_brand_list' + input_type: STRING + } + input_fields { + input_name: 'price' + input_type: INT32 + } + + label_fields: 'clk' + num_epochs: 5 + prefetch_size: 4 + input_type: CSVInput + + negative_sampler { + input_path: 'data/test/tb_data/taobao_ad_feature_gl' + num_sample: 256 + num_eval_sample: 4096 + attr_fields: 'adgroup_id' + attr_fields: 'cate_id' + attr_fields: 'campaign_id' + attr_fields: 'customer' + attr_fields: 'brand' + item_id_field: 'adgroup_id' + } +} + +feature_configs : { + input_names: 'pid' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs : { + input_names: 'adgroup_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs : { + input_names: 'cate_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10000 +} +feature_configs : { + input_names: 'campaign_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs : { + input_names: 'customer' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs : { + input_names: 'brand' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs : { + input_names: 'user_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs : { + input_names: 'cms_segid' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs : { + input_names: 'cms_group_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs : { + input_names: 'final_gender_code' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs : { + input_names: 'age_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs : { + input_names: 'pvalue_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs : { + input_names: 'shopping_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs : { + input_names: 'occupation' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs : { + input_names: 'new_user_class_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "tag_category_list" + feature_type: SequenceFeature + embedding_dim: 16 + hash_bucket_size: 100000 + sub_feature_type: IdFeature + separator: "|" +} +feature_configs { + input_names: "tag_brand_list" + feature_type: SequenceFeature + embedding_dim: 16 + hash_bucket_size: 100000 + sub_feature_type: IdFeature + separator: "|" +} +feature_configs : { + input_names: 'price' + feature_type: IdFeature + embedding_dim: 16 + num_buckets: 50 +} +model_config:{ + model_class: "DSSM" + feature_groups: { + group_name: 'user' + feature_names: 'user_id' + feature_names: 'cms_segid' + feature_names: 'cms_group_id' + feature_names: 'age_level' + feature_names: 'pvalue_level' + feature_names: 'shopping_level' + feature_names: 'occupation' + feature_names: 'new_user_class_level' + wide_deep:DEEP + sequence_features: { + group_name: "seq_fea" + allow_key_search: true + only_sequence_feature:true + seq_att_map: { + key: "brand" + key: "cate_id" + hist_seq: "tag_brand_list" + hist_seq: "tag_category_list" + } + } + } + feature_groups: { + group_name: "item" + feature_names: 'adgroup_id' + feature_names: 'cate_id' + feature_names: 'campaign_id' + feature_names: 'customer' + feature_names: 'brand' + wide_deep:DEEP + } + dssm { + user_tower { + id: "user_id" + dnn { + hidden_units: [256, 128, 64, 32] + # dropout_ratio : [0.1, 0.1, 0.1, 0.1] + } + } + item_tower { + id: "adgroup_id" + dnn { + hidden_units: [256, 128, 64, 32] + } + } + l2_regularization: 1e-6 + } + loss_type: SOFTMAX_CROSS_ENTROPY + embedding_regularization: 5e-6 +} diff --git a/samples/model_config/dssm_neg_sampler_sequence_feature.config b/samples/model_config/dssm_neg_sampler_sequence_feature.config new file mode 100644 index 000000000..462504090 --- /dev/null +++ b/samples/model_config/dssm_neg_sampler_sequence_feature.config @@ -0,0 +1,302 @@ +train_input_path: "data/test/tb_data/taobao_train_data" +eval_input_path: "data/test/tb_data/taobao_test_data" +model_dir: "experiments/dssm_neg_sampler_sequence_feature" + +train_config { + optimizer_config: { + adam_optimizer: { + learning_rate: { + exponential_decay_learning_rate { + initial_learning_rate: 0.001 + decay_steps: 1000 + decay_factor: 0.5 + min_learning_rate: 1e-07 + } + } + } + use_moving_average: false + } + num_steps: 1000 + sync_replicas: false + save_checkpoints_steps: 100 + log_step_count_steps: 10 +} + +eval_config { + metrics_set: { + auc { + } + } + metrics_set: { + gauc { + uid_field: "user_id" + } + } +} + +data_config { + batch_size: 1024 + input_fields { + input_name:'clk' + input_type: INT32 + } + input_fields { + input_name:'buy' + input_type: INT32 + } + input_fields { + input_name: 'pid' + input_type: STRING + } + input_fields { + input_name: 'adgroup_id' + input_type: STRING + } + input_fields { + input_name: 'cate_id' + input_type: STRING + } + input_fields { + input_name: 'campaign_id' + input_type: STRING + } + input_fields { + input_name: 'customer' + input_type: STRING + } + input_fields { + input_name: 'brand' + input_type: STRING + } + input_fields { + input_name: 'user_id' + input_type: STRING + } + input_fields { + input_name: 'cms_segid' + input_type: STRING + } + input_fields { + input_name: 'cms_group_id' + input_type: STRING + } + input_fields { + input_name: 'final_gender_code' + input_type: STRING + } + input_fields { + input_name: 'age_level' + input_type: STRING + } + input_fields { + input_name: 'pvalue_level' + input_type: STRING + } + input_fields { + input_name: 'shopping_level' + input_type: STRING + } + input_fields { + input_name: 'occupation' + input_type: STRING + } + input_fields { + input_name: 'new_user_class_level' + input_type: STRING + } + input_fields { + input_name: 'tag_category_list' + input_type: STRING + } + input_fields { + input_name: 'tag_brand_list' + input_type: STRING + } + input_fields { + input_name: 'price' + input_type: INT32 + } + + label_fields: 'clk' + num_epochs: 5 + prefetch_size: 4 + input_type: CSVInput + + negative_sampler { + input_path: 'data/test/tb_data/taobao_ad_feature_gl' + num_sample: 256 + num_eval_sample: 4096 + attr_fields: 'adgroup_id' + attr_fields: 'cate_id' + attr_fields: 'campaign_id' + attr_fields: 'customer' + attr_fields: 'brand' + item_id_field: 'adgroup_id' + } +} + +feature_configs : { + input_names: 'pid' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs : { + input_names: 'adgroup_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs : { + input_names: 'cate_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10000 +} +feature_configs : { + input_names: 'campaign_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs : { + input_names: 'customer' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs : { + input_names: 'brand' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs : { + input_names: 'user_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs : { + input_names: 'cms_segid' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs : { + input_names: 'cms_group_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs : { + input_names: 'final_gender_code' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs : { + input_names: 'age_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs : { + input_names: 'pvalue_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs : { + input_names: 'shopping_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs : { + input_names: 'occupation' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs : { + input_names: 'new_user_class_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "tag_category_list" + feature_type: SequenceFeature + embedding_dim: 16 + hash_bucket_size: 100000 + sub_feature_type: IdFeature + separator: "|" +} +feature_configs { + input_names: "tag_brand_list" + feature_type: SequenceFeature + embedding_dim: 16 + hash_bucket_size: 100000 + sub_feature_type: IdFeature + separator: "|" +} +feature_configs : { + input_names: 'price' + feature_type: IdFeature + embedding_dim: 16 + num_buckets: 50 +} +model_config:{ + model_class: "DSSM" + feature_groups: { + group_name: 'user' + feature_names: 'user_id' + feature_names: 'cms_segid' + feature_names: 'cms_group_id' + feature_names: 'age_level' + feature_names: 'pvalue_level' + feature_names: 'shopping_level' + feature_names: 'occupation' + feature_names: 'new_user_class_level' + wide_deep:DEEP + sequence_features: { + group_name: "seq_fea" + allow_key_search: true + only_sequence_feature:false + seq_att_map: { + key: "brand" + key: "cate_id" + hist_seq: "tag_brand_list" + hist_seq: "tag_category_list" + } + } + } + feature_groups: { + group_name: "item" + feature_names: 'adgroup_id' + feature_names: 'cate_id' + feature_names: 'campaign_id' + feature_names: 'customer' + feature_names: 'brand' + wide_deep:DEEP + } + dssm { + user_tower { + id: "user_id" + dnn { + hidden_units: [256, 128, 64, 32] + # dropout_ratio : [0.1, 0.1, 0.1, 0.1] + } + } + item_tower { + id: "adgroup_id" + dnn { + hidden_units: [256, 128, 64, 32] + } + } + l2_regularization: 1e-6 + } + loss_type: SOFTMAX_CROSS_ENTROPY + embedding_regularization: 5e-6 +} From ca7e1a730699caf111a9674a07869e354e95901d Mon Sep 17 00:00:00 2001 From: "yancheng.lgq" Date: Thu, 4 Aug 2022 19:41:22 +0800 Subject: [PATCH 2/3] fix bug for tf2 --- easy_rec/python/test/train_eval_test.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/easy_rec/python/test/train_eval_test.py b/easy_rec/python/test/train_eval_test.py index 801ad3e95..81e96625e 100644 --- a/easy_rec/python/test/train_eval_test.py +++ b/easy_rec/python/test/train_eval_test.py @@ -845,18 +845,21 @@ def test_distribute_eval_esmm(self): cur_eval_path, self._test_dir) self.assertTrue(self._success) + @unittest.skipIf(gl is None, 'graphlearn is not installed') def test_dssm_neg_sampler_sequence_feature(self): self._success = test_utils.test_single_train_eval( 'samples/model_config/dssm_neg_sampler_sequence_feature.config', self._test_dir) self.assertTrue(self._success) + @unittest.skipIf(gl is None, 'graphlearn is not installed') def test_dssm_neg_sampler_only_sequence_feature(self): self._success = test_utils.test_single_train_eval( 'samples/model_config/dssm_neg_sampler_only_sequence_feature.config', self._test_dir) self.assertTrue(self._success) + @unittest.skipIf(gl is None, 'graphlearn is not installed') def test_dbmtl_on_multi_numeric_boundary_only_sequence_feature(self): self._success = test_utils.test_single_train_eval( 'samples/model_config/dbmtl_on_multi_numeric_boundary_only_sequence_feature_taobao.config', From 36e506b58ed9fe6a0fd8b7fe500a49b3f457831d Mon Sep 17 00:00:00 2001 From: "yancheng.lgq" Date: Fri, 5 Aug 2022 11:45:10 +0800 Subject: [PATCH 3/3] add need_key_feature --- docs/source/feature/feature.rst | 6 +++--- easy_rec/python/layers/input_layer.py | 12 ++++-------- easy_rec/python/protos/feature_config.proto | 2 +- easy_rec/python/test/train_eval_test.py | 9 ++++----- ..._numeric_boundary_need_key_feature_taobao.config} | 2 +- ...nfig => dssm_neg_sampler_need_key_feature.config} | 2 +- .../dssm_neg_sampler_sequence_feature.config | 2 +- 7 files changed, 15 insertions(+), 20 deletions(-) rename samples/model_config/{dbmtl_on_multi_numeric_boundary_only_sequence_feature_taobao.config => dbmtl_on_multi_numeric_boundary_need_key_feature_taobao.config} (99%) rename samples/model_config/{dssm_neg_sampler_only_sequence_feature.config => dssm_neg_sampler_need_key_feature.config} (99%) diff --git a/docs/source/feature/feature.rst b/docs/source/feature/feature.rst index 45d0ec6a4..73ec2b57c 100644 --- a/docs/source/feature/feature.rst +++ b/docs/source/feature/feature.rst @@ -270,7 +270,7 @@ Sequense类特征格式一般为“XX\|XX\|XX”,如用户行为序列特征 sequence_features: { group_name: "seq_fea" allow_key_search: true - only_sequence_feature:false + need_key_feature:true seq_att_map: { key: "brand" key: "cate_id" @@ -282,8 +282,8 @@ Sequense类特征格式一般为“XX\|XX\|XX”,如用户行为序列特征 - sequence_features: 序列特征组的名称 - allow_key_search: 当 key 对应的特征没有在 feature_groups 里面时,需要设置为 true, 将会复用对应特征的 embedding. -- only_sequence_feature : 默认为 false, 指过完 target attention 之后的特征会和 key 对应的特征 concat 之后返回。 - 设置为 true 时,将会只返回过完 target attention 之后的特征。 +- need_key_feature : 默认为 true, 指过完 target attention 之后的特征会和 key 对应的特征 concat 之后返回。 + 设置为 false 时,将会只返回过完 target attention 之后的特征。 - seq_att_map: 具体细节可以参考排序里的 DIN 模型。 - NOTE:SequenceFeature一般放在 user 组里面。 diff --git a/easy_rec/python/layers/input_layer.py b/easy_rec/python/layers/input_layer.py index d2000c039..cd2aa909d 100644 --- a/easy_rec/python/layers/input_layer.py +++ b/easy_rec/python/layers/input_layer.py @@ -68,11 +68,7 @@ def __init__(self, def has_group(self, group_name): return group_name in self._feature_groups - def target_attention(self, - dnn_config, - deep_fea, - name, - only_sequence_feature=False): + def target_attention(self, dnn_config, deep_fea, name, need_key_feature=True): cur_id, hist_id_col, seq_len = deep_fea['key'], deep_fea[ 'hist_seq_emb'], deep_fea['hist_seq_len'] @@ -101,7 +97,7 @@ def target_attention(self, scores = tf.nn.softmax(scores) # (B, 1, seq_max_len) hist_din_emb = tf.matmul(scores, hist_id_col) # [B, 1, emb_dim] hist_din_emb = tf.reshape(hist_din_emb, [-1, emb_dim]) # [B, emb_dim] - if only_sequence_feature: + if not need_key_feature: return hist_din_emb din_output = tf.concat([hist_din_emb, cur_id], axis=1) return din_output @@ -115,7 +111,7 @@ def call_seq_input_layer(self, for seq_att_map_config in all_seq_att_map_config: group_name = seq_att_map_config.group_name allow_key_search = seq_att_map_config.allow_key_search - only_sequence_feature = seq_att_map_config.only_sequence_feature + need_key_feature = seq_att_map_config.need_key_feature seq_features = self._seq_input_layer(features, group_name, feature_name_to_output_tensors, allow_key_search) @@ -139,7 +135,7 @@ def call_seq_input_layer(self, seq_dnn_config, seq_features, name=cur_target_attention_name, - only_sequence_feature=only_sequence_feature) + need_key_feature=need_key_feature) all_seq_fea.append(seq_fea) # concat all seq_fea all_seq_fea = tf.concat(all_seq_fea, axis=1) diff --git a/easy_rec/python/protos/feature_config.proto b/easy_rec/python/protos/feature_config.proto index ba06e25e7..aad2b3b85 100644 --- a/easy_rec/python/protos/feature_config.proto +++ b/easy_rec/python/protos/feature_config.proto @@ -148,5 +148,5 @@ message SeqAttGroupConfig { optional bool tf_summary = 3 [default = false]; optional DNN seq_dnn = 4; optional bool allow_key_search = 5 [default = false]; - optional bool only_sequence_feature = 6 [default = false]; + optional bool need_key_feature = 6 [default = true]; } diff --git a/easy_rec/python/test/train_eval_test.py b/easy_rec/python/test/train_eval_test.py index 81e96625e..96a96387b 100644 --- a/easy_rec/python/test/train_eval_test.py +++ b/easy_rec/python/test/train_eval_test.py @@ -853,16 +853,15 @@ def test_dssm_neg_sampler_sequence_feature(self): self.assertTrue(self._success) @unittest.skipIf(gl is None, 'graphlearn is not installed') - def test_dssm_neg_sampler_only_sequence_feature(self): + def test_dssm_neg_sampler_need_key_feature(self): self._success = test_utils.test_single_train_eval( - 'samples/model_config/dssm_neg_sampler_only_sequence_feature.config', + 'samples/model_config/dssm_neg_sampler_need_key_feature.config', self._test_dir) self.assertTrue(self._success) - @unittest.skipIf(gl is None, 'graphlearn is not installed') - def test_dbmtl_on_multi_numeric_boundary_only_sequence_feature(self): + def test_dbmtl_on_multi_numeric_boundary_need_key_feature(self): self._success = test_utils.test_single_train_eval( - 'samples/model_config/dbmtl_on_multi_numeric_boundary_only_sequence_feature_taobao.config', + 'samples/model_config/dbmtl_on_multi_numeric_boundary_need_key_feature_taobao.config', self._test_dir) self.assertTrue(self._success) diff --git a/samples/model_config/dbmtl_on_multi_numeric_boundary_only_sequence_feature_taobao.config b/samples/model_config/dbmtl_on_multi_numeric_boundary_need_key_feature_taobao.config similarity index 99% rename from samples/model_config/dbmtl_on_multi_numeric_boundary_only_sequence_feature_taobao.config rename to samples/model_config/dbmtl_on_multi_numeric_boundary_need_key_feature_taobao.config index f1b645555..6584391eb 100644 --- a/samples/model_config/dbmtl_on_multi_numeric_boundary_only_sequence_feature_taobao.config +++ b/samples/model_config/dbmtl_on_multi_numeric_boundary_need_key_feature_taobao.config @@ -297,7 +297,7 @@ model_config { sequence_features: { group_name: "seq_fea" tf_summary: false - only_sequence_feature:true + need_key_feature:false seq_att_map: { key: "brand" key: "cate_id" diff --git a/samples/model_config/dssm_neg_sampler_only_sequence_feature.config b/samples/model_config/dssm_neg_sampler_need_key_feature.config similarity index 99% rename from samples/model_config/dssm_neg_sampler_only_sequence_feature.config rename to samples/model_config/dssm_neg_sampler_need_key_feature.config index eb41ce7d6..12a8621cc 100644 --- a/samples/model_config/dssm_neg_sampler_only_sequence_feature.config +++ b/samples/model_config/dssm_neg_sampler_need_key_feature.config @@ -263,7 +263,7 @@ model_config:{ sequence_features: { group_name: "seq_fea" allow_key_search: true - only_sequence_feature:true + need_key_feature:false seq_att_map: { key: "brand" key: "cate_id" diff --git a/samples/model_config/dssm_neg_sampler_sequence_feature.config b/samples/model_config/dssm_neg_sampler_sequence_feature.config index 462504090..df6896408 100644 --- a/samples/model_config/dssm_neg_sampler_sequence_feature.config +++ b/samples/model_config/dssm_neg_sampler_sequence_feature.config @@ -263,7 +263,7 @@ model_config:{ sequence_features: { group_name: "seq_fea" allow_key_search: true - only_sequence_feature:false + need_key_feature:true seq_att_map: { key: "brand" key: "cate_id"