[feats]: Add CMBF layer for DBMTL model (#223)

* [feats]: Add CMBF layer for DBMTL model
alibaba · Jun 28, 2022 · a0fd524 · a0fd524
1 parent dcec6f1
commit a0fd524
Show file tree

Hide file tree

Showing 31 changed files with 630 additions and 100 deletions.
diff --git a/.git_bin_url b/.git_bin_url
@@ -24,7 +24,7 @@
 {"leaf_path": "data/test/inference/tb_multitower_rtp_export/assets", "sig": "ae1cc9ec956fb900e5df45c4ec255c4b", "remote_path": "data/git_oss_sample_data/data_test_inference_tb_multitower_rtp_export_assets_ae1cc9ec956fb900e5df45c4ec255c4b"}
 {"leaf_path": "data/test/inference/tb_multitower_rtp_export/variables", "sig": "efe52ef308fd6452f3b67fd04cdd22bd", "remote_path": "data/git_oss_sample_data/data_test_inference_tb_multitower_rtp_export_variables_efe52ef308fd6452f3b67fd04cdd22bd"}
 {"leaf_path": "data/test/latest_ckpt_test", "sig": "d41d8cd98f00b204e9800998ecf8427e", "remote_path": "data/git_oss_sample_data/data_test_latest_ckpt_test_d41d8cd98f00b204e9800998ecf8427e"}
-{"leaf_path": "data/test/movielens_1m", "sig": "56b6486acda000cd49a06b422ed09166", "remote_path": "data/git_oss_sample_data/data_test_movielens_1m_56b6486acda000cd49a06b422ed09166"}
+{"leaf_path": "data/test/movielens_1m", "sig": "99badbeec64f2fcabe0dfa1d2bfd8fb5", "remote_path": "data/git_oss_sample_data/data_test_movielens_1m_99badbeec64f2fcabe0dfa1d2bfd8fb5"}
 {"leaf_path": "data/test/rtp", "sig": "76cda60582617ddbb7cd5a49eb68a4b9", "remote_path": "data/git_oss_sample_data/data_test_rtp_76cda60582617ddbb7cd5a49eb68a4b9"}
 {"leaf_path": "data/test/tb_data", "sig": "126c375d6aa666633fb3084aa27ff9f7", "remote_path": "data/git_oss_sample_data/data_test_tb_data_126c375d6aa666633fb3084aa27ff9f7"}
 {"leaf_path": "data/test/tb_data_with_time", "sig": "1a7648f4ae55faf37855762bccbb70cc", "remote_path": "data/git_oss_sample_data/data_test_tb_data_with_time_1a7648f4ae55faf37855762bccbb70cc"}
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -53,6 +53,7 @@ Welcome to easy_rec's documentation!
    predict/Local 离线预测
    predict/在线预测
    feature/rtp_native
+   vector_retrieve
 
 .. toctree::
    :maxdepth: 2

diff --git a/docs/source/loss.md b/docs/source/loss.md
@@ -28,16 +28,16 @@ EasyRec支持两种损失函数配置方式：1）使用单个损失函数；2
 
 - F1_REWEIGHTED_LOSS 的参数配置
 
-  可以调节二分类模型recall/precision相对权重的损失函数
+  可以调节二分类模型recall/precision相对权重的损失函数，配置如下：
 
-```
+  ```
   {
     loss_type: F1_REWEIGHTED_LOSS
     f1_reweight_loss {
       f1_beta_square: 0.5625
     }
   }
-```
+  ```
 
 - f1_beta_square: 大于1的值会导致模型更关注recall，小于1的值会导致模型更关注precision
 
@@ -76,3 +76,6 @@ EasyRec支持两种损失函数配置方式：1）使用单个损失函数；2
     f1_beta_square: 0.5625
   }
 ```
+
+排序模型同时使用多个损失函数的完整示例：
+[cmbf_with_multi_loss.config](https://github.com/alibaba/EasyRec/blob/master/samples/model_config/cmbf_with_multi_loss.config)
diff --git a/docs/source/models/cmbf.md b/docs/source/models/cmbf.md
@@ -19,9 +19,9 @@ CMBF主要有4个模块（如上图）：
 视觉特征提取模块通常是一个CNN-based的模型，用来提取图像或视频特征，以便后续接入transformer模块。
 视觉特征的输入（对应配置名为`image`的`feature group`）可以是以下三种情况之一:
 
-1. multiple image embeddings, each corresponding to video frames or ROIs(region of interest)
-1. one conventional image embedding extracted by an image model
-1. one big image embedding composed by multiple results of spatial convolutions(feature maps before CNN pooling layer)
+1. 多个图像特征向量，每个特征向量对应原始图像的一个分片（patch）或一个兴趣区域 (region of interest) ，或者对应视频的某一帧；
+2. 一个大的复合特征向量，即上述多个图像特征向量铺平（flat）之后的结果，这时需要知道`image_feature_patch_num`参数；
+3. 一个常规的由某个图像模型提取的图像特征。
 
 文本型特征包括两部分：
 
@@ -69,6 +69,8 @@ model_config: {
   }
   cmbf {
     multi_head_num: 2
+    image_multi_head_num: 2
+    text_multi_head_num: 2
     image_head_size: 8
     text_head_size: 8
     image_feature_dim: 64
@@ -101,10 +103,12 @@ model_config: {
 - cmbf: CMBF 模型相关的参数
 
   - image_feature_dim: 在单模态学习模块之前做图像特征维度调整，调整到该参数指定的维度
-  - multi_head_num: 单模态学习模块和跨模态融合模块中的 head 数量，默认为1
+  - multi_head_num: 跨模态融合模块中的 head 数量，默认为1
+  - image_multi_head_num: 图像单模态学习模块中的 head 数量，默认为1
+  - text_multi_head_num: 文本单模态学习模块中的 head 数量，默认为1
   - image_head_size: 单模态学习模块中的图像tower，multi-headed self-attention的每个head的size
   - text_head_size: 单模态学习模块中的文本tower，multi-headed self-attention的每个head的size
-  - image_feature_slice_num: \[可选，默认值为1\] 表示CNN的filter个数。当只有一个image feature时生效，表示该图像特征是一个复合embedding，维度为`image_feature_slice_num * embedding_size`。
+  - image_feature_patch_num: \[可选，默认值为1\] 当只有一个image feature时生效，表示该图像特征是一个复合embedding，维度为`image_feature_patch_num * embedding_size`。
   - image_self_attention_layer_num: 单模态学习模块中的图像tower，multi-headed self-attention的层数
   - text_self_attention_layer_num: 单模态学习模块中的文本tower，multi-headed self-attention的层数
   - cross_modal_layer_num: 跨模态融合模块的层数，建议设在1到5之间，默认为1
@@ -122,7 +126,7 @@ model_config: {
 
 ### 示例Config
 
-[CMBF_demo.config](https://easyrec.oss-cn-beijing.aliyuncs.com/config/cmbf.config)
+[CMBF_demo.config](https://github.com/alibaba/EasyRec/blob/master/samples/model_config/cmbf_on_movielens.config)
 
 ### 参考论文
 

diff --git a/docs/source/models/dbmtl.md b/docs/source/models/dbmtl.md
@@ -9,6 +9,8 @@ DBMTL构建了多个目标之间的贝叶斯网络，显式建模了多个目标
 
 ![dbmtl_mmoe.png](../../images/models/dbmtl_mmoe.png)
 
+在多模态（图像、视频、文本）推荐场景，DBMTL支持使用[CMBF模型](cmbf.md)作为底层的`shared layer`，以便充分利用多模态特征，取到更好的推荐效果。
+
 ### 配置说明
 
 #### DBTML
@@ -148,10 +150,122 @@ DBMTL模型每个塔的输出名为："logits\_" / "probs\_" / "y\_" + tower_nam
 其中，logits/probs/y对应: sigmoid之前的值/概率/回归模型的预测值
 DBMTL模型每个塔的指标为：指标名+ "\_" + tower_name
 
+#### DBMTL+CMBF
+
+多模态、多目标推荐模型
+
+```protobuf
+model_config: {
+  model_class: 'DBMTL'
+  feature_groups: {
+    group_name: 'image'
+    feature_names: 'embedding'
+    wide_deep: DEEP
+  }
+  feature_groups: {
+    group_name: 'general'
+    feature_names: 'user_id'
+    feature_names: 'movie_id'
+    feature_names: 'gender'
+    feature_names: 'age'
+    feature_names: 'occupation'
+    feature_names: 'zip_id'
+    feature_names: 'movie_year_bin'
+    feature_names: 'score_year_diff'
+    feature_names: 'score_time'
+    wide_deep: DEEP
+  }
+  feature_groups: {
+    group_name: 'text'
+    feature_names: 'title'
+    feature_names: 'genres'
+    wide_deep: DEEP
+  }
+  dbmtl {
+    bottom_cmbf {
+      multi_head_num: 2
+      image_multi_head_num: 2
+      text_multi_head_num: 2
+      image_feature_patch_num: 8
+      image_head_size: 32
+      text_head_size: 8
+      image_self_attention_layer_num: 2
+      text_self_attention_layer_num: 2
+      cross_modal_layer_num: 3
+      image_cross_head_size: 8
+      text_cross_head_size: 16
+      max_position_embeddings: 16
+      use_token_type: true
+    }
+    task_towers {
+      tower_name: "classify"
+      label_name: "label"
+      loss_type: CLASSIFICATION
+      metrics_set: {
+        auc {}
+      }
+      metrics_set: {
+        gauc {
+          uid_field: 'user_id'
+        }
+      }
+      dnn {
+        hidden_units: [256, 128, 64]
+      }
+      relation_dnn {
+        hidden_units: [32]
+      }
+      weight: 1.0
+    }
+    task_towers {
+      tower_name: "rating"
+      label_name: "rating"
+      loss_type: L2_LOSS
+      metrics_set: {
+        mean_squared_error {}
+      }
+      dnn {
+        hidden_units: [256, 128, 64]
+      }
+      relation_tower_names: ["classify"]
+      relation_dnn {
+        hidden_units: [32]
+      }
+      weight: 1.0
+    }
+    l2_regularization: 1e-6
+  }
+  embedding_regularization: 1e-6
+}
+```
+
+- dbmtl
+  - bottom_cmbf: 跨模态融合模型CMBF的相关配置，请参考[CMBF模型](cmbf.md)配置。
+    - image_feature_dim: 在单模态学习模块之前做图像特征维度调整，调整到该参数指定的维度
+    - multi_head_num: 跨模态融合模块中的 head 数量，默认为1
+    - image_multi_head_num: 图像单模态学习模块中的 head 数量，默认为1
+    - text_multi_head_num: 文本单模态学习模块中的 head 数量，默认为1
+    - image_head_size: 单模态学习模块中的图像tower，multi-headed self-attention的每个head的size
+    - text_head_size: 单模态学习模块中的文本tower，multi-headed self-attention的每个head的size
+    - image_feature_patch_num: \[可选，默认值为1\] 当只有一个image feature时生效，表示该图像特征是一个复合embedding，维度为`image_feature_patch_num * embedding_size`。
+    - image_self_attention_layer_num: 单模态学习模块中的图像tower，multi-headed self-attention的层数
+    - text_self_attention_layer_num: 单模态学习模块中的文本tower，multi-headed self-attention的层数
+    - cross_modal_layer_num: 跨模态融合模块的层数，建议设在1到5之间，默认为1
+    - image_cross_head_size: 跨模模态学习模块中的图像tower，multi-headed attention的每个head的size
+    - text_cross_head_size: 跨模模态学习模块中的文本tower，multi-headed attention的每个head的size
+    - attention_probs_dropout_prob: self/cross attention模块attention权重的dropout概率
+    - hidden_dropout_prob: multi-headed attention模块中FC layer的dropout概率
+    - use_token_type: bool，default is false；是否使用token type embedding区分不同的text sequence feature
+    - use_position_embeddings: bool, default is true；是否为文本序列添加位置编码
+    - max_position_embeddings: 文本序列的最大位置，当`use_position_embeddings`为true时，必须配置；并且必须大于或等于所有特征配置`max_seq_len`的最大值
+    - text_seq_emb_dropout_prob: 文本序列embedding的dropout概率
+  - 其余与dbmtl一致
+
 ### 示例Config
 
 - [DBMTL_demo.config](https://easyrec.oss-cn-beijing.aliyuncs.com/config/dbmtl.config)
 - [DBMTL_MMOE_demo.config](https://easyrec.oss-cn-beijing.aliyuncs.com/config/dbmtl_mmoe.config)
+- [DBMTL_CMBF_demo.config](https://github.com/alibaba/EasyRec/blob/master/samples/model_config/dbmtl_cmbf_on_movielens.config)
 
 ### 参考论文
 

diff --git a/docs/source/vector_retrieve.md b/docs/source/vector_retrieve.md
@@ -3,7 +3,7 @@
 ## Pai 命令
 
 ```sql
-pai -name easy_rec_ext -project algo_public
+pai -name easy_rec_ext -project algo_public_dev
 -Dcmd=vector_retrieve
 -Dquery_table=odps://pai_online_project/tables/query_vector_table
 -Ddoc_table=odps://pai_online_project/tables/doc_vector_table
@@ -74,7 +74,6 @@ VALUES
 ```sql
 pai -name easy_rec_ext -project algo_public_dev
 -Dcmd='vector_retrieve'
--DentryFile='run.py'
 -Dquery_table='odps://${project}/tables/query_table/pt=20190410'
 -Ddoc_table='odps://${project}/tables/doc_table/pt=20190410'
 -Doutput_table='odps://${project}/tables/knn_result_table/pt=20190410'
@@ -92,7 +91,25 @@ pai -name easy_rec_ext -project algo_public_dev
         \"cpu\" : 600
     }
 }';
-;
+```
+
+FQA: 遇到以下错误怎么办？
+
+```
+File "run.py", line 517, in main
+  raise ValueError('cmd should be one of train/evaluate/export/predict')
+ValueError: cmd should be one of train/evaluate/export/predict
+```
+
+这个错误是因为包含`向量近邻检索`的最新的EasyRec版本暂时还没有正式发布。
+
+解决方案：从 [Github](https://github.com/alibaba/EasyRec)
+的master分支拉取最新代码，使用`bash pai_jobs/deploy_ext.sh -V ${version}`命令打一个最新的资源包`easy_rec_ext_${version}_res.tar.gz`，
+上传到MaxCompute作为Archive资源，最后，在上述命令中加两个如下的参数即可解决。
+
+```
+-Dversion='${version}'
+-Dres_project=${maxcompute_project}
 ```
 
 ### 4. 查看结果

diff --git a/easy_rec/python/feature_column/feature_column.py b/easy_rec/python/feature_column/feature_column.py
@@ -1,6 +1,5 @@
 # -*- encoding:utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import collections
 import logging
 
 import tensorflow as tf

diff --git a/easy_rec/python/inference/predictor.py b/easy_rec/python/inference/predictor.py
@@ -477,10 +477,10 @@ def _parse_value(all_vals):
                 split_vals[k] = []
             else:
               assert self._all_input_names, 'must set fg_json_path when use fg input'
-              assert fg_input_size == len(self._all_input_names), \
-                'The size of features in fg_json != the size of fg input. ' \
-                'The size of features in fg_json is: %s; The size of fg input is: %s' % \
-                (fg_input_size, len(self._all_input_names))
+              assert fg_input_size == len(self._all_input_names), (
+                  'The size of features in fg_json != the size of fg input. '
+                  'The size of features in fg_json is: %s; The size of fg input is: %s'
+                  % (fg_input_size, len(self._all_input_names)))
               for i, k in enumerate(self._all_input_names):
                 split_index.append(k)
                 split_vals[k] = []
@@ -674,9 +674,9 @@ def _get_num_cols(self, file_paths):
       for line_str in fin:
         line_tok = line_str.strip().split(self._input_sep)
         if num_cols != -1:
-          assert num_cols == len(line_tok), \
-            'num selected cols is %d, not equal to %d, current line is: %s, please check input_sep and data.' % \
-            (num_cols, len(line_tok), line_str)
+          assert num_cols == len(line_tok), (
+              'num selected cols is %d, not equal to %d, current line is: %s, please check input_sep and data.'
+              % (num_cols, len(line_tok), line_str))
         num_cols = len(line_tok)
         num_lines += 1
         if num_lines > 10:
@@ -797,7 +797,7 @@ def out_of_range_exception(self):
 
   def _get_reserve_vals(self, reserved_cols, output_cols, all_vals, outputs):
     reserve_vals = [all_vals[k] for k in reserved_cols] + \
-                 [outputs[x] for x in output_cols]
+                   [outputs[x] for x in output_cols]
     return reserve_vals
 
 

diff --git a/easy_rec/python/input/criteo_input.py b/easy_rec/python/input/criteo_input.py
@@ -2,7 +2,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import logging
 
-import numpy as np
 import tensorflow as tf
 from tensorflow.python.platform import gfile
 

diff --git a/easy_rec/python/input/odps_rtp_input_v2.py b/easy_rec/python/input/odps_rtp_input_v2.py
@@ -3,7 +3,6 @@
 import json
 import logging
 
-import numpy as np
 import tensorflow as tf
 
 from easy_rec.python.input.odps_rtp_input import OdpsRTPInput
@@ -54,13 +53,6 @@ def _parse_table(self, *fields):
     fields = list(fields)
     labels = fields[:-1]
 
-    # only for features, labels excluded
-    record_defaults = [
-        self.get_type_defaults(t, v)
-        for x, t, v in zip(self._input_fields, self._input_field_types,
-                           self._input_field_defaults)
-        if x not in self._label_fields
-    ]
     # assume that the last field is the generated feature column
     features = rtp_fg.parse_genreated_fg(self._fg_config, fields[-1])