diff --git a/ltr/README.md b/ltr/README.md index 0ee2dadbd9..3cc84494f7 100644 --- a/ltr/README.md +++ b/ltr/README.md @@ -96,7 +96,7 @@ $$\lambda _{i,j}=\frac{\partial C}{\partial s_{i}} = \frac{1}{2}(1-S_{i,j})-\fra 训练`RankNet`模型在命令行执行: ```bash -python ranknet.py +python train.py --model_type ranknet ``` 初次执行会自动下载数据,训练RankNet模型,并将每个轮次的模型参数存储下来。 @@ -104,9 +104,7 @@ python ranknet.py 使用训练好的`RankNet`模型继续进行预测,在命令行执行: ```bash -python ranknet.py \ - --run_type infer \ - --test_model_path models/ranknet_params_0.tar.gz +python infer.py --model_type ranknet --test_model_path models/ranknet_params_0.tar.gz ``` 本例提供了rankNet模型的训练和预测两个部分。完成训练后的模型分为拓扑结构(需要注意`rank_cost`不是模型拓扑结构的一部分)和模型参数文件两部分。在本例子中复用了`ranknet`训练时的模型拓扑结构`half_ranknet`,模型参数从外存中加载。模型预测的输入为单个文档的特征向量,模型会给出相关性得分。将预测得分排序即可得到最终的文档相关性排序结果。 @@ -193,7 +191,7 @@ $$\lambda _{i,j}=\frac{\partial C}{\partial s_{i}}=-\frac{\sigma }{1+e^{\sigma ( 训练`LambdaRank`模型在命令行执行: ```bash -python lambda_rank.py +python train.py --model_type lambdarank ``` 初次运行脚本会自动下载数据训练LambdaRank模型,并将每个轮次的模型存储下来。 @@ -203,9 +201,7 @@ LambdaRank模型预测过程和RankNet相同。预测时的模型拓扑结构复 使用训练好的`LambdaRank`模型继续进行预测,在命令行执行: ```bash -python lambda_rank.py \ - --run_type infer \ - --test_model_path models/lambda_rank_params_0.tar.gz +python infer.py --model_type lambdarank --test_model_path models/lambda_rank_params_0.tar.gz ``` ## 自定义 LambdaRank数据 diff --git a/ltr/infer.py b/ltr/infer.py new file mode 100644 index 0000000000..3ec1842d72 --- /dev/null +++ b/ltr/infer.py @@ -0,0 +1,115 @@ +import os +import gzip +import functools +import argparse + +import paddle.v2 as paddle + +from ranknet import half_ranknet +from lambda_rank import lambda_rank + + +def ranknet_infer(input_dim, model_path): + """ + RankNet model inference interface. + """ + # we just need half_ranknet to predict a rank score, + # which can be used in sort documents + output = half_ranknet("right", input_dim) + parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path)) + + # load data of same query and relevance documents, + # need ranknet to rank these candidates + infer_query_id = [] + infer_data = [] + infer_doc_index = [] + + # convert to mq2007 built-in data format + # + plain_txt_test = functools.partial( + paddle.dataset.mq2007.test, format="plain_txt") + + for query_id, relevance_score, feature_vector in plain_txt_test(): + infer_query_id.append(query_id) + infer_data.append([feature_vector]) + + # predict score of infer_data document. + # Re-sort the document base on predict score + # in descending order. then we build the ranking documents + scores = paddle.infer( + output_layer=output, parameters=parameters, input=infer_data) + for query_id, score in zip(infer_query_id, scores): + print "query_id : ", query_id, " score : ", score + + +def lambda_rank_infer(input_dim, model_path): + """ + LambdaRank model inference interface. + """ + output = lambda_rank(input_dim, is_infer=True) + parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path)) + + infer_query_id = None + infer_data = [] + infer_data_num = 1 + + fill_default_test = functools.partial( + paddle.dataset.mq2007.test, format="listwise") + for label, querylist in fill_default_test(): + infer_data.append([querylist]) + if len(infer_data) == infer_data_num: + break + + # Predict score of infer_data document. + # Re-sort the document base on predict score. + # In descending order. then we build the ranking documents. + predicitons = paddle.infer( + output_layer=output, parameters=parameters, input=infer_data) + for i, score in enumerate(predicitons): + print i, score + + +def parse_args(): + parser = argparse.ArgumentParser( + description="PaddlePaddle learning to rank example.") + parser.add_argument( + "--model_type", + type=str, + help=("A flag indicating to run the RankNet or the LambdaRank model. " + "Available options are: ranknet or lambdarank."), + default="ranknet") + parser.add_argument( + "--use_gpu", + type=bool, + help="A flag indicating whether to use the GPU device in training.", + default=False) + parser.add_argument( + "--trainer_count", + type=int, + help="The thread number used in training.", + default=1) + parser.add_argument( + "--test_model_path", + type=str, + required=True, + help=("The path of a trained model.")) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + assert os.path.exists(args.test_model_path), ( + "The trained model does not exit. Please set a correct path.") + + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) + + # Training dataset: mq2007, input_dim = 46, dense format. + input_dim = 46 + + if args.model_type == "ranknet": + ranknet_infer(input_dim, args.test_model_path) + elif args.model_type == "lambdarank": + lambda_rank_infer(input_dim, args.test_model_path) + else: + logger.fatal(("A wrong value for parameter model type. " + "Available options are: ranknet or lambdarank.")) diff --git a/ltr/lambda_rank.py b/ltr/lambda_rank.py index 95bc41566a..aae008ba81 100644 --- a/ltr/lambda_rank.py +++ b/ltr/lambda_rank.py @@ -1,32 +1,20 @@ -import os -import sys -import gzip -import functools -import argparse -import logging -import numpy as np - +""" +LambdaRank is a listwise rank model. +https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf +""" import paddle.v2 as paddle -logger = logging.getLogger("paddle") -logger.setLevel(logging.INFO) - -def lambda_rank(input_dim, is_infer): +def lambda_rank(input_dim, is_infer=False): """ - LambdaRank is a listwise rank model, the input data and label - must be sequences. + The input data and label for LambdaRank must be sequences. - https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf parameters : input_dim, one document's dense feature vector dimension The format of the dense_vector_sequence is as follows: [[f, ...], [f, ...], ...], f is a float or an int number """ - if not is_infer: - label = paddle.layer.data("label", - paddle.data_type.dense_vector_sequence(1)) data = paddle.layer.data("data", paddle.data_type.dense_vector_sequence(input_dim)) @@ -49,134 +37,11 @@ def lambda_rank(input_dim, is_infer): param_attr=paddle.attr.Param(initial_std=0.01)) if not is_infer: - # Define the cost layer. + label = paddle.layer.data("label", + paddle.data_type.dense_vector_sequence(1)) + cost = paddle.layer.lambda_cost( input=output, score=label, NDCG_num=6, max_sort_size=-1) - return cost, output - return output - - -def lambda_rank_train(num_passes, model_save_dir): - # The input for LambdaRank must be a sequence. - fill_default_train = functools.partial( - paddle.dataset.mq2007.train, format="listwise") - fill_default_test = functools.partial( - paddle.dataset.mq2007.test, format="listwise") - - train_reader = paddle.batch( - paddle.reader.shuffle(fill_default_train, buf_size=100), batch_size=32) - test_reader = paddle.batch(fill_default_test, batch_size=32) - - # Training dataset: mq2007, input_dim = 46, dense format. - input_dim = 46 - cost, output = lambda_rank(input_dim, is_infer=False) - parameters = paddle.parameters.create(cost) - - trainer = paddle.trainer.SGD( - cost=cost, - parameters=parameters, - update_equation=paddle.optimizer.Adam(learning_rate=1e-4)) - - # Define end batch and end pass event handler. - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - logger.info("Pass %d Batch %d Cost %.9f" % - (event.pass_id, event.batch_id, event.cost)) - if isinstance(event, paddle.event.EndPass): - result = trainer.test(reader=test_reader, feeding=feeding) - logger.info("\nTest with Pass %d, %s" % - (event.pass_id, result.metrics)) - with gzip.open( - os.path.join(model_save_dir, "lambda_rank_params_%d.tar.gz" - % (event.pass_id)), "w") as f: - trainer.save_parameter_to_tar(f) - - feeding = {"label": 0, "data": 1} - trainer.train( - reader=train_reader, - event_handler=event_handler, - feeding=feeding, - num_passes=num_passes) - - -def lambda_rank_infer(test_model_path): - """LambdaRank model inference interface. - - Parameters: - test_model_path : The path of the trained model. - """ - logger.info("Begin to Infer...") - input_dim = 46 - output = lambda_rank(input_dim, is_infer=True) - parameters = paddle.parameters.Parameters.from_tar( - gzip.open(test_model_path)) - - infer_query_id = None - infer_data = [] - infer_data_num = 1 - - fill_default_test = functools.partial( - paddle.dataset.mq2007.test, format="listwise") - for label, querylist in fill_default_test(): - infer_data.append([querylist]) - if len(infer_data) == infer_data_num: - break - - # Predict score of infer_data document. - # Re-sort the document base on predict score. - # In descending order. then we build the ranking documents. - predicitons = paddle.infer( - output_layer=output, parameters=parameters, input=infer_data) - for i, score in enumerate(predicitons): - print i, score - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description="PaddlePaddle LambdaRank example.") - parser.add_argument( - "--run_type", - type=str, - help=("A flag indicating to run the training or the inferring task. " - "Available options are: train or infer."), - default="train") - parser.add_argument( - "--num_passes", - type=int, - help="The number of passes to train the model.", - default=10) - parser.add_argument( - "--use_gpu", - type=bool, - help="A flag indicating whether to use the GPU device in training.", - default=False) - parser.add_argument( - "--trainer_count", - type=int, - help="The thread number used in training.", - default=1) - parser.add_argument( - "--model_save_dir", - type=str, - required=False, - help=("The path to save the trained models."), - default="models") - parser.add_argument( - "--test_model_path", - type=str, - required=False, - help=("This parameter works only in inferring task to " - "specify path of a trained model."), - default="") - - args = parser.parse_args() - paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) - if args.run_type == "train": - lambda_rank_train(args.num_passes, args.model_save_dir) - elif args.run_type == "infer": - assert os.path.exists(args.test_model_path), ( - "The trained model does not exit. Please set a correct path.") - lambda_rank_infer(args.test_model_path) + return cost else: - logger.fatal(("A wrong value for parameter run type. " - "Available options are: train or infer.")) + return output diff --git a/ltr/metrics.py b/ltr/metrics.py deleted file mode 100644 index be1cc70839..0000000000 --- a/ltr/metrics.py +++ /dev/null @@ -1,38 +0,0 @@ -import numpy as np -import unittest - - -def ndcg(score_list): - """ - measure the ndcg score of order list - https://en.wikipedia.org/wiki/Discounted_cumulative_gain - parameter: - score_list: np.array, shape=(sample_num,1) - - e.g. predict rank score list : - >>> scores = [3, 2, 3, 0, 1, 2] - >>> ndcg_score = ndcg(scores) - """ - - def dcg(score_list): - n = len(score_list) - cost = .0 - for i in range(n): - cost += float(np.power(2, score_list[i])) / np.log((i + 1) + 1) - return cost - - dcg_cost = dcg(score_list) - score_ranking = sorted(score_list, reverse=True) - ideal_cost = dcg(score_ranking) - return dcg_cost / ideal_cost - - -class TestNDCG(unittest.TestCase): - def test_array(self): - a = [3, 2, 3, 0, 1, 2] - value = ndcg(a) - self.assertAlmostEqual(0.9583, value, places=3) - - -if __name__ == '__main__': - unittest.main() diff --git a/ltr/ranknet.py b/ltr/ranknet.py index 477dd3db4f..4f39ca931a 100644 --- a/ltr/ranknet.py +++ b/ltr/ranknet.py @@ -1,23 +1,9 @@ -import os -import sys -import gzip -import functools -import argparse -import logging -import numpy as np - +""" +ranknet is the classic pairwise learning to rank algorithm +http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf +""" import paddle.v2 as paddle -logger = logging.getLogger("paddle") -logger.setLevel(logging.INFO) - -# ranknet is the classic pairwise learning to rank algorithm -# http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf - - -def score_diff(right_score, left_score): - return np.average(np.abs(right_score - left_score)) - def half_ranknet(name_prefix, input_dim): """ @@ -60,142 +46,3 @@ def ranknet(input_dim): cost = paddle.layer.rank_cost( name="cost", left=output_left, right=output_right, label=label) return cost - - -def ranknet_train(num_passes, model_save_dir): - train_reader = paddle.batch( - paddle.reader.shuffle(paddle.dataset.mq2007.train, buf_size=100), - batch_size=100) - test_reader = paddle.batch(paddle.dataset.mq2007.test, batch_size=100) - - # mq2007 feature_dim = 46, dense format - # fc hidden_dim = 128 - feature_dim = 46 - cost = ranknet(feature_dim) - parameters = paddle.parameters.create(cost) - - trainer = paddle.trainer.SGD( - cost=cost, - parameters=parameters, - update_equation=paddle.optimizer.Adam(learning_rate=2e-4)) - - # Define the input data order - feeding = {"label": 0, "left_data": 1, "right_data": 2} - - # Define end batch and end pass event handler - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 25 == 0: - diff = score_diff( - event.gm.getLayerOutputs("left_score")["left_score"][ - "value"], - event.gm.getLayerOutputs("right_score")["right_score"][ - "value"]) - logger.info(("Pass %d Batch %d : Cost %.6f, " - "average absolute diff scores: %.6f") % - (event.pass_id, event.batch_id, event.cost, diff)) - - if isinstance(event, paddle.event.EndPass): - result = trainer.test(reader=test_reader, feeding=feeding) - logger.info("\nTest with Pass %d, %s" % - (event.pass_id, result.metrics)) - with gzip.open( - os.path.join(model_save_dir, "ranknet_params_%d.tar.gz" % - (event.pass_id)), "w") as f: - trainer.save_parameter_to_tar(f) - - trainer.train( - reader=train_reader, - event_handler=event_handler, - feeding=feeding, - num_passes=num_passes) - - -def ranknet_infer(model_path): - """ - load the trained model. And predict with plain txt input - """ - logger.info("Begin to Infer...") - feature_dim = 46 - - # we just need half_ranknet to predict a rank score, - # which can be used in sort documents - output = half_ranknet("right", feature_dim) - parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path)) - - # load data of same query and relevance documents, - # need ranknet to rank these candidates - infer_query_id = [] - infer_data = [] - infer_doc_index = [] - - # convert to mq2007 built-in data format - # - plain_txt_test = functools.partial( - paddle.dataset.mq2007.test, format="plain_txt") - - for query_id, relevance_score, feature_vector in plain_txt_test(): - infer_query_id.append(query_id) - infer_data.append([feature_vector]) - - # predict score of infer_data document. - # Re-sort the document base on predict score - # in descending order. then we build the ranking documents - scores = paddle.infer( - output_layer=output, parameters=parameters, input=infer_data) - for query_id, score in zip(infer_query_id, scores): - print "query_id : ", query_id, " score : ", score - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="PaddlePaddle RankNet example.") - parser.add_argument( - "--run_type", - type=str, - help=("A flag indicating to run the training or the inferring task. " - "Available options are: train or infer."), - default="train") - parser.add_argument( - "--num_passes", - type=int, - help="The number of passes to train the model.", - default=10) - parser.add_argument( - "--use_gpu", - type=bool, - help="A flag indicating whether to use the GPU device in training.", - default=False) - parser.add_argument( - "--trainer_count", - type=int, - help="The thread number used in training.", - default=1) - parser.add_argument( - "--model_save_dir", - type=str, - required=False, - help=("The path to save the trained models."), - default="models") - parser.add_argument( - "--test_model_path", - type=str, - required=False, - help=("This parameter works only in inferring task to " - "specify path of a trained model."), - default="") - - args = parser.parse_args() - if not os.path.exists(args.model_save_dir): os.mkdir(args.model_save_dir) - - paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) - - if args.run_type == "train": - ranknet_train(args.num_passes, args.model_save_dir) - elif args.run_type == "infer": - assert os.path.exists( - args.test_model_path), "The trained model does not exit." - ranknet_infer(args.test_model_path) - else: - logger.fatal(("A wrong value for parameter run type. " - "Available options are: train or infer.")) diff --git a/ltr/train.py b/ltr/train.py new file mode 100644 index 0000000000..1820ce1213 --- /dev/null +++ b/ltr/train.py @@ -0,0 +1,155 @@ +import os +import gzip +import functools +import argparse +import logging +import numpy as np + +import paddle.v2 as paddle + +from ranknet import ranknet +from lambda_rank import lambda_rank + +logger = logging.getLogger("paddle") +logger.setLevel(logging.INFO) + + +def ranknet_train(input_dim, num_passes, model_save_dir): + train_reader = paddle.batch( + paddle.reader.shuffle(paddle.dataset.mq2007.train, buf_size=100), + batch_size=100) + test_reader = paddle.batch(paddle.dataset.mq2007.test, batch_size=100) + + cost = ranknet(input_dim) + parameters = paddle.parameters.create(cost) + + trainer = paddle.trainer.SGD( + cost=cost, + parameters=parameters, + update_equation=paddle.optimizer.Adam(learning_rate=2e-4)) + + feeding = {"label": 0, "left_data": 1, "right_data": 2} + + def score_diff(right_score, left_score): + return np.average(np.abs(right_score - left_score)) + + # Define end batch and end pass event handler + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 25 == 0: + diff = score_diff( + event.gm.getLayerOutputs("left_score")["left_score"][ + "value"], + event.gm.getLayerOutputs("right_score")["right_score"][ + "value"]) + logger.info(("Pass %d Batch %d : Cost %.6f, " + "average absolute diff scores: %.6f") % + (event.pass_id, event.batch_id, event.cost, diff)) + + if isinstance(event, paddle.event.EndPass): + result = trainer.test(reader=test_reader, feeding=feeding) + logger.info("\nTest with Pass %d, %s" % + (event.pass_id, result.metrics)) + with gzip.open( + os.path.join(model_save_dir, "ranknet_params_%d.tar.gz" % + (event.pass_id)), "w") as f: + trainer.save_parameter_to_tar(f) + + trainer.train( + reader=train_reader, + event_handler=event_handler, + feeding=feeding, + num_passes=num_passes) + + +def lambda_rank_train(input_dim, num_passes, model_save_dir): + # The input for LambdaRank must be a sequence. + fill_default_train = functools.partial( + paddle.dataset.mq2007.train, format="listwise") + fill_default_test = functools.partial( + paddle.dataset.mq2007.test, format="listwise") + + train_reader = paddle.batch( + paddle.reader.shuffle(fill_default_train, buf_size=100), batch_size=32) + test_reader = paddle.batch(fill_default_test, batch_size=32) + + cost = lambda_rank(input_dim) + parameters = paddle.parameters.create(cost) + + trainer = paddle.trainer.SGD( + cost=cost, + parameters=parameters, + update_equation=paddle.optimizer.Adam(learning_rate=1e-4)) + + feeding = {"label": 0, "data": 1} + + # Define end batch and end pass event handler. + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + logger.info("Pass %d Batch %d Cost %.9f" % + (event.pass_id, event.batch_id, event.cost)) + if isinstance(event, paddle.event.EndPass): + result = trainer.test(reader=test_reader, feeding=feeding) + logger.info("\nTest with Pass %d, %s" % + (event.pass_id, result.metrics)) + with gzip.open( + os.path.join(model_save_dir, "lambda_rank_params_%d.tar.gz" + % (event.pass_id)), "w") as f: + trainer.save_parameter_to_tar(f) + + trainer.train( + reader=train_reader, + event_handler=event_handler, + feeding=feeding, + num_passes=num_passes) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="PaddlePaddle learning to rank example.") + parser.add_argument( + "--model_type", + type=str, + help=("A flag indicating to run the RankNet or the LambdaRank model. " + "Available options are: ranknet or lambdarank."), + default="ranknet") + parser.add_argument( + "--num_passes", + type=int, + help="The number of passes to train the model.", + default=10) + parser.add_argument( + "--use_gpu", + type=bool, + help="A flag indicating whether to use the GPU device in training.", + default=False) + parser.add_argument( + "--trainer_count", + type=int, + help="The thread number used in training.", + default=1) + parser.add_argument( + "--model_save_dir", + type=str, + required=False, + help=("The path to save the trained models."), + default="models") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + if not os.path.exists(args.model_save_dir): os.mkdir(args.model_save_dir) + + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) + + # Training dataset: mq2007, input_dim = 46, dense format. + input_dim = 46 + + if args.model_type == "ranknet": + ranknet_train(input_dim, args.num_passes, args.model_save_dir) + elif args.model_type == "lambdarank": + lambda_rank_train(input_dim, args.num_passes, args.model_save_dir) + else: + logger.fatal(("A wrong value for parameter model type. " + "Available options are: ranknet or lambdarank."))