forked from github/CodeSearchNet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrelevanceeval.py
129 lines (102 loc) · 5.84 KB
/
relevanceeval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python
"""
Usage:
relevanceeval.py [options] RELEVANCE_ANNOTATIONS_CSV_PATH MODEL_PREDICTIONS_CSV
Standalone relevance evaluation script that outputs evaluation statistics for a set of predictions of a given model.
The input formats of the files is described below.
The model predictions MODEL_PREDICTIONS_CSV file has the following format:
A comma-separated file with (at least) the fields and headers "language", "query", "url". Each row represents
a single result for a given query and a given programming language.
* language: the programming language for the given query, e.g. "python"
* query: the textual representation of the query, e.g. "int to string"
* url: the unique GitHub URL to the returned results, e.g. "https://github.com/JamesClonk/vultr/blob/fed59ad207c9bda0a5dfe4d18de53ccbb3d80c91/cmd/commands.go#L12-L190"
The order of the rows imply the ranking of the results in the search task. For example, if in row 5 there is
an entry for the Python query "read properties file" and then in row 60 appears another result for the
Python query "read properties file", then the URL in row 5 is considered to be ranked higher than the
URL in row 60 for that query and language.
Options:
--debug Run in debug mode, falling into pdb on exceptions.
-h --help Show this screen.
"""
from collections import defaultdict
from typing import Dict, List
import numpy as np
import pandas as pd
from docopt import docopt
from dpu_utils.utils import run_and_debug
def load_relevances(filepath: str) -> Dict[str, Dict[str, Dict[str, float]]]:
relevance_annotations = pd.read_csv(filepath)
per_query_language = relevance_annotations.pivot_table(
index=['Query', 'Language', 'GitHubUrl'], values='Relevance', aggfunc=np.mean)
# Map language -> query -> url -> float
relevances = defaultdict(lambda: defaultdict(dict)) # type: Dict[str, Dict[str, Dict[str, float]]]
for (query, language, url), relevance in per_query_language['Relevance'].items():
relevances[language.lower()][query.lower()][url] = relevance
return relevances
def load_predictions(filepath: str, max_urls_per_language: int=300) -> Dict[str, Dict[str, List[str]]]:
prediction_data = pd.read_csv(filepath)
# Map language -> query -> Ranked List of URL
predictions = defaultdict(lambda: defaultdict(list))
for _, row in prediction_data.iterrows():
predictions[row['language'].lower()][row['query'].lower()].append(row['url'])
for query_data in predictions.values():
for query, ranked_urls in query_data.items():
query_data[query] = ranked_urls[:max_urls_per_language]
return predictions
def coverage_per_language(predictions: Dict[str, List[str]],
relevance_scores: Dict[str, Dict[str, float]], with_positive_relevance: bool=False) -> float:
"""
Compute the % of annotated URLs that appear in the algorithm's predictions.
"""
num_annotations = 0
num_covered = 0
for query, url_data in relevance_scores.items():
urls_in_predictions = set(predictions[query])
for url, relevance in url_data.items():
if not with_positive_relevance or relevance > 0:
num_annotations += 1
if url in urls_in_predictions:
num_covered += 1
return num_covered / num_annotations
def ndcg(predictions: Dict[str, List[str]], relevance_scores: Dict[str, Dict[str, float]],
ignore_rank_of_non_annotated_urls: bool=True) -> float:
num_results = 0
ndcg_sum = 0
for query, query_relevance_annotations in relevance_scores.items():
current_rank = 1
query_dcg = 0
for url in predictions[query]:
if url in query_relevance_annotations:
query_dcg += (2**query_relevance_annotations[url] - 1) / np.log2(current_rank + 1)
current_rank += 1
elif not ignore_rank_of_non_annotated_urls:
current_rank += 1
query_idcg = 0
for i, ideal_relevance in enumerate(sorted(query_relevance_annotations.values(), reverse=True), start=1):
query_idcg += (2 ** ideal_relevance - 1) / np.log2(i + 1)
if query_idcg == 0:
# We have no positive annotations for the given query, so we should probably not penalize anyone about this.
continue
num_results += 1
ndcg_sum += query_dcg / query_idcg
return ndcg_sum / num_results
def run(arguments):
relevance_scores = load_relevances(arguments['RELEVANCE_ANNOTATIONS_CSV_PATH'])
predictions = load_predictions(arguments['MODEL_PREDICTIONS_CSV'])
languages_predicted = sorted(set(predictions.keys()))
# Now Compute the various evaluation results
print('% of URLs in predictions that exist in the annotation dataset:')
for language in languages_predicted:
print(f'\t{language}: {coverage_per_language(predictions[language], relevance_scores[language])*100:.2f}%')
print('% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):')
for language in languages_predicted:
print(f'\t{language}: {coverage_per_language(predictions[language], relevance_scores[language], with_positive_relevance=True) * 100:.2f}%')
print('NDCG:')
for language in languages_predicted:
print(f'\t{language}: {ndcg(predictions[language], relevance_scores[language]):.3f}')
print('NDCG (full ranking):')
for language in languages_predicted:
print(f'\t{language}: {ndcg(predictions[language], relevance_scores[language], ignore_rank_of_non_annotated_urls=False):.3f}')
if __name__ == '__main__':
args = docopt(__doc__)
run_and_debug(lambda: run(args), args['--debug'])