-
Notifications
You must be signed in to change notification settings - Fork 3
/
update_info.py
134 lines (115 loc) · 4.65 KB
/
update_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
@File : update_info.py
@Time : 2023/03/29 13:27:28
@Author : Weihao Xia (xiawh3@outlook)
@Version : 2.0
@Desc : This script is used to automatically update paper information in a paper list.
'''
import re
import feedparser
import argparse
from urllib import request
def get_arxiv_info(query_id):
'''
extract information
return:
comment: a string of the comment
param:
query_id: the id of the paper
'''
query_url = f'http://export.arxiv.org/api/query?id_list={query_id}'
data = request.urlopen(query_url).read().decode('utf-8')
feed = feedparser.parse(data)
# sometimes the tiltle and authors are changed
title = feed.entries[0].title
authors = [author.name for author in feed.entries[0].authors]
try:
comment = feed.entries[0].arxiv_comment
except AttributeError:
comment = 'hello world!'
return title, authors, comment
def clean_title(title):
'''
remove certain punctuations in the title; and
capitalize the first letter of each word (except for prepositions and acronyms)
'''
prepositions = ['about', 'and', 'as', 'at', 'but', 'by', 'for', 'from', 'in', 'nor', 'of', 'on', 'or', 'to', 'with']
title = [word.capitalize() if word not in prepositions and word.islower() else word for word in title.split()]
title = ' '.join(title)
return title
def parse_paper_info(paper_info_str):
'''
parse the paper information
return:
paper_info: a dictionary of the paper information
param:
paper_info_str: a string of the paper information
'''
title, authors, *pubinfo_and_url = paper_info_str.strip().split('\n')
pubinfo_and_url = ' '.join(pubinfo_and_url)
venue_and_year = pubinfo_and_url.split('. ')[0]
urls = pubinfo_and_url.split('. ')[1]
urls = urls.split()# multiple spaces
arxiv_id = re.search(r"\d+\.\d+", urls[0]).group(0)
return {
'id': arxiv_id,
'title': title,
'authors': authors,
'venue_and_year': venue_and_year,
'urls': urls
}
def update_info(paper_info, publish_regex):
'''
update the venue_and_year information
return:
paper_info: a dictionary of the paper information
param:
paper_info: a dictionary of the paper information
publish_regex: a regex of the publish information
'''
if paper_info['venue_and_year'].split(' ')[0].lower()== 'arxiv':
query_id = paper_info['id']
title, authors, comment = get_arxiv_info(query_id)
print(f"Update the paper {paper_info['id']}...")
authors_str = ', '.join(authors)
paper_info['authors'] = f'*{authors_str}.*<br>'
title = clean_title(title)
paper_info['title'] = f'**{title}.**<br>'
# extract publish information (e.g. cvpr 2021) from the comment
publish = re.findall(publish_regex, comment)
if publish:
publish = publish[0][0]
paper_info['venue_and_year'] = publish
return paper_info
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Update the publication information of the papers.')
parser.add_argument('--read_path', type=str, default='papers.md', help='the path of source file')
parser.add_argument('--save_path', type=str, default='papers_updated.md', \
help='the path of target file')
args = parser.parse_args()
# read conference names from a text file
with open('conf_list.txt') as f:
conf_list = [line.strip() for line in f]
conf_regex = '|'.join(conf_list)
publish_regex = fr'({conf_regex}).*?\d{{4}}'
publish_regex = f'[\s\S]*({publish_regex})[\s\S]*'
# read markdown and convert each paper information into a dictionary
# Open the file and read the contents into one string
with open(args.read_path, 'r') as f:
paper_info_strs = f.read().strip().split('\n\n')
# Parse the string into a list of paper info
updated_papers_info = []
for paper_info_str in paper_info_strs:
paper_info = parse_paper_info(paper_info_str)
# Update the paper info based on the publish regex
updated_paper_info = update_info(paper_info, publish_regex)
updated_papers_info.append(updated_paper_info)
# Save the updated publication information to a file
with open(args.save_path, 'w') as f:
for paper_info in updated_papers_info:
note = f"{paper_info['title']}\n{paper_info['authors']}\
\n{paper_info['venue_and_year']}. {' '.join(paper_info['urls'])}\n\n"
f.write(note)
print("Update the publication information successfully!")