Skip to content

Commit bf0f49b

Browse files
author
Chenna Keshava B S
committed
add arxiv querying functionality
1 parent 5b8794a commit bf0f49b

File tree

12 files changed

+62
-56
lines changed

12 files changed

+62
-56
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
db.sqlite3
22

33
*.pyc
4+
*.txt
45

56
user/files/

arxiv_querier/fetch_papers.py

+14-45
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,4 @@
1-
"""
2-
Queries arxiv API and downloads papers (the query is a parameter).
3-
The script is intended to enrich an existing database pickle (by default db.p),
4-
so this file will be loaded first, and then new results will be added to it.
51

6-
Usage: get_relevant_papers(keywords separated by spaces)
7-
"""
8-
9-
import os
102
import time
113
import pickle
124
import random
@@ -16,51 +8,28 @@
168
import http
179

1810

19-
20-
21-
# parse input arguments
2211
def get_relevant_papers(keyword_string):
23-
parser = argparse.ArgumentParser()
24-
parser.add_argument('--search-query', type=str,
25-
default='cat:cs.CV+OR+cat:cs.AI+OR+cat:cs.LG+OR+cat:cs.CL+OR+cat:cs.NE+OR+cat:stat.ML',
26-
help='query used for arxiv API. See http://arxiv.org/help/api/user-manual#detailed_examples')
27-
parser.add_argument('--start-index', type=int, default=0, help='0 = most recent API result')
28-
parser.add_argument('--max-index', type=int, default=1, help='upper bound on paper index we will fetch')
29-
parser.add_argument('--results-per-iteration', type=int, default=1, help='passed to arxiv API')
30-
parser.add_argument('--wait-time', type=float, default=5.0, help='lets be gentle to arxiv API (in number of seconds)')
31-
parser.add_argument('--break-on-no-added', type=int, default=1, help='break out early if all returned query papers are already in db? 1=yes, 0=no')
32-
args = parser.parse_args()
33-
3412

3513
base_url = 'http://export.arxiv.org/api/query?' # base api query url
3614
print('Searching arXiv for %s' % (keyword_string, ))
3715

38-
for i in range(args.start_index, args.max_index, args.results_per_iteration):
39-
print("Results %i - %i" % (i,i+args.results_per_iteration))
40-
41-
query = 'search_query=%s&sortBy=lastUpdatedDate&start=%i&max_results=%i' % (keyword_string,
42-
i, args.results_per_iteration)
43-
44-
with request.urlopen(base_url+query) as answer:
45-
parse = feedparser.parse(answer)
46-
47-
48-
ans = {}
16+
query = 'search_query=%s&sortBy=lastUpdatedDate' % (keyword_string)
4917

50-
for e in parse.entries:
51-
ans[e.title] = e.id
18+
with request.urlopen(base_url+query) as answer:
19+
parse = feedparser.parse(answer)
5220

53-
# print('\Link to the research-paper: {}'.format(e.id))
54-
# print('\n\nAbstract of the research-paper: {}'.format(e.summary))
55-
# print(e)
56-
# print('\n{} \t {}'.format(e.title, e.id))
57-
# print('Authors of the research-paper: {}'.format(e.authors))
21+
ans = {}
22+
23+
for e in parse.entries:
24+
ans[e.title] = e.id
5825

5926

27+
if len(parse.entries) == 0:
28+
print('Received no results from arxiv. Rate limiting? Exiting. Restart later maybe.')
29+
# print(answer)
30+
6031

61-
if len(parse.entries) == 0:
62-
print('Received no results from arxiv. Rate limiting? Exiting. Restart later maybe.')
63-
print(answer)
64-
break
32+
return ans
6533

66-
return ans
34+
ans = get_relevant_papers('quantum computer networks')
35+
print(ans)

arxiv_querier/fp.py

-10
This file was deleted.
Binary file not shown.
16 Bytes
Binary file not shown.
16 Bytes
Binary file not shown.
16 Bytes
Binary file not shown.
16 Bytes
Binary file not shown.
16 Bytes
Binary file not shown.
Binary file not shown.

templates/summary.html

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
<html>
2+
<title>Generate Summary</title>
3+
<form>
4+
5+
</form>
6+
</html>

user/views.py

+41-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from django.http import HttpResponseRedirect
99
from django.urls import reverse
1010
from urllib.parse import urlencode
11-
from .views import *
11+
# from .views import *
1212
from django.contrib.auth.decorators import login_required
1313
from django.utils import timezone
1414
from gensim.summarization import keywords
@@ -25,6 +25,16 @@
2525
import nltk
2626
import language_check
2727

28+
# for querying arxiv
29+
import time
30+
import pickle
31+
import random
32+
import argparse
33+
import urllib.request as request
34+
import feedparser
35+
import http
36+
37+
2838
# Create your views here.
2939
@login_required(redirect_field_name='login')
3040
def userHomePage(request):
@@ -117,6 +127,7 @@ def grammarChecker(filepath):
117127
new_f.write(s)
118128
new_f.close()
119129

130+
120131
@login_required(redirect_field_name='login')
121132
def displaySummaryView(request):
122133
if request.method == 'GET':
@@ -167,3 +178,32 @@ def get_keyword_info(FILE_PATH):
167178
# ==============================================================
168179

169180

181+
182+
183+
# parse input arguments
184+
def get_relevant_papers(keyword_string):
185+
186+
base_url = 'http://export.arxiv.org/api/query?' # base api query url
187+
print('Searching arXiv for %s' % (keyword_string, ))
188+
189+
query = 'search_query=%s&sortBy=lastUpdatedDate' % (keyword_string)
190+
191+
192+
with request.urlopen(base_url+query) as answer:
193+
parse = feedparser.parse(answer)
194+
195+
ans = {}
196+
197+
for e in parse.entries:
198+
ans[e.title] = e.id
199+
200+
201+
if len(parse.entries) == 0:
202+
print('Received no results from arxiv. Rate limiting? Exiting. Restart later maybe.')
203+
# print(answer)
204+
205+
206+
return ans
207+
208+
209+

0 commit comments

Comments
 (0)