-
Notifications
You must be signed in to change notification settings - Fork 0
/
manage.py
146 lines (118 loc) · 4.53 KB
/
manage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# -*- coding: utf8 -*-
"""
manage.py tool for the webcrawler project.
Allows to manage database schema and run the crawler
Type `python manage.py -h` in a terminal for more information
"""
from __future__ import print_function
import argparse
import peewee
import inspect
import sys
from robot import models
from robot.crawler import Crawler
from robot.pagerank import compute_pagerank
from robot.tfidf import compute_tfidf, tfdidf_query, boolean_query
import settings
def get_model_classes():
clsmembers = inspect.getmembers(models, inspect.isclass)
return [(k, v) for k, v in clsmembers if k in models.__all__]
# Commands definition
def syncdb(data):
""" Synchronize the database by creating tables """
print("Creating tables...")
try:
models.Page.create_table()
models.Word.create_table()
models.Link.create_table()
models.WordPage.create_table()
models.User.create_table()
models.UserQuery.create_table()
except peewee.OperationalError as e:
print("An error occured: ", end='')
print(e)
print("Done!")
def cleandb(data):
""" Remove every old entries """
print("Cleaning old database...")
try:
models.WordPage.delete().execute()
models.Link.delete().execute()
models.Word.delete().execute()
models.Page.delete().execute()
models.UserQuery.delete().execute()
models.User.delete().execute()
except peewee.OperationalError as e:
print("An error occured: ", end='')
print(e)
print("Done!")
def fill_user_database(data):
# Create users
maxime = models.User.get_or_create(name='maxime')
xavier = models.User.get_or_create(name='xavier')
marc = models.User.get_or_create(name='marc')
# Create word preferences
models.UserQuery.create(user=maxime, word='python', frequency=5)
models.UserQuery.create(user=maxime, word='php', frequency=3)
models.UserQuery.create(user=xavier, word='tarantino', frequency=4)
models.UserQuery.create(user=xavier, word='creative', frequency=4)
models.UserQuery.create(user=xavier, word='commons', frequency=4)
models.UserQuery.create(user=marc, word='paris', frequency=3)
models.UserQuery.create(user=marc, word='c++', frequency=6)
models.UserQuery.create(user=marc, word='reinhardt', frequency=4)
def run(data):
"""
Set up database connection, clean log file if needed
and run the crawler
"""
settings.DATABASE.connect()
# Clean the old log file if we don't want to keep it
if not data['keeplogs']:
with open(data['logfile'], 'w'):
pass
a = Crawler()
a.set_logfile(data['logfile'])
try:
a.start()
except KeyboardInterrupt:
print("Stopping the crawler in few seconds...")
a.stop = True
sys.exit(0)
def tfidf(data):
print("Start computing TF/IDF...")
compute_tfidf()
print("Done!")
def query(data):
if data['method'] == 'tfidf':
sim = tfdidf_query(sum([x.split(' ') for x in data['words']], []))
else:
query = ' '.join(data['words'])
sim = boolean_query(query)
for item in sim:
print(item)
def pagerank(data):
print("Start computing Pagerank...")
compute_pagerank()
print("Done! Results available in logs/")
# List of commands available
commands = {'syncdb': syncdb, 'cleandb': cleandb, 'run': run,
'tfidf': tfidf, 'query': query, 'pagerank': pagerank,
'filluser': fill_user_database}
# Argument parser definition
parser = argparse.ArgumentParser(
description='Allows to manage database schema and run the crawler')
parser.add_argument('command', type=str, choices=commands.keys(),
help='Command to execute')
parser.add_argument('words', type=str, nargs='*',
help='words')
parser.add_argument('--method', dest='method', default="tfidf",
help='Query method', choices=['tfidf', 'boolean'])
parser.add_argument('--keep-logs', dest='keeplogs', action='store_true',
help='Keep previous logs of the crawler')
parser.add_argument('--logfile', dest='logfile', default=settings.LOGFILE,
help='Log file destination')
# Parse and execute the command asked
args = parser.parse_args()
data = {'logfile': args.logfile, 'keeplogs': args.keeplogs,
'method': args.method, 'words': args.words}
commands[args.command](data)