-
Notifications
You must be signed in to change notification settings - Fork 0
/
setup.py
144 lines (137 loc) · 4.22 KB
/
setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import sys
import re
import os.path
from optparse import OptionParser, OptionGroup
from work import process
class Config(object):
# settings
SIM_THRESHOLD = -1
MIN_CHILDREN_COUNT = -1
MIN_DEEP = -1
MIN_SIMILAR_COUNT = -1
url = ''
webdriver = ''
chromeDriverPath = ''
parser = None
# options
usage = 'Usage: %prog <URL> [-w workfile] [options]'
parser_system_options = [
# system
{
"short": "-w",
"long": "--work-file",
"action": "store",
"dest": "workFile",
"default": "./work.py",
"help": "Work file path",
"type": "string"
},
{
"short": "--sim-threshold",
"action": "store",
"dest": "threshold",
"default": 0.75,
"help": "Similarity threshold",
"type": "float"
},
{
"short": "--min-children-count",
"action": "store",
"dest": "minChildrenCount",
"default": 4,
"help": "Min children count of a DOM",
"type": "int"
},
{
"short": "--min-children-deep",
"action": "store",
"dest": "minDeep",
"default": 2,
"help": "Minimum deep of children of a DOM",
"type": "int"
},
{
"short": "--min-similar-count",
"action": "store",
"dest": "minSimilar",
"default": 1,
"help": "Minimum count of a set of similar DOMs",
"type": "int"
},
]
parser_options = [
{
"short": "-d",
"long": "--webdriver",
"action": "store",
"dest": "webdriver",
"default": "phantomjs",
"help": "Web Driver",
"type": "string"
},
{
"short": "--chrome-driver-path",
"action": "store",
"dest": "chromeDriverPath",
"default": "./chromedriver",
"help": "Chromedriver path",
"type": "string"
},
]
def __init__(self, args):
self.verify(args)
def verify(self, args):
parser = OptionParser(usage=self.usage)
self.parser = parser
for opt in self.parser_options:
parser.add_option(
opt['short'],
opt.get('long', None),
action=opt['action'],
dest=opt['dest'],
default=opt['default'],
help=opt['help'],
type=opt['type'])
group = OptionGroup(
parser, "Other options",
"Caution: These options usually use default values.")
for opt in self.parser_system_options:
group.add_option(
opt['short'],
opt.get('long', None),
action=opt['action'],
dest=opt['dest'],
default=opt['default'],
help=opt['help'],
type=opt['type'])
parser.add_option_group(group)
(options, args) = parser.parse_args(args)
# url
if len(args) < 2:
parser.print_help()
parser.error("Need URL")
self.url = args[1]
m = re.match(
r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)' +
r'(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s(' +
r')<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))',
self.url
)
if m is None:
parser.print_help()
parser.error("Invalid URL")
# work file
self.workFile = options.workFile
if not os.path.isfile(self.workFile):
parser.print_help()
parser.error("Work file " + self.workFile + " is not exist")
# other
self.SIM_THRESHOLD = options.threshold
self.MIN_CHILDREN_COUNT = options.minChildrenCount
self.MIN_DEEP = options.minDeep
self.MIN_SIMILAR_COUNT = options.minSimilar
self.webdriver = options.webdriver
self.chromeDriverPath = options.chromeDriverPath
if __name__ == "__main__":
conf = Config(sys.argv)
process(conf)