-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmotherbot.py
executable file
·393 lines (359 loc) · 16.7 KB
/
motherbot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
#!/usr/bin/env python
""""
[MotherBot] Syntax/docbot by /u/num8lock
Desc: Model (logic module)
version: v.0.3
git:
CHANGES:
2016/12/02 Moved from SQLite3 to PostgreSQL, only version 3 data due to number
of columns limitation on free tier Heroku
Acknowledgement: Thanks to Alireza Savand for keeping html2text alive after
Aaron Swartz passed away.
"""
import os
import re
import ast
import logging, logging.config
from datetime import date, datetime
import copy as cp # to copy section
from random import randrange
from bs4 import BeautifulSoup as BS
import html2text # MAKE SURE html2text BODY_WIDTH config
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from docdb import Library
log = logging.getLogger(__name__)
# load logging config from env variable
logging.config.dictConfig(ast.literal_eval(os.getenv('LOG_CFG')))
''' LOCAL TEST CONFIGS '''
docvers = '3.5.2'
path = os.path.expanduser('~/Google Drive/docs/python-docs/{}/library/'.format(
docvers))
def json_builder(data):
"""Create JSON object and file to store the definitions
Failed to make it work for now, sqlalchemy claims not valid json"""
try:
import simplejson as json
except ImportError as err:
import json
log.error('Cannot import simplejson, use builtin json module instead.')
serialized = json.dumps(data,
sort_keys=True, ensure_ascii=False).encode('utf-8')
# # development: save json files
filepath = './json/{0}.{1}.json'.format(data['keywords'][0], docvers)
with open(filepath, 'wb') as jsonfile:
jsonfile.write(serialized)
log.info('%s', serialized)
return serialized
def build_definitions(fullpath):
""" This is the main function to initialize the definitions data
Arguments: ==> TODO
URL path
datastore: database URL
function name and query (optional for scraping new docs)
Process: start by getting the doc file
- get the chuncks of sections of definition
- process markup conversion on each of section into parts,
- keyword : exact syntax
- header: function syntax & arguments
- body: function definition
- gather metadata into a dict: section link, h1 header
- join all parts to a single section & store to database
"""
# Initialize data from html source
with open(fullpath, 'r', encoding='utf-8') as doc:
_doc = BS(doc, 'lxml')
datadump = {}
h = html2text.HTML2Text()
h.ignore_links = False
_sections = _doc.find_all('dl')
# in... <ul class="this-page-menu"> <...>
# <li><a href="../_sources/library/functions.txt"
_src = _doc.body.find(attrs={'class':'this-page-menu'})
_var_opt = ''.join(_doc.head.script.contents)
''' [ Metadata ] for naming & hyperlinks
- Get Python version from the html page header, set by javascript in
var DOCUMENTATIONS_OPTIONS
- Get other info from url included to use for metadata, db & other variables
'''
_version = re.search(r'VERSION.*\'([\d\.]+)\'', _var_opt)
_suffix = re.search(r'SUFFIX.*\'(\.\w+)\'', _var_opt)
_part = re.search(r'\.\./_sources/([\w]+)/([\w.]+)\.*', str(_src))
# Initialize metadata variables
DOC_ROOT = 'https://docs.python.org'
DOC_LONGVERSION = _version.group(1) # i.e. 3.5.2
DOC_VERSION = DOC_LONGVERSION[0] # i.e. 3
DOC_TOPIC = _part.group(1) # i.e. library, references, etc
DOC_MODULE = os.path.splitext(_part.group(2))[0] # i.e, functions, etc
DOC_SUFFIX = _suffix.group(1)
DOC_VER_URL = '{}/{}/'.format(DOC_ROOT, DOC_VERSION)
DOC_MODULE_URL = '{}/{}/{}/'.format(DOC_ROOT, DOC_VERSION, DOC_TOPIC)
DOC_FULL_URL = '{}/{}/{}/{}{}'.format(DOC_ROOT, DOC_VERSION, DOC_TOPIC,
DOC_MODULE, DOC_SUFFIX)
if _part.group(1) == (DOC_ROOT or fullpath) and DOC_MODULE == 'glossary':
DOC_TOPIC = 'glossary'
''' Set database variables, TODO for analytic & logging purposes
See DocBot_Schema.sql
'''
db_table = DOC_TOPIC.capitalize()
version_id = ''.join(DOC_LONGVERSION.split('.'))
major = version_id[0]
minor = version_id[1]
micro = version_id[2:]
def transform_relative_links(arg):
""" Replaces internal anchor refs and relative urls with absolute path
group 1: same page href anchor,
group 2: links to different page in the same Python doc section
group 3: links to different sections
"""
try:
def subpattern(match):
if match.group(1):
return r'{}{}'.format(DOC_FULL_URL, match.group(1))
elif match.group(2):
return r'{}{}'.format(DOC_MODULE_URL, match.group(2))
elif match.group(3):
strings = match.group(3)[ (match.start(3)+3) : match.end() ]
''' This match.span example basically just for reminder '''
# debug(['<re matched group 3> : {} : {}{}'
# .format(match.span(3), DOC_VER_URL, strings])
# )
return r'{}{}'.format(DOC_VER_URL, strings)
else:
print('{} not found :(( ', match)
urlsub = re.sub(
r'^(#[\w\.]+)|^([\w]+\.htm.[#\w.]+)|^(\.{2}/[\w+|/+]*\.htm.#.*)',
subpattern, arg)
return urlsub
# This might be the wrong way to catch exception
except re.error as err:
log.error('URL cannot be replace with regex: {}'.format(err),
exc_info=True)
return False
def markdown_header(arg):
""" HTML to Markdown headers """
header = {'h1': '# ', 'h2': '## ', 'h3': '### ', 'h4': '#### ',
'h5': '##### ', 'h6': '###### '}
return header[arg]
def markdown_special(tag):
""" Some css class that used to mark important notes """
_class = {'versionchanged': 'em', 'versionadded': 'blockquote',
'versionmodified': 'em', 'admonition-title': 'b',
'first': 'blockquote', 'last': 'blockquote',}
# log.debug('Finding span class %s', arg)
# no css class found, just go ahead unwrap
if not 'class' in tag.attrs:
tag.unwrap()
return False
for css_name in tag.attrs['class']:
if css_name in _class:
log.debug('Found class: %s in %s', tag.attrs['class'], tag)
noted_tag = _doc.new_tag(_class[css_name])
if tag.string is not None:
tag.string.strip()
tag.string.wrap(noted_tag)
noted_tag.insert_after(_doc.new_string(' '))
else:
tag.wrap(noted_tag)
tag.unwrap()
log.debug('Changed %s. Unwrapped? %s', noted_tag, tag)
return True
else:
tag.unwrap()
return False
def valid(section):
"""[ Keywords ] Evaluate if section contains valid definition"""
# log.debug('Evaluating section %s', section)
try:
if 'id' in section.dt.attrs:
return True
elif section.dt.code.next_sibling is not None:
return True
else:
log.warn("Skipping: can't find keyword in %s", section.dt.code)
return False
except AttributeError as err:
log.error('Error, no keyword identifiers found.\n %s', err)
return False
def create_keywords(section):
""" [ Keywords ] Extract definition type (class, method, etc) and
keyword strings"""
log.debug('Creating keyword')
try:
if 'id' in section.dt.attrs:
keytype = section['class'][0]
_keyword = section.dt['id']
log.debug('creating... [id]: (%s) %s', keytype, _keyword)
elif section.dt.find(class_='descname'):
# for readability
keytype = section['class'][0]
if section.dt.find(class_='descclassname') is None:
log.debug('`Descclassname` not found')
_keyword = section.dt.find(class_='descname').string
log.debug('creating... [css] _keyword: %s', _keyword)
elif section.dt.find(class_='descclassname') is not None:
log.debug('Found css `descclassname`')
descclass = section.dt.find(class_='descclassname').string
log.debug('Found css `descname`')
descname = section.dt.find(class_='descname').string
_keyword = '{}{}'.format(descclass, descname)
log.debug('creating... [css] _keyword: %s', _keyword)
'''separate class from methods & functions based on the length of
syntax we have so far. note: this is not in above loop
`descclass name not found`, because [id]'''
if len(_keyword.split('.')) == 1:
log.debug('creating... One word syntax %s', _keyword)
keyclass = keytype
log.info('Created keywords: %s', _keyword)
return keytype, keyclass, [_keyword]
elif len(_keyword.split('.')) > 1:
log.debug('creating... More than one word syntax %s', _keyword)
_keys = _keyword.split('.', maxsplit=1)
keyclass = _keys[0]
splitkeys = _keyword.split('.')
keyword = []
for i, val in enumerate(splitkeys):
keyword.append('.'.join(splitkeys[i : ]))
log.info('Created keywords: %s', keyword)
return keytype, keyclass, keyword
except AttributeError as error:
log.error('Keywords error: %s', (error))
log.warn("Skipping: can't find keyword in %s", section.dt.code)
return False
def create_header(section):
""" [ Header ] Convert anchors/relative URLs to absolute paths if exists
then convert html markups to markdown."""
log.debug('Creating header')
if section.a is not None:
internal_link = transform_relative_links(section.a['href'])
section.a['href'] = internal_link
del section.a['title']
url = section.a['href']
else:
log.warn('No internal anchors/links found %s', section.string)
url = None
# < Hack > fix annoying trailing space in <em>class </em> to avoid
# incorrect markdown formatting
for em in section.find_all('em'):
if em.string is not None:
em.string = '_{0}_ '.format(em.string.rstrip())
em.unwrap()
# Change css styled spans & divs for important notes to respective tags
for span in section.find_all('span'):
markdown_special(span)
for div in section.find_all('div'):
markdown_special(div)
# < Hack > around BS because making it output simple strings is like
# getting your money back from asshole you misjudged a long time ago
transform_header = []
for content in section.dt.contents:
transform_header.append(str(content).replace('\n', ''))
# Add horizontal rule below header
# Format header section to markdown, add opening link tag.
tmp_header = '{0}[{1}'.format( markdown_header('h4'),
h.handle(''.join(transform_header).strip()),
)
# remove double markdown link tag
newtmp = tmp_header.replace(".``", ".")
# string is immutable
header = '{0}{1}'.format(newtmp.replace('[¶', ''), '-----')
log.info('Constructed header: \n%s', header)
return header, url
def create_body(section):
""" [ Body ] Convert anchors/relative URLs to absolute paths if exists
Format the definition in Header"""
# log.debug('Creating body')
for link in section.dd.select('a[href]'):
if link is not None:
transform_link = transform_relative_links(link.attrs['href'])
link.attrs['href'] = transform_link
del link['title']
else:
log.debug('Link? %s', link)
transform_body =[]
for content in section.dd.contents:
transform_body.append(str(content))
body = str(h.handle(''.join(transform_body))).strip()
log.debug('Constructed body: \n%s', body)
return body
def create_footer(keyword):
""" [ Footer ] Include infos in docbots replies"""
length = len(keyword) - 1 if len(keyword) > 1 else 1
randsyn = keyword[randrange(0, length)]
msg_template = 'SyntaxBot --find {0} --version 3'.format(randsyn)
# stolen from RemindMeBot
pm_link = \
'https://np.reddit.com/message/compose/?to={0}&subject={1}&message={2}' \
''.format('SyntaxBot', randsyn, msg_template)
readme_link = 'https://www.reddit.com/r/SyntaxBot/'
footer = '-----\n`>>>` [README]({0}) | `>>>` ' \
'[Try get it from PM!]({1})'.format(readme_link, pm_link)
log.debug('Created footer')
return footer
''' Extract data from page '''
for section in _sections:
if 'docutils' in section['class']:
log.warn('class docutils, Skipping, not a definition. %s', section)
continue
elif valid(section):
keytype, keyclass, keyword = create_keywords(section)
header, url = create_header(section)
body = create_body(section)
footer = create_footer(keyword)
data = '{0} \n {1} \n {2}'.format(header, body, footer)
# unpack keyword list
keywords = ', '.join(keyword)
log.info('(%s) keytype: %s, keyclass: %s, keyword: %s',
version_id, keytype, keyclass, keyword)
log.info('`%s` section done.\n',
section.dt.find(class_='descname').string)
''' Store all the data
Arguments:
Database columns:
table: Database table name
version_id: Int, main variable for user <version> query
major: Int(major).x.x, i.e. 3.5.2, 2.7.12
minor: x.int(minor).x
micro: x.x.int(micro)
topic: Library (Python Doc **Section**)
module: Python doc page name, based on module name
keytype: Class, method, function or attribute
keywords: Strings, main variable for user <syntax> query
header: Syntax & argument part
body: Definition part
footer: Related document URLs & docbot information
links
url: Permalink to Python doc syntax definition
# json structure
json_data = {'meta': [{ 'version_id': version_id, 'major': major,
'micro': micro, 'minor': minor, },
{'module': DOC_MODULE, 'keyclass': keyclass,
'keytype': keytype, }],
'keywords': keyword, 'url': url,
'data': [ header, body, footer],
}
json_builder(json_data)
'''
doc = Library(
version_id=version_id, major=major,
minor=minor, micro=micro,
topic=DOC_TOPIC, module=DOC_MODULE, keytype=keytype,
keyclass=keyclass, keywords=keywords, header=header,
body=body, footer=footer, url=url
)
# commit only when all definitions added to session
session.add(doc)
session.flush()
# commit the record the database and close connection
session.commit()
db_config = os.getenv('DATABASE_URL')
engine = create_engine(db_config, echo=True, isolation_level="READ COMMITTED")
Session = sessionmaker(bind=engine)
session = Session()
for root, dirs, filenames in os.walk(path):
for fname in filenames:
fullpath = os.path.join(root, fname)
log.info('Start building definitions %s', fullpath , exc_info=True)
build_definitions(fullpath)
# build_definitions(path)
session.close()