forked from bcorfman/raven-checkers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
rules.py
143 lines (131 loc) · 4.85 KB
/
rules.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import re
import sys
class LinkRules(object):
"""Rules for recognizing external links."""
# For the link targets:
proto = r'http|https|ftp|nntp|news|mailto|telnet|file|irc'
extern = r'(?P<extern_addr>(?P<extern_proto>%s):.*)' % proto
interwiki = r'''
(?P<inter_wiki> [A-Z][a-zA-Z]+ ) :
(?P<inter_page> .* )
'''
def __init__(self):
self.addr_re = re.compile('|'.join([
self.extern,
self.interwiki,
]), re.X | re.U) # for addresses
class Rules(object):
"""Hold all the rules for generating regular expressions."""
# For the inline elements:
proto = r'http|https|ftp|nntp|news|mailto|telnet|file|irc'
link = r'''(?P<link>
\[\[
(?P<link_target>.+?) \s*
([|] \s* (?P<link_text>.+?) \s*)?
]]
)'''
image = r'''(?P<image>
{{
(?P<image_target>.+?) \s*
([|] \s* (?P<image_text>.+?) \s*)?
}}
)'''
macro = r'''(?P<macro>
<<
(?P<macro_name> \w+)
(\( (?P<macro_args> .*?) \))? \s*
([|] \s* (?P<macro_text> .+?) \s* )?
>>
)'''
code = r'(?P<code> {{{ (?P<code_text>.*?) }}} )'
emph = r'(?P<emph> (?<!:)// )' # there must be no : in front of the //
# avoids italic rendering in urls with
# unknown protocols
strong = r'(?P<strong> \*\* )'
linebreak = r'(?P<break> \\\\ )'
escape = r'(?P<escape> ~ (?P<escaped_char>\S) )'
char = r'(?P<char> . )'
# For the block elements:
separator = r'(?P<separator> ^ \s* ---- \s* $ )' # horizontal line
line = r'(?P<line> ^ \s* $ )' # empty line that separates paragraphs
head = r'''(?P<head>
^ \s*
(?P<head_head>=+) \s*
(?P<head_text> .*? ) \s*
(?P<head_tail>=*) \s*
$
)'''
text = r'(?P<text> .+ )'
list = r'''(?P<list>
^ [ \t]* ([*][^*\#]|[\#][^\#*]).* $
( \n[ \t]* [*\#]+.* $ )*
)''' # Matches the whole list, separate items are parsed later. The
# list *must* start with a single bullet.
item = r'''(?P<item>
^ \s*
(?P<item_head> [\#*]+) \s*
(?P<item_text> .*?)
$
)''' # Matches single list items
pre = r'''(?P<pre>
^{{{ \s* $
(\n)?
(?P<pre_text>
([\#]!(?P<pre_kind>\w*?)(\s+.*)?$)?
(.|\n)+?
)
(\n)?
^}}} \s*$
)'''
pre_escape = r' ^(?P<indent>\s*) ~ (?P<rest> \}\}\} \s*) $'
table = r'''(?P<table>
^ \s*
[|].*? \s*
[|]? \s*
$
)'''
# For splitting table cells:
cell = r'''
\| \s*
(
(?P<head> [=][^|]+ ) |
(?P<cell> ( %s | [^|])+ )
) \s*
''' % '|'.join([link, macro, image, code])
def __init__(self, bloglike_lines=False, url_protocols=None,
wiki_words=False):
c = re.compile
# For pre escaping, in creole 1.0 done with ~:
self.pre_escape_re = c(self.pre_escape, re.M | re.X)
# for link descriptions
self.link_re = c('|'.join([self.image, self.linebreak,
self.char]), re.X | re.U)
# for list items
self.item_re = c(self.item, re.X | re.U | re.M)
# for table cells
self.cell_re = c(self.cell, re.X | re.U)
# For block elements:
if bloglike_lines:
self.text = r'(?P<text> .+ ) (?P<break> (?<!\\)$\n(?!\s*$) )?'
self.block_re = c('|'.join([self.line, self.head, self.separator,
self.pre, self.list, self.table,
self.text]), re.X | re.U | re.M)
# For inline elements:
if url_protocols is not None:
self.proto = '|'.join(re.escape(p) for p in url_protocols)
self.url = r'''(?P<url>
(^ | (?<=\s | [.,:;!?()/=]))
(?P<escaped_url>~)?
(?P<url_target> (?P<url_proto> %s ):\S+? )
($ | (?=\s | [,.:;!?()] (\s | $))))''' % self.proto
inline_elements = [self.link, self.url, self.macro,
self.code, self.image, self.strong,
self.emph, self.linebreak,
self.escape, self.char]
if wiki_words:
import unicodedata
up_case = u''.join(unichr(i) for i in xrange(sys.maxunicode)
if unicodedata.category(unichr(i))=='Lu')
self.wiki = ur'''(?P<wiki>[%s]\w+[%s]\w+)''' % (up_case, up_case)
inline_elements.insert(3, self.wiki)
self.inline_re = c('|'.join(inline_elements), re.X | re.U)