-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.py
133 lines (103 loc) · 3.82 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""
Script to parse the contents of a codelab into JSON.
"""
import logging
import os
import re
import yaml
import requests
logging.basicConfig(level=logging.INFO)
base_dir_path = 'repos'
doc_file = 'doc.md'
metadata_file = 'metadata.yaml'
data = {base_dir_path: {}}
IMPORT_REGEXP = re.compile(r"IMPORT\('(\S+?)',\s*'(\w+?)'\)")
comment_chars_map = {
'.dart': '//',
'.py': '#',
'.html': '<!--'
}
"""Reads the contents of the markdown file."""
def read_md_file(dirpath, filename):
file_path = os.path.join(dirpath, filename)
data[base_dir_path][os.path.basename(dirpath)] = {}
with open(file_path) as fh:
md = fh.read()
return md
"""Returns a list of IMPORTs in a markdown file. Each entry in the list is
itself a two-item list, where the first item is the import path and second item
is the name of the import tag."""
def extract_import_tags(text):
tags = []
matches = re.finditer(IMPORT_REGEXP, text)
for match in matches:
tags.append([match.group(1), match.group(2)])
return tags
"""Makes a request to GitHub to get the file used in an IMPORT."""
def fetch_import_code(path):
# TODO(shailen): add some error handling code.
return requests.get(path).content
"""Generates the string used in the regexp for getting the contents between
BEGIN() and END() tags."""
def get_begin_end_regexp_string(path, tag):
file_suffix = os.path.splitext(path)[1]
comment_chars = comment_chars_map[file_suffix]
# TODO(shailen): de-uglify this. Seriously.
return r"%s\s*?BEGIN\(%s\)([\s\S]+?)%s\s*?END\(%s\)" % (
comment_chars, tag, comment_chars, tag)
"""Gets code snippet defined by an IMPORT and ensures four spaces of leading
whitespace required by markdown."""
def get_code_snippet(match):
snippet = match.group(1)
snippet_lines = snippet.split('\n')
for line in snippet_lines:
if line.strip():
# Get leading whitspace.
lspace_len = len(line) - len(line.lstrip())
break
space = ' '
return '\n'.join(
["%s%s" % (space * 4, line[lspace_len:]) for line in snippet_lines]
)
"""Reads content of a markdown file, substituting IMPORTS with code
snippets, and returns the converted markdown."""
def process_md_file_with_imports(dirpath, filename):
md = read_md_file(dirpath, filename)
# Assuming for now that all IMPORT tags use urls.
imports = extract_import_tags(md)
for _import in imports:
path, tag = _import
code = fetch_import_code(path)
matches = re.finditer(re.compile(
get_begin_end_regexp_string(path, tag),
re.MULTILINE), code)
for match in matches:
snippet = get_code_snippet(match)
md = re.sub(re.compile(r"IMPORT\('%s', '%s'\)" %
(path, tag)), snippet, md)
return md
"""Generates the key, comprising of the org name and the repo name, used for
storing repo data as JSON."""
def get_org_and_repo(dirpath):
repo = os.path.basename(dirpath)
org = os.path.split(os.path.dirname(dirpath))[1]
return os.path.join(org, repo)
"""Generates the metadata and code for a repo as JSON."""
def generate_data(base_dir_path):
for dirpath, subdirs, filenames in os.walk(base_dir_path):
for filename in filenames:
if filename != doc_file:
continue
md = process_md_file_with_imports(dirpath, filename)
for filename in os.listdir(dirpath):
if filename != metadata_file:
continue
with open(os.path.join(dirpath, metadata_file),
'r') as fh:
metadata = yaml.load(fh)
key = get_org_and_repo(dirpath)
data[base_dir_path][key] = metadata
data[base_dir_path][key]['md'] = md
print(data)
if __name__ == "__main__":
generate_data(base_dir_path)