-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathnote_stats.py
92 lines (76 loc) · 2.46 KB
/
note_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
Analyze all joplin notes.
Requirements: pip install joppy markdown beautifulsoup4 nltk
Usage: API_TOKEN=XYZ python note_stats.py
"""
import re
import string
import os
from bs4 import BeautifulSoup
from joppy.client_api import ClientApi
from markdown import Markdown
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
def markdown_to_text(markdown_string: str) -> str:
# convert markdown to html
md = Markdown(extensions=["nl2br", "sane_lists", "tables"])
html = md.convert(markdown_string)
exclude_patterns_html = (
r"<pre>.*?<\/pre>", # code
r"<code>.*?<\/code>", # code
r"\$.*?\$", # formulas (https://meta.stackexchange.com/a/263344)
)
for pattern in exclude_patterns_html:
html = re.sub(pattern, " ", html, flags=re.DOTALL)
# convert html to text
text = BeautifulSoup(html, "html.parser").get_text()
exclude_patterns_text = (r"http[A-Za-z0-9-._~:/?#\[\]@!$&'\(\)\*+,;=]*",) # links
for pattern in exclude_patterns_text:
text = re.sub(pattern, " ", text, flags=re.DOTALL)
return text
def analyze_text(text: str):
tokens = word_tokenize(text)
tokens = [
# normalize to lower case
word.lower()
for word in tokens
if word not in ("...", "''", "``", "--", "++")
and
# punctuation
word not in string.punctuation
and
# single character words
len(word) > 1
and
# words containing at least one digit
not any(character.isdigit() for character in word)
]
print("Words:", len(tokens))
# filter most common words
tokens = [
word
for word in tokens
if word not in set(stopwords.words("english") + stopwords.words("german"))
]
fdist = FreqDist(tokens)
# fdist.plot(50)
print("Most common words:")
for word, count in fdist.most_common(10):
print(f"- {word}: {count}")
def main():
# download nltk data at the first start
if False:
# "punkt" and "tokenizers" are needed.
nltk.download()
# get all notes from joplin
api = ClientApi(token=os.getenv("API_TOKEN"))
notes = api.get_all_notes(fields="id,title,body")
print("Notes:", len(notes))
# concatenate and convert them to text
text = markdown_to_text("\n".join(note.body for note in notes))
# analyze them
analyze_text(text)
if __name__ == "__main__":
main()