-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_wiki_words.py
60 lines (41 loc) · 2.38 KB
/
get_wiki_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import requests
from bs4 import BeautifulSoup
from utils import HEADERS
OUT_FILENAME = 'ru_words.txt'
wiki_link = "https://ru.wiktionary.org/wiki/"
top1_100 = "Приложение:Список_частотности_по_НКРЯ"
top101_1000 = "Приложение:Список_частотности_по_НКРЯ/101—1000"
top1001_10000 = "Приложение:Список_частотности_по_НКРЯ/1001—10_000"
top1_100_wiki_link = "https://ru.wiktionary.org/wiki/%D0%9F%D1%80%D0%B8%D0%BB%D0%BE%D0%B6%D0%B5%D0%BD%D0%B8%D0%B5:%D0%A1%D0%BF%D0%B8%D1%81%D0%BE%D0%BA_%D1%87%D0%B0%D1%81%D1%82%D0%BE%D1%82%D0%BD%D0%BE%D1%81%D1%82%D0%B8_%D0%BF%D0%BE_%D0%9D%D0%9A%D0%A0%D0%AF"
top101_1000_wiki_link = "https://ru.wiktionary.org/wiki/%D0%9F%D1%80%D0%B8%D0%BB%D0%BE%D0%B6%D0%B5%D0%BD%D0%B8%D0%B5:%D0%A1%D0%BF%D0%B8%D1%81%D0%BE%D0%BA_%D1%87%D0%B0%D1%81%D1%82%D0%BE%D1%82%D0%BD%D0%BE%D1%81%D1%82%D0%B8_%D0%BF%D0%BE_%D0%9D%D0%9A%D0%A0%D0%AF/101%E2%80%941000"
top1001_10000_wiki_link = "https://ru.wiktionary.org/wiki/%D0%9F%D1%80%D0%B8%D0%BB%D0%BE%D0%B6%D0%B5%D0%BD%D0%B8%D0%B5:%D0%A1%D0%BF%D0%B8%D1%81%D0%BE%D0%BA_%D1%87%D0%B0%D1%81%D1%82%D0%BE%D1%82%D0%BD%D0%BE%D1%81%D1%82%D0%B8_%D0%BF%D0%BE_%D0%9D%D0%9A%D0%A0%D0%AF/1001%E2%80%9410_000"
LINKS = (top1_100_wiki_link, top101_1000_wiki_link, top1001_10000_wiki_link)
def get_words_page(link):
session = requests.session()
response = session.get(link, headers=HEADERS)
if response.status_code == 200:
print(f"Successfully loaded `{requests.utils.unquote(link)}`")
else:
raise ValueError("Something went wrong while getting result")
words = []
soup = BeautifulSoup(response.text, 'html.parser')
for list_element in soup.find(
'div', id="mw-content-text").find('ol').find_all('li'):
word = list_element.string
if word is not None:
words.append(word)
print(f'Successfully parsed `{requests.utils.unquote(link)}`')
return words
def get_words():
words = []
for link in LINKS:
words.extend(get_words_page(link))
return words
def delete_duplicates(words):
return list(dict.fromkeys(words))
with open(OUT_FILENAME, 'w', encoding='utf-8') as out_f:
words = get_words()
print(f"Before removing duplicates: {len(words)}")
words = delete_duplicates(words)
print(f"After removing duplicates: {len(words)}")
out_f.write('\n'.join(words))