-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindexer.py
91 lines (79 loc) · 2.47 KB
/
indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# import psycopg2
import math
import os
import re
import string
from enum import unique
from queue import Queue
from random import randint
import nltk
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from nltk.stem import WordNetLemmatizer
from pymongo import MongoClient
load_dotenv()
# for debugging
from pprint import pprint
## for client connecting
if os.environ.get("ENVIRONMENT") == "dev":
client = MongoClient("mongodb://localhost:27017/")
else:
client = MongoClient(os.environ.get("ATLAS_URI"))
## Db connect and index creation for unique words
db = client["web-map"]
db.tags.create_index("word", unique=True)
def tagparse(data, priority):
lst_tags = {}
try:
for i in data:
for j in i.text.strip().split(" "):
j = re.sub("[%s]" % re.escape(string.punctuation), "", j)
if len(j) >= 2 and len(j) <= 45:
if lst_tags.get(j.lower()):
x = (lst_tags[j.lower()] / priority) + 1
lst_tags[j.lower()] = x * priority
else:
lst_tags[j.lower()] = priority
else:
pass
return lst_tags
except:
return None
def bodyClean(body):
body = str(body)
CLEANR = re.compile(r"<[^>]*>")
cleantext = CLEANR.sub("", body)
pprint(cleantext)
## uncomment this while running for first time ....
# nltk.download('omw-1.4')
# nltk.download('wordnet')
# nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words("english")
lemmatizer = WordNetLemmatizer()
doc = re.sub("[^a-zA-Z]", " ", cleantext)
doc = doc.lower()
doc = doc.split()
doc = [lemmatizer.lemmatize(word) for word in doc if not word in set(stopwords)]
doc = " ".join(doc)
pprint(doc)
return doc
def htmlparser(soup, url):
print(soup.findAll("title"))
print(soup.findAll("meta"))
bodyClean(soup.find("body"))
priorities = [
tagparse(soup.findAll("h1"), 10),
tagparse(soup.findAll("h2"), 9),
tagparse(soup.findAll("p"), 8),
tagparse(soup.findAll("h3"), 9),
tagparse(soup.findAll("li"), 7),
tagparse(soup.findAll("b"), 8),
]
for i in priorities:
for key, value in i.items():
db.tags.update_one(
filter={"word": key},
update={"$addToSet": {str(math.ceil(value / 10)): url}},
upsert=True,
)