Skip to content

Commit

Permalink
feat: add query param for choosing the language (ar & en)
Browse files Browse the repository at this point in the history
  • Loading branch information
anqorithm committed Nov 15, 2023
1 parent 83a61f0 commit 5140808
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 4 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
crawle_alerts_ar
.env
13 changes: 9 additions & 4 deletions functions/alerts/get_alerts.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@
def lambda_handler(event, context):
page = 1
limit = 10
lang = 'en'

query_params = event.get('queryStringParameters', {})
if query_params:
page = int(query_params.get('page', 1))
limit = int(query_params.get('limit', 10))
lang = query_params.get('lang', 'en')

if page < 1 or limit < 1:
return {
Expand All @@ -22,13 +24,15 @@ def lambda_handler(event, context):
}

try:
alerts, total_alerts = get_alerts(page, limit)
alerts, total_alerts = get_alerts(
page, limit, lang)
next_page_url = None

total_pages = (total_alerts + limit - 1) // limit

if page < total_pages:
next_page_params = urlencode({'page': page + 1, 'limit': limit})
next_page_params = urlencode(
{'page': page + 1, 'limit': limit, 'lang': lang})
next_page_url = "https://1tozt5y6hl.execute-api.us-east-1.amazonaws.com/default/get_alerts?" + next_page_params

return {
Expand All @@ -50,14 +54,15 @@ def lambda_handler(event, context):
}


def get_alerts(page, limit):
def get_alerts(page, limit, lang):
try:
mongodb_uri = os.getenv('MONGO_URI')
client = MongoClient(mongodb_uri, serverSelectionTimeoutMS=5000)
client.server_info()

db = client['alerts_database']
collection = db['alerts']
collection_name = 'alerts_ar' if lang == 'ar' else 'alerts'
collection = db[collection_name]

skip = (page - 1) * limit

Expand Down
124 changes: 124 additions & 0 deletions functions/crewler/crawle_alerts_ar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import os
import json
import requests
from pymongo import MongoClient
from bs4 import BeautifulSoup


def lambda_handler(event, context):
try:
from_page = int(event.get("from_page", 1))
to_page = int(event.get("to_page", 1)) + 1

all_alerts = []
for i in range(from_page, to_page):
data = scrape_page(i)
all_alerts.extend(data)
store_in_mongodb(all_alerts)
return {
'statusCode': 200,
'body': json.dumps(f'Successfully processed pages {from_page} to {to_page - 1}')
}
except Exception as e:
return {
'statusCode': 500,
'body': json.dumps(f'Error: {str(e)}')
}


def store_in_mongodb(data):
mongodb_uri = os.getenv('MONGO_URI')
client = MongoClient(mongodb_uri)
db = client['alerts_database']
collection = db['alerts_ar']

for alert in data:
warning_number = alert['details'].get('warning_number')
if not collection.find_one({'details.warning_number': warning_number}):
collection.insert_one(alert)
else:
print(
f"Alert with warning number {warning_number} already exists, skipping.")


def scrape_alert_details(alert_url):
response = requests.get(alert_url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
alert_details_div = soup.find(
'div', class_='cert-body cert-gray-70 m-3')

details = {}

if alert_details_div:
columns = alert_details_div.find('div', class_='row pb-5')
if columns:
left_col = columns.find(
'div', class_='col-5 col-md-auto cert-gray-50').find_all('p')
right_col = columns.find(
'div', class_='col-7 col-md-9 vertical-line pl-4').find_all('p')

keys = ["warning_date", "severity_level",
"warning_number", "target_sector"]
for key, value in zip(keys, right_col):
details[key] = value.get_text(strip=True)

paragraph_count = 1
list_item_count = 1
for child in alert_details_div.find_all(['p', 'li', 'strong']):
if child.name == 'p' and child.find('a'):
link_text = child.get_text(
strip=True).split('click')[0].strip()
details[f"link_{paragraph_count}"] = child.find(
'a').get('href', '')
paragraph_count += 1
elif child.name == 'li':
details[f"i_{list_item_count}"] = child.get_text(
strip=True)
list_item_count += 1
elif child.name == 'p':
details[f"p_{paragraph_count}"] = child.get_text(
strip=True)
paragraph_count += 1
elif child.name == 'strong':
strong_text = child.get_text(strip=True)
if strong_text:
details[f"strong_{paragraph_count}"] = strong_text
paragraph_count += 1

return details
else:
return f"Failed to retrieve alert details. Status code: {response.status_code}"


def scrape_page(page_number):
url = f"https://cert.gov.sa/ar/security-warnings/?page={page_number}"
response = requests.get(url)

if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')

alerts_severity = soup.find_all('div', class_='card-header')
alerts_title = soup.find_all('p', class_='cert-card-body-warning')
alert_images = soup.find_all(
'img', class_=['card-img-top', 'security-alerts-cover-image'])
alert_cards = soup.find_all(
'div', class_='card mb-4 light-gray-border')
alerts_data = []

for severity, title, image, card in zip(alerts_severity, alerts_title, alert_images, alert_cards):
alert_url = "https://cert.gov.sa" + card.find('a').get('href')
alert_details = scrape_alert_details(alert_url)

alert_info = {
"title": title.text.strip(),
"severity": severity.text.strip(),
"logo": "https://cert.gov.sa" + image.get('src'),
"alert_url": alert_url,
"details": alert_details
}
alerts_data.append(alert_info)

return alerts_data
else:
return f"Failed to retrieve data from page {page_number}. Status code: {response.status_code}"
File renamed without changes.

0 comments on commit 5140808

Please sign in to comment.