-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathwebScraper.py
31 lines (26 loc) · 1.17 KB
/
webScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def scrape_website(url, visited=set()):
print("iteration")
try:
response = requests.get(url)
response.raise_for_status() # This will throw an HTTPError if the response was unsuccessful
except requests.exceptions.HTTPError as err:
print(f"HTTP error occurred: {err} - URL: {url}")
return # Exit the function, skipping the rest of the code
soup = BeautifulSoup(response.content, 'html.parser')
visited.add(url)
# Save content to file
text_content = '\n'.join(element.text for element in soup.find_all(['p']))
with open('dailyIllini2.txt', 'a', encoding='utf-8') as file:
file.write(f'\n\n--- Content from {url} ---\n\n{text_content}\n')
# Find all links on the page
for a_tag in soup.find_all('a', href=True):
link = urljoin(url, a_tag['href'])
# Only visit links within the same domain and not visited yet
if url in link and link not in visited:
scrape_website(link, visited)
# Usage:
url = 'https://dailyillini.com/' # Replace with the URL of the website you want to scrape
scrape_website(url)