-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add twiiter crawler script
- Loading branch information
Showing
6 changed files
with
883 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,4 +5,6 @@ start: | |
.PHONY: format | ||
format: | ||
black . | ||
isort . | ||
isort . | ||
dataset_vector_swft: | ||
python3 ingest_swft.py |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
"""Load html from files, clean up, split, ingest into Weaviate.""" | ||
import pickle | ||
|
||
from langchain.document_loaders import ReadTheDocsLoader | ||
from langchain.embeddings import OpenAIEmbeddings | ||
from langchain.text_splitter import RecursiveCharacterTextSplitter | ||
from langchain.vectorstores.faiss import FAISS | ||
|
||
from dotenv import load_dotenv | ||
from pathlib import Path | ||
import sys | ||
|
||
if getattr(sys, 'frozen', False): | ||
script_location = Path(sys.executable).parent.resolve() | ||
else: | ||
script_location = Path(__file__).parent.resolve() | ||
load_dotenv(dotenv_path=script_location / '.env') | ||
|
||
def ingest_docs(): | ||
"""Get documents from web pages.""" | ||
loader = ReadTheDocsLoader("dataset/") | ||
raw_documents = loader.load() | ||
text_splitter = RecursiveCharacterTextSplitter( | ||
chunk_size=1000, | ||
chunk_overlap=200, | ||
) | ||
documents = text_splitter.split_documents(raw_documents) | ||
embeddings = OpenAIEmbeddings() | ||
vectorstore = FAISS.from_documents(documents, embeddings) | ||
|
||
# Save vectorstore | ||
with open("vectorstore.pkl", "wb") as f: | ||
pickle.dump(vectorstore, f) | ||
|
||
|
||
if __name__ == "__main__": | ||
ingest_docs() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import os | ||
import requests | ||
from bs4 import BeautifulSoup | ||
from urllib.parse import urlparse | ||
from concurrent.futures import ThreadPoolExecutor, as_completed | ||
|
||
def get_links(url): | ||
domain = urlparse(url).netloc | ||
|
||
reqs = requests.get(url) | ||
soup = BeautifulSoup(reqs.text, 'html.parser') | ||
|
||
urls = [] | ||
for link in soup.find_all('a'): | ||
link_url = link.get('href') | ||
if link_url: | ||
link_domain = urlparse(link_url).netloc | ||
if not link_domain or link_domain == domain: | ||
urls.append(link_url) | ||
|
||
return urls | ||
|
||
def save_page(url, content): | ||
domain = urlparse(url).netloc | ||
path = os.path.join(domain, urlparse(url).path) | ||
folder = os.path.dirname(path) | ||
|
||
if not os.path.exists(folder): | ||
os.makedirs(folder) | ||
|
||
with open(f"{domain}/{path}", 'w', encoding='utf-8') as f: | ||
f.write(content) | ||
|
||
def process_url(current_url, visited_urls): | ||
print(f"Visiting {current_url}") | ||
|
||
try: | ||
response = requests.get(current_url) | ||
save_page(current_url, response.text) | ||
new_links = get_links(current_url) | ||
|
||
return new_links, visited_urls | {current_url} | ||
|
||
except Exception as e: | ||
print(f"Error processing {current_url}: {e}") | ||
return [], visited_urls | ||
|
||
def crawl_website(start_url, max_threads=10): | ||
visited_urls = set() | ||
urls_to_visit = [start_url] | ||
|
||
with ThreadPoolExecutor(max_workers=max_threads) as executor: | ||
while urls_to_visit: | ||
futures = {executor.submit(process_url, url, visited_urls): url for url in urls_to_visit} | ||
|
||
urls_to_visit = [] | ||
for future in as_completed(futures): | ||
new_links, visited = future.result() | ||
visited_urls |= visited # union visited sets | ||
urls_to_visit.extend(link for link in new_links if link not in visited_urls) | ||
|
||
print("Crawl complete.") | ||
|
||
if __name__ == "__main__": | ||
start_url = 'https://www.swft.pro/index.html' | ||
crawl_website(start_url) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
import os | ||
import requests | ||
import tweepy | ||
from bs4 import BeautifulSoup | ||
from urllib.parse import urlparse | ||
from getpass import getpass | ||
from concurrent.futures import ThreadPoolExecutor, as_completed | ||
|
||
# Enter your Twitter API credentials | ||
consumer_key = getpass("Enter your consumer key: ") | ||
consumer_secret = getpass("Enter your consumer secret: ") | ||
def get_links(url): | ||
domain = urlparse(url).netloc | ||
|
||
reqs = requests.get(url) | ||
soup = BeautifulSoup(reqs.text, 'html.parser') | ||
|
||
urls = [] | ||
for link in soup.find_all('a'): | ||
link_url = link.get('href') | ||
if link_url: | ||
link_domain = urlparse(link_url).netloc | ||
if not link_domain or link_domain == domain: | ||
urls.append(link_url) | ||
|
||
return urls | ||
|
||
def get_twitter_api(): | ||
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret) | ||
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) | ||
return api | ||
|
||
def get_twitter_username_from_url(url): | ||
domain = urlparse(url).netloc | ||
if "twitter.com" in domain: | ||
return urlparse(url).path.split('/')[1] | ||
return None | ||
|
||
def get_tweets(twitter_username, api, tweet_count=10): | ||
try: | ||
tweets = api.user_timeline(screen_name=twitter_username, count=tweet_count, tweet_mode='extended') | ||
return [tweet.full_text for tweet in tweets] | ||
except tweepy.TweepError as e: | ||
print(f"Error retrieving tweets for {twitter_username}: {e.reason}") | ||
return [] | ||
|
||
def save_tweets_to_file(username, tweets): | ||
folder = "twitter_data" | ||
|
||
if not os.path.exists(folder): | ||
os.makedirs(folder) | ||
|
||
with open(os.path.join(folder, f"{username}.txt"), 'w', encoding='utf-8') as f: | ||
for tweet in tweets: | ||
f.write(f"{tweet}\n\n") | ||
|
||
def crawl_and_find_twitter_links(start_url): | ||
visited_urls = set() | ||
urls_to_visit = [start_url] | ||
twitter_links = [] | ||
|
||
while urls_to_visit: | ||
current_url = urls_to_visit.pop() | ||
if current_url not in visited_urls: | ||
print(f"Visiting {current_url}") | ||
visited_urls.add(current_url) | ||
|
||
try: | ||
new_links = get_links(current_url) | ||
twitter_links.extend(link for link in new_links if "twitter.com" in urlparse(link).netloc) | ||
urls_to_visit.extend(link for link in new_links if link not in visited_urls) | ||
except Exception as e: | ||
print(f"Error processing {current_url}: {e}") | ||
|
||
return twitter_links | ||
|
||
def process_link(current_url,twitter_api): | ||
print(f"Visiting {current_url}") | ||
|
||
try: | ||
new_links = get_links(current_url) | ||
twitter_links = [link for link in new_links if "twitter.com" in urlparse(link).netloc] | ||
|
||
for twitter_link in set(twitter_links): | ||
twitter_username = get_twitter_username_from_url(twitter_link) | ||
if twitter_username: | ||
tweets = get_tweets(twitter_username, twitter_api) | ||
if tweets: | ||
save_tweets_to_file(twitter_username, tweets) | ||
print(f"Saved tweets for {twitter_username}") | ||
|
||
return new_links | ||
|
||
except Exception as e: | ||
print(f"Error processing {current_url}: {e}") | ||
return [] | ||
|
||
def crawl_twitter_data(start_url, max_threads=10): | ||
twitter_api = get_twitter_api() | ||
|
||
visited_urls = set() | ||
urls_to_visit = [start_url] | ||
|
||
with ThreadPoolExecutor(max_workers=max_threads) as executor: | ||
while urls_to_visit: | ||
futures = {executor.submit(process_link, url,twitter_api): url for url in urls_to_visit} | ||
visited_urls |= set(urls_to_visit) # Add new links to visited_urls | ||
|
||
urls_to_visit = [] | ||
for future in as_completed(futures): | ||
new_links = future.result() | ||
urls_to_visit.extend( | ||
link for link in new_links if link not in visited_urls | ||
) | ||
|
||
if __name__ == "__main__": | ||
start_url = 'https://www.swft.pro/' | ||
crawl_twitter_data(start_url) |
Large diffs are not rendered by default.
Oops, something went wrong.