Skip to content

Commit

Permalink
add swft dataset
Browse files Browse the repository at this point in the history
add twiiter crawler script
  • Loading branch information
holynull committed Apr 26, 2023
1 parent eba5d15 commit 5c50b56
Show file tree
Hide file tree
Showing 6 changed files with 883 additions and 1 deletion.
4 changes: 3 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ start:
.PHONY: format
format:
black .
isort .
isort .
dataset_vector_swft:
python3 ingest_swft.py
658 changes: 658 additions & 0 deletions dataset/dataset 20230426-121810.csv

Large diffs are not rendered by default.

37 changes: 37 additions & 0 deletions ingest_swft.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""Load html from files, clean up, split, ingest into Weaviate."""
import pickle

from langchain.document_loaders import ReadTheDocsLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS

from dotenv import load_dotenv
from pathlib import Path
import sys

if getattr(sys, 'frozen', False):
script_location = Path(sys.executable).parent.resolve()
else:
script_location = Path(__file__).parent.resolve()
load_dotenv(dotenv_path=script_location / '.env')

def ingest_docs():
"""Get documents from web pages."""
loader = ReadTheDocsLoader("dataset/")
raw_documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
)
documents = text_splitter.split_documents(raw_documents)
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents, embeddings)

# Save vectorstore
with open("vectorstore.pkl", "wb") as f:
pickle.dump(vectorstore, f)


if __name__ == "__main__":
ingest_docs()
66 changes: 66 additions & 0 deletions swft_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed

def get_links(url):
domain = urlparse(url).netloc

reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')

urls = []
for link in soup.find_all('a'):
link_url = link.get('href')
if link_url:
link_domain = urlparse(link_url).netloc
if not link_domain or link_domain == domain:
urls.append(link_url)

return urls

def save_page(url, content):
domain = urlparse(url).netloc
path = os.path.join(domain, urlparse(url).path)
folder = os.path.dirname(path)

if not os.path.exists(folder):
os.makedirs(folder)

with open(f"{domain}/{path}", 'w', encoding='utf-8') as f:
f.write(content)

def process_url(current_url, visited_urls):
print(f"Visiting {current_url}")

try:
response = requests.get(current_url)
save_page(current_url, response.text)
new_links = get_links(current_url)

return new_links, visited_urls | {current_url}

except Exception as e:
print(f"Error processing {current_url}: {e}")
return [], visited_urls

def crawl_website(start_url, max_threads=10):
visited_urls = set()
urls_to_visit = [start_url]

with ThreadPoolExecutor(max_workers=max_threads) as executor:
while urls_to_visit:
futures = {executor.submit(process_url, url, visited_urls): url for url in urls_to_visit}

urls_to_visit = []
for future in as_completed(futures):
new_links, visited = future.result()
visited_urls |= visited # union visited sets
urls_to_visit.extend(link for link in new_links if link not in visited_urls)

print("Crawl complete.")

if __name__ == "__main__":
start_url = 'https://www.swft.pro/index.html'
crawl_website(start_url)
118 changes: 118 additions & 0 deletions swft_twitter_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import os
import requests
import tweepy
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from getpass import getpass
from concurrent.futures import ThreadPoolExecutor, as_completed

# Enter your Twitter API credentials
consumer_key = getpass("Enter your consumer key: ")
consumer_secret = getpass("Enter your consumer secret: ")
def get_links(url):
domain = urlparse(url).netloc

reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')

urls = []
for link in soup.find_all('a'):
link_url = link.get('href')
if link_url:
link_domain = urlparse(link_url).netloc
if not link_domain or link_domain == domain:
urls.append(link_url)

return urls

def get_twitter_api():
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
return api

def get_twitter_username_from_url(url):
domain = urlparse(url).netloc
if "twitter.com" in domain:
return urlparse(url).path.split('/')[1]
return None

def get_tweets(twitter_username, api, tweet_count=10):
try:
tweets = api.user_timeline(screen_name=twitter_username, count=tweet_count, tweet_mode='extended')
return [tweet.full_text for tweet in tweets]
except tweepy.TweepError as e:
print(f"Error retrieving tweets for {twitter_username}: {e.reason}")
return []

def save_tweets_to_file(username, tweets):
folder = "twitter_data"

if not os.path.exists(folder):
os.makedirs(folder)

with open(os.path.join(folder, f"{username}.txt"), 'w', encoding='utf-8') as f:
for tweet in tweets:
f.write(f"{tweet}\n\n")

def crawl_and_find_twitter_links(start_url):
visited_urls = set()
urls_to_visit = [start_url]
twitter_links = []

while urls_to_visit:
current_url = urls_to_visit.pop()
if current_url not in visited_urls:
print(f"Visiting {current_url}")
visited_urls.add(current_url)

try:
new_links = get_links(current_url)
twitter_links.extend(link for link in new_links if "twitter.com" in urlparse(link).netloc)
urls_to_visit.extend(link for link in new_links if link not in visited_urls)
except Exception as e:
print(f"Error processing {current_url}: {e}")

return twitter_links

def process_link(current_url,twitter_api):
print(f"Visiting {current_url}")

try:
new_links = get_links(current_url)
twitter_links = [link for link in new_links if "twitter.com" in urlparse(link).netloc]

for twitter_link in set(twitter_links):
twitter_username = get_twitter_username_from_url(twitter_link)
if twitter_username:
tweets = get_tweets(twitter_username, twitter_api)
if tweets:
save_tweets_to_file(twitter_username, tweets)
print(f"Saved tweets for {twitter_username}")

return new_links

except Exception as e:
print(f"Error processing {current_url}: {e}")
return []

def crawl_twitter_data(start_url, max_threads=10):
twitter_api = get_twitter_api()

visited_urls = set()
urls_to_visit = [start_url]

with ThreadPoolExecutor(max_workers=max_threads) as executor:
while urls_to_visit:
futures = {executor.submit(process_link, url,twitter_api): url for url in urls_to_visit}
visited_urls |= set(urls_to_visit) # Add new links to visited_urls

urls_to_visit = []
for future in as_completed(futures):
new_links = future.result()
urls_to_visit.extend(
link for link in new_links if link not in visited_urls
)

if __name__ == "__main__":
start_url = 'https://www.swft.pro/'
crawl_twitter_data(start_url)
1 change: 1 addition & 0 deletions www.swft.pro/index.html

Large diffs are not rendered by default.

0 comments on commit 5c50b56

Please sign in to comment.