add swft dataset

add twiiter crawler script
MetaPath01 · Apr 26, 2023 · 5c50b56 · 5c50b56
1 parent eba5d15
commit 5c50b56
Show file tree

Hide file tree

Showing 6 changed files with 883 additions and 1 deletion.
diff --git a/Makefile b/Makefile
@@ -5,4 +5,6 @@ start:
 .PHONY: format
 format:
 	black .
-	isort .
+	isort .
+dataset_vector_swft:
+	python3 ingest_swft.py
diff --git a/dataset/dataset 20230426-121810.csv b/dataset/dataset 20230426-121810.csv
diff --git a/ingest_swft.py b/ingest_swft.py
@@ -0,0 +1,37 @@
+"""Load html from files, clean up, split, ingest into Weaviate."""
+import pickle
+
+from langchain.document_loaders import ReadTheDocsLoader
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores.faiss import FAISS
+
+from dotenv import load_dotenv
+from pathlib import Path
+import sys
+
+if getattr(sys, 'frozen', False):
+    script_location = Path(sys.executable).parent.resolve()
+else:
+    script_location = Path(__file__).parent.resolve()
+load_dotenv(dotenv_path=script_location / '.env')
+
+def ingest_docs():
+    """Get documents from web pages."""
+    loader = ReadTheDocsLoader("dataset/")
+    raw_documents = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200,
+    )
+    documents = text_splitter.split_documents(raw_documents)
+    embeddings = OpenAIEmbeddings()
+    vectorstore = FAISS.from_documents(documents, embeddings)
+
+    # Save vectorstore
+    with open("vectorstore.pkl", "wb") as f:
+        pickle.dump(vectorstore, f)
+
+
+if __name__ == "__main__":
+    ingest_docs()
diff --git a/swft_crawler.py b/swft_crawler.py
@@ -0,0 +1,66 @@
+import os
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+def get_links(url):
+    domain = urlparse(url).netloc
+
+    reqs = requests.get(url)
+    soup = BeautifulSoup(reqs.text, 'html.parser')
+
+    urls = []
+    for link in soup.find_all('a'):
+        link_url = link.get('href')
+        if link_url:
+            link_domain = urlparse(link_url).netloc
+            if not link_domain or link_domain == domain:
+                urls.append(link_url)
+
+    return urls
+
+def save_page(url, content):
+    domain = urlparse(url).netloc
+    path = os.path.join(domain, urlparse(url).path)
+    folder = os.path.dirname(path)
+
+    if not os.path.exists(folder):
+        os.makedirs(folder)
+
+    with open(f"{domain}/{path}", 'w', encoding='utf-8') as f:
+        f.write(content)
+
+def process_url(current_url, visited_urls):
+    print(f"Visiting {current_url}")
+
+    try:
+        response = requests.get(current_url)
+        save_page(current_url, response.text)
+        new_links = get_links(current_url)
+
+        return new_links, visited_urls | {current_url}
+
+    except Exception as e:
+        print(f"Error processing {current_url}: {e}")
+        return [], visited_urls
+
+def crawl_website(start_url, max_threads=10):
+    visited_urls = set()
+    urls_to_visit = [start_url]
+
+    with ThreadPoolExecutor(max_workers=max_threads) as executor:
+        while urls_to_visit:
+            futures = {executor.submit(process_url, url, visited_urls): url for url in urls_to_visit}
+
+            urls_to_visit = []
+            for future in as_completed(futures):
+                new_links, visited = future.result()
+                visited_urls |= visited  # union visited sets
+                urls_to_visit.extend(link for link in new_links if link not in visited_urls)
+
+    print("Crawl complete.")
+
+if __name__ == "__main__":
+    start_url = 'https://www.swft.pro/index.html'
+    crawl_website(start_url)
diff --git a/swft_twitter_crawler.py b/swft_twitter_crawler.py
@@ -0,0 +1,118 @@
+import os
+import requests
+import tweepy
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+from getpass import getpass
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+# Enter your Twitter API credentials
+consumer_key = getpass("Enter your consumer key: ")
+consumer_secret = getpass("Enter your consumer secret: ")
+def get_links(url):
+    domain = urlparse(url).netloc
+
+    reqs = requests.get(url)
+    soup = BeautifulSoup(reqs.text, 'html.parser')
+
+    urls = []
+    for link in soup.find_all('a'):
+        link_url = link.get('href')
+        if link_url:
+            link_domain = urlparse(link_url).netloc
+            if not link_domain or link_domain == domain:
+                urls.append(link_url)
+
+    return urls
+
+def get_twitter_api():
+    auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
+    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
+    return api
+
+def get_twitter_username_from_url(url):
+    domain = urlparse(url).netloc
+    if "twitter.com" in domain:
+        return urlparse(url).path.split('/')[1]
+    return None
+
+def get_tweets(twitter_username, api, tweet_count=10):
+    try:
+        tweets = api.user_timeline(screen_name=twitter_username, count=tweet_count, tweet_mode='extended')
+        return [tweet.full_text for tweet in tweets]
+    except tweepy.TweepError as e:
+        print(f"Error retrieving tweets for {twitter_username}: {e.reason}")
+        return []
+
+def save_tweets_to_file(username, tweets):
+    folder = "twitter_data"
+
+    if not os.path.exists(folder):
+        os.makedirs(folder)
+
+    with open(os.path.join(folder, f"{username}.txt"), 'w', encoding='utf-8') as f:
+        for tweet in tweets:
+            f.write(f"{tweet}\n\n")
+
+def crawl_and_find_twitter_links(start_url):
+    visited_urls = set()
+    urls_to_visit = [start_url]
+    twitter_links = []
+
+    while urls_to_visit:
+        current_url = urls_to_visit.pop()
+        if current_url not in visited_urls:
+            print(f"Visiting {current_url}")
+            visited_urls.add(current_url)
+
+            try:
+                new_links = get_links(current_url)
+                twitter_links.extend(link for link in new_links if "twitter.com" in urlparse(link).netloc)
+                urls_to_visit.extend(link for link in new_links if link not in visited_urls)
+            except Exception as e:
+                print(f"Error processing {current_url}: {e}")
+
+    return twitter_links
+
+def process_link(current_url,twitter_api):
+    print(f"Visiting {current_url}")
+
+    try:
+        new_links = get_links(current_url)
+        twitter_links = [link for link in new_links if "twitter.com" in urlparse(link).netloc]
+
+        for twitter_link in set(twitter_links):
+            twitter_username = get_twitter_username_from_url(twitter_link)
+            if twitter_username:
+                tweets = get_tweets(twitter_username, twitter_api)
+                if tweets:
+                    save_tweets_to_file(twitter_username, tweets)
+                    print(f"Saved tweets for {twitter_username}")
+
+        return new_links
+
+    except Exception as e:
+        print(f"Error processing {current_url}: {e}")
+        return []
+
+def crawl_twitter_data(start_url, max_threads=10):
+    twitter_api = get_twitter_api()
+
+    visited_urls = set()
+    urls_to_visit = [start_url]
+
+    with ThreadPoolExecutor(max_workers=max_threads) as executor:
+        while urls_to_visit:
+            futures = {executor.submit(process_link, url,twitter_api): url for url in urls_to_visit}
+            visited_urls |= set(urls_to_visit)  # Add new links to visited_urls
+
+            urls_to_visit = []
+            for future in as_completed(futures):
+                new_links = future.result()
+                urls_to_visit.extend(
+                    link for link in new_links if link not in visited_urls
+                )
+
+if __name__ == "__main__":
+    start_url = 'https://www.swft.pro/'
+    crawl_twitter_data(start_url)
diff --git a/www.swft.pro/index.html b/www.swft.pro/index.html