From 978d4ece6f76fa6c11abe1b09d5e65db4c516816 Mon Sep 17 00:00:00 2001 From: Robert Date: Mon, 21 Oct 2024 20:05:47 -0700 Subject: [PATCH] Delete Extract_Bookmark_URLs.py --- .../Web_Scraping/Extract_Bookmark_URLs.py | 171 ------------------ 1 file changed, 171 deletions(-) delete mode 100644 App_Function_Libraries/Web_Scraping/Extract_Bookmark_URLs.py diff --git a/App_Function_Libraries/Web_Scraping/Extract_Bookmark_URLs.py b/App_Function_Libraries/Web_Scraping/Extract_Bookmark_URLs.py deleted file mode 100644 index 86b597ddd..000000000 --- a/App_Function_Libraries/Web_Scraping/Extract_Bookmark_URLs.py +++ /dev/null @@ -1,171 +0,0 @@ -# Extract_Bookmark_URLs.py -# Description: Extract URLs from Chrome/Edge and Firefox bookmarks files, as well as CSV files. -# -# Imports -import json -import sqlite3 -from pathlib import Path -from typing import Set, Union -from urllib.parse import urlparse -import logging -# -# External Libraries -from bs4 import BeautifulSoup -# -#################################################################################################### -# -# Logging -logger = logging.getLogger(__name__) -# -#################################################################################################### -# -# Functions - -def is_valid_url(url: str) -> bool: - """Validate URL using urlparse""" - try: - result = urlparse(url) - return all([result.scheme, result.netloc]) - except Exception: - return False - - -def clean_url(url: str) -> str: - """Clean and normalize URL""" - url = url.strip() - if not url.startswith(('http://', 'https://', 'ftp://')): - url = 'https://' + url - return url - - -def extract_from_chrome_like(file_path: Path) -> Set[str]: - """Extract URLs from Chrome/Edge JSON bookmarks file""" - urls = set() - - with open(file_path, 'r', encoding='utf-8') as f: - data = json.load(f) - - def process_node(node): - if isinstance(node, dict): - if node.get('type') == 'url': - url = node.get('url', '').strip() - if url and is_valid_url(url): - urls.add(clean_url(url)) - for child in node.get('children', []): - process_node(child) - - if 'roots' in data: - for root in data['roots'].values(): - process_node(root) - - return urls - - -def extract_from_firefox_sqlite(file_path: Path) -> Set[str]: - """Extract URLs from Firefox places.sqlite database""" - urls = set() - - conn = sqlite3.connect(f'file:{file_path}?mode=ro', uri=True) - try: - cursor = conn.cursor() - cursor.execute(""" - SELECT url FROM moz_places - JOIN moz_bookmarks ON moz_bookmarks.fk = moz_places.id - WHERE url IS NOT NULL - """) - - for (url,) in cursor.fetchall(): - if url and is_valid_url(url): - urls.add(clean_url(url)) - finally: - conn.close() - - return urls - - -def extract_from_html_bookmarks(file_path: Path) -> Set[str]: - """Extract URLs from HTML bookmarks export file""" - urls = set() - - encodings = ['utf-8', 'utf-8-sig', 'latin1'] - content = None - - for encoding in encodings: - try: - with open(file_path, 'r', encoding=encoding) as f: - content = f.read() - break - except UnicodeDecodeError: - continue - - if not content: - raise ValueError("Could not decode file with supported encodings") - - soup = BeautifulSoup(content, 'html.parser') - for link in soup.find_all('a'): - url = link.get('href', '').strip() - if url and not url.startswith('#') and is_valid_url(url): - urls.add(clean_url(url)) - - return urls - - -def extract_urls(file_path: Union[str, Path]) -> Set[str]: - """ - Extract URLs from browser bookmarks file. - Supports Chrome/Edge JSON, Firefox SQLite, and HTML exports. - - Args: - file_path: Path to bookmarks file - - Returns: - Set of unique URLs - - Raises: - FileNotFoundError: If file doesn't exist - ValueError: If file format is not supported - """ - file_path = Path(file_path) - - if not file_path.exists(): - raise FileNotFoundError(f"File not found: {file_path}") - - suffix = file_path.suffix.lower() - - try: - if suffix == '.json': - return extract_from_chrome_like(file_path) - elif suffix == '.sqlite': - return extract_from_firefox_sqlite(file_path) - elif suffix in ['.htm', '.html']: - return extract_from_html_bookmarks(file_path) - else: - raise ValueError(f"Unsupported file format: {suffix}") - except Exception as e: - raise ValueError(f"Failed to process {file_path}: {str(e)}") - - -def save_urls(urls: Set[str], output_file: Union[str, Path]) -> None: - """Save URLs to a text file""" - with open(output_file, 'w', encoding='utf-8') as f: - for url in sorted(urls): - f.write(f"{url}\n") - - -if __name__ == "__main__": - # Simple command line interface - logging.basicConfig(level=logging.INFO) - - file_path = input("Enter the path to your bookmarks file: ") - try: - urls = extract_urls(file_path) - output_file = Path(file_path).with_suffix('.urls.txt') - save_urls(urls, output_file) - print(f"Found {len(urls)} URLs") - print(f"Saved to {output_file}") - except Exception as e: - print(f"Error: {e}") - -# -# End of Extract_Bookmark_URLs.py -####################################################################################################