Web scraper with Selenium, BeautifulSoup, and standalone config_manager for dynamic config and executable readiness.

HamidByte · HamidByte · commit f90a2f05ab70 · 2025-01-10T03:05:46.000+01:00
diff --git a/README.md b/README.md
@@ -0,0 +1,44 @@
+# Twitter Trending Topics/Hashtags Scraper
+
+This Python script scrapes the trending topics from the website trends24.in and generates hashtags from the trends. It supports filtering for English-only trends and ensures that the generated hashtags fit within Twitter's character limits.
+
+## Requirements
+
+- Python 3.x
+- Selenium
+- BeautifulSoup
+- WebDriver Manager
+- Regular expressions (for filtering)
+
+You can install the required packages using pip:
+```bash
+pip install selenium beautifulsoup4 webdriver-manager
+```
+
+## Usage
+
+1. Run the script:
+
+```bash
+python T3_Scraper.py
+```
+
+2. The script will:
+
+- Open the trends24.in website.
+- Accept the cookie consent (if prompted).
+- Navigate to the "Table" section to gather trending topics.
+- Extract the trending topics along with additional information such as rank, position, count, and duration.
+- Optionally filter only English topics (if `ENGLISH_ONLY_REGEX` is set to `True`).
+- Create and print hashtags based on the most popular trends while adhering to Twitter's 280-character limit.
+
+## Configuration
+
+- `HEADLESS_MODE`: Set to `True` to run in headless mode (without opening a browser window).
+- `ENGLISH_ONLY_REGEX`: Set to `True` to filter for English-only trends based on regex patterns.
+- `TWEET_MAX_CHARS`: The character limit for hashtags (default is 280).
+
+## Notes
+
+- Ensure you have the Chrome WebDriver installed. You can use the WebDriver Manager to automatically handle this.
+- Adjust the sleep times if necessary based on your internet speed or website load time.
diff --git a/T3_Scraper.py b/T3_Scraper.py
@@ -0,0 +1,159 @@
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.action_chains import ActionChains
+from bs4 import BeautifulSoup
+import time
+import re
+from collections import defaultdict
+
+# Import configuration manager
+from config_manager import load_config
+
+# Load configuration
+config_data = load_config()
+
+# Access configuration constants
+TWEET_MAX_CHARS = config_data['TWEET_MAX_CHARS'] # Maximum characters allowed in a tweet
+HEADLESS_MODE = config_data['HEADLESS_MODE'] # Set to True to run in headless mode
+ENGLISH_ONLY_REGEX = config_data['ENGLISH_ONLY_REGEX'] # Set to True to filter only English topics
+SLEEP_TIME_PAGE_LOAD = config_data['SLEEP_TIME_PAGE_LOAD']
+SLEEP_TIME_AFTER_COOKIE_CONSENT = config_data['SLEEP_TIME_AFTER_COOKIE_CONSENT']
+SLEEP_TIME_AFTER_TAB_CLICK = config_data['SLEEP_TIME_AFTER_TAB_CLICK']
+
+# Set up Chrome options for headless mode
+options = Options()
+if HEADLESS_MODE:
+    options.add_argument("--headless")  # Run in headless mode
+options.add_argument("--disable-gpu")  # Disable GPU (helps in some environments)
+options.add_argument("--no-sandbox")  # Bypass sandbox for CI environments / Bypass OS security model (use cautiously in CI)
+options.add_argument("--window-size=1920,1080")  # Set a window size for rendering
+options.add_argument("--disable-dev-shm-usage")  # Address resource limits in containers
+
+# Set up Selenium WebDriver with Service
+driver_path = ChromeDriverManager().install()
+service = Service(driver_path)
+
+if HEADLESS_MODE:
+    driver = webdriver.Chrome(service=service, options=options) # Headless mode
+else:
+    driver = webdriver.Chrome(service=service) # Headful mode
+
+# Load the page
+url = "https://trends24.in/"
+driver.get(url)
+
+# Wait for the page to load fully (you might adjust the time)
+time.sleep(SLEEP_TIME_PAGE_LOAD)
+
+# Try to click on the cookie consent button if it's present
+try:
+    cookie_button = driver.find_element(By.CSS_SELECTOR, 'body > div.fc-consent-root > div.fc-dialog-container > div.fc-dialog.fc-choice-dialog > div.fc-footer-buttons-container > div.fc-footer-buttons > button.fc-button.fc-cta-consent.fc-primary-button')
+    cookie_button.click()
+    print("Cookie consent clicked.")
+except:
+    print("Cookie consent already given or not present.")
+
+# Wait for the page to load after the consent is given
+time.sleep(SLEEP_TIME_AFTER_COOKIE_CONSENT)
+
+# Click on the "Table" tab
+table_button = driver.find_element(By.ID, 'tab-link-table')
+table_button.click()
+
+# Wait for the table to load (adjust the sleep time if necessary)
+time.sleep(SLEEP_TIME_AFTER_TAB_CLICK)
+
+# Get the page source after the tab click
+soup = BeautifulSoup(driver.page_source, 'html.parser')
+
+# Close the browser window
+driver.quit()
+
+# Find the table section
+table_section = soup.select_one('section#table .table-container-4 table.the-table tbody.list')
+
+# Extract the trending topics with detailed information
+if table_section:
+    trending_topics = []  # List to hold extracted information
+    rows = table_section.find_all('tr')
+    for row in rows:
+        rank = row.find('td', class_='rank').text.strip() if row.find('td', class_='rank') else None
+        topic_cell = row.find('td', class_='topic')
+        topic = topic_cell.find('a', class_='trend-link').text.strip() if topic_cell else None
+        position = row.find('td', class_='position').text.strip() if row.find('td', class_='position') else None
+        count = row.find('td', class_='count')['data-count'] if row.find('td', class_='count') else None
+        duration = row.find('td', class_='duration').text.strip() if row.find('td', class_='duration') else None
+
+        if rank and topic:  # Ensure required fields are present
+            trending_topics.append({
+                "rank": rank,
+                "topic": topic,
+                "position": position,
+                "count": count,
+                "duration": duration,
+            })
+
+else:
+    print("Trending topics table not found.")
+
+# Print the extracted information
+# print("Trending Topics with Details:")
+# for topic in trending_topics:
+#     print(f"Rank: {topic['rank']}, Topic: {topic['topic']}, Position: {topic['position']}, Count: {topic['count']}, Duration: {topic['duration']}")
+
+# Function to filter English trending topics
+def filter_english_trends(trends):
+    english_trends = []
+    for trend in trends:
+        # Access the 'topic' key from each dictionary
+        trend_text = trend['topic']
+        # Include only topics with alphanumeric, spaces, or hashtags
+        if re.match(r'^[a-zA-Z0-9#\s]+$', trend_text):
+            english_trends.append(trend)
+    return english_trends
+
+# Function to create hashtags within a character limit
+def create_hashtags(trends, max_chars=TWEET_MAX_CHARS):
+    # Sort trends by their popularity ('count') in descending order
+    sorted_trends = sorted(trends, key=lambda x: int(x['count']), reverse=True)
+    
+    hashtags = []  # List to store hashtags
+    total_chars = 0  # Track total character count
+    for trend in sorted_trends:
+        # Access the 'topic' key from each dictionary
+        trend_text = trend['topic']
+        # Convert topic to a hashtag (keep letters, numbers, underscores, and Unicode letters)
+        # clean_trend = trend_text.lstrip('#') # Remove leading '#' if present
+        # hashtag = '#' + clean_trend.replace(' ', '')
+        # clean_trend = re.sub(r'[^a-zA-Z0-9_]', '', trend_text.replace(' ', ''))
+        clean_trend = re.sub(r'[^\w\u4e00-\u9fff\u0600-\u06ff]+', '', trend_text.replace(' ', ''))
+        hashtag = '#' + clean_trend
+        hashtag_length = len(hashtag)
+        
+        # Ensure adding this hashtag stays within the character limit
+        if total_chars + hashtag_length <= max_chars:
+            hashtags.append(hashtag)
+            total_chars += hashtag_length + 1  # Add 1 for the space
+        else:
+            break  # Stop if the character limit is reached
+    
+    # Join hashtags with spaces for Twitter compatibility
+    return ' '.join(hashtags)  # Return the hashtags as a space-separated string
+
+# Filter English topics
+# english_trends = filter_english_trends(trending_topics)
+# print(english_trends)
+# print([trend['topic'] for trend in english_trends])
+
+# Check if only English trends should be allowed
+if ENGLISH_ONLY_REGEX:
+    filtered_trends = filter_english_trends(trending_topics)
+else:
+    filtered_trends = trending_topics  # Use all topics without filtering
+
+# Create hashtags within a character limit from the filtered trending topics
+hashtags = create_hashtags(filtered_trends, max_chars=TWEET_MAX_CHARS)
+print(hashtags)
diff --git a/config.txt b/config.txt
@@ -0,0 +1,8 @@
+{
+    "TWEET_MAX_CHARS": 280,
+    "HEADLESS_MODE": true,
+    "ENGLISH_ONLY_REGEX": true,
+    "SLEEP_TIME_PAGE_LOAD": 3,
+    "SLEEP_TIME_AFTER_COOKIE_CONSENT": 0,
+    "SLEEP_TIME_AFTER_TAB_CLICK": 1
+}
diff --git a/config_manager.py b/config_manager.py
@@ -0,0 +1,49 @@
+import json
+import os
+import sys
+
+# Default configuration values
+default_config = {
+    "TWEET_MAX_CHARS": 280,
+    "HEADLESS_MODE": True,
+    "ENGLISH_ONLY_REGEX": True,
+    "SLEEP_TIME_PAGE_LOAD": 3,
+    "SLEEP_TIME_AFTER_COOKIE_CONSENT": 0,
+    "SLEEP_TIME_AFTER_TAB_CLICK": 1
+}
+
+# Get the config file path (in the same folder as the executable)
+def get_config_file_path():
+    if getattr(sys, 'frozen', False):  # If the program is running as a bundled executable
+        # PyInstaller sets the 'frozen' attribute to True for bundled apps
+        return os.path.join(os.path.dirname(sys.executable), 'config.txt')
+    else:
+        # If the script is running as a regular Python file
+        return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), 'config.txt')
+
+# Function to create default config file
+def create_default_config(config_file_path):
+    with open(config_file_path, 'w') as file:
+        json.dump(default_config, file, indent=4)
+    print(f"Config file created with default values at {config_file_path}")
+
+# Load configuration from file
+def load_config():
+    config_file_path = get_config_file_path()
+
+    # Check if the config file exists
+    if not os.path.exists(config_file_path):
+        # If config doesn't exist, create it with default values
+        create_default_config(config_file_path)
+
+    # Now try reading the config file
+    try:
+        with open(config_file_path, 'r') as file:
+            config_data = json.load(file)
+        return config_data
+
+    # Handle the case where the JSON format is corrupted
+    except json.JSONDecodeError:
+        print(f"Error: The configuration file '{config_file_path}' is corrupted.")
+        print("Please delete the file, and a new default config file will be generated next time.")
+        exit(1)  # Exit with an error code