Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit f90a2f0

Browse files
committedJan 10, 2025
Web scraper with Selenium, BeautifulSoup, and standalone config_manager for dynamic config and executable readiness.
0 parents  commit f90a2f0

File tree

4 files changed

+260
-0
lines changed

4 files changed

+260
-0
lines changed
 

‎README.md

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# Twitter Trending Topics/Hashtags Scraper
2+
3+
This Python script scrapes the trending topics from the website trends24.in and generates hashtags from the trends. It supports filtering for English-only trends and ensures that the generated hashtags fit within Twitter's character limits.
4+
5+
## Requirements
6+
7+
- Python 3.x
8+
- Selenium
9+
- BeautifulSoup
10+
- WebDriver Manager
11+
- Regular expressions (for filtering)
12+
13+
You can install the required packages using pip:
14+
```bash
15+
pip install selenium beautifulsoup4 webdriver-manager
16+
```
17+
18+
## Usage
19+
20+
1. Run the script:
21+
22+
```bash
23+
python T3_Scraper.py
24+
```
25+
26+
2. The script will:
27+
28+
- Open the trends24.in website.
29+
- Accept the cookie consent (if prompted).
30+
- Navigate to the "Table" section to gather trending topics.
31+
- Extract the trending topics along with additional information such as rank, position, count, and duration.
32+
- Optionally filter only English topics (if `ENGLISH_ONLY_REGEX` is set to `True`).
33+
- Create and print hashtags based on the most popular trends while adhering to Twitter's 280-character limit.
34+
35+
## Configuration
36+
37+
- `HEADLESS_MODE`: Set to `True` to run in headless mode (without opening a browser window).
38+
- `ENGLISH_ONLY_REGEX`: Set to `True` to filter for English-only trends based on regex patterns.
39+
- `TWEET_MAX_CHARS`: The character limit for hashtags (default is 280).
40+
41+
## Notes
42+
43+
- Ensure you have the Chrome WebDriver installed. You can use the WebDriver Manager to automatically handle this.
44+
- Adjust the sleep times if necessary based on your internet speed or website load time.

‎T3_Scraper.py

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
from selenium import webdriver
2+
from selenium.webdriver.common.by import By
3+
from selenium.webdriver.chrome.service import Service
4+
from webdriver_manager.chrome import ChromeDriverManager
5+
from selenium.webdriver.chrome.options import Options
6+
from selenium.webdriver.common.action_chains import ActionChains
7+
from bs4 import BeautifulSoup
8+
import time
9+
import re
10+
from collections import defaultdict
11+
12+
# Import configuration manager
13+
from config_manager import load_config
14+
15+
# Load configuration
16+
config_data = load_config()
17+
18+
# Access configuration constants
19+
TWEET_MAX_CHARS = config_data['TWEET_MAX_CHARS'] # Maximum characters allowed in a tweet
20+
HEADLESS_MODE = config_data['HEADLESS_MODE'] # Set to True to run in headless mode
21+
ENGLISH_ONLY_REGEX = config_data['ENGLISH_ONLY_REGEX'] # Set to True to filter only English topics
22+
SLEEP_TIME_PAGE_LOAD = config_data['SLEEP_TIME_PAGE_LOAD']
23+
SLEEP_TIME_AFTER_COOKIE_CONSENT = config_data['SLEEP_TIME_AFTER_COOKIE_CONSENT']
24+
SLEEP_TIME_AFTER_TAB_CLICK = config_data['SLEEP_TIME_AFTER_TAB_CLICK']
25+
26+
# Set up Chrome options for headless mode
27+
options = Options()
28+
if HEADLESS_MODE:
29+
options.add_argument("--headless") # Run in headless mode
30+
options.add_argument("--disable-gpu") # Disable GPU (helps in some environments)
31+
options.add_argument("--no-sandbox") # Bypass sandbox for CI environments / Bypass OS security model (use cautiously in CI)
32+
options.add_argument("--window-size=1920,1080") # Set a window size for rendering
33+
options.add_argument("--disable-dev-shm-usage") # Address resource limits in containers
34+
35+
# Set up Selenium WebDriver with Service
36+
driver_path = ChromeDriverManager().install()
37+
service = Service(driver_path)
38+
39+
if HEADLESS_MODE:
40+
driver = webdriver.Chrome(service=service, options=options) # Headless mode
41+
else:
42+
driver = webdriver.Chrome(service=service) # Headful mode
43+
44+
# Load the page
45+
url = "https://trends24.in/"
46+
driver.get(url)
47+
48+
# Wait for the page to load fully (you might adjust the time)
49+
time.sleep(SLEEP_TIME_PAGE_LOAD)
50+
51+
# Try to click on the cookie consent button if it's present
52+
try:
53+
cookie_button = driver.find_element(By.CSS_SELECTOR, 'body > div.fc-consent-root > div.fc-dialog-container > div.fc-dialog.fc-choice-dialog > div.fc-footer-buttons-container > div.fc-footer-buttons > button.fc-button.fc-cta-consent.fc-primary-button')
54+
cookie_button.click()
55+
print("Cookie consent clicked.")
56+
except:
57+
print("Cookie consent already given or not present.")
58+
59+
# Wait for the page to load after the consent is given
60+
time.sleep(SLEEP_TIME_AFTER_COOKIE_CONSENT)
61+
62+
# Click on the "Table" tab
63+
table_button = driver.find_element(By.ID, 'tab-link-table')
64+
table_button.click()
65+
66+
# Wait for the table to load (adjust the sleep time if necessary)
67+
time.sleep(SLEEP_TIME_AFTER_TAB_CLICK)
68+
69+
# Get the page source after the tab click
70+
soup = BeautifulSoup(driver.page_source, 'html.parser')
71+
72+
# Close the browser window
73+
driver.quit()
74+
75+
# Find the table section
76+
table_section = soup.select_one('section#table .table-container-4 table.the-table tbody.list')
77+
78+
# Extract the trending topics with detailed information
79+
if table_section:
80+
trending_topics = [] # List to hold extracted information
81+
rows = table_section.find_all('tr')
82+
for row in rows:
83+
rank = row.find('td', class_='rank').text.strip() if row.find('td', class_='rank') else None
84+
topic_cell = row.find('td', class_='topic')
85+
topic = topic_cell.find('a', class_='trend-link').text.strip() if topic_cell else None
86+
position = row.find('td', class_='position').text.strip() if row.find('td', class_='position') else None
87+
count = row.find('td', class_='count')['data-count'] if row.find('td', class_='count') else None
88+
duration = row.find('td', class_='duration').text.strip() if row.find('td', class_='duration') else None
89+
90+
if rank and topic: # Ensure required fields are present
91+
trending_topics.append({
92+
"rank": rank,
93+
"topic": topic,
94+
"position": position,
95+
"count": count,
96+
"duration": duration,
97+
})
98+
99+
else:
100+
print("Trending topics table not found.")
101+
102+
# Print the extracted information
103+
# print("Trending Topics with Details:")
104+
# for topic in trending_topics:
105+
# print(f"Rank: {topic['rank']}, Topic: {topic['topic']}, Position: {topic['position']}, Count: {topic['count']}, Duration: {topic['duration']}")
106+
107+
# Function to filter English trending topics
108+
def filter_english_trends(trends):
109+
english_trends = []
110+
for trend in trends:
111+
# Access the 'topic' key from each dictionary
112+
trend_text = trend['topic']
113+
# Include only topics with alphanumeric, spaces, or hashtags
114+
if re.match(r'^[a-zA-Z0-9#\s]+$', trend_text):
115+
english_trends.append(trend)
116+
return english_trends
117+
118+
# Function to create hashtags within a character limit
119+
def create_hashtags(trends, max_chars=TWEET_MAX_CHARS):
120+
# Sort trends by their popularity ('count') in descending order
121+
sorted_trends = sorted(trends, key=lambda x: int(x['count']), reverse=True)
122+
123+
hashtags = [] # List to store hashtags
124+
total_chars = 0 # Track total character count
125+
for trend in sorted_trends:
126+
# Access the 'topic' key from each dictionary
127+
trend_text = trend['topic']
128+
# Convert topic to a hashtag (keep letters, numbers, underscores, and Unicode letters)
129+
# clean_trend = trend_text.lstrip('#') # Remove leading '#' if present
130+
# hashtag = '#' + clean_trend.replace(' ', '')
131+
# clean_trend = re.sub(r'[^a-zA-Z0-9_]', '', trend_text.replace(' ', ''))
132+
clean_trend = re.sub(r'[^\w\u4e00-\u9fff\u0600-\u06ff]+', '', trend_text.replace(' ', ''))
133+
hashtag = '#' + clean_trend
134+
hashtag_length = len(hashtag)
135+
136+
# Ensure adding this hashtag stays within the character limit
137+
if total_chars + hashtag_length <= max_chars:
138+
hashtags.append(hashtag)
139+
total_chars += hashtag_length + 1 # Add 1 for the space
140+
else:
141+
break # Stop if the character limit is reached
142+
143+
# Join hashtags with spaces for Twitter compatibility
144+
return ' '.join(hashtags) # Return the hashtags as a space-separated string
145+
146+
# Filter English topics
147+
# english_trends = filter_english_trends(trending_topics)
148+
# print(english_trends)
149+
# print([trend['topic'] for trend in english_trends])
150+
151+
# Check if only English trends should be allowed
152+
if ENGLISH_ONLY_REGEX:
153+
filtered_trends = filter_english_trends(trending_topics)
154+
else:
155+
filtered_trends = trending_topics # Use all topics without filtering
156+
157+
# Create hashtags within a character limit from the filtered trending topics
158+
hashtags = create_hashtags(filtered_trends, max_chars=TWEET_MAX_CHARS)
159+
print(hashtags)

‎config.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"TWEET_MAX_CHARS": 280,
3+
"HEADLESS_MODE": true,
4+
"ENGLISH_ONLY_REGEX": true,
5+
"SLEEP_TIME_PAGE_LOAD": 3,
6+
"SLEEP_TIME_AFTER_COOKIE_CONSENT": 0,
7+
"SLEEP_TIME_AFTER_TAB_CLICK": 1
8+
}

‎config_manager.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import json
2+
import os
3+
import sys
4+
5+
# Default configuration values
6+
default_config = {
7+
"TWEET_MAX_CHARS": 280,
8+
"HEADLESS_MODE": True,
9+
"ENGLISH_ONLY_REGEX": True,
10+
"SLEEP_TIME_PAGE_LOAD": 3,
11+
"SLEEP_TIME_AFTER_COOKIE_CONSENT": 0,
12+
"SLEEP_TIME_AFTER_TAB_CLICK": 1
13+
}
14+
15+
# Get the config file path (in the same folder as the executable)
16+
def get_config_file_path():
17+
if getattr(sys, 'frozen', False): # If the program is running as a bundled executable
18+
# PyInstaller sets the 'frozen' attribute to True for bundled apps
19+
return os.path.join(os.path.dirname(sys.executable), 'config.txt')
20+
else:
21+
# If the script is running as a regular Python file
22+
return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), 'config.txt')
23+
24+
# Function to create default config file
25+
def create_default_config(config_file_path):
26+
with open(config_file_path, 'w') as file:
27+
json.dump(default_config, file, indent=4)
28+
print(f"Config file created with default values at {config_file_path}")
29+
30+
# Load configuration from file
31+
def load_config():
32+
config_file_path = get_config_file_path()
33+
34+
# Check if the config file exists
35+
if not os.path.exists(config_file_path):
36+
# If config doesn't exist, create it with default values
37+
create_default_config(config_file_path)
38+
39+
# Now try reading the config file
40+
try:
41+
with open(config_file_path, 'r') as file:
42+
config_data = json.load(file)
43+
return config_data
44+
45+
# Handle the case where the JSON format is corrupted
46+
except json.JSONDecodeError:
47+
print(f"Error: The configuration file '{config_file_path}' is corrupted.")
48+
print("Please delete the file, and a new default config file will be generated next time.")
49+
exit(1) # Exit with an error code

0 commit comments

Comments
 (0)
Please sign in to comment.