|
| 1 | +from selenium import webdriver |
| 2 | +from selenium.webdriver.common.by import By |
| 3 | +from selenium.webdriver.chrome.service import Service |
| 4 | +from webdriver_manager.chrome import ChromeDriverManager |
| 5 | +from selenium.webdriver.chrome.options import Options |
| 6 | +from selenium.webdriver.common.action_chains import ActionChains |
| 7 | +from bs4 import BeautifulSoup |
| 8 | +import time |
| 9 | +import re |
| 10 | +from collections import defaultdict |
| 11 | + |
| 12 | +# Import configuration manager |
| 13 | +from config_manager import load_config |
| 14 | + |
| 15 | +# Load configuration |
| 16 | +config_data = load_config() |
| 17 | + |
| 18 | +# Access configuration constants |
| 19 | +TWEET_MAX_CHARS = config_data['TWEET_MAX_CHARS'] # Maximum characters allowed in a tweet |
| 20 | +HEADLESS_MODE = config_data['HEADLESS_MODE'] # Set to True to run in headless mode |
| 21 | +ENGLISH_ONLY_REGEX = config_data['ENGLISH_ONLY_REGEX'] # Set to True to filter only English topics |
| 22 | +SLEEP_TIME_PAGE_LOAD = config_data['SLEEP_TIME_PAGE_LOAD'] |
| 23 | +SLEEP_TIME_AFTER_COOKIE_CONSENT = config_data['SLEEP_TIME_AFTER_COOKIE_CONSENT'] |
| 24 | +SLEEP_TIME_AFTER_TAB_CLICK = config_data['SLEEP_TIME_AFTER_TAB_CLICK'] |
| 25 | + |
| 26 | +# Set up Chrome options for headless mode |
| 27 | +options = Options() |
| 28 | +if HEADLESS_MODE: |
| 29 | + options.add_argument("--headless") # Run in headless mode |
| 30 | +options.add_argument("--disable-gpu") # Disable GPU (helps in some environments) |
| 31 | +options.add_argument("--no-sandbox") # Bypass sandbox for CI environments / Bypass OS security model (use cautiously in CI) |
| 32 | +options.add_argument("--window-size=1920,1080") # Set a window size for rendering |
| 33 | +options.add_argument("--disable-dev-shm-usage") # Address resource limits in containers |
| 34 | + |
| 35 | +# Set up Selenium WebDriver with Service |
| 36 | +driver_path = ChromeDriverManager().install() |
| 37 | +service = Service(driver_path) |
| 38 | + |
| 39 | +if HEADLESS_MODE: |
| 40 | + driver = webdriver.Chrome(service=service, options=options) # Headless mode |
| 41 | +else: |
| 42 | + driver = webdriver.Chrome(service=service) # Headful mode |
| 43 | + |
| 44 | +# Load the page |
| 45 | +url = "https://trends24.in/" |
| 46 | +driver.get(url) |
| 47 | + |
| 48 | +# Wait for the page to load fully (you might adjust the time) |
| 49 | +time.sleep(SLEEP_TIME_PAGE_LOAD) |
| 50 | + |
| 51 | +# Try to click on the cookie consent button if it's present |
| 52 | +try: |
| 53 | + cookie_button = driver.find_element(By.CSS_SELECTOR, 'body > div.fc-consent-root > div.fc-dialog-container > div.fc-dialog.fc-choice-dialog > div.fc-footer-buttons-container > div.fc-footer-buttons > button.fc-button.fc-cta-consent.fc-primary-button') |
| 54 | + cookie_button.click() |
| 55 | + print("Cookie consent clicked.") |
| 56 | +except: |
| 57 | + print("Cookie consent already given or not present.") |
| 58 | + |
| 59 | +# Wait for the page to load after the consent is given |
| 60 | +time.sleep(SLEEP_TIME_AFTER_COOKIE_CONSENT) |
| 61 | + |
| 62 | +# Click on the "Table" tab |
| 63 | +table_button = driver.find_element(By.ID, 'tab-link-table') |
| 64 | +table_button.click() |
| 65 | + |
| 66 | +# Wait for the table to load (adjust the sleep time if necessary) |
| 67 | +time.sleep(SLEEP_TIME_AFTER_TAB_CLICK) |
| 68 | + |
| 69 | +# Get the page source after the tab click |
| 70 | +soup = BeautifulSoup(driver.page_source, 'html.parser') |
| 71 | + |
| 72 | +# Close the browser window |
| 73 | +driver.quit() |
| 74 | + |
| 75 | +# Find the table section |
| 76 | +table_section = soup.select_one('section#table .table-container-4 table.the-table tbody.list') |
| 77 | + |
| 78 | +# Extract the trending topics with detailed information |
| 79 | +if table_section: |
| 80 | + trending_topics = [] # List to hold extracted information |
| 81 | + rows = table_section.find_all('tr') |
| 82 | + for row in rows: |
| 83 | + rank = row.find('td', class_='rank').text.strip() if row.find('td', class_='rank') else None |
| 84 | + topic_cell = row.find('td', class_='topic') |
| 85 | + topic = topic_cell.find('a', class_='trend-link').text.strip() if topic_cell else None |
| 86 | + position = row.find('td', class_='position').text.strip() if row.find('td', class_='position') else None |
| 87 | + count = row.find('td', class_='count')['data-count'] if row.find('td', class_='count') else None |
| 88 | + duration = row.find('td', class_='duration').text.strip() if row.find('td', class_='duration') else None |
| 89 | + |
| 90 | + if rank and topic: # Ensure required fields are present |
| 91 | + trending_topics.append({ |
| 92 | + "rank": rank, |
| 93 | + "topic": topic, |
| 94 | + "position": position, |
| 95 | + "count": count, |
| 96 | + "duration": duration, |
| 97 | + }) |
| 98 | + |
| 99 | +else: |
| 100 | + print("Trending topics table not found.") |
| 101 | + |
| 102 | +# Print the extracted information |
| 103 | +# print("Trending Topics with Details:") |
| 104 | +# for topic in trending_topics: |
| 105 | +# print(f"Rank: {topic['rank']}, Topic: {topic['topic']}, Position: {topic['position']}, Count: {topic['count']}, Duration: {topic['duration']}") |
| 106 | + |
| 107 | +# Function to filter English trending topics |
| 108 | +def filter_english_trends(trends): |
| 109 | + english_trends = [] |
| 110 | + for trend in trends: |
| 111 | + # Access the 'topic' key from each dictionary |
| 112 | + trend_text = trend['topic'] |
| 113 | + # Include only topics with alphanumeric, spaces, or hashtags |
| 114 | + if re.match(r'^[a-zA-Z0-9#\s]+$', trend_text): |
| 115 | + english_trends.append(trend) |
| 116 | + return english_trends |
| 117 | + |
| 118 | +# Function to create hashtags within a character limit |
| 119 | +def create_hashtags(trends, max_chars=TWEET_MAX_CHARS): |
| 120 | + # Sort trends by their popularity ('count') in descending order |
| 121 | + sorted_trends = sorted(trends, key=lambda x: int(x['count']), reverse=True) |
| 122 | + |
| 123 | + hashtags = [] # List to store hashtags |
| 124 | + total_chars = 0 # Track total character count |
| 125 | + for trend in sorted_trends: |
| 126 | + # Access the 'topic' key from each dictionary |
| 127 | + trend_text = trend['topic'] |
| 128 | + # Convert topic to a hashtag (keep letters, numbers, underscores, and Unicode letters) |
| 129 | + # clean_trend = trend_text.lstrip('#') # Remove leading '#' if present |
| 130 | + # hashtag = '#' + clean_trend.replace(' ', '') |
| 131 | + # clean_trend = re.sub(r'[^a-zA-Z0-9_]', '', trend_text.replace(' ', '')) |
| 132 | + clean_trend = re.sub(r'[^\w\u4e00-\u9fff\u0600-\u06ff]+', '', trend_text.replace(' ', '')) |
| 133 | + hashtag = '#' + clean_trend |
| 134 | + hashtag_length = len(hashtag) |
| 135 | + |
| 136 | + # Ensure adding this hashtag stays within the character limit |
| 137 | + if total_chars + hashtag_length <= max_chars: |
| 138 | + hashtags.append(hashtag) |
| 139 | + total_chars += hashtag_length + 1 # Add 1 for the space |
| 140 | + else: |
| 141 | + break # Stop if the character limit is reached |
| 142 | + |
| 143 | + # Join hashtags with spaces for Twitter compatibility |
| 144 | + return ' '.join(hashtags) # Return the hashtags as a space-separated string |
| 145 | + |
| 146 | +# Filter English topics |
| 147 | +# english_trends = filter_english_trends(trending_topics) |
| 148 | +# print(english_trends) |
| 149 | +# print([trend['topic'] for trend in english_trends]) |
| 150 | + |
| 151 | +# Check if only English trends should be allowed |
| 152 | +if ENGLISH_ONLY_REGEX: |
| 153 | + filtered_trends = filter_english_trends(trending_topics) |
| 154 | +else: |
| 155 | + filtered_trends = trending_topics # Use all topics without filtering |
| 156 | + |
| 157 | +# Create hashtags within a character limit from the filtered trending topics |
| 158 | +hashtags = create_hashtags(filtered_trends, max_chars=TWEET_MAX_CHARS) |
| 159 | +print(hashtags) |
0 commit comments