Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize scraping with threads #192

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 24 additions & 12 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from typing import List
import threading
import logging.config
import logging
Expand Down Expand Up @@ -59,25 +60,36 @@ def scrape_with_threads() -> None:
print("Scraping with threads...")

request_delay = scraper.Config.get_request_delay()

products_df = scraper.Filemanager.get_products_data()
domain_grouped_products_df = scraper.get_products_df_grouped_by_domains(products_df)
grouped_products = scraper.get_products_grouped_by_domain(domain_grouped_products_df)

# Create instances of class "Scraper"
products = [scraper.Scraper(category, url) for category, url in zip(products_df["category"], products_df["url"])]
grouped_scraper_threads: List[List[threading.Thread]] = []

# Create threads
threads = [threading.Thread(target=product.scrape_info) for product in products]
# Create scraper threads and group by domain
for products in grouped_products.values():
scraper_threads = [threading.Thread(target=product.scrape_info) for product in products]
grouped_scraper_threads.append(scraper_threads)

# Start scraping on all threads
for thread in threads:
time.sleep(request_delay)
thread.start()
# Create master threads to manage scraper threads sequentially for each domain
master_threads = [
threading.Thread(target=scraper.start_threads_sequentially, args=[scraper_threads, request_delay])
for scraper_threads in grouped_scraper_threads
]

# Wait for all threads to finish
for thread in threads:
thread.join()
# Start all master threads
for master_thread in master_threads:
master_thread.start()

# Wait for all master threads to finish
for master_thread in master_threads:
master_thread.join()

products_flatten = [product for products in grouped_products.values() for product in products]

# Save scraped data for each product (sequentially)
for product in products:
for product in products_flatten:
product.save_info()


Expand Down
5 changes: 3 additions & 2 deletions scraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .scrape import Scraper
from .scrape import Scraper, start_threads_sequentially
from .arguments import argparse_setup
from .add_product import add_products
from .filemanager import Filemanager, Config
Expand All @@ -7,7 +7,8 @@
from .delete_data import delete
from .reset_data import reset
from .search_data import search
from .misc import print_latest_datapoints, print_all_products
from .print_products import print_latest_datapoints, print_all_products
from .misc import get_products_df_grouped_by_domains, get_products_grouped_by_domain


__author__ = "Crinibus"
77 changes: 23 additions & 54 deletions scraper/misc.py
Original file line number Diff line number Diff line change
@@ -1,64 +1,33 @@
from typing import Iterator, List, Tuple
import pandas as pd
from pandas.core.groupby.generic import DataFrameGroupBy

from scraper.filemanager import Filemanager
from scraper.scrape import Scraper
from scraper.domains import get_website_name


def print_latest_datapoints(names: List[str], ids: List[str]) -> None:
records_data = Filemanager.get_record_data()
def add_dataframe_column(df: pd.DataFrame, column_name: str, column_data: list[any]) -> pd.DataFrame:
df[column_name] = column_data
return df

if names:
print("\n----- SHOWING LATEST DATAPOINT FOR NAME(s) -----")
for name in names:
print(name.upper())
# iterate the different websites the product with the specified name is scraped from
for website_name, website_dict in get_product_info_with_name(name, records_data):
print_latest_datapoint(website_name, website_dict)
print()

if ids:
print("\n----- SHOWING LATEST DATAPOINT FOR ID(s) -----")
for id in ids:
product_name, website_name, website_dict = get_product_info_with_id(id, records_data)
print(product_name.upper())
print_latest_datapoint(website_name, website_dict)
print()
def group_df(df: pd.DataFrame, column_name: str, group_keys: bool) -> DataFrameGroupBy:
grouped_df = df.groupby(column_name, group_keys=group_keys)
return grouped_df


def get_product_info_with_name(name: str, records_data: dict) -> Iterator[Tuple[str, str, dict]]:
for category_dict in records_data.values():
for product_name, product_dict in category_dict.items():
if not product_name.lower() == name.lower():
continue
for website_name, website_dict in product_dict.items():
yield website_name, website_dict
def get_products_df_grouped_by_domains(products_df: pd.DataFrame) -> DataFrameGroupBy:
domain_names = [get_website_name(url) for url in products_df["url"]]
df = add_dataframe_column(products_df, "domain", domain_names)
grouped_df = group_df(df, "domain", True)
return grouped_df


def get_product_info_with_id(id: str, records_data: dict) -> Tuple[str, str, dict]:
for category_dict in records_data.values():
for product_name, product_dict in category_dict.items():
for website_name, website_dict in product_dict.items():
if website_dict["info"]["id"] == id:
return product_name, website_name, website_dict
def get_products_grouped_by_domain(grouped_products_df: DataFrameGroupBy) -> dict[str, list[Scraper]]:
domains_dict: dict[str, list[Scraper]] = {}


def print_latest_datapoint(website_name: str, website_dict: dict) -> None:
id = website_dict["info"]["id"]
currency = website_dict["info"]["currency"]
latest_datapoint = website_dict["datapoints"][-1]
date = latest_datapoint["date"]
price = latest_datapoint["price"]
print(f"> {website_name.capitalize()} - {id}\n - {currency} {price}\n - {date}")


def print_all_products() -> None:
records_data = Filemanager.get_record_data()

print("\n----- SHOWING ALL PRODUCTS -----")
for category_name, category_dict in records_data.items():
print(category_name.upper())
for product_name, product_dict in category_dict.items():
print(f" > {product_name.upper()}")
for website_name, website_dict in product_dict.items():
product_id = website_dict["info"]["id"]
print(f" - {website_name.upper()} - {product_id}")
print()
for domain_name in grouped_products_df.groups:
group_products = grouped_products_df.get_group(domain_name)
domains_dict[domain_name] = [
Scraper(category, url) for category, url in zip(group_products["category"], group_products["url"])
]
return domains_dict
64 changes: 64 additions & 0 deletions scraper/print_products.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from typing import Iterator

from scraper.filemanager import Filemanager


def print_latest_datapoints(names: list[str], ids: list[str]) -> None:
records_data = Filemanager.get_record_data()

if names:
print("\n----- SHOWING LATEST DATAPOINT FOR NAME(s) -----")
for name in names:
print(name.upper())
# iterate the different websites the product with the specified name is scraped from
for website_name, website_dict in get_product_info_with_name(name, records_data):
print_latest_datapoint(website_name, website_dict)
print()

if ids:
print("\n----- SHOWING LATEST DATAPOINT FOR ID(s) -----")
for id in ids:
product_name, website_name, website_dict = get_product_info_with_id(id, records_data)
print(product_name.upper())
print_latest_datapoint(website_name, website_dict)
print()


def get_product_info_with_name(name: str, records_data: dict) -> Iterator[tuple[str, str, dict]]:
for category_dict in records_data.values():
for product_name, product_dict in category_dict.items():
if not product_name.lower() == name.lower():
continue
for website_name, website_dict in product_dict.items():
yield website_name, website_dict


def get_product_info_with_id(id: str, records_data: dict) -> tuple[str, str, dict]:
for category_dict in records_data.values():
for product_name, product_dict in category_dict.items():
for website_name, website_dict in product_dict.items():
if website_dict["info"]["id"] == id:
return product_name, website_name, website_dict


def print_latest_datapoint(website_name: str, website_dict: dict) -> None:
id = website_dict["info"]["id"]
currency = website_dict["info"]["currency"]
latest_datapoint = website_dict["datapoints"][-1]
date = latest_datapoint["date"]
price = latest_datapoint["price"]
print(f"> {website_name.capitalize()} - {id}\n - {currency} {price}\n - {date}")


def print_all_products() -> None:
records_data = Filemanager.get_record_data()

print("\n----- SHOWING ALL PRODUCTS -----")
for category_name, category_dict in records_data.items():
print(category_name.upper())
for product_name, product_dict in category_dict.items():
print(f" > {product_name.upper()}")
for website_name, website_dict in product_dict.items():
product_id = website_dict["info"]["id"]
print(f" - {website_name.upper()} - {product_id}")
print()
9 changes: 9 additions & 0 deletions scraper/scrape.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import time
import threading
import logging
from datetime import datetime
from scraper.domains import BaseWebsiteHandler, get_website_handler
Expand Down Expand Up @@ -70,3 +72,10 @@ def add_product_datapoint(product_data: dict, price: float) -> None:
latest_datapoint["price"] = price
else:
product_datapoints.append(new_datapoint)


def start_threads_sequentially(threads: list[threading.Thread], request_delay: int) -> None:
for thread in threads:
thread.start()
thread.join()
time.sleep(request_delay)