Skip to content

Commit

Permalink
refactor: instantiate classes with attr in main
Browse files Browse the repository at this point in the history
  • Loading branch information
alimghmi committed Oct 22, 2023
1 parent 76c8432 commit 59fdced
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 23 deletions.
7 changes: 4 additions & 3 deletions database/mssql.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import time

import pandas as pd
from sqlalchemy import exc

Expand Down Expand Up @@ -45,7 +46,7 @@ def insert(self, df: pd.DataFrame, table_name: str):
logger.error(
f"Failed to insert data. Attempt {i + 1} of {self.max_retries}. Error: {e}" # noqa: E501
)
time.sleep(i + 1)
if (i + 1) == self.max_retries:
raise

logger.error("Max retries reached. Data insertion failed.")
raise exc.SQLAlchemyError("Data insertion failed.")
time.sleep(i + 1)
13 changes: 11 additions & 2 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,21 @@

def main():
logger.info("Initializing Scraper Engine")
engine = Engine()
engine = Engine(
url=settings.URL,
max_retries=settings.REQUEST_MAX_RETRIES,
backoff_factor=settings.REQUEST_BACKOFF_FACTOR,
)
df = engine.fetch()
logger.info("Transforming Data")
df_transformed = Agent(df).transform()
logger.info("Preparing Database Inserter")
inserter = create_inserter_objects()
inserter = create_inserter_objects(
server=settings.MSSQL_SERVER,
database=settings.MSSQL_DATABASE,
username=settings.MSSQL_USERNAME,
password=settings.MSSQL_PASSWORD,
)
logger.info(f"Inserting Data into {settings.OUTPUT_TABLE}")
inserter.insert(df_transformed, settings.OUTPUT_TABLE)
logger.info("Application completed successfully")
Expand Down
23 changes: 13 additions & 10 deletions scraper/engine.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,34 @@
import pandas as pd
import requests

from config import logger, settings
from config import logger
from utils import Request


class Engine:
URL = settings.URL
def __init__(self, url, max_retries, backoff_factor) -> None:
self.url = url
self.request = Request(max_retries=max_retries, backoff_factor=backoff_factor)

def fetch(self) -> pd.DataFrame:
logger.debug(f"Attempting to fetch content from {self.URL}.")
logger.debug(f"Attempting to fetch content from {self.url}.")
content = self.get_content()
logger.debug(
f"Successfully fetched content from {self.URL}. Now parsing the content."
f"Successfully fetched content from {self.url}. Now parsing the content."
)
df = self.parse_html(content)
logger.info(f"Parsed content from {self.URL}. Extracted {len(df)} rows.")
logger.info(f"Parsed content from {self.url}. Extracted {len(df)} rows.")
logger.debug(f"\n{df}")
return df

def get_content(self):
try:
r = requests.get(self.URL)
r = self.request.request("GET", self.url)
r.raise_for_status()
return r.text
except requests.RequestException as e:
logger.error(f"Error fetching content from {self.URL}. Error: {e}")
raise ConnectionError(f"Failed to connect to {self.URL}.") from e
logger.error(f"Error fetching content from {self.url}. Error: {e}")
raise ConnectionError(f"Failed to connect to {self.url}.") from e

def parse_html(self, content: str) -> pd.DataFrame:
try:
Expand All @@ -36,8 +39,8 @@ def parse_html(self, content: str) -> pd.DataFrame:

if len(dfs):
logger.debug(
f"Successfully parsed content from {self.URL}. Extracted {len(dfs[0])} rows." # noqa: E501
f"Successfully parsed content from {self.url}. Extracted {len(dfs[0])} rows." # noqa: E501
)
return dfs[0]
else:
raise ValueError(f"No data found when parsing content from {self.URL}.")
raise ValueError(f"No data found when parsing content from {self.url}.")
10 changes: 2 additions & 8 deletions utils/db_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,8 @@
from database.mssql import MSSQLDatabaseConnection, PandasSQLDataInserter


def create_inserter_objects() -> PandasSQLDataInserter:
db_connection = MSSQLDatabaseConnection(
settings.MSSQL_SERVER,
settings.MSSQL_DATABASE,
settings.MSSQL_USERNAME,
settings.MSSQL_PASSWORD,
)

def create_inserter_objects(*args, **kwargs) -> PandasSQLDataInserter:
db_connection = MSSQLDatabaseConnection(*args, **kwargs)
data_inserter = PandasSQLDataInserter(
db_connection, max_retries=settings.INSERTER_MAX_RETRIES
)
Expand Down

0 comments on commit 59fdced

Please sign in to comment.