Skip to content

Commit

Permalink
structured with our predefined ETL
Browse files Browse the repository at this point in the history
  • Loading branch information
cbej-ea committed Jan 31, 2024
1 parent 6ef6f3c commit deec3ae
Show file tree
Hide file tree
Showing 8 changed files with 182 additions and 272 deletions.
103 changes: 10 additions & 93 deletions requirements/requirements.in
Original file line number Diff line number Diff line change
@@ -1,93 +1,10 @@
attrs @ file:///C:/b/abs_35n0jusce8/croot/attrs_1695717880170/work
beautifulsoup4==4.12.3
boto3==1.34.27
botocore==1.34.27
Brotli @ file:///C:/Windows/Temp/abs_63l7912z0e/croots/recipe/brotli-split_1659616056886/work
bs4==0.0.2
cachetools==5.3.2
certifi @ file:///C:/b/abs_91u83siphd/croot/certifi_1700501720658/work/certifi
cffi @ file:///C:/b/abs_924gv1kxzj/croot/cffi_1700254355075/work
chardet==5.2.0
charset-normalizer==3.3.2
click==8.1.7
colorama==0.4.6
cryptography @ file:///C:/b/abs_e8cnom_zw_/croot/cryptography_1702071486468/work
cssselect==1.2.0
decorator==5.1.1
distro==1.9.0
Elixir==0.7.1
et-xmlfile==1.1.0
exceptiongroup @ file:///C:/b/abs_c5h1o1_b5b/croot/exceptiongroup_1706031441653/work
ghostscript==0.7
google-api-core==2.15.0
google-auth==2.27.0
google-cloud-core==2.4.1
google-cloud-storage==2.14.0
google-crc32c @ file:///C:/b/abs_f8g37ql__2/croot/google-crc32c_1667946622512/work
google-resumable-media==2.7.0
googleapis-common-protos==1.62.0
greenlet @ file:///C:/b/abs_a6c75ie0bc/croot/greenlet_1702060012174/work
h11==0.14.0
helper-functions-ea @ git+https://git@github.com/energyaspects/helper_functions.git@31531d83a80f324453173e05807145af4a583a2a
idna @ file:///C:/b/abs_bdhbebrioa/croot/idna_1666125572046/work
Jinja2==3.1.3
jmespath @ file:///C:/b/abs_59jpuaows7/croot/jmespath_1700144635019/work
JPype1==1.5.0
lxml==5.1.0
MarkupSafe==2.1.4
mkl-fft @ file:///C:/b/abs_19i1y8ykas/croot/mkl_fft_1695058226480/work
mkl-random @ file:///C:/b/abs_edwkj1_o69/croot/mkl_random_1695059866750/work
mkl-service==2.4.0
numpy @ file:///C:/b/abs_16b2j7ad8n/croot/numpy_and_numpy_base_1704311752418/work/dist/numpy-1.26.3-cp39-cp39-win_amd64.whl#sha256=02e606e23ca31bb00a40d147fd1ce4dd7d241395346a4196592d5abe54a333bc
opencv-python==4.9.0.80
openpyxl==3.1.2
outcome @ file:///tmp/build/80754af9/outcome_1609338780791/work
packaging==23.2
pandas==2.2.0
pbr==6.0.0
pdfminer.six==20231228
pdftopng==0.2.3
protobuf==4.25.2
psycopg2-binary==2.9.9
pyasn1==0.5.1
pyasn1-modules==0.3.0
pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work
PyMuPDFb==1.23.9
PyMySQL==1.1.0
pyOpenSSL @ file:///C:/b/abs_08f38zyck4/croot/pyopenssl_1690225407403/work
pypdf==4.0.0
PyPDF2==3.0.1
pyquery==2.0.0
PySocks @ file:///C:/ci/pysocks_1605307512533/work
python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work
python-dotenv==1.0.1
python-http-client==3.3.7
pytz @ file:///C:/b/abs_19q3ljkez4/croot/pytz_1695131651401/work
requests @ file:///C:/b/abs_316c2inijk/croot/requests_1690400295842/work
roman==4.1
rsa==4.9
s3transfer==0.10.0
selenium==4.17.2
sendgrid==6.11.0
shooju==3.8.13
six @ file:///tmp/build/80754af9/six_1644875935023/work
sniffio @ file:///C:/b/abs_3akdewudo_/croot/sniffio_1705431337396/work
sortedcontainers @ file:///tmp/build/80754af9/sortedcontainers_1623949099177/work
soupsieve @ file:///C:/b/abs_bbsvy9t4pl/croot/soupsieve_1696347611357/work
SQLAlchemy==0.7.10
sqlalchemy-migrate==0.11.0
sqlparse==0.4.4
starkbank-ecdsa==2.2.0
tabula-py==2.9.0
tabulate==0.9.0
Tempita==0.5.2
trio @ file:///C:/b/abs_3bsokxbl8q/croot/trio_1705518572139/work
trio-websocket==0.11.1
typing_extensions @ file:///C:/b/abs_72cdotwc_6/croot/typing_extensions_1705599364138/work
tzdata==2023.4
urllib3 @ file:///C:/b/abs_9cmlsrm3ys/croot/urllib3_1698257595508/work
webdriver-manager==4.0.1
win-inet-pton @ file:///C:/ci/win_inet_pton_1605306162074/work
wsproto==1.2.0
xlrd==0.7.1
xlwt==0.7.2
# EDIT THIS WITH THE PACKAGES YOU HAVE INSTALLED!
# DO NOT INSTALL Req.txt blindly :)
pandas
numpy
shooju
pylint
black
invoke
python-dotenv
tabula-py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import tabula
import pandas as pd

from helper_functions_ea import Logger
from india_mopng_etl.utils.base_classes import DataExtractor
from india_mopng_etl.utils.helper_functions import get_pdf_lists, get_pdf_info


class June2023ToLatest(DataExtractor): # make sure you rename the class to your preference
"""Make sure you implement all the methods required for your ETL"""

logger = Logger("India Mopng Scrape => ").logger
# name = "India Mopng Scrape ETL"
WEBSITE_URL = "https://mopng.gov.in/en/petroleum-statistics/monthly-production"

def __init__(self):
self.list_of_pdfs = []
self.crude_df = pd.DataFrame()
self.petroleum_df = pd.DataFrame()

def get_list_of_pdfs(self):
"""
Gets the PDF links from the website
"""
self.logger.info("Getting List of PDFs")
try:
self.list_of_pdfs = get_pdf_lists(self.WEBSITE_URL)
except Exception as e:
self.logger.error(f"India Mopng scrapper failed at getting list of PDFs, error was {e}")
raise Exception(e)

def extract(self):
self.get_list_of_pdfs()

dataframes = {'crude_dataframe': [], 'petroleum_dataframe': []}
self.logger.info("Extracting PDF data")
for pdf_path in self.list_of_pdfs:
dataframes = get_pdf_info(pdf_path, dataframes)
self.crude_df = pd.concat(dataframes['crude_dataframe'], ignore_index=True)
self.petroleum_df = pd.concat(dataframes['petroleum_dataframe'], ignore_index=True)
self.logger.info("Extracted PDF data")

def transform(self):
self.logger.info("Transforming data")
# Example Crude Data positions and new names
crude_column_positions_to_rename = {0: 'Name of Undertaking/Unit/State',
1: 'Production during the Preceding month of current year'}

# Rename Crude Data columns based on positions
for position, new_name in crude_column_positions_to_rename.items():
# Ensure the position is within the range of existing columns
if position < len(self.crude_df.columns):
self.crude_df.rename(columns={self.crude_df.columns[position]: new_name}, inplace=True)

# Example Petroleum Data positions and new names
petroleum_column_positions_to_rename = {0: 'Name of Undertaking/Unit/State',
1: 'Production during the Preceding month of current year'}

# Rename Petroleum Data columns based on positions
for position, new_name in petroleum_column_positions_to_rename.items():
# Ensure the position is within the range of existing columns
if position < len(self.petroleum_df.columns):
self.petroleum_df.rename(columns={self.petroleum_df.columns[position]: new_name}, inplace=True)
print(self.crude_df)
print(self.petroleum_df)
self.logger.info("Transformed Data")

This file was deleted.

31 changes: 6 additions & 25 deletions src/india_mopng_etl/main.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,4 @@
import pandas as pd

from helper_functions_ea import Logger
from india_mopng_etl.metadata import metadata
from india_mopng_etl.utils.base_classes import DataExtractor


class __Class_Name__(DataExtractor): # make sure you rename the class to your preference
"""Make sure you implement all the methods required for your ETL"""

logger = Logger("__Class_Name__").logger # Creates a logger

def __init__(self, ):
"""Setting the metadata (if needed) and any other needed dependencies."""
self.metadata_df = metadata

def extract(self):
self.logger.info("Extracting data")
self.df = pd.DataFrame()

def transform(self, data):
self.logger.info("Transforming data")
self.df = self.df.merge(self.metadata_df)
from india_mopng_etl.india_mopng_scrapper.india_mopng_scrape_june2023_to_latest import June2023ToLatest


def main():
Expand All @@ -29,8 +7,11 @@ def main():
Returns:
None
"""
class_init = __Class_Name__()
class_init.etl()
try:
class_init = June2023ToLatest()
class_init.etl()
except Exception as ex:
raise RuntimeError(f"Scraper failed to process. Error was {ex}")


if __name__ == "__main__": # pragma: no cover
Expand Down
Loading

0 comments on commit deec3ae

Please sign in to comment.