structured with our predefined ETL

energyaspects · Jan 31, 2024 · deec3ae · deec3ae
1 parent 6ef6f3c
commit deec3ae
Show file tree

Hide file tree

Showing 8 changed files with 182 additions and 272 deletions.
diff --git a/requirements/requirements.in b/requirements/requirements.in
@@ -1,93 +1,10 @@
-attrs @ file:///C:/b/abs_35n0jusce8/croot/attrs_1695717880170/work
-beautifulsoup4==4.12.3
-boto3==1.34.27
-botocore==1.34.27
-Brotli @ file:///C:/Windows/Temp/abs_63l7912z0e/croots/recipe/brotli-split_1659616056886/work
-bs4==0.0.2
-cachetools==5.3.2
-certifi @ file:///C:/b/abs_91u83siphd/croot/certifi_1700501720658/work/certifi
-cffi @ file:///C:/b/abs_924gv1kxzj/croot/cffi_1700254355075/work
-chardet==5.2.0
-charset-normalizer==3.3.2
-click==8.1.7
-colorama==0.4.6
-cryptography @ file:///C:/b/abs_e8cnom_zw_/croot/cryptography_1702071486468/work
-cssselect==1.2.0
-decorator==5.1.1
-distro==1.9.0
-Elixir==0.7.1
-et-xmlfile==1.1.0
-exceptiongroup @ file:///C:/b/abs_c5h1o1_b5b/croot/exceptiongroup_1706031441653/work
-ghostscript==0.7
-google-api-core==2.15.0
-google-auth==2.27.0
-google-cloud-core==2.4.1
-google-cloud-storage==2.14.0
-google-crc32c @ file:///C:/b/abs_f8g37ql__2/croot/google-crc32c_1667946622512/work
-google-resumable-media==2.7.0
-googleapis-common-protos==1.62.0
-greenlet @ file:///C:/b/abs_a6c75ie0bc/croot/greenlet_1702060012174/work
-h11==0.14.0
-helper-functions-ea @ git+https://git@github.com/energyaspects/helper_functions.git@31531d83a80f324453173e05807145af4a583a2a
-idna @ file:///C:/b/abs_bdhbebrioa/croot/idna_1666125572046/work
-Jinja2==3.1.3
-jmespath @ file:///C:/b/abs_59jpuaows7/croot/jmespath_1700144635019/work
-JPype1==1.5.0
-lxml==5.1.0
-MarkupSafe==2.1.4
-mkl-fft @ file:///C:/b/abs_19i1y8ykas/croot/mkl_fft_1695058226480/work
-mkl-random @ file:///C:/b/abs_edwkj1_o69/croot/mkl_random_1695059866750/work
-mkl-service==2.4.0
-numpy @ file:///C:/b/abs_16b2j7ad8n/croot/numpy_and_numpy_base_1704311752418/work/dist/numpy-1.26.3-cp39-cp39-win_amd64.whl#sha256=02e606e23ca31bb00a40d147fd1ce4dd7d241395346a4196592d5abe54a333bc
-opencv-python==4.9.0.80
-openpyxl==3.1.2
-outcome @ file:///tmp/build/80754af9/outcome_1609338780791/work
-packaging==23.2
-pandas==2.2.0
-pbr==6.0.0
-pdfminer.six==20231228
-pdftopng==0.2.3
-protobuf==4.25.2
-psycopg2-binary==2.9.9
-pyasn1==0.5.1
-pyasn1-modules==0.3.0
-pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work
-PyMuPDFb==1.23.9
-PyMySQL==1.1.0
-pyOpenSSL @ file:///C:/b/abs_08f38zyck4/croot/pyopenssl_1690225407403/work
-pypdf==4.0.0
-PyPDF2==3.0.1
-pyquery==2.0.0
-PySocks @ file:///C:/ci/pysocks_1605307512533/work
-python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work
-python-dotenv==1.0.1
-python-http-client==3.3.7
-pytz @ file:///C:/b/abs_19q3ljkez4/croot/pytz_1695131651401/work
-requests @ file:///C:/b/abs_316c2inijk/croot/requests_1690400295842/work
-roman==4.1
-rsa==4.9
-s3transfer==0.10.0
-selenium==4.17.2
-sendgrid==6.11.0
-shooju==3.8.13
-six @ file:///tmp/build/80754af9/six_1644875935023/work
-sniffio @ file:///C:/b/abs_3akdewudo_/croot/sniffio_1705431337396/work
-sortedcontainers @ file:///tmp/build/80754af9/sortedcontainers_1623949099177/work
-soupsieve @ file:///C:/b/abs_bbsvy9t4pl/croot/soupsieve_1696347611357/work
-SQLAlchemy==0.7.10
-sqlalchemy-migrate==0.11.0
-sqlparse==0.4.4
-starkbank-ecdsa==2.2.0
-tabula-py==2.9.0
-tabulate==0.9.0
-Tempita==0.5.2
-trio @ file:///C:/b/abs_3bsokxbl8q/croot/trio_1705518572139/work
-trio-websocket==0.11.1
-typing_extensions @ file:///C:/b/abs_72cdotwc_6/croot/typing_extensions_1705599364138/work
-tzdata==2023.4
-urllib3 @ file:///C:/b/abs_9cmlsrm3ys/croot/urllib3_1698257595508/work
-webdriver-manager==4.0.1
-win-inet-pton @ file:///C:/ci/win_inet_pton_1605306162074/work
-wsproto==1.2.0
-xlrd==0.7.1
-xlwt==0.7.2
+# EDIT THIS WITH THE PACKAGES YOU HAVE INSTALLED!
+# DO NOT INSTALL Req.txt blindly :)
+pandas
+numpy
+shooju
+pylint
+black
+invoke
+python-dotenv
+tabula-py
diff --git a/..._mopng_etl/june2023_to_latest/__init__.py → ...opng_etl/india_mopng_scrapper/__init__.py b/..._mopng_etl/june2023_to_latest/__init__.py → ...opng_etl/india_mopng_scrapper/__init__.py
diff --git a/src/india_mopng_etl/india_mopng_scrapper/india_mopng_scrape_june2023_to_latest.py b/src/india_mopng_etl/india_mopng_scrapper/india_mopng_scrape_june2023_to_latest.py
@@ -0,0 +1,69 @@
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+import tabula
+import pandas as pd
+
+from helper_functions_ea import Logger
+from india_mopng_etl.utils.base_classes import DataExtractor
+from india_mopng_etl.utils.helper_functions import get_pdf_lists, get_pdf_info
+
+
+class June2023ToLatest(DataExtractor):  # make sure you rename the class to your preference
+    """Make sure you implement all the methods required for your ETL"""
+
+    logger = Logger("India Mopng Scrape => ").logger
+    # name = "India Mopng Scrape ETL"
+    WEBSITE_URL = "https://mopng.gov.in/en/petroleum-statistics/monthly-production"
+
+    def __init__(self):
+        self.list_of_pdfs = []
+        self.crude_df = pd.DataFrame()
+        self.petroleum_df = pd.DataFrame()
+
+    def get_list_of_pdfs(self):
+        """
+        Gets the PDF links from the website
+        """
+        self.logger.info("Getting List of PDFs")
+        try:
+            self.list_of_pdfs = get_pdf_lists(self.WEBSITE_URL)
+        except Exception as e:
+            self.logger.error(f"India Mopng scrapper failed at getting list of PDFs, error was {e}")
+            raise Exception(e)
+
+    def extract(self):
+        self.get_list_of_pdfs()
+
+        dataframes = {'crude_dataframe': [], 'petroleum_dataframe': []}
+        self.logger.info("Extracting PDF data")
+        for pdf_path in self.list_of_pdfs:
+            dataframes = get_pdf_info(pdf_path, dataframes)
+        self.crude_df = pd.concat(dataframes['crude_dataframe'], ignore_index=True)
+        self.petroleum_df = pd.concat(dataframes['petroleum_dataframe'], ignore_index=True)
+        self.logger.info("Extracted PDF data")
+
+    def transform(self):
+        self.logger.info("Transforming data")
+        # Example Crude Data positions and new names
+        crude_column_positions_to_rename = {0: 'Name of Undertaking/Unit/State',
+                                            1: 'Production during the Preceding month of current year'}
+
+        # Rename Crude Data columns based on positions
+        for position, new_name in crude_column_positions_to_rename.items():
+            # Ensure the position is within the range of existing columns
+            if position < len(self.crude_df.columns):
+                self.crude_df.rename(columns={self.crude_df.columns[position]: new_name}, inplace=True)
+
+        # Example Petroleum Data positions and new names
+        petroleum_column_positions_to_rename = {0: 'Name of Undertaking/Unit/State',
+                                                1: 'Production during the Preceding month of current year'}
+
+        # Rename Petroleum Data columns based on positions
+        for position, new_name in petroleum_column_positions_to_rename.items():
+            # Ensure the position is within the range of existing columns
+            if position < len(self.petroleum_df.columns):
+                self.petroleum_df.rename(columns={self.petroleum_df.columns[position]: new_name}, inplace=True)
+        print(self.crude_df)
+        print(self.petroleum_df)
+        self.logger.info("Transformed Data")
diff --git a/src/india_mopng_etl/june2023_to_latest/india_mopng_scrape_june2023_to_latest.py b/src/india_mopng_etl/june2023_to_latest/india_mopng_scrape_june2023_to_latest.py
diff --git a/src/india_mopng_etl/main.py b/src/india_mopng_etl/main.py
@@ -1,26 +1,4 @@
-import pandas as pd
-
-from helper_functions_ea import Logger
-from india_mopng_etl.metadata import metadata
-from india_mopng_etl.utils.base_classes import DataExtractor
-
-
-class __Class_Name__(DataExtractor):  # make sure you rename the class to your preference
-    """Make sure you implement all the methods required for your ETL"""
-
-    logger = Logger("__Class_Name__").logger  # Creates a logger
-
-    def __init__(self, ):
-        """Setting the metadata (if needed) and any other needed dependencies."""
-        self.metadata_df = metadata
-
-    def extract(self):
-        self.logger.info("Extracting data")
-        self.df = pd.DataFrame()
-
-    def transform(self, data):
-        self.logger.info("Transforming data")
-        self.df = self.df.merge(self.metadata_df)
+from india_mopng_etl.india_mopng_scrapper.india_mopng_scrape_june2023_to_latest import June2023ToLatest
 
 
 def main():
@@ -29,8 +7,11 @@ def main():
     Returns:
       None
     """
-    class_init = __Class_Name__()
-    class_init.etl()
+    try:
+        class_init = June2023ToLatest()
+        class_init.etl()
+    except Exception as ex:
+        raise RuntimeError(f"Scraper failed to process. Error was {ex}")
 
 
 if __name__ == "__main__":  # pragma: no cover