-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
182 additions
and
272 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,93 +1,10 @@ | ||
attrs @ file:///C:/b/abs_35n0jusce8/croot/attrs_1695717880170/work | ||
beautifulsoup4==4.12.3 | ||
boto3==1.34.27 | ||
botocore==1.34.27 | ||
Brotli @ file:///C:/Windows/Temp/abs_63l7912z0e/croots/recipe/brotli-split_1659616056886/work | ||
bs4==0.0.2 | ||
cachetools==5.3.2 | ||
certifi @ file:///C:/b/abs_91u83siphd/croot/certifi_1700501720658/work/certifi | ||
cffi @ file:///C:/b/abs_924gv1kxzj/croot/cffi_1700254355075/work | ||
chardet==5.2.0 | ||
charset-normalizer==3.3.2 | ||
click==8.1.7 | ||
colorama==0.4.6 | ||
cryptography @ file:///C:/b/abs_e8cnom_zw_/croot/cryptography_1702071486468/work | ||
cssselect==1.2.0 | ||
decorator==5.1.1 | ||
distro==1.9.0 | ||
Elixir==0.7.1 | ||
et-xmlfile==1.1.0 | ||
exceptiongroup @ file:///C:/b/abs_c5h1o1_b5b/croot/exceptiongroup_1706031441653/work | ||
ghostscript==0.7 | ||
google-api-core==2.15.0 | ||
google-auth==2.27.0 | ||
google-cloud-core==2.4.1 | ||
google-cloud-storage==2.14.0 | ||
google-crc32c @ file:///C:/b/abs_f8g37ql__2/croot/google-crc32c_1667946622512/work | ||
google-resumable-media==2.7.0 | ||
googleapis-common-protos==1.62.0 | ||
greenlet @ file:///C:/b/abs_a6c75ie0bc/croot/greenlet_1702060012174/work | ||
h11==0.14.0 | ||
helper-functions-ea @ git+https://git@github.com/energyaspects/helper_functions.git@31531d83a80f324453173e05807145af4a583a2a | ||
idna @ file:///C:/b/abs_bdhbebrioa/croot/idna_1666125572046/work | ||
Jinja2==3.1.3 | ||
jmespath @ file:///C:/b/abs_59jpuaows7/croot/jmespath_1700144635019/work | ||
JPype1==1.5.0 | ||
lxml==5.1.0 | ||
MarkupSafe==2.1.4 | ||
mkl-fft @ file:///C:/b/abs_19i1y8ykas/croot/mkl_fft_1695058226480/work | ||
mkl-random @ file:///C:/b/abs_edwkj1_o69/croot/mkl_random_1695059866750/work | ||
mkl-service==2.4.0 | ||
numpy @ file:///C:/b/abs_16b2j7ad8n/croot/numpy_and_numpy_base_1704311752418/work/dist/numpy-1.26.3-cp39-cp39-win_amd64.whl#sha256=02e606e23ca31bb00a40d147fd1ce4dd7d241395346a4196592d5abe54a333bc | ||
opencv-python==4.9.0.80 | ||
openpyxl==3.1.2 | ||
outcome @ file:///tmp/build/80754af9/outcome_1609338780791/work | ||
packaging==23.2 | ||
pandas==2.2.0 | ||
pbr==6.0.0 | ||
pdfminer.six==20231228 | ||
pdftopng==0.2.3 | ||
protobuf==4.25.2 | ||
psycopg2-binary==2.9.9 | ||
pyasn1==0.5.1 | ||
pyasn1-modules==0.3.0 | ||
pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work | ||
PyMuPDFb==1.23.9 | ||
PyMySQL==1.1.0 | ||
pyOpenSSL @ file:///C:/b/abs_08f38zyck4/croot/pyopenssl_1690225407403/work | ||
pypdf==4.0.0 | ||
PyPDF2==3.0.1 | ||
pyquery==2.0.0 | ||
PySocks @ file:///C:/ci/pysocks_1605307512533/work | ||
python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work | ||
python-dotenv==1.0.1 | ||
python-http-client==3.3.7 | ||
pytz @ file:///C:/b/abs_19q3ljkez4/croot/pytz_1695131651401/work | ||
requests @ file:///C:/b/abs_316c2inijk/croot/requests_1690400295842/work | ||
roman==4.1 | ||
rsa==4.9 | ||
s3transfer==0.10.0 | ||
selenium==4.17.2 | ||
sendgrid==6.11.0 | ||
shooju==3.8.13 | ||
six @ file:///tmp/build/80754af9/six_1644875935023/work | ||
sniffio @ file:///C:/b/abs_3akdewudo_/croot/sniffio_1705431337396/work | ||
sortedcontainers @ file:///tmp/build/80754af9/sortedcontainers_1623949099177/work | ||
soupsieve @ file:///C:/b/abs_bbsvy9t4pl/croot/soupsieve_1696347611357/work | ||
SQLAlchemy==0.7.10 | ||
sqlalchemy-migrate==0.11.0 | ||
sqlparse==0.4.4 | ||
starkbank-ecdsa==2.2.0 | ||
tabula-py==2.9.0 | ||
tabulate==0.9.0 | ||
Tempita==0.5.2 | ||
trio @ file:///C:/b/abs_3bsokxbl8q/croot/trio_1705518572139/work | ||
trio-websocket==0.11.1 | ||
typing_extensions @ file:///C:/b/abs_72cdotwc_6/croot/typing_extensions_1705599364138/work | ||
tzdata==2023.4 | ||
urllib3 @ file:///C:/b/abs_9cmlsrm3ys/croot/urllib3_1698257595508/work | ||
webdriver-manager==4.0.1 | ||
win-inet-pton @ file:///C:/ci/win_inet_pton_1605306162074/work | ||
wsproto==1.2.0 | ||
xlrd==0.7.1 | ||
xlwt==0.7.2 | ||
# EDIT THIS WITH THE PACKAGES YOU HAVE INSTALLED! | ||
# DO NOT INSTALL Req.txt blindly :) | ||
pandas | ||
numpy | ||
shooju | ||
pylint | ||
black | ||
invoke | ||
python-dotenv | ||
tabula-py |
File renamed without changes.
69 changes: 69 additions & 0 deletions
69
src/india_mopng_etl/india_mopng_scrapper/india_mopng_scrape_june2023_to_latest.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import requests | ||
from bs4 import BeautifulSoup | ||
from urllib.parse import urljoin | ||
import tabula | ||
import pandas as pd | ||
|
||
from helper_functions_ea import Logger | ||
from india_mopng_etl.utils.base_classes import DataExtractor | ||
from india_mopng_etl.utils.helper_functions import get_pdf_lists, get_pdf_info | ||
|
||
|
||
class June2023ToLatest(DataExtractor): # make sure you rename the class to your preference | ||
"""Make sure you implement all the methods required for your ETL""" | ||
|
||
logger = Logger("India Mopng Scrape => ").logger | ||
# name = "India Mopng Scrape ETL" | ||
WEBSITE_URL = "https://mopng.gov.in/en/petroleum-statistics/monthly-production" | ||
|
||
def __init__(self): | ||
self.list_of_pdfs = [] | ||
self.crude_df = pd.DataFrame() | ||
self.petroleum_df = pd.DataFrame() | ||
|
||
def get_list_of_pdfs(self): | ||
""" | ||
Gets the PDF links from the website | ||
""" | ||
self.logger.info("Getting List of PDFs") | ||
try: | ||
self.list_of_pdfs = get_pdf_lists(self.WEBSITE_URL) | ||
except Exception as e: | ||
self.logger.error(f"India Mopng scrapper failed at getting list of PDFs, error was {e}") | ||
raise Exception(e) | ||
|
||
def extract(self): | ||
self.get_list_of_pdfs() | ||
|
||
dataframes = {'crude_dataframe': [], 'petroleum_dataframe': []} | ||
self.logger.info("Extracting PDF data") | ||
for pdf_path in self.list_of_pdfs: | ||
dataframes = get_pdf_info(pdf_path, dataframes) | ||
self.crude_df = pd.concat(dataframes['crude_dataframe'], ignore_index=True) | ||
self.petroleum_df = pd.concat(dataframes['petroleum_dataframe'], ignore_index=True) | ||
self.logger.info("Extracted PDF data") | ||
|
||
def transform(self): | ||
self.logger.info("Transforming data") | ||
# Example Crude Data positions and new names | ||
crude_column_positions_to_rename = {0: 'Name of Undertaking/Unit/State', | ||
1: 'Production during the Preceding month of current year'} | ||
|
||
# Rename Crude Data columns based on positions | ||
for position, new_name in crude_column_positions_to_rename.items(): | ||
# Ensure the position is within the range of existing columns | ||
if position < len(self.crude_df.columns): | ||
self.crude_df.rename(columns={self.crude_df.columns[position]: new_name}, inplace=True) | ||
|
||
# Example Petroleum Data positions and new names | ||
petroleum_column_positions_to_rename = {0: 'Name of Undertaking/Unit/State', | ||
1: 'Production during the Preceding month of current year'} | ||
|
||
# Rename Petroleum Data columns based on positions | ||
for position, new_name in petroleum_column_positions_to_rename.items(): | ||
# Ensure the position is within the range of existing columns | ||
if position < len(self.petroleum_df.columns): | ||
self.petroleum_df.rename(columns={self.petroleum_df.columns[position]: new_name}, inplace=True) | ||
print(self.crude_df) | ||
print(self.petroleum_df) | ||
self.logger.info("Transformed Data") |
133 changes: 0 additions & 133 deletions
133
src/india_mopng_etl/june2023_to_latest/india_mopng_scrape_june2023_to_latest.py
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.