From a54f1cae40d493ac673de09b08cd3da831bb062f Mon Sep 17 00:00:00 2001 From: Sachith R Date: Thu, 15 Apr 2021 16:37:04 -0400 Subject: [PATCH 1/3] CreationalPattern --- src/covidify/data_prep.py | 9 +- src/covidify/sources/github.py | 8 +- src/covidify/sources/githubSourceBuilder.py | 24 +++ src/covidify/sources/github_GHU_Builder.py | 170 ++++++++++++++++++++ 4 files changed, 207 insertions(+), 4 deletions(-) create mode 100644 src/covidify/sources/githubSourceBuilder.py create mode 100644 src/covidify/sources/github_GHU_Builder.py diff --git a/src/covidify/data_prep.py b/src/covidify/data_prep.py index 6153d44..8202b8f 100644 --- a/src/covidify/data_prep.py +++ b/src/covidify/data_prep.py @@ -21,7 +21,7 @@ from difflib import get_close_matches from datetime import datetime, date, time -from covidify.sources import github, wiki +from covidify.sources import gitHub_GHU_Builder from covidify.config import REPO, TMP_FOLDER, TMP_GIT, DATA from covidify.utils.utils import replace_arg_score @@ -227,4 +227,9 @@ def get_day_counts(d, country): log_df.astype(str).to_csv(os.path.join(save_dir, log_file_name)) print('...', log_file_name) -print('Done!') \ No newline at end of file +print('Done!') + + + +Class IBuilder(metaclass = ABCMeta) + df = github.get() \ No newline at end of file diff --git a/src/covidify/sources/github.py b/src/covidify/sources/github.py index 38c7a71..482576f 100644 --- a/src/covidify/sources/github.py +++ b/src/covidify/sources/github.py @@ -157,7 +157,6 @@ def get(): sys.exit(1) sheets = os.listdir(DATA) - # Clean the result to the sheet tabs we want print('Getting sheets...') cleaned_sheets = clean_sheet_names(sheets) @@ -166,4 +165,9 @@ def get(): df = get_data(cleaned_sheets) #Clean the column names - return df \ No newline at end of file + return df + +Director() +{ + #fetches the builder +} \ No newline at end of file diff --git a/src/covidify/sources/githubSourceBuilder.py b/src/covidify/sources/githubSourceBuilder.py new file mode 100644 index 0000000..63f4b0d --- /dev/null +++ b/src/covidify/sources/githubSourceBuilder.py @@ -0,0 +1,24 @@ +class dataSourceBuilder(metaclass = ABCMeta) + def clean_sheet_names(new_ranges): + # Remove all sheets that dont have a numeric header + + def clone_repo(TMP_FOLDER, REPO): + #clone data repositary + + def get_date(last_update): + #date of the last update + + def get_csv_date(f): + #gets data of the csv file + + def fix_country_names(tmp_df) + #country names in appropriate order to have function properly + + def clean_data(df): + #have all the data we need to clean, clean the data + + def get_data(cleaned_sheets) + #function get data from a clean sheet + + def get(): + # use this function to fetch the date \ No newline at end of file diff --git a/src/covidify/sources/github_GHU_Builder.py b/src/covidify/sources/github_GHU_Builder.py new file mode 100644 index 0000000..2215070 --- /dev/null +++ b/src/covidify/sources/github_GHU_Builder.py @@ -0,0 +1,170 @@ +from __future__ import print_function +import pandas as pd +import re +import os +import sys +import git +import numpy as np +from tqdm import tqdm +from time import strftime +from dateutil.parser import parse +from datetime import datetime, date, time +from covidify.config import REPO, TMP_FOLDER, TMP_GIT, DATA, KEEP_COLS, NUMERIC_COLS + +class gitHub_GHU_Builder(githubSourceBuilder) +{ + def clean_sheet_names(new_ranges): + # Remove all sheets that dont have a numeric header + return [x for x in new_ranges if re.search(r'\d', x)] + +def clone_repo(TMP_FOLDER, REPO): + print('Cloning Data Repo...') + git.Git(TMP_FOLDER).clone(REPO) + +def get_date(last_update): + return parse(str(last_update).split(' ')[0]).strftime("%Y-%m-%d") + +def get_csv_date(f): + return get_date(f.split('.')[0] + ' ') + + +def fix_country_names(tmp_df): + ''' + Cleaning up after JHU's bullshit data management + ''' + # Asian Countries + tmp_df['country'] = np.where((tmp_df['country'] == 'Mainland China'),'China', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Korea, South'),'South Korea', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Republic of Korea'),'South Korea', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Hong Kong SAR'),'Hong Kong', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Taipei and environs'),'Taiwan', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Taiwan*'),'Taiwan', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Macao SAR'),'Macau', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Iran (Islamic Republic of)'),'Iran', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Viet Nam'),'Vietnam', tmp_df['country']) + + #European Countries + tmp_df['country'] = np.where((tmp_df['country'] == 'UK'),'United Kingdom', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == ' Azerbaijan'),'Azerbaijan', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Bosnia and Herzegovina'),'Bosnia', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Czech Republic'),'Czechia', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Republic of Ireland'),'Ireland', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'North Ireland'),'Ireland', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Republic of Moldova'),'Moldova', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Russian Federation'),'Russia', tmp_df['country']) + + #African Countries + tmp_df['country'] = np.where((tmp_df['country'] == 'Congo (Brazzaville)'),'Congo', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Congo (Kinshasa)'),'Congo', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Republic of the Congo'),'Congo', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Gambia, The'),'Gambia', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'The Gambia'),'Gambia', tmp_df['country']) + + # Western Countries + tmp_df['country'] = np.where((tmp_df['country'] == 'USA'),'America', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'US'),'America', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Bahamas, The'),'The Bahamas', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Bahamas'),'The Bahamas', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'st. Martin'),'Saint Martin', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'St. Martin'),'Saint Martin', tmp_df['country']) + + + # Others + tmp_df['country'] = np.where((tmp_df['country'] == 'Cruise Ship'),'Others', tmp_df['country']) + + return tmp_df + +# Now that we have all the data we now need to clean it +# - Fill null values +# - remore suspected values +# - change column names +def clean_data(df): + tmp_df = df.copy() + + if 'Demised' in tmp_df.columns: + tmp_df.rename(columns={'Demised':'deaths'}, inplace=True) + + if 'Country/Region' in tmp_df.columns: + tmp_df.rename(columns={'Country/Region':'country'}, inplace=True) + + if 'Country_Region' in tmp_df.columns: + tmp_df.rename(columns={'Country_Region':'country'}, inplace=True) + + if 'Province/State' in tmp_df.columns: + tmp_df.rename(columns={'Province/State':'province'}, inplace=True) + + if 'Province_State' in tmp_df.columns: + tmp_df.rename(columns={'Province_State':'province'}, inplace=True) + + if 'Last Update' in tmp_df.columns: + tmp_df.rename(columns={'Last Update':'datetime'}, inplace=True) + + if 'Last_Update' in tmp_df.columns: + tmp_df.rename(columns={'Last_Update':'datetime'}, inplace=True) + + #Lower case all col names + tmp_df.columns = map(str.lower, tmp_df.columns) + + for col in tmp_df[NUMERIC_COLS]: + tmp_df[col] = tmp_df[col].fillna(0) + tmp_df[col] = tmp_df[col].astype(int) + + return tmp_df + +def get_data(cleaned_sheets): + all_csv = [] + # Import all CSV's + for f in tqdm(sorted(cleaned_sheets), desc='... loading data: '): + if 'csv' in f: + try: + tmp_df = pd.read_csv(os.path.join(DATA, f), index_col=None,header=0, parse_dates=['Last Update']) + except: + # Temporary fix for JHU's bullshit data management + tmp_df = pd.read_csv(os.path.join(DATA, f), index_col=None,header=0, parse_dates=['Last_Update']) + + tmp_df = clean_data(tmp_df) + tmp_df['date'] = tmp_df['datetime'].apply(get_date) # remove time to get date + tmp_df['file_date'] = get_csv_date(f) #Get date of csv from file name + tmp_df = tmp_df[KEEP_COLS] + tmp_df['province'].fillna(tmp_df['country'], inplace=True) #If no region given, fill it with country + all_csv.append(tmp_df) + + df_raw = pd.concat(all_csv, axis=0, ignore_index=True, sort=True) # concatenate all csv's into one df + df_raw = fix_country_names(df_raw) # Fix mispelled country names + df_raw = df_raw.sort_values(by=['datetime']) + return df_raw + + +# use this function to fetch the data +def get(): + + # Create Tmp Folder + if not os.path.isdir(TMP_FOLDER): + print('Creating folder...') + print('...', TMP_FOLDER) + os.mkdir(TMP_FOLDER) + + #Check if repo exists + #git pull if it does + if not os.path.isdir(TMP_GIT): + clone_repo(TMP_FOLDER, REPO) + else: + try: + print('git pull from', REPO) + rep = git.Repo(TMP_GIT) + rep.remotes.origin.pull() + except: + print('Could not pull from', REPO) + sys.exit(1) + + sheets = os.listdir(DATA) + # Clean the result to the sheet tabs we want + print('Getting sheets...') + cleaned_sheets = clean_sheet_names(sheets) + + # Aggregate all the data from sheets + df = get_data(cleaned_sheets) + + #Clean the column names + return df +} \ No newline at end of file From d1bf1ce4a5b58ce78ef62db7e1263af41847e494 Mon Sep 17 00:00:00 2001 From: Sachith R Date: Thu, 15 Apr 2021 18:42:34 -0400 Subject: [PATCH 2/3] CreationalP --- src/covidify/sources/Director.py | 12 ++++++++++++ src/covidify/sources/github.py | 7 +------ .../{github_GHU_Builder.py => github_JHU_Builder.py} | 0 3 files changed, 13 insertions(+), 6 deletions(-) create mode 100644 src/covidify/sources/Director.py rename src/covidify/sources/{github_GHU_Builder.py => github_JHU_Builder.py} (100%) diff --git a/src/covidify/sources/Director.py b/src/covidify/sources/Director.py new file mode 100644 index 0000000..34ec6c8 --- /dev/null +++ b/src/covidify/sources/Director.py @@ -0,0 +1,12 @@ +class Director: +{ + #fetches the builder + def __init__(self) -> None; + self._builder = None + + def builder(self) -> githubSourceBuilder + return self._builder + + def builder(self, builder:githubSourceBuilder) -> None: + self._builder = builder +} \ No newline at end of file diff --git a/src/covidify/sources/github.py b/src/covidify/sources/github.py index 482576f..fc1ec9d 100644 --- a/src/covidify/sources/github.py +++ b/src/covidify/sources/github.py @@ -165,9 +165,4 @@ def get(): df = get_data(cleaned_sheets) #Clean the column names - return df - -Director() -{ - #fetches the builder -} \ No newline at end of file + return df \ No newline at end of file diff --git a/src/covidify/sources/github_GHU_Builder.py b/src/covidify/sources/github_JHU_Builder.py similarity index 100% rename from src/covidify/sources/github_GHU_Builder.py rename to src/covidify/sources/github_JHU_Builder.py From df1a69113fc0f4db59a026db0b34f0993599781a Mon Sep 17 00:00:00 2001 From: Sachith R Date: Thu, 15 Apr 2021 21:49:23 -0400 Subject: [PATCH 3/3] Creational Pattern --- src/covidify/covidify_data.py | 11 ++ src/covidify/data_prep.py | 235 ---------------------------------- 2 files changed, 11 insertions(+), 235 deletions(-) create mode 100644 src/covidify/covidify_data.py delete mode 100644 src/covidify/data_prep.py diff --git a/src/covidify/covidify_data.py b/src/covidify/covidify_data.py new file mode 100644 index 0000000..c475723 --- /dev/null +++ b/src/covidify/covidify_data.py @@ -0,0 +1,11 @@ +from convidify.sources import github, wiki + +""" +global values +""" +class DataStore(): + def jhu_sources(self): + return github.get() + + def wiki_sources(self): + return github.get() \ No newline at end of file diff --git a/src/covidify/data_prep.py b/src/covidify/data_prep.py deleted file mode 100644 index 8202b8f..0000000 --- a/src/covidify/data_prep.py +++ /dev/null @@ -1,235 +0,0 @@ -""" -data_prep.py - Extract data from date range and create models -Usage: - data_prep.py [options] - data_prep.py -h | --help - -Options: - -h --help Show this message. - --output_folder=OUT Output folder for the data and reports to be saved. - --source=SRC Datasource for where the data will be downloaded from. - --country=CNT Arg for filtering by a specific country - --top=top Top number of countries in the log plot -""" -from __future__ import print_function -import os -import sys -import docopt -import numpy as np -import pandas as pd -from string import capwords -from difflib import get_close_matches -from datetime import datetime, date, time - -from covidify.sources import gitHub_GHU_Builder -from covidify.config import REPO, TMP_FOLDER, TMP_GIT, DATA -from covidify.utils.utils import replace_arg_score - - -args = docopt.docopt(__doc__) -out = args['--output_folder'] -country = args['--country'] -source = args['--source'] -top = int(args['--top']) - - -############ DATA SELECTION ############ - -if '_' in country: - country = replace_arg_score(country) - -if country == 'Global': - country = None - -if source == 'JHU': - df = github.get() - -elif source == 'wiki': - print('Apologies, the wikipedia source is not ready yet - getting github data') - df = github.get() - - - -############ COUNTRY SELECTION ############ - -def get_similar_countries(c, country_list): - pos_countries = get_close_matches(c, country_list) - - if len(pos_countries) > 0: - print('\033[1;31m'+c, 'was not listed. did you mean', pos_countries[0].capitalize() + '?\033[0;0m') - - #Only delete if its a covidify generated folder - if 'Desktop/covidify-output-' in out: - os.system('rm -rf ' + out) - sys.exit(1) - else: - print('\033[1;31m'+c, 'was not listed.\033[0;0m') - if 'Desktop/covidify-output-' in out: - os.system('rm -rf ' + out) - sys.exit(1) - -def check_specified_country(df, country): - ''' - let user filter reports by country, if not found - then give a option if the string is similar - ''' - - # Get all unique countries in the data - country_list = list(map(lambda x:x.lower().strip(), set(df.country.values))) - - if country: - print('Country specified!') - if country.lower() == 'Mainland China': #Mainland china and china doesn't come up as similar - print(country, 'was not listed. did you mean China?') - sys.exit(1) - # give similar option if similarity found - if country.lower() not in country_list: - get_similar_countries(country, country_list) - - else: - #Return filtered dataframe - print('... filtering data for', country) - if len(country) == 2: - df = df[df.country == country.upper()] - else: - df = df[df.country == capwords(country)] - return df - else: - print('... No specific country specified') - return df - -df = check_specified_country(df, country) - -############ DAILY CASES ############ - -# sheets need to be sorted by date value -# print('Sorting by datetime...') -df = df.sort_values('datetime') - -current_date = str(datetime.date(datetime.now())) - -''' -Get the difference of the sum totals for each -date and plot them on a trendline graph -''' -def get_new_cases(tmp, col): - diff_list = [] - tmp_df_list = [] - df = tmp.copy() - - for i, day in enumerate(df.sort_values('file_date').file_date.unique()): - tmp_df = df[df.file_date == day] - tmp_df_list.append(tmp_df[col].sum()) - - if i == 0: - diff_list.append(tmp_df[col].sum()) - else: - diff_list.append(tmp_df[col].sum() - tmp_df_list[i-1]) - - return diff_list - -def get_moving_average(tmp, col): - df = tmp.copy() - return df[col].rolling(window=2).mean() - -def get_exp_moving_average(tmp, col): - df = tmp.copy() - return df[col].ewm(span=2, adjust=True).mean() - - -print('... Calculating dataframe for new cases') -daily_cases_df = pd.DataFrame([]) -daily_cases_df['date'] = df.file_date.unique() -daily_cases_df = daily_cases_df.sort_values('date') -daily_cases_df['new_confirmed_cases'] = get_new_cases(df, 'confirmed') -daily_cases_df['new_deaths'] = get_new_cases(df, 'deaths') -daily_cases_df['new_recoveries'] = get_new_cases(df, 'recovered') -daily_cases_df['cumulative_cases'] = daily_cases_df.new_confirmed_cases.cumsum() -daily_cases_df.insert(loc=0, column='day', value=np.arange(0, len(daily_cases_df))) - -''' -Calculate the number of people that are ACTUALLY infected on a given day -currently infected = sum of people date - (recovored + died) -ex: 5 = 10 - (4 - 1) - -''' -current_infected = pd.DataFrame([]) -current_infected['currently_infected'] = (df.groupby('file_date').confirmed.sum() - (df.groupby('file_date').deaths.sum() + df.groupby('file_date').recovered.sum())) -current_infected['delta'] = (current_infected['currently_infected'] - df.groupby('file_date').confirmed.sum()) -current_infected.index.rename('date', inplace=True) - -daily_cases_df = pd.merge(daily_cases_df, current_infected, how='outer', on='date') - -############ LOG DATA ############ - -print('Calculating data for logarithmic plotting...') -if not country: - print('... top infected countries: {}'.format(top)) - -def get_top_countries(data): - # Get top N infected countries - tmp_df = data.copy() - tmp_df = tmp_df[tmp_df.file_date == df.file_date.max()] - return tmp_df.groupby(['country']).agg({'confirmed': 'sum'}).sort_values('confirmed',ascending=False).head(top).index - -TOP_N_COUNTRIES = get_top_countries(df) - -tmp_df = df[df.country.isin(TOP_N_COUNTRIES)].copy() - -def get_day_counts(d, country): - ''' - For each country, get the days of the spread since 500 - cases - ''' - data = d.copy() - result_df = pd.DataFrame([]) - result_df = data.groupby(['file_date']).agg({'confirmed': 'sum', - 'recovered': 'sum', - 'deaths': 'sum'}) - result_df['date'] = data['file_date'].unique() - result_df['country'] = country - - result_df = result_df[result_df.confirmed >= 500] - result_df.insert(loc=0, column='day', value=np.arange(len(result_df))) - return result_df - -df_list = [] - -for country in TOP_N_COUNTRIES: - print(' ...', country + ': ' + str(tmp_df[(tmp_df.file_date == df.file_date.max()) & - (tmp_df.country == country)].confirmed.sum())) - df_list.append(get_day_counts(tmp_df[tmp_df.country == country], country)) - -log_df = pd.concat(df_list, axis=0, ignore_index=True) - - -############ SAVE DATA ############ -#Create date of extraction folder -data_folder = os.path.join('data', str(datetime.date(datetime.now()))) -save_dir = os.path.join(out, data_folder) - -if not os.path.exists(save_dir): - os.system('mkdir -p ' + save_dir) - -print('Creating subdirectory for data...') -print('...', save_dir) - -print('Saving...') -csv_file_name = 'agg_data_{}.csv'.format(datetime.date(datetime.now())) -df.astype(str).to_csv(os.path.join(save_dir, csv_file_name)) -print('...', csv_file_name) - -daily_cases_file_name = 'trend_{}.csv'.format(datetime.date(datetime.now())) -daily_cases_df.astype(str).to_csv(os.path.join(save_dir, daily_cases_file_name)) -print('...', daily_cases_file_name) - -log_file_name = 'log_{}.csv'.format(datetime.date(datetime.now())) -log_df.astype(str).to_csv(os.path.join(save_dir, log_file_name)) -print('...', log_file_name) - -print('Done!') - - - -Class IBuilder(metaclass = ABCMeta) - df = github.get() \ No newline at end of file