From a54f1cae40d493ac673de09b08cd3da831bb062f Mon Sep 17 00:00:00 2001
From: Sachith R <bearsolitary@gmail.com>
Date: Thu, 15 Apr 2021 16:37:04 -0400
Subject: [PATCH 1/3] CreationalPattern

---
 src/covidify/data_prep.py                   |   9 +-
 src/covidify/sources/github.py              |   8 +-
 src/covidify/sources/githubSourceBuilder.py |  24 +++
 src/covidify/sources/github_GHU_Builder.py  | 170 ++++++++++++++++++++
 4 files changed, 207 insertions(+), 4 deletions(-)
 create mode 100644 src/covidify/sources/githubSourceBuilder.py
 create mode 100644 src/covidify/sources/github_GHU_Builder.py

diff --git a/src/covidify/data_prep.py b/src/covidify/data_prep.py
index 6153d44..8202b8f 100644
--- a/src/covidify/data_prep.py
+++ b/src/covidify/data_prep.py
@@ -21,7 +21,7 @@
 from difflib import get_close_matches
 from datetime import datetime, date, time 
 
-from covidify.sources import github, wiki
+from covidify.sources import gitHub_GHU_Builder
 from covidify.config import REPO, TMP_FOLDER, TMP_GIT, DATA
 from covidify.utils.utils import replace_arg_score
 
@@ -227,4 +227,9 @@ def get_day_counts(d, country):
 log_df.astype(str).to_csv(os.path.join(save_dir, log_file_name))
 print('...', log_file_name)
 
-print('Done!')
\ No newline at end of file
+print('Done!')
+
+
+
+Class IBuilder(metaclass = ABCMeta)
+    df = github.get()
\ No newline at end of file
diff --git a/src/covidify/sources/github.py b/src/covidify/sources/github.py
index 38c7a71..482576f 100644
--- a/src/covidify/sources/github.py
+++ b/src/covidify/sources/github.py
@@ -157,7 +157,6 @@ def get():
             sys.exit(1)
 
     sheets = os.listdir(DATA)
-    
     # Clean the result to the sheet tabs we want
     print('Getting sheets...')
     cleaned_sheets = clean_sheet_names(sheets)
@@ -166,4 +165,9 @@ def get():
     df = get_data(cleaned_sheets)
     
     #Clean the column names
-    return df
\ No newline at end of file
+    return df
+
+Director()
+{
+    #fetches the builder
+}
\ No newline at end of file
diff --git a/src/covidify/sources/githubSourceBuilder.py b/src/covidify/sources/githubSourceBuilder.py
new file mode 100644
index 0000000..63f4b0d
--- /dev/null
+++ b/src/covidify/sources/githubSourceBuilder.py
@@ -0,0 +1,24 @@
+class dataSourceBuilder(metaclass = ABCMeta)
+    def clean_sheet_names(new_ranges):
+        # Remove all sheets that dont have a numeric header
+
+    def clone_repo(TMP_FOLDER, REPO):
+        #clone data repositary
+
+    def get_date(last_update):
+        #date of the last update
+
+    def get_csv_date(f):
+        #gets data of the csv file
+
+    def fix_country_names(tmp_df)
+        #country names in appropriate order to have function properly
+
+    def clean_data(df):
+        #have all the data we need to clean, clean the data
+
+    def get_data(cleaned_sheets)
+        #function get data from a clean sheet
+
+    def get():
+        # use this function to fetch the date
\ No newline at end of file
diff --git a/src/covidify/sources/github_GHU_Builder.py b/src/covidify/sources/github_GHU_Builder.py
new file mode 100644
index 0000000..2215070
--- /dev/null
+++ b/src/covidify/sources/github_GHU_Builder.py
@@ -0,0 +1,170 @@
+from __future__ import print_function
+import pandas as pd
+import re
+import os
+import sys
+import git
+import numpy as np
+from tqdm import tqdm
+from time import strftime
+from dateutil.parser import parse
+from datetime import datetime, date, time 
+from covidify.config import REPO, TMP_FOLDER, TMP_GIT, DATA, KEEP_COLS, NUMERIC_COLS
+
+class gitHub_GHU_Builder(githubSourceBuilder)
+{
+    def clean_sheet_names(new_ranges):
+    # Remove all sheets that dont have a numeric header
+    return [x for x in new_ranges if re.search(r'\d', x)]
+
+def clone_repo(TMP_FOLDER, REPO):
+    print('Cloning Data Repo...')
+    git.Git(TMP_FOLDER).clone(REPO)
+
+def get_date(last_update):
+    return parse(str(last_update).split(' ')[0]).strftime("%Y-%m-%d")
+
+def get_csv_date(f):
+    return get_date(f.split('.')[0] + ' ')
+
+
+def fix_country_names(tmp_df):
+    '''
+    Cleaning up after JHU's bullshit data management
+    '''
+    # Asian Countries
+    tmp_df['country'] = np.where((tmp_df['country']  == 'Mainland China'),'China', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'Korea, South'),'South Korea', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'Republic of Korea'),'South Korea', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'Hong Kong SAR'),'Hong Kong', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'Taipei and environs'),'Taiwan', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'Taiwan*'),'Taiwan', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'Macao SAR'),'Macau', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'Iran (Islamic Republic of)'),'Iran', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'Viet Nam'),'Vietnam', tmp_df['country'])
+
+    #European Countries
+    tmp_df['country'] = np.where((tmp_df['country']  == 'UK'),'United Kingdom', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == ' Azerbaijan'),'Azerbaijan', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'Bosnia and Herzegovina'),'Bosnia', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'Czech Republic'),'Czechia', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'Republic of Ireland'),'Ireland', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'North Ireland'),'Ireland', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'Republic of Moldova'),'Moldova', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'Russian Federation'),'Russia', tmp_df['country'])
+
+    #African Countries
+    tmp_df['country'] = np.where((tmp_df['country']  == 'Congo (Brazzaville)'),'Congo', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'Congo (Kinshasa)'),'Congo', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'Republic of the Congo'),'Congo', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'Gambia, The'),'Gambia', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'The Gambia'),'Gambia', tmp_df['country'])
+
+    # Western Countries
+    tmp_df['country'] = np.where((tmp_df['country']  == 'USA'),'America', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'US'),'America', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'Bahamas, The'),'The Bahamas', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'Bahamas'),'The Bahamas', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'st. Martin'),'Saint Martin', tmp_df['country'])
+    tmp_df['country'] = np.where((tmp_df['country']  == 'St. Martin'),'Saint Martin', tmp_df['country'])
+    
+
+    # Others
+    tmp_df['country'] = np.where((tmp_df['country']  == 'Cruise Ship'),'Others', tmp_df['country'])
+
+    return tmp_df
+
+# Now that we have all the data we now need to clean it 
+# - Fill null values
+# - remore suspected values
+# - change column names
+def clean_data(df):
+    tmp_df = df.copy()
+
+    if 'Demised' in tmp_df.columns:
+        tmp_df.rename(columns={'Demised':'deaths'}, inplace=True)
+
+    if 'Country/Region' in tmp_df.columns:
+        tmp_df.rename(columns={'Country/Region':'country'}, inplace=True)
+
+    if 'Country_Region' in tmp_df.columns:
+        tmp_df.rename(columns={'Country_Region':'country'}, inplace=True)
+    
+    if 'Province/State' in tmp_df.columns:
+        tmp_df.rename(columns={'Province/State':'province'}, inplace=True)
+
+    if 'Province_State' in tmp_df.columns:
+        tmp_df.rename(columns={'Province_State':'province'}, inplace=True)
+
+    if 'Last Update' in tmp_df.columns:
+        tmp_df.rename(columns={'Last Update':'datetime'}, inplace=True)
+
+    if 'Last_Update' in tmp_df.columns:
+        tmp_df.rename(columns={'Last_Update':'datetime'}, inplace=True)
+
+    #Lower case all col names
+    tmp_df.columns = map(str.lower, tmp_df.columns) 
+
+    for col in tmp_df[NUMERIC_COLS]:
+        tmp_df[col] = tmp_df[col].fillna(0)
+        tmp_df[col] = tmp_df[col].astype(int)
+
+    return tmp_df
+
+def get_data(cleaned_sheets):
+    all_csv = []
+    # Import all CSV's
+    for f in tqdm(sorted(cleaned_sheets), desc='... loading data: '):
+        if 'csv' in f:
+            try:
+                tmp_df = pd.read_csv(os.path.join(DATA, f), index_col=None,header=0, parse_dates=['Last Update'])  
+            except:
+                # Temporary fix for JHU's bullshit data management
+                tmp_df = pd.read_csv(os.path.join(DATA, f), index_col=None,header=0, parse_dates=['Last_Update'])  
+
+            tmp_df = clean_data(tmp_df)
+            tmp_df['date'] = tmp_df['datetime'].apply(get_date) # remove time to get date
+            tmp_df['file_date'] = get_csv_date(f) #Get date of csv from file name
+            tmp_df = tmp_df[KEEP_COLS]
+            tmp_df['province'].fillna(tmp_df['country'], inplace=True) #If no region given, fill it with country
+            all_csv.append(tmp_df)
+
+    df_raw = pd.concat(all_csv, axis=0, ignore_index=True, sort=True)  # concatenate all csv's into one df
+    df_raw = fix_country_names(df_raw)    # Fix mispelled country names
+    df_raw = df_raw.sort_values(by=['datetime'])
+    return df_raw
+
+
+# use this function to fetch the data
+def get():
+        
+    # Create Tmp Folder
+    if not os.path.isdir(TMP_FOLDER):
+        print('Creating folder...')
+        print('...', TMP_FOLDER)
+        os.mkdir(TMP_FOLDER)
+
+    #Check if repo exists
+    #git pull if it does
+    if not os.path.isdir(TMP_GIT):
+        clone_repo(TMP_FOLDER, REPO)
+    else:
+        try:
+            print('git pull from', REPO)
+            rep = git.Repo(TMP_GIT)
+            rep.remotes.origin.pull()
+        except:
+            print('Could not pull from', REPO)
+            sys.exit(1)
+
+    sheets = os.listdir(DATA)
+    # Clean the result to the sheet tabs we want
+    print('Getting sheets...')
+    cleaned_sheets = clean_sheet_names(sheets)
+
+    # Aggregate all the data from sheets
+    df = get_data(cleaned_sheets)
+    
+    #Clean the column names
+    return df
+}
\ No newline at end of file

From d1bf1ce4a5b58ce78ef62db7e1263af41847e494 Mon Sep 17 00:00:00 2001
From: Sachith R <bearsolitary@gmail.com>
Date: Thu, 15 Apr 2021 18:42:34 -0400
Subject: [PATCH 2/3] CreationalP

---
 src/covidify/sources/Director.py                     | 12 ++++++++++++
 src/covidify/sources/github.py                       |  7 +------
 .../{github_GHU_Builder.py => github_JHU_Builder.py} |  0
 3 files changed, 13 insertions(+), 6 deletions(-)
 create mode 100644 src/covidify/sources/Director.py
 rename src/covidify/sources/{github_GHU_Builder.py => github_JHU_Builder.py} (100%)

diff --git a/src/covidify/sources/Director.py b/src/covidify/sources/Director.py
new file mode 100644
index 0000000..34ec6c8
--- /dev/null
+++ b/src/covidify/sources/Director.py
@@ -0,0 +1,12 @@
+class Director:
+{
+    #fetches the builder
+    def __init__(self) -> None;
+        self._builder = None
+    
+    def builder(self) -> githubSourceBuilder
+        return self._builder
+    
+    def builder(self, builder:githubSourceBuilder) -> None:
+        self._builder = builder
+}
\ No newline at end of file
diff --git a/src/covidify/sources/github.py b/src/covidify/sources/github.py
index 482576f..fc1ec9d 100644
--- a/src/covidify/sources/github.py
+++ b/src/covidify/sources/github.py
@@ -165,9 +165,4 @@ def get():
     df = get_data(cleaned_sheets)
     
     #Clean the column names
-    return df
-
-Director()
-{
-    #fetches the builder
-}
\ No newline at end of file
+    return df
\ No newline at end of file
diff --git a/src/covidify/sources/github_GHU_Builder.py b/src/covidify/sources/github_JHU_Builder.py
similarity index 100%
rename from src/covidify/sources/github_GHU_Builder.py
rename to src/covidify/sources/github_JHU_Builder.py

From df1a69113fc0f4db59a026db0b34f0993599781a Mon Sep 17 00:00:00 2001
From: Sachith R <bearsolitary@gmail.com>
Date: Thu, 15 Apr 2021 21:49:23 -0400
Subject: [PATCH 3/3] Creational Pattern

---
 src/covidify/covidify_data.py |  11 ++
 src/covidify/data_prep.py     | 235 ----------------------------------
 2 files changed, 11 insertions(+), 235 deletions(-)
 create mode 100644 src/covidify/covidify_data.py
 delete mode 100644 src/covidify/data_prep.py

diff --git a/src/covidify/covidify_data.py b/src/covidify/covidify_data.py
new file mode 100644
index 0000000..c475723
--- /dev/null
+++ b/src/covidify/covidify_data.py
@@ -0,0 +1,11 @@
+from convidify.sources import github, wiki
+
+"""
+global values
+"""
+class DataStore():
+    def jhu_sources(self):
+        return github.get()
+
+    def wiki_sources(self):
+        return github.get() 
\ No newline at end of file
diff --git a/src/covidify/data_prep.py b/src/covidify/data_prep.py
deleted file mode 100644
index 8202b8f..0000000
--- a/src/covidify/data_prep.py
+++ /dev/null
@@ -1,235 +0,0 @@
-"""
-data_prep.py - Extract data from date range and create models
-Usage:
-    data_prep.py [options]
-    data_prep.py -h | --help
-
-Options:
-    -h --help             Show this message.
-    --output_folder=OUT   Output folder for the data and reports to be saved.
-    --source=SRC          Datasource for where the data will be downloaded from.
-    --country=CNT         Arg for filtering by a specific country
-    --top=top             Top number of countries in the log plot
-"""
-from __future__ import print_function
-import os
-import sys
-import docopt
-import numpy as np
-import pandas as pd
-from string import capwords
-from difflib import get_close_matches
-from datetime import datetime, date, time 
-
-from covidify.sources import gitHub_GHU_Builder
-from covidify.config import REPO, TMP_FOLDER, TMP_GIT, DATA
-from covidify.utils.utils import replace_arg_score
-
-
-args = docopt.docopt(__doc__)
-out = args['--output_folder']
-country = args['--country']
-source = args['--source']
-top = int(args['--top'])
-
-
-############ DATA SELECTION ############
-
-if '_' in country:
-    country = replace_arg_score(country)
-
-if country == 'Global':
-    country = None
-
-if source == 'JHU':
-    df = github.get()
-    
-elif source == 'wiki':
-    print('Apologies, the wikipedia source is not ready yet - getting github data')
-    df = github.get()
-    
-
-
-############ COUNTRY SELECTION ############
-
-def get_similar_countries(c, country_list):
-    pos_countries = get_close_matches(c, country_list)
-    
-    if len(pos_countries) > 0:
-        print('\033[1;31m'+c, 'was not listed. did you mean', pos_countries[0].capitalize() + '?\033[0;0m')
-        
-        #Only delete if its a covidify generated folder
-        if 'Desktop/covidify-output-' in out:
-            os.system('rm -rf ' + out)
-        sys.exit(1)
-    else:
-        print('\033[1;31m'+c, 'was not listed.\033[0;0m')
-        if 'Desktop/covidify-output-' in out:
-            os.system('rm -rf ' + out)
-        sys.exit(1)
-        
-def check_specified_country(df, country):
-    '''
-    let user filter reports by country, if not found
-    then give a option if the string is similar
-    '''
-    
-    # Get all unique countries in the data
-    country_list = list(map(lambda x:x.lower().strip(), set(df.country.values)))
-
-    if country:
-        print('Country specified!')
-        if country.lower() == 'Mainland China': #Mainland china and china doesn't come up as similar
-            print(country, 'was not listed. did you mean China?')
-            sys.exit(1)
-        # give similar option if similarity found
-        if country.lower() not in country_list:
-            get_similar_countries(country, country_list)
-            
-        else:
-            #Return filtered dataframe
-            print('... filtering data for', country)
-            if len(country) == 2:
-                df = df[df.country == country.upper()]
-            else:
-                df = df[df.country == capwords(country)]
-            return df
-    else:
-        print('... No specific country specified')
-        return df
-
-df = check_specified_country(df, country)
-
-############ DAILY CASES ############
-
-# sheets need to be sorted by date value
-# print('Sorting by datetime...')
-df = df.sort_values('datetime')
-
-current_date = str(datetime.date(datetime.now()))
-
-'''
-Get the difference of the sum totals for each
-date and plot them on a trendline graph
-'''
-def get_new_cases(tmp, col):
-    diff_list = []
-    tmp_df_list = []
-    df = tmp.copy()
-
-    for i, day in enumerate(df.sort_values('file_date').file_date.unique()):
-        tmp_df = df[df.file_date == day]
-        tmp_df_list.append(tmp_df[col].sum())
-
-        if i == 0:
-            diff_list.append(tmp_df[col].sum())
-        else:
-            diff_list.append(tmp_df[col].sum() - tmp_df_list[i-1])
-
-    return diff_list
-
-def get_moving_average(tmp, col):
-    df = tmp.copy()
-    return df[col].rolling(window=2).mean()
-
-def get_exp_moving_average(tmp, col):
-    df = tmp.copy()
-    return df[col].ewm(span=2, adjust=True).mean()
-
-
-print('... Calculating dataframe for new cases')
-daily_cases_df = pd.DataFrame([])
-daily_cases_df['date'] = df.file_date.unique()
-daily_cases_df = daily_cases_df.sort_values('date')
-daily_cases_df['new_confirmed_cases'] = get_new_cases(df, 'confirmed')
-daily_cases_df['new_deaths'] = get_new_cases(df, 'deaths')
-daily_cases_df['new_recoveries'] = get_new_cases(df, 'recovered')
-daily_cases_df['cumulative_cases'] = daily_cases_df.new_confirmed_cases.cumsum()
-daily_cases_df.insert(loc=0, column='day', value=np.arange(0, len(daily_cases_df)))
-
-'''
-Calculate the number of people that are ACTUALLY infected on a given day
-currently infected = sum of people date - (recovored + died)
-ex: 5 = 10 - (4 - 1)
-
-'''
-current_infected = pd.DataFrame([])
-current_infected['currently_infected'] = (df.groupby('file_date').confirmed.sum() - (df.groupby('file_date').deaths.sum() + df.groupby('file_date').recovered.sum()))
-current_infected['delta'] = (current_infected['currently_infected'] - df.groupby('file_date').confirmed.sum())
-current_infected.index.rename('date', inplace=True)
-
-daily_cases_df = pd.merge(daily_cases_df, current_infected, how='outer', on='date')
-
-############ LOG DATA ############
-
-print('Calculating data for logarithmic plotting...')
-if not country:
-    print('... top infected countries: {}'.format(top))
-
-def get_top_countries(data):
-    # Get top N infected countries
-    tmp_df = data.copy()
-    tmp_df = tmp_df[tmp_df.file_date == df.file_date.max()]
-    return tmp_df.groupby(['country']).agg({'confirmed': 'sum'}).sort_values('confirmed',ascending=False).head(top).index 
-        
-TOP_N_COUNTRIES = get_top_countries(df)    
-
-tmp_df = df[df.country.isin(TOP_N_COUNTRIES)].copy()
-
-def get_day_counts(d, country):
-    '''
-    For each country, get the days of the spread since 500
-    cases
-    '''
-    data = d.copy()
-    result_df = pd.DataFrame([])
-    result_df = data.groupby(['file_date']).agg({'confirmed': 'sum',
-                                                'recovered': 'sum',
-                                                'deaths': 'sum'})
-    result_df['date'] = data['file_date'].unique()
-    result_df['country'] = country
-        
-    result_df = result_df[result_df.confirmed >= 500]
-    result_df.insert(loc=0, column='day', value=np.arange(len(result_df)))
-    return result_df
-
-df_list = []
-
-for country in TOP_N_COUNTRIES:
-    print('   ...', country + ': ' +  str(tmp_df[(tmp_df.file_date == df.file_date.max()) & 
-                                                 (tmp_df.country == country)].confirmed.sum()))
-    df_list.append(get_day_counts(tmp_df[tmp_df.country == country], country))
-    
-log_df = pd.concat(df_list, axis=0, ignore_index=True)
-
-
-############ SAVE DATA ############
-#Create date of extraction folder
-data_folder = os.path.join('data', str(datetime.date(datetime.now())))
-save_dir = os.path.join(out, data_folder)
-
-if not os.path.exists(save_dir):
-    os.system('mkdir -p ' + save_dir)
-
-print('Creating subdirectory for data...')
-print('...', save_dir)
-
-print('Saving...')
-csv_file_name = 'agg_data_{}.csv'.format(datetime.date(datetime.now()))
-df.astype(str).to_csv(os.path.join(save_dir, csv_file_name))
-print('...', csv_file_name)
-
-daily_cases_file_name = 'trend_{}.csv'.format(datetime.date(datetime.now()))
-daily_cases_df.astype(str).to_csv(os.path.join(save_dir, daily_cases_file_name))
-print('...', daily_cases_file_name)
-
-log_file_name = 'log_{}.csv'.format(datetime.date(datetime.now()))
-log_df.astype(str).to_csv(os.path.join(save_dir, log_file_name))
-print('...', log_file_name)
-
-print('Done!')
-
-
-
-Class IBuilder(metaclass = ABCMeta)
-    df = github.get()
\ No newline at end of file