Merge pull request #136 from hodcroftlab/usa_page

First Attempt at USA 'Per Country' Page
hodcroftlab · Apr 23, 2021 · 6df185c · 6df185c
2 parents 60a8d1a + 799cd5a
commit 6df185c
Show file tree

Hide file tree

Showing 24 changed files with 39,997 additions and 16,714 deletions.
diff --git a/cluster_tables/USAClusters_data.json b/cluster_tables/USAClusters_data.json
diff --git a/content/PerAreaIntro.md b/content/PerAreaIntro.md
@@ -0,0 +1,5 @@
+import RegionBreakdown from 'content/RegionBreakdownInfo.md'
+
+Graphs show for each region, the proportion of total number of **sequences** (*not cases*), over time, that fall into defined variant groups. Regions are displayed if they have at least 20 sequences in any variant being tracked. They are ordered by total number of sequences in tracked variants.
+
+<RegionBreakdown/>
diff --git a/content/PerCountryIntro.md b/content/PerCountryIntro.md
@@ -1,15 +1,5 @@
-Graphs show for each country, the proportion of total number of **sequences** (*not cases*), over time, that fall into defined variant groups. Countries are displayed if they have at least 70 sequences in any variant being tracked. Countries are ordered by total number of sequences in tracked variants.
-
-The grey colour between the top of the coloured curve and '1' on the Y-axis is composed of variants, lineages, and mutations that we don't currently track.
-
-Since mutations could appear in multiple variant, only variants are plotted below.
+import RegionBreakdown from 'content/RegionBreakdownInfo.md'
 
-<!-- Sequences with the 69 deletion or a mutation at <AaMut mut={'S:E484'}/> in Spike are not shown on these plots as they commonly are found in other varaints (<Var name="20A/S:439K"/>, <Var name="S:Y453F"/>, and <Var name="S:N501"/> for <AaMut mut={'S:H69-'}/>; <Var name="S:N501"/> for <AaMut mut={'S:E484'}/>), so they would be 'double-plotted'. -->
-
-**Particularly from early 2021** many countries have started preferentially sequencing samples to detect the main variants of concern (see <Var name="20I/501Y.V1" prefix=""/>, <Var name="20H/501Y.V2" prefix=""/>, <Var name="20J/501Y.V3" prefix=""/>). Often this is through sequencing samples that have an 'S-drop-out,' which in particular biases the frequencies of <Var name="20A/S:439K"/> and 501Y.V1 (shown here as a bias in the 3 variants of concern). Alternatively, this can be through preferentially sequencing cases with particular travel histories, or connections to known cases of the variants of concern.
-
-It is worth interpreting with caution:
-- The last data point - this often has incomplete data and may change as more sequences come in
-- Frequencies that are very 'jagged' - this often indicates low sequencing numbers and so may not be truly representative of the country
-- In many countries, sampling may not be equal across the country: samples may only cover one area or certain areas. It's important not to assume frequencies shown are necessarily representative of the country.
+Graphs show for each country, the proportion of total number of **sequences** (*not cases*), over time, that fall into defined variant groups. Countries are displayed if they have at least 70 sequences in any variant being tracked. Countries are ordered by total number of sequences in tracked variants.
 
+<RegionBreakdown/>
diff --git a/content/RegionBreakdownInfo.md b/content/RegionBreakdownInfo.md
@@ -0,0 +1,11 @@
+The grey colour between the top of the coloured curve and '1' on the Y-axis is composed of variants, lineages, and mutations that we don't currently track.
+
+Since mutations could appear in multiple variant, only variants are plotted below.
+
+**In early 2021** many places have started preferentially sequencing samples to detect the main variants of concern (see <Var name="20I/501Y.V1" prefix=""/>, <Var name="20H/501Y.V2" prefix=""/>, <Var name="20J/501Y.V3" prefix=""/>). Often this is through sequencing samples that have an 'S-drop-out,' which in particular biases the frequencies of <Var name="20A/S:439K"/> and 501Y.V1 (shown here as a bias in the 3 variants of concern). Alternatively, this can be through preferentially sequencing cases with particular travel histories, or connections to known cases of the variants of concern.
+
+It is worth interpreting with caution:
+- The last data point - this often has incomplete data and may change as more sequences come in
+- Frequencies that are very 'jagged' - this often indicates low sequencing numbers and so may not be truly representative of the country
+- In many places, sampling may not be equal across the region: samples may only cover one area or certain areas. It's important not to assume frequencies shown are necessarily representative.
+
diff --git a/figures/EUClusters_compare.pdf b/figures/EUClusters_compare.pdf
diff --git a/figures/USAClusters_compare.pdf b/figures/USAClusters_compare.pdf
diff --git a/overall_trends_figures/USAClusters_compare.png b/overall_trends_figures/USAClusters_compare.png
diff --git a/overall_trends_figures/overall_trends_S.P681.png b/overall_trends_figures/overall_trends_S.P681.png
diff --git a/overall_trends_figures/overall_trends_S.S477.png b/overall_trends_figures/overall_trends_S.S477.png
diff --git a/scripts/allClusterDynamics_faster.py b/scripts/allClusterDynamics_faster.py
@@ -180,6 +180,13 @@ def marker_size(n):
 print("These clusters will be run: ", clus_to_run)
 
 
+
+division = False
+division_answer = input("\nDo division for USA?(y/n) (Enter is no): ")
+if division_answer in ["y", "Y", "yes", "YES", "Yes"]:
+    division = True
+    selected_country = "USA"
+
 ##################################
 ##################################
 #### Read in the starting files
@@ -572,7 +579,6 @@ def marker_size(n):
 #### PREPARING FOR OF PLOTTING
 
 
-
 for clus in clus_to_run:
     print(f"\nPreparing to plot cluster {clus}\n")
 
@@ -586,13 +592,25 @@ def marker_size(n):
 
     clus_data['cluster_data'] = []
     clus_data['total_data'] = []
+    clus_data[selected_country] = {}
+    clus_data[selected_country]['cluster_data_div'] = []
+    clus_data[selected_country]['total_data_div'] = []
+
+    #divide out data for divisions
+    cluster_meta_div = cluster_meta[cluster_meta['country'].apply(lambda x: x == selected_country)]
+    observed_divisions = [x for x in cluster_meta_div['division'].unique()]
+
+    division_info, division_dates = get_summary(cluster_meta_div, observed_divisions, division=True)
+    clus_data[selected_country]['division_info'] = division_info
+    clus_data[selected_country]['division_dates'] = division_dates
 
     # We want to look at % of samples from a country that are in this cluster
     # To avoid the up-and-down of dates, bin samples into weeks
     countries_to_plot = country_list
     acknowledgement_table = []
     clus_data['acknowledgements'] = acknowledgement_table
 
+    #COUNTRY LEVEL
     # Get counts per week for sequences in the cluster
     clus_week_counts = {}
     clus_2week_counts = {}
@@ -625,18 +643,6 @@ def marker_size(n):
             #temp_meta = meta[meta['country'].isin([coun])]
             temp_meta = meta[meta['country'].apply(lambda x: x == coun)]
 
-#        temp_meta = temp_meta[temp_meta['date'].apply(lambda x: len(x) == 10)]
-#        temp_meta = temp_meta[temp_meta['date'].apply(lambda x: 'XX' not in x)]
-#
-#        temp_meta['date_formatted'] = temp_meta['date'].apply(lambda x:datetime.datetime.strptime(x, "%Y-%m-%d"))
-#        #warn of any in the future
-#        future_meta = temp_meta[temp_meta['date_formatted'].apply(lambda x: x > datetime.date.today())]
-#        if not future_meta.empty:
-#            print("WARNING! Data from the future!")
-#            print(future_meta)
-#        #get rid of any with dates in the future.....
-#        temp_meta = temp_meta[temp_meta['date_formatted'].apply(lambda x: x <= datetime.date.today())]
-
         #temp_meta['calendar_week'] = temp_meta['date_formatted'].apply(lambda x: x.isocalendar()[1])
         temp_meta['calendar_week'] = temp_meta['date_formatted'].apply(lambda x: (x.isocalendar()[0], x.isocalendar()[1]))
         temp_meta = temp_meta[temp_meta['calendar_week']>=min_data_week]
@@ -664,10 +670,62 @@ def marker_size(n):
         total_2week_counts[coun] = counts_by_2week
 
 
+    # DIVISION LEVEL
+    # Get counts per week for sequences in the cluster
+    clus_week_counts_div = {}
+    clus_2week_counts_div = {}
+    for div in observed_divisions:
+        counts_by_week = defaultdict(int)
+        counts_by_2week = defaultdict(int)
+        for dat in division_dates[div]:
+            #counts_by_week[dat.timetuple().tm_yday//7]+=1 # old method
+            #counts_by_week[dat.isocalendar()[1]]+=1  #returns ISO calendar week
+            wk = dat.isocalendar()[1]#returns ISO calendar week
+            wk2 = (dat.isocalendar()[1]//2*2) #returns ISO calendar week -every 2 weeks
+            yr = dat.isocalendar()[0]
+            yr_wk = (yr, wk)
+            yr_2wk = (yr, wk2)
+            counts_by_week[yr_wk]+=1
+            counts_by_2week[yr_2wk]+=1
+        clus_week_counts_div[div] = counts_by_week
+        clus_2week_counts_div[div] = counts_by_2week
+
+    # Get counts per week for sequences regardless of whether in the cluster or not - from week 20 only.
+    total_week_counts_div = {}
+    total_2week_counts_div = {}
+    for div in observed_divisions:
+        counts_by_week = defaultdict(int)
+        counts_by_2week = defaultdict(int)
+        if div in uk_countries or division:
+            #temp_meta = meta[meta['division'].isin([coun])]
+            temp_meta = meta[meta['division'].apply(lambda x: x == div)]
+        else:
+            #temp_meta = meta[meta['country'].isin([coun])]
+            temp_meta = meta[meta['country'].apply(lambda x: x == div)]
+
+        #temp_meta['calendar_week'] = temp_meta['date_formatted'].apply(lambda x: x.isocalendar()[1])
+        temp_meta['calendar_week'] = temp_meta['date_formatted'].apply(lambda x: (x.isocalendar()[0], x.isocalendar()[1]))
+        temp_meta = temp_meta[temp_meta['calendar_week']>=min_data_week]
+        #If no samples are after min_data_week, will be empty!!
+        if temp_meta.empty:
+            continue
+
+        week_counts = temp_meta['calendar_week'].value_counts().sort_index()
+        counts_by_week = week_counts.to_dict()
+        total_week_counts_div[div] = counts_by_week
+
+        # TWO WEEKS
+        temp_meta['calendar_2week'] = temp_meta['date_formatted'].apply(lambda x: (x.isocalendar()[0], x.isocalendar()[1]//2*2))
+        temp_meta = temp_meta[temp_meta['calendar_2week']>=min_data_week]
+        week2_counts = temp_meta['calendar_2week'].value_counts().sort_index()
+        counts_by_2week = week2_counts.to_dict()
+        total_2week_counts_div[div] = counts_by_2week
+
+
 #    if print_acks:
 #        acknowledgement_table.to_csv(f'{acknowledgement_folder}{clus}_acknowledgement_table.tsv', sep="\t")
 
-
+    #COUNTRY LEVEL
     # Convert into dataframe
     cluster_data = pd.DataFrame(data=clus_week_counts)
     total_data = pd.DataFrame(data=total_week_counts)
@@ -688,11 +746,32 @@ def marker_size(n):
     clus_data['cluster_data_2wk'] = cluster_data_2wk
     clus_data['total_data_2wk'] = total_data_2wk
 
+    #DIVISION LEVEL
+    # Convert into dataframe
+    cluster_data_div = pd.DataFrame(data=clus_week_counts_div)
+    total_data_div = pd.DataFrame(data=total_week_counts_div)
+    # sort
+    total_data_div=total_data_div.sort_index()
+    cluster_data_div=cluster_data_div.sort_index()
+    #store
+    clus_data[selected_country]['cluster_data_div'] = cluster_data_div
+    clus_data[selected_country]['total_data_div'] = total_data_div
+
+    # Convert into dataframe -- 2 weeks
+    cluster_data_2wk_div = pd.DataFrame(data=clus_2week_counts_div)
+    total_data_2wk_div = pd.DataFrame(data=total_2week_counts_div)
+    # sort
+    total_data_2wk_div=total_data_2wk_div.sort_index()
+    cluster_data_2wk_div=cluster_data_2wk_div.sort_index()
+    #store
+    clus_data[selected_country]['cluster_data_2wk_div'] = cluster_data_2wk_div
+    clus_data[selected_country]['total_data_2wk_div'] = total_data_2wk_div
+
 ######################################################################################################
 ##################################
 #### FIGURE OUT WHAT TO PLOT
 
-cutoff_num_seqs = 100
+cutoff_num_seqs = 130
 
 
 clusters_tww = []
@@ -931,35 +1010,50 @@ def marker_size(n):
 if do_country == False:
     print("You can alway run this step by calling `plot_country_data(clusters, proposed_coun_to_plot, print_files)`")
 
-def get_ordered_clusters_to_plot(clusters):
+def get_ordered_clusters_to_plot(clusters, division=False, selected_country=None):
     #fix cluster order in a list so it's reliable
     clus_keys = [x for x in clusters.keys()] #if x in clusters_tww]
 
     # DO NOT PLOT 69 AS IT OVERLAPS WITH 439 AND 501!!!!
     # Do not plot 484 as it overlaps with 501Y.V2, possibly others
     # Do not plot DanishCluster as also overlaps
     #clus_keys = [x for x in clus_keys if x not in ["S69","S484", "DanishCluster"]]
-    clus_keys = [x for x in clus_keys if clusters[x]["graphing"] is True]
+    if division: 
+        clus_keys = [x for x in clus_keys if clusters[x]["type"] == "variant" or ("usa_graph" in clusters[x] and clusters[x]["usa_graph"] is True)]
+        cluster_data_key = "cluster_data_2wk_div"
+        min_to_plot = 20
+    else:
+        clus_keys = [x for x in clus_keys if clusters[x]["graphing"] is True]
+        cluster_data_key = "cluster_data_2wk"
+        min_to_plot = 70
 
     countries_all = defaultdict(dict)
     for clus in clus_keys:
-        clus_dat = clusters[clus]["cluster_data_2wk"]
+        if division:
+            clus_dat = clusters[clus][selected_country][cluster_data_key]
+        else:
+            clus_dat = clusters[clus][cluster_data_key]
         for coun in clus_dat.columns:
             countries_all[coun][clus] = clus_dat[coun]
 
     # how to decide what to plot?
-    min_to_plot = 70
     proposed_coun_to_plot = []
     for clus in clus_keys:
-        country_inf = clusters[clus]["country_info_df"]
+        if division:
+            country_inf = clusters[clus][selected_country]["division_info"]
+        else:
+            country_inf = clusters[clus]["country_info_df"]
         proposed_coun_to_plot.extend(country_inf[country_inf.num_seqs > min_to_plot].index)
     proposed_coun_to_plot = set(proposed_coun_to_plot)
-    print(f"At min plot {min_to_plot}, there are {len(proposed_coun_to_plot)} countries")
+    print(f"At min plot {min_to_plot}, there are {len(proposed_coun_to_plot)} entries")
 
     total_coun_counts = {}
     #decide order
     for clus in clus_keys:
-        country_inf = clusters[clus]["country_info_df"]
+        if division:
+            country_inf = clusters[clus][selected_country]["division_info"]
+        else:
+            country_inf = clusters[clus]["country_info_df"]
         for coun in proposed_coun_to_plot:
             if coun not in total_coun_counts:
                 total_coun_counts[coun] = 0
@@ -971,7 +1065,7 @@ def get_ordered_clusters_to_plot(clusters):
 
     return proposed_coun_to_plot, clus_keys
 
-def plot_country_data(clusters, proposed_coun_to_plot, print_files, clus_keys):
+def plot_country_data(clusters, proposed_coun_to_plot, print_files, clus_keys, file_prefix, division=False, selected_country=None):
     country_week = {clus: {} for clus in clusters}
 
     fs = 14
@@ -993,9 +1087,15 @@ def plot_country_data(clusters, proposed_coun_to_plot, print_files, clus_keys):
         json_output['countries'][coun] = {'week': {}, 'total_sequences': {} }
 
         for clus in clus_keys:
-            cluster_data = clusters[clus]['cluster_data_2wk']
+            if division:
+                cluster_data = clusters[clus][selected_country]['cluster_data_2wk_div']
+            else:
+                cluster_data = clusters[clus]['cluster_data_2wk']
             cluster_data=cluster_data.sort_index()
-            total_data = clusters[clus]['total_data_2wk']
+            if division:
+                total_data = clusters[clus][selected_country]['total_data_2wk_div']
+            else:
+                total_data = clusters[clus]['total_data_2wk']
             if coun not in cluster_data:
                 if clus == clus_keys[-1]:
                     ax.fill_between(week_as_date, cluster_count/total_count, 1, facecolor=grey_color)
@@ -1061,20 +1161,30 @@ def plot_country_data(clusters, proposed_coun_to_plot, print_files, clus_keys):
 
     if print_files:
         fmt = "pdf"
-        with open(tables_path+f'EUClusters_data.json', 'w') as fh:
+        with open(tables_path+f'{file_prefix}_data.json', 'w') as fh:
             json.dump(json_output, fh)
 
-        plt.savefig(figure_path+f"EUClusters_compare.png")
+        plt.savefig(figure_path+f"{file_prefix}_compare.png")
 
-        plt.savefig(figure_only_path+f"EUClusters_compare.{fmt}")
-        trends_path = figure_only_path+f"EUClusters_compare.{fmt}"
+        plt.savefig(figure_only_path+f"{file_prefix}_compare.{fmt}")
+        trends_path = figure_only_path+f"{file_prefix}_compare.{fmt}"
         copypath = trends_path.replace("compare", "compare-{}".format(datetime.date.today().strftime("%Y-%m-%d")))
         copyfile(trends_path, copypath)
 
 
 if do_country:
     proposed_coun_to_plot, clus_keys = get_ordered_clusters_to_plot(clusters)
-    plot_country_data(clusters, proposed_coun_to_plot, print_files, clus_keys)
+    plot_country_data(clusters, proposed_coun_to_plot, print_files, clus_keys, "EUClusters")
+
+do_usa_country = False
+if "all" in clus_answer:
+    print_answer = input("\nContinue to USA-specific country plotting? (y/n) (Enter is no): ")
+    if print_answer in ["y", "Y", "yes", "YES", "Yes"]:
+        do_usa_country = True
+
+if do_usa_country:
+    proposed_coun_to_plot, clus_keys = get_ordered_clusters_to_plot(clusters, True, "USA")
+    plot_country_data(clusters, proposed_coun_to_plot, print_files, clus_keys, "USAClusters", True, "USA")
 
 #if all went well (script got to this point), and did an 'all' run, then print out an update!
 #from datetime import datetime