Ruff format

FannyPicou · Jan 18, 2025 · 0796ac3 · 0796ac3
1 parent a3f9b55
commit 0796ac3
Show file tree

Hide file tree

Showing 6 changed files with 41 additions and 57 deletions.
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,7 @@ dependencies = [
 ]
 
 authors = [
-  { name="Fanny Picourlat", email="fanny.picourlat@gmail.com" },
+    { name = "Fanny Picourlat", email = "fanny.picourlat@gmail.com" },
 ]
 description = "Cleaner and filler of groundwater level time series, using correlated time series from neighboring wells"
 readme = "README.md"
@@ -24,8 +24,8 @@ classifiers = [
 ]
 
 [project.optional-dependencies]
-dev = ["pytest","ruff","black","mkdocs","mkdocstrings[python]", "mkdocs-material"]
-test= ["pytest"]
+dev = ["pytest", "ruff", "black", "mkdocs", "mkdocstrings[python]", "mkdocs-material"]
+test = ["pytest"]
 
 [project.urls]
 "Homepage" = "https://github.com/FannyPicou/wt_ts_filler"

diff --git a/scripts/main.py b/scripts/main.py
@@ -5,13 +5,14 @@
 import pandas as pd
 
 # Import dataframe
-dataframe = pd.read_csv('C:/Users/picourlat/Documents/040724_Data_recap/DATA/Hydrologic_data/Groundwater_lvls/Analyse_data_drought/Data/wt_ts.csv')
-dataframe.iloc[:,0] = pd.to_datetime(dataframe.iloc[:,0], format='%Y-%m-%d') # set dates in datetime format
+dataframe = pd.read_csv(
+    'C:/Users/picourlat/Documents/040724_Data_recap/DATA/Hydrologic_data/Groundwater_lvls/Analyse_data_drought/Data/wt_ts.csv')
+dataframe.iloc[:, 0] = pd.to_datetime(dataframe.iloc[:, 0], format='%Y-%m-%d')  # set dates in datetime format
 
 # Split the dataframe into data series
 data_series = []
 for i in range(1, len(dataframe.columns)):
-    data = pd.Series(dataframe.iloc[:,i].values, index=dataframe.iloc[:,0], name="data"+str(i))
+    data = pd.Series(dataframe.iloc[:, i].values, index=dataframe.iloc[:, 0], name="data" + str(i))
     data_series.append(data)
 
 # Clean
@@ -20,45 +21,17 @@
     FlatPeriodCleaner(flat_period=10)
 ]
 
-for data in data_series :
+for data in data_series:
     data_original = data.copy()
     for cleaner in cleaners:
         data = cleaner.clean(data)
     # plot_timeseries(data_original, data)
 cleaned_dataframe = pd.concat(data_series, axis=1)
 
 # Fill gaps
-estimated_dataframe = GapsFiller(max_gap_lin_interp=5,Corr_min=0.75).fill(cleaned_dataframe)
+estimated_dataframe = GapsFiller(max_gap_lin_interp=5, Corr_min=0.75).fill(cleaned_dataframe)
 estimated_dataframe.columns = dataframe.columns[1:]
-plot_dataframes(cleaned_dataframe,estimated_dataframe)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+plot_dataframes(cleaned_dataframe, estimated_dataframe)
 
 # from scipy import stats
 # corr_matrix = cleaned_dataframe.corr()

diff --git a/tsfiller/Filling_gaps.py b/tsfiller/Filling_gaps.py
@@ -1,6 +1,7 @@
 from scipy import stats
 import numpy as np
 
+
 class GapsFiller:
     def __init__(self, max_gap_lin_interp, Corr_min):
         self.max_gap_lin_interp = max_gap_lin_interp
@@ -16,13 +17,16 @@ def fill(self, dataframe):
         estimated_df_interpolated = estimated_dataframe.interpolate()
         for c in estimated_dataframe:
             mask = estimated_dataframe[c].isna()
-            x = (mask.groupby((mask != mask.shift()).cumsum()).transform(lambda x: len(x) > self.max_gap_lin_interp) * mask)
+            x = (mask.groupby((mask != mask.shift()).cumsum()).transform(
+                lambda x: len(x) > self.max_gap_lin_interp) * mask)
             estimated_df_interpolated[c] = estimated_df_interpolated.loc[~x, c]
         estimated_dataframe = estimated_df_interpolated
 
         # Step 2 : Search the more correlated and apply linear regression + compute epsilon left
         # print("Apply linear regression and compute epsilon left")
-        print("Estimation of missing data from a data set with a correlation coefficient greater than or equal to " + str(self.Corr_min)+ '.')
+        print(
+            "Estimation of missing data from a data set with a correlation coefficient greater than or equal to " + str(
+                self.Corr_min) + '.')
         estimated_dataframe_w_lin_reg = estimated_dataframe.copy()
         df_epsilon = estimated_dataframe.copy()
         correlation_df = estimated_dataframe.copy()
@@ -43,7 +47,7 @@ def fill(self, dataframe):
                     n = 0
                     while np.isnan(dataframe.iloc[j - 1, int(col_max_corr[4:]) - 1]) and np.isnan(
                             dataframe.iloc[j, int(col_max_corr[
-                                                          4:]) - 1]) and n < Nb_datasets_corr - 1:  # tant que la valeur aux temps j-1 et j du dataset le + corrélé est nan, et que n<31
+                                                  4:]) - 1]) and n < Nb_datasets_corr - 1:  # tant que la valeur aux temps j-1 et j du dataset le + corrélé est nan, et que n<31
                         col_corr_matrix = col_corr_matrix.drop(
                             labels=[col_max_corr])  # on supprime la ligne de la colonne de correlation
                         col_max_corr = col_corr_matrix.idxmax()  # on recherche le dataset le plus corrélé
@@ -121,7 +125,7 @@ def fill(self, dataframe):
                     n = 0
                     while np.isnan(dataframe.iloc[j - 1, int(col_max_corr[4:]) - 1]) and np.isnan(
                             dataframe.iloc[j, int(col_max_corr[
-                                                          4:]) - 1]) and n < Nb_datasets_corr - 1:  # tant que la valeur aux temps j-1 et j du dataset le + corrélé est nan, et que n<31
+                                                  4:]) - 1]) and n < Nb_datasets_corr - 1:  # tant que la valeur aux temps j-1 et j du dataset le + corrélé est nan, et que n<31
                         col_corr_matrix = col_corr_matrix.drop(
                             labels=[col_max_corr])  # on supprime la ligne de la colonne de correlation
                         col_max_corr = col_corr_matrix.idxmax()  # on recherche le dataset le plus corrélé
@@ -156,7 +160,7 @@ def fill(self, dataframe):
                     n = 0
                     while np.isnan(dataframe.iloc[j + 1, int(col_max_corr[4:]) - 1]) and np.isnan(
                             dataframe.iloc[j, int(col_max_corr[
-                                                          4:]) - 1]) and n < Nb_datasets_corr - 1:  # tant que la valeur aux temps j-1 et j du dataset le + corrélé est nan, et que n<31
+                                                  4:]) - 1]) and n < Nb_datasets_corr - 1:  # tant que la valeur aux temps j-1 et j du dataset le + corrélé est nan, et que n<31
                         col_corr_matrix = col_corr_matrix.drop(
                             labels=[col_max_corr])  # on supprime la ligne de la colonne de correlation
                         col_max_corr = col_corr_matrix.idxmax()  # on recherche le dataset le plus corrélé
@@ -182,13 +186,13 @@ def fill(self, dataframe):
                     k = j
                     while np.isnan(estimated_dataframe.iloc[k, i]) and ~np.isnan(
                             df_predict_lengths.iloc[k, i]) and ~np.isnan(df_epsilon.iloc[k, i]) and ~np.isnan(
-                            estimated_dataframe_w_lin_reg.iloc[k, i]):
+                        estimated_dataframe_w_lin_reg.iloc[k, i]):
                         L = int(df_predict_lengths.iloc[k, i])
                         epsilon_left = df_epsilon.iloc[j, i]
                         if ~np.isnan(df_epsilon.iloc[j + L - 1, i]):
                             epsilon_right = df_epsilon.iloc[j + L - 1, i]
                             estimated_dataframe.iloc[j:j + L, i] = [estimated_dataframe_w_lin_reg.iloc[j + l, i] + (
-                                        l * epsilon_right + (L - l) * epsilon_left) / L for l in range(L)]
+                                    l * epsilon_right + (L - l) * epsilon_left) / L for l in range(L)]
                         else:
                             estimated_dataframe.iloc[j:j + L, i] = [
                                 estimated_dataframe_w_lin_reg.iloc[j + l, i] + epsilon_left for l in range(L)]
@@ -197,8 +201,6 @@ def fill(self, dataframe):
 
         return estimated_dataframe
 
-
-
 # # Other technique : apply same variation
 #
 # estimated_dataframe = cleaned_dataframe.copy()
@@ -292,4 +294,4 @@ def fill(self, dataframe):
 #             L = int(df_predict_lengths.iloc[j, i])
 #             print([(estimated_dataframe_backward.iloc[j, i]*l + estimated_dataframe_forward.iloc[j, i]+(L-l))/L for l in range(L)])
 #             print(estimated_dataframe.iloc[j:j+L, i])
-#             estimated_dataframe.iloc[j:j+L, i] = [(estimated_dataframe_backward.iloc[j+l, i]*l + estimated_dataframe_forward.iloc[j+l, i]*(L-l))/L for l in range(int(L))]
+#             estimated_dataframe.iloc[j:j+L, i] = [(estimated_dataframe_backward.iloc[j+l, i]*l + estimated_dataframe_forward.iloc[j+l, i]*(L-l))/L for l in range(int(L))]
diff --git a/tsfiller/cleaning.py b/tsfiller/cleaning.py
@@ -1,5 +1,6 @@
 import numpy as np
 
+
 class SpikeCleaner:
     def __init__(self, max_jump):
         self.max_jump = max_jump
@@ -23,6 +24,7 @@ def clean(self, data):
                 data[t] = np.nan
         return data
 
+
 class FlatPeriodCleaner:
     def __init__(self, flat_period):
         self.flat_period = flat_period
@@ -36,11 +38,11 @@ def clean(self, data):
         i = 0
         while i < len(data) - self.flat_period:
             count = 0
-            while data[i+count+1] == data[i+count] :
+            while data[i + count + 1] == data[i + count]:
                 count += 1
-            if count >= self.flat_period :
+            if count >= self.flat_period:
                 data[i: i + count + 1] = np.nan
                 i = i + 1 + count
             else:
                 i += 1
-        return data
+        return data
diff --git a/tsfiller/plotting.py b/tsfiller/plotting.py
@@ -1,19 +1,21 @@
 import matplotlib.pyplot as plt
 
+
 def plot_timeseries(data_original, data):
     plt.figure(figsize=(15, 5))
-    plt.plot(data_original, ".", color="red", label = 'Deleted data')
-    plt.plot(data, ".", color="green", label = 'Retained data')
+    plt.plot(data_original, ".", color="red", label='Deleted data')
+    plt.plot(data, ".", color="green", label='Retained data')
     plt.legend()
     plt.title(f"{data.name}")
     plt.ylabel('Groundwater level (mNGF)')
     plt.show()
 
 
-def plot_dataframes(cleaned_dataframe,estimated_dataframe):
+def plot_dataframes(cleaned_dataframe, estimated_dataframe):
     for i in range(len(estimated_dataframe.columns)):
         plt.figure(figsize=(15, 5))
-        plt.plot(estimated_dataframe.index, estimated_dataframe.iloc[:, i], lw=0, marker='.', color='orchid',label='Missing data estimation')
+        plt.plot(estimated_dataframe.index, estimated_dataframe.iloc[:, i], lw=0, marker='.', color='orchid',
+                 label='Missing data estimation')
         plt.plot(cleaned_dataframe.index, cleaned_dataframe.iloc[:, i], lw=0, marker='.',
                  label='Measurement', color='darkblue')
         plt.ylabel('Groundwater level (mNGF)')
@@ -22,5 +24,3 @@ def plot_dataframes(cleaned_dataframe,estimated_dataframe):
         plt.grid(True)
         plt.xticks(rotation=45)
         plt.show()
-
-