Skip to content

Commit

Permalink
Ruff format
Browse files Browse the repository at this point in the history
  • Loading branch information
FannyPicou committed Jan 18, 2025
1 parent a3f9b55 commit 0796ac3
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 57 deletions.
9 changes: 8 additions & 1 deletion .idea/workspace.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ dependencies = [
]

authors = [
{ name="Fanny Picourlat", email="fanny.picourlat@gmail.com" },
{ name = "Fanny Picourlat", email = "fanny.picourlat@gmail.com" },
]
description = "Cleaner and filler of groundwater level time series, using correlated time series from neighboring wells"
readme = "README.md"
Expand All @@ -24,8 +24,8 @@ classifiers = [
]

[project.optional-dependencies]
dev = ["pytest","ruff","black","mkdocs","mkdocstrings[python]", "mkdocs-material"]
test= ["pytest"]
dev = ["pytest", "ruff", "black", "mkdocs", "mkdocstrings[python]", "mkdocs-material"]
test = ["pytest"]

[project.urls]
"Homepage" = "https://github.com/FannyPicou/wt_ts_filler"
Expand Down
41 changes: 7 additions & 34 deletions scripts/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
import pandas as pd

# Import dataframe
dataframe = pd.read_csv('C:/Users/picourlat/Documents/040724_Data_recap/DATA/Hydrologic_data/Groundwater_lvls/Analyse_data_drought/Data/wt_ts.csv')
dataframe.iloc[:,0] = pd.to_datetime(dataframe.iloc[:,0], format='%Y-%m-%d') # set dates in datetime format
dataframe = pd.read_csv(
'C:/Users/picourlat/Documents/040724_Data_recap/DATA/Hydrologic_data/Groundwater_lvls/Analyse_data_drought/Data/wt_ts.csv')
dataframe.iloc[:, 0] = pd.to_datetime(dataframe.iloc[:, 0], format='%Y-%m-%d') # set dates in datetime format

# Split the dataframe into data series
data_series = []
for i in range(1, len(dataframe.columns)):
data = pd.Series(dataframe.iloc[:,i].values, index=dataframe.iloc[:,0], name="data"+str(i))
data = pd.Series(dataframe.iloc[:, i].values, index=dataframe.iloc[:, 0], name="data" + str(i))
data_series.append(data)

# Clean
Expand All @@ -20,45 +21,17 @@
FlatPeriodCleaner(flat_period=10)
]

for data in data_series :
for data in data_series:
data_original = data.copy()
for cleaner in cleaners:
data = cleaner.clean(data)
# plot_timeseries(data_original, data)
cleaned_dataframe = pd.concat(data_series, axis=1)

# Fill gaps
estimated_dataframe = GapsFiller(max_gap_lin_interp=5,Corr_min=0.75).fill(cleaned_dataframe)
estimated_dataframe = GapsFiller(max_gap_lin_interp=5, Corr_min=0.75).fill(cleaned_dataframe)
estimated_dataframe.columns = dataframe.columns[1:]
plot_dataframes(cleaned_dataframe,estimated_dataframe)




























plot_dataframes(cleaned_dataframe, estimated_dataframe)

# from scipy import stats
# corr_matrix = cleaned_dataframe.corr()
Expand Down
22 changes: 12 additions & 10 deletions tsfiller/Filling_gaps.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from scipy import stats
import numpy as np


class GapsFiller:
def __init__(self, max_gap_lin_interp, Corr_min):
self.max_gap_lin_interp = max_gap_lin_interp
Expand All @@ -16,13 +17,16 @@ def fill(self, dataframe):
estimated_df_interpolated = estimated_dataframe.interpolate()
for c in estimated_dataframe:
mask = estimated_dataframe[c].isna()
x = (mask.groupby((mask != mask.shift()).cumsum()).transform(lambda x: len(x) > self.max_gap_lin_interp) * mask)
x = (mask.groupby((mask != mask.shift()).cumsum()).transform(
lambda x: len(x) > self.max_gap_lin_interp) * mask)
estimated_df_interpolated[c] = estimated_df_interpolated.loc[~x, c]
estimated_dataframe = estimated_df_interpolated

# Step 2 : Search the more correlated and apply linear regression + compute epsilon left
# print("Apply linear regression and compute epsilon left")
print("Estimation of missing data from a data set with a correlation coefficient greater than or equal to " + str(self.Corr_min)+ '.')
print(
"Estimation of missing data from a data set with a correlation coefficient greater than or equal to " + str(
self.Corr_min) + '.')
estimated_dataframe_w_lin_reg = estimated_dataframe.copy()
df_epsilon = estimated_dataframe.copy()
correlation_df = estimated_dataframe.copy()
Expand All @@ -43,7 +47,7 @@ def fill(self, dataframe):
n = 0
while np.isnan(dataframe.iloc[j - 1, int(col_max_corr[4:]) - 1]) and np.isnan(
dataframe.iloc[j, int(col_max_corr[
4:]) - 1]) and n < Nb_datasets_corr - 1: # tant que la valeur aux temps j-1 et j du dataset le + corrélé est nan, et que n<31
4:]) - 1]) and n < Nb_datasets_corr - 1: # tant que la valeur aux temps j-1 et j du dataset le + corrélé est nan, et que n<31
col_corr_matrix = col_corr_matrix.drop(
labels=[col_max_corr]) # on supprime la ligne de la colonne de correlation
col_max_corr = col_corr_matrix.idxmax() # on recherche le dataset le plus corrélé
Expand Down Expand Up @@ -121,7 +125,7 @@ def fill(self, dataframe):
n = 0
while np.isnan(dataframe.iloc[j - 1, int(col_max_corr[4:]) - 1]) and np.isnan(
dataframe.iloc[j, int(col_max_corr[
4:]) - 1]) and n < Nb_datasets_corr - 1: # tant que la valeur aux temps j-1 et j du dataset le + corrélé est nan, et que n<31
4:]) - 1]) and n < Nb_datasets_corr - 1: # tant que la valeur aux temps j-1 et j du dataset le + corrélé est nan, et que n<31
col_corr_matrix = col_corr_matrix.drop(
labels=[col_max_corr]) # on supprime la ligne de la colonne de correlation
col_max_corr = col_corr_matrix.idxmax() # on recherche le dataset le plus corrélé
Expand Down Expand Up @@ -156,7 +160,7 @@ def fill(self, dataframe):
n = 0
while np.isnan(dataframe.iloc[j + 1, int(col_max_corr[4:]) - 1]) and np.isnan(
dataframe.iloc[j, int(col_max_corr[
4:]) - 1]) and n < Nb_datasets_corr - 1: # tant que la valeur aux temps j-1 et j du dataset le + corrélé est nan, et que n<31
4:]) - 1]) and n < Nb_datasets_corr - 1: # tant que la valeur aux temps j-1 et j du dataset le + corrélé est nan, et que n<31
col_corr_matrix = col_corr_matrix.drop(
labels=[col_max_corr]) # on supprime la ligne de la colonne de correlation
col_max_corr = col_corr_matrix.idxmax() # on recherche le dataset le plus corrélé
Expand All @@ -182,13 +186,13 @@ def fill(self, dataframe):
k = j
while np.isnan(estimated_dataframe.iloc[k, i]) and ~np.isnan(
df_predict_lengths.iloc[k, i]) and ~np.isnan(df_epsilon.iloc[k, i]) and ~np.isnan(
estimated_dataframe_w_lin_reg.iloc[k, i]):
estimated_dataframe_w_lin_reg.iloc[k, i]):
L = int(df_predict_lengths.iloc[k, i])
epsilon_left = df_epsilon.iloc[j, i]
if ~np.isnan(df_epsilon.iloc[j + L - 1, i]):
epsilon_right = df_epsilon.iloc[j + L - 1, i]
estimated_dataframe.iloc[j:j + L, i] = [estimated_dataframe_w_lin_reg.iloc[j + l, i] + (
l * epsilon_right + (L - l) * epsilon_left) / L for l in range(L)]
l * epsilon_right + (L - l) * epsilon_left) / L for l in range(L)]
else:
estimated_dataframe.iloc[j:j + L, i] = [
estimated_dataframe_w_lin_reg.iloc[j + l, i] + epsilon_left for l in range(L)]
Expand All @@ -197,8 +201,6 @@ def fill(self, dataframe):

return estimated_dataframe



# # Other technique : apply same variation
#
# estimated_dataframe = cleaned_dataframe.copy()
Expand Down Expand Up @@ -292,4 +294,4 @@ def fill(self, dataframe):
# L = int(df_predict_lengths.iloc[j, i])
# print([(estimated_dataframe_backward.iloc[j, i]*l + estimated_dataframe_forward.iloc[j, i]+(L-l))/L for l in range(L)])
# print(estimated_dataframe.iloc[j:j+L, i])
# estimated_dataframe.iloc[j:j+L, i] = [(estimated_dataframe_backward.iloc[j+l, i]*l + estimated_dataframe_forward.iloc[j+l, i]*(L-l))/L for l in range(int(L))]
# estimated_dataframe.iloc[j:j+L, i] = [(estimated_dataframe_backward.iloc[j+l, i]*l + estimated_dataframe_forward.iloc[j+l, i]*(L-l))/L for l in range(int(L))]
8 changes: 5 additions & 3 deletions tsfiller/cleaning.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np


class SpikeCleaner:
def __init__(self, max_jump):
self.max_jump = max_jump
Expand All @@ -23,6 +24,7 @@ def clean(self, data):
data[t] = np.nan
return data


class FlatPeriodCleaner:
def __init__(self, flat_period):
self.flat_period = flat_period
Expand All @@ -36,11 +38,11 @@ def clean(self, data):
i = 0
while i < len(data) - self.flat_period:
count = 0
while data[i+count+1] == data[i+count] :
while data[i + count + 1] == data[i + count]:
count += 1
if count >= self.flat_period :
if count >= self.flat_period:
data[i: i + count + 1] = np.nan
i = i + 1 + count
else:
i += 1
return data
return data
12 changes: 6 additions & 6 deletions tsfiller/plotting.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
import matplotlib.pyplot as plt


def plot_timeseries(data_original, data):
plt.figure(figsize=(15, 5))
plt.plot(data_original, ".", color="red", label = 'Deleted data')
plt.plot(data, ".", color="green", label = 'Retained data')
plt.plot(data_original, ".", color="red", label='Deleted data')
plt.plot(data, ".", color="green", label='Retained data')
plt.legend()
plt.title(f"{data.name}")
plt.ylabel('Groundwater level (mNGF)')
plt.show()


def plot_dataframes(cleaned_dataframe,estimated_dataframe):
def plot_dataframes(cleaned_dataframe, estimated_dataframe):
for i in range(len(estimated_dataframe.columns)):
plt.figure(figsize=(15, 5))
plt.plot(estimated_dataframe.index, estimated_dataframe.iloc[:, i], lw=0, marker='.', color='orchid',label='Missing data estimation')
plt.plot(estimated_dataframe.index, estimated_dataframe.iloc[:, i], lw=0, marker='.', color='orchid',
label='Missing data estimation')
plt.plot(cleaned_dataframe.index, cleaned_dataframe.iloc[:, i], lw=0, marker='.',
label='Measurement', color='darkblue')
plt.ylabel('Groundwater level (mNGF)')
Expand All @@ -22,5 +24,3 @@ def plot_dataframes(cleaned_dataframe,estimated_dataframe):
plt.grid(True)
plt.xticks(rotation=45)
plt.show()


0 comments on commit 0796ac3

Please sign in to comment.