-
Notifications
You must be signed in to change notification settings - Fork 11
/
pandas_dq.py
1667 lines (1500 loc) · 85.6 KB
/
pandas_dq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
###########################################################################################
#Copyright 2023 Google LLC
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
###########################################################################################
##### pandas_dq is a simple transformer pipeline for all kinds of pandas pipelines. ######
###########################################################################################
# This library has two major modules.
###########################################################################################
# The first module dq_report finds all the problems:
# It detects missing values and suggests to impute them with mean, median, mode, or a constant value.
# It identifies rare categories and suggests to group them into a single category or drop them.
# It finds infinite values and suggests to replace them with NaN or a large value.
# It detects mixed data types and suggests to convert them to a single type or split them into multiple columns.
# It detects outliers and suggests to remove them or use robust statistics.
# It detects high cardinality features and suggests to reduce them using encoding techniques or feature selection methods.
# It detects highly correlated features and suggests to drop one of them or use dimensionality reduction techniques.
# It detects duplicate rows and columns and suggests to drop them or keep only one copy .
# It detects skewed distributions and suggests to apply transformations or scaling techniques .
# It detects imbalanced classes and suggests to use resampling techniques or class weights .
# It detects feature leakage and suggests to avoid using features that are not available at prediction time .
############################################################################################
# The second module, Fix_DQ fixes all the data quality problems that find_dq finds.
############################################################################################
####### This Transformer was inspired by ChatGPT and Bard's answers when I was searching for
####### a quick and dirty data cleaning library. Since they couldn't find me any good ones,
####### I decided to create a simple quick and dirty data cleaning library using ChatGPT and Bard.
####### I dedicate this library to all the 1000's of researchers who worked to create LLM's.
############################################################################################
# Define a function to print data cleaning suggestions
# Import pandas and numpy libraries
import pandas as pd
import numpy as np
import copy
import os
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)
pd.set_option('colheader_justify', 'center')
# Define a function to print a data quality report and suggestions to clean data
def dq_report(data, target=None, html=False, csv_engine="pandas", verbose=0):
"""
This is a data quality reporting tool that accepts any kind of file format as a filename or as a
pandas dataframe as input and returns a report highlighting potential data quality issues in it.
The function performs the following data quality checks. More will be added periodically.
# It detects missing values and suggests to impute them with mean, median, mode, or a constant value.
# It identifies rare categories and suggests to group them into a single category or drop them.
# It finds infinite values and suggests to replace them with NaN or a large value.
# It detects mixed data types and suggests to convert them to a single type or split them into multiple columns.
# It detects outliers and suggests to remove them or use robust statistics.
# It detects high cardinality features and suggests to reduce them using encoding techniques or feature selection methods.
# It detects highly correlated features and suggests to drop one of them or use dimensionality reduction techniques.
# It detects duplicate rows and columns and suggests to drop them or keep only one copy .
# It detects skewed distributions and suggests to apply transformations or scaling techniques .
# It detects imbalanced classes and suggests to use resampling techniques or class weights .
# It detects feature leakage and suggests to avoid using features that are not available at prediction time .
Finally, the function identifies if the dataset is a classification problem or
a regression problem and checks if there is class imbalance or target leakage in the dataset.
"""
correlation_threshold = 0.8 # Define a threshold for high correlation between variables
leakage_threshold = 0.8 # Define a threshold for feature leakage
if not verbose:
print("This is a summary report. Change verbose to 1 to see more details on each DQ issue.")
#### If sometimes, target is given as empty string, change it to None
if isinstance(target, str):
if target == '':
target = None
# Check if the input is a string or a dataframe
if isinstance(data, str):
# Get the file extension
ext = os.path.splitext(data)[-1]
# Load the file into a pandas dataframe based on the extension
if ext == ".csv":
print(" If large dataset, we will randomly sample 100K rows to speed up processing...")
if csv_engine == 'pandas':
# Upload the data file into Pandas
df = pd.read_csv(data)
elif csv_engine == 'polars':
# Upload the data file into Polars
import polars as pl
df = pl.read_csv(data)
elif csv_engine == 'parquet':
# Upload the data file into Parquet
import pyarrow as pa
df = pa.read_table(data)
else :
# print the pandas version
if str(pd.__version__)[0] == '2':
print(f" pandas version={pd.__version__}. Hence using pyarrow backend.")
df = pd.read_csv(data, engine='pyarrow', dtype_backend='pyarrow')
else:
print(f" pandas version={pd.__version__}. Hence using pandas backend.")
df = pd.read_csv(data)
elif ext == ".parquet":
df = pd.read_parquet(data)
elif ext in [".feather", ".arrow", "ftr"]:
df = pd.read_feather(data)
else:
print(" Unsupported file format. Please use CSV, parquet, feather or arrow.")
return data
######## This is to sample the data if it is too large ###
if df.shape[0] >= 1000000:
df = df.sample(100000)
elif isinstance(data, pd.DataFrame):
df = copy.deepcopy(data)
else:
print(" Unrecognized input. Please provide a filename or a pandas dataframe. Returning...")
return data
# Drop duplicate rows
dup_rows = df.duplicated().sum()
if dup_rows > 0:
print(f'There are {dup_rows} duplicate rows in your dataset')
print(f' Alert: Dropping duplicate rows can sometimes cause your column data types to change to object!')
df = df.drop_duplicates()
# Drop duplicate columns
dup_cols = df.columns[df.columns.duplicated()]
if len(dup_cols) > 0:
print(f' Alert: Dropping {len(dup_cols)} duplicate cols')
### DO NOT MODIFY THIS LINE. TOOK A LONG TIME TO MAKE IT WORK!!!
### THis is the only way that dropping duplicate columns works. This is not found anywhere!
df = df.T[df.T.index.duplicated(keep='first')].T
### This is the column that lists our data quality issues
new_col = 'DQ Issue'
good_col = "The Good News"
bad_col = "The Bad News"
# Create an empty dataframe to store the data quality issues
dq_df1 = pd.DataFrame(columns=[good_col, bad_col])
dq_df1 = dq_df1.T
dq_df1["first_comma"] = ""
dq_df1[new_col] = ""
# Create an empty dataframe to store the data quality issues
data_types = pd.DataFrame(
df.dtypes,
columns=['Data Type']
)
missing_values = df.isnull().sum()
missing_values_pct = ((df.isnull().sum()/df.shape[0])*100)
missing_cols = missing_values[missing_values > 0].index.tolist()
##### Classify Columns ################
if not target is None:
var_df = classify_columns(df.drop(target, axis=1), verbose=0)
else:
var_df = classify_columns(df, verbose=0)
##### ClassifyColumns ################
IDcols = var_df['id_vars']
nlp_vars = var_df['nlp_vars']
discrete_string_vars = var_df['discrete_string_vars']
cols_delete = var_df['cols_delete']
bool_vars = var_df['string_bool_vars'] + var_df['num_bool_vars']
int_vars = var_df['int_vars']
categorical_vars = var_df['cat_vars'] + var_df['factor_vars']
date_vars = var_df['date_vars']
if target is None:
preds = [x for x in list(df) if x not in IDcols+cols_delete]
else:
if isinstance(target, str):
### target is a string #####
preds = [x for x in list(df) if x not in IDcols+cols_delete+[target]]
else:
### target is a multi-label list ####
preds = [x for x in list(df) if x not in IDcols+cols_delete+target]
#####################################################################################################
float_cols = var_df['continuous_vars'] # Get float columns
id_cols = list(set(IDcols[:]))
zero_var_cols = list(set(cols_delete[:]))
number_cols = list(set(var_df['continuous_vars'] + var_df['int_vars']))
text_vars = list(set(discrete_string_vars + nlp_vars))
cat_cols = categorical_vars[:] # Get categorical columns
date_cols = date_vars[:]
######################### These are needed for further processing ##########
missing_data = pd.DataFrame(
missing_values_pct,
columns=['Missing Values%']
)
unique_values = pd.DataFrame(
columns=['Unique Values%']
)
#### For every column except float columns find the number of unique values % ##
for col in list(df.columns.values):
if col in float_cols:
unique_values.loc[col] = ["NA"]
else:
unique_values.loc[col] = [int(100*df[col].nunique()/df.shape[0])]
#### Find the max and min of every column except missing cols ##
maximum_values = pd.DataFrame(
columns=['Maximum Value']
)
minimum_values = pd.DataFrame(
columns=['Minimum Value']
)
### for every column except missing cols, find the max and min ######
for col in list(df.columns.values):
if col not in missing_cols:
if pd.api.types.is_numeric_dtype(df[col]):
maximum_values.loc[col] = [df[col].max()]
elif col in number_cols:
maximum_values.loc[col] = [df[col].max()]
### for every column except missing cols, find the max and min ######
for col in list(df.columns.values):
if col not in missing_cols:
if pd.api.types.is_numeric_dtype(df[col]):
minimum_values.loc[col] = [df[col].min()]
elif col in number_cols:
minimum_values.loc[col] = [df[col].min()]
### now generate the data quality starter dataframe
dq_df2 = data_types.join(missing_data).join(unique_values).join(minimum_values).join(maximum_values)
dq_df2['Minimum Value'] = dq_df2[['Minimum Value']].fillna('') ## these need to be filled with empty strings
dq_df2['Maximum Value'] = dq_df2[['Maximum Value']].fillna('') ### these need to be filled with empty strings
### set up additional columns
dq_df2["first_comma"] = ""
dq_df2[new_col] = f""
#### This is the first thing you need to do ###############
if dup_rows > 0:
new_string = f"There are {dup_rows} duplicate rows in the dataset. De-Dup these rows using Fix_DQ."
dq_df1.loc[bad_col,new_col] += dq_df1.loc[bad_col,'first_comma'] + new_string
dq_df1.loc[bad_col,'first_comma'] = ', '
else:
new_string = f"There are no duplicate rows in this dataset"
dq_df1.loc[good_col,new_col] += dq_df1.loc[good_col,'first_comma'] + new_string
dq_df1.loc[good_col,'first_comma'] = ', '
### DO NOT CHANGE THE NEXT LINE. The logic for columns is different.
if len(dup_cols) > 0:
new_string = f"There are {len(dup_cols)} duplicate columns in the dataset. De-Dup {dup_cols} using Fix_DQ."
dq_df1.loc[bad_col,new_col] += dq_df1.loc[bad_col,'first_comma'] + new_string
dq_df1.loc[bad_col,'first_comma'] = ', '
else:
new_string = f"There are no duplicate columns in this datatset"
dq_df1.loc[good_col,new_col] += dq_df1.loc[good_col,'first_comma'] + new_string
dq_df1.loc[good_col,'first_comma'] = ', '
# Detect ID columns in dataset and recommend removing them
if len(id_cols) > 0:
new_string = f"There are ID columns in the dataset. Remove them before modeling using Fix_DQ."
dq_df1.loc[bad_col,new_col] += dq_df1.loc[bad_col,'first_comma'] + new_string
dq_df1.loc[bad_col,'first_comma'] = ', '
for col in id_cols:
# Append a row to the dq_df1 with the column name and the issue only if the column has a missing value
new_string = f"Possible ID column: drop before modeling step."
dq_df2.loc[col,new_col] += dq_df2.loc[col,'first_comma'] + new_string
dq_df2.loc[col,'first_comma'] = ', '
else:
new_string = f"There are no ID columns in the dataset."
dq_df1.loc[good_col,new_col] += dq_df1.loc[good_col,'first_comma'] + new_string
dq_df1.loc[good_col,'first_comma'] = ', '
# Detect zero variance columns in dataset and recommend removing them
if len(zero_var_cols) > 0:
new_string = f"These are zero-variance or low information columns in the dataset. Remove them before modeling."
dq_df1.loc[bad_col,new_col] += dq_df1.loc[bad_col,'first_comma'] + new_string
dq_df1.loc[bad_col,'first_comma'] = ', '
for col in zero_var_cols:
# Append a row to the dq_df1 with the column name and the issue only if the column has a missing value
new_string = f"Possible Zero-variance or low information colum: drop before modeling step."
dq_df2.loc[col,new_col] += dq_df2.loc[col,'first_comma'] + new_string
dq_df2.loc[col,'first_comma'] = ', '
else:
new_string = f"There are no zero-variance or low information columns in the dataset."
dq_df1.loc[good_col,new_col] += dq_df1.loc[good_col,'first_comma'] + new_string
dq_df1.loc[good_col,'first_comma'] = ', '
# Detect date-time related columns in dataset
if len(date_cols) > 0:
new_string = f"There are {len(date_vars)} date-time vars in the dataset. Make sure you transform them before modeling."
dq_df1.loc[bad_col,new_col] += dq_df1.loc[bad_col,'first_comma'] + new_string
dq_df1.loc[bad_col,'first_comma'] = ', '
for col in date_cols:
# Append a row to the dq_df1 with the column name and the issue only if the column has a missing value
new_string = f"Possible date-time colum: transform before modeling step."
dq_df2.loc[col,new_col] += dq_df2.loc[col,'first_comma'] + new_string
dq_df2.loc[col,'first_comma'] = ', '
else:
new_string = f"There are no date-time vars in this dataset"
dq_df1.loc[good_col,new_col] += dq_df1.loc[good_col,'first_comma'] + new_string
dq_df1.loc[good_col,'first_comma'] = ', '
# Detect missing values and suggests to impute them with mean, median, mode, or a constant value123
#missing_values = df.isnull().sum()
#missing_cols = missing_values[missing_values > 0].index.tolist()
if len(missing_cols) > 0:
for col in missing_cols:
# Append a row to the dq_df1 with the column name and the issue only if the column has a missing value
if missing_values[col] > 0:
new_string = f"{missing_values[col]} missing values. Impute them with mean, median, mode, or a constant value such as 123."
dq_df2.loc[col,new_col] += dq_df2.loc[col,'first_comma'] + new_string
dq_df2.loc[col,'first_comma'] = ', '
else:
new_string = f"There are no columns with missing values in the dataset"
dq_df1.loc[good_col,new_col] += dq_df1.loc[good_col,'first_comma'] + new_string
dq_df1.loc[good_col,'first_comma'] = ', '
# Identify rare categories and suggests to group them into a single category or drop them123
rare_threshold = 0.01 # Define a 1% threshold for rare categories
rare_cat_cols = []
if len(cat_cols) > 0:
for col in cat_cols:
value_counts = df[col].value_counts(normalize=True)
rare_values = value_counts[value_counts < rare_threshold].index.tolist()
if len(rare_values) > 0:
rare_cat_cols.append(col)
# Append a row to the dq_df2 with the column name and the issue
if len(rare_values) <= 10:
new_string = f"{len(rare_values)} rare categories: {rare_values}. Group them into a single category or drop the categories."
else:
new_string = f"{len(rare_values)} rare categories: Too many to list. Group them into a single category or drop the categories."
dq_df2.loc[col,new_col] += dq_df2.loc[col,'first_comma'] + new_string
dq_df2.loc[col,'first_comma'] = ', '
else:
new_string = f"There are no categorical columns with rare categories (< {100*rare_threshold:.0f} percent) in this dataset"
dq_df1.loc[good_col,new_col] += dq_df1.loc[good_col,'first_comma'] + new_string
dq_df1.loc[good_col,'first_comma'] = ', '
# Find infinite values and suggests to replace them with NaN or a large value123
inf_values = df.replace([np.inf, -np.inf], np.nan).isnull().sum() - missing_values
inf_cols = inf_values[inf_values > 0].index.tolist()
if len(inf_cols) > 0:
new_string = f"There are {len(inf_cols)} columns with infinite values in the dataset. Replace them with NaN or a finite value."
dq_df1.loc[bad_col,new_col] += dq_df1.loc[bad_col,'first_comma'] + new_string
dq_df1.loc[bad_col,'first_comma'] = ', '
for col in inf_cols:
if inf_values[col] > 0:
new_string = f"{inf_values[col]} infinite values. Replace them with a finite value."
dq_df2.loc[col,new_col] += dq_df2.loc[col,'first_comma'] + new_string
dq_df2.loc[col,'first_comma'] = ', '
else:
new_string = f"There are no columns with infinite values in this dataset "
dq_df1.loc[good_col,new_col] += dq_df1.loc[good_col,'first_comma'] + new_string
dq_df1.loc[good_col,'first_comma'] = ', '
# Detect mixed data types and suggests to convert them to a single type or split them into multiple columns123
mixed_types = df[preds].applymap(type).nunique() # Get the number of unique types in each column
mixed_cols = mixed_types[mixed_types > 1].index.tolist() # Get the columns with more than one type
if len(mixed_cols) > 0:
new_string = f"There are {len(mixed_cols)} columns with mixed data types in the dataset. Convert them to a single type or split them into multiple columns."
dq_df1.loc[bad_col,new_col] += dq_df1.loc[bad_col,'first_comma'] + new_string
dq_df1.loc[bad_col,'first_comma'] = ', '
for col in mixed_cols:
if mixed_types[col] > 1:
new_string = f"Mixed dtypes: has {mixed_types[col]} different data types: "
for each_class in df[col].apply(type).unique():
if each_class == str:
new_string += f" object,"
elif each_class == int:
new_string += f" integer,"
elif each_class == float:
new_string += f" float,"
dq_df2.loc[col,new_col] += dq_df2.loc[col,'first_comma'] + new_string
dq_df2.loc[col,'first_comma'] = ', '
else:
new_string = f"There are no columns with mixed (more than one) dataypes in this dataset"
dq_df1.loc[good_col,new_col] += dq_df1.loc[good_col,'first_comma'] + new_string
dq_df1.loc[good_col,'first_comma'] = ', '
# Detect outliers in numeric cols
num_cols = var_df['continuous_vars'] + var_df['int_vars'] # Get numerical columns
if len(num_cols) > 0:
first_time = True
outlier_cols = []
for col in num_cols:
q1 = df[col].quantile(0.25) # Get the first quartile
q3 = df[col].quantile(0.75) # Get the third quartile
iqr = q3 - q1 # Get the interquartile range
lower_bound = q1 - 1.5 * iqr # Get the lower bound
upper_bound = q3 + 1.5 * iqr # Get the upper bound
outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col] # Get the outliers
if len(outliers) > 0:
outlier_cols.append(col)
if first_time:
new_string = f"There are {len(num_cols)} numerical columns, some with outliers. Remove them or use robust statistics."
dq_df1.loc[bad_col,new_col] += dq_df1.loc[bad_col,'first_comma'] + new_string
dq_df1.loc[bad_col,'first_comma'] = ', '
first_time =False
### check if there are outlier columns and print them ##
new_string = f"Column has {len(outliers)} outliers greater than upper bound ({upper_bound:.2f}) or lower than lower bound({lower_bound:.2f}). Cap them or remove them."
dq_df2.loc[col,new_col] += dq_df2.loc[col,'first_comma'] + new_string
dq_df2.loc[col,'first_comma'] = ', '
if len(outlier_cols) < 1:
new_string = f"There are no numeric columns with outliers in this dataset"
dq_df1.loc[good_col,new_col] += dq_df1.loc[good_col,'first_comma'] + new_string
dq_df1.loc[good_col,'first_comma'] = ', '
# Detect high cardinality features only in categorical columns
cardinality = df[discrete_string_vars].nunique() # Get the number of unique values in each categorical column
cardinality_threshold = min(30, cardinality.min()) # Define a threshold for high cardinality
high_card_cols = discrete_string_vars[:]
# Get the columns with high cardinality
## high_card_cols = cardinality[cardinality > cardinality_threshold].index.tolist()
if len(high_card_cols) > 0:
new_string = f"There are {len(high_card_cols)} columns with high cardinality (>{cardinality_threshold} categories) in the dataset. Reduce them using encoding techniques or feature selection methods."
dq_df1.loc[bad_col,new_col] += dq_df1.loc[bad_col,'first_comma'] + new_string
dq_df1.loc[bad_col,'first_comma'] = ', '
for col in high_card_cols:
new_string = f"Possible high cardinality column with {cardinality[col]} unique values: Use hash encoding or text embedding to reduce dimension."
dq_df2.loc[col,new_col] += dq_df2.loc[col,'first_comma'] + new_string
dq_df2.loc[col,'first_comma'] = ', '
else:
new_string = f"There are no high cardinality columns in this dataset"
dq_df1.loc[good_col,new_col] += dq_df1.loc[good_col,'first_comma'] + new_string
dq_df1.loc[good_col,'first_comma'] = ', '
# Detect highly correlated features
correlation_matrix = df[num_cols].corr().abs() # Get the absolute correlation matrix of numerical columns
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)) # Get the upper triangle of the matrix
high_corr_cols = [column for column in upper_triangle.columns if any(upper_triangle[column] > correlation_threshold)] # Get the columns with high correlation
if len(high_corr_cols) > 0:
new_string = f"There are {len(high_corr_cols)} columns with >= {correlation_threshold} correlation in the dataset. Drop one of them or use dimensionality reduction techniques."
dq_df1.loc[bad_col,new_col] += dq_df1.loc[bad_col,'first_comma'] + new_string
dq_df1.loc[bad_col,'first_comma'] = ', '
for col in high_corr_cols:
new_string = f"Column has a high correlation with {upper_triangle[col][upper_triangle[col] > correlation_threshold].index.tolist()}. Consider dropping one of them."
dq_df2.loc[col,new_col] += dq_df2.loc[col,'first_comma'] + new_string
dq_df2.loc[col,'first_comma'] = ', '
else:
new_string = f"There are no highly correlated columns in the dataset."
dq_df1.loc[good_col,new_col] += dq_df1.loc[good_col,'first_comma'] + new_string
dq_df1.loc[good_col,'first_comma'] = ', '
# First see if this is a classification problem
if target is not None:
if isinstance(target, str):
target_col = [target]
else:
target_col = copy.deepcopy(target) # Define the target column name
cat_cols = df[target_col].select_dtypes(include=["object", "category"]).columns.tolist()
### Check if it is a categorical var, then it is classification problem ###
model_type = 'Regression'
if len(cat_cols) > 0:
model_type = "Classification"
else:
int_cols = df[target_col].select_dtypes(include=["integer"]).columns.tolist()
copy_target_col = copy.deepcopy(target_col)
for each_target_col in copy_target_col:
if len(df[each_target_col].value_counts()) <= 30:
model_type = "Classification"
### Then check for imbalanced classes in each target column
if model_type == 'Classification':
for each_target_col in target_col:
y = df[each_target_col]
# Get the value counts of each class
value_counts = y.value_counts(normalize=True)
# Get the minimum and maximum class frequencies
min_freq = value_counts.min()
max_freq = value_counts.max()
# Define a threshold for imbalance
imbalance_threshold = 0.1
# Check if the class frequencies are imbalanced
if min_freq < imbalance_threshold or max_freq > 1 - imbalance_threshold:
# Print a message to suggest resampling techniques or class weights
new_string = f"Imbalanced classes in target variable ({each_target_col}). Use resampling or class weights to address."
dq_df1.loc[bad_col,new_col] += dq_df1.loc[bad_col,'first_comma'] + new_string
dq_df1.loc[bad_col,'first_comma'] = ', '
dq_df2.loc[each_target_col, new_col] += "Target column. Appears to have Imbalanced classes. Try balancing classes."
# Detect target leakage in each feature
leakage_matrix = df[preds].corrwith(df[target_col]).abs() # Get the absolute correlation matrix of each column with the target column
leakage_cols = leakage_matrix[leakage_matrix > leakage_threshold].index.tolist() # Get the columns with feature leakage
if len(leakage_cols) > 0:
new_string = f"There are {len(leakage_cols)} columns with data leakage. Double check whether you should use this variable."
dq_df1.loc[bad_col,new_col] += dq_df1.loc[bad_col,'first_comma'] + new_string
dq_df1.loc[bad_col,'first_comma'] = ', '
for col in leakage_cols:
new_string = f" {col} has a correlation >= {leakage_threshold} with {target_col}. Possible data leakage. Double check this variable."
dq_df2.loc[col,new_col] += dq_df2.loc[col,'first_comma'] + new_string
dq_df2.loc[col,'first_comma'] = ', '
else:
new_string = f'There are no target leakage columns in the dataset'
dq_df1.loc[good_col,new_col] += dq_df1.loc[good_col,'first_comma'] + new_string
dq_df1.loc[good_col,'first_comma'] = ', '
else:
new_string = f'There is no target given. Hence no target leakage columns detected in the dataset'
dq_df1.loc[good_col,new_col] += dq_df1.loc[good_col,'first_comma'] + new_string
dq_df1.loc[good_col,'first_comma'] = ', '
target_col = []
dq_df1.drop('first_comma', axis=1, inplace=True)
dq_df2.drop('first_comma', axis=1, inplace=True)
for col in list(df):
if dq_df2.loc[col, new_col] == "":
if col in target_col:
#### This is to make sure target column is properly labeled.
if df[col].nunique() == 1:
dq_df2.loc[col,new_col] += "Target column. Appears to have zero variance. Double-check it."
else:
dq_df2.loc[col,new_col] += "Target column"
else:
dq_df2.loc[col,new_col] += "No issue"
if html:
if verbose == 0:
write_to_html(dq_df1, "dq_report.html")
else:
write_to_html(dq_df2, "dq_report.html")
else:
try:
from IPython.display import display
except Exception as e:
print('Erroring due to %s. Please install and try again...')
return dq_df2
if verbose < 0:
pass
elif verbose == 0:
all_rows = dq_df1.shape[0]
ax = dq_df1.head(all_rows).style.background_gradient(cmap='Reds').set_properties(**{'font-family': 'Segoe UI'})
display(ax);
else:
all_rows = dq_df2.shape[0]
ax = dq_df2.head(all_rows).style.background_gradient(cmap='Reds').set_properties(**{'font-family': 'Segoe UI'})
display(ax);
# Return the dq_df1 as a table
return dq_df2
##################################################################################################
import re
# Import the webbrowser module
import webbrowser
def write_to_html(dqr, filename="dq_report.html"):
"""
Write a data quality report to an HTML file and open it in a browser.
Parameters
----------
dqr : pandas.DataFrame
A data quality report generated by the dq_report function.
Returns
-------
None
Notes
-----
This function will create an HTML file named "dq_report.html" in the current working directory
and open it in a new tab of the default browser. The HTML file will contain a table with
the data quality report, formatted with colors, fonts, and styles. The table will have
alternating row colors using the CSS style sheet embedded in strings. The function requires
the re and webbrowser modules to be imported.
"""
df_html = dqr.to_html(classes="table table-striped table-bordered table-hover",
border=0, na_rep="", index=True).replace('<th>',
'<th style="background-color: lightgreen">').replace('<td>',
'<td style="color: blue">')
df_html = f""" <style> /* Import Roboto from Google Fonts */ @import url(‘https://fonts.googleapis.com/css?family=Roboto&display=swap’);
/* Set the font family and size for the whole table */ table {{ font-family: Roboto; font-size: 12px; }}
/* Set the background color and text alignment for the header cells */ th {{ background-color: orange; font-size: 14px; text-align: center; }}
/* Set the color and font style for the data cells */ td {{ color: blue; font-style: italic; text-align: left; }}
/* Set the background color for every odd row */ tr:nth-child(odd) {{ background-color: lightyellow; }}
/* Set the background color for every even row */ tr:nth-child(even) {{ background-color: lightgrey; }} </style> {df_html} """
# Return the HTML code of the report as a string
with open(filename, "w") as f:
f.write(df_html)
# Open the file in a new tab of the default browser
webbrowser.open_new_tab(filename)
##################################################################################################
# Import pandas and numpy libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import FunctionTransformer
import pdb
# Import BaseEstimator and TransformerMixin from sklearn
from sklearn.base import BaseEstimator, TransformerMixin
##################################################################################
def left_subtract(l1,l2):
lst = []
for i in l1:
if i not in l2:
lst.append(i)
return lst
################################################################################
def compare_unique(df1, df2, column):
"""
Compare the difference between the unique values in a single column of two dataframes.
This function takes two dataframes and a column name as inputs and returns a dictionary
with the count and the actual differences of the unique values in that column between
the two dataframes.
Parameters
----------
df1 : pandas.DataFrame
The first dataframe to compare.
df2 : pandas.DataFrame
The second dataframe to compare.
column : str
The name of the column to compare.
Returns
-------
result : dict
A dictionary with four keys: 'count_1', 'count_2', 'diff_1_2', and 'diff_2_1'.
'count_1' is the number of unique values in column of df1.
'count_2' is the number of unique values in column of df2.
'diff_1_2' is a list of unique values in column of df1 that are not in column of df2.
'diff_2_1' is a list of unique values in column of df2 that are not in column of df1.
"""
# Get the unique values in column of each dataframe as sets
set1 = set(df1[column].unique())
set2 = set(df2[column].unique())
# Calculate the count and the differences using set operations
count_1 = len(set1)
count_2 = len(set2)
diff_1_2 = list(set1 - set2)
diff_2_1 = list(set2 - set1)
# Store the results in a dictionary
result = {
"unique_count_in_df1": count_1,
"unique_count_in_df2": count_2,
"diff_between_df1_df2": diff_1_2,
"diff_between_df2_df1": diff_2_1,
}
# Return the result
return result
########################################################################################
# Define a custom transformer class for fixing data quality issues
class Fix_DQ(BaseEstimator, TransformerMixin):
# Initialize the class with optional parameters for the quantile, cat_fill_value and num_fill_value
def __init__(self, quantile=0.87, cat_fill_value="missing", num_fill_value=9999,
rare_threshold=0.01, correlation_threshold=0.9):
self.quantile = quantile # Define a threshold for IQR for outlier detection
self.cat_fill_value = cat_fill_value ## Define a fill value for missing categories
self.num_fill_value = num_fill_value # Define a fill value for missing numbers
self.rare_threshold = rare_threshold # Define a threshold for rare categories
self.correlation_threshold = correlation_threshold ## Above this limit, variables will be dropped
# Define a function to cap the outliers in numerical columns using the upper bounds
def cap_outliers(self, X):
# Check if X is a pandas DataFrame
if not isinstance(X, pd.DataFrame):
# Convert X to a pandas DataFrame
X = pd.DataFrame(X)
X = copy.deepcopy(X)
# Get the numerical columns
num_cols = X.select_dtypes(include=[ "float"]).columns.tolist()
# Loop through each float column
for col in num_cols:
# Check if the column has an upper bound calculated in the fit method
if col in self.upper_bounds_:
# Cap the outliers using the upper bound
X[col] = np.where(X[col] > self.upper_bounds_[col], self.upper_bounds_[col], X[col])
else:
# Just continue and don't cap the outliers in that column
continue
# Return the DataFrame with capped outliers
return X
# Define a function to impute the missing values in categorical and numerical columns using the constant values
def impute_missing(self, X):
"""
### impute_missing can fill missing value using a global default value or a
### dictionary of fill values for each column and apply that fill value to each column.
"""
# Check if X is a pandas DataFrame
if not isinstance(X, pd.DataFrame):
# Convert X to a pandas DataFrame
X = pd.DataFrame(X)
X = copy.deepcopy(X)
# Get the categorical columns
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
# Get the numerical columns
num_cols = X.select_dtypes(include=["int", "float"]).columns.tolist()
missing_values = X.isnull().sum()
missing_cols = missing_values[missing_values > 0].index.tolist()
#### Sometimes missing values are found in test but not train. This to catch those!
for col in missing_cols:
if not col in self.missing_cols_:
self.missing_cols_.append(col)
# Loop through the columns of cat_cols
for col in self.missing_cols_:
if col in cat_cols:
# Check if the column is in the fill_values dictionary
if isinstance(self.cat_fill_value, dict):
if col in self.cat_fill_value:
# Impute the missing values in the column with the corresponding fill value
X[col] = X[[col]].fillna(self.cat_fill_value[col]).values
else:
### use a default value for that column since it is not specified
X[col] = X[[col]].fillna("missing").values
else:
### use a global default value for all columns
X[col] = X[[col]].fillna(self.cat_fill_value).values
# Impute the missing values in numerical columns with the num_fill_value
# Loop through the columns of num_cols
for col in self.missing_cols_:
if col in num_cols:
# Check if the column is in the fill_values dictionary
if isinstance(self.num_fill_value, dict):
if col in self.num_fill_value:
# Impute the missing values in the column with the corresponding fill value
X[col] = X[[col]].fillna(self.num_fill_value[col]).values
else:
### use a default value for that column since it is not specified
X[col] = X[[col]].fillna(-999).values
else:
X[col] = X[[col]].fillna(self.num_fill_value).values
# Return the DataFrame with imputed missing values
return X
# Define a function to identify rare categories and group them into a single category
def group_rare_categories(self, X):
# Check if X is a pandas DataFrame
if not isinstance(X, pd.DataFrame):
# Convert X to a pandas DataFrame
X = pd.DataFrame(X)
# Get the categorical columns
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
# Loop through each categorical column
for col in cat_cols:
# Get the value counts of each category
value_counts = X[col].value_counts(normalize=True)
# Get the rare categories that have a frequency below the threshold
rare_values = value_counts[value_counts < self.rare_threshold].index.tolist()
# Check if there are any rare categories
if len(rare_values) > 0:
# Group the rare categories into a single category called "Rare"
X[col] = X[col].replace(rare_values, "Rare")
# Return the DataFrame with grouped rare categories
return X
# Define a function to find infinite values and replace them with the upper bounds from that numeric column
def replace_infinite(self, X):
# Check if X is a pandas DataFrame
if not isinstance(X, pd.DataFrame):
# Convert X to a pandas DataFrame
X = pd.DataFrame(X)
# Get the numerical columns
num_cols = X.select_dtypes(include=["int", "float"]).columns.tolist()
# Loop through each numerical column
for col in num_cols:
# Check if the column has an upper bound calculated in the fit method
if col in self.upper_bounds_:
# Replace the infinite values with the upper bound
X[col] = X[col].replace([np.inf, -np.inf], self.upper_bounds_[col])
# Return the DataFrame with replaced infinite values
return X
def detect_duplicates(self, X):
# Check if X is a pandas DataFrame
if not isinstance(X, pd.DataFrame):
# Convert X to a pandas DataFrame
X = pd.DataFrame(X)
# Drop duplicate rows
dup_rows = X.duplicated().sum()
if dup_rows > 0:
print(f'Alert: Detecting {dup_rows} duplicate rows...')
# Drop duplicate columns
dup_cols = X.columns[X.columns.duplicated()]
### Remember that the logic for columns is different. Don't change this line!
if len(dup_cols) > 0:
print(f'Alert: Detecting {len(dup_cols)} duplicate cols...')
### DO NOT TOUCH THIS LINE!! IT TOOK A LONG TIME TO MAKE IT WORK!!
### Also if you don't delete these columns, then nothing after this line will work!
X = X.T[X.T.index.duplicated(keep='first')].T
# Return the DataFrame with no duplicates
return X
# Define a function to detect duplicate rows and columns and keep only one copy
def drop_duplicated(self, X):
# Check if X is a pandas DataFrame
if not isinstance(X, pd.DataFrame):
# Convert X to a pandas DataFrame
X = pd.DataFrame(X)
# Drop duplicate rows
dup_rows = X.duplicated().sum()
if dup_rows > 0:
print(f'Alert: Dropping {dup_rows} duplicate rows can sometimes cause column data types to change to object. Double-check!')
X = X.drop_duplicates(keep='first')
# Drop duplicate columns
dup_cols = X.columns[X.columns.duplicated()]
### Remember that the logic for columns is different. Don't change this line!
if len(dup_cols) > 0:
print(f'Alert: Dropping {len(dup_cols)} duplicate cols: {dup_cols}!')
### DO NOT TOUCH THIS LINE!! IT TOOK A LONG TIME TO MAKE IT WORK!!
X = X.T[X.T.index.duplicated(keep='first')].T
# Return the DataFrame with no duplicates
return X
# Define a function to detect skewed distributions and apply a proper transformation to the column
def transform_skewed(self, X):
# Check if X is a pandas DataFrame
if not isinstance(X, pd.DataFrame):
# Convert X to a pandas DataFrame
X = pd.DataFrame(X)
# Get the numerical columns
num_cols = X.select_dtypes(include=["float"]).columns.tolist()
# Loop through each numerical column
for col in num_cols:
# Find if a column transformer exists for this column
if col in self.col_transformers_:
# Cap the outliers using the upper bound
if str(self.col_transformers_[col]).split("(")[0] == "PowerTransformer":
### power transformer expects Pandas DataFrame
pt = self.col_transformers_[col]
X[col] = pt.transform(X[[col]])
else:
### function transformer expects pandas series
ft = self.col_transformers_[col]
X[col] = ft.transform(X[col])
# Return the DataFrame with transformed skewed columns
return X
# Define the fit method that calculates the upper bound for each numerical column
def fit(self, X, y=None):
self.drop_cols_ = []
self.missing_cols_ = []
# Check if X is a pandas DataFrame
if not isinstance(X, pd.DataFrame):
# Convert X to a pandas DataFrame
X = pd.DataFrame(X)
# Get the numerical columns
num_cols = X.select_dtypes(include=["int", "float"]).columns.tolist()
float_cols = X.select_dtypes(include=["float"]).columns.tolist()
non_float_cols = left_subtract(X.columns, float_cols)
# Find percent of missing values in all columns
missing_values = X.isnull().sum()
self.missing_cols_ = missing_values[missing_values > 0].index.tolist()
drop_missing = []
for each in self.missing_cols_:
if X[each].isna().sum()/len(X) >= 0.80 :
### drop the column if it has 80% or more missing values
drop_missing.append(each)
print(f" Dropping {each} since it has >= 80%% missing values")
### First and foremost you must drop duplicate columns and rows
X = self.detect_duplicates(X)
# Detect ID columns
self.id_cols_ = [column for column in non_float_cols if X[column].nunique() == X.shape[0]]
if len(self.id_cols_) > 0:
print(f" Dropping {len(self.id_cols_)} ID column(s): {self.id_cols_}")
# Detect zero-variance columns
self.zero_var_cols_ = [column for column in non_float_cols if X[column].nunique() == 1]
if len(self.zero_var_cols_) > 0:
print(f" Dropping {len(self.zero_var_cols_)} zero-variance cols: {self.zero_var_cols_}")
# Detect highly correlated features
self.drop_corr_cols_ = []
correlation_matrix = X[num_cols].corr().abs() # Get the absolute correlation matrix of numerical columns
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)) # Get the upper triangle of the matrix
high_corr_cols = [column for column in upper_triangle.columns if any(upper_triangle[column] > self.correlation_threshold)] # Get the columns with high correlation
if len(high_corr_cols) > 0:
self.drop_corr_cols_ = high_corr_cols
for col in high_corr_cols:
print(f" Dropping {col} which has a high correlation with {upper_triangle[col][upper_triangle[col] > self.correlation_threshold].index.tolist()}")
# Initialize an empty dictionary to store the upper bounds
self.upper_bounds_ = {}
# Loop through each numerical column
#### processing of quantiles is only for float columns or those in dict ###
if self.quantile is None:
### you still need to calculate upper bounds needed capping for infinite values ##
base_quantile = 0.99
for col in float_cols:
# Get the third quartile
q3 = X[col].quantile(base_quantile)
# Get the interquartile range
iqr = X[col].quantile(base_quantile) - X[col].quantile(1 - base_quantile)
# Calculate the upper bound
upper_bound = q3 + 1.5 * iqr
# Store the upper bound in the dictionary
self.upper_bounds_[col] = upper_bound
else:
### calculate upper bounds to cap outliers using quantile given ##
for col in float_cols:
# Get the third quartile
q3 = X[col].quantile(self.quantile)
# Get the interquartile range
iqr = X[col].quantile(self.quantile) - X[col].quantile(1 - self.quantile)
# Calculate the upper bound
upper_bound = q3 + 1.5 * iqr
# Store the upper bound in the dictionary
self.upper_bounds_[col] = upper_bound
# Initialize an empty dictionary to store the column transformers
self.col_transformers_ = {}
# Define a threshold for skewness
skew_threshold = 1.0
# Loop through each float column
for col in float_cols:
# Calculate the skewness of the column
skewness = X[col].skew()
# Check if the skewness is above the threshold
if abs(skewness) > skew_threshold:
# Apply a log transformation if the column has positive values only
if X[col].min() > 0:
### function transformer expects pandas series
ft = FunctionTransformer(np.log1p)
ft.fit(X[col])
self.col_transformers_[col] = ft
# Apply a box-cox transformation if the column has positive values only and scipy is installed
elif X[col].min() > 0 and "scipy" in sys.modules:
### power transformer expects Pandas DataFrame
pt = PowerTransformer(method="box-cox")
pt.fit(X[[col]])
self.col_transformers_[col] = pt
# Apply a yeo-johnson transformation if the column has any values and sklearn is installed
else:
### power transformer expects Pandas DataFrame
pt = PowerTransformer(method="yeo-johnson")
pt.fit(X[[col]])
self.col_transformers_[col] = pt
# Get the number of unique types in each column
self.mixed_type_cols_ = []
mixed_types = X.applymap(type).nunique()
# Get the columns with more than one type
self.mixed_type_cols_ = mixed_types[mixed_types > 1].index.tolist()
if len(self.mixed_type_cols_) > 0:
extra_mixed = left_subtract(self.mixed_type_cols_, self.missing_cols_)
if len(extra_mixed) > 0:
print(f" Dropping {len(extra_mixed)} columns due to mixed data types")
for each in extra_mixed:
print(f" {each} has mixed dtypes: {X[each].apply(type).unique()}")
### drop ID columns from further processing ##
if len(self.id_cols_) > 0:
self.drop_cols_ += self.id_cols_
### drop Zero Variance columns from further processing ##
if len(self.zero_var_cols_) > 0:
self.drop_cols_ += self.zero_var_cols_