-
Notifications
You must be signed in to change notification settings - Fork 0
/
gen.py
133 lines (110 loc) · 4.38 KB
/
gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#%%
import random
import string
import time
import pandas as pd
import numpy as np
import os
# Resources
data = 'source.csv'
compressed_data = 'source.csv.gz'
class Decorators:
def __init__(self):
pass
def run_timer(func):
def wrapper():
t1 = time.time()
func()
t2 = time.time() - t1
print(f"{func.__name__} : ran in {t2} seconds")
return wrapper
class FileGenerator:
def __init__(self):
pass
def gen_random_string(length):
return ''.join(random.choice(string.ascii_letters) for _ in range(length))
def gen_file():
with open('source.csv', 'w') as file:
file.write("id,integer1,string1,string2\n")
target_size = 1024 * 1024 * 1024 # sets target fileseize (1GB)
current_size = file.tell() # initialise file size
id_counter = 1
while current_size < target_size:
integer1 = random.randint(1,10) # auto-increment IDs
string1 = FileGenerator.gen_random_string(random.randint(1, 32)) # random char of len(1 to 32)
string2 = FileGenerator.gen_random_string(random.randint(1, 32)) # random char of len(1 to 32)
row = f"{id_counter},{integer1},{string1},{string2}\n" # concat values to csv
file.write(row) # write rows to csv
current_size = file.tell() # updates file size
id_counter += 1 # increment ID counter
print("file generation complete!")
class GenUtils:
def __init__(self):
pass
@Decorators.run_timer
def gen_file():
gen_file = FileGenerator.gen_file()
return gen_file
def gen_csv_file(data):
with open(data, 'r') as file:
df = pd.read_csv(file)
return df
class GenFileAnalysis:
def __init__(self):
pass
def gen_file_row_count():
df = GenUtils.gen_csv_file(data)
row_count = df.shape[0]
return print(f"Total number of rows generated : {row_count}")
def gen_file_last_row_contents():
df = GenUtils.gen_csv_file(data)
last_row = df.iloc[-1]
return last_row
def gen_file_distribution_of_integers():
df = GenUtils.gen_csv_file(data)
distribution = df['integer1'].value_counts()
most_common_item = distribution.idxmax()
print(f"Most occured Integer : {most_common_item}")
return distribution
def gen_file_vowel_analyser():
df = GenUtils.gen_csv_file(data)
def count_vowels(string):
vowels = 'aeiou'
return sum(char in vowels for char in string)
df['v_count'] = df.apply(lambda row: sum(count_vowels(str(cell)) for cell in row), axis=1)
max_vowel_row_index = df['v_count'].idxmax()
max_vowel_row = df.loc[max_vowel_row_index]
return max_vowel_row
class GenFileCompression:
def __init__(self):
pass
@Decorators.run_timer
def gen_file_gzip_compression():
df = GenUtils.gen_csv_file(data)
gzip_file = df.to_csv('source.csv.gz', compression='gzip', index=False)
class GenFileSizeAnalysis:
def __init__(self):
pass
def gen_file_size_checker():
comp_file_size = os.path.getsize(compressed_data)
comp_file_size_in_mb = comp_file_size / 1024
return print(f"Compressed File Size : {comp_file_size_in_mb}")
# Calls to generate data files
generate_csv_file = GenUtils.gen_file
generate_pandas_dataframe_from_csv_file = GenUtils.gen_csv_file
generate_gzip_compressed_file = GenFileCompression.gen_file_gzip_compression
# Calls to analyse contents of data file
get_df_row_count = GenFileAnalysis.gen_file_row_count
get_df_last_row_contents = GenFileAnalysis.gen_file_last_row_contents
get_distribution_of_integers = GenFileAnalysis.gen_file_distribution_of_integers
get_row_with_most_vowels = GenFileAnalysis.gen_file_vowel_analyser
get_gzip_compressed_file_size = GenFileSizeAnalysis.gen_file_size_checker
# Unhash to run
# generate_csv_file()
# generate_pandas_dataframe_from_csv_file()
# generate_gzip_compressed_file()
# get_df_row_count()
# get_df_last_row_contents()
# get_distribution_of_integers()
# get_row_with_most_vowels()
# get_gzip_compressed_file_size()