forked from MeItsLars/thabloid-sticker-generator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
167 lines (142 loc) · 7.13 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
"""
Main file, entrypoint for the application.
"""
import sys
import os
import unicodedata
import pandas as pd
from util import query_yes_no
from checker import correct_entries
from pdf import generate_pdf
AUTHORS = ['Lars Jeurissen']
VERSION = '1.0.5'
PROGRAM_ART = (' _______ _ _ _ _ _ \n'
' |__ __| | | | | | (_) | | \n'
' | | | |__ __ _| |__ | | ___ _ __| | \n'
' | | | \'_ \\ / _` | \'_ \\| |/ _ \\| |/ _` | \n'
' | | | | | | (_| | |_) | | (_) | | (_| | \n'
' _|_|_ |_| |_|\\__,_|_.__/|_|\\___/|_|\\__,_| \n'
' / ____| | (_) | | \n'
' | (___ | |_ _ ___| | _____ _ __ \n'
' \\___ \\| __| |/ __| |/ / _ \\ \'__| \n'
' ____) | |_| | (__| < __/ | \n'
' |_____/ \\__|_|\\___|_|\\_\\___|_| _ \n'
' / ____| | | \n'
' | | __ ___ _ __ ___ _ __ __ _| |_ ___ _ __ \n'
' | | |_ |/ _ \\ \'_ \\ / _ \\ \'__/ _` | __/ _ \\| \'__|\n'
' | |__| | __/ | | | __/ | | (_| | || (_) | | \n'
' \\_____|\\___|_| |_|\\___|_| \\__,_|\\__\\___/|_|\n'
'\n'
'Running Thabloid Sticker Generator v{} by {}\n'
'-------------------------------------------------------------------------------')
def read_input(input_directory: str) -> pd.DataFrame:
"""
Reads input CSV files from the given input directory and converts them to a DataFrame.
Invalid lines will be collected. If any invalid lines are encountered, the user will be
prompted on whether they want to continue.
Parameters
----------
input_directory: str
The input directory where the CSV files are stored
Returns
-------
pd.DataFrame:
A DataFrame containing all correct CSV file entries
"""
# Read the CSV files
input("Put all address files (.csv) that you want to process in the 'input' folder. "
"Press enter when done.")
csvs = list(filter(lambda file: file.endswith('.csv'), os.listdir(input_directory)))
while len(csvs) <= 0:
input("No .csv files detected. Press enter when you have added them.")
csvs = list(filter(lambda file: file.endswith('.csv'), os.listdir(input_directory)))
# Parse the CSV files into a pandas dataframe
print(f"Parsing {len(csvs)} input file(s)..")
column_names = ['first_name', 'last_name', 'address',
'address_2', 'postal_code', 'city', 'country']
erroneous_lines = []
input_data = pd.concat([pd.read_csv(os.path.join(input_directory, csv), header=0,
names=column_names, engine='python',
on_bad_lines=lambda l: erroneous_lines.append(','.join(l)),
keep_default_na=False) for csv in csvs]).drop_duplicates()
# Re-index the dataframe so that we don't have double indices
input_data.reset_index(drop=True, inplace=True)
# If there are erroneous lines, prompt the user to ask if they want to continue
if len(erroneous_lines) > 0:
print("----------")
print(f"Encountered {len(erroneous_lines)} erroneous line(s) in the input csv file(s):")
print(*erroneous_lines, sep="\n")
print("----------")
if query_yes_no("We will continue with the remaining entries if you don't exit. "
"Do you want to exit?", "yes"):
sys.exit(0)
print(f"Read {len(input_data)} data entries")
return input_data
def format_entries(input_entries: pd.DataFrame):
"""
Formats input entries in-place so that the rest of the application can use correct data.
German s'es are converted to two s'es and the data is ascii-encoded
Parameters
----------
input_entries: pd.DataFrame
The input entries to be formatted, stored in a dataframe
"""
for _, entry in input_entries.iterrows():
# Convert german s to double s
entry['address'] = entry['address'].replace('\xdf', 'ss')
entry['address_2'] = entry['address_2'].replace('\xdf', 'ss')
entry['city'] = entry['city'].replace('\xdf', 'ss')
# Format to ascii
entry['address'] = unicodedata.normalize('NFKD', entry['address']).encode('ascii', 'ignore').decode("ascii")
entry['address_2'] = unicodedata.normalize('NFKD', entry['address_2']).encode('ascii', 'ignore').decode("ascii")
entry['city'] = unicodedata.normalize('NFKD', entry['city']).encode('ascii', 'ignore').decode("ascii")
entry['postal_code'] = entry['postal_code'].upper()
def post_process_entries(input_entries: pd.DataFrame):
"""
Post-processes the entries in-place, so that they are converted to a format that PostNL accepts.
See: https://www.postnl.nl/versturen/brief-of-kaart-versturen/hoe-verstuur-ik-een-brief-of-kaart/brief-adresseren/
Parameters
----------
input_entries: pd.DataFrame
The input entries to be post-processed, stored in a dataframe
"""
for _, entry in input_entries.iterrows():
# Format Dutch postal codes
if entry["country"] == "Netherlands" and len(entry["postal_code"]) == 6:
entry["postal_code"] = f"{entry['postal_code'][:4]} {entry['postal_code'][4:]}"
# Uppercase cities and countries
entry["city"] = entry["city"].upper()
entry["country"] = entry["country"].upper()
# Format german s
entry['address'] = entry['address'].replace('\xdf', 'ss')
entry['address_2'] = entry['address_2'].replace('\xdf', 'ss')
entry['city'] = entry['city'].replace('\xdf', 'ss')
if __name__ == "__main__":
# Print introduction
print(PROGRAM_ART.format(VERSION, ", ".join(AUTHORS)))
# Validate input and output directories
INPUT_DIR = 'input'
OUTPUT_DIR = 'output'
if not os.path.exists(INPUT_DIR):
os.mkdir(INPUT_DIR)
while os.path.exists(OUTPUT_DIR) and os.listdir(OUTPUT_DIR):
input("The 'output' directory is not empty, please delete its contents. "
"Press enter when done.")
if not os.path.exists(OUTPUT_DIR):
os.mkdir(OUTPUT_DIR)
# Read the input from the input directory
entries = read_input(INPUT_DIR)
# Do some basic entry formatting
format_entries(entries)
# Optional: Check entries in the Google Maps API
if query_yes_no("Entries can be checked and corrected using various APIs. "
"Do you want to do this?", "yes"):
no_invalid, no_changed = correct_entries(entries, OUTPUT_DIR)
print(f"Number of invalid addresses: {no_invalid}")
print(f"Number of changed addresses: {no_changed}")
# Apply postprocessing to the entries
post_process_entries(entries)
# Generate the output PDF
generate_pdf(entries, os.path.join(OUTPUT_DIR, "sticker_sheet.pdf"))
print("Generation complete! Thank you for using the amazing Thabloid Sticker Generator, "
"see you in a few months!")