-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
92 lines (73 loc) · 3.58 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# app.py
import streamlit as st
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def main():
st.title("DataMapper")
st.write("""
Upload two Excel files containing the metadata of your databases. Each file should have the following columns:
- **Table Name**
- **Column Name**
- **Description**
""")
# Upload Excel files
uploaded_file1 = st.file_uploader("Upload First Excel File", type=["xlsx", "xls"], key="file1")
uploaded_file2 = st.file_uploader("Upload Second Excel File", type=["xlsx", "xls"], key="file2")
if uploaded_file1 and uploaded_file2:
# Read Excel files
df1 = pd.read_excel(uploaded_file1)
df2 = pd.read_excel(uploaded_file2)
# Ensure required columns are present
required_columns = {'Table Name', 'Column Name', 'Description'}
if not required_columns.issubset(df1.columns) or not required_columns.issubset(df2.columns):
st.error("Both files must contain 'Table Name', 'Column Name', and 'Description' columns.")
return
# Perform column mapping
column_mapping, similarity_matrix = map_columns(df1, df2)
st.subheader("Column Mapping Based on Description Similarity")
mapping_df = pd.DataFrame(list(column_mapping.items()), columns=['Column from File 1', 'Mapped to Column in File 2'])
st.dataframe(mapping_df)
# Display similarity matrix as heatmap
st.subheader("Similarity Matrix Heatmap")
display_heatmap(similarity_matrix, df1['Column Name'], df2['Column Name'])
# Generate mapping function
st.subheader("Generated Mapping Function")
st.code(generate_mapping_function(column_mapping), language='python')
def map_columns(df1, df2):
# Get descriptions
descriptions_1 = df1['Description'].fillna('').tolist()
descriptions_2 = df2['Description'].fillna('').tolist()
# Vectorize descriptions using TF-IDF
vectorizer = TfidfVectorizer().fit(descriptions_1 + descriptions_2)
tfidf_matrix_1 = vectorizer.transform(descriptions_1)
tfidf_matrix_2 = vectorizer.transform(descriptions_2)
# Compute similarity matrix
similarity_matrix = cosine_similarity(tfidf_matrix_1, tfidf_matrix_2)
# Create mapping based on highest similarity
column_mapping = {}
for i, row in enumerate(similarity_matrix):
best_match_index = row.argmax()
col_1 = df1['Column Name'].iloc[i]
col_2 = df2['Column Name'].iloc[best_match_index]
column_mapping[col_1] = col_2
return column_mapping, similarity_matrix
def generate_mapping_function(column_mapping):
mapping_lines = ["def map_columns(df_source, df_target):", " # Column mapping"]
mapping_lines.append(f" column_mapping = {column_mapping}")
mapping_lines.append(" for src_col, tgt_col in column_mapping.items():")
mapping_lines.append(" if src_col in df_source.columns and tgt_col in df_target.columns:")
mapping_lines.append(" df_target[tgt_col] = df_source[src_col]")
mapping_lines.append(" return df_target")
return '\n'.join(mapping_lines)
def display_heatmap(similarity_matrix, columns_1, columns_2):
import seaborn as sns
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(similarity_matrix, annot=True, fmt=".2f", cmap="YlGnBu",
xticklabels=columns_2, yticklabels=columns_1, ax=ax)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
st.pyplot(fig)
if __name__ == '__main__':
main()