Skip to content

Commit

Permalink
updated Schema Extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
Cybonto committed Dec 10, 2024
1 parent 39224c9 commit 78c4666
Show file tree
Hide file tree
Showing 13 changed files with 1,057 additions and 827 deletions.
58 changes: 58 additions & 0 deletions streamlit_app/app/pages/Schema_Extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Schema_Extractor.py

app_version = "0.1"
app_title = "OllaLab - Schema Extractor"
app_description = "Schema and Structured Data Extractor from any data source."
app_icon = ":arrow_upper_right:"

import streamlit as st
from schema_extractor import file_uploader
from schema_extractor import sanitizer
from schema_extractor import tabular_data_processor
from schema_extractor import serialized_data_processor
from schema_extractor import unstructured_data_processor

def run_schema_extractor():
"""
Runs the Schema Extractor application page.
Handles user interactions, file uploads, processing options, and displays results.
"""
st.title("Data Schema Extractor")
st.write("Extract data schemas from your datasets easily.")

# Step 1: File Upload and Type Detection
# Use the file_uploader module to handle file uploads and detect file type categories
uploaded_files, file_type_category = file_uploader.upload_files()

# Check if files are uploaded and a file type category is detected
if uploaded_files and file_type_category:
# Step 2: Sanitize Files
# Use the sanitizer module to sanitize uploaded files
sanitized_data = sanitizer.sanitize_files(uploaded_files)

# Step 3: Inform User of Detected File Type Category
st.write(f"Detected file type category: **{file_type_category.capitalize()}**")

# Step 4: Process Data Based on File Type Category
if file_type_category == 'serialized':
st.header("Processing Serialized Data")
# Process serialized data using the serialized_data_processor module
serialized_data_processor.process_serialized_data(sanitized_data)
elif file_type_category == 'tabular':
st.header("Processing Tabular Data")
# Process tabular data using the tabular_data_processor module
tabular_data_processor.process_tabular_data(sanitized_data)
elif file_type_category == 'unstructured':
st.header("Processing Unstructured Data")
# Process unstructured data using the unstructured_data_processor module
unstructured_data_processor.process_unstructured_data(sanitized_data)
else:
# If file type category is unknown or unsupported, display an error
st.error(f"Unsupported file type category: {file_type_category}")
else:
# If no files are uploaded, prompt the user to upload files
st.info("Please upload files to proceed.")

if __name__ == "__main__":
run_schema_extractor()
4 changes: 2 additions & 2 deletions streamlit_app/app/schema_extractor/file_uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import streamlit as st
import os
from schema_extractor.utils import detect_file_type
from schema_extractor.utils import detect_file_category

def upload_files():
"""
Expand Down Expand Up @@ -44,7 +44,7 @@ def upload_files():
for uploaded_file in uploaded_files:
try:
# Detect the file type category using the utility function
file_type_category = detect_file_type(uploaded_file)
file_type_category = detect_file_category(uploaded_file)
if file_type_category == 'unknown':
st.warning(f"File '{uploaded_file.name}' has an unsupported file type or cannot be processed.")
return None, None
Expand Down
1 change: 1 addition & 0 deletions streamlit_app/app/schema_extractor/sanitizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# schema_extractor/sanitizer.py

import os
import io
import pandas as pd
import numpy as np
import streamlit as st
Expand Down
21 changes: 20 additions & 1 deletion streamlit_app/app/schema_extractor/schema_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,4 +366,23 @@ def extract_data_from_text(text_data, schema):
if field in schema.get('required', []):
return None

return extracted_data
return extracted_data

def get_pandera_dtype(dtype_str: str):
"""
Maps string representation of data types to Pandera types.
Args:
dtype_str (str): The data type as a string.
Returns:
pandera.Column: The corresponding Pandera data type.
"""
mapping = {
"Int": pa.Int,
"Float": pa.Float,
"String": pa.String,
"Bool": pa.Bool,
"DateTime": pa.DateTime
}
return mapping.get(dtype_str, pa.String)
Loading

0 comments on commit 78c4666

Please sign in to comment.