updated Schema Extractor

GSA · Dec 10, 2024 · 78c4666 · 78c4666
1 parent 39224c9
commit 78c4666
Show file tree

Hide file tree

Showing 13 changed files with 1,057 additions and 827 deletions.
diff --git a/streamlit_app/app/pages/Schema_Extractor.py b/streamlit_app/app/pages/Schema_Extractor.py
@@ -0,0 +1,58 @@
+# Schema_Extractor.py
+
+app_version = "0.1"
+app_title = "OllaLab - Schema Extractor"
+app_description = "Schema and Structured Data Extractor from any data source."
+app_icon = ":arrow_upper_right:"
+
+import streamlit as st
+from schema_extractor import file_uploader
+from schema_extractor import sanitizer
+from schema_extractor import tabular_data_processor
+from schema_extractor import serialized_data_processor
+from schema_extractor import unstructured_data_processor
+
+def run_schema_extractor():
+    """
+    Runs the Schema Extractor application page.
+
+    Handles user interactions, file uploads, processing options, and displays results.
+    """
+    st.title("Data Schema Extractor")
+    st.write("Extract data schemas from your datasets easily.")
+
+    # Step 1: File Upload and Type Detection
+    # Use the file_uploader module to handle file uploads and detect file type categories
+    uploaded_files, file_type_category = file_uploader.upload_files()
+
+    # Check if files are uploaded and a file type category is detected
+    if uploaded_files and file_type_category:
+        # Step 2: Sanitize Files
+        # Use the sanitizer module to sanitize uploaded files
+        sanitized_data = sanitizer.sanitize_files(uploaded_files)
+
+        # Step 3: Inform User of Detected File Type Category
+        st.write(f"Detected file type category: **{file_type_category.capitalize()}**")
+
+        # Step 4: Process Data Based on File Type Category
+        if file_type_category == 'serialized':
+            st.header("Processing Serialized Data")
+            # Process serialized data using the serialized_data_processor module
+            serialized_data_processor.process_serialized_data(sanitized_data)
+        elif file_type_category == 'tabular':
+            st.header("Processing Tabular Data")
+            # Process tabular data using the tabular_data_processor module
+            tabular_data_processor.process_tabular_data(sanitized_data)
+        elif file_type_category == 'unstructured':
+            st.header("Processing Unstructured Data")
+            # Process unstructured data using the unstructured_data_processor module
+            unstructured_data_processor.process_unstructured_data(sanitized_data)
+        else:
+            # If file type category is unknown or unsupported, display an error
+            st.error(f"Unsupported file type category: {file_type_category}")
+    else:
+        # If no files are uploaded, prompt the user to upload files
+        st.info("Please upload files to proceed.")
+
+if __name__ == "__main__":
+    run_schema_extractor()
diff --git a/streamlit_app/app/schema_extractor/file_uploader.py b/streamlit_app/app/schema_extractor/file_uploader.py
@@ -2,7 +2,7 @@
 
 import streamlit as st
 import os
-from schema_extractor.utils import detect_file_type
+from schema_extractor.utils import detect_file_category
 
 def upload_files():
     """
@@ -44,7 +44,7 @@ def upload_files():
         for uploaded_file in uploaded_files:
             try:
                 # Detect the file type category using the utility function
-                file_type_category = detect_file_type(uploaded_file)
+                file_type_category = detect_file_category(uploaded_file)
                 if file_type_category == 'unknown':
                     st.warning(f"File '{uploaded_file.name}' has an unsupported file type or cannot be processed.")
                     return None, None

diff --git a/streamlit_app/app/schema_extractor/sanitizer.py b/streamlit_app/app/schema_extractor/sanitizer.py
@@ -1,6 +1,7 @@
 # schema_extractor/sanitizer.py
 
 import os
+import io
 import pandas as pd
 import numpy as np
 import streamlit as st

diff --git a/streamlit_app/app/schema_extractor/schema_builder.py b/streamlit_app/app/schema_extractor/schema_builder.py
@@ -366,4 +366,23 @@ def extract_data_from_text(text_data, schema):
             if field in schema.get('required', []):
                 return None
 
-    return extracted_data
+    return extracted_data
+
+def get_pandera_dtype(dtype_str: str):
+    """
+    Maps string representation of data types to Pandera types.
+
+    Args:
+        dtype_str (str): The data type as a string.
+
+    Returns:
+        pandera.Column: The corresponding Pandera data type.
+    """
+    mapping = {
+        "Int": pa.Int,
+        "Float": pa.Float,
+        "String": pa.String,
+        "Bool": pa.Bool,
+        "DateTime": pa.DateTime
+    }
+    return mapping.get(dtype_str, pa.String)