GlareDB · tychoish · Jan 30, 2024 · Jan 24, 2024 · Jan 24, 2024 · Jan 25, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/datasources/Cargo.toml b/crates/datasources/Cargo.toml
@@ -73,6 +73,7 @@ lance = { git = "https://github.com/universalmind303/lance", rev = "ffd4ac6ee2c6
 bson = "2.7.0"
 scylla = { version = "0.11.1" }
 glob = "0.3.1"
+indexmap = "2.1.0"
 
 # SSH tunnels
 [target.'cfg(any(target_os = "linux", target_os = "macos"))'.dependencies]

diff --git a/crates/datasources/src/json/errors.rs b/crates/datasources/src/json/errors.rs
@@ -0,0 +1,33 @@
+use datafusion_ext::errors::ExtensionError;
+
+use crate::object_store::errors::ObjectStoreSourceError;
+
+#[derive(Debug, thiserror::Error)]
+pub enum JsonError {
+    #[error("Unsupported json type: {0}")]
+    UnspportedType(&'static str),
+
+    #[error(transparent)]
+    SerdeJson(#[from] serde_json::Error),
+
+    #[error("no objects found {0}")]
+    NotFound(String),
+
+    #[error(transparent)]
+    ObjectStoreSource(#[from] ObjectStoreSourceError),
+
+    #[error(transparent)]
+    ObjectStore(#[from] object_store::Error),
+
+    #[error(transparent)]
+    Arrow(#[from] datafusion::arrow::error::ArrowError),
+
+    #[error(transparent)]
+    Datafusion(#[from] datafusion::error::DataFusionError),
+}
+
+impl From<JsonError> for ExtensionError {
+    fn from(e: JsonError) -> Self {
+        ExtensionError::String(e.to_string())
+    }
+}
diff --git a/crates/datasources/src/json/mod.rs b/crates/datasources/src/json/mod.rs
@@ -0,0 +1,3 @@
+mod errors;
+mod stream;
+pub mod table;
diff --git a/crates/datasources/src/json/stream.rs b/crates/datasources/src/json/stream.rs
@@ -0,0 +1,83 @@
+use std::pin::Pin;
+use std::sync::{Arc, Mutex};
+use std::task::{Context, Poll};
+
+use datafusion::arrow::datatypes::{Schema, SchemaRef};
+use datafusion::arrow::json::ReaderBuilder;
+use datafusion::arrow::record_batch::RecordBatch;
+use datafusion::error::DataFusionError;
+use datafusion::execution::TaskContext;
+use datafusion::physical_plan::streaming::PartitionStream;
+use datafusion::physical_plan::{RecordBatchStream, SendableRecordBatchStream};
+use futures::{Stream, StreamExt};
+use serde_json::{Map, Value};
+
+type SendableCheckedRecordBatchStrem =
+    Pin<Box<dyn Stream<Item = Result<RecordBatch, DataFusionError>> + Send>>;
+
+pub struct JsonStream {
+    schema: Arc<Schema>,
+    stream: SendableCheckedRecordBatchStrem,
+}
+
+impl Stream for JsonStream {
+    type Item = Result<RecordBatch, DataFusionError>;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        self.stream.poll_next_unpin(cx)
+    }
+}
+
+impl RecordBatchStream for JsonStream {
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+}
+
+pub struct JsonPartitionStream {
+    schema: Arc<Schema>,
+    stream: Mutex<Option<SendableCheckedRecordBatchStrem>>,
+}
+
+impl PartitionStream for JsonPartitionStream {
+    fn schema(&self) -> &SchemaRef {
+        &self.schema
+    }
+
+    fn execute(&self, _ctx: Arc<TaskContext>) -> SendableRecordBatchStream {
+        let partition = self
+            .stream
+            .lock()
+            .unwrap()
+            .take()
+            .expect("stream to only be called once")
+            .boxed();
+
+        Box::pin(JsonStream {
+            schema: self.schema.clone(),
+            stream: partition,
+        })
+    }
+}
+
+impl JsonPartitionStream {
+    pub fn new(schema: Arc<Schema>, chunk: Vec<Map<String, Value>>) -> Self {
+        let stream_schema = schema.clone();
+        let stream = futures::stream::iter(chunk)
+            .chunks(25)
+            .map(move |objs| {
+                let mut decoder =
+                    ReaderBuilder::new(stream_schema.clone().to_owned()).build_decoder()?;
+                decoder
+                    .serialize(&objs)
+                    .map_err(|e| DataFusionError::External(Box::new(e)))?;
+                Ok(decoder.flush()?.unwrap())
+            })
+            .boxed();
+
+        Self {
+            schema: schema.clone(),
+            stream: Mutex::new(Some(stream)),
+        }
+    }
+}
diff --git a/crates/datasources/src/json/table.rs b/crates/datasources/src/json/table.rs
@@ -0,0 +1,135 @@
+use std::sync::Arc;
+use std::vec::Vec;
+
+use datafusion::arrow::datatypes::{DataType, Field, Schema};
+use datafusion::datasource::streaming::StreamingTable;
+use datafusion::datasource::TableProvider;
+use datafusion::physical_plan::streaming::PartitionStream;
+use serde_json::{Map, Value};
+
+use super::stream::JsonPartitionStream;
+use crate::common::url::DatasourceUrl;
+use crate::json::errors::JsonError;
+use crate::object_store::generic::GenericStoreAccess;
+use crate::object_store::ObjStoreAccess;
+
+pub async fn json_streaming_table(
+    store_access: GenericStoreAccess,
+    source_url: DatasourceUrl,
+) -> Result<Arc<dyn TableProvider>, JsonError> {
+    let path = source_url.path();
+
+    let store = store_access.create_store()?;
+
+    // assume that the file type is a glob and see if there are
+    // more files...
+    let mut list = store_access.list_globbed(&store, path.as_ref()).await?;
+
+    if list.is_empty() {
+        return Err(JsonError::NotFound(path.into_owned()));
+    }
+
+    // for consistent results, particularly for the sample, always
+    // sort by location
+    list.sort_by(|a, b| a.location.cmp(&b.location));
+
+    let mut data = Vec::new();
+    for obj in list {
+        let blob = store.get(&obj.location).await?.bytes().await?.to_vec();
+        let dejson = serde_json::from_slice::<serde_json::Value>(blob.as_slice())?.to_owned();
+        push_unwind_json_values(&mut data, dejson)?;
+    }
+
+    let mut field_set = indexmap::IndexMap::<String, DataType>::new();
+    for obj in &data {
+        for key in obj.keys() {
+            if field_set.contains_key(key) {
+                continue;
+            }
+            field_set.insert(key.to_string(), type_for_value(obj.get(key).unwrap()));
+        }
+    }
+
+    let schema = Arc::new(Schema::new(
+        field_set
+            .iter()
+            .map(|(k, v)| Field::new(k, v.to_owned(), true))
+            .collect::<Vec<_>>(),
+    ));
+
+    let chunks = data
+        .chunks(100)
+        .map(|chunk| -> Arc<dyn PartitionStream> {
+            Arc::new(JsonPartitionStream::new(
+                schema.clone(),
+                chunk.to_vec().to_owned(),
+            ))
+        })
+        .collect::<Vec<_>>();
+
+    Ok(Arc::new(StreamingTable::try_new(schema.clone(), chunks)?))
+}
+
+fn push_unwind_json_values(
+    data: &mut Vec<Map<String, Value>>,
+    val: Value,
+) -> Result<(), JsonError> {
+    match val {
+        Value::Array(vals) => {
+            for v in vals {
+                match v {
+                    Value::Object(doc) => data.push(doc),
+                    Value::Null => data.push(Map::new()),
+                    _ => {
+                        return Err(JsonError::UnspportedType(
+                            "only objects and arrays of objects are supported",
+                        ))
+                    }
+                }
+            }
+        }
+        Value::Object(doc) => data.push(doc),
+        Value::Null => data.push(Map::new()),
+        _ => {
+            return Err(JsonError::UnspportedType(
+                "only objects and arrays of objects are supported",
+            ))
+        }
+    };
+    Ok(())
+}
+
+fn type_for_value(value: &Value) -> DataType {
+    match value {
+        Value::Array(v) => {
+            if v.is_empty() {
+                DataType::List(Arc::new(Field::new("", DataType::Null, true)))
+            } else {
+                DataType::List(Arc::new(Field::new(
+                    "",
+                    type_for_value(v.first().unwrap()),
+                    true,
+                )))
+            }
+        }
+        Value::String(_) => DataType::Utf8,
+        Value::Bool(_) => DataType::Boolean,
+        Value::Null => DataType::Null,
+        Value::Number(n) => {
+            if n.is_i64() {
+                DataType::Int64
+            } else if n.is_u64() {
+                DataType::UInt64
+            } else {
+                DataType::Float64
+            }
+        }
+        Value::Object(obj) => {
+            let mut fields = Vec::with_capacity(obj.len());
+            for (k, v) in obj.iter() {
+                fields.push(Field::new(k, type_for_value(v), true))
+            }
+            DataType::Struct(fields.into())
+        }
+    }
+}