Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

detect datetime column on ingestion #975

Merged
merged 3 commits into from
Oct 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 94 additions & 8 deletions server/src/event/format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ pub trait EventFormat: Sized {
if !Self::is_schema_matching(new_schema.clone(), storage_schema, static_schema_flag) {
return Err(anyhow!("Schema mismatch"));
}
new_schema = update_field_type_in_schema(new_schema, time_partition);
new_schema = update_field_type_in_schema(new_schema, None, time_partition, None);
let rb = Self::decode(data, new_schema.clone())?;
let tags_arr = StringArray::from_iter_values(std::iter::repeat(&tags).take(rb.num_rows()));
let metadata_arr =
Expand Down Expand Up @@ -147,19 +147,101 @@ pub trait EventFormat: Sized {
}
}

pub fn get_existing_fields(
inferred_schema: Arc<Schema>,
existing_schema: Option<&HashMap<String, Arc<Field>>>,
) -> Vec<Arc<Field>> {
let mut existing_fields = Vec::new();

for field in inferred_schema.fields.iter() {
if existing_schema.map_or(false, |schema| schema.contains_key(field.name())) {
existing_fields.push(field.clone());
}
}

existing_fields
}

pub fn get_existing_timestamp_fields(
existing_schema: &HashMap<String, Arc<Field>>,
) -> Vec<Arc<Field>> {
let mut timestamp_fields = Vec::new();

for field in existing_schema.values() {
if let DataType::Timestamp(TimeUnit::Millisecond, None) = field.data_type() {
timestamp_fields.push(field.clone());
}
}

timestamp_fields
}

pub fn override_timestamp_fields(
inferred_schema: Arc<Schema>,
existing_timestamp_fields: &[Arc<Field>],
) -> Arc<Schema> {
let timestamp_field_names: Vec<&str> = existing_timestamp_fields
.iter()
.map(|field| field.name().as_str())
.collect();

let updated_fields: Vec<Arc<Field>> = inferred_schema
.fields()
.iter()
.map(|field| {
if timestamp_field_names.contains(&field.name().as_str()) {
Arc::new(Field::new(
field.name(),
DataType::Timestamp(TimeUnit::Millisecond, None),
field.is_nullable(),
))
} else {
field.clone()
}
})
.collect();

Arc::new(Schema::new(updated_fields))
}

pub fn update_field_type_in_schema(
schema: Arc<Schema>,
inferred_schema: Arc<Schema>,
existing_schema: Option<&HashMap<String, Arc<Field>>>,
time_partition: Option<String>,
log_records: Option<&Vec<Value>>,
) -> Arc<Schema> {
let mut updated_schema = inferred_schema.clone();

if let Some(existing_schema) = existing_schema {
let existing_fields = get_existing_fields(inferred_schema.clone(), Some(existing_schema));
let existing_timestamp_fields = get_existing_timestamp_fields(existing_schema);
// overriding known timestamp fields which were inferred as string fields
updated_schema = override_timestamp_fields(updated_schema, &existing_timestamp_fields);
let existing_field_names: Vec<String> = existing_fields
.iter()
.map(|field| field.name().clone())
.collect();

if let Some(log_records) = log_records {
for log_record in log_records {
updated_schema = Arc::new(update_data_type_to_datetime(
(*updated_schema).clone(),
log_record.clone(),
existing_field_names.clone(),
));
}
}
}

if time_partition.is_none() {
return schema;
return updated_schema;
}
let field_name = time_partition.unwrap();
let new_schema: Vec<Field> = schema
let time_partition_field_name = time_partition.unwrap();
let new_schema: Vec<Field> = updated_schema
.fields()
.iter()
.map(|field| {
if *field.name() == field_name {
if *field.name() == time_partition_field_name {
if field.data_type() == &DataType::Utf8 {
let new_data_type = DataType::Timestamp(TimeUnit::Millisecond, None);
Field::new(field.name().clone(), new_data_type, true)
Expand All @@ -174,12 +256,16 @@ pub fn update_field_type_in_schema(
Arc::new(Schema::new(new_schema))
}

pub fn update_data_type_to_datetime(schema: Schema, value: Value) -> Schema {
pub fn update_data_type_to_datetime(
schema: Schema,
value: Value,
ignore_field_names: Vec<String>,
) -> Schema {
let new_schema: Vec<Field> = schema
.fields()
.iter()
.map(|field| {
if field.data_type() == &DataType::Utf8 {
if field.data_type() == &DataType::Utf8 && !ignore_field_names.contains(field.name()) {
if let Value::Object(map) = &value {
if let Some(Value::String(s)) = map.get(field.name()) {
if DateTime::parse_from_rfc3339(s).is_ok() {
Expand Down
2 changes: 2 additions & 0 deletions server/src/event/format/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,9 @@ impl EventFormat for Event {
Ok(mut infer_schema) => {
let new_infer_schema = super::super::format::update_field_type_in_schema(
Arc::new(infer_schema),
Some(&stream_schema),
time_partition,
Some(&value_arr),
);
infer_schema = Schema::new(new_infer_schema.fields().clone());
if let Err(err) = Schema::try_merge(vec![
Expand Down
8 changes: 4 additions & 4 deletions server/src/handlers/http/logstream.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ pub async fn list(_: HttpRequest) -> impl Responder {

pub async fn detect_schema(body: Bytes) -> Result<impl Responder, StreamError> {
let body_val: Value = serde_json::from_slice(&body)?;
let value_arr: Vec<Value> = match body_val {
let log_records: Vec<Value> = match body_val {
Value::Array(arr) => arr,
value @ Value::Object(_) => vec![value],
_ => {
Expand All @@ -104,9 +104,9 @@ pub async fn detect_schema(body: Bytes) -> Result<impl Responder, StreamError> {
}
};

let mut schema = infer_json_schema_from_iterator(value_arr.iter().map(Ok)).unwrap();
for value in value_arr {
schema = update_data_type_to_datetime(schema, value);
let mut schema = infer_json_schema_from_iterator(log_records.iter().map(Ok)).unwrap();
for log_record in log_records {
schema = update_data_type_to_datetime(schema, log_record, Vec::new());
}
Ok((web::Json(schema), StatusCode::OK))
}
Expand Down
Loading