From 48464288cee73036df1e2b1ca7969ab8b99a6440 Mon Sep 17 00:00:00 2001 From: tycho garen Date: Sat, 30 Dec 2023 21:33:40 -0500 Subject: [PATCH 01/10] fix: bson builder should handle schema flexibly --- crates/datasources/src/bson/builder.rs | 60 +++++++++++++++++++++++--- crates/datasources/src/bson/stream.rs | 3 +- 2 files changed, 54 insertions(+), 9 deletions(-) diff --git a/crates/datasources/src/bson/builder.rs b/crates/datasources/src/bson/builder.rs index cea755d08..bea6472fa 100644 --- a/crates/datasources/src/bson/builder.rs +++ b/crates/datasources/src/bson/builder.rs @@ -88,9 +88,7 @@ impl RecordStructBuilder { } // Add value to columns. - let typ = self.fields.get(idx).unwrap().data_type(); // Programmer error if data type doesn't exist. - let col = self.builders.get_mut(idx).unwrap(); // Programmer error if this doesn't exist. - append_value(val, typ, col.as_mut())?; + self.add_value_at_index(idx, Some(val))?; // Track which columns we've added values to. cols_set.set(idx, true); @@ -102,10 +100,48 @@ impl RecordStructBuilder { // Append nulls to all columns not included in the doc. for (idx, did_set) in cols_set.iter().enumerate() { if !did_set { - // Add nulls... - let typ = self.fields.get(idx).unwrap().data_type(); // Programmer error if data type doesn't exist. - let col = self.builders.get_mut(idx).unwrap(); // Programmer error if column doesn't exist. - append_null(typ, col.as_mut())?; + // Add null... + self.add_value_at_index(idx, None)?; + } + } + + Ok(()) + } + + pub fn project_and_append(&mut self, doc: &RawDocument) -> Result<()> { + let mut cols_set: BitVec = BitVec::repeat(false, self.fields.len()); + + for iter_result in doc { + match iter_result { + Ok((key, val)) => { + if let Some(idx) = self.field_index.get(key) { + let idx = idx.to_owned(); + + if cols_set.get(idx).is_some_and(|v| v == true) { + // TODO: if this happens it means that the bson document has a field + // name that appears more than once. This is legal and possible to build + // with some libraries but isn't forbidden, and (I think?) historically + // not (always?) rejected by MongoDB. Regardless "ignoring second + // appearances of the key" is a reasonable semantic. + continue; + } + + // Add value to columns. + self.add_value_at_index(idx, Some(val))?; + + // Track which columns we've added values to. + cols_set.set(idx, true); + }; + } + Err(_) => return Err(BsonError::FailedToReadRawBsonDocument), + } + } + + // Append nulls to all columns not included in the doc. + for (idx, did_set) in cols_set.iter().enumerate() { + if !did_set { + // Add null... + self.add_value_at_index(idx, None)?; } } @@ -115,6 +151,16 @@ impl RecordStructBuilder { pub fn into_fields_and_builders(self) -> (Fields, Vec>) { (self.fields, self.builders) } + + fn add_value_at_index(&mut self, idx: usize, val: Option) -> Result<()> { + let typ = self.fields.get(idx).unwrap().data_type(); // Programmer error if data type doesn't exist. + let col = self.builders.get_mut(idx).unwrap(); // Programmer error if column doesn't exist. + + match val { + Some(v) => append_value(v, typ, col.as_mut()), + None => append_null(typ, col.as_mut()), + } + } } impl ArrayBuilder for RecordStructBuilder { diff --git a/crates/datasources/src/bson/stream.rs b/crates/datasources/src/bson/stream.rs index dd272254c..7635ef3f1 100644 --- a/crates/datasources/src/bson/stream.rs +++ b/crates/datasources/src/bson/stream.rs @@ -58,8 +58,7 @@ impl BsonStream { let mut builder = RecordStructBuilder::new_with_capacity(schema.fields().to_owned(), 100)?; for result in results { - // TOOD: shouldn't convert here. - builder.append_record(&result?)?; + builder.project_and_append(&result?)?; } let (fields, builders) = builder.into_fields_and_builders(); From 931748feebb01ef07e143451bc99801650580ebd Mon Sep 17 00:00:00 2001 From: Sam Kleinman Date: Mon, 1 Jan 2024 20:34:12 -0500 Subject: [PATCH 02/10] Update crates/datasources/src/bson/builder.rs Co-authored-by: Sean Smith --- crates/datasources/src/bson/builder.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crates/datasources/src/bson/builder.rs b/crates/datasources/src/bson/builder.rs index bea6472fa..8ff2bda61 100644 --- a/crates/datasources/src/bson/builder.rs +++ b/crates/datasources/src/bson/builder.rs @@ -114,8 +114,7 @@ impl RecordStructBuilder { for iter_result in doc { match iter_result { Ok((key, val)) => { - if let Some(idx) = self.field_index.get(key) { - let idx = idx.to_owned(); + if let Some(&idx) = self.field_index.get(key) { if cols_set.get(idx).is_some_and(|v| v == true) { // TODO: if this happens it means that the bson document has a field From 5118e65ce41551b42147e26a297e338769f200cf Mon Sep 17 00:00:00 2001 From: tycho garen Date: Tue, 2 Jan 2024 09:07:52 -0500 Subject: [PATCH 03/10] fixup --- crates/datasources/src/bson/builder.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/crates/datasources/src/bson/builder.rs b/crates/datasources/src/bson/builder.rs index 601d3ed06..d0e58682a 100644 --- a/crates/datasources/src/bson/builder.rs +++ b/crates/datasources/src/bson/builder.rs @@ -84,7 +84,7 @@ impl RecordStructBuilder { .ok_or_else(|| BsonError::ColumnNotInInferredSchema(key.to_string()))?; if *cols_set.get(idx).unwrap() { - println!("DUPLICATE SET: {}, {:?}", key, doc); + continue; } // Add value to columns. @@ -115,9 +115,8 @@ impl RecordStructBuilder { match iter_result { Ok((key, val)) => { if let Some(&idx) = self.field_index.get(key) { - if cols_set.get(idx).is_some_and(|v| v == true) { - // TODO: if this happens it means that the bson document has a field + // If this happens it means that the bson document has a field // name that appears more than once. This is legal and possible to build // with some libraries but isn't forbidden, and (I think?) historically // not (always?) rejected by MongoDB. Regardless "ignoring second From 122d22594af7c1b4cc35354a54473e6dde759bc5 Mon Sep 17 00:00:00 2001 From: tycho garen Date: Tue, 2 Jan 2024 09:48:00 -0500 Subject: [PATCH 04/10] make bson test weird --- tests/tests/test_bson.py | 93 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/tests/tests/test_bson.py b/tests/tests/test_bson.py index 700f73ffa..522849894 100644 --- a/tests/tests/test_bson.py +++ b/tests/tests/test_bson.py @@ -88,3 +88,96 @@ def test_read_bson( assert len(row) == 5 assert row["beatle_name"] in beatles assert beatles.index(row["beatle_name"]) == row["beatle_idx"] - 1 + + +def test_read_bson( + glaredb_connection: psycopg2.extensions.connection, + tmp_path_factory: pytest.TempPathFactory, +): + beatles = {"john": (0, 1940), "paul": (1, 1942), "george": (2, 1943), "ringo": (3, 1940)} + + tmp_dir = tmp_path_factory.mktemp(basename="read-bson-beatles-", numbered=True) + data_path = tmp_dir.joinpath("beatles.208.bson") + + counts = {"john": 0, "paul": 0, "george": 0, "ringo": 0} + + keys = [b for b in beatles.keys()] + # construct a + with open(data_path, "wb") as f: + for i in range(100): + beatle = random.choice(keys) + counts[beatle] += 1 + f.write( + bson.encode( + { + "_id": bson.objectid.ObjectId(), + "beatle_idx": beatles[beatle][0], + "beatle_name": beatle, + "case": i + 1, + } + ) + ) + + for beatle in counts: + f.write( + bson.encode( + { + "_id": bson.objectid.ObjectId(), + "beatle_name": beatle, + "count": counts[beatle], + } + ) + ) + + for i in range(100): + beatle = random.choice(keys) + counts[beatle] += 1 + f.write( + bson.encode( + { + "_id": bson.objectid.ObjectId(), + "beatle_name": beatle, + "birth_year": beatles[beatle][1], + "beatle_idx": beatles[beatle][0], + } + ) + ) + + for beatle in counts: + f.write( + bson.encode( + { + "_id": bson.objectid.ObjectId(), + "beatle_name": beatle, + "count": counts[beatle], + } + ) + ) + + with glaredb_connection.cursor() as curr: + curr.execute( + f"create external table bson_beatles from bson options ( location='{data_path}', file_type='bson')" + ) + + for from_clause in ["bson_beatles", f"read_bson('{data_path}')"]: + with glaredb_connection.cursor() as curr: + curr.execute(f"select count(*) from {from_clause}") + r = curr.fetchone() + assert r[0] == 208 + + with glaredb_connection.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as curr: + curr.execute(f"select * from {from_clause}") + rows = curr.fetchall() + assert len(rows) == 208 + meta_docs = 0 + for row in rows: + assert len(row) == 4 + assert row["beatle_name"] in beatles + assert "count" not in row, "count is not inferred in the schema" + + if row["beatle_idx"] is not None: + assert beatles[row["beatle_name"]][0] == row["beatle_idx"] + else: + meta_docs += 1 + + assert meta_docs == 8 From 446f54cb81c9593b96ab15cc0408b8071140cc46 Mon Sep 17 00:00:00 2001 From: tycho garen Date: Tue, 2 Jan 2024 10:59:41 -0500 Subject: [PATCH 05/10] better handling --- crates/datasources/src/bson/builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/datasources/src/bson/builder.rs b/crates/datasources/src/bson/builder.rs index d0e58682a..3ec042eac 100644 --- a/crates/datasources/src/bson/builder.rs +++ b/crates/datasources/src/bson/builder.rs @@ -332,7 +332,7 @@ fn append_value(val: RawBsonRef, typ: &DataType, col: &mut dyn ArrayBuilder) -> .as_any_mut() .downcast_mut::() .unwrap(); - builder.append_record(nested)?; + builder.project_and_append(nested)?; } // Array From 64b02b1b8a7a423547363b24a14ed8ed3f3d2326 Mon Sep 17 00:00:00 2001 From: tycho garen Date: Tue, 2 Jan 2024 12:24:20 -0500 Subject: [PATCH 06/10] test example --- tests/tests/dupes.bson | Bin 0 -> 6400 bytes tests/tests/scripts/go.mod | 7 +++++++ tests/tests/scripts/go.sum | 6 ++++++ tests/tests/scripts/make_bad_bson.go | 27 +++++++++++++++++++++++++++ tests/tests/test_bson.py | 25 +++++++++++++++++++++++++ 5 files changed, 65 insertions(+) create mode 100644 tests/tests/dupes.bson create mode 100644 tests/tests/scripts/go.mod create mode 100644 tests/tests/scripts/go.sum create mode 100644 tests/tests/scripts/make_bad_bson.go diff --git a/tests/tests/dupes.bson b/tests/tests/dupes.bson new file mode 100644 index 0000000000000000000000000000000000000000..1e26e66ed0791a84e750027b17024f776a7471f2 GIT binary patch literal 6400 zcmbu$H&R;x07cOc1!}z!c0fTv%K|($QC}DVmINY*989oH&N=676HQzJtMC@Q0aQ@A zV@~((yg7wI5Jbx2VGx@s`uF#x;A4Jq~ z(Emg~jD7_DDEcw<QPokefKaG9{{Ve)9^z-N!&@ZB2LcffD1^p`eHT3J~H_&gQ z-$K8Qeh2+7`aSgf=nv2zqCY}^jQ#}uDf%Wv QJ^BaqkLaJ!KflAj0v9!7wEzGB literal 0 HcmV?d00001 diff --git a/tests/tests/scripts/go.mod b/tests/tests/scripts/go.mod new file mode 100644 index 000000000..02b1110ce --- /dev/null +++ b/tests/tests/scripts/go.mod @@ -0,0 +1,7 @@ +module github.com/glaredb/glaredb/tests + +go 1.21.5 + +require github.com/tychoish/fun v0.10.8 + +require github.com/tychoish/birch v0.2.3-0.20230825153023-79636ce03829 // indirect diff --git a/tests/tests/scripts/go.sum b/tests/tests/scripts/go.sum new file mode 100644 index 000000000..156292176 --- /dev/null +++ b/tests/tests/scripts/go.sum @@ -0,0 +1,6 @@ +github.com/tychoish/birch v0.2.2 h1:+qiWibwCa8UoJvRFlpUxF6rLlGfnbNWKcIL8dkvY4Lo= +github.com/tychoish/birch v0.2.2/go.mod h1:k/5kBT4z40uY6vqQPJ3IBOU6E3/YMyIlUkerx3qY/Vo= +github.com/tychoish/birch v0.2.3-0.20230825153023-79636ce03829 h1:Lt2IbZbnSk49LfYZTyfCz22Pd5q6siVhS7xj3ELzIco= +github.com/tychoish/birch v0.2.3-0.20230825153023-79636ce03829/go.mod h1:Gv3VW01RrjmgGWJgxpT+wh71IKFV2vJOe7M00i+TFgE= +github.com/tychoish/fun v0.10.8 h1:NBC8ug2cRYGKzlD70+r/tz4lkwfKAshFv9aaxHoYNwQ= +github.com/tychoish/fun v0.10.8/go.mod h1:ZZfrwtsnHHV81ecZCBPp57DjjYY9Io39JH2QSXNpKn4= diff --git a/tests/tests/scripts/make_bad_bson.go b/tests/tests/scripts/make_bad_bson.go new file mode 100644 index 000000000..e71d2233d --- /dev/null +++ b/tests/tests/scripts/make_bad_bson.go @@ -0,0 +1,27 @@ +package main + +import ( + "os" + + "github.com/tychoish/birch" + "github.com/tychoish/birch/types" + "github.com/tychoish/fun" + "github.com/tychoish/fun/ft" +) + +func main() { + fn, err := os.Create("dupes.bson") + fun.Invariant.Must(err) + defer func() { fun.Invariant.Must(fn.Close()) }() + + for i := 0; i < 100; i++ { + n := ft.Must( + birch.DC.Make(3).Append( + birch.EC.ObjectID("_id", types.NewObjectID()), + birch.EC.Int32("idx", 1), + birch.EC.String("dupe", "first"), + birch.EC.String("dupe", "second"), + ).WriteTo(fn)) + fun.Invariant.IsTrue(n == 64, "unexpected document length", n) + } +} diff --git a/tests/tests/test_bson.py b/tests/tests/test_bson.py index 522849894..6f4f87014 100644 --- a/tests/tests/test_bson.py +++ b/tests/tests/test_bson.py @@ -90,6 +90,31 @@ def test_read_bson( assert beatles.index(row["beatle_name"]) == row["beatle_idx"] - 1 +def test_read_bson_multiple_fields( + glaredb_connection: psycopg2.extensions.connection, + tmp_path_factory: pytest.TempPathFactory, +): + data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "dupes.bson") + + with glaredb_connection.cursor() as curr: + curr.execute( + f"create external table dupes from bson options ( location='{data_path}', file_type='bson')" + ) + + for from_clause in ["dupes", f"read_bson('{data_path}')"]: + with glaredb_connection.cursor() as curr: + curr.execute(f"select count(*) from {from_clause}") + r = curr.fetchone() + assert r[0] == 100 + with glaredb_connection.cursor() as curr: + curr.execute(f"select * from {from_clause}") + rows = curr.fetchall() + assert len(rows) == 100 + for row in rows: + assert len(row) == 3 + assert row[2] == "first", row + + def test_read_bson( glaredb_connection: psycopg2.extensions.connection, tmp_path_factory: pytest.TempPathFactory, From 47b0542ba16c5f757acd5baeddde4b22e63d3d97 Mon Sep 17 00:00:00 2001 From: tycho garen Date: Tue, 2 Jan 2024 16:35:34 -0500 Subject: [PATCH 07/10] add it as a unit test --- crates/datasources/src/bson/builder.rs | 42 ++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/crates/datasources/src/bson/builder.rs b/crates/datasources/src/bson/builder.rs index 3ec042eac..1eff54966 100644 --- a/crates/datasources/src/bson/builder.rs +++ b/crates/datasources/src/bson/builder.rs @@ -454,3 +454,45 @@ fn column_builders_for_fields( Ok(cols) } + +#[cfg(test)] +mod test { + use bson::oid::ObjectId; + + use super::*; + + #[test] + fn test_duplicate_field_handling() { + let fields = Fields::from_iter(vec![ + Field::new("_id", DataType::Binary, true), + Field::new("idx", DataType::Int64, true), + Field::new("value", DataType::Utf8, true), + ]); + let mut rsb = RecordStructBuilder::new_with_capacity(fields, 100).unwrap(); + for idx in 0..100 { + let mut buf = bson::RawDocumentBuf::new(); + + buf.append("_id", ObjectId::new()); + buf.append("idx", idx as i64); + buf.append("value", "first"); + buf.append("value", "second"); + + rsb.append_record(RawDocument::from_bytes(&buf.into_bytes()).unwrap()) + .unwrap(); + } + assert_eq!(rsb.len(), 100); + for value in rsb + .builders + .get_mut(2) + .unwrap() + .as_any_mut() + .downcast_mut::() + .unwrap() + .finish_cloned() + .iter() + { + let v = value.unwrap(); + assert_eq!(v, "first"); + } + } +} From f07f2b16960ca5395c21a93bd9d77bf0e7c7a7c7 Mon Sep 17 00:00:00 2001 From: tycho garen Date: Tue, 2 Jan 2024 16:41:47 -0500 Subject: [PATCH 08/10] write it as a unittest --- crates/datasources/src/bson/builder.rs | 1 + tests/tests/scripts/go.mod | 7 ------- tests/tests/scripts/go.sum | 6 ------ tests/tests/scripts/make_bad_bson.go | 27 -------------------------- tests/tests/test_bson.py | 25 ------------------------ 5 files changed, 1 insertion(+), 65 deletions(-) delete mode 100644 tests/tests/scripts/go.mod delete mode 100644 tests/tests/scripts/go.sum delete mode 100644 tests/tests/scripts/make_bad_bson.go diff --git a/crates/datasources/src/bson/builder.rs b/crates/datasources/src/bson/builder.rs index 1eff54966..d40440bf8 100644 --- a/crates/datasources/src/bson/builder.rs +++ b/crates/datasources/src/bson/builder.rs @@ -476,6 +476,7 @@ mod test { buf.append("idx", idx as i64); buf.append("value", "first"); buf.append("value", "second"); + assert_eq!(buf.iter().count(), 4); rsb.append_record(RawDocument::from_bytes(&buf.into_bytes()).unwrap()) .unwrap(); diff --git a/tests/tests/scripts/go.mod b/tests/tests/scripts/go.mod deleted file mode 100644 index 02b1110ce..000000000 --- a/tests/tests/scripts/go.mod +++ /dev/null @@ -1,7 +0,0 @@ -module github.com/glaredb/glaredb/tests - -go 1.21.5 - -require github.com/tychoish/fun v0.10.8 - -require github.com/tychoish/birch v0.2.3-0.20230825153023-79636ce03829 // indirect diff --git a/tests/tests/scripts/go.sum b/tests/tests/scripts/go.sum deleted file mode 100644 index 156292176..000000000 --- a/tests/tests/scripts/go.sum +++ /dev/null @@ -1,6 +0,0 @@ -github.com/tychoish/birch v0.2.2 h1:+qiWibwCa8UoJvRFlpUxF6rLlGfnbNWKcIL8dkvY4Lo= -github.com/tychoish/birch v0.2.2/go.mod h1:k/5kBT4z40uY6vqQPJ3IBOU6E3/YMyIlUkerx3qY/Vo= -github.com/tychoish/birch v0.2.3-0.20230825153023-79636ce03829 h1:Lt2IbZbnSk49LfYZTyfCz22Pd5q6siVhS7xj3ELzIco= -github.com/tychoish/birch v0.2.3-0.20230825153023-79636ce03829/go.mod h1:Gv3VW01RrjmgGWJgxpT+wh71IKFV2vJOe7M00i+TFgE= -github.com/tychoish/fun v0.10.8 h1:NBC8ug2cRYGKzlD70+r/tz4lkwfKAshFv9aaxHoYNwQ= -github.com/tychoish/fun v0.10.8/go.mod h1:ZZfrwtsnHHV81ecZCBPp57DjjYY9Io39JH2QSXNpKn4= diff --git a/tests/tests/scripts/make_bad_bson.go b/tests/tests/scripts/make_bad_bson.go deleted file mode 100644 index e71d2233d..000000000 --- a/tests/tests/scripts/make_bad_bson.go +++ /dev/null @@ -1,27 +0,0 @@ -package main - -import ( - "os" - - "github.com/tychoish/birch" - "github.com/tychoish/birch/types" - "github.com/tychoish/fun" - "github.com/tychoish/fun/ft" -) - -func main() { - fn, err := os.Create("dupes.bson") - fun.Invariant.Must(err) - defer func() { fun.Invariant.Must(fn.Close()) }() - - for i := 0; i < 100; i++ { - n := ft.Must( - birch.DC.Make(3).Append( - birch.EC.ObjectID("_id", types.NewObjectID()), - birch.EC.Int32("idx", 1), - birch.EC.String("dupe", "first"), - birch.EC.String("dupe", "second"), - ).WriteTo(fn)) - fun.Invariant.IsTrue(n == 64, "unexpected document length", n) - } -} diff --git a/tests/tests/test_bson.py b/tests/tests/test_bson.py index 6f4f87014..522849894 100644 --- a/tests/tests/test_bson.py +++ b/tests/tests/test_bson.py @@ -90,31 +90,6 @@ def test_read_bson( assert beatles.index(row["beatle_name"]) == row["beatle_idx"] - 1 -def test_read_bson_multiple_fields( - glaredb_connection: psycopg2.extensions.connection, - tmp_path_factory: pytest.TempPathFactory, -): - data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "dupes.bson") - - with glaredb_connection.cursor() as curr: - curr.execute( - f"create external table dupes from bson options ( location='{data_path}', file_type='bson')" - ) - - for from_clause in ["dupes", f"read_bson('{data_path}')"]: - with glaredb_connection.cursor() as curr: - curr.execute(f"select count(*) from {from_clause}") - r = curr.fetchone() - assert r[0] == 100 - with glaredb_connection.cursor() as curr: - curr.execute(f"select * from {from_clause}") - rows = curr.fetchall() - assert len(rows) == 100 - for row in rows: - assert len(row) == 3 - assert row[2] == "first", row - - def test_read_bson( glaredb_connection: psycopg2.extensions.connection, tmp_path_factory: pytest.TempPathFactory, From 469b640787e2efa0290737d40eb8bda608d39d07 Mon Sep 17 00:00:00 2001 From: tycho garen Date: Tue, 2 Jan 2024 18:29:12 -0500 Subject: [PATCH 09/10] test buffer semantics --- crates/datasources/src/bson/builder.rs | 45 ++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/crates/datasources/src/bson/builder.rs b/crates/datasources/src/bson/builder.rs index d40440bf8..75a02c00c 100644 --- a/crates/datasources/src/bson/builder.rs +++ b/crates/datasources/src/bson/builder.rs @@ -82,6 +82,7 @@ impl RecordStructBuilder { .field_index .get(key) .ok_or_else(|| BsonError::ColumnNotInInferredSchema(key.to_string()))?; + println!("{}->{}", key, idx); if *cols_set.get(idx).unwrap() { continue; @@ -496,4 +497,48 @@ mod test { assert_eq!(v, "first"); } } + + #[test] + fn test_unexpected_schema_change() { + let fields = Fields::from_iter(vec![ + Field::new("_id", DataType::Binary, true), + Field::new("idx", DataType::Int64, true), + Field::new("value", DataType::Utf8, true), + ]); + let mut rsb = RecordStructBuilder::new_with_capacity(fields, 100).unwrap(); + let mut buf = bson::RawDocumentBuf::new(); + + buf.append("_id", ObjectId::new()); + buf.append("idx", 0); + buf.append("value", "first"); + assert_eq!(buf.iter().count(), 3); + + rsb.append_record(RawDocument::from_bytes(&buf.into_bytes()).unwrap()) + .expect("first record matchex expectations"); + assert_eq!(rsb.len(), 1); + + let mut buf = bson::RawDocumentBuf::new(); + buf.append("index", 1); + buf.append("values", 3); + assert_eq!(buf.iter().count(), 2); + rsb.append_record(RawDocument::from_bytes(&buf.clone().into_bytes()).unwrap()) + .expect_err("for append_record schema changes are an error"); + assert_eq!(rsb.len(), 1); + + let mut buf = bson::RawDocumentBuf::new(); + buf.append("_id", ObjectId::new()); + buf.append("index", 1); + buf.append("values", 3); + assert_eq!(buf.iter().count(), 3); + + assert_eq!(rsb.len(), 1); + rsb.append_record(RawDocument::from_bytes(&buf.clone().into_bytes()).unwrap()) + .expect_err("for append_record schema changes are an error"); + // the first value was added successfully to another buffer to the rsb grew + assert_eq!(rsb.len(), 2); + + rsb.project_and_append(RawDocument::from_bytes(&buf.clone().into_bytes()).unwrap()) + .expect("project and append should filter out unrequired fields"); + assert_eq!(rsb.len(), 3); + } } From 7b6177d27a29ceefc20dc3d650fa5abe19219db1 Mon Sep 17 00:00:00 2001 From: tycho garen Date: Tue, 2 Jan 2024 18:33:56 -0500 Subject: [PATCH 10/10] bson tests covered by unit tests --- crates/datasources/src/bson/builder.rs | 8 ++- tests/tests/test_bson.py | 93 -------------------------- 2 files changed, 5 insertions(+), 96 deletions(-) diff --git a/crates/datasources/src/bson/builder.rs b/crates/datasources/src/bson/builder.rs index 75a02c00c..3138f04ab 100644 --- a/crates/datasources/src/bson/builder.rs +++ b/crates/datasources/src/bson/builder.rs @@ -524,6 +524,9 @@ mod test { rsb.append_record(RawDocument::from_bytes(&buf.clone().into_bytes()).unwrap()) .expect_err("for append_record schema changes are an error"); assert_eq!(rsb.len(), 1); + rsb.project_and_append(RawDocument::from_bytes(&buf.clone().into_bytes()).unwrap()) + .expect("project and append should filter out unrequired fields"); + assert_eq!(rsb.len(), 2); let mut buf = bson::RawDocumentBuf::new(); buf.append("_id", ObjectId::new()); @@ -531,14 +534,13 @@ mod test { buf.append("values", 3); assert_eq!(buf.iter().count(), 3); - assert_eq!(rsb.len(), 1); rsb.append_record(RawDocument::from_bytes(&buf.clone().into_bytes()).unwrap()) .expect_err("for append_record schema changes are an error"); // the first value was added successfully to another buffer to the rsb grew - assert_eq!(rsb.len(), 2); + assert_eq!(rsb.len(), 3); rsb.project_and_append(RawDocument::from_bytes(&buf.clone().into_bytes()).unwrap()) .expect("project and append should filter out unrequired fields"); - assert_eq!(rsb.len(), 3); + assert_eq!(rsb.len(), 4); } } diff --git a/tests/tests/test_bson.py b/tests/tests/test_bson.py index 522849894..700f73ffa 100644 --- a/tests/tests/test_bson.py +++ b/tests/tests/test_bson.py @@ -88,96 +88,3 @@ def test_read_bson( assert len(row) == 5 assert row["beatle_name"] in beatles assert beatles.index(row["beatle_name"]) == row["beatle_idx"] - 1 - - -def test_read_bson( - glaredb_connection: psycopg2.extensions.connection, - tmp_path_factory: pytest.TempPathFactory, -): - beatles = {"john": (0, 1940), "paul": (1, 1942), "george": (2, 1943), "ringo": (3, 1940)} - - tmp_dir = tmp_path_factory.mktemp(basename="read-bson-beatles-", numbered=True) - data_path = tmp_dir.joinpath("beatles.208.bson") - - counts = {"john": 0, "paul": 0, "george": 0, "ringo": 0} - - keys = [b for b in beatles.keys()] - # construct a - with open(data_path, "wb") as f: - for i in range(100): - beatle = random.choice(keys) - counts[beatle] += 1 - f.write( - bson.encode( - { - "_id": bson.objectid.ObjectId(), - "beatle_idx": beatles[beatle][0], - "beatle_name": beatle, - "case": i + 1, - } - ) - ) - - for beatle in counts: - f.write( - bson.encode( - { - "_id": bson.objectid.ObjectId(), - "beatle_name": beatle, - "count": counts[beatle], - } - ) - ) - - for i in range(100): - beatle = random.choice(keys) - counts[beatle] += 1 - f.write( - bson.encode( - { - "_id": bson.objectid.ObjectId(), - "beatle_name": beatle, - "birth_year": beatles[beatle][1], - "beatle_idx": beatles[beatle][0], - } - ) - ) - - for beatle in counts: - f.write( - bson.encode( - { - "_id": bson.objectid.ObjectId(), - "beatle_name": beatle, - "count": counts[beatle], - } - ) - ) - - with glaredb_connection.cursor() as curr: - curr.execute( - f"create external table bson_beatles from bson options ( location='{data_path}', file_type='bson')" - ) - - for from_clause in ["bson_beatles", f"read_bson('{data_path}')"]: - with glaredb_connection.cursor() as curr: - curr.execute(f"select count(*) from {from_clause}") - r = curr.fetchone() - assert r[0] == 208 - - with glaredb_connection.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as curr: - curr.execute(f"select * from {from_clause}") - rows = curr.fetchall() - assert len(rows) == 208 - meta_docs = 0 - for row in rows: - assert len(row) == 4 - assert row["beatle_name"] in beatles - assert "count" not in row, "count is not inferred in the schema" - - if row["beatle_idx"] is not None: - assert beatles[row["beatle_name"]][0] == row["beatle_idx"] - else: - meta_docs += 1 - - assert meta_docs == 8