Skip to content

Commit

Permalink
feat: alias read_parquet, read_njson, read_csv to their scan counterp…
Browse files Browse the repository at this point in the history
…art (#2185)

Added three new functions: `read_parquet`, `read_njson` and `read_csv`.
They do the same as their scan counterparts.
Closes: #2169

Co-authored-by: universalmind303 <cory.grinstead@gmail.com>
  • Loading branch information
Lilit0x and universalmind303 authored Dec 1, 2023
1 parent 035fe55 commit b34edb9
Show file tree
Hide file tree
Showing 5 changed files with 181 additions and 1 deletion.
5 changes: 4 additions & 1 deletion crates/sqlbuiltins/src/functions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ use self::iceberg::{IcebergDataFiles, IcebergScan, IcebergSnapshots};
use self::lance::LanceScan;
use self::mongo::ReadMongoDb;
use self::mysql::ReadMysql;
use self::object_store::{CSV_SCAN, JSON_SCAN, PARQUET_SCAN};
use self::object_store::{CSV_SCAN, JSON_SCAN, PARQUET_SCAN, READ_CSV, READ_JSON, READ_PARQUET};
use self::postgres::ReadPostgres;
use self::snowflake::ReadSnowflake;
use self::virtual_listing::{ListColumns, ListSchemas, ListTables};
Expand Down Expand Up @@ -109,8 +109,11 @@ impl BuiltinTableFuncs {
Arc::new(ReadSnowflake),
// Object store
Arc::new(PARQUET_SCAN),
Arc::new(READ_PARQUET),
Arc::new(CSV_SCAN),
Arc::new(READ_CSV),
Arc::new(JSON_SCAN),
Arc::new(READ_JSON),
// Data lakes
Arc::new(DeltaScan),
Arc::new(IcebergScan),
Expand Down
3 changes: 3 additions & 0 deletions crates/sqlbuiltins/src/functions/object_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,13 @@ use protogen::metastore::types::catalog::RuntimePreference;
use protogen::metastore::types::options::{CredentialsOptions, StorageOptions};

pub const PARQUET_SCAN: ObjScanTableFunc = ObjScanTableFunc(FileType::PARQUET, "parquet_scan");
pub const READ_PARQUET: ObjScanTableFunc = ObjScanTableFunc(FileType::PARQUET, "read_parquet");

pub const CSV_SCAN: ObjScanTableFunc = ObjScanTableFunc(FileType::CSV, "csv_scan");
pub const READ_CSV: ObjScanTableFunc = ObjScanTableFunc(FileType::CSV, "read_csv");

pub const JSON_SCAN: ObjScanTableFunc = ObjScanTableFunc(FileType::JSON, "ndjson_scan");
pub const READ_JSON: ObjScanTableFunc = ObjScanTableFunc(FileType::JSON, "read_ndjson");

#[derive(Debug, Clone)]
pub struct ObjScanTableFunc(FileType, &'static str);
Expand Down
64 changes: 64 additions & 0 deletions testdata/sqllogictests/functions/read_csv.slt
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Tests `read_csv`

# Absolute path
query I
select count(*) from read_csv('file://${PWD}/testdata/sqllogictests_datasources_common/data/bikeshare_stations.csv')
----
102

# Absolute path (compressed)

query I
select count(*) from read_csv('file://${PWD}/testdata/sqllogictests_datasources_common/data/bikeshare_stations.csv.gz')
----
102

# Compressed (with function argument)

query I
select count(*) from read_csv(
'file://${PWD}/testdata/sqllogictests_datasources_common/data/bikeshare_stations.csv.gz',
compression => 'gzip'
);
----
102

# To prove this actually picks up the compression from the argument, giving a
# wrong compression type should fail.
statement error stream/file format not recognized
select count(*) from read_csv(
'file://${PWD}/testdata/sqllogictests_datasources_common/data/bikeshare_stations.csv.gz',
compression => 'xz'
);

# Relative path
query I
select count(*) from read_csv('../../testdata/sqllogictests_datasources_common/data/bikeshare_stations.csv')
----
102

# Remote path
query I
select count(*) from read_csv('https://raw.githubusercontent.com/GlareDB/glaredb/main/testdata/sqllogictests_datasources_common/data/bikeshare_stations.csv');
----
102

# Multiple URLs

query I
select count(*) from read_csv([
'https://raw.githubusercontent.com/GlareDB/glaredb/main/testdata/sqllogictests_datasources_common/data/bikeshare_stations.csv',
'https://raw.githubusercontent.com/GlareDB/glaredb/main/testdata/sqllogictests_datasources_common/data/bikeshare_stations.csv'
]);
----
204

statement error at least one url expected
select * from read_csv([]);

# Glob patterns not supported on HTTP

statement error Note that globbing is not supported for HTTP.
select * from read_csv(
'https://raw.githubusercontent.com/GlareDB/glaredb/main/testdata/sqllogictests_datasources_common/data/*.csv'
);
52 changes: 52 additions & 0 deletions testdata/sqllogictests/functions/read_json.slt
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Tests `read_ndjson`

# Absolute path
query I
select count(*) from read_ndjson('file://${PWD}/testdata/sqllogictests_datasources_common/data/bikeshare_stations.ndjson')
----
102

# Absolute path (compressed)
query I
select count(*) from read_ndjson('file://${PWD}/testdata/sqllogictests_datasources_common/data/bikeshare_stations.ndjson.gz')
----
102

# # Relative path
query I
select count(*) from read_ndjson('../../testdata/sqllogictests_datasources_common/data/bikeshare_stations.ndjson')
----
102

# Remote path
query I
select count(*) from read_ndjson('https://raw.githubusercontent.com/GlareDB/glaredb/main/testdata/sqllogictests_datasources_common/data/bikeshare_stations.ndjson');
----
102


# Multiple URLs

query I
select count(*) from read_ndjson([
'https://raw.githubusercontent.com/GlareDB/glaredb/main/testdata/sqllogictests_datasources_common/data/bikeshare_stations.ndjson',
'https://raw.githubusercontent.com/GlareDB/glaredb/main/testdata/sqllogictests_datasources_common/data/bikeshare_stations.ndjson'
]);
----
204

statement error at least one url expected
select * from read_ndjson([]);

# Glob patterns not supported on HTTP

statement error Unexpected status code '404 Not Found'
select * from read_ndjson(
'https://raw.githubusercontent.com/GlareDB/glaredb/main/testdata/sqllogictests_datasources_common/data/*.ndjson'
);

statement error Note that globbing is not supported for HTTP.
select * from read_ndjson(
'https://raw.githubusercontent.com/GlareDB/glaredb/main/testdata/sqllogictests_datasources_common/data/*.ndjson'
);

58 changes: 58 additions & 0 deletions testdata/sqllogictests/functions/read_parquet.slt
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Tests `read_parquet`

# Absolute path
query I
select count(*) from read_parquet('file://${PWD}/testdata/parquet/userdata1.parquet')
----
1000

# Relative path
query I
select count(*) from read_parquet('../../testdata/parquet/userdata1.parquet')
----
1000

# Remote path
query I
select count(*) from read_parquet('https://github.com/GlareDB/glaredb/raw/main/testdata/parquet/userdata1.parquet');
----
1000

# Huggingface (percent encoded paths)
#
# Note that this is a pretty big file, but the limit will be pushed down to the
# exec, ensuring we don't need to load the whole thing.
query T
select length(head) > 1 from read_parquet(
'https://huggingface.co/datasets/allenai/soda/resolve/refs%2Fconvert%2Fparquet/default/test/0000.parquet'
) limit 1;
----
t

# Multiple URLs

query I
select count(*) from read_parquet([
'https://github.com/GlareDB/glaredb/raw/main/testdata/parquet/userdata1.parquet',
'https://github.com/GlareDB/glaredb/raw/main/testdata/parquet/userdata1.parquet'
]);
----
2000

statement error No such file or directory
select * from read_parquet('./testdata/parquet/userdata1.paruqet');

# Ambiguous name.
# query I
# select count(*)
# from read_parquet('../../testdata/parquet/userdata1.parquet') p
# inner join (values ('Sweden')) as c(country) on p.country = c.country
# ----
# 1000

# query I
# select count(*)
# from read_parquet('../../testdata/parquet/userdata1.parquet') p
# inner join (select 'Sweden') as c(country) on p.country = c.country
# ----
# 1000

0 comments on commit b34edb9

Please sign in to comment.