From 264e413e96357906de64214692abae783eb9d46b Mon Sep 17 00:00:00 2001 From: Ib Green Date: Thu, 17 Oct 2024 12:17:32 -0400 Subject: [PATCH] feat(csv): CSVArrowLoader (#3135) --- modules/csv/src/csv-arrow-loader.ts | 41 ++++++++ modules/csv/src/csv-loader.ts | 71 ++++++++++---- modules/csv/src/index.ts | 3 + ...arrow.spec.ts => csv-arrow-loader.spec.ts} | 38 +++++--- modules/csv/test/csv-loader.spec.ts | 4 + modules/csv/test/csv-writer-papaparse.spec.ts | 4 + modules/csv/test/index.ts | 4 +- modules/schema-utils/src/index.ts | 42 +++++--- ...able-batch-aggregator-builders.ts.disabled | 0 .../arrow-table-batch-aggregator.ts | 0 .../base-table-batch-aggregator.ts | 0 .../columnar-table-batch-aggregator.ts | 49 ++++------ .../row-table-batch-aggregator.ts | 4 +- .../table-batch-aggregator.ts | 0 .../table-batch-builder.ts | 0 .../src/lib/table/batches/convert-batches.ts | 83 ++++++++++++++++ .../batches/make-arrow-batch-iterator.ts | 75 ++++++++++++++ .../batches/make-table-batch-iterator.ts | 25 +++++ .../make-table-from-batches.ts | 21 +--- .../lib/table/tables/convert-arrow-table.ts | 97 ++++++++++--------- .../src/lib/table/tables/convert-table.ts | 40 ++++---- .../src/lib/table/tables/make-table.ts | 18 +++- .../src/lib/table/tables/table-types.ts | 37 +++++++ 23 files changed, 496 insertions(+), 160 deletions(-) create mode 100644 modules/csv/src/csv-arrow-loader.ts rename modules/csv/test/{csv-loader-arrow.spec.ts => csv-arrow-loader.spec.ts} (55%) rename modules/schema-utils/src/lib/table/{batches => batch-builder}/arrow-table-batch-aggregator-builders.ts.disabled (100%) rename modules/schema-utils/src/lib/table/{batches => batch-builder}/arrow-table-batch-aggregator.ts (100%) rename modules/schema-utils/src/lib/table/{batches => batch-builder}/base-table-batch-aggregator.ts (100%) rename modules/schema-utils/src/lib/table/{batches => batch-builder}/columnar-table-batch-aggregator.ts (64%) rename modules/schema-utils/src/lib/table/{batches => batch-builder}/row-table-batch-aggregator.ts (96%) rename modules/schema-utils/src/lib/table/{batches => batch-builder}/table-batch-aggregator.ts (100%) rename modules/schema-utils/src/lib/table/{batches => batch-builder}/table-batch-builder.ts (100%) create mode 100644 modules/schema-utils/src/lib/table/batches/convert-batches.ts create mode 100644 modules/schema-utils/src/lib/table/batches/make-arrow-batch-iterator.ts create mode 100644 modules/schema-utils/src/lib/table/batches/make-table-batch-iterator.ts rename modules/schema-utils/src/lib/table/{tables => batches}/make-table-from-batches.ts (74%) create mode 100644 modules/schema-utils/src/lib/table/tables/table-types.ts diff --git a/modules/csv/src/csv-arrow-loader.ts b/modules/csv/src/csv-arrow-loader.ts new file mode 100644 index 0000000000..78d3ce62f5 --- /dev/null +++ b/modules/csv/src/csv-arrow-loader.ts @@ -0,0 +1,41 @@ +// loaders.gl +// SPDX-License-Identifier: MIT +// Copyright (c) vis.gl contributors + +import type {LoaderWithParser, LoaderOptions} from '@loaders.gl/loader-utils'; +import type {ArrowTable, ArrowTableBatch} from '@loaders.gl/schema'; +import {convertTable, convertBatches} from '@loaders.gl/schema-utils'; + +import type {CSVLoaderOptions} from './csv-loader'; +import {CSVLoader} from './csv-loader'; + +export type CSVArrowLoaderOptions = LoaderOptions & { + csv?: Omit; +}; + +export const CSVArrowLoader = { + ...CSVLoader, + + dataType: null as unknown as ArrowTable, + batchType: null as unknown as ArrowTableBatch, + + parse: async (arrayBuffer: ArrayBuffer, options?: CSVLoaderOptions) => + parseCSVToArrow(new TextDecoder().decode(arrayBuffer), options), + parseText: (text: string, options?: CSVLoaderOptions) => parseCSVToArrow(text, options), + parseInBatches: parseCSVToArrowBatches +} as const satisfies LoaderWithParser; + +async function parseCSVToArrow(csvText: string, options?: CSVLoaderOptions): Promise { + // Apps can call the parse method directly, we so apply default options here + // const csvOptions = {...CSVArrowLoader.options.csv, ...options?.csv}; + const table = await CSVLoader.parseText(csvText, options); + return convertTable(table, 'arrow-table'); +} + +function parseCSVToArrowBatches( + asyncIterator: AsyncIterable | Iterable, + options?: CSVArrowLoaderOptions +): AsyncIterable { + const tableIterator = CSVLoader.parseInBatches(asyncIterator, options); + return convertBatches(tableIterator, 'arrow-table'); +} diff --git a/modules/csv/src/csv-loader.ts b/modules/csv/src/csv-loader.ts index 7abafba5a1..7329b740c0 100644 --- a/modules/csv/src/csv-loader.ts +++ b/modules/csv/src/csv-loader.ts @@ -3,10 +3,12 @@ // Copyright (c) vis.gl contributors import type {LoaderWithParser, LoaderOptions} from '@loaders.gl/loader-utils'; -import type {ArrayRowTable, ObjectRowTable, TableBatch} from '@loaders.gl/schema'; +import type {Schema, ArrayRowTable, ObjectRowTable, TableBatch} from '@loaders.gl/schema'; +import {log} from '@loaders.gl/loader-utils'; import { AsyncQueue, + deduceTableSchema, TableBatchBuilder, convertToArrayRow, convertToObjectRow @@ -14,9 +16,6 @@ import { import Papa from './papaparse/papaparse'; import AsyncIteratorStreamer from './papaparse/async-iterator-streamer'; -type ObjectField = {name: string; index: number; type: any}; -type ObjectSchema = {[key: string]: ObjectField} | ObjectField[]; - // __VERSION__ is injected by babel-plugin-version-inline // @ts-ignore TS2304: Cannot find name '__VERSION__'. const VERSION = typeof __VERSION__ !== 'undefined' ? __VERSION__ : 'latest'; @@ -89,7 +88,7 @@ async function parseCSV( csvText: string, options?: CSVLoaderOptions ): Promise { - // Apps can call the parse method directly, we so apply default options here + // Apps can call the parse method directly, so we apply default options here const csvOptions = {...CSVLoader.options.csv, ...options?.csv}; const firstRow = readFirstRow(csvText); @@ -115,20 +114,25 @@ async function parseCSV( const headerRow = result.meta.fields || generateHeader(csvOptions.columnPrefix, firstRow.length); const shape = csvOptions.shape || DEFAULT_CSV_SHAPE; + let table: ArrayRowTable | ObjectRowTable; switch (shape) { case 'object-row-table': - return { + table = { shape: 'object-row-table', data: rows.map((row) => (Array.isArray(row) ? convertToObjectRow(row, headerRow) : row)) }; + break; case 'array-row-table': - return { + table = { shape: 'array-row-table', data: rows.map((row) => (Array.isArray(row) ? row : convertToArrayRow(row, headerRow))) }; + break; default: throw new Error(shape); } + table.schema = deduceTableSchema(table!); + return table; } // TODO - support batch size 0 = no batching/single batch? @@ -151,7 +155,7 @@ function parseCSVInBatches( let isFirstRow: boolean = true; let headerRow: string[] | null = null; let tableBatchBuilder: TableBatchBuilder | null = null; - let schema: ObjectSchema | null = null; + let schema: Schema | null = null; const config = { // dynamicTyping: true, // Convert numbers and boolean values in rows from strings, @@ -199,7 +203,7 @@ function parseCSVInBatches( if (!headerRow) { headerRow = generateHeader(csvOptions.columnPrefix, row.length); } - schema = deduceSchema(row, headerRow); + schema = deduceCSVSchema(row, headerRow); } if (csvOptions.optimizeMemoryUsage) { @@ -314,23 +318,56 @@ function generateHeader(columnPrefix: string, count: number = 0): string[] { return headers; } -function deduceSchema(row, headerRow): ObjectSchema { - const schema: ObjectSchema = headerRow ? {} : []; +function deduceCSVSchema(row, headerRow): Schema { + const fields: Schema['fields'] = []; for (let i = 0; i < row.length; i++) { const columnName = (headerRow && headerRow[i]) || i; const value = row[i]; switch (typeof value) { case 'number': + fields.push({name: String(columnName), type: 'float64', nullable: true}); + break; case 'boolean': - // TODO - booleans could be handled differently... - schema[columnName] = {name: String(columnName), index: i, type: Float32Array}; + fields.push({name: String(columnName), type: 'bool', nullable: true}); break; case 'string': + fields.push({name: String(columnName), type: 'utf8', nullable: true}); + break; default: - schema[columnName] = {name: String(columnName), index: i, type: Array}; - // We currently only handle numeric rows - // TODO we could offer a function to map strings to numbers? + log.warn(`CSV: Unknown column type: ${typeof value}`)(); + fields.push({name: String(columnName), type: 'utf8', nullable: true}); } } - return schema; + return { + fields, + metadata: { + 'loaders.gl#format': 'csv', + 'loaders.gl#loader': 'CSVLoader' + } + }; } + +// TODO - remove +// type ObjectField = {name: string; index: number; type: any}; +// type ObjectSchema = {[key: string]: ObjectField} | ObjectField[]; + +// function deduceObjectSchema(row, headerRow): ObjectSchema { +// const schema: ObjectSchema = headerRow ? {} : []; +// for (let i = 0; i < row.length; i++) { +// const columnName = (headerRow && headerRow[i]) || i; +// const value = row[i]; +// switch (typeof value) { +// case 'number': +// case 'boolean': +// // TODO - booleans could be handled differently... +// schema[columnName] = {name: String(columnName), index: i, type: Float32Array}; +// break; +// case 'string': +// default: +// schema[columnName] = {name: String(columnName), index: i, type: Array}; +// // We currently only handle numeric rows +// // TODO we could offer a function to map strings to numbers? +// } +// } +// return schema; +// } diff --git a/modules/csv/src/index.ts b/modules/csv/src/index.ts index d25134bc75..3e00dd719f 100644 --- a/modules/csv/src/index.ts +++ b/modules/csv/src/index.ts @@ -7,3 +7,6 @@ export {CSVLoader} from './csv-loader'; export type {CSVWriterOptions} from './csv-writer'; export {CSVWriter} from './csv-writer'; + +export type {CSVArrowLoaderOptions} from './csv-arrow-loader'; +export {CSVArrowLoader} from './csv-arrow-loader'; diff --git a/modules/csv/test/csv-loader-arrow.spec.ts b/modules/csv/test/csv-arrow-loader.spec.ts similarity index 55% rename from modules/csv/test/csv-loader-arrow.spec.ts rename to modules/csv/test/csv-arrow-loader.spec.ts index 2a63337629..2c1bdef38a 100644 --- a/modules/csv/test/csv-loader-arrow.spec.ts +++ b/modules/csv/test/csv-arrow-loader.spec.ts @@ -1,18 +1,19 @@ +// loaders.gl +// SPDX-License-Identifier: MIT +// Copyright (c) vis.gl contributors + import test from 'tape-promise/tape'; import {loadInBatches, isIterator, isAsyncIterable} from '@loaders.gl/core'; -import {CSVLoader} from '../src/csv-loader'; // from '@loaders.gl/csv'; +import {CSVArrowLoader} from '@loaders.gl/csv'; import * as arrow from 'apache-arrow'; // Small CSV Sample Files const CSV_NUMBERS_100_URL = '@loaders.gl/csv/test/data/numbers-100.csv'; const CSV_NUMBERS_10000_URL = '@loaders.gl/csv/test/data/numbers-10000.csv'; +const CSV_INCIDENTS_URL_QUOTES = '@loaders.gl/csv/test/data/sf_incidents-small.csv'; -// TODO -restore -test.skip('CSVLoader#loadInBatches(numbers-100.csv, arrow)', async (t) => { - const iterator = await loadInBatches(CSV_NUMBERS_100_URL, CSVLoader, { - csv: { - shape: 'arrow-table' - }, +test('CSVArrowLoader#loadInBatches(numbers-100.csv)', async (t) => { + const iterator = await loadInBatches(CSV_NUMBERS_100_URL, CSVArrowLoader, { batchSize: 40 }); @@ -29,12 +30,8 @@ test.skip('CSVLoader#loadInBatches(numbers-100.csv, arrow)', async (t) => { t.end(); }); -// TODO - restore -test.skip('CSVLoader#loadInBatches(numbers-10000.csv, arrow)', async (t) => { - const iterator = await loadInBatches(CSV_NUMBERS_10000_URL, CSVLoader, { - csv: { - shape: 'arrow-table' - }, +test('CSVArrowLoader#loadInBatches(numbers-10000.csv)', async (t) => { + const iterator = await loadInBatches(CSV_NUMBERS_10000_URL, CSVArrowLoader, { batchSize: 2000 }); t.ok(isIterator(iterator) || isAsyncIterable(iterator), 'loadInBatches returned iterator'); @@ -49,3 +46,18 @@ test.skip('CSVLoader#loadInBatches(numbers-10000.csv, arrow)', async (t) => { t.end(); }); + +test('CSVArrowLoader#loadInBatches(incidents.csv)', async (t) => { + const iterator = await loadInBatches(CSV_INCIDENTS_URL_QUOTES, CSVArrowLoader); + t.ok(isIterator(iterator) || isAsyncIterable(iterator), 'loadInBatches returned iterator'); + + let batchCount = 0; + for await (const batch of iterator) { + t.ok(batch.data instanceof arrow.Table, 'returns arrow RecordBatch'); + // t.comment(`BATCH: ${batch.length}`); + batchCount++; + } + t.equal(batchCount, 1, 'Correct number of batches received'); + + t.end(); +}); diff --git a/modules/csv/test/csv-loader.spec.ts b/modules/csv/test/csv-loader.spec.ts index aafe93f5ed..66b0c3c5bd 100644 --- a/modules/csv/test/csv-loader.spec.ts +++ b/modules/csv/test/csv-loader.spec.ts @@ -1,3 +1,7 @@ +// loaders.gl +// SPDX-License-Identifier: MIT +// Copyright (c) vis.gl contributors + import test from 'tape-promise/tape'; import {validateLoader} from 'test/common/conformance'; diff --git a/modules/csv/test/csv-writer-papaparse.spec.ts b/modules/csv/test/csv-writer-papaparse.spec.ts index f3690960c9..84d90a5b6c 100644 --- a/modules/csv/test/csv-writer-papaparse.spec.ts +++ b/modules/csv/test/csv-writer-papaparse.spec.ts @@ -1,3 +1,7 @@ +// loaders.gl +// SPDX-License-Identifier: MIT +// Copyright (c) vis.gl contributors + // This is a fork of papaparse under MIT License // https://github.com/mholt/PapaParse /* eslint-disable */ diff --git a/modules/csv/test/index.ts b/modules/csv/test/index.ts index d2fc0b8f48..6ffc16977e 100644 --- a/modules/csv/test/index.ts +++ b/modules/csv/test/index.ts @@ -3,6 +3,6 @@ import './papaparse/papaparse.spec'; // import './csv-writer-papaparse.spec'; import './csv-loader.spec'; -import './csv-loader-arrow.spec'; - import './csv-writer.spec'; + +import './csv-arrow-loader.spec'; diff --git a/modules/schema-utils/src/index.ts b/modules/schema-utils/src/index.ts index c4ade48bfe..7dd92ed0d1 100644 --- a/modules/schema-utils/src/index.ts +++ b/modules/schema-utils/src/index.ts @@ -21,14 +21,33 @@ export { } from './lib/schema/convert-arrow-schema'; export {getDataTypeFromArray} from './lib/schema/data-type'; -// Table utils +// TABLE CATEGORY UTILS + +export {deduceTableSchema} from './lib/schema/deduce-table-schema'; +export {makeTableFromData} from './lib/table/tables/make-table'; +export {makeTableFromBatches} from './lib/table/batches/make-table-from-batches'; + +export {convertTable} from './lib/table/tables/convert-table'; +export {convertToObjectRow, convertToArrayRow} from './lib/table/tables/row-utils'; export {convertArrowToTable, convertTableToArrow} from './lib/table/tables/convert-arrow-table'; -// TABLE CATEGORY UTILS -export {TableBatchBuilder} from './lib/table/batches/table-batch-builder'; -export type {TableBatchAggregator} from './lib/table/batches/table-batch-aggregator'; -export {RowTableBatchAggregator} from './lib/table/batches/row-table-batch-aggregator'; -export {ColumnarTableBatchAggregator} from './lib/table/batches/columnar-table-batch-aggregator'; +export { + makeTableBatchIterator, + makeBatchFromTable +} from './lib/table/batches/make-table-batch-iterator'; +export { + makeArrowTableBatchIterator, + makeArrowRecordBatchIterator +} from './lib/table/batches/make-arrow-batch-iterator'; +export {convertBatch, convertBatches} from './lib/table/batches/convert-batches'; + +export { + isArrayRowTable, + isObjectRowTable, + isColumnarTable, + isGeoJSONTable, + isArrowTable +} from './lib/table/tables/table-types'; export { isTable, @@ -46,11 +65,12 @@ export { makeObjectRowIterator } from './lib/table/tables/table-accessors'; -export {makeTableFromData} from './lib/table/tables/make-table'; -export {makeTableFromBatches, makeBatchFromTable} from './lib/table/tables/make-table-from-batches'; -export {convertTable} from './lib/table/tables/convert-table'; -export {deduceTableSchema} from './lib/schema/deduce-table-schema'; -export {convertToObjectRow, convertToArrayRow} from './lib/table/tables/row-utils'; +// Table batch builders + +export {TableBatchBuilder} from './lib/table/batch-builder/table-batch-builder'; +export type {TableBatchAggregator} from './lib/table/batch-builder/table-batch-aggregator'; +export {RowTableBatchAggregator} from './lib/table/batch-builder/row-table-batch-aggregator'; +export {ColumnarTableBatchAggregator} from './lib/table/batch-builder/columnar-table-batch-aggregator'; export {ArrowLikeTable} from './lib/table/arrow-api/arrow-like-table'; diff --git a/modules/schema-utils/src/lib/table/batches/arrow-table-batch-aggregator-builders.ts.disabled b/modules/schema-utils/src/lib/table/batch-builder/arrow-table-batch-aggregator-builders.ts.disabled similarity index 100% rename from modules/schema-utils/src/lib/table/batches/arrow-table-batch-aggregator-builders.ts.disabled rename to modules/schema-utils/src/lib/table/batch-builder/arrow-table-batch-aggregator-builders.ts.disabled diff --git a/modules/schema-utils/src/lib/table/batches/arrow-table-batch-aggregator.ts b/modules/schema-utils/src/lib/table/batch-builder/arrow-table-batch-aggregator.ts similarity index 100% rename from modules/schema-utils/src/lib/table/batches/arrow-table-batch-aggregator.ts rename to modules/schema-utils/src/lib/table/batch-builder/arrow-table-batch-aggregator.ts diff --git a/modules/schema-utils/src/lib/table/batches/base-table-batch-aggregator.ts b/modules/schema-utils/src/lib/table/batch-builder/base-table-batch-aggregator.ts similarity index 100% rename from modules/schema-utils/src/lib/table/batches/base-table-batch-aggregator.ts rename to modules/schema-utils/src/lib/table/batch-builder/base-table-batch-aggregator.ts diff --git a/modules/schema-utils/src/lib/table/batches/columnar-table-batch-aggregator.ts b/modules/schema-utils/src/lib/table/batch-builder/columnar-table-batch-aggregator.ts similarity index 64% rename from modules/schema-utils/src/lib/table/batches/columnar-table-batch-aggregator.ts rename to modules/schema-utils/src/lib/table/batch-builder/columnar-table-batch-aggregator.ts index e919a6e99a..7ecad44927 100644 --- a/modules/schema-utils/src/lib/table/batches/columnar-table-batch-aggregator.ts +++ b/modules/schema-utils/src/lib/table/batch-builder/columnar-table-batch-aggregator.ts @@ -2,9 +2,9 @@ // SPDX-License-Identifier: MIT // Copyright (c) vis.gl contributors -import type {Schema, ColumnarTableBatch, ArrowTableBatch} from '@loaders.gl/schema'; +import type {Schema, ColumnarTableBatch, ArrowTableBatch, TypedArray} from '@loaders.gl/schema'; +import {getArrayTypeFromDataType} from '../../schema/data-type'; import {TableBatchAggregator} from './table-batch-aggregator'; - type ColumnarTableBatchOptions = {}; const DEFAULT_ROW_COUNT = 100; @@ -13,7 +13,7 @@ export class ColumnarTableBatchAggregator implements TableBatchAggregator { schema: Schema; length: number = 0; allocated: number = 0; - columns: {[columnName: string]: any[]} = {}; + columns: Record> = {}; constructor(schema: Schema, options: ColumnarTableBatchOptions) { this.schema = schema; @@ -46,24 +46,11 @@ export class ColumnarTableBatchAggregator implements TableBatchAggregator { getBatch(): ColumnarTableBatch | ArrowTableBatch | null { this._pruneColumns(); - const columns = Array.isArray(this.schema) ? this.columns : {}; - - // schema is an array if there're no headers - // object if there are headers - // columns should match schema format - if (!Array.isArray(this.schema)) { - for (const fieldName in this.schema) { - const field = this.schema[fieldName]; - columns[field.name] = this.columns[field.index]; - } - } - - this.columns = {}; const batch: ColumnarTableBatch = { shape: 'columnar-table', batchType: 'data', - data: columns, + data: this.columns, schema: this.schema, length: this.length }; @@ -82,23 +69,23 @@ export class ColumnarTableBatchAggregator implements TableBatchAggregator { this.allocated = this.allocated > 0 ? (this.allocated *= 2) : DEFAULT_ROW_COUNT; this.columns = {}; - for (const fieldName in this.schema) { - const field = this.schema[fieldName]; - const ArrayType = field.type || Float32Array; - const oldColumn = this.columns[field.index]; + for (const field of this.schema.fields) { + const ArrayType = getArrayTypeFromDataType(field.type, field.nullable); + const oldColumn = this.columns[field.name]; - if (oldColumn && ArrayBuffer.isView(oldColumn)) { + if (!oldColumn) { + // Create new + this.columns[field.name] = new ArrayType(this.allocated); + } else if (Array.isArray(oldColumn)) { + // Plain array, just increase its size + oldColumn.length = this.allocated; + } else if (ArrayBuffer.isView(oldColumn)) { // Copy the old data to the new array const typedArray = new ArrayType(this.allocated); - typedArray.set(oldColumn); - this.columns[field.index] = typedArray; - } else if (oldColumn) { - // Plain array - oldColumn.length = this.allocated; - this.columns[field.index] = oldColumn; - } else { - // Create new - this.columns[field.index] = new ArrayType(this.allocated); + if (ArrayBuffer.isView(typedArray)) { + typedArray.set(oldColumn); + } + this.columns[field.name] = typedArray; } } } diff --git a/modules/schema-utils/src/lib/table/batches/row-table-batch-aggregator.ts b/modules/schema-utils/src/lib/table/batch-builder/row-table-batch-aggregator.ts similarity index 96% rename from modules/schema-utils/src/lib/table/batches/row-table-batch-aggregator.ts rename to modules/schema-utils/src/lib/table/batch-builder/row-table-batch-aggregator.ts index 1b1ae0efdb..5e35e95c88 100644 --- a/modules/schema-utils/src/lib/table/batches/row-table-batch-aggregator.ts +++ b/modules/schema-utils/src/lib/table/batch-builder/row-table-batch-aggregator.ts @@ -31,8 +31,8 @@ export class RowTableBatchAggregator implements TableBatchAggregator { // object if there are headers if (schema) { this._headers = []; - for (const key in schema) { - this._headers[schema[key].index] = schema[key].name; + for (let i = 0; i < schema.fields.length; i++) { + this._headers[i] = schema.fields[i].name; } } } diff --git a/modules/schema-utils/src/lib/table/batches/table-batch-aggregator.ts b/modules/schema-utils/src/lib/table/batch-builder/table-batch-aggregator.ts similarity index 100% rename from modules/schema-utils/src/lib/table/batches/table-batch-aggregator.ts rename to modules/schema-utils/src/lib/table/batch-builder/table-batch-aggregator.ts diff --git a/modules/schema-utils/src/lib/table/batches/table-batch-builder.ts b/modules/schema-utils/src/lib/table/batch-builder/table-batch-builder.ts similarity index 100% rename from modules/schema-utils/src/lib/table/batches/table-batch-builder.ts rename to modules/schema-utils/src/lib/table/batch-builder/table-batch-builder.ts diff --git a/modules/schema-utils/src/lib/table/batches/convert-batches.ts b/modules/schema-utils/src/lib/table/batches/convert-batches.ts new file mode 100644 index 0000000000..82734a6771 --- /dev/null +++ b/modules/schema-utils/src/lib/table/batches/convert-batches.ts @@ -0,0 +1,83 @@ +// loaders.gl +// SPDX-License-Identifier: MIT +// Copyright (c) vis.gl contributors + +import type { + TableBatch, + ArrayRowTableBatch, + ObjectRowTableBatch, + ColumnarTableBatch, + ArrowTableBatch +} from '@loaders.gl/schema'; +import {convertTable} from '../tables/convert-table'; + +export function convertBatch(batches: TableBatch, shape: 'object-row-table'): ObjectRowTableBatch; +export function convertBatch(batches: TableBatch, shape: 'array-row-table'): ArrayRowTableBatch; +export function convertBatch(batches: TableBatch, shape: 'columnar-table'): ColumnarTableBatch; +export function convertBatch(batches: TableBatch, shape: 'arrow-table'): ArrowTableBatch; + +/** Convert a table batch to a different shape */ +export function convertBatch( + batch: TableBatch, + shape: 'object-row-table' | 'array-row-table' | 'columnar-table' | 'arrow-table' +): TableBatch { + switch (shape) { + case 'object-row-table': + return {...batch, ...convertTable(batch, 'object-row-table')}; + case 'array-row-table': + return {...batch, ...convertTable(batch, 'array-row-table')}; + case 'columnar-table': + return {...batch, ...convertTable(batch, 'columnar-table')}; + case 'arrow-table': + return {...batch, ...convertTable(batch, 'arrow-table')}; + default: + throw new Error(shape); + } +} + +export function convertBatches( + batches: Iterable | AsyncIterable, + shape: 'object-row-table' +): AsyncIterableIterator; +export function convertBatches( + batches: Iterable | AsyncIterable, + shape: 'array-row-table' +): AsyncIterableIterator; +export function convertBatches( + batches: Iterable | AsyncIterable, + shape: 'columnar-table' +): AsyncIterableIterator; +export function convertBatches( + batches: Iterable | AsyncIterable, + shape: 'arrow-table' +): AsyncIterableIterator; + +/** + * Convert batches to a different shape + * @param table + * @param shape + * @returns + */ +export async function* convertBatches( + batches: Iterable | AsyncIterable, + shape: 'object-row-table' | 'array-row-table' | 'columnar-table' | 'arrow-table' +): AsyncIterableIterator { + for await (const batch of batches) { + switch (shape) { + case 'object-row-table': + yield convertBatch(batch, 'object-row-table'); + break; + case 'array-row-table': + yield convertBatch(batch, 'array-row-table'); + break; + case 'columnar-table': + yield convertBatch(batch, 'columnar-table'); + break; + case 'arrow-table': + yield convertBatch(batch, 'arrow-table'); + break; + default: + throw new Error(shape); + } + } +} diff --git a/modules/schema-utils/src/lib/table/batches/make-arrow-batch-iterator.ts b/modules/schema-utils/src/lib/table/batches/make-arrow-batch-iterator.ts new file mode 100644 index 0000000000..8a7fc1756a --- /dev/null +++ b/modules/schema-utils/src/lib/table/batches/make-arrow-batch-iterator.ts @@ -0,0 +1,75 @@ +// loaders.gl +// SPDX-License-Identifier: MIT +// Copyright (c) vis.gl contributors + +import * as arrow from 'apache-arrow'; +import type {Table, ArrowTableBatch} from '@loaders.gl/schema'; + +import {convertSchemaToArrow} from '../../schema/convert-arrow-schema'; +import {getTableLength, getTableNumCols, getTableCellAt} from '../tables/table-accessors'; + +/** + * Returns an iterator that yields a single table as a sequence of ArrowTable batches. + * @note All batches will have the same shape and schema as the original table. + */ +export function* makeArrowTableBatchIterator( + table: Table, + options?: {batchSize?: number} +): IterableIterator { + for (const batch of makeArrowRecordBatchIterator(table, options)) { + const arrowTable = new arrow.Table([batch]); + yield { + ...batch, + shape: 'arrow-table', + schema: table.schema, + batchType: 'data', + length: arrowTable.numRows, + data: arrowTable + }; + } +} + +/** + * Returns an iterator that yields a single table as a sequence of arrow.RecordBatch batches. + * @note All batches will have the same shape and schema as the original table. + */ +export function* makeArrowRecordBatchIterator( + table: Table, + options?: {batchSize?: number} +): IterableIterator { + const arrowSchema = convertSchemaToArrow(table.schema!); + + const length = getTableLength(table); + const numColumns = getTableNumCols(table); + const batchSize = options?.batchSize || length; + + const builders = arrowSchema?.fields.map((arrowField) => arrow.makeBuilder(arrowField)); + const structField = new arrow.Struct(arrowSchema.fields); + + let batchLength = 0; + for (let rowIndex = 0; rowIndex < length; rowIndex++) { + for (let columnIndex = 0; columnIndex < numColumns; ++columnIndex) { + const value = getTableCellAt(table, rowIndex, columnIndex); + + const builder = builders[columnIndex]; + builder.append(value); + batchLength++; + + if (batchLength >= batchSize) { + const datas = builders.map((builder) => builder.flush()); + const structData = new arrow.Data(structField, 0, batchLength, 0, undefined, datas); + yield new arrow.RecordBatch(arrowSchema, structData); + batchLength = 0; + } + } + } + + if (batchLength > 0) { + const datas = builders.map((builder) => builder.flush()); + const structData = new arrow.Data(structField, 0, batchLength, 0, undefined, datas); + yield new arrow.RecordBatch(arrowSchema, structData); + batchLength = 0; + } + + builders.map((builder) => builder.finish()); +} diff --git a/modules/schema-utils/src/lib/table/batches/make-table-batch-iterator.ts b/modules/schema-utils/src/lib/table/batches/make-table-batch-iterator.ts new file mode 100644 index 0000000000..7d1a23e3f6 --- /dev/null +++ b/modules/schema-utils/src/lib/table/batches/make-table-batch-iterator.ts @@ -0,0 +1,25 @@ +// loaders.gl +// SPDX-License-Identifier: MIT +// Copyright (c) vis.gl contributors + +import type {TableBatch, Table} from '@loaders.gl/schema'; +import {getTableLength} from '../tables/table-accessors'; + +/** + * Returns an iterator that yields the contents of a table as a sequence of batches. + * @todo Currently only a single batch is yielded. + * @note All batches will have the same shape and schema as the original table. + * @returns + */ +export function* makeTableBatchIterator(table: Table): IterableIterator { + yield makeBatchFromTable(table); +} + +/** + * Returns a table packaged as a single table batch + * @note The batch will have the same shape and schema as the original table. + * @returns `null` if no batches are yielded by the async iterator + */ +export function makeBatchFromTable(table: Table): TableBatch { + return {...table, length: getTableLength(table), batchType: 'data'}; +} diff --git a/modules/schema-utils/src/lib/table/tables/make-table-from-batches.ts b/modules/schema-utils/src/lib/table/batches/make-table-from-batches.ts similarity index 74% rename from modules/schema-utils/src/lib/table/tables/make-table-from-batches.ts rename to modules/schema-utils/src/lib/table/batches/make-table-from-batches.ts index edf4ce4aaa..0739240430 100644 --- a/modules/schema-utils/src/lib/table/tables/make-table-from-batches.ts +++ b/modules/schema-utils/src/lib/table/batches/make-table-from-batches.ts @@ -10,26 +10,7 @@ import type { ArrayRowTable, Feature } from '@loaders.gl/schema'; -import {getTableLength} from './table-accessors'; - -/** - * Returns an iterator that yields a single table as a sequence of batches. - * @note Currently only a single batch is yielded. - * @note All batches will have the same shape and schema as the original table. - * @returns - */ -export function* makeBatchesFromTable(table: Table): IterableIterator { - yield makeBatchFromTable(table); -} - -/** - * Returns a table packaged as a single table batch - * @note The batch will have the same shape and schema as the original table. - * @returns `null` if no batches are yielded by the async iterator - */ -export function makeBatchFromTable(table: Table): TableBatch { - return {...table, length: getTableLength(table), batchType: 'data'}; -} +import {getTableLength} from '../tables/table-accessors'; /** * Assembles all batches from an async iterator into a single table. diff --git a/modules/schema-utils/src/lib/table/tables/convert-arrow-table.ts b/modules/schema-utils/src/lib/table/tables/convert-arrow-table.ts index 63a27d602d..3a35541fe3 100644 --- a/modules/schema-utils/src/lib/table/tables/convert-arrow-table.ts +++ b/modules/schema-utils/src/lib/table/tables/convert-arrow-table.ts @@ -14,8 +14,8 @@ import type { } from '@loaders.gl/schema'; import {convertTable} from './convert-table'; -import {convertArrowToSchema, convertSchemaToArrow} from '../../schema/convert-arrow-schema'; -import {getTableLength, getTableNumCols, getTableCellAt} from './table-accessors'; +import {convertArrowToSchema} from '../../schema/convert-arrow-schema'; +import {makeArrowRecordBatchIterator} from '../batches/make-arrow-batch-iterator'; /** * * Convert a loaders.gl Table to an Apache Arrow Table @@ -36,52 +36,11 @@ export function convertTableToArrow(table: Table, options?: {batchSize?: number} // fall through default: - const arrowBatchIterator = makeTableToArrowBatchesIterator(table, options); + const arrowBatchIterator = makeArrowRecordBatchIterator(table, options); return new arrow.Table(arrowBatchIterator); } } -export function* makeTableToArrowBatchesIterator( - table: Table, - options?: {batchSize?: number} -): IterableIterator { - const arrowSchema = convertSchemaToArrow(table.schema!); - - const length = getTableLength(table); - const numColumns = getTableNumCols(table); - const batchSize = options?.batchSize || length; - - const builders = arrowSchema?.fields.map((arrowField) => arrow.makeBuilder(arrowField)); - const structField = new arrow.Struct(arrowSchema.fields); - - let batchLength = 0; - for (let rowIndex = 0; rowIndex < length; rowIndex++) { - for (let columnIndex = 0; columnIndex < numColumns; ++columnIndex) { - const value = getTableCellAt(table, rowIndex, columnIndex); - - const builder = builders[columnIndex]; - builder.append(value); - batchLength++; - - if (batchLength >= batchSize) { - const datas = builders.map((builder) => builder.flush()); - const structData = new arrow.Data(structField, 0, batchLength, 0, undefined, datas); - yield new arrow.RecordBatch(arrowSchema, structData); - batchLength = 0; - } - } - } - - if (batchLength > 0) { - const datas = builders.map((builder) => builder.flush()); - const structData = new arrow.Data(structField, 0, batchLength, 0, undefined, datas); - yield new arrow.RecordBatch(arrowSchema, structData); - batchLength = 0; - } - - builders.map((builder) => builder.finish()); -} - /** * Convert an Apache Arrow table to a loaders.gl Table * @note Currently does not convert schema @@ -187,3 +146,53 @@ function convertArrowToGeoJSONTable(arrowTable: arrow.Table): GeoJSONTable { features }; } + +// /** +// * Wrap an apache arrow table in a loaders.gl table wrapper. +// * From this additional conversions are available. +// * @param arrowTable +// * @returns +// */ +// function convertArrowToArrowTable(arrowTable: arrow.Table): ArrowTable { +// return { +// shape: 'arrow-table', +// schema: convertArrowToSchema(arrowTable.schema), +// data: arrowTable +// }; +// } + +// function convertArrowToArrayRowTable(arrowTable: arrow.Table): Table { +// const columnarTable = convertArrowToColumnarTable(arrowTable); +// return convertTable(columnarTable, 'array-row-table'); +// } + +// function convertArrowToObjectRowTable(arrowTable: arrow.Table): Table { +// const columnarTable = convertArrowToColumnarTable(arrowTable); +// return convertTable(columnarTable, 'object-row-table'); +// } + +// /** +// * Convert an Apache Arrow table to a ColumnarTable +// * @note Currently does not convert schema +// */ +// function convertArrowToColumnarTable(arrowTable: arrow.Table): ColumnarTable { +// // TODO - avoid calling `getColumn` on columns we are not interested in? +// // Add options object? + +// const columns: ColumnarTable['data'] = {}; + +// for (const field of arrowTable.schema.fields) { +// // This (is intended to) coalesce all record batches into a single typed array +// const arrowColumn = arrowTable.getChild(field.name); +// const values = arrowColumn?.toArray(); +// columns[field.name] = values; +// } + +// const schema = convertArrowToSchema(arrowTable.schema); + +// return { +// shape: 'columnar-table', +// schema, +// data: columns +// }; +// } diff --git a/modules/schema-utils/src/lib/table/tables/convert-table.ts b/modules/schema-utils/src/lib/table/tables/convert-table.ts index caa6efcf5f..27fb89e4d7 100644 --- a/modules/schema-utils/src/lib/table/tables/convert-table.ts +++ b/modules/schema-utils/src/lib/table/tables/convert-table.ts @@ -17,6 +17,7 @@ import { } from './table-accessors'; import {deduceTableSchema} from '../../schema/deduce-table-schema'; import {makeColumnFromField} from './table-column'; +import {convertTableToArrow} from './convert-arrow-table'; export function convertTable(table: Table, shape: 'object-row-table'): ObjectRowTable; export function convertTable(table: Table, shape: 'array-row-table'): ArrayRowTable; @@ -35,32 +36,20 @@ export function convertTable( ) { switch (shape) { case 'object-row-table': - return makeObjectRowTable(table); + return convertToObjectRowTable(table); case 'array-row-table': - return makeArrayRowTable(table); + return convertToArrayRowTable(table); case 'columnar-table': - return makeColumnarTable(table); + return convertToColumnarTable(table); case 'arrow-table': - return makeArrowTable(table); + return convertToArrowTable(table); default: throw new Error(shape); } } -/** - * Convert a table to apache arrow format - * @note this depends on the `@loaders.gl/arrow module being imported - */ -export function makeArrowTable(table: Table): Table { - const _makeArrowTable = globalThis.__loaders?._makeArrowTable; - if (!_makeArrowTable) { - throw new Error(''); - } - return _makeArrowTable(table); -} - /** Convert any simple table into columnar format */ -export function makeColumnarTable(table: Table): ColumnarTable { +export function convertToColumnarTable(table: Table): ColumnarTable { // TODO - should schema really be optional? const schema = table.schema || deduceTableSchema(table); const fields = table.schema?.fields || []; @@ -88,7 +77,7 @@ export function makeColumnarTable(table: Table): ColumnarTable { } /** Convert any table into array row format */ -export function makeArrayRowTable(table: Table): ArrayRowTable { +export function convertToArrayRowTable(table: Table): ArrayRowTable { if (table.shape === 'array-row-table') { return table; } @@ -105,7 +94,7 @@ export function makeArrayRowTable(table: Table): ArrayRowTable { } /** Convert any table into object row format */ -export function makeObjectRowTable(table: Table): ObjectRowTable { +export function convertToObjectRowTable(table: Table): ObjectRowTable { if (table.shape === 'object-row-table') { return table; } @@ -121,6 +110,19 @@ export function makeObjectRowTable(table: Table): ObjectRowTable { }; } +/** + * Convert a table to apache arrow format + * @note this depends on the `@loaders.gl/arrow module being imported + */ +export function convertToArrowTable(table: Table): ArrowTable { + const arrowTable = convertTableToArrow(table); + return { + shape: 'arrow-table', + schema: table.schema, + data: arrowTable + }; +} + /** * * @note - should be part of schema module diff --git a/modules/schema-utils/src/lib/table/tables/make-table.ts b/modules/schema-utils/src/lib/table/tables/make-table.ts index 3ccb1f3879..e50ffc450d 100644 --- a/modules/schema-utils/src/lib/table/tables/make-table.ts +++ b/modules/schema-utils/src/lib/table/tables/make-table.ts @@ -2,7 +2,14 @@ // SPDX-License-Identifier: MIT // Copyright (c) vis.gl contributors -import type {Table, ArrayRowTable, ObjectRowTable, ColumnarTable} from '@loaders.gl/schema'; +import * as arrow from 'apache-arrow'; +import type { + Table, + ArrayRowTable, + ObjectRowTable, + ColumnarTable, + ArrowTable +} from '@loaders.gl/schema'; import {deduceTableSchema} from '../../schema/deduce-table-schema'; /** @@ -12,6 +19,8 @@ import {deduceTableSchema} from '../../schema/deduce-table-schema'; export function makeTableFromData(data: unknown[][]): ArrayRowTable; export function makeTableFromData(data: {[column: string]: unknown}[]): ObjectRowTable; export function makeTableFromData(data: {[column: string]: ArrayLike}): ColumnarTable; +export function makeTableFromData(data: arrow.Table): ArrowTable; + export function makeTableFromData(data: unknown): Table { let table: Table; switch (getTableShapeFromData(data)) { @@ -24,6 +33,9 @@ export function makeTableFromData(data: unknown): Table { case 'columnar-table': table = {shape: 'columnar-table', data: data as {[column: string]: ArrayLike}}; break; + case 'arrow-table': + table = {shape: 'arrow-table', data: data as arrow.Table}; + break; default: throw new Error('table'); } @@ -33,6 +45,10 @@ export function makeTableFromData(data: unknown): Table { /** Helper function to get shape of data */ function getTableShapeFromData(data) { + if (data instanceof arrow.Table) { + return 'arrow-table'; + } + if (Array.isArray(data)) { if (data.length === 0) { throw new Error('cannot deduce type of empty table'); diff --git a/modules/schema-utils/src/lib/table/tables/table-types.ts b/modules/schema-utils/src/lib/table/tables/table-types.ts new file mode 100644 index 0000000000..965b9d050e --- /dev/null +++ b/modules/schema-utils/src/lib/table/tables/table-types.ts @@ -0,0 +1,37 @@ +// loaders.gl +// SPDX-License-Identifier: MIT +// Copyright (c) vis.gl contributors + +import type { + Table, + ObjectRowTable, + ArrayRowTable, + ColumnarTable, + GeoJSONTable, + ArrowTable +} from '@loaders.gl/schema'; + +/** Checks if a table is of array row layout */ +export function isArrayRowTable(table: Table): table is ArrayRowTable { + return table.shape === 'array-row-table'; +} + +/** Checks if a table is of array row layout */ +export function isObjectRowTable(table: Table): table is ObjectRowTable { + return table.shape === 'object-row-table'; +} + +/** Checks if a table is of columnar layout */ +export function isColumnarTable(table: Table): table is ColumnarTable { + return table.shape === 'columnar-table'; +} + +/** Checks if a table is of GeoJSON format */ +export function isGeoJSONTable(table: Table): table is GeoJSONTable { + return table.shape === 'geojson-table'; +} + +/** Checks if table wraps an Apache Arrow table */ +export function isArrowTable(table: Table): table is ArrowTable { + return table.shape === 'arrow-table'; +}