diff --git a/packages/core/__tests__/hierarchy.test.ts b/packages/core/__tests__/hierarchy.test.ts index f58e41d1..306160cb 100644 --- a/packages/core/__tests__/hierarchy.test.ts +++ b/packages/core/__tests__/hierarchy.test.ts @@ -42,7 +42,7 @@ describe("Array", () => { fill_value: arr.fill_value, attrs: await arr.attrs(), path: arr.path, - codec_pipeline: arr.codec_pipeline, + codec: arr.codec, store: arr.store, }).toMatchInlineSnapshot(` { @@ -53,7 +53,7 @@ describe("Array", () => { 5, 5, ], - "codec_pipeline": { + "codec": { "decode": [Function], "encode": [Function], }, diff --git a/packages/core/__tests__/util.test.ts b/packages/core/__tests__/util.test.ts index fcff50a6..f45dd63a 100644 --- a/packages/core/__tests__/util.test.ts +++ b/packages/core/__tests__/util.test.ts @@ -87,6 +87,23 @@ describe("is_dtype", () => { expect(is_dtype(dtype, "number")).toBe(expected); }); + test.each<[DataType, boolean]>([ + ["int8", false], + ["int16", false], + ["int32", false], + ["uint8", false], + ["uint16", false], + ["uint32", false], + ["float32", false], + ["float64", false], + ["bool", true], + ["int64", false], + ["uint64", false], + ["r42", false], + ])("is_dtype(%s, 'boolean') -> %s", (dtype, expected) => { + expect(is_dtype(dtype, "boolean")).toBe(expected); + }); + test.each<[DataType, boolean]>([ ["int8", false], ["int16", false], diff --git a/packages/core/src/codecs.ts b/packages/core/src/codecs.ts index 60dddf0c..7a5e0c53 100644 --- a/packages/core/src/codecs.ts +++ b/packages/core/src/codecs.ts @@ -1,143 +1,108 @@ import type { Codec } from "numcodecs"; -import type { ArrayMetadata, Chunk, DataType, TypedArray } from "./metadata.js"; -import { byteswap_inplace, get_ctr, get_strides } from "./util.js"; +import type { ArrayMetadata, Chunk, DataType } from "./metadata.js"; -const LITTLE_ENDIAN_OS = system_is_little_endian(); +import { TransposeCodec } from "./codecs/transpose.js"; +import { EndianCodec } from "./codecs/endian.js"; -function system_is_little_endian(): boolean { - const a = new Uint32Array([0x12345678]); - const b = new Uint8Array(a.buffer, a.byteOffset, a.byteLength); - return !(b[0] === 0x12); -} - -function bytes_per_element(data_type: DataType): number { - const mapping: any = { - int8: 1, - int16: 2, - int32: 4, - int64: 8, - uint8: 1, - uint16: 2, - uint32: 4, - uint64: 8, - float32: 4, - float64: 8, - }; - let b = mapping[data_type]; - if (!b) throw new Error(`Unknown data type: ${data_type}`); - return b; -} - -class EndianCodec { - constructor( - public configuration: { endian: "little" | "big" }, - public array_metadata: ArrayMetadata, - ) {} - - static fromConfig( - configuration: { endian: "little" | "big"; bytes_per_element: number }, - array_metadata: ArrayMetadata, - ): EndianCodec { - return new EndianCodec(configuration, array_metadata); - } - - encode(bytes: Uint8Array): Uint8Array { - if (LITTLE_ENDIAN_OS && this.configuration.endian === "big") { - byteswap_inplace(bytes, bytes_per_element(this.array_metadata.data_type)); - } - return bytes; - } +type CodecEntry = { + fromConfig: (config: Record, meta: ArrayMetadata) => Codec; + kind?: "array_to_array" | "array_to_bytes" | "bytes_to_bytes"; +}; - decode(bytes: Uint8Array): Uint8Array { - if (LITTLE_ENDIAN_OS && this.configuration.endian === "big") { - byteswap_inplace(bytes, bytes_per_element(this.array_metadata.data_type)); - } - return bytes; - } -} - -class TransposeCodec { - constructor( - public configuration: { order: "C" | "F" }, - public array_metadata: ArrayMetadata, - ) {} - - static fromConfig( - configuration: { order: "C" | "F" }, - array_metadata: ArrayMetadata, - ): TransposeCodec { - return new TransposeCodec(configuration, array_metadata); - } - - encode(bytes: Uint8Array): Uint8Array { - return bytes; - } - - decode(bytes: Uint8Array): Uint8Array { - return bytes; - } -} - -type InitCodec = (config: Record, meta: ArrayMetadata) => Codec; -type CodecImporter = () => Promise<{ fromConfig: InitCodec }>; - -function create_default_registry(): Map { +function create_default_registry(): Map< + string, + () => Promise +> { return new Map() .set("blosc", () => import("numcodecs/blosc").then((m) => m.default)) .set("gzip", () => import("numcodecs/gzip").then((m) => m.default)) .set("lz4", () => import("numcodecs/lz4").then((m) => m.default)) .set("zlib", () => import("numcodecs/zlib").then((m) => m.default)) .set("zstd", () => import("numcodecs/zstd").then((m) => m.default)) - .set("endian", () => EndianCodec) - .set("transpose", () => TransposeCodec); + .set("transpose", () => TransposeCodec) + .set("endian", () => EndianCodec); } export const registry = create_default_registry(); -export function create_codec_pipeline( - array_metadata: ArrayMetadata, - codec_registry: typeof registry = registry, +export function create_codec_pipeline( + array_metadata: ArrayMetadata, ) { - let codecs: Promise[] | undefined; - - // allows us to laziy load codecs only if they are required - function init_codecs() { - let metadata = array_metadata.codecs; - return metadata.map(async (meta) => { - let Codec = await codec_registry.get(meta.name)?.(); - if (!Codec) { - throw new Error(`Unknown codec: ${meta.name}`); - } - return Codec.fromConfig(meta.configuration, array_metadata); - }); - } - + let codecs: Awaited>; return { - async encode( - data: TypedArray, - ): Promise { - if (!codecs) codecs = init_codecs(); - let bytes = new Uint8Array(data.buffer); - for await (const codec of codecs) { + async encode(chunk: Chunk): Promise { + if (!codecs) codecs = await load_codecs(array_metadata); + console.log(codecs); + for (const codec of codecs.array_to_array) { + chunk = await codec.encode(chunk); + } + let bytes = await codecs.array_to_bytes.encode(chunk); + for (const codec of codecs.bytes_to_bytes) { bytes = await codec.encode(bytes); } - return bytes as unknown as Uint8Array; + return bytes; }, - async decode(bytes: Uint8Array): Promise> { - if (!codecs) codecs = init_codecs(); - for (let i = codecs.length - 1; i >= 0; i--) { - let codec = await codecs[i]; - bytes = await codec.decode(bytes); + async decode(bytes: Uint8Array): Promise> { + if (!codecs) codecs = await load_codecs(array_metadata); + for (let i = codecs.bytes_to_bytes.length - 1; i >= 0; i--) { + bytes = await codecs.bytes_to_bytes[i].decode(bytes); } - let ctr = get_ctr(array_metadata.data_type); - return { - data: new ctr(bytes.buffer), - shape: array_metadata.chunk_grid.configuration.chunk_shape, - stride: get_strides( - array_metadata.chunk_grid.configuration.chunk_shape, - "C", - ), - }; + let chunk = await codecs.array_to_bytes.decode(bytes); + for (let i = codecs.array_to_array.length - 1; i >= 0; i--) { + chunk = await codecs.array_to_array[i].decode(chunk); + } + return chunk; }, }; } + +type ArrayToArrayCodec = { + encode: (data: Chunk) => Promise> | Chunk; + decode: (data: Chunk) => Promise> | Chunk; +}; + +type ArrayToBytesCodec = { + encode: (data: Chunk) => Promise | Uint8Array; + decode: (data: Uint8Array) => Promise> | Chunk; +}; + +type BytesToBytesCodec = { + encode: (data: Uint8Array) => Promise; + decode: (data: Uint8Array) => Promise; +}; + +async function load_codecs( + array_metadata: ArrayMetadata, +) { + let promises = array_metadata.codecs.map(async (meta) => { + let Codec = await registry.get(meta.name)?.(); + if (!Codec) { + throw new Error(`Unknown codec: ${meta.name}`); + } + return { Codec, meta }; + }); + let array_to_array: ArrayToArrayCodec[] = []; + let array_to_bytes: ArrayToBytesCodec = EndianCodec.fromConfig({ + endian: "little", + }, array_metadata); + let bytes_to_bytes: BytesToBytesCodec[] = []; + for await (let { Codec, meta } of promises) { + let codec = Codec.fromConfig(meta.configuration, array_metadata); + switch (codec.kind) { + case "array_to_array": + array_to_array.push(codec); + break; + case "array_to_bytes": + array_to_bytes = codec; + break; + default: + bytes_to_bytes.push(codec); + } + } + if (array_to_array.length === 0) { + array_to_array.push( + TransposeCodec.fromConfig({ order: "C" }, array_metadata), + ); + } + return { array_to_array, array_to_bytes, bytes_to_bytes }; +} diff --git a/packages/core/src/codecs/endian.ts b/packages/core/src/codecs/endian.ts new file mode 100644 index 00000000..70e407ce --- /dev/null +++ b/packages/core/src/codecs/endian.ts @@ -0,0 +1,69 @@ +import type { ArrayMetadata, Chunk, DataType } from "../metadata.js"; +import { byteswap_inplace, get_ctr, get_strides } from "../util.js"; + +const LITTLE_ENDIAN_OS = system_is_little_endian(); + +function system_is_little_endian(): boolean { + const a = new Uint32Array([0x12345678]); + const b = new Uint8Array(a.buffer, a.byteOffset, a.byteLength); + return !(b[0] === 0x12); +} + +function bytes_per_element(data_type: DataType): number { + const mapping: any = { + int8: 1, + int16: 2, + int32: 4, + int64: 8, + uint8: 1, + uint16: 2, + uint32: 4, + uint64: 8, + float32: 4, + float64: 8, + }; + let b = mapping[data_type]; + if (!b) { + throw new Error(`Unknown or unsupported data type: ${data_type}`); + } + return b; +} + +export class EndianCodec { + kind = "array_to_bytes"; + + constructor( + public configuration: { endian: "little" | "big" }, + public array_metadata: ArrayMetadata, + ) {} + + static fromConfig( + configuration: { endian: "little" | "big" }, + array_metadata: ArrayMetadata, + ): EndianCodec { + return new EndianCodec(configuration, array_metadata); + } + + encode(arr: Chunk): Uint8Array { + let bytes = new Uint8Array(arr.data.buffer); + if (LITTLE_ENDIAN_OS && this.configuration.endian === "big") { + byteswap_inplace(bytes, bytes_per_element(this.array_metadata.data_type)); + } + return bytes; + } + + decode(bytes: Uint8Array): Chunk { + if (LITTLE_ENDIAN_OS && this.configuration.endian === "big") { + byteswap_inplace(bytes, bytes_per_element(this.array_metadata.data_type)); + } + let ctr = get_ctr(this.array_metadata.data_type); + return { + data: new ctr(bytes.buffer) as any, + shape: this.array_metadata.chunk_grid.configuration.chunk_shape, + stride: get_strides( + this.array_metadata.chunk_grid.configuration.chunk_shape, + "C", // TODO: this should be configurable? + ), + }; + } +} diff --git a/packages/core/src/codecs/transpose.ts b/packages/core/src/codecs/transpose.ts new file mode 100644 index 00000000..071f34da --- /dev/null +++ b/packages/core/src/codecs/transpose.ts @@ -0,0 +1,137 @@ +import type { + ArrayMetadata, + Chunk, + DataType, + Scalar, + TypedArray, + TypedArrayConstructor, +} from "../metadata.js"; +import { + BoolArray, + ByteStringArray, + UnicodeStringArray, +} from "@zarrita/typedarray"; +import { get_strides } from "../util.js"; + +type TypedArrayProxy = { + [x: number]: Scalar; +}; + +function proxy(arr: TypedArray): TypedArrayProxy { + if ( + arr instanceof BoolArray || + arr instanceof ByteStringArray || + arr instanceof UnicodeStringArray + ) { + return new Proxy(arr as any, { + get(target, prop) { + return target.get(Number(prop)); + }, + set(target, prop, value) { + target.set(Number(prop), value as any); + return true; + }, + }); + } + return arr as any; +} + +function empty_like( + chunk: Chunk, + order: "C" | "F", +): Chunk { + let data: TypedArray; + if ( + chunk.data instanceof ByteStringArray || + chunk.data instanceof UnicodeStringArray + ) { + data = new (chunk.constructor as TypedArrayConstructor)( + chunk.data.length, + // @ts-expect-error + chunk.data.chars, + ); + } else { + data = new (chunk.constructor as TypedArrayConstructor)( + chunk.data.length, + ); + } + return { + data, + shape: chunk.shape, + stride: get_strides(chunk.shape, order), + }; +} + +function convert_array_order( + src: Chunk, + target: "C" | "F", +): Chunk { + let out = empty_like(src, target); + let n_dims = src.shape.length; + let size = src.data.length; + let index = Array(n_dims).fill(0); + + let src_data = proxy(src.data); + let out_data = proxy(out.data); + + for (let src_idx = 0; src_idx < size; src_idx++) { + let out_idx = 0; + for (let dim = 0; dim < n_dims; dim++) { + out_idx += index[dim] * out.stride[dim]; + } + out_data[out_idx] = src_data[src_idx]; + + index[0] += 1; + for (let dim = 0; dim < n_dims; dim++) { + if (index[dim] === src.shape[dim]) { + if (dim + 1 === n_dims) { + break; + } + index[dim] = 0; + index[dim + 1] += 1; + } + } + } + + return out; +} + +function get_order(arr: Chunk): "C" | "F" { + // Assume C order if no stride is given + if (!arr.stride) return "C"; + let row_major_strides = get_strides(arr.shape, "C"); + return arr.stride.every((s, i) => s === row_major_strides[i]) ? "C" : "F"; +} + +export class TransposeCodec { + kind = "array_to_array"; + + constructor( + public configuration: { order: "C" | "F" }, + public array_metadata: ArrayMetadata, + ) {} + + static fromConfig( + configuration: { order: "C" | "F" }, + array_metadata: ArrayMetadata, + ) { + return new TransposeCodec(configuration, array_metadata); + } + + encode(arr: Chunk): Chunk { + if (get_order(arr) === this.configuration.order) { + return arr; + } + return convert_array_order(arr, this.configuration.order); + } + + decode(arr: Chunk): Chunk { + if (get_order(arr) === this.configuration.order) { + return arr; + } + return convert_array_order( + arr, + this.configuration.order === "C" ? "F" : "C", + ); + } +} diff --git a/packages/core/src/create.ts b/packages/core/src/create.ts index 7e3d87a5..45363bad 100644 --- a/packages/core/src/create.ts +++ b/packages/core/src/create.ts @@ -1,4 +1,5 @@ import type { Async, Readable, Writeable } from "@zarrita/storage"; + import type { ArrayMetadata, Attributes, diff --git a/packages/core/src/hierarchy.ts b/packages/core/src/hierarchy.ts index 1ebe98ab..762bfa59 100644 --- a/packages/core/src/hierarchy.ts +++ b/packages/core/src/hierarchy.ts @@ -79,7 +79,7 @@ export class Array< Dtype extends DataType, Store extends Readable | Async = Readable | Async, > extends Location { - codec_pipeline: ReturnType; + codec: ReturnType; #metadata: ArrayMetadata; #attributes: Record | undefined; @@ -89,7 +89,7 @@ export class Array< metadata: ArrayMetadata, ) { super(store, path); - this.codec_pipeline = create_codec_pipeline(metadata); + this.codec = create_codec_pipeline(metadata); this.#metadata = metadata; if (typeof metadata.attributes === "object") { this.#attributes = metadata.attributes; @@ -112,7 +112,7 @@ export class Array< if (!maybe_bytes) { throw new KeyError(chunk_path); } - return this.codec_pipeline.decode(maybe_bytes); + return this.codec.decode(maybe_bytes); } get shape() { diff --git a/packages/core/src/metadata.ts b/packages/core/src/metadata.ts index b068ce67..345e49ac 100644 --- a/packages/core/src/metadata.ts +++ b/packages/core/src/metadata.ts @@ -131,9 +131,6 @@ export type TypedArray = D extends Int8 ? Int8Array export type TypedArrayConstructor = { new (length: number): TypedArray; new (array: ArrayLike> | ArrayBufferLike): TypedArray; - // TODO: implement for Bool/Unicode arrays - // new(buffer: ArrayBufferLike, byteOffset?: number, length?: number): TypedArray - // new(elements: Iterable>): TypedArray }; export type Chunk = { diff --git a/packages/core/src/util.ts b/packages/core/src/util.ts index c51660be..7b148cc8 100644 --- a/packages/core/src/util.ts +++ b/packages/core/src/util.ts @@ -185,6 +185,7 @@ export function v2_to_v3_group_metadata(_meta: GroupMetadataV2): GroupMetadata { export type DataTypeQuery = | DataType + | "boolean" | "number" | "bigint" | "raw"; @@ -201,12 +202,19 @@ export function is_dtype( dtype: DataType, query: Query, ): dtype is NarrowDataType { - if (query !== "raw" && query !== "number" && query !== "bigint") { + if ( + query !== "raw" && + query !== "number" && + query !== "bigint" && + query !== "boolean" + ) { return dtype === query; } + const is_boolean = dtype === "bool"; + if (query === "boolean") return is_boolean; const is_raw = dtype.startsWith("r"); if (query === "raw") return is_raw; const is_bigint = dtype === "int64" || dtype === "uint64"; if (query === "bigint") return is_bigint; - return !is_raw && !is_bigint && !(dtype === "bool"); + return !is_raw && !is_bigint && !is_boolean; } diff --git a/packages/indexing/src/set.ts b/packages/indexing/src/set.ts index ab76f856..56624f2d 100644 --- a/packages/indexing/src/set.ts +++ b/packages/indexing/src/set.ts @@ -93,7 +93,11 @@ export async function set>( } } // encode chunk - const encoded_chunk_data = await arr.codec_pipeline.encode(cdata); + const encoded_chunk_data = await arr.codec.encode({ + data: cdata, + shape: arr.chunk_shape, + stride: stride, + }); // store await arr.store.set(chunk_path, encoded_chunk_data); });