From a902b460fad38730855358fa471f4567ac862e73 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Fri, 29 Mar 2024 11:48:41 -0400 Subject: [PATCH 1/4] GH-40891: [JS] Store Dates as TimestampMillisecond --- js/src/factories.ts | 4 ++-- js/src/type.ts | 2 +- js/src/visitor/get.ts | 6 +++++- js/test/generate-test-data.ts | 5 ++++- js/test/unit/vector/date-vector-tests.ts | 19 ++++++++++++++----- 5 files changed, 26 insertions(+), 10 deletions(-) diff --git a/js/src/factories.ts b/js/src/factories.ts index aa54498c50bc0..657ae1b95ab92 100644 --- a/js/src/factories.ts +++ b/js/src/factories.ts @@ -65,7 +65,7 @@ export function makeBuilder(option export function vectorFromArray(values: readonly (null | undefined)[], type?: dtypes.Null): Vector; export function vectorFromArray(values: readonly (null | undefined | boolean)[], type?: dtypes.Bool): Vector; export function vectorFromArray = dtypes.Dictionary>(values: readonly (null | undefined | string)[], type?: T): Vector; -export function vectorFromArray(values: readonly (null | undefined | Date)[], type?: T): Vector; +export function vectorFromArray(values: readonly (null | undefined | Date)[], type?: T): Vector; export function vectorFromArray(values: readonly (null | undefined | number)[], type: T): Vector; export function vectorFromArray(values: readonly (null | undefined | bigint)[], type?: T): Vector; export function vectorFromArray(values: readonly (null | undefined | number)[], type?: T): Vector; @@ -145,7 +145,7 @@ function inferType(value: readonly unknown[]): dtypes.DataType { } else if (booleansCount + nullsCount === value.length) { return new dtypes.Bool; } else if (datesCount + nullsCount === value.length) { - return new dtypes.DateMillisecond; + return new dtypes.TimestampMillisecond; } else if (arraysCount + nullsCount === value.length) { const array = value as Array[]; const childType = inferType(array[array.findIndex((ary) => ary != null)]); diff --git a/js/src/type.ts b/js/src/type.ts index ae3aefa025999..2bb330c164bc5 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -406,7 +406,7 @@ type Timestamps = Type.Timestamp | Type.TimestampSecond | Type.TimestampMillisec /** @ignore */ interface Timestamp_ extends DataType { TArray: Int32Array; - TValue: number; + TValue: number | Date; ArrayType: TypedArrayConstructor; } diff --git a/js/src/visitor/get.ts b/js/src/visitor/get.ts index 3ab3bcb68c386..425accd19382e 100644 --- a/js/src/visitor/get.ts +++ b/js/src/visitor/get.ts @@ -180,7 +180,11 @@ const getDate = (data: Data, index: number): T['TValue'] => /** @ignore */ const getTimestampSecond = ({ values }: Data, index: number): T['TValue'] => 1000 * epochMillisecondsLongToMs(values, index * 2); /** @ignore */ -const getTimestampMillisecond = ({ values }: Data, index: number): T['TValue'] => epochMillisecondsLongToMs(values, index * 2); +const getTimestampMillisecond = ({ values, type }: Data, index: number): T['TValue'] => { + const value = epochMillisecondsLongToMs(values, index * 2); + // js dates are timezone agnostic so we only convert to date if there is no timezome + return type.timezone ? value : epochMillisecondsToDate(value); +}; /** @ignore */ const getTimestampMicrosecond = ({ values }: Data, index: number): T['TValue'] => epochMicrosecondsLongToMs(values, index * 2); /** @ignore */ diff --git a/js/test/generate-test-data.ts b/js/test/generate-test-data.ts index 8e6e47de836eb..559283680ad67 100644 --- a/js/test/generate-test-data.ts +++ b/js/test/generate-test-data.ts @@ -415,7 +415,10 @@ function generateTimestamp(this: TestDataVectorGenerator, t type.unit === TimeUnit.MICROSECOND ? 1000000 : type.unit === TimeUnit.MILLISECOND ? 1000 : 1; const data = createTimestamp(length, nullBitmap, multiple, values); - return { values: () => values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, data })]) }; + return { + values: () => type.unit === TimeUnit.MILLISECOND && !type.timezone ? values.map((x) => x == null ? null : new Date(x)) : values, + vector: new Vector([makeData({ type, length, nullCount, nullBitmap, data })]) + }; } function generateTime(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { diff --git a/js/test/unit/vector/date-vector-tests.ts b/js/test/unit/vector/date-vector-tests.ts index f8b4c1c7976d2..61d50d48715b7 100644 --- a/js/test/unit/vector/date-vector-tests.ts +++ b/js/test/unit/vector/date-vector-tests.ts @@ -15,10 +15,19 @@ // specific language governing permissions and limitations // under the License. -import { DateDay, DateMillisecond, RecordBatchReader, Table, vectorFromArray } from 'apache-arrow'; +import { DateDay, DateMillisecond, TimestampMillisecond, RecordBatchReader, Table, vectorFromArray } from 'apache-arrow'; + +describe(`TimeStampVector`, () => { + test(`Dates are stored in TimestampMillisecond`, () => { + const date = new Date('2023-02-01T12:34:56Z'); + const vec = vectorFromArray([date]); + expect(vec.type).toBeInstanceOf(TimestampMillisecond); + expect(vec.get(0)).toBeInstanceOf(Date); + }); +}); describe(`DateVector`, () => { - it('returns days since the epoch as correct JS Dates', () => { + test(`returns days since the epoch as correct JS Dates`, () => { const table = new Table(RecordBatchReader.from(test_data)); const expectedMillis = expectedMillis32(); const date32 = table.getChildAt(0)!; @@ -28,7 +37,7 @@ describe(`DateVector`, () => { } }); - it('returns millisecond longs since the epoch as correct JS Dates', () => { + test(`returns millisecond longs since the epoch as correct JS Dates`, () => { const table = new Table(RecordBatchReader.from(test_data)); const expectedMillis = expectedMillis64(); const date64 = table.getChildAt(1)!; @@ -38,9 +47,9 @@ describe(`DateVector`, () => { } }); - it('returns the same date that was in the vector', () => { + test(`returns the same date that was in the vector`, () => { const dates = [new Date(1950, 1, 0)]; - const vec = vectorFromArray(dates); + const vec = vectorFromArray(dates, new DateMillisecond()); for (const date of vec) { expect(date).toEqual(dates.shift()); } From 66fb984edf3d1b1ecba3f76eecbce8b48f719c27 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Fri, 29 Mar 2024 11:59:33 -0400 Subject: [PATCH 2/4] Fix type --- js/src/visitor/set.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/js/src/visitor/set.ts b/js/src/visitor/set.ts index 5dc42283c36f0..f828b4d3685c0 100644 --- a/js/src/visitor/set.ts +++ b/js/src/visitor/set.ts @@ -178,13 +178,13 @@ export const setDate = (data: Data, index: number, value: T[ }; /** @ignore */ -export const setTimestampSecond = ({ values }: Data, index: number, value: T['TValue']): void => setEpochMsToMillisecondsLong(values, index * 2, value / 1000); +export const setTimestampSecond = ({ values }: Data, index: number, value: T['TValue']): void => setEpochMsToMillisecondsLong(values, index * 2, (value as number) / 1000); /** @ignore */ -export const setTimestampMillisecond = ({ values }: Data, index: number, value: T['TValue']): void => setEpochMsToMillisecondsLong(values, index * 2, value); +export const setTimestampMillisecond = ({ values }: Data, index: number, value: T['TValue']): void => setEpochMsToMillisecondsLong(values, index * 2, value instanceof Date ? value.getTime() : value); /** @ignore */ -export const setTimestampMicrosecond = ({ values }: Data, index: number, value: T['TValue']): void => setEpochMsToMicrosecondsLong(values, index * 2, value); +export const setTimestampMicrosecond = ({ values }: Data, index: number, value: T['TValue']): void => setEpochMsToMicrosecondsLong(values, index * 2, value as number); /** @ignore */ -export const setTimestampNanosecond = ({ values }: Data, index: number, value: T['TValue']): void => setEpochMsToNanosecondsLong(values, index * 2, value); +export const setTimestampNanosecond = ({ values }: Data, index: number, value: T['TValue']): void => setEpochMsToNanosecondsLong(values, index * 2, value as number); /* istanbul ignore next */ /** @ignore */ export const setTimestamp = (data: Data, index: number, value: T['TValue']): void => { From 45a8a87037c5dd9fd4d9d307e8c9091be11775e7 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Mon, 1 Apr 2024 20:14:35 -0400 Subject: [PATCH 3/4] Don't convert timestamps to dates --- js/src/type.ts | 2 +- js/src/visitor/get.ts | 6 +----- js/src/visitor/set.ts | 8 ++++---- js/test/generate-test-data.ts | 5 +---- js/test/unit/vector/date-vector-tests.ts | 4 ++-- 5 files changed, 9 insertions(+), 16 deletions(-) diff --git a/js/src/type.ts b/js/src/type.ts index 2bb330c164bc5..ae3aefa025999 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -406,7 +406,7 @@ type Timestamps = Type.Timestamp | Type.TimestampSecond | Type.TimestampMillisec /** @ignore */ interface Timestamp_ extends DataType { TArray: Int32Array; - TValue: number | Date; + TValue: number; ArrayType: TypedArrayConstructor; } diff --git a/js/src/visitor/get.ts b/js/src/visitor/get.ts index 425accd19382e..3ab3bcb68c386 100644 --- a/js/src/visitor/get.ts +++ b/js/src/visitor/get.ts @@ -180,11 +180,7 @@ const getDate = (data: Data, index: number): T['TValue'] => /** @ignore */ const getTimestampSecond = ({ values }: Data, index: number): T['TValue'] => 1000 * epochMillisecondsLongToMs(values, index * 2); /** @ignore */ -const getTimestampMillisecond = ({ values, type }: Data, index: number): T['TValue'] => { - const value = epochMillisecondsLongToMs(values, index * 2); - // js dates are timezone agnostic so we only convert to date if there is no timezome - return type.timezone ? value : epochMillisecondsToDate(value); -}; +const getTimestampMillisecond = ({ values }: Data, index: number): T['TValue'] => epochMillisecondsLongToMs(values, index * 2); /** @ignore */ const getTimestampMicrosecond = ({ values }: Data, index: number): T['TValue'] => epochMicrosecondsLongToMs(values, index * 2); /** @ignore */ diff --git a/js/src/visitor/set.ts b/js/src/visitor/set.ts index f828b4d3685c0..5dc42283c36f0 100644 --- a/js/src/visitor/set.ts +++ b/js/src/visitor/set.ts @@ -178,13 +178,13 @@ export const setDate = (data: Data, index: number, value: T[ }; /** @ignore */ -export const setTimestampSecond = ({ values }: Data, index: number, value: T['TValue']): void => setEpochMsToMillisecondsLong(values, index * 2, (value as number) / 1000); +export const setTimestampSecond = ({ values }: Data, index: number, value: T['TValue']): void => setEpochMsToMillisecondsLong(values, index * 2, value / 1000); /** @ignore */ -export const setTimestampMillisecond = ({ values }: Data, index: number, value: T['TValue']): void => setEpochMsToMillisecondsLong(values, index * 2, value instanceof Date ? value.getTime() : value); +export const setTimestampMillisecond = ({ values }: Data, index: number, value: T['TValue']): void => setEpochMsToMillisecondsLong(values, index * 2, value); /** @ignore */ -export const setTimestampMicrosecond = ({ values }: Data, index: number, value: T['TValue']): void => setEpochMsToMicrosecondsLong(values, index * 2, value as number); +export const setTimestampMicrosecond = ({ values }: Data, index: number, value: T['TValue']): void => setEpochMsToMicrosecondsLong(values, index * 2, value); /** @ignore */ -export const setTimestampNanosecond = ({ values }: Data, index: number, value: T['TValue']): void => setEpochMsToNanosecondsLong(values, index * 2, value as number); +export const setTimestampNanosecond = ({ values }: Data, index: number, value: T['TValue']): void => setEpochMsToNanosecondsLong(values, index * 2, value); /* istanbul ignore next */ /** @ignore */ export const setTimestamp = (data: Data, index: number, value: T['TValue']): void => { diff --git a/js/test/generate-test-data.ts b/js/test/generate-test-data.ts index 559283680ad67..8e6e47de836eb 100644 --- a/js/test/generate-test-data.ts +++ b/js/test/generate-test-data.ts @@ -415,10 +415,7 @@ function generateTimestamp(this: TestDataVectorGenerator, t type.unit === TimeUnit.MICROSECOND ? 1000000 : type.unit === TimeUnit.MILLISECOND ? 1000 : 1; const data = createTimestamp(length, nullBitmap, multiple, values); - return { - values: () => type.unit === TimeUnit.MILLISECOND && !type.timezone ? values.map((x) => x == null ? null : new Date(x)) : values, - vector: new Vector([makeData({ type, length, nullCount, nullBitmap, data })]) - }; + return { values: () => values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, data })]) }; } function generateTime(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { diff --git a/js/test/unit/vector/date-vector-tests.ts b/js/test/unit/vector/date-vector-tests.ts index 61d50d48715b7..e5cd49933eac5 100644 --- a/js/test/unit/vector/date-vector-tests.ts +++ b/js/test/unit/vector/date-vector-tests.ts @@ -17,12 +17,12 @@ import { DateDay, DateMillisecond, TimestampMillisecond, RecordBatchReader, Table, vectorFromArray } from 'apache-arrow'; -describe(`TimeStampVector`, () => { +describe(`TimestampVector`, () => { test(`Dates are stored in TimestampMillisecond`, () => { const date = new Date('2023-02-01T12:34:56Z'); const vec = vectorFromArray([date]); expect(vec.type).toBeInstanceOf(TimestampMillisecond); - expect(vec.get(0)).toBeInstanceOf(Date); + expect(vec.get(0)).toBe(date.valueOf()); }); }); From 55ef9ff516c182587c9e4a25418a3e2eb40cda95 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Tue, 2 Apr 2024 19:48:14 -0400 Subject: [PATCH 4/4] Describe why DateMillisecond is bad --- js/src/type.ts | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/js/src/type.ts b/js/src/type.ts index ae3aefa025999..a42552d65ad27 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -349,7 +349,19 @@ export class Date_ extends DataType { /** @ignore */ export class DateDay extends Date_ { constructor() { super(DateUnit.DAY); } } -/** @ignore */ +/** + * A signed 64-bit date representing the elapsed time since UNIX epoch (1970-01-01) in milliseconds. + * According to the specification, this should be treated as the number of days, in milliseconds, since the UNIX epoch. + * Therefore, values must be evenly divisible by `86_400_000` (the number of milliseconds in a standard day). + * + * Practically, validation that values of this type are evenly divisible by `86_400_000` is not enforced by this library + * for performance and usability reasons. + * + * Users should prefer to use {@link DateDay} to cleanly represent the number of days. For JS dates, + * {@link TimestampMillisecond} is the preferred type. + * + * @ignore + */ export class DateMillisecond extends Date_ { constructor() { super(DateUnit.MILLISECOND); } } /** @ignore */