Skip to content

Commit

Permalink
Serialization and deserialization to file, new serialization formats (#…
Browse files Browse the repository at this point in the history
…53)

Add the ability to serialize and deserialize a file directly with serializeFileAsync and deserializeFileAsync methods - those two are faster than loading a file in memory and deserialize/serialize in memory and are fully asynchronous. They use mmap if possible.
Adds the ability to serialize to file text format and deserialize from buffer and from file text formats: newline_separated_values, comma_separated_values, tab_separated_values, json_array
Adds the ability to serialize and deserialize a binary array of little endian uint32_t values
remove COW enabled by default, it could cause issues with asynchronous operations and multi threading - will provide a flag in a new version
  • Loading branch information
SalvatorePreviti authored May 17, 2023
1 parent 1b476d8 commit 20bc4ff
Show file tree
Hide file tree
Showing 23 changed files with 2,843 additions and 700 deletions.
78 changes: 9 additions & 69 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -77,84 +77,24 @@
},
"files.associations": {
"__bit_reference": "cpp",
"__bits": "cpp",
"__config": "cpp",
"__debug": "cpp",
"__errc": "cpp",
"__hash_table": "cpp",
"__locale": "cpp",
"__mutex_base": "cpp",
"__node_handle": "cpp",
"__nullptr": "cpp",
"__split_buffer": "cpp",
"__string": "cpp",
"__threading_support": "cpp",
"__tree": "cpp",
"__tuple": "cpp",
"array": "cpp",
"atomic": "cpp",
"bitset": "cpp",
"cctype": "cpp",
"chrono": "cpp",
"cinttypes": "cpp",
"clocale": "cpp",
"cmath": "cpp",
"compare": "cpp",
"complex": "cpp",
"concepts": "cpp",
"cstdarg": "cpp",
"cstddef": "cpp",
"cstdint": "cpp",
"cstdio": "cpp",
"cstdlib": "cpp",
"cstring": "cpp",
"ctime": "cpp",
"cwchar": "cpp",
"cwctype": "cpp",
"deque": "cpp",
"exception": "cpp",
"fstream": "cpp",
"initializer_list": "cpp",
"iomanip": "cpp",
"ios": "cpp",
"iosfwd": "cpp",
"iostream": "cpp",
"istream": "cpp",
"__memory": "cpp",
"limits": "cpp",
"locale": "cpp",
"map": "cpp",
"memory": "cpp",
"mutex": "cpp",
"new": "cpp",
"numeric": "cpp",
"optional": "cpp",
"ostream": "cpp",
"queue": "cpp",
"random": "cpp",
"ratio": "cpp",
"set": "cpp",
"sstream": "cpp",
"stack": "cpp",
"stdexcept": "cpp",
"streambuf": "cpp",
"string": "cpp",
"string_view": "cpp",
"system_error": "cpp",
"tuple": "cpp",
"type_traits": "cpp",
"typeinfo": "cpp",
"unordered_map": "cpp",
"variant": "cpp",
"vector": "cpp",
"algorithm": "cpp",
"roaring.c": "cpp",
"isadetection.c": "cpp",
"array_util.c": "cpp",
"bitset.c": "cpp",
"bitset_util.c": "cpp",
"containers.c": "cpp",
"__verbose_abort": "cpp",
"bit": "cpp",
"filesystem": "cpp"
"chrono": "cpp",
"filesystem": "cpp",
"random": "cpp",
"__config": "cpp",
"__nullptr": "cpp",
"atomic": "cpp",
"locale": "cpp",
"cstddef": "cpp"
}
}
175 changes: 165 additions & 10 deletions index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -157,15 +157,83 @@ export enum SerializationFormat {
*
*/
unsafe_frozen_croaring = "unsafe_frozen_croaring",

/**
* A plain binary array of 32 bits integers in little endian format. 4 bytes per value.
*/
uint32_array = "uint32_array",
}

export enum FileSerializationFormat {
/**
* Stable Optimized non portable C/C++ format. Used by croaring. Can be smaller than the portable format.
*/
croaring = "croaring",

/**
* Stable Portable Java and Go format.
*/
portable = "unsafe_portable",

/**
* A plain binary array of 32 bits integers in little endian format. 4 bytes per value.
*/
uint32_array = "uint32_array",

/**
* Non portable C/C++ frozen format.
* Is considered unsafe and unstable because the format might change at any new version.
* Can be useful for temporary storage or for sending data over the network between similar machines.
* If the content is corrupted when deserialized or when a frozen view is create, the behavior is undefined!
* The application may crash, buffer overrun, could be a vector of attack!
*
* When this option is used in the serialize function, the new returned buffer (if no buffer was provided) will be aligned to a 32 bytes boundary.
* This is required to create a frozen view with the method unsafeFrozenView.
*
*/
unsafe_frozen_croaring = "unsafe_frozen_croaring",

/**
* Comma separated values, all values are in decimal and in one line without spaces or other characters.
*/
comma_separated_values = "comma_separated_values",

/**
* Tab "\t" separated values, all values are in decimal and in one line without other characters.
*/
tab_separated_values = "tab_separated_values",

/**
* Newline (\n) separated values, all values are in decimal and one per line with a terminating newline.
*/
newline_separated_values = "newline_separated_values",

/**
* A JSON file in the format "[1,2,3,4...]"
*/
json_array = "json_array",
}

export type SerializationFormatType =
| SerializationFormat
| "croaring"
| "portable"
| "unsafe_frozen_croaring"
| "uint32_array"
| boolean;

export type FileSerializationFormatType =
| SerializationFormatType
| FileSerializationFormat
| "comma_separated_values"
| "tab_separated_values"
| "newline_separated_values"
| "json_array";

export type SerializationDeserializationFormatType = SerializationFormatType & DeserializationFormatType;

export type FileSerializationDeserializationFormatType = FileSerializationFormatType & FileDeserializationFormatType;

export enum DeserializationFormat {
/** Stable Optimized non portable C/C++ format. Used by croaring. Can be smaller than the portable format. */
croaring = "croaring",
Expand All @@ -190,16 +258,69 @@ export enum DeserializationFormat {
* The application may crash, buffer overrun, could be a vector of attack!
*/
unsafe_frozen_portable = "unsafe_frozen_portable",

/**
* A plain binary array of 32 bits integers in little endian format. 4 bytes per value.
*/
uint32_array = "uint32_array",

comma_separated_values = "comma_separated_values",
tab_separated_values = "tab_separated_values",
newline_separated_values = "newline_separated_values",
json_array = "json_array",
}

export type DeserializationFormatType =
| SerializationFormat
| DeserializationFormat
| "croaring"
| "portable"
| "unsafe_frozen_croaring"
| "unsafe_frozen_portable"
| "uint32_array"
| "comma_separated_values"
| "tab_separated_values"
| "newline_separated_values"
| "json_array"
| boolean;

export enum FileDeserializationFormat {
/** Stable Optimized non portable C/C++ format. Used by croaring. Can be smaller than the portable format. */
croaring = "croaring",

/** Stable Portable Java and Go format. */
portable = "portable",

/**
* Non portable C/C++ frozen format.
* Is considered unsafe and unstable because the format might change at any new version.
* Can be useful for temporary storage or for sending data over the network between similar machines.
* If the content is corrupted when loaded or the buffer is modified when a frozen view is create, the behavior is undefined!
* The application may crash, buffer overrun, could be a vector of attack!
*/
unsafe_frozen_croaring = "unsafe_frozen_croaring",

/**
* Portable version of the frozen view, compatible with Go and Java.
* Is considered unsafe and unstable because the format might change at any new version.
* Can be useful for temporary storage or for sending data over the network between similar machines.
* If the content is corrupted when loaded or the buffer is modified when a frozen view is create, the behavior is undefined!
* The application may crash, buffer overrun, could be a vector of attack!
*/
unsafe_frozen_portable = "unsafe_frozen_portable",

/**
* A plain binary array of 32 bits integers in little endian format. 4 bytes per value.
*/
uint32_array = "uint32_array",

comma_separated_values = "comma_separated_values",
tab_separated_values = "tab_separated_values",
newline_separated_values = "newline_separated_values",
json_array = "json_array",
}

export type FileDeserializationFormatType = DeserializationFormatType | FileDeserializationFormat;

export enum FrozenViewFormat {
/**
* Non portable C/C++ frozen format.
Expand Down Expand Up @@ -1006,6 +1127,18 @@ export interface ReadonlyRoaringBitmap32 extends ReadonlySet<number> {
format: SerializationFormatType,
): Promise<Buffer>;

/**
* Serializes the bitmap into a file, asynchronously.
* The bitmap will be temporarily frozen until the operation completes.
*
* This is faster, everything runs in its own thread and it consumes less memory than serializing to a Buffer and then to write to a file,
* internally it uses memory mapped files and skip all the JS overhead.
*
* @param {FileSerializationFormat | boolean} format One of the SerializationFormat enum values, or a boolean value: if false, optimized C/C++ format is used. If true, Java and Go portable format is used.
* @memberof ReadonlyRoaringBitmap32
*/
serializeFileAsync(filePath: string, format: FileSerializationFormatType): Promise<void>;

/**
* Returns a new ReadonlyRoaringBitmap32 that is a copy of this bitmap, same as new ReadonlyRoaringBitmap32(copy)
*
Expand Down Expand Up @@ -1174,7 +1307,6 @@ export interface RoaringBitmap32 extends ReadonlyRoaringBitmap32, Set<number> {
* Overwrite the content of this bitmap copying it from an Iterable or another RoaringBitmap32.
*
* Is faster to pass a Uint32Array instance instead of an array or an iterable.
*
* Is even faster if a RoaringBitmap32 instance is used (it performs a simple copy).
*
* @param {Iterable<number>} values The new values or a RoaringBitmap32 instance.
Expand Down Expand Up @@ -1476,9 +1608,17 @@ export class RoaringBitmap32 {

public readonly SerializationFormat: typeof SerializationFormat;

public static readonly DeserializationFormat: typeof SerializationFormat;
public static readonly FileSerializationFormat: typeof FileSerializationFormat;

public readonly FileSerializationFormat: typeof FileSerializationFormat;

public static readonly FileDeserializationFormat: typeof FileDeserializationFormat;

public readonly DeserializationFormat: typeof SerializationFormat;
public readonly FileDeserializationFormat: typeof FileDeserializationFormat;

public static readonly DeserializationFormat: typeof DeserializationFormat;

public readonly DeserializationFormat: typeof DeserializationFormat;

public static readonly FrozenViewFormat: typeof FrozenViewFormat;

Expand Down Expand Up @@ -1735,12 +1875,10 @@ export class RoaringBitmap32 {
*
* Returns a Promise that resolves to a new RoaringBitmap32 instance.
*
* Setting the portable flag to false enable a custom format that can save space compared to the portable format (e.g., for very sparse bitmaps).
* The portable version is meant to be compatible with Java and Go versions.
* The croaring version is compatible with the C version, it can be smaller than the portable version.
* When a frozen format is used, the buffer will be copied and the bitmap will be frozen.
*
* NOTE: portable argument was optional before, now is required and an Error is thrown if the portable flag is not passed.
*
* @static
* @param {Uint8Array | Uint8ClampedArray | Int8Array | ArrayBuffer| SharedArrayBuffer | null | undefined} serialized An Uint8Array or a node Buffer that contains the serialized data.
* @param {DeserializationFormatType} format The format of the serialized data. true means "portable". false means "croaring".
Expand All @@ -1758,12 +1896,10 @@ export class RoaringBitmap32 {
*
* When deserialization is completed or failed, the given callback will be executed.
*
* Setting the portable flag to false enable a custom format that can save space compared to the portable format (e.g., for very sparse bitmaps).
* The portable version is meant to be compatible with Java and Go versions.
* The croaring version is compatible with the C version, it can be smaller than the portable version.
* When a frozen format is used, the buffer will be copied and the bitmap will be frozen.
*
* NOTE: portable argument was optional before, now is required and an Error is thrown if the portable flag is not passed.
*
* @static
* @param {Uint8Array | Uint8ClampedArray | Int8Array | ArrayBuffer| SharedArrayBuffer | null | undefined} serialized An Uint8Array or a node Buffer that contains the.
* @param {DeserializationFormatType} format The format of the serialized data. true means "portable". false means "croaring".
Expand All @@ -1777,6 +1913,25 @@ export class RoaringBitmap32 {
callback: RoaringBitmap32Callback,
): void;

/**
* Deserializes the bitmap from a file asynchronously.
* Returns a new RoaringBitmap32 instance.
*
* The portable version is meant to be compatible with Java and Go versions.
* The croaring version is compatible with the C version, it can be smaller than the portable version.
* When a frozen format is used, the buffer will be copied and the bitmap will be frozen.
*
* This is faster, everything runs in its own thread and it consumes less memory than serializing to a Buffer and then to write to a file,
* internally it uses memory mapped files and skip all the JS overhead.
*
* @static
* @param {string} filePath The path of the file to read.
* @param {FileDeserializationFormatType} format The format of the serialized data. true means "portable". false means "croaring".
* @returns {Promise<RoaringBitmap32>} A promise that resolves to a new RoaringBitmap32 instance.
* @memberof RoaringBitmap32
*/
public static deserializeFileAsync(filePath: string, format: FileDeserializationFormatType): Promise<RoaringBitmap32>;

/**
*
* Deserializes many bitmaps from an array of Uint8Array or an array of Buffer asynchronously in multiple parallel threads.
Expand Down
Loading

0 comments on commit 20bc4ff

Please sign in to comment.