From 518ee54b17d1d3832fb8ef1bbd6ff0b61bd9b96d Mon Sep 17 00:00:00 2001 From: Harish Kumar Gangula Date: Thu, 24 Oct 2024 16:10:47 +0530 Subject: [PATCH 1/3] #I285: removed empty objects and arrays from sample data --- .../GenerateDataSchema/GenerateDataSchema.ts | 5 +- .../DataSchemaService.ts | 54 +++++++++++++------ 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/api-service/src/controllers/GenerateDataSchema/GenerateDataSchema.ts b/api-service/src/controllers/GenerateDataSchema/GenerateDataSchema.ts index a37e5d13..11bcee7e 100644 --- a/api-service/src/controllers/GenerateDataSchema/GenerateDataSchema.ts +++ b/api-service/src/controllers/GenerateDataSchema/GenerateDataSchema.ts @@ -32,7 +32,7 @@ const dataSchema = async (req: Request, res: Response) => { const request = req.body.request const dataSchemaSpec = schemaGenerate(request.data, request.config) ResponseHandler.successResponse(req, res, { status: httpStatus.OK, data: dataSchemaSpec }); - + } const schemaGenerate = (sample: Map[], config: Record): any => { @@ -46,13 +46,14 @@ const schemaGenerate = (sample: Map[], config: Record) result.schema = removeFormats(result.schema) return result } else { - let schema = isBatch ? schemaInference.inferBatchSchema([]>sample, extractionKey) : schemaInference.inferSchema(sample); + let { schema, removedKeys } = isBatch ? schemaInference.inferBatchSchema([]>sample, extractionKey) : schemaInference.inferSchema(sample); schema = schemaArrayValidator.validate(schema) const schemaCardinalityAnalyser = new SchemaCardinalityAnalyser(sample, schema) rollupInfo = schemaCardinalityAnalyser.analyse() const result = process(schema, dataset) result.schema = removeNonIndexColumns(result.schema) result.schema = removeFormats(result.schema) + !_.isEmpty(removedKeys) && _.set(result, "removedKeys", removedKeys) return result } } diff --git a/api-service/src/services/SchemaGenerateService/DataSchemaService.ts b/api-service/src/services/SchemaGenerateService/DataSchemaService.ts index 67bde9e7..78a8556b 100644 --- a/api-service/src/services/SchemaGenerateService/DataSchemaService.ts +++ b/api-service/src/services/SchemaGenerateService/DataSchemaService.ts @@ -5,17 +5,17 @@ import moment from "moment"; import { SchemaGenerationException } from "../../exceptions/SchemaGenerationException"; const DATE_FORMATS = [ - "MM/DD/YYYY","DD/MM/YYYY", "YYYY-MM-DD", "YYYY-DD-MM", "YYYY/MM/DD", + "MM/DD/YYYY", "DD/MM/YYYY", "YYYY-MM-DD", "YYYY-DD-MM", "YYYY/MM/DD", "DD-MM-YYYY", "MM-DD-YYYY", "MM-DD-YYYY HH:mm:ss", "YYYY/MM/DD HH:mm:ss", - "YYYY-MM-DD HH:mm:ss", "YYYY-DD-MM HH:mm:ss", "DD/MM/YYYY HH:mm:ss", - "DD-MM-YYYY HH:mm:ss", "MM-DD-YYYY HH:mm:ss.SSS", "YYYY-MM-DD HH:mm:ss.SSS", - "YYYY-DD-MM HH:mm:ss.SSS", "YYYY/MM/DD HH:mm:ss.SSS", "DD/MM/YYYY HH:mm:ss.SSS", - "DD-MM-YYYY HH:mm:ss.SSS", "DD-MM-YYYYTHH:mm:ss.SSSZ", "YYYY-MM-DDTHH:mm:ss.SSSZ", + "YYYY-MM-DD HH:mm:ss", "YYYY-DD-MM HH:mm:ss", "DD/MM/YYYY HH:mm:ss", + "DD-MM-YYYY HH:mm:ss", "MM-DD-YYYY HH:mm:ss.SSS", "YYYY-MM-DD HH:mm:ss.SSS", + "YYYY-DD-MM HH:mm:ss.SSS", "YYYY/MM/DD HH:mm:ss.SSS", "DD/MM/YYYY HH:mm:ss.SSS", + "DD-MM-YYYY HH:mm:ss.SSS", "DD-MM-YYYYTHH:mm:ss.SSSZ", "YYYY-MM-DDTHH:mm:ss.SSSZ", "YYYY-DD-MMTHH:mm:ss.SSSZ", "YYYY/MM/DDTHH:mm:ss.SSSZ", "DD/MM/YYYYTHH:mm:ss.SSSZ", "YYYY-DD-MMTHH:mm:ss.SSS", "YYYY/MM/DDTHH:mm:ss.SSS", "DD/MM/YYYYTHH:mm:ss.SSS", "MM-DD-YYYYTHH:mm:ss.SSSZ", "DD-MM-YYYYTHH:mm:ssZ", "YYYY-MM-DDTHH:mm:ssZ", - "YYYY-DD-MMTHH:mm:ssZ", "YYYY/MM/DDTHH:mm:ssZ", "DD/MM/YYYYTHH:mm:ssZ", "MM-DD-YYYYTHH:mm:ssZ", - "MM-DD-YYYYTHH:mm:ss", "DD-MM-YYYYTHH:mm:ss", "YYYY-MM-DDTHH:mm:ss", "YYYY-DD-MMTHH:mm:ss", + "YYYY-DD-MMTHH:mm:ssZ", "YYYY/MM/DDTHH:mm:ssZ", "DD/MM/YYYYTHH:mm:ssZ", "MM-DD-YYYYTHH:mm:ssZ", + "MM-DD-YYYYTHH:mm:ss", "DD-MM-YYYYTHH:mm:ss", "YYYY-MM-DDTHH:mm:ss", "YYYY-DD-MMTHH:mm:ss", "YYYY/MM/DDTHH:mm:ss", "DD/MM/YYYYTHH:mm:ss", "DD-MM-YYYY HH:mm:ss.SSSZ", "YYYY-MM-DD HH:mm:ss.SSSZ", "YYYY-DD-MM HH:mm:ss.SSSZ", "YYYY/MM/DD HH:mm:ss.SSSZ", "DD/MM/YYYY HH:mm:ss.SSSZ", "MM-DD-YYYY HH:mm:ss.SSSZ", "DD-MM-YYYY HH:mm:ssZ", "YYYY-MM-DD HH:mm:ssZ", "YYYY-DD-MM HH:mm:ssZ", @@ -38,16 +38,24 @@ const DATE_FORMATS = [ export class SchemaInference { public inferSchema(sample: any) { - const schema = _.map(sample, (value): any => this.validateEpoch(inferSchema(value).toJSONSchema({ includeSchema: true }), value, "properties")) - return schema + const removedAllKeys: any[] = [] + const schema = _.map(sample, (value): any => { + const { cleanedData, removedKeys } = this.removeEmpty(value) + removedAllKeys.push(...removedKeys) + return this.validateEpoch(inferSchema(cleanedData).toJSONSchema({ includeSchema: true }), cleanedData, "properties") + }) + return { schema, removedKeys: removedAllKeys } } public inferBatchSchema(sample: Map[], extractionKey: string) { - return _.flatMap(sample, (value) => { + const removedAllKeys: any[] = [] + const schema = _.flatMap(sample, (value) => { if (extractionKey) { const extracted = _.get(value, extractionKey); if (extracted) { - return this.inferSchema(extracted); + const { schema, removedKeys } = this.inferSchema(extracted); + removedAllKeys.push(...removedKeys) + return schema } else { throw new SchemaGenerationException("Unable to extract the batch data.", httpStatus.BAD_REQUEST); } @@ -55,6 +63,22 @@ export class SchemaInference { throw new SchemaGenerationException("Extraction key not found.", httpStatus.BAD_REQUEST); } }) + return { schema, removedKeys: removedAllKeys } + } + // Only removes empty object and array at all the levels + private removeEmpty(data: any, parentKey = '', removedKeys: any[] = []) { + Object.keys(data).forEach((key) => { + const value = data[key]; + const fullKey = parentKey ? `${parentKey}.${key}` : key; + if (typeof value === 'object' && value !== null) { + this.removeEmpty(value, fullKey, removedKeys); + if (_.isEmpty(value)) { + delete data[key]; + removedKeys.push({ "key": fullKey, value }); + } + } + }); + return { cleanedData: data, removedKeys }; } private validateEpoch(schema: any, sample: any, path: any) { @@ -67,7 +91,7 @@ export class SchemaInference { if (isValidTimestamp) { _.set(schema, `${path}.${key}.format`, type) } - else if(format && ["date-time", "time", "date"].includes(format) && !isValidTimestamp) { + else if (format && ["date-time", "time", "date"].includes(format) && !isValidTimestamp) { _.unset(schema, `${path}.${key}.format`) } }); @@ -79,7 +103,7 @@ export class SchemaInference { const epochRegex = /^\d+$/ig; switch (dataType) { case "string": - if(epochRegex.test(value)){ + if (epochRegex.test(value)) { const parsedValue = parseInt(value, 10); // Timestamp should be greater than Jan 01 2000 00:00:00 UTC/GMT in seconds return { @@ -93,12 +117,12 @@ export class SchemaInference { case "number": // Timestamp should be greater than Jan 01 2000 00:00:00 UTC/GMT in seconds return { - isValidTimestamp: value >= 946684800 && moment(value).isValid(), + isValidTimestamp: value >= 946684800 && moment(value).isValid(), type: "epoch" }; default: return { - isValidTimestamp: false, + isValidTimestamp: false, type: "" }; } From 3827e9f8322a6340cb39c763d5ca0e21a5eb73a8 Mon Sep 17 00:00:00 2001 From: Harish Kumar Gangula Date: Thu, 24 Oct 2024 16:22:27 +0530 Subject: [PATCH 2/3] Fix code scanning alert no. 98: Loop bound injection Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- .../src/services/SchemaGenerateService/DataSchemaService.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/api-service/src/services/SchemaGenerateService/DataSchemaService.ts b/api-service/src/services/SchemaGenerateService/DataSchemaService.ts index 78a8556b..e648c2ec 100644 --- a/api-service/src/services/SchemaGenerateService/DataSchemaService.ts +++ b/api-service/src/services/SchemaGenerateService/DataSchemaService.ts @@ -38,6 +38,9 @@ const DATE_FORMATS = [ export class SchemaInference { public inferSchema(sample: any) { + if (!Array.isArray(sample)) { + throw new SchemaGenerationException("Invalid input: sample must be an array.", httpStatus.BAD_REQUEST); + } const removedAllKeys: any[] = [] const schema = _.map(sample, (value): any => { const { cleanedData, removedKeys } = this.removeEmpty(value) @@ -48,6 +51,9 @@ export class SchemaInference { } public inferBatchSchema(sample: Map[], extractionKey: string) { + if (!Array.isArray(sample)) { + throw new SchemaGenerationException("Invalid input: sample must be an array.", httpStatus.BAD_REQUEST); + } const removedAllKeys: any[] = [] const schema = _.flatMap(sample, (value) => { if (extractionKey) { From 856e6d72ad8c9f57255c075172b2e7537c1c03f7 Mon Sep 17 00:00:00 2001 From: Harish Kumar Gangula Date: Thu, 24 Oct 2024 16:45:53 +0530 Subject: [PATCH 3/3] #I285: lint issues fixed --- .../src/services/SchemaGenerateService/DataSchemaService.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api-service/src/services/SchemaGenerateService/DataSchemaService.ts b/api-service/src/services/SchemaGenerateService/DataSchemaService.ts index e648c2ec..aec2f8d3 100644 --- a/api-service/src/services/SchemaGenerateService/DataSchemaService.ts +++ b/api-service/src/services/SchemaGenerateService/DataSchemaService.ts @@ -72,11 +72,11 @@ export class SchemaInference { return { schema, removedKeys: removedAllKeys } } // Only removes empty object and array at all the levels - private removeEmpty(data: any, parentKey = '', removedKeys: any[] = []) { + private removeEmpty(data: any, parentKey = "", removedKeys: any[] = []) { Object.keys(data).forEach((key) => { const value = data[key]; const fullKey = parentKey ? `${parentKey}.${key}` : key; - if (typeof value === 'object' && value !== null) { + if (typeof value === "object" && value !== null) { this.removeEmpty(value, fullKey, removedKeys); if (_.isEmpty(value)) { delete data[key];