diff --git a/api-service/src/controllers/GenerateDataSchema/GenerateDataSchema.ts b/api-service/src/controllers/GenerateDataSchema/GenerateDataSchema.ts index a37e5d13..11bcee7e 100644 --- a/api-service/src/controllers/GenerateDataSchema/GenerateDataSchema.ts +++ b/api-service/src/controllers/GenerateDataSchema/GenerateDataSchema.ts @@ -32,7 +32,7 @@ const dataSchema = async (req: Request, res: Response) => { const request = req.body.request const dataSchemaSpec = schemaGenerate(request.data, request.config) ResponseHandler.successResponse(req, res, { status: httpStatus.OK, data: dataSchemaSpec }); - + } const schemaGenerate = (sample: Map[], config: Record): any => { @@ -46,13 +46,14 @@ const schemaGenerate = (sample: Map[], config: Record) result.schema = removeFormats(result.schema) return result } else { - let schema = isBatch ? schemaInference.inferBatchSchema([]>sample, extractionKey) : schemaInference.inferSchema(sample); + let { schema, removedKeys } = isBatch ? schemaInference.inferBatchSchema([]>sample, extractionKey) : schemaInference.inferSchema(sample); schema = schemaArrayValidator.validate(schema) const schemaCardinalityAnalyser = new SchemaCardinalityAnalyser(sample, schema) rollupInfo = schemaCardinalityAnalyser.analyse() const result = process(schema, dataset) result.schema = removeNonIndexColumns(result.schema) result.schema = removeFormats(result.schema) + !_.isEmpty(removedKeys) && _.set(result, "removedKeys", removedKeys) return result } } diff --git a/api-service/src/services/SchemaGenerateService/DataSchemaService.ts b/api-service/src/services/SchemaGenerateService/DataSchemaService.ts index 67bde9e7..aec2f8d3 100644 --- a/api-service/src/services/SchemaGenerateService/DataSchemaService.ts +++ b/api-service/src/services/SchemaGenerateService/DataSchemaService.ts @@ -5,17 +5,17 @@ import moment from "moment"; import { SchemaGenerationException } from "../../exceptions/SchemaGenerationException"; const DATE_FORMATS = [ - "MM/DD/YYYY","DD/MM/YYYY", "YYYY-MM-DD", "YYYY-DD-MM", "YYYY/MM/DD", + "MM/DD/YYYY", "DD/MM/YYYY", "YYYY-MM-DD", "YYYY-DD-MM", "YYYY/MM/DD", "DD-MM-YYYY", "MM-DD-YYYY", "MM-DD-YYYY HH:mm:ss", "YYYY/MM/DD HH:mm:ss", - "YYYY-MM-DD HH:mm:ss", "YYYY-DD-MM HH:mm:ss", "DD/MM/YYYY HH:mm:ss", - "DD-MM-YYYY HH:mm:ss", "MM-DD-YYYY HH:mm:ss.SSS", "YYYY-MM-DD HH:mm:ss.SSS", - "YYYY-DD-MM HH:mm:ss.SSS", "YYYY/MM/DD HH:mm:ss.SSS", "DD/MM/YYYY HH:mm:ss.SSS", - "DD-MM-YYYY HH:mm:ss.SSS", "DD-MM-YYYYTHH:mm:ss.SSSZ", "YYYY-MM-DDTHH:mm:ss.SSSZ", + "YYYY-MM-DD HH:mm:ss", "YYYY-DD-MM HH:mm:ss", "DD/MM/YYYY HH:mm:ss", + "DD-MM-YYYY HH:mm:ss", "MM-DD-YYYY HH:mm:ss.SSS", "YYYY-MM-DD HH:mm:ss.SSS", + "YYYY-DD-MM HH:mm:ss.SSS", "YYYY/MM/DD HH:mm:ss.SSS", "DD/MM/YYYY HH:mm:ss.SSS", + "DD-MM-YYYY HH:mm:ss.SSS", "DD-MM-YYYYTHH:mm:ss.SSSZ", "YYYY-MM-DDTHH:mm:ss.SSSZ", "YYYY-DD-MMTHH:mm:ss.SSSZ", "YYYY/MM/DDTHH:mm:ss.SSSZ", "DD/MM/YYYYTHH:mm:ss.SSSZ", "YYYY-DD-MMTHH:mm:ss.SSS", "YYYY/MM/DDTHH:mm:ss.SSS", "DD/MM/YYYYTHH:mm:ss.SSS", "MM-DD-YYYYTHH:mm:ss.SSSZ", "DD-MM-YYYYTHH:mm:ssZ", "YYYY-MM-DDTHH:mm:ssZ", - "YYYY-DD-MMTHH:mm:ssZ", "YYYY/MM/DDTHH:mm:ssZ", "DD/MM/YYYYTHH:mm:ssZ", "MM-DD-YYYYTHH:mm:ssZ", - "MM-DD-YYYYTHH:mm:ss", "DD-MM-YYYYTHH:mm:ss", "YYYY-MM-DDTHH:mm:ss", "YYYY-DD-MMTHH:mm:ss", + "YYYY-DD-MMTHH:mm:ssZ", "YYYY/MM/DDTHH:mm:ssZ", "DD/MM/YYYYTHH:mm:ssZ", "MM-DD-YYYYTHH:mm:ssZ", + "MM-DD-YYYYTHH:mm:ss", "DD-MM-YYYYTHH:mm:ss", "YYYY-MM-DDTHH:mm:ss", "YYYY-DD-MMTHH:mm:ss", "YYYY/MM/DDTHH:mm:ss", "DD/MM/YYYYTHH:mm:ss", "DD-MM-YYYY HH:mm:ss.SSSZ", "YYYY-MM-DD HH:mm:ss.SSSZ", "YYYY-DD-MM HH:mm:ss.SSSZ", "YYYY/MM/DD HH:mm:ss.SSSZ", "DD/MM/YYYY HH:mm:ss.SSSZ", "MM-DD-YYYY HH:mm:ss.SSSZ", "DD-MM-YYYY HH:mm:ssZ", "YYYY-MM-DD HH:mm:ssZ", "YYYY-DD-MM HH:mm:ssZ", @@ -38,16 +38,30 @@ const DATE_FORMATS = [ export class SchemaInference { public inferSchema(sample: any) { - const schema = _.map(sample, (value): any => this.validateEpoch(inferSchema(value).toJSONSchema({ includeSchema: true }), value, "properties")) - return schema + if (!Array.isArray(sample)) { + throw new SchemaGenerationException("Invalid input: sample must be an array.", httpStatus.BAD_REQUEST); + } + const removedAllKeys: any[] = [] + const schema = _.map(sample, (value): any => { + const { cleanedData, removedKeys } = this.removeEmpty(value) + removedAllKeys.push(...removedKeys) + return this.validateEpoch(inferSchema(cleanedData).toJSONSchema({ includeSchema: true }), cleanedData, "properties") + }) + return { schema, removedKeys: removedAllKeys } } public inferBatchSchema(sample: Map[], extractionKey: string) { - return _.flatMap(sample, (value) => { + if (!Array.isArray(sample)) { + throw new SchemaGenerationException("Invalid input: sample must be an array.", httpStatus.BAD_REQUEST); + } + const removedAllKeys: any[] = [] + const schema = _.flatMap(sample, (value) => { if (extractionKey) { const extracted = _.get(value, extractionKey); if (extracted) { - return this.inferSchema(extracted); + const { schema, removedKeys } = this.inferSchema(extracted); + removedAllKeys.push(...removedKeys) + return schema } else { throw new SchemaGenerationException("Unable to extract the batch data.", httpStatus.BAD_REQUEST); } @@ -55,6 +69,22 @@ export class SchemaInference { throw new SchemaGenerationException("Extraction key not found.", httpStatus.BAD_REQUEST); } }) + return { schema, removedKeys: removedAllKeys } + } + // Only removes empty object and array at all the levels + private removeEmpty(data: any, parentKey = "", removedKeys: any[] = []) { + Object.keys(data).forEach((key) => { + const value = data[key]; + const fullKey = parentKey ? `${parentKey}.${key}` : key; + if (typeof value === "object" && value !== null) { + this.removeEmpty(value, fullKey, removedKeys); + if (_.isEmpty(value)) { + delete data[key]; + removedKeys.push({ "key": fullKey, value }); + } + } + }); + return { cleanedData: data, removedKeys }; } private validateEpoch(schema: any, sample: any, path: any) { @@ -67,7 +97,7 @@ export class SchemaInference { if (isValidTimestamp) { _.set(schema, `${path}.${key}.format`, type) } - else if(format && ["date-time", "time", "date"].includes(format) && !isValidTimestamp) { + else if (format && ["date-time", "time", "date"].includes(format) && !isValidTimestamp) { _.unset(schema, `${path}.${key}.format`) } }); @@ -79,7 +109,7 @@ export class SchemaInference { const epochRegex = /^\d+$/ig; switch (dataType) { case "string": - if(epochRegex.test(value)){ + if (epochRegex.test(value)) { const parsedValue = parseInt(value, 10); // Timestamp should be greater than Jan 01 2000 00:00:00 UTC/GMT in seconds return { @@ -93,12 +123,12 @@ export class SchemaInference { case "number": // Timestamp should be greater than Jan 01 2000 00:00:00 UTC/GMT in seconds return { - isValidTimestamp: value >= 946684800 && moment(value).isValid(), + isValidTimestamp: value >= 946684800 && moment(value).isValid(), type: "epoch" }; default: return { - isValidTimestamp: false, + isValidTimestamp: false, type: "" }; }