Skip to content

Commit

Permalink
#I285: removed empty objects and arrays from sample data (#261)
Browse files Browse the repository at this point in the history
* #I285: removed empty objects and arrays from sample data

* Fix code scanning alert no. 98: Loop bound injection

Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>

* #I285: lint issues fixed

---------

Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
  • Loading branch information
1 parent 3c4901f commit 5048efe
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ const dataSchema = async (req: Request, res: Response) => {
const request = <DatasetSchemeRequest>req.body.request
const dataSchemaSpec = schemaGenerate(request.data, request.config)
ResponseHandler.successResponse(req, res, { status: httpStatus.OK, data: dataSchemaSpec });

}

const schemaGenerate = (sample: Map<string, any>[], config: Record<string, any>): any => {
Expand All @@ -46,13 +46,14 @@ const schemaGenerate = (sample: Map<string, any>[], config: Record<string, any>)
result.schema = removeFormats(result.schema)
return result
} else {
let schema = isBatch ? schemaInference.inferBatchSchema(<Map<string, any>[]>sample, extractionKey) : schemaInference.inferSchema(sample);
let { schema, removedKeys } = isBatch ? schemaInference.inferBatchSchema(<Map<string, any>[]>sample, extractionKey) : schemaInference.inferSchema(sample);
schema = schemaArrayValidator.validate(schema)
const schemaCardinalityAnalyser = new SchemaCardinalityAnalyser(sample, schema)
rollupInfo = schemaCardinalityAnalyser.analyse()
const result = process(schema, dataset)
result.schema = removeNonIndexColumns(result.schema)
result.schema = removeFormats(result.schema)
!_.isEmpty(removedKeys) && _.set(result, "removedKeys", removedKeys)
return result
}
}
Expand Down
60 changes: 45 additions & 15 deletions api-service/src/services/SchemaGenerateService/DataSchemaService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@ import moment from "moment";
import { SchemaGenerationException } from "../../exceptions/SchemaGenerationException";

const DATE_FORMATS = [
"MM/DD/YYYY","DD/MM/YYYY", "YYYY-MM-DD", "YYYY-DD-MM", "YYYY/MM/DD",
"MM/DD/YYYY", "DD/MM/YYYY", "YYYY-MM-DD", "YYYY-DD-MM", "YYYY/MM/DD",
"DD-MM-YYYY", "MM-DD-YYYY", "MM-DD-YYYY HH:mm:ss", "YYYY/MM/DD HH:mm:ss",
"YYYY-MM-DD HH:mm:ss", "YYYY-DD-MM HH:mm:ss", "DD/MM/YYYY HH:mm:ss",
"DD-MM-YYYY HH:mm:ss", "MM-DD-YYYY HH:mm:ss.SSS", "YYYY-MM-DD HH:mm:ss.SSS",
"YYYY-DD-MM HH:mm:ss.SSS", "YYYY/MM/DD HH:mm:ss.SSS", "DD/MM/YYYY HH:mm:ss.SSS",
"DD-MM-YYYY HH:mm:ss.SSS", "DD-MM-YYYYTHH:mm:ss.SSSZ", "YYYY-MM-DDTHH:mm:ss.SSSZ",
"YYYY-MM-DD HH:mm:ss", "YYYY-DD-MM HH:mm:ss", "DD/MM/YYYY HH:mm:ss",
"DD-MM-YYYY HH:mm:ss", "MM-DD-YYYY HH:mm:ss.SSS", "YYYY-MM-DD HH:mm:ss.SSS",
"YYYY-DD-MM HH:mm:ss.SSS", "YYYY/MM/DD HH:mm:ss.SSS", "DD/MM/YYYY HH:mm:ss.SSS",
"DD-MM-YYYY HH:mm:ss.SSS", "DD-MM-YYYYTHH:mm:ss.SSSZ", "YYYY-MM-DDTHH:mm:ss.SSSZ",
"YYYY-DD-MMTHH:mm:ss.SSSZ", "YYYY/MM/DDTHH:mm:ss.SSSZ", "DD/MM/YYYYTHH:mm:ss.SSSZ",
"YYYY-DD-MMTHH:mm:ss.SSS", "YYYY/MM/DDTHH:mm:ss.SSS", "DD/MM/YYYYTHH:mm:ss.SSS",
"MM-DD-YYYYTHH:mm:ss.SSSZ", "DD-MM-YYYYTHH:mm:ssZ", "YYYY-MM-DDTHH:mm:ssZ",
"YYYY-DD-MMTHH:mm:ssZ", "YYYY/MM/DDTHH:mm:ssZ", "DD/MM/YYYYTHH:mm:ssZ", "MM-DD-YYYYTHH:mm:ssZ",
"MM-DD-YYYYTHH:mm:ss", "DD-MM-YYYYTHH:mm:ss", "YYYY-MM-DDTHH:mm:ss", "YYYY-DD-MMTHH:mm:ss",
"YYYY-DD-MMTHH:mm:ssZ", "YYYY/MM/DDTHH:mm:ssZ", "DD/MM/YYYYTHH:mm:ssZ", "MM-DD-YYYYTHH:mm:ssZ",
"MM-DD-YYYYTHH:mm:ss", "DD-MM-YYYYTHH:mm:ss", "YYYY-MM-DDTHH:mm:ss", "YYYY-DD-MMTHH:mm:ss",
"YYYY/MM/DDTHH:mm:ss", "DD/MM/YYYYTHH:mm:ss", "DD-MM-YYYY HH:mm:ss.SSSZ", "YYYY-MM-DD HH:mm:ss.SSSZ",
"YYYY-DD-MM HH:mm:ss.SSSZ", "YYYY/MM/DD HH:mm:ss.SSSZ", "DD/MM/YYYY HH:mm:ss.SSSZ",
"MM-DD-YYYY HH:mm:ss.SSSZ", "DD-MM-YYYY HH:mm:ssZ", "YYYY-MM-DD HH:mm:ssZ", "YYYY-DD-MM HH:mm:ssZ",
Expand All @@ -38,23 +38,53 @@ const DATE_FORMATS = [
export class SchemaInference {

public inferSchema(sample: any) {
const schema = _.map(sample, (value): any => this.validateEpoch(inferSchema(value).toJSONSchema({ includeSchema: true }), value, "properties"))
return schema
if (!Array.isArray(sample)) {
throw new SchemaGenerationException("Invalid input: sample must be an array.", httpStatus.BAD_REQUEST);
}
const removedAllKeys: any[] = []
const schema = _.map(sample, (value): any => {
const { cleanedData, removedKeys } = this.removeEmpty(value)
removedAllKeys.push(...removedKeys)
return this.validateEpoch(inferSchema(cleanedData).toJSONSchema({ includeSchema: true }), cleanedData, "properties")
})
return { schema, removedKeys: removedAllKeys }
}

public inferBatchSchema(sample: Map<string, any>[], extractionKey: string) {
return _.flatMap(sample, (value) => {
if (!Array.isArray(sample)) {
throw new SchemaGenerationException("Invalid input: sample must be an array.", httpStatus.BAD_REQUEST);
}
const removedAllKeys: any[] = []
const schema = _.flatMap(sample, (value) => {
if (extractionKey) {
const extracted = _.get(value, extractionKey);
if (extracted) {
return this.inferSchema(extracted);
const { schema, removedKeys } = this.inferSchema(extracted);
removedAllKeys.push(...removedKeys)
return schema
} else {
throw new SchemaGenerationException("Unable to extract the batch data.", httpStatus.BAD_REQUEST);
}
} else {
throw new SchemaGenerationException("Extraction key not found.", httpStatus.BAD_REQUEST);
}
})
return { schema, removedKeys: removedAllKeys }
}
// Only removes empty object and array at all the levels
private removeEmpty(data: any, parentKey = "", removedKeys: any[] = []) {
Object.keys(data).forEach((key) => {
const value = data[key];
const fullKey = parentKey ? `${parentKey}.${key}` : key;
if (typeof value === "object" && value !== null) {
this.removeEmpty(value, fullKey, removedKeys);
if (_.isEmpty(value)) {
delete data[key];
removedKeys.push({ "key": fullKey, value });
}
}
});
return { cleanedData: data, removedKeys };
}

private validateEpoch(schema: any, sample: any, path: any) {
Expand All @@ -67,7 +97,7 @@ export class SchemaInference {
if (isValidTimestamp) {
_.set(schema, `${path}.${key}.format`, type)
}
else if(format && ["date-time", "time", "date"].includes(format) && !isValidTimestamp) {
else if (format && ["date-time", "time", "date"].includes(format) && !isValidTimestamp) {
_.unset(schema, `${path}.${key}.format`)
}
});
Expand All @@ -79,7 +109,7 @@ export class SchemaInference {
const epochRegex = /^\d+$/ig;
switch (dataType) {
case "string":
if(epochRegex.test(value)){
if (epochRegex.test(value)) {
const parsedValue = parseInt(value, 10);
// Timestamp should be greater than Jan 01 2000 00:00:00 UTC/GMT in seconds
return {
Expand All @@ -93,12 +123,12 @@ export class SchemaInference {
case "number":
// Timestamp should be greater than Jan 01 2000 00:00:00 UTC/GMT in seconds
return {
isValidTimestamp: value >= 946684800 && moment(value).isValid(),
isValidTimestamp: value >= 946684800 && moment(value).isValid(),
type: "epoch"
};
default:
return {
isValidTimestamp: false,
isValidTimestamp: false,
type: ""
};
}
Expand Down

0 comments on commit 5048efe

Please sign in to comment.