|
24 | 24 | Add below parameters in the lambda function Environment Variables |
25 | 25 | SCRIPT_BUCKET BUCKET WHERE YOU SAVE THIS SCRIPT |
26 | 26 | SPARK_SCRIPT THE SCRIPT NAME AND PATH |
27 | | - input_path s3a://redshift-downloads/spatial-data/accommodations.csv |
28 | | - output_path THE PATH WHERE THE VERIFICATION RESULTS AND METRICS WILL BE STORED |
| 27 | + INPUT_PATH s3a://redshift-downloads/spatial-data/accommodations.csv |
| 28 | + OUTPUT_PATH THE PATH WHERE THE VERIFICATION RESULTS AND METRICS WILL BE STORED |
29 | 29 |
|
30 | 30 | Lambda General Configuration for above input file. Based on the input file size, the memory can be updated. |
31 | 31 | Memory 2048 MB |
|
42 | 42 | print("Usage: spark-dq [input-folder] [output-folder]") |
43 | 43 | sys.exit(0) |
44 | 44 |
|
45 | | - input_path = os.environ['input_path'] |
46 | | - output_path = os.environ['output_path'] |
| 45 | + input_path = os.environ['INPUT_PATH'] |
| 46 | + output_path = os.environ['OUTPUT_PATH'] |
47 | 47 |
|
48 | 48 |
|
49 | 49 | aws_region = os.environ['AWS_REGION'] |
|
108 | 108 | checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult) |
109 | 109 | checkResult_df.show() |
110 | 110 |
|
111 | | - checkResult_df.repartition(1).write.csv(output_path+"/"+str(uuid.uuid4())+"/", sep=',') |
| 111 | + checkResult_df.repartition(1).write.mode('overwrite').csv(output_path+"/verification-results/", sep=',') |
112 | 112 |
|
113 | 113 | print("Showing VerificationResults metrics:") |
114 | 114 | checkResult_df = VerificationResult.successMetricsAsDataFrame(spark, checkResult) |
115 | 115 | checkResult_df.show() |
116 | 116 |
|
117 | | - checkResult_df.repartition(1).write.csv(output_path+"/"+str(uuid.uuid4())+"/", sep=',') |
| 117 | + checkResult_df.repartition(1).write.mode('overwrite').csv(output_path+"/verification-results-metrics/", sep=',') |
118 | 118 |
|
119 | 119 | spark.sparkContext._gateway.shutdown_callback_server() |
120 | 120 | spark.stop() |
0 commit comments