Skip to content

Commit 5a708b9

Browse files
authored
Merge pull request #9 from aws-samples/develop
Added Enhancement related to Apache Hudi and pyspark script from S3
2 parents cd55098 + f6260d8 commit 5a708b9

File tree

5 files changed

+94
-4
lines changed

5 files changed

+94
-4
lines changed

Dockerfile

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ ARG HADOOP_VERSION=3.2.4
66
ARG AWS_SDK_VERSION=1.11.901
77
ARG PYSPARK_VERSION=3.3.0
88
ARG SOURCE_REGION_NAME='us-east-1'
9+
ARG HUDI_VERSION=0.12.2
910

1011

1112
# yum updates, security updates for zlib, java installation and pyspark installation
@@ -15,9 +16,11 @@ RUN yum update -y && \
1516
yum -y install yum-plugin-versionlock && \
1617
yum -y versionlock add java-1.8.0-openjdk-1.8.0.352.b08-0.amzn2.0.1.x86_64 && \
1718
yum -y install java-1.8.0-openjdk && \
19+
pip install --upgrade pip && \
1820
pip install pyspark==$PYSPARK_VERSION && \
1921
yum clean all
2022

23+
2124
# setting the environment variable and Spark path
2225
ENV SPARK_HOME="/var/lang/lib/python3.8/site-packages/pyspark"
2326
ENV PATH=$PATH:$SPARK_HOME/bin
@@ -34,7 +37,8 @@ ENV PATH=$SPARK_HOME/python:$PATH
3437

3538
RUN mkdir $SPARK_HOME/conf && \
3639
echo "SPARK_LOCAL_IP=127.0.0.1" > $SPARK_HOME/conf/spark-env.sh && \
37-
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/ && \
40+
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/ && \
41+
wget -q https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark3.3-bundle_2.12/${HUDI_VERSION}/hudi-spark3.3-bundle_2.12-${HUDI_VERSION}.jar -P ${SPARK_HOME}/jars/ && \
3842
wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar -P ${SPARK_HOME}/jars/
3943

4044
# JAVA_HOME depends upon the java version used
@@ -56,7 +60,7 @@ RUN chmod -R 755 $SPARK_HOME
5660

5761
# Copy the Pyspark script to container
5862

59-
COPY sparkOnAWSLambda.py ${LAMBDA_TASK_ROOT}
63+
COPY sparkLambdaHandler.py ${LAMBDA_TASK_ROOT}
6064

6165
# calling the Lambda handler
62-
CMD [ "/var/task/sparkOnAWSLambda.lambda_handler" ]
66+
CMD [ "/var/task/sparkLambdaHandler.lambda_handler" ]

images/Github-diagram.jpg

39 KB
Loading

sparkOnAWSLambda.py renamed to spark-scripts/sparkOnAWSLambda.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,4 +42,5 @@ def lambda_handler(event, context):
4242

4343

4444
print("Started Writing the CSV file to Target S3 location ", target_path)
45-
df.write.format("csv").save(target_path)
45+
#df.write.format("csv").save(target_path)
46+
df.write.format("hudi").save(target_path)

spark-scripts/spark_script_hudi.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
from pyspark.sql import SparkSession
2+
from pyspark.sql.types import *
3+
from pyspark.sql.functions import current_timestamp
4+
import sys
5+
import os
6+
7+
def spark_script():
8+
print("start...................")
9+
10+
input_path = os.environ['input_path']
11+
target_path = os.environ['output_path']
12+
s3_bucket = os.environ['s3_bucket']
13+
14+
aws_region = os.environ['REGION']
15+
aws_access_key_id = os.environ['ACCESS_KEY_ID']
16+
aws_secret_access_key = os.environ['SECRET_ACCESS_KEY']
17+
session_token = os.environ['SESSION_TOKEN']
18+
19+
20+
input_path = "s3a://"+s3_bucket+"/"+input_path
21+
target_path ="s3a://"+s3_bucket+"/"+target_path
22+
23+
print(" ******* Input path ",input_path)
24+
print(" ******* Target path ",target_path)
25+
26+
spark = SparkSession.builder \
27+
.appName("Spark-on-AWS-Lambda") \
28+
.master("local[*]") \
29+
.config("spark.driver.bindAddress", "127.0.0.1") \
30+
.config("spark.driver.memory", "5g") \
31+
.config("spark.executor.memory", "5g") \
32+
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
33+
.config("spark.sql.hive.convertMetastoreParquet", "false") \
34+
.config("spark.hadoop.hive.metastore.client.factory.class", "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory") \
35+
.config("hoodie.meta.sync.client.tool.class", "org.apache.hudi.aws.sync.AwsGlueCatalogSyncTool") \
36+
.config("spark.hadoop.fs.s3a.access.key", aws_access_key_id) \
37+
.config("spark.hadoop.fs.s3a.secret.key", aws_secret_access_key) \
38+
.config("spark.hadoop.fs.s3a.session.token",session_token) \
39+
.config("spark.hadoop.fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider") \
40+
.enableHiveSupport().getOrCreate()
41+
42+
43+
44+
print("Started Reading the CSV file from S3 location ",input_path)
45+
46+
df=spark.read.option('header','true').csv(input_path)
47+
df = df.withColumn("last_upd_timestamp", current_timestamp())
48+
df.show()
49+
50+
hudi_options = {
51+
'hoodie.table.name': 'customer_table',
52+
'hoodie.datasource.write.recordkey.field': 'Customer_ID',
53+
'hoodie.datasource.write.precombine.field': 'last_upd_timestamp',
54+
'hoodie.insert.shuffle.parallelism': 2,
55+
"hoodie.datasource.hive_sync.enable": "false",
56+
"hoodie.datasource.hive_sync.database": "default",
57+
"hoodie.datasource.hive_sync.table": "customer_table",
58+
"hoodie.datasource.hive_sync.use_jdbc": "false",
59+
"hoodie.datasource.hive_sync.mode": "hms",
60+
"hoodie.write.markers.type":"direct", # It's not advisable to use this configuration. Working on workaround without using this config.
61+
"hoodie.embed.timeline.server":"false" # It's not advisable to use this configuration. Working on workaround without using this config.
62+
}
63+
64+
print("Started Writing the CSV file to Target hudi table ", target_path)
65+
df.write.format("hudi").options(**hudi_options).mode("overwrite").save(target_path)
66+
# df.write.format("csv").save(target_path)
67+
68+
if __name__ == '__main__':
69+
spark_script()

sparkLambdaHandler.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import boto3
2+
import sys
3+
import os
4+
import subprocess
5+
6+
def lambda_handler(event, context):
7+
print("start...................")
8+
s3_bucket_script = os.environ['SCRIPT_BUCKET']
9+
input_script = os.environ['SPARK_SCRIPT']
10+
s3_client = boto3.client("s3")
11+
s3_client.download_file(s3_bucket_script, input_script, "/tmp/spark_script.py")
12+
# Set the environment variables for the Spark application
13+
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local pyspark-shell"
14+
#ENV PYSPARK_SUBMIT_ARGS="--master local pyspark-shell"\
15+
# Run the spark-submit command
16+
subprocess.run(["spark-submit", "/tmp/spark_script.py"])

0 commit comments

Comments
 (0)