Merge branch 'main' into release-0.3.0

JohnChe88 · web-flow · commit 64251f009de9 · 2024-02-27T14:08:42.000-05:00
diff --git a/Architecture-Diagram.jpg b/Architecture-Diagram.jpg
diff --git a/Dockerfile b/Dockerfile
@@ -6,7 +6,7 @@ ARG HADOOP_VERSION=3.2.4
 ARG AWS_SDK_VERSION=1.11.901
 ARG PYSPARK_VERSION=3.3.0
 
-#FRAMEWORK will passed during the Docker build 
+#FRAMEWORK will passed during the Docker build. For Apache Iceberg in somecase downgrading PYSPARK_VERSION to 3.2.0 will be good
 ARG FRAMEWORK
 ARG DELTA_FRAMEWORK_VERSION=2.2.0
 ARG HUDI_FRAMEWORK_VERSION=0.12.2
@@ -23,8 +23,10 @@ RUN yum update -y && \
     yum -y install yum-plugin-versionlock && \
     yum -y versionlock add java-1.8.0-openjdk-1.8.0.362.b08-0.amzn2.0.1.x86_64 && \
     yum -y install java-1.8.0-openjdk && \
+
     pip install --upgrade pip && \
     pip install pyspark==$PYSPARK_VERSION boto3==1.28.27 && \
+
     yum clean all
 
 # Install pydeequ if FRAMEWORK is DEEQU
diff --git a/README.md b/README.md
@@ -19,7 +19,9 @@ Once the container is deployed on AWS Lambda, it remains the same until the func
 
 The Spark logs will be part of the AWS Lambda logs stored in AWS Cloudwatch.
 
+
 ![Architecture](https://github.com/aws-samples/spark-on-aws-lambda/blob/release-0.3.0/images/SoAL-Architecture.jpg)
+
 ***
 
 ### Current Challenge
@@ -63,6 +65,7 @@ This script is invoked in AWS Lambda when an event is triggered. The script down
 
 Here is a summary of the main steps in the script:
 
+
 1. **Entry Point**: The `lambda_handler` function is the entry point for the Lambda function. It receives an event object and a context object as parameters.
 2. **S3 Script Location**: The `s3_bucket_script` and `input_script` variables are used to specify the Amazon S3 bucket and object key where the Spark script is located.
 3. **Download Script**: The `boto3` module is used to download the Spark script from Amazon S3 to a temporary file on the Lambda function's file system.
diff --git a/sparkLambdaHandler.py b/sparkLambdaHandler.py
@@ -34,14 +34,17 @@ def spark_submit(s3_bucket_script: str,input_script: str, event: dict)-> None:
     Submits a local Spark script using spark-submit.
     """
      # Set the environment variables for the Spark application
-    pyspark_submit_args = event.get('PYSPARK_SUBMIT_ARGS', '')
-    # Source input and output if available in event
-    input_path = event.get('INPUT_PATH','')
-    output_path = event.get('OUTPUT_PATH', '')
+    # pyspark_submit_args = event.get('PYSPARK_SUBMIT_ARGS', '')
+    # # Source input and output if available in event
+    # input_path = event.get('INPUT_PATH','')
+    # output_path = event.get('OUTPUT_PATH', '')
+
+    for key,value in event.items():
+        os.environ[key] = value
     # Run the spark-submit command on the local copy of teh script
     try:
         logger.info(f'Spark-Submitting the Spark script {input_script} from {s3_bucket_script}')
-        subprocess.run(["spark-submit", "/tmp/spark_script.py", "--event", json.dumps(event)], check=True)
+        subprocess.run(["spark-submit", "/tmp/spark_script.py", "--event", json.dumps(event)], check=True, env=os.environ)
     except Exception as e :
         logger.error(f'Error Spark-Submit with exception: {e}')
         raise e