Try adding setup hadoop step to features

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>
kedro-org · Oct 26, 2023 · 1cd7ab9 · 1cd7ab9
1 parent f2a7d0a
commit 1cd7ab9
Show file tree

Hide file tree

Showing 5 changed files with 57 additions and 8 deletions.
diff --git a/.github/workflows/all-checks.yml b/.github/workflows/all-checks.yml
@@ -38,16 +38,18 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install test requirements
         run: make install-test-requirements
-      - uses: actions/setup-java@v1
-        with:
-          java-version: '11'
-      - uses: vemonet/setup-spark@v1
-        with:
-          spark-version: '3.4.1'
-          hadoop-version: '3'
+#      - name: Setup Java
+#        uses: actions/setup-java@v1
+#        with:
+#          java-version: '11'
+#      - name: Setup spark
+#        uses: vemonet/setup-spark@v1
+#        with:
+#          spark-version: '3.4.1'
+#          hadoop-version: '3'
       - name: Run `kedro run` end to end tests for all starters
         run: |
-          behave features/run.feature
+          behave features/run.feature --tags=pyspark
 
   lint:
     strategy:

diff --git a/.github/workflows/run-test.yml b/.github/workflows/run-test.yml
diff --git a/features/run.feature b/features/run.feature
@@ -23,10 +23,12 @@ Feature: Run all starters
     When I execute the CLI command to list Kedro pipelines
     Then I should get a successful exit code
 
+  @pyspark
   Scenario: Run a Kedro project created from pyspark-iris
     Given I have prepared a config file
     And I have run a non-interactive kedro new with the starter pyspark-iris
     And I have installed the Kedro project's dependencies
+    And I have setup hadoop binary
     When I run the Kedro pipeline
     Then I should get a successful exit code
 
@@ -44,16 +46,20 @@ Feature: Run all starters
     When I run the Kedro pipeline
     Then I should get a successful exit code
 
+  @pyspark
   Scenario: Run a Kedro project created from spaceflights-pyspark
     Given I have prepared a config file
     And I have run a non-interactive kedro new with the starter spaceflights-pyspark
     And I have installed the Kedro project's dependencies
+    And I have setup hadoop binary
     When I run the Kedro pipeline
     Then I should get a successful exit code
 
+  @pyspark
   Scenario: Run a Kedro project created from spaceflights-pyspark-viz
     Given I have prepared a config file
     And I have run a non-interactive kedro new with the starter spaceflights-pyspark-viz
     And I have installed the Kedro project's dependencies
+    And I have setup hadoop binary
     When I run the Kedro pipeline
     Then I should get a successful exit code
diff --git a/features/steps/run_steps.py b/features/steps/run_steps.py
@@ -1,6 +1,7 @@
 import subprocess
 
 import yaml
+import os, requests, platform
 from behave import given, then, when
 
 OK_EXIT_CODE = 0
@@ -31,6 +32,7 @@ def create_configuration_file(context):
 @given("I have run a non-interactive kedro new with the starter {starter_name}")
 def create_project_from_config_file(context, starter_name):
     """Behave step to run Kedro new given the config I previously created."""
+    print("!!!!!!!!!!!!!!", context.starters_paths)
     res = subprocess.run(
         [
             context.kedro,
@@ -41,6 +43,7 @@ def create_project_from_config_file(context, starter_name):
             context.starters_paths[starter_name],
         ]
     )
+
     assert res.returncode == OK_EXIT_CODE
     # prevent telemetry from prompting for input during e2e tests
     telemetry_file = context.root_project_dir / ".telemetry"
@@ -55,6 +58,41 @@ def install_project_dependencies(context):
     )
     assert res.returncode == OK_EXIT_CODE
 
+@given("I have setup hadoop binary")
+def setup_hadoop(context):
+    if platform.system() != 'Windows':
+        return
+    # Define the URLs of the files to download
+    winutils_url = "https://github.com/steveloughran/winutils/raw/master/hadoop-2.7.1/bin/winutils.exe"
+    hadoop_dll_url = "https://github.com/steveloughran/winutils/raw/master/hadoop-2.7.1/bin/hadoop.dll"
+
+    # Specify the local file paths
+    winutils_local_path = "winutils.exe"
+    hadoop_dll_local_path = "hadoop.dll"
+    hadoop_bin_dir = "C:\\hadoop\\bin"
+
+    # Download winutils.exe and hadoop.dll
+    response1 = requests.get(winutils_url)
+    with open(winutils_local_path, "wb") as file1:
+        file1.write(response1.content)
+
+    response2 = requests.get(hadoop_dll_url)
+    with open(hadoop_dll_local_path, "wb") as file2:
+        file2.write(response2.content)
+
+    # Move hadoop.dll to C:\Windows\System32
+    os.rename(hadoop_dll_local_path, os.path.join("C:\\Windows\\System32", os.path.basename(hadoop_dll_local_path)))
+
+    # Create C:\hadoop\bin directory
+    if not os.path.exists(hadoop_bin_dir):
+        os.makedirs(hadoop_bin_dir)
+
+    # Move winutils.exe to C:\hadoop\bin
+    os.rename(winutils_local_path, os.path.join(hadoop_bin_dir, os.path.basename(winutils_local_path)))
+
+    # Set the HADOOP_HOME environment variable
+    os.system(f"setx /M HADOOP_HOME {hadoop_bin_dir}")
+
 
 @when("I run the Kedro pipeline")
 def run_kedro_pipeline(context):

diff --git a/test_requirements.txt b/test_requirements.txt
@@ -4,3 +4,6 @@ black~=22.0
 PyYAML>=4.2, <7.0
 ruff~=0.0.290
 git+https://github.com/kedro-org/kedro.git@develop#egg=kedro
+os
+platform
+requests