final changes

nogibjj · Dec 14, 2024 · a36e209 · a36e209
1 parent fc94288
commit a36e209
Show file tree

Hide file tree

Showing 6 changed files with 77 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -1 +1,53 @@
-# Pyspark
+# PySpark
+
+[![CI](https://github.com/nogibjj/fan_xu_pyspark/actions/workflows/cicd.yml/badge.svg)](https://github.com/nogibjj/fan_xu_pyspark/actions/workflows/cicd.yml)
+
+The purpose of this project is to demonstrate PySpark functionality on a dataset about the statistics of NBA Players. The dataset is queried and a transformation is done with the output shown in a markdown file.
+
+## Requirements
+
+- Use PySpark to perform data processing on a large dataset
+
+- Include at least one Spark SQL query and one data transformation
+
+## Project Structure
+
+```
+📦 fan_xu_pyspark
+.github
+workflows
+cicd.yml
+Makefile
+NBA_24_stats.csv
+README.md
+__pycache__
+script.cpython-312.pyc
+gitignore
+lib.py
+output.md
+requirements.txt
+script.py
+test_lib.py
+```
+©generated by [Project Tree Generator](https://woochanleee.github.io/project-tree-generator)
+
+# Highlights
+
+- EDA
+
+The first 3 rows are displayed along with summary statistics for the age, assists, and steals columns
+
+- Query
+
+The top 10 highest-scoring players are queried 
+
+- Transformation
+
+A column is added to show the assist/turnover ratio of the players
+
+# Installation
+
+Requirements:
+- Python
+- PySpark
+- Java
diff --git a/__pycache__/script.cpython-312.pyc b/__pycache__/script.cpython-312.pyc
diff --git a/lib.py b/lib.py
@@ -5,8 +5,6 @@
 def save_to_markdown(filename, content):
     """
     Save the given content to a markdown file.
-    :param filename: Name of the markdown file.
-    :param content: Content to write into the file.
     """
     with open(filename, "w") as f:
         f.write(content)
@@ -39,14 +37,14 @@ def main():
     process_output = process_data(spark, df)
     markdown_content += "## SQL Queries\n\n"
     markdown_content += (
-        "### Query 1: Top 10 High-Scoring Games\n\n" + process_output["query"] + "\n\n"
+        "### Top 10 Highest Scoring Players\n\n" + process_output["query"] + "\n\n"
     )
 
     # Transformation 1
     winner_output = transformation(df)
-    markdown_content += "## Data Transformation 1\n\n"
+    markdown_content += "## Data Transformation\n\n"
     markdown_content += (
-        "### Creating a winner column for each game and calculate the point difference (1st 10 rows)\n\n"
+        "### Creating a column to calculate Assist/Turnover Ratio (1st 10 rows)\n\n"
         + winner_output
         + "\n\n"
     )

diff --git a/output.md b/output.md
@@ -28,24 +28,24 @@ Data loaded successfully.
 
 ## SQL Queries
 
-### Query 1: Top 10 High-Scoring Games
-
-| Player                  | Team   | Pos   |
-|:------------------------|:-------|:------|
-| De'Aaron Fox            | SAC    | PG    |
-| Jayson Tatum            | BOS    | PF    |
-| Kevin Durant            | PHO    | PF    |
-| Devin Booker            | PHO    | PG    |
-| Jalen Brunson           | NYK    | PG    |
-| Shai Gilgeous-Alexander | OKC    | PG    |
-| Giannis Antetokounmpo   | MIL    | PF    |
-| Luka Dončić             | DAL    | PG    |
-| Ja Morant               | MEM    | PG    |
-| Kyrie Irving            | DAL    | SG    |
-
-## Data Transformation 1
-
-### Creating a winner column for each game and calculate the point difference (1st 10 rows)
+### Top 10 Highest Scoring Players
+
+| Player                  | Team   | Pos   |   PTS |
+|:------------------------|:-------|:------|------:|
+| Joel Embiid             | PHI    | C     |  34.7 |
+| Luka Dončić             | DAL    | PG    |  33.9 |
+| Giannis Antetokounmpo   | MIL    | PF    |  30.4 |
+| Shai Gilgeous-Alexander | OKC    | PG    |  30.1 |
+| Jalen Brunson           | NYK    | PG    |  28.7 |
+| Devin Booker            | PHO    | PG    |  27.1 |
+| Kevin Durant            | PHO    | PF    |  27.1 |
+| Jayson Tatum            | BOS    | PF    |  26.9 |
+| De'Aaron Fox            | SAC    | PG    |  26.6 |
+| Donovan Mitchell        | CLE    | SG    |  26.6 |
+
+## Data Transformation
+
+### Creating a column to calculate Assist/Turnover Ratio (1st 10 rows)
 
 | Player                  | Team   | Pos   |   AST/TOV |
 |:------------------------|:-------|:------|----------:|

diff --git a/script.py b/script.py
@@ -3,22 +3,6 @@ def load_data(spark, local_path="NBA_24_stats.csv"):
     Load data from a local CSV file into a PySpark DataFrame, clean up column names, and remove duplicate columns.
     """
     df = spark.read.csv(local_path, header=True, inferSchema=True)
-
-    # # Clean column names
-    # for column_name in df.columns:
-    #     new_col_name = re.sub(r"[^a-zA-Z0-9]", "", column_name)
-    #     df = df.withColumnRenamed(column_name, new_col_name)
-
-    # # Remove duplicate columns
-    # column_set = set()
-    # duplicate_columns = [
-    #     col for col in df.columns if col in column_set or column_set.add(col)
-    # ]
-    # if duplicate_columns:
-    #     print(f"Duplicate Columns Detected: {duplicate_columns}")
-    #     for col in duplicate_columns:
-    #         df = df.drop(col)
-
     return df
 
 
@@ -56,10 +40,10 @@ def process_data(spark, df):
     # Query 1
     query1_df = spark.sql(
         """
-        SELECT Player, Team, Pos
+        SELECT Player, Team, Pos, PTS
         FROM nba_players
         WHERE PTS > 25
-        ORDER BY Rk DESC
+        ORDER BY PTS DESC
         LIMIT 10
     """
     )

diff --git a/test_lib.py b/test_lib.py
@@ -35,7 +35,7 @@ def test_explore_data(sample_data):
     output = explore_data(sample_data)
     assert "first_rows" in output
     assert "row_count" in output
-    assert output["row_count"] == 3  # Check sample data row count
+    assert output["row_count"] == 3
     assert "summary_stats" in output