-
Notifications
You must be signed in to change notification settings - Fork 0
/
sparkData.py
33 lines (24 loc) · 1.07 KB
/
sparkData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import sys
from os.path import splitext
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id
def convert_csv_to_json(input_csv, output_json, separator):
# Create a SparkSession
spark = SparkSession.builder.appName("CSVtoJSON").getOrCreate()
# Read the CSV file and create a DataFrame
df = spark.read.csv(input_csv, header=True, inferSchema=True, sep=separator)
# Add an index column to the DataFrame
#df_with_index = df.withColumn("index", monotonically_increasing_id())
# Save the DataFrame with index as a JSON file (overwrite)
df.write.mode("overwrite").json(output_json)
# Stop the SparkSession
spark.stop()
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python script.py input.csv [separator]")
sys.exit(1)
input_csv = sys.argv[1]
output_json = splitext(input_csv)[0] + ".json"
# Determine the separator (default to , if not provided)
separator = sys.argv[2] if len(sys.argv) > 2 else ","
convert_csv_to_json(input_csv, output_json, separator)