!pip install pyspark
+Collecting pyspark + Downloading pyspark-3.5.3.tar.gz (317.3 MB) + ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 317.3/317.3 MB 3.1 MB/s eta 0:00:00 + Preparing metadata (setup.py) ... done +Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-packages (from pyspark) (0.10.9.7) +Building wheels for collected packages: pyspark + Building wheel for pyspark (setup.py) ... done + Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=c95ba513b8ea87737447a56790e3342645a001f55d94dc06cf038953cf54d9d5 + Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab +Successfully built pyspark +Installing collected packages: pyspark +Successfully installed pyspark-3.5.3 ++
!pip install findspark
+Collecting findspark + Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes) +Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB) +Installing collected packages: findspark +Successfully installed findspark-2.0.1 ++
!pip install pyarrow
+Requirement already satisfied: pyarrow in /usr/local/lib/python3.10/dist-packages (16.1.0) +Requirement already satisfied: numpy>=1.16.6 in /usr/local/lib/python3.10/dist-packages (from pyarrow) (1.26.4) ++
import findspark
+findspark.init()
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import col
+from pyspark.sql.functions import *
+import time
+# Initialize SparkSession
+spark = SparkSession.builder.appName("CSVReader").getOrCreate()
+# Start time
+start_time = time.time()
+print(f"Execution started at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))}")
+
+# Read the CSV data
+df = spark.read.csv("/content/drive/MyDrive/CSV/yelp_review.csv", header=True, inferSchema=True)
+
+# Display the entire dataframe (use with caution for large datasets)
+df.show()
+
+# End time
+end_time = time.time()
+print(f"Execution ended at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time))}")
+
+# Calculate and print the execution time
+execution_time = end_time - start_time
+print(f"Execution time: {execution_time:.2f} seconds")
+Execution started at: 2024-10-11 23:54:45 ++--------------------+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+ +| _id| review_id| user_id| business_id|stars|useful|funny|cool| text| date| ++--------------------+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+ +|66ea4ea9e59c7c5b6...|KU_O5udG6zpxOg-Vc...|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...| 3| 0| 0| 0|If you decide to ...|2018-07-07 22:09:11| +|66ea4ea9e59c7c5b6...|BiTunyQ73aT9WBnpR...|OyoGAe7OKpv6SyGZT...|7ATYjTIgM3jUlt4UM...| 5| 1| 0| 1|I've taken a lot ...|2012-01-03 15:28:18| +|66ea4ea9e59c7c5b6...|saUsX_uimxRlCVr67...|8g_iMtfSiwikVnbP2...|YjUWPpI6HXG530lwP...| 3| 0| 0| 0|Family diner. Had...|2014-02-05 20:30:30| +|66ea4ea9e59c7c5b6...|AqPFMleE6RsU23_au...|_7bHUi9Uuf5__HHc_...|kxX2SOes4o-D3ZQBk...| 5| 1| 0| 1|Wow! Yummy, diff...|2015-01-04 00:01:03| +|66ea4ea9e59c7c5b6...|Sx8TMOWLNuJBWer-0...|bcjbaE6dDog4jkNY9...|e4Vwtrqf-wpJfwesg...| 4| 1| 0| 1|Cute interior and...|2017-01-14 20:54:15| +|66ea4ea9e59c7c5b6...|JrIxlS1TzJ-iCu79u...|eUta8W_HdHMXPzLBB...|04UD14gamNjLY0IDY...| 1| 1| 2| 1|I am a long term ...|2015-09-23 23:10:31| +|66ea4ea9e59c7c5b6...|6AxgBCNX_PNTOxmbR...|r3zeYsv1XFBRA4dJp...|gmjsEdUsKpj9Xxu6p...| 5| 0| 2| 0|Loved this tour! ...|2015-01-03 23:21:18| +|66ea4ea9e59c7c5b6...|_ZeMknuYdlQcUqng_...|yfFzsLmaWF2d4Sr0U...|LHSTtnW3YHCeUkRDG...| 5| 2| 0| 0|Amazingly amazing...|2015-08-07 02:29:16| +|66ea4ea9e59c7c5b6...|ZKvDG2sBvHVdF5oBN...|wSTuiTk-sKNdcFypr...|B5XSoSG3SfvQGtKEG...| 3| 1| 1| 0|This easter inste...|2016-03-30 22:46:33| +|66ea4ea9e59c7c5b6...|pUycOfUwM8vqX7KjR...|59MxRhNVhU9MYndMk...|gebiRewfieSdtt17P...| 3| 0| 0| 0|Had a party of 6 ...|2016-07-25 07:31:06| +|66ea4ea9e59c7c5b6...|rGQRf8UafX7OTlMNN...|1WHRWwQmZOZDAhp2Q...|uMvVYRgGNXf5boolA...| 5| 2| 0| 0|My experience wit...|2015-06-21 14:48:06| +|66ea4ea9e59c7c5b6...|l3Wk_mvAog6XANIuG...|ZbqSHbgCjzVAqaa7N...|EQ-TZ2eeD_E0BHuvo...| 4| 0| 0| 0|Locals recommende...|2015-08-19 14:31:45| +|66ea4ea9e59c7c5b6...|XW_LfMv0fV21l9c6x...|9OAtfnWag-ajVxRbU...|lj-E32x9_FA7GmUrB...| 4| 0| 0| 0|Love going here f...|2014-06-27 22:44:01| +|66ea4ea9e59c7c5b6...|8JFGBuHMoiNDyfcxu...|smOvOajNG0lS4Pq7d...|RZtGWDLCAtuipwaZ-...| 4| 0| 0| 0|Good food--loved ...|2009-10-14 19:57:14| +|66ea4ea9e59c7c5b6...|UBp0zWyH60Hmw6Fsa...|4Uh27DgGzsp6PqrH9...|otQS34_MymijPTdNB...| 4| 0| 2| 0|The bun makes the...|2011-10-27 17:12:05| +|66ea4ea9e59c7c5b6...|OAhBYw8IQ6wlfw1ow...|1C2lxzUo1Hyye4RFI...|BVndHaLihEYbr76Z0...| 5| 0| 0| 0|Great place for b...|2014-10-11 16:22:06| +|66ea4ea9e59c7c5b6...|oyaMhzBSwfGgemSGu...|Dd1jQj7S-BFGqRbAp...|YtSqYv1Q_pOltsVPS...| 5| 0| 0| 0|Tremendous servic...|2013-06-24 11:21:25| +|66ea4ea9e59c7c5b6...|LnGZB0fjfgeVDVz5I...|j2wlzrntrbKwyOcOi...|rBdG_23USc7DletfZ...| 4| 1| 0| 0|The hubby and I h...|2014-08-10 19:41:43| +|66ea4ea9e59c7c5b6...|u2vzZaOqJ2feRshaa...|NDZvyYHTUWWu-kqgQ...|CLEWowfkj-wKYJlQD...| 5| 2| 0| 1|I go to blow bar ...|2016-03-07 00:02:18| +|66ea4ea9e59c7c5b6...|Xs8Z8lmKkosqW5mw_...|IQsF3Rc6IgCzjVV9D...|eFvzHawVJofxSnD7T...| 5| 0| 0| 0|My absolute favor...|2014-11-12 15:30:27| ++--------------------+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+ +only showing top 20 rows + +Execution ended at: 2024-10-11 23:56:18 +Execution time: 93.61 seconds ++
# total records in the dataframe
+
+total_records = df.count()
+print(f"Total records in the dataframe: {total_records}")
+Total records in the dataframe: 6990280 ++
# dropping the column _id
+
+df_review = df.drop('_id')
+df_review.show()
++--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+ +| review_id| user_id| business_id|stars|useful|funny|cool| text| date| ++--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+ +|KU_O5udG6zpxOg-Vc...|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...| 3| 0| 0| 0|If you decide to ...|2018-07-07 22:09:11| +|BiTunyQ73aT9WBnpR...|OyoGAe7OKpv6SyGZT...|7ATYjTIgM3jUlt4UM...| 5| 1| 0| 1|I've taken a lot ...|2012-01-03 15:28:18| +|saUsX_uimxRlCVr67...|8g_iMtfSiwikVnbP2...|YjUWPpI6HXG530lwP...| 3| 0| 0| 0|Family diner. Had...|2014-02-05 20:30:30| +|AqPFMleE6RsU23_au...|_7bHUi9Uuf5__HHc_...|kxX2SOes4o-D3ZQBk...| 5| 1| 0| 1|Wow! Yummy, diff...|2015-01-04 00:01:03| +|Sx8TMOWLNuJBWer-0...|bcjbaE6dDog4jkNY9...|e4Vwtrqf-wpJfwesg...| 4| 1| 0| 1|Cute interior and...|2017-01-14 20:54:15| +|JrIxlS1TzJ-iCu79u...|eUta8W_HdHMXPzLBB...|04UD14gamNjLY0IDY...| 1| 1| 2| 1|I am a long term ...|2015-09-23 23:10:31| +|6AxgBCNX_PNTOxmbR...|r3zeYsv1XFBRA4dJp...|gmjsEdUsKpj9Xxu6p...| 5| 0| 2| 0|Loved this tour! ...|2015-01-03 23:21:18| +|_ZeMknuYdlQcUqng_...|yfFzsLmaWF2d4Sr0U...|LHSTtnW3YHCeUkRDG...| 5| 2| 0| 0|Amazingly amazing...|2015-08-07 02:29:16| +|ZKvDG2sBvHVdF5oBN...|wSTuiTk-sKNdcFypr...|B5XSoSG3SfvQGtKEG...| 3| 1| 1| 0|This easter inste...|2016-03-30 22:46:33| +|pUycOfUwM8vqX7KjR...|59MxRhNVhU9MYndMk...|gebiRewfieSdtt17P...| 3| 0| 0| 0|Had a party of 6 ...|2016-07-25 07:31:06| +|rGQRf8UafX7OTlMNN...|1WHRWwQmZOZDAhp2Q...|uMvVYRgGNXf5boolA...| 5| 2| 0| 0|My experience wit...|2015-06-21 14:48:06| +|l3Wk_mvAog6XANIuG...|ZbqSHbgCjzVAqaa7N...|EQ-TZ2eeD_E0BHuvo...| 4| 0| 0| 0|Locals recommende...|2015-08-19 14:31:45| +|XW_LfMv0fV21l9c6x...|9OAtfnWag-ajVxRbU...|lj-E32x9_FA7GmUrB...| 4| 0| 0| 0|Love going here f...|2014-06-27 22:44:01| +|8JFGBuHMoiNDyfcxu...|smOvOajNG0lS4Pq7d...|RZtGWDLCAtuipwaZ-...| 4| 0| 0| 0|Good food--loved ...|2009-10-14 19:57:14| +|UBp0zWyH60Hmw6Fsa...|4Uh27DgGzsp6PqrH9...|otQS34_MymijPTdNB...| 4| 0| 2| 0|The bun makes the...|2011-10-27 17:12:05| +|OAhBYw8IQ6wlfw1ow...|1C2lxzUo1Hyye4RFI...|BVndHaLihEYbr76Z0...| 5| 0| 0| 0|Great place for b...|2014-10-11 16:22:06| +|oyaMhzBSwfGgemSGu...|Dd1jQj7S-BFGqRbAp...|YtSqYv1Q_pOltsVPS...| 5| 0| 0| 0|Tremendous servic...|2013-06-24 11:21:25| +|LnGZB0fjfgeVDVz5I...|j2wlzrntrbKwyOcOi...|rBdG_23USc7DletfZ...| 4| 1| 0| 0|The hubby and I h...|2014-08-10 19:41:43| +|u2vzZaOqJ2feRshaa...|NDZvyYHTUWWu-kqgQ...|CLEWowfkj-wKYJlQD...| 5| 2| 0| 1|I go to blow bar ...|2016-03-07 00:02:18| +|Xs8Z8lmKkosqW5mw_...|IQsF3Rc6IgCzjVV9D...|eFvzHawVJofxSnD7T...| 5| 0| 0| 0|My absolute favor...|2014-11-12 15:30:27| ++--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+ +only showing top 20 rows + ++
# Print a valve in column "text"
+for row in df_review.select("text").limit(1).collect():
+ print(row.text)
+If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. \n\nThe food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker. ++
# create two new columns by splitting date field to date and time, maybe convert the original column to a timestamp after trimming naming it date_trimmedand then, split and drop the date_trimmed
+
+from pyspark.sql.functions import to_timestamp, split
+
+# Assuming your date column is named 'date'
+# Convert the 'date' column to timestamp after trimming and name it 'date_trimmed'
+df_review = df_review.withColumn("date_trimmed", to_timestamp(trim(col("date")), "yyyy-MM-dd HH:mm:ss"))
+
+# Split the 'date_trimmed' column into 'date' and 'time' columns
+df_review = df_review.withColumn("date", split(col("date_trimmed"), " ").getItem(0))\
+ .withColumn("time", split(col("date_trimmed"), " ").getItem(1))
+
+# Drop the 'date_trimmed' column
+df_review = df_review.drop("date_trimmed")
+
+# Show the updated DataFrame
+df_review.show()
++--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+----------+--------+ +| review_id| user_id| business_id|stars|useful|funny|cool| text| date| time| ++--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+----------+--------+ +|KU_O5udG6zpxOg-Vc...|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...| 3| 0| 0| 0|If you decide to ...|2018-07-07|22:09:11| +|BiTunyQ73aT9WBnpR...|OyoGAe7OKpv6SyGZT...|7ATYjTIgM3jUlt4UM...| 5| 1| 0| 1|I've taken a lot ...|2012-01-03|15:28:18| +|saUsX_uimxRlCVr67...|8g_iMtfSiwikVnbP2...|YjUWPpI6HXG530lwP...| 3| 0| 0| 0|Family diner. Had...|2014-02-05|20:30:30| +|AqPFMleE6RsU23_au...|_7bHUi9Uuf5__HHc_...|kxX2SOes4o-D3ZQBk...| 5| 1| 0| 1|Wow! Yummy, diff...|2015-01-04|00:01:03| +|Sx8TMOWLNuJBWer-0...|bcjbaE6dDog4jkNY9...|e4Vwtrqf-wpJfwesg...| 4| 1| 0| 1|Cute interior and...|2017-01-14|20:54:15| +|JrIxlS1TzJ-iCu79u...|eUta8W_HdHMXPzLBB...|04UD14gamNjLY0IDY...| 1| 1| 2| 1|I am a long term ...|2015-09-23|23:10:31| +|6AxgBCNX_PNTOxmbR...|r3zeYsv1XFBRA4dJp...|gmjsEdUsKpj9Xxu6p...| 5| 0| 2| 0|Loved this tour! ...|2015-01-03|23:21:18| +|_ZeMknuYdlQcUqng_...|yfFzsLmaWF2d4Sr0U...|LHSTtnW3YHCeUkRDG...| 5| 2| 0| 0|Amazingly amazing...|2015-08-07|02:29:16| +|ZKvDG2sBvHVdF5oBN...|wSTuiTk-sKNdcFypr...|B5XSoSG3SfvQGtKEG...| 3| 1| 1| 0|This easter inste...|2016-03-30|22:46:33| +|pUycOfUwM8vqX7KjR...|59MxRhNVhU9MYndMk...|gebiRewfieSdtt17P...| 3| 0| 0| 0|Had a party of 6 ...|2016-07-25|07:31:06| +|rGQRf8UafX7OTlMNN...|1WHRWwQmZOZDAhp2Q...|uMvVYRgGNXf5boolA...| 5| 2| 0| 0|My experience wit...|2015-06-21|14:48:06| +|l3Wk_mvAog6XANIuG...|ZbqSHbgCjzVAqaa7N...|EQ-TZ2eeD_E0BHuvo...| 4| 0| 0| 0|Locals recommende...|2015-08-19|14:31:45| +|XW_LfMv0fV21l9c6x...|9OAtfnWag-ajVxRbU...|lj-E32x9_FA7GmUrB...| 4| 0| 0| 0|Love going here f...|2014-06-27|22:44:01| +|8JFGBuHMoiNDyfcxu...|smOvOajNG0lS4Pq7d...|RZtGWDLCAtuipwaZ-...| 4| 0| 0| 0|Good food--loved ...|2009-10-14|19:57:14| +|UBp0zWyH60Hmw6Fsa...|4Uh27DgGzsp6PqrH9...|otQS34_MymijPTdNB...| 4| 0| 2| 0|The bun makes the...|2011-10-27|17:12:05| +|OAhBYw8IQ6wlfw1ow...|1C2lxzUo1Hyye4RFI...|BVndHaLihEYbr76Z0...| 5| 0| 0| 0|Great place for b...|2014-10-11|16:22:06| +|oyaMhzBSwfGgemSGu...|Dd1jQj7S-BFGqRbAp...|YtSqYv1Q_pOltsVPS...| 5| 0| 0| 0|Tremendous servic...|2013-06-24|11:21:25| +|LnGZB0fjfgeVDVz5I...|j2wlzrntrbKwyOcOi...|rBdG_23USc7DletfZ...| 4| 1| 0| 0|The hubby and I h...|2014-08-10|19:41:43| +|u2vzZaOqJ2feRshaa...|NDZvyYHTUWWu-kqgQ...|CLEWowfkj-wKYJlQD...| 5| 2| 0| 1|I go to blow bar ...|2016-03-07|00:02:18| +|Xs8Z8lmKkosqW5mw_...|IQsF3Rc6IgCzjVV9D...|eFvzHawVJofxSnD7T...| 5| 0| 0| 0|My absolute favor...|2014-11-12|15:30:27| ++--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+----------+--------+ +only showing top 20 rows + ++
# write it as a csv in /content/drive/MyDrive/ProcessedCSV named review.csv
+
+# Write the DataFrame to a CSV file in Google Drive
+df_review.write.csv("/content/drive/MyDrive/ProcessedCSV/review.csv", header=True)
+
+
RecBizPred
+