Initial Commit

LearningJournal · LearningJournal · commit 455fcef1726c · 2020-06-27T10:31:05.000+05:30
diff --git a/12-MiscDemoNotebook/MiscDemo.ipynb b/12-MiscDemoNotebook/MiscDemo.ipynb
@@ -0,0 +1,316 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "root\n",
+      " |-- name: string (nullable = true)\n",
+      " |-- day: string (nullable = true)\n",
+      " |-- month: string (nullable = true)\n",
+      " |-- year: string (nullable = true)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import findspark\n",
+    "findspark.init()\n",
+    "\n",
+    "import pyspark\n",
+    "from pyspark.sql import *\n",
+    "from pyspark.sql.functions import *\n",
+    "from pyspark.sql.types import *\n",
+    "\n",
+    "spark = SparkSession.builder \\\n",
+    "            .master(\"local[3]\") \\\n",
+    "            .appName(\"MiscDemo\") \\\n",
+    "            .getOrCreate()\n",
+    "\n",
+    "data_list = [(\"Ravi\", \"28\", \"1\", \"2002\"),\n",
+    "             (\"Abdul\", \"23\", \"5\", \"81\"), # 1981\n",
+    "             (\"John\", \"12\", \"12\", \"6\"), # 2006\n",
+    "             (\"Rosy\", \"7\", \"8\", \"63\"), # 1963\n",
+    "             (\"Abdul\", \"23\", \"5\", \"81\") # 1981\n",
+    "            ]\n",
+    "raw_df = spark.createDataFrame(data_list).toDF(\"name\", \"day\", \"month\", \"year\").repartition(3)\n",
+    "raw_df.printSchema()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-----+---+-----+----+-----------+\n",
+      "| name|day|month|year|         id|\n",
+      "+-----+---+-----+----+-----------+\n",
+      "|Abdul| 23|    5|  81|          0|\n",
+      "| Ravi| 28|    1|2002| 8589934592|\n",
+      "|Abdul| 23|    5|  81| 8589934593|\n",
+      "| John| 12|   12|   6|17179869184|\n",
+      "| Rosy|  7|    8|  63|17179869185|\n",
+      "+-----+---+-----+----+-----------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "df1 = raw_df.withColumn(\"id\", monotonically_increasing_id())\n",
+    "df1.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-----+---+-----+------+-----------+\n",
+      "| name|day|month|  year|         id|\n",
+      "+-----+---+-----+------+-----------+\n",
+      "|Abdul| 23|    5|1981.0|          0|\n",
+      "| Ravi| 28|    1|  2002| 8589934592|\n",
+      "|Abdul| 23|    5|1981.0| 8589934593|\n",
+      "| John| 12|   12|2006.0|17179869184|\n",
+      "| Rosy|  7|    8|1963.0|17179869185|\n",
+      "+-----+---+-----+------+-----------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "df2 = df1.withColumn(\"year\", expr(\"\"\"\n",
+    "         case when year < 21 then year + 2000\n",
+    "         when year < 100 then year + 1900\n",
+    "         else year\n",
+    "         end\"\"\"))\n",
+    "df2.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-----+---+-----+----+-----------+\n",
+      "| name|day|month|year|         id|\n",
+      "+-----+---+-----+----+-----------+\n",
+      "|Abdul| 23|    5|1981|          0|\n",
+      "| Ravi| 28|    1|2002| 8589934592|\n",
+      "|Abdul| 23|    5|1981| 8589934593|\n",
+      "| John| 12|   12|2006|17179869184|\n",
+      "| Rosy|  7|    8|1963|17179869185|\n",
+      "+-----+---+-----+----+-----------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "df3 = df1.withColumn(\"year\", expr(\"\"\"\n",
+    "         case when year < 21 then cast(year as int) + 2000\n",
+    "         when year < 100 then cast(year as int) + 1900\n",
+    "         else year\n",
+    "         end\"\"\"))\n",
+    "df3.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-----+---+-----+----+-----------+\n",
+      "| name|day|month|year|         id|\n",
+      "+-----+---+-----+----+-----------+\n",
+      "|Abdul| 23|    5|1981|          0|\n",
+      "| Ravi| 28|    1|2002| 8589934592|\n",
+      "|Abdul| 23|    5|1981| 8589934593|\n",
+      "| John| 12|   12|2006|17179869184|\n",
+      "| Rosy|  7|    8|1963|17179869185|\n",
+      "+-----+---+-----+----+-----------+\n",
+      "\n",
+      "root\n",
+      " |-- name: string (nullable = true)\n",
+      " |-- day: string (nullable = true)\n",
+      " |-- month: string (nullable = true)\n",
+      " |-- year: integer (nullable = true)\n",
+      " |-- id: long (nullable = false)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "df4 = df1.withColumn(\"year\", expr(\"\"\"\n",
+    "         case when year < 21 then year + 2000\n",
+    "         when year < 100 then year + 1900\n",
+    "         else year\n",
+    "         end\"\"\").cast(IntegerType()))\n",
+    "df4.show()\n",
+    "df4.printSchema()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-----+---+-----+----+-----------+\n",
+      "| name|day|month|year|         id|\n",
+      "+-----+---+-----+----+-----------+\n",
+      "|Abdul| 23|    5|1981|          0|\n",
+      "| Ravi| 28|    1|2002| 8589934592|\n",
+      "|Abdul| 23|    5|1981| 8589934593|\n",
+      "| John| 12|   12|2006|17179869184|\n",
+      "| Rosy|  7|    8|1963|17179869185|\n",
+      "+-----+---+-----+----+-----------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "df5 = df1.withColumn(\"day\", col(\"day\").cast(IntegerType())) \\\n",
+    "         .withColumn(\"month\", col(\"month\").cast(IntegerType())) \\\n",
+    "         .withColumn(\"year\", col(\"year\").cast(IntegerType())) \n",
+    "\n",
+    "df6 = df5.withColumn(\"year\", expr(\"\"\"\n",
+    "          case when year < 21 then year + 2000\n",
+    "          when year < 100 then year + 1900\n",
+    "          else year\n",
+    "          end\"\"\"))\n",
+    "df6.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-----+---+-----+----+-----------+\n",
+      "| name|day|month|year|         id|\n",
+      "+-----+---+-----+----+-----------+\n",
+      "|Abdul| 23|    5|1981|          0|\n",
+      "| Ravi| 28|    1|2002| 8589934592|\n",
+      "|Abdul| 23|    5|1981| 8589934593|\n",
+      "| John| 12|   12|2006|17179869184|\n",
+      "| Rosy|  7|    8|1963|17179869185|\n",
+      "+-----+---+-----+----+-----------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "df7 = df5.withColumn(\"year\", \\\n",
+    "                    when(col(\"year\") < 21, col(\"year\") + 2000) \\\n",
+    "                    .when(col(\"year\") < 100, col(\"year\") + 1900) \\\n",
+    "                    .otherwise(col(\"year\")))\n",
+    "df7.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-----+---+-----+----+-----------+----------+\n",
+      "| name|day|month|year|         id|       dob|\n",
+      "+-----+---+-----+----+-----------+----------+\n",
+      "|Abdul| 23|    5|1981|          0|1981-05-23|\n",
+      "| Ravi| 28|    1|2002| 8589934592|2002-01-28|\n",
+      "|Abdul| 23|    5|1981| 8589934593|1981-05-23|\n",
+      "| John| 12|   12|2006|17179869184|2006-12-12|\n",
+      "| Rosy|  7|    8|1963|17179869185|1963-08-07|\n",
+      "+-----+---+-----+----+-----------+----------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "df8 = df7.withColumn(\"dob\", expr(\"to_date(concat(day,'/',month,'/',year), 'd/M/y')\"))\n",
+    "df8.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-----+-----------+----------+\n",
+      "| name|         id|       dob|\n",
+      "+-----+-----------+----------+\n",
+      "| Rosy|17179869185|1963-08-07|\n",
+      "|Abdul|          0|1981-05-23|\n",
+      "| Ravi| 8589934592|2002-01-28|\n",
+      "| John|17179869184|2006-12-12|\n",
+      "+-----+-----------+----------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "df9 = df7.withColumn(\"dob\", to_date(expr(\"concat(day,'/',month,'/',year)\"), 'd/M/y')) \\\n",
+    "         .drop(\"day\", \"month\", \"year\") \\\n",
+    "         .dropDuplicates([\"name\", \"dob\"]) \\\n",
+    "         .sort(expr(\"dob desc\"))\n",
+    "df9.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}