Skip to content

Commit 455fcef

Browse files
Initial Commit
1 parent e3d6509 commit 455fcef

File tree

1 file changed

+316
-0
lines changed

1 file changed

+316
-0
lines changed

Diff for: 12-MiscDemoNotebook/MiscDemo.ipynb

+316
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,316 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 5,
6+
"metadata": {},
7+
"outputs": [
8+
{
9+
"name": "stdout",
10+
"output_type": "stream",
11+
"text": [
12+
"root\n",
13+
" |-- name: string (nullable = true)\n",
14+
" |-- day: string (nullable = true)\n",
15+
" |-- month: string (nullable = true)\n",
16+
" |-- year: string (nullable = true)\n",
17+
"\n"
18+
]
19+
}
20+
],
21+
"source": [
22+
"import findspark\n",
23+
"findspark.init()\n",
24+
"\n",
25+
"import pyspark\n",
26+
"from pyspark.sql import *\n",
27+
"from pyspark.sql.functions import *\n",
28+
"from pyspark.sql.types import *\n",
29+
"\n",
30+
"spark = SparkSession.builder \\\n",
31+
" .master(\"local[3]\") \\\n",
32+
" .appName(\"MiscDemo\") \\\n",
33+
" .getOrCreate()\n",
34+
"\n",
35+
"data_list = [(\"Ravi\", \"28\", \"1\", \"2002\"),\n",
36+
" (\"Abdul\", \"23\", \"5\", \"81\"), # 1981\n",
37+
" (\"John\", \"12\", \"12\", \"6\"), # 2006\n",
38+
" (\"Rosy\", \"7\", \"8\", \"63\"), # 1963\n",
39+
" (\"Abdul\", \"23\", \"5\", \"81\") # 1981\n",
40+
" ]\n",
41+
"raw_df = spark.createDataFrame(data_list).toDF(\"name\", \"day\", \"month\", \"year\").repartition(3)\n",
42+
"raw_df.printSchema()"
43+
]
44+
},
45+
{
46+
"cell_type": "code",
47+
"execution_count": 7,
48+
"metadata": {},
49+
"outputs": [
50+
{
51+
"name": "stdout",
52+
"output_type": "stream",
53+
"text": [
54+
"+-----+---+-----+----+-----------+\n",
55+
"| name|day|month|year| id|\n",
56+
"+-----+---+-----+----+-----------+\n",
57+
"|Abdul| 23| 5| 81| 0|\n",
58+
"| Ravi| 28| 1|2002| 8589934592|\n",
59+
"|Abdul| 23| 5| 81| 8589934593|\n",
60+
"| John| 12| 12| 6|17179869184|\n",
61+
"| Rosy| 7| 8| 63|17179869185|\n",
62+
"+-----+---+-----+----+-----------+\n",
63+
"\n"
64+
]
65+
}
66+
],
67+
"source": [
68+
"df1 = raw_df.withColumn(\"id\", monotonically_increasing_id())\n",
69+
"df1.show()"
70+
]
71+
},
72+
{
73+
"cell_type": "code",
74+
"execution_count": 8,
75+
"metadata": {},
76+
"outputs": [
77+
{
78+
"name": "stdout",
79+
"output_type": "stream",
80+
"text": [
81+
"+-----+---+-----+------+-----------+\n",
82+
"| name|day|month| year| id|\n",
83+
"+-----+---+-----+------+-----------+\n",
84+
"|Abdul| 23| 5|1981.0| 0|\n",
85+
"| Ravi| 28| 1| 2002| 8589934592|\n",
86+
"|Abdul| 23| 5|1981.0| 8589934593|\n",
87+
"| John| 12| 12|2006.0|17179869184|\n",
88+
"| Rosy| 7| 8|1963.0|17179869185|\n",
89+
"+-----+---+-----+------+-----------+\n",
90+
"\n"
91+
]
92+
}
93+
],
94+
"source": [
95+
"df2 = df1.withColumn(\"year\", expr(\"\"\"\n",
96+
" case when year < 21 then year + 2000\n",
97+
" when year < 100 then year + 1900\n",
98+
" else year\n",
99+
" end\"\"\"))\n",
100+
"df2.show()"
101+
]
102+
},
103+
{
104+
"cell_type": "code",
105+
"execution_count": 9,
106+
"metadata": {},
107+
"outputs": [
108+
{
109+
"name": "stdout",
110+
"output_type": "stream",
111+
"text": [
112+
"+-----+---+-----+----+-----------+\n",
113+
"| name|day|month|year| id|\n",
114+
"+-----+---+-----+----+-----------+\n",
115+
"|Abdul| 23| 5|1981| 0|\n",
116+
"| Ravi| 28| 1|2002| 8589934592|\n",
117+
"|Abdul| 23| 5|1981| 8589934593|\n",
118+
"| John| 12| 12|2006|17179869184|\n",
119+
"| Rosy| 7| 8|1963|17179869185|\n",
120+
"+-----+---+-----+----+-----------+\n",
121+
"\n"
122+
]
123+
}
124+
],
125+
"source": [
126+
"df3 = df1.withColumn(\"year\", expr(\"\"\"\n",
127+
" case when year < 21 then cast(year as int) + 2000\n",
128+
" when year < 100 then cast(year as int) + 1900\n",
129+
" else year\n",
130+
" end\"\"\"))\n",
131+
"df3.show()"
132+
]
133+
},
134+
{
135+
"cell_type": "code",
136+
"execution_count": 11,
137+
"metadata": {},
138+
"outputs": [
139+
{
140+
"name": "stdout",
141+
"output_type": "stream",
142+
"text": [
143+
"+-----+---+-----+----+-----------+\n",
144+
"| name|day|month|year| id|\n",
145+
"+-----+---+-----+----+-----------+\n",
146+
"|Abdul| 23| 5|1981| 0|\n",
147+
"| Ravi| 28| 1|2002| 8589934592|\n",
148+
"|Abdul| 23| 5|1981| 8589934593|\n",
149+
"| John| 12| 12|2006|17179869184|\n",
150+
"| Rosy| 7| 8|1963|17179869185|\n",
151+
"+-----+---+-----+----+-----------+\n",
152+
"\n",
153+
"root\n",
154+
" |-- name: string (nullable = true)\n",
155+
" |-- day: string (nullable = true)\n",
156+
" |-- month: string (nullable = true)\n",
157+
" |-- year: integer (nullable = true)\n",
158+
" |-- id: long (nullable = false)\n",
159+
"\n"
160+
]
161+
}
162+
],
163+
"source": [
164+
"df4 = df1.withColumn(\"year\", expr(\"\"\"\n",
165+
" case when year < 21 then year + 2000\n",
166+
" when year < 100 then year + 1900\n",
167+
" else year\n",
168+
" end\"\"\").cast(IntegerType()))\n",
169+
"df4.show()\n",
170+
"df4.printSchema()"
171+
]
172+
},
173+
{
174+
"cell_type": "code",
175+
"execution_count": 14,
176+
"metadata": {},
177+
"outputs": [
178+
{
179+
"name": "stdout",
180+
"output_type": "stream",
181+
"text": [
182+
"+-----+---+-----+----+-----------+\n",
183+
"| name|day|month|year| id|\n",
184+
"+-----+---+-----+----+-----------+\n",
185+
"|Abdul| 23| 5|1981| 0|\n",
186+
"| Ravi| 28| 1|2002| 8589934592|\n",
187+
"|Abdul| 23| 5|1981| 8589934593|\n",
188+
"| John| 12| 12|2006|17179869184|\n",
189+
"| Rosy| 7| 8|1963|17179869185|\n",
190+
"+-----+---+-----+----+-----------+\n",
191+
"\n"
192+
]
193+
}
194+
],
195+
"source": [
196+
"df5 = df1.withColumn(\"day\", col(\"day\").cast(IntegerType())) \\\n",
197+
" .withColumn(\"month\", col(\"month\").cast(IntegerType())) \\\n",
198+
" .withColumn(\"year\", col(\"year\").cast(IntegerType())) \n",
199+
"\n",
200+
"df6 = df5.withColumn(\"year\", expr(\"\"\"\n",
201+
" case when year < 21 then year + 2000\n",
202+
" when year < 100 then year + 1900\n",
203+
" else year\n",
204+
" end\"\"\"))\n",
205+
"df6.show()"
206+
]
207+
},
208+
{
209+
"cell_type": "code",
210+
"execution_count": 15,
211+
"metadata": {},
212+
"outputs": [
213+
{
214+
"name": "stdout",
215+
"output_type": "stream",
216+
"text": [
217+
"+-----+---+-----+----+-----------+\n",
218+
"| name|day|month|year| id|\n",
219+
"+-----+---+-----+----+-----------+\n",
220+
"|Abdul| 23| 5|1981| 0|\n",
221+
"| Ravi| 28| 1|2002| 8589934592|\n",
222+
"|Abdul| 23| 5|1981| 8589934593|\n",
223+
"| John| 12| 12|2006|17179869184|\n",
224+
"| Rosy| 7| 8|1963|17179869185|\n",
225+
"+-----+---+-----+----+-----------+\n",
226+
"\n"
227+
]
228+
}
229+
],
230+
"source": [
231+
"df7 = df5.withColumn(\"year\", \\\n",
232+
" when(col(\"year\") < 21, col(\"year\") + 2000) \\\n",
233+
" .when(col(\"year\") < 100, col(\"year\") + 1900) \\\n",
234+
" .otherwise(col(\"year\")))\n",
235+
"df7.show()"
236+
]
237+
},
238+
{
239+
"cell_type": "code",
240+
"execution_count": 16,
241+
"metadata": {},
242+
"outputs": [
243+
{
244+
"name": "stdout",
245+
"output_type": "stream",
246+
"text": [
247+
"+-----+---+-----+----+-----------+----------+\n",
248+
"| name|day|month|year| id| dob|\n",
249+
"+-----+---+-----+----+-----------+----------+\n",
250+
"|Abdul| 23| 5|1981| 0|1981-05-23|\n",
251+
"| Ravi| 28| 1|2002| 8589934592|2002-01-28|\n",
252+
"|Abdul| 23| 5|1981| 8589934593|1981-05-23|\n",
253+
"| John| 12| 12|2006|17179869184|2006-12-12|\n",
254+
"| Rosy| 7| 8|1963|17179869185|1963-08-07|\n",
255+
"+-----+---+-----+----+-----------+----------+\n",
256+
"\n"
257+
]
258+
}
259+
],
260+
"source": [
261+
"df8 = df7.withColumn(\"dob\", expr(\"to_date(concat(day,'/',month,'/',year), 'd/M/y')\"))\n",
262+
"df8.show()"
263+
]
264+
},
265+
{
266+
"cell_type": "code",
267+
"execution_count": 20,
268+
"metadata": {},
269+
"outputs": [
270+
{
271+
"name": "stdout",
272+
"output_type": "stream",
273+
"text": [
274+
"+-----+-----------+----------+\n",
275+
"| name| id| dob|\n",
276+
"+-----+-----------+----------+\n",
277+
"| Rosy|17179869185|1963-08-07|\n",
278+
"|Abdul| 0|1981-05-23|\n",
279+
"| Ravi| 8589934592|2002-01-28|\n",
280+
"| John|17179869184|2006-12-12|\n",
281+
"+-----+-----------+----------+\n",
282+
"\n"
283+
]
284+
}
285+
],
286+
"source": [
287+
"df9 = df7.withColumn(\"dob\", to_date(expr(\"concat(day,'/',month,'/',year)\"), 'd/M/y')) \\\n",
288+
" .drop(\"day\", \"month\", \"year\") \\\n",
289+
" .dropDuplicates([\"name\", \"dob\"]) \\\n",
290+
" .sort(expr(\"dob desc\"))\n",
291+
"df9.show()"
292+
]
293+
}
294+
],
295+
"metadata": {
296+
"kernelspec": {
297+
"display_name": "Python 3",
298+
"language": "python",
299+
"name": "python3"
300+
},
301+
"language_info": {
302+
"codemirror_mode": {
303+
"name": "ipython",
304+
"version": 3
305+
},
306+
"file_extension": ".py",
307+
"mimetype": "text/x-python",
308+
"name": "python",
309+
"nbconvert_exporter": "python",
310+
"pygments_lexer": "ipython3",
311+
"version": "3.7.6"
312+
}
313+
},
314+
"nbformat": 4,
315+
"nbformat_minor": 2
316+
}

0 commit comments

Comments
 (0)