forked from qalhata/Python-Scripts-Repo-on-Data-Science
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDat_Clean_Analysis.py
103 lines (64 loc) · 1.96 KB
/
Dat_Clean_Analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# -*- coding: utf-8 -*-
"""
Created on Fri Apr 7 02:30:02 2017
@author: Shabaka
"""
# ''Load and View Data ''''''''''#
# Import pandas
import pandas as pd
import matplotlib.pyplot as plt
# Read the file into a DataFrame: df
# df = pd.read_csv('dob_job_application_filings_subset.csv')
df = pd.read_csv('fixations.csv')
df2 = pd.read_csv('aerodata.csv')
# Print the head of df
print(df.head())
# Print the tail of df
print(df.tail())
print('AERO DATA OUTPUT')
print(df2.head())
print(df2.tail())
# Print the shape of df
print(df.shape)
print(df2.shape)
# Print the columns of df
print(df.columns)
print(df2.columns)
# Print the head and tail of df_subset
# print(df.subset.head())
# print(df.subset.tail())
# Print the info of df
print(df.info())
print(df2.info())
# Print the info of df_subset
# print(df.subset.info())
# '''''''' Frequency counts for Categorical Data
# Print the value counts for 'Borough'
print(df['duration'].value_counts(dropna=False))
print(df['duration'].shape)
# Print the value_counts for 'State'
print(df['confidence'].value_counts(dropna=False))
print(df['confidence'].shape)
# Print the value counts for 'Site Fill'
print(df['avg_pupil_size'].value_counts(dropna=False))
# ''''''''''' Single Variable Histogram plot ''''''''#
# Plot the histogram
df['duration'].plot(kind='hist', rot=70, logx=True, logy=True)
# Display the histogram
plt.show()
# ''''' Multi Variable Box Plot Visualisation '''''''#
# Import necessary modules
# Create the boxplot
df.boxplot(column='duration', by='avg_pupil_size', rot=90)
# Display the plot
plt.show()
# ''''''''''' Multiple variable scatter plot visualisation''''#
# Import necessary modules
# import pandas as pd
# import matplotlib.pyplot as plt
# Create and display the first scatter plot
df.plot(kind='scatter', x='initial_cost', y='total_est_fee', rot=70)
plt.show()
# Create and display the second scatter plot
df_subset.plot(kind='scatter', x='initial_cost', y='total_est_fee', rot=70)
plt.show()