This notebook contains following sections
-
Importing necessary Libraries & dataset
-
Building a dataset Module
-
Building performance Module
-
Building Evaluator Module
- Evaluated Algorithm submodule
- Evaluated Data submodule
-
Building Hybrid Module
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
!pip install surprise
Requirement already satisfied: surprise in /usr/local/lib/python3.6/dist-packages (0.1)
Requirement already satisfied: scikit-surprise in /usr/local/lib/python3.6/dist-packages (from surprise) (1.1.0)
Requirement already satisfied: scipy>=1.0.0 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (1.4.1)
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (0.14.1)
Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (1.12.0)
Requirement already satisfied: numpy>=1.11.2 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (1.18.2)
folderpath='drive/My Drive/datasets/'
import os
import csv
import sys
import re
import numpy as np
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise import dump
from collections import defaultdict
This module takes the raw dataset and provides the processed the dataset along with other details
It has following functions
- loadDataset
- getUserRating
- getPopularityRanking
- getArtistName
- getArtistID
#user_id artist_mbid artist_name plays norm_plays rating
class DataLoader:
path='drive/My Drive/datasets/user-songs-rating-3000.csv'
artistID_to_name={}
name_to_artistID={}
#user_id artist_mbid norm_plays rating
def loadDataset(self):
ratingsDataset = 0
self.artistID_to_name = {}
self.name_to_artistID = {}
reader = Reader(rating_scale=(0, 5))
df_matrix=pd.read_csv(self.path)
#df_matrix=df_matrix.iloc[:200000,:]
ratingsDataset= Dataset.load_from_df(df_matrix[['user_id', 'artist_mbid', 'rating']], reader)
with open(self.path, newline='', encoding='ISO-8859-1') as csvfile:
artistReader = csv.reader(csvfile)
next(artistReader) #Skip header line
for row in artistReader:
artistID = row[1]
artistName = row[2]
self.artistID_to_name[artistID] = artistName
self.name_to_artistID[artistName] = artistID
return ratingsDataset
def getUserRatings(self, user):
userRatings = []
hitUser = False
with open(self.path, newline='', encoding='ISO-8859-1') as csvfile:
ratingReader = csv.reader(csvfile)
next(ratingReader)
for row in ratingReader:
userID = row[0]
if (user == userID):
artistID = row[1]
rating = float(row[5])
userRatings.append((artistID, rating))
hitUser = True
if (hitUser and (user != userID)):
break
return userRatings
def getPopularityRanks(self):
ratings = defaultdict(int)
rankings = defaultdict(int)
with open(self.path, newline='', encoding='ISO-8859-1') as csvfile:
ratingReader = csv.reader(csvfile)
next(ratingReader)
for row in ratingReader:
artistID = row[1]
ratings[artistID] += 1
rank = 1
for artistID, ratingCount in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
rankings[artistID] = rank
rank += 1
return rankings
def getArtistName(self, artistID):
if artistID in self.artistID_to_name:
return self.artistID_to_name[artistID]
else:
return ""
def getArtistID(self, artistName):
if artistName in self.name_to_artistID:
return self.name_to_artistID[artistName]
else:
return 0
This module generated the metrics by taking the predictions of the models. It outputs two metrics
- Mean Absolute Error
- Root mean square Error
from surprise import accuracy
class PerformanceMetrics:
def MAE(predictions):
return accuracy.mae(predictions)
def RMSE(predictions):
return accuracy.rmse(predictions)
This module is to build the algorithms/models to train the dataset It has following models
- getName - returns the name of model
- getModel - returns the model
- saveModel - save the model
- Evaluate - train the model and returns the metrics
class ModelBuilder:
def __init__(self, model, name):
self.model = model
self.name = name
def GetName(self):
return self.name
def GetModel(self):
return self.model
def SaveModel(self,predictions):
dump.dump(folderpath+self.name,predictions,self.model)
print('Model saved at '+folderpath+self.name)
def Evaluate(self, evaluationData,save=False):
metrics = {}
# Compute accuracy
print("Evaluating accuracy...")
predictions = self.model.fit(evaluationData.GetTrainSet()).test(evaluationData.GetTestSet())
metrics["RMSE"] = PerformanceMetrics.RMSE(predictions)
metrics["MAE"] = PerformanceMetrics.MAE(predictions)
print("Analysis complete.")
if(save):
print('saving the model.....')
self.SaveModel(predictions)
return metrics
This module is used to load a set of models into the returns the metrics/performace of each algorithm
It has following functions
- addmodel
- Evaluate
- flushModels
class ModelFactory:
models = []
def __init__(self, dataset):
ed = DataGenerator(dataset)
self.dataset = ed
self.models=[]
def AddModel(self, model, name):
alg = ModelBuilder(model, name)
self.models.append(alg)
def Evaluate(self,save=False):
results = {}
for model in self.models:
print("Evaluating ", model.GetName(), "...")
results[model.GetName()] = model.Evaluate(self.dataset,save)
# Print results
print("\n")
print(results)
def flushModels(self):
self.models=[]
This model takes the dataset splits it into training dataset and testing dataset and return them
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
class DataGenerator:
def __init__(self, data):
#Build a 75/25 train/test split for measuring accuracy
self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1)
def GetTrainSet(self):
return self.trainSet
def GetTestSet(self):
return self.testSet
This module takes multiple models along with their preference weights and returns the results
from surprise import AlgoBase
class HybridModel(AlgoBase):
def __init__(self, models, weights, sim_options={}):
AlgoBase.__init__(self)
self.models = models
self.weights = weights
def fit(self, trainset):
AlgoBase.fit(self, trainset)
for model in self.models:
model.fit(trainset)
return self
def estimate(self, user_id, item_id):
scores_sum = 0
weights_sum = 0
for i in range(len(self.models)):
scores_sum += self.models[i].estimate(user_id, item_id) * self.weights[i] # 3*1/4+4*3/4 laga ra
weights_sum += self.weights[i] # always becomes one
return scores_sum / weights_sum
def LoadData():
ml = DataLoader()
print("Loading songs ratings...")
data = ml.loadDataset()
return (ml, data)
# Load up common data set for the recommender algorithms
(ml, evaluationData) = LoadData()
Loading songs ratings...
from surprise import BaselineOnly
#Construct an Evaluator to, you know, evaluate them
modelfactory = ModelFactory(evaluationData)
# BaselineOnly
baseline= BaselineOnly()
modelfactory.AddModel(baseline, "baseline")
modelfactory.Evaluate(True)
Evaluating baseline ...
Evaluating accuracy...
Estimating biases using als...
RMSE: 0.9934
MAE: 0.6817
Analysis complete.
saving the model.....
Model saved at drive/My Drive/datasets/baseline
{'baseline': {'RMSE': 0.9934226190571713, 'MAE': 0.6816700266309835}}
from surprise import SVD
# BaselineOnly
svd= SVD()
modelfactory.AddModel(svd, "svd")
modelfactory.Evaluate()
Evaluating baseline ...
Evaluating accuracy...
Estimating biases using als...
RMSE: 0.9934
MAE: 0.6817
Analysis complete.
Evaluating svd ...
Evaluating accuracy...
RMSE: 1.0063
MAE: 0.6831
Analysis complete.
{'baseline': {'RMSE': 0.9934226190571713, 'MAE': 0.6816700266309835}, 'svd': {'RMSE': 1.0063057804737225, 'MAE': 0.6830706904210475}}
#Combine them
Hybrid = HybridModel([svd, baseline], [0.5, 0.5])
# Fight!
modelfactory.AddModel(Hybrid, "Hybrid")
modelfactory.Evaluate(True)
Evaluating baseline ...
Evaluating accuracy...
Estimating biases using als...
RMSE: 0.9934
MAE: 0.6817
Analysis complete.
saving the model.....
Model saved at drive/My Drive/datasets/baseline
Evaluating svd ...
Evaluating accuracy...
RMSE: 1.0058
MAE: 0.6821
Analysis complete.
saving the model.....
Model saved at drive/My Drive/datasets/svd
Evaluating Hybrid ...
Evaluating accuracy...
Estimating biases using als...
RMSE: 0.9971
MAE: 0.6795
Analysis complete.
saving the model.....
Model saved at drive/My Drive/datasets/Hybrid
{'baseline': {'RMSE': 0.9934226190571713, 'MAE': 0.6816700266309835}, 'svd': {'RMSE': 1.005848123790534, 'MAE': 0.6821244100215875}, 'Hybrid': {'RMSE': 0.9970686978736479, 'MAE': 0.6794833671716486}}
RBMs have two layers, input layer which is also known as visible layer and the hidden layer. The neurons in each layer communicate with neurons in the other layer but not with neurons in the same layer. there is no intralayer communication among the neurons.
import sys
sys.path.append('/content/drive/My Drive/datasets/')
import RBM
import RBMModel
import importlib
importlib.reload(RBM)
importlib.reload(RBMModel)
# Construct an Evaluator to, you know, evaluate them
deep_factory= ModelFactory(evaluationData)
#Simple RBM
SimpleRBM = RBMModel.RBMAlgorithm(epochs=10)
deep_factory.AddModel(SimpleRBM,'rbm')
svd= SVD()
deep_factory.AddModel(svd, "svd")
#Combine them
Hybrid = HybridModel([svd, SimpleRBM], [0.5, 0.5])
# Fight!
deep_factory.AddModel(Hybrid, "Hybrid")
deep_factory.Evaluate()
Evaluating rbm ...
Evaluating accuracy...
Trained epoch 0
Trained epoch 1
Trained epoch 2
Trained epoch 3
Trained epoch 4
Trained epoch 5
Trained epoch 6
Trained epoch 7
Trained epoch 8
Trained epoch 9
Processing user 0
Processing user 50
Processing user 100
Processing user 150
Processing user 200
Processing user 250
Processing user 300
Processing user 350
Processing user 400
Processing user 450
Processing user 500
Processing user 550
Processing user 600
Processing user 650
Processing user 700
Processing user 750
Processing user 800
Processing user 850
Processing user 900
Processing user 950
Processing user 1000
Processing user 1050
Processing user 1100
Processing user 1150
Processing user 1200
Processing user 1250
Processing user 1300
Processing user 1350
Processing user 1400
Processing user 1450
Processing user 1500
Processing user 1550
Processing user 1600
Processing user 1650
Processing user 1700
Processing user 1750
Processing user 1800
Processing user 1850
Processing user 1900
Processing user 1950
Processing user 2000
Processing user 2050
Processing user 2100
Processing user 2150
Processing user 2200
Processing user 2250
Processing user 2300
Processing user 2350
Processing user 2400
Processing user 2450
Processing user 2500
Processing user 2550
Processing user 2600
Processing user 2650
Processing user 2700
Processing user 2750
Processing user 2800
Processing user 2850
Processing user 2900
Processing user 2950
RMSE: 1.5733
MAE: 1.4437
Analysis complete.
Evaluating svd ...
Evaluating accuracy...
RMSE: 1.0070
MAE: 0.6827
Analysis complete.
Evaluating Hybrid ...
Evaluating accuracy...
Trained epoch 0
Trained epoch 1
Trained epoch 2
Trained epoch 3
Trained epoch 4
Trained epoch 5
Trained epoch 6
Trained epoch 7
Trained epoch 8
Trained epoch 9
Processing user 0
Processing user 50
Processing user 100
Processing user 150
Processing user 200
Processing user 250
Processing user 300
Processing user 350
Processing user 400
Processing user 450
Processing user 500
Processing user 550
Processing user 600
Processing user 650
Processing user 700
Processing user 750
Processing user 800
Processing user 850
Processing user 900
Processing user 950
Processing user 1000
Processing user 1050
Processing user 1100
Processing user 1150
Processing user 1200
Processing user 1250
Processing user 1300
Processing user 1350
Processing user 1400
Processing user 1450
Processing user 1500
Processing user 1550
Processing user 1600
Processing user 1650
Processing user 1700
Processing user 1750
Processing user 1800
Processing user 1850
Processing user 1900
Processing user 1950
Processing user 2000
Processing user 2050
Processing user 2100
Processing user 2150
Processing user 2200
Processing user 2250
Processing user 2300
Processing user 2350
Processing user 2400
Processing user 2450
Processing user 2500
Processing user 2550
Processing user 2600
Processing user 2650
Processing user 2700
Processing user 2750
Processing user 2800
Processing user 2850
Processing user 2900
Processing user 2950
RMSE: 1.1650
MAE: 1.0040
Analysis complete.
{'rbm': {'RMSE': 1.5732653327536739, 'MAE': 1.4437298577219584}, 'svd': {'RMSE': 1.0069576973032366, 'MAE': 0.6826501315236461}, 'Hybrid': {'RMSE': 1.1649843535460234, 'MAE': 1.0040280997374962}}
Customer Segmentation is one the most important applications of unsupervised learning. Using clustering techniques, one can identify the several segments of customers allowing them to target the potential user base.
This notebook has the following sections
- Importing Necessary Libraries
- Loading the data
- Exploratory Data Analysis
- Applying clustering Techniques
- Finding the optimal Clusters
- Applying PCA
- Find the optimal clusters
- Visualization of clusters
import numpy as np
import pandas as pd
# for standardizing the dataset
from sklearn.preprocessing import StandardScaler
# for visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
# for the clustering techniques
from scipy.cluster.hierarchy import dendrogram,linkage
from sklearn.cluster import KMeans
df_users=pd.read_csv('user_side_data_raw.tsv',sep='\t',names=['id','gender','age','country','date'])
df_users.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id | gender | age | country | date | |
---|---|---|---|---|---|
0 | 00000c289a1829a808ac09c00daf10bc3c4e223b | f | 22.0 | Germany | Feb 1, 2007 |
1 | 00001411dc427966b17297bf4d69e7e193135d89 | f | NaN | Canada | Dec 4, 2007 |
2 | 00004d2ac9316e22dc007ab2243d6fcb239e707d | NaN | NaN | Germany | Sep 1, 2006 |
3 | 000063d3fe1cf2ba248b9e3c3f0334845a27a6bf | m | 19.0 | Mexico | Apr 28, 2008 |
4 | 00007a47085b9aab8af55f52ec8846ac479ac4fe | m | 28.0 | United States | Jan 27, 2006 |
df_users.shape
(359347, 5)
df_users.isnull().sum()
id 0
gender 32775
age 74900
country 0
date 0
dtype: int64
# proportions of null values
(df_users.gender.isnull().sum()/df_users.shape[0])*100
9.12071062232327
(df_users.age.isnull().sum()/df_users.shape[0])*100
20.84336310029025
df_users.gender.fillna(df_users.gender.mode()[0],inplace=True)
df_users.age.fillna(int(df_users.age.mean()),inplace=True)
df_users.isnull().sum()
id 0
gender 0
age 0
country 0
date 0
dtype: int64
df_users.gender.value_counts()
m 274417
f 84930
Name: gender, dtype: int64
df_users.age.mean()
25.07761022076155
df_users.country.value_counts()
United States 67044
Germany 31651
United Kingdom 29902
Poland 20987
Russian Federation 19833
Brazil 14534
Sweden 13122
Spain 13051
Finland 11579
Netherlands 9650
Canada 8679
France 7529
Italy 7525
Australia 7135
Japan 6637
Turkey 6452
Norway 5155
Mexico 4834
Czech Republic 4774
Ukraine 4396
Belgium 3803
Portugal 3196
Switzerland 3053
Bulgaria 2800
Austria 2796
Chile 2794
Argentina 2640
Romania 2636
Denmark 2508
Hungary 1985
...
Equatorial Guinea 10
Saint Kitts and Nevis 10
San Marino 9
Cameroon 9
Namibia 9
Senegal 9
Saint Lucia 8
Rwanda 8
Gabon 8
Comoros 8
Mayotte 8
Tonga 8
Grenada 7
Myanmar 7
Mali 7
Malawi 6
Belize 6
Eritrea 6
Guinea-Bissau 5
Central African Republic 5
Sudan 5
Guyana 5
Suriname 5
Mauritania 5
Marshall Islands 4
French Guiana 4
Palau 3
Benin 3
Liberia 3
Gambia 3
Name: country, Length: 239, dtype: int64
df_users['joined year']=df_users.date.apply(lambda x:x[-4:])
df_users.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id | gender | age | country | date | joined year | |
---|---|---|---|---|---|---|
0 | 00000c289a1829a808ac09c00daf10bc3c4e223b | f | 22.0 | Germany | Feb 1, 2007 | 2007 |
1 | 00001411dc427966b17297bf4d69e7e193135d89 | f | 25.0 | Canada | Dec 4, 2007 | 2007 |
2 | 00004d2ac9316e22dc007ab2243d6fcb239e707d | m | 25.0 | Germany | Sep 1, 2006 | 2006 |
3 | 000063d3fe1cf2ba248b9e3c3f0334845a27a6bf | m | 19.0 | Mexico | Apr 28, 2008 | 2008 |
4 | 00007a47085b9aab8af55f52ec8846ac479ac4fe | m | 28.0 | United States | Jan 27, 2006 | 2006 |
df_users.drop(['date'],axis=1,inplace=True)
df_users.set_index(['id'],inplace=True)
df_users.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
gender | age | country | joined year | |
---|---|---|---|---|
id | ||||
00000c289a1829a808ac09c00daf10bc3c4e223b | f | 22.0 | Germany | 2007 |
00001411dc427966b17297bf4d69e7e193135d89 | f | 25.0 | Canada | 2007 |
00004d2ac9316e22dc007ab2243d6fcb239e707d | m | 25.0 | Germany | 2006 |
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf | m | 19.0 | Mexico | 2008 |
00007a47085b9aab8af55f52ec8846ac479ac4fe | m | 28.0 | United States | 2006 |
df_seg=pd.get_dummies(df_users,['gender','country','joined year'])
df_seg.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
age | gender_f | gender_m | country_Afghanistan | country_Albania | country_Algeria | country_American Samoa | country_Andorra | country_Angola | country_Anguilla | ... | country_Zambia | country_Zimbabwe | joined year_2002 | joined year_2003 | joined year_2004 | joined year_2005 | joined year_2006 | joined year_2007 | joined year_2008 | joined year_2009 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | |||||||||||||||||||||
00000c289a1829a808ac09c00daf10bc3c4e223b | 22.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
00001411dc427966b17297bf4d69e7e193135d89 | 25.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
00004d2ac9316e22dc007ab2243d6fcb239e707d | 25.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf | 19.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
00007a47085b9aab8af55f52ec8846ac479ac4fe | 28.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
5 rows × 250 columns
df_users.describe(include='all')
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
gender | age | country | joined year | |
---|---|---|---|---|
count | 359347 | 359347.000000 | 359347 | 359347 |
unique | 2 | NaN | 239 | 8 |
top | m | NaN | United States | 2008 |
freq | 274417 | NaN | 67044 | 120808 |
mean | NaN | 25.077610 | NaN | NaN |
std | NaN | 19.276048 | NaN | NaN |
min | NaN | -1337.000000 | NaN | NaN |
25% | NaN | 21.000000 | NaN | NaN |
50% | NaN | 25.000000 | NaN | NaN |
75% | NaN | 26.000000 | NaN | NaN |
max | NaN | 1002.000000 | NaN | NaN |
from sklearn.preprocessing import LabelEncoder
lb_encoder=LabelEncoder()
countries=lb_encoder.fit_transform(df_users.country)
year_encoder=LabelEncoder()
years=lb_encoder.fit_transform(df_users['joined year'])
years
array([5, 5, 4, ..., 5, 3, 6])
gender_encoder=LabelEncoder()
genders=lb_encoder.fit_transform(df_users.gender)
genders
array([0, 0, 1, ..., 1, 1, 1])
Clustering is the task of grouping together a set of objects in a way that objects in the same cluster are more similar to each other than to objects in other clusters. Similarity is a metric that reflects the strength of relationship between two data objects. Clustering is mainly used for exploratory data mining.
we have two ways to find the optimal no.of clusters
- By Hierarchical clustering we can find the optimal no.of clusters by observing the dendogram
- By ploting the WCSS of a range of cluster using Bottom up approach
The first option crashed the kernel because of lack of enough hardware requirement in the laptop.I'm opting for second one
K-Means attempts to classify data without having first been trained with labeled data. Once the algorithm has been run and the groups are defined, any new data can be easily assigned to the most relevant group.
wcss=[]
for i in range(1,11):
kmeans=KMeans(n_clusters=i,init='k-means++',random_state=42)
kmeans.fit(df_seg)
wcss.append(kmeans.inertia_)
print(kmeans.inertia_)
134258498.48275355
28492159.521132484
15087558.004872149
8737924.294237286
5680436.533094462
4150917.3284346457
3079109.5899161794
2474939.295240826
2105803.1053973585
1820427.9377313748
wcss
[134258498.48275355,
28492159.521132484,
15087558.004872149,
8737924.294237286,
5680436.533094462,
4150917.3284346457,
3079109.5899161794,
2474939.295240826,
2105803.1053973585,
1820427.9377313748]
f=open('wcss.txt','w')
for i in wcss:
f.write(str(i))
f.write('\n')
f.close()
plt.figure(figsize=(12,9))
plt.plot(range(1,11),wcss,marker='o',linestyle='--')
plt.xlabel('Number of clusters')
plt.ylabel('wcss')
plt.show()
We graph the relationship between the number of clusters and Within Cluster Sum of Squares (WCSS) then we select the number of clusters where the change in WCSS begins to level off (elbow method).
WCSS is defined as the sum of the squared distance between each member of the cluster and its centroid.
kmeans=KMeans(n_clusters=3,init='k-means++',random_state=42)
kmeans.fit(df_seg)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
random_state=42, tol=0.0001, verbose=0)
df_segm_kmeans=df_seg.copy()
df_segm_kmeans['segment K-means']=kmeans.labels_
df_users.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
gender | age | country | joined year | |
---|---|---|---|---|
id | ||||
00000c289a1829a808ac09c00daf10bc3c4e223b | f | 22.0 | Germany | 2007 |
00001411dc427966b17297bf4d69e7e193135d89 | f | 25.0 | Canada | 2007 |
00004d2ac9316e22dc007ab2243d6fcb239e707d | m | 25.0 | Germany | 2006 |
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf | m | 19.0 | Mexico | 2008 |
00007a47085b9aab8af55f52ec8846ac479ac4fe | m | 28.0 | United States | 2006 |
df_users_en=df_users.copy()
df_users_en['gender']=genders
df_users_en['country']=countries
df_users_en['joined year']=years
df_users_en.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
gender | age | country | joined year | |
---|---|---|---|---|
id | ||||
00000c289a1829a808ac09c00daf10bc3c4e223b | 0 | 22.0 | 79 | 5 |
00001411dc427966b17297bf4d69e7e193135d89 | 0 | 25.0 | 37 | 5 |
00004d2ac9316e22dc007ab2243d6fcb239e707d | 1 | 25.0 | 79 | 4 |
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf | 1 | 19.0 | 136 | 6 |
00007a47085b9aab8af55f52ec8846ac479ac4fe | 1 | 28.0 | 225 | 4 |
test_wcss=[]
for i in range(1,11):
kmeans=KMeans(n_clusters=i,init='k-means++',random_state=42)
kmeans.fit(df_users_en)
test_wcss.append(kmeans.inertia_)
print(kmeans.inertia_)
2086135898.8955686
431290273.3553824
290135430.90301967
184763290.25917345
86585872.56605095
66684087.42352505
52578488.68448655
44171563.90171374
38522141.4220099
33634118.314277805
plt.figure(figsize=(12,9))
plt.plot(range(1,11),test_wcss,marker='o',linestyle='--')
plt.xlabel('Number of clusters')
plt.ylabel('wcss')
plt.show()
From the above graph we can conclude that there are three optimal clusters can be formed using Elbow-point method
kmeans=KMeans(n_clusters=3,init='k-means++',random_state=42)
kmeans.fit(df_users_en)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
random_state=42, tol=0.0001, verbose=0)
df_en_segm_kmeans=df_users_en.copy()
df_en_segm_kmeans['segment K-means']=kmeans.labels_
df_en_segm_kmeans.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
gender | age | country | joined year | segment K-means | |
---|---|---|---|---|---|
id | |||||
00000c289a1829a808ac09c00daf10bc3c4e223b | 0 | 22.0 | 79 | 5 | 0 |
00001411dc427966b17297bf4d69e7e193135d89 | 0 | 25.0 | 37 | 5 | 0 |
00004d2ac9316e22dc007ab2243d6fcb239e707d | 1 | 25.0 | 79 | 4 | 0 |
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf | 1 | 19.0 | 136 | 6 | 2 |
00007a47085b9aab8af55f52ec8846ac479ac4fe | 1 | 28.0 | 225 | 4 | 1 |
df_en_segm_kmeans.groupby(['segment K-means']).count()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
gender | age | country | joined year | |
---|---|---|---|---|
segment K-means | ||||
0 | 139132 | 139132 | 139132 | 139132 |
1 | 143475 | 143475 | 143475 | 143475 |
2 | 76740 | 76740 | 76740 | 76740 |
x_axis=df_en_segm_kmeans['age']
y_axis=df_en_segm_kmeans['gender']
plt.figure(figsize=(10,8))
sns.scatterplot(x_axis,y_axis,hue=df_en_segm_kmeans['segment K-means'])
plt.title('Segmentation K-means')
plt.show()
It is a common practice to apply PCA (principal component analysis) before a clustering algorithm (such as k-means). It is believed that it improves the clustering results in practice (noise reduction).
df_seg.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
age | gender_f | gender_m | country_Afghanistan | country_Albania | country_Algeria | country_American Samoa | country_Andorra | country_Angola | country_Anguilla | ... | country_Zambia | country_Zimbabwe | joined year_2002 | joined year_2003 | joined year_2004 | joined year_2005 | joined year_2006 | joined year_2007 | joined year_2008 | joined year_2009 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | |||||||||||||||||||||
00000c289a1829a808ac09c00daf10bc3c4e223b | 22.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
00001411dc427966b17297bf4d69e7e193135d89 | 25.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
00004d2ac9316e22dc007ab2243d6fcb239e707d | 25.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf | 19.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
00007a47085b9aab8af55f52ec8846ac479ac4fe | 28.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
5 rows × 250 columns
df_seg.shape
(359347, 250)
from sklearn.decomposition import PCA
pca=PCA()
pca.fit(df_seg)
PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)
On the applying the pca on the dataset we can get the variance explained by each column/field. As a rule of thumb we have to retain the components explaining more than 80% variance
pca.explained_variance_ratio_
array([9.94508186e-01, 9.73522817e-04, 8.20579773e-04, 5.95721570e-04,
4.23043046e-04, 3.16620110e-04, 2.36815861e-04, 2.24564767e-04,
1.94659919e-04, 1.52096641e-04, 1.32009525e-04, 1.04611178e-04,
9.71849613e-05, 9.02506282e-05, 7.90944697e-05, 6.88602904e-05,
6.24582865e-05, 5.91033460e-05, 5.59826220e-05, 5.40947256e-05,
5.09339156e-05, 4.85152270e-05, 4.18295955e-05, 3.72284667e-05,
3.57277045e-05, 3.34830053e-05, 2.97241284e-05, 2.62015022e-05,
2.33735151e-05, 2.21352569e-05, 2.08321749e-05, 2.08007512e-05,
2.00785538e-05, 1.96385456e-05, 1.88673504e-05, 1.61374506e-05,
1.47573341e-05, 1.44643374e-05, 1.40240498e-05, 1.37709835e-05,
1.31064515e-05, 1.23692322e-05, 1.18111175e-05, 1.10000004e-05,
1.00279260e-05, 9.47876445e-06, 9.14714313e-06, 8.86760541e-06,
8.67800866e-06, 7.70307208e-06, 7.67159210e-06, 5.76894105e-06,
5.19930343e-06, 4.80708919e-06, 4.56295442e-06, 4.46261434e-06,
4.26692383e-06, 3.83936073e-06, 3.67754963e-06, 3.34438438e-06,
3.18773258e-06, 3.12815711e-06, 2.87014131e-06, 2.75656584e-06,
2.72712673e-06, 2.47237075e-06, 2.38166637e-06, 2.19906353e-06,
2.05900936e-06, 1.97978608e-06, 1.90307330e-06, 1.68047567e-06,
1.59470740e-06, 1.50658856e-06, 1.46615802e-06, 1.33682246e-06,
1.27957460e-06, 1.07718066e-06, 9.98022358e-07, 9.68251415e-07,
9.57905326e-07, 9.19400987e-07, 8.45930243e-07, 7.94271484e-07,
7.54658274e-07, 7.44790552e-07, 7.20438788e-07, 6.95982213e-07,
6.74934359e-07, 6.59264476e-07, 6.50888084e-07, 6.25582753e-07,
6.10743920e-07, 5.90678072e-07, 5.73134299e-07, 5.59405831e-07,
5.51150575e-07, 5.40057633e-07, 5.20446256e-07, 5.04597269e-07,
4.87954010e-07, 4.76684985e-07, 4.71534318e-07, 4.63970861e-07,
4.38285746e-07, 4.16822315e-07, 3.99990982e-07, 3.95251715e-07,
3.89967669e-07, 3.82741298e-07, 3.75220497e-07, 3.67580918e-07,
3.56091579e-07, 3.47480286e-07, 3.42618219e-07, 3.42610019e-07,
3.30604562e-07, 3.26406772e-07, 3.20276480e-07, 3.06736254e-07,
2.97923602e-07, 2.93015723e-07, 2.85970480e-07, 2.76959634e-07,
2.68104924e-07, 2.64372514e-07, 2.58769842e-07, 2.53241597e-07,
2.53223936e-07, 2.49628719e-07, 2.45792638e-07, 2.45707335e-07,
2.40951568e-07, 2.36566801e-07, 2.30896537e-07, 2.30892295e-07,
2.30889871e-07, 2.30885957e-07, 2.26742891e-07, 2.23448883e-07,
2.23447035e-07, 2.23439483e-07, 2.19663069e-07, 2.16001219e-07,
2.16000859e-07, 2.15998211e-07, 2.15996978e-07, 2.15995929e-07,
2.15979867e-07, 2.09163783e-07, 2.02775823e-07, 2.01098199e-07,
1.94860441e-07, 1.89790492e-07, 1.86207356e-07, 1.86205351e-07,
1.86196977e-07, 1.79897678e-07, 1.75267874e-07, 1.71310697e-07,
1.71305824e-07, 1.71303737e-07, 1.67462478e-07, 1.63862863e-07,
1.63862532e-07, 1.63860469e-07, 1.63858676e-07, 1.63857267e-07,
1.63857164e-07, 1.58171815e-07, 1.56413481e-07, 1.56410913e-07,
1.56410211e-07, 1.50704835e-07, 1.48965558e-07, 1.48965319e-07,
1.48959351e-07, 1.42463851e-07, 1.41512222e-07, 1.35211289e-07,
1.34064397e-07, 1.28641246e-07, 1.26620115e-07, 1.26619194e-07,
1.26529159e-07, 1.20225344e-07, 1.19170003e-07, 1.13592850e-07,
1.11723423e-07, 1.11721851e-07, 1.07657805e-07, 1.04276435e-07,
1.04275964e-07, 1.04275338e-07, 1.04274754e-07, 1.04273034e-07,
1.04271979e-07, 1.04267122e-07, 1.04266543e-07, 9.86215397e-08,
9.68281310e-08, 9.68280733e-08, 9.68274111e-08, 9.68261044e-08,
9.68248163e-08, 9.68229630e-08, 9.01822837e-08, 8.93774534e-08,
8.93753397e-08, 8.34459092e-08, 8.19308868e-08, 8.19304574e-08,
8.19298303e-08, 8.19292872e-08, 8.19169571e-08, 7.53686097e-08,
7.44816457e-08, 7.44791372e-08, 7.43903163e-08, 6.79021000e-08,
6.70341310e-08, 6.70330907e-08, 6.70325489e-08, 6.06907254e-08,
5.95865186e-08, 5.95863903e-08, 5.95858414e-08, 5.95854838e-08,
5.95849659e-08, 5.26219570e-08, 5.21376438e-08, 5.21365906e-08,
4.51453399e-08, 4.46894504e-08, 4.46888105e-08, 3.79871192e-08,
3.72415398e-08, 3.72414852e-08, 3.72413734e-08, 3.72403576e-08,
3.72401695e-08, 2.99931770e-08, 2.97931632e-08, 2.26592117e-08,
2.23449135e-08, 2.23447407e-08, 2.23443392e-08, 3.07797804e-29,
4.13256137e-31, 1.35450131e-31])
plt.figure(figsize=(12,9))
plt.plot(range(1,50),pca.explained_variance_ratio_.cumsum()[:49],marker='o',linestyle='--')
plt.title('Explained variance by components')
plt.xlabel('No.of components')
plt.ylabel('Explained variance')
plt.show()
From the above graph we can see that the 10 components are explaining more than 98% variance itself
pca=PCA(n_components=10)
pca.fit(df_seg)
PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)
scores_pca=pca.transform(df_seg)
scores_pca
array([[ 3.07924934e+00, -9.65742459e-01, 8.40539686e-01, ...,
7.06646168e-01, 3.95826494e-01, 7.82931891e-03],
[ 7.93175181e-02, -9.83262193e-01, 8.53291430e-01, ...,
-2.34652219e-03, -9.89962221e-02, -5.87799257e-02],
[ 7.68214240e-02, 4.37602123e-01, 8.39268432e-02, ...,
6.78797567e-01, 3.46492084e-01, 2.94501037e-02],
...,
[ 5.07686694e+00, 3.86188080e-01, 6.60299973e-01, ...,
-7.11075126e-02, 1.01004688e-01, 7.92880964e-03],
[ 5.07701841e+00, 3.97111786e-01, 8.15172939e-03, ...,
3.78636281e-01, -5.27255987e-01, -6.20991725e-01],
[ 4.07736262e+00, 2.01415376e-01, -7.59616724e-01, ...,
-4.35712057e-02, -8.14134376e-02, -1.57825945e-02]])
pca_wcss=[]
for i in range(1,11):
kmeans_pca=KMeans(n_clusters=i,init='k-means++',random_state=42)
kmeans_pca.fit(scores_pca)
pca_wcss.append(kmeans_pca.inertia_)
print(kmeans_pca.inertia_)
134049835.27223751
28283517.23694611
14868999.474239191
8548741.72116922
5472147.923170721
3942546.1647670097
2881208.2835925072
2260121.1429546513
1886633.3945844688
1612138.7509535719
plt.figure(figsize=(12,9))
plt.plot(range(1,11),pca_wcss,marker='o',linestyle='--')
plt.xlabel('components')
plt.ylabel('wcss')
plt.show()
From the above graph we can conclude to take optimal no.of clusters to be 3 which supports the Kmeans without PCA and as well as without LabelEncoding
kmeans_pca=KMeans(n_clusters=3,init='k-means++',random_state=42)
kmeans_pca.fit(scores_pca)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
random_state=42, tol=0.0001, verbose=0)
df_users_en.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
gender | age | country | joined year | |
---|---|---|---|---|
id | ||||
00000c289a1829a808ac09c00daf10bc3c4e223b | 0 | 22.0 | 79 | 5 |
00001411dc427966b17297bf4d69e7e193135d89 | 0 | 25.0 | 37 | 5 |
00004d2ac9316e22dc007ab2243d6fcb239e707d | 1 | 25.0 | 79 | 4 |
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf | 1 | 19.0 | 136 | 6 |
00007a47085b9aab8af55f52ec8846ac479ac4fe | 1 | 28.0 | 225 | 4 |
df_segm_kmeans_pca=df_users.copy()
df_segm_kmeans_pca['segment K-means']=kmeans_pca.labels_
df_segm_kmeans_pca.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
gender | age | country | joined year | segment K-means | |
---|---|---|---|---|---|
id | |||||
00000c289a1829a808ac09c00daf10bc3c4e223b | f | 22.0 | Germany | 2007 | 0 |
00001411dc427966b17297bf4d69e7e193135d89 | f | 25.0 | Canada | 2007 | 0 |
00004d2ac9316e22dc007ab2243d6fcb239e707d | m | 25.0 | Germany | 2006 | 0 |
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf | m | 19.0 | Mexico | 2008 | 0 |
00007a47085b9aab8af55f52ec8846ac479ac4fe | m | 28.0 | United States | 2006 | 0 |
df_segm_kmeans_pca=pd.concat([df_segm_kmeans_pca.reset_index(drop=True),pd.DataFrame(scores_pca)],axis=1)
df_segm_kmeans_pca.columns.values[-10:]=['component_'+str(i) for i in range(1,11)]
df_segm_kmeans_pca['Labels']=df_segm_kmeans_pca['segment K-means'].map({
0:"cluster_1",
1:"cluster_2",
2:"cluster_3"
})
df_segm_kmeans_pca.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
gender | age | country | joined year | segment K-means | component_1 | component_2 | component_3 | component_4 | component_5 | component_6 | component_7 | component_8 | component_9 | component_10 | Labels | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | f | 22.0 | Germany | 2007 | 0 | 3.079249 | -0.965742 | 0.840540 | -0.434122 | -0.271597 | -0.097946 | -0.395199 | 0.706646 | 0.395826 | 0.007829 | cluster_1 |
1 | f | 25.0 | Canada | 2007 | 0 | 0.079318 | -0.983262 | 0.853291 | -0.389060 | -0.142788 | -0.104055 | 0.059069 | -0.002347 | -0.098996 | -0.058780 | cluster_1 |
2 | m | 25.0 | Germany | 2006 | 0 | 0.076821 | 0.437602 | 0.083927 | 0.725797 | -0.504299 | -0.281414 | -0.455975 | 0.678798 | 0.346492 | 0.029450 | cluster_1 |
3 | m | 19.0 | Mexico | 2008 | 0 | 6.077364 | 0.200957 | -0.766265 | -0.276504 | -0.052819 | -0.101826 | 0.002710 | -0.052166 | -0.080768 | -0.017021 | cluster_1 |
4 | m | 28.0 | United States | 2006 | 0 | -2.923463 | 0.394624 | 0.137001 | 0.997569 | 0.572417 | -0.371891 | -0.118602 | -0.063246 | 0.063210 | 0.007534 | cluster_1 |
x_axis=df_segm_kmeans_pca['component_1']
y_axis=df_segm_kmeans_pca['component_10']
plt.figure(figsize=(10,8))
sns.scatterplot(x_axis,y_axis,hue=df_segm_kmeans_pca['Labels'])
plt.title('Segmentation K-means PCA')
plt.show()
import pickle
pickle.dump(pca,open('pca.pickle','wb'))
pickle.dump(kmeans_pca,open('kmeans_pca.pickle','wb'))
df_segm_kmeans_pca.to_csv('user_segm_kmeans_pca.csv',index=False)