Skip to content

A hybrid recommendation system built on top of surprise library on user-song-rating along with user segmentation using clustering techniques

Notifications You must be signed in to change notification settings

srinivas365/Hybrid_Recommender_Systems

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

3 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

Building a Hybrid Recommendation

This notebook contains following sections

  1. Importing necessary Libraries & dataset

  2. Building a dataset Module

  3. Building performance Module

  4. Building Evaluator Module

    1. Evaluated Algorithm submodule
    2. Evaluated Data submodule
  5. Building Hybrid Module

from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
!pip install surprise
Requirement already satisfied: surprise in /usr/local/lib/python3.6/dist-packages (0.1)
Requirement already satisfied: scikit-surprise in /usr/local/lib/python3.6/dist-packages (from surprise) (1.1.0)
Requirement already satisfied: scipy>=1.0.0 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (1.4.1)
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (0.14.1)
Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (1.12.0)
Requirement already satisfied: numpy>=1.11.2 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (1.18.2)
folderpath='drive/My Drive/datasets/'

Importing Necessary Libraries

import os
import csv
import sys
import re

import numpy as np
import pandas as pd

from surprise import Dataset
from surprise import Reader
from surprise import dump

from collections import defaultdict

alt text

DataLoader Module

This module takes the raw dataset and provides the processed the dataset along with other details

It has following functions

  1. loadDataset
  2. getUserRating
  3. getPopularityRanking
  4. getArtistName
  5. getArtistID
#user_id	artist_mbid	artist_name	plays	norm_plays	rating

class DataLoader:
    path='drive/My Drive/datasets/user-songs-rating-3000.csv'
    artistID_to_name={}
    name_to_artistID={}
    #user_id	artist_mbid	norm_plays	rating
    
    def loadDataset(self):

        ratingsDataset = 0
        self.artistID_to_name = {}
        self.name_to_artistID = {}

        reader = Reader(rating_scale=(0, 5))
        df_matrix=pd.read_csv(self.path)
        #df_matrix=df_matrix.iloc[:200000,:]
        ratingsDataset= Dataset.load_from_df(df_matrix[['user_id', 'artist_mbid', 'rating']], reader)
    
        with open(self.path, newline='', encoding='ISO-8859-1') as csvfile:
                artistReader = csv.reader(csvfile)
                next(artistReader)  #Skip header line
                for row in artistReader:
                    artistID = row[1]
                    artistName = row[2]
                    self.artistID_to_name[artistID] = artistName
                    self.name_to_artistID[artistName] = artistID

        return ratingsDataset
    
    def getUserRatings(self, user):
        userRatings = []
        hitUser = False
        with open(self.path, newline='', encoding='ISO-8859-1') as csvfile:
            ratingReader = csv.reader(csvfile)
            next(ratingReader)
            for row in ratingReader:
                userID = row[0]
                if (user == userID):
                    artistID = row[1]
                    rating = float(row[5])
                    userRatings.append((artistID, rating))
                    hitUser = True
                if (hitUser and (user != userID)):
                    break

        return userRatings
    
    def getPopularityRanks(self):
        ratings = defaultdict(int)
        rankings = defaultdict(int)
        with open(self.path, newline='', encoding='ISO-8859-1') as csvfile:
            ratingReader = csv.reader(csvfile)
            next(ratingReader)
            for row in ratingReader:
                artistID = row[1]
                ratings[artistID] += 1
        rank = 1
        for artistID, ratingCount in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
            rankings[artistID] = rank
            rank += 1
        return rankings
    
    def getArtistName(self, artistID):
        if artistID in self.artistID_to_name:
            return self.artistID_to_name[artistID]
        else:
            return ""
        
    def getArtistID(self, artistName):
        if artistName in self.name_to_artistID:
            return self.name_to_artistID[artistName]
        else:
            return 0
    

performance Class Module

This module generated the metrics by taking the predictions of the models. It outputs two metrics

  1. Mean Absolute Error
  2. Root mean square Error
from surprise import accuracy
class PerformanceMetrics:
	
	def MAE(predictions):
		return accuracy.mae(predictions)
		
	def RMSE(predictions):
		return accuracy.rmse(predictions)

ModelBuilder Module

This module is to build the algorithms/models to train the dataset It has following models

  1. getName - returns the name of model
  2. getModel - returns the model
  3. saveModel - save the model
  4. Evaluate - train the model and returns the metrics
class ModelBuilder:
    def __init__(self, model, name):
        self.model = model
        self.name = name
    def GetName(self):
        return self.name
    
    def GetModel(self):
        return self.model

    def SaveModel(self,predictions):
        
        dump.dump(folderpath+self.name,predictions,self.model)
        print('Model saved at '+folderpath+self.name)
        
    
    def Evaluate(self, evaluationData,save=False):
        metrics = {}
        # Compute accuracy
    
        print("Evaluating accuracy...")
        predictions = self.model.fit(evaluationData.GetTrainSet()).test(evaluationData.GetTestSet())
        metrics["RMSE"] = PerformanceMetrics.RMSE(predictions)
        metrics["MAE"] = PerformanceMetrics.MAE(predictions)
        
        
        print("Analysis complete.")

        if(save):
            print('saving the model.....')
            self.SaveModel(predictions)
            
    
        return metrics
    
    
    

ModelFactory Module

This module is used to load a set of models into the returns the metrics/performace of each algorithm

It has following functions

  1. addmodel
  2. Evaluate
  3. flushModels
class ModelFactory:
    
    models = []
    
    def __init__(self, dataset):        
        ed = DataGenerator(dataset)
        self.dataset = ed
        self.models=[]
        
    def AddModel(self, model, name):
        alg = ModelBuilder(model, name)
        self.models.append(alg)
        
    def Evaluate(self,save=False):
        results = {}
        for model in self.models:
            print("Evaluating ", model.GetName(), "...")
            results[model.GetName()] = model.Evaluate(self.dataset,save)

        # Print results
        print("\n")
        print(results)
    def flushModels(self):
        self.models=[]

DataGenerator Module

This model takes the dataset splits it into training dataset and testing dataset and return them

from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut

class DataGenerator:
    
    def __init__(self, data):
        #Build a 75/25 train/test split for measuring accuracy
        self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1)
            
    def GetTrainSet(self):
        return self.trainSet
    
    def GetTestSet(self):
        return self.testSet
    

Hybrid Algorithm Module

This module takes multiple models along with their preference weights and returns the results

from surprise import AlgoBase

class HybridModel(AlgoBase):

    def __init__(self, models, weights, sim_options={}):
        AlgoBase.__init__(self)
        self.models = models
        self.weights = weights

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)
        
        for model in self.models:
            model.fit(trainset)
                
        return self

    def estimate(self, user_id, item_id):
        
        scores_sum = 0
        weights_sum = 0
        
        for i in range(len(self.models)):
            scores_sum += self.models[i].estimate(user_id, item_id) * self.weights[i] # 3*1/4+4*3/4 laga ra
            weights_sum += self.weights[i] # always becomes one
            
        return scores_sum / weights_sum
def LoadData():
    ml = DataLoader()
    print("Loading songs ratings...")
    data = ml.loadDataset()
    return (ml, data)
# Load up common data set for the recommender algorithms
(ml, evaluationData) = LoadData()
Loading songs ratings...
from surprise import BaselineOnly
#Construct an Evaluator to, you know, evaluate them
modelfactory = ModelFactory(evaluationData)

# BaselineOnly
baseline= BaselineOnly()
modelfactory.AddModel(baseline, "baseline")
modelfactory.Evaluate(True)
Evaluating  baseline ...
Evaluating accuracy...
Estimating biases using als...
RMSE: 0.9934
MAE:  0.6817
Analysis complete.
saving the model.....
Model saved at drive/My Drive/datasets/baseline


{'baseline': {'RMSE': 0.9934226190571713, 'MAE': 0.6816700266309835}}
from surprise import SVD
# BaselineOnly
svd= SVD()
modelfactory.AddModel(svd, "svd")
modelfactory.Evaluate()
Evaluating  baseline ...
Evaluating accuracy...
Estimating biases using als...
RMSE: 0.9934
MAE:  0.6817
Analysis complete.
Evaluating  svd ...
Evaluating accuracy...
RMSE: 1.0063
MAE:  0.6831
Analysis complete.


{'baseline': {'RMSE': 0.9934226190571713, 'MAE': 0.6816700266309835}, 'svd': {'RMSE': 1.0063057804737225, 'MAE': 0.6830706904210475}}
#Combine them
Hybrid = HybridModel([svd, baseline], [0.5, 0.5])
# Fight!
modelfactory.AddModel(Hybrid, "Hybrid")
modelfactory.Evaluate(True)
Evaluating  baseline ...
Evaluating accuracy...
Estimating biases using als...
RMSE: 0.9934
MAE:  0.6817
Analysis complete.
saving the model.....
Model saved at drive/My Drive/datasets/baseline
Evaluating  svd ...
Evaluating accuracy...
RMSE: 1.0058
MAE:  0.6821
Analysis complete.
saving the model.....
Model saved at drive/My Drive/datasets/svd
Evaluating  Hybrid ...
Evaluating accuracy...
Estimating biases using als...
RMSE: 0.9971
MAE:  0.6795
Analysis complete.
saving the model.....
Model saved at drive/My Drive/datasets/Hybrid


{'baseline': {'RMSE': 0.9934226190571713, 'MAE': 0.6816700266309835}, 'svd': {'RMSE': 1.005848123790534, 'MAE': 0.6821244100215875}, 'Hybrid': {'RMSE': 0.9970686978736479, 'MAE': 0.6794833671716486}}

Training dataset using Deep Learing Technique

RBMs have two layers, input layer which is also known as visible layer and the hidden layer. The neurons in each layer communicate with neurons in the other layer but not with neurons in the same layer. there is no intralayer communication among the neurons.

import sys
sys.path.append('/content/drive/My Drive/datasets/')
import RBM 
import RBMModel

import importlib
importlib.reload(RBM)
importlib.reload(RBMModel)

# Construct an Evaluator to, you know, evaluate them
deep_factory= ModelFactory(evaluationData)

#Simple RBM
SimpleRBM = RBMModel.RBMAlgorithm(epochs=10)
deep_factory.AddModel(SimpleRBM,'rbm')

svd= SVD()
deep_factory.AddModel(svd, "svd")



#Combine them
Hybrid = HybridModel([svd, SimpleRBM], [0.5, 0.5])
# Fight!
deep_factory.AddModel(Hybrid, "Hybrid")

deep_factory.Evaluate()
Evaluating  rbm ...
Evaluating accuracy...
Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Processing user  300
Processing user  350
Processing user  400
Processing user  450
Processing user  500
Processing user  550
Processing user  600
Processing user  650
Processing user  700
Processing user  750
Processing user  800
Processing user  850
Processing user  900
Processing user  950
Processing user  1000
Processing user  1050
Processing user  1100
Processing user  1150
Processing user  1200
Processing user  1250
Processing user  1300
Processing user  1350
Processing user  1400
Processing user  1450
Processing user  1500
Processing user  1550
Processing user  1600
Processing user  1650
Processing user  1700
Processing user  1750
Processing user  1800
Processing user  1850
Processing user  1900
Processing user  1950
Processing user  2000
Processing user  2050
Processing user  2100
Processing user  2150
Processing user  2200
Processing user  2250
Processing user  2300
Processing user  2350
Processing user  2400
Processing user  2450
Processing user  2500
Processing user  2550
Processing user  2600
Processing user  2650
Processing user  2700
Processing user  2750
Processing user  2800
Processing user  2850
Processing user  2900
Processing user  2950
RMSE: 1.5733
MAE:  1.4437
Analysis complete.
Evaluating  svd ...
Evaluating accuracy...
RMSE: 1.0070
MAE:  0.6827
Analysis complete.
Evaluating  Hybrid ...
Evaluating accuracy...
Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Processing user  300
Processing user  350
Processing user  400
Processing user  450
Processing user  500
Processing user  550
Processing user  600
Processing user  650
Processing user  700
Processing user  750
Processing user  800
Processing user  850
Processing user  900
Processing user  950
Processing user  1000
Processing user  1050
Processing user  1100
Processing user  1150
Processing user  1200
Processing user  1250
Processing user  1300
Processing user  1350
Processing user  1400
Processing user  1450
Processing user  1500
Processing user  1550
Processing user  1600
Processing user  1650
Processing user  1700
Processing user  1750
Processing user  1800
Processing user  1850
Processing user  1900
Processing user  1950
Processing user  2000
Processing user  2050
Processing user  2100
Processing user  2150
Processing user  2200
Processing user  2250
Processing user  2300
Processing user  2350
Processing user  2400
Processing user  2450
Processing user  2500
Processing user  2550
Processing user  2600
Processing user  2650
Processing user  2700
Processing user  2750
Processing user  2800
Processing user  2850
Processing user  2900
Processing user  2950
RMSE: 1.1650
MAE:  1.0040
Analysis complete.


{'rbm': {'RMSE': 1.5732653327536739, 'MAE': 1.4437298577219584}, 'svd': {'RMSE': 1.0069576973032366, 'MAE': 0.6826501315236461}, 'Hybrid': {'RMSE': 1.1649843535460234, 'MAE': 1.0040280997374962}}

User Segmentation using Clustering Techniques

Customer Segmentation is one the most important applications of unsupervised learning. Using clustering techniques, one can identify the several segments of customers allowing them to target the potential user base.

This notebook has the following sections

  1. Importing Necessary Libraries
  2. Loading the data
  3. Exploratory Data Analysis
  4. Applying clustering Techniques
  5. Finding the optimal Clusters
  6. Applying PCA
  7. Find the optimal clusters
  8. Visualization of clusters

1. Importing the libraries

import numpy as np
import pandas as pd

# for standardizing the dataset
from sklearn.preprocessing import StandardScaler

# for visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# for the clustering techniques
from scipy.cluster.hierarchy import dendrogram,linkage
from sklearn.cluster import KMeans

Loading the dataset

df_users=pd.read_csv('user_side_data_raw.tsv',sep='\t',names=['id','gender','age','country','date'])
df_users.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
id gender age country date
0 00000c289a1829a808ac09c00daf10bc3c4e223b f 22.0 Germany Feb 1, 2007
1 00001411dc427966b17297bf4d69e7e193135d89 f NaN Canada Dec 4, 2007
2 00004d2ac9316e22dc007ab2243d6fcb239e707d NaN NaN Germany Sep 1, 2006
3 000063d3fe1cf2ba248b9e3c3f0334845a27a6bf m 19.0 Mexico Apr 28, 2008
4 00007a47085b9aab8af55f52ec8846ac479ac4fe m 28.0 United States Jan 27, 2006
df_users.shape
(359347, 5)
df_users.isnull().sum()
id             0
gender     32775
age        74900
country        0
date           0
dtype: int64
# proportions of null values
(df_users.gender.isnull().sum()/df_users.shape[0])*100
9.12071062232327
(df_users.age.isnull().sum()/df_users.shape[0])*100
20.84336310029025
df_users.gender.fillna(df_users.gender.mode()[0],inplace=True)
df_users.age.fillna(int(df_users.age.mean()),inplace=True)
df_users.isnull().sum()
id         0
gender     0
age        0
country    0
date       0
dtype: int64

Exploratory data analysis

df_users.gender.value_counts()
m    274417
f     84930
Name: gender, dtype: int64
df_users.age.mean()
25.07761022076155
df_users.country.value_counts()
United States               67044
Germany                     31651
United Kingdom              29902
Poland                      20987
Russian Federation          19833
Brazil                      14534
Sweden                      13122
Spain                       13051
Finland                     11579
Netherlands                  9650
Canada                       8679
France                       7529
Italy                        7525
Australia                    7135
Japan                        6637
Turkey                       6452
Norway                       5155
Mexico                       4834
Czech Republic               4774
Ukraine                      4396
Belgium                      3803
Portugal                     3196
Switzerland                  3053
Bulgaria                     2800
Austria                      2796
Chile                        2794
Argentina                    2640
Romania                      2636
Denmark                      2508
Hungary                      1985
                            ...  
Equatorial Guinea              10
Saint Kitts and Nevis          10
San Marino                      9
Cameroon                        9
Namibia                         9
Senegal                         9
Saint Lucia                     8
Rwanda                          8
Gabon                           8
Comoros                         8
Mayotte                         8
Tonga                           8
Grenada                         7
Myanmar                         7
Mali                            7
Malawi                          6
Belize                          6
Eritrea                         6
Guinea-Bissau                   5
Central African Republic        5
Sudan                           5
Guyana                          5
Suriname                        5
Mauritania                      5
Marshall Islands                4
French Guiana                   4
Palau                           3
Benin                           3
Liberia                         3
Gambia                          3
Name: country, Length: 239, dtype: int64
df_users['joined year']=df_users.date.apply(lambda x:x[-4:])
df_users.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
id gender age country date joined year
0 00000c289a1829a808ac09c00daf10bc3c4e223b f 22.0 Germany Feb 1, 2007 2007
1 00001411dc427966b17297bf4d69e7e193135d89 f 25.0 Canada Dec 4, 2007 2007
2 00004d2ac9316e22dc007ab2243d6fcb239e707d m 25.0 Germany Sep 1, 2006 2006
3 000063d3fe1cf2ba248b9e3c3f0334845a27a6bf m 19.0 Mexico Apr 28, 2008 2008
4 00007a47085b9aab8af55f52ec8846ac479ac4fe m 28.0 United States Jan 27, 2006 2006
df_users.drop(['date'],axis=1,inplace=True)
df_users.set_index(['id'],inplace=True)
df_users.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
gender age country joined year
id
00000c289a1829a808ac09c00daf10bc3c4e223b f 22.0 Germany 2007
00001411dc427966b17297bf4d69e7e193135d89 f 25.0 Canada 2007
00004d2ac9316e22dc007ab2243d6fcb239e707d m 25.0 Germany 2006
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf m 19.0 Mexico 2008
00007a47085b9aab8af55f52ec8846ac479ac4fe m 28.0 United States 2006
df_seg=pd.get_dummies(df_users,['gender','country','joined year'])
df_seg.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
age gender_f gender_m country_Afghanistan country_Albania country_Algeria country_American Samoa country_Andorra country_Angola country_Anguilla ... country_Zambia country_Zimbabwe joined year_2002 joined year_2003 joined year_2004 joined year_2005 joined year_2006 joined year_2007 joined year_2008 joined year_2009
id
00000c289a1829a808ac09c00daf10bc3c4e223b 22.0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
00001411dc427966b17297bf4d69e7e193135d89 25.0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
00004d2ac9316e22dc007ab2243d6fcb239e707d 25.0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf 19.0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
00007a47085b9aab8af55f52ec8846ac479ac4fe 28.0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0

5 rows × 250 columns

df_users.describe(include='all')
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
gender age country joined year
count 359347 359347.000000 359347 359347
unique 2 NaN 239 8
top m NaN United States 2008
freq 274417 NaN 67044 120808
mean NaN 25.077610 NaN NaN
std NaN 19.276048 NaN NaN
min NaN -1337.000000 NaN NaN
25% NaN 21.000000 NaN NaN
50% NaN 25.000000 NaN NaN
75% NaN 26.000000 NaN NaN
max NaN 1002.000000 NaN NaN
from sklearn.preprocessing import LabelEncoder
lb_encoder=LabelEncoder()
countries=lb_encoder.fit_transform(df_users.country)
year_encoder=LabelEncoder()
years=lb_encoder.fit_transform(df_users['joined year'])
years
array([5, 5, 4, ..., 5, 3, 6])
gender_encoder=LabelEncoder()
genders=lb_encoder.fit_transform(df_users.gender)
genders
array([0, 0, 1, ..., 1, 1, 1])

Applying clustering Techniques

Clustering is the task of grouping together a set of objects in a way that objects in the same cluster are more similar to each other than to objects in other clusters. Similarity is a metric that reflects the strength of relationship between two data objects. Clustering is mainly used for exploratory data mining.

we have two ways to find the optimal no.of clusters

  1. By Hierarchical clustering we can find the optimal no.of clusters by observing the dendogram
  2. By ploting the WCSS of a range of cluster using Bottom up approach

The first option crashed the kernel because of lack of enough hardware requirement in the laptop.I'm opting for second one

K-Means attempts to classify data without having first been trained with labeled data. Once the algorithm has been run and the groups are defined, any new data can be easily assigned to the most relevant group.

wcss=[]
for i in range(1,11):
    kmeans=KMeans(n_clusters=i,init='k-means++',random_state=42)
    kmeans.fit(df_seg)
    wcss.append(kmeans.inertia_)
    print(kmeans.inertia_)
134258498.48275355
28492159.521132484
15087558.004872149
8737924.294237286
5680436.533094462
4150917.3284346457
3079109.5899161794
2474939.295240826
2105803.1053973585
1820427.9377313748
wcss
[134258498.48275355,
 28492159.521132484,
 15087558.004872149,
 8737924.294237286,
 5680436.533094462,
 4150917.3284346457,
 3079109.5899161794,
 2474939.295240826,
 2105803.1053973585,
 1820427.9377313748]
f=open('wcss.txt','w')
for i in wcss:
    f.write(str(i))
    f.write('\n')
f.close()
plt.figure(figsize=(12,9))
plt.plot(range(1,11),wcss,marker='o',linestyle='--')
plt.xlabel('Number of clusters')
plt.ylabel('wcss')
plt.show()

png

We graph the relationship between the number of clusters and Within Cluster Sum of Squares (WCSS) then we select the number of clusters where the change in WCSS begins to level off (elbow method).

WCSS is defined as the sum of the squared distance between each member of the cluster and its centroid.

kmeans=KMeans(n_clusters=3,init='k-means++',random_state=42)
kmeans.fit(df_seg)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0)
df_segm_kmeans=df_seg.copy()
df_segm_kmeans['segment K-means']=kmeans.labels_

Applying the clustering technique with Label Encoding

df_users.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
gender age country joined year
id
00000c289a1829a808ac09c00daf10bc3c4e223b f 22.0 Germany 2007
00001411dc427966b17297bf4d69e7e193135d89 f 25.0 Canada 2007
00004d2ac9316e22dc007ab2243d6fcb239e707d m 25.0 Germany 2006
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf m 19.0 Mexico 2008
00007a47085b9aab8af55f52ec8846ac479ac4fe m 28.0 United States 2006
df_users_en=df_users.copy()
df_users_en['gender']=genders
df_users_en['country']=countries
df_users_en['joined year']=years
df_users_en.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
gender age country joined year
id
00000c289a1829a808ac09c00daf10bc3c4e223b 0 22.0 79 5
00001411dc427966b17297bf4d69e7e193135d89 0 25.0 37 5
00004d2ac9316e22dc007ab2243d6fcb239e707d 1 25.0 79 4
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf 1 19.0 136 6
00007a47085b9aab8af55f52ec8846ac479ac4fe 1 28.0 225 4
test_wcss=[]
for i in range(1,11):
    kmeans=KMeans(n_clusters=i,init='k-means++',random_state=42)
    kmeans.fit(df_users_en)
    test_wcss.append(kmeans.inertia_)
    print(kmeans.inertia_)
2086135898.8955686
431290273.3553824
290135430.90301967
184763290.25917345
86585872.56605095
66684087.42352505
52578488.68448655
44171563.90171374
38522141.4220099
33634118.314277805
plt.figure(figsize=(12,9))
plt.plot(range(1,11),test_wcss,marker='o',linestyle='--')
plt.xlabel('Number of clusters')
plt.ylabel('wcss')
plt.show()

png

From the above graph we can conclude that there are three optimal clusters can be formed using Elbow-point method

kmeans=KMeans(n_clusters=3,init='k-means++',random_state=42)
kmeans.fit(df_users_en)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0)
df_en_segm_kmeans=df_users_en.copy()
df_en_segm_kmeans['segment K-means']=kmeans.labels_
df_en_segm_kmeans.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
gender age country joined year segment K-means
id
00000c289a1829a808ac09c00daf10bc3c4e223b 0 22.0 79 5 0
00001411dc427966b17297bf4d69e7e193135d89 0 25.0 37 5 0
00004d2ac9316e22dc007ab2243d6fcb239e707d 1 25.0 79 4 0
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf 1 19.0 136 6 2
00007a47085b9aab8af55f52ec8846ac479ac4fe 1 28.0 225 4 1
df_en_segm_kmeans.groupby(['segment K-means']).count()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
gender age country joined year
segment K-means
0 139132 139132 139132 139132
1 143475 143475 143475 143475
2 76740 76740 76740 76740
x_axis=df_en_segm_kmeans['age']
y_axis=df_en_segm_kmeans['gender']
plt.figure(figsize=(10,8))
sns.scatterplot(x_axis,y_axis,hue=df_en_segm_kmeans['segment K-means'])
plt.title('Segmentation K-means')
plt.show()

png

PCA

It is a common practice to apply PCA (principal component analysis) before a clustering algorithm (such as k-means). It is believed that it improves the clustering results in practice (noise reduction).

df_seg.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
age gender_f gender_m country_Afghanistan country_Albania country_Algeria country_American Samoa country_Andorra country_Angola country_Anguilla ... country_Zambia country_Zimbabwe joined year_2002 joined year_2003 joined year_2004 joined year_2005 joined year_2006 joined year_2007 joined year_2008 joined year_2009
id
00000c289a1829a808ac09c00daf10bc3c4e223b 22.0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
00001411dc427966b17297bf4d69e7e193135d89 25.0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
00004d2ac9316e22dc007ab2243d6fcb239e707d 25.0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf 19.0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
00007a47085b9aab8af55f52ec8846ac479ac4fe 28.0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0

5 rows × 250 columns

df_seg.shape
(359347, 250)
from sklearn.decomposition import PCA
pca=PCA()
pca.fit(df_seg)
PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

On the applying the pca on the dataset we can get the variance explained by each column/field. As a rule of thumb we have to retain the components explaining more than 80% variance

pca.explained_variance_ratio_
array([9.94508186e-01, 9.73522817e-04, 8.20579773e-04, 5.95721570e-04,
       4.23043046e-04, 3.16620110e-04, 2.36815861e-04, 2.24564767e-04,
       1.94659919e-04, 1.52096641e-04, 1.32009525e-04, 1.04611178e-04,
       9.71849613e-05, 9.02506282e-05, 7.90944697e-05, 6.88602904e-05,
       6.24582865e-05, 5.91033460e-05, 5.59826220e-05, 5.40947256e-05,
       5.09339156e-05, 4.85152270e-05, 4.18295955e-05, 3.72284667e-05,
       3.57277045e-05, 3.34830053e-05, 2.97241284e-05, 2.62015022e-05,
       2.33735151e-05, 2.21352569e-05, 2.08321749e-05, 2.08007512e-05,
       2.00785538e-05, 1.96385456e-05, 1.88673504e-05, 1.61374506e-05,
       1.47573341e-05, 1.44643374e-05, 1.40240498e-05, 1.37709835e-05,
       1.31064515e-05, 1.23692322e-05, 1.18111175e-05, 1.10000004e-05,
       1.00279260e-05, 9.47876445e-06, 9.14714313e-06, 8.86760541e-06,
       8.67800866e-06, 7.70307208e-06, 7.67159210e-06, 5.76894105e-06,
       5.19930343e-06, 4.80708919e-06, 4.56295442e-06, 4.46261434e-06,
       4.26692383e-06, 3.83936073e-06, 3.67754963e-06, 3.34438438e-06,
       3.18773258e-06, 3.12815711e-06, 2.87014131e-06, 2.75656584e-06,
       2.72712673e-06, 2.47237075e-06, 2.38166637e-06, 2.19906353e-06,
       2.05900936e-06, 1.97978608e-06, 1.90307330e-06, 1.68047567e-06,
       1.59470740e-06, 1.50658856e-06, 1.46615802e-06, 1.33682246e-06,
       1.27957460e-06, 1.07718066e-06, 9.98022358e-07, 9.68251415e-07,
       9.57905326e-07, 9.19400987e-07, 8.45930243e-07, 7.94271484e-07,
       7.54658274e-07, 7.44790552e-07, 7.20438788e-07, 6.95982213e-07,
       6.74934359e-07, 6.59264476e-07, 6.50888084e-07, 6.25582753e-07,
       6.10743920e-07, 5.90678072e-07, 5.73134299e-07, 5.59405831e-07,
       5.51150575e-07, 5.40057633e-07, 5.20446256e-07, 5.04597269e-07,
       4.87954010e-07, 4.76684985e-07, 4.71534318e-07, 4.63970861e-07,
       4.38285746e-07, 4.16822315e-07, 3.99990982e-07, 3.95251715e-07,
       3.89967669e-07, 3.82741298e-07, 3.75220497e-07, 3.67580918e-07,
       3.56091579e-07, 3.47480286e-07, 3.42618219e-07, 3.42610019e-07,
       3.30604562e-07, 3.26406772e-07, 3.20276480e-07, 3.06736254e-07,
       2.97923602e-07, 2.93015723e-07, 2.85970480e-07, 2.76959634e-07,
       2.68104924e-07, 2.64372514e-07, 2.58769842e-07, 2.53241597e-07,
       2.53223936e-07, 2.49628719e-07, 2.45792638e-07, 2.45707335e-07,
       2.40951568e-07, 2.36566801e-07, 2.30896537e-07, 2.30892295e-07,
       2.30889871e-07, 2.30885957e-07, 2.26742891e-07, 2.23448883e-07,
       2.23447035e-07, 2.23439483e-07, 2.19663069e-07, 2.16001219e-07,
       2.16000859e-07, 2.15998211e-07, 2.15996978e-07, 2.15995929e-07,
       2.15979867e-07, 2.09163783e-07, 2.02775823e-07, 2.01098199e-07,
       1.94860441e-07, 1.89790492e-07, 1.86207356e-07, 1.86205351e-07,
       1.86196977e-07, 1.79897678e-07, 1.75267874e-07, 1.71310697e-07,
       1.71305824e-07, 1.71303737e-07, 1.67462478e-07, 1.63862863e-07,
       1.63862532e-07, 1.63860469e-07, 1.63858676e-07, 1.63857267e-07,
       1.63857164e-07, 1.58171815e-07, 1.56413481e-07, 1.56410913e-07,
       1.56410211e-07, 1.50704835e-07, 1.48965558e-07, 1.48965319e-07,
       1.48959351e-07, 1.42463851e-07, 1.41512222e-07, 1.35211289e-07,
       1.34064397e-07, 1.28641246e-07, 1.26620115e-07, 1.26619194e-07,
       1.26529159e-07, 1.20225344e-07, 1.19170003e-07, 1.13592850e-07,
       1.11723423e-07, 1.11721851e-07, 1.07657805e-07, 1.04276435e-07,
       1.04275964e-07, 1.04275338e-07, 1.04274754e-07, 1.04273034e-07,
       1.04271979e-07, 1.04267122e-07, 1.04266543e-07, 9.86215397e-08,
       9.68281310e-08, 9.68280733e-08, 9.68274111e-08, 9.68261044e-08,
       9.68248163e-08, 9.68229630e-08, 9.01822837e-08, 8.93774534e-08,
       8.93753397e-08, 8.34459092e-08, 8.19308868e-08, 8.19304574e-08,
       8.19298303e-08, 8.19292872e-08, 8.19169571e-08, 7.53686097e-08,
       7.44816457e-08, 7.44791372e-08, 7.43903163e-08, 6.79021000e-08,
       6.70341310e-08, 6.70330907e-08, 6.70325489e-08, 6.06907254e-08,
       5.95865186e-08, 5.95863903e-08, 5.95858414e-08, 5.95854838e-08,
       5.95849659e-08, 5.26219570e-08, 5.21376438e-08, 5.21365906e-08,
       4.51453399e-08, 4.46894504e-08, 4.46888105e-08, 3.79871192e-08,
       3.72415398e-08, 3.72414852e-08, 3.72413734e-08, 3.72403576e-08,
       3.72401695e-08, 2.99931770e-08, 2.97931632e-08, 2.26592117e-08,
       2.23449135e-08, 2.23447407e-08, 2.23443392e-08, 3.07797804e-29,
       4.13256137e-31, 1.35450131e-31])
plt.figure(figsize=(12,9))
plt.plot(range(1,50),pca.explained_variance_ratio_.cumsum()[:49],marker='o',linestyle='--')
plt.title('Explained variance by components')
plt.xlabel('No.of components')
plt.ylabel('Explained variance')
plt.show()

png

From the above graph we can see that the 10 components are explaining more than 98% variance itself

pca=PCA(n_components=10)
pca.fit(df_seg)
PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)
scores_pca=pca.transform(df_seg)
scores_pca
array([[ 3.07924934e+00, -9.65742459e-01,  8.40539686e-01, ...,
         7.06646168e-01,  3.95826494e-01,  7.82931891e-03],
       [ 7.93175181e-02, -9.83262193e-01,  8.53291430e-01, ...,
        -2.34652219e-03, -9.89962221e-02, -5.87799257e-02],
       [ 7.68214240e-02,  4.37602123e-01,  8.39268432e-02, ...,
         6.78797567e-01,  3.46492084e-01,  2.94501037e-02],
       ...,
       [ 5.07686694e+00,  3.86188080e-01,  6.60299973e-01, ...,
        -7.11075126e-02,  1.01004688e-01,  7.92880964e-03],
       [ 5.07701841e+00,  3.97111786e-01,  8.15172939e-03, ...,
         3.78636281e-01, -5.27255987e-01, -6.20991725e-01],
       [ 4.07736262e+00,  2.01415376e-01, -7.59616724e-01, ...,
        -4.35712057e-02, -8.14134376e-02, -1.57825945e-02]])
pca_wcss=[]
for i in range(1,11):
    kmeans_pca=KMeans(n_clusters=i,init='k-means++',random_state=42)
    kmeans_pca.fit(scores_pca)
    pca_wcss.append(kmeans_pca.inertia_)
    print(kmeans_pca.inertia_)
134049835.27223751
28283517.23694611
14868999.474239191
8548741.72116922
5472147.923170721
3942546.1647670097
2881208.2835925072
2260121.1429546513
1886633.3945844688
1612138.7509535719
plt.figure(figsize=(12,9))
plt.plot(range(1,11),pca_wcss,marker='o',linestyle='--')
plt.xlabel('components')
plt.ylabel('wcss')
plt.show()

png

From the above graph we can conclude to take optimal no.of clusters to be 3 which supports the Kmeans without PCA and as well as without LabelEncoding

kmeans_pca=KMeans(n_clusters=3,init='k-means++',random_state=42)
kmeans_pca.fit(scores_pca)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0)
df_users_en.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
gender age country joined year
id
00000c289a1829a808ac09c00daf10bc3c4e223b 0 22.0 79 5
00001411dc427966b17297bf4d69e7e193135d89 0 25.0 37 5
00004d2ac9316e22dc007ab2243d6fcb239e707d 1 25.0 79 4
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf 1 19.0 136 6
00007a47085b9aab8af55f52ec8846ac479ac4fe 1 28.0 225 4
df_segm_kmeans_pca=df_users.copy()
df_segm_kmeans_pca['segment K-means']=kmeans_pca.labels_
df_segm_kmeans_pca.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
gender age country joined year segment K-means
id
00000c289a1829a808ac09c00daf10bc3c4e223b f 22.0 Germany 2007 0
00001411dc427966b17297bf4d69e7e193135d89 f 25.0 Canada 2007 0
00004d2ac9316e22dc007ab2243d6fcb239e707d m 25.0 Germany 2006 0
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf m 19.0 Mexico 2008 0
00007a47085b9aab8af55f52ec8846ac479ac4fe m 28.0 United States 2006 0
df_segm_kmeans_pca=pd.concat([df_segm_kmeans_pca.reset_index(drop=True),pd.DataFrame(scores_pca)],axis=1)
df_segm_kmeans_pca.columns.values[-10:]=['component_'+str(i) for i in range(1,11)]
df_segm_kmeans_pca['Labels']=df_segm_kmeans_pca['segment K-means'].map({
    0:"cluster_1",
    1:"cluster_2",
    2:"cluster_3"            
})
df_segm_kmeans_pca.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
gender age country joined year segment K-means component_1 component_2 component_3 component_4 component_5 component_6 component_7 component_8 component_9 component_10 Labels
0 f 22.0 Germany 2007 0 3.079249 -0.965742 0.840540 -0.434122 -0.271597 -0.097946 -0.395199 0.706646 0.395826 0.007829 cluster_1
1 f 25.0 Canada 2007 0 0.079318 -0.983262 0.853291 -0.389060 -0.142788 -0.104055 0.059069 -0.002347 -0.098996 -0.058780 cluster_1
2 m 25.0 Germany 2006 0 0.076821 0.437602 0.083927 0.725797 -0.504299 -0.281414 -0.455975 0.678798 0.346492 0.029450 cluster_1
3 m 19.0 Mexico 2008 0 6.077364 0.200957 -0.766265 -0.276504 -0.052819 -0.101826 0.002710 -0.052166 -0.080768 -0.017021 cluster_1
4 m 28.0 United States 2006 0 -2.923463 0.394624 0.137001 0.997569 0.572417 -0.371891 -0.118602 -0.063246 0.063210 0.007534 cluster_1
x_axis=df_segm_kmeans_pca['component_1']
y_axis=df_segm_kmeans_pca['component_10']
plt.figure(figsize=(10,8))
sns.scatterplot(x_axis,y_axis,hue=df_segm_kmeans_pca['Labels'])
plt.title('Segmentation K-means PCA')
plt.show()

png

import pickle

pickle.dump(pca,open('pca.pickle','wb'))

pickle.dump(kmeans_pca,open('kmeans_pca.pickle','wb'))
df_segm_kmeans_pca.to_csv('user_segm_kmeans_pca.csv',index=False)

About

A hybrid recommendation system built on top of surprise library on user-song-rating along with user segmentation using clustering techniques

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published