Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 36 additions & 19 deletions machine_learning/k_nearest_neighbours.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,11 @@
Reference: https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
"""

from collections import Counter
from heapq import nsmallest

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

Check failure on line 19 in machine_learning/k_nearest_neighbours.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (I001)

machine_learning/k_nearest_neighbours.py:15:1: I001 Import block is un-sorted or un-formatted

Check failure on line 19 in machine_learning/k_nearest_neighbours.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (I001)

machine_learning/k_nearest_neighbours.py:15:1: I001 Import block is un-sorted or un-formatted


class KNN:
Expand All @@ -26,23 +25,36 @@
train_data: np.ndarray[float],
train_target: np.ndarray[int],
class_labels: list[str],
distance_metric: str = "euclidean",
p: int = 2,
) -> None:
"""
Create a kNN classifier using the given training data and class labels
Create a kNN classifier using the given training data and class labels.

Parameters:
-----------
distance_metric : str
Type of distance metric to use ('euclidean', 'manhattan', 'minkowski')
p : int
Power parameter for Minkowski distance (default 2)
"""
self.data = zip(train_data, train_target)
self.data = list(zip(train_data, train_target))
self.labels = class_labels
self.distance_metric = distance_metric
self.p = p

@staticmethod
def _euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float:
def _calculate_distance(self, a: np.ndarray[float], b: np.ndarray[float]) -> float:
"""
Calculate the Euclidean distance between two points
>>> KNN._euclidean_distance(np.array([0, 0]), np.array([3, 4]))
5.0
>>> KNN._euclidean_distance(np.array([1, 2, 3]), np.array([1, 8, 11]))
10.0
Calculate distance between two points based on the selected metric.
"""
return float(np.linalg.norm(a - b))
if self.distance_metric == "euclidean":
return float(np.linalg.norm(a - b))
elif self.distance_metric == "manhattan":
return float(np.sum(np.abs(a - b)))
elif self.distance_metric == "minkowski":
return float(np.sum(np.abs(a - b) ** self.p) ** (1 / self.p))
else:
raise ValueError("Invalid distance metric. Choose 'euclidean', 'manhattan', or 'minkowski'.")

Check failure on line 57 in machine_learning/k_nearest_neighbours.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E501)

machine_learning/k_nearest_neighbours.py:57:89: E501 Line too long (105 > 88)

Check failure on line 57 in machine_learning/k_nearest_neighbours.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E501)

machine_learning/k_nearest_neighbours.py:57:89: E501 Line too long (105 > 88)

def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str:
"""
Expand All @@ -57,23 +69,18 @@
>>> knn.classify(point)
'A'
"""
# Distances of all points from the point to be classified
distances = (
(self._euclidean_distance(data_point[0], pred_point), data_point[1])
(self._calculate_distance(data_point[0], pred_point), data_point[1])
for data_point in self.data
)

# Choosing k points with the shortest distances
votes = (i[1] for i in nsmallest(k, distances))

# Most commonly occurring class is the one into which the point is classified
result = Counter(votes).most_common(1)[0][0]
return self.labels[result]


if __name__ == "__main__":
import doctest

doctest.testmod()

iris = datasets.load_iris()
Expand All @@ -84,5 +91,15 @@

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
iris_point = np.array([4.4, 3.1, 1.3, 1.4])
classifier = KNN(X_train, y_train, iris_classes)
print(classifier.classify(iris_point, k=3))

print("\nUsing Euclidean Distance:")
classifier1 = KNN(X_train, y_train, iris_classes, distance_metric="euclidean")
print(classifier1.classify(iris_point, k=3))

print("\nUsing Manhattan Distance:")
classifier2 = KNN(X_train, y_train, iris_classes, distance_metric="manhattan")
print(classifier2.classify(iris_point, k=3))

print("\nUsing Minkowski Distance (p=3):")
classifier3 = KNN(X_train, y_train, iris_classes, distance_metric="minkowski", p=3)
print(classifier3.classify(iris_point, k=3))
Loading