|
| 1 | +""" |
| 2 | +k-Nearest Neighbours (kNN) is a simple non-parametric supervised learning |
| 3 | +algorithm used for classification. Given some labelled training data, a given |
| 4 | +point is classified using its k nearest neighbours according to some distance |
| 5 | +metric. The most commonly occurring label among the neighbours becomes the label |
| 6 | +of the given point. In effect, the label of the given point is decided by a |
| 7 | +majority vote. |
| 8 | +
|
| 9 | +This implementation uses the commonly used Euclidean distance metric, but other |
| 10 | +distance metrics can also be used. |
| 11 | +
|
| 12 | +Reference: https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm |
| 13 | +""" |
| 14 | + |
1 | 15 | from collections import Counter
|
| 16 | +from heapq import nsmallest |
2 | 17 |
|
3 | 18 | import numpy as np
|
4 | 19 | from sklearn import datasets
|
5 | 20 | from sklearn.model_selection import train_test_split
|
6 | 21 |
|
7 |
| -data = datasets.load_iris() |
8 |
| - |
9 |
| -X = np.array(data["data"]) |
10 |
| -y = np.array(data["target"]) |
11 |
| -classes = data["target_names"] |
12 |
| - |
13 |
| -X_train, X_test, y_train, y_test = train_test_split(X, y) |
14 |
| - |
15 |
| - |
16 |
| -def euclidean_distance(a, b): |
17 |
| - """ |
18 |
| - Gives the euclidean distance between two points |
19 |
| - >>> euclidean_distance([0, 0], [3, 4]) |
20 |
| - 5.0 |
21 |
| - >>> euclidean_distance([1, 2, 3], [1, 8, 11]) |
22 |
| - 10.0 |
23 |
| - """ |
24 |
| - return np.linalg.norm(np.array(a) - np.array(b)) |
25 |
| - |
26 |
| - |
27 |
| -def classifier(train_data, train_target, classes, point, k=5): |
28 |
| - """ |
29 |
| - Classifies the point using the KNN algorithm |
30 |
| - k closest points are found (ranked in ascending order of euclidean distance) |
31 |
| - Params: |
32 |
| - :train_data: Set of points that are classified into two or more classes |
33 |
| - :train_target: List of classes in the order of train_data points |
34 |
| - :classes: Labels of the classes |
35 |
| - :point: The data point that needs to be classified |
36 |
| -
|
37 |
| - >>> X_train = [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]] |
38 |
| - >>> y_train = [0, 0, 0, 0, 1, 1, 1] |
39 |
| - >>> classes = ['A','B']; point = [1.2,1.2] |
40 |
| - >>> classifier(X_train, y_train, classes,point) |
41 |
| - 'A' |
42 |
| - """ |
43 |
| - data = zip(train_data, train_target) |
44 |
| - # List of distances of all points from the point to be classified |
45 |
| - distances = [] |
46 |
| - for data_point in data: |
47 |
| - distance = euclidean_distance(data_point[0], point) |
48 |
| - distances.append((distance, data_point[1])) |
49 |
| - # Choosing 'k' points with the least distances. |
50 |
| - votes = [i[1] for i in sorted(distances)[:k]] |
51 |
| - # Most commonly occurring class among them |
52 |
| - # is the class into which the point is classified |
53 |
| - result = Counter(votes).most_common(1)[0][0] |
54 |
| - return classes[result] |
| 22 | + |
| 23 | +class KNN: |
| 24 | + def __init__( |
| 25 | + self, |
| 26 | + train_data: np.ndarray[float], |
| 27 | + train_target: np.ndarray[int], |
| 28 | + class_labels: list[str], |
| 29 | + ) -> None: |
| 30 | + """ |
| 31 | + Create a kNN classifier using the given training data and class labels |
| 32 | + """ |
| 33 | + self.data = zip(train_data, train_target) |
| 34 | + self.labels = class_labels |
| 35 | + |
| 36 | + @staticmethod |
| 37 | + def _euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float: |
| 38 | + """ |
| 39 | + Calculate the Euclidean distance between two points |
| 40 | + >>> KNN._euclidean_distance(np.array([0, 0]), np.array([3, 4])) |
| 41 | + 5.0 |
| 42 | + >>> KNN._euclidean_distance(np.array([1, 2, 3]), np.array([1, 8, 11])) |
| 43 | + 10.0 |
| 44 | + """ |
| 45 | + return np.linalg.norm(a - b) |
| 46 | + |
| 47 | + def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: |
| 48 | + """ |
| 49 | + Classify a given point using the kNN algorithm |
| 50 | + >>> train_X = np.array( |
| 51 | + ... [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]] |
| 52 | + ... ) |
| 53 | + >>> train_y = np.array([0, 0, 0, 0, 1, 1, 1]) |
| 54 | + >>> classes = ['A', 'B'] |
| 55 | + >>> knn = KNN(train_X, train_y, classes) |
| 56 | + >>> point = np.array([1.2, 1.2]) |
| 57 | + >>> knn.classify(point) |
| 58 | + 'A' |
| 59 | + """ |
| 60 | + # Distances of all points from the point to be classified |
| 61 | + distances = ( |
| 62 | + (self._euclidean_distance(data_point[0], pred_point), data_point[1]) |
| 63 | + for data_point in self.data |
| 64 | + ) |
| 65 | + |
| 66 | + # Choosing k points with the shortest distances |
| 67 | + votes = (i[1] for i in nsmallest(k, distances)) |
| 68 | + |
| 69 | + # Most commonly occurring class is the one into which the point is classified |
| 70 | + result = Counter(votes).most_common(1)[0][0] |
| 71 | + return self.labels[result] |
55 | 72 |
|
56 | 73 |
|
57 | 74 | if __name__ == "__main__":
|
58 |
| - print(classifier(X_train, y_train, classes, [4.4, 3.1, 1.3, 1.4])) |
| 75 | + import doctest |
| 76 | + |
| 77 | + doctest.testmod() |
| 78 | + |
| 79 | + iris = datasets.load_iris() |
| 80 | + |
| 81 | + X = np.array(iris["data"]) |
| 82 | + y = np.array(iris["target"]) |
| 83 | + iris_classes = iris["target_names"] |
| 84 | + |
| 85 | + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) |
| 86 | + iris_point = np.array([4.4, 3.1, 1.3, 1.4]) |
| 87 | + classifier = KNN(X_train, y_train, iris_classes) |
| 88 | + print(classifier.classify(iris_point, k=3)) |
0 commit comments