forked from biopython/biopython
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLogisticRegression.py
153 lines (125 loc) · 4.72 KB
/
LogisticRegression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# Copyright 2002 by Jeffrey Chang.
# All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Code for doing logistic regressions (DEPRECATED).
Classes:
- LogisticRegression Holds information for a LogisticRegression classifier.
Functions:
- train Train a new classifier.
- calculate Calculate the probabilities of each class, given an observation.
- classify Classify an observation into a class.
This module has been deprecated, please consider an alternative like scikit-learn
insead.
"""
import warnings
from Bio import BiopythonDeprecationWarning
warnings.warn(
"The 'Bio.LogisticRegression' module is deprecated and will be removed in a future "
"release of Biopython. Consider using scikit-learn instead.",
BiopythonDeprecationWarning,
)
try:
import numpy as np
import numpy.linalg
except ImportError:
from Bio import MissingPythonDependencyError
raise MissingPythonDependencyError(
"Please install NumPy if you want to use Bio.LogisticRegression. "
"See http://www.numpy.org/"
) from None
class LogisticRegression:
"""Holds information necessary to do logistic regression classification.
Attributes:
- beta - List of the weights for each dimension.
"""
def __init__(self):
"""Initialize the class."""
self.beta = []
def train(xs, ys, update_fn=None, typecode=None):
"""Train a logistic regression classifier on a training set.
Argument xs is a list of observations and ys is a list of the class
assignments, which should be 0 or 1. xs and ys should contain the
same number of elements. update_fn is an optional callback function
that takes as parameters that iteration number and log likelihood.
"""
if len(xs) != len(ys):
raise ValueError("xs and ys should be the same length.")
classes = set(ys)
if classes != {0, 1}:
raise ValueError("Classes should be 0's and 1's")
if typecode is None:
typecode = "d"
# Dimensionality of the data is the dimensionality of the
# observations plus a constant dimension.
N, ndims = len(xs), len(xs[0]) + 1
if N == 0 or ndims == 1:
raise ValueError("No observations or observation of 0 dimension.")
# Make an X array, with a constant first dimension.
X = np.ones((N, ndims), typecode)
X[:, 1:] = xs
Xt = np.transpose(X)
y = np.asarray(ys, typecode)
# Initialize the beta parameter to 0.
beta = np.zeros(ndims, typecode)
MAX_ITERATIONS = 500
CONVERGE_THRESHOLD = 0.01
stepsize = 1.0
# Now iterate using Newton-Raphson until the log-likelihoods
# converge.
i = 0
old_beta = old_llik = None
while i < MAX_ITERATIONS:
# Calculate the probabilities. p = e^(beta X) / (1+e^(beta X))
ebetaX = np.exp(np.dot(beta, Xt))
p = ebetaX / (1 + ebetaX)
# Find the log likelihood score and see if I've converged.
logp = y * np.log(p) + (1 - y) * np.log(1 - p)
llik = sum(logp)
if update_fn is not None:
update_fn(iter, llik)
if old_llik is not None:
# Check to see if the likelihood decreased. If it did, then
# restore the old beta parameters and half the step size.
if llik < old_llik:
stepsize /= 2.0
beta = old_beta
# If I've converged, then stop.
if np.fabs(llik - old_llik) <= CONVERGE_THRESHOLD:
break
old_llik, old_beta = llik, beta
i += 1
W = np.identity(N) * p
Xtyp = np.dot(Xt, y - p) # Calculate the first derivative.
XtWX = np.dot(np.dot(Xt, W), X) # Calculate the second derivative.
delta = numpy.linalg.solve(XtWX, Xtyp)
if np.fabs(stepsize - 1.0) > 0.001:
delta *= stepsize
beta += delta # Update beta.
else:
raise RuntimeError("Didn't converge.")
lr = LogisticRegression()
lr.beta = list(beta)
return lr
def calculate(lr, x):
"""Calculate the probability for each class.
Arguments:
- lr is a LogisticRegression object.
- x is the observed data.
Returns a list of the probability that it fits each class.
"""
# Insert a constant term for x.
x = np.asarray([1.0] + x)
# Calculate the probability. p = e^(beta X) / (1+e^(beta X))
ebetaX = np.exp(np.dot(lr.beta, x))
p = ebetaX / (1 + ebetaX)
return [1 - p, p]
def classify(lr, x):
"""Classify an observation into a class."""
probs = calculate(lr, x)
if probs[0] > probs[1]:
return 0
return 1