Skip to content

Commit

Permalink
Merge pull request #117 from markmipt/master
Browse files Browse the repository at this point in the history
Add optional quantile regression for get_RCs
  • Loading branch information
levitsky authored Aug 11, 2023
2 parents 3078e5e + 680dbe5 commit 30ab0ad
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 13 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
4.6.1a2
4.6.1a3
-------

- Make :py:func:`pyteomics.mgf.write` work with a regular list of ints as "charge" param.
- Fix #115.
- Add mean absolute error (MAE) regression in :py:mod:`pyteomics.achrom`
(`#117 <https://github.com/levitsky/pyteomics/pull/117>`_ by Mark Ivanov).

4.6
---
Expand Down
52 changes: 42 additions & 10 deletions pyteomics/achrom.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,8 @@
Dependencies
------------
This module requires :py:mod:`numpy`.
This module requires :py:mod:`numpy` and, optionally, :py:mod:`scikit-learn`
(for MAE regression).
--------------------------------------------------------------------------------
"""
Expand All @@ -348,10 +349,14 @@

import numpy as np
from .auxiliary import linear_regression, PyteomicsError
try:
from sklearn.linear_model import QuantileRegressor
except ImportError:
QuantileRegressor = None

from . import parser

def get_RCs(sequences, RTs, lcp = -0.21,
term_aa = False, **kwargs):
def get_RCs(sequences, RTs, lcp=-0.21, term_aa=False, metric='mse', **kwargs):
"""Calculate the retention coefficients of amino acids using
retention times of a peptide sample and a fixed value of length
correction parameter.
Expand All @@ -369,6 +374,15 @@ def get_RCs(sequences, RTs, lcp = -0.21,
If :py:const:`True`, terminal amino acids are treated as being
modified with 'ntermX'/'ctermX' modifications. :py:const:`False`
by default.
metric : str, optional
Metric for the regression problem. Set to "mse" (mean squared
error) by default. Alternative: "mae" (mean absolute error),
which uses quantile regression.
.. note ::
`"mae"` requires :py:mod:`scikit-learn` for
`quantile regression <https://scikit-learn.org/stable/auto_examples/linear_model/plot_quantile_regression.html>`_.
labels : list of str, optional
List of all possible amino acids and terminal groups
If not given, any modX labels are allowed.
Expand Down Expand Up @@ -435,8 +449,19 @@ def get_RCs(sequences, RTs, lcp = -0.21,
composition_array.append(normalizing_peptide)
RTs.append(0.0)

# Use least square linear regression.
RCs, res, rank, s = np.linalg.lstsq(np.array(composition_array), np.array(RTs), rcond=None)
if metric == 'mse':
# # Use least square linear regression.
RCs, _, _, _ = np.linalg.lstsq(np.array(composition_array), np.array(RTs), rcond=None)

elif metric == 'mae':
if QuantileRegressor is None:
raise PyteomicsError("`metric='mae'` requires scikit-learn.")
# Use Quantile regression.
QR = QuantileRegressor(fit_intercept=False, alpha=0, solver='highs')
QR.fit(np.array(composition_array), np.array(RTs))
RCs = QR.coef_
else:
raise PyteomicsError('Invalid metric "{}". Must be "mse" or "mae".'.format(metric))

# Remove normalizing elements from the RTs vector.
if term_aa:
Expand Down Expand Up @@ -478,10 +503,8 @@ def get_RCs(sequences, RTs, lcp = -0.21,

return RC_dict

def get_RCs_vary_lcp(sequences, RTs,
term_aa = False,
lcp_range = (-1.0, 1.0),
**kwargs):

def get_RCs_vary_lcp(sequences, RTs, term_aa=False, lcp_range=(-1.0, 1.0), metric='mse', **kwargs):
"""Find the best combination of a length correction parameter and
retention coefficients for a given peptide sample.
Expand All @@ -494,6 +517,14 @@ def get_RCs_vary_lcp(sequences, RTs,
term_aa : bool, optional
If True, terminal amino acids are treated as being
modified with 'ntermX'/'ctermX' modifications. False by default.
metric : str, optional
Metric for the regression problem. Set to "mse" (mean squared
error) by default. Alternative: "mae" (mean absolute error).
.. note ::
`"mae"` requires :py:mod:`scikit-learn` for
`quantile regression <https://scikit-learn.org/stable/auto_examples/linear_model/plot_quantile_regression.html>`_.
lcp_range : 2-tuple of float, optional
Range of possible values of the length correction parameter.
labels : list of str, optional
Expand Down Expand Up @@ -540,7 +571,7 @@ def get_RCs_vary_lcp(sequences, RTs,
lcp_grid = np.arange(min_lcp, max_lcp,
(max_lcp - min_lcp) / 10.0)
for lcp in lcp_grid:
RC_dict = get_RCs(peptide_dicts, RTs, lcp, term_aa, labels=labels)
RC_dict = get_RCs(peptide_dicts, RTs, lcp, term_aa, labels=labels, metric=metric)
regression_coeffs = linear_regression(
RTs,
[calculate_RT(peptide, RC_dict) for peptide in peptide_dicts])
Expand All @@ -553,6 +584,7 @@ def get_RCs_vary_lcp(sequences, RTs,

return best_RC_dict


def calculate_RT(peptide, RC_dict, raise_no_mod=True):
"""Calculate the retention time of a peptide using a given set
of retention coefficients.
Expand Down
2 changes: 1 addition & 1 deletion pyteomics/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"""

__version__ = '4.6.1a2'
__version__ = '4.6.1a3'

from collections import namedtuple
import re
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ matplotlib
pandas
sqlalchemy
pynumpress
scikit-learn
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def get_version(rel_path):
'numpress': ['pynumpress'],
'mzMLb': ['h5py', 'hdf5plugin'],
'proforma': ['psims > v0.1.42']}
extras_require['all'] = sum(extras_require.values(), [])
extras_require['all'] = sum(extras_require.values(), ['scikit-learn'])


setup(
Expand Down

0 comments on commit 30ab0ad

Please sign in to comment.