"""
`MCPCounter`_ is a method described by Becht et al., Genome Biology (2015)
for deconvolution of tissue-infiltrating immune and stromal cell populations.
Besides presenting a method for gene-expression based deconvolution they put a
lot of effort into curating signatures.
This module
* re-implements MCP counter in python as SignatureTester
* implements a SignatureGenerator based on the MCP method for curating signatures.
.. _MCPCounter:
http://dx.doi.org/10.1186/s13059-016-1070-5
"""
from pygenesig.validation import SignatureGenerator, SignatureTester
import numpy as np
from sklearn.metrics import roc_auc_score
[docs]def fold_change(expr, positive_mask):
"""
Compute the fold change between the
positive and negative samples for a single gene `G`.
According to `Becht et al.`_ the fold change is defined as
.. math::
FC = X - \overline{X}
where :math:`X` is the mean of positive and :math:`\overline{X}` is the mean of the negative samples.
Args:
expr (np.ndarray): expression of `G` for each sample.
positive_mask (np.ndarray, dtype=np.bool): boolean mask for `expr` indicating which samples belong to
the positive class.
Returns:
float: fold change
>>> expr = np.array([[2, 3, 5, 4, 9, 15]])
>>> target = np.array(["A", "B", "C", "A", "B", "C"])
>>> fold_change(expr, target == "A")
array([-5.])
.. _Becht et al.:
http://dx.doi.org/10.1186/s13059-016-1070-5
"""
return np.mean(expr[:, positive_mask], axis=1) - np.mean(
expr[:, ~positive_mask], axis=1
)
[docs]def specific_fold_change(expr, positive_mask, negative_masks):
"""
Compute the specific fold change of the positive class with respect to all other classes
for a single gene `G`.
According to `Becht et al.`_ the specific fold change is defined as
.. math::
sFC = (X - \overline{X}_{min})/(\overline{X}_{max} - \overline{X}_{min})
where :math:`X` is the mean of positive and :math:`\overline{X}_{max}` is the maximum mean over
all negative classes and :math:`\overline{X}_{min}` is the minimal mean over all negative classes.
Args:
expr (np.ndarray): expression of `G` for each sample.
positive_mask (np.ndarray): boolean mask for `expr` indicating which samples belong to
the positive class.
negative_masks (list of np.ndarray): list of boolean masks for `expr` indicating which samples belong
to the different negative classes.
Returns:
float: specific fold change
>>> expr = np.array([[2, 3, 5, 4, 9, 15], [2, 3, 5, 4, 9, 15]])
>>> target = np.array(["A", "B", "C", "A", "B", "C"])
>>> specific_fold_change(expr, target == 'A', [target == "B", target == "C"])
array([-0.75, -0.75])
.. _Becht et al.:
http://dx.doi.org/10.1186/s13059-016-1070-5
"""
mean_per_class = np.hstack(
[
np.mean(expr[:, class_inds], axis=1)[:, np.newaxis]
for class_inds in negative_masks
]
)
x_min = np.min(mean_per_class, axis=1)
x_max = np.max(mean_per_class, axis=1)
return np.divide(np.mean(expr[:, positive_mask], axis=1) - x_min, x_max - x_min)
[docs]def roc_auc(expr, positive_mask):
"""
Compute the Receiver Operator Characteristics Area under the Curve (ROC AUC) for a single gene `G`.
This tells how well the gene discriminates between the two classes.
This is a wrapper for the scikit-learn `roc_auc_score`_ function.
Args:
expr (np.ndarray): expression of `G` for each sample.
positive_mask (np.ndarray, dtype=np.bool): boolean mask for `expr` indicating which samples belong to
the positive class.
Returns:
float: roc auc score
.. _roc_auc_score:
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score
>>> expr = np.array([2, 3, 5, 4, 9, 15])
>>> target = np.array(["A", "B", "C", "A", "B", "C"])
>>> roc_auc(expr, target == "A")
0.125
"""
return roc_auc_score(positive_mask, expr)
[docs]class MCPSignatureGenerator(SignatureGenerator):
"""
Implements the procedure described by `Becht et al.`_ for curating signatures.
A gene is considered a valid `Transcription Marker` (TM) if it meets the following criteria
* fold change >= ``min_fc`` (default=2)
* specific fold change >= ``min_sfc`` (default=1.5)
* AUC ROC >= ``min_auc`` (default=0.97)
Args:
expr: m x n gene expression matrix with m genes and n samples.
target: m-vector with true tissue for each sample
min_fc: minimal fold change for a gene to be considered as valid TM
min_sfc: minimal specific fold change for a gene to be considered as valid TM
min_auc: minimal ROC AUC for a gene to be considered as valid TM
.. _Becht et al.:
http://dx.doi.org/10.1186/s13059-016-1070-5
"""
def __init__(self, expr, target, min_fc=2, min_sfc=1.5, min_auc=0.97):
super().__init__(expr, target)
self.min_fc = min_fc
self.min_sfc = min_sfc
self.min_auc = min_auc
def _mk_signatures(self, expr, target):
classes = list(set(target))
masks = {cls: target == cls for cls in classes}
signatures = {cls: [] for cls in classes}
for cls in classes:
fc = fold_change(expr, masks[cls])
sfc = specific_fold_change(
expr, masks[cls], [mask for k, mask in masks.items() if k != cls]
)
for i in range(expr.shape[0]):
auc = roc_auc(expr[i, :], masks[cls])
if (
fc[i] >= self.min_fc
and sfc[i] >= self.min_sfc
and auc >= self.min_auc
):
signatures[cls].append(i)
return signatures
[docs]class MCPSignatureTester(SignatureTester):
"""
Implements the `MCPCounter`_ described by Becht et al. in python.
The principle is super-simple: take the mean of all marker genes as indicator.
Also see their `R script`_.
.. _R script:
https://github.com/ebecht/MCPcounter/blob/a79614eee002c88c64725d69140c7653e7c379b4/Source/R/MCPcounter.R
"""
def _score_signatures(self, expr, signatures):
result = np.empty((len(signatures), expr.shape[1]))
classes = self.sort_signatures(signatures)
for i, cls in enumerate(classes):
inds = np.array(list(signatures[cls]))
for j in range(expr.shape[1]):
result[i, j] = np.mean(expr[inds, j]) if len(inds) > 0 else np.NAN
return result