"""
The Gini-coefficient (gini index) measures the inequality among values of
a frequency distribution. Originally applied to measure income inequality
we can also use it on gene expression data to find genes that are
over-represented in certain samples.
"""
import numpy as np
import pandas as pd
from pygenesig.validation import SignatureGenerator
from pygenesig.tools import collapse_matrix
[docs]def gini(array):
"""
Calculate the Gini coefficient of a numpy array.
Based on: https://github.com/oliviaguest/gini
Args:
array (array-like): input array
Returns:
float: gini-index of ``array``
>>> a = np.zeros((10000))
>>> a[0] = 1.0
>>> '%.3f' % gini(a)
'1.000'
>>> a = np.ones(100)
>>> '%.3f' % gini(a)
'0.000'
>>> a = np.random.uniform(-1,0,1000000)
>>> '%.2f' % gini(a)
'0.33'
"""
# based on bottom eq: http://www.statsdirect.com/help/content/image/stat0206_wmf.gif
# from: http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
if np.amin(array) < 0:
array -= np.amin(array) # values cannot be negative
array += 1e-12 # values cannot be 0
array = np.sort(array) # values must be sorted
index = np.arange(1, array.shape[0] + 1) # index per array element
n = array.shape[0] # number of array elements
return (np.sum((2 * index - n - 1) * array)) / (
n * np.sum(array)
) # Gini coefficient
[docs]def get_gini_signatures(
df_aggr, min_gini=0.7, max_rk=3, max_rel_rk=None, min_expr=None
):
"""
Generate gene signatures using gini index.
Finds over-represented genes for each sample group, given the specified thresholds.
Args:
df_aggr (pd.DataFrame): m x n pandas DataFrame with m Genes and n tissues
min_gini (float): gini cutoff, genes need to have a gini index larger than this value.
max_rk (int): rank cutoff, include genes if they rank <= max_rank among all tissues.
max_rel_rk (float): rank cutoff, include genes if they rank <= mal_rel_rk * m among all genes
in the current sample.
min_expr (float): minimal expression
Returns:
dict of list: A signature dictionary.
Example::
{
"tissue1" : [list, of, gene, ids],
"tissue2" : [list, of, other, genes],
...
}
"""
if max_rk is None:
max_rk = float("inf")
if min_gini is None:
min_gini = 0
if min_expr is None:
min_expr = float("-inf")
if max_rel_rk is None:
max_rel_rk = 1
# convert relative rank to absolute rank
max_gene_rk = max_rel_rk * df_aggr.shape[0]
expr_gini = df_aggr.apply(gini, axis=1)
expr_rank_sample = df_aggr.rank(
axis=1, ascending=False
) # rank across samples (along row)
expr_rank_gene = df_aggr.rank(
axis=0, ascending=False
) # rank across genes (along column)
sig_mask = (
(expr_rank_sample <= max_rk)
& (df_aggr >= min_expr)
& (expr_rank_gene <= max_gene_rk)
)
sig_mask.loc[
expr_gini < min_gini,
] = False
signatures = {}
for tissue, sig_series in sig_mask.iteritems():
signatures[tissue] = set(np.flatnonzero(sig_series))
return signatures
[docs]class GiniSignatureGenerator(SignatureGenerator):
"""
Use gini index to generate gene signatures.
Genes, which are specific for a tissue result in a high gini index,
whereas genes equally present in all tissues have a gini index close to zero.
The idea is, that genes with a high gini index will reliably identify
their tissue of origin.
Args:
expr (np.ndarray): m x n matrix with m samples and n genes
target (array-like): m-vector with true tissue for each sample
min_gini (float): gini cutoff, genes need to have a gini index larger than this value.
max_rk (int): rank cutoff, include genes if they rank ``<= max_rank`` among all tissues.
max_rel_rk (float): rank cutoff, include genes if they rank <= max_rel_rk * m among all genes
in the current sample. Default: .33, i.e. genes need to appear in the first third of the sample.
min_expr (float): genes need to have at least an expression ``>= min_expr`` to be included.
aggregate_fun (function): function used to aggregate samples of the same tissue.
"""
def __init__(
self,
expr,
target,
min_gini=0.7,
max_rk=3,
min_expr=1,
max_rel_rk=0.33,
aggregate_fun=np.median,
):
super().__init__(expr, target)
self.min_gini = min_gini
self.max_rk = max_rk
self.min_expr = min_expr
self.aggregate_fun = aggregate_fun
self.max_rel_rk = max_rel_rk
def _mk_signatures(self, expr, target):
df_aggr = collapse_matrix(
expr, target, axis=1, aggregate_fun=self.aggregate_fun
)
return get_gini_signatures(
df_aggr,
min_gini=self.min_gini,
max_rk=self.max_rk,
min_expr=self.min_expr,
max_rel_rk=self.max_rel_rk,
)