Source code for pygenesig.gini

"""
The Gini-coefficient (gini index) measures the inequality among values of
a frequency distribution. Originally applied to measure income inequality
we can also use it on gene expression data to find genes that are
over-represented in certain samples.
"""

import numpy as np
import pandas as pd
from pygenesig.validation import SignatureGenerator
from pygenesig.tools import collapse_matrix


[docs]def gini(array):
    """
    Calculate the Gini coefficient of a numpy array.

    Based on: https://github.com/oliviaguest/gini

    Args:
        array (array-like): input array

    Returns:
        float: gini-index of ``array``

    >>> a = np.zeros((10000))
    >>> a[0] = 1.0
    >>> '%.3f' % gini(a)
    '1.000'
    >>> a = np.ones(100)
    >>> '%.3f' % gini(a)
    '0.000'
    >>> a = np.random.uniform(-1,0,1000000)
    >>> '%.2f' % gini(a)
    '0.33'
    """
    # based on bottom eq: http://www.statsdirect.com/help/content/image/stat0206_wmf.gif
    # from: http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
    if np.amin(array) < 0:
        array -= np.amin(array)  # values cannot be negative
    array += 1e-12  # values cannot be 0
    array = np.sort(array)  # values must be sorted
    index = np.arange(1, array.shape[0] + 1)  # index per array element
    n = array.shape[0]  # number of array elements
    return (np.sum((2 * index - n - 1) * array)) / (
        n * np.sum(array)
    )  # Gini coefficient


[docs]def get_rogini_format(df_aggr, min_gini=0.7, max_rk=None, min_expr=1):
    """
    Imitate the *rogini* output format.

    Args:
        df_aggr (pd.DataFrame): m x n pandas DataFrame with m Genes and n tissues
        min_gini (float): gini cutoff, genes need to have a gini index larger than this value.
        max_rk (int): rank cutoff, include genes if they rank <= max_rank among all tissues.
        min_expr (float): minimal expression

    Returns:
        pd.DataFrame: equal to *rogini* output.

        Example::

            GENEID  CATEGORY        VALUE   RANKING GINI_IDX
            54165   Whole blood (ribopure)  491.359000      1       0.441296
            54165   CD34 cells differentiated to erythrocyte lineage        148.124000      2       0.441296
            54165   Mast cell - stimulated  68.973000       3       0.441296
            101927596       CD4+CD25-CD45RA+ naive conventional T cells     15.505000       1       0.948804
            101927596       CD8+ T Cells (pluriselect)      15.376000       2       0.948804
            101927596       CD4+CD25-CD45RA- memory conventional T cells    10.769000       3       0.948804

    """
    if max_rk is None:
        max_rk = float("inf")
    if min_gini is None:
        min_gini = 0
    if min_expr is None:
        min_expr = float("-inf")

    rogini_rows = []
    expr_gini = df_aggr.apply(gini, axis=1)
    expr_rank = df_aggr.rank(axis=1, ascending=False)
    for i in range(df_aggr.shape[0]):
        geneid = df_aggr.index[i]
        expr_row = df_aggr.iloc[i]
        gini_row = expr_gini.iloc[i]
        rk_row = expr_rank.iloc[i]
        which_cols = np.flatnonzero(
            (rk_row <= max_rk) & (gini_row >= min_gini) & (expr_row >= min_expr)
        )
        for j in which_cols:
            rogini_rows.append(
                [geneid, df_aggr.columns[j], expr_row[j], rk_row[j], gini_row]
            )
    columns = ["GENEID", "CATEGORY", "VALUE", "RANKING", "GINI_IDX"]
    df = pd.DataFrame(rogini_rows)
    df.columns = columns
    df.sort_values(["GENEID", "RANKING"], inplace=True)

    # the index does not have any meaning, we therefore want a consecutive index in the final data frame
    df.reset_index(drop=True, inplace=True)
    return df


[docs]def get_gini_signatures(
    df_aggr, min_gini=0.7, max_rk=3, max_rel_rk=None, min_expr=None
):
    """
    Generate gene signatures using gini index.

    Finds over-represented genes for each sample group, given the specified thresholds.

    Args:
        df_aggr (pd.DataFrame): m x n pandas DataFrame with m Genes and n tissues
        min_gini (float): gini cutoff, genes need to have a gini index larger than this value.
        max_rk (int): rank cutoff, include genes if they rank <= max_rank among all tissues.
        max_rel_rk (float): rank cutoff, include genes if they rank <= mal_rel_rk * m among all genes
            in the current sample.
        min_expr (float): minimal expression

    Returns:
        dict of list: A signature dictionary.

        Example::

            {
                "tissue1" : [list, of, gene, ids],
                "tissue2" : [list, of, other, genes],
                ...
            }

    """
    if max_rk is None:
        max_rk = float("inf")
    if min_gini is None:
        min_gini = 0
    if min_expr is None:
        min_expr = float("-inf")
    if max_rel_rk is None:
        max_rel_rk = 1

    # convert relative rank to absolute rank
    max_gene_rk = max_rel_rk * df_aggr.shape[0]

    expr_gini = df_aggr.apply(gini, axis=1)
    expr_rank_sample = df_aggr.rank(
        axis=1, ascending=False
    )  # rank across samples (along row)
    expr_rank_gene = df_aggr.rank(
        axis=0, ascending=False
    )  # rank across genes (along column)
    sig_mask = (
        (expr_rank_sample <= max_rk)
        & (df_aggr >= min_expr)
        & (expr_rank_gene <= max_gene_rk)
    )
    sig_mask.loc[
        expr_gini < min_gini,
    ] = False

    signatures = {}
    for tissue, sig_series in sig_mask.iteritems():
        signatures[tissue] = set(np.flatnonzero(sig_series))
    return signatures


[docs]class GiniSignatureGenerator(SignatureGenerator):
    """
    Use gini index to generate gene signatures.

    Genes, which are specific for a tissue result in a high gini index,
    whereas genes equally present in all tissues have a gini index close to zero.

    The idea is, that genes with a high gini index will reliably identify
    their tissue of origin.

    Args:
        expr (np.ndarray): m x n matrix with m samples and n genes
        target (array-like): m-vector with true tissue for each sample
        min_gini (float): gini cutoff, genes need to have a gini index larger than this value.
        max_rk (int): rank cutoff, include genes if they rank ``<= max_rank`` among all tissues.
        max_rel_rk (float): rank cutoff, include genes if they rank <= max_rel_rk * m among all genes
            in the current sample. Default: .33, i.e. genes need to appear in the first third of the sample.
        min_expr (float): genes need to have at least an expression ``>= min_expr`` to be included.
        aggregate_fun (function): function used to aggregate samples of the same tissue.
    """

    def __init__(
        self,
        expr,
        target,
        min_gini=0.7,
        max_rk=3,
        min_expr=1,
        max_rel_rk=0.33,
        aggregate_fun=np.median,
    ):
        super().__init__(expr, target)
        self.min_gini = min_gini
        self.max_rk = max_rk
        self.min_expr = min_expr
        self.aggregate_fun = aggregate_fun
        self.max_rel_rk = max_rel_rk

    def _mk_signatures(self, expr, target):
        df_aggr = collapse_matrix(
            expr, target, axis=1, aggregate_fun=self.aggregate_fun
        )
        return get_gini_signatures(
            df_aggr,
            min_gini=self.min_gini,
            max_rk=self.max_rk,
            min_expr=self.min_expr,
            max_rel_rk=self.max_rel_rk,
        )

[docs]    def get_rogini_format(self):
        df_aggr = collapse_matrix(
            self.expr, self.target, axis=1, aggregate_fun=self.aggregate_fun
        )
        return get_rogini_format(
            df_aggr, min_gini=self.min_gini, max_rk=self.max_rk, min_expr=self.min_expr
        )