Source code for pygenesig.tools

import numpy as np
import itertools
import pandas as pd
from pprint import pprint
from .tests import TESTDATA
from pygenesig.file_formats import load_gmt


[docs]def translate_signatures(signatures, rosetta, ignore_missing=False): """ Translate gene identifiers in a signature dictionary. Args: signatures (dict of list): signature dictionary rosetta (dict): translation table mapping one gene identifier to another ignore_missing (boolean): If true, no error will be raised if an identifier is not in the translation dictionary. Respective entries will be skipped. Returns: dict of list: translated signature dictionary Raises: KeyError: if a gene is not in the rosetta dictionary unless ignore_missing is specified """ if ignore_missing: # remove genes from signature which is not in rosetta. signatures = { tissue: [gene for gene in genes if gene in rosetta] for tissue, genes in signatures.items() } return { tissue: [rosetta[gene] for gene in genes] for tissue, genes in signatures.items() }
[docs]def combine_signatures(*args, function=set.intersection): """Combine signatures (e.g. by taking the intersection) Args: *args: list of signature dictonaries function: set operation to combine the signatures Returns: dict of set: combined signature dictionary. >>> s1 = {"A": {1, 2, 3}, "B": {2, 3, 4}} >>> s2 = {"A": {1, 3, 4, 5}, "B": {42}} >>> pprint(combine_signatures(s1, s2)) {'A': {1, 3}, 'B': set()} """ assert len(args) > 0, "No signatures provided" keys = args[0].keys() for sig in args: assert sig.keys() == keys, "All signature dictonaries must have identical keys." return {k: function(*[set(sig[k]) for sig in args]) for k in keys}
[docs]def jaccard_ind(set1, set2, *args): """ Computes the Jaccard-Index of two or more sets. Args: set1 (list-like): set2 (list-like): *args: arbitrary number of more sets. Returns: float: jaccard index of all sets """ set1 = set(set1) set2 = set(set2) i = len(set.intersection(set1, set2, *args)) u = len(set.union(set1, set2, *args)) return np.divide(i, u)
[docs]def pairwise_jaccard_ind(list_of_signatures): """ Compute the pairwise jaccard index for a list of signature sets. Useful for calculating the pariwise overlap between the different crossvalidation folds. Args: list_of_signatures: list of signature dicts. Returns: dict: signature_name -> [list, of, jaccard, indices] Note: takes the signature names from the first dict in list_of_signatures to build the output dictionary. """ assert len(list_of_signatures) > 0, "no signatures provided." pairwise_jacc = {} for signame in list_of_signatures[0]: pairwise_jacc[signame] = [] for sigset1, sigset2 in itertools.product( list_of_signatures, list_of_signatures ): pairwise_jacc[signame].append( jaccard_ind(sigset1[signame], sigset2[signame]) ) return pairwise_jacc
[docs]def performance_per_tissue(list_of_confusion_matrices, sig_labels, perf_fun): """ Compute per-tissue performance measures from all-against-all confusion matrices. Args: list_of_confusion_matrices (list of np.array): list of confusion matrices sig_labels (array-like): list of signatures in the same order as in the confusion matrices. perf_fun (function): ``(TP, FN, TP, TN)`` computing a performance measure from the binary confusion matrix. See ``perfmeasures`` module. Returns: dict: signature_name -> list of performance meausures for each confusion matrix provided. """ assert len(list_of_confusion_matrices) > 0, "no matrices provided." res = {} for i, sig in enumerate(sig_labels): res[sig] = [] for confmat in list_of_confusion_matrices: TP = confmat[i, i] FN = np.sum(confmat[i, :]) - TP FP = np.sum(confmat[:, i]) - TP TN = np.sum(confmat) - TP - FN - FP res[sig].append(perf_fun(TP, FN, FP, TN)) return res
[docs]def jaccard_mat(sigs1, sigs2, colname1="set_1", colname2="set_2", as_matrix=False): """ Compute a matrix of jaccard indices to compute the overlap of two signature sets. Args: sigs1: signature dictionary sigs2: signature dictionary colname1: Name of the column for sigs1 in the dataframe colname2: Name of the column for sigs2 in the dataframe as_matrix: if False, a long-form dataframe will be returned, if True, a 2d matrix will be returned instead. Returns: pd.DataFrame: Matrix of Jaccard indices in long format Plot the overlap of signatures: >>> import seaborn as sns >>> signatures = load_gmt(TESTDATA / "bioqc/test_bioqc_log_pvalue.gmt") >>> df = jaccard_mat(signatures, signatures) >>> sns.heatmap(df.pivot(*df.columns)) # doctest: +ELLIPSIS <AxesSubplot:...> """ jaccard_list = [] for name1, genes1 in sigs1.items(): for name2, genes2 in sigs2.items(): jaccard_list.append((name1, name2, jaccard_ind(set(genes1), set(genes2)))) df = pd.DataFrame(jaccard_list, columns=(colname1, colname2, "jaccard index")) if as_matrix: return df.pivot(index=colname1, columns=colname2) return df
[docs]def collapse_matrix(mat, group_by, axis=0, aggregate_fun=np.median): """ Aggregate expression by annotation (collapse samples of the same tissue) Args: mat (np.array): m x n gene expression matrix with m genes and n samples. group_by (list-like): list of length m (if axis=0) or list of length n (if axis=1) axis (int): 0 for rows, 1 for columns aggregate_fun (function): aggregate to apply, defaults to ``numpy.median`` Returns: pd.DataFrame: collapsed matrix with annotation from `group_by`. """ if axis == 0: assert mat.shape[0] == len(group_by) elif axis == 1: assert mat.shape[1] == len(group_by) mat_df = pd.DataFrame(mat) group_by = list(group_by) # strip index from series return mat_df.groupby(group_by, axis=axis).aggregate(aggregate_fun)
[docs]def normalize(array): """normalize a vector to values between 0 and 1""" amax = float(np.nanmax(array)) amin = float(np.nanmin(array)) if amax - amin == 0: return [0] * len(array) array = [(x - amin) / (amax - amin) for x in array] return array
[docs]def normalize_sum(array): """normalize a vector to sum to 1""" s = sum(array) return [x / s for x in array]