Source code for pygenesig.file_formats

import pandas as pd
import numpy as np
import itertools


def _read_flatfile(input_file):
    """read array from file, one entry per line"""
    with open(input_file) as f:
        output_array = np.array(f.read().splitlines())
    return output_array


def _write_flatfile(output_file, input_array):
    """write array to file, one entry per line"""
    with open(output_file, "w") as f:
        for e in input_array:
            f.write(e + "\n")


#################################################################
# expression
#################################################################
[docs]def write_expr(expression_matrix, file): """Store a m x n gene expression matrix as numpy object.""" np.save(file, expression_matrix)
[docs]def read_expr(expr_file): """Read a m x n gene expression matrix from a numpy object.""" return np.load(expr_file)
[docs]def read_gct(file): """ Read a `GCT file`_ to a gene expression matrix. Args: file (str): path to GCT file Returns: np.array: gene expression matrix. .. _GCT file: http://software.broadinstitute.org/cancer/software/genepattern/file-formats-guide#gct """ gct = pd.read_csv(file, sep="\t", skiprows=2, index_col=0) exprs = gct.iloc[:, 1:].values # get rid of description column return exprs
[docs]def write_gct(file, exprs, samples=None, description=None, name=None): """ Write a gct file. Args: file (str): path to output file exprs (np.array): m x n matrix with m genes and n samples samples: n-array with the labels for the n samples description: m array with a description for each gene (e.g. gene symbol) name: m array with the name for each gene (e.g. gene index) """ if description is None: description = np.repeat("na", exprs.shape[0]) if name is None: name = np.arange(0, exprs.shape[0]) if samples is None: samples = np.arange(0, exprs.shape[1]) assert exprs.shape[0] == description.size == name.size assert exprs.shape[1] == samples.size gct = pd.DataFrame(exprs) gct.columns = samples fdata = pd.DataFrame({"NAME": name, "Description": description}) gct = pd.concat((fdata, gct), axis=1) gct.set_index("NAME", inplace=True) with open(file, "w") as f: f.write("#1.2\n") f.write("{} {}\n".format(*exprs.shape)) gct.to_csv(file, mode="a", sep="\t")
################################################################## # target (=tissue annotations) ##################################################################
[docs]def write_target(target_array, file): """Given a m x n gene expression matrix with m genes and n samples. Write an n-array with one target annotation for each sample.""" _write_flatfile(file, target_array)
[docs]def read_target(target_file): """Given a m x n gene expression matrix with m genes and n samples. Read an n-array with one target annotation for each sample.""" return _read_flatfile(target_file)
################################################################## # feature (=gene symbol annotation) ##################################################################
[docs]def write_rosetta(rosetta_array, rosetta_file): """Alias for `write_feature`. Given a m x n gene expression matrix with m genes and n samples. Write a m-array with one identifier for each gene. This can be used to map the index-based signature back to gene symbols.""" _write_flatfile(rosetta_file, rosetta_array)
[docs]def read_rosetta(rosetta_file, as_dict=True, inverse=False): """Given a m x n gene expression matrix with m genes and n samples. Read an m-array with one identifier for each gene. If `as_dict` is True, this will be converted into a dictionary mapping the gene-index to the gene identifier. This can be used to map the index-based signatures back to gene symbols. If `inverse` is True, the mapping is inverse, and the gene-symbol will be mapped to the gene-index. Args: rosetta_file (str): path to file as_dict (boolean): If True, a dictionary will be returned. Else it will be a flat numpy.array. inverse (boolean): If true, map gene-symbol to index. .. Important:: Be carefule when using `inverse = True` when the list in rosetta_file is not unique. In that case only the last entry makes it into the list! Returns: dict or np.array: mapping or numpy array. """ fdata = _read_flatfile(rosetta_file) if as_dict: return make_rosetta_dict(fdata, inverse) else: return fdata
[docs]def make_rosetta_dict(array, inverse=False): """ convert an array to a dictonary mapping the index to the corresponding array entry. Use `inverse` to reverse the mapping, i.e. the array entry to the index. Args: array (array-like): inverse (boolean): Returns: dict: the map """ if inverse: return { # '-' is an artifact from ribiosAnnotation gene_symbol: i for i, gene_symbol in enumerate(array) if gene_symbol != "-" } else: return dict(enumerate(array))
################################################################## # signatures ##################################################################
[docs]def write_gmt(signatures, file, description="na"): """ Writes signatures to a `GMT file`_. Args: signatures (dict of list): signature dictionary file: path to output file description: text to fill in the gmt description field. .. _GMT file: http://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#GMT:_Gene_Matrix_Transposed_file_format_.28.2A.gmt.29 """ with open(file, "w") as f: for sig, genes in sorted(signatures.items()): genes = sorted([str(g) for g in signatures[sig]]) f.write("\t".join(itertools.chain([sig, description], genes)) + "\n")
[docs]def load_gmt(file): """ Deprecated. Alias for read_gmt. """ return read_gmt(file)
[docs]def read_gmt(file): """ Read a `GMT file`_ into a signature dictionary. Args: file: path to GMT file Returns: dict of list: signature dictionary Example:: { "tissue1" : [list, of, gene, ids], "tissue2" : [list, of, other, genes], ... } .. _GMT file: http://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#GMT:_Gene_Matrix_Transposed_file_format_.28.2A.gmt.29 """ signatures = {} with open(file) as f: for line in f.readlines(): cols = line.strip().split("\t") name = cols[0] genes = cols[2:] signatures[name] = genes return signatures