Source code for pygenesig.file_formats

import pandas as pd
import numpy as np
import itertools


def _read_flatfile(input_file):
    """read array from file, one entry per line"""
    with open(input_file) as f:
        output_array = np.array(f.read().splitlines())
    return output_array


def _write_flatfile(output_file, input_array):
    """write array to file, one entry per line"""
    with open(output_file, "w") as f:
        for e in input_array:
            f.write(e + "\n")


#################################################################
# expression
#################################################################
[docs]def write_expr(expression_matrix, file):
    """Store a m x n gene expression matrix as numpy object."""
    np.save(file, expression_matrix)


[docs]def read_expr(expr_file):
    """Read a m x n gene expression matrix from a numpy object."""
    return np.load(expr_file)


[docs]def read_gct(file):
    """
    Read a `GCT file`_ to a gene expression matrix.

    Args:
        file (str): path to GCT file

    Returns:
        np.array: gene expression matrix.

    .. _GCT file:
        http://software.broadinstitute.org/cancer/software/genepattern/file-formats-guide#gct

    """
    gct = pd.read_csv(file, sep="\t", skiprows=2, index_col=0)
    exprs = gct.iloc[:, 1:].values  # get rid of description column
    return exprs


[docs]def write_gct(file, exprs, samples=None, description=None, name=None):
    """
    Write a gct file.

    Args:
        file (str): path to output file
        exprs (np.array): m x n matrix with m genes and n samples
        samples: n-array with the labels for the n samples
        description: m array with a description for each gene (e.g. gene symbol)
        name: m array with the name for each gene (e.g. gene index)

    """
    if description is None:
        description = np.repeat("na", exprs.shape[0])
    if name is None:
        name = np.arange(0, exprs.shape[0])
    if samples is None:
        samples = np.arange(0, exprs.shape[1])
    assert exprs.shape[0] == description.size == name.size
    assert exprs.shape[1] == samples.size

    gct = pd.DataFrame(exprs)
    gct.columns = samples
    fdata = pd.DataFrame({"NAME": name, "Description": description})
    gct = pd.concat((fdata, gct), axis=1)
    gct.set_index("NAME", inplace=True)

    with open(file, "w") as f:
        f.write("#1.2\n")
        f.write("{} {}\n".format(*exprs.shape))

    gct.to_csv(file, mode="a", sep="\t")


##################################################################
# target (=tissue annotations)
##################################################################
[docs]def write_target(target_array, file):
    """Given a m x n gene expression matrix with m genes and n samples. Write an n-array with
    one target annotation for each sample."""
    _write_flatfile(file, target_array)


[docs]def read_target(target_file):
    """Given a m x n gene expression matrix with m genes and n samples. Read an n-array with
    one target annotation for each sample."""
    return _read_flatfile(target_file)


##################################################################
# feature (=gene symbol annotation)
##################################################################
[docs]def write_rosetta(rosetta_array, rosetta_file):
    """Alias for `write_feature`.

    Given a m x n gene expression matrix with m genes and n samples. Write a m-array
    with one identifier for each gene.

    This can be used to map the index-based signature back to gene symbols."""
    _write_flatfile(rosetta_file, rosetta_array)


[docs]def read_rosetta(rosetta_file, as_dict=True, inverse=False):
    """Given a m x n gene expression matrix with m genes and n samples. Read an m-array
    with one identifier for each gene.

    If `as_dict` is True, this will be converted into a dictionary mapping the gene-index
    to the gene identifier. This can be used to map the index-based signatures
    back to gene symbols.

    If `inverse` is True, the mapping is inverse, and the gene-symbol will be mapped to the
    gene-index.

    Args:
        rosetta_file (str): path to file
        as_dict (boolean): If True, a dictionary will be returned. Else it will be a flat numpy.array.
        inverse (boolean): If true, map gene-symbol to index.

    .. Important::
        Be carefule when using `inverse = True` when the list in
        rosetta_file is not unique. In that case only the last
        entry makes it into the list!

    Returns:
        dict or np.array: mapping or numpy array.

    """
    fdata = _read_flatfile(rosetta_file)
    if as_dict:
        return make_rosetta_dict(fdata, inverse)
    else:
        return fdata


[docs]def make_rosetta_dict(array, inverse=False):
    """
    convert an array to a dictonary mapping the index to the corresponding array entry.
    Use `inverse` to reverse the mapping, i.e. the array entry to the index.

    Args:
        array (array-like):
        inverse (boolean):

    Returns:
        dict: the map

    """
    if inverse:
        return {
            # '-' is an artifact from ribiosAnnotation
            gene_symbol: i
            for i, gene_symbol in enumerate(array)
            if gene_symbol != "-"
        }
    else:
        return dict(enumerate(array))


##################################################################
# signatures
##################################################################
[docs]def write_gmt(signatures, file, description="na"):
    """
    Writes signatures to a `GMT file`_.

    Args:
        signatures (dict of list): signature dictionary
        file: path to output file
        description: text to fill in the gmt description field.

    .. _GMT file:
        http://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#GMT:_Gene_Matrix_Transposed_file_format_.28.2A.gmt.29

    """
    with open(file, "w") as f:
        for sig, genes in sorted(signatures.items()):
            genes = sorted([str(g) for g in signatures[sig]])
            f.write("\t".join(itertools.chain([sig, description], genes)) + "\n")


[docs]def load_gmt(file):
    """
    Deprecated.
    Alias for read_gmt.
    """
    return read_gmt(file)


[docs]def read_gmt(file):
    """
    Read a `GMT file`_ into a signature dictionary.

    Args:
        file: path to GMT file

    Returns:
        dict of list: signature dictionary

        Example::

            {
                "tissue1" : [list, of, gene, ids],
                "tissue2" : [list, of, other, genes],
                ...
            }

    .. _GMT file:
        http://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#GMT:_Gene_Matrix_Transposed_file_format_.28.2A.gmt.29


    """
    signatures = {}
    with open(file) as f:
        for line in f.readlines():
            cols = line.strip().split("\t")
            name = cols[0]
            genes = cols[2:]
            signatures[name] = genes
    return signatures