Source code for genopandas.ngs.rna

import numpy as np

from genopandas.core.matrix import AnnotatedMatrix


[docs]class ExpressionMatrix(AnnotatedMatrix):
    """Matrix containing (gene) expression values (features-by-samples)."""

[docs]    @classmethod
    def from_subread(cls,
                     file_path,
                     sample_data=None,
                     sample_mapping=None,
                     **kwargs):
        """Reads expression from a subread output file."""

        return super().from_csv(
            file_path,
            sample_data=sample_data,
            sample_mapping=sample_mapping,
            drop_cols=['Chr', 'Start', 'End', 'Strand', 'Length'],
            index_col=0,
            sep='\t',
            **kwargs)

[docs]    def normalize(self, size_factors=None, log2=False):
        """Normalizes expression counts for sequencing depth.

        Normalizes by dividing sample counts using the given (sample) size
        factors. If no size factors are given, they are calculated using the
        median-of-ratios approach used by DESeq2.

        Parameters
        ----------
        size_factors : np.array
            Array of size factors, length should be equal to the number
            of samples.
        log2 : bool
            Whether to also log2-transform the normalized counts.

        Returns
        -------
        ExpressionMatrix
            ExpressionMatrix containing normalized counts.

        """

        with np.errstate(divide="ignore"):
            if size_factors is None:
                size_factors = self._estimate_size_factors(self._values)
            normalized = self._values.divide(size_factors, axis=1)

        if log2:
            normalized = np.log2(normalized + 1)

        return self._constructor(normalized)

    @staticmethod
    def _estimate_size_factors(counts):
        """Calculate size factors for DESeq's median-of-ratios normalization."""

        def _estimate_size_factors_col(counts, log_geo_means):
            log_counts = np.log(counts)
            mask = np.isfinite(log_geo_means) & (counts > 0)
            return np.exp(np.median((log_counts - log_geo_means)[mask]))

        with np.errstate(divide="ignore"):
            log_geo_means = np.mean(np.log(counts), axis=1)

            size_factors = np.apply_along_axis(
                _estimate_size_factors_col,
                axis=0,
                arr=counts,
                log_geo_means=log_geo_means)

        return size_factors