Source code for scrnatools.tools._create_isoform_lookup_tables

"""
Creates the lookup tables for isoform data transcripts and genes.
From scrnatools package

Created on Mon Jan 10 15:57:46 2022

@author: joe germino (joe.germino@ucsf.edu)
"""

# external imports
from anndata import AnnData
from pandas import DataFrame
import numpy as np
import pandas as pd
from typing import Tuple, Dict

# scrnatools package imports
from .._configs import configs

logger = configs.create_logger(__name__.split('_', 1)[1])


# -------------------------------------------------------function----------------------------------------------------- #


[docs]def create_isoform_lookup_tables(
        adata: AnnData,
        t2enst_path: DataFrame,
        t2g_path: DataFrame,
) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str]]:
    """Creates the lookup tables for isoform data transcripts and genes.

    Args:
        adata (AnnData): The AnnData containing kallisto isoform data.
        t2enst_path (DataFrame): Path to the transcript to ensembl transcript id mapping dataframe (from kallisto alignment).
        t2g_path (DataFrame): Path to the transcript to gene mapping dataframe (from the kallisto reference used for alignment).

    Returns:
        Tuple[Dict[str, str], Dict[str, str], Dict[str, str]]: The equivalence class to transcript dict, the equivalence class to gene dict, and the gene to equivalence class dict.
    """

    # Import the transcript list from kallisto
    t2enst = pd.read_csv(t2enst_path, header=None,)
    t2enst.columns = ['enst']

    # Import the transcript to gene mapping dataframe
    t2g = pd.read_csv(t2g_path, header=None, sep='\t')
    t2g.columns = ['enst', 'ensg', 'gname']
    t2g.index = t2g.enst

    # Get ecs from adata
    ecs = np.array(adata.var_names.values, dtype=str)
    # Convert ec string to a list of tx indices
    ecdict = {}
    for ec, txs in enumerate(ecs):
        ecdict[str(ec)] = list(np.array(txs.split(','), dtype=int))
    # Save ec list to adata
    adata.var['ecs'] = ecs
    # Set adata var_names to be new ec index
    # var names are set names
    adata.var_names = np.array(np.arange(adata.shape[1]), dtype=str)
    # Create dictionary mapping tx indices to tx ids for each ec
    ec2tx = {}
    for (ec, txs) in ecdict.items():
        ec2tx[ec] = list(t2enst.loc[txs].enst.values)
    # Create mapping of each ec to the genes whose transcripts are contained in it
    ec2g = {}
    key_errors = {}
    ec_error = []
    for (ec, txs) in ec2tx.items():
        try:
            # get unique gene names for each ec list of transcripts
            ec2g[ec] = list(np.unique(t2g.loc[txs].gname.values))
            ec_error.append(False)
        except KeyError:
            # if there's a tx that doesn't have a t2g mapping
            valid_txs = [i for i in txs if i in t2g.index]
            if len(valid_txs) > 0:
                # if there is at least one valid gene for the ec txs use them instead
                ec2g[ec] = list(np.unique(t2g.loc[valid_txs].gname.values))
            else:
                # Otherwise, fill in none for ec2g mapping
                ec2g[ec] = "None"
            ec_error.append(True)
            # If the ec list of transcripts contains a tx not in the tx to gene mapping dict, note the ec and txs that
            # are the problem
            errors = [i for i in txs if i not in t2g.index]
            key_errors[ec] = errors
    adata.var["ec_tx_error"] = ec_error
    # Create an inverse mapping dictionary getting all ecs and their txs for a given gene
    inv_map = {}
    for ec, genes in ec2g.items():
        for gene in genes:
            # Get the current dict of ec's for that gene
            inv_map[gene] = inv_map.get(gene, {})
            # Append the current ec
            inv_map[gene][ec] = ec2tx[ec]
    # save the dicts
    logger.info(f"Number of ec errors: {len(key_errors)}")
    adata.uns['key_errors'] = key_errors
    return ec2tx, ec2g, inv_map