Source code for scrnatools.tools._create_isoform_lookup_tables

"""
Creates the lookup tables for isoform data transcripts and genes.
From scrnatools package

Created on Mon Jan 10 15:57:46 2022

@author: joe germino (joe.germino@ucsf.edu)
"""

# external imports
from anndata import AnnData
from pandas import DataFrame
import numpy as np
import pandas as pd
from typing import Tuple, Dict

# scrnatools package imports
from .._configs import configs

logger = configs.create_logger(__name__.split('_', 1)[1])


# -------------------------------------------------------function----------------------------------------------------- #


[docs]def create_isoform_lookup_tables( adata: AnnData, t2enst_path: DataFrame, t2g_path: DataFrame, ) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str]]: """Creates the lookup tables for isoform data transcripts and genes. Args: adata (AnnData): The AnnData containing kallisto isoform data. t2enst_path (DataFrame): Path to the transcript to ensembl transcript id mapping dataframe (from kallisto alignment). t2g_path (DataFrame): Path to the transcript to gene mapping dataframe (from the kallisto reference used for alignment). Returns: Tuple[Dict[str, str], Dict[str, str], Dict[str, str]]: The equivalence class to transcript dict, the equivalence class to gene dict, and the gene to equivalence class dict. """ # Import the transcript list from kallisto t2enst = pd.read_csv(t2enst_path, header=None,) t2enst.columns = ['enst'] # Import the transcript to gene mapping dataframe t2g = pd.read_csv(t2g_path, header=None, sep='\t') t2g.columns = ['enst', 'ensg', 'gname'] t2g.index = t2g.enst # Get ecs from adata ecs = np.array(adata.var_names.values, dtype=str) # Convert ec string to a list of tx indices ecdict = {} for ec, txs in enumerate(ecs): ecdict[str(ec)] = list(np.array(txs.split(','), dtype=int)) # Save ec list to adata adata.var['ecs'] = ecs # Set adata var_names to be new ec index # var names are set names adata.var_names = np.array(np.arange(adata.shape[1]), dtype=str) # Create dictionary mapping tx indices to tx ids for each ec ec2tx = {} for (ec, txs) in ecdict.items(): ec2tx[ec] = list(t2enst.loc[txs].enst.values) # Create mapping of each ec to the genes whose transcripts are contained in it ec2g = {} key_errors = {} ec_error = [] for (ec, txs) in ec2tx.items(): try: # get unique gene names for each ec list of transcripts ec2g[ec] = list(np.unique(t2g.loc[txs].gname.values)) ec_error.append(False) except KeyError: # if there's a tx that doesn't have a t2g mapping valid_txs = [i for i in txs if i in t2g.index] if len(valid_txs) > 0: # if there is at least one valid gene for the ec txs use them instead ec2g[ec] = list(np.unique(t2g.loc[valid_txs].gname.values)) else: # Otherwise, fill in none for ec2g mapping ec2g[ec] = "None" ec_error.append(True) # If the ec list of transcripts contains a tx not in the tx to gene mapping dict, note the ec and txs that # are the problem errors = [i for i in txs if i not in t2g.index] key_errors[ec] = errors adata.var["ec_tx_error"] = ec_error # Create an inverse mapping dictionary getting all ecs and their txs for a given gene inv_map = {} for ec, genes in ec2g.items(): for gene in genes: # Get the current dict of ec's for that gene inv_map[gene] = inv_map.get(gene, {}) # Append the current ec inv_map[gene][ec] = ec2tx[ec] # save the dicts logger.info(f"Number of ec errors: {len(key_errors)}") adata.uns['key_errors'] = key_errors return ec2tx, ec2g, inv_map