Source code for DOLPHIN.AS.convert_modality_ohe

import pandas as pd
import anndata

[docs]def run_modality_ohe( anchor_output: str, adata_psi: str, cluster_name : str, out_directory: str ): """ Convert splicing modality output from Anchor into a one-hot encoded matrix and save as AnnData. This function processes the modality classification of splicing events generated by the [Anchor](https://github.com/yeolab/anchor) module of Expedition. It converts each categorical modality (e.g., included, excluded, bimodal) into numerical values and aligns them to cells using the provided cluster labels in the adata_psi. Parameters ---------- anchor_output : str Path to the CSV file containing modality annotations for splicing events from Anchor. adata_psi : str Path to the .h5ad file containing the PSI matrix and cell metadata. cluster_name : str Column name in adata_psi.obs indicating cluster identity of each cell. Must match with column names in the Anchor output. out_directory : str Path to save the resulting one-hot encoded AnnData .h5ad file. Returns ------- adata : anndata.AnnData An AnnData object with modality one-hot encoded values. """ #create a dictionary to assign different values to different modality category dict_mod_num = { "none":0, "uncategorized":1, "included":2, "excluded":3, "middle":4, "bimodal":5 } df_mod = pd.read_csv(anchor_output, index_col=0) df_mod_num = df_mod.replace(dict_mod_num) adata_psi = anndata.read_h5ad(adata_psi) data = adata_psi.to_df() #turn each column into an dictionary dict_mod = {} for _col in list(df_mod_num.columns): dict_mod[_col] = df_mod_num[df_mod_num[_col].notna()][_col].to_dict() df_pca = data #replace all non-value with 2 df_pca[df_pca.notnull()] = 99 dict_celltype_lbl = dict(zip(adata_psi.obs.index, adata_psi.obs[cluster_name])) df_pca[cluster_name] = df_pca.index.map(dict_celltype_lbl) df_pca_num = [] for i, _type in enumerate(df_mod.columns): print(_type) #subset dataframe based on cluster name df_pca_sub = df_pca[df_pca[cluster_name] == _type] # replace non-na with event modality category with dictionary for _key in dict_mod[_type].keys(): df_pca_sub=df_pca_sub.replace({_key: 99}, dict_mod[_type][_key]) #replace nan value with "None" df_pca_sub.iloc[:,:-1] = df_pca_sub.iloc[:,:-1].fillna(dict_mod_num["none"]) if i == 0: df_pca_num = df_pca_sub else: df_pca_num = pd.concat([df_pca_num, df_pca_sub]) # create a new adata obs = adata_psi.obs.loc[df_pca_num.index] var_names = df_pca_num.T.index.values[:-1] #use gene_id as index since gene name is not unique var = pd.DataFrame(index=var_names) X = df_pca_num.iloc[:,:-1].values adata = anndata.AnnData(X, obs=obs, var=var) adata.write(out_directory) return adata