Source code for DOLPHIN.AS.convert_modality_ohe

import pandas as pd
import anndata

[docs]def run_modality_ohe(
    anchor_output: str,
    adata_psi: str,
    cluster_name : str,
    out_directory: str
):  
    """
    Convert splicing modality output from Anchor into a one-hot encoded matrix and save as AnnData.

    This function processes the modality classification of splicing events generated by the
    [Anchor](https://github.com/yeolab/anchor) module of Expedition. It converts each categorical
    modality (e.g., included, excluded, bimodal) into numerical values and aligns them to cells
    using the provided cluster labels in the adata_psi.

    Parameters
    ----------
    anchor_output : str
        Path to the CSV file containing modality annotations for splicing events from Anchor.

    adata_psi : str
        Path to the .h5ad file containing the PSI matrix and cell metadata.

    cluster_name : str
        Column name in adata_psi.obs indicating cluster identity of each cell. 
        Must match with column names in the Anchor output.

    out_directory : str
        Path to save the resulting one-hot encoded AnnData .h5ad file.

    Returns
    -------
    adata : anndata.AnnData
        An AnnData object with modality one-hot encoded values.
    """
    
    #create a dictionary to assign different values to different modality category
    dict_mod_num = {
        "none":0,
        "uncategorized":1, 
        "included":2,
        "excluded":3,
        "middle":4,
        "bimodal":5
    }
    
    df_mod = pd.read_csv(anchor_output, index_col=0)
    df_mod_num = df_mod.replace(dict_mod_num)
    
    adata_psi = anndata.read_h5ad(adata_psi)
    data = adata_psi.to_df()

    #turn each column into an dictionary
    dict_mod = {}
    for _col in list(df_mod_num.columns):
        dict_mod[_col] = df_mod_num[df_mod_num[_col].notna()][_col].to_dict()
    df_pca = data
    #replace all non-value with 2
    df_pca[df_pca.notnull()] = 99
    dict_celltype_lbl = dict(zip(adata_psi.obs.index, adata_psi.obs[cluster_name]))
    df_pca[cluster_name] = df_pca.index.map(dict_celltype_lbl)

    df_pca_num = []
    for i, _type in enumerate(df_mod.columns):
        print(_type)
        #subset dataframe based on cluster name
        df_pca_sub = df_pca[df_pca[cluster_name] == _type]

        # replace non-na with event modality category with dictionary 
        for _key in dict_mod[_type].keys():
            df_pca_sub=df_pca_sub.replace({_key: 99}, dict_mod[_type][_key])
        #replace nan value with "None"
        df_pca_sub.iloc[:,:-1] = df_pca_sub.iloc[:,:-1].fillna(dict_mod_num["none"])

        if i == 0:
            df_pca_num = df_pca_sub
        else:
            df_pca_num = pd.concat([df_pca_num, df_pca_sub])

    # create a new adata
    obs = adata_psi.obs.loc[df_pca_num.index]
    var_names = df_pca_num.T.index.values[:-1] #use gene_id as index since gene name is not unique
    var = pd.DataFrame(index=var_names)
    X = df_pca_num.iloc[:,:-1].values
    adata = anndata.AnnData(X, obs=obs, var=var)
    adata.write(out_directory)

    return adata