import pandas as pd
import anndata
[docs]def run_modality_ohe(
anchor_output: str,
adata_psi: str,
cluster_name : str,
out_directory: str
):
"""
Convert splicing modality output from Anchor into a one-hot encoded matrix and save as AnnData.
This function processes the modality classification of splicing events generated by the
[Anchor](https://github.com/yeolab/anchor) module of Expedition. It converts each categorical
modality (e.g., included, excluded, bimodal) into numerical values and aligns them to cells
using the provided cluster labels in the adata_psi.
Parameters
----------
anchor_output : str
Path to the CSV file containing modality annotations for splicing events from Anchor.
adata_psi : str
Path to the .h5ad file containing the PSI matrix and cell metadata.
cluster_name : str
Column name in adata_psi.obs indicating cluster identity of each cell.
Must match with column names in the Anchor output.
out_directory : str
Path to save the resulting one-hot encoded AnnData .h5ad file.
Returns
-------
adata : anndata.AnnData
An AnnData object with modality one-hot encoded values.
"""
#create a dictionary to assign different values to different modality category
dict_mod_num = {
"none":0,
"uncategorized":1,
"included":2,
"excluded":3,
"middle":4,
"bimodal":5
}
df_mod = pd.read_csv(anchor_output, index_col=0)
df_mod_num = df_mod.replace(dict_mod_num)
adata_psi = anndata.read_h5ad(adata_psi)
data = adata_psi.to_df()
#turn each column into an dictionary
dict_mod = {}
for _col in list(df_mod_num.columns):
dict_mod[_col] = df_mod_num[df_mod_num[_col].notna()][_col].to_dict()
df_pca = data
#replace all non-value with 2
df_pca[df_pca.notnull()] = 99
dict_celltype_lbl = dict(zip(adata_psi.obs.index, adata_psi.obs[cluster_name]))
df_pca[cluster_name] = df_pca.index.map(dict_celltype_lbl)
df_pca_num = []
for i, _type in enumerate(df_mod.columns):
print(_type)
#subset dataframe based on cluster name
df_pca_sub = df_pca[df_pca[cluster_name] == _type]
# replace non-na with event modality category with dictionary
for _key in dict_mod[_type].keys():
df_pca_sub=df_pca_sub.replace({_key: 99}, dict_mod[_type][_key])
#replace nan value with "None"
df_pca_sub.iloc[:,:-1] = df_pca_sub.iloc[:,:-1].fillna(dict_mod_num["none"])
if i == 0:
df_pca_num = df_pca_sub
else:
df_pca_num = pd.concat([df_pca_num, df_pca_sub])
# create a new adata
obs = adata_psi.obs.loc[df_pca_num.index]
var_names = df_pca_num.T.index.values[:-1] #use gene_id as index since gene name is not unique
var = pd.DataFrame(index=var_names)
X = df_pca_num.iloc[:,:-1].values
adata = anndata.AnnData(X, obs=obs, var=var)
adata.write(out_directory)
return adata