Source code for DOLPHIN.AS.convert_psi_to_h5ad

import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from functools import reduce
import scanpy as sc
import anndata

[docs]def run_convert_psi( metadata_path: str, outrigger_path: str, out_name: str, out_directory: str = "./" ): """ Convert Outrigger PSI output into an AnnData object for alternative splicing analysis. This function processes PSI (Percent Spliced In) values generated by Outrigger, constructs a sample (cell) by event matrix, annotates events with gene names, and returns a fully structured AnnData object for downstream analysis. Parameters ---------- metadata_path : str Path to the metadata file. Must be a tab-separated file containing a column named 'CB' for cell barcodes. outrigger_path : str Path to the directory containing Outrigger results. This directory must include: - "psi/outrigger_summary.csv": PSI values per cell and event. - "index/se/validated/events.csv" and "index/mxe/validated/events.csv": Event-to-gene mapping. out_name : str Output filename prefix (without extension). Final output will be named "<out_name>_PSI.h5ad". out_directory : str, optional Directory where the output `.h5ad` file will be saved. Default is the current directory ("./"). Returns ------- adata : anndata.AnnData A structured AnnData object with the following components: - X : np.ndarray The PSI matrix (cells by events), with PSI values for each splicing event. - obs : pandas.DataFrame Cell-level metadata, with cell barcodes and additional columns from the input metadata file. - var : pandas.DataFrame Event-level metadata with gene names The output adata will be saved as: `<out_directory>/alternative_splicing/<out_name>_PSI.h5ad`. """ final_out_dir = os.path.join(out_directory, "alternative_splicing") os.makedirs(final_out_dir, exist_ok=True) df_meta = pd.read_csv(metadata_path, sep='\t') all_sample = list(df_meta["CB"]) pd_psi_single = pd.read_csv(os.path.join(outrigger_path, "psi", "outrigger_summary.csv")) pd_psi_single["sample_id"] = pd_psi_single["sample_id"].apply(lambda x:x.split(".")[0]) d = {} for i, _srr in enumerate(tqdm(all_sample)): _temp_df = pd_psi_single[pd_psi_single["sample_id"] == _srr] _temp_df = _temp_df.rename(columns={"psi":_srr}) _temp_df = _temp_df[["event_id",_srr]] d["{0}".format(_srr)] = _temp_df df_merge_list = [] for key in d: df_merge_list.append(d[key]) df_merged = reduce(lambda left,right: pd.merge(left, right, on = "event_id", how='outer'), df_merge_list) df_merged = df_merged.set_index("event_id") df_recon = df_merged.transpose() df_obs_org = pd.merge(df_meta.set_index("CB"), df_recon, left_index=True, right_index=True) ### get AS event and it's corresponding gene name pd_mxe_event = pd.read_csv(os.path.join(outrigger_path, "index", "mxe/validated/events.csv"),low_memory=False) pd_se_event = pd.read_csv(os.path.join(outrigger_path, "index", "se/validated/events.csv"),low_memory=False) #get event dataframe pd_mxe_event["AS_event_type"] = "MXE" pd_se_event["AS_event_type"] = "SE" pd_event = pd.concat([pd_mxe_event, pd_se_event], ignore_index=True) #use gene id to replace nan gene name pd_event["isoform1_gene_name_mod"] = pd_event["isoform1_gene_name"] pd_event["isoform1_gene_name_mod"] = pd_event["isoform1_gene_name_mod"].fillna(pd_event["isoform1_gene_id"]) pd_event["isoform2_gene_name_mod"] = pd_event["isoform2_gene_name"] pd_event["isoform2_gene_name_mod"] = pd_event["isoform2_gene_name"].fillna(pd_event["isoform2_gene_id"]) #get the event id the corresponding most frequent gene name pd_event_isoform1 = pd_event[["event_id", "isoform1_gene_name_mod"]] pd_event_isoform1_freq = pd_event_isoform1.groupby(['event_id', "isoform1_gene_name_mod"], dropna=False).size().to_frame('count1').reset_index() pd_event_isoform1_freq = pd_event_isoform1_freq.sort_values(["event_id","count1"],ascending=False).groupby('event_id').head(1) pd_event_isoform2 = pd_event[["event_id", "isoform2_gene_name_mod"]] pd_event_isoform2_freq = pd_event_isoform2.groupby(['event_id', "isoform2_gene_name_mod"], dropna=False).size().to_frame('count2').reset_index() pd_event_isoform2_freq = pd_event_isoform2_freq.sort_values(["event_id","count2"],ascending=False).groupby('event_id').head(1) #merge two isoform table to get final genes per event pd_event_gene = pd.merge(pd_event_isoform1_freq, pd_event_isoform2_freq, left_on=["event_id"], right_on=["event_id"]) #remove both isoforms has nan gene_name/id pd_event_gene["gene_name"] = np.select( [(pd_event_gene["isoform1_gene_name_mod"].notna() & (pd_event_gene["isoform1_gene_name_mod"] == pd_event_gene["isoform2_gene_name_mod"])), (pd_event_gene["isoform1_gene_name_mod"].notna() & pd_event_gene["isoform2_gene_name_mod"].isna()), (pd_event_gene["isoform2_gene_name_mod"].notna() & pd_event_gene["isoform1_gene_name_mod"].isna()), (pd_event_gene["isoform1_gene_name_mod"].notna() & pd_event_gene["isoform2_gene_name_mod"].notna() & (pd_event_gene["isoform1_gene_name_mod"] != pd_event_gene["isoform2_gene_name_mod"])), (pd_event_gene["isoform2_gene_name_mod"].isna() & pd_event_gene["isoform1_gene_name_mod"].isna()) ], [pd_event_gene["isoform1_gene_name_mod"], pd_event_gene["isoform1_gene_name_mod"], pd_event_gene["isoform2_gene_name_mod"], pd_event_gene["isoform1_gene_name_mod"] + "," + pd_event_gene["isoform2_gene_name_mod"], "Empty" ] ) #remove duplicate gene names if more than one genes pd_event_gene['gene_name'] = pd_event_gene['gene_name'].apply(lambda x: (",").join(list(set(x.split(",")))) if "," in x else x) pd_event_gene = pd_event_gene[["event_id", "gene_name"]] dict_event_gene = dict(zip(pd_event_gene.event_id, pd_event_gene.gene_name)) #save original psi count table obs = df_obs_org.iloc[:, :df_meta.shape[1]-1] var_names = df_obs_org.T.index.values[df_meta.shape[1]-1:] #use gene_id as index since gene name is not unique var = pd.DataFrame(index=var_names) var["gene_name"] = var.index var = var.replace({"gene_name": dict_event_gene}) X = df_obs_org.iloc[:,df_meta.shape[1]-1:].values adata = anndata.AnnData(X, obs=obs, var=var) adata.write(os.path.join(final_out_dir, out_name+"_PSI.h5ad")) return adata