import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from functools import reduce
import scanpy as sc
import anndata
[docs]def run_convert_psi(
metadata_path: str,
outrigger_path: str,
out_name: str,
out_directory: str = "./"
):
"""
Convert Outrigger PSI output into an AnnData object for alternative splicing analysis.
This function processes PSI (Percent Spliced In) values generated by Outrigger,
constructs a sample (cell) by event matrix, annotates events with gene names, and returns a fully structured AnnData object
for downstream analysis.
Parameters
----------
metadata_path : str
Path to the metadata file. Must be a tab-separated file containing a column named 'CB' for cell barcodes.
outrigger_path : str
Path to the directory containing Outrigger results. This directory must include:
- "psi/outrigger_summary.csv": PSI values per cell and event.
- "index/se/validated/events.csv" and "index/mxe/validated/events.csv": Event-to-gene mapping.
out_name : str
Output filename prefix (without extension). Final output will be named "<out_name>_PSI.h5ad".
out_directory : str, optional
Directory where the output `.h5ad` file will be saved. Default is the current directory ("./").
Returns
-------
adata : anndata.AnnData
A structured AnnData object with the following components:
- X : np.ndarray
The PSI matrix (cells by events), with PSI values for each splicing event.
- obs : pandas.DataFrame
Cell-level metadata, with cell barcodes and additional columns from the input metadata file.
- var : pandas.DataFrame
Event-level metadata with gene names
The output adata will be saved as: `<out_directory>/alternative_splicing/<out_name>_PSI.h5ad`.
"""
final_out_dir = os.path.join(out_directory, "alternative_splicing")
os.makedirs(final_out_dir, exist_ok=True)
df_meta = pd.read_csv(metadata_path, sep='\t')
all_sample = list(df_meta["CB"])
pd_psi_single = pd.read_csv(os.path.join(outrigger_path, "psi", "outrigger_summary.csv"))
pd_psi_single["sample_id"] = pd_psi_single["sample_id"].apply(lambda x:x.split(".")[0])
d = {}
for i, _srr in enumerate(tqdm(all_sample)):
_temp_df = pd_psi_single[pd_psi_single["sample_id"] == _srr]
_temp_df = _temp_df.rename(columns={"psi":_srr})
_temp_df = _temp_df[["event_id",_srr]]
d["{0}".format(_srr)] = _temp_df
df_merge_list = []
for key in d:
df_merge_list.append(d[key])
df_merged = reduce(lambda left,right: pd.merge(left, right, on = "event_id", how='outer'), df_merge_list)
df_merged = df_merged.set_index("event_id")
df_recon = df_merged.transpose()
df_obs_org = pd.merge(df_meta.set_index("CB"), df_recon, left_index=True, right_index=True)
### get AS event and it's corresponding gene name
pd_mxe_event = pd.read_csv(os.path.join(outrigger_path, "index", "mxe/validated/events.csv"),low_memory=False)
pd_se_event = pd.read_csv(os.path.join(outrigger_path, "index", "se/validated/events.csv"),low_memory=False)
#get event dataframe
pd_mxe_event["AS_event_type"] = "MXE"
pd_se_event["AS_event_type"] = "SE"
pd_event = pd.concat([pd_mxe_event, pd_se_event], ignore_index=True)
#use gene id to replace nan gene name
pd_event["isoform1_gene_name_mod"] = pd_event["isoform1_gene_name"]
pd_event["isoform1_gene_name_mod"] = pd_event["isoform1_gene_name_mod"].fillna(pd_event["isoform1_gene_id"])
pd_event["isoform2_gene_name_mod"] = pd_event["isoform2_gene_name"]
pd_event["isoform2_gene_name_mod"] = pd_event["isoform2_gene_name"].fillna(pd_event["isoform2_gene_id"])
#get the event id the corresponding most frequent gene name
pd_event_isoform1 = pd_event[["event_id", "isoform1_gene_name_mod"]]
pd_event_isoform1_freq = pd_event_isoform1.groupby(['event_id', "isoform1_gene_name_mod"], dropna=False).size().to_frame('count1').reset_index()
pd_event_isoform1_freq = pd_event_isoform1_freq.sort_values(["event_id","count1"],ascending=False).groupby('event_id').head(1)
pd_event_isoform2 = pd_event[["event_id", "isoform2_gene_name_mod"]]
pd_event_isoform2_freq = pd_event_isoform2.groupby(['event_id', "isoform2_gene_name_mod"], dropna=False).size().to_frame('count2').reset_index()
pd_event_isoform2_freq = pd_event_isoform2_freq.sort_values(["event_id","count2"],ascending=False).groupby('event_id').head(1)
#merge two isoform table to get final genes per event
pd_event_gene = pd.merge(pd_event_isoform1_freq, pd_event_isoform2_freq, left_on=["event_id"], right_on=["event_id"])
#remove both isoforms has nan gene_name/id
pd_event_gene["gene_name"] = np.select(
[(pd_event_gene["isoform1_gene_name_mod"].notna() & (pd_event_gene["isoform1_gene_name_mod"] == pd_event_gene["isoform2_gene_name_mod"])),
(pd_event_gene["isoform1_gene_name_mod"].notna() & pd_event_gene["isoform2_gene_name_mod"].isna()),
(pd_event_gene["isoform2_gene_name_mod"].notna() & pd_event_gene["isoform1_gene_name_mod"].isna()),
(pd_event_gene["isoform1_gene_name_mod"].notna() & pd_event_gene["isoform2_gene_name_mod"].notna() & (pd_event_gene["isoform1_gene_name_mod"] != pd_event_gene["isoform2_gene_name_mod"])),
(pd_event_gene["isoform2_gene_name_mod"].isna() & pd_event_gene["isoform1_gene_name_mod"].isna())
],
[pd_event_gene["isoform1_gene_name_mod"],
pd_event_gene["isoform1_gene_name_mod"],
pd_event_gene["isoform2_gene_name_mod"],
pd_event_gene["isoform1_gene_name_mod"] + "," + pd_event_gene["isoform2_gene_name_mod"],
"Empty"
]
)
#remove duplicate gene names if more than one genes
pd_event_gene['gene_name'] = pd_event_gene['gene_name'].apply(lambda x: (",").join(list(set(x.split(",")))) if "," in x else x)
pd_event_gene = pd_event_gene[["event_id", "gene_name"]]
dict_event_gene = dict(zip(pd_event_gene.event_id, pd_event_gene.gene_name))
#save original psi count table
obs = df_obs_org.iloc[:, :df_meta.shape[1]-1]
var_names = df_obs_org.T.index.values[df_meta.shape[1]-1:] #use gene_id as index since gene name is not unique
var = pd.DataFrame(index=var_names)
var["gene_name"] = var.index
var = var.replace({"gene_name": dict_event_gene})
X = df_obs_org.iloc[:,df_meta.shape[1]-1:].values
adata = anndata.AnnData(X, obs=obs, var=var)
adata.write(os.path.join(final_out_dir, out_name+"_PSI.h5ad"))
return adata