Source code for DOLPHIN.AS.convert_random_psi

import os
import pandas as pd
import numpy as np
import anndata

[docs]def run_psi_random( outrigger_psi_data: str, out_name: str, out_directory: str = "./", seed_num: int = 0 ): """ Randomly impute missing PSI values for downstream clustering analysis. This function reads an AnnData <out_name>_PSI.h5ad file containing PSI values generated by Outrigger. It replaces all missing values (NaNs) in the PSI matrix with random values uniformly sampled between 0 and 1, enabling compatibility with dimensionality reduction and clustering methods (e.g., PCA, Leiden). The output is a new `.h5ad` file with the same structure but with imputed values. Parameters ---------- outrigger_psi_data : str Path to the input `<out_name>_PSI.h5ad` file containing PSI values with NaNs. out_directory : str, optional Directory where the output `.h5ad` file will be saved. Default is the current directory ("./"). out_name : str Output filename prefix (without extension). The result will be saved as "<out_name>_PSI_random.h5ad". seed_num : int, optional Random seed for reproducibility. Default is 0. Returns ------- adata : anndata.AnnData An AnnData object with random-imputed PSI values. The file is saved to: `<out_directory>/alternative_splicing/<out_name>_PSI_random.h5ad`. """ final_out_dir = os.path.join(out_directory, "alternative_splicing") os.makedirs(final_out_dir, exist_ok=True) if not os.path.exists(outrigger_psi_data): raise FileNotFoundError(f"Input file not found: {outrigger_psi_data}") np.random.seed(seed_num) _adata = anndata.read_h5ad(outrigger_psi_data) df_nan = _adata.to_df() nan_mask = df_nan.isna() random_vals = pd.DataFrame(np.random.uniform(0, 1, df_nan.shape), index=df_nan.index, columns=df_nan.columns) df_filled = df_nan.where(~nan_mask, random_vals) adata = anndata.AnnData(df_filled.values, obs=_adata.obs, var=_adata.var) adata.write(os.path.join(final_out_dir, out_name + "_PSI_random.h5ad")) return adata