import os
import pandas as pd
import numpy as np
import anndata
[docs]def run_psi_random(
outrigger_psi_data: str,
out_name: str,
out_directory: str = "./",
seed_num: int = 0
):
"""
Randomly impute missing PSI values for downstream clustering analysis.
This function reads an AnnData <out_name>_PSI.h5ad file containing PSI values generated by Outrigger.
It replaces all missing values (NaNs) in the PSI matrix with random values uniformly
sampled between 0 and 1, enabling compatibility with dimensionality reduction and
clustering methods (e.g., PCA, Leiden). The output is a new `.h5ad` file with the
same structure but with imputed values.
Parameters
----------
outrigger_psi_data : str
Path to the input `<out_name>_PSI.h5ad` file containing PSI values with NaNs.
out_directory : str, optional
Directory where the output `.h5ad` file will be saved. Default is the current directory ("./").
out_name : str
Output filename prefix (without extension). The result will be saved as "<out_name>_PSI_random.h5ad".
seed_num : int, optional
Random seed for reproducibility. Default is 0.
Returns
-------
adata : anndata.AnnData
An AnnData object with random-imputed PSI values. The file is saved to:
`<out_directory>/alternative_splicing/<out_name>_PSI_random.h5ad`.
"""
final_out_dir = os.path.join(out_directory, "alternative_splicing")
os.makedirs(final_out_dir, exist_ok=True)
if not os.path.exists(outrigger_psi_data):
raise FileNotFoundError(f"Input file not found: {outrigger_psi_data}")
np.random.seed(seed_num)
_adata = anndata.read_h5ad(outrigger_psi_data)
df_nan = _adata.to_df()
nan_mask = df_nan.isna()
random_vals = pd.DataFrame(np.random.uniform(0, 1, df_nan.shape), index=df_nan.index, columns=df_nan.columns)
df_filled = df_nan.where(~nan_mask, random_vals)
adata = anndata.AnnData(df_filled.values, obs=_adata.obs, var=_adata.var)
adata.write(os.path.join(final_out_dir, out_name + "_PSI_random.h5ad"))
return adata