Source code for DOLPHIN.cell_reads_aggregation.find_cell_neighbor

import anndata
from sklearn.neighbors import kneighbors_graph
import numpy as np
import pandas as pd
import os

[docs]def run_find_neighbor(
    embedding_data: str,
    out_name: str,
    N_neighbor: int = 10,
    out_directory: str = "./"
):  
    """
    Identify nearest neighbors for each cell based on the embedding space.

    Parameters
    ----------
    embedding_data : str
        Path to the `.h5ad` file generated by the DOLPHIN model. The file must contain
        the embedding matrix `X_z` in `obsm`.
    out_name : str
        Output filename prefix.
    N_neighbor : int, optional
        Number of neighbors to find for each cell (including itself). Default is 10.
    out_directory : str, optional
        Directory where the neighbor list CSV file will be saved. Default is current directory.

    Returns
    -------
    None
        Saves a CSV file named `N_<out_name>_<N_neighbor>.csv` containing two columns:
        - `main_name`: the target cell
        - `neighbor`: a neighboring cell ID from the embedding space
    """
    
    adata = anndata.read_h5ad(embedding_data)

    #save the neighborhood information based on cell embedding
    cell_conn_new = kneighbors_graph(adata.obsm['X_z'], N_neighbor, mode='connectivity',include_self=True, n_jobs=20).toarray()
    # cell_dist_new = kneighbors_graph(adata.obsm['X_z'], N_neighbor, mode='distance', include_self=True,n_jobs=20).toarray()

    #save the neighborhood information for 0.701
    main_name = []
    combine_name = []
    for _cell_idx in range(0, adata.obs.shape[0]):
        for i, _idx  in enumerate(np.nonzero(cell_conn_new[_cell_idx])[0]):
            main_name.append(adata.obs.index[_cell_idx])
            combine_name.append(adata.obs.index[_idx])
    
    out_path = os.path.join(out_directory, f"N_{out_name}_{N_neighbor}.csv")
    pd.DataFrame({"main_name": main_name, "neighbor":combine_name}).to_csv(out_path, index=None)
    print(f"Saved neighbor list to {out_path}")