import anndata
from sklearn.neighbors import kneighbors_graph
import numpy as np
import pandas as pd
import os
[docs]def run_find_neighbor(
embedding_data: str,
out_name: str,
N_neighbor: int = 10,
out_directory: str = "./"
):
"""
Identify nearest neighbors for each cell based on the embedding space.
Parameters
----------
embedding_data : str
Path to the `.h5ad` file generated by the DOLPHIN model. The file must contain
the embedding matrix `X_z` in `obsm`.
out_name : str
Output filename prefix.
N_neighbor : int, optional
Number of neighbors to find for each cell (including itself). Default is 10.
out_directory : str, optional
Directory where the neighbor list CSV file will be saved. Default is current directory.
Returns
-------
None
Saves a CSV file named `N_<out_name>_<N_neighbor>.csv` containing two columns:
- `main_name`: the target cell
- `neighbor`: a neighboring cell ID from the embedding space
"""
adata = anndata.read_h5ad(embedding_data)
#save the neighborhood information based on cell embedding
cell_conn_new = kneighbors_graph(adata.obsm['X_z'], N_neighbor, mode='connectivity',include_self=True, n_jobs=20).toarray()
# cell_dist_new = kneighbors_graph(adata.obsm['X_z'], N_neighbor, mode='distance', include_self=True,n_jobs=20).toarray()
#save the neighborhood information for 0.701
main_name = []
combine_name = []
for _cell_idx in range(0, adata.obs.shape[0]):
for i, _idx in enumerate(np.nonzero(cell_conn_new[_cell_idx])[0]):
main_name.append(adata.obs.index[_cell_idx])
combine_name.append(adata.obs.index[_idx])
out_path = os.path.join(out_directory, f"N_{out_name}_{N_neighbor}.csv")
pd.DataFrame({"main_name": main_name, "neighbor":combine_name}).to_csv(out_path, index=None)
print(f"Saved neighbor list to {out_path}")