"""
@authors: Guihong Wan
@date: July 22, 2023
"""
import scanpy as sc
from sklearn.cluster import AgglomerativeClustering
[docs]
def estimateInitialDistance(adata, markers_of_interest, sampling_ratio=0.1):
    """
    Use hierarchical clustering to get the checkpoints to estimate the distance parameter
    for density-based clustering algorithms, e.g., DBSCAN.
    :param adata: the anndata object
    :param markers_of_interest: the list of marker names to subset the data
    :param sampling_ratio: the sampling ratio to subsample the data
    :returns: the list of distances
    """
    # get the subset data
    assert len(markers_of_interest) > 0, "markers_of_interest is empty?"
    condition = False
    for target_marker in markers_of_interest:
        try:
            condition = condition | (adata.obs[target_marker])
        except:
            print(target_marker, "is not found.")
            return None
    adata_tmp = adata[condition]
    assert (sampling_ratio >= 0) and (
        sampling_ratio <= 1
    ), "sampling_ratio should be [0, 1]"
    if sampling_ratio < 1:
        sc.pp.subsample(adata_tmp, fraction=sampling_ratio, random_state=42)
    X = adata_tmp.obs[["X_centroid", "Y_centroid"]].to_numpy()
    # compute the distances
    model = AgglomerativeClustering(
        distance_threshold=0, n_clusters=None, compute_distances=True
    )
    model = model.fit(X)
    print("Computing distances...")
    nsamples = X.shape[0]
    newnodes = []
    layer = 0
    distances = []
    for i, item in enumerate(model.children_):
        # print("i="+str(i)+":", item[0], item[1],"d:", model.distances_[i], "-->",  nsamples+i)
        newnodes.append(nsamples + i)
        if (item[0] in newnodes) and (item[1] in newnodes):
            # print("-------",layer, model.distances_[i])
            newnodes = []
            layer += 1
            distances.append(model.distances_[i])
    # print(min(model.distances_), max(model.distances_), np.median(model.distances_), np.mean(model.distances_))
    return distances