Source code for trackcell.tl.spatial

"""
Tools for spatial analyses.
"""

import numpy as np
import pandas as pd
from scipy.spatial import cKDTree
from scipy.spatial.distance import cdist



[docs]
def hd_labeldist(adata, groupby: str, label: str, inplace: bool = True, method: str = "kdtree"):
    """
    Compute the distance from every cell to the nearest cell annotated with a specific label (10x HD data).
    
    Distances are reported both in the pixel coordinate system stored in `.obsm["spatial"]`
    (SpaceRanger target/hires layer) and in microns using the scalefactors embedded in
    `adata.uns["spatial"]`.
    
    The function automatically detects whether coordinates are in hires or full-res resolution
    by comparing the computed tissue size with the expected chip size (6.5mm for 10X HD).
    This ensures compatibility with both SpaceRanger output and bin2cell-processed data.
    
    Parameters
    ----------
    adata : sc.AnnData
        Annotated data matrix with spatial coordinates and SpaceRanger scalefactors.
    groupby : str
        Column name in `adata.obs` that contains the annotation labels.
    label : str
        Target label within `groupby` for which distances will be computed.
    inplace : bool, default True
        If True, the function adds two columns to `adata.obs`:
        `{label}_px` (pixel distance on the hires/registered image) and
        `{label}_dist` (physical distance in microns).
        If False, the function returns a dataframe with the two columns.
    method : str, default "kdtree"
        Method to use for distance computation:
        - "kdtree": Use KDTree spatial indexing (recommended, O(n log n) time, O(n) memory).
                   Best for large datasets with many cells.
        - "cdist": Use scipy's cdist function (O(n*m) time and memory, where m is number of label cells).
                   Faster for small datasets but memory-intensive for large ones.
    
    Returns
    -------
    pandas.DataFrame | None
        Returns a DataFrame with `{label}_px` and `{label}_dist` when `inplace=False`.
        Otherwise, modifies `adata.obs` in place and returns None.
    """
    if "spatial" not in adata.obsm:
        raise ValueError("`adata.obsm['spatial']` is required but missing.")
    if groupby not in adata.obs.columns:
        raise ValueError(f"`{groupby}` not found in `adata.obs`.")
    if label not in adata.obs[groupby].unique():
        raise ValueError(f"`{label}` not present in `adata.obs['{groupby}']`.")
    
    coords = adata.obsm["spatial"]
    if coords is None or len(coords) == 0:
        raise ValueError("Spatial coordinates are empty.")
    
    mask_label = (adata.obs[groupby] == label).to_numpy()
    if not mask_label.any():
        raise ValueError(f"No observations with label `{label}` were found.")
    
    coords_all = np.asarray(coords, dtype=float)
    coords_label = coords_all[mask_label]
    
    # Compute distances using selected method
    if method == "kdtree":
        # Method 1: KDTree (recommended for large datasets)
        # Memory: O(n + m), Time: O(n log m) where n=all cells, m=label cells
        tree = cKDTree(coords_label)
        dist_px, _ = tree.query(coords_all, k=1)
    elif method == "cdist":
        # Method 2: cdist (faster for small datasets, but memory-intensive)
        # Memory: O(n*m), Time: O(n*m)
        dist_matrix = cdist(coords_all, coords_label, metric='euclidean')
        dist_px = dist_matrix.min(axis=1)
    else:
        raise ValueError(f"Unknown method: {method}. Choose 'kdtree' or 'cdist'.")
    
    spatial_meta = adata.uns.get("spatial")
    if not spatial_meta:
        raise ValueError("`adata.uns['spatial']` is missing scalefactor information.")
    sample_key = next(iter(spatial_meta))
    scalefactors = spatial_meta[sample_key].get("scalefactors", {})
    
    microns_per_pixel = scalefactors.get("microns_per_pixel")
    if microns_per_pixel is None:
        raise ValueError("`microns_per_pixel` not found in scalefactors.")
    
    hires_scale = scalefactors.get("tissue_hires_scalef") or scalefactors.get("regist_target_img_scalef") or 1.0
    
    # Auto-detect coordinate resolution by comparing computed size with expected chip size
    # 10X HD chip is 6.5mm x 6.5mm, VisiumHD is similar
    # Calculate coordinate range in both possible resolutions
    coord_range = np.ptp(coords_all, axis=0)  # [max_x - min_x, max_y - min_y]
    max_range_px = np.max(coord_range)
    
    # Test both hypotheses:
    # 1. Coordinates are hires: need to divide by hires_scale
    size_um_hires = max_range_px * (microns_per_pixel / hires_scale)
    # 2. Coordinates are full-res: use directly
    size_um_fullres = max_range_px * microns_per_pixel
    
    # Expected chip size: 6.5mm = 6500um (with some tolerance for tissue coverage)
    # Typical tissue coverage is 60-90% of chip, so reasonable range is 4000-7000um
    expected_max_size_um = 7000  # Upper bound for reasonable chip size
    
    # Determine which resolution gives more reasonable results
    # If hires calculation gives reasonable size (< expected_max_size_um), coordinates are hires
    # If fullres calculation gives reasonable size, coordinates are full-res
    # If hires_scale is 1.0 or very close, assume full-res
    if hires_scale < 1.01:  # hires_scale close to 1.0 means likely full-res
        is_hires = False
    elif size_um_hires <= expected_max_size_um and size_um_fullres > expected_max_size_um:
        # hires calculation gives reasonable result, fullres doesn't
        is_hires = True
    elif size_um_fullres <= expected_max_size_um and size_um_hires > expected_max_size_um:
        # fullres calculation gives reasonable result, hires doesn't
        is_hires = False
    else:
        # Both or neither give reasonable results, prefer hires if hires_scale is significantly < 1.0
        # This handles edge cases where tissue might be larger than chip
        is_hires = (hires_scale < 0.5)  # If hires_scale is significantly < 1, likely hires coords
    
    # Calculate physical distance based on detected resolution
    if is_hires:
        dist_um = dist_px * (microns_per_pixel / hires_scale)
    else:
        dist_um = dist_px * microns_per_pixel
    
    col_px = f"{label}_px"
    col_um = f"{label}_dist"
    
    if inplace:
        adata.obs[col_px] = dist_px
        adata.obs[col_um] = dist_um
        return None
    
    return pd.DataFrame({col_px: dist_px, col_um: dist_um}, index=adata.obs.index)