Source code for trackcell.tl.spatial
"""
Tools for spatial analyses.
"""
import numpy as np
import pandas as pd
from scipy.spatial import cKDTree
from scipy.spatial.distance import cdist
[docs]
def hd_labeldist(adata, groupby: str, label: str, inplace: bool = True, method: str = "kdtree"):
"""
Compute the distance from every cell to the nearest cell annotated with a specific label (10x HD data).
Distances are reported both in the pixel coordinate system stored in `.obsm["spatial"]`
(SpaceRanger target/hires layer) and in microns using the scalefactors embedded in
`adata.uns["spatial"]`.
The function automatically detects whether coordinates are in hires or full-res resolution
by comparing the computed tissue size with the expected chip size (6.5mm for 10X HD).
This ensures compatibility with both SpaceRanger output and bin2cell-processed data.
Parameters
----------
adata : sc.AnnData
Annotated data matrix with spatial coordinates and SpaceRanger scalefactors.
groupby : str
Column name in `adata.obs` that contains the annotation labels.
label : str
Target label within `groupby` for which distances will be computed.
inplace : bool, default True
If True, the function adds two columns to `adata.obs`:
`{label}_px` (pixel distance on the hires/registered image) and
`{label}_dist` (physical distance in microns).
If False, the function returns a dataframe with the two columns.
method : str, default "kdtree"
Method to use for distance computation:
- "kdtree": Use KDTree spatial indexing (recommended, O(n log n) time, O(n) memory).
Best for large datasets with many cells.
- "cdist": Use scipy's cdist function (O(n*m) time and memory, where m is number of label cells).
Faster for small datasets but memory-intensive for large ones.
Returns
-------
pandas.DataFrame | None
Returns a DataFrame with `{label}_px` and `{label}_dist` when `inplace=False`.
Otherwise, modifies `adata.obs` in place and returns None.
"""
if "spatial" not in adata.obsm:
raise ValueError("`adata.obsm['spatial']` is required but missing.")
if groupby not in adata.obs.columns:
raise ValueError(f"`{groupby}` not found in `adata.obs`.")
if label not in adata.obs[groupby].unique():
raise ValueError(f"`{label}` not present in `adata.obs['{groupby}']`.")
coords = adata.obsm["spatial"]
if coords is None or len(coords) == 0:
raise ValueError("Spatial coordinates are empty.")
mask_label = (adata.obs[groupby] == label).to_numpy()
if not mask_label.any():
raise ValueError(f"No observations with label `{label}` were found.")
coords_all = np.asarray(coords, dtype=float)
coords_label = coords_all[mask_label]
# Compute distances using selected method
if method == "kdtree":
# Method 1: KDTree (recommended for large datasets)
# Memory: O(n + m), Time: O(n log m) where n=all cells, m=label cells
tree = cKDTree(coords_label)
dist_px, _ = tree.query(coords_all, k=1)
elif method == "cdist":
# Method 2: cdist (faster for small datasets, but memory-intensive)
# Memory: O(n*m), Time: O(n*m)
dist_matrix = cdist(coords_all, coords_label, metric='euclidean')
dist_px = dist_matrix.min(axis=1)
else:
raise ValueError(f"Unknown method: {method}. Choose 'kdtree' or 'cdist'.")
spatial_meta = adata.uns.get("spatial")
if not spatial_meta:
raise ValueError("`adata.uns['spatial']` is missing scalefactor information.")
sample_key = next(iter(spatial_meta))
scalefactors = spatial_meta[sample_key].get("scalefactors", {})
microns_per_pixel = scalefactors.get("microns_per_pixel")
if microns_per_pixel is None:
raise ValueError("`microns_per_pixel` not found in scalefactors.")
hires_scale = scalefactors.get("tissue_hires_scalef") or scalefactors.get("regist_target_img_scalef") or 1.0
# Auto-detect coordinate resolution by comparing computed size with expected chip size
# 10X HD chip is 6.5mm x 6.5mm, VisiumHD is similar
# Calculate coordinate range in both possible resolutions
coord_range = np.ptp(coords_all, axis=0) # [max_x - min_x, max_y - min_y]
max_range_px = np.max(coord_range)
# Test both hypotheses:
# 1. Coordinates are hires: need to divide by hires_scale
size_um_hires = max_range_px * (microns_per_pixel / hires_scale)
# 2. Coordinates are full-res: use directly
size_um_fullres = max_range_px * microns_per_pixel
# Expected chip size: 6.5mm = 6500um (with some tolerance for tissue coverage)
# Typical tissue coverage is 60-90% of chip, so reasonable range is 4000-7000um
expected_max_size_um = 7000 # Upper bound for reasonable chip size
# Determine which resolution gives more reasonable results
# If hires calculation gives reasonable size (< expected_max_size_um), coordinates are hires
# If fullres calculation gives reasonable size, coordinates are full-res
# If hires_scale is 1.0 or very close, assume full-res
if hires_scale < 1.01: # hires_scale close to 1.0 means likely full-res
is_hires = False
elif size_um_hires <= expected_max_size_um and size_um_fullres > expected_max_size_um:
# hires calculation gives reasonable result, fullres doesn't
is_hires = True
elif size_um_fullres <= expected_max_size_um and size_um_hires > expected_max_size_um:
# fullres calculation gives reasonable result, hires doesn't
is_hires = False
else:
# Both or neither give reasonable results, prefer hires if hires_scale is significantly < 1.0
# This handles edge cases where tissue might be larger than chip
is_hires = (hires_scale < 0.5) # If hires_scale is significantly < 1, likely hires coords
# Calculate physical distance based on detected resolution
if is_hires:
dist_um = dist_px * (microns_per_pixel / hires_scale)
else:
dist_um = dist_px * microns_per_pixel
col_px = f"{label}_px"
col_um = f"{label}_dist"
if inplace:
adata.obs[col_px] = dist_px
adata.obs[col_um] = dist_um
return None
return pd.DataFrame({col_px: dist_px, col_um: dist_um}, index=adata.obs.index)