Source code for nuee.dissimilarity.anosim

"""
ANOSIM (Analysis of Similarities).

This module provides an implementation of ANOSIM that follows the
approach used in ``vegan::anosim``: it ranks distances, computes the
between- vs. within-group statistic, and estimates p-values by permutation.
"""

from __future__ import annotations

import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform
from typing import Dict, Optional, Union


ArrayLike = Union[np.ndarray, pd.DataFrame, pd.Series]


def _as_numpy_distance(matrix: ArrayLike) -> np.ndarray:
    if isinstance(matrix, pd.DataFrame):
        matrix = matrix.values
    matrix = np.asarray(matrix, dtype=float)
    if matrix.ndim == 1:
        matrix = squareform(matrix)
    if matrix.shape[0] != matrix.shape[1]:
        raise ValueError("Distance matrix must be square or condensed.")
    if not np.allclose(matrix, matrix.T, atol=1e-12):
        raise ValueError("Distance matrix must be symmetric.")
    return matrix


def _prepare_grouping(grouping: ArrayLike, n: int) -> np.ndarray:
    if isinstance(grouping, pd.Series):
        grouping = grouping.values
    grouping = np.asarray(grouping)
    if grouping.ndim != 1:
        raise ValueError("Grouping vector must be one-dimensional.")
    if grouping.shape[0] != n:
        raise ValueError("Grouping vector length must match the distance matrix.")
    return grouping


[docs] def anosim(distance_matrix: ArrayLike, grouping: ArrayLike, permutations: int = 999, random_state: Optional[Union[int, np.random.Generator]] = None, **kwargs) -> Dict[str, Union[float, int]]: """ Analysis of similarities (ANOSIM). Parameters ---------- distance_matrix: Square or condensed distance matrix. grouping: Group assignments for each observation. permutations: Number of permutations used to estimate the p-value. Set to 0 to skip. random_state: Seed controlling permutation reproducibility. Returns ------- dict Dictionary with ``r_statistic``, ``p_value`` and ``permutations``. """ D = _as_numpy_distance(distance_matrix) n = D.shape[0] grouping = _prepare_grouping(grouping, n) iu = np.triu_indices(n, k=1) distances = D[iu] ranks = distances.argsort().argsort().astype(float) + 1.0 group_labels, inverse = np.unique(grouping, return_inverse=True) n_groups = len(group_labels) if n_groups < 2: raise ValueError("ANOSIM requires at least two groups.") between_values = [] within_values = [] idx = 0 for i in range(n - 1): gi = inverse[i] for j in range(i + 1, n): gj = inverse[j] if gi == gj: within_values.append(ranks[idx]) else: between_values.append(ranks[idx]) idx += 1 between_values = np.asarray(between_values, dtype=float) within_values = np.asarray(within_values, dtype=float) rb = between_values.mean() if between_values.size else 0.0 rw = within_values.mean() if within_values.size else 0.0 m = ranks.size denominator = m / 2 if m > 0 else 1.0 R_obs = (rb - rw) / denominator p_value = np.nan if permutations and permutations > 0: rng = np.random.default_rng(random_state) exceed = 0 for _ in range(permutations): permuted = rng.permutation(inverse) between_p = [] within_p = [] idx = 0 for i in range(n - 1): gi = permuted[i] for j in range(i + 1, n): gj = permuted[j] if gi == gj: within_p.append(ranks[idx]) else: between_p.append(ranks[idx]) idx += 1 between_p = np.asarray(between_p, dtype=float) within_p = np.asarray(within_p, dtype=float) denom = m / 2 if m > 0 else 1.0 rb_p = between_p.mean() if between_p.size else 0.0 rw_p = within_p.mean() if within_p.size else 0.0 R_perm = (rb_p - rw_p) / denom if R_perm >= R_obs - 1e-12: exceed += 1 p_value = (exceed + 1) / (permutations + 1) return { "r_statistic": float(R_obs), "p_value": float(p_value) if np.isfinite(p_value) else np.nan, "permutations": permutations, }