Source code for xrspatial.kde

"""Kernel density estimation (KDE) for point-to-raster conversion.

Produces a continuous density surface from point (or line) data.  Each
output pixel accumulates weighted kernel contributions from all nearby
features, yielding a smooth density field.

Supports numpy, cupy, dask+numpy, and dask+cupy backends.
"""
from __future__ import annotations

import math
from math import pi, sqrt
from functools import partial
from typing import Optional, Tuple, Union

import numpy as np
import xarray as xr

from xrspatial.utils import ngjit, has_cuda_and_cupy, has_dask_array
from xrspatial.interpolate._vector import (
    is_geodataframe, points_from_geodataframe,
)

try:
    import cupy
except ImportError:
    cupy = None

try:
    import dask
    import dask.array as da
except ImportError:
    da = None

from numba import cuda


# ---------------------------------------------------------------------------
# Kernel constants
# ---------------------------------------------------------------------------
_KERNEL_NAMES = ('gaussian', 'epanechnikov', 'quartic')

# Normalisation constants for 2-D product kernels.
# Gaussian  : (1/(2pi))
# Epanechnikov: (2/pi)  (product of (3/4)*(1-u^2) marginals, integrated)
# Quartic   : (3/pi)    (product of (15/16)*(1-u^2)^2 marginals, integrated)
_NORM_GAUSSIAN = 1.0 / (2.0 * pi)
_NORM_EPANECHNIKOV = 2.0 / pi
_NORM_QUARTIC = 3.0 / pi


# ---------------------------------------------------------------------------
# Memory guard
# ---------------------------------------------------------------------------

def _available_memory_bytes():
    """Best-effort estimate of available memory in bytes."""
    # Try /proc/meminfo (Linux)
    try:
        with open('/proc/meminfo', 'r') as f:
            for line in f:
                if line.startswith('MemAvailable:'):
                    return int(line.split()[1]) * 1024
    except (OSError, ValueError, IndexError):
        pass
    # Try psutil
    try:
        import psutil
        return psutil.virtual_memory().available
    except (ImportError, AttributeError):
        pass
    # Fallback: 2 GB
    return 2 * 1024 ** 3


def _check_grid_memory(rows, cols):
    """Raise MemoryError if a single float64 grid of (rows, cols) would
    exceed half of available RAM.

    The eager numpy and cupy backends allocate one ``(rows, cols)``
    float64 buffer up front. A user passing huge ``width``/``height``
    would otherwise OOM the process or surface an opaque CUDA allocator
    error.
    """
    required = int(rows) * int(cols) * 8
    available = _available_memory_bytes()
    if required > 0.5 * available:
        raise MemoryError(
            f"output grid of {rows}x{cols} float64 needs "
            f"~{required / 1e9:.1f} GB, but only "
            f"{available / 1e9:.1f} GB is available. "
            f"Use smaller width/height or pass a dask-backed template."
        )


# ---------------------------------------------------------------------------
# Bandwidth selection
# ---------------------------------------------------------------------------

def _silverman_bandwidth(x, y):
    """Silverman's rule of thumb for 2-D data.

    h = n^(-1/6) * mean(sigma_x, sigma_y)
    where sigma uses the robust scale estimator min(std, IQR/1.34).
    """
    n = len(x)
    if n < 2:
        return 1.0

    def _robust_scale(v):
        s = float(np.std(v))
        q75, q25 = float(np.percentile(v, 75)), float(np.percentile(v, 25))
        iqr = (q75 - q25) / 1.34
        return min(s, iqr) if iqr > 0 else s

    sx = _robust_scale(x)
    sy = _robust_scale(y)
    sigma = (sx + sy) / 2.0
    if sigma == 0:
        sigma = 1.0
    return sigma * (n ** (-1.0 / 6.0))


# ---------------------------------------------------------------------------
# CPU kernels (numba-jitted)
# ---------------------------------------------------------------------------

@ngjit
def _kde_cpu(xs, ys, ws, out, x0, y0, dx, dy, bw, kernel_id):
    """Populate *out* with kernel density values.

    Parameters
    ----------
    xs, ys : 1-D float64 arrays of point coordinates.
    ws : 1-D float64 array of weights (same length as xs).
    out : 2-D float64 output array (rows, cols), pre-zeroed.
    x0, y0 : origin (lower-left corner of first pixel centre).
    dx, dy : pixel spacing in x and y.
    bw : bandwidth (same units as coordinates).
    kernel_id : 0 = gaussian, 1 = epanechnikov, 2 = quartic.
    """
    rows, cols = out.shape
    n_pts = xs.shape[0]
    inv_bw = 1.0 / bw
    inv_bw2 = inv_bw * inv_bw

    # Pre-compute normalisation
    if kernel_id == 0:
        norm = inv_bw2 / (2.0 * pi)
    elif kernel_id == 1:
        norm = inv_bw2 * 2.0 / pi
    else:
        norm = inv_bw2 * 3.0 / pi

    # Cutoff radius in pixels for compact kernels.
    # Gaussian uses 4*bw; compact kernels use exactly bw.
    if kernel_id == 0:
        cutoff = 4.0 * bw
    else:
        cutoff = bw

    for p in range(n_pts):
        px = xs[p]
        py = ys[p]
        w = ws[p]

        # Pixel range that falls within the cutoff.
        # Compute both endpoints and use min/max so that negative
        # spacing (descending coordinates) still produces lo <= hi.
        c_a = int((px - cutoff - x0) / dx)
        c_b = int((px + cutoff - x0) / dx)
        col_lo = max(0, min(c_a, c_b))
        col_hi = min(cols - 1, max(c_a, c_b)) + 1
        r_a = int((py - cutoff - y0) / dy)
        r_b = int((py + cutoff - y0) / dy)
        row_lo = max(0, min(r_a, r_b))
        row_hi = min(rows - 1, max(r_a, r_b)) + 1

        for r in range(row_lo, row_hi):
            cy = y0 + r * dy
            uy = (cy - py) * inv_bw
            uy2 = uy * uy
            for c in range(col_lo, col_hi):
                cx = x0 + c * dx
                ux = (cx - px) * inv_bw
                u2 = ux * ux + uy2

                if kernel_id == 0:
                    # Gaussian
                    val = norm * w * np.exp(-0.5 * u2)
                elif kernel_id == 1:
                    # Epanechnikov
                    if u2 <= 1.0:
                        val = norm * w * (1.0 - u2)
                    else:
                        val = 0.0
                else:
                    # Quartic
                    if u2 <= 1.0:
                        t = 1.0 - u2
                        val = norm * w * t * t
                    else:
                        val = 0.0

                out[r, c] += val


@ngjit
def _line_density_cpu(x1s, y1s, x2s, y2s, ws, out,
                      x0, y0, dx, dy, bw, kernel_id):
    """Compute line density on *out*.

    Each line segment is sampled at sub-segment intervals (step = bw/4)
    and each sample acts as a weighted point where the weight is
    proportional to the step length.
    """
    rows, cols = out.shape
    n_segs = x1s.shape[0]
    inv_bw = 1.0 / bw
    inv_bw2 = inv_bw * inv_bw

    if kernel_id == 0:
        norm = inv_bw2 / (2.0 * pi)
    elif kernel_id == 1:
        norm = inv_bw2 * 2.0 / pi
    else:
        norm = inv_bw2 * 3.0 / pi

    if kernel_id == 0:
        cutoff = 4.0 * bw
    else:
        cutoff = bw

    step = bw / 4.0
    if step < 1e-12:
        return

    for s in range(n_segs):
        ax = x1s[s]
        ay = y1s[s]
        bx = x2s[s]
        by = y2s[s]
        seg_len = sqrt((bx - ax) ** 2 + (by - ay) ** 2)
        if seg_len < 1e-12:
            continue

        w = ws[s]
        n_steps = max(1, int(seg_len / step))
        sub_w = w * (seg_len / n_steps)

        for i in range(n_steps):
            t = (i + 0.5) / n_steps
            px = ax + t * (bx - ax)
            py = ay + t * (by - ay)

            c_a = int((px - cutoff - x0) / dx)
            c_b = int((px + cutoff - x0) / dx)
            col_lo = max(0, min(c_a, c_b))
            col_hi = min(cols - 1, max(c_a, c_b)) + 1
            r_a = int((py - cutoff - y0) / dy)
            r_b = int((py + cutoff - y0) / dy)
            row_lo = max(0, min(r_a, r_b))
            row_hi = min(rows - 1, max(r_a, r_b)) + 1

            for r in range(row_lo, row_hi):
                cy = y0 + r * dy
                uy = (cy - py) * inv_bw
                uy2 = uy * uy
                for c in range(col_lo, col_hi):
                    cx = x0 + c * dx
                    ux = (cx - px) * inv_bw
                    u2 = ux * ux + uy2

                    if kernel_id == 0:
                        val = norm * sub_w * np.exp(-0.5 * u2)
                    elif kernel_id == 1:
                        if u2 <= 1.0:
                            val = norm * sub_w * (1.0 - u2)
                        else:
                            val = 0.0
                    else:
                        if u2 <= 1.0:
                            tt = 1.0 - u2
                            val = norm * sub_w * tt * tt
                        else:
                            val = 0.0

                    out[r, c] += val


# ---------------------------------------------------------------------------
# GPU kernels (CUDA)
# ---------------------------------------------------------------------------

@cuda.jit
def _kde_cuda(xs, ys, ws, out, x0, y0, dx, dy, bw, kernel_id, n_pts):
    """Each thread computes one output pixel."""
    r, c = cuda.grid(2)
    rows = out.shape[0]
    cols = out.shape[1]
    if r >= rows or c >= cols:
        return

    cx = x0[0] + c * dx[0]
    cy = y0[0] + r * dy[0]
    inv_bw = 1.0 / bw[0]
    inv_bw2 = inv_bw * inv_bw
    kid = kernel_id[0]

    if kid == 0:
        norm = inv_bw2 / (2.0 * 3.141592653589793)
    elif kid == 1:
        norm = inv_bw2 * 2.0 / 3.141592653589793
    else:
        norm = inv_bw2 * 3.0 / 3.141592653589793

    total = 0.0
    for p in range(n_pts[0]):
        ux = (cx - xs[p]) * inv_bw
        uy = (cy - ys[p]) * inv_bw
        u2 = ux * ux + uy * uy

        if kid == 0:
            # No hard cutoff; exp decays fast enough and each thread
            # loops independently so the extra iterations are cheap.
            total += norm * ws[p] * math.exp(-0.5 * u2)
        elif kid == 1:
            if u2 <= 1.0:
                total += norm * ws[p] * (1.0 - u2)
        else:
            if u2 <= 1.0:
                t = 1.0 - u2
                total += norm * ws[p] * t * t

    out[r, c] = total


# ---------------------------------------------------------------------------
# Backend wrappers
# ---------------------------------------------------------------------------

def _kernel_id(kernel: str) -> int:
    if kernel == 'gaussian':
        return 0
    elif kernel == 'epanechnikov':
        return 1
    elif kernel == 'quartic':
        return 2
    raise ValueError(
        f"kernel must be one of {_KERNEL_NAMES}, got {kernel!r}"
    )


def _run_kde_numpy(xs, ys, ws, shape, x0, y0, dx, dy, bw, kernel_id):
    out = np.zeros(shape, dtype=np.float64)
    _kde_cpu(xs, ys, ws, out, x0, y0, dx, dy, bw, kernel_id)
    return out


def _run_kde_cupy(xs, ys, ws, shape, x0, y0, dx, dy, bw, kernel_id):
    out = cupy.zeros(shape, dtype=cupy.float64)
    n_pts = cupy.array([len(xs)], dtype=cupy.int64)
    x0_d = cupy.array([x0], dtype=cupy.float64)
    y0_d = cupy.array([y0], dtype=cupy.float64)
    dx_d = cupy.array([dx], dtype=cupy.float64)
    dy_d = cupy.array([dy], dtype=cupy.float64)
    bw_d = cupy.array([bw], dtype=cupy.float64)
    kid_d = cupy.array([kernel_id], dtype=cupy.int64)

    xs_d = cupy.asarray(xs, dtype=cupy.float64)
    ys_d = cupy.asarray(ys, dtype=cupy.float64)
    ws_d = cupy.asarray(ws, dtype=cupy.float64)

    tpb = (16, 16)
    bpg = (
        (shape[0] + tpb[0] - 1) // tpb[0],
        (shape[1] + tpb[1] - 1) // tpb[1],
    )
    _kde_cuda[bpg, tpb](xs_d, ys_d, ws_d, out,
                         x0_d, y0_d, dx_d, dy_d, bw_d, kid_d, n_pts)
    return out


def _filter_points_to_tile(xs, ys, ws, tile_x0, tile_y0, dx, dy,
                           tile_rows, tile_cols, cutoff):
    """Return (xs, ys, ws) subset that could affect this tile.

    Points whose cutoff circle doesn't overlap the tile extent are
    excluded, reducing serialization and speeding up the kernel.
    """
    # dx/dy may be negative (descending coordinates), so order the tile
    # edges with min/max before widening by the cutoff.  The unordered
    # version inverted the interval and dropped the points (#3627).
    # tile_x1/tile_y1 overshoot the last pixel centre by one spacing,
    # which keeps the filter conservative (never drops a contributor).
    tile_x1 = tile_x0 + tile_cols * dx
    tile_y1 = tile_y0 + tile_rows * dy
    x_lo = min(tile_x0, tile_x1) - cutoff
    x_hi = max(tile_x0, tile_x1) + cutoff
    y_lo = min(tile_y0, tile_y1) - cutoff
    y_hi = max(tile_y0, tile_y1) + cutoff
    mask = ((xs >= x_lo) & (xs <= x_hi) &
            (ys >= y_lo) & (ys <= y_hi))
    if mask.all():
        return xs, ys, ws
    return xs[mask], ys[mask], ws[mask]


def _run_kde_dask_numpy(xs, ys, ws, shape, x0, y0, dx, dy, bw, kernel_id,
                        chunks):
    """Dask-backed KDE: each chunk computes its own tile independently.

    Points are pre-filtered per tile so each delayed task receives only
    the relevant subset, reducing serialization from O(n_tiles * n_points)
    to O(n_tiles * points_per_tile).
    """
    # Determine chunk layout
    if chunks is None:
        chunks = (min(256, shape[0]), min(256, shape[1]))
    row_splits = _split_sizes(shape[0], chunks[0])
    col_splits = _split_sizes(shape[1], chunks[1])

    # Cutoff radius matching the kernel implementation
    cutoff = 4.0 * bw if kernel_id == 0 else bw

    blocks = []
    row_off = 0
    for rs in row_splits:
        row_blocks = []
        col_off = 0
        for cs in col_splits:
            tile_y0 = y0 + row_off * dy
            tile_x0 = x0 + col_off * dx
            tile_shape = (rs, cs)
            # Pre-filter points to this tile's extent + cutoff
            txs, tys, tws = _filter_points_to_tile(
                xs, ys, ws, tile_x0, tile_y0, dx, dy, rs, cs, cutoff)
            block = dask.delayed(_run_kde_numpy)(
                txs, tys, tws, tile_shape,
                tile_x0, tile_y0, dx, dy, bw, kernel_id,
            )
            row_blocks.append(
                da.from_delayed(block, shape=tile_shape, dtype=np.float64)
            )
            col_off += cs
        blocks.append(row_blocks)
        row_off += rs

    return da.block(blocks)


def _run_kde_dask_cupy(xs, ys, ws, shape, x0, y0, dx, dy, bw, kernel_id,
                       chunks):
    """Dask+CuPy KDE: each chunk uses the GPU kernel.

    Points are pre-filtered per tile (same as the numpy dask path)
    so each delayed task serializes only the relevant subset.
    """
    if chunks is None:
        chunks = (min(256, shape[0]), min(256, shape[1]))
    row_splits = _split_sizes(shape[0], chunks[0])
    col_splits = _split_sizes(shape[1], chunks[1])

    cutoff = 4.0 * bw if kernel_id == 0 else bw

    blocks = []
    row_off = 0
    for rs in row_splits:
        row_blocks = []
        col_off = 0
        for cs in col_splits:
            tile_y0 = y0 + row_off * dy
            tile_x0 = x0 + col_off * dx
            tile_shape = (rs, cs)
            txs, tys, tws = _filter_points_to_tile(
                xs, ys, ws, tile_x0, tile_y0, dx, dy, rs, cs, cutoff)
            block = dask.delayed(_run_kde_cupy)(
                txs, tys, tws, tile_shape,
                tile_x0, tile_y0, dx, dy, bw, kernel_id,
            )
            row_blocks.append(
                da.from_delayed(block, shape=tile_shape,
                                dtype=np.float64,
                                meta=cupy.array((), dtype=cupy.float64))
            )
            col_off += cs
        blocks.append(row_blocks)
        row_off += rs

    return da.block(blocks)


def _split_sizes(total, chunk):
    """Return a list of chunk sizes that sum to *total*."""
    full, rem = divmod(total, chunk)
    sizes = [chunk] * full
    if rem:
        sizes.append(rem)
    return sizes


# ---------------------------------------------------------------------------
# Public API -- kde
# ---------------------------------------------------------------------------


[docs]
def kde(
    x: Union[np.ndarray, list],
    y: Optional[Union[np.ndarray, list]] = None,
    *,
    weights: Optional[Union[np.ndarray, list]] = None,
    bandwidth: Union[float, str] = 'silverman',
    kernel: str = 'gaussian',
    template: Optional[xr.DataArray] = None,
    x_range: Optional[Tuple[float, float]] = None,
    y_range: Optional[Tuple[float, float]] = None,
    width: int = 256,
    height: int = 256,
    name: str = 'kde',
    column: Optional[str] = None,
) -> xr.DataArray:
    """Compute 2-D kernel density estimation from point data.

    Each output pixel accumulates weighted kernel contributions from all
    input points, producing a smooth continuous density surface.

    Parameters
    ----------
    x, y : array-like
        1-D arrays of point coordinates.  Alternatively, pass a GeoDataFrame
        of Point geometries as the first argument and leave *y* unset.
        Points with non-finite (NaN or infinite) coordinates or weights
        are dropped; a ``ValueError`` is raised if no finite points remain.
    weights : array-like, optional
        Per-point weights.  Defaults to uniform weights of 1.
    column : str, optional
        When the first argument is a GeoDataFrame, the column used as
        per-point weights.  Omit for a pure (uniform-weight) point density.
        Mutually exclusive with *weights*.
    bandwidth : float or ``'silverman'``
        Kernel bandwidth in the same units as *x*/*y*.
        ``'silverman'`` (default) uses Silverman's rule of thumb.
    kernel : ``{'gaussian', 'epanechnikov', 'quartic'}``
        Kernel shape.
    template : xr.DataArray, optional
        If provided, the output matches this array's shape, extent, and
        coordinates.  *x_range*, *y_range*, *width*, and *height* are
        ignored when *template* is given.
    x_range, y_range : (min, max), optional
        Spatial extent of the output grid.  Defaults to the data extent
        with 10 %% padding on each side.
    width, height : int
        Number of columns and rows in the output grid.  Ignored when
        *template* is provided.
    name : str
        Name of the output DataArray.

    Returns
    -------
    xr.DataArray
        2-D density surface.  When *template* is omitted, ``attrs['res']``
        carries the ``(x, y)`` cell size of the grid built from the data
        extent; when *template* is given, the output inherits its attrs.
    """
    # -- Resolve GeoDataFrame input ----------------------------------------
    if is_geodataframe(x):
        if y is not None:
            raise ValueError(
                "kde(): when the first argument is a GeoDataFrame, pass "
                "weights via 'column' or 'weights' as keywords, not y"
            )
        if weights is not None and column is not None:
            raise ValueError(
                "kde(): pass either 'weights' or 'column', not both"
            )
        x, y, col_weights, _crs = points_from_geodataframe(
            x, column=column, value_required=False, func_name='kde'
        )
        if col_weights is not None:
            weights = col_weights
    else:
        if column is not None:
            raise ValueError(
                "kde(): 'column' is only valid when the first argument is a "
                "GeoDataFrame"
            )
        if y is None:
            raise ValueError("kde(): y is required when x is not a GeoDataFrame")

    # -- Validate and coerce inputs ----------------------------------------
    x_arr = np.asarray(x, dtype=np.float64).ravel()
    y_arr = np.asarray(y, dtype=np.float64).ravel()
    if x_arr.shape[0] != y_arr.shape[0]:
        raise ValueError("x and y must have the same length")
    n = x_arr.shape[0]

    if weights is not None:
        w_arr = np.asarray(weights, dtype=np.float64).ravel()
        if w_arr.shape[0] != n:
            raise ValueError("weights must have the same length as x and y")
    else:
        w_arr = np.ones(n, dtype=np.float64)

    # Drop points with non-finite coordinates or weights (#3628).  Same
    # policy as xrspatial.interpolate: without this, a single NaN poisons
    # the auto-computed extent (all backends) or the whole grid (cupy
    # gaussian, which has no cutoff), while the other backends silently
    # skip the point.
    valid = np.isfinite(x_arr) & np.isfinite(y_arr) & np.isfinite(w_arr)
    if not valid.all():
        x_arr, y_arr, w_arr = x_arr[valid], y_arr[valid], w_arr[valid]
        if x_arr.shape[0] == 0:
            raise ValueError(
                "kde(): no valid (finite) points remain after filtering "
                "non-finite coordinates and weights"
            )

    kid = _kernel_id(kernel)

    # -- Bandwidth ---------------------------------------------------------
    if isinstance(bandwidth, str):
        if bandwidth != 'silverman':
            raise ValueError(
                "bandwidth must be a positive number or 'silverman', "
                f"got {bandwidth!r}"
            )
        bw = _silverman_bandwidth(x_arr, y_arr)
    else:
        bw = float(bandwidth)
        if bw <= 0:
            raise ValueError(f"bandwidth must be positive, got {bw}")

    # -- Output grid -------------------------------------------------------
    if template is not None:
        _validate_template(template)
        y_coords = template.coords[template.dims[0]].values
        x_coords = template.coords[template.dims[1]].values
        rows, cols = template.shape
        # Pixel spacing
        dy = float(y_coords[1] - y_coords[0]) if rows > 1 else 1.0
        dx = float(x_coords[1] - x_coords[0]) if cols > 1 else 1.0
        x0 = float(x_coords[0])
        y0 = float(y_coords[0])
        use_dask = has_dask_array() and isinstance(template.data, da.Array)
        use_cupy = (has_cuda_and_cupy() and cupy is not None
                    and _is_cupy_backed(template))
        out_chunks = template.data.chunksize if use_dask else None
    else:
        if x_range is None:
            pad = max(bw, (float(x_arr.max()) - float(x_arr.min())) * 0.1)
            x_range = (float(x_arr.min()) - pad, float(x_arr.max()) + pad)
        if y_range is None:
            pad = max(bw, (float(y_arr.max()) - float(y_arr.min())) * 0.1)
            y_range = (float(y_arr.min()) - pad, float(y_arr.max()) + pad)
        rows, cols = height, width
        dx = (x_range[1] - x_range[0]) / max(cols - 1, 1)
        dy = (y_range[1] - y_range[0]) / max(rows - 1, 1)
        x0 = x_range[0]
        y0 = y_range[0]
        x_coords = np.linspace(x_range[0], x_range[1], cols)
        y_coords = np.linspace(y_range[0], y_range[1], rows)
        use_dask = False
        use_cupy = False
        out_chunks = None

    shape = (rows, cols)

    # -- Memory guard for eager backends ------------------------------------
    # Dask paths build per-tile allocations lazily, so chunk size already
    # bounds peak memory. The eager numpy/cupy paths allocate the full
    # (rows, cols) float64 buffer up front and need an explicit guard.
    if not use_dask:
        _check_grid_memory(rows, cols)

    # -- Dispatch -----------------------------------------------------------
    if use_dask and use_cupy:
        data = _run_kde_dask_cupy(
            x_arr, y_arr, w_arr, shape, x0, y0, dx, dy, bw, kid, out_chunks,
        )
    elif use_dask:
        data = _run_kde_dask_numpy(
            x_arr, y_arr, w_arr, shape, x0, y0, dx, dy, bw, kid, out_chunks,
        )
    elif use_cupy:
        data = _run_kde_cupy(
            x_arr, y_arr, w_arr, shape, x0, y0, dx, dy, bw, kid,
        )
    else:
        data = _run_kde_numpy(
            x_arr, y_arr, w_arr, shape, x0, y0, dx, dy, bw, kid,
        )

    # -- Build output DataArray --------------------------------------------
    if template is not None:
        return xr.DataArray(
            data, name=name,
            coords=template.coords, dims=template.dims,
            attrs=template.attrs,
        )
    # No template: the grid was built from x_range/y_range above, so the
    # cell spacing `dx`/`dy` is the output resolution.  Record it as
    # `res` so downstream tools (which prefer `attrs['res']` over deriving
    # cellsize from coords) read the true spacing.
    return xr.DataArray(
        data, name=name,
        dims=['y', 'x'],
        coords={'y': y_coords, 'x': x_coords},
        attrs={'res': (abs(dx), abs(dy))},
    )



# ---------------------------------------------------------------------------
# Public API -- line_density
# ---------------------------------------------------------------------------


[docs]
def line_density(
    x1: Union[np.ndarray, list],
    y1: Union[np.ndarray, list],
    x2: Union[np.ndarray, list],
    y2: Union[np.ndarray, list],
    *,
    weights: Optional[Union[np.ndarray, list]] = None,
    bandwidth: Union[float, str] = 'silverman',
    kernel: str = 'gaussian',
    template: Optional[xr.DataArray] = None,
    x_range: Optional[Tuple[float, float]] = None,
    y_range: Optional[Tuple[float, float]] = None,
    width: int = 256,
    height: int = 256,
    name: str = 'line_density',
) -> xr.DataArray:
    """Compute line density from line-segment data.

    Each segment is uniformly sampled and the samples are convolved with
    the chosen kernel, producing a smooth density surface that represents
    the concentration of linear features.

    Parameters
    ----------
    x1, y1, x2, y2 : array-like
        Start and end coordinates of each line segment.  Segments with
        non-finite (NaN or infinite) endpoints or weights are dropped;
        a ``ValueError`` is raised if no finite segments remain.
    weights : array-like, optional
        Per-segment weights.  Defaults to uniform weights of 1.
    bandwidth : float or ``'silverman'``
        Kernel bandwidth.  ``'silverman'`` uses an automatic estimate
        based on all segment endpoints.
    kernel : ``{'gaussian', 'epanechnikov', 'quartic'}``
        Kernel shape.
    template : xr.DataArray, optional
        Output grid specification (same as :func:`kde`).
    x_range, y_range : (min, max), optional
        Spatial extent.
    width, height : int
        Grid dimensions.
    name : str
        Name of the output DataArray.

    Returns
    -------
    xr.DataArray
        2-D line-density surface.  When *template* is omitted,
        ``attrs['res']`` carries the ``(x, y)`` cell size of the grid built
        from the data extent; when *template* is given, the output inherits
        its attrs.
    """
    x1a = np.asarray(x1, dtype=np.float64).ravel()
    y1a = np.asarray(y1, dtype=np.float64).ravel()
    x2a = np.asarray(x2, dtype=np.float64).ravel()
    y2a = np.asarray(y2, dtype=np.float64).ravel()
    n = x1a.shape[0]
    if not (y1a.shape[0] == n and x2a.shape[0] == n and y2a.shape[0] == n):
        raise ValueError("x1, y1, x2, y2 must all have the same length")

    if weights is not None:
        w_arr = np.asarray(weights, dtype=np.float64).ravel()
        if w_arr.shape[0] != n:
            raise ValueError(
                "weights must have the same length as the segment arrays"
            )
    else:
        w_arr = np.ones(n, dtype=np.float64)

    # Drop segments with non-finite endpoints or weights (#3628); a
    # single NaN endpoint otherwise poisons the auto-computed extent
    # and the output collapses to zeros with NaN coordinates.
    valid = (np.isfinite(x1a) & np.isfinite(y1a) &
             np.isfinite(x2a) & np.isfinite(y2a) & np.isfinite(w_arr))
    if not valid.all():
        x1a, y1a, x2a, y2a, w_arr = (
            x1a[valid], y1a[valid], x2a[valid], y2a[valid], w_arr[valid])
        if x1a.shape[0] == 0:
            raise ValueError(
                "line_density(): no valid (finite) segments remain after "
                "filtering non-finite endpoints and weights"
            )

    kid = _kernel_id(kernel)

    # Bandwidth from all endpoints
    all_x = np.concatenate([x1a, x2a])
    all_y = np.concatenate([y1a, y2a])

    if isinstance(bandwidth, str):
        if bandwidth != 'silverman':
            raise ValueError(
                "bandwidth must be a positive number or 'silverman', "
                f"got {bandwidth!r}"
            )
        bw = _silverman_bandwidth(all_x, all_y)
    else:
        bw = float(bandwidth)
        if bw <= 0:
            raise ValueError(f"bandwidth must be positive, got {bw}")

    # Grid
    if template is not None:
        _validate_template(template)
        y_coords = template.coords[template.dims[0]].values
        x_coords = template.coords[template.dims[1]].values
        rows, cols = template.shape
        dy = float(y_coords[1] - y_coords[0]) if rows > 1 else 1.0
        dx = float(x_coords[1] - x_coords[0]) if cols > 1 else 1.0
        x0 = float(x_coords[0])
        y0 = float(y_coords[0])
    else:
        if x_range is None:
            pad = max(bw, (float(all_x.max()) - float(all_x.min())) * 0.1)
            x_range = (float(all_x.min()) - pad, float(all_x.max()) + pad)
        if y_range is None:
            pad = max(bw, (float(all_y.max()) - float(all_y.min())) * 0.1)
            y_range = (float(all_y.min()) - pad, float(all_y.max()) + pad)
        rows, cols = height, width
        dx = (x_range[1] - x_range[0]) / max(cols - 1, 1)
        dy = (y_range[1] - y_range[0]) / max(rows - 1, 1)
        x0 = x_range[0]
        y0 = y_range[0]
        x_coords = np.linspace(x_range[0], x_range[1], cols)
        y_coords = np.linspace(y_range[0], y_range[1], rows)

    shape = (rows, cols)
    _check_grid_memory(rows, cols)
    out = np.zeros(shape, dtype=np.float64)
    _line_density_cpu(x1a, y1a, x2a, y2a, w_arr, out,
                      x0, y0, dx, dy, bw, kid)

    if template is not None:
        return xr.DataArray(
            out, name=name,
            coords=template.coords, dims=template.dims,
            attrs=template.attrs,
        )
    # No template: `dx`/`dy` is the output cell spacing, so record it as
    # `res` for downstream tools that prefer `attrs['res']` over coords.
    return xr.DataArray(
        out, name=name,
        dims=['y', 'x'],
        coords={'y': y_coords, 'x': x_coords},
        attrs={'res': (abs(dx), abs(dy))},
    )



# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _validate_template(template):
    if not isinstance(template, xr.DataArray):
        raise TypeError(
            "template must be an xr.DataArray, "
            f"got {type(template).__qualname__}"
        )
    if template.ndim != 2:
        raise ValueError(
            f"template must be 2-D, got {template.ndim}-D"
        )


def _is_cupy_backed(agg):
    """Check if a DataArray is backed by cupy (plain or via dask)."""
    try:
        meta = agg.data._meta
        return type(meta).__module__.split('.')[0] == 'cupy'
    except AttributeError:
        if cupy is not None:
            return isinstance(agg.data, cupy.ndarray)
    return False