Source code for xrspatial.geotiff._writers.eager

"""Eager (CPU) writer entry point and helpers.

Holds ``to_geotiff`` (the public eager writer),
``_write_single_tile`` (per-tile worker used by ``_write_vrt_tiled``),
and ``_write_vrt_tiled`` (the deprecated ``vrt_tiled=True`` path on
``to_geotiff``). Companion modules ``_writers/gpu.py`` and
``_writers/vrt.py`` hold the GPU writer and the public ``write_vrt``;
``to_geotiff`` dispatches to them when the caller asks for a GPU
output or a ``.vrt`` path.
"""
from __future__ import annotations

import numbers
import os
import shutil
import uuid
import warnings
from typing import TYPE_CHECKING

import numpy as np
import xarray as xr

if TYPE_CHECKING:
    from typing import BinaryIO

from .._attrs import (_EXPERIMENTAL_CODECS, _LEVEL_RANGES, _VALID_COMPRESSIONS, _extract_rich_tags,
                      _resolve_nodata_attr, _should_restore_nan_sentinel)
from .._backends._gpu_helpers import _is_gpu_data
from .._coords import _BAND_DIM_NAMES, ROTATION_SHEAR_TOL, _has_no_georef_marker
from .._coords import require_transform_for_georeferenced as _require_transform_for_georeferenced
from .._coords import resolve_georef as _resolve_georef
from .._crs import _validate_crs_arg, _validate_crs_fallback, _wkt_to_epsg
from .._geotags import RASTER_PIXEL_IS_AREA, GeoTransform
from .._nodata import NodataLifecycle as _NL
from .._runtime import (GeoTIFFFallbackWarning, _geotiff_strict_mode, _gpu_fallback_warning_message,
                        _resolve_spatial_coords)
from .._validation import (_validate_3d_writer_dims, _validate_gpu_arg,
                           _validate_no_rotated_affine, _validate_nodata_arg,
                           _validate_tile_size_arg, _validate_writer_spatial_shape,
                           validate_write_metadata)
from .._writer import _COG_REQUIRES_TILED_MSG, write
from .gpu import write_geotiff_gpu


[docs] def to_geotiff(data: xr.DataArray | np.ndarray, path: str | BinaryIO, *, crs: int | str | None = None, nodata: float | int | None = None, compression: str = 'zstd', compression_level: int | None = None, tiled: bool = True, tile_size: int = 256, predictor: bool | int = False, cog: bool = False, overview_levels: list[int] | None = None, overview_resampling: str = 'mean', bigtiff: bool | None = None, gpu: bool | None = None, streaming_buffer_bytes: int = 256 * 1024 * 1024, max_z_error: float = 0.0, photometric: str | int = 'auto', allow_internal_only_jpeg: bool = False, allow_experimental_codecs: bool = False, allow_unparseable_crs: bool = False, drop_rotation: bool = False) -> str | BinaryIO: """Write data as a GeoTIFF or Cloud Optimized GeoTIFF. Release-contract tier (see ``docs/source/reference/release_gate_geotiff.rst`` and ``docs/source/reference/geotiff_release_contract.rst``): * [stable] Local-file output on an axis-aligned grid with ``compression`` in ``{'none', 'deflate', 'lzw', 'packbits', 'zstd'}``; CRS / transform / nodata attrs round-trip; ``bigtiff`` auto-promotion; ``cog=True`` (the IFD-first tiled COG layout with a stable codec, covered by ``SUPPORTED_FEATURES['writer.cog']``). * [advanced] Internal overview pyramid generation (``SUPPORTED_FEATURES['writer.overviews']``): the ``overview_levels`` and ``overview_resampling`` knobs and the pyramid bytes themselves. Also explicit ``bigtiff=True``; ``photometric=`` overrides; ``extra_tags`` pass-through. * [experimental] GPU dispatch via ``gpu=True``; ``compression`` in ``{'lerc', 'jpeg2000', 'j2k', 'lz4'}`` behind the explicit ``allow_experimental_codecs=True`` opt-in; ``allow_unparseable_crs=True``. * [internal-only] ``compression='jpeg'`` behind ``allow_internal_only_jpeg=True``. The produced files do not round-trip through libtiff / GDAL / rasterio; the path exists for xrspatial's own use and is not part of the externally interoperable surface. * Out of scope for this release (allowed to raise): rotated / sheared write support (no ``ModelTransformationTag`` emit path); silent mixed-metadata flattening. See :data:`xrspatial.geotiff.SUPPORTED_FEATURES` for the full tier map. Per-parameter tier markers below describe the tier the parameter itself carries; a parameter's effective tier is bounded by the function-level surface above (e.g. ``[stable]`` ``nodata`` is still only stable when combined with a ``[stable]`` codec and options). Dask-backed DataArrays are written in streaming mode: one tile-row at a time, without materialising the full array into RAM. Peak memory is roughly ``tile_size * width * bytes_per_sample``. COG output (``cog=True``) still materialises because overviews need the full array. Automatically dispatches to GPU compression when: - ``gpu=True`` is passed, or - The input data is CuPy-backed (auto-detected) GPU write uses nvCOMP batch compression (deflate/ZSTD) and keeps the array on device. Falls back to CPU if nvCOMP is not available. Parameters ---------- data : xr.DataArray or np.ndarray [stable] 2D raster data. path : str or binary file-like [stable for local file paths; advanced for ``io.BytesIO`` and other in-memory file-likes] Output file path, or any object exposing a ``write`` method (e.g. ``io.BytesIO``). When a file-like is passed, the encoded TIFF bytes are written to that object once assembly completes. ``cog=True`` and ``.vrt`` outputs require a string path. crs : int, numpy.integer, str, or None [stable for int EPSG codes; advanced for WKT/PROJ strings] EPSG code (int or numpy integer scalar), WKT string, or PROJ string. If None and data is a DataArray, tries to read from attrs ('crs' for EPSG, 'crs_wkt' for WKT). EPSG codes are strongly preferred for interop. The WKT-only path writes ``ProjectedCSType`` / ``GeographicType`` = 32767 with the WKT stored in ``GTCitationGeoKey`` -- libgeotiff and GDAL can round-trip this but many other GeoTIFF readers treat the citation as a free-form name and lose the CRS. A ``UserWarning`` is emitted when the WKT-only path is taken. nodata : float, int, or None [stable] NoData value. compression : str [stable for ``{'none', 'deflate', 'lzw', 'packbits', 'zstd'}``; experimental for ``{'lerc', 'jpeg2000', 'j2k', 'lz4'}`` behind ``allow_experimental_codecs=True``; internal-only for ``'jpeg'`` behind ``allow_internal_only_jpeg=True``] Codec name. One of ``'none'``, ``'deflate'``, ``'lzw'``, ``'jpeg'``, ``'packbits'``, ``'zstd'``, ``'lz4'``, ``'jpeg2000'`` (alias ``'j2k'``), or ``'lerc'``. Stable codecs (Tier 1, lossless, byte-for-byte round-trip): ``'none'``, ``'deflate'``, ``'lzw'``, ``'packbits'``, ``'zstd'``. Experimental codecs (Tier 3): ``'lerc'``, ``'jpeg2000'`` / ``'j2k'``, ``'lz4'``. Rejected by default; pass ``allow_experimental_codecs=True`` to opt in. The opt-in emits ``GeoTIFFFallbackWarning`` once per call so the caller knows the chosen codec carries no cross-backend numerical parity claim and uneven reader support across GDAL versions. ``'lerc'`` accepts ``max_z_error`` for lossy compression with a bounded per-pixel error. Internal-only codec (Tier 4): ``'jpeg'``. Rejected on write by default because the encoder omits the JPEGTables tag and the produced files do not round-trip through libtiff / GDAL / rasterio. Pass ``allow_internal_only_jpeg=True`` to opt in to the internal-reader-only path (see that parameter for details). ``allow_experimental_codecs=True`` does NOT cover ``'jpeg'``: internal-only is a stricter tier than experimental, and the two flags do not collapse into one switch. compression_level : int or None [stable] Compression effort level. None uses each codec's default (6 for deflate/zstd). Valid ranges: deflate 1-9, zstd 1-22, lz4 0-16. Codecs without a level concept (lzw, packbits, jpeg) accept any value and ignore it. tiled : bool [stable] Use tiled layout (default True). Incompatible with ``cog=True`` because the COG specification requires a tiled internal layout; passing ``cog=True, tiled=False`` raises ``ValueError``. tile_size : int [stable] Tile size in pixels (default 256). Must be a positive multiple of 16 when ``tiled=True``; this is a TIFF 6 spec requirement on TileWidth and TileLength for broad reader compatibility. Ignored when ``tiled=False``; a warning is emitted if a non-default value is passed alongside strip mode. predictor : bool or int [stable] TIFF predictor. Accepted values: * ``False``, ``0``, or ``1`` -> no predictor. * ``True`` or ``2`` -> horizontal differencing (good for integer data; ``True`` and ``2`` are exactly equivalent). * ``3`` -> floating-point predictor (float dtypes only; typically gives better deflate/zstd ratios on float data than predictor 2). cog : bool [stable] Write as Cloud Optimized GeoTIFF. The CPU writer emits the spec-conforming COG layout (IFD-first, tiled, internal overviews, lossless codec) covered by ``SUPPORTED_FEATURES['writer.cog']``. Requires ``tiled=True`` (the default): the COG specification mandates a tiled internal layout, so ``cog=True, tiled=False`` raises ``ValueError``. COG output also materialises the full array, because the overview pyramid needs random access to every pixel; the ``streaming_buffer_bytes`` kwarg is a no-op on this path. Customisation of the overview pyramid itself (``overview_levels``, ``overview_resampling``) is tracked separately as advanced under ``SUPPORTED_FEATURES['writer.overviews']``. overview_levels : list[int] or None [advanced] Overview pyramids are an optional COG feature; the decimation factors and resampling choice affect downstream analytics in ways that are not byte-for-byte reproducible across backends. Overview decimation factors relative to full resolution. Each entry must be a power-of-two integer >= 2, and the list must be strictly increasing (e.g. ``[2, 4, 8]`` writes overviews at 1/2, 1/4 and 1/8 of the full resolution). Invalid values raise ``ValueError``. Only used when ``cog=True``. If None and ``cog=True``, levels auto-generate as ``[2, 4, 8, ...]`` until the next halving would fall below ``tile_size`` (capped at 8 levels). overview_resampling : str [advanced] Resampling method for overviews: 'mean' (default), 'nearest', 'min', 'max', 'median', 'mode', or 'cubic'. bigtiff : bool or None [advanced] BigTIFF uses 64-bit offsets; older readers that only speak classic TIFF cannot open the output. Force BigTIFF (64-bit offsets). None (default) auto-promotes when the estimated file size would exceed the classic-TIFF 4 GB limit. Matches the same kwarg on ``write_geotiff_gpu``. gpu : bool or None [experimental] Requires cupy + numba CUDA, plus the optional nvCOMP / nvJPEG / nvJPEG2K libraries for codec-specific acceleration; backend parity with the CPU writer is tested for the Tier 1 codec set only. Force GPU compression. None (default) auto-detects CuPy data. streaming_buffer_bytes : int [stable] Soft cap on bytes materialised per dask compute call when streaming a dask-backed DataArray. Defaults to 256 MB. Wide rasters whose tile-row exceeds this budget are split into horizontal segments. Only relevant for dask-backed inputs; the kwarg is a no-op for numpy / CuPy / COG paths (the COG path materialises the full array because the overview pyramid needs it). max_z_error : float [experimental] Per-pixel error budget for LERC compression. ``0.0`` (default) is lossless; larger values let the encoder approximate values within the bound, producing smaller files at the cost of accuracy bounded by ``abs(decoded - original) <= max_z_error``. Only used when ``compression='lerc'`` (which itself requires ``allow_experimental_codecs=True``); passing a non-zero value with any other codec raises ``ValueError``. photometric : str or int [advanced] Photometric interpretation for the TIFF Photometric tag (262). * ``'auto'`` (default) -- MinIsBlack (1) for any band count. ExtraSamples for every band beyond the first is tagged ``0`` (unspecified). Multispectral rasters (e.g. R, G, B, NIR) round-trip through this default without being silently labelled as RGB+alpha. Prior versions treated any 3+ band array as RGB and the 4th band as unassociated alpha -- the behaviour change is intentional. * ``'rgb'`` -- RGB (Photometric=2). Three colour bands; any additional bands are tagged ``0`` (unspecified). * ``'rgba'`` -- RGB with the 4th band tagged as unassociated alpha (TIFF ExtraSamples=2). Requires at least 4 bands. * ``'minisblack'`` or ``'miniswhite'`` -- grayscale; multi-band extras tagged ``0``. Signed-integer pixel types with ``'miniswhite'`` are rejected with ``NotImplementedError`` -- xrspatial has no semantically correct inversion for signed MinIsWhite and the silent passthrough that used to happen produced files that disagreed with the on-disk Photometric tag against every standards-compliant TIFF reader. Cast to an unsigned dtype or pass ``photometric='minisblack'``. * An ``int`` -- written verbatim into Photometric for advanced callers (e.g. ``3`` for Palette, ``5`` for CMYK). A user-supplied ``extra_tags`` entry of ``(TAG_PHOTOMETRIC, ...)`` or ``(TAG_EXTRA_SAMPLES, ...)`` overrides the writer's chosen value; only these two tag ids are overridable so other auto-emitted tags such as ``ImageWidth`` or ``StripOffsets`` remain protected. allow_experimental_codecs : bool [experimental] Opt in to the Tier 3 experimental codecs ``'lerc'``, ``'jpeg2000'`` / ``'j2k'``, and ``'lz4'`` (default ``False``). Setting ``compression=`` to one of those codecs without this flag raises ``ValueError`` whose message names the flag. With the flag set, the write proceeds and a ``GeoTIFFFallbackWarning`` is emitted once per call so the caller knows the chosen codec carries no cross-backend numerical parity claim and uneven reader support across GDAL versions. Does NOT cover ``compression='jpeg'``: the internal-only JPEG path keeps its own dedicated ``allow_internal_only_jpeg`` flag because internal-only is a stricter tier than experimental. The kwarg is forwarded unchanged to ``write_geotiff_gpu`` on the GPU dispatch path. allow_internal_only_jpeg : bool [internal-only] Opt in to the ``compression='jpeg'`` encode path (default ``False``). The encoder writes self-contained JFIF tiles without the TIFF JPEGTables tag (347); the file decodes through this library's reader but not through libtiff, GDAL, or rasterio. This codec is internal-only for the release contract: it is not externally interoperable and the path exists so xrspatial can round-trip its own JPEG output. With the flag set, the write proceeds and a ``GeoTIFFFallbackWarning`` is emitted at call time. Without the flag, ``compression='jpeg'`` raises ``ValueError``. The kwarg is forwarded unchanged to ``write_geotiff_gpu`` on the GPU dispatch path so callers can reach the same experimental encode via ``to_geotiff(..., gpu=True)``. allow_unparseable_crs : bool [experimental] Opt in to writing an unvalidatable CRS string into ``GTCitationGeoKey`` (default ``False``). When ``False`` (the default), a ``crs=`` value that is neither an EPSG int nor a string that pyproj can resolve and is not structurally WKT (no ``PROJCS`` / ``GEOGCS`` / ``PROJCRS`` / ``GEOGCRS`` root) raises ``ValueError`` instead of landing verbatim in the citation field. Set to ``True`` to keep the permissive behaviour. The fail-closed default protects against files where a typo'd PROJ string or an ``"EPSG:4326"`` token on a host without pyproj produces a citation that most readers cannot interpret. drop_rotation : bool, default False [advanced] Opt in to writing a DataArray that carries ``attrs['rotated_affine']``. The reader sets that attr when called with ``allow_rotated=True`` on a file whose ``ModelTransformationTag`` contains rotation, shear, or z-coupling terms. The writer does not emit a ``ModelTransformationTag``, so passing such a DataArray through ``to_geotiff`` produces an identity-affine output and loses the rotated mapping; a subsequent ``open_geotiff`` round-trip cannot recover it. Default ``False`` refuses the write with ``ValueError`` so the loss is impossible without an explicit signal. ``drop_rotation=True`` accepts the loss and lets the write proceed; consumers reading the output will see an axis-aligned, non-rotated TIFF. Returns ------- str or binary file-like The ``path`` argument (a string for filesystem paths, the file-like object for BytesIO destinations). Returning the path lines up with ``write_vrt`` and lets callers chain a write into a read without round-tripping through a variable; existing callers that discarded the previous ``None`` return are unaffected. Raises ------ ValueError If ``data.attrs['transform']`` is a rotated or skewed affine (``b != 0`` or ``d != 0`` in rasterio ``Affine`` ordering). The on-disk GeoTIFF is axis-aligned; reproject onto an axis-aligned grid first. ValueError If ``data.attrs['rotated_affine']`` is set and ``drop_rotation`` is False. The reader sets that attr on the ``allow_rotated=True`` path; the writer cannot emit a ``ModelTransformationTag`` so writing would silently lose the rotation. Pass ``drop_rotation=True`` to accept the loss explicitly. ValueError If ``data`` is a 3D DataArray whose ``dims`` is not ``(band, y, x)`` or ``(y, x, band)`` (accepting the band-name aliases ``bands`` / ``channel`` and spatial-name aliases ``lat`` / ``lon`` / ``latitude`` / ``longitude`` / ``row`` / ``col``). A leading non-band dim such as ``time`` is rejected because the writer cannot infer the band axis from arbitrary names and used to silently treat the leading axis as ``y``. """ from .._reader import _coerce_path path = _coerce_path(path) # Reject bool / np.bool_ nodata up front. ``bool`` is a subclass of # ``int`` in Python, so a typo like ``nodata=True`` slips past every # downstream ``isinstance(nodata, (int, float))`` guard. The geotag # builder then writes ``str(True)`` -> ``"True"`` into GDAL_NODATA, # which no reader parses as numeric, so the round-trip drops the # sentinel without warning. Refuse rather than coerce ``True`` to 1: # the caller almost certainly meant a real numeric sentinel. if isinstance(nodata, (bool, np.bool_)): raise TypeError( f"nodata must be numeric (int or float), got {nodata!r}") # Reject non-bool ``gpu`` before the truthiness dispatch below. # ``use_gpu`` keys the GPU path off ``gpu`` directly, so a value # like ``gpu="False"`` (truthy) would silently select the GPU # writer. ``None`` stays valid here: it means "auto-detect from the # data" on the write path. _validate_gpu_arg(gpu, allow_none=True) # tiled=False ignores tile_size for the strip-layout pixel data, so # historically validation only ran when tiled=True. The COG path # (cog=True) still reads tile_size to auto-generate overviews in # ``_writer.py`` regardless of the strip-vs-tiled choice, so a # non-positive tile_size with cog=True drove the overview loop # into a hang once oh, ow halved to 0. Validate # tile_size whenever either path will consume it: tiled output OR # COG overview generation. Shared with write_geotiff_gpu via # _validate_tile_size_arg so both writers keep identical validation. if tiled or cog: _validate_tile_size_arg(tile_size) _validate_nodata_arg(nodata) # Refuse to silently drop the rotated 6-tuple that the reader # surfaces on ``attrs['rotated_affine']`` when called with # ``allow_rotated=True``. The writer has no ``ModelTransformationTag`` # emit path, so writing a rotated input produces an identity-affine # file with no signal to the caller. Run the check # at the entry point so the GPU dispatch path, the VRT path, and the # eager path all hit the same gate. _drop_rotation_attrs = getattr(data, 'attrs', None) or {} _validate_no_rotated_affine( _drop_rotation_attrs, drop_rotation=drop_rotation, entry_point="to_geotiff", ) # ``attrs['transform']`` carrying a rasterio ``Affine`` with # non-zero rotation/shear (``b != 0`` or ``d != 0``) silently # slipped past ``transform_from_attr`` because ``Affine`` iterates # as a 9-element augmented matrix and the 6-tuple gate there ran # ``len(seq) != 6 -> return None``. Without the rejection the writer # falls back to coord-derived or no-georef output and the rotation # is lost on disk. Detect the Affine shape by duck-typing the (b, d) # attrs and surface the same diagnostic the 6-tuple branch raises # (wording kept verbatim so the existing match patterns still hit). _attr_transform = _drop_rotation_attrs.get('transform') if (_attr_transform is not None and hasattr(_attr_transform, 'b') and hasattr(_attr_transform, 'd')): try: _b = float(_attr_transform.b) _d = float(_attr_transform.d) except (TypeError, ValueError) as _exc: # Fail-closed on a malformed ``.b`` / ``.d`` rather than # zero-defaulting: an unconvertable value inside an attr # claiming to be an affine transform is itself a writer # input contract violation. Without the explicit raise the # branch would bypass every downstream georef gate that # would otherwise catch the bad value. raise ValueError( f"attrs['transform'] has unconvertable rotation/shear " f"terms (b={_attr_transform.b!r}, " f"d={_attr_transform.d!r}); expected numeric values on " f"a rasterio Affine-like object." ) from _exc if abs(_b) > ROTATION_SHEAR_TOL or abs(_d) > ROTATION_SHEAR_TOL: raise ValueError( f"attrs['transform'] has non-zero rotation/shear " f"(b={_b!r}, d={_d!r}); rotated or skewed affines are " f"not supported by the GeoTIFF writers in this module " f"because the on-disk GeoTIFF representation is " f"axis-aligned and would be written at the wrong " f"location. Reproject the raster to an axis-aligned " f"grid before writing." ) # Reject zero-height / zero-width inputs before any dispatch # decision. Clip / window pipelines naturally produce empty # rasters and the writers used to accept them, produce a TIFF whose # IFD claimed shape ``(0, N)`` / ``(N, 0)``, and surface a generic # "Invalid image dimensions" only at read time. Fail closed at the # entry point with a message that names the offending dim. _shape = getattr(data, 'shape', None) _dims = getattr(data, 'dims', None) _validate_writer_spatial_shape(_shape, _dims) # Ambiguous-metadata checks. The hook is a no-op when no check is # registered, so this call is safe even if every check is later # unregistered for a specific entry point. # # Resolve documented spatial-dim aliases (lat/lon, # latitude/longitude, row/col) here so the NonUniformCoordsError # check fires consistently regardless of which alias the caller # picked. Before this, only literal coords['y'] / coords['x'] # were passed in, and alias-named coords slipped past the # validator entirely; the later transform-synthesis path caught # them only as plain ValueError. _attrs = getattr(data, 'attrs', None) or {} _coord_y, _coord_x = _resolve_spatial_coords(data) validate_write_metadata({ 'crs_kwarg': crs, 'attrs_crs': _attrs.get('crs'), 'attrs_crs_wkt': _attrs.get('crs_wkt'), 'nodata_kwarg': nodata, 'attrs_nodata': _attrs.get('nodata'), 'attrs_nodatavals': _attrs.get('nodatavals'), 'coord_y': _coord_y, 'coord_x': _coord_x, 'no_georef_marker': _has_no_georef_marker(data), }) # Up-front validation: catch bad compression names before they reach # any of the deeper write paths (streaming, GPU, VRT, COG) where the # error surfaces from _compression_tag with a less obvious traceback. if isinstance(compression, str): if compression.lower() not in _VALID_COMPRESSIONS: raise ValueError( f"Unknown compression {compression!r}. " f"Valid options: {list(_VALID_COMPRESSIONS)}.") # JPEG-in-TIFF (compression=7) requires the JPEGTables tag (347) # carrying the abbreviated quantization/Huffman tables. The # current encoder writes a self-contained JFIF stream per # tile/strip and omits JPEGTables, which makes the resulting # files unreadable by libtiff / GDAL / rasterio: they reject the # tile data with "TIFFReadEncodedStrip() failed". The internal # reader round-trips because Pillow re-decodes the JFIF stream # directly, masking the interop break. Refuse the write by # default and surface the same ``allow_internal_only_jpeg=True`` # opt-in that ``write_geotiff_gpu`` already accepts, so the # auto-dispatch entry point can reach the experimental # internal-reader-only path the explicit GPU entry point # exposes. if compression.lower() == 'jpeg' and not allow_internal_only_jpeg: raise ValueError( "compression='jpeg' is not supported: the encoder writes " "self-contained JFIF streams without the required " "JPEGTables tag (347), so other readers (libtiff, GDAL, " "rasterio) reject the file. Use 'deflate', 'zstd', or " "'lzw' instead. Pass allow_internal_only_jpeg=True to " "opt in to the experimental internal-reader-only path " "(issue #1845).") # The JPEG opt-in warning is emitted below once we know the # dispatch decision: ``write_geotiff_gpu`` emits its own warning # on the GPU path, so emitting here would double-warn callers # of ``to_geotiff(gpu=True, compression='jpeg', # allow_internal_only_jpeg=True)``. # Tier 3 experimental-codec gate. Lerc, jpeg2000 / # j2k, and lz4 sit in ``_VALID_COMPRESSIONS`` for wire-format # reasons but their cross-backend numerical parity, reader # support across GDAL versions, and (for lerc) bounded lossy # behaviour all carry caveats the default writer should not # silently accept. Mirror the ``allow_internal_only_jpeg`` shape # so callers learn the opt-in name from the rejection message # and can fix the call site in one line. The opt-in warning is # emitted below once the GPU dispatch decision is known so the # GPU path does not double-warn (``write_geotiff_gpu`` emits its # own warning on the GPU path). if (compression.lower() in _EXPERIMENTAL_CODECS and not allow_experimental_codecs): raise ValueError( f"compression={compression!r} is experimental: cross-backend " "numerical parity is not claimed and reader support across " "GDAL versions is uneven. Pass allow_experimental_codecs=True " "to opt in, or use 'deflate', 'zstd', or 'lzw' for a " "stable lossless codec (issue #2137).") # max_z_error only applies to LERC; reject negative values and reject # non-zero values paired with any other codec so the caller learns the # parameter was ignored before bytes hit disk. if max_z_error < 0: raise ValueError( f"max_z_error must be >= 0, got {max_z_error}") if max_z_error != 0 and ( not isinstance(compression, str) or compression.lower() != 'lerc'): raise ValueError( "max_z_error is only valid with compression='lerc'") # File-like (BytesIO etc.) destinations: the streaming, GPU, COG, and # VRT writers all need a real filesystem path (atomic rename, overview # passes, sidecar writes). Reject those combos up front so the user # gets a clear error instead of a deep traceback. _path_is_file_like = (not isinstance(path, str)) and hasattr(path, 'write') if _path_is_file_like: if cog: raise ValueError( "cog=True is not supported for file-like destinations. " "Pass a string path or write to BytesIO without cog=True.") elif not isinstance(path, str): raise TypeError( f"path must be a str or a binary file-like with a write() " f"method, got {type(path).__name__}") _is_vrt_path = ( isinstance(path, str) and path.lower().endswith('.vrt')) # Resolve GPU dispatch up front so the JPEG opt-in warning fires # exactly once. ``write_geotiff_gpu`` emits its own warning on the # GPU path; emitting here as well would double-warn callers of # ``to_geotiff(gpu=True, compression='jpeg', # allow_internal_only_jpeg=True)``. VRT and CPU paths receive the # warning here. On GPU-to-CPU fallback the GPU writer has already # warned before raising, so the CPU fallback does not warn twice. auto_detected_gpu = gpu is None use_gpu = gpu if gpu is not None else _is_gpu_data(data) if (isinstance(compression, str) and compression.lower() == 'jpeg' and allow_internal_only_jpeg and not use_gpu): warnings.warn( "to_geotiff(compression='jpeg', " "allow_internal_only_jpeg=True) writes JFIF tiles " "without the TIFF JPEGTables tag (347); the file decodes " "through xrspatial but may fail in libtiff, GDAL, or " "rasterio. See issue #1845.", GeoTIFFFallbackWarning, stacklevel=2, ) # Tier 3 experimental-codec opt-in warning. Mirrors the JPEG # flag's "warn once, after dispatch is resolved" shape: # ``write_geotiff_gpu`` emits its own warning on the GPU path with # a backend-specific caveat, so the CPU dispatcher only warns when # the write is staying on CPU. if (isinstance(compression, str) and compression.lower() in _EXPERIMENTAL_CODECS and allow_experimental_codecs and not use_gpu): warnings.warn( f"to_geotiff(compression={compression!r}, " "allow_experimental_codecs=True): experimental codec, " "no cross-backend parity claim and uneven reader support " "across GDAL versions. See issue #2137.", GeoTIFFFallbackWarning, stacklevel=2, ) # Reject ``gdal_metadata_xml`` / ``extra_tags`` pass-through writes # unless the caller opted in via ``allow_experimental_codecs=True``. # Both surfaces ride the Experimental tier in ``SUPPORTED_FEATURES`` # because the on-disk bytes are written verbatim and downstream # interop with rasterio / libtiff / GDAL depends on the payload. _data_attrs_for_optin = ( data.attrs if isinstance(data, xr.DataArray) else {} ) from .._attrs import _validate_write_rich_tag_optin _validate_write_rich_tag_optin( _data_attrs_for_optin, allow_experimental_codecs=allow_experimental_codecs, entry_point="to_geotiff", ) # ``cog=True`` requires a tiled internal layout per the COG spec. # The writer used to accept ``cog=True, tiled=False``, warn that # ``tile_size`` was ignored, and then write strips via ``_write`` -- # silently producing a file that violates the stable COG contract. # Reject the combination at the public boundary with the same # actionable-error shape as the other COG input gates: the message # names the violated constraint and lists both fixes the caller can # apply in one line. The defense-in-depth gate in ``_writer._write`` # catches direct callers that bypass this wrapper. if cog and not tiled: raise ValueError(_COG_REQUIRES_TILED_MSG) # tile_size only applies to tiled output; warn if the caller passed a # non-default size alongside strip mode (it would otherwise be silently # ignored). The VRT path always tiles, so the warning would be # misleading there -- the VRT branch below rejects tiled=False up front # instead. The ``cog=True, tiled=False`` arm of this warning is dead # under the cog/tiled gate above (that combination raises before # reaching this line), so the condition below only fires for ``cog=False, # tiled=False, tile_size != 256``. if not tiled and tile_size != 256 and not _is_vrt_path: warnings.warn( f"tile_size={tile_size} is ignored when tiled=False " "(strip layout). Pass tiled=True to use tile_size, or drop " "tile_size to silence this warning.", stacklevel=2, ) # VRT tiled output (string paths only -- VRT writes a real .vrt file # plus per-tile GeoTIFFs to a directory) if _is_vrt_path: if not tiled: raise ValueError( "tiled=False is not compatible with VRT output. " "VRT writes a directory of tiled GeoTIFFs; pass " "tiled=True or omit it.") # The early ``if tiled: _validate_tile_size_arg(tile_size)`` check # above already validates tile_size when tiled=True, but call it # here as well so the VRT path stays self-contained against future # changes to the early-validation gate (no-op on re-entry; either # raises or returns). _validate_tile_size_arg(tile_size) if cog: raise ValueError( "cog=True is not compatible with VRT output. " "VRT writes tiled GeoTIFFs, not a single COG.") if overview_levels is not None: raise ValueError( "overview_levels is not compatible with VRT output. " "VRT tiles do not include overviews.") _write_vrt_tiled(data, path, crs=crs, nodata=nodata, compression=compression, compression_level=compression_level, tile_size=tile_size, predictor=predictor, bigtiff=bigtiff, max_z_error=max_z_error, photometric=photometric, allow_unparseable_crs=allow_unparseable_crs, allow_internal_only_jpeg=allow_internal_only_jpeg, drop_rotation=drop_rotation) return path # Dispatch to write_geotiff_gpu when GPU was selected (explicit # ``gpu=True`` or auto-detected CuPy data). ``auto_detected_gpu`` # and ``use_gpu`` were computed above to gate the JPEG opt-in # warning; reuse them so the call sites stay in sync. if use_gpu and _path_is_file_like: # write_geotiff_gpu's nvCOMP path materialises tile parts and then # calls _write_bytes(path), which would write at the buffer's # current cursor without truncating. More importantly, the GPU # path was never tested with file-like destinations; refuse rather # than silently produce something untested. raise ValueError( "gpu=True is not supported for file-like destinations. " "Pass a string path (or set gpu=False).") if use_gpu: # GPU writer uses nvCOMP and does not support LERC; refuse rather # than silently dropping the requested error budget. if max_z_error != 0: raise ValueError( "max_z_error is not supported on the GPU writer " "(nvCOMP has no LERC backend). Use the CPU path " "(gpu=False) or omit max_z_error.") # Strip output is not implemented on the GPU path; reject up # front rather than silently producing a tiled file. if not tiled: raise ValueError( "tiled=False is not supported on the GPU writer. " "Pass gpu=False or omit tiled=False.") try: write_geotiff_gpu( data, path, crs=crs, nodata=nodata, compression=compression, compression_level=compression_level, tiled=tiled, tile_size=tile_size, predictor=predictor, cog=cog, overview_levels=overview_levels, overview_resampling=overview_resampling, bigtiff=bigtiff, streaming_buffer_bytes=streaming_buffer_bytes, photometric=photometric, allow_internal_only_jpeg=allow_internal_only_jpeg, allow_experimental_codecs=allow_experimental_codecs, allow_unparseable_crs=allow_unparseable_crs, drop_rotation=drop_rotation, ) return path except ImportError as e: # ``write_geotiff_gpu`` raises ImportError when cupy itself # can't be imported. nvCOMP absence doesn't surface here: # ``_try_nvcomp_from_device_bufs`` returns None when the # library can't load, and the writer drops to CPU # compression internally instead of re-raising. Fall back # to the CPU writer with a typed warning so callers see # that gpu=True (or auto-detected CuPy data) didn't go # through. Strict mode re-raises so CI can fail loudly on # missing GPU stacks. if _geotiff_strict_mode(): raise warnings.warn( _gpu_fallback_warning_message(auto_detected_gpu, e), GeoTIFFFallbackWarning, stacklevel=2, ) except RuntimeError as e: # Only fall back when the message names a GPU-availability # problem; any other RuntimeError is a real bug in the GPU # writer and the broad ``except (ImportError, Exception)`` # used to hide it from the user. Keep the keyword list # tight: nvCOMP / CUDA / no device / no GPU / cuInit cover # the realistic "no GPU present" failure modes without # masking, e.g., a CRS or compression error that happens to # raise RuntimeError. Strict mode re-raises in either case. _gpu_unavail_tokens = ( 'nvcomp', 'cuda', 'no device', 'no gpu', 'cuinit', ) msg = str(e).lower() if not any(tok in msg for tok in _gpu_unavail_tokens): raise if _geotiff_strict_mode(): raise warnings.warn( _gpu_fallback_warning_message(auto_detected_gpu, e), GeoTIFFFallbackWarning, stacklevel=2, ) geo_transform = None epsg = None wkt_fallback = None # WKT string when EPSG is not available raster_type = RASTER_PIXEL_IS_AREA x_res = None y_res = None res_unit = None gdal_meta_xml = None extra_tags_list = None # Default for the NaN-sentinel restore gate. Overwritten with the # actual ``attrs['masked_nodata']`` value below when ``data`` is a # DataArray; bare numpy / cupy positional inputs have no attrs # so they keep the NaN->sentinel rewrite. restore_sentinel = True # Resolve crs argument: can be int (EPSG) or str (WKT/PROJ). # ``numbers.Integral`` covers plain ``int`` and numpy integer scalars # (``np.int32``, ``np.int64``); ``_validate_crs_arg`` already rejects # bool. Coerce to plain ``int`` so downstream ``epsg`` is a real int. _validate_crs_arg(crs) if isinstance(crs, numbers.Integral): epsg = int(crs) elif isinstance(crs, str): epsg = _wkt_to_epsg(crs) # try to extract EPSG from WKT/PROJ if epsg is None: wkt_fallback = crs if isinstance(data, xr.DataArray): raw = data.data # Extract metadata from DataArray attrs (no materialisation needed). # Resolve the affine through the centralised resolver so this # writer, the GPU writer, and the per-tile VRT path share # the same precedence rule: prefer ``attrs['transform']`` (which # round-trips bit-exactly) over a coord-derived transform (which # drifts on fractional pixel sizes because ``x[1] - x[0]`` is # computed in float64 from already-rounded coords). if geo_transform is None: geo_transform = _resolve_georef(data).transform # Fail closed when coords are present but no transform could be # derived (e.g. 1x1 without ``attrs['transform']``) instead of # silently writing a non-georeferenced TIFF that round-trips back # with integer pixel coords. _require_transform_for_georeferenced(data, geo_transform) if epsg is None and crs is None: crs_attr = data.attrs.get('crs') if isinstance(crs_attr, str): epsg = _wkt_to_epsg(crs_attr) if epsg is None and wkt_fallback is None: wkt_fallback = crs_attr elif crs_attr is not None: # Same gate as the kwarg path: reject bool / non-int # types and confirm the EPSG resolves before writing it # to disk. Without this, ``attrs={'crs': True}`` round- # trips as EPSG=1. _validate_crs_arg(crs_attr) epsg = int(crs_attr) if epsg is None: wkt = data.attrs.get('crs_wkt') if isinstance(wkt, str): epsg = _wkt_to_epsg(wkt) if epsg is None and wkt_fallback is None: wkt_fallback = wkt if nodata is None: nodata = _resolve_nodata_attr(data.attrs) # ``attrs['masked_nodata']`` records whether the reader promoted # the sentinel to NaN. Writers reverse that rewrite only when the # read side did the forward step. Default (attr absent) is True, # which preserves behaviour for external DataArrays. restore_sentinel = _should_restore_nan_sentinel(data.attrs) # Pull raster_type, gdal_metadata_xml, extra_tags (folded with # the friendly image_description / extra_samples / colormap # attrs), x/y_resolution, and resolution_unit via the shared # helper so all three writers stay in lockstep. _rich = _extract_rich_tags(data.attrs) raster_type = _rich['raster_type'] gdal_meta_xml = _rich['gdal_metadata_xml'] extra_tags_list = _rich['extra_tags'] x_res = _rich['x_resolution'] y_res = _rich['y_resolution'] res_unit = _rich['resolution_unit'] # Dask-backed: stream tiles to avoid materialising the full array. # COG requires overviews from the full array, so it falls through # to the eager path. Streaming write needs a real filesystem path # (it builds a temp file then atomic-renames); for file-like # destinations we materialise eagerly and assemble in-memory. if hasattr(raw, 'dask') and not cog and not _path_is_file_like: dask_arr = raw # Reject ambiguous 3D layouts at the entry point so a leading # non-band dim like ``('time', 'y', 'x')`` cannot silently # round-trip as a TIFF whose ``y`` axis carries the time # values. if raw.ndim == 3: _validate_3d_writer_dims(data.dims) # Handle band-first dimension order (band, y, x) -> (y, x, band) if raw.ndim == 3 and data.dims[0] in _BAND_DIM_NAMES: import dask.array as da dask_arr = da.moveaxis(raw, 0, -1) if dask_arr.ndim not in (2, 3): raise ValueError( f"Expected 2D or 3D array, got {dask_arr.ndim}D") # Validate compression_level if compression_level is not None: level_range = _LEVEL_RANGES.get(compression.lower()) if level_range is not None: lo, hi = level_range if not (lo <= compression_level <= hi): raise ValueError( f"compression_level={compression_level} out of " f"range for {compression} (valid: {lo}-{hi})") from .._writer import write_streaming # Refuse to write an unvalidatable CRS string into # GTCitationGeoKey unless the caller opts in. ``epsg`` is # set when ``_wkt_to_epsg`` succeeded; only the fallback # branch needs the guard. if epsg is None: _validate_crs_fallback(wkt_fallback, allow_unparseable_crs) write_streaming( dask_arr, path, geo_transform=geo_transform, crs_epsg=epsg, crs_wkt=wkt_fallback if epsg is None else None, nodata=nodata, compression=compression, compression_level=compression_level, tiled=tiled, tile_size=tile_size, predictor=predictor, raster_type=raster_type, x_resolution=x_res, y_resolution=y_res, resolution_unit=res_unit, gdal_metadata_xml=gdal_meta_xml, extra_tags=extra_tags_list, bigtiff=bigtiff, streaming_buffer_bytes=streaming_buffer_bytes, max_z_error=max_z_error, photometric=photometric, restore_sentinel=restore_sentinel, # ``to_geotiff`` ran the JPEG opt-in and the CRS # fallback gates upstream; forwarding the kwargs lets # ``_write_streaming``'s push-down check stay aligned # rather than rejecting input the wrapper accepted. allow_internal_only_jpeg=allow_internal_only_jpeg, allow_unparseable_crs=allow_unparseable_crs, ) return path # Eager compute (numpy, CuPy, or dask+COG) if hasattr(raw, 'get'): arr = raw.get() # CuPy -> numpy elif hasattr(raw, 'compute'): arr = raw.compute() # Dask -> numpy if hasattr(arr, 'get'): arr = arr.get() # Dask+CuPy -> numpy else: arr = np.asarray(raw) # Reject ambiguous 3D layouts. The validator runs # on ``data.dims`` (the original DataArray's dim names) rather # than on ``arr`` so the error fires before the move-axis even # for COG and file-like destinations that fall through here. if arr.ndim == 3: _validate_3d_writer_dims(data.dims) # Handle band-first dimension order (band, y, x) -> (y, x, band) if arr.ndim == 3 and data.dims[0] in _BAND_DIM_NAMES: arr = np.moveaxis(arr, 0, -1) else: if hasattr(data, 'get'): arr = data.get() # CuPy -> numpy else: arr = np.asarray(data) if arr.ndim not in (2, 3): raise ValueError(f"Expected 2D or 3D array, got {arr.ndim}D") # Auto-promote unsupported dtypes if arr.dtype == np.float16: arr = arr.astype(np.float32) elif arr.dtype == np.bool_: arr = arr.astype(np.uint8) # Restore NaN pixels to the nodata sentinel value so the written file # has sentinel values matching the GDAL_NODATA tag. # # The defensive ``arr.copy()`` here is intentional: ``arr`` may be # ``np.asarray(raw)`` of a caller-owned numpy DataArray (a view, # not a copy) or ``np.moveaxis(arr, 0, -1)`` of one (also a view). # Mutating without a copy would corrupt the user's input buffer. # # ``NodataLifecycle.writer_restore_sentinel`` owns the "should we # restore?" decision (no declared sentinel, non-float buffer, NaN # sentinel all collapse to False there). The ``restore_sentinel`` # local was resolved upstream from ``attrs['masked_nodata']`` and # is passed through the helper's ``restore_sentinel=`` kwarg as a # single bool. if _NL(declared=nodata, dtype_in=arr.dtype).writer_restore_sentinel( buffer_dtype=arr.dtype, restore_sentinel=restore_sentinel, ): nan_mask = np.isnan(arr) if nan_mask.any(): arr = arr.copy() arr[nan_mask] = arr.dtype.type(nodata) # Validate compression_level against codec-specific range if compression_level is not None: level_range = _LEVEL_RANGES.get(compression.lower()) if level_range is not None: lo, hi = level_range if not (lo <= compression_level <= hi): raise ValueError( f"compression_level={compression_level} out of range " f"for {compression} (valid: {lo}-{hi})") # Refuse to write an unvalidatable CRS string into GTCitationGeoKey # unless the caller opts in. if epsg is None: _validate_crs_fallback(wkt_fallback, allow_unparseable_crs) write( arr, path, geo_transform=geo_transform, crs_epsg=epsg, crs_wkt=wkt_fallback if epsg is None else None, nodata=nodata, compression=compression, compression_level=compression_level, tiled=tiled, tile_size=tile_size, predictor=predictor, cog=cog, overview_levels=overview_levels, overview_resampling=overview_resampling, raster_type=raster_type, x_resolution=x_res, y_resolution=y_res, resolution_unit=res_unit, gdal_metadata_xml=gdal_meta_xml, extra_tags=extra_tags_list, bigtiff=bigtiff, max_z_error=max_z_error, photometric=photometric, restore_sentinel=restore_sentinel, # ``to_geotiff`` ran the JPEG opt-in and the CRS fallback # gates upstream; forwarding the kwargs keeps ``_write``'s # push-down check from rejecting input the wrapper accepted. allow_internal_only_jpeg=allow_internal_only_jpeg, allow_unparseable_crs=allow_unparseable_crs, ) return path
def _write_single_tile(chunk_data, path, geo_transform, epsg, wkt, nodata, compression, compression_level, tile_size, predictor, bigtiff, max_z_error: float = 0.0, raster_type: int = RASTER_PIXEL_IS_AREA, x_resolution=None, y_resolution=None, resolution_unit=None, gdal_metadata_xml=None, extra_tags=None, photometric: str | int = 'auto', restore_sentinel: bool = True, allow_internal_only_jpeg: bool = False, allow_unparseable_crs: bool = False): """Write a single tile GeoTIFF. Used by _write_vrt_tiled. Forwards the same rich-tag set that ``to_geotiff`` passes through to ``write`` (raster_type, x/y resolution, GDAL metadata, extra tags) so every per-tile file under a VRT carries the same metadata it would have received from a single-file ``to_geotiff(..., out.tif)`` write. Without this, ``to_geotiff(da, "out.vrt")`` silently drops everything except the per-tile geo_transform / crs / nodata. """ if hasattr(chunk_data, 'compute'): chunk_data = chunk_data.compute() if hasattr(chunk_data, 'get'): chunk_data = chunk_data.get() # CuPy -> numpy arr = np.asarray(chunk_data) # Auto-promote unsupported dtypes if arr.dtype == np.float16: arr = arr.astype(np.float32) elif arr.dtype == np.bool_: arr = arr.astype(np.uint8) # Restore NaN to nodata sentinel. # # The defensive ``arr.copy()`` here is intentional: ``arr`` came # from ``np.asarray(chunk_data)`` where ``chunk_data`` may be a # caller-owned numpy buffer. Mutating without a copy would corrupt # the user's input. # # ``NodataLifecycle.writer_restore_sentinel`` owns the gate; see # the matching block in ``to_geotiff`` above. if _NL(declared=nodata, dtype_in=arr.dtype).writer_restore_sentinel( buffer_dtype=arr.dtype, restore_sentinel=restore_sentinel, ): nan_mask = np.isnan(arr) if nan_mask.any(): arr = arr.copy() arr[nan_mask] = arr.dtype.type(nodata) write(arr, path, geo_transform=geo_transform, crs_epsg=epsg, crs_wkt=wkt if epsg is None else None, nodata=nodata, compression=compression, tiled=True, tile_size=tile_size, predictor=predictor, compression_level=compression_level, raster_type=raster_type, x_resolution=x_resolution, y_resolution=y_resolution, resolution_unit=resolution_unit, gdal_metadata_xml=gdal_metadata_xml, extra_tags=extra_tags, bigtiff=bigtiff, max_z_error=max_z_error, photometric=photometric, restore_sentinel=restore_sentinel, # Forward the JPEG / CRS-fallback opt-ins so the per-tile # write does not re-trip the push-down gate ``to_geotiff`` # / ``_write_vrt_tiled`` already cleared upstream. allow_internal_only_jpeg=allow_internal_only_jpeg, allow_unparseable_crs=allow_unparseable_crs) def _write_vrt_tiled(data, vrt_path, *, crs=None, nodata=None, compression='zstd', compression_level=None, tile_size=256, predictor: bool | int = False, bigtiff=None, max_z_error: float = 0.0, photometric: str | int = 'auto', allow_unparseable_crs: bool = False, allow_internal_only_jpeg: bool = False, drop_rotation: bool = False): """Write a DataArray as a directory of tiled GeoTIFFs with a VRT index. This enables streaming dask arrays to disk without materializing the full array in RAM. """ _validate_nodata_arg(nodata) # Mirror to_geotiff's rotated_affine gate so the VRT path rejects # rotated inputs at the same boundary. ``to_geotiff`` already ran # the check upstream when the caller reached this branch through # the dispatcher, but a direct call to ``_write_vrt_tiled`` would # otherwise bypass the gate. ``entry_point`` names the private helper # so a direct caller sees the function actually running the check; # the public ``to_geotiff`` dispatch path raised at its own gate # before reaching this point so the helper-name surface is only # visible to direct callers. _drop_rotation_attrs = getattr(data, 'attrs', None) or {} _validate_no_rotated_affine( _drop_rotation_attrs, drop_rotation=drop_rotation, entry_point="_write_vrt_tiled", ) # Ambiguous-metadata checks; mirrors the call in ``to_geotiff`` so # the dask-VRT write path enforces the same crs/crs_wkt / nodata / # coord rules. Alias resolution keeps the validator consistent # across both entry points. _attrs = getattr(data, 'attrs', None) or {} _coord_y, _coord_x = _resolve_spatial_coords(data) validate_write_metadata({ 'crs_kwarg': crs, 'attrs_crs': _attrs.get('crs'), 'attrs_crs_wkt': _attrs.get('crs_wkt'), 'nodata_kwarg': nodata, 'attrs_nodata': _attrs.get('nodata'), 'attrs_nodatavals': _attrs.get('nodatavals'), 'coord_y': _coord_y, 'coord_x': _coord_x, 'no_georef_marker': _has_no_georef_marker(data), }) # Validate compression_level against codec-specific range if compression_level is not None: level_range = _LEVEL_RANGES.get(compression.lower()) if level_range is not None: lo, hi = level_range if not (lo <= compression_level <= hi): raise ValueError( f"compression_level={compression_level} out of range " f"for {compression} (valid: {lo}-{hi})") # Derive tiles directory from VRT path stem vrt_dir = os.path.dirname(os.path.abspath(vrt_path)) stem = os.path.splitext(os.path.basename(vrt_path))[0] tiles_dir_name = stem + '_tiles' tiles_dir = os.path.join(vrt_dir, tiles_dir_name) # Validate tiles directory. A non-empty ``tiles_dir`` is treated as a # prior completed write and refused. Partial output from a *failed* # write never reaches this name -- tiles are staged in a temp dir and # only renamed into place once every tile is written (see below), so a # retry after a failure starts from a clean slate. if os.path.isdir(tiles_dir) and os.listdir(tiles_dir): raise FileExistsError( f"Tiles directory already contains files: {tiles_dir}") # Stage tiles in a temp directory next to the final one. Same parent # filesystem keeps the final ``os.replace`` atomic. The unique suffix # avoids collisions between concurrent writers targeting the same VRT. staging_dir = os.path.join( vrt_dir, f'{tiles_dir_name}.tmp-{uuid.uuid4().hex}') os.makedirs(staging_dir, exist_ok=True) # Resolve CRS. ``numbers.Integral`` covers numpy integer scalars # (``np.int32``, ``np.int64``) so ``crs=np.int64(4326)`` does not # silently fall through to ``epsg=None``. Validator already # rejects bool. _validate_crs_arg(crs) epsg = None wkt_fallback = None if isinstance(crs, numbers.Integral): epsg = int(crs) elif isinstance(crs, str): epsg = _wkt_to_epsg(crs) if epsg is None: wkt_fallback = crs geo_transform = None raster_type = RASTER_PIXEL_IS_AREA x_res = None y_res = None res_unit = None gdal_meta_xml = None extra_tags_list = None if isinstance(data, xr.DataArray): raw = data.data if epsg is None and crs is None: crs_attr = data.attrs.get('crs') if isinstance(crs_attr, str): epsg = _wkt_to_epsg(crs_attr) if epsg is None and wkt_fallback is None: wkt_fallback = crs_attr elif crs_attr is not None: # Same gate as the kwarg path: reject bool / non-int # types and confirm the EPSG resolves before writing it # to disk. Without this, ``attrs={'crs': True}`` round- # trips as EPSG=1. _validate_crs_arg(crs_attr) epsg = int(crs_attr) if epsg is None: wkt = data.attrs.get('crs_wkt') if isinstance(wkt, str): epsg = _wkt_to_epsg(wkt) if epsg is None and wkt_fallback is None: wkt_fallback = wkt if nodata is None: # Use the same alias-aware resolver that to_geotiff / # write_geotiff_gpu apply so a rioxarray-style DataArray # (``attrs['nodatavals']``) or a CF-style one # (``attrs['_FillValue']``) round-trips through ``.vrt`` # the same way it does through ``.tif``. Using # ``attrs.get('nodata')`` directly would silently drop both # aliases. nodata = _resolve_nodata_attr(data.attrs) # Mirror the ``to_geotiff`` gate so per-tile writes only # NaN-rewrite when the read side promoted the sentinel. See # ``_should_restore_nan_sentinel`` for the semantics; default # True keeps existing behaviour. restore_sentinel = _should_restore_nan_sentinel(data.attrs) # Resolve via the centralised resolver. Same precedence as # ``to_geotiff``: attrs['transform'] wins over coord-derived. geo_transform = _resolve_georef(data).transform # Match the to_geotiff fail-closed guard so VRT writes don't # silently produce non-georeferenced tiles either. _require_transform_for_georeferenced(data, geo_transform) # Pull the same rich-tag set that to_geotiff forwards to # ``write`` so per-tile files under the VRT carry it too. _rich = _extract_rich_tags(data.attrs) raster_type = _rich['raster_type'] gdal_meta_xml = _rich['gdal_metadata_xml'] extra_tags_list = _rich['extra_tags'] x_res = _rich['x_resolution'] y_res = _rich['y_resolution'] res_unit = _rich['resolution_unit'] else: raw = data # Refuse to write an unvalidatable CRS string into the per-tile # GTCitationGeoKey fields unless the caller opts in. # ``_write_single_tile`` forwards ``wkt_fallback`` to the geokey # builder once per tile, so validate once up front rather than # per-tile. if epsg is None: _validate_crs_fallback(wkt_fallback, allow_unparseable_crs) # Check for dask backing is_dask = hasattr(raw, 'dask') if is_dask: if raw.ndim != 2: raise ValueError( "VRT tiled output currently supports 2D arrays only, " f"got {raw.ndim}D. Squeeze or select a band first.") # Use dask chunk grid import dask row_chunks = raw.chunks[0] # tuple of chunk sizes along y col_chunks = raw.chunks[1] # tuple of chunk sizes along x n_row_tiles = len(row_chunks) n_col_tiles = len(col_chunks) else: # Numpy: tile using tile_size if hasattr(raw, 'get'): np_arr = raw.get() # CuPy elif hasattr(raw, 'compute'): np_arr = raw.compute() else: np_arr = np.asarray(raw) if np_arr.ndim != 2: raise ValueError( "VRT tiled output currently supports 2D arrays only, " f"got {np_arr.ndim}D. Squeeze or select a band first.") height, width = np_arr.shape[:2] n_row_tiles = (height + tile_size - 1) // tile_size n_col_tiles = (width + tile_size - 1) // tile_size # Zero-padding width for tile names pad_width = max(2, len(str(max(n_row_tiles, n_col_tiles) - 1))) # Tiles are written into ``staging_dir``; ``tile_names`` records the # bare filenames so the final tile paths (under ``tiles_dir``) can be # rebuilt for the VRT index after the atomic rename below. tile_names = [] delayed_tasks = [] def _cleanup_staging(): shutil.rmtree(staging_dir, ignore_errors=True) def _safe_write_tile(*args, **kwargs): # Return the exception instead of raising so a failure in one # threaded dask task does not abort ``dask.compute`` while sibling # threads are still writing into the staging dir. The caller raises # the first captured failure after every task has settled. try: _write_single_tile(*args, **kwargs) except BaseException as exc: # noqa: BLE001 - re-raised by caller return exc return None try: row_offset = 0 for ri in range(n_row_tiles): if is_dask: chunk_h = row_chunks[ri] else: chunk_h = min(tile_size, height - row_offset) col_offset = 0 for ci in range(n_col_tiles): if is_dask: chunk_w = col_chunks[ci] else: chunk_w = min(tile_size, width - col_offset) tile_name = f'tile_{ri:0{pad_width}d}_{ci:0{pad_width}d}.tif' tile_path = os.path.join(staging_dir, tile_name) tile_names.append(tile_name) # Compute per-tile geo_transform tile_gt = None if geo_transform is not None: tile_gt = GeoTransform( origin_x=geo_transform.origin_x + col_offset * geo_transform.pixel_width, origin_y=geo_transform.origin_y + row_offset * geo_transform.pixel_height, pixel_width=geo_transform.pixel_width, pixel_height=geo_transform.pixel_height, ) if is_dask: # Slice the dask array for this chunk r_end = row_offset + chunk_h c_end = col_offset + chunk_w chunk_data = raw[row_offset:r_end, col_offset:c_end] task = dask.delayed(_safe_write_tile)( chunk_data, tile_path, tile_gt, epsg, wkt_fallback, nodata, compression, compression_level, tile_size, predictor, bigtiff, max_z_error, raster_type=raster_type, x_resolution=x_res, y_resolution=y_res, resolution_unit=res_unit, gdal_metadata_xml=gdal_meta_xml, extra_tags=extra_tags_list, photometric=photometric, restore_sentinel=restore_sentinel, allow_internal_only_jpeg=allow_internal_only_jpeg, allow_unparseable_crs=allow_unparseable_crs) delayed_tasks.append(task) else: # Numpy: slice and write directly chunk_data = np_arr[row_offset:row_offset + chunk_h, col_offset:col_offset + chunk_w] _write_single_tile( chunk_data, tile_path, tile_gt, epsg, wkt_fallback, nodata, compression, compression_level, tile_size, predictor, bigtiff, max_z_error, raster_type=raster_type, x_resolution=x_res, y_resolution=y_res, resolution_unit=res_unit, gdal_metadata_xml=gdal_meta_xml, extra_tags=extra_tags_list, photometric=photometric, restore_sentinel=restore_sentinel, allow_internal_only_jpeg=allow_internal_only_jpeg, allow_unparseable_crs=allow_unparseable_crs) col_offset += chunk_w row_offset += chunk_h # Execute all dask tasks. # # Each delayed task is an independent ``_write_single_tile`` call on # a distinct output path, with no shared mutable Python state, so # the writes are embarrassingly parallel. Using ``scheduler='threads'`` # lets zlib / zstd / LZW release the GIL during compression and the # OS coalesce concurrent writes; in a 256-tile zstd write on a # 4096x4096 dask DataArray the wall time drops ~33% versus the # ``synchronous`` scheduler this used to call. # # ``dask.compute`` raises as soon as one task errors, while sibling # worker threads may still be writing tiles into the staging dir. # Cleaning up at that point races those threads and can leave a # partial staging dir behind. Capture each task's outcome instead # so every worker reaches a barrier before we inspect results, then # re-raise the first failure once no thread is still writing. if delayed_tasks: import dask results = dask.compute(*delayed_tasks, scheduler='threads') for r in results: if isinstance(r, BaseException): raise r except BaseException: # Any tile failure leaves no partial output: drop the staging dir # so the final ``tiles_dir`` name is never created and a retry # starts clean. _cleanup_staging() raise # Every tile is written. Promote the staging dir to its final name in # one atomic rename so the VRT only ever references a complete tile set. # A pre-existing *empty* ``tiles_dir`` passes the leftover-state guard # above (the old code reused it via ``makedirs(exist_ok=True)``), but # ``os.replace`` onto an existing directory raises on Windows even when # the target is empty. Drop the empty target first so the rename has a # clear slot on every platform. try: if os.path.isdir(tiles_dir): os.rmdir(tiles_dir) # only succeeds if empty; guard ensured that os.replace(staging_dir, tiles_dir) except BaseException: _cleanup_staging() raise # Write VRT index with relative paths. The VRT lives at ``vrt_path``; # tile paths now resolve under the final ``tiles_dir``. tile_paths = [os.path.join(tiles_dir, name) for name in tile_names] from .._vrt import write_vrt as _write_vrt_fn try: _write_vrt_fn(vrt_path, tile_paths, relative=True, nodata=nodata) except BaseException: # The index step failed after the rename. Remove the now-renamed # tile dir too so a retry is not blocked by the leftover-state # guard above. shutil.rmtree(tiles_dir, ignore_errors=True) raise