Source code for wf_psf.data.data_utils

"""
Data utilities and lightweight runtime data structures.

Provides lightweight dataset containers, runtime conversion contexts,
and helper utilities used throughout the dataset normalization and
preprocessing pipeline.

This module includes:

- Dictionary-like dataset container abstractions
- Dataset inspection and normalization helpers
- Runtime conversion context objects used during field-level processing
- Domain-specific preprocessing contexts (e.g. SED processing)

These utilities support schema-driven dataset conversion workflows used
by training, validation, and inference pipelines.

Notes
-----
The conversion context system is intentionally extensible to support
additional scientific domains and instrument pipelines beyond the
current Euclid-specific workflows. Future extensions
may include dedicated contexts for:

- PSF modeling
- Instrument calibration
- Detector noise simulation

Authors
-------
Jennifer Pollack <jennifer.pollack@cea.fr>
"""

from __future__ import annotations
from collections.abc import MutableMapping
from dataclasses import dataclass, is_dataclass, fields
from typing import Any, Optional 

from wf_psf.sims.psf_simulator import PSFSimulator



[docs]
@dataclass(frozen=True)
class SEDContext:
    """
    Context object containing parameters required for SED processing.

    This context encapsulates all runtime dependencies needed for
    spectral energy distribution (SED) transformations within the
    dataset conversion pipeline.

    Parameters
    ----------
    simPSF : Any
        PSF simulator instance used during SED processing. This object
        is responsible for modelling instrument response effects applied
        to spectral data.
    n_bins_lambda : int
        Number of wavelength bins used for discretizing the SED during
        conversion.
    """

    simPSF: PSFSimulator
    n_bins_lambda: int




[docs]
@dataclass(frozen=True)
class ConversionContext:
    """
    Global runtime context for dataset conversion operations.

    This object aggregates optional domain-specific contexts required
    during dataset preprocessing and conversion. It is passed through
    the conversion pipeline and accessed by field-specific handlers.

    Currently, it contains an optional SED context used for spectral
    energy distribution processing.

    Design Note
    -----------
    This structure is intentionally extensible to support additional
    scientific domains beyond SED processing. Future extensions may
    include, for example:

    - PSFContext: instrument point spread function modeling
    - InstrumentContext: instrument-specific calibration and metadata
    - NoiseContext: detector noise or simulation noise models

    This design allows the framework to generalize beyond Euclid and
    support additional instruments or simulation pipelines without
    modifying the core converter logic.

    Parameters
    ----------
    seds : SEDContext or None
        Optional context required for SED-related field processing.
        If None, SED-dependent handlers should not be invoked.
    """

    seds: SEDContext | None = None




[docs]
class DatasetContainer(MutableMapping):
    """
    Lightweight container for structured dataset data.

    Stores data internally as a dictionary, while providing
    dictionary-style and attribute-style access for convenience.

    Parameters
    ----------
    data : dict[str, Any]
        Dictionary containing dataset tensors and metadata.

    Attributes
    ----------
    _data : dict[str, Any]
        Internal storage for dataset contents.

    Examples
    --------
    >>> container = DatasetContainer({'x': np.array([1, 2, 3]), 'y': np.array([4, 5, 6])})
    >>> container['x']
    array([1, 2, 3])
    >>> container.x
    array([1, 2, 3])
    >>> container.to_dict()
    {'x': array([1, 2, 3]), 'y': array([4, 5, 6])}
    """

    def __init__(self, data: dict[str, Any]):
        self._data = data

    def __getitem__(self, key):
        """Return the value stored under `key`."""
        return self._data[key]

    def __setitem__(self, key, value):
        """Store `value` at `key`."""
        self._data[key] = value

    def __delitem__(self, key):
        """Delete `key`."""
        del self._data[key]

    def __iter__(self):
        """Iterate over data."""
        return iter(self._data)

    def __len__(self):
        """Return length of data."""
        return len(self._data)

    def __getattr__(self, name: str):
        """Get data attribute."""
        try:
            return self._data[name]
        except KeyError:
            raise AttributeError(name)


[docs]
    def to_dict(self) -> dict[str, Any]:
        """Return data as dict."""
        return self._data





[docs]
def to_container(obj) -> Optional[DatasetContainer]:
    """Convert an object to a ``DatasetContainer``.

    Transforms various dataset representations into a standardized
    :class:`DatasetContainer` used by downstream processing.

    Supported input types include dictionaries, dataclasses,
    objects with attributes, and existing ``DatasetContainer`` instances.

    Parameters
    ----------
    obj : Any
        Object representing dataset data.

    Returns
    -------
    DatasetContainer or None
        Structured container wrapping the dataset data.

    Raises
    ------
    TypeError
        If the input type is not supported.
    """
    if obj is None:
        return None

    if isinstance(obj, DatasetContainer):
        return obj

    if isinstance(obj, dict):
        return DatasetContainer(obj)

    if is_dataclass(obj):
        return DatasetContainer({f.name: getattr(obj, f.name) for f in fields(obj)})

    if hasattr(obj, "__dict__"):
        return DatasetContainer(vars(obj))

    raise TypeError(f"Unsupported dataset type: {type(obj)}")