Source code for metatrain.utils.data.readers.readers

import importlib
import logging
from functools import lru_cache
from pathlib import Path
from types import ModuleType
from typing import Callable, Dict, List, Optional, Tuple

from metatensor.torch import TensorMap
from metatomic.torch import ModelCapabilities, ModelOutput, System
from omegaconf import DictConfig

from ..target_info import TargetInfo


AVAILABLE_READERS = ["ase", "metatensor"]
""":py:class:`list`: list containing all implemented reader libraries"""

DEFAULT_READER = {
    ".xyz": "ase",
    ".extxyz": "ase",
    ".mts": "metatensor",
}
""":py:class:`dict`: mapping file extensions to a default reader"""

logger = logging.getLogger(__name__)



[docs]
def read_systems(
    filename: str,
    reader: Optional[str] = None,
) -> List[System]:
    """Read system informations from a file.

    :param filename: name of the file to read
    :param reader: reader library for parsing the file. If :py:obj:`None` the library is
        is tried to determined from the file extension.
    :return: list of :py:class:`System` objects determined from the file extension.
    :raises ValueError: if no reader is found or data not in double precision
    """
    # Determine reader if not provided
    if reader is None:
        file_suffix = Path(filename).suffix
        try:
            reader = DEFAULT_READER[file_suffix]
        except KeyError:
            raise ValueError(
                f"File extension {file_suffix!r} is not linked to a default reader "
                "library. You can try reading it by setting a specific 'reader' from "
                f"the known ones: {', '.join(AVAILABLE_READERS)} "
            )

    module = _load_reader_module(reader)

    # Fetch and call read_systems
    try:
        reader_fn = module.read_systems
    except AttributeError as e:
        raise ValueError(
            f"Reader library {reader!r} cannot read systems."
            f"You can try with other readers: {AVAILABLE_READERS}"
        ) from e

    systems = reader_fn(filename)

    # elements in data are `torch.ScriptObject`s and their `dtype` is an integer.
    # A C++ double/torch.float64 is `7` according to
    # https://github.com/pytorch/pytorch/blob/207564bab1c4fe42750931765734ee604032fb69/c10/core/ScalarType.h#L54-L93
    if not all(s.dtype == 7 for s in systems):
        raise ValueError("The loaded systems are not in double precision.")

    return systems




[docs]
def read_targets(
    conf: DictConfig,
) -> Tuple[Dict[str, List[TensorMap]], Dict[str, TargetInfo]]:
    """Reading all target information from a fully expanded config.

    To get such a config you can use :func:`expand_dataset_config
    <metatrain.utils.omegaconf.expand_dataset_config>`. All targets are stored in double
    precision.

    This function uses subfunctions like :func:`read_energy` to parse the requested
    target quantity. Currently only `energy` is a supported target property. But, within
    the `energy` section gradients such as `forces`, the `stress` or the `virial` can be
    added. Other gradients are silently ignored.

    :param conf: config containing the keys for what should be read.
    :return: Dictionary containing a list of TensorMaps for each target section in the
        config as well as a ``Dict[str, TargetInfo]`` object containing the metadata of
        the targets.

    :raises ValueError: if the target name is not valid. Valid target names are those
        that either start with ``mtt::`` or those that are in the list of standard
        outputs of ``metatomic`` (see
        https://docs.metatensor.org/metatomic/latest/outputs/)
    """
    return _read_conf_section(
        conf,
        decide_reader=_decide_target_reader,
        validate_key=_validate_target,
    )



def read_extra_data(
    conf: DictConfig,
) -> Tuple[Dict[str, List[TensorMap]], Dict[str, TargetInfo]]:
    """Read extra data from a fully expanded config.

    This function is similar to :func:`read_targets`, but it is used to read additional
    data that is not part of the main targets. It can be used to read auxiliary data
    that might be useful for training or evaluation.

    :param conf: config containing the keys for what should be read.
    :return: Dictionary containing a list of TensorMaps for each extra data section in
        the config as well as a ``Dict[str, TargetInfo]`` object containing the metadata
        of the extra data.
    """
    return _read_conf_section(
        conf,
        decide_reader=_decide_generic_reader,
        validate_key=_no_validate,
    )


def _read_conf_section(
    conf: DictConfig,
    decide_reader: Callable[[str, DictConfig], str],
    validate_key: Callable[[str, DictConfig], None],
) -> Tuple[Dict[str, List[TensorMap]], Dict[str, TargetInfo]]:
    """
    Generic loader for any DictConfig section (targets, extra_data, …).

    :param conf:          mapping of section names to entry configs
    :param decide_reader: callback(key, entry) -> either "energy" or "generic"
    :param validate_key:  callback(key, entry) -> None (may raise or log)
    :return: (data_dict, info_dict)
    :raises ValueError: on unsupported file types, readers, or dtype mismatch
    """
    data_dict: Dict[str, List[TensorMap]] = {}
    info_dict: Dict[str, TargetInfo] = {}

    for key, entry in conf.items():
        # section-specific key validation
        validate_key(key, entry)

        # decide which reader method to call
        reader_kind = decide_reader(key, entry)

        # resolve reader name (explicit or default via suffix)
        reader = entry.get("reader")
        filename = entry.get("read_from")
        if reader is None:
            suffix = Path(filename).suffix
            try:
                reader = DEFAULT_READER[suffix]
            except KeyError:
                raise ValueError(
                    f"File extension {suffix!r} has no default reader. "
                    f"Set 'reader' explicitly from: {AVAILABLE_READERS}"
                )

        module = _load_reader_module(reader)

        # fetch the appropriate read_* function
        method_name = f"read_{reader_kind}"
        try:
            reader_fn = getattr(module, method_name)
        except AttributeError as e:
            available = [m for m in dir(module) if m.startswith("read_")]
            raise ValueError(
                f"Reader {reader!r} has no method {method_name!r}. "
                f"Available methods: {available}"
            ) from e

        # execute reader and collect outputs
        tensormaps, info = reader_fn(key, entry)

        # enforce double precision (dtype == 7)
        if not all(t.dtype == 7 for t in tensormaps):
            raise ValueError(f"Data for '{key}' not in double precision (dtype==7).")

        data_dict[key] = tensormaps
        info_dict[key] = info

    return data_dict, info_dict


def _validate_target(key: str, entry: DictConfig) -> None:
    # use `ModelCapabilities` to verify if `key` is valid
    try:
        _ = ModelCapabilities({key: ModelOutput()})
    except ValueError as e:
        # adjust error message for gradients
        if any(name in key.lower() for name in ("force", "virial", "stress")):
            message = (
                f"Target name '{key}' resembles to a gradient of `energies`."
                "Gradient targets must be either specified as sub-entries of an "
                "`energy` quantity or if they are a direct target prefixed with "
                "non_conservative_<gradient>."
            )
        else:
            message = e.args[0]

        raise ValueError(message) from e

    if "::" in key and "mtt" not in key.split("::")[0]:
        raise ValueError(
            f"Target name '{key}' is not valid. Non-standard names "
            "(using '::' notation) are only allowed with the 'mtt::' prefix."
        )


def _decide_target_reader(key: str, entry: DictConfig) -> str:
    is_energy = (
        entry.get("quantity") == "energy"
        and not entry.get("per_atom", False)
        and entry.get("num_subtargets", 1) == 1
        and entry.get("type") == "scalar"
    )
    return "energy" if is_energy else "generic"


# Callbacks for "extra_data"
def _no_validate(key: str, entry: DictConfig) -> None:
    pass


def _decide_generic_reader(key: str, entry: DictConfig) -> str:
    """
    Return "generic" for any input.

    :param key: target name
    :param entry: target config
    :return: always "generic"
    """
    return "generic"


@lru_cache(maxsize=None)
def _load_reader_module(reader_name: str) -> ModuleType:
    """
    Load (and cache) a reader module by name.
    Raises ValueError if the module cannot be imported.

    :param reader_name: Name of the reader module to load.
    :return: The imported module.
    """
    module_path = f"metatrain.utils.data.readers.{reader_name}"
    try:
        return importlib.import_module(module_path)
    except ImportError as e:
        raise ValueError(
            f"Reader library {reader_name!r} not supported. Choose from "
            f"{', '.join(AVAILABLE_READERS)}"
        ) from e