import importlib
import logging
from functools import lru_cache
from pathlib import Path
from types import ModuleType
from typing import Callable, Dict, List, Optional, Tuple
from metatensor.torch import TensorMap
from metatomic.torch import ModelCapabilities, ModelOutput, System
from omegaconf import DictConfig
from ..target_info import TargetInfo
AVAILABLE_READERS = ["ase", "metatensor"]
""":py:class:`list`: list containing all implemented reader libraries"""
DEFAULT_READER = {
".xyz": "ase",
".extxyz": "ase",
".mts": "metatensor",
}
""":py:class:`dict`: mapping file extensions to a default reader"""
logger = logging.getLogger(__name__)
[docs]
def read_systems(
filename: str,
reader: Optional[str] = None,
) -> List[System]:
"""Read system informations from a file.
:param filename: name of the file to read
:param reader: reader library for parsing the file. If :py:obj:`None` the library is
is tried to determined from the file extension.
:return: list of :py:class:`System` objects determined from the file extension.
:raises ValueError: if no reader is found or data not in double precision
"""
# Determine reader if not provided
if reader is None:
file_suffix = Path(filename).suffix
try:
reader = DEFAULT_READER[file_suffix]
except KeyError:
raise ValueError(
f"File extension {file_suffix!r} is not linked to a default reader "
"library. You can try reading it by setting a specific 'reader' from "
f"the known ones: {', '.join(AVAILABLE_READERS)} "
)
module = _load_reader_module(reader)
# Fetch and call read_systems
try:
reader_fn = module.read_systems
except AttributeError as e:
raise ValueError(
f"Reader library {reader!r} cannot read systems."
f"You can try with other readers: {AVAILABLE_READERS}"
) from e
systems = reader_fn(filename)
# elements in data are `torch.ScriptObject`s and their `dtype` is an integer.
# A C++ double/torch.float64 is `7` according to
# https://github.com/pytorch/pytorch/blob/207564bab1c4fe42750931765734ee604032fb69/c10/core/ScalarType.h#L54-L93
if not all(s.dtype == 7 for s in systems):
raise ValueError("The loaded systems are not in double precision.")
return systems
[docs]
def read_targets(
conf: DictConfig,
) -> Tuple[Dict[str, List[TensorMap]], Dict[str, TargetInfo]]:
"""Reading all target information from a fully expanded config.
To get such a config you can use :func:`expand_dataset_config
<metatrain.utils.omegaconf.expand_dataset_config>`. All targets are stored in double
precision.
This function uses subfunctions like :func:`read_energy` to parse the requested
target quantity. Currently only `energy` is a supported target property. But, within
the `energy` section gradients such as `forces`, the `stress` or the `virial` can be
added. Other gradients are silently ignored.
:param conf: config containing the keys for what should be read.
:return: Dictionary containing a list of TensorMaps for each target section in the
config as well as a ``Dict[str, TargetInfo]`` object containing the metadata of
the targets.
:raises ValueError: if the target name is not valid. Valid target names are those
that either start with ``mtt::`` or those that are in the list of standard
outputs of ``metatomic`` (see
https://docs.metatensor.org/metatomic/latest/outputs/)
"""
return _read_conf_section(
conf,
decide_reader=_decide_target_reader,
validate_key=_validate_target,
)
def read_extra_data(
conf: DictConfig,
) -> Tuple[Dict[str, List[TensorMap]], Dict[str, TargetInfo]]:
"""Read extra data from a fully expanded config.
This function is similar to :func:`read_targets`, but it is used to read additional
data that is not part of the main targets. It can be used to read auxiliary data
that might be useful for training or evaluation.
:param conf: config containing the keys for what should be read.
:return: Dictionary containing a list of TensorMaps for each extra data section in
the config as well as a ``Dict[str, TargetInfo]`` object containing the metadata
of the extra data.
"""
return _read_conf_section(
conf,
decide_reader=_decide_generic_reader,
validate_key=_no_validate,
)
def _read_conf_section(
conf: DictConfig,
decide_reader: Callable[[str, DictConfig], str],
validate_key: Callable[[str, DictConfig], None],
) -> Tuple[Dict[str, List[TensorMap]], Dict[str, TargetInfo]]:
"""
Generic loader for any DictConfig section (targets, extra_data, …).
:param conf: mapping of section names to entry configs
:param decide_reader: callback(key, entry) -> either "energy" or "generic"
:param validate_key: callback(key, entry) -> None (may raise or log)
:return: (data_dict, info_dict)
:raises ValueError: on unsupported file types, readers, or dtype mismatch
"""
data_dict: Dict[str, List[TensorMap]] = {}
info_dict: Dict[str, TargetInfo] = {}
for key, entry in conf.items():
# section-specific key validation
validate_key(key, entry)
# decide which reader method to call
reader_kind = decide_reader(key, entry)
# resolve reader name (explicit or default via suffix)
reader = entry.get("reader")
filename = entry.get("read_from")
if reader is None:
suffix = Path(filename).suffix
try:
reader = DEFAULT_READER[suffix]
except KeyError:
raise ValueError(
f"File extension {suffix!r} has no default reader. "
f"Set 'reader' explicitly from: {AVAILABLE_READERS}"
)
module = _load_reader_module(reader)
# fetch the appropriate read_* function
method_name = f"read_{reader_kind}"
try:
reader_fn = getattr(module, method_name)
except AttributeError as e:
available = [m for m in dir(module) if m.startswith("read_")]
raise ValueError(
f"Reader {reader!r} has no method {method_name!r}. "
f"Available methods: {available}"
) from e
# execute reader and collect outputs
tensormaps, info = reader_fn(key, entry)
# enforce double precision (dtype == 7)
if not all(t.dtype == 7 for t in tensormaps):
raise ValueError(f"Data for '{key}' not in double precision (dtype==7).")
data_dict[key] = tensormaps
info_dict[key] = info
return data_dict, info_dict
def _validate_target(key: str, entry: DictConfig) -> None:
# use `ModelCapabilities` to verify if `key` is valid
try:
_ = ModelCapabilities({key: ModelOutput()})
except ValueError as e:
# adjust error message for gradients
if any(name in key.lower() for name in ("force", "virial", "stress")):
message = (
f"Target name '{key}' resembles to a gradient of `energies`."
"Gradient targets must be either specified as sub-entries of an "
"`energy` quantity or if they are a direct target prefixed with "
"non_conservative_<gradient>."
)
else:
message = e.args[0]
raise ValueError(message) from e
if "::" in key and "mtt" not in key.split("::")[0]:
raise ValueError(
f"Target name '{key}' is not valid. Non-standard names "
"(using '::' notation) are only allowed with the 'mtt::' prefix."
)
def _decide_target_reader(key: str, entry: DictConfig) -> str:
is_energy = (
entry.get("quantity") == "energy"
and not entry.get("per_atom", False)
and entry.get("num_subtargets", 1) == 1
and entry.get("type") == "scalar"
)
return "energy" if is_energy else "generic"
# Callbacks for "extra_data"
def _no_validate(key: str, entry: DictConfig) -> None:
pass
def _decide_generic_reader(key: str, entry: DictConfig) -> str:
"""
Return "generic" for any input.
:param key: target name
:param entry: target config
:return: always "generic"
"""
return "generic"
@lru_cache(maxsize=None)
def _load_reader_module(reader_name: str) -> ModuleType:
"""
Load (and cache) a reader module by name.
Raises ValueError if the module cannot be imported.
:param reader_name: Name of the reader module to load.
:return: The imported module.
"""
module_path = f"metatrain.utils.data.readers.{reader_name}"
try:
return importlib.import_module(module_path)
except ImportError as e:
raise ValueError(
f"Reader library {reader_name!r} not supported. Choose from "
f"{', '.join(AVAILABLE_READERS)}"
) from e