Source code for formulation_bench.dataset
import json
from pathlib import Path
from .download import download_dataset
from .problem import Problem
from .reformulation import Reformulation
[docs]
class Dataset:
"""FormulationBench dataset.
Parameters
----------
root : str or pathlib.Path
Path to the root directory containing the FormulationBench dataset. See
:doc:`/schema` for the expected directory structure.
Attributes
----------
root : pathlib.Path
Resolved absolute path to the dataset root.
problems : dict[int, Problem]
Mapping from integer problem ID (e.g., ``1`` for ``p1``) to :class:`Problem`.
reformulations : list[Reformulation]
List of all labelled reformulation pairs in the dataset.
Examples
--------
Load the dataset from a local ``./dataset`` directory::
>>> from formulation_bench import Dataset
>>> ds = Dataset("./dataset")
>>> ds
Dataset(root=..., n_problems=20, n_reformulations=96)
Access a specific problem and one of its formulations::
>>> p1 = ds.problems[1]
>>> p1.formulations["a"].valid
True
Iterate over labelled reformulations::
>>> pos = [r for r in ds.reformulations if r.is_reformulation]
>>> neg = [r for r in ds.reformulations if not r.is_reformulation]
>>> len(pos), len(neg)
(70, 26)
"""
def __init__(self, root: str | Path) -> None:
self.root = Path(root).resolve()
self._raw = json.loads((self.root / "dataset.json").read_text())
self.problems: dict[int, Problem] = {
pid: Problem(self.root / "problems" / f"p{pid}")
for pid in self._raw["problems"]
}
self.reformulations: list[Reformulation] = [
Reformulation(
a=self.problems[entry["a"]["problem"]].formulations[
entry["a"]["formulation"]
],
b=self.problems[entry["b"]["problem"]].formulations[
entry["b"]["formulation"]
],
is_reformulation=entry["reformulation"],
)
for entry in self._raw.get("reformulations", [])
]
[docs]
@classmethod
def load(
cls,
version: str | None = None,
cache_dir: str | Path | None = None,
force: bool = False,
) -> "Dataset":
"""Load the FormulationBench dataset, downloading it if necessary.
Thin wrapper around :func:`formulation_bench.download_dataset` that
downloads the dataset and constructs a :class:`Dataset`. See that
function for versioning and caching semantics.
Parameters
----------
version, cache_dir, force
Passed through to :func:`formulation_bench.download_dataset`.
Returns
-------
dataset : Dataset
The loaded dataset.
Examples
--------
Download the default version of the dataset (or load from cache)::
>>> from formulation_bench import Dataset
>>> ds = Dataset.load()
>>> sorted(ds.problems)[:5]
[1, 2, 3, 4, 5]
"""
root = download_dataset(version, cache_dir=cache_dir, force=force)
return cls(root)
def __repr__(self) -> str:
return (
f"Dataset(root={self.root!r},"
f" n_problems={len(self.problems)},"
f" n_reformulations={len(self.reformulations)})"
)