from pathlib import Path
from typing import Any, NamedTuple
import numpy as np
from lir.config.base import config_parser, pop_field
from lir.config.substitution import ContextAwareDict
from lir.data.models import DataProvider, FeatureData
[docs]
class SynthesizedDimension(NamedTuple):
"""Representation of a data distribution."""
population_mean: float
population_std: float
sources_std: float
[docs]
class SynthesizedNormalMulticlassData(DataProvider):
"""
Implementation of a data source generating normally distributed multiclass data.
Parameters
----------
dimensions : list[SynthesizedDimension]
Number of feature dimensions to include in the header.
population_size : int
Number of sources to sample in the synthetic population.
sources_size : int
Number of source groups represented in the dataset.
seed : int | None
Random seed controlling stochastic behaviour for reproducible results.
"""
def __init__(
self,
dimensions: list[SynthesizedDimension],
population_size: int,
sources_size: int,
seed: int | None,
):
self.dimensions = dimensions
self.population_size = population_size
self.sources_size = sources_size
self.seed = seed
def _generate_dimension(self, rng: Any, dimension: SynthesizedDimension) -> np.ndarray:
population = rng.normal(
loc=dimension.population_mean,
scale=dimension.population_std,
size=self.population_size,
)
measurement_error = rng.normal(
loc=0,
scale=dimension.sources_std,
size=self.population_size * self.sources_size,
)
measurements = np.concatenate([population] * self.sources_size) + measurement_error
return measurements
[docs]
def get_instances(self) -> FeatureData:
"""
Return instances with randomly synthesized data and multi-class labels.
The features are drawn from a normal distribution, as configured.
Returns
-------
FeatureData
FeatureData object parsed from the source.
"""
rng = np.random.default_rng(seed=self.seed)
measurements = [self._generate_dimension(rng, dim) for dim in self.dimensions]
measurements = np.stack(measurements, axis=1)
source_ids = np.concatenate([np.arange(self.population_size)] * self.sources_size)
return FeatureData(features=measurements, source_ids=source_ids)
@config_parser
def synthesized_normal_multiclass(config: ContextAwareDict, _: Path) -> SynthesizedNormalMulticlassData:
"""
Set up (multiple class) data source class to obtain normally distributed data from configuration.
Parameters
----------
config : ContextAwareDict
Configuration mapping used to construct this component.
_ : Path
Unused argument required by the parser interface.
Returns
-------
SynthesizedNormalMulticlassData
Configured multiclass synthesized data provider.
"""
seed = pop_field(config, 'seed', validate=int, required=False)
population = pop_field(config, 'population')
population_size = pop_field(population, 'size', validate=int)
instances_per_source = pop_field(
population,
'instances_per_source',
validate=int,
)
dimensions_cfg = pop_field(config, 'dimensions')
dimensions = []
for dim in dimensions_cfg:
mean = pop_field(dim, 'mean', validate=float)
std = pop_field(dim, 'std', validate=float)
error_std = pop_field(dim, 'error_std', validate=float)
dimensions.append(SynthesizedDimension(mean, std, error_std))
return SynthesizedNormalMulticlassData(
dimensions,
population_size,
instances_per_source,
seed,
)