import logging
from abc import ABC, abstractmethod
from typing import Any, Self
import numpy as np
from lir import Transformer
from lir.data.models import FeatureData, InstanceData, LLRData
from lir.util import check_type
LOG = logging.getLogger(__name__)
[docs]
class LLRBounder(Transformer, ABC):
"""
Base class for LLR bounders.
A bounder updates any LLRs that are out of bounds. Any LLR values within bounds remain unchanged. LLR values that
are out-of-bounds are updated to the nearest bound.
Parameters
----------
lower_llr_bound : float | None
The lower bound for the LLRs. If `None`, no lower bound is applied.
upper_llr_bound : float | None
The upper bound for the LLRs. If `None`, no upper bound is applied.
"""
def __init__(
self,
lower_llr_bound: float | None = None,
upper_llr_bound: float | None = None,
):
self.lower_llr_bound = lower_llr_bound
self.upper_llr_bound = upper_llr_bound
[docs]
@abstractmethod
def calculate_bounds(self, llrdata: LLRData) -> tuple[float | None, float | None]:
"""
Calculate and returns appropriate bounds for a set of LLRs and their labels.
Parameters
----------
llrdata : LLRData
The LLR data for which to calculate the bounds. This includes the LLRs,
their labels, and any other relevant information.
"""
raise NotImplementedError
@staticmethod
def _validate(instances: InstanceData) -> LLRData:
instances = check_type(FeatureData, instances)
if not isinstance(instances, LLRData):
LOG.info(f'casting `{type(instances)}` to `LLRData`')
instances = instances.replace_as(LLRData)
return instances
[docs]
def fit(self, instances: InstanceData) -> Self:
"""
Configure this bounder by calculating bounds.
assuming that y=1 corresponds to Hp, y=0 to Hd
Parameters
----------
instances : InstanceData
The data to fit the bounder on. This should include the LLRs and their corresponding labels.
Returns
-------
Self
The fitted bounder instance.
"""
instances = self._validate(instances)
if instances.labels is None:
raise ValueError(f'{type(self)}.fit() requires labeled data')
# calculate the bounds
self.lower_llr_bound, self.upper_llr_bound = self.calculate_bounds(instances)
# check the sanity of the bounds
if (
self.lower_llr_bound is not None
and self.upper_llr_bound is not None
and self.lower_llr_bound > self.upper_llr_bound
):
raise ValueError(
'the lower bound must be lower than the upper bound; '
f'lower_llr_bound={self.lower_llr_bound}; upper_llr_bound={self.upper_llr_bound}'
)
return self
[docs]
def apply(self, instances: InstanceData) -> LLRData:
"""
Recalculate the LLR data using the first step calibrator and applying the bounds.
Parameters
----------
instances : InstanceData
The data to apply the bounder to. This should include the LLRs and their corresponding labels.
Returns
-------
LLRData
The LLR data with the LLRs bounded according to the calculated bounds.
"""
instances = self._validate(instances)
llrs = instances.features
# Clip the LLRs to the bounds, where np.clip handles the None values correctly.
llrs = np.clip(llrs, self.lower_llr_bound, self.upper_llr_bound)
return instances.replace(
features=llrs, llr_upper_bound=self.upper_llr_bound, llr_lower_bound=self.lower_llr_bound
)
[docs]
class StaticBounder(LLRBounder):
"""
Bound LLRs to constant values.
This bounder takes arguments for a lower and upper bound, which may take `None` in which case no bounds are applied.
Parameters
----------
lower_llr_bound : float | None
The lower bound for the LLRs. If `None`, no lower bound is applied.
upper_llr_bound : float | None
The upper bound for the LLRs. If `None`, no upper bound is applied.
"""
def __init__(self, lower_llr_bound: float | None, upper_llr_bound: float | None):
super().__init__(lower_llr_bound, upper_llr_bound)
[docs]
def calculate_bounds(self, llrdata: LLRData) -> tuple[float | None, float | None]:
"""
Calculate and return the lower and upper LLR bounds.
Parameters
----------
llrdata : LLRData
Not used, but included for compatibility with the base class.
Returns
-------
tuple[float | None, float | None]
The lower and upper LLR bounds, as specified in the constructor.
"""
return self.lower_llr_bound, self.upper_llr_bound
[docs]
class NSourceBounder(LLRBounder):
"""
Bound LLRs based on the number of sources.
This bounder sets the lower LLR bound to -log(N) and the upper bound to log(N), where N is the number of sources.
In non-log space, this corresponds to bounding likelihood ratios to [1/N, N]. This is a logical consequence of
having N sources: no source can provide more than N support for one hypothesis over the other.
"""
[docs]
def calculate_bounds(self, llrdata: LLRData) -> tuple[float | None, float | None]:
"""
Calculate and return the lower and upper LLR bounds.
Parameters
----------
llrdata : LLRData
The LLR data for which to calculate the bounds. This should include the source IDs.
Returns
-------
tuple[float | None, float | None]
The lower and upper LLR bounds, calculated based on the number of sources.
"""
if llrdata.source_ids is None:
raise ValueError(f'{type(self)} requires source IDs to calculate bounds')
n_sources = np.unique(llrdata.source_ids, sorted=False)
log_n_sopurces = np.log10(len(n_sources))
LOG.debug(f'NSourceBounder: number of sources: N={len(n_sources)}')
LOG.debug(f'NSourceBounder: calculated bounds: -log(N)={-log_n_sopurces}, log(N)={log_n_sopurces}')
return -log_n_sopurces, log_n_sopurces
__all__: list[Any] = [
LLRBounder,
StaticBounder,
NSourceBounder,
]