Source code for lir.bounding

import logging
from abc import ABC, abstractmethod
from typing import Self

import numpy as np

from lir import Transformer
from lir.data.models import FeatureData, InstanceData, LLRData
from lir.util import check_type


LOG = logging.getLogger(__name__)



[docs]
class LLRBounder(Transformer, ABC):
    """
    Base class for LLR bounders.

    A bounder updates any LLRs that are out of bounds. Any LLR values within bounds remain unchanged. LLR values that
    are out-of-bounds are updated to the nearest bound.

    Parameters
    ----------
    lower_llr_bound : float | None
        The lower bound for the LLRs. If `None`, no lower bound is applied.
    upper_llr_bound : float | None
        The upper bound for the LLRs. If `None`, no upper bound is applied.
    """

    def __init__(
        self,
        lower_llr_bound: float | None = None,
        upper_llr_bound: float | None = None,
    ):
        self.lower_llr_bound = lower_llr_bound
        self.upper_llr_bound = upper_llr_bound


[docs]
    @abstractmethod
    def calculate_bounds(self, llrdata: LLRData) -> tuple[float | None, float | None]:
        """
        Calculate and return appropriate bounds for a set of LLRs and their labels.

        Parameters
        ----------
        llrdata : LLRData
            The LLR data for which to calculate the bounds. This includes the LLRs,
            their labels, and any other relevant information.
        """
        raise NotImplementedError


    @staticmethod
    def _validate(instances: InstanceData) -> LLRData:
        instances = check_type(FeatureData, instances)
        if not isinstance(instances, LLRData):
            LOG.info(f'casting `{type(instances)}` to `LLRData`')
            instances = instances.replace_as(LLRData)
        return instances


[docs]
    def fit(self, instances: InstanceData) -> Self:
        """
        Configure this bounder by calculating bounds.

        assuming that y=1 corresponds to Hp, y=0 to Hd

        Parameters
        ----------
        instances : InstanceData
            The data to fit the bounder on. This should include the LLRs and their corresponding labels.

        Returns
        -------
        Self
            The fitted bounder instance.
        """
        instances = self._validate(instances)

        if instances.labels is None:
            raise ValueError(f'{type(self)}.fit() requires labeled data')

        # calculate the bounds
        self.lower_llr_bound, self.upper_llr_bound = self.calculate_bounds(instances)

        # check the sanity of the bounds
        if (
            self.lower_llr_bound is not None
            and self.upper_llr_bound is not None
            and self.lower_llr_bound > self.upper_llr_bound
        ):
            raise ValueError(
                'the lower bound must be lower than the upper bound; '
                f'lower_llr_bound={self.lower_llr_bound}; upper_llr_bound={self.upper_llr_bound}'
            )

        return self



[docs]
    def apply(self, instances: InstanceData) -> LLRData:
        """
        Recalculate the LLR data using the first step calibrator and applying the bounds.

        Parameters
        ----------
        instances : InstanceData
            The data to apply the bounder to. This should include the LLRs and their corresponding labels.

        Returns
        -------
        LLRData
            The LLR data with the LLRs bounded according to the calculated bounds.
        """
        instances = self._validate(instances)

        llrs = instances.features

        # Clip the LLRs to the bounds, where np.clip handles the None values correctly.
        llrs = np.clip(llrs, self.lower_llr_bound, self.upper_llr_bound)

        return instances.replace(
            features=llrs, llr_upper_bound=self.upper_llr_bound, llr_lower_bound=self.lower_llr_bound
        )





[docs]
class StaticBounder(LLRBounder):
    """
    Bound LLRs to constant values.

    This bounder takes arguments for a lower and upper bound, which may take `None` in which case no bounds are applied.

    Parameters
    ----------
    lower_llr_bound : float | None
        The lower bound for the LLRs. If `None`, no lower bound is applied.
    upper_llr_bound : float | None
        The upper bound for the LLRs. If `None`, no upper bound is applied.
    """

    def __init__(self, lower_llr_bound: float | None, upper_llr_bound: float | None):
        super().__init__(lower_llr_bound, upper_llr_bound)


[docs]
    def calculate_bounds(self, llrdata: LLRData) -> tuple[float | None, float | None]:
        """
        Calculate and return the lower and upper LLR bounds.

        Parameters
        ----------
        llrdata : LLRData
            Not used, but included for compatibility with the base class.

        Returns
        -------
        tuple[float | None, float | None]
            The lower and upper LLR bounds, as specified in the constructor.
        """
        return self.lower_llr_bound, self.upper_llr_bound





[docs]
class NSourceBounder(LLRBounder):
    """
    Bound LLRs based on the number of sources.

    This bounder sets the lower LLR bound to -log(N) and the upper bound to log(N), where N is the number of sources.

    In non-log space, this corresponds to bounding likelihood ratios to [1/N, N]. This is a logical consequence of
    having N sources: no source can provide more than N support for one hypothesis over the other.
    """


[docs]
    def calculate_bounds(self, llrdata: LLRData) -> tuple[float | None, float | None]:
        """
        Calculate and return the lower and upper LLR bounds.

        Parameters
        ----------
        llrdata : LLRData
            The LLR data for which to calculate the bounds. This should include the source IDs.

        Returns
        -------
        tuple[float | None, float | None]
            The lower and upper LLR bounds, calculated based on the number of sources.
        """
        if llrdata.source_ids is None:
            raise ValueError(f'{type(self)} requires source IDs to calculate bounds')

        n_sources = np.unique(llrdata.source_ids, sorted=False)
        log_n_sources = np.log10(len(n_sources))

        LOG.debug(f'NSourceBounder: number of sources: N={len(n_sources)}')
        LOG.debug(f'NSourceBounder: calculated bounds: -log(N)={-log_n_sources}, log(N)={log_n_sources}')
        return -log_n_sources, log_n_sources




__all__ = [
    'LLRBounder',
    'StaticBounder',
    'NSourceBounder',
]