Source code for simdec.sensitivity_indices

from dataclasses import dataclass

import numpy as np
import pandas as pd
from scipy import stats


__all__ = ["sensitivity_indices"]


def number_of_bins(n_runs: int, n_factors: int) -> tuple[int, int]:
    """Optimal number of bins for first & second-order sensitivity_indices indices.

    Linear approximation of experimental results from (Marzban & Lahmer, 2016).
    """
    n_bins_foe = 36 - 2.7 * n_factors + (0.0017 - 0.00008 * n_factors) * n_runs
    n_bins_foe = np.ceil(n_bins_foe)
    if n_bins_foe <= 30:
        n_bins_foe = 10  # setting a limit to fit the experimental results

    n_bins_soe = max(4, np.round(np.sqrt(n_bins_foe)))

    return n_bins_foe, n_bins_soe


def _weighted_var(x: np.ndarray, weights: np.ndarray) -> np.ndarray:
    avg = np.average(x, weights=weights)
    variance = np.average((x - avg) ** 2, weights=weights)
    return variance


@dataclass
class SensitivityAnalysisResult:
    si: np.ndarray
    first_order: np.ndarray
    second_order: np.ndarray



[docs]
def sensitivity_indices(
    inputs: pd.DataFrame | np.ndarray,
    output: pd.DataFrame | np.ndarray,
    print_indices: bool = False,
) -> SensitivityAnalysisResult:
    """Sensitivity indices.

    The sensitivity_indices express how much variability of the output is
    explained by the inputs.

    Parameters
    ----------
    inputs : ndarray or DataFrame of shape (n_runs, n_factors)
        Input variables.
    output : ndarray or DataFrame of shape (n_runs, 1)
        Target variable.
    print_indices : bool, default False
        If True, displays computed indices.

    Returns
    -------
    res : SensitivityAnalysisResult
        An object with attributes:

        si : ndarray of shape (n_factors, 1)
            Sensitivity indices, combined effect of each input.
        foe : ndarray of shape (n_factors, 1)
            First-order effects (also called 'main' or 'individual').
        soe : ndarray of shape (n_factors, n_factors)
            Second-order effects (also called 'interaction').

    Examples
    --------
    >>> import numpy as np
    >>> from scipy.stats import qmc
    >>> import simdec as sd

    We define first the function that we want to analyse. We use the
    well studied Ishigami function:

    >>> def f_ishigami(x):
    ...     return (np.sin(x[0]) + 7 * np.sin(x[1]) ** 2
    ...             + 0.1 * (x[2] ** 4) * np.sin(x[0]))

    Then we generate inputs using the Quasi-Monte Carlo method of Sobol' in
    order to cover uniformly our space. And we compute outputs of the function.

    >>> rng = np.random.default_rng()
    >>> inputs = qmc.Sobol(d=3, seed=rng).random(2**18)
    >>> inputs = qmc.scale(
    ...     sample=inputs,
    ...     l_bounds=[-np.pi, -np.pi, -np.pi],
    ...     u_bounds=[np.pi, np.pi, np.pi]
    ... )
    >>> output = f_ishigami(inputs.T)

    We can now pass our inputs and outputs to the `sensitivity_indices` function:

    >>> res = sd.sensitivity_indices(inputs=inputs, output=output)
    >>> res.si
    array([0.43157591, 0.44241433, 0.11767249])

    """
    # Handle inputs conversion
    if isinstance(inputs, pd.DataFrame):
        var_names = inputs.columns.tolist()
        cat_cols = inputs.select_dtypes(include=["category", "O", "string"]).columns
        if not cat_cols.empty:
            inputs = inputs.copy()  # Avoid SettingWithCopyWarning
            inputs[cat_cols] = inputs[cat_cols].apply(
                lambda x: x.astype("category").cat.codes
            )
        inputs = inputs.to_numpy()
    else:
        inputs = np.asarray(inputs)
        # Fallback names if it's just a numpy array
        var_names = [f"x{i}" for i in range(inputs.shape[1])]

    # Handle output conversion first, then flatten
    if isinstance(output, (pd.DataFrame, pd.Series)):
        output = output.to_numpy()

    # Flatten output if it's (N, 1)
    output = output.flatten()

    n_runs, n_factors = inputs.shape
    n_bins_foe, n_bins_soe = number_of_bins(n_runs, n_factors)

    # Overall variance of the output
    var_y = np.var(output)

    si = np.empty(n_factors)
    foe = np.empty(n_factors)
    soe = np.zeros((n_factors, n_factors))

    for i in range(n_factors):
        # 1. First-order effects (FOE)
        xi = inputs[:, i]

        bin_avg, _, binnumber = stats.binned_statistic(
            x=xi, values=output, bins=n_bins_foe, statistic="mean"
        )

        # Filter empty bins and get weights (counts)
        mask_foe = ~np.isnan(bin_avg)
        mean_i_foe = bin_avg[mask_foe]
        # binnumber starts at 1; 0 is for values outside range
        bin_counts_foe = np.unique(binnumber[binnumber > 0], return_counts=True)[1]

        foe[i] = _weighted_var(mean_i_foe, weights=bin_counts_foe) / var_y

        # 2. Second-order effects (SOE)
        for j in range(n_factors):
            if j <= i:
                continue

            xj = inputs[:, j]

            # 2D Binned Statistic for Var(E[Y|Xi, Xj])
            bin_avg_ij, x_edges, y_edges, binnumber_ij = stats.binned_statistic_2d(
                x=xi, y=xj, values=output, bins=n_bins_soe, expand_binnumbers=False
            )

            mask_ij = ~np.isnan(bin_avg_ij)
            mean_ij = bin_avg_ij[mask_ij]
            counts_ij = np.unique(binnumber_ij[binnumber_ij > 0], return_counts=True)[1]
            var_ij = _weighted_var(mean_ij, weights=counts_ij)

            # Marginal Var(E[Y|Xi]) using n_bins_soe to match MATLAB logic
            bin_avg_i_soe, _, binnumber_i_soe = stats.binned_statistic(
                x=xi, values=output, bins=n_bins_soe, statistic="mean"
            )
            mask_i = ~np.isnan(bin_avg_i_soe)
            counts_i = np.unique(
                binnumber_i_soe[binnumber_i_soe > 0], return_counts=True
            )[1]
            var_i_soe = _weighted_var(bin_avg_i_soe[mask_i], weights=counts_i)

            # Marginal Var(E[Y|Xj]) using n_bins_soe to match MATLAB logic
            bin_avg_j_soe, _, binnumber_j_soe = stats.binned_statistic(
                x=xj, values=output, bins=n_bins_soe, statistic="mean"
            )
            mask_j = ~np.isnan(bin_avg_j_soe)
            counts_j = np.unique(
                binnumber_j_soe[binnumber_j_soe > 0], return_counts=True
            )[1]
            var_j_soe = _weighted_var(bin_avg_j_soe[mask_j], weights=counts_j)

            soe[i, j] = (var_ij - var_i_soe - var_j_soe) / var_y

    # Mirror SOE and calculate Combined Effect (SI)
    # SI is FOE + half of all interactions associated with that variable
    soe = soe + soe.T
    for k in range(n_factors):
        si[k] = foe[k] + (soe[:, k].sum() / 2)

    if print_indices:
        df_foe = pd.DataFrame(foe, index=var_names, columns=["First-order effect"])
        df_soe = pd.DataFrame(soe, index=var_names, columns=var_names)
        df_si = pd.DataFrame(si, index=var_names, columns=["Combined effect"])

        df_indices = pd.concat([df_foe, df_soe, df_si], axis=1)
        print(f"\n{df_indices}\n")

    return SensitivityAnalysisResult(si, foe, soe)