Source code for mlconfound.simulate

import numpy as np
from warnings import warn


def _scale(x):
    return (x - x.mean()) / x.std()


[docs]def sinh_arcsinh(x, delta=1, epsilon=0): """ Sinh-arcsinh transformation Notes ----- The sinh-arcsinh transformation of Jones and Pewsey [1]_ can be used to transfrom Normal distribution to non-normal. Parameters ---------- x : array_like Normally distributed input data. delta : float Parameter to control kurtosis, delta=1 means no change. epsilon : float Parameter to control skewness, epsilon=0 means no change. Returns ------- array_like Transformed data. Examples -------- See `validation/simulation.py` for an application example. >>> result = sinh_arcsinh([-1, -0.5, -0.1, 0.1, 0.5, 1], delta=2, epsilon=1) >>> print(result) [-7.8900947 -3.48801839 -1.50886059 -0.88854985 -0.03758519 0.83888754] >>> result = sinh_arcsinh([-1, -0.5, -0.1, 0.1, 0.5, 1], delta=1, epsilon=0) >>> print(result) [-1. -0.5 -0.1 0.1 0.5 1. ] See Also -------- simulate_y_c_yhat References ---------- [1] Jones, M. C. and Pewsey A. (2009). Sinh-arcsinh distributions. Biometrika 96: 761–780 """ return np.sinh(delta * np.arcsinh(x) - epsilon)
[docs]def identity(x): """ Identity transformation Notes ----- To be used as a transformation function in simulate_y_c_yhat. Parameters ---------- x Returns ------- See Also -------- simulate_y_c_yhat """ return x
def polynomial(x, coefs=[1, 1]): ret = 0 for c_i, coef in enumerate(coefs): ret += coef * np.power(x, c_i + 1) return ret def sigmoid(x, method='tanh'): if method == 'tanh': return np.tanh(x) else: raise NotImplementedError("Currently only tanh is implemented.")
[docs]def simulate_y_c_yhat(w_yc, w_yyhat, w_cyhat, n, delta=1, epsilon=0, nonlin_trf_fun=identity, bin_y=False, bin_c=False, bin_yhat=False, scale=True, random_state=None): """ Simulate normally distributed target (y), confounder (c) and predictions (yhat). Notes ----- .. math:: y \sim \mathcal{N}(0, 1) .. math:: c | y_i \sim \mathcal{N}(w_{y,c} f(y_i), 1) .. math:: \hat{y} | y_i, c_i \sim sinh(\delta sinh^{-1}( \mathcal{N}(w_{y,c} f(y_i), 1))-\epsilon) Parameters ---------- w_yc: float The weight of y in c. w_yyhat: float The weight of y in yhat. w_cyhat: float The weight of c in yhat. Set it to zero for H0. n: int Number of observations. delta: float The delta param of the sinh_archsin transformation on y's contribution in yhat (only affects yhat). epsilon: float The epsilon param of the sinh_archsin transformation on y's contribution in yhat (only affects yhat). nonlin_trf_fun: callable Callable to introduce non-linearity in the conditional distributions. (default: no non-linearity). scale: bool Scale variables to zero mean and unit variance. random_state: int Numpy random state. Returns ------- tuple - y: the simulated target variable - c: the simulated confounder variable - yhat: the simulated predictions See Also -------- sinh_arcsinh Examples -------- >>> y, c, yhat = simulate_y_c_yhat(0.3, 0.2, 0.2, n=3, random_state=42) >>> print(y, c, yhat) [ 0.30471708 -1.03998411 0.7504512 ] [ 1.32193037 -1.09613765 -0.22579272] [ 1.03955979 -1.35013318 0.31057339] """ rng = np.random.default_rng(random_state) y = rng.normal(0, 1, n) if bin_y: y = (y > 0).astype(int) c = np.array([sinh_arcsinh(rng.normal(0, 1, 1), delta=delta, epsilon=epsilon) + w_yc * nonlin_trf_fun(yi) for yi in y]).flatten() if scale: c = _scale(c) if bin_c: c = (c > 0).astype(int) yhat = np.array([sinh_arcsinh(rng.normal(0, 1, 1), delta=delta, epsilon=epsilon) + w_yyhat * nonlin_trf_fun(yi) + w_cyhat * nonlin_trf_fun(ci) for yi, ci in zip(y, c)]).flatten() if scale: yhat = _scale(yhat) if bin_yhat: yhat = (yhat > 0).astype(int) return y, c, yhat