Source code for pygrinder.missing_completely_at_random.little_test

# Created by Wenjie Du <wenjay.du@gmail.com>
# License: BSD-3-Clause

from typing import Union

import numpy as np
import pandas as pd
from scipy.stats import chi2



[docs]
def mcar_little_test(X: Union[pd.DataFrame, np.ndarray]) -> float:
    """Little's MCAR Test. Refer to :cite:`little1988TestMCAR` for more details.

    Notes
    -----
    This implementation is inspired by
    https://github.com/RianneSchouten/pyampute/blob/master/pyampute/exploration/mcar_statistical_tests.py.
    Note that this function should be used carefully. Rejecting the null hypothesis may not always mean that
    the data is not MCAR, nor is accepting the null hypothesis a guarantee that the data is MCAR.

    Parameters
    ----------
    X:
        Time series data containing missing values that should be in shape of [n_steps, n_features],
        i.e. have 2 dimensions.

    Returns
    -------
    p_value:
        The p-value of a chi-square hypothesis test.
        Null hypothesis: the time series is missing completely at random (MCAR).

    """

    if isinstance(X, np.ndarray):
        assert len(X.shape) == 2
        dataset = pd.DataFrame(X)
    elif isinstance(X, pd.DataFrame):
        dataset = X.copy()
    else:
        raise RuntimeError(f"X should be np.ndarray or pd.DataFrame, but got {type(X)}")

    vars = dataset.dtypes.index.values
    n_features = dataset.shape[1]

    # mean and covariance estimates
    # ideally, this is done with a maximum likelihood estimator
    global_mean = dataset.mean()
    global_covariance = dataset.cov()

    # set up missing data patterns
    r = 1 * dataset.isnull()
    mdp = np.dot(r, list(map(lambda x: pow(2, x), range(n_features))))
    sorted_mdp = sorted(np.unique(mdp))
    n_pat = len(sorted_mdp)
    correct_mdp = list(map(lambda x: sorted_mdp.index(x), mdp))
    dataset["mdp"] = pd.Series(correct_mdp, index=dataset.index)

    # calculate statistic and df
    pj = 0
    d2 = 0
    for i in range(n_pat):
        dataset_temp = dataset.loc[dataset["mdp"] == i, vars]
        select_vars = ~dataset_temp.isnull().any()
        pj += np.sum(select_vars)
        select_vars = vars[select_vars]
        means = dataset_temp[select_vars].mean() - global_mean[select_vars]
        select_cov = global_covariance.loc[select_vars, select_vars]
        mj = len(dataset_temp)
        parta = np.dot(
            means.T, np.linalg.solve(select_cov, np.identity(select_cov.shape[1]))
        )
        d2 += mj * (np.dot(parta, means))

    df = pj - n_features

    # perform test and output p value
    p_value = 1 - chi2.cdf(d2, df)
    return p_value