Source code for pygrinder.missing_completely_at_random.little_test

# Created by Wenjie Du <>
# License: BSD-3-Clause

from typing import Union

import numpy as np
import pandas as pd
from scipy.stats import chi2

[docs] def mcar_little_test(X: Union[pd.DataFrame, np.ndarray]) -> float: """Little's MCAR Test. Refer to :cite:`little1988TestMCAR` for more details. Notes ----- This implementation is inspired by Note that this function should be used carefully. Rejecting the null hypothesis may not always mean that the data is not MCAR, nor is accepting the null hypothesis a guarantee that the data is MCAR. Parameters ---------- X: Time series data containing missing values that should be in shape of [n_steps, n_features], i.e. have 2 dimensions. Returns ------- p_value: float The p-value of a chi-square hypothesis test. Null hypothesis: the time series is missing completely at random (MCAR). """ if isinstance(X, np.ndarray): assert len(X.shape) == 2 dataset = pd.DataFrame(X) elif isinstance(X, pd.DataFrame): dataset = X.copy() else: raise RuntimeError(f"X should be np.ndarray or pd.DataFrame, but got {type(X)}") vars = dataset.dtypes.index.values n_features = dataset.shape[1] # mean and covariance estimates # ideally, this is done with a maximum likelihood estimator global_mean = dataset.mean() global_covariance = dataset.cov() # set up missing data patterns r = 1 * dataset.isnull() mdp =, list(map(lambda x: pow(2, x), range(n_features)))) sorted_mdp = sorted(np.unique(mdp)) n_pat = len(sorted_mdp) correct_mdp = list(map(lambda x: sorted_mdp.index(x), mdp)) dataset["mdp"] = pd.Series(correct_mdp, index=dataset.index) # calculate statistic and df pj = 0 d2 = 0 for i in range(n_pat): dataset_temp = dataset.loc[dataset["mdp"] == i, vars] select_vars = ~dataset_temp.isnull().any() pj += np.sum(select_vars) select_vars = vars[select_vars] means = dataset_temp[select_vars].mean() - global_mean[select_vars] select_cov = global_covariance.loc[select_vars, select_vars] mj = len(dataset_temp) parta = means.T, np.linalg.solve(select_cov, np.identity(select_cov.shape[1])) ) d2 += mj * (, means)) df = pj - n_features # perform test and output p value p_value = 1 - chi2.cdf(d2, df) return p_value