Source code for pygrinder.missing_completely_at_random.little_test
# Created by Wenjie Du <wenjay.du@gmail.com># License: BSD-3-ClausefromtypingimportUnionimportnumpyasnpimportpandasaspdfromscipy.statsimportchi2
[docs]defmcar_little_test(X:Union[pd.DataFrame,np.ndarray])->float:"""Little's MCAR Test. Refer to :cite:`little1988TestMCAR` for more details. Notes ----- This implementation is inspired by https://github.com/RianneSchouten/pyampute/blob/master/pyampute/exploration/mcar_statistical_tests.py. Note that this function should be used carefully. Rejecting the null hypothesis may not always mean that the data is not MCAR, nor is accepting the null hypothesis a guarantee that the data is MCAR. Parameters ---------- X: Time series data containing missing values that should be in shape of [n_steps, n_features], i.e. have 2 dimensions. Returns ------- p_value: The p-value of a chi-square hypothesis test. Null hypothesis: the time series is missing completely at random (MCAR). """ifisinstance(X,np.ndarray):assertlen(X.shape)==2dataset=pd.DataFrame(X)elifisinstance(X,pd.DataFrame):dataset=X.copy()else:raiseRuntimeError(f"X should be np.ndarray or pd.DataFrame, but got {type(X)}")vars=dataset.dtypes.index.valuesn_features=dataset.shape[1]# mean and covariance estimates# ideally, this is done with a maximum likelihood estimatorglobal_mean=dataset.mean()global_covariance=dataset.cov()# set up missing data patternsr=1*dataset.isnull()mdp=np.dot(r,list(map(lambdax:pow(2,x),range(n_features))))sorted_mdp=sorted(np.unique(mdp))n_pat=len(sorted_mdp)correct_mdp=list(map(lambdax:sorted_mdp.index(x),mdp))dataset["mdp"]=pd.Series(correct_mdp,index=dataset.index)# calculate statistic and dfpj=0d2=0foriinrange(n_pat):dataset_temp=dataset.loc[dataset["mdp"]==i,vars]select_vars=~dataset_temp.isnull().any()pj+=np.sum(select_vars)select_vars=vars[select_vars]means=dataset_temp[select_vars].mean()-global_mean[select_vars]select_cov=global_covariance.loc[select_vars,select_vars]mj=len(dataset_temp)parta=np.dot(means.T,np.linalg.solve(select_cov,np.identity(select_cov.shape[1])))d2+=mj*(np.dot(parta,means))df=pj-n_features# perform test and output p valuep_value=1-chi2.cdf(d2,df)returnp_value