Source code for pygrinder.utils

"""
Utility functions for pygrinder.
"""

# Created by Wenjie Du <wenjay.du@gmail.com>
# License: BSD-3-Clause

from typing import Union, Tuple

import numpy as np
import pandas as pd
import torch



[docs]
def calc_missing_rate(
    X: Union[np.ndarray, torch.Tensor, pd.DataFrame],
) -> float:
    """Calculate the originally missing rate of the raw data.

    Parameters
    ----------
    X:
        Data array/tensor/frame that may contain missing values.

    Returns
    -------
    missing_rate,
        The originally missing rate of the raw data. Its value should be in the range [0,1].

    """
    if isinstance(X, list):
        X = np.asarray(X)

    if isinstance(X, np.ndarray):
        missing_rate = np.sum(np.isnan(X)) / np.prod(X.shape)
    elif isinstance(X, torch.Tensor):
        missing_rate = torch.sum(torch.isnan(X)) / np.prod(X.shape)
        missing_rate = missing_rate.item()
    elif isinstance(X, pd.DataFrame):
        missing_rate = pd.isna(X).sum().sum() / np.prod(X.shape)
    else:
        raise TypeError(
            f"X must be type of list/numpy.ndarray/torch.Tensor/pandas.DataFrame, but got {type(X)}"
        )

    return missing_rate




[docs]
def masked_fill(
    X: Union[np.ndarray, torch.Tensor],
    mask: Union[np.ndarray, torch.Tensor],
    val: float,
) -> Union[np.ndarray, torch.Tensor]:
    """Like torch.Tensor.masked_fill(), fill elements in given `X` with `val` where `mask` is True.

    Parameters
    ----------
    X:
        The data vector.

    mask:
        The boolean mask.

    val:
        The value to fill in with.

    Returns
    -------
    filled_X:
        Mask filled X.

    """
    if isinstance(X, list):
        X = np.asarray(X)
        mask = np.asarray(mask)

    assert X.shape == mask.shape, (
        "Shapes of X and mask must match, "
        f"but X.shape={X.shape}, mask.shape={mask.shape}"
    )
    assert isinstance(X, type(mask)), (
        "Data types of X and mask must match, " f"but got {type(X)} and {type(mask)}"
    )

    if isinstance(X, np.ndarray):
        filled_X = X.copy()
        mask = mask.copy()
        mask = mask.astype(bool)
        filled_X[mask] = val
    elif isinstance(X, torch.Tensor):
        filled_X = torch.clone(X)
        mask = torch.clone(mask)
        mask = mask.type(torch.bool)
        filled_X[mask] = val
    else:
        raise TypeError(
            f"X must be type of list/numpy.ndarray/torch.Tensor, but got {type(X)}"
        )

    return filled_X




[docs]
def fill_and_get_mask_numpy(
    X: np.ndarray,
    nan: Union[float, int] = 0,
) -> Tuple[np.ndarray, ...]:
    """Fill missing values in numpy array X with `nan` and return the missing mask.

    Parameters
    ----------
    X : np.ndarray
        Time series data generated from X_intact, with artificially missing values added.

    nan : int/float, optional, default=0
        Value used to fill NaN values. Only valid when return_masks is True.
        If return_masks is False, the NaN values will be kept as NaN.

    Returns
    -------
    X :
        Original X with artificial missing values. X is for model input.
        Both originally-missing and artificially-missing values are filled with given parameter `nan`.

    missing_mask :
        The mask indicates all missing values in X.
        In it, 1 indicates observed values, and 0 indicates missing values.

    """
    X_missing_mask = (~np.isnan(X)).astype(np.float32)
    X = np.nan_to_num(X, nan=nan)
    return X, X_missing_mask




[docs]
def fill_and_get_mask_torch(
    X: torch.Tensor,
    nan: Union[float, int] = 0,
) -> Tuple[torch.Tensor, ...]:
    """Fill missing values in torch tensor X with `nan` and return the missing mask.

    Parameters
    ----------
    X :
        Time series data generated from X_intact, with artificially missing values added.

    nan : int/float, optional, default=0
        Value used to fill NaN values. Only valid when return_masks is True.
        If return_masks is False, the NaN values will be kept as NaN.

    Returns
    -------
    X :
        Original X with artificial missing values. X is for model input.
        Both originally-missing and artificially-missing values are filled with given parameter `nan`.

    missing_mask :
        The mask indicates all missing values in X.
        In it, 1 indicates observed values, and 0 indicates missing values.

    """
    missing_mask = (~torch.isnan(X)).type(torch.float32)
    X = torch.nan_to_num(X, nan=nan)
    return X, missing_mask




[docs]
def fill_and_get_mask(
    X: Union[torch.Tensor, np.ndarray],
    nan: Union[float, int] = 0,
) -> Union[Tuple[np.ndarray, ...], Tuple[torch.Tensor, ...]]:
    """Fill missing values in X with `nan` and return the missing mask.

    Parameters
    ----------
    X :
        Data with missing values

    nan : int/float, optional, default=0
        Value used to fill NaN values. Only valid when return_masks is True.
        If return_masks is False, the NaN values will be kept as NaN.

    Returns
    -------
    X :
        Original X with artificial missing values. X is for model input.
        Both originally-missing and artificially-missing values are filled with given parameter `nan`.

    missing_mask :
        The mask indicates all missing values in X.
        In it, 1 indicates observed values, and 0 indicates missing values.

    """
    if isinstance(X, list):
        X = np.asarray(X)

    if isinstance(X, np.ndarray):
        X, missing_mask = fill_and_get_mask_numpy(X, nan)

    elif isinstance(X, torch.Tensor):
        X, missing_mask = fill_and_get_mask_torch(X, nan)
    else:
        raise TypeError(
            f"X must be type of list/numpy.ndarray/torch.Tensor, but got {type(X)}"
        )

    return X, missing_mask