Source code for limix.qc.impute

from __future__ import division

from numpy import isnan, nanmean


[docs]def mean_impute(X): r"""Column-wise impute ``NaN`` values by column mean. It works well with `Dask`_ array. Parameters ---------- X : array_like Matrix to have its missing values imputed. Returns ------- array_like Imputed array. Examples -------- .. doctest:: >>> from numpy.random import RandomState >>> from numpy import nan, array_str >>> from limix.qc import mean_impute >>> >>> random = RandomState(0) >>> X = random.randn(5, 2) >>> X[0, 0] = nan >>> >>> print(array_str(mean_impute(X), precision=4)) [[ 0.9233 0.4002] [ 0.9787 2.2409] [ 1.8676 -0.9773] [ 0.9501 -0.1514] [-0.1032 0.4106]] .. _Dask: https://dask.pydata.org/ """ import dask.array as da if isinstance(X, da.Array): m = da.nanmean(X, axis=0).compute() start = 0 arrs = [] for i in range(len(X.chunks[1])): end = start + X.chunks[1][i] impute = _get_imputer(m[start:end]) arrs.append(X[:, start:end].map_blocks(impute, dtype=float)) start = end X = da.concatenate(arrs, axis=1) else: if hasattr(X, "values"): x = X.values else: x = X m = nanmean(x, axis=0) for i, mi in enumerate(m): x[isnan(x[:, i]), i] = mi return X
def _get_imputer(m): def impute(X): A = X.copy() isn = isnan(A) A[:] = 0 A[isn] = 1 X[isn] = 0 X += A * m return X return impute