Source code for limix.qc.kinship

from __future__ import division

from numpy import copyto, asarray


[docs]def normalise_covariance(K, out=None):
    r"""Variance rescaling of covariance matrix ``K``.

    Let :math:`n` be the number of rows (or columns) of ``K`` and let
    :math:`m_i` be the average of the values in the i-th column.
    Gower rescaling is defined as

    .. math::

        \mathrm K \frac{n - 1}{\text{trace}(\mathrm K) - \sum m_i}.

    It works well with `Dask`_ array as log as ``out`` is ``None``.

    Notes
    -----
    The reasoning of the scaling is as follows.
    Let :math:`\mathbf g` be a vector of :math:`n` independent samples and let
    :math:`\mathrm C` be the Gower's centering matrix.
    The unbiased variance estimator is

    .. math::
        v = \sum_i \frac{(g_i-\overline g)^2}{n-1}
        =\frac{\mathrm{Tr}
        [(\mathbf g-\overline g\mathbf 1)^t(\mathbf g-\overline g\mathbf 1)]}
        {n-1}
        = \frac{\mathrm{Tr}[\mathrm C\mathbf g\mathbf g^t\mathrm C]}{n-1}

    Let :math:`\mathrm K` be the covariance matrix of :math:`\mathbf g`.
    The expectation of the unbiased variance estimator is

    .. math::

        \mathbb E[v] =
        \frac{\mathrm{Tr}[\mathrm C\mathbb E[\mathbf g\mathbf g^t]\mathrm C]}
        {n-1}
        = \frac{\mathrm{Tr}[\mathrm C\mathrm K\mathrm C]}{n-1}

    assuming that :math:`\mathbb E[g_i]=0`.
    We thus divide :math:`\mathrm K` by :math:`\mathbb E[v]` to achieve the
    desired normalisation.

    Parameters
    ----------
    K : array_like
        Covariance matrix to be normalised.
    out : array_like, optional
        Result destination. Defaults to ``None``.

    Examples
    --------
    .. doctest::

        >>> from numpy import dot, mean, zeros
        >>> from numpy.random import RandomState
        >>> from limix.qc import normalise_covariance
        >>>
        >>> random = RandomState(0)
        >>> X = random.randn(10, 10)
        >>> K = dot(X, X.T)
        >>> Z = random.multivariate_normal(zeros(10), K, 500)
        >>> print("%.3f" % mean(Z.var(1, ddof=1)))
        9.824
        >>> Kn = normalise_covariance(K)
        >>> Zn = random.multivariate_normal(zeros(10), Kn, 500)
        >>> print("%.3f" % mean(Zn.var(1, ddof=1)))
        1.008

    .. _Dask: https://dask.pydata.org/
    """
    K = asarray(K, float)
    c = (K.shape[0] - 1) / (K.trace() - K.mean(0).sum())
    if out is None:
        return c * K

    copyto(out, K)
    out *= c