from __future__ import division
import sys
from numpy import ones
from limix.display import timer_text
from .._dataset import _normalise_dataset
from ..display import session_text
from .._likelihood import assert_likelihood_name, normalise_extreme_values
from ._model import QTLModel
[docs]def scan(G, y, lik, K=None, M=None, verbose=True):
r"""Single-variant association testing via generalised linear mixed models.
It supports Normal (linear mixed model), Bernoulli, Probit, Binomial, and Poisson
residual errors, defined by ``lik``.
The columns of ``G`` define the candidates to be tested for association
with the phenotype ``y``.
The covariance matrix is set by ``K``.
If not provided, or set to ``None``, the generalised linear model
without random effects is assumed.
The covariates can be set via the parameter ``M``.
We recommend to always provide a column of ones when covariates are actually
provided.
Parameters
----------
G : array_like
:math:`N` individuals by :math:`S` candidate markers.
y : tuple, array_like
Either a tuple of two arrays of :math:`N` individuals each (Binomial
phenotypes) or an array of :math:`N` individuals (Normal, Poisson, Bernoulli, or
Probit phenotypes).
lik : "normal", "bernoulli", "probit", binomial", "poisson"
Sample likelihood describing the residual distribution.
K : array_like, optional
:math:`N`-by-:math:`N` covariance matrix (e.g., kinship coefficients).
Set to ``None`` for a generalised linear model without random effects.
Defaults to ``None``.
M : array_like, optional
`N` individuals by `S` covariates.
It will create a :math:`N`-by-:math:`1` matrix ``M`` of ones representing the
offset covariate if ``None`` is passed. If an array is passed, it will used as
is. Defaults to ``None``.
verbose : bool, optional
``True`` to display progress and summary; ``False`` otherwise.
Returns
-------
:class:`limix.qtl.QTLModel`
QTL representation.
Examples
--------
.. doctest::
>>> from numpy import dot, exp, sqrt, ones
>>> from numpy.random import RandomState
>>> from pandas import DataFrame
>>> import pandas as pd
>>> from limix.qtl import scan
>>>
>>> random = RandomState(1)
>>> pd.options.display.float_format = "{:9.6f}".format
>>>
>>> n = 30
>>> p = 3
>>> samples_index = range(n)
>>>
>>> M = DataFrame(dict(offset=ones(n), age=random.randint(10, 60, n)))
>>> M.index = samples_index
>>>
>>> X = random.randn(n, 100)
>>> K = dot(X, X.T)
>>>
>>> candidates = random.randn(n, p)
>>> candidates = DataFrame(candidates, index=samples_index,
... columns=['rs0', 'rs1', 'rs2'])
>>>
>>> y = random.poisson(exp(random.randn(n)))
>>>
>>> model = scan(candidates, y, 'poisson', K, M=M, verbose=False)
>>>
>>> model.variant_pvalues.to_dataframe() # doctest: +FLOAT_CMP
pv
candidate
rs0 0.554444
rs1 0.218996
rs2 0.552200
>>> model.variant_effsizes.to_dataframe() # doctest: +FLOAT_CMP
effsizes
candidate
rs0 -0.130867
rs1 -0.315078
rs2 -0.143869
>>> model.variant_effsizes_se.to_dataframe() # doctest: +FLOAT_CMP
effsizes std
candidate
rs0 0.221390
rs1 0.256327
rs2 0.242013
>>> model # doctest: +FLOAT_CMP
Variants
--------
effsizes effsizes_se pvalues
count 3.000000 3.000000 3.000000
mean -0.196604 0.239910 0.441880
std 0.102807 0.017563 0.193027
min -0.315077 0.221389 0.218996
25% -0.229473 0.231701 0.385598
50% -0.143869 0.242013 0.552200
75% -0.137367 0.249170 0.553322
max -0.130866 0.256326 0.554443
<BLANKLINE>
Covariate effect sizes for H0
-----------------------------
age offset
-0.005568 0.395287
Notes
-----
It will raise a ``ValueError`` exception if non-finite values are passed. Please,
refer to the :func:`limix.qc.mean_impute` function for missing value imputation.
"""
from numpy_sugar import is_all_finite
from numpy_sugar.linalg import economic_qs
from xarray import DataArray
from pandas import Series
if not isinstance(lik, (tuple, list)):
lik = (lik,)
lik_name = lik[0].lower()
assert_likelihood_name(lik_name)
if M is None:
M = ones((len(y), 1))
if isinstance(y, (DataArray, Series)):
M = DataArray(M, encoding={"dtype": "float64"})
M = M.rename({M.dims[0]: "sample"})
if hasattr(y, "index"):
M.coords["sample"] = y.index.values
else:
M.coords["sample"] = y.coords["sample"].values
with session_text("qtl analysis", disable=not verbose):
with timer_text("Normalising input... ", disable=not verbose):
data = _normalise_dataset(y, M, G=G, K=K)
y = data["y"]
M = data["M"]
G = data["G"]
K = data["K"]
if not is_all_finite(y):
raise ValueError("Outcome must have finite values only.")
if not is_all_finite(M):
raise ValueError("Covariates must have finite values only.")
if K is not None:
if not is_all_finite(K):
raise ValueError("Covariate matrix must have finite values only.")
QS = economic_qs(K)
else:
QS = None
y = normalise_extreme_values(data["y"], lik)
if lik_name == "normal":
model = _perform_lmm(y.values, M, QS, G, verbose)
else:
model = _perform_glmm(y.values, lik, M, K, QS, G, verbose)
return model
def _perform_lmm(y, M, QS, G, verbose):
from glimix_core.lmm import LMM
from pandas import Series, DataFrame
from xarray import DataArray
lmm = LMM(y, M.values, QS)
lmm.fit(verbose=verbose)
sys.stdout.flush()
null_lml = lmm.lml()
beta = lmm.beta
covariates = list(M.coords["covariate"].values)
ncov_effsizes = Series(beta, covariates)
flmm = lmm.get_fast_scanner()
if isinstance(G, DataFrame):
alt_lmls, effsizes = flmm.fast_scan(G.values, verbose=verbose)
else:
alt_lmls, effsizes = flmm.fast_scan(G, verbose=verbose)
coords = {
k: ("candidate", G.coords[k].values)
for k in G.coords.keys()
if G.coords[k].dims[0] == "candidate"
}
alt_lmls = DataArray(alt_lmls, dims=["candidate"], coords=coords)
effsizes = DataArray(effsizes, dims=["candidate"], coords=coords)
return QTLModel(null_lml, alt_lmls, effsizes, ncov_effsizes)
def _perform_glmm(y, lik, M, K, QS, G, verbose):
from glimix_core.glmm import GLMMExpFam, GLMMNormal
from pandas import Series, DataFrame
from xarray import DataArray
glmm = GLMMExpFam(y.ravel(), lik, M.values, QS)
glmm.fit(verbose=verbose)
sys.stdout.flush()
eta = glmm.site.eta
tau = glmm.site.tau
gnormal = GLMMNormal(eta, tau, M.values, QS)
gnormal.fit(verbose=verbose)
beta = gnormal.beta
covariates = list(M.coords["covariate"].values)
ncov_effsizes = Series(beta, covariates)
flmm = gnormal.get_fast_scanner()
flmm.set_scale(1.0)
null_lml = flmm.null_lml()
if isinstance(G, DataFrame):
alt_lmls, effsizes = flmm.fast_scan(G.values, verbose=verbose)
else:
alt_lmls, effsizes = flmm.fast_scan(G, verbose=verbose)
coords = {
k: ("candidate", G.coords[k].values)
for k in G.coords.keys()
if G.coords[k].dims[0] == "candidate"
}
alt_lmls = DataArray(alt_lmls, dims=["candidate"], coords=coords)
effsizes = DataArray(effsizes, dims=["candidate"], coords=coords)
return QTLModel(null_lml, alt_lmls, effsizes, ncov_effsizes)