from __future__ import unicode_literals
from collections import Counter
from numpy import array_equal, asarray, unique, dtype
from ._dask import array_shape_reveal
[docs]def _normalise_dataset(y, M=None, G=None, K=None):
r"""Convert data types to DataArray.
This is a fundamental function for :mod:`limix` as it standardise outcome,
covariates, candidates, and kinship arrays into :class:`xarray.DataArray` data type.
Data arrays are :mod:`numpy`/:mod:`dask` arrays with indexed coordinates,
therefore generalising data frames from :mod:`pandas`. It allows for lazy loading of
data via dask arrays. It also supports arrays with different dimensionality and
types, mixture of indexed and non-indexed arrays, and repeated sample labels.
Examples
--------
.. doctest::
>>> from __future__ import unicode_literals
>>> import pytest
>>> from numpy.random import RandomState
>>> from pandas import DataFrame
>>> from xarray import DataArray
>>> from limix._dataset import _normalise_dataset
>>>
>>> random = RandomState(0)
>>>
>>> y = random.randn(4)
>>> y = DataFrame(y, index=["sample0", "sample0", "sample1", "sample2"])
>>>
>>> G = random.randn(5, 6)
>>>
>>> data = _normalise_dataset(y, G=G)
>>> print(data["y"])
<xarray.DataArray 'outcome' (sample: 4, trait: 1)>
array([[1.764052],
[0.400157],
[0.978738],
[2.240893]])
Coordinates:
* sample (sample) object 'sample0' 'sample0' 'sample1' 'sample2'
* trait (trait) int64 0
>>> print(data["G"])
<xarray.DataArray 'candidates' (sample: 4, candidate: 6)>
array([[ 1.867558, -0.977278, 0.950088, -0.151357, -0.103219, 0.410599],
[ 0.144044, 1.454274, 0.761038, 0.121675, 0.443863, 0.333674],
[ 1.494079, -0.205158, 0.313068, -0.854096, -2.55299 , 0.653619],
[ 0.864436, -0.742165, 2.269755, -1.454366, 0.045759, -0.187184]])
Coordinates:
* sample (sample) object 'sample0' 'sample0' 'sample1' 'sample2'
Dimensions without coordinates: candidate
>>> K = random.randn(3, 3)
>>> K = K.dot(K.T)
>>> K = DataArray(K)
>>> K.coords["dim_0"] = ["sample0", "sample1", "sample2"]
>>> K.coords["dim_1"] = ["sample0", "sample1", "sample2"]
>>>
>>> data = _normalise_dataset(y, K=K)
>>> print(data["y"])
<xarray.DataArray 'outcome' (sample: 4, trait: 1)>
array([[1.764052],
[0.400157],
[0.978738],
[2.240893]])
Coordinates:
* sample (sample) object 'sample0' 'sample0' 'sample1' 'sample2'
* trait (trait) int64 0
>>> print(data["K"])
<xarray.DataArray 'variance-covariance' (sample_0: 4, sample_1: 4)>
array([[ 1.659103, 1.659103, -0.850801, -1.956422],
[ 1.659103, 1.659103, -0.850801, -1.956422],
[-0.850801, -0.850801, 1.687126, -0.194938],
[-1.956422, -1.956422, -0.194938, 6.027272]])
Coordinates:
* sample_0 (sample_0) <U7 'sample0' 'sample0' 'sample1' 'sample2'
* sample_1 (sample_1) <U7 'sample0' 'sample0' 'sample1' 'sample2'
>>> with pytest.raises(ValueError):
... _normalise_dataset(y, G=G, K=K)
"""
y = _rename_dims(_dataarray_upcast(y), "sample", "trait")
M = _rename_dims(_dataarray_upcast(M), "sample", "covariate")
G = _rename_dims(_dataarray_upcast(G), "sample", "candidate")
K = _rename_dims(_dataarray_upcast(K), "sample_0", "sample_1")
data = {"y": y, "M": M, "G": G, "K": K}
dim_name = {
"y": ["sample"],
"M": ["sample"],
"G": ["sample"],
"K": ["sample_0", "sample_1"],
}
arrname = {
"y": "outcome",
"M": "covariates",
"G": "candidates",
"K": "variance-covariance",
}
data.update(_assign_index_to_nonindexed(_fout(data), dim_name))
valid_samples = _infer_samples_index(_fout(data), dim_name)
for k, v in data.items():
for dn in dim_name[k]:
data[k] = _assign_coords(v, dn, valid_samples)
for k, v in _fout(data).items():
if v.name is None:
v.name = arrname[k]
n = len(data["y"].coords["sample"])
o = [v.coords[dn].size == n for k, v in _fout(data).items() for dn in dim_name[k]]
if all(o):
return data
_check_uniqueness(data, dim_name, arrname)
_check_sample_compatibility(data, dim_name, arrname)
return data
def _assign_index_to_nonindexed(data, dim_name):
indexed = {}
nonindexed = {}
for k, v in data.items():
for dn in dim_name[k]:
if dn not in v.coords:
nonindexed[k] = v
else:
indexed[k] = v
if len(nonindexed) == 0:
return data
if len(indexed) == 0:
for k, v in data.items():
for dn in dim_name[k]:
n = len(v.coords[dn])
v.coords[dn] = ["sample{}".format(j) for j in range(n)]
return data
k = next(iter(indexed.keys()))
index = indexed[k].coords[dim_name[k][0]]
for k, v in indexed.items():
for dn in dim_name[k]:
if not array_equal(index, v.coords[dn]):
msg = "Please, check the provided sample labels in your arrays."
msg = " There are some inconsistences in them."
raise ValueError(msg)
n = min(v.coords[dn].size for (k, v) in data.items() for dn in dim_name[k])
index = index[:n]
for k, v in data.items():
for dn in dim_name[k]:
data[k] = _take(v, dn, n)
nonindexed = {}
for k, v in data.items():
for dn in dim_name[k]:
if dn not in v.coords:
nonindexed[k] = v
for k, v in nonindexed.items():
for dn in dim_name[k]:
v.coords[dn] = index.values
return data
def _rename_dims(x, dim_0, dim_1):
if x is None:
return None
x = x.rename({x.dims[0]: dim_0})
x = x.rename({x.dims[1]: dim_1})
return x
def _dataarray_upcast(x):
import dask.dataframe as dd
import dask.array as da
import xarray as xr
if x is None:
return None
if isinstance(x, (dd.Series, dd.DataFrame)):
xidx = x.index.compute()
x = da.asarray(x)
x = array_shape_reveal(x)
x0 = xr.DataArray(x)
x0.coords[x0.dims[0]] = xidx
if isinstance(x, dd.DataFrame):
x0.coords[x0.dims[1]] = x.columns
x = x0
if not isinstance(x, xr.DataArray):
x = xr.DataArray(x, encoding={"dtype": "float64"})
if x.dtype != dtype("float64"):
x = x.astype("float64")
if x.ndim < 2:
x = x.expand_dims("dim_1", 1)
return x
def _assign_coords(x, dim_name, samples):
if x is None:
return None
if dim_name not in x.coords:
x = x.assign_coords(**{dim_name: list(samples)})
else:
x = x.loc[{dim_name: x.get_index(dim_name).isin(samples)}]
return x
def _infer_samples_index(data, dim_name):
k, v = next(iter(data.items()))
samples = v.coords[dim_name[k][0]].values
for k, v in data.items():
for dn in dim_name[k]:
if not array_equal(v.coords[dn].values, samples):
break
else:
continue
break
else:
return Counter(samples)
samples_sets = [
Counter(v.coords[dn].values) for k, v in data.items() for dn in dim_name[k]
]
set_intersection = samples_sets[0]
for ss in samples_sets[1:]:
set_intersection = set_intersection & ss
membership_size = [
asarray([ss[si] for ss in samples_sets], int) for si in set_intersection
]
valid_samples = Counter()
for i, k in enumerate(set_intersection.keys()):
if sum(membership_size[0] > 1) <= 1:
valid_samples[k] = set_intersection[k]
return valid_samples
def _has_sample_index(x):
if hasattr(x, "index"):
return True
return hasattr(x, "coords") and "sample" == x.dims[0]
def _get_sample_index(x):
if hasattr(x, "index"):
return asarray(x.index.values)
return asarray(x.coords["sample"].values)
def _index_set_intersection(arrs):
"""Indices that are present in every indexed DataFrame/Series."""
index_set = None
for a in arrs:
i = set(asarray(_get_sample_index(a)))
if index_set is None:
index_set = i
else:
index_set = index_set.intersection(i)
if index_set is None:
return set()
return index_set
def _same_order_if_possible(index_set, arrs):
for a in arrs:
i = _get_sample_index(a)
if len(index_set) == len(i) and index_set == set(i):
return i.copy()
return asarray([], int)
def _fout(data):
return {k: v for k, v in data.items() if v is not None}
def _take(x, dim_name, n):
i = 0
sl = [slice(None)] * x.ndim
while i < x.ndim and x.dims[i] != dim_name:
i += 1
sl[i] = slice(0, n)
return x[tuple(sl)]
def _check_uniqueness(data, dim_name, arrname):
msg = "Non-unique sample ids are not allowed in the {} array"
msg += " if the sample ids are not equal nor in the same order."
for k, v in _fout(data).items():
if k == "y":
continue
for dn in dim_name[k]:
idx = v.coords[dn].values
if len(unique(idx)) < len(idx):
raise ValueError(msg.format(arrname[k]))
def _check_sample_compatibility(data, dim_name, arrname):
inc_msg = "The provided outcome and {} arrays are sample-wise incompatible."
for k in _fout(data).keys():
if k == "y":
continue
for dn in dim_name[k]:
try:
data[k] = data[k].sel(**{dn: data["y"].coords["sample"].values})
except IndexError as e:
raise ValueError(str(e) + "\n\n" + inc_msg.format(arrname[k]))