Source code for limix._dataset

from __future__ import unicode_literals
from collections import Counter

from numpy import array_equal, asarray, unique, dtype

from ._dask import array_shape_reveal


[docs]def _normalise_dataset(y, M=None, G=None, K=None): r"""Convert data types to DataArray. This is a fundamental function for :mod:`limix` as it standardise outcome, covariates, candidates, and kinship arrays into :class:`xarray.DataArray` data type. Data arrays are :mod:`numpy`/:mod:`dask` arrays with indexed coordinates, therefore generalising data frames from :mod:`pandas`. It allows for lazy loading of data via dask arrays. It also supports arrays with different dimensionality and types, mixture of indexed and non-indexed arrays, and repeated sample labels. Examples -------- .. doctest:: >>> from __future__ import unicode_literals >>> import pytest >>> from numpy.random import RandomState >>> from pandas import DataFrame >>> from xarray import DataArray >>> from limix._dataset import _normalise_dataset >>> >>> random = RandomState(0) >>> >>> y = random.randn(4) >>> y = DataFrame(y, index=["sample0", "sample0", "sample1", "sample2"]) >>> >>> G = random.randn(5, 6) >>> >>> data = _normalise_dataset(y, G=G) >>> print(data["y"]) <xarray.DataArray 'outcome' (sample: 4, trait: 1)> array([[1.764052], [0.400157], [0.978738], [2.240893]]) Coordinates: * sample (sample) object 'sample0' 'sample0' 'sample1' 'sample2' * trait (trait) int64 0 >>> print(data["G"]) <xarray.DataArray 'candidates' (sample: 4, candidate: 6)> array([[ 1.867558, -0.977278, 0.950088, -0.151357, -0.103219, 0.410599], [ 0.144044, 1.454274, 0.761038, 0.121675, 0.443863, 0.333674], [ 1.494079, -0.205158, 0.313068, -0.854096, -2.55299 , 0.653619], [ 0.864436, -0.742165, 2.269755, -1.454366, 0.045759, -0.187184]]) Coordinates: * sample (sample) object 'sample0' 'sample0' 'sample1' 'sample2' Dimensions without coordinates: candidate >>> K = random.randn(3, 3) >>> K = K.dot(K.T) >>> K = DataArray(K) >>> K.coords["dim_0"] = ["sample0", "sample1", "sample2"] >>> K.coords["dim_1"] = ["sample0", "sample1", "sample2"] >>> >>> data = _normalise_dataset(y, K=K) >>> print(data["y"]) <xarray.DataArray 'outcome' (sample: 4, trait: 1)> array([[1.764052], [0.400157], [0.978738], [2.240893]]) Coordinates: * sample (sample) object 'sample0' 'sample0' 'sample1' 'sample2' * trait (trait) int64 0 >>> print(data["K"]) <xarray.DataArray 'variance-covariance' (sample_0: 4, sample_1: 4)> array([[ 1.659103, 1.659103, -0.850801, -1.956422], [ 1.659103, 1.659103, -0.850801, -1.956422], [-0.850801, -0.850801, 1.687126, -0.194938], [-1.956422, -1.956422, -0.194938, 6.027272]]) Coordinates: * sample_0 (sample_0) <U7 'sample0' 'sample0' 'sample1' 'sample2' * sample_1 (sample_1) <U7 'sample0' 'sample0' 'sample1' 'sample2' >>> with pytest.raises(ValueError): ... _normalise_dataset(y, G=G, K=K) """ y = _rename_dims(_dataarray_upcast(y), "sample", "trait") M = _rename_dims(_dataarray_upcast(M), "sample", "covariate") G = _rename_dims(_dataarray_upcast(G), "sample", "candidate") K = _rename_dims(_dataarray_upcast(K), "sample_0", "sample_1") data = {"y": y, "M": M, "G": G, "K": K} dim_name = { "y": ["sample"], "M": ["sample"], "G": ["sample"], "K": ["sample_0", "sample_1"], } arrname = { "y": "outcome", "M": "covariates", "G": "candidates", "K": "variance-covariance", } data.update(_assign_index_to_nonindexed(_fout(data), dim_name)) valid_samples = _infer_samples_index(_fout(data), dim_name) for k, v in data.items(): for dn in dim_name[k]: data[k] = _assign_coords(v, dn, valid_samples) for k, v in _fout(data).items(): if v.name is None: v.name = arrname[k] n = len(data["y"].coords["sample"]) o = [v.coords[dn].size == n for k, v in _fout(data).items() for dn in dim_name[k]] if all(o): return data _check_uniqueness(data, dim_name, arrname) _check_sample_compatibility(data, dim_name, arrname) return data
def _assign_index_to_nonindexed(data, dim_name): indexed = {} nonindexed = {} for k, v in data.items(): for dn in dim_name[k]: if dn not in v.coords: nonindexed[k] = v else: indexed[k] = v if len(nonindexed) == 0: return data if len(indexed) == 0: for k, v in data.items(): for dn in dim_name[k]: n = len(v.coords[dn]) v.coords[dn] = ["sample{}".format(j) for j in range(n)] return data k = next(iter(indexed.keys())) index = indexed[k].coords[dim_name[k][0]] for k, v in indexed.items(): for dn in dim_name[k]: if not array_equal(index, v.coords[dn]): msg = "Please, check the provided sample labels in your arrays." msg = " There are some inconsistences in them." raise ValueError(msg) n = min(v.coords[dn].size for (k, v) in data.items() for dn in dim_name[k]) index = index[:n] for k, v in data.items(): for dn in dim_name[k]: data[k] = _take(v, dn, n) nonindexed = {} for k, v in data.items(): for dn in dim_name[k]: if dn not in v.coords: nonindexed[k] = v for k, v in nonindexed.items(): for dn in dim_name[k]: v.coords[dn] = index.values return data def _rename_dims(x, dim_0, dim_1): if x is None: return None x = x.rename({x.dims[0]: dim_0}) x = x.rename({x.dims[1]: dim_1}) return x def _dataarray_upcast(x): import dask.dataframe as dd import dask.array as da import xarray as xr if x is None: return None if isinstance(x, (dd.Series, dd.DataFrame)): xidx = x.index.compute() x = da.asarray(x) x = array_shape_reveal(x) x0 = xr.DataArray(x) x0.coords[x0.dims[0]] = xidx if isinstance(x, dd.DataFrame): x0.coords[x0.dims[1]] = x.columns x = x0 if not isinstance(x, xr.DataArray): x = xr.DataArray(x, encoding={"dtype": "float64"}) if x.dtype != dtype("float64"): x = x.astype("float64") if x.ndim < 2: x = x.expand_dims("dim_1", 1) return x def _assign_coords(x, dim_name, samples): if x is None: return None if dim_name not in x.coords: x = x.assign_coords(**{dim_name: list(samples)}) else: x = x.loc[{dim_name: x.get_index(dim_name).isin(samples)}] return x def _infer_samples_index(data, dim_name): k, v = next(iter(data.items())) samples = v.coords[dim_name[k][0]].values for k, v in data.items(): for dn in dim_name[k]: if not array_equal(v.coords[dn].values, samples): break else: continue break else: return Counter(samples) samples_sets = [ Counter(v.coords[dn].values) for k, v in data.items() for dn in dim_name[k] ] set_intersection = samples_sets[0] for ss in samples_sets[1:]: set_intersection = set_intersection & ss membership_size = [ asarray([ss[si] for ss in samples_sets], int) for si in set_intersection ] valid_samples = Counter() for i, k in enumerate(set_intersection.keys()): if sum(membership_size[0] > 1) <= 1: valid_samples[k] = set_intersection[k] return valid_samples def _has_sample_index(x): if hasattr(x, "index"): return True return hasattr(x, "coords") and "sample" == x.dims[0] def _get_sample_index(x): if hasattr(x, "index"): return asarray(x.index.values) return asarray(x.coords["sample"].values) def _index_set_intersection(arrs): """Indices that are present in every indexed DataFrame/Series.""" index_set = None for a in arrs: i = set(asarray(_get_sample_index(a))) if index_set is None: index_set = i else: index_set = index_set.intersection(i) if index_set is None: return set() return index_set def _same_order_if_possible(index_set, arrs): for a in arrs: i = _get_sample_index(a) if len(index_set) == len(i) and index_set == set(i): return i.copy() return asarray([], int) def _fout(data): return {k: v for k, v in data.items() if v is not None} def _take(x, dim_name, n): i = 0 sl = [slice(None)] * x.ndim while i < x.ndim and x.dims[i] != dim_name: i += 1 sl[i] = slice(0, n) return x[tuple(sl)] def _check_uniqueness(data, dim_name, arrname): msg = "Non-unique sample ids are not allowed in the {} array" msg += " if the sample ids are not equal nor in the same order." for k, v in _fout(data).items(): if k == "y": continue for dn in dim_name[k]: idx = v.coords[dn].values if len(unique(idx)) < len(idx): raise ValueError(msg.format(arrname[k])) def _check_sample_compatibility(data, dim_name, arrname): inc_msg = "The provided outcome and {} arrays are sample-wise incompatible." for k in _fout(data).keys(): if k == "y": continue for dn in dim_name[k]: try: data[k] = data[k].sel(**{dn: data["y"].coords["sample"].values}) except IndexError as e: raise ValueError(str(e) + "\n\n" + inc_msg.format(arrname[k]))