from .._dask import array_shape_reveal
[docs]def allele_frequency(expec):
r"""Compute allele frequency from its expectation.
Parameters
----------
expec : array_like
Allele expectations encoded as a variants-by-samples-by-alleles matrix.
Returns
-------
:class:`numpy.ndarray`
Allele frequencies encoded as a variants-by-alleles matrix.
"""
ploidy = expec.shape[-1]
if expec.ndim < 3:
n = 1
else:
n = expec.shape[1]
return expec.sum(-2) / (ploidy * n)
[docs]def compute_dosage(expec, alt=None):
r"""Compute dosage from allele expectation.
Parameters
----------
expec : array_like
Allele expectations encoded as a variants-by-samples-by-alleles matrix.
ref : array_like
Allele reference of each locus. The allele having the minor allele frequency
for the provided ``expec`` is used as the reference if `None`. Defaults to
`None`.
Returns
-------
:class:`numpy.ndarray`
Dosage encoded as a variants-by-samples matrix.
Examples
--------
.. doctest::
>>> from bgen_reader import read_bgen, allele_expectation, example_files
>>> from bgen_reader import compute_dosage
>>>
>>> with example_files("example.32bits.bgen") as filepath:
... bgen = read_bgen(filepath, verbose=False)
... e = allele_expectation(bgen["genotype"], nalleles=2, ploidy=2)
... dosage = compute_dosage(e).compute()
... print(dosage.shape)
... print(dosage)
(199, 500)
[[ nan 1.93575854 1.91558579 ... 1.94351192 0.10894776 1.01101689]
[1.98779296 1.97802735 0.02111815 ... 1.95492412 1.00897216 1.02255316]
[0.01550294 0.99383543 1.97933958 ... 1.98681641 1.99041748 1.99603272]
...
[1.99319479 1.980896 1.98767124 ... 1.9943846 1.99716186 1.98712159]
[0.01263467 0.09661863 0.00869752 ... 0.00643921 0.00494384 0.01504517]
[0.99185182 1.94860838 0.99734497 ... 0.02914425 1.97827146 0.9515991 ]]
"""
from numpy import asarray
if alt is None:
return expec[..., -1]
try:
return expec[alt, :, alt]
except NotImplementedError:
alt = asarray(alt, int)
return asarray(expec, float)[alt, :, alt]
[docs]def allele_expectation(p, nalleles, ploidy):
r"""Allele expectation.
Compute the expectation of each allele from the given probabilities.
It accepts three shapes of matrices:
- unidimensional array of probabilities;
- bidimensional samples-by-alleles probabilities array;
- and three dimensional variants-by-samples-by-alleles array.
Parameters
----------
p : array_like
Allele probabilities.
nalleles : int
Number of alleles.
ploidy : int
Number of complete sets of chromosomes.
Returns
-------
:class:`numpy.ndarray`
Last dimension will contain the expectation of each allele.
Examples
--------
.. doctest::
>>> from texttable import Texttable
>>> from bgen_reader import read_bgen, allele_expectation, example_files
>>>
>>> sampleid = "sample_005"
>>> rsid = "RSID_6"
>>>
>>> with example_files("example.32bits.bgen") as filepath:
... bgen = read_bgen(filepath, verbose=False)
...
... locus = bgen["variants"].query("rsid == '{}'".format(rsid)).index
... sample = bgen["samples"].query("id == '{}'".format(sampleid)).index
...
... nalleles = bgen["variants"].loc[locus, "nalleles"].item()
... ploidy = 2
...
... p = bgen["genotype"][locus[0], sample[0]].compute()
... # For unphased genotypes only.
... e = allele_expectation(bgen["genotype"][locus[0], sample[0]], nalleles, ploidy)
...
... alleles = bgen["variants"].loc[locus, "allele_ids"].item().split(",")
...
... tab = Texttable()
...
... tab.add_rows(
... [
... ["", "AA", "AG", "GG", "E[.]"],
... ["p"] + list(p) + [1.0],
... ["#" + alleles[0], 2, 1, 0, e[0]],
... ["#" + alleles[1], 0, 1, 2, e[1]],
... ]
... )
>>> print(tab.draw())
+----+-------+-------+-------+-------+
| | AA | AG | GG | E[.] |
+====+=======+=======+=======+=======+
| p | 0.012 | 0.987 | 0.001 | 1 |
+----+-------+-------+-------+-------+
| #A | 2 | 1 | 0 | 1.011 |
+----+-------+-------+-------+-------+
| #G | 0 | 1 | 2 | 0.989 |
+----+-------+-------+-------+-------+
>>> print("variant: {}".format(rsid))
variant: RSID_6
>>> print("sample : {}".format(sampleid))
sample : sample_005
Note
----
This function supports unphased genotypes only.
"""
from numpy import asarray, newaxis
g = _get_genotypes(ploidy, nalleles)
c = asarray(_genotypes_to_allele_counts(g), float)
c = c.T.reshape((1,) * (p.ndim - 1) + (c.shape[1], c.shape[0]))
p = array_shape_reveal(p)
return (c * p[..., newaxis, :]).sum(-1)
def _get_genotypes(ploidy, nalleles):
g = _make_genotypes(ploidy, 1, nalleles)
g = sorted([list(reversed(i)) for i in g])
g = [list(reversed(i)) for i in g]
return g
def _make_genotypes(ploidy, start, end):
tups = []
if ploidy == 0:
return tups
if ploidy == 1:
return [[i] for i in range(start, end + 1)]
for i in range(start, end + 1):
t = _make_genotypes(ploidy - 1, i, end)
for ti in t:
tups += [[i] + ti]
return tups
def _genotypes_to_allele_counts(genotypes):
nalleles = genotypes[-1][0]
counts = []
for g in genotypes:
count = [0] * nalleles
for gi in g:
count[gi - 1] += 1
counts.append(count)
return counts