import HLAfreq
import HLAfreq.HLAfreq_pymc as HLAhdi
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

WARNING (pytensor.tensor.blas): Using NumPy C-API based implementation for BLAS functions.

# Download data for Venezuela
country = "Venezuela"
base_url = HLAfreq.makeURL(country, locus="A", standard="g",)
venezuelaAF = HLAfreq.getAFdata(base_url)

1 pages of results
Download complete

venezuelaAF = HLAfreq.only_complete(venezuelaAF)
venezuelaAF = HLAfreq.decrease_resolution(venezuelaAF, 2)
cafV = HLAfreq.combineAF(venezuelaAF)
cafV['study'] =  'Venezuela'
HLAfreq.plotAF(cafV, venezuelaAF)

manual_prior = [1, 10, 1, 1, 1]

# View prior distribution
HLAfreq.plot_prior(concentration=manual_prior, labels=cafV.allele)

# Combine Allele Freq of study with Manual Prior
cafMP = HLAfreq.combineAF(venezuelaAF, alpha=manual_prior)

# View posterior
HLAfreq.plotAF(cafMP, AFtab=venezuelaAF)

# Download data for Colombia
country = "Colombia"
base_url = HLAfreq.makeURL(country, locus="A", standard="g",)
colombiaAF = HLAfreq.getAFdata(base_url)

1 pages of results
Download complete

# Combine Colombia studies into a national average
colombiaAF = HLAfreq.only_complete(colombiaAF)
colombiaAF = HLAfreq.decrease_resolution(colombiaAF, 2)
cafC = HLAfreq.combineAF(colombiaAF)
cafC['population'] =  'Colombia'

# Create a prior from Colombia
study_prior = cafC.copy()
study_prior.sample_size = study_prior.sample_size * 0.01

# View prior
HLAfreq.plot_prior(
    concentration = (2 * study_prior.sample_size * study_prior.allele_freq).tolist()
)

# Allele Frequency TABle with Study Prior
AFtabSP = pd.concat([venezuelaAF, study_prior], join="inner")

# Combine Allele Frequency with Study Prior
# As we're using a dataset as the prior we should set the
# argument for priors, `alpha` to 0. 59 Is the number all alleles
# in our dataset
nalleles = len(AFtabSP.allele.unique())
cafSP = HLAfreq.combineAF(AFtabSP, alpha=[0]*nalleles)

# View posterior of all alleles
HLAfreq.plotAF(cafSP, AFtabSP)

# View posterior of just alleles observed in Venezuela

venezuela_alleles = cafV.allele
HLAfreq.plotAF(
    cafSP[cafSP.allele.isin(venezuela_alleles)],
    AFtabSP[AFtabSP.allele.isin(venezuela_alleles)]
)

country = "Oman"
locus = "A"

# Download dataset
base_url = HLAfreq.makeURL(country, locus=locus)
aftab = HLAfreq.getAFdata(base_url)
aftab = HLAfreq.only_complete(aftab)
HLAfreq.check_resolution(aftab)
aftab = HLAfreq.decrease_resolution(aftab, 2)
caf = HLAfreq.combineAF(aftab)
# Calculate high density intervals
hdi = HLAhdi.AFhdi(aftab, credible_interval=0.95)

# Plot data
HLAfreq.plotAF(caf, aftab, hdi=hdi)

1 pages of results
Download complete
2    80
3     3
4     2
Name: allele, dtype: int64
Multiple resolutions in AFtab. Fix with decrease_resolution()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [frac, conc]

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 20 seconds.

WARNING: The default allele frequency estimate is outside of the CI estimated by the compound method for some alleles!
There are several possible reasons, see the credible intervals example: https://github.com/Vaccitech/HLAfreq/blob/main/examples/credible_intervals.ipynb
If you have set `credible_interval` to < 0.95, this may be a non-issue.

plt.scatter(caf.allele_freq, hdi.post_mean)
plt.vlines(caf.allele_freq, hdi.lo, hdi.hi)
plt.plot([0,.17], [0,.17], c="black", linestyle="--")
plt.xlabel('Default model AF')
plt.ylabel('Compound model AF')
plt.show()

k = caf.shape[0]
perks_hdi = HLAhdi.AFhdi(aftab, credible_interval=0.95, prior=[1/k]*k)
HLAfreq.plotAF(caf, aftab, hdi=perks_hdi)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [frac, conc]

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 293 seconds.

WARNING: The default allele frequency estimate is outside of the CI estimated by the compound method for some alleles!
There are several possible reasons, see the credible intervals example: https://github.com/Vaccitech/HLAfreq/blob/main/examples/credible_intervals.ipynb
If you have set `credible_interval` to < 0.95, this may be a non-issue.

plt.scatter(caf.allele_freq, perks_hdi.post_mean)
plt.vlines(caf.allele_freq, perks_hdi.lo, perks_hdi.hi)
plt.plot([0,.17], [0,.17], c="black", linestyle="--")
plt.xlabel('Default model AF')
plt.ylabel('1/k Compound model AF')
plt.show()

Working with priors¶

Alternate priors¶