import pandas as pd
import matplotlib.pyplot as plt
import HLAfreq
from HLAfreq import HLAfreq_pymc as HLAhdi

WARNING (pytensor.tensor.blas): Using NumPy C-API based implementation for BLAS functions.

# Download dataset
country = "Mongolia"
locus = "DQB1"

base_url = HLAfreq.makeURL(country, locus=locus)
aftab = HLAfreq.getAFdata(base_url)
aftab = HLAfreq.only_complete(aftab)
HLAfreq.check_resolution(aftab)
aftab = HLAfreq.decrease_resolution(aftab, 2)
caf = HLAfreq.combineAF(aftab)

1 pages of results
Download complete
2    64
3     3
Name: allele, dtype: int64
Multiple resolutions in AFtab. Fix with decrease_resolution()

# Calculate high density intervals
hdi = HLAhdi.AFhdi(aftab, credible_interval=0.95)
# Add to the combined allele frequency dataset
caf = pd.merge(caf, hdi, how="left", on="allele")
caf[['allele', 'allele_freq', 'lo','hi','post_mean']]

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [frac, conc]

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 7 seconds.

HLAfreq.plotAF(caf, aftab, hdi=hdi)

# When k is the number of alleles
prior=[1/k]*k

base_url = HLAfreq.makeURL("Italy", locus="DQB1")
aftab = HLAfreq.getAFdata(base_url)
aftab = HLAfreq.only_complete(aftab)
HLAfreq.check_resolution(aftab)
aftab = HLAfreq.decrease_resolution(aftab, 2)
caf = HLAfreq.combineAF(aftab)
hdi = HLAhdi.AFhdi(aftab,credible_interval=0.95)
HLAfreq.plotAF(caf, aftab, hdi=hdi)

3 pages of results
Download complete
population   loci
Italy pop 4  DQB1    0.574
Name: allele_freq, dtype: float64
1 studies have total allele frequency < 0.95
2    206
3      1
Name: allele, dtype: int64
Multiple resolutions in AFtab. Fix with decrease_resolution()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [frac, conc]

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 11 seconds.

WARNING: The default allele frequency estimate is outside of the CI estimated by the compound method for some alleles!
There are several possible reasons, see the credible intervals example: https://github.com/BarinthusBio/HLAfreq/blob/main/examples/credible_intervals.ipynb
If you have set `credible_interval` to < 0.95, this may be a non-issue.

aftab[aftab.allele=="DQB1*02:01"]

# Format data and fit variance model
c_array, allele_names = HLAhdi._make_c_array(aftab)
idata = HLAhdi._fit_Dirichlet_Multinomial(c_array)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [frac, conc]

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 12 seconds.

import arviz as az
az.summary(idata)

az.plot_energy(idata)

<AxesSubplot: >

az.plot_trace(idata, compact=False, combined=False)

/home/dwells/miniconda3/envs/hlafreq/lib/python3.10/site-packages/arviz/plots/traceplot.py:217: UserWarning: rcParams['plot.max_subplots'] (20) is smaller than the number of variables to plot (45), generating only 20 plots
  warnings.warn(

array([[<AxesSubplot: title={'center': 'frac\n0'}>,
        <AxesSubplot: title={'center': 'frac\n0'}>],
       [<AxesSubplot: title={'center': 'frac\n1'}>,
        <AxesSubplot: title={'center': 'frac\n1'}>],
       [<AxesSubplot: title={'center': 'frac\n2'}>,
        <AxesSubplot: title={'center': 'frac\n2'}>],
       [<AxesSubplot: title={'center': 'frac\n3'}>,
        <AxesSubplot: title={'center': 'frac\n3'}>],
       [<AxesSubplot: title={'center': 'frac\n4'}>,
        <AxesSubplot: title={'center': 'frac\n4'}>],
       [<AxesSubplot: title={'center': 'frac\n5'}>,
        <AxesSubplot: title={'center': 'frac\n5'}>],
       [<AxesSubplot: title={'center': 'frac\n6'}>,
        <AxesSubplot: title={'center': 'frac\n6'}>],
       [<AxesSubplot: title={'center': 'frac\n7'}>,
        <AxesSubplot: title={'center': 'frac\n7'}>],
       [<AxesSubplot: title={'center': 'frac\n8'}>,
        <AxesSubplot: title={'center': 'frac\n8'}>],
       [<AxesSubplot: title={'center': 'frac\n9'}>,
        <AxesSubplot: title={'center': 'frac\n9'}>],
       [<AxesSubplot: title={'center': 'frac\n10'}>,
        <AxesSubplot: title={'center': 'frac\n10'}>],
       [<AxesSubplot: title={'center': 'frac\n11'}>,
        <AxesSubplot: title={'center': 'frac\n11'}>],
       [<AxesSubplot: title={'center': 'frac\n12'}>,
        <AxesSubplot: title={'center': 'frac\n12'}>],
       [<AxesSubplot: title={'center': 'frac\n13'}>,
        <AxesSubplot: title={'center': 'frac\n13'}>],
       [<AxesSubplot: title={'center': 'frac\n14'}>,
        <AxesSubplot: title={'center': 'frac\n14'}>],
       [<AxesSubplot: title={'center': 'frac\n15'}>,
        <AxesSubplot: title={'center': 'frac\n15'}>],
       [<AxesSubplot: title={'center': 'frac\n16'}>,
        <AxesSubplot: title={'center': 'frac\n16'}>],
       [<AxesSubplot: title={'center': 'frac\n17'}>,
        <AxesSubplot: title={'center': 'frac\n17'}>],
       [<AxesSubplot: title={'center': 'frac\n18'}>,
        <AxesSubplot: title={'center': 'frac\n18'}>],
       [<AxesSubplot: title={'center': 'frac\n19'}>,
        <AxesSubplot: title={'center': 'frac\n19'}>]], dtype=object)

	allele	allele_freq	lo	hi	post_mean
0	DQB1*02:01	0.102965	0.070885	0.141614	0.107
1	DQB1*02:02	0.062476	0.026928	0.080378	0.052
2	DQB1*03:01	0.271881	0.216031	0.325208	0.274
3	DQB1*03:02	0.045532	0.023916	0.069979	0.044
4	DQB1*03:03	0.128528	0.065484	0.140824	0.102
5	DQB1*03:04	0.001752	0.000079	0.010115	0.004
6	DQB1*03:13	0.001752	0.000117	0.010007	0.004
7	DQB1*04:01	0.038527	0.016878	0.059678	0.038
8	DQB1*04:02	0.021780	0.012336	0.048523	0.031
9	DQB1*05:01	0.060791	0.035718	0.09095	0.062
10	DQB1*05:02	0.039710	0.020024	0.065927	0.042
11	DQB1*05:03	0.020403	0.00555	0.033167	0.018
12	DQB1*05:04	0.001170	0.000059	0.008448	0.004
13	DQB1*05:42	0.001170	0.000095	0.008946	0.004
14	DQB1*06:01	0.065409	0.03638	0.091338	0.063
15	DQB1*06:02	0.071234	0.03162	0.088836	0.061
16	DQB1*06:03	0.029325	0.018884	0.062921	0.04
17	DQB1*06:04	0.018674	0.010062	0.04377	0.026
18	DQB1*06:05	0.001165	0.00005	0.008732	0.004
19	DQB1*06:07	0.001743	0.000112	0.009819	0.004
20	DQB1*06:08	0.001165	0.000012	0.008881	0.004
21	DQB1*06:09	0.012848	0.001486	0.023908	0.012

	allele	loci	population	allele_freq	sample_size
0	DQB1*02:01	DQB1	Germany DKMS - Italy minority	0.1898	1159
17	DQB1*02:01	DQB1	Italy Bergamo	0.0910	101
54	DQB1*02:01	DQB1	Italy Central	0.1730	380
69	DQB1*02:01	DQB1	Italy North pop 3	0.0730	97
102	DQB1*02:01	DQB1	Italy Rome	0.1200	100
144	DQB1*02:01	DQB1	Italy Sardinia pop2	0.5900	1129
159	DQB1*02:01	DQB1	Italy pop 2	0.1980	53
171	DQB1*02:01	DQB1	Italy pop 3	0.2300	179
185	DQB1*02:01	DQB1	Italy pop 5	0.0780	975

	mean	sd	hdi_3%	hdi_97%	mcse_mean	mcse_sd	ess_bulk	ess_tail	r_hat
frac[0]	0.144	0.025	0.095	0.189	0.000	0.000	5064.0	2570.0	1.0
frac[1]	0.020	0.008	0.007	0.034	0.000	0.000	4900.0	2553.0	1.0
frac[2]	0.003	0.002	0.000	0.008	0.000	0.000	4228.0	2307.0	1.0
frac[3]	0.209	0.030	0.152	0.264	0.000	0.000	4433.0	2760.0	1.0
frac[4]	0.072	0.017	0.042	0.104	0.000	0.000	5285.0	2477.0	1.0
frac[5]	0.042	0.011	0.021	0.063	0.000	0.000	4996.0	2143.0	1.0
frac[6]	0.015	0.006	0.005	0.025	0.000	0.000	4635.0	2491.0	1.0
frac[7]	0.008	0.004	0.002	0.015	0.000	0.000	4582.0	2966.0	1.0
frac[8]	0.002	0.002	0.000	0.005	0.000	0.000	2501.0	1628.0	1.0
frac[9]	0.002	0.002	0.000	0.004	0.000	0.000	2900.0	1691.0	1.0
frac[10]	0.002	0.002	0.000	0.005	0.000	0.000	2636.0	1445.0	1.0
frac[11]	0.002	0.002	0.000	0.005	0.000	0.000	3434.0	1832.0	1.0
frac[12]	0.002	0.002	0.000	0.004	0.000	0.000	2833.0	1737.0	1.0
frac[13]	0.003	0.002	0.000	0.007	0.000	0.000	4053.0	2244.0	1.0
frac[14]	0.002	0.002	0.000	0.004	0.000	0.000	2968.0	1520.0	1.0
frac[15]	0.002	0.002	0.000	0.005	0.000	0.000	2586.0	1530.0	1.0
frac[16]	0.003	0.002	0.000	0.008	0.000	0.000	3245.0	1888.0	1.0
frac[17]	0.005	0.003	0.001	0.011	0.000	0.000	3413.0	2071.0	1.0
frac[18]	0.040	0.011	0.021	0.062	0.000	0.000	4816.0	2444.0	1.0
frac[19]	0.003	0.002	0.000	0.007	0.000	0.000	3833.0	1695.0	1.0
frac[20]	0.089	0.019	0.055	0.123	0.000	0.000	5406.0	2574.0	1.0
frac[21]	0.070	0.016	0.042	0.101	0.000	0.000	5408.0	2315.0	1.0
frac[22]	0.052	0.013	0.029	0.077	0.000	0.000	5539.0	2855.0	1.0
frac[23]	0.005	0.003	0.001	0.010	0.000	0.000	3347.0	1889.0	1.0
frac[24]	0.027	0.008	0.012	0.043	0.000	0.000	3708.0	2691.0	1.0
frac[25]	0.048	0.012	0.026	0.071	0.000	0.000	5204.0	2544.0	1.0
frac[26]	0.049	0.013	0.026	0.072	0.000	0.000	4751.0	2326.0	1.0
frac[27]	0.044	0.012	0.022	0.066	0.000	0.000	4116.0	2295.0	1.0
frac[28]	0.003	0.002	0.000	0.008	0.000	0.000	3708.0	2002.0	1.0
frac[29]	0.002	0.002	0.000	0.004	0.000	0.000	2600.0	1636.0	1.0
frac[30]	0.002	0.002	0.000	0.004	0.000	0.000	2956.0	1951.0	1.0
frac[31]	0.002	0.002	0.000	0.004	0.000	0.000	3457.0	2407.0	1.0
frac[32]	0.012	0.005	0.004	0.021	0.000	0.000	3938.0	2234.0	1.0
frac[33]	0.002	0.002	0.000	0.005	0.000	0.000	2976.0	1941.0	1.0
frac[34]	0.002	0.002	0.000	0.005	0.000	0.000	3412.0	1919.0	1.0
frac[35]	0.002	0.002	0.000	0.004	0.000	0.000	2827.0	1542.0	1.0
frac[36]	0.002	0.002	0.000	0.004	0.000	0.000	3072.0	1448.0	1.0
frac[37]	0.002	0.002	0.000	0.004	0.000	0.000	2261.0	1217.0	1.0
frac[38]	0.002	0.002	0.000	0.004	0.000	0.000	2664.0	1623.0	1.0
frac[39]	0.002	0.002	0.000	0.005	0.000	0.000	3466.0	2163.0	1.0
frac[40]	0.002	0.002	0.000	0.005	0.000	0.000	3012.0	1648.0	1.0
frac[41]	0.002	0.002	0.000	0.004	0.000	0.000	3740.0	2475.0	1.0
frac[42]	0.002	0.002	0.000	0.005	0.000	0.000	2703.0	1464.0	1.0
frac[43]	0.002	0.002	0.000	0.005	0.000	0.000	3380.0	2648.0	1.0
conc	19.339	3.001	13.982	25.118	0.054	0.038	3001.0	2386.0	1.0

Calculating credible intervals¶

Explaining differences between `allele_freq` and `post_mean`¶

Inappropriate prior¶

Heterogeneity in estimated frequency for a single allele¶

Issues with model convergence¶

Model details¶

Fitting the model¶

Validating the model¶

Calculating credible intervals¶

Explaining differences between allele_freq and post_mean¶

Inappropriate prior¶

Heterogeneity in estimated frequency for a single allele¶

Issues with model convergence¶

Model details¶

Fitting the model¶

Validating the model¶

Explaining differences between `allele_freq` and `post_mean`¶