import HLAfreq
from HLAfreq import HLAfreq_pymc as HLAhdi
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import arviz as az

WARNING (pytensor.tensor.blas): Using NumPy C-API based implementation for BLAS functions.

countries = ['Cameroon','Cape+Verde','Ghana','Guinea',
    'Guinea-Bissau', 'Kenya','Sao+Tome+and+Principe','Senegal',
    'South+Africa','Uganda','Zimbabwe']

for country in countries:
    print(country)
    base_url = HLAfreq.makeURL(
        country, standard='s', locus="A",
        resolution_pattern="bigger_equal_than", resolution=2)
    aftab = HLAfreq.getAFdata(base_url)
    aftab.to_csv("../data/example/multi_country/%s_raw.csv" %country, index=False)

Cameroon
2 pages of results
Download complete
Cape+Verde
1 pages of results
Download complete
Ghana
1 pages of results
Download complete
Guinea
3 pages of results
Download complete
Guinea-Bissau
2 pages of results
Download complete
Kenya
2 pages of results
Download complete
Sao+Tome+and+Principe
1 pages of results
Download complete
Senegal
1 pages of results
Download complete
South+Africa
3 pages of results
Download complete
Uganda
1 pages of results
Download complete
Zimbabwe
1 pages of results
Download complete

cafs = []
for country in countries:
    # Load raw country data
    aftab = pd.read_csv("../data/example/multi_country/%s_raw.csv" %country)
    # Drop any incomplete studies
    aftab = HLAfreq.only_complete(aftab)
    # Ensure all alleles have the same resolution
    aftab = HLAfreq.decrease_resolution(aftab, 2)
    # Combine studies within country
    caf = HLAfreq.combineAF(aftab)
    # Add country name to dataset, this is used as `datasetID` going forward
    caf['country'] = country
    cafs.append(caf)

population               loci
South Africa Natal Zulu  A       0.935
Name: allele_freq, dtype: float64
1 studies have total allele frequency < 0.95

cafs = pd.concat(cafs, ignore_index=True)
international = HLAfreq.combineAF(cafs, datasetID='country')

# Plot international averages as bar plot
mask = international.allele_freq > 0.01
international[mask].plot.barh('allele', 'allele_freq')
plt.show()

mask2 = cafs.allele.isin(international.allele[mask])
# Plot national averages as grouped bar plot
cafs[mask2].pivot(index='allele', columns='country', values='allele_freq').plot.bar()
plt.show()

# Plot international allele frequencies estimates and individual countries
# Without filtering
HLAfreq.plotAF(international, cafs, datasetID='country')

# Plot specific alleles
# Select alleles to plot
hifreq = international[international.allele_freq > 0.01].allele
# Must be a list
hifreq = hifreq.tolist()
# Plot only selected alleles
HLAfreq.plotAF(
    international[international.allele.isin(hifreq)],
    cafs[cafs.allele.isin(hifreq)],
    datasetID='country')

cafs[cafs.allele == "A*24:02"].sort_values('allele_freq')

aftab = pd.read_csv("../data/example/multi_country/Guinea_raw.csv")
aftab.population.unique()

array(['Guinea Bissau Balanta', 'Guinea Bissau Bijago',
       'Guinea Bissau Fula', 'Guinea Bissau Papel', 'Guinea Bissau',
       'Papua New Guinea East New Britain Rabaul',
       'Papua New Guinea Eastern Highlands Goroka Asaro',
       'Papua New Guinea Karimui Plateau Pawaia',
       'Papua New Guinea Madang',
       'Papua New Guinea West Schrader Ranges Haruai',
       'Papua New Guinea Wosera Abelam'], dtype=object)

population_sizes = {'Cameroon':24348251,
    'Cape+Verde':563198,
    'Ghana':30832019,
    'Guinea':12907395,
    'Guinea-Bissau':1646077,
    'Kenya':47564296,
    'Sao+Tome+and+Principe':214610,
    'Senegal':17223497,
    'South+Africa':60604992,
    'Uganda':42885900,
    'Zimbabwe':15178979
}

country_data = pd.DataFrame(
    {'country':population_sizes.keys(),
    'population':population_sizes.values()}
)

# What proportion of the regional population does each country account for
country_data['proportion'] = country_data.population/country_data.population.sum()
# How much will each individual in the country count towards the sample size?
country_data['individual_weight'] = country_data['proportion'] * len(country_data.country)

# Add country data to Combined Allele Frequency data
cafs = pd.merge(cafs, country_data, how="left", on='country')
# Sample size is multiplied by this individual weight and doubled
# this accounts for diploid samples from each individual
cafs['weighted_sample_size'] = cafs.sample_size * 2 * cafs.individual_weight

# Calculate allele frequency, weighting each country by the column weighted_sample_szie
winternational = HLAfreq.combineAF(cafs, datasetID='country', weights='weighted_sample_size')

hifreq = winternational[winternational.allele_freq > 0.01].allele
# Must be a list
hifreq = hifreq.tolist()
# Plot only selected alleles
HLAfreq.plotAF(
    winternational[winternational.allele.isin(hifreq)],
    cafs[cafs.allele.isin(hifreq)],
    datasetID='country')

countries = ['Cameroon','Cape+Verde','Ghana',
    'Guinea-Bissau', 'Kenya','Sao+Tome+and+Principe','Senegal',
    'South+Africa','Uganda','Zimbabwe']
aftabs = []
for country in countries:
  # Load raw country data
  aftab = pd.read_csv("../data/example/multi_country/%s_raw.csv" %country)
  # Drop any incomplete studies
  aftab = HLAfreq.only_complete(aftab)
  # Ensure all alleles have the same resolution
  aftab = HLAfreq.decrease_resolution(aftab, 2)
  aftab['country'] = country
  aftabs.append(aftab)

population               loci
South Africa Natal Zulu  A       0.935
Name: allele_freq, dtype: float64
1 studies have total allele frequency < 0.95

# Join them up
all_aftabs = pd.concat(aftabs)

# Add the missing alleles
all_aftabs = HLAfreq.unmeasured_alleles(all_aftabs, datasetID='population')

# Split them apart again
aftabs = []
for country in countries:
  # List all populations in a given country
  country_populations = all_aftabs[all_aftabs.country == country].population.unique()
  # Select all data from a population in the given country
  mask = all_aftabs.population.isin(country_populations)
  aftabs.append(all_aftabs[mask])

idatas = []
for i,aftab in enumerate(aftabs):
  print(countries[i])
  # fit pymc model to each dataset separately
  c_array, allele_names = HLAhdi._make_c_array(aftab)
  idata = HLAhdi._fit_Dirichlet_Multinomial(c_array)
  idatas.append(idata)

Cameroon

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [frac, conc]

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 22 seconds.

Cape+Verde

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [frac, conc]

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 24 seconds.

Ghana

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [frac, conc]

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 22 seconds.

Guinea-Bissau

population_sizes = {'Cameroon':24348251,
    'Cape+Verde':563198,
    'Ghana':30832019,
    # 'Guinea':12907395,
    'Guinea-Bissau':1646077,
    'Kenya':47564296,
    'Sao+Tome+and+Principe':214610,
    'Senegal':17223497,
    'South+Africa':60604992,
    'Uganda':42885900,
    'Zimbabwe':15178979
}

country_data = pd.DataFrame(
    {'country':population_sizes.keys(),
    'population':population_sizes.values()}
)

# What proportion of the regional population does each country account for
country_data['proportion'] = country_data.population/country_data.population.sum()

weighted_country_estimates = []
for i,country in enumerate(countries):
    # Country specific weight
    weight = country_data[country_data.country==country].proportion.values
    # Weighting the country estimate
    weighted_country_estimate = idatas[i].posterior['frac'] * weight
    weighted_country_estimates.append(weighted_country_estimate)

# Sum all the weighted country estimates
weighted_international_estimate = sum(weighted_country_estimates)

# Get model summary
summary = az.summary(weighted_international_estimate)
# Add the allele names on to the summary
# these were produced when we made c_array
summary.index = allele_names
summary

plt.scatter(winternational.allele_freq, summary['mean'])
plt.vlines(x=winternational.allele_freq, ymin=summary['hdi_3%'], ymax=summary['hdi_97%'], color="lightskyblue", zorder=0)
plt.plot([0,.1],[0,.1], c="black", linestyle="--")
plt.xlabel('Weighted default AF')
plt.ylabel('Weighted compound AF')
plt.show()

idatas2 = []
for i,aftab in enumerate(aftabs):
  print(countries[i])
  # fit pymc model to each dataset separately
  c_array, allele_names = HLAhdi._make_c_array(aftab)
  # number of alleles
  k = len(aftab.allele.unique())
  # use 1/k prior
  idata = HLAhdi._fit_Dirichlet_Multinomial(c_array, prior=[1/k]*k)
  idatas2.append(idata)

Cameroon

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [frac, conc]

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 272 seconds.

Cape+Verde

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [frac, conc]

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 285 seconds.

Ghana

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [frac, conc]

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 308 seconds.

Guinea-Bissau

weighted_country_estimates2 = []
for i,country in enumerate(countries):
    # Country specific weight
    weight = country_data[country_data.country==country].proportion.values
    # Weighting the country estimate
    weighted_country_estimate = idatas2[i].posterior['frac'] * weight
    weighted_country_estimates2.append(weighted_country_estimate)

# Sum all the weighted country estimates
weighted_international_estimate2 = sum(weighted_country_estimates2)

# Get model summary
summary2 = az.summary(weighted_international_estimate2)
# Add the allele names on to the summary
# these were produced when we made c_array
summary2.index = allele_names
summary2

plt.scatter(winternational.allele_freq, summary2['mean'])
plt.vlines(x=winternational.allele_freq, ymin=summary2['hdi_3%'], ymax=summary2['hdi_97%'], color="lightskyblue", zorder=0)
plt.plot([0,.1],[0,.1], c="black", linestyle="--")
plt.xlabel('Weighted default AF')
plt.ylabel('Weighted 1/k compound AF')
plt.show()

# az.summary() gives 96% credible intervals
# but any interval can be obtained with az.hdi()
az.hdi(weighted_international_estimate2, 0.95).frac.values

array([[0.02759661, 0.0548428 ],
       [0.00452285, 0.01433677],
       [0.00377304, 0.0129694 ],
       [0.00176486, 0.00881422],
       [0.00231861, 0.01024124],
       [0.00203228, 0.00954437],
       [0.03415686, 0.06497219],
       [0.01573759, 0.03394972],
       [0.00235762, 0.01074451],
       [0.0021822 , 0.00932612],
       [0.01403252, 0.03101837],
       [0.00373039, 0.01356755],
       [0.00196443, 0.00987427],
       [0.00434116, 0.01517492],
       [0.00564147, 0.01691425],
       [0.0026366 , 0.01053915],
       [0.00148889, 0.00845054],
       [0.00176905, 0.00898124],
       [0.00234081, 0.01046179],
       [0.00186188, 0.00876754],
       [0.00203182, 0.00905159],
       [0.00193571, 0.00895337],
       [0.00179998, 0.00900889],
       [0.02321528, 0.04599152],
       [0.00311518, 0.01205201],
       [0.00896835, 0.02769771],
       [0.00189439, 0.00985112],
       [0.00206027, 0.00961964],
       [0.02288719, 0.04653844],
       [0.00210568, 0.0104285 ],
       [0.00180538, 0.00896166],
       [0.00150521, 0.00843743],
       [0.00204784, 0.0101851 ],
       [0.01709138, 0.03842565],
       [0.00212325, 0.01014264],
       [0.00165893, 0.00846217],
       [0.00372449, 0.01475766],
       [0.00152746, 0.00848436],
       [0.00193521, 0.00922991],
       [0.00157473, 0.00838175],
       [0.00378695, 0.01341583],
       [0.01160147, 0.02687958],
       [0.00178218, 0.00884404],
       [0.00191682, 0.00977162],
       [0.00165529, 0.00880991],
       [0.00330384, 0.01207244],
       [0.00703055, 0.01960981],
       [0.0193696 , 0.03994969],
       [0.00181336, 0.0088939 ],
       [0.00228535, 0.01033887],
       [0.00184695, 0.00903161],
       [0.02108054, 0.04292904],
       [0.02220973, 0.04393149],
       [0.0018361 , 0.00901249],
       [0.00890208, 0.02225097],
       [0.00170556, 0.00899271],
       [0.00199869, 0.00952631],
       [0.0020753 , 0.00991832],
       [0.00976027, 0.02484884],
       [0.00276726, 0.01087186],
       [0.00335449, 0.01247954],
       [0.01155675, 0.02631822],
       [0.00185561, 0.00900223],
       [0.0019529 , 0.00912835],
       [0.00216055, 0.01033339],
       [0.00690024, 0.01936623],
       [0.01479347, 0.03303133],
       [0.00246119, 0.01059669],
       [0.01388516, 0.03169402],
       [0.01108828, 0.02599472],
       [0.00311409, 0.0128932 ],
       [0.01510048, 0.03311533],
       [0.00578678, 0.01594144],
       [0.00361539, 0.01296951],
       [0.0160388 , 0.03481026],
       [0.02409609, 0.04690669],
       [0.00162889, 0.00867585],
       [0.00184908, 0.00935288],
       [0.00160451, 0.00851681],
       [0.001937  , 0.00959509],
       [0.00289136, 0.0113081 ],
       [0.01694051, 0.03715183],
       [0.00159336, 0.00858231],
       [0.00447347, 0.0143532 ],
       [0.00440853, 0.01568641]])

# This object can be fully explored using arviz like any other pymc model
az.plot_trace(weighted_international_estimate2)

array([[<AxesSubplot: title={'center': 'frac'}>,
        <AxesSubplot: title={'center': 'frac'}>]], dtype=object)

	allele	loci	wav	c	sample_size	alpha	allele_freq	country
383	A*24:02	A	0.004000	1.8400	230	1	0.005818	Zimbabwe
262	A*24:02	A	0.005000	1.6500	165	1	0.007458	Senegal
183	A*24:02	A	0.010700	16.0280	749	1	0.010936	Kenya
11	A*24:02	A	0.010322	5.9660	289	1	0.011301	Cameroon
136	A*24:02	A	0.020740	7.9640	192	1	0.021445	Guinea-Bissau
342	A*24:02	A	0.025292	16.9960	336	1	0.024831	Uganda
231	A*24:02	A	0.025449	4.9880	98	1	0.026148	Sao+Tome+and+Principe
45	A*24:02	A	0.068500	16.9880	124	1	0.064939	Cape+Verde
297	A*24:02	A	0.077965	61.5924	395	1	0.074961	South+Africa
99	A*24:02	A	0.327856	419.6560	640	1	0.319246	Guinea

	mean	sd	hdi_3%	hdi_97%	mcse_mean	mcse_sd	ess_bulk	ess_tail	r_hat
A*01:01	0.063	0.009	0.047	0.079	0.0	0.0	1774.0	2231.0	1.00
A*01:02	0.004	0.002	0.002	0.007	0.0	0.0	1002.0	1315.0	1.00
A*01:03	0.003	0.001	0.001	0.005	0.0	0.0	1903.0	2290.0	1.00
A*01:06	0.000	0.000	0.000	0.001	0.0	0.0	1894.0	1317.0	1.00
A*01:09	0.001	0.001	0.000	0.002	0.0	0.0	1221.0	1575.0	1.00
...	...	...	...	...	...	...	...	...	...
A*69:01	0.002	0.001	0.000	0.004	0.0	0.0	1373.0	1856.0	1.00
A*74:01	0.042	0.006	0.030	0.053	0.0	0.0	886.0	1661.0	1.01
A*74:02	0.000	0.000	0.000	0.000	0.0	0.0	1240.0	1675.0	1.00
A*74:03	0.004	0.001	0.001	0.006	0.0	0.0	2653.0	2560.0	1.00
A*80:01	0.010	0.004	0.003	0.017	0.0	0.0	3136.0	2721.0	1.00

Combine multi-country HLA data¶

Download HLA data for each specified country¶

Combine allele frequencies within country¶

Combine all country data¶

View allele frequencies¶

Weighting countries by population size¶

Credible intervals and population size¶

	mean	sd	hdi_3%	hdi_97%	mcse_mean	mcse_sd	ess_bulk	ess_tail	r_hat
A*01:01	0.040	0.007	0.028	0.054	0.0	0.0	4128.0	3365.0	1.0
A*01:02	0.009	0.003	0.005	0.014	0.0	0.0	4615.0	3709.0	1.0
A*01:03	0.008	0.002	0.004	0.013	0.0	0.0	3747.0	3452.0	1.0
A*01:06	0.005	0.002	0.002	0.009	0.0	0.0	5086.0	3560.0	1.0
A*01:09	0.006	0.002	0.002	0.010	0.0	0.0	4828.0	3588.0	1.0
...	...	...	...	...	...	...	...	...	...
A*69:01	0.007	0.002	0.003	0.011	0.0	0.0	5330.0	3755.0	1.0
A*74:01	0.027	0.005	0.017	0.037	0.0	0.0	4433.0	3294.0	1.0
A*74:02	0.005	0.002	0.002	0.008	0.0	0.0	5237.0	3646.0	1.0
A*74:03	0.009	0.003	0.004	0.014	0.0	0.0	4620.0	3717.0	1.0
A*80:01	0.010	0.003	0.005	0.016	0.0	0.0	3889.0	3798.0	1.0