HLAfreq.HLAfreq

Download and combine HLA allele frequencies from multiple datasets.

Download allele frequency data from allelefrequencies.net. Allele frequencies from different populations can be combined to estimate HLA frequencies of countries or other regions such as global HLA frequencies.

  1"""
  2Download and combine HLA allele frequencies from multiple datasets.
  3
  4Download allele frequency data from
  5[allelefrequencies.net](www.allelefrequencies.net). Allele
  6frequencies from different populations can be combined to
  7estimate HLA frequencies of countries or other regions such as
  8global HLA frequencies.
  9"""
 10
 11from collections.abc import Iterable
 12from bs4 import BeautifulSoup
 13import requests
 14import pandas as pd
 15import numpy as np
 16import matplotlib.pyplot as plt
 17import math
 18import scipy as sp
 19import matplotlib.colors as mcolors
 20
 21
 22def simulate_population(alleles: Iterable[str], locus: str, population: str):
 23    pop_size = np.random.randint(len(alleles), 50)
 24    samples = np.random.choice(alleles, pop_size, replace=True)
 25    counts = pd.Series(samples).value_counts()
 26    counts.values / pop_size
 27    pop = pd.DataFrame(
 28        {
 29            "allele": counts.index,
 30            "loci": locus,
 31            "population": population,
 32            "allele_freq": counts.values / pop_size,
 33            "sample_size": pop_size,
 34        }
 35    )
 36    return pop
 37
 38
 39def simulate_study(alleles, populations, locus):
 40    study = []
 41    for i in range(populations):
 42        pop = simulate_population(alleles=alleles, locus=locus, population=f"pop_{i}")
 43        study.append(pop)
 44
 45    study = pd.concat(study)
 46    return study
 47
 48
 49def makeURL(
 50    country="",
 51    standard="s",
 52    locus="",
 53    resolution_pattern="bigger_equal_than",
 54    resolution=2,
 55    region="",
 56    ethnic="",
 57    study_type="",
 58    dataset_source="",
 59    sample_year="",
 60    sample_year_pattern="",
 61    sample_size="",
 62    sample_size_pattern="",
 63):
 64    """Create URL for search of allele frequency net database.
 65
 66    All arguments are documented [here](https://www.allelefrequencies.net/extaccess.asp)
 67
 68    Args:
 69        country (str, optional): Country name to retrieve records from. Defaults to "".
 70        standard (str, optional): Filter study quality standard to this or higher.
 71            {'g', 's', 'a'} Gold, silver, all. Defaults to 's'.
 72        locus (str, optional): The locus to return allele data for. Defaults to "".
 73        resolution_pattern (str, optional): Resolution comparitor {'equal', 'different',
 74            'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}.
 75            Filter created using `resolution` and `resolution_pattern`.
 76            Defaults to "bigger_equal_than".
 77        resolution (int, optional): Number of fields of resolution of allele. Filter
 78            created using `resolution` and `resolution_pattern`. Defaults to 2.
 79        region (str, optional): Filter to geographic region. {Asia, Australia,
 80            Eastern Europe, ...}.
 81            All regions listed [here](https://www.allelefrequencies.net/pop6003a.asp).
 82            Defaults to "".
 83        ethnic (str, optional): Filter to ethnicity. {"Amerindian", "Black", "Caucasian", ...}.
 84            All ethnicities listed [here](https://www.allelefrequencies.net/pop6003a.asp).
 85            Defaults to "".
 86        study_type (str, optional): Type of study. {"Anthropology", "Blood+Donor",
 87            "Bone+Marrow+Registry", "Controls+for+Disease+Study", "Disease+Study+Patients",
 88            "Other", "Solid+Organd+Unrelated+Donors", "Stem+cell+donors"}. Defaults to "".
 89        dataset_source (str, optional): Source of data. {"Literature",
 90            "Proceedings+of+IHWs", "Unpublished"}. Defaults to "".
 91        sample_year (int, optional): Sample year to compare to. Filter created using
 92            sample_year and sample_year_pattern. Defaults to "".
 93        sample_year_pattern (str, optional): Pattern to compare sample year to. Filter
 94            created using sample_year and sample_year_pattern. {'equal', 'different',
 95            'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}.
 96            Defaults to "".
 97        sample_size (int, optional): Sample size to compare to. Filter created using
 98            sample_size and sample_size_pattern. Defaults to "".
 99        sample_size_pattern (str, optional): Pattern to compare sample size to. Filter
100            created using sample_size and sample_size_pattern. {'equal', 'different',
101            'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}.
102            Defaults to "".
103
104    Returns:
105        str: URL to search allelefrequencies.net
106    """
107    base = "https://www.allelefrequencies.net/hla6006a.asp?"
108    locus_type = "hla_locus_type=Classical&"
109    hla_locus = "hla_locus=%s&" % (locus)
110    country = "hla_country=%s&" % (country)
111    region = "hla_region=%s&" % (region)
112    ethnic = "hla_ethnic=%s&" % (ethnic)
113    study_type = "hla_study=%s&" % (study_type)
114    dataset_source = "hla_dataset_source=%s&" % (dataset_source)
115    sample_year = "hla_sample_year=%s&" % (sample_year)
116    sample_year_pattern = "hla_sample_year_pattern=%s&" % (sample_year_pattern)
117    sample_size = "hla_sample_size=%s&" % (sample_size)
118    sample_size_pattern = "hla_sample_size_pattern=%s&" % (sample_size_pattern)
119    hla_level_pattern = "hla_level_pattern=%s&" % (resolution_pattern)
120    hla_level = "hla_level=%s&" % (resolution)
121    standard = "standard=%s&" % standard
122    url = (
123        base
124        + locus_type
125        + hla_locus
126        + country
127        + hla_level_pattern
128        + hla_level
129        + standard
130        + region
131        + ethnic
132        + study_type
133        + dataset_source
134        + sample_year
135        + sample_year_pattern
136        + sample_size
137        + sample_size_pattern
138    )
139    return url
140
141
142def parseAF(bs):
143    """Generate a dataframe from a given html page
144
145    Args:
146        bs (bs4.BeautifulSoup): BeautifulSoup object from allelefrequencies.net page
147
148    Returns:
149        pd.DataFrame: Table of allele, allele frequency, samplesize, and population
150    """
151    # Get the results table from the div `divGenDetail`
152    tab = bs.find("div", {"id": "divGenDetail"}).find("table", {"class": "tblNormal"})
153    # Get the column headers from the first row of the table
154    columns = [
155        "line",
156        "allele",
157        "flag",
158        "population",
159        "carriers%",
160        "allele_freq",
161        "AF_graphic",
162        "sample_size",
163        "database",
164        "distribution",
165        "haplotype_association",
166        "notes",
167    ]
168    rows = []
169    for row in tab.find_all("tr"):
170        rows.append([td.get_text(strip=True) for td in row.find_all("td")])
171    # Make dataframe of table rows
172    # skip the first row as it's `th` headers
173    df = pd.DataFrame(rows[1:], columns=columns)
174
175    # Get HLA loci
176    df["loci"] = df.allele.apply(lambda x: x.split("*")[0])
177
178    # Drop unwanted columns
179    df = df[["allele", "loci", "population", "allele_freq", "carriers%", "sample_size"]]
180    return df
181
182
183def Npages(bs):
184    """How many pages of results are there?
185
186    Args:
187        bs (bs4.BeautifulSoup): BS object of allelefrequencies.net results page
188
189    Returns:
190        int: Total number of results pages
191    """
192    # Get the table with number of pages
193    navtab = bs.find("div", {"id": "divGenNavig"}).find("table", {"class": "table10"})
194    if not navtab:
195        raise AssertionError(
196            "navtab does not evaluate to True. Check URL returns results in web browser."
197        )
198    # Get cell with ' of ' in
199    pagesOfN = [
200        td.get_text(strip=True) for td in navtab.find_all("td") if " of " in td.text
201    ]
202    # Check single cell returned
203    if not len(pagesOfN) == 1:
204        raise AssertionError("divGenNavig should contain 1 of not %s" % len(pagesOfN))
205    # Get total number of pages
206    N = pagesOfN[0].split("of ")[1]
207    N = int(N)
208    return N
209
210
211def formatAF(AFtab, ignoreG=True):
212    """Format allele frequency table.
213
214    Convert sample_size and allele_freq to numeric data type.
215    Removes commas from sample size. Removes "(*)" from allele frequency if
216    `ignoreG` is `True`. `formatAF()` is used internally by combineAF and getAFdata
217    by default.
218
219    Args:
220        AFtab (pd.DataFrame): Allele frequency data downloaded from allelefrequency.net
221            using `getAFdata()`.
222        ignoreG (bool, optional): Treat G group alleles as normal.
223            See https://hla.alleles.org/alleles/g_groups.html for details. Defaults to True.
224
225    Returns:
226        pd.DataFrame: The formatted allele frequency data.
227    """
228    df = AFtab.copy()
229    if pd.api.types.is_string_dtype(df.sample_size.dtype):
230        df.sample_size = pd.to_numeric(df.sample_size.str.replace(",", ""))
231    if pd.api.types.is_string_dtype(df.allele_freq.dtype):
232        if ignoreG:
233            df.allele_freq = df.allele_freq.str.replace("(*)", "", regex=False)
234        df.allele_freq = pd.to_numeric(df.allele_freq)
235    return df
236
237
238def getAFdata(base_url, timeout=20, format=True, ignoreG=True):
239    """Get all allele frequency data from a search base_url.
240
241    Iterates over all pages regardless of which page is based.
242
243    Args:
244        base_url (str): URL for base search.
245        timeout (int): How long to wait to receive a response.
246        format (bool): Format the downloaded data using `formatAF()`.
247        ignoreG (bool): treat allele G groups as normal.
248            See https://hla.alleles.org/alleles/g_groups.html for details. Default = True
249
250    Returns:
251        pd.DataFrame: allele frequency data parsed into a pandas dataframe
252    """
253    # Get BS object from base search
254    try:
255        bs = BeautifulSoup(requests.get(base_url, timeout=timeout).text, "html.parser")
256    except requests.exceptions.ReadTimeout as e:
257        raise Exception(
258            "Requests timeout, try a larger `timeout` value for `getAFdata()`"
259        ) from None
260    # How many pages of results
261    N = Npages(bs)
262    print("%s pages of results" % N)
263    # iterate over pages, parse and combine data from each
264    tabs = []
265    for i in range(N):
266        # print (" Parsing page %s" %(i+1))
267        print(" Parsing page %s" % (i + 1), end="\r")
268        url = base_url + "page=" + str(i + 1)
269        try:
270            bs = BeautifulSoup(requests.get(url, timeout=timeout).text, "html.parser")
271        except requests.exceptions.ReadTimeout as e:
272            raise Exception(
273                "Requests timeout, try a larger `timeout` value for `getAFdata()`"
274            ) from None
275        tab = parseAF(bs)
276        tabs.append(tab)
277    print("Download complete")
278    tabs = pd.concat(tabs)
279    if format:
280        try:
281            tabs = formatAF(tabs, ignoreG)
282        except AttributeError:
283            print("Formatting failed, non-numeric datatypes may remain.")
284    return tabs
285
286
287def incomplete_studies(AFtab, llimit=0.95, ulimit=1.1, datasetID="population"):
288    """Report any studies with allele freqs that don't sum to 1
289
290    Args:
291        AFtab (pd.DataFrame): Dataframe containing multiple studies
292        llimit (float, optional): Lower allele_freq sum limit that counts as complete.
293            Defaults to 0.95.
294        ulimit (float, optional): Upper allele_freq sum limit that will not be reported.
295            Defaults to 1.1.
296        datasetID (str): Unique identifier column for study
297    """
298    poplocs = AFtab.groupby([datasetID, "loci"]).allele_freq.sum()
299    lmask = poplocs < llimit
300    if sum(lmask > 0):
301        print(poplocs[lmask])
302        print(f"{sum(lmask)} studies have total allele frequency < {llimit}")
303    umask = poplocs > ulimit
304    if sum(umask > 0):
305        print(poplocs[umask])
306        print(f"{sum(umask)} studies have total allele frequency > {ulimit}")
307    incomplete = pd.concat([poplocs[lmask], poplocs[umask]])
308    return incomplete
309
310
311def only_complete(AFtab, llimit=0.95, ulimit=1.1, datasetID="population"):
312    """Returns only complete studies.
313
314    Data is dropped if the locus for that population is not complete, i.e. doesn't
315    sum to between `llimit` and `ulimit`. This prevents throwing away data if
316    another loci in the population is incomplete.
317
318    Args:
319        AFtab (pd.DataFrame): Dataframe containing multiple studies
320        llimit (float, optional): Lower allele_freq sum limit that counts as complete.
321            Defaults to 0.95.
322        ulimit (float, optional): Upper allele_freq sum limit that will not be reported.
323            Defaults to 1.1.
324        datasetID (str): Unique identifier column for study. Defaults to 'population'.
325
326    Returns:
327        pd.DataFrame: Allele frequency data of multiple studies, but only complete studies.
328    """
329    noncomplete = incomplete_studies(
330        AFtab=AFtab, llimit=llimit, ulimit=ulimit, datasetID=datasetID
331    )
332    # Returns False if population AND loci are in the noncomplete.index
333    # AS A PAIR
334    # This is important so that we don't throw away all data on a population
335    # just because one loci is incomplete.
336    complete_mask = AFtab.apply(
337        lambda x: (x[datasetID], x.loci) not in noncomplete.index, axis=1
338    )
339    df = AFtab[complete_mask]
340    return df
341
342
343def check_resolution(AFtab):
344    """Check if all alleles in AFtab have the same resolution.
345    Will print the number of records with each resolution.
346
347    Args:
348        AFtab (pd.DataFrame): Allele frequency data
349
350    Returns:
351        bool: True only if all alleles have the same resolution, else False.
352    """
353    resolution = 1 + AFtab.allele.str.count(":")
354    resVC = resolution.value_counts()
355    pass_check = len(resVC) == 1
356    if not pass_check:
357        print(resVC)
358        print("Multiple resolutions in AFtab. Fix with decrease_resolution()")
359    return pass_check
360
361
362def decrease_resolution(AFtab, newres, datasetID="population"):
363    """Decrease allele resolution so all alleles have the same resolution.
364
365    Args:
366        AFtab (pd.DataFrame): Allele frequency data.
367        newres (int): The desired number of fields for resolution.
368        datasetID (str, optional): Column to use as stud identifier.
369            Defaults to 'population'.
370
371    Returns:
372        pd.DataFrame: Allele frequency data with all alleles of requested resolution.
373    """
374    df = AFtab.copy()
375    resolution = 1 + df.allele.str.count(":")
376    if not all(resolution >= newres):
377        raise AssertionError(f"Some alleles have resolution below {newres} fields")
378    new_allele = df.allele.str.split(":").apply(lambda x: ":".join(x[:newres]))
379    df.allele = new_allele
380    collapsed = collapse_reduced_alleles(df, datasetID=datasetID)
381    return collapsed
382
383
384def collapse_reduced_alleles(AFtab, datasetID="population"):
385    df = AFtab.copy()
386    # Group by alleles within datasets
387    grouped = df.groupby([datasetID, "allele"])
388    # Sum allele freq but keep other columns
389    collapsed = grouped.apply(
390        lambda row: [
391            sum(row.allele_freq),
392            row.sample_size.unique()[0],
393            row.loci.unique()[0],
394            len(row.loci.unique()),
395            len(row.sample_size.unique()),
396        ],
397        include_groups=False
398    )
399    collapsed = pd.DataFrame(
400        collapsed.tolist(),
401        index=collapsed.index,
402        columns=["allele_freq", "sample_size", "loci", "#loci", "#sample_sizes"],
403    ).reset_index()
404    # Within a study each all identical alleles should have the same loci and sample size
405    if not all(collapsed["#loci"] == 1):
406        raise AssertionError(
407            "Multiple loci found for a single allele in a single population"
408        )
409    if not all(collapsed["#sample_sizes"] == 1):
410        raise AssertionError(
411            "Multiple sample_sizes found for a single allele in a single population"
412        )
413    collapsed = collapsed[
414        ["allele", "loci", "population", "allele_freq", "sample_size"]
415    ]
416    alleles_unique_in_study(collapsed)
417    return collapsed
418
419
420def unmeasured_alleles(AFtab, datasetID="population"):
421    """When combining AF estimates, unreported alleles can inflate frequencies
422        so AF sums to >1. Therefore we add unreported alleles with frequency zero.
423
424    Args:
425        AFtab (pd.DataFrame): Formatted allele frequency data
426        datasetID (str): Unique identifier column for study
427
428    Returns:
429        pd.DataFrame: Allele frequency data with all locus alleles reported
430            for each dataset
431    """
432    df = AFtab.copy()
433    loci = df.loci.unique()
434    # Iterate over loci separately
435    for locus in loci:
436        # Iterate over each dataset reporting that locus
437        datasets = df[df.loci == locus][datasetID].unique()
438        for dataset in datasets:
439            # Single locus, single dataset
440            datasetAF = df[(df[datasetID] == dataset) & (df.loci == locus)]
441            # What was the sample size for this data?
442            dataset_sample_size = datasetAF.sample_size.unique()
443            if not (len(dataset_sample_size) == 1):
444                raise AssertionError(
445                    "dataset_sample_size must be 1, not %s" % len(dataset_sample_size)
446                )
447            dataset_sample_size = dataset_sample_size[0]
448            # Get all alleles for this locus (across datasets)
449            ualleles = df[df.loci == locus].allele.unique()
450            # Which of these alleles are not in this dataset?
451            missing_alleles = [
452                allele for allele in ualleles if not allele in datasetAF.allele.values
453            ]
454            missing_rows = [
455                (al, locus, dataset, 0, 0, dataset_sample_size)
456                for al in missing_alleles
457            ]
458            missing_rows = pd.DataFrame(
459                missing_rows,
460                columns=[
461                    "allele",
462                    "loci",
463                    datasetID,
464                    "allele_freq",
465                    "carriers%",
466                    "sample_size",
467                ],
468            )
469            # Add them in with zero frequency
470            if not missing_rows.empty:
471                df = pd.concat([df, missing_rows], ignore_index=True)
472    return df
473
474
475def combineAF(
476    AFtab,
477    weights="2n",
478    alpha=[],
479    datasetID="population",
480    format=True,
481    ignoreG=True,
482    add_unmeasured=True,
483    complete=True,
484    resolution=True,
485    unique=True,
486):
487    """Combine allele frequencies from multiple studies.
488
489    `datasetID` is the unique identifier for studies to combine.
490    Allele frequencies combined using a Dirichlet distribution where each study's
491    contribution to the concentration parameter is $2 * sample_size * allele_frequency$.
492    Sample size is doubled to get `2n` due to diploidy. If an alternative `weights` is
493    set it is not doubled. The total concentration parameter of the Dirichlet distribution
494    is the contributions from all studies plus the prior `alpha`. If `alpha` is not set
495    the prior defaults to 1 observation of each allele.
496
497    Args:
498        AFtab (pd.DataFrame): Table of Allele frequency data
499        weights (str, optional): Column to be weighted by allele frequency to generate
500            concentration parameter of Dirichlet distribution. Defaults to '2n'.
501        alpha (list, optional): Prior to use for Dirichlet distribution. Defaults to [].
502        datasetID (str, optional): Unique identifier column for study. Defaults to
503            'population'.
504        format (bool, optional): Run `formatAF()`. Defaults to True.
505        ignoreG (bool, optional): Treat allele G groups as normal, see `formatAF()`.
506            Defaults to True.
507        add_unmeasured (bool, optional): Add unmeasured alleles to each study. This is
508            important to ensure combined allele frequencies sum to 1. See
509            `add_unmeasured()`. Defaults to True.
510        complete (bool, optional): Check study completeness. Uses default values for
511            `incomplete_studies()`. If you are happy with your study completeness can
512            be switched off with False. Defaults to True.
513        resolution (bool, optional): Check that all alleles have the same resolution,
514            see `check_resolution()`. Defaults to True.
515        unique (bool, optional): Check that each allele appears no more than once per
516            study. See `alleles_unique_in_study()`. Defaults to True.
517
518    Returns:
519        pd.DataFrame: Allele frequencies after combining estimates from all studies.
520            *allele_freq* is the combined frequency estimate from the Dirichlet mean
521            where the concentration is `alpha` + `c`.
522            *alpha* is the prior used for the Dirichlet distribution.
523            *c* is the observations used for the Dirichlet distribution.
524            *sample_size* is the total sample size of all combined studies.
525            *wav* is the weighted average.
526    """
527    df = AFtab.copy()
528    single_loci(df)
529    if unique:
530        if not alleles_unique_in_study(df, datasetID=datasetID):
531            raise AssertionError("The same allele appears multiple times in a dataset")
532    if complete:
533        if not incomplete_studies(df, datasetID=datasetID).empty:
534            raise AssertionError(
535                "AFtab contains studies with AF that doesn't sum to 1. Check incomplete_studies(AFtab)"
536            )
537    if resolution:
538        if not check_resolution(df):
539            raise AssertionError(
540                "AFtab conains alleles at multiple resolutions, check check_resolution(AFtab)"
541            )
542    if format:
543        df = formatAF(df, ignoreG)
544    if add_unmeasured:
545        df = unmeasured_alleles(df, datasetID)
546    try:
547        df["2n"] = df.sample_size * 2
548    except AttributeError:
549        print("column '2n' could not be created")
550    df["c"] = df.allele_freq * df[weights]
551    grouped = df.groupby("allele", sort=True)
552    combined = grouped.apply(
553        lambda row: [
554            row.name,
555            row.loci.unique()[0],
556            np.average(row.allele_freq, weights=row[weights]),
557            row.c.sum(),
558            row.sample_size.sum(),
559        ],
560        include_groups=False
561    )
562    combined = pd.DataFrame(
563        combined.tolist(), columns=["allele", "loci", "wav", "c", "sample_size"]
564    )
565    combined = combined.reset_index(drop=True)
566    # Check that all alleles in a locus have the same sample size
567    # after merging
568    if duplicated_sample_size(combined):
569        id_duplicated_allele(grouped)
570    if not alpha:
571        alpha = default_prior(len(combined.allele))
572    combined["alpha"] = alpha
573    # Calculate Dirichlet mean for each allele
574    combined["allele_freq"] = sp.stats.dirichlet(combined.alpha + combined.c).mean()
575
576    return combined
577
578
579def default_prior(k):
580    """Calculate a default prior, 1 observation of each class.
581
582    Args:
583        k (int): Number of classes in the Dirichlet distribution.
584
585    Returns:
586        list: List of k 1s to use as prior.
587    """
588    alpha = [1] * k
589    return alpha
590
591
592def single_loci(AFtab):
593    """Check that allele frequency data is only of one locus
594
595    Args:
596        AFtab (pd.DataFrame): Allele frequency data
597    """
598    if not len(AFtab.loci.unique()) == 1:
599        raise AssertionError("'AFtab' must contain only 1 loci")
600
601
602def alleles_unique_in_study(AFtab, datasetID="population"):
603    """Are all alleles unique in each study?
604
605    Checks that no alleles are reported more than once in a single study.
606    Study is defined by `datasetID`.
607
608    Args:
609        AFtab (pd.DataFrame): Allele frequency data
610        datasetID (str, optional): Unique identifier column to define study.
611            Defaults to 'population'.
612
613    Returns:
614        bool: `True` on if no alleles occur more than once in any study, otherwise `False`.
615    """
616    df = AFtab.copy()
617    grouped = df.groupby([datasetID, "allele"])
618    # Are allele alleles unique? i.e. do any occur multiple times in grouping?
619    unique = grouped.size()[grouped.size() > 1].empty
620    if not unique:
621        print(f"Non unique alleles in study, is datasetID correct? {datasetID}")
622        print(grouped.size()[grouped.size() > 1])
623    return unique
624
625
626def duplicated_sample_size(AFtab):
627    """Returns True if any loci has more than 1 unique sample size"""
628    locus_sample_sizes = AFtab.groupby("loci").sample_size.apply(
629        lambda x: len(x.unique())
630    )
631    return any(locus_sample_sizes != 1)
632
633
634def id_duplicated_allele(grouped):
635    """Reports the allele that has mupltiple sample sizes"""
636    duplicated_population = grouped.population.apply(lambda x: any(x.duplicated()))
637    if not all(~duplicated_population):
638        raise AssertionError(
639            f"duplicated population within allele {duplicated_population[duplicated_population].index.tolist()}"
640        )
641
642
643def population_coverage(p):
644    """Proportion of people with at least 1 copy of this allele assuming HWE.
645
646    Args:
647        p (float): Allele frequency
648
649    Returns:
650        float: Sum of homozygotes and heterozygotes for this allele
651    """
652    q = 1 - p
653    homo = p**2
654    hetero = 2 * p * q
655    return homo + hetero
656
657
658def betaAB(alpha):
659    """Calculate `a` `b` values for all composite beta distributions.
660
661    Given the `alpha` vector defining a Dirichlet distribution calculate the `a` `b` values
662    for all composite beta distributions.
663
664    Args:
665        alpha (list): Values defining a Dirichlet distribution. This will be the prior
666            (for a naive distribution) or the prior + caf.c for a posterior distribution.
667
668    Returns:
669        list: List of `a` `b` values defining beta values, i.e. for each allele it is
670            the number of times it was and wasn't observed.
671    """
672    ab = [(a, sum(alpha) - a) for a in alpha]
673    return ab
674
675
676# def betaCI(a,b,credible_interval=0.95):
677#     """Calculat the central credible interval of a beta distribution
678
679#     Args:
680#         a (float): Beta shape parameter `a`, i.e. the number of times the allele was observed.
681#         b (float): Beta shape parameter `b`, i.e. the number of times the allele was not observed.
682#         credible_interval (float, optional): The size of the credible interval requested. Defaults to 0.95.
683
684#     Returns:
685#         tuple: Lower and upper credible interval of beta distribution.
686#     """
687#     bd = sp.stats.beta(a,b)
688#     lower_quantile = (1-credible_interval)/2
689#     upper_quantile = 1-lower_quantile
690#     lower_interval = bd.ppf(lower_quantile)
691#     upper_interval = bd.ppf(upper_quantile)
692#     return lower_interval, upper_interval
693
694# def AFci(caf, credible_interval=0.95):
695#     """Calculate credible interval for combined allele frequency table.
696#     Note that this ignores sampling error so confidence interval is too tight.
697#     Use HLAhdi.AFhdi() instead.
698
699#     Args:
700#         caf (pd.DataFrame): Table produced by combineAF()
701#         credible_interval (float, optional): The desired confidence interval. Defaults to 0.95.
702
703#     Returns:
704#         list: Lower and upper credible intervals as a list of tuples
705#     """
706#     ab = betaAB(
707#         caf.alpha + caf.c,
708#     )
709#     ci = [betaCI(a, b, credible_interval) for a,b in ab]
710#     return ci
711
712
713def plot_prior(concentration, ncol=2, psteps=1000, labels=""):
714    """Plot probability density function for prior values.
715
716    Args:
717        concentration (list): Vector of the prior Dirichlet concentration values.
718        ncol (int, optional): Number of columns. Defaults to 2.
719        labels (list, optional): Labels for elements of concentration in the same
720            order. Defaults to "".
721    """
722    ab = betaAB(concentration)
723    pline = np.linspace(0, 1, psteps)
724    nrow = math.ceil(len(concentration) / ncol)
725    fig, ax = plt.subplots(nrow, ncol, sharex=True)
726    fig.suptitle("Probability density")
727    # If labels is a list nothing happens,
728    # But if it's a series it converts to a list
729    labels = list(labels)
730    if not labels:
731        labels = [""] * len(concentration)
732    if not len(concentration) == len(labels):
733        raise AssertionError("concentration must be same length as labels")
734    for i, alpha in enumerate(concentration):
735        a, b = ab[i]
736        bd = sp.stats.beta(a, b)
737        pdf = [bd.pdf(p) for p in pline]
738        ax.flat[i].plot(pline, pdf)
739        ax.flat[i].set_title(labels[i])
740    for axi in ax[-1, :]:
741        axi.set(xlabel="Allele freq")
742    for axi in ax[:, 0]:
743        axi.set(ylabel="PDF")
744    plt.show()
745
746
747def plotAF(
748    caf=pd.DataFrame(),
749    AFtab=pd.DataFrame(),
750    cols=list(mcolors.TABLEAU_COLORS.keys()),
751    datasetID="population",
752    hdi=pd.DataFrame(),
753    compound_mean=pd.DataFrame(),
754):
755    """Plot allele frequency results from `HLAfreq`.
756
757    Plot combined allele frequencies, individual allele frequencies,
758    and credible intervals on combined allele frequency estimates.
759    Credible interval is only plotted if a value is given for `hdi`.
760    The plotted Credible interval is whatever was passed to HLAfreq_pymc.AFhdi()
761    when calculating hdi.
762
763    Args:
764        caf (pd.DataFrame, optional): Combined allele frequency estimates from
765            HLAfreq.combineAF. Defaults to pd.DataFrame().
766        AFtab (pd.DataFrame, optional): Table of allele frequency data. Defaults
767            to pd.DataFrame().
768        cols (list, optional): List of colours to use for each individual dataset.
769            Defaults to list(mcolors.TABLEAU_COLORS.keys()).
770        datasetID (str, optional): Column used to define separate datasets. Defaults
771            to "population".
772        weights (str, optional): Column to be weighted by allele frequency to generate
773            concentration parameter of Dirichlet distribution. Defaults to '2n'.
774        hdi (pd.DataFrame, optional): The high density interval object to plot credible
775            intervals. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame().
776        compound_mean (pd.DataFrame, optional): The high density interval object to plot
777            post_mean. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame().
778    """
779    # Plot allele frequency for each dataset
780    if not AFtab.empty:
781        # Cols must be longer than the list of alleles
782        # If not, repeat the list of cols
783        repeat_cols = np.ceil(len(AFtab[datasetID]) / len(cols))
784        repeat_cols = int(repeat_cols)
785        cols = cols * repeat_cols
786        # Make a dictionary mapping datasetID to colours
787        cmap = dict(zip(AFtab[datasetID].unique(), cols))
788        plt.scatter(
789            x=AFtab.allele_freq,
790            y=AFtab.allele,
791            c=[cmap[x] for x in AFtab[datasetID]],
792            alpha=0.7,
793            zorder=2,
794        )
795    # Plot combined allele frequency
796    if not caf.empty:
797        plt.scatter(
798            x=caf.allele_freq,
799            y=caf.allele,
800            edgecolors="black",
801            facecolors="none",
802            zorder=3,
803        )
804    # Plot high density interval
805    if not hdi.empty:
806        # assert not AFtab.empty, "AFtab is needed to calculate credible interval"
807        # from HLAfreq import HLAfreq_pymc as HLAhdi
808        # print("Fitting model with PyMC, make take a few seconds")
809        # hdi = HLAhdi.AFhdi(
810        #     AFtab=AFtab,
811        #     weights=weights,
812        #     datasetID=datasetID,
813        #     credible_interval=credible_interval,
814        #     conc_mu=conc_mu,
815        #     conc_sigma=conc_sigma
816        # )
817        for interval in hdi.iterrows():
818            # .iterrows returns a index and data as a tuple for each row
819            plt.hlines(
820                y=interval[1]["allele"],
821                xmin=interval[1]["lo"],
822                xmax=interval[1]["hi"],
823                color="black",
824            )
825    if not compound_mean.empty:
826        for row in compound_mean.iterrows():
827            plt.scatter(
828                y=row[1]["allele"], x=row[1]["post_mean"], color="black", marker="|"
829            )
830    plt.xlabel("Allele frequency")
831    plt.grid(zorder=0)
832    plt.show()
def simulate_population(alleles: Iterable[str], locus: str, population: str):
23def simulate_population(alleles: Iterable[str], locus: str, population: str):
24    pop_size = np.random.randint(len(alleles), 50)
25    samples = np.random.choice(alleles, pop_size, replace=True)
26    counts = pd.Series(samples).value_counts()
27    counts.values / pop_size
28    pop = pd.DataFrame(
29        {
30            "allele": counts.index,
31            "loci": locus,
32            "population": population,
33            "allele_freq": counts.values / pop_size,
34            "sample_size": pop_size,
35        }
36    )
37    return pop
def simulate_study(alleles, populations, locus):
40def simulate_study(alleles, populations, locus):
41    study = []
42    for i in range(populations):
43        pop = simulate_population(alleles=alleles, locus=locus, population=f"pop_{i}")
44        study.append(pop)
45
46    study = pd.concat(study)
47    return study
def makeURL( country='', standard='s', locus='', resolution_pattern='bigger_equal_than', resolution=2, region='', ethnic='', study_type='', dataset_source='', sample_year='', sample_year_pattern='', sample_size='', sample_size_pattern=''):
 50def makeURL(
 51    country="",
 52    standard="s",
 53    locus="",
 54    resolution_pattern="bigger_equal_than",
 55    resolution=2,
 56    region="",
 57    ethnic="",
 58    study_type="",
 59    dataset_source="",
 60    sample_year="",
 61    sample_year_pattern="",
 62    sample_size="",
 63    sample_size_pattern="",
 64):
 65    """Create URL for search of allele frequency net database.
 66
 67    All arguments are documented [here](https://www.allelefrequencies.net/extaccess.asp)
 68
 69    Args:
 70        country (str, optional): Country name to retrieve records from. Defaults to "".
 71        standard (str, optional): Filter study quality standard to this or higher.
 72            {'g', 's', 'a'} Gold, silver, all. Defaults to 's'.
 73        locus (str, optional): The locus to return allele data for. Defaults to "".
 74        resolution_pattern (str, optional): Resolution comparitor {'equal', 'different',
 75            'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}.
 76            Filter created using `resolution` and `resolution_pattern`.
 77            Defaults to "bigger_equal_than".
 78        resolution (int, optional): Number of fields of resolution of allele. Filter
 79            created using `resolution` and `resolution_pattern`. Defaults to 2.
 80        region (str, optional): Filter to geographic region. {Asia, Australia,
 81            Eastern Europe, ...}.
 82            All regions listed [here](https://www.allelefrequencies.net/pop6003a.asp).
 83            Defaults to "".
 84        ethnic (str, optional): Filter to ethnicity. {"Amerindian", "Black", "Caucasian", ...}.
 85            All ethnicities listed [here](https://www.allelefrequencies.net/pop6003a.asp).
 86            Defaults to "".
 87        study_type (str, optional): Type of study. {"Anthropology", "Blood+Donor",
 88            "Bone+Marrow+Registry", "Controls+for+Disease+Study", "Disease+Study+Patients",
 89            "Other", "Solid+Organd+Unrelated+Donors", "Stem+cell+donors"}. Defaults to "".
 90        dataset_source (str, optional): Source of data. {"Literature",
 91            "Proceedings+of+IHWs", "Unpublished"}. Defaults to "".
 92        sample_year (int, optional): Sample year to compare to. Filter created using
 93            sample_year and sample_year_pattern. Defaults to "".
 94        sample_year_pattern (str, optional): Pattern to compare sample year to. Filter
 95            created using sample_year and sample_year_pattern. {'equal', 'different',
 96            'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}.
 97            Defaults to "".
 98        sample_size (int, optional): Sample size to compare to. Filter created using
 99            sample_size and sample_size_pattern. Defaults to "".
100        sample_size_pattern (str, optional): Pattern to compare sample size to. Filter
101            created using sample_size and sample_size_pattern. {'equal', 'different',
102            'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}.
103            Defaults to "".
104
105    Returns:
106        str: URL to search allelefrequencies.net
107    """
108    base = "https://www.allelefrequencies.net/hla6006a.asp?"
109    locus_type = "hla_locus_type=Classical&"
110    hla_locus = "hla_locus=%s&" % (locus)
111    country = "hla_country=%s&" % (country)
112    region = "hla_region=%s&" % (region)
113    ethnic = "hla_ethnic=%s&" % (ethnic)
114    study_type = "hla_study=%s&" % (study_type)
115    dataset_source = "hla_dataset_source=%s&" % (dataset_source)
116    sample_year = "hla_sample_year=%s&" % (sample_year)
117    sample_year_pattern = "hla_sample_year_pattern=%s&" % (sample_year_pattern)
118    sample_size = "hla_sample_size=%s&" % (sample_size)
119    sample_size_pattern = "hla_sample_size_pattern=%s&" % (sample_size_pattern)
120    hla_level_pattern = "hla_level_pattern=%s&" % (resolution_pattern)
121    hla_level = "hla_level=%s&" % (resolution)
122    standard = "standard=%s&" % standard
123    url = (
124        base
125        + locus_type
126        + hla_locus
127        + country
128        + hla_level_pattern
129        + hla_level
130        + standard
131        + region
132        + ethnic
133        + study_type
134        + dataset_source
135        + sample_year
136        + sample_year_pattern
137        + sample_size
138        + sample_size_pattern
139    )
140    return url

Create URL for search of allele frequency net database.

All arguments are documented here

Arguments:
  • country (str, optional): Country name to retrieve records from. Defaults to "".
  • standard (str, optional): Filter study quality standard to this or higher. {'g', 's', 'a'} Gold, silver, all. Defaults to 's'.
  • locus (str, optional): The locus to return allele data for. Defaults to "".
  • resolution_pattern (str, optional): Resolution comparitor {'equal', 'different', 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. Filter created using resolution and resolution_pattern. Defaults to "bigger_equal_than".
  • resolution (int, optional): Number of fields of resolution of allele. Filter created using resolution and resolution_pattern. Defaults to 2.
  • region (str, optional): Filter to geographic region. {Asia, Australia, Eastern Europe, ...}. All regions listed here. Defaults to "".
  • ethnic (str, optional): Filter to ethnicity. {"Amerindian", "Black", "Caucasian", ...}. All ethnicities listed here. Defaults to "".
  • study_type (str, optional): Type of study. {"Anthropology", "Blood+Donor", "Bone+Marrow+Registry", "Controls+for+Disease+Study", "Disease+Study+Patients", "Other", "Solid+Organd+Unrelated+Donors", "Stem+cell+donors"}. Defaults to "".
  • dataset_source (str, optional): Source of data. {"Literature", "Proceedings+of+IHWs", "Unpublished"}. Defaults to "".
  • sample_year (int, optional): Sample year to compare to. Filter created using sample_year and sample_year_pattern. Defaults to "".
  • sample_year_pattern (str, optional): Pattern to compare sample year to. Filter created using sample_year and sample_year_pattern. {'equal', 'different', 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. Defaults to "".
  • sample_size (int, optional): Sample size to compare to. Filter created using sample_size and sample_size_pattern. Defaults to "".
  • sample_size_pattern (str, optional): Pattern to compare sample size to. Filter created using sample_size and sample_size_pattern. {'equal', 'different', 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. Defaults to "".
Returns:

str: URL to search allelefrequencies.net

def parseAF(bs):
143def parseAF(bs):
144    """Generate a dataframe from a given html page
145
146    Args:
147        bs (bs4.BeautifulSoup): BeautifulSoup object from allelefrequencies.net page
148
149    Returns:
150        pd.DataFrame: Table of allele, allele frequency, samplesize, and population
151    """
152    # Get the results table from the div `divGenDetail`
153    tab = bs.find("div", {"id": "divGenDetail"}).find("table", {"class": "tblNormal"})
154    # Get the column headers from the first row of the table
155    columns = [
156        "line",
157        "allele",
158        "flag",
159        "population",
160        "carriers%",
161        "allele_freq",
162        "AF_graphic",
163        "sample_size",
164        "database",
165        "distribution",
166        "haplotype_association",
167        "notes",
168    ]
169    rows = []
170    for row in tab.find_all("tr"):
171        rows.append([td.get_text(strip=True) for td in row.find_all("td")])
172    # Make dataframe of table rows
173    # skip the first row as it's `th` headers
174    df = pd.DataFrame(rows[1:], columns=columns)
175
176    # Get HLA loci
177    df["loci"] = df.allele.apply(lambda x: x.split("*")[0])
178
179    # Drop unwanted columns
180    df = df[["allele", "loci", "population", "allele_freq", "carriers%", "sample_size"]]
181    return df

Generate a dataframe from a given html page

Arguments:
  • bs (bs4.BeautifulSoup): BeautifulSoup object from allelefrequencies.net page
Returns:

pd.DataFrame: Table of allele, allele frequency, samplesize, and population

def Npages(bs):
184def Npages(bs):
185    """How many pages of results are there?
186
187    Args:
188        bs (bs4.BeautifulSoup): BS object of allelefrequencies.net results page
189
190    Returns:
191        int: Total number of results pages
192    """
193    # Get the table with number of pages
194    navtab = bs.find("div", {"id": "divGenNavig"}).find("table", {"class": "table10"})
195    if not navtab:
196        raise AssertionError(
197            "navtab does not evaluate to True. Check URL returns results in web browser."
198        )
199    # Get cell with ' of ' in
200    pagesOfN = [
201        td.get_text(strip=True) for td in navtab.find_all("td") if " of " in td.text
202    ]
203    # Check single cell returned
204    if not len(pagesOfN) == 1:
205        raise AssertionError("divGenNavig should contain 1 of not %s" % len(pagesOfN))
206    # Get total number of pages
207    N = pagesOfN[0].split("of ")[1]
208    N = int(N)
209    return N

How many pages of results are there?

Arguments:
  • bs (bs4.BeautifulSoup): BS object of allelefrequencies.net results page
Returns:

int: Total number of results pages

def formatAF(AFtab, ignoreG=True):
212def formatAF(AFtab, ignoreG=True):
213    """Format allele frequency table.
214
215    Convert sample_size and allele_freq to numeric data type.
216    Removes commas from sample size. Removes "(*)" from allele frequency if
217    `ignoreG` is `True`. `formatAF()` is used internally by combineAF and getAFdata
218    by default.
219
220    Args:
221        AFtab (pd.DataFrame): Allele frequency data downloaded from allelefrequency.net
222            using `getAFdata()`.
223        ignoreG (bool, optional): Treat G group alleles as normal.
224            See https://hla.alleles.org/alleles/g_groups.html for details. Defaults to True.
225
226    Returns:
227        pd.DataFrame: The formatted allele frequency data.
228    """
229    df = AFtab.copy()
230    if pd.api.types.is_string_dtype(df.sample_size.dtype):
231        df.sample_size = pd.to_numeric(df.sample_size.str.replace(",", ""))
232    if pd.api.types.is_string_dtype(df.allele_freq.dtype):
233        if ignoreG:
234            df.allele_freq = df.allele_freq.str.replace("(*)", "", regex=False)
235        df.allele_freq = pd.to_numeric(df.allele_freq)
236    return df

Format allele frequency table.

Convert sample_size and allele_freq to numeric data type. Removes commas from sample size. Removes "(*)" from allele frequency if ignoreG is True. formatAF() is used internally by combineAF and getAFdata by default.

Arguments:
Returns:

pd.DataFrame: The formatted allele frequency data.

def getAFdata(base_url, timeout=20, format=True, ignoreG=True):
239def getAFdata(base_url, timeout=20, format=True, ignoreG=True):
240    """Get all allele frequency data from a search base_url.
241
242    Iterates over all pages regardless of which page is based.
243
244    Args:
245        base_url (str): URL for base search.
246        timeout (int): How long to wait to receive a response.
247        format (bool): Format the downloaded data using `formatAF()`.
248        ignoreG (bool): treat allele G groups as normal.
249            See https://hla.alleles.org/alleles/g_groups.html for details. Default = True
250
251    Returns:
252        pd.DataFrame: allele frequency data parsed into a pandas dataframe
253    """
254    # Get BS object from base search
255    try:
256        bs = BeautifulSoup(requests.get(base_url, timeout=timeout).text, "html.parser")
257    except requests.exceptions.ReadTimeout as e:
258        raise Exception(
259            "Requests timeout, try a larger `timeout` value for `getAFdata()`"
260        ) from None
261    # How many pages of results
262    N = Npages(bs)
263    print("%s pages of results" % N)
264    # iterate over pages, parse and combine data from each
265    tabs = []
266    for i in range(N):
267        # print (" Parsing page %s" %(i+1))
268        print(" Parsing page %s" % (i + 1), end="\r")
269        url = base_url + "page=" + str(i + 1)
270        try:
271            bs = BeautifulSoup(requests.get(url, timeout=timeout).text, "html.parser")
272        except requests.exceptions.ReadTimeout as e:
273            raise Exception(
274                "Requests timeout, try a larger `timeout` value for `getAFdata()`"
275            ) from None
276        tab = parseAF(bs)
277        tabs.append(tab)
278    print("Download complete")
279    tabs = pd.concat(tabs)
280    if format:
281        try:
282            tabs = formatAF(tabs, ignoreG)
283        except AttributeError:
284            print("Formatting failed, non-numeric datatypes may remain.")
285    return tabs

Get all allele frequency data from a search base_url.

Iterates over all pages regardless of which page is based.

Arguments:
  • base_url (str): URL for base search.
  • timeout (int): How long to wait to receive a response.
  • format (bool): Format the downloaded data using formatAF().
  • ignoreG (bool): treat allele G groups as normal. See https://hla.alleles.org/alleles/g_groups.html for details. Default = True
Returns:

pd.DataFrame: allele frequency data parsed into a pandas dataframe

def incomplete_studies(AFtab, llimit=0.95, ulimit=1.1, datasetID='population'):
288def incomplete_studies(AFtab, llimit=0.95, ulimit=1.1, datasetID="population"):
289    """Report any studies with allele freqs that don't sum to 1
290
291    Args:
292        AFtab (pd.DataFrame): Dataframe containing multiple studies
293        llimit (float, optional): Lower allele_freq sum limit that counts as complete.
294            Defaults to 0.95.
295        ulimit (float, optional): Upper allele_freq sum limit that will not be reported.
296            Defaults to 1.1.
297        datasetID (str): Unique identifier column for study
298    """
299    poplocs = AFtab.groupby([datasetID, "loci"]).allele_freq.sum()
300    lmask = poplocs < llimit
301    if sum(lmask > 0):
302        print(poplocs[lmask])
303        print(f"{sum(lmask)} studies have total allele frequency < {llimit}")
304    umask = poplocs > ulimit
305    if sum(umask > 0):
306        print(poplocs[umask])
307        print(f"{sum(umask)} studies have total allele frequency > {ulimit}")
308    incomplete = pd.concat([poplocs[lmask], poplocs[umask]])
309    return incomplete

Report any studies with allele freqs that don't sum to 1

Arguments:
  • AFtab (pd.DataFrame): Dataframe containing multiple studies
  • llimit (float, optional): Lower allele_freq sum limit that counts as complete. Defaults to 0.95.
  • ulimit (float, optional): Upper allele_freq sum limit that will not be reported. Defaults to 1.1.
  • datasetID (str): Unique identifier column for study
def only_complete(AFtab, llimit=0.95, ulimit=1.1, datasetID='population'):
312def only_complete(AFtab, llimit=0.95, ulimit=1.1, datasetID="population"):
313    """Returns only complete studies.
314
315    Data is dropped if the locus for that population is not complete, i.e. doesn't
316    sum to between `llimit` and `ulimit`. This prevents throwing away data if
317    another loci in the population is incomplete.
318
319    Args:
320        AFtab (pd.DataFrame): Dataframe containing multiple studies
321        llimit (float, optional): Lower allele_freq sum limit that counts as complete.
322            Defaults to 0.95.
323        ulimit (float, optional): Upper allele_freq sum limit that will not be reported.
324            Defaults to 1.1.
325        datasetID (str): Unique identifier column for study. Defaults to 'population'.
326
327    Returns:
328        pd.DataFrame: Allele frequency data of multiple studies, but only complete studies.
329    """
330    noncomplete = incomplete_studies(
331        AFtab=AFtab, llimit=llimit, ulimit=ulimit, datasetID=datasetID
332    )
333    # Returns False if population AND loci are in the noncomplete.index
334    # AS A PAIR
335    # This is important so that we don't throw away all data on a population
336    # just because one loci is incomplete.
337    complete_mask = AFtab.apply(
338        lambda x: (x[datasetID], x.loci) not in noncomplete.index, axis=1
339    )
340    df = AFtab[complete_mask]
341    return df

Returns only complete studies.

Data is dropped if the locus for that population is not complete, i.e. doesn't sum to between llimit and ulimit. This prevents throwing away data if another loci in the population is incomplete.

Arguments:
  • AFtab (pd.DataFrame): Dataframe containing multiple studies
  • llimit (float, optional): Lower allele_freq sum limit that counts as complete. Defaults to 0.95.
  • ulimit (float, optional): Upper allele_freq sum limit that will not be reported. Defaults to 1.1.
  • datasetID (str): Unique identifier column for study. Defaults to 'population'.
Returns:

pd.DataFrame: Allele frequency data of multiple studies, but only complete studies.

def check_resolution(AFtab):
344def check_resolution(AFtab):
345    """Check if all alleles in AFtab have the same resolution.
346    Will print the number of records with each resolution.
347
348    Args:
349        AFtab (pd.DataFrame): Allele frequency data
350
351    Returns:
352        bool: True only if all alleles have the same resolution, else False.
353    """
354    resolution = 1 + AFtab.allele.str.count(":")
355    resVC = resolution.value_counts()
356    pass_check = len(resVC) == 1
357    if not pass_check:
358        print(resVC)
359        print("Multiple resolutions in AFtab. Fix with decrease_resolution()")
360    return pass_check

Check if all alleles in AFtab have the same resolution. Will print the number of records with each resolution.

Arguments:
  • AFtab (pd.DataFrame): Allele frequency data
Returns:

bool: True only if all alleles have the same resolution, else False.

def decrease_resolution(AFtab, newres, datasetID='population'):
363def decrease_resolution(AFtab, newres, datasetID="population"):
364    """Decrease allele resolution so all alleles have the same resolution.
365
366    Args:
367        AFtab (pd.DataFrame): Allele frequency data.
368        newres (int): The desired number of fields for resolution.
369        datasetID (str, optional): Column to use as stud identifier.
370            Defaults to 'population'.
371
372    Returns:
373        pd.DataFrame: Allele frequency data with all alleles of requested resolution.
374    """
375    df = AFtab.copy()
376    resolution = 1 + df.allele.str.count(":")
377    if not all(resolution >= newres):
378        raise AssertionError(f"Some alleles have resolution below {newres} fields")
379    new_allele = df.allele.str.split(":").apply(lambda x: ":".join(x[:newres]))
380    df.allele = new_allele
381    collapsed = collapse_reduced_alleles(df, datasetID=datasetID)
382    return collapsed

Decrease allele resolution so all alleles have the same resolution.

Arguments:
  • AFtab (pd.DataFrame): Allele frequency data.
  • newres (int): The desired number of fields for resolution.
  • datasetID (str, optional): Column to use as stud identifier. Defaults to 'population'.
Returns:

pd.DataFrame: Allele frequency data with all alleles of requested resolution.

def collapse_reduced_alleles(AFtab, datasetID='population'):
385def collapse_reduced_alleles(AFtab, datasetID="population"):
386    df = AFtab.copy()
387    # Group by alleles within datasets
388    grouped = df.groupby([datasetID, "allele"])
389    # Sum allele freq but keep other columns
390    collapsed = grouped.apply(
391        lambda row: [
392            sum(row.allele_freq),
393            row.sample_size.unique()[0],
394            row.loci.unique()[0],
395            len(row.loci.unique()),
396            len(row.sample_size.unique()),
397        ],
398        include_groups=False
399    )
400    collapsed = pd.DataFrame(
401        collapsed.tolist(),
402        index=collapsed.index,
403        columns=["allele_freq", "sample_size", "loci", "#loci", "#sample_sizes"],
404    ).reset_index()
405    # Within a study each all identical alleles should have the same loci and sample size
406    if not all(collapsed["#loci"] == 1):
407        raise AssertionError(
408            "Multiple loci found for a single allele in a single population"
409        )
410    if not all(collapsed["#sample_sizes"] == 1):
411        raise AssertionError(
412            "Multiple sample_sizes found for a single allele in a single population"
413        )
414    collapsed = collapsed[
415        ["allele", "loci", "population", "allele_freq", "sample_size"]
416    ]
417    alleles_unique_in_study(collapsed)
418    return collapsed
def unmeasured_alleles(AFtab, datasetID='population'):
421def unmeasured_alleles(AFtab, datasetID="population"):
422    """When combining AF estimates, unreported alleles can inflate frequencies
423        so AF sums to >1. Therefore we add unreported alleles with frequency zero.
424
425    Args:
426        AFtab (pd.DataFrame): Formatted allele frequency data
427        datasetID (str): Unique identifier column for study
428
429    Returns:
430        pd.DataFrame: Allele frequency data with all locus alleles reported
431            for each dataset
432    """
433    df = AFtab.copy()
434    loci = df.loci.unique()
435    # Iterate over loci separately
436    for locus in loci:
437        # Iterate over each dataset reporting that locus
438        datasets = df[df.loci == locus][datasetID].unique()
439        for dataset in datasets:
440            # Single locus, single dataset
441            datasetAF = df[(df[datasetID] == dataset) & (df.loci == locus)]
442            # What was the sample size for this data?
443            dataset_sample_size = datasetAF.sample_size.unique()
444            if not (len(dataset_sample_size) == 1):
445                raise AssertionError(
446                    "dataset_sample_size must be 1, not %s" % len(dataset_sample_size)
447                )
448            dataset_sample_size = dataset_sample_size[0]
449            # Get all alleles for this locus (across datasets)
450            ualleles = df[df.loci == locus].allele.unique()
451            # Which of these alleles are not in this dataset?
452            missing_alleles = [
453                allele for allele in ualleles if not allele in datasetAF.allele.values
454            ]
455            missing_rows = [
456                (al, locus, dataset, 0, 0, dataset_sample_size)
457                for al in missing_alleles
458            ]
459            missing_rows = pd.DataFrame(
460                missing_rows,
461                columns=[
462                    "allele",
463                    "loci",
464                    datasetID,
465                    "allele_freq",
466                    "carriers%",
467                    "sample_size",
468                ],
469            )
470            # Add them in with zero frequency
471            if not missing_rows.empty:
472                df = pd.concat([df, missing_rows], ignore_index=True)
473    return df

When combining AF estimates, unreported alleles can inflate frequencies so AF sums to >1. Therefore we add unreported alleles with frequency zero.

Arguments:
  • AFtab (pd.DataFrame): Formatted allele frequency data
  • datasetID (str): Unique identifier column for study
Returns:

pd.DataFrame: Allele frequency data with all locus alleles reported for each dataset

def combineAF( AFtab, weights='2n', alpha=[], datasetID='population', format=True, ignoreG=True, add_unmeasured=True, complete=True, resolution=True, unique=True):
476def combineAF(
477    AFtab,
478    weights="2n",
479    alpha=[],
480    datasetID="population",
481    format=True,
482    ignoreG=True,
483    add_unmeasured=True,
484    complete=True,
485    resolution=True,
486    unique=True,
487):
488    """Combine allele frequencies from multiple studies.
489
490    `datasetID` is the unique identifier for studies to combine.
491    Allele frequencies combined using a Dirichlet distribution where each study's
492    contribution to the concentration parameter is $2 * sample_size * allele_frequency$.
493    Sample size is doubled to get `2n` due to diploidy. If an alternative `weights` is
494    set it is not doubled. The total concentration parameter of the Dirichlet distribution
495    is the contributions from all studies plus the prior `alpha`. If `alpha` is not set
496    the prior defaults to 1 observation of each allele.
497
498    Args:
499        AFtab (pd.DataFrame): Table of Allele frequency data
500        weights (str, optional): Column to be weighted by allele frequency to generate
501            concentration parameter of Dirichlet distribution. Defaults to '2n'.
502        alpha (list, optional): Prior to use for Dirichlet distribution. Defaults to [].
503        datasetID (str, optional): Unique identifier column for study. Defaults to
504            'population'.
505        format (bool, optional): Run `formatAF()`. Defaults to True.
506        ignoreG (bool, optional): Treat allele G groups as normal, see `formatAF()`.
507            Defaults to True.
508        add_unmeasured (bool, optional): Add unmeasured alleles to each study. This is
509            important to ensure combined allele frequencies sum to 1. See
510            `add_unmeasured()`. Defaults to True.
511        complete (bool, optional): Check study completeness. Uses default values for
512            `incomplete_studies()`. If you are happy with your study completeness can
513            be switched off with False. Defaults to True.
514        resolution (bool, optional): Check that all alleles have the same resolution,
515            see `check_resolution()`. Defaults to True.
516        unique (bool, optional): Check that each allele appears no more than once per
517            study. See `alleles_unique_in_study()`. Defaults to True.
518
519    Returns:
520        pd.DataFrame: Allele frequencies after combining estimates from all studies.
521            *allele_freq* is the combined frequency estimate from the Dirichlet mean
522            where the concentration is `alpha` + `c`.
523            *alpha* is the prior used for the Dirichlet distribution.
524            *c* is the observations used for the Dirichlet distribution.
525            *sample_size* is the total sample size of all combined studies.
526            *wav* is the weighted average.
527    """
528    df = AFtab.copy()
529    single_loci(df)
530    if unique:
531        if not alleles_unique_in_study(df, datasetID=datasetID):
532            raise AssertionError("The same allele appears multiple times in a dataset")
533    if complete:
534        if not incomplete_studies(df, datasetID=datasetID).empty:
535            raise AssertionError(
536                "AFtab contains studies with AF that doesn't sum to 1. Check incomplete_studies(AFtab)"
537            )
538    if resolution:
539        if not check_resolution(df):
540            raise AssertionError(
541                "AFtab conains alleles at multiple resolutions, check check_resolution(AFtab)"
542            )
543    if format:
544        df = formatAF(df, ignoreG)
545    if add_unmeasured:
546        df = unmeasured_alleles(df, datasetID)
547    try:
548        df["2n"] = df.sample_size * 2
549    except AttributeError:
550        print("column '2n' could not be created")
551    df["c"] = df.allele_freq * df[weights]
552    grouped = df.groupby("allele", sort=True)
553    combined = grouped.apply(
554        lambda row: [
555            row.name,
556            row.loci.unique()[0],
557            np.average(row.allele_freq, weights=row[weights]),
558            row.c.sum(),
559            row.sample_size.sum(),
560        ],
561        include_groups=False
562    )
563    combined = pd.DataFrame(
564        combined.tolist(), columns=["allele", "loci", "wav", "c", "sample_size"]
565    )
566    combined = combined.reset_index(drop=True)
567    # Check that all alleles in a locus have the same sample size
568    # after merging
569    if duplicated_sample_size(combined):
570        id_duplicated_allele(grouped)
571    if not alpha:
572        alpha = default_prior(len(combined.allele))
573    combined["alpha"] = alpha
574    # Calculate Dirichlet mean for each allele
575    combined["allele_freq"] = sp.stats.dirichlet(combined.alpha + combined.c).mean()
576
577    return combined

Combine allele frequencies from multiple studies.

datasetID is the unique identifier for studies to combine. Allele frequencies combined using a Dirichlet distribution where each study's contribution to the concentration parameter is $2 * sample_size * allele_frequency$. Sample size is doubled to get 2n due to diploidy. If an alternative weights is set it is not doubled. The total concentration parameter of the Dirichlet distribution is the contributions from all studies plus the prior alpha. If alpha is not set the prior defaults to 1 observation of each allele.

Arguments:
  • AFtab (pd.DataFrame): Table of Allele frequency data
  • weights (str, optional): Column to be weighted by allele frequency to generate concentration parameter of Dirichlet distribution. Defaults to '2n'.
  • alpha (list, optional): Prior to use for Dirichlet distribution. Defaults to [].
  • datasetID (str, optional): Unique identifier column for study. Defaults to 'population'.
  • format (bool, optional): Run formatAF(). Defaults to True.
  • ignoreG (bool, optional): Treat allele G groups as normal, see formatAF(). Defaults to True.
  • add_unmeasured (bool, optional): Add unmeasured alleles to each study. This is important to ensure combined allele frequencies sum to 1. See add_unmeasured(). Defaults to True.
  • complete (bool, optional): Check study completeness. Uses default values for incomplete_studies(). If you are happy with your study completeness can be switched off with False. Defaults to True.
  • resolution (bool, optional): Check that all alleles have the same resolution, see check_resolution(). Defaults to True.
  • unique (bool, optional): Check that each allele appears no more than once per study. See alleles_unique_in_study(). Defaults to True.
Returns:

pd.DataFrame: Allele frequencies after combining estimates from all studies. allele_freq is the combined frequency estimate from the Dirichlet mean where the concentration is alpha + c. alpha is the prior used for the Dirichlet distribution. c is the observations used for the Dirichlet distribution. sample_size is the total sample size of all combined studies. wav is the weighted average.

def default_prior(k):
580def default_prior(k):
581    """Calculate a default prior, 1 observation of each class.
582
583    Args:
584        k (int): Number of classes in the Dirichlet distribution.
585
586    Returns:
587        list: List of k 1s to use as prior.
588    """
589    alpha = [1] * k
590    return alpha

Calculate a default prior, 1 observation of each class.

Arguments:
  • k (int): Number of classes in the Dirichlet distribution.
Returns:

list: List of k 1s to use as prior.

def single_loci(AFtab):
593def single_loci(AFtab):
594    """Check that allele frequency data is only of one locus
595
596    Args:
597        AFtab (pd.DataFrame): Allele frequency data
598    """
599    if not len(AFtab.loci.unique()) == 1:
600        raise AssertionError("'AFtab' must contain only 1 loci")

Check that allele frequency data is only of one locus

Arguments:
  • AFtab (pd.DataFrame): Allele frequency data
def alleles_unique_in_study(AFtab, datasetID='population'):
603def alleles_unique_in_study(AFtab, datasetID="population"):
604    """Are all alleles unique in each study?
605
606    Checks that no alleles are reported more than once in a single study.
607    Study is defined by `datasetID`.
608
609    Args:
610        AFtab (pd.DataFrame): Allele frequency data
611        datasetID (str, optional): Unique identifier column to define study.
612            Defaults to 'population'.
613
614    Returns:
615        bool: `True` on if no alleles occur more than once in any study, otherwise `False`.
616    """
617    df = AFtab.copy()
618    grouped = df.groupby([datasetID, "allele"])
619    # Are allele alleles unique? i.e. do any occur multiple times in grouping?
620    unique = grouped.size()[grouped.size() > 1].empty
621    if not unique:
622        print(f"Non unique alleles in study, is datasetID correct? {datasetID}")
623        print(grouped.size()[grouped.size() > 1])
624    return unique

Are all alleles unique in each study?

Checks that no alleles are reported more than once in a single study. Study is defined by datasetID.

Arguments:
  • AFtab (pd.DataFrame): Allele frequency data
  • datasetID (str, optional): Unique identifier column to define study. Defaults to 'population'.
Returns:

bool: True on if no alleles occur more than once in any study, otherwise False.

def duplicated_sample_size(AFtab):
627def duplicated_sample_size(AFtab):
628    """Returns True if any loci has more than 1 unique sample size"""
629    locus_sample_sizes = AFtab.groupby("loci").sample_size.apply(
630        lambda x: len(x.unique())
631    )
632    return any(locus_sample_sizes != 1)

Returns True if any loci has more than 1 unique sample size

def id_duplicated_allele(grouped):
635def id_duplicated_allele(grouped):
636    """Reports the allele that has mupltiple sample sizes"""
637    duplicated_population = grouped.population.apply(lambda x: any(x.duplicated()))
638    if not all(~duplicated_population):
639        raise AssertionError(
640            f"duplicated population within allele {duplicated_population[duplicated_population].index.tolist()}"
641        )

Reports the allele that has mupltiple sample sizes

def population_coverage(p):
644def population_coverage(p):
645    """Proportion of people with at least 1 copy of this allele assuming HWE.
646
647    Args:
648        p (float): Allele frequency
649
650    Returns:
651        float: Sum of homozygotes and heterozygotes for this allele
652    """
653    q = 1 - p
654    homo = p**2
655    hetero = 2 * p * q
656    return homo + hetero

Proportion of people with at least 1 copy of this allele assuming HWE.

Arguments:
  • p (float): Allele frequency
Returns:

float: Sum of homozygotes and heterozygotes for this allele

def betaAB(alpha):
659def betaAB(alpha):
660    """Calculate `a` `b` values for all composite beta distributions.
661
662    Given the `alpha` vector defining a Dirichlet distribution calculate the `a` `b` values
663    for all composite beta distributions.
664
665    Args:
666        alpha (list): Values defining a Dirichlet distribution. This will be the prior
667            (for a naive distribution) or the prior + caf.c for a posterior distribution.
668
669    Returns:
670        list: List of `a` `b` values defining beta values, i.e. for each allele it is
671            the number of times it was and wasn't observed.
672    """
673    ab = [(a, sum(alpha) - a) for a in alpha]
674    return ab

Calculate a b values for all composite beta distributions.

Given the alpha vector defining a Dirichlet distribution calculate the a b values for all composite beta distributions.

Arguments:
  • alpha (list): Values defining a Dirichlet distribution. This will be the prior (for a naive distribution) or the prior + caf.c for a posterior distribution.
Returns:

list: List of a b values defining beta values, i.e. for each allele it is the number of times it was and wasn't observed.

def plot_prior(concentration, ncol=2, psteps=1000, labels=''):
714def plot_prior(concentration, ncol=2, psteps=1000, labels=""):
715    """Plot probability density function for prior values.
716
717    Args:
718        concentration (list): Vector of the prior Dirichlet concentration values.
719        ncol (int, optional): Number of columns. Defaults to 2.
720        labels (list, optional): Labels for elements of concentration in the same
721            order. Defaults to "".
722    """
723    ab = betaAB(concentration)
724    pline = np.linspace(0, 1, psteps)
725    nrow = math.ceil(len(concentration) / ncol)
726    fig, ax = plt.subplots(nrow, ncol, sharex=True)
727    fig.suptitle("Probability density")
728    # If labels is a list nothing happens,
729    # But if it's a series it converts to a list
730    labels = list(labels)
731    if not labels:
732        labels = [""] * len(concentration)
733    if not len(concentration) == len(labels):
734        raise AssertionError("concentration must be same length as labels")
735    for i, alpha in enumerate(concentration):
736        a, b = ab[i]
737        bd = sp.stats.beta(a, b)
738        pdf = [bd.pdf(p) for p in pline]
739        ax.flat[i].plot(pline, pdf)
740        ax.flat[i].set_title(labels[i])
741    for axi in ax[-1, :]:
742        axi.set(xlabel="Allele freq")
743    for axi in ax[:, 0]:
744        axi.set(ylabel="PDF")
745    plt.show()

Plot probability density function for prior values.

Arguments:
  • concentration (list): Vector of the prior Dirichlet concentration values.
  • ncol (int, optional): Number of columns. Defaults to 2.
  • labels (list, optional): Labels for elements of concentration in the same order. Defaults to "".
def plotAF( caf=Empty DataFrame Columns: [] Index: [], AFtab=Empty DataFrame Columns: [] Index: [], cols=['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan'], datasetID='population', hdi=Empty DataFrame Columns: [] Index: [], compound_mean=Empty DataFrame Columns: [] Index: []):
748def plotAF(
749    caf=pd.DataFrame(),
750    AFtab=pd.DataFrame(),
751    cols=list(mcolors.TABLEAU_COLORS.keys()),
752    datasetID="population",
753    hdi=pd.DataFrame(),
754    compound_mean=pd.DataFrame(),
755):
756    """Plot allele frequency results from `HLAfreq`.
757
758    Plot combined allele frequencies, individual allele frequencies,
759    and credible intervals on combined allele frequency estimates.
760    Credible interval is only plotted if a value is given for `hdi`.
761    The plotted Credible interval is whatever was passed to HLAfreq_pymc.AFhdi()
762    when calculating hdi.
763
764    Args:
765        caf (pd.DataFrame, optional): Combined allele frequency estimates from
766            HLAfreq.combineAF. Defaults to pd.DataFrame().
767        AFtab (pd.DataFrame, optional): Table of allele frequency data. Defaults
768            to pd.DataFrame().
769        cols (list, optional): List of colours to use for each individual dataset.
770            Defaults to list(mcolors.TABLEAU_COLORS.keys()).
771        datasetID (str, optional): Column used to define separate datasets. Defaults
772            to "population".
773        weights (str, optional): Column to be weighted by allele frequency to generate
774            concentration parameter of Dirichlet distribution. Defaults to '2n'.
775        hdi (pd.DataFrame, optional): The high density interval object to plot credible
776            intervals. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame().
777        compound_mean (pd.DataFrame, optional): The high density interval object to plot
778            post_mean. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame().
779    """
780    # Plot allele frequency for each dataset
781    if not AFtab.empty:
782        # Cols must be longer than the list of alleles
783        # If not, repeat the list of cols
784        repeat_cols = np.ceil(len(AFtab[datasetID]) / len(cols))
785        repeat_cols = int(repeat_cols)
786        cols = cols * repeat_cols
787        # Make a dictionary mapping datasetID to colours
788        cmap = dict(zip(AFtab[datasetID].unique(), cols))
789        plt.scatter(
790            x=AFtab.allele_freq,
791            y=AFtab.allele,
792            c=[cmap[x] for x in AFtab[datasetID]],
793            alpha=0.7,
794            zorder=2,
795        )
796    # Plot combined allele frequency
797    if not caf.empty:
798        plt.scatter(
799            x=caf.allele_freq,
800            y=caf.allele,
801            edgecolors="black",
802            facecolors="none",
803            zorder=3,
804        )
805    # Plot high density interval
806    if not hdi.empty:
807        # assert not AFtab.empty, "AFtab is needed to calculate credible interval"
808        # from HLAfreq import HLAfreq_pymc as HLAhdi
809        # print("Fitting model with PyMC, make take a few seconds")
810        # hdi = HLAhdi.AFhdi(
811        #     AFtab=AFtab,
812        #     weights=weights,
813        #     datasetID=datasetID,
814        #     credible_interval=credible_interval,
815        #     conc_mu=conc_mu,
816        #     conc_sigma=conc_sigma
817        # )
818        for interval in hdi.iterrows():
819            # .iterrows returns a index and data as a tuple for each row
820            plt.hlines(
821                y=interval[1]["allele"],
822                xmin=interval[1]["lo"],
823                xmax=interval[1]["hi"],
824                color="black",
825            )
826    if not compound_mean.empty:
827        for row in compound_mean.iterrows():
828            plt.scatter(
829                y=row[1]["allele"], x=row[1]["post_mean"], color="black", marker="|"
830            )
831    plt.xlabel("Allele frequency")
832    plt.grid(zorder=0)
833    plt.show()

Plot allele frequency results from HLAfreq.

Plot combined allele frequencies, individual allele frequencies, and credible intervals on combined allele frequency estimates. Credible interval is only plotted if a value is given for hdi. The plotted Credible interval is whatever was passed to HLAfreq_pymc.AFhdi() when calculating hdi.

Arguments:
  • caf (pd.DataFrame, optional): Combined allele frequency estimates from HLAfreq.combineAF. Defaults to pd.DataFrame().
  • AFtab (pd.DataFrame, optional): Table of allele frequency data. Defaults to pd.DataFrame().
  • cols (list, optional): List of colours to use for each individual dataset. Defaults to list(mcolors.TABLEAU_COLORS.keys()).
  • datasetID (str, optional): Column used to define separate datasets. Defaults to "population".
  • weights (str, optional): Column to be weighted by allele frequency to generate concentration parameter of Dirichlet distribution. Defaults to '2n'.
  • hdi (pd.DataFrame, optional): The high density interval object to plot credible intervals. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame().
  • compound_mean (pd.DataFrame, optional): The high density interval object to plot post_mean. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame().