HLAfreq.HLAfreq

Download and combine HLA allele frequencies from multiple datasets.

Download allele frequency data from allelefrequencies.net. Allele frequencies from different populations can be combined to estimate HLA frequencies of countries or other regions such as global HLA frequencies.

  1"""
  2Download and combine HLA allele frequencies from multiple datasets.
  3
  4Download allele frequency data from
  5[allelefrequencies.net](www.allelefrequencies.net). Allele
  6frequencies from different populations can be combined to
  7estimate HLA frequencies of countries or other regions such as
  8global HLA frequencies.
  9"""
 10
 11from collections.abc import Iterable
 12from bs4 import BeautifulSoup
 13import requests
 14import pandas as pd
 15import numpy as np
 16import matplotlib.pyplot as plt
 17import math
 18import scipy as sp
 19import matplotlib.colors as mcolors
 20
 21
 22def simulate_population(alleles: Iterable[str], locus: str, population: str):
 23    pop_size = np.random.randint(len(alleles), 50)
 24    samples = np.random.choice(alleles, pop_size, replace=True)
 25    counts = pd.Series(samples).value_counts()
 26    counts.values / pop_size
 27    pop = pd.DataFrame(
 28        {
 29            "allele": counts.index,
 30            "loci": locus,
 31            "population": population,
 32            "allele_freq": counts.values / pop_size,
 33            "sample_size": pop_size,
 34        }
 35    )
 36    return pop
 37
 38
 39def simulate_study(alleles, populations, locus):
 40    study = []
 41    for i in range(populations):
 42        pop = simulate_population(alleles=alleles, locus=locus, population=f"pop_{i}")
 43        study.append(pop)
 44
 45    study = pd.concat(study)
 46    return study
 47
 48
 49def makeURL(
 50    country="",
 51    standard="s",
 52    locus="",
 53    resolution_pattern="bigger_equal_than",
 54    resolution=2,
 55    region="",
 56    ethnic="",
 57    study_type="",
 58    dataset_source="",
 59    sample_year="",
 60    sample_year_pattern="",
 61    sample_size="",
 62    sample_size_pattern="",
 63):
 64    """Create URL for search of allele frequency net database.
 65
 66    All arguments are documented [here](http://www.allelefrequencies.net/extaccess.asp)
 67
 68    Args:
 69        country (str, optional): Country name to retrieve records from. Defaults to "".
 70        standard (str, optional): Filter study quality standard to this or higher.
 71            {'g', 's', 'a'} Gold, silver, all. Defaults to 's'.
 72        locus (str, optional): The locus to return allele data for. Defaults to "".
 73        resolution_pattern (str, optional): Resolution comparitor {'equal', 'different',
 74            'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}.
 75            Filter created using `resolution` and `resolution_pattern`.
 76            Defaults to "bigger_equal_than".
 77        resolution (int, optional): Number of fields of resolution of allele. Filter
 78            created using `resolution` and `resolution_pattern`. Defaults to 2.
 79        region (str, optional): Filter to geographic region. {Asia, Australia,
 80            Eastern Europe, ...}.
 81            All regions listed [here](http://www.allelefrequencies.net/pop6003a.asp).
 82            Defaults to "".
 83        ethnic (str, optional): Filter to ethnicity. {"Amerindian", "Black", "Caucasian", ...}.
 84            All ethnicities listed [here](http://www.allelefrequencies.net/pop6003a.asp).
 85            Defaults to "".
 86        study_type (str, optional): Type of study. {"Anthropology", "Blood+Donor",
 87            "Bone+Marrow+Registry", "Controls+for+Disease+Study", "Disease+Study+Patients",
 88            "Other", "Solid+Organd+Unrelated+Donors", "Stem+cell+donors"}. Defaults to "".
 89        dataset_source (str, optional): Source of data. {"Literature",
 90            "Proceedings+of+IHWs", "Unpublished"}. Defaults to "".
 91        sample_year (int, optional): Sample year to compare to. Filter created using
 92            sample_year and sample_year_pattern. Defaults to "".
 93        sample_year_pattern (str, optional): Pattern to compare sample year to. Filter
 94            created using sample_year and sample_year_pattern. {'equal', 'different',
 95            'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}.
 96            Defaults to "".
 97        sample_size (int, optional): Sample size to compare to. Filter created using
 98            sample_size and sample_size_pattern. Defaults to "".
 99        sample_size_pattern (str, optional): Pattern to compare sample size to. Filter
100            created using sample_size and sample_size_pattern. {'equal', 'different',
101            'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}.
102            Defaults to "".
103
104    Returns:
105        str: URL to search allelefrequencies.net
106    """
107    base = "http://www.allelefrequencies.net/hla6006a.asp?"
108    locus_type = "hla_locus_type=Classical&"
109    hla_locus = "hla_locus=%s&" % (locus)
110    country = "hla_country=%s&" % (country)
111    region = "hla_region=%s&" % (region)
112    ethnic = "hla_ethnic=%s&" % (ethnic)
113    study_type = "hla_study=%s&" % (study_type)
114    dataset_source = "hla_dataset_source=%s&" % (dataset_source)
115    sample_year = "hla_sample_year=%s&" % (sample_year)
116    sample_year_pattern = "hla_sample_year_pattern=%s&" % (sample_year_pattern)
117    sample_size = "hla_sample_size=%s&" % (sample_size)
118    sample_size_pattern = "hla_sample_size_pattern=%s&" % (sample_size_pattern)
119    hla_level_pattern = "hla_level_pattern=%s&" % (resolution_pattern)
120    hla_level = "hla_level=%s&" % (resolution)
121    standard = "standard=%s&" % standard
122    url = (
123        base
124        + locus_type
125        + hla_locus
126        + country
127        + hla_level_pattern
128        + hla_level
129        + standard
130        + region
131        + ethnic
132        + study_type
133        + dataset_source
134        + sample_year
135        + sample_year_pattern
136        + sample_size
137        + sample_size_pattern
138    )
139    return url
140
141
142def parseAF(bs):
143    """Generate a dataframe from a given html page
144
145    Args:
146        bs (bs4.BeautifulSoup): BeautifulSoup object from allelefrequencies.net page
147
148    Returns:
149        pd.DataFrame: Table of allele, allele frequency, samplesize, and population
150    """
151    # Get the results table from the div `divGenDetail`
152    tab = bs.find("div", {"id": "divGenDetail"}).find("table", {"class": "tblNormal"})
153    # Get the column headers from the first row of the table
154    columns = [
155        "line",
156        "allele",
157        "flag",
158        "population",
159        "carriers%",
160        "allele_freq",
161        "AF_graphic",
162        "sample_size",
163        "database",
164        "distribution",
165        "haplotype_association",
166        "notes",
167    ]
168    rows = []
169    for row in tab.find_all("tr"):
170        rows.append([td.get_text(strip=True) for td in row.find_all("td")])
171    # Make dataframe of table rows
172    # skip the first row as it's `th` headers
173    df = pd.DataFrame(rows[1:], columns=columns)
174
175    # Get HLA loci
176    df["loci"] = df.allele.apply(lambda x: x.split("*")[0])
177
178    # Drop unwanted columns
179    df = df[["allele", "loci", "population", "allele_freq", "carriers%", "sample_size"]]
180    return df
181
182
183def Npages(bs):
184    """How many pages of results are there?
185
186    Args:
187        bs (bs4.BeautifulSoup): BS object of allelefrequencies.net results page
188
189    Returns:
190        int: Total number of results pages
191    """
192    # Get the table with number of pages
193    navtab = bs.find("div", {"id": "divGenNavig"}).find("table", {"class": "table10"})
194    if not navtab:
195        raise AssertionError(
196            "navtab does not evaluate to True. Check URL returns results in web browser."
197        )
198    # Get cell with ' of ' in
199    pagesOfN = [
200        td.get_text(strip=True) for td in navtab.find_all("td") if " of " in td.text
201    ]
202    # Check single cell returned
203    if not len(pagesOfN) == 1:
204        raise AssertionError("divGenNavig should contain 1 of not %s" % len(pagesOfN))
205    # Get total number of pages
206    N = pagesOfN[0].split("of ")[1]
207    N = int(N)
208    return N
209
210
211def formatAF(AFtab, ignoreG=True):
212    """Format allele frequency table.
213
214    Convert sample_size and allele_freq to numeric data type.
215    Removes commas from sample size. Removes "(*)" from allele frequency if
216    `ignoreG` is `True`. `formatAF()` is used internally by combineAF and getAFdata
217    by default.
218
219    Args:
220        AFtab (pd.DataFrame): Allele frequency data downloaded from allelefrequency.net
221            using `getAFdata()`.
222        ignoreG (bool, optional): Treat G group alleles as normal.
223            See http://hla.alleles.org/alleles/g_groups.html for details. Defaults to True.
224
225    Returns:
226        pd.DataFrame: The formatted allele frequency data.
227    """
228    df = AFtab.copy()
229    if df.sample_size.dtype == "O":
230        df.sample_size = pd.to_numeric(df.sample_size.str.replace(",", ""))
231    if df.allele_freq.dtype == "O":
232        if ignoreG:
233            df.allele_freq = df.allele_freq.str.replace("(*)", "", regex=False)
234        df.allele_freq = pd.to_numeric(df.allele_freq)
235    return df
236
237
238def getAFdata(base_url, timeout=20, format=True, ignoreG=True):
239    """Get all allele frequency data from a search base_url.
240
241    Iterates over all pages regardless of which page is based.
242
243    Args:
244        base_url (str): URL for base search.
245        timeout (int): How long to wait to receive a response.
246        format (bool): Format the downloaded data using `formatAF()`.
247        ignoreG (bool): treat allele G groups as normal.
248            See http://hla.alleles.org/alleles/g_groups.html for details. Default = True
249
250    Returns:
251        pd.DataFrame: allele frequency data parsed into a pandas dataframe
252    """
253    # Get BS object from base search
254    try:
255        bs = BeautifulSoup(requests.get(base_url, timeout=timeout).text, "html.parser")
256    except requests.exceptions.ReadTimeout as e:
257        raise Exception(
258            "Requests timeout, try a larger `timeout` value for `getAFdata()`"
259        ) from None
260    # How many pages of results
261    N = Npages(bs)
262    print("%s pages of results" % N)
263    # iterate over pages, parse and combine data from each
264    tabs = []
265    for i in range(N):
266        # print (" Parsing page %s" %(i+1))
267        print(" Parsing page %s" % (i + 1), end="\r")
268        url = base_url + "page=" + str(i + 1)
269        try:
270            bs = BeautifulSoup(requests.get(url, timeout=timeout).text, "html.parser")
271        except requests.exceptions.ReadTimeout as e:
272            raise Exception(
273                "Requests timeout, try a larger `timeout` value for `getAFdata()`"
274            ) from None
275        tab = parseAF(bs)
276        tabs.append(tab)
277    print("Download complete")
278    tabs = pd.concat(tabs)
279    if format:
280        try:
281            tabs = formatAF(tabs, ignoreG)
282        except AttributeError:
283            print("Formatting failed, non-numeric datatypes may remain.")
284    return tabs
285
286
287def incomplete_studies(AFtab, llimit=0.95, ulimit=1.1, datasetID="population"):
288    """Report any studies with allele freqs that don't sum to 1
289
290    Args:
291        AFtab (pd.DataFrame): Dataframe containing multiple studies
292        llimit (float, optional): Lower allele_freq sum limit that counts as complete.
293            Defaults to 0.95.
294        ulimit (float, optional): Upper allele_freq sum limit that will not be reported.
295            Defaults to 1.1.
296        datasetID (str): Unique identifier column for study
297    """
298    poplocs = AFtab.groupby([datasetID, "loci"]).allele_freq.sum()
299    lmask = poplocs < llimit
300    if sum(lmask > 0):
301        print(poplocs[lmask])
302        print(f"{sum(lmask)} studies have total allele frequency < {llimit}")
303    umask = poplocs > ulimit
304    if sum(umask > 0):
305        print(poplocs[umask])
306        print(f"{sum(umask)} studies have total allele frequency > {ulimit}")
307    incomplete = pd.concat([poplocs[lmask], poplocs[umask]])
308    return incomplete
309
310
311def only_complete(AFtab, llimit=0.95, ulimit=1.1, datasetID="population"):
312    """Returns only complete studies.
313
314    Data is dropped if the locus for that population is not complete, i.e. doesn't
315    sum to between `llimit` and `ulimit`. This prevents throwing away data if
316    another loci in the population is incomplete.
317
318    Args:
319        AFtab (pd.DataFrame): Dataframe containing multiple studies
320        llimit (float, optional): Lower allele_freq sum limit that counts as complete.
321            Defaults to 0.95.
322        ulimit (float, optional): Upper allele_freq sum limit that will not be reported.
323            Defaults to 1.1.
324        datasetID (str): Unique identifier column for study. Defaults to 'population'.
325
326    Returns:
327        pd.DataFrame: Allele frequency data of multiple studies, but only complete studies.
328    """
329    noncomplete = incomplete_studies(
330        AFtab=AFtab, llimit=llimit, ulimit=ulimit, datasetID=datasetID
331    )
332    # Returns False if population AND loci are in the noncomplete.index
333    # AS A PAIR
334    # This is important so that we don't throw away all data on a population
335    # just because one loci is incomplete.
336    complete_mask = AFtab.apply(
337        lambda x: (x[datasetID], x.loci) not in noncomplete.index, axis=1
338    )
339    df = AFtab[complete_mask]
340    return df
341
342
343def check_resolution(AFtab):
344    """Check if all alleles in AFtab have the same resolution.
345    Will print the number of records with each resolution.
346
347    Args:
348        AFtab (pd.DataFrame): Allele frequency data
349
350    Returns:
351        bool: True only if all alleles have the same resolution, else False.
352    """
353    resolution = 1 + AFtab.allele.str.count(":")
354    resVC = resolution.value_counts()
355    pass_check = len(resVC) == 1
356    if not pass_check:
357        print(resVC)
358        print("Multiple resolutions in AFtab. Fix with decrease_resolution()")
359    return pass_check
360
361
362def decrease_resolution(AFtab, newres, datasetID="population"):
363    """Decrease allele resolution so all alleles have the same resolution.
364
365    Args:
366        AFtab (pd.DataFrame): Allele frequency data.
367        newres (int): The desired number of fields for resolution.
368        datasetID (str, optional): Column to use as stud identifier.
369            Defaults to 'population'.
370
371    Returns:
372        pd.DataFrame: Allele frequency data with all alleles of requested resolution.
373    """
374    df = AFtab.copy()
375    resolution = 1 + df.allele.str.count(":")
376    if not all(resolution >= newres):
377        raise AssertionError(f"Some alleles have resolution below {newres} fields")
378    new_allele = df.allele.str.split(":").apply(lambda x: ":".join(x[:newres]))
379    df.allele = new_allele
380    collapsed = collapse_reduced_alleles(df, datasetID=datasetID)
381    return collapsed
382
383
384def collapse_reduced_alleles(AFtab, datasetID="population"):
385    df = AFtab.copy()
386    # Group by alleles within datasets
387    grouped = df.groupby([datasetID, "allele"])
388    # Sum allele freq but keep other columns
389    collapsed = grouped.apply(
390        lambda row: [
391            sum(row.allele_freq),
392            row.sample_size.unique()[0],
393            row.loci.unique()[0],
394            len(row.loci.unique()),
395            len(row.sample_size.unique()),
396        ]
397    )
398    collapsed = pd.DataFrame(
399        collapsed.tolist(),
400        index=collapsed.index,
401        columns=["allele_freq", "sample_size", "loci", "#loci", "#sample_sizes"],
402    ).reset_index()
403    # Within a study each all identical alleles should have the same loci and sample size
404    if not all(collapsed["#loci"] == 1):
405        raise AssertionError(
406            "Multiple loci found for a single allele in a single population"
407        )
408    if not all(collapsed["#sample_sizes"] == 1):
409        raise AssertionError(
410            "Multiple sample_sizes found for a single allele in a single population"
411        )
412    collapsed = collapsed[
413        ["allele", "loci", "population", "allele_freq", "sample_size"]
414    ]
415    alleles_unique_in_study(collapsed)
416    return collapsed
417
418
419def unmeasured_alleles(AFtab, datasetID="population"):
420    """When combining AF estimates, unreported alleles can inflate frequencies
421        so AF sums to >1. Therefore we add unreported alleles with frequency zero.
422
423    Args:
424        AFtab (pd.DataFrame): Formatted allele frequency data
425        datasetID (str): Unique identifier column for study
426
427    Returns:
428        pd.DataFrame: Allele frequency data with all locus alleles reported
429            for each dataset
430    """
431    df = AFtab.copy()
432    loci = df.loci.unique()
433    # Iterate over loci separately
434    for locus in loci:
435        # Iterate over each dataset reporting that locus
436        datasets = df[df.loci == locus][datasetID].unique()
437        for dataset in datasets:
438            # Single locus, single dataset
439            datasetAF = df[(df[datasetID] == dataset) & (df.loci == locus)]
440            # What was the sample size for this data?
441            dataset_sample_size = datasetAF.sample_size.unique()
442            if not (len(dataset_sample_size) == 1):
443                raise AssertionError(
444                    "dataset_sample_size must be 1, not %s" % len(dataset_sample_size)
445                )
446            dataset_sample_size = dataset_sample_size[0]
447            # Get all alleles for this locus (across datasets)
448            ualleles = df[df.loci == locus].allele.unique()
449            # Which of these alleles are not in this dataset?
450            missing_alleles = [
451                allele for allele in ualleles if not allele in datasetAF.allele.values
452            ]
453            missing_rows = [
454                (al, locus, dataset, 0, 0, dataset_sample_size)
455                for al in missing_alleles
456            ]
457            missing_rows = pd.DataFrame(
458                missing_rows,
459                columns=[
460                    "allele",
461                    "loci",
462                    datasetID,
463                    "allele_freq",
464                    "carriers%",
465                    "sample_size",
466                ],
467            )
468            # Add them in with zero frequency
469            if not missing_rows.empty:
470                df = pd.concat([df, missing_rows], ignore_index=True)
471    return df
472
473
474def combineAF(
475    AFtab,
476    weights="2n",
477    alpha=[],
478    datasetID="population",
479    format=True,
480    ignoreG=True,
481    add_unmeasured=True,
482    complete=True,
483    resolution=True,
484    unique=True,
485):
486    """Combine allele frequencies from multiple studies.
487
488    `datasetID` is the unique identifier for studies to combine.
489    Allele frequencies combined using a Dirichlet distribution where each study's
490    contribution to the concentration parameter is $2 * sample_size * allele_frequency$.
491    Sample size is doubled to get `2n` due to diploidy. If an alternative `weights` is
492    set it is not doubled. The total concentration parameter of the Dirichlet distribution
493    is the contributions from all studies plus the prior `alpha`. If `alpha` is not set
494    the prior defaults to 1 observation of each allele.
495
496    Args:
497        AFtab (pd.DataFrame): Table of Allele frequency data
498        weights (str, optional): Column to be weighted by allele frequency to generate
499            concentration parameter of Dirichlet distribution. Defaults to '2n'.
500        alpha (list, optional): Prior to use for Dirichlet distribution. Defaults to [].
501        datasetID (str, optional): Unique identifier column for study. Defaults to
502            'population'.
503        format (bool, optional): Run `formatAF()`. Defaults to True.
504        ignoreG (bool, optional): Treat allele G groups as normal, see `formatAF()`.
505            Defaults to True.
506        add_unmeasured (bool, optional): Add unmeasured alleles to each study. This is
507            important to ensure combined allele frequencies sum to 1. See
508            `add_unmeasured()`. Defaults to True.
509        complete (bool, optional): Check study completeness. Uses default values for
510            `incomplete_studies()`. If you are happy with your study completeness can
511            be switched off with False. Defaults to True.
512        resolution (bool, optional): Check that all alleles have the same resolution,
513            see `check_resolution()`. Defaults to True.
514        unique (bool, optional): Check that each allele appears no more than once per
515            study. See `alleles_unique_in_study()`. Defaults to True.
516
517    Returns:
518        pd.DataFrame: Allele frequencies after combining estimates from all studies.
519            *allele_freq* is the combined frequency estimate from the Dirichlet mean
520            where the concentration is `alpha` + `c`.
521            *alpha* is the prior used for the Dirichlet distribution.
522            *c* is the observations used for the Dirichlet distribution.
523            *sample_size* is the total sample size of all combined studies.
524            *wav* is the weighted average.
525    """
526    df = AFtab.copy()
527    single_loci(df)
528    if unique:
529        if not alleles_unique_in_study(df, datasetID=datasetID):
530            raise AssertionError("The same allele appears multiple times in a dataset")
531    if complete:
532        if not incomplete_studies(df, datasetID=datasetID).empty:
533            raise AssertionError(
534                "AFtab contains studies with AF that doesn't sum to 1. Check incomplete_studies(AFtab)"
535            )
536    if resolution:
537        if not check_resolution(df):
538            raise AssertionError(
539                "AFtab conains alleles at multiple resolutions, check check_resolution(AFtab)"
540            )
541    if format:
542        df = formatAF(df, ignoreG)
543    if add_unmeasured:
544        df = unmeasured_alleles(df, datasetID)
545    try:
546        df["2n"] = df.sample_size * 2
547    except AttributeError:
548        print("column '2n' could not be created")
549    df["c"] = df.allele_freq * df[weights]
550    grouped = df.groupby("allele", sort=True)
551    combined = grouped.apply(
552        lambda row: [
553            row.name,
554            row.loci.unique()[0],
555            np.average(row.allele_freq, weights=row[weights]),
556            row.c.sum(),
557            row.sample_size.sum(),
558        ]
559    )
560    combined = pd.DataFrame(
561        combined.tolist(), columns=["allele", "loci", "wav", "c", "sample_size"]
562    )
563    combined = combined.reset_index(drop=True)
564    # Check that all alleles in a locus have the same sample size
565    # after merging
566    if duplicated_sample_size(combined):
567        id_duplicated_allele(grouped)
568    if not alpha:
569        alpha = default_prior(len(combined.allele))
570    combined["alpha"] = alpha
571    # Calculate Dirichlet mean for each allele
572    combined["allele_freq"] = sp.stats.dirichlet(combined.alpha + combined.c).mean()
573
574    return combined
575
576
577def default_prior(k):
578    """Calculate a default prior, 1 observation of each class.
579
580    Args:
581        k (int): Number of classes in the Dirichlet distribution.
582
583    Returns:
584        list: List of k 1s to use as prior.
585    """
586    alpha = [1] * k
587    return alpha
588
589
590def single_loci(AFtab):
591    """Check that allele frequency data is only of one locus
592
593    Args:
594        AFtab (pd.DataFrame): Allele frequency data
595    """
596    if not len(AFtab.loci.unique()) == 1:
597        raise AssertionError("'AFtab' must contain only 1 loci")
598
599
600def alleles_unique_in_study(AFtab, datasetID="population"):
601    """Are all alleles unique in each study?
602
603    Checks that no alleles are reported more than once in a single study.
604    Study is defined by `datasetID`.
605
606    Args:
607        AFtab (pd.DataFrame): Allele frequency data
608        datasetID (str, optional): Unique identifier column to define study.
609            Defaults to 'population'.
610
611    Returns:
612        bool: `True` on if no alleles occur more than once in any study, otherwise `False`.
613    """
614    df = AFtab.copy()
615    grouped = df.groupby([datasetID, "allele"])
616    # Are allele alleles unique? i.e. do any occur multiple times in grouping?
617    unique = grouped.size()[grouped.size() > 1].empty
618    if not unique:
619        print(f"Non unique alleles in study, is datasetID correct? {datasetID}")
620        print(grouped.size()[grouped.size() > 1])
621    return unique
622
623
624def duplicated_sample_size(AFtab):
625    """Returns True if any loci has more than 1 unique sample size"""
626    locus_sample_sizes = AFtab.groupby("loci").sample_size.apply(
627        lambda x: len(x.unique())
628    )
629    return any(locus_sample_sizes != 1)
630
631
632def id_duplicated_allele(grouped):
633    """Reports the allele that has mupltiple sample sizes"""
634    duplicated_population = grouped.population.apply(lambda x: any(x.duplicated()))
635    if not all(~duplicated_population):
636        raise AssertionError(
637            f"duplicated population within allele {duplicated_population[duplicated_population].index.tolist()}"
638        )
639
640
641def population_coverage(p):
642    """Proportion of people with at least 1 copy of this allele assuming HWE.
643
644    Args:
645        p (float): Allele frequency
646
647    Returns:
648        float: Sum of homozygotes and heterozygotes for this allele
649    """
650    q = 1 - p
651    homo = p**2
652    hetero = 2 * p * q
653    return homo + hetero
654
655
656def betaAB(alpha):
657    """Calculate `a` `b` values for all composite beta distributions.
658
659    Given the `alpha` vector defining a Dirichlet distribution calculate the `a` `b` values
660    for all composite beta distributions.
661
662    Args:
663        alpha (list): Values defining a Dirichlet distribution. This will be the prior
664            (for a naive distribution) or the prior + caf.c for a posterior distribution.
665
666    Returns:
667        list: List of `a` `b` values defining beta values, i.e. for each allele it is
668            the number of times it was and wasn't observed.
669    """
670    ab = [(a, sum(alpha) - a) for a in alpha]
671    return ab
672
673
674# def betaCI(a,b,credible_interval=0.95):
675#     """Calculat the central credible interval of a beta distribution
676
677#     Args:
678#         a (float): Beta shape parameter `a`, i.e. the number of times the allele was observed.
679#         b (float): Beta shape parameter `b`, i.e. the number of times the allele was not observed.
680#         credible_interval (float, optional): The size of the credible interval requested. Defaults to 0.95.
681
682#     Returns:
683#         tuple: Lower and upper credible interval of beta distribution.
684#     """
685#     bd = sp.stats.beta(a,b)
686#     lower_quantile = (1-credible_interval)/2
687#     upper_quantile = 1-lower_quantile
688#     lower_interval = bd.ppf(lower_quantile)
689#     upper_interval = bd.ppf(upper_quantile)
690#     return lower_interval, upper_interval
691
692# def AFci(caf, credible_interval=0.95):
693#     """Calculate credible interval for combined allele frequency table.
694#     Note that this ignores sampling error so confidence interval is too tight.
695#     Use HLAhdi.AFhdi() instead.
696
697#     Args:
698#         caf (pd.DataFrame): Table produced by combineAF()
699#         credible_interval (float, optional): The desired confidence interval. Defaults to 0.95.
700
701#     Returns:
702#         list: Lower and upper credible intervals as a list of tuples
703#     """
704#     ab = betaAB(
705#         caf.alpha + caf.c,
706#     )
707#     ci = [betaCI(a, b, credible_interval) for a,b in ab]
708#     return ci
709
710
711def plot_prior(concentration, ncol=2, psteps=1000, labels=""):
712    """Plot probability density function for prior values.
713
714    Args:
715        concentration (list): Vector of the prior Dirichlet concentration values.
716        ncol (int, optional): Number of columns. Defaults to 2.
717        labels (list, optional): Labels for elements of concentration in the same
718            order. Defaults to "".
719    """
720    ab = betaAB(concentration)
721    pline = np.linspace(0, 1, psteps)
722    nrow = math.ceil(len(concentration) / ncol)
723    fig, ax = plt.subplots(nrow, ncol, sharex=True)
724    fig.suptitle("Probability density")
725    # If labels is a list nothing happens,
726    # But if it's a series it converts to a list
727    labels = list(labels)
728    if not labels:
729        labels = [""] * len(concentration)
730    if not len(concentration) == len(labels):
731        raise AssertionError("concentration must be same length as labels")
732    for i, alpha in enumerate(concentration):
733        a, b = ab[i]
734        bd = sp.stats.beta(a, b)
735        pdf = [bd.pdf(p) for p in pline]
736        ax.flat[i].plot(pline, pdf)
737        ax.flat[i].set_title(labels[i])
738    for axi in ax[-1, :]:
739        axi.set(xlabel="Allele freq")
740    for axi in ax[:, 0]:
741        axi.set(ylabel="PDF")
742    plt.show()
743
744
745def plotAF(
746    caf=pd.DataFrame(),
747    AFtab=pd.DataFrame(),
748    cols=list(mcolors.TABLEAU_COLORS.keys()),
749    datasetID="population",
750    hdi=pd.DataFrame(),
751    compound_mean=pd.DataFrame(),
752):
753    """Plot allele frequency results from `HLAfreq`.
754
755    Plot combined allele frequencies, individual allele frequencies,
756    and credible intervals on combined allele frequency estimates.
757    Credible interval is only plotted if a value is given for `hdi`.
758    The plotted Credible interval is whatever was passed to HLAfreq_pymc.AFhdi()
759    when calculating hdi.
760
761    Args:
762        caf (pd.DataFrame, optional): Combined allele frequency estimates from
763            HLAfreq.combineAF. Defaults to pd.DataFrame().
764        AFtab (pd.DataFrame, optional): Table of allele frequency data. Defaults
765            to pd.DataFrame().
766        cols (list, optional): List of colours to use for each individual dataset.
767            Defaults to list(mcolors.TABLEAU_COLORS.keys()).
768        datasetID (str, optional): Column used to define separate datasets. Defaults
769            to "population".
770        weights (str, optional): Column to be weighted by allele frequency to generate
771            concentration parameter of Dirichlet distribution. Defaults to '2n'.
772        hdi (pd.DataFrame, optional): The high density interval object to plot credible
773            intervals. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame().
774        compound_mean (pd.DataFrame, optional): The high density interval object to plot
775            post_mean. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame().
776    """
777    # Plot allele frequency for each dataset
778    if not AFtab.empty:
779        # Cols must be longer than the list of alleles
780        # If not, repeat the list of cols
781        repeat_cols = np.ceil(len(AFtab[datasetID]) / len(cols))
782        repeat_cols = int(repeat_cols)
783        cols = cols * repeat_cols
784        # Make a dictionary mapping datasetID to colours
785        cmap = dict(zip(AFtab[datasetID].unique(), cols))
786        plt.scatter(
787            x=AFtab.allele_freq,
788            y=AFtab.allele,
789            c=[cmap[x] for x in AFtab[datasetID]],
790            alpha=0.7,
791            zorder=2,
792        )
793    # Plot combined allele frequency
794    if not caf.empty:
795        plt.scatter(
796            x=caf.allele_freq,
797            y=caf.allele,
798            edgecolors="black",
799            facecolors="none",
800            zorder=3,
801        )
802    # Plot high density interval
803    if not hdi.empty:
804        # assert not AFtab.empty, "AFtab is needed to calculate credible interval"
805        # from HLAfreq import HLAfreq_pymc as HLAhdi
806        # print("Fitting model with PyMC, make take a few seconds")
807        # hdi = HLAhdi.AFhdi(
808        #     AFtab=AFtab,
809        #     weights=weights,
810        #     datasetID=datasetID,
811        #     credible_interval=credible_interval,
812        #     conc_mu=conc_mu,
813        #     conc_sigma=conc_sigma
814        # )
815        for interval in hdi.iterrows():
816            # .iterrows returns a index and data as a tuple for each row
817            plt.hlines(
818                y=interval[1]["allele"],
819                xmin=interval[1]["lo"],
820                xmax=interval[1]["hi"],
821                color="black",
822            )
823    if not compound_mean.empty:
824        for row in compound_mean.iterrows():
825            plt.scatter(
826                y=row[1]["allele"], x=row[1]["post_mean"], color="black", marker="|"
827            )
828    plt.xlabel("Allele frequency")
829    plt.grid(zorder=0)
830    plt.show()
def simulate_population(alleles: Iterable[str], locus: str, population: str):
23def simulate_population(alleles: Iterable[str], locus: str, population: str):
24    pop_size = np.random.randint(len(alleles), 50)
25    samples = np.random.choice(alleles, pop_size, replace=True)
26    counts = pd.Series(samples).value_counts()
27    counts.values / pop_size
28    pop = pd.DataFrame(
29        {
30            "allele": counts.index,
31            "loci": locus,
32            "population": population,
33            "allele_freq": counts.values / pop_size,
34            "sample_size": pop_size,
35        }
36    )
37    return pop
def simulate_study(alleles, populations, locus):
40def simulate_study(alleles, populations, locus):
41    study = []
42    for i in range(populations):
43        pop = simulate_population(alleles=alleles, locus=locus, population=f"pop_{i}")
44        study.append(pop)
45
46    study = pd.concat(study)
47    return study
def makeURL( country='', standard='s', locus='', resolution_pattern='bigger_equal_than', resolution=2, region='', ethnic='', study_type='', dataset_source='', sample_year='', sample_year_pattern='', sample_size='', sample_size_pattern=''):
 50def makeURL(
 51    country="",
 52    standard="s",
 53    locus="",
 54    resolution_pattern="bigger_equal_than",
 55    resolution=2,
 56    region="",
 57    ethnic="",
 58    study_type="",
 59    dataset_source="",
 60    sample_year="",
 61    sample_year_pattern="",
 62    sample_size="",
 63    sample_size_pattern="",
 64):
 65    """Create URL for search of allele frequency net database.
 66
 67    All arguments are documented [here](http://www.allelefrequencies.net/extaccess.asp)
 68
 69    Args:
 70        country (str, optional): Country name to retrieve records from. Defaults to "".
 71        standard (str, optional): Filter study quality standard to this or higher.
 72            {'g', 's', 'a'} Gold, silver, all. Defaults to 's'.
 73        locus (str, optional): The locus to return allele data for. Defaults to "".
 74        resolution_pattern (str, optional): Resolution comparitor {'equal', 'different',
 75            'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}.
 76            Filter created using `resolution` and `resolution_pattern`.
 77            Defaults to "bigger_equal_than".
 78        resolution (int, optional): Number of fields of resolution of allele. Filter
 79            created using `resolution` and `resolution_pattern`. Defaults to 2.
 80        region (str, optional): Filter to geographic region. {Asia, Australia,
 81            Eastern Europe, ...}.
 82            All regions listed [here](http://www.allelefrequencies.net/pop6003a.asp).
 83            Defaults to "".
 84        ethnic (str, optional): Filter to ethnicity. {"Amerindian", "Black", "Caucasian", ...}.
 85            All ethnicities listed [here](http://www.allelefrequencies.net/pop6003a.asp).
 86            Defaults to "".
 87        study_type (str, optional): Type of study. {"Anthropology", "Blood+Donor",
 88            "Bone+Marrow+Registry", "Controls+for+Disease+Study", "Disease+Study+Patients",
 89            "Other", "Solid+Organd+Unrelated+Donors", "Stem+cell+donors"}. Defaults to "".
 90        dataset_source (str, optional): Source of data. {"Literature",
 91            "Proceedings+of+IHWs", "Unpublished"}. Defaults to "".
 92        sample_year (int, optional): Sample year to compare to. Filter created using
 93            sample_year and sample_year_pattern. Defaults to "".
 94        sample_year_pattern (str, optional): Pattern to compare sample year to. Filter
 95            created using sample_year and sample_year_pattern. {'equal', 'different',
 96            'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}.
 97            Defaults to "".
 98        sample_size (int, optional): Sample size to compare to. Filter created using
 99            sample_size and sample_size_pattern. Defaults to "".
100        sample_size_pattern (str, optional): Pattern to compare sample size to. Filter
101            created using sample_size and sample_size_pattern. {'equal', 'different',
102            'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}.
103            Defaults to "".
104
105    Returns:
106        str: URL to search allelefrequencies.net
107    """
108    base = "http://www.allelefrequencies.net/hla6006a.asp?"
109    locus_type = "hla_locus_type=Classical&"
110    hla_locus = "hla_locus=%s&" % (locus)
111    country = "hla_country=%s&" % (country)
112    region = "hla_region=%s&" % (region)
113    ethnic = "hla_ethnic=%s&" % (ethnic)
114    study_type = "hla_study=%s&" % (study_type)
115    dataset_source = "hla_dataset_source=%s&" % (dataset_source)
116    sample_year = "hla_sample_year=%s&" % (sample_year)
117    sample_year_pattern = "hla_sample_year_pattern=%s&" % (sample_year_pattern)
118    sample_size = "hla_sample_size=%s&" % (sample_size)
119    sample_size_pattern = "hla_sample_size_pattern=%s&" % (sample_size_pattern)
120    hla_level_pattern = "hla_level_pattern=%s&" % (resolution_pattern)
121    hla_level = "hla_level=%s&" % (resolution)
122    standard = "standard=%s&" % standard
123    url = (
124        base
125        + locus_type
126        + hla_locus
127        + country
128        + hla_level_pattern
129        + hla_level
130        + standard
131        + region
132        + ethnic
133        + study_type
134        + dataset_source
135        + sample_year
136        + sample_year_pattern
137        + sample_size
138        + sample_size_pattern
139    )
140    return url

Create URL for search of allele frequency net database.

All arguments are documented here

Arguments:
  • country (str, optional): Country name to retrieve records from. Defaults to "".
  • standard (str, optional): Filter study quality standard to this or higher. {'g', 's', 'a'} Gold, silver, all. Defaults to 's'.
  • locus (str, optional): The locus to return allele data for. Defaults to "".
  • resolution_pattern (str, optional): Resolution comparitor {'equal', 'different', 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. Filter created using resolution and resolution_pattern. Defaults to "bigger_equal_than".
  • resolution (int, optional): Number of fields of resolution of allele. Filter created using resolution and resolution_pattern. Defaults to 2.
  • region (str, optional): Filter to geographic region. {Asia, Australia, Eastern Europe, ...}. All regions listed here. Defaults to "".
  • ethnic (str, optional): Filter to ethnicity. {"Amerindian", "Black", "Caucasian", ...}. All ethnicities listed here. Defaults to "".
  • study_type (str, optional): Type of study. {"Anthropology", "Blood+Donor", "Bone+Marrow+Registry", "Controls+for+Disease+Study", "Disease+Study+Patients", "Other", "Solid+Organd+Unrelated+Donors", "Stem+cell+donors"}. Defaults to "".
  • dataset_source (str, optional): Source of data. {"Literature", "Proceedings+of+IHWs", "Unpublished"}. Defaults to "".
  • sample_year (int, optional): Sample year to compare to. Filter created using sample_year and sample_year_pattern. Defaults to "".
  • sample_year_pattern (str, optional): Pattern to compare sample year to. Filter created using sample_year and sample_year_pattern. {'equal', 'different', 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. Defaults to "".
  • sample_size (int, optional): Sample size to compare to. Filter created using sample_size and sample_size_pattern. Defaults to "".
  • sample_size_pattern (str, optional): Pattern to compare sample size to. Filter created using sample_size and sample_size_pattern. {'equal', 'different', 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. Defaults to "".
Returns:

str: URL to search allelefrequencies.net

def parseAF(bs):
143def parseAF(bs):
144    """Generate a dataframe from a given html page
145
146    Args:
147        bs (bs4.BeautifulSoup): BeautifulSoup object from allelefrequencies.net page
148
149    Returns:
150        pd.DataFrame: Table of allele, allele frequency, samplesize, and population
151    """
152    # Get the results table from the div `divGenDetail`
153    tab = bs.find("div", {"id": "divGenDetail"}).find("table", {"class": "tblNormal"})
154    # Get the column headers from the first row of the table
155    columns = [
156        "line",
157        "allele",
158        "flag",
159        "population",
160        "carriers%",
161        "allele_freq",
162        "AF_graphic",
163        "sample_size",
164        "database",
165        "distribution",
166        "haplotype_association",
167        "notes",
168    ]
169    rows = []
170    for row in tab.find_all("tr"):
171        rows.append([td.get_text(strip=True) for td in row.find_all("td")])
172    # Make dataframe of table rows
173    # skip the first row as it's `th` headers
174    df = pd.DataFrame(rows[1:], columns=columns)
175
176    # Get HLA loci
177    df["loci"] = df.allele.apply(lambda x: x.split("*")[0])
178
179    # Drop unwanted columns
180    df = df[["allele", "loci", "population", "allele_freq", "carriers%", "sample_size"]]
181    return df

Generate a dataframe from a given html page

Arguments:
  • bs (bs4.BeautifulSoup): BeautifulSoup object from allelefrequencies.net page
Returns:

pd.DataFrame: Table of allele, allele frequency, samplesize, and population

def Npages(bs):
184def Npages(bs):
185    """How many pages of results are there?
186
187    Args:
188        bs (bs4.BeautifulSoup): BS object of allelefrequencies.net results page
189
190    Returns:
191        int: Total number of results pages
192    """
193    # Get the table with number of pages
194    navtab = bs.find("div", {"id": "divGenNavig"}).find("table", {"class": "table10"})
195    if not navtab:
196        raise AssertionError(
197            "navtab does not evaluate to True. Check URL returns results in web browser."
198        )
199    # Get cell with ' of ' in
200    pagesOfN = [
201        td.get_text(strip=True) for td in navtab.find_all("td") if " of " in td.text
202    ]
203    # Check single cell returned
204    if not len(pagesOfN) == 1:
205        raise AssertionError("divGenNavig should contain 1 of not %s" % len(pagesOfN))
206    # Get total number of pages
207    N = pagesOfN[0].split("of ")[1]
208    N = int(N)
209    return N

How many pages of results are there?

Arguments:
  • bs (bs4.BeautifulSoup): BS object of allelefrequencies.net results page
Returns:

int: Total number of results pages

def formatAF(AFtab, ignoreG=True):
212def formatAF(AFtab, ignoreG=True):
213    """Format allele frequency table.
214
215    Convert sample_size and allele_freq to numeric data type.
216    Removes commas from sample size. Removes "(*)" from allele frequency if
217    `ignoreG` is `True`. `formatAF()` is used internally by combineAF and getAFdata
218    by default.
219
220    Args:
221        AFtab (pd.DataFrame): Allele frequency data downloaded from allelefrequency.net
222            using `getAFdata()`.
223        ignoreG (bool, optional): Treat G group alleles as normal.
224            See http://hla.alleles.org/alleles/g_groups.html for details. Defaults to True.
225
226    Returns:
227        pd.DataFrame: The formatted allele frequency data.
228    """
229    df = AFtab.copy()
230    if df.sample_size.dtype == "O":
231        df.sample_size = pd.to_numeric(df.sample_size.str.replace(",", ""))
232    if df.allele_freq.dtype == "O":
233        if ignoreG:
234            df.allele_freq = df.allele_freq.str.replace("(*)", "", regex=False)
235        df.allele_freq = pd.to_numeric(df.allele_freq)
236    return df

Format allele frequency table.

Convert sample_size and allele_freq to numeric data type. Removes commas from sample size. Removes "(*)" from allele frequency if ignoreG is True. formatAF() is used internally by combineAF and getAFdata by default.

Arguments:
Returns:

pd.DataFrame: The formatted allele frequency data.

def getAFdata(base_url, timeout=20, format=True, ignoreG=True):
239def getAFdata(base_url, timeout=20, format=True, ignoreG=True):
240    """Get all allele frequency data from a search base_url.
241
242    Iterates over all pages regardless of which page is based.
243
244    Args:
245        base_url (str): URL for base search.
246        timeout (int): How long to wait to receive a response.
247        format (bool): Format the downloaded data using `formatAF()`.
248        ignoreG (bool): treat allele G groups as normal.
249            See http://hla.alleles.org/alleles/g_groups.html for details. Default = True
250
251    Returns:
252        pd.DataFrame: allele frequency data parsed into a pandas dataframe
253    """
254    # Get BS object from base search
255    try:
256        bs = BeautifulSoup(requests.get(base_url, timeout=timeout).text, "html.parser")
257    except requests.exceptions.ReadTimeout as e:
258        raise Exception(
259            "Requests timeout, try a larger `timeout` value for `getAFdata()`"
260        ) from None
261    # How many pages of results
262    N = Npages(bs)
263    print("%s pages of results" % N)
264    # iterate over pages, parse and combine data from each
265    tabs = []
266    for i in range(N):
267        # print (" Parsing page %s" %(i+1))
268        print(" Parsing page %s" % (i + 1), end="\r")
269        url = base_url + "page=" + str(i + 1)
270        try:
271            bs = BeautifulSoup(requests.get(url, timeout=timeout).text, "html.parser")
272        except requests.exceptions.ReadTimeout as e:
273            raise Exception(
274                "Requests timeout, try a larger `timeout` value for `getAFdata()`"
275            ) from None
276        tab = parseAF(bs)
277        tabs.append(tab)
278    print("Download complete")
279    tabs = pd.concat(tabs)
280    if format:
281        try:
282            tabs = formatAF(tabs, ignoreG)
283        except AttributeError:
284            print("Formatting failed, non-numeric datatypes may remain.")
285    return tabs

Get all allele frequency data from a search base_url.

Iterates over all pages regardless of which page is based.

Arguments:
  • base_url (str): URL for base search.
  • timeout (int): How long to wait to receive a response.
  • format (bool): Format the downloaded data using formatAF().
  • ignoreG (bool): treat allele G groups as normal. See http://hla.alleles.org/alleles/g_groups.html for details. Default = True
Returns:

pd.DataFrame: allele frequency data parsed into a pandas dataframe

def incomplete_studies(AFtab, llimit=0.95, ulimit=1.1, datasetID='population'):
288def incomplete_studies(AFtab, llimit=0.95, ulimit=1.1, datasetID="population"):
289    """Report any studies with allele freqs that don't sum to 1
290
291    Args:
292        AFtab (pd.DataFrame): Dataframe containing multiple studies
293        llimit (float, optional): Lower allele_freq sum limit that counts as complete.
294            Defaults to 0.95.
295        ulimit (float, optional): Upper allele_freq sum limit that will not be reported.
296            Defaults to 1.1.
297        datasetID (str): Unique identifier column for study
298    """
299    poplocs = AFtab.groupby([datasetID, "loci"]).allele_freq.sum()
300    lmask = poplocs < llimit
301    if sum(lmask > 0):
302        print(poplocs[lmask])
303        print(f"{sum(lmask)} studies have total allele frequency < {llimit}")
304    umask = poplocs > ulimit
305    if sum(umask > 0):
306        print(poplocs[umask])
307        print(f"{sum(umask)} studies have total allele frequency > {ulimit}")
308    incomplete = pd.concat([poplocs[lmask], poplocs[umask]])
309    return incomplete

Report any studies with allele freqs that don't sum to 1

Arguments:
  • AFtab (pd.DataFrame): Dataframe containing multiple studies
  • llimit (float, optional): Lower allele_freq sum limit that counts as complete. Defaults to 0.95.
  • ulimit (float, optional): Upper allele_freq sum limit that will not be reported. Defaults to 1.1.
  • datasetID (str): Unique identifier column for study
def only_complete(AFtab, llimit=0.95, ulimit=1.1, datasetID='population'):
312def only_complete(AFtab, llimit=0.95, ulimit=1.1, datasetID="population"):
313    """Returns only complete studies.
314
315    Data is dropped if the locus for that population is not complete, i.e. doesn't
316    sum to between `llimit` and `ulimit`. This prevents throwing away data if
317    another loci in the population is incomplete.
318
319    Args:
320        AFtab (pd.DataFrame): Dataframe containing multiple studies
321        llimit (float, optional): Lower allele_freq sum limit that counts as complete.
322            Defaults to 0.95.
323        ulimit (float, optional): Upper allele_freq sum limit that will not be reported.
324            Defaults to 1.1.
325        datasetID (str): Unique identifier column for study. Defaults to 'population'.
326
327    Returns:
328        pd.DataFrame: Allele frequency data of multiple studies, but only complete studies.
329    """
330    noncomplete = incomplete_studies(
331        AFtab=AFtab, llimit=llimit, ulimit=ulimit, datasetID=datasetID
332    )
333    # Returns False if population AND loci are in the noncomplete.index
334    # AS A PAIR
335    # This is important so that we don't throw away all data on a population
336    # just because one loci is incomplete.
337    complete_mask = AFtab.apply(
338        lambda x: (x[datasetID], x.loci) not in noncomplete.index, axis=1
339    )
340    df = AFtab[complete_mask]
341    return df

Returns only complete studies.

Data is dropped if the locus for that population is not complete, i.e. doesn't sum to between llimit and ulimit. This prevents throwing away data if another loci in the population is incomplete.

Arguments:
  • AFtab (pd.DataFrame): Dataframe containing multiple studies
  • llimit (float, optional): Lower allele_freq sum limit that counts as complete. Defaults to 0.95.
  • ulimit (float, optional): Upper allele_freq sum limit that will not be reported. Defaults to 1.1.
  • datasetID (str): Unique identifier column for study. Defaults to 'population'.
Returns:

pd.DataFrame: Allele frequency data of multiple studies, but only complete studies.

def check_resolution(AFtab):
344def check_resolution(AFtab):
345    """Check if all alleles in AFtab have the same resolution.
346    Will print the number of records with each resolution.
347
348    Args:
349        AFtab (pd.DataFrame): Allele frequency data
350
351    Returns:
352        bool: True only if all alleles have the same resolution, else False.
353    """
354    resolution = 1 + AFtab.allele.str.count(":")
355    resVC = resolution.value_counts()
356    pass_check = len(resVC) == 1
357    if not pass_check:
358        print(resVC)
359        print("Multiple resolutions in AFtab. Fix with decrease_resolution()")
360    return pass_check

Check if all alleles in AFtab have the same resolution. Will print the number of records with each resolution.

Arguments:
  • AFtab (pd.DataFrame): Allele frequency data
Returns:

bool: True only if all alleles have the same resolution, else False.

def decrease_resolution(AFtab, newres, datasetID='population'):
363def decrease_resolution(AFtab, newres, datasetID="population"):
364    """Decrease allele resolution so all alleles have the same resolution.
365
366    Args:
367        AFtab (pd.DataFrame): Allele frequency data.
368        newres (int): The desired number of fields for resolution.
369        datasetID (str, optional): Column to use as stud identifier.
370            Defaults to 'population'.
371
372    Returns:
373        pd.DataFrame: Allele frequency data with all alleles of requested resolution.
374    """
375    df = AFtab.copy()
376    resolution = 1 + df.allele.str.count(":")
377    if not all(resolution >= newres):
378        raise AssertionError(f"Some alleles have resolution below {newres} fields")
379    new_allele = df.allele.str.split(":").apply(lambda x: ":".join(x[:newres]))
380    df.allele = new_allele
381    collapsed = collapse_reduced_alleles(df, datasetID=datasetID)
382    return collapsed

Decrease allele resolution so all alleles have the same resolution.

Arguments:
  • AFtab (pd.DataFrame): Allele frequency data.
  • newres (int): The desired number of fields for resolution.
  • datasetID (str, optional): Column to use as stud identifier. Defaults to 'population'.
Returns:

pd.DataFrame: Allele frequency data with all alleles of requested resolution.

def collapse_reduced_alleles(AFtab, datasetID='population'):
385def collapse_reduced_alleles(AFtab, datasetID="population"):
386    df = AFtab.copy()
387    # Group by alleles within datasets
388    grouped = df.groupby([datasetID, "allele"])
389    # Sum allele freq but keep other columns
390    collapsed = grouped.apply(
391        lambda row: [
392            sum(row.allele_freq),
393            row.sample_size.unique()[0],
394            row.loci.unique()[0],
395            len(row.loci.unique()),
396            len(row.sample_size.unique()),
397        ]
398    )
399    collapsed = pd.DataFrame(
400        collapsed.tolist(),
401        index=collapsed.index,
402        columns=["allele_freq", "sample_size", "loci", "#loci", "#sample_sizes"],
403    ).reset_index()
404    # Within a study each all identical alleles should have the same loci and sample size
405    if not all(collapsed["#loci"] == 1):
406        raise AssertionError(
407            "Multiple loci found for a single allele in a single population"
408        )
409    if not all(collapsed["#sample_sizes"] == 1):
410        raise AssertionError(
411            "Multiple sample_sizes found for a single allele in a single population"
412        )
413    collapsed = collapsed[
414        ["allele", "loci", "population", "allele_freq", "sample_size"]
415    ]
416    alleles_unique_in_study(collapsed)
417    return collapsed
def unmeasured_alleles(AFtab, datasetID='population'):
420def unmeasured_alleles(AFtab, datasetID="population"):
421    """When combining AF estimates, unreported alleles can inflate frequencies
422        so AF sums to >1. Therefore we add unreported alleles with frequency zero.
423
424    Args:
425        AFtab (pd.DataFrame): Formatted allele frequency data
426        datasetID (str): Unique identifier column for study
427
428    Returns:
429        pd.DataFrame: Allele frequency data with all locus alleles reported
430            for each dataset
431    """
432    df = AFtab.copy()
433    loci = df.loci.unique()
434    # Iterate over loci separately
435    for locus in loci:
436        # Iterate over each dataset reporting that locus
437        datasets = df[df.loci == locus][datasetID].unique()
438        for dataset in datasets:
439            # Single locus, single dataset
440            datasetAF = df[(df[datasetID] == dataset) & (df.loci == locus)]
441            # What was the sample size for this data?
442            dataset_sample_size = datasetAF.sample_size.unique()
443            if not (len(dataset_sample_size) == 1):
444                raise AssertionError(
445                    "dataset_sample_size must be 1, not %s" % len(dataset_sample_size)
446                )
447            dataset_sample_size = dataset_sample_size[0]
448            # Get all alleles for this locus (across datasets)
449            ualleles = df[df.loci == locus].allele.unique()
450            # Which of these alleles are not in this dataset?
451            missing_alleles = [
452                allele for allele in ualleles if not allele in datasetAF.allele.values
453            ]
454            missing_rows = [
455                (al, locus, dataset, 0, 0, dataset_sample_size)
456                for al in missing_alleles
457            ]
458            missing_rows = pd.DataFrame(
459                missing_rows,
460                columns=[
461                    "allele",
462                    "loci",
463                    datasetID,
464                    "allele_freq",
465                    "carriers%",
466                    "sample_size",
467                ],
468            )
469            # Add them in with zero frequency
470            if not missing_rows.empty:
471                df = pd.concat([df, missing_rows], ignore_index=True)
472    return df

When combining AF estimates, unreported alleles can inflate frequencies so AF sums to >1. Therefore we add unreported alleles with frequency zero.

Arguments:
  • AFtab (pd.DataFrame): Formatted allele frequency data
  • datasetID (str): Unique identifier column for study
Returns:

pd.DataFrame: Allele frequency data with all locus alleles reported for each dataset

def combineAF( AFtab, weights='2n', alpha=[], datasetID='population', format=True, ignoreG=True, add_unmeasured=True, complete=True, resolution=True, unique=True):
475def combineAF(
476    AFtab,
477    weights="2n",
478    alpha=[],
479    datasetID="population",
480    format=True,
481    ignoreG=True,
482    add_unmeasured=True,
483    complete=True,
484    resolution=True,
485    unique=True,
486):
487    """Combine allele frequencies from multiple studies.
488
489    `datasetID` is the unique identifier for studies to combine.
490    Allele frequencies combined using a Dirichlet distribution where each study's
491    contribution to the concentration parameter is $2 * sample_size * allele_frequency$.
492    Sample size is doubled to get `2n` due to diploidy. If an alternative `weights` is
493    set it is not doubled. The total concentration parameter of the Dirichlet distribution
494    is the contributions from all studies plus the prior `alpha`. If `alpha` is not set
495    the prior defaults to 1 observation of each allele.
496
497    Args:
498        AFtab (pd.DataFrame): Table of Allele frequency data
499        weights (str, optional): Column to be weighted by allele frequency to generate
500            concentration parameter of Dirichlet distribution. Defaults to '2n'.
501        alpha (list, optional): Prior to use for Dirichlet distribution. Defaults to [].
502        datasetID (str, optional): Unique identifier column for study. Defaults to
503            'population'.
504        format (bool, optional): Run `formatAF()`. Defaults to True.
505        ignoreG (bool, optional): Treat allele G groups as normal, see `formatAF()`.
506            Defaults to True.
507        add_unmeasured (bool, optional): Add unmeasured alleles to each study. This is
508            important to ensure combined allele frequencies sum to 1. See
509            `add_unmeasured()`. Defaults to True.
510        complete (bool, optional): Check study completeness. Uses default values for
511            `incomplete_studies()`. If you are happy with your study completeness can
512            be switched off with False. Defaults to True.
513        resolution (bool, optional): Check that all alleles have the same resolution,
514            see `check_resolution()`. Defaults to True.
515        unique (bool, optional): Check that each allele appears no more than once per
516            study. See `alleles_unique_in_study()`. Defaults to True.
517
518    Returns:
519        pd.DataFrame: Allele frequencies after combining estimates from all studies.
520            *allele_freq* is the combined frequency estimate from the Dirichlet mean
521            where the concentration is `alpha` + `c`.
522            *alpha* is the prior used for the Dirichlet distribution.
523            *c* is the observations used for the Dirichlet distribution.
524            *sample_size* is the total sample size of all combined studies.
525            *wav* is the weighted average.
526    """
527    df = AFtab.copy()
528    single_loci(df)
529    if unique:
530        if not alleles_unique_in_study(df, datasetID=datasetID):
531            raise AssertionError("The same allele appears multiple times in a dataset")
532    if complete:
533        if not incomplete_studies(df, datasetID=datasetID).empty:
534            raise AssertionError(
535                "AFtab contains studies with AF that doesn't sum to 1. Check incomplete_studies(AFtab)"
536            )
537    if resolution:
538        if not check_resolution(df):
539            raise AssertionError(
540                "AFtab conains alleles at multiple resolutions, check check_resolution(AFtab)"
541            )
542    if format:
543        df = formatAF(df, ignoreG)
544    if add_unmeasured:
545        df = unmeasured_alleles(df, datasetID)
546    try:
547        df["2n"] = df.sample_size * 2
548    except AttributeError:
549        print("column '2n' could not be created")
550    df["c"] = df.allele_freq * df[weights]
551    grouped = df.groupby("allele", sort=True)
552    combined = grouped.apply(
553        lambda row: [
554            row.name,
555            row.loci.unique()[0],
556            np.average(row.allele_freq, weights=row[weights]),
557            row.c.sum(),
558            row.sample_size.sum(),
559        ]
560    )
561    combined = pd.DataFrame(
562        combined.tolist(), columns=["allele", "loci", "wav", "c", "sample_size"]
563    )
564    combined = combined.reset_index(drop=True)
565    # Check that all alleles in a locus have the same sample size
566    # after merging
567    if duplicated_sample_size(combined):
568        id_duplicated_allele(grouped)
569    if not alpha:
570        alpha = default_prior(len(combined.allele))
571    combined["alpha"] = alpha
572    # Calculate Dirichlet mean for each allele
573    combined["allele_freq"] = sp.stats.dirichlet(combined.alpha + combined.c).mean()
574
575    return combined

Combine allele frequencies from multiple studies.

datasetID is the unique identifier for studies to combine. Allele frequencies combined using a Dirichlet distribution where each study's contribution to the concentration parameter is $2 * sample_size * allele_frequency$. Sample size is doubled to get 2n due to diploidy. If an alternative weights is set it is not doubled. The total concentration parameter of the Dirichlet distribution is the contributions from all studies plus the prior alpha. If alpha is not set the prior defaults to 1 observation of each allele.

Arguments:
  • AFtab (pd.DataFrame): Table of Allele frequency data
  • weights (str, optional): Column to be weighted by allele frequency to generate concentration parameter of Dirichlet distribution. Defaults to '2n'.
  • alpha (list, optional): Prior to use for Dirichlet distribution. Defaults to [].
  • datasetID (str, optional): Unique identifier column for study. Defaults to 'population'.
  • format (bool, optional): Run formatAF(). Defaults to True.
  • ignoreG (bool, optional): Treat allele G groups as normal, see formatAF(). Defaults to True.
  • add_unmeasured (bool, optional): Add unmeasured alleles to each study. This is important to ensure combined allele frequencies sum to 1. See add_unmeasured(). Defaults to True.
  • complete (bool, optional): Check study completeness. Uses default values for incomplete_studies(). If you are happy with your study completeness can be switched off with False. Defaults to True.
  • resolution (bool, optional): Check that all alleles have the same resolution, see check_resolution(). Defaults to True.
  • unique (bool, optional): Check that each allele appears no more than once per study. See alleles_unique_in_study(). Defaults to True.
Returns:

pd.DataFrame: Allele frequencies after combining estimates from all studies. allele_freq is the combined frequency estimate from the Dirichlet mean where the concentration is alpha + c. alpha is the prior used for the Dirichlet distribution. c is the observations used for the Dirichlet distribution. sample_size is the total sample size of all combined studies. wav is the weighted average.

def default_prior(k):
578def default_prior(k):
579    """Calculate a default prior, 1 observation of each class.
580
581    Args:
582        k (int): Number of classes in the Dirichlet distribution.
583
584    Returns:
585        list: List of k 1s to use as prior.
586    """
587    alpha = [1] * k
588    return alpha

Calculate a default prior, 1 observation of each class.

Arguments:
  • k (int): Number of classes in the Dirichlet distribution.
Returns:

list: List of k 1s to use as prior.

def single_loci(AFtab):
591def single_loci(AFtab):
592    """Check that allele frequency data is only of one locus
593
594    Args:
595        AFtab (pd.DataFrame): Allele frequency data
596    """
597    if not len(AFtab.loci.unique()) == 1:
598        raise AssertionError("'AFtab' must contain only 1 loci")

Check that allele frequency data is only of one locus

Arguments:
  • AFtab (pd.DataFrame): Allele frequency data
def alleles_unique_in_study(AFtab, datasetID='population'):
601def alleles_unique_in_study(AFtab, datasetID="population"):
602    """Are all alleles unique in each study?
603
604    Checks that no alleles are reported more than once in a single study.
605    Study is defined by `datasetID`.
606
607    Args:
608        AFtab (pd.DataFrame): Allele frequency data
609        datasetID (str, optional): Unique identifier column to define study.
610            Defaults to 'population'.
611
612    Returns:
613        bool: `True` on if no alleles occur more than once in any study, otherwise `False`.
614    """
615    df = AFtab.copy()
616    grouped = df.groupby([datasetID, "allele"])
617    # Are allele alleles unique? i.e. do any occur multiple times in grouping?
618    unique = grouped.size()[grouped.size() > 1].empty
619    if not unique:
620        print(f"Non unique alleles in study, is datasetID correct? {datasetID}")
621        print(grouped.size()[grouped.size() > 1])
622    return unique

Are all alleles unique in each study?

Checks that no alleles are reported more than once in a single study. Study is defined by datasetID.

Arguments:
  • AFtab (pd.DataFrame): Allele frequency data
  • datasetID (str, optional): Unique identifier column to define study. Defaults to 'population'.
Returns:

bool: True on if no alleles occur more than once in any study, otherwise False.

def duplicated_sample_size(AFtab):
625def duplicated_sample_size(AFtab):
626    """Returns True if any loci has more than 1 unique sample size"""
627    locus_sample_sizes = AFtab.groupby("loci").sample_size.apply(
628        lambda x: len(x.unique())
629    )
630    return any(locus_sample_sizes != 1)

Returns True if any loci has more than 1 unique sample size

def id_duplicated_allele(grouped):
633def id_duplicated_allele(grouped):
634    """Reports the allele that has mupltiple sample sizes"""
635    duplicated_population = grouped.population.apply(lambda x: any(x.duplicated()))
636    if not all(~duplicated_population):
637        raise AssertionError(
638            f"duplicated population within allele {duplicated_population[duplicated_population].index.tolist()}"
639        )

Reports the allele that has mupltiple sample sizes

def population_coverage(p):
642def population_coverage(p):
643    """Proportion of people with at least 1 copy of this allele assuming HWE.
644
645    Args:
646        p (float): Allele frequency
647
648    Returns:
649        float: Sum of homozygotes and heterozygotes for this allele
650    """
651    q = 1 - p
652    homo = p**2
653    hetero = 2 * p * q
654    return homo + hetero

Proportion of people with at least 1 copy of this allele assuming HWE.

Arguments:
  • p (float): Allele frequency
Returns:

float: Sum of homozygotes and heterozygotes for this allele

def betaAB(alpha):
657def betaAB(alpha):
658    """Calculate `a` `b` values for all composite beta distributions.
659
660    Given the `alpha` vector defining a Dirichlet distribution calculate the `a` `b` values
661    for all composite beta distributions.
662
663    Args:
664        alpha (list): Values defining a Dirichlet distribution. This will be the prior
665            (for a naive distribution) or the prior + caf.c for a posterior distribution.
666
667    Returns:
668        list: List of `a` `b` values defining beta values, i.e. for each allele it is
669            the number of times it was and wasn't observed.
670    """
671    ab = [(a, sum(alpha) - a) for a in alpha]
672    return ab

Calculate a b values for all composite beta distributions.

Given the alpha vector defining a Dirichlet distribution calculate the a b values for all composite beta distributions.

Arguments:
  • alpha (list): Values defining a Dirichlet distribution. This will be the prior (for a naive distribution) or the prior + caf.c for a posterior distribution.
Returns:

list: List of a b values defining beta values, i.e. for each allele it is the number of times it was and wasn't observed.

def plot_prior(concentration, ncol=2, psteps=1000, labels=''):
712def plot_prior(concentration, ncol=2, psteps=1000, labels=""):
713    """Plot probability density function for prior values.
714
715    Args:
716        concentration (list): Vector of the prior Dirichlet concentration values.
717        ncol (int, optional): Number of columns. Defaults to 2.
718        labels (list, optional): Labels for elements of concentration in the same
719            order. Defaults to "".
720    """
721    ab = betaAB(concentration)
722    pline = np.linspace(0, 1, psteps)
723    nrow = math.ceil(len(concentration) / ncol)
724    fig, ax = plt.subplots(nrow, ncol, sharex=True)
725    fig.suptitle("Probability density")
726    # If labels is a list nothing happens,
727    # But if it's a series it converts to a list
728    labels = list(labels)
729    if not labels:
730        labels = [""] * len(concentration)
731    if not len(concentration) == len(labels):
732        raise AssertionError("concentration must be same length as labels")
733    for i, alpha in enumerate(concentration):
734        a, b = ab[i]
735        bd = sp.stats.beta(a, b)
736        pdf = [bd.pdf(p) for p in pline]
737        ax.flat[i].plot(pline, pdf)
738        ax.flat[i].set_title(labels[i])
739    for axi in ax[-1, :]:
740        axi.set(xlabel="Allele freq")
741    for axi in ax[:, 0]:
742        axi.set(ylabel="PDF")
743    plt.show()

Plot probability density function for prior values.

Arguments:
  • concentration (list): Vector of the prior Dirichlet concentration values.
  • ncol (int, optional): Number of columns. Defaults to 2.
  • labels (list, optional): Labels for elements of concentration in the same order. Defaults to "".
def plotAF( caf=Empty DataFrame Columns: [] Index: [], AFtab=Empty DataFrame Columns: [] Index: [], cols=['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan'], datasetID='population', hdi=Empty DataFrame Columns: [] Index: [], compound_mean=Empty DataFrame Columns: [] Index: []):
746def plotAF(
747    caf=pd.DataFrame(),
748    AFtab=pd.DataFrame(),
749    cols=list(mcolors.TABLEAU_COLORS.keys()),
750    datasetID="population",
751    hdi=pd.DataFrame(),
752    compound_mean=pd.DataFrame(),
753):
754    """Plot allele frequency results from `HLAfreq`.
755
756    Plot combined allele frequencies, individual allele frequencies,
757    and credible intervals on combined allele frequency estimates.
758    Credible interval is only plotted if a value is given for `hdi`.
759    The plotted Credible interval is whatever was passed to HLAfreq_pymc.AFhdi()
760    when calculating hdi.
761
762    Args:
763        caf (pd.DataFrame, optional): Combined allele frequency estimates from
764            HLAfreq.combineAF. Defaults to pd.DataFrame().
765        AFtab (pd.DataFrame, optional): Table of allele frequency data. Defaults
766            to pd.DataFrame().
767        cols (list, optional): List of colours to use for each individual dataset.
768            Defaults to list(mcolors.TABLEAU_COLORS.keys()).
769        datasetID (str, optional): Column used to define separate datasets. Defaults
770            to "population".
771        weights (str, optional): Column to be weighted by allele frequency to generate
772            concentration parameter of Dirichlet distribution. Defaults to '2n'.
773        hdi (pd.DataFrame, optional): The high density interval object to plot credible
774            intervals. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame().
775        compound_mean (pd.DataFrame, optional): The high density interval object to plot
776            post_mean. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame().
777    """
778    # Plot allele frequency for each dataset
779    if not AFtab.empty:
780        # Cols must be longer than the list of alleles
781        # If not, repeat the list of cols
782        repeat_cols = np.ceil(len(AFtab[datasetID]) / len(cols))
783        repeat_cols = int(repeat_cols)
784        cols = cols * repeat_cols
785        # Make a dictionary mapping datasetID to colours
786        cmap = dict(zip(AFtab[datasetID].unique(), cols))
787        plt.scatter(
788            x=AFtab.allele_freq,
789            y=AFtab.allele,
790            c=[cmap[x] for x in AFtab[datasetID]],
791            alpha=0.7,
792            zorder=2,
793        )
794    # Plot combined allele frequency
795    if not caf.empty:
796        plt.scatter(
797            x=caf.allele_freq,
798            y=caf.allele,
799            edgecolors="black",
800            facecolors="none",
801            zorder=3,
802        )
803    # Plot high density interval
804    if not hdi.empty:
805        # assert not AFtab.empty, "AFtab is needed to calculate credible interval"
806        # from HLAfreq import HLAfreq_pymc as HLAhdi
807        # print("Fitting model with PyMC, make take a few seconds")
808        # hdi = HLAhdi.AFhdi(
809        #     AFtab=AFtab,
810        #     weights=weights,
811        #     datasetID=datasetID,
812        #     credible_interval=credible_interval,
813        #     conc_mu=conc_mu,
814        #     conc_sigma=conc_sigma
815        # )
816        for interval in hdi.iterrows():
817            # .iterrows returns a index and data as a tuple for each row
818            plt.hlines(
819                y=interval[1]["allele"],
820                xmin=interval[1]["lo"],
821                xmax=interval[1]["hi"],
822                color="black",
823            )
824    if not compound_mean.empty:
825        for row in compound_mean.iterrows():
826            plt.scatter(
827                y=row[1]["allele"], x=row[1]["post_mean"], color="black", marker="|"
828            )
829    plt.xlabel("Allele frequency")
830    plt.grid(zorder=0)
831    plt.show()

Plot allele frequency results from HLAfreq.

Plot combined allele frequencies, individual allele frequencies, and credible intervals on combined allele frequency estimates. Credible interval is only plotted if a value is given for hdi. The plotted Credible interval is whatever was passed to HLAfreq_pymc.AFhdi() when calculating hdi.

Arguments:
  • caf (pd.DataFrame, optional): Combined allele frequency estimates from HLAfreq.combineAF. Defaults to pd.DataFrame().
  • AFtab (pd.DataFrame, optional): Table of allele frequency data. Defaults to pd.DataFrame().
  • cols (list, optional): List of colours to use for each individual dataset. Defaults to list(mcolors.TABLEAU_COLORS.keys()).
  • datasetID (str, optional): Column used to define separate datasets. Defaults to "population".
  • weights (str, optional): Column to be weighted by allele frequency to generate concentration parameter of Dirichlet distribution. Defaults to '2n'.
  • hdi (pd.DataFrame, optional): The high density interval object to plot credible intervals. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame().
  • compound_mean (pd.DataFrame, optional): The high density interval object to plot post_mean. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame().