HLAfreq.HLAfreq
Download and combine HLA allele frequencies from multiple datasets.
Download allele frequency data from allelefrequencies.net. Allele frequencies from different populations can be combined to estimate HLA frequencies of countries or other regions such as global HLA frequencies.
1""" 2Download and combine HLA allele frequencies from multiple datasets. 3 4Download allele frequency data from 5[allelefrequencies.net](www.allelefrequencies.net). Allele 6frequencies from different populations can be combined to 7estimate HLA frequencies of countries or other regions such as 8global HLA frequencies. 9""" 10 11from collections.abc import Iterable 12from bs4 import BeautifulSoup 13import requests 14import pandas as pd 15import numpy as np 16import matplotlib.pyplot as plt 17import math 18import scipy as sp 19import matplotlib.colors as mcolors 20 21 22def simulate_population(alleles: Iterable[str], locus: str, population: str): 23 pop_size = np.random.randint(len(alleles), 50) 24 samples = np.random.choice(alleles, pop_size, replace=True) 25 counts = pd.Series(samples).value_counts() 26 counts.values / pop_size 27 pop = pd.DataFrame( 28 { 29 "allele": counts.index, 30 "loci": locus, 31 "population": population, 32 "allele_freq": counts.values / pop_size, 33 "sample_size": pop_size, 34 } 35 ) 36 return pop 37 38 39def simulate_study(alleles, populations, locus): 40 study = [] 41 for i in range(populations): 42 pop = simulate_population(alleles=alleles, locus=locus, population=f"pop_{i}") 43 study.append(pop) 44 45 study = pd.concat(study) 46 return study 47 48 49def makeURL( 50 country="", 51 standard="s", 52 locus="", 53 resolution_pattern="bigger_equal_than", 54 resolution=2, 55 region="", 56 ethnic="", 57 study_type="", 58 dataset_source="", 59 sample_year="", 60 sample_year_pattern="", 61 sample_size="", 62 sample_size_pattern="", 63): 64 """Create URL for search of allele frequency net database. 65 66 All arguments are documented [here](https://www.allelefrequencies.net/extaccess.asp) 67 68 Args: 69 country (str, optional): Country name to retrieve records from. Defaults to "". 70 standard (str, optional): Filter study quality standard to this or higher. 71 {'g', 's', 'a'} Gold, silver, all. Defaults to 's'. 72 locus (str, optional): The locus to return allele data for. Defaults to "". 73 resolution_pattern (str, optional): Resolution comparitor {'equal', 'different', 74 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. 75 Filter created using `resolution` and `resolution_pattern`. 76 Defaults to "bigger_equal_than". 77 resolution (int, optional): Number of fields of resolution of allele. Filter 78 created using `resolution` and `resolution_pattern`. Defaults to 2. 79 region (str, optional): Filter to geographic region. {Asia, Australia, 80 Eastern Europe, ...}. 81 All regions listed [here](https://www.allelefrequencies.net/pop6003a.asp). 82 Defaults to "". 83 ethnic (str, optional): Filter to ethnicity. {"Amerindian", "Black", "Caucasian", ...}. 84 All ethnicities listed [here](https://www.allelefrequencies.net/pop6003a.asp). 85 Defaults to "". 86 study_type (str, optional): Type of study. {"Anthropology", "Blood+Donor", 87 "Bone+Marrow+Registry", "Controls+for+Disease+Study", "Disease+Study+Patients", 88 "Other", "Solid+Organd+Unrelated+Donors", "Stem+cell+donors"}. Defaults to "". 89 dataset_source (str, optional): Source of data. {"Literature", 90 "Proceedings+of+IHWs", "Unpublished"}. Defaults to "". 91 sample_year (int, optional): Sample year to compare to. Filter created using 92 sample_year and sample_year_pattern. Defaults to "". 93 sample_year_pattern (str, optional): Pattern to compare sample year to. Filter 94 created using sample_year and sample_year_pattern. {'equal', 'different', 95 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. 96 Defaults to "". 97 sample_size (int, optional): Sample size to compare to. Filter created using 98 sample_size and sample_size_pattern. Defaults to "". 99 sample_size_pattern (str, optional): Pattern to compare sample size to. Filter 100 created using sample_size and sample_size_pattern. {'equal', 'different', 101 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. 102 Defaults to "". 103 104 Returns: 105 str: URL to search allelefrequencies.net 106 """ 107 base = "https://www.allelefrequencies.net/hla6006a.asp?" 108 locus_type = "hla_locus_type=Classical&" 109 hla_locus = "hla_locus=%s&" % (locus) 110 country = "hla_country=%s&" % (country) 111 region = "hla_region=%s&" % (region) 112 ethnic = "hla_ethnic=%s&" % (ethnic) 113 study_type = "hla_study=%s&" % (study_type) 114 dataset_source = "hla_dataset_source=%s&" % (dataset_source) 115 sample_year = "hla_sample_year=%s&" % (sample_year) 116 sample_year_pattern = "hla_sample_year_pattern=%s&" % (sample_year_pattern) 117 sample_size = "hla_sample_size=%s&" % (sample_size) 118 sample_size_pattern = "hla_sample_size_pattern=%s&" % (sample_size_pattern) 119 hla_level_pattern = "hla_level_pattern=%s&" % (resolution_pattern) 120 hla_level = "hla_level=%s&" % (resolution) 121 standard = "standard=%s&" % standard 122 url = ( 123 base 124 + locus_type 125 + hla_locus 126 + country 127 + hla_level_pattern 128 + hla_level 129 + standard 130 + region 131 + ethnic 132 + study_type 133 + dataset_source 134 + sample_year 135 + sample_year_pattern 136 + sample_size 137 + sample_size_pattern 138 ) 139 return url 140 141 142def parseAF(bs): 143 """Generate a dataframe from a given html page 144 145 Args: 146 bs (bs4.BeautifulSoup): BeautifulSoup object from allelefrequencies.net page 147 148 Returns: 149 pd.DataFrame: Table of allele, allele frequency, samplesize, and population 150 """ 151 # Get the results table from the div `divGenDetail` 152 tab = bs.find("div", {"id": "divGenDetail"}).find("table", {"class": "tblNormal"}) 153 # Get the column headers from the first row of the table 154 columns = [ 155 "line", 156 "allele", 157 "flag", 158 "population", 159 "carriers%", 160 "allele_freq", 161 "AF_graphic", 162 "sample_size", 163 "database", 164 "distribution", 165 "haplotype_association", 166 "notes", 167 ] 168 rows = [] 169 for row in tab.find_all("tr"): 170 rows.append([td.get_text(strip=True) for td in row.find_all("td")]) 171 # Make dataframe of table rows 172 # skip the first row as it's `th` headers 173 df = pd.DataFrame(rows[1:], columns=columns) 174 175 # Get HLA loci 176 df["loci"] = df.allele.apply(lambda x: x.split("*")[0]) 177 178 # Drop unwanted columns 179 df = df[["allele", "loci", "population", "allele_freq", "carriers%", "sample_size"]] 180 return df 181 182 183def Npages(bs): 184 """How many pages of results are there? 185 186 Args: 187 bs (bs4.BeautifulSoup): BS object of allelefrequencies.net results page 188 189 Returns: 190 int: Total number of results pages 191 """ 192 # Get the table with number of pages 193 navtab = bs.find("div", {"id": "divGenNavig"}).find("table", {"class": "table10"}) 194 if not navtab: 195 raise AssertionError( 196 "navtab does not evaluate to True. Check URL returns results in web browser." 197 ) 198 # Get cell with ' of ' in 199 pagesOfN = [ 200 td.get_text(strip=True) for td in navtab.find_all("td") if " of " in td.text 201 ] 202 # Check single cell returned 203 if not len(pagesOfN) == 1: 204 raise AssertionError("divGenNavig should contain 1 of not %s" % len(pagesOfN)) 205 # Get total number of pages 206 N = pagesOfN[0].split("of ")[1] 207 N = int(N) 208 return N 209 210 211def formatAF(AFtab, ignoreG=True): 212 """Format allele frequency table. 213 214 Convert sample_size and allele_freq to numeric data type. 215 Removes commas from sample size. Removes "(*)" from allele frequency if 216 `ignoreG` is `True`. `formatAF()` is used internally by combineAF and getAFdata 217 by default. 218 219 Args: 220 AFtab (pd.DataFrame): Allele frequency data downloaded from allelefrequency.net 221 using `getAFdata()`. 222 ignoreG (bool, optional): Treat G group alleles as normal. 223 See https://hla.alleles.org/alleles/g_groups.html for details. Defaults to True. 224 225 Returns: 226 pd.DataFrame: The formatted allele frequency data. 227 """ 228 df = AFtab.copy() 229 if pd.api.types.is_string_dtype(df.sample_size.dtype): 230 df.sample_size = pd.to_numeric(df.sample_size.str.replace(",", "")) 231 if pd.api.types.is_string_dtype(df.allele_freq.dtype): 232 if ignoreG: 233 df.allele_freq = df.allele_freq.str.replace("(*)", "", regex=False) 234 df.allele_freq = pd.to_numeric(df.allele_freq) 235 return df 236 237 238def getAFdata(base_url, timeout=20, format=True, ignoreG=True): 239 """Get all allele frequency data from a search base_url. 240 241 Iterates over all pages regardless of which page is based. 242 243 Args: 244 base_url (str): URL for base search. 245 timeout (int): How long to wait to receive a response. 246 format (bool): Format the downloaded data using `formatAF()`. 247 ignoreG (bool): treat allele G groups as normal. 248 See https://hla.alleles.org/alleles/g_groups.html for details. Default = True 249 250 Returns: 251 pd.DataFrame: allele frequency data parsed into a pandas dataframe 252 """ 253 # Get BS object from base search 254 try: 255 bs = BeautifulSoup(requests.get(base_url, timeout=timeout).text, "html.parser") 256 except requests.exceptions.ReadTimeout as e: 257 raise Exception( 258 "Requests timeout, try a larger `timeout` value for `getAFdata()`" 259 ) from None 260 # How many pages of results 261 N = Npages(bs) 262 print("%s pages of results" % N) 263 # iterate over pages, parse and combine data from each 264 tabs = [] 265 for i in range(N): 266 # print (" Parsing page %s" %(i+1)) 267 print(" Parsing page %s" % (i + 1), end="\r") 268 url = base_url + "page=" + str(i + 1) 269 try: 270 bs = BeautifulSoup(requests.get(url, timeout=timeout).text, "html.parser") 271 except requests.exceptions.ReadTimeout as e: 272 raise Exception( 273 "Requests timeout, try a larger `timeout` value for `getAFdata()`" 274 ) from None 275 tab = parseAF(bs) 276 tabs.append(tab) 277 print("Download complete") 278 tabs = pd.concat(tabs) 279 if format: 280 try: 281 tabs = formatAF(tabs, ignoreG) 282 except AttributeError: 283 print("Formatting failed, non-numeric datatypes may remain.") 284 return tabs 285 286 287def incomplete_studies(AFtab, llimit=0.95, ulimit=1.1, datasetID="population"): 288 """Report any studies with allele freqs that don't sum to 1 289 290 Args: 291 AFtab (pd.DataFrame): Dataframe containing multiple studies 292 llimit (float, optional): Lower allele_freq sum limit that counts as complete. 293 Defaults to 0.95. 294 ulimit (float, optional): Upper allele_freq sum limit that will not be reported. 295 Defaults to 1.1. 296 datasetID (str): Unique identifier column for study 297 """ 298 poplocs = AFtab.groupby([datasetID, "loci"]).allele_freq.sum() 299 lmask = poplocs < llimit 300 if sum(lmask > 0): 301 print(poplocs[lmask]) 302 print(f"{sum(lmask)} studies have total allele frequency < {llimit}") 303 umask = poplocs > ulimit 304 if sum(umask > 0): 305 print(poplocs[umask]) 306 print(f"{sum(umask)} studies have total allele frequency > {ulimit}") 307 incomplete = pd.concat([poplocs[lmask], poplocs[umask]]) 308 return incomplete 309 310 311def only_complete(AFtab, llimit=0.95, ulimit=1.1, datasetID="population"): 312 """Returns only complete studies. 313 314 Data is dropped if the locus for that population is not complete, i.e. doesn't 315 sum to between `llimit` and `ulimit`. This prevents throwing away data if 316 another loci in the population is incomplete. 317 318 Args: 319 AFtab (pd.DataFrame): Dataframe containing multiple studies 320 llimit (float, optional): Lower allele_freq sum limit that counts as complete. 321 Defaults to 0.95. 322 ulimit (float, optional): Upper allele_freq sum limit that will not be reported. 323 Defaults to 1.1. 324 datasetID (str): Unique identifier column for study. Defaults to 'population'. 325 326 Returns: 327 pd.DataFrame: Allele frequency data of multiple studies, but only complete studies. 328 """ 329 noncomplete = incomplete_studies( 330 AFtab=AFtab, llimit=llimit, ulimit=ulimit, datasetID=datasetID 331 ) 332 # Returns False if population AND loci are in the noncomplete.index 333 # AS A PAIR 334 # This is important so that we don't throw away all data on a population 335 # just because one loci is incomplete. 336 complete_mask = AFtab.apply( 337 lambda x: (x[datasetID], x.loci) not in noncomplete.index, axis=1 338 ) 339 df = AFtab[complete_mask] 340 return df 341 342 343def check_resolution(AFtab): 344 """Check if all alleles in AFtab have the same resolution. 345 Will print the number of records with each resolution. 346 347 Args: 348 AFtab (pd.DataFrame): Allele frequency data 349 350 Returns: 351 bool: True only if all alleles have the same resolution, else False. 352 """ 353 resolution = 1 + AFtab.allele.str.count(":") 354 resVC = resolution.value_counts() 355 pass_check = len(resVC) == 1 356 if not pass_check: 357 print(resVC) 358 print("Multiple resolutions in AFtab. Fix with decrease_resolution()") 359 return pass_check 360 361 362def decrease_resolution(AFtab, newres, datasetID="population"): 363 """Decrease allele resolution so all alleles have the same resolution. 364 365 Args: 366 AFtab (pd.DataFrame): Allele frequency data. 367 newres (int): The desired number of fields for resolution. 368 datasetID (str, optional): Column to use as stud identifier. 369 Defaults to 'population'. 370 371 Returns: 372 pd.DataFrame: Allele frequency data with all alleles of requested resolution. 373 """ 374 df = AFtab.copy() 375 resolution = 1 + df.allele.str.count(":") 376 if not all(resolution >= newres): 377 raise AssertionError(f"Some alleles have resolution below {newres} fields") 378 new_allele = df.allele.str.split(":").apply(lambda x: ":".join(x[:newres])) 379 df.allele = new_allele 380 collapsed = collapse_reduced_alleles(df, datasetID=datasetID) 381 return collapsed 382 383 384def collapse_reduced_alleles(AFtab, datasetID="population"): 385 df = AFtab.copy() 386 # Group by alleles within datasets 387 grouped = df.groupby([datasetID, "allele"]) 388 # Sum allele freq but keep other columns 389 collapsed = grouped.apply( 390 lambda row: [ 391 sum(row.allele_freq), 392 row.sample_size.unique()[0], 393 row.loci.unique()[0], 394 len(row.loci.unique()), 395 len(row.sample_size.unique()), 396 ], 397 include_groups=False 398 ) 399 collapsed = pd.DataFrame( 400 collapsed.tolist(), 401 index=collapsed.index, 402 columns=["allele_freq", "sample_size", "loci", "#loci", "#sample_sizes"], 403 ).reset_index() 404 # Within a study each all identical alleles should have the same loci and sample size 405 if not all(collapsed["#loci"] == 1): 406 raise AssertionError( 407 "Multiple loci found for a single allele in a single population" 408 ) 409 if not all(collapsed["#sample_sizes"] == 1): 410 raise AssertionError( 411 "Multiple sample_sizes found for a single allele in a single population" 412 ) 413 collapsed = collapsed[ 414 ["allele", "loci", "population", "allele_freq", "sample_size"] 415 ] 416 alleles_unique_in_study(collapsed) 417 return collapsed 418 419 420def unmeasured_alleles(AFtab, datasetID="population"): 421 """When combining AF estimates, unreported alleles can inflate frequencies 422 so AF sums to >1. Therefore we add unreported alleles with frequency zero. 423 424 Args: 425 AFtab (pd.DataFrame): Formatted allele frequency data 426 datasetID (str): Unique identifier column for study 427 428 Returns: 429 pd.DataFrame: Allele frequency data with all locus alleles reported 430 for each dataset 431 """ 432 df = AFtab.copy() 433 loci = df.loci.unique() 434 # Iterate over loci separately 435 for locus in loci: 436 # Iterate over each dataset reporting that locus 437 datasets = df[df.loci == locus][datasetID].unique() 438 for dataset in datasets: 439 # Single locus, single dataset 440 datasetAF = df[(df[datasetID] == dataset) & (df.loci == locus)] 441 # What was the sample size for this data? 442 dataset_sample_size = datasetAF.sample_size.unique() 443 if not (len(dataset_sample_size) == 1): 444 raise AssertionError( 445 "dataset_sample_size must be 1, not %s" % len(dataset_sample_size) 446 ) 447 dataset_sample_size = dataset_sample_size[0] 448 # Get all alleles for this locus (across datasets) 449 ualleles = df[df.loci == locus].allele.unique() 450 # Which of these alleles are not in this dataset? 451 missing_alleles = [ 452 allele for allele in ualleles if not allele in datasetAF.allele.values 453 ] 454 missing_rows = [ 455 (al, locus, dataset, 0, 0, dataset_sample_size) 456 for al in missing_alleles 457 ] 458 missing_rows = pd.DataFrame( 459 missing_rows, 460 columns=[ 461 "allele", 462 "loci", 463 datasetID, 464 "allele_freq", 465 "carriers%", 466 "sample_size", 467 ], 468 ) 469 # Add them in with zero frequency 470 if not missing_rows.empty: 471 df = pd.concat([df, missing_rows], ignore_index=True) 472 return df 473 474 475def combineAF( 476 AFtab, 477 weights="2n", 478 alpha=[], 479 datasetID="population", 480 format=True, 481 ignoreG=True, 482 add_unmeasured=True, 483 complete=True, 484 resolution=True, 485 unique=True, 486): 487 """Combine allele frequencies from multiple studies. 488 489 `datasetID` is the unique identifier for studies to combine. 490 Allele frequencies combined using a Dirichlet distribution where each study's 491 contribution to the concentration parameter is $2 * sample_size * allele_frequency$. 492 Sample size is doubled to get `2n` due to diploidy. If an alternative `weights` is 493 set it is not doubled. The total concentration parameter of the Dirichlet distribution 494 is the contributions from all studies plus the prior `alpha`. If `alpha` is not set 495 the prior defaults to 1 observation of each allele. 496 497 Args: 498 AFtab (pd.DataFrame): Table of Allele frequency data 499 weights (str, optional): Column to be weighted by allele frequency to generate 500 concentration parameter of Dirichlet distribution. Defaults to '2n'. 501 alpha (list, optional): Prior to use for Dirichlet distribution. Defaults to []. 502 datasetID (str, optional): Unique identifier column for study. Defaults to 503 'population'. 504 format (bool, optional): Run `formatAF()`. Defaults to True. 505 ignoreG (bool, optional): Treat allele G groups as normal, see `formatAF()`. 506 Defaults to True. 507 add_unmeasured (bool, optional): Add unmeasured alleles to each study. This is 508 important to ensure combined allele frequencies sum to 1. See 509 `add_unmeasured()`. Defaults to True. 510 complete (bool, optional): Check study completeness. Uses default values for 511 `incomplete_studies()`. If you are happy with your study completeness can 512 be switched off with False. Defaults to True. 513 resolution (bool, optional): Check that all alleles have the same resolution, 514 see `check_resolution()`. Defaults to True. 515 unique (bool, optional): Check that each allele appears no more than once per 516 study. See `alleles_unique_in_study()`. Defaults to True. 517 518 Returns: 519 pd.DataFrame: Allele frequencies after combining estimates from all studies. 520 *allele_freq* is the combined frequency estimate from the Dirichlet mean 521 where the concentration is `alpha` + `c`. 522 *alpha* is the prior used for the Dirichlet distribution. 523 *c* is the observations used for the Dirichlet distribution. 524 *sample_size* is the total sample size of all combined studies. 525 *wav* is the weighted average. 526 """ 527 df = AFtab.copy() 528 single_loci(df) 529 if unique: 530 if not alleles_unique_in_study(df, datasetID=datasetID): 531 raise AssertionError("The same allele appears multiple times in a dataset") 532 if complete: 533 if not incomplete_studies(df, datasetID=datasetID).empty: 534 raise AssertionError( 535 "AFtab contains studies with AF that doesn't sum to 1. Check incomplete_studies(AFtab)" 536 ) 537 if resolution: 538 if not check_resolution(df): 539 raise AssertionError( 540 "AFtab conains alleles at multiple resolutions, check check_resolution(AFtab)" 541 ) 542 if format: 543 df = formatAF(df, ignoreG) 544 if add_unmeasured: 545 df = unmeasured_alleles(df, datasetID) 546 try: 547 df["2n"] = df.sample_size * 2 548 except AttributeError: 549 print("column '2n' could not be created") 550 df["c"] = df.allele_freq * df[weights] 551 grouped = df.groupby("allele", sort=True) 552 combined = grouped.apply( 553 lambda row: [ 554 row.name, 555 row.loci.unique()[0], 556 np.average(row.allele_freq, weights=row[weights]), 557 row.c.sum(), 558 row.sample_size.sum(), 559 ], 560 include_groups=False 561 ) 562 combined = pd.DataFrame( 563 combined.tolist(), columns=["allele", "loci", "wav", "c", "sample_size"] 564 ) 565 combined = combined.reset_index(drop=True) 566 # Check that all alleles in a locus have the same sample size 567 # after merging 568 if duplicated_sample_size(combined): 569 id_duplicated_allele(grouped) 570 if not alpha: 571 alpha = default_prior(len(combined.allele)) 572 combined["alpha"] = alpha 573 # Calculate Dirichlet mean for each allele 574 combined["allele_freq"] = sp.stats.dirichlet(combined.alpha + combined.c).mean() 575 576 return combined 577 578 579def default_prior(k): 580 """Calculate a default prior, 1 observation of each class. 581 582 Args: 583 k (int): Number of classes in the Dirichlet distribution. 584 585 Returns: 586 list: List of k 1s to use as prior. 587 """ 588 alpha = [1] * k 589 return alpha 590 591 592def single_loci(AFtab): 593 """Check that allele frequency data is only of one locus 594 595 Args: 596 AFtab (pd.DataFrame): Allele frequency data 597 """ 598 if not len(AFtab.loci.unique()) == 1: 599 raise AssertionError("'AFtab' must contain only 1 loci") 600 601 602def alleles_unique_in_study(AFtab, datasetID="population"): 603 """Are all alleles unique in each study? 604 605 Checks that no alleles are reported more than once in a single study. 606 Study is defined by `datasetID`. 607 608 Args: 609 AFtab (pd.DataFrame): Allele frequency data 610 datasetID (str, optional): Unique identifier column to define study. 611 Defaults to 'population'. 612 613 Returns: 614 bool: `True` on if no alleles occur more than once in any study, otherwise `False`. 615 """ 616 df = AFtab.copy() 617 grouped = df.groupby([datasetID, "allele"]) 618 # Are allele alleles unique? i.e. do any occur multiple times in grouping? 619 unique = grouped.size()[grouped.size() > 1].empty 620 if not unique: 621 print(f"Non unique alleles in study, is datasetID correct? {datasetID}") 622 print(grouped.size()[grouped.size() > 1]) 623 return unique 624 625 626def duplicated_sample_size(AFtab): 627 """Returns True if any loci has more than 1 unique sample size""" 628 locus_sample_sizes = AFtab.groupby("loci").sample_size.apply( 629 lambda x: len(x.unique()) 630 ) 631 return any(locus_sample_sizes != 1) 632 633 634def id_duplicated_allele(grouped): 635 """Reports the allele that has mupltiple sample sizes""" 636 duplicated_population = grouped.population.apply(lambda x: any(x.duplicated())) 637 if not all(~duplicated_population): 638 raise AssertionError( 639 f"duplicated population within allele {duplicated_population[duplicated_population].index.tolist()}" 640 ) 641 642 643def population_coverage(p): 644 """Proportion of people with at least 1 copy of this allele assuming HWE. 645 646 Args: 647 p (float): Allele frequency 648 649 Returns: 650 float: Sum of homozygotes and heterozygotes for this allele 651 """ 652 q = 1 - p 653 homo = p**2 654 hetero = 2 * p * q 655 return homo + hetero 656 657 658def betaAB(alpha): 659 """Calculate `a` `b` values for all composite beta distributions. 660 661 Given the `alpha` vector defining a Dirichlet distribution calculate the `a` `b` values 662 for all composite beta distributions. 663 664 Args: 665 alpha (list): Values defining a Dirichlet distribution. This will be the prior 666 (for a naive distribution) or the prior + caf.c for a posterior distribution. 667 668 Returns: 669 list: List of `a` `b` values defining beta values, i.e. for each allele it is 670 the number of times it was and wasn't observed. 671 """ 672 ab = [(a, sum(alpha) - a) for a in alpha] 673 return ab 674 675 676# def betaCI(a,b,credible_interval=0.95): 677# """Calculat the central credible interval of a beta distribution 678 679# Args: 680# a (float): Beta shape parameter `a`, i.e. the number of times the allele was observed. 681# b (float): Beta shape parameter `b`, i.e. the number of times the allele was not observed. 682# credible_interval (float, optional): The size of the credible interval requested. Defaults to 0.95. 683 684# Returns: 685# tuple: Lower and upper credible interval of beta distribution. 686# """ 687# bd = sp.stats.beta(a,b) 688# lower_quantile = (1-credible_interval)/2 689# upper_quantile = 1-lower_quantile 690# lower_interval = bd.ppf(lower_quantile) 691# upper_interval = bd.ppf(upper_quantile) 692# return lower_interval, upper_interval 693 694# def AFci(caf, credible_interval=0.95): 695# """Calculate credible interval for combined allele frequency table. 696# Note that this ignores sampling error so confidence interval is too tight. 697# Use HLAhdi.AFhdi() instead. 698 699# Args: 700# caf (pd.DataFrame): Table produced by combineAF() 701# credible_interval (float, optional): The desired confidence interval. Defaults to 0.95. 702 703# Returns: 704# list: Lower and upper credible intervals as a list of tuples 705# """ 706# ab = betaAB( 707# caf.alpha + caf.c, 708# ) 709# ci = [betaCI(a, b, credible_interval) for a,b in ab] 710# return ci 711 712 713def plot_prior(concentration, ncol=2, psteps=1000, labels=""): 714 """Plot probability density function for prior values. 715 716 Args: 717 concentration (list): Vector of the prior Dirichlet concentration values. 718 ncol (int, optional): Number of columns. Defaults to 2. 719 labels (list, optional): Labels for elements of concentration in the same 720 order. Defaults to "". 721 """ 722 ab = betaAB(concentration) 723 pline = np.linspace(0, 1, psteps) 724 nrow = math.ceil(len(concentration) / ncol) 725 fig, ax = plt.subplots(nrow, ncol, sharex=True) 726 fig.suptitle("Probability density") 727 # If labels is a list nothing happens, 728 # But if it's a series it converts to a list 729 labels = list(labels) 730 if not labels: 731 labels = [""] * len(concentration) 732 if not len(concentration) == len(labels): 733 raise AssertionError("concentration must be same length as labels") 734 for i, alpha in enumerate(concentration): 735 a, b = ab[i] 736 bd = sp.stats.beta(a, b) 737 pdf = [bd.pdf(p) for p in pline] 738 ax.flat[i].plot(pline, pdf) 739 ax.flat[i].set_title(labels[i]) 740 for axi in ax[-1, :]: 741 axi.set(xlabel="Allele freq") 742 for axi in ax[:, 0]: 743 axi.set(ylabel="PDF") 744 plt.show() 745 746 747def plotAF( 748 caf=pd.DataFrame(), 749 AFtab=pd.DataFrame(), 750 cols=list(mcolors.TABLEAU_COLORS.keys()), 751 datasetID="population", 752 hdi=pd.DataFrame(), 753 compound_mean=pd.DataFrame(), 754): 755 """Plot allele frequency results from `HLAfreq`. 756 757 Plot combined allele frequencies, individual allele frequencies, 758 and credible intervals on combined allele frequency estimates. 759 Credible interval is only plotted if a value is given for `hdi`. 760 The plotted Credible interval is whatever was passed to HLAfreq_pymc.AFhdi() 761 when calculating hdi. 762 763 Args: 764 caf (pd.DataFrame, optional): Combined allele frequency estimates from 765 HLAfreq.combineAF. Defaults to pd.DataFrame(). 766 AFtab (pd.DataFrame, optional): Table of allele frequency data. Defaults 767 to pd.DataFrame(). 768 cols (list, optional): List of colours to use for each individual dataset. 769 Defaults to list(mcolors.TABLEAU_COLORS.keys()). 770 datasetID (str, optional): Column used to define separate datasets. Defaults 771 to "population". 772 weights (str, optional): Column to be weighted by allele frequency to generate 773 concentration parameter of Dirichlet distribution. Defaults to '2n'. 774 hdi (pd.DataFrame, optional): The high density interval object to plot credible 775 intervals. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame(). 776 compound_mean (pd.DataFrame, optional): The high density interval object to plot 777 post_mean. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame(). 778 """ 779 # Plot allele frequency for each dataset 780 if not AFtab.empty: 781 # Cols must be longer than the list of alleles 782 # If not, repeat the list of cols 783 repeat_cols = np.ceil(len(AFtab[datasetID]) / len(cols)) 784 repeat_cols = int(repeat_cols) 785 cols = cols * repeat_cols 786 # Make a dictionary mapping datasetID to colours 787 cmap = dict(zip(AFtab[datasetID].unique(), cols)) 788 plt.scatter( 789 x=AFtab.allele_freq, 790 y=AFtab.allele, 791 c=[cmap[x] for x in AFtab[datasetID]], 792 alpha=0.7, 793 zorder=2, 794 ) 795 # Plot combined allele frequency 796 if not caf.empty: 797 plt.scatter( 798 x=caf.allele_freq, 799 y=caf.allele, 800 edgecolors="black", 801 facecolors="none", 802 zorder=3, 803 ) 804 # Plot high density interval 805 if not hdi.empty: 806 # assert not AFtab.empty, "AFtab is needed to calculate credible interval" 807 # from HLAfreq import HLAfreq_pymc as HLAhdi 808 # print("Fitting model with PyMC, make take a few seconds") 809 # hdi = HLAhdi.AFhdi( 810 # AFtab=AFtab, 811 # weights=weights, 812 # datasetID=datasetID, 813 # credible_interval=credible_interval, 814 # conc_mu=conc_mu, 815 # conc_sigma=conc_sigma 816 # ) 817 for interval in hdi.iterrows(): 818 # .iterrows returns a index and data as a tuple for each row 819 plt.hlines( 820 y=interval[1]["allele"], 821 xmin=interval[1]["lo"], 822 xmax=interval[1]["hi"], 823 color="black", 824 ) 825 if not compound_mean.empty: 826 for row in compound_mean.iterrows(): 827 plt.scatter( 828 y=row[1]["allele"], x=row[1]["post_mean"], color="black", marker="|" 829 ) 830 plt.xlabel("Allele frequency") 831 plt.grid(zorder=0) 832 plt.show()
23def simulate_population(alleles: Iterable[str], locus: str, population: str): 24 pop_size = np.random.randint(len(alleles), 50) 25 samples = np.random.choice(alleles, pop_size, replace=True) 26 counts = pd.Series(samples).value_counts() 27 counts.values / pop_size 28 pop = pd.DataFrame( 29 { 30 "allele": counts.index, 31 "loci": locus, 32 "population": population, 33 "allele_freq": counts.values / pop_size, 34 "sample_size": pop_size, 35 } 36 ) 37 return pop
50def makeURL( 51 country="", 52 standard="s", 53 locus="", 54 resolution_pattern="bigger_equal_than", 55 resolution=2, 56 region="", 57 ethnic="", 58 study_type="", 59 dataset_source="", 60 sample_year="", 61 sample_year_pattern="", 62 sample_size="", 63 sample_size_pattern="", 64): 65 """Create URL for search of allele frequency net database. 66 67 All arguments are documented [here](https://www.allelefrequencies.net/extaccess.asp) 68 69 Args: 70 country (str, optional): Country name to retrieve records from. Defaults to "". 71 standard (str, optional): Filter study quality standard to this or higher. 72 {'g', 's', 'a'} Gold, silver, all. Defaults to 's'. 73 locus (str, optional): The locus to return allele data for. Defaults to "". 74 resolution_pattern (str, optional): Resolution comparitor {'equal', 'different', 75 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. 76 Filter created using `resolution` and `resolution_pattern`. 77 Defaults to "bigger_equal_than". 78 resolution (int, optional): Number of fields of resolution of allele. Filter 79 created using `resolution` and `resolution_pattern`. Defaults to 2. 80 region (str, optional): Filter to geographic region. {Asia, Australia, 81 Eastern Europe, ...}. 82 All regions listed [here](https://www.allelefrequencies.net/pop6003a.asp). 83 Defaults to "". 84 ethnic (str, optional): Filter to ethnicity. {"Amerindian", "Black", "Caucasian", ...}. 85 All ethnicities listed [here](https://www.allelefrequencies.net/pop6003a.asp). 86 Defaults to "". 87 study_type (str, optional): Type of study. {"Anthropology", "Blood+Donor", 88 "Bone+Marrow+Registry", "Controls+for+Disease+Study", "Disease+Study+Patients", 89 "Other", "Solid+Organd+Unrelated+Donors", "Stem+cell+donors"}. Defaults to "". 90 dataset_source (str, optional): Source of data. {"Literature", 91 "Proceedings+of+IHWs", "Unpublished"}. Defaults to "". 92 sample_year (int, optional): Sample year to compare to. Filter created using 93 sample_year and sample_year_pattern. Defaults to "". 94 sample_year_pattern (str, optional): Pattern to compare sample year to. Filter 95 created using sample_year and sample_year_pattern. {'equal', 'different', 96 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. 97 Defaults to "". 98 sample_size (int, optional): Sample size to compare to. Filter created using 99 sample_size and sample_size_pattern. Defaults to "". 100 sample_size_pattern (str, optional): Pattern to compare sample size to. Filter 101 created using sample_size and sample_size_pattern. {'equal', 'different', 102 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. 103 Defaults to "". 104 105 Returns: 106 str: URL to search allelefrequencies.net 107 """ 108 base = "https://www.allelefrequencies.net/hla6006a.asp?" 109 locus_type = "hla_locus_type=Classical&" 110 hla_locus = "hla_locus=%s&" % (locus) 111 country = "hla_country=%s&" % (country) 112 region = "hla_region=%s&" % (region) 113 ethnic = "hla_ethnic=%s&" % (ethnic) 114 study_type = "hla_study=%s&" % (study_type) 115 dataset_source = "hla_dataset_source=%s&" % (dataset_source) 116 sample_year = "hla_sample_year=%s&" % (sample_year) 117 sample_year_pattern = "hla_sample_year_pattern=%s&" % (sample_year_pattern) 118 sample_size = "hla_sample_size=%s&" % (sample_size) 119 sample_size_pattern = "hla_sample_size_pattern=%s&" % (sample_size_pattern) 120 hla_level_pattern = "hla_level_pattern=%s&" % (resolution_pattern) 121 hla_level = "hla_level=%s&" % (resolution) 122 standard = "standard=%s&" % standard 123 url = ( 124 base 125 + locus_type 126 + hla_locus 127 + country 128 + hla_level_pattern 129 + hla_level 130 + standard 131 + region 132 + ethnic 133 + study_type 134 + dataset_source 135 + sample_year 136 + sample_year_pattern 137 + sample_size 138 + sample_size_pattern 139 ) 140 return url
Create URL for search of allele frequency net database.
All arguments are documented here
Arguments:
- country (str, optional): Country name to retrieve records from. Defaults to "".
- standard (str, optional): Filter study quality standard to this or higher. {'g', 's', 'a'} Gold, silver, all. Defaults to 's'.
- locus (str, optional): The locus to return allele data for. Defaults to "".
- resolution_pattern (str, optional): Resolution comparitor {'equal', 'different',
'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}.
Filter created using
resolutionandresolution_pattern. Defaults to "bigger_equal_than". - resolution (int, optional): Number of fields of resolution of allele. Filter
created using
resolutionandresolution_pattern. Defaults to 2. - region (str, optional): Filter to geographic region. {Asia, Australia, Eastern Europe, ...}. All regions listed here. Defaults to "".
- ethnic (str, optional): Filter to ethnicity. {"Amerindian", "Black", "Caucasian", ...}. All ethnicities listed here. Defaults to "".
- study_type (str, optional): Type of study. {"Anthropology", "Blood+Donor", "Bone+Marrow+Registry", "Controls+for+Disease+Study", "Disease+Study+Patients", "Other", "Solid+Organd+Unrelated+Donors", "Stem+cell+donors"}. Defaults to "".
- dataset_source (str, optional): Source of data. {"Literature", "Proceedings+of+IHWs", "Unpublished"}. Defaults to "".
- sample_year (int, optional): Sample year to compare to. Filter created using sample_year and sample_year_pattern. Defaults to "".
- sample_year_pattern (str, optional): Pattern to compare sample year to. Filter created using sample_year and sample_year_pattern. {'equal', 'different', 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. Defaults to "".
- sample_size (int, optional): Sample size to compare to. Filter created using sample_size and sample_size_pattern. Defaults to "".
- sample_size_pattern (str, optional): Pattern to compare sample size to. Filter created using sample_size and sample_size_pattern. {'equal', 'different', 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. Defaults to "".
Returns:
str: URL to search allelefrequencies.net
143def parseAF(bs): 144 """Generate a dataframe from a given html page 145 146 Args: 147 bs (bs4.BeautifulSoup): BeautifulSoup object from allelefrequencies.net page 148 149 Returns: 150 pd.DataFrame: Table of allele, allele frequency, samplesize, and population 151 """ 152 # Get the results table from the div `divGenDetail` 153 tab = bs.find("div", {"id": "divGenDetail"}).find("table", {"class": "tblNormal"}) 154 # Get the column headers from the first row of the table 155 columns = [ 156 "line", 157 "allele", 158 "flag", 159 "population", 160 "carriers%", 161 "allele_freq", 162 "AF_graphic", 163 "sample_size", 164 "database", 165 "distribution", 166 "haplotype_association", 167 "notes", 168 ] 169 rows = [] 170 for row in tab.find_all("tr"): 171 rows.append([td.get_text(strip=True) for td in row.find_all("td")]) 172 # Make dataframe of table rows 173 # skip the first row as it's `th` headers 174 df = pd.DataFrame(rows[1:], columns=columns) 175 176 # Get HLA loci 177 df["loci"] = df.allele.apply(lambda x: x.split("*")[0]) 178 179 # Drop unwanted columns 180 df = df[["allele", "loci", "population", "allele_freq", "carriers%", "sample_size"]] 181 return df
Generate a dataframe from a given html page
Arguments:
- bs (bs4.BeautifulSoup): BeautifulSoup object from allelefrequencies.net page
Returns:
pd.DataFrame: Table of allele, allele frequency, samplesize, and population
184def Npages(bs): 185 """How many pages of results are there? 186 187 Args: 188 bs (bs4.BeautifulSoup): BS object of allelefrequencies.net results page 189 190 Returns: 191 int: Total number of results pages 192 """ 193 # Get the table with number of pages 194 navtab = bs.find("div", {"id": "divGenNavig"}).find("table", {"class": "table10"}) 195 if not navtab: 196 raise AssertionError( 197 "navtab does not evaluate to True. Check URL returns results in web browser." 198 ) 199 # Get cell with ' of ' in 200 pagesOfN = [ 201 td.get_text(strip=True) for td in navtab.find_all("td") if " of " in td.text 202 ] 203 # Check single cell returned 204 if not len(pagesOfN) == 1: 205 raise AssertionError("divGenNavig should contain 1 of not %s" % len(pagesOfN)) 206 # Get total number of pages 207 N = pagesOfN[0].split("of ")[1] 208 N = int(N) 209 return N
How many pages of results are there?
Arguments:
- bs (bs4.BeautifulSoup): BS object of allelefrequencies.net results page
Returns:
int: Total number of results pages
212def formatAF(AFtab, ignoreG=True): 213 """Format allele frequency table. 214 215 Convert sample_size and allele_freq to numeric data type. 216 Removes commas from sample size. Removes "(*)" from allele frequency if 217 `ignoreG` is `True`. `formatAF()` is used internally by combineAF and getAFdata 218 by default. 219 220 Args: 221 AFtab (pd.DataFrame): Allele frequency data downloaded from allelefrequency.net 222 using `getAFdata()`. 223 ignoreG (bool, optional): Treat G group alleles as normal. 224 See https://hla.alleles.org/alleles/g_groups.html for details. Defaults to True. 225 226 Returns: 227 pd.DataFrame: The formatted allele frequency data. 228 """ 229 df = AFtab.copy() 230 if pd.api.types.is_string_dtype(df.sample_size.dtype): 231 df.sample_size = pd.to_numeric(df.sample_size.str.replace(",", "")) 232 if pd.api.types.is_string_dtype(df.allele_freq.dtype): 233 if ignoreG: 234 df.allele_freq = df.allele_freq.str.replace("(*)", "", regex=False) 235 df.allele_freq = pd.to_numeric(df.allele_freq) 236 return df
Format allele frequency table.
Convert sample_size and allele_freq to numeric data type.
Removes commas from sample size. Removes "(*)" from allele frequency if
ignoreG is True. formatAF() is used internally by combineAF and getAFdata
by default.
Arguments:
- AFtab (pd.DataFrame): Allele frequency data downloaded from allelefrequency.net
using
getAFdata(). - ignoreG (bool, optional): Treat G group alleles as normal. See https://hla.alleles.org/alleles/g_groups.html for details. Defaults to True.
Returns:
pd.DataFrame: The formatted allele frequency data.
239def getAFdata(base_url, timeout=20, format=True, ignoreG=True): 240 """Get all allele frequency data from a search base_url. 241 242 Iterates over all pages regardless of which page is based. 243 244 Args: 245 base_url (str): URL for base search. 246 timeout (int): How long to wait to receive a response. 247 format (bool): Format the downloaded data using `formatAF()`. 248 ignoreG (bool): treat allele G groups as normal. 249 See https://hla.alleles.org/alleles/g_groups.html for details. Default = True 250 251 Returns: 252 pd.DataFrame: allele frequency data parsed into a pandas dataframe 253 """ 254 # Get BS object from base search 255 try: 256 bs = BeautifulSoup(requests.get(base_url, timeout=timeout).text, "html.parser") 257 except requests.exceptions.ReadTimeout as e: 258 raise Exception( 259 "Requests timeout, try a larger `timeout` value for `getAFdata()`" 260 ) from None 261 # How many pages of results 262 N = Npages(bs) 263 print("%s pages of results" % N) 264 # iterate over pages, parse and combine data from each 265 tabs = [] 266 for i in range(N): 267 # print (" Parsing page %s" %(i+1)) 268 print(" Parsing page %s" % (i + 1), end="\r") 269 url = base_url + "page=" + str(i + 1) 270 try: 271 bs = BeautifulSoup(requests.get(url, timeout=timeout).text, "html.parser") 272 except requests.exceptions.ReadTimeout as e: 273 raise Exception( 274 "Requests timeout, try a larger `timeout` value for `getAFdata()`" 275 ) from None 276 tab = parseAF(bs) 277 tabs.append(tab) 278 print("Download complete") 279 tabs = pd.concat(tabs) 280 if format: 281 try: 282 tabs = formatAF(tabs, ignoreG) 283 except AttributeError: 284 print("Formatting failed, non-numeric datatypes may remain.") 285 return tabs
Get all allele frequency data from a search base_url.
Iterates over all pages regardless of which page is based.
Arguments:
- base_url (str): URL for base search.
- timeout (int): How long to wait to receive a response.
- format (bool): Format the downloaded data using
formatAF(). - ignoreG (bool): treat allele G groups as normal. See https://hla.alleles.org/alleles/g_groups.html for details. Default = True
Returns:
pd.DataFrame: allele frequency data parsed into a pandas dataframe
288def incomplete_studies(AFtab, llimit=0.95, ulimit=1.1, datasetID="population"): 289 """Report any studies with allele freqs that don't sum to 1 290 291 Args: 292 AFtab (pd.DataFrame): Dataframe containing multiple studies 293 llimit (float, optional): Lower allele_freq sum limit that counts as complete. 294 Defaults to 0.95. 295 ulimit (float, optional): Upper allele_freq sum limit that will not be reported. 296 Defaults to 1.1. 297 datasetID (str): Unique identifier column for study 298 """ 299 poplocs = AFtab.groupby([datasetID, "loci"]).allele_freq.sum() 300 lmask = poplocs < llimit 301 if sum(lmask > 0): 302 print(poplocs[lmask]) 303 print(f"{sum(lmask)} studies have total allele frequency < {llimit}") 304 umask = poplocs > ulimit 305 if sum(umask > 0): 306 print(poplocs[umask]) 307 print(f"{sum(umask)} studies have total allele frequency > {ulimit}") 308 incomplete = pd.concat([poplocs[lmask], poplocs[umask]]) 309 return incomplete
Report any studies with allele freqs that don't sum to 1
Arguments:
- AFtab (pd.DataFrame): Dataframe containing multiple studies
- llimit (float, optional): Lower allele_freq sum limit that counts as complete. Defaults to 0.95.
- ulimit (float, optional): Upper allele_freq sum limit that will not be reported. Defaults to 1.1.
- datasetID (str): Unique identifier column for study
312def only_complete(AFtab, llimit=0.95, ulimit=1.1, datasetID="population"): 313 """Returns only complete studies. 314 315 Data is dropped if the locus for that population is not complete, i.e. doesn't 316 sum to between `llimit` and `ulimit`. This prevents throwing away data if 317 another loci in the population is incomplete. 318 319 Args: 320 AFtab (pd.DataFrame): Dataframe containing multiple studies 321 llimit (float, optional): Lower allele_freq sum limit that counts as complete. 322 Defaults to 0.95. 323 ulimit (float, optional): Upper allele_freq sum limit that will not be reported. 324 Defaults to 1.1. 325 datasetID (str): Unique identifier column for study. Defaults to 'population'. 326 327 Returns: 328 pd.DataFrame: Allele frequency data of multiple studies, but only complete studies. 329 """ 330 noncomplete = incomplete_studies( 331 AFtab=AFtab, llimit=llimit, ulimit=ulimit, datasetID=datasetID 332 ) 333 # Returns False if population AND loci are in the noncomplete.index 334 # AS A PAIR 335 # This is important so that we don't throw away all data on a population 336 # just because one loci is incomplete. 337 complete_mask = AFtab.apply( 338 lambda x: (x[datasetID], x.loci) not in noncomplete.index, axis=1 339 ) 340 df = AFtab[complete_mask] 341 return df
Returns only complete studies.
Data is dropped if the locus for that population is not complete, i.e. doesn't
sum to between llimit and ulimit. This prevents throwing away data if
another loci in the population is incomplete.
Arguments:
- AFtab (pd.DataFrame): Dataframe containing multiple studies
- llimit (float, optional): Lower allele_freq sum limit that counts as complete. Defaults to 0.95.
- ulimit (float, optional): Upper allele_freq sum limit that will not be reported. Defaults to 1.1.
- datasetID (str): Unique identifier column for study. Defaults to 'population'.
Returns:
pd.DataFrame: Allele frequency data of multiple studies, but only complete studies.
344def check_resolution(AFtab): 345 """Check if all alleles in AFtab have the same resolution. 346 Will print the number of records with each resolution. 347 348 Args: 349 AFtab (pd.DataFrame): Allele frequency data 350 351 Returns: 352 bool: True only if all alleles have the same resolution, else False. 353 """ 354 resolution = 1 + AFtab.allele.str.count(":") 355 resVC = resolution.value_counts() 356 pass_check = len(resVC) == 1 357 if not pass_check: 358 print(resVC) 359 print("Multiple resolutions in AFtab. Fix with decrease_resolution()") 360 return pass_check
Check if all alleles in AFtab have the same resolution. Will print the number of records with each resolution.
Arguments:
- AFtab (pd.DataFrame): Allele frequency data
Returns:
bool: True only if all alleles have the same resolution, else False.
363def decrease_resolution(AFtab, newres, datasetID="population"): 364 """Decrease allele resolution so all alleles have the same resolution. 365 366 Args: 367 AFtab (pd.DataFrame): Allele frequency data. 368 newres (int): The desired number of fields for resolution. 369 datasetID (str, optional): Column to use as stud identifier. 370 Defaults to 'population'. 371 372 Returns: 373 pd.DataFrame: Allele frequency data with all alleles of requested resolution. 374 """ 375 df = AFtab.copy() 376 resolution = 1 + df.allele.str.count(":") 377 if not all(resolution >= newres): 378 raise AssertionError(f"Some alleles have resolution below {newres} fields") 379 new_allele = df.allele.str.split(":").apply(lambda x: ":".join(x[:newres])) 380 df.allele = new_allele 381 collapsed = collapse_reduced_alleles(df, datasetID=datasetID) 382 return collapsed
Decrease allele resolution so all alleles have the same resolution.
Arguments:
- AFtab (pd.DataFrame): Allele frequency data.
- newres (int): The desired number of fields for resolution.
- datasetID (str, optional): Column to use as stud identifier. Defaults to 'population'.
Returns:
pd.DataFrame: Allele frequency data with all alleles of requested resolution.
385def collapse_reduced_alleles(AFtab, datasetID="population"): 386 df = AFtab.copy() 387 # Group by alleles within datasets 388 grouped = df.groupby([datasetID, "allele"]) 389 # Sum allele freq but keep other columns 390 collapsed = grouped.apply( 391 lambda row: [ 392 sum(row.allele_freq), 393 row.sample_size.unique()[0], 394 row.loci.unique()[0], 395 len(row.loci.unique()), 396 len(row.sample_size.unique()), 397 ], 398 include_groups=False 399 ) 400 collapsed = pd.DataFrame( 401 collapsed.tolist(), 402 index=collapsed.index, 403 columns=["allele_freq", "sample_size", "loci", "#loci", "#sample_sizes"], 404 ).reset_index() 405 # Within a study each all identical alleles should have the same loci and sample size 406 if not all(collapsed["#loci"] == 1): 407 raise AssertionError( 408 "Multiple loci found for a single allele in a single population" 409 ) 410 if not all(collapsed["#sample_sizes"] == 1): 411 raise AssertionError( 412 "Multiple sample_sizes found for a single allele in a single population" 413 ) 414 collapsed = collapsed[ 415 ["allele", "loci", "population", "allele_freq", "sample_size"] 416 ] 417 alleles_unique_in_study(collapsed) 418 return collapsed
421def unmeasured_alleles(AFtab, datasetID="population"): 422 """When combining AF estimates, unreported alleles can inflate frequencies 423 so AF sums to >1. Therefore we add unreported alleles with frequency zero. 424 425 Args: 426 AFtab (pd.DataFrame): Formatted allele frequency data 427 datasetID (str): Unique identifier column for study 428 429 Returns: 430 pd.DataFrame: Allele frequency data with all locus alleles reported 431 for each dataset 432 """ 433 df = AFtab.copy() 434 loci = df.loci.unique() 435 # Iterate over loci separately 436 for locus in loci: 437 # Iterate over each dataset reporting that locus 438 datasets = df[df.loci == locus][datasetID].unique() 439 for dataset in datasets: 440 # Single locus, single dataset 441 datasetAF = df[(df[datasetID] == dataset) & (df.loci == locus)] 442 # What was the sample size for this data? 443 dataset_sample_size = datasetAF.sample_size.unique() 444 if not (len(dataset_sample_size) == 1): 445 raise AssertionError( 446 "dataset_sample_size must be 1, not %s" % len(dataset_sample_size) 447 ) 448 dataset_sample_size = dataset_sample_size[0] 449 # Get all alleles for this locus (across datasets) 450 ualleles = df[df.loci == locus].allele.unique() 451 # Which of these alleles are not in this dataset? 452 missing_alleles = [ 453 allele for allele in ualleles if not allele in datasetAF.allele.values 454 ] 455 missing_rows = [ 456 (al, locus, dataset, 0, 0, dataset_sample_size) 457 for al in missing_alleles 458 ] 459 missing_rows = pd.DataFrame( 460 missing_rows, 461 columns=[ 462 "allele", 463 "loci", 464 datasetID, 465 "allele_freq", 466 "carriers%", 467 "sample_size", 468 ], 469 ) 470 # Add them in with zero frequency 471 if not missing_rows.empty: 472 df = pd.concat([df, missing_rows], ignore_index=True) 473 return df
When combining AF estimates, unreported alleles can inflate frequencies so AF sums to >1. Therefore we add unreported alleles with frequency zero.
Arguments:
- AFtab (pd.DataFrame): Formatted allele frequency data
- datasetID (str): Unique identifier column for study
Returns:
pd.DataFrame: Allele frequency data with all locus alleles reported for each dataset
476def combineAF( 477 AFtab, 478 weights="2n", 479 alpha=[], 480 datasetID="population", 481 format=True, 482 ignoreG=True, 483 add_unmeasured=True, 484 complete=True, 485 resolution=True, 486 unique=True, 487): 488 """Combine allele frequencies from multiple studies. 489 490 `datasetID` is the unique identifier for studies to combine. 491 Allele frequencies combined using a Dirichlet distribution where each study's 492 contribution to the concentration parameter is $2 * sample_size * allele_frequency$. 493 Sample size is doubled to get `2n` due to diploidy. If an alternative `weights` is 494 set it is not doubled. The total concentration parameter of the Dirichlet distribution 495 is the contributions from all studies plus the prior `alpha`. If `alpha` is not set 496 the prior defaults to 1 observation of each allele. 497 498 Args: 499 AFtab (pd.DataFrame): Table of Allele frequency data 500 weights (str, optional): Column to be weighted by allele frequency to generate 501 concentration parameter of Dirichlet distribution. Defaults to '2n'. 502 alpha (list, optional): Prior to use for Dirichlet distribution. Defaults to []. 503 datasetID (str, optional): Unique identifier column for study. Defaults to 504 'population'. 505 format (bool, optional): Run `formatAF()`. Defaults to True. 506 ignoreG (bool, optional): Treat allele G groups as normal, see `formatAF()`. 507 Defaults to True. 508 add_unmeasured (bool, optional): Add unmeasured alleles to each study. This is 509 important to ensure combined allele frequencies sum to 1. See 510 `add_unmeasured()`. Defaults to True. 511 complete (bool, optional): Check study completeness. Uses default values for 512 `incomplete_studies()`. If you are happy with your study completeness can 513 be switched off with False. Defaults to True. 514 resolution (bool, optional): Check that all alleles have the same resolution, 515 see `check_resolution()`. Defaults to True. 516 unique (bool, optional): Check that each allele appears no more than once per 517 study. See `alleles_unique_in_study()`. Defaults to True. 518 519 Returns: 520 pd.DataFrame: Allele frequencies after combining estimates from all studies. 521 *allele_freq* is the combined frequency estimate from the Dirichlet mean 522 where the concentration is `alpha` + `c`. 523 *alpha* is the prior used for the Dirichlet distribution. 524 *c* is the observations used for the Dirichlet distribution. 525 *sample_size* is the total sample size of all combined studies. 526 *wav* is the weighted average. 527 """ 528 df = AFtab.copy() 529 single_loci(df) 530 if unique: 531 if not alleles_unique_in_study(df, datasetID=datasetID): 532 raise AssertionError("The same allele appears multiple times in a dataset") 533 if complete: 534 if not incomplete_studies(df, datasetID=datasetID).empty: 535 raise AssertionError( 536 "AFtab contains studies with AF that doesn't sum to 1. Check incomplete_studies(AFtab)" 537 ) 538 if resolution: 539 if not check_resolution(df): 540 raise AssertionError( 541 "AFtab conains alleles at multiple resolutions, check check_resolution(AFtab)" 542 ) 543 if format: 544 df = formatAF(df, ignoreG) 545 if add_unmeasured: 546 df = unmeasured_alleles(df, datasetID) 547 try: 548 df["2n"] = df.sample_size * 2 549 except AttributeError: 550 print("column '2n' could not be created") 551 df["c"] = df.allele_freq * df[weights] 552 grouped = df.groupby("allele", sort=True) 553 combined = grouped.apply( 554 lambda row: [ 555 row.name, 556 row.loci.unique()[0], 557 np.average(row.allele_freq, weights=row[weights]), 558 row.c.sum(), 559 row.sample_size.sum(), 560 ], 561 include_groups=False 562 ) 563 combined = pd.DataFrame( 564 combined.tolist(), columns=["allele", "loci", "wav", "c", "sample_size"] 565 ) 566 combined = combined.reset_index(drop=True) 567 # Check that all alleles in a locus have the same sample size 568 # after merging 569 if duplicated_sample_size(combined): 570 id_duplicated_allele(grouped) 571 if not alpha: 572 alpha = default_prior(len(combined.allele)) 573 combined["alpha"] = alpha 574 # Calculate Dirichlet mean for each allele 575 combined["allele_freq"] = sp.stats.dirichlet(combined.alpha + combined.c).mean() 576 577 return combined
Combine allele frequencies from multiple studies.
datasetID is the unique identifier for studies to combine.
Allele frequencies combined using a Dirichlet distribution where each study's
contribution to the concentration parameter is $2 * sample_size * allele_frequency$.
Sample size is doubled to get 2n due to diploidy. If an alternative weights is
set it is not doubled. The total concentration parameter of the Dirichlet distribution
is the contributions from all studies plus the prior alpha. If alpha is not set
the prior defaults to 1 observation of each allele.
Arguments:
- AFtab (pd.DataFrame): Table of Allele frequency data
- weights (str, optional): Column to be weighted by allele frequency to generate concentration parameter of Dirichlet distribution. Defaults to '2n'.
- alpha (list, optional): Prior to use for Dirichlet distribution. Defaults to [].
- datasetID (str, optional): Unique identifier column for study. Defaults to 'population'.
- format (bool, optional): Run
formatAF(). Defaults to True. - ignoreG (bool, optional): Treat allele G groups as normal, see
formatAF(). Defaults to True. - add_unmeasured (bool, optional): Add unmeasured alleles to each study. This is
important to ensure combined allele frequencies sum to 1. See
add_unmeasured(). Defaults to True. - complete (bool, optional): Check study completeness. Uses default values for
incomplete_studies(). If you are happy with your study completeness can be switched off with False. Defaults to True. - resolution (bool, optional): Check that all alleles have the same resolution,
see
check_resolution(). Defaults to True. - unique (bool, optional): Check that each allele appears no more than once per
study. See
alleles_unique_in_study(). Defaults to True.
Returns:
pd.DataFrame: Allele frequencies after combining estimates from all studies. allele_freq is the combined frequency estimate from the Dirichlet mean where the concentration is
alpha+c. alpha is the prior used for the Dirichlet distribution. c is the observations used for the Dirichlet distribution. sample_size is the total sample size of all combined studies. wav is the weighted average.
580def default_prior(k): 581 """Calculate a default prior, 1 observation of each class. 582 583 Args: 584 k (int): Number of classes in the Dirichlet distribution. 585 586 Returns: 587 list: List of k 1s to use as prior. 588 """ 589 alpha = [1] * k 590 return alpha
Calculate a default prior, 1 observation of each class.
Arguments:
- k (int): Number of classes in the Dirichlet distribution.
Returns:
list: List of k 1s to use as prior.
593def single_loci(AFtab): 594 """Check that allele frequency data is only of one locus 595 596 Args: 597 AFtab (pd.DataFrame): Allele frequency data 598 """ 599 if not len(AFtab.loci.unique()) == 1: 600 raise AssertionError("'AFtab' must contain only 1 loci")
Check that allele frequency data is only of one locus
Arguments:
- AFtab (pd.DataFrame): Allele frequency data
603def alleles_unique_in_study(AFtab, datasetID="population"): 604 """Are all alleles unique in each study? 605 606 Checks that no alleles are reported more than once in a single study. 607 Study is defined by `datasetID`. 608 609 Args: 610 AFtab (pd.DataFrame): Allele frequency data 611 datasetID (str, optional): Unique identifier column to define study. 612 Defaults to 'population'. 613 614 Returns: 615 bool: `True` on if no alleles occur more than once in any study, otherwise `False`. 616 """ 617 df = AFtab.copy() 618 grouped = df.groupby([datasetID, "allele"]) 619 # Are allele alleles unique? i.e. do any occur multiple times in grouping? 620 unique = grouped.size()[grouped.size() > 1].empty 621 if not unique: 622 print(f"Non unique alleles in study, is datasetID correct? {datasetID}") 623 print(grouped.size()[grouped.size() > 1]) 624 return unique
Are all alleles unique in each study?
Checks that no alleles are reported more than once in a single study.
Study is defined by datasetID.
Arguments:
- AFtab (pd.DataFrame): Allele frequency data
- datasetID (str, optional): Unique identifier column to define study. Defaults to 'population'.
Returns:
bool:
Trueon if no alleles occur more than once in any study, otherwiseFalse.
627def duplicated_sample_size(AFtab): 628 """Returns True if any loci has more than 1 unique sample size""" 629 locus_sample_sizes = AFtab.groupby("loci").sample_size.apply( 630 lambda x: len(x.unique()) 631 ) 632 return any(locus_sample_sizes != 1)
Returns True if any loci has more than 1 unique sample size
635def id_duplicated_allele(grouped): 636 """Reports the allele that has mupltiple sample sizes""" 637 duplicated_population = grouped.population.apply(lambda x: any(x.duplicated())) 638 if not all(~duplicated_population): 639 raise AssertionError( 640 f"duplicated population within allele {duplicated_population[duplicated_population].index.tolist()}" 641 )
Reports the allele that has mupltiple sample sizes
644def population_coverage(p): 645 """Proportion of people with at least 1 copy of this allele assuming HWE. 646 647 Args: 648 p (float): Allele frequency 649 650 Returns: 651 float: Sum of homozygotes and heterozygotes for this allele 652 """ 653 q = 1 - p 654 homo = p**2 655 hetero = 2 * p * q 656 return homo + hetero
Proportion of people with at least 1 copy of this allele assuming HWE.
Arguments:
- p (float): Allele frequency
Returns:
float: Sum of homozygotes and heterozygotes for this allele
659def betaAB(alpha): 660 """Calculate `a` `b` values for all composite beta distributions. 661 662 Given the `alpha` vector defining a Dirichlet distribution calculate the `a` `b` values 663 for all composite beta distributions. 664 665 Args: 666 alpha (list): Values defining a Dirichlet distribution. This will be the prior 667 (for a naive distribution) or the prior + caf.c for a posterior distribution. 668 669 Returns: 670 list: List of `a` `b` values defining beta values, i.e. for each allele it is 671 the number of times it was and wasn't observed. 672 """ 673 ab = [(a, sum(alpha) - a) for a in alpha] 674 return ab
Calculate a b values for all composite beta distributions.
Given the alpha vector defining a Dirichlet distribution calculate the a b values
for all composite beta distributions.
Arguments:
- alpha (list): Values defining a Dirichlet distribution. This will be the prior (for a naive distribution) or the prior + caf.c for a posterior distribution.
Returns:
list: List of
abvalues defining beta values, i.e. for each allele it is the number of times it was and wasn't observed.
714def plot_prior(concentration, ncol=2, psteps=1000, labels=""): 715 """Plot probability density function for prior values. 716 717 Args: 718 concentration (list): Vector of the prior Dirichlet concentration values. 719 ncol (int, optional): Number of columns. Defaults to 2. 720 labels (list, optional): Labels for elements of concentration in the same 721 order. Defaults to "". 722 """ 723 ab = betaAB(concentration) 724 pline = np.linspace(0, 1, psteps) 725 nrow = math.ceil(len(concentration) / ncol) 726 fig, ax = plt.subplots(nrow, ncol, sharex=True) 727 fig.suptitle("Probability density") 728 # If labels is a list nothing happens, 729 # But if it's a series it converts to a list 730 labels = list(labels) 731 if not labels: 732 labels = [""] * len(concentration) 733 if not len(concentration) == len(labels): 734 raise AssertionError("concentration must be same length as labels") 735 for i, alpha in enumerate(concentration): 736 a, b = ab[i] 737 bd = sp.stats.beta(a, b) 738 pdf = [bd.pdf(p) for p in pline] 739 ax.flat[i].plot(pline, pdf) 740 ax.flat[i].set_title(labels[i]) 741 for axi in ax[-1, :]: 742 axi.set(xlabel="Allele freq") 743 for axi in ax[:, 0]: 744 axi.set(ylabel="PDF") 745 plt.show()
Plot probability density function for prior values.
Arguments:
- concentration (list): Vector of the prior Dirichlet concentration values.
- ncol (int, optional): Number of columns. Defaults to 2.
- labels (list, optional): Labels for elements of concentration in the same order. Defaults to "".
748def plotAF( 749 caf=pd.DataFrame(), 750 AFtab=pd.DataFrame(), 751 cols=list(mcolors.TABLEAU_COLORS.keys()), 752 datasetID="population", 753 hdi=pd.DataFrame(), 754 compound_mean=pd.DataFrame(), 755): 756 """Plot allele frequency results from `HLAfreq`. 757 758 Plot combined allele frequencies, individual allele frequencies, 759 and credible intervals on combined allele frequency estimates. 760 Credible interval is only plotted if a value is given for `hdi`. 761 The plotted Credible interval is whatever was passed to HLAfreq_pymc.AFhdi() 762 when calculating hdi. 763 764 Args: 765 caf (pd.DataFrame, optional): Combined allele frequency estimates from 766 HLAfreq.combineAF. Defaults to pd.DataFrame(). 767 AFtab (pd.DataFrame, optional): Table of allele frequency data. Defaults 768 to pd.DataFrame(). 769 cols (list, optional): List of colours to use for each individual dataset. 770 Defaults to list(mcolors.TABLEAU_COLORS.keys()). 771 datasetID (str, optional): Column used to define separate datasets. Defaults 772 to "population". 773 weights (str, optional): Column to be weighted by allele frequency to generate 774 concentration parameter of Dirichlet distribution. Defaults to '2n'. 775 hdi (pd.DataFrame, optional): The high density interval object to plot credible 776 intervals. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame(). 777 compound_mean (pd.DataFrame, optional): The high density interval object to plot 778 post_mean. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame(). 779 """ 780 # Plot allele frequency for each dataset 781 if not AFtab.empty: 782 # Cols must be longer than the list of alleles 783 # If not, repeat the list of cols 784 repeat_cols = np.ceil(len(AFtab[datasetID]) / len(cols)) 785 repeat_cols = int(repeat_cols) 786 cols = cols * repeat_cols 787 # Make a dictionary mapping datasetID to colours 788 cmap = dict(zip(AFtab[datasetID].unique(), cols)) 789 plt.scatter( 790 x=AFtab.allele_freq, 791 y=AFtab.allele, 792 c=[cmap[x] for x in AFtab[datasetID]], 793 alpha=0.7, 794 zorder=2, 795 ) 796 # Plot combined allele frequency 797 if not caf.empty: 798 plt.scatter( 799 x=caf.allele_freq, 800 y=caf.allele, 801 edgecolors="black", 802 facecolors="none", 803 zorder=3, 804 ) 805 # Plot high density interval 806 if not hdi.empty: 807 # assert not AFtab.empty, "AFtab is needed to calculate credible interval" 808 # from HLAfreq import HLAfreq_pymc as HLAhdi 809 # print("Fitting model with PyMC, make take a few seconds") 810 # hdi = HLAhdi.AFhdi( 811 # AFtab=AFtab, 812 # weights=weights, 813 # datasetID=datasetID, 814 # credible_interval=credible_interval, 815 # conc_mu=conc_mu, 816 # conc_sigma=conc_sigma 817 # ) 818 for interval in hdi.iterrows(): 819 # .iterrows returns a index and data as a tuple for each row 820 plt.hlines( 821 y=interval[1]["allele"], 822 xmin=interval[1]["lo"], 823 xmax=interval[1]["hi"], 824 color="black", 825 ) 826 if not compound_mean.empty: 827 for row in compound_mean.iterrows(): 828 plt.scatter( 829 y=row[1]["allele"], x=row[1]["post_mean"], color="black", marker="|" 830 ) 831 plt.xlabel("Allele frequency") 832 plt.grid(zorder=0) 833 plt.show()
Plot allele frequency results from HLAfreq.
Plot combined allele frequencies, individual allele frequencies,
and credible intervals on combined allele frequency estimates.
Credible interval is only plotted if a value is given for hdi.
The plotted Credible interval is whatever was passed to HLAfreq_pymc.AFhdi()
when calculating hdi.
Arguments:
- caf (pd.DataFrame, optional): Combined allele frequency estimates from HLAfreq.combineAF. Defaults to pd.DataFrame().
- AFtab (pd.DataFrame, optional): Table of allele frequency data. Defaults to pd.DataFrame().
- cols (list, optional): List of colours to use for each individual dataset. Defaults to list(mcolors.TABLEAU_COLORS.keys()).
- datasetID (str, optional): Column used to define separate datasets. Defaults to "population".
- weights (str, optional): Column to be weighted by allele frequency to generate concentration parameter of Dirichlet distribution. Defaults to '2n'.
- hdi (pd.DataFrame, optional): The high density interval object to plot credible intervals. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame().
- compound_mean (pd.DataFrame, optional): The high density interval object to plot post_mean. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame().