HLAfreq.HLAfreq
Download and combine HLA allele frequencies from multiple datasets.
Download allele frequency data from allelefrequencies.net. Allele frequencies from different populations can be combined to estimate HLA frequencies of countries or other regions such as global HLA frequencies.
1""" 2Download and combine HLA allele frequencies from multiple datasets. 3 4Download allele frequency data from 5[allelefrequencies.net](www.allelefrequencies.net). Allele 6frequencies from different populations can be combined to 7estimate HLA frequencies of countries or other regions such as 8global HLA frequencies. 9""" 10 11from collections.abc import Iterable 12from bs4 import BeautifulSoup 13import requests 14import pandas as pd 15import numpy as np 16import matplotlib.pyplot as plt 17import math 18import scipy as sp 19import matplotlib.colors as mcolors 20 21 22def simulate_population(alleles: Iterable[str], locus: str, population: str): 23 pop_size = np.random.randint(len(alleles), 50) 24 samples = np.random.choice(alleles, pop_size, replace=True) 25 counts = pd.Series(samples).value_counts() 26 counts.values / pop_size 27 pop = pd.DataFrame( 28 { 29 "allele": counts.index, 30 "loci": locus, 31 "population": population, 32 "allele_freq": counts.values / pop_size, 33 "sample_size": pop_size, 34 } 35 ) 36 return pop 37 38 39def simulate_study(alleles, populations, locus): 40 study = [] 41 for i in range(populations): 42 pop = simulate_population(alleles=alleles, locus=locus, population=f"pop_{i}") 43 study.append(pop) 44 45 study = pd.concat(study) 46 return study 47 48 49def makeURL( 50 country="", 51 standard="s", 52 locus="", 53 resolution_pattern="bigger_equal_than", 54 resolution=2, 55 region="", 56 ethnic="", 57 study_type="", 58 dataset_source="", 59 sample_year="", 60 sample_year_pattern="", 61 sample_size="", 62 sample_size_pattern="", 63): 64 """Create URL for search of allele frequency net database. 65 66 All arguments are documented [here](http://www.allelefrequencies.net/extaccess.asp) 67 68 Args: 69 country (str, optional): Country name to retrieve records from. Defaults to "". 70 standard (str, optional): Filter study quality standard to this or higher. 71 {'g', 's', 'a'} Gold, silver, all. Defaults to 's'. 72 locus (str, optional): The locus to return allele data for. Defaults to "". 73 resolution_pattern (str, optional): Resolution comparitor {'equal', 'different', 74 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. 75 Filter created using `resolution` and `resolution_pattern`. 76 Defaults to "bigger_equal_than". 77 resolution (int, optional): Number of fields of resolution of allele. Filter 78 created using `resolution` and `resolution_pattern`. Defaults to 2. 79 region (str, optional): Filter to geographic region. {Asia, Australia, 80 Eastern Europe, ...}. 81 All regions listed [here](http://www.allelefrequencies.net/pop6003a.asp). 82 Defaults to "". 83 ethnic (str, optional): Filter to ethnicity. {"Amerindian", "Black", "Caucasian", ...}. 84 All ethnicities listed [here](http://www.allelefrequencies.net/pop6003a.asp). 85 Defaults to "". 86 study_type (str, optional): Type of study. {"Anthropology", "Blood+Donor", 87 "Bone+Marrow+Registry", "Controls+for+Disease+Study", "Disease+Study+Patients", 88 "Other", "Solid+Organd+Unrelated+Donors", "Stem+cell+donors"}. Defaults to "". 89 dataset_source (str, optional): Source of data. {"Literature", 90 "Proceedings+of+IHWs", "Unpublished"}. Defaults to "". 91 sample_year (int, optional): Sample year to compare to. Filter created using 92 sample_year and sample_year_pattern. Defaults to "". 93 sample_year_pattern (str, optional): Pattern to compare sample year to. Filter 94 created using sample_year and sample_year_pattern. {'equal', 'different', 95 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. 96 Defaults to "". 97 sample_size (int, optional): Sample size to compare to. Filter created using 98 sample_size and sample_size_pattern. Defaults to "". 99 sample_size_pattern (str, optional): Pattern to compare sample size to. Filter 100 created using sample_size and sample_size_pattern. {'equal', 'different', 101 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. 102 Defaults to "". 103 104 Returns: 105 str: URL to search allelefrequencies.net 106 """ 107 base = "http://www.allelefrequencies.net/hla6006a.asp?" 108 locus_type = "hla_locus_type=Classical&" 109 hla_locus = "hla_locus=%s&" % (locus) 110 country = "hla_country=%s&" % (country) 111 region = "hla_region=%s&" % (region) 112 ethnic = "hla_ethnic=%s&" % (ethnic) 113 study_type = "hla_study=%s&" % (study_type) 114 dataset_source = "hla_dataset_source=%s&" % (dataset_source) 115 sample_year = "hla_sample_year=%s&" % (sample_year) 116 sample_year_pattern = "hla_sample_year_pattern=%s&" % (sample_year_pattern) 117 sample_size = "hla_sample_size=%s&" % (sample_size) 118 sample_size_pattern = "hla_sample_size_pattern=%s&" % (sample_size_pattern) 119 hla_level_pattern = "hla_level_pattern=%s&" % (resolution_pattern) 120 hla_level = "hla_level=%s&" % (resolution) 121 standard = "standard=%s&" % standard 122 url = ( 123 base 124 + locus_type 125 + hla_locus 126 + country 127 + hla_level_pattern 128 + hla_level 129 + standard 130 + region 131 + ethnic 132 + study_type 133 + dataset_source 134 + sample_year 135 + sample_year_pattern 136 + sample_size 137 + sample_size_pattern 138 ) 139 return url 140 141 142def parseAF(bs): 143 """Generate a dataframe from a given html page 144 145 Args: 146 bs (bs4.BeautifulSoup): BeautifulSoup object from allelefrequencies.net page 147 148 Returns: 149 pd.DataFrame: Table of allele, allele frequency, samplesize, and population 150 """ 151 # Get the results table from the div `divGenDetail` 152 tab = bs.find("div", {"id": "divGenDetail"}).find("table", {"class": "tblNormal"}) 153 # Get the column headers from the first row of the table 154 columns = [ 155 "line", 156 "allele", 157 "flag", 158 "population", 159 "carriers%", 160 "allele_freq", 161 "AF_graphic", 162 "sample_size", 163 "database", 164 "distribution", 165 "haplotype_association", 166 "notes", 167 ] 168 rows = [] 169 for row in tab.find_all("tr"): 170 rows.append([td.get_text(strip=True) for td in row.find_all("td")]) 171 # Make dataframe of table rows 172 # skip the first row as it's `th` headers 173 df = pd.DataFrame(rows[1:], columns=columns) 174 175 # Get HLA loci 176 df["loci"] = df.allele.apply(lambda x: x.split("*")[0]) 177 178 # Drop unwanted columns 179 df = df[["allele", "loci", "population", "allele_freq", "carriers%", "sample_size"]] 180 return df 181 182 183def Npages(bs): 184 """How many pages of results are there? 185 186 Args: 187 bs (bs4.BeautifulSoup): BS object of allelefrequencies.net results page 188 189 Returns: 190 int: Total number of results pages 191 """ 192 # Get the table with number of pages 193 navtab = bs.find("div", {"id": "divGenNavig"}).find("table", {"class": "table10"}) 194 if not navtab: 195 raise AssertionError( 196 "navtab does not evaluate to True. Check URL returns results in web browser." 197 ) 198 # Get cell with ' of ' in 199 pagesOfN = [ 200 td.get_text(strip=True) for td in navtab.find_all("td") if " of " in td.text 201 ] 202 # Check single cell returned 203 if not len(pagesOfN) == 1: 204 raise AssertionError("divGenNavig should contain 1 of not %s" % len(pagesOfN)) 205 # Get total number of pages 206 N = pagesOfN[0].split("of ")[1] 207 N = int(N) 208 return N 209 210 211def formatAF(AFtab, ignoreG=True): 212 """Format allele frequency table. 213 214 Convert sample_size and allele_freq to numeric data type. 215 Removes commas from sample size. Removes "(*)" from allele frequency if 216 `ignoreG` is `True`. `formatAF()` is used internally by combineAF and getAFdata 217 by default. 218 219 Args: 220 AFtab (pd.DataFrame): Allele frequency data downloaded from allelefrequency.net 221 using `getAFdata()`. 222 ignoreG (bool, optional): Treat G group alleles as normal. 223 See http://hla.alleles.org/alleles/g_groups.html for details. Defaults to True. 224 225 Returns: 226 pd.DataFrame: The formatted allele frequency data. 227 """ 228 df = AFtab.copy() 229 if df.sample_size.dtype == "O": 230 df.sample_size = pd.to_numeric(df.sample_size.str.replace(",", "")) 231 if df.allele_freq.dtype == "O": 232 if ignoreG: 233 df.allele_freq = df.allele_freq.str.replace("(*)", "", regex=False) 234 df.allele_freq = pd.to_numeric(df.allele_freq) 235 return df 236 237 238def getAFdata(base_url, timeout=20, format=True, ignoreG=True): 239 """Get all allele frequency data from a search base_url. 240 241 Iterates over all pages regardless of which page is based. 242 243 Args: 244 base_url (str): URL for base search. 245 timeout (int): How long to wait to receive a response. 246 format (bool): Format the downloaded data using `formatAF()`. 247 ignoreG (bool): treat allele G groups as normal. 248 See http://hla.alleles.org/alleles/g_groups.html for details. Default = True 249 250 Returns: 251 pd.DataFrame: allele frequency data parsed into a pandas dataframe 252 """ 253 # Get BS object from base search 254 try: 255 bs = BeautifulSoup(requests.get(base_url, timeout=timeout).text, "html.parser") 256 except requests.exceptions.ReadTimeout as e: 257 raise Exception( 258 "Requests timeout, try a larger `timeout` value for `getAFdata()`" 259 ) from None 260 # How many pages of results 261 N = Npages(bs) 262 print("%s pages of results" % N) 263 # iterate over pages, parse and combine data from each 264 tabs = [] 265 for i in range(N): 266 # print (" Parsing page %s" %(i+1)) 267 print(" Parsing page %s" % (i + 1), end="\r") 268 url = base_url + "page=" + str(i + 1) 269 try: 270 bs = BeautifulSoup(requests.get(url, timeout=timeout).text, "html.parser") 271 except requests.exceptions.ReadTimeout as e: 272 raise Exception( 273 "Requests timeout, try a larger `timeout` value for `getAFdata()`" 274 ) from None 275 tab = parseAF(bs) 276 tabs.append(tab) 277 print("Download complete") 278 tabs = pd.concat(tabs) 279 if format: 280 try: 281 tabs = formatAF(tabs, ignoreG) 282 except AttributeError: 283 print("Formatting failed, non-numeric datatypes may remain.") 284 return tabs 285 286 287def incomplete_studies(AFtab, llimit=0.95, ulimit=1.1, datasetID="population"): 288 """Report any studies with allele freqs that don't sum to 1 289 290 Args: 291 AFtab (pd.DataFrame): Dataframe containing multiple studies 292 llimit (float, optional): Lower allele_freq sum limit that counts as complete. 293 Defaults to 0.95. 294 ulimit (float, optional): Upper allele_freq sum limit that will not be reported. 295 Defaults to 1.1. 296 datasetID (str): Unique identifier column for study 297 """ 298 poplocs = AFtab.groupby([datasetID, "loci"]).allele_freq.sum() 299 lmask = poplocs < llimit 300 if sum(lmask > 0): 301 print(poplocs[lmask]) 302 print(f"{sum(lmask)} studies have total allele frequency < {llimit}") 303 umask = poplocs > ulimit 304 if sum(umask > 0): 305 print(poplocs[umask]) 306 print(f"{sum(umask)} studies have total allele frequency > {ulimit}") 307 incomplete = pd.concat([poplocs[lmask], poplocs[umask]]) 308 return incomplete 309 310 311def only_complete(AFtab, llimit=0.95, ulimit=1.1, datasetID="population"): 312 """Returns only complete studies. 313 314 Data is dropped if the locus for that population is not complete, i.e. doesn't 315 sum to between `llimit` and `ulimit`. This prevents throwing away data if 316 another loci in the population is incomplete. 317 318 Args: 319 AFtab (pd.DataFrame): Dataframe containing multiple studies 320 llimit (float, optional): Lower allele_freq sum limit that counts as complete. 321 Defaults to 0.95. 322 ulimit (float, optional): Upper allele_freq sum limit that will not be reported. 323 Defaults to 1.1. 324 datasetID (str): Unique identifier column for study. Defaults to 'population'. 325 326 Returns: 327 pd.DataFrame: Allele frequency data of multiple studies, but only complete studies. 328 """ 329 noncomplete = incomplete_studies( 330 AFtab=AFtab, llimit=llimit, ulimit=ulimit, datasetID=datasetID 331 ) 332 # Returns False if population AND loci are in the noncomplete.index 333 # AS A PAIR 334 # This is important so that we don't throw away all data on a population 335 # just because one loci is incomplete. 336 complete_mask = AFtab.apply( 337 lambda x: (x[datasetID], x.loci) not in noncomplete.index, axis=1 338 ) 339 df = AFtab[complete_mask] 340 return df 341 342 343def check_resolution(AFtab): 344 """Check if all alleles in AFtab have the same resolution. 345 Will print the number of records with each resolution. 346 347 Args: 348 AFtab (pd.DataFrame): Allele frequency data 349 350 Returns: 351 bool: True only if all alleles have the same resolution, else False. 352 """ 353 resolution = 1 + AFtab.allele.str.count(":") 354 resVC = resolution.value_counts() 355 pass_check = len(resVC) == 1 356 if not pass_check: 357 print(resVC) 358 print("Multiple resolutions in AFtab. Fix with decrease_resolution()") 359 return pass_check 360 361 362def decrease_resolution(AFtab, newres, datasetID="population"): 363 """Decrease allele resolution so all alleles have the same resolution. 364 365 Args: 366 AFtab (pd.DataFrame): Allele frequency data. 367 newres (int): The desired number of fields for resolution. 368 datasetID (str, optional): Column to use as stud identifier. 369 Defaults to 'population'. 370 371 Returns: 372 pd.DataFrame: Allele frequency data with all alleles of requested resolution. 373 """ 374 df = AFtab.copy() 375 resolution = 1 + df.allele.str.count(":") 376 if not all(resolution >= newres): 377 raise AssertionError(f"Some alleles have resolution below {newres} fields") 378 new_allele = df.allele.str.split(":").apply(lambda x: ":".join(x[:newres])) 379 df.allele = new_allele 380 collapsed = collapse_reduced_alleles(df, datasetID=datasetID) 381 return collapsed 382 383 384def collapse_reduced_alleles(AFtab, datasetID="population"): 385 df = AFtab.copy() 386 # Group by alleles within datasets 387 grouped = df.groupby([datasetID, "allele"]) 388 # Sum allele freq but keep other columns 389 collapsed = grouped.apply( 390 lambda row: [ 391 sum(row.allele_freq), 392 row.sample_size.unique()[0], 393 row.loci.unique()[0], 394 len(row.loci.unique()), 395 len(row.sample_size.unique()), 396 ] 397 ) 398 collapsed = pd.DataFrame( 399 collapsed.tolist(), 400 index=collapsed.index, 401 columns=["allele_freq", "sample_size", "loci", "#loci", "#sample_sizes"], 402 ).reset_index() 403 # Within a study each all identical alleles should have the same loci and sample size 404 if not all(collapsed["#loci"] == 1): 405 raise AssertionError( 406 "Multiple loci found for a single allele in a single population" 407 ) 408 if not all(collapsed["#sample_sizes"] == 1): 409 raise AssertionError( 410 "Multiple sample_sizes found for a single allele in a single population" 411 ) 412 collapsed = collapsed[ 413 ["allele", "loci", "population", "allele_freq", "sample_size"] 414 ] 415 alleles_unique_in_study(collapsed) 416 return collapsed 417 418 419def unmeasured_alleles(AFtab, datasetID="population"): 420 """When combining AF estimates, unreported alleles can inflate frequencies 421 so AF sums to >1. Therefore we add unreported alleles with frequency zero. 422 423 Args: 424 AFtab (pd.DataFrame): Formatted allele frequency data 425 datasetID (str): Unique identifier column for study 426 427 Returns: 428 pd.DataFrame: Allele frequency data with all locus alleles reported 429 for each dataset 430 """ 431 df = AFtab.copy() 432 loci = df.loci.unique() 433 # Iterate over loci separately 434 for locus in loci: 435 # Iterate over each dataset reporting that locus 436 datasets = df[df.loci == locus][datasetID].unique() 437 for dataset in datasets: 438 # Single locus, single dataset 439 datasetAF = df[(df[datasetID] == dataset) & (df.loci == locus)] 440 # What was the sample size for this data? 441 dataset_sample_size = datasetAF.sample_size.unique() 442 if not (len(dataset_sample_size) == 1): 443 raise AssertionError( 444 "dataset_sample_size must be 1, not %s" % len(dataset_sample_size) 445 ) 446 dataset_sample_size = dataset_sample_size[0] 447 # Get all alleles for this locus (across datasets) 448 ualleles = df[df.loci == locus].allele.unique() 449 # Which of these alleles are not in this dataset? 450 missing_alleles = [ 451 allele for allele in ualleles if not allele in datasetAF.allele.values 452 ] 453 missing_rows = [ 454 (al, locus, dataset, 0, 0, dataset_sample_size) 455 for al in missing_alleles 456 ] 457 missing_rows = pd.DataFrame( 458 missing_rows, 459 columns=[ 460 "allele", 461 "loci", 462 datasetID, 463 "allele_freq", 464 "carriers%", 465 "sample_size", 466 ], 467 ) 468 # Add them in with zero frequency 469 if not missing_rows.empty: 470 df = pd.concat([df, missing_rows], ignore_index=True) 471 return df 472 473 474def combineAF( 475 AFtab, 476 weights="2n", 477 alpha=[], 478 datasetID="population", 479 format=True, 480 ignoreG=True, 481 add_unmeasured=True, 482 complete=True, 483 resolution=True, 484 unique=True, 485): 486 """Combine allele frequencies from multiple studies. 487 488 `datasetID` is the unique identifier for studies to combine. 489 Allele frequencies combined using a Dirichlet distribution where each study's 490 contribution to the concentration parameter is $2 * sample_size * allele_frequency$. 491 Sample size is doubled to get `2n` due to diploidy. If an alternative `weights` is 492 set it is not doubled. The total concentration parameter of the Dirichlet distribution 493 is the contributions from all studies plus the prior `alpha`. If `alpha` is not set 494 the prior defaults to 1 observation of each allele. 495 496 Args: 497 AFtab (pd.DataFrame): Table of Allele frequency data 498 weights (str, optional): Column to be weighted by allele frequency to generate 499 concentration parameter of Dirichlet distribution. Defaults to '2n'. 500 alpha (list, optional): Prior to use for Dirichlet distribution. Defaults to []. 501 datasetID (str, optional): Unique identifier column for study. Defaults to 502 'population'. 503 format (bool, optional): Run `formatAF()`. Defaults to True. 504 ignoreG (bool, optional): Treat allele G groups as normal, see `formatAF()`. 505 Defaults to True. 506 add_unmeasured (bool, optional): Add unmeasured alleles to each study. This is 507 important to ensure combined allele frequencies sum to 1. See 508 `add_unmeasured()`. Defaults to True. 509 complete (bool, optional): Check study completeness. Uses default values for 510 `incomplete_studies()`. If you are happy with your study completeness can 511 be switched off with False. Defaults to True. 512 resolution (bool, optional): Check that all alleles have the same resolution, 513 see `check_resolution()`. Defaults to True. 514 unique (bool, optional): Check that each allele appears no more than once per 515 study. See `alleles_unique_in_study()`. Defaults to True. 516 517 Returns: 518 pd.DataFrame: Allele frequencies after combining estimates from all studies. 519 *allele_freq* is the combined frequency estimate from the Dirichlet mean 520 where the concentration is `alpha` + `c`. 521 *alpha* is the prior used for the Dirichlet distribution. 522 *c* is the observations used for the Dirichlet distribution. 523 *sample_size* is the total sample size of all combined studies. 524 *wav* is the weighted average. 525 """ 526 df = AFtab.copy() 527 single_loci(df) 528 if unique: 529 if not alleles_unique_in_study(df, datasetID=datasetID): 530 raise AssertionError("The same allele appears multiple times in a dataset") 531 if complete: 532 if not incomplete_studies(df, datasetID=datasetID).empty: 533 raise AssertionError( 534 "AFtab contains studies with AF that doesn't sum to 1. Check incomplete_studies(AFtab)" 535 ) 536 if resolution: 537 if not check_resolution(df): 538 raise AssertionError( 539 "AFtab conains alleles at multiple resolutions, check check_resolution(AFtab)" 540 ) 541 if format: 542 df = formatAF(df, ignoreG) 543 if add_unmeasured: 544 df = unmeasured_alleles(df, datasetID) 545 try: 546 df["2n"] = df.sample_size * 2 547 except AttributeError: 548 print("column '2n' could not be created") 549 df["c"] = df.allele_freq * df[weights] 550 grouped = df.groupby("allele", sort=True) 551 combined = grouped.apply( 552 lambda row: [ 553 row.name, 554 row.loci.unique()[0], 555 np.average(row.allele_freq, weights=row[weights]), 556 row.c.sum(), 557 row.sample_size.sum(), 558 ] 559 ) 560 combined = pd.DataFrame( 561 combined.tolist(), columns=["allele", "loci", "wav", "c", "sample_size"] 562 ) 563 combined = combined.reset_index(drop=True) 564 # Check that all alleles in a locus have the same sample size 565 # after merging 566 if duplicated_sample_size(combined): 567 id_duplicated_allele(grouped) 568 if not alpha: 569 alpha = default_prior(len(combined.allele)) 570 combined["alpha"] = alpha 571 # Calculate Dirichlet mean for each allele 572 combined["allele_freq"] = sp.stats.dirichlet(combined.alpha + combined.c).mean() 573 574 return combined 575 576 577def default_prior(k): 578 """Calculate a default prior, 1 observation of each class. 579 580 Args: 581 k (int): Number of classes in the Dirichlet distribution. 582 583 Returns: 584 list: List of k 1s to use as prior. 585 """ 586 alpha = [1] * k 587 return alpha 588 589 590def single_loci(AFtab): 591 """Check that allele frequency data is only of one locus 592 593 Args: 594 AFtab (pd.DataFrame): Allele frequency data 595 """ 596 if not len(AFtab.loci.unique()) == 1: 597 raise AssertionError("'AFtab' must contain only 1 loci") 598 599 600def alleles_unique_in_study(AFtab, datasetID="population"): 601 """Are all alleles unique in each study? 602 603 Checks that no alleles are reported more than once in a single study. 604 Study is defined by `datasetID`. 605 606 Args: 607 AFtab (pd.DataFrame): Allele frequency data 608 datasetID (str, optional): Unique identifier column to define study. 609 Defaults to 'population'. 610 611 Returns: 612 bool: `True` on if no alleles occur more than once in any study, otherwise `False`. 613 """ 614 df = AFtab.copy() 615 grouped = df.groupby([datasetID, "allele"]) 616 # Are allele alleles unique? i.e. do any occur multiple times in grouping? 617 unique = grouped.size()[grouped.size() > 1].empty 618 if not unique: 619 print(f"Non unique alleles in study, is datasetID correct? {datasetID}") 620 print(grouped.size()[grouped.size() > 1]) 621 return unique 622 623 624def duplicated_sample_size(AFtab): 625 """Returns True if any loci has more than 1 unique sample size""" 626 locus_sample_sizes = AFtab.groupby("loci").sample_size.apply( 627 lambda x: len(x.unique()) 628 ) 629 return any(locus_sample_sizes != 1) 630 631 632def id_duplicated_allele(grouped): 633 """Reports the allele that has mupltiple sample sizes""" 634 duplicated_population = grouped.population.apply(lambda x: any(x.duplicated())) 635 if not all(~duplicated_population): 636 raise AssertionError( 637 f"duplicated population within allele {duplicated_population[duplicated_population].index.tolist()}" 638 ) 639 640 641def population_coverage(p): 642 """Proportion of people with at least 1 copy of this allele assuming HWE. 643 644 Args: 645 p (float): Allele frequency 646 647 Returns: 648 float: Sum of homozygotes and heterozygotes for this allele 649 """ 650 q = 1 - p 651 homo = p**2 652 hetero = 2 * p * q 653 return homo + hetero 654 655 656def betaAB(alpha): 657 """Calculate `a` `b` values for all composite beta distributions. 658 659 Given the `alpha` vector defining a Dirichlet distribution calculate the `a` `b` values 660 for all composite beta distributions. 661 662 Args: 663 alpha (list): Values defining a Dirichlet distribution. This will be the prior 664 (for a naive distribution) or the prior + caf.c for a posterior distribution. 665 666 Returns: 667 list: List of `a` `b` values defining beta values, i.e. for each allele it is 668 the number of times it was and wasn't observed. 669 """ 670 ab = [(a, sum(alpha) - a) for a in alpha] 671 return ab 672 673 674# def betaCI(a,b,credible_interval=0.95): 675# """Calculat the central credible interval of a beta distribution 676 677# Args: 678# a (float): Beta shape parameter `a`, i.e. the number of times the allele was observed. 679# b (float): Beta shape parameter `b`, i.e. the number of times the allele was not observed. 680# credible_interval (float, optional): The size of the credible interval requested. Defaults to 0.95. 681 682# Returns: 683# tuple: Lower and upper credible interval of beta distribution. 684# """ 685# bd = sp.stats.beta(a,b) 686# lower_quantile = (1-credible_interval)/2 687# upper_quantile = 1-lower_quantile 688# lower_interval = bd.ppf(lower_quantile) 689# upper_interval = bd.ppf(upper_quantile) 690# return lower_interval, upper_interval 691 692# def AFci(caf, credible_interval=0.95): 693# """Calculate credible interval for combined allele frequency table. 694# Note that this ignores sampling error so confidence interval is too tight. 695# Use HLAhdi.AFhdi() instead. 696 697# Args: 698# caf (pd.DataFrame): Table produced by combineAF() 699# credible_interval (float, optional): The desired confidence interval. Defaults to 0.95. 700 701# Returns: 702# list: Lower and upper credible intervals as a list of tuples 703# """ 704# ab = betaAB( 705# caf.alpha + caf.c, 706# ) 707# ci = [betaCI(a, b, credible_interval) for a,b in ab] 708# return ci 709 710 711def plot_prior(concentration, ncol=2, psteps=1000, labels=""): 712 """Plot probability density function for prior values. 713 714 Args: 715 concentration (list): Vector of the prior Dirichlet concentration values. 716 ncol (int, optional): Number of columns. Defaults to 2. 717 labels (list, optional): Labels for elements of concentration in the same 718 order. Defaults to "". 719 """ 720 ab = betaAB(concentration) 721 pline = np.linspace(0, 1, psteps) 722 nrow = math.ceil(len(concentration) / ncol) 723 fig, ax = plt.subplots(nrow, ncol, sharex=True) 724 fig.suptitle("Probability density") 725 # If labels is a list nothing happens, 726 # But if it's a series it converts to a list 727 labels = list(labels) 728 if not labels: 729 labels = [""] * len(concentration) 730 if not len(concentration) == len(labels): 731 raise AssertionError("concentration must be same length as labels") 732 for i, alpha in enumerate(concentration): 733 a, b = ab[i] 734 bd = sp.stats.beta(a, b) 735 pdf = [bd.pdf(p) for p in pline] 736 ax.flat[i].plot(pline, pdf) 737 ax.flat[i].set_title(labels[i]) 738 for axi in ax[-1, :]: 739 axi.set(xlabel="Allele freq") 740 for axi in ax[:, 0]: 741 axi.set(ylabel="PDF") 742 plt.show() 743 744 745def plotAF( 746 caf=pd.DataFrame(), 747 AFtab=pd.DataFrame(), 748 cols=list(mcolors.TABLEAU_COLORS.keys()), 749 datasetID="population", 750 hdi=pd.DataFrame(), 751 compound_mean=pd.DataFrame(), 752): 753 """Plot allele frequency results from `HLAfreq`. 754 755 Plot combined allele frequencies, individual allele frequencies, 756 and credible intervals on combined allele frequency estimates. 757 Credible interval is only plotted if a value is given for `hdi`. 758 The plotted Credible interval is whatever was passed to HLAfreq_pymc.AFhdi() 759 when calculating hdi. 760 761 Args: 762 caf (pd.DataFrame, optional): Combined allele frequency estimates from 763 HLAfreq.combineAF. Defaults to pd.DataFrame(). 764 AFtab (pd.DataFrame, optional): Table of allele frequency data. Defaults 765 to pd.DataFrame(). 766 cols (list, optional): List of colours to use for each individual dataset. 767 Defaults to list(mcolors.TABLEAU_COLORS.keys()). 768 datasetID (str, optional): Column used to define separate datasets. Defaults 769 to "population". 770 weights (str, optional): Column to be weighted by allele frequency to generate 771 concentration parameter of Dirichlet distribution. Defaults to '2n'. 772 hdi (pd.DataFrame, optional): The high density interval object to plot credible 773 intervals. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame(). 774 compound_mean (pd.DataFrame, optional): The high density interval object to plot 775 post_mean. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame(). 776 """ 777 # Plot allele frequency for each dataset 778 if not AFtab.empty: 779 # Cols must be longer than the list of alleles 780 # If not, repeat the list of cols 781 repeat_cols = np.ceil(len(AFtab[datasetID]) / len(cols)) 782 repeat_cols = int(repeat_cols) 783 cols = cols * repeat_cols 784 # Make a dictionary mapping datasetID to colours 785 cmap = dict(zip(AFtab[datasetID].unique(), cols)) 786 plt.scatter( 787 x=AFtab.allele_freq, 788 y=AFtab.allele, 789 c=[cmap[x] for x in AFtab[datasetID]], 790 alpha=0.7, 791 zorder=2, 792 ) 793 # Plot combined allele frequency 794 if not caf.empty: 795 plt.scatter( 796 x=caf.allele_freq, 797 y=caf.allele, 798 edgecolors="black", 799 facecolors="none", 800 zorder=3, 801 ) 802 # Plot high density interval 803 if not hdi.empty: 804 # assert not AFtab.empty, "AFtab is needed to calculate credible interval" 805 # from HLAfreq import HLAfreq_pymc as HLAhdi 806 # print("Fitting model with PyMC, make take a few seconds") 807 # hdi = HLAhdi.AFhdi( 808 # AFtab=AFtab, 809 # weights=weights, 810 # datasetID=datasetID, 811 # credible_interval=credible_interval, 812 # conc_mu=conc_mu, 813 # conc_sigma=conc_sigma 814 # ) 815 for interval in hdi.iterrows(): 816 # .iterrows returns a index and data as a tuple for each row 817 plt.hlines( 818 y=interval[1]["allele"], 819 xmin=interval[1]["lo"], 820 xmax=interval[1]["hi"], 821 color="black", 822 ) 823 if not compound_mean.empty: 824 for row in compound_mean.iterrows(): 825 plt.scatter( 826 y=row[1]["allele"], x=row[1]["post_mean"], color="black", marker="|" 827 ) 828 plt.xlabel("Allele frequency") 829 plt.grid(zorder=0) 830 plt.show()
23def simulate_population(alleles: Iterable[str], locus: str, population: str): 24 pop_size = np.random.randint(len(alleles), 50) 25 samples = np.random.choice(alleles, pop_size, replace=True) 26 counts = pd.Series(samples).value_counts() 27 counts.values / pop_size 28 pop = pd.DataFrame( 29 { 30 "allele": counts.index, 31 "loci": locus, 32 "population": population, 33 "allele_freq": counts.values / pop_size, 34 "sample_size": pop_size, 35 } 36 ) 37 return pop
50def makeURL( 51 country="", 52 standard="s", 53 locus="", 54 resolution_pattern="bigger_equal_than", 55 resolution=2, 56 region="", 57 ethnic="", 58 study_type="", 59 dataset_source="", 60 sample_year="", 61 sample_year_pattern="", 62 sample_size="", 63 sample_size_pattern="", 64): 65 """Create URL for search of allele frequency net database. 66 67 All arguments are documented [here](http://www.allelefrequencies.net/extaccess.asp) 68 69 Args: 70 country (str, optional): Country name to retrieve records from. Defaults to "". 71 standard (str, optional): Filter study quality standard to this or higher. 72 {'g', 's', 'a'} Gold, silver, all. Defaults to 's'. 73 locus (str, optional): The locus to return allele data for. Defaults to "". 74 resolution_pattern (str, optional): Resolution comparitor {'equal', 'different', 75 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. 76 Filter created using `resolution` and `resolution_pattern`. 77 Defaults to "bigger_equal_than". 78 resolution (int, optional): Number of fields of resolution of allele. Filter 79 created using `resolution` and `resolution_pattern`. Defaults to 2. 80 region (str, optional): Filter to geographic region. {Asia, Australia, 81 Eastern Europe, ...}. 82 All regions listed [here](http://www.allelefrequencies.net/pop6003a.asp). 83 Defaults to "". 84 ethnic (str, optional): Filter to ethnicity. {"Amerindian", "Black", "Caucasian", ...}. 85 All ethnicities listed [here](http://www.allelefrequencies.net/pop6003a.asp). 86 Defaults to "". 87 study_type (str, optional): Type of study. {"Anthropology", "Blood+Donor", 88 "Bone+Marrow+Registry", "Controls+for+Disease+Study", "Disease+Study+Patients", 89 "Other", "Solid+Organd+Unrelated+Donors", "Stem+cell+donors"}. Defaults to "". 90 dataset_source (str, optional): Source of data. {"Literature", 91 "Proceedings+of+IHWs", "Unpublished"}. Defaults to "". 92 sample_year (int, optional): Sample year to compare to. Filter created using 93 sample_year and sample_year_pattern. Defaults to "". 94 sample_year_pattern (str, optional): Pattern to compare sample year to. Filter 95 created using sample_year and sample_year_pattern. {'equal', 'different', 96 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. 97 Defaults to "". 98 sample_size (int, optional): Sample size to compare to. Filter created using 99 sample_size and sample_size_pattern. Defaults to "". 100 sample_size_pattern (str, optional): Pattern to compare sample size to. Filter 101 created using sample_size and sample_size_pattern. {'equal', 'different', 102 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. 103 Defaults to "". 104 105 Returns: 106 str: URL to search allelefrequencies.net 107 """ 108 base = "http://www.allelefrequencies.net/hla6006a.asp?" 109 locus_type = "hla_locus_type=Classical&" 110 hla_locus = "hla_locus=%s&" % (locus) 111 country = "hla_country=%s&" % (country) 112 region = "hla_region=%s&" % (region) 113 ethnic = "hla_ethnic=%s&" % (ethnic) 114 study_type = "hla_study=%s&" % (study_type) 115 dataset_source = "hla_dataset_source=%s&" % (dataset_source) 116 sample_year = "hla_sample_year=%s&" % (sample_year) 117 sample_year_pattern = "hla_sample_year_pattern=%s&" % (sample_year_pattern) 118 sample_size = "hla_sample_size=%s&" % (sample_size) 119 sample_size_pattern = "hla_sample_size_pattern=%s&" % (sample_size_pattern) 120 hla_level_pattern = "hla_level_pattern=%s&" % (resolution_pattern) 121 hla_level = "hla_level=%s&" % (resolution) 122 standard = "standard=%s&" % standard 123 url = ( 124 base 125 + locus_type 126 + hla_locus 127 + country 128 + hla_level_pattern 129 + hla_level 130 + standard 131 + region 132 + ethnic 133 + study_type 134 + dataset_source 135 + sample_year 136 + sample_year_pattern 137 + sample_size 138 + sample_size_pattern 139 ) 140 return url
Create URL for search of allele frequency net database.
All arguments are documented here
Arguments:
- country (str, optional): Country name to retrieve records from. Defaults to "".
- standard (str, optional): Filter study quality standard to this or higher. {'g', 's', 'a'} Gold, silver, all. Defaults to 's'.
- locus (str, optional): The locus to return allele data for. Defaults to "".
- resolution_pattern (str, optional): Resolution comparitor {'equal', 'different',
'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}.
Filter created using
resolution
andresolution_pattern
. Defaults to "bigger_equal_than". - resolution (int, optional): Number of fields of resolution of allele. Filter
created using
resolution
andresolution_pattern
. Defaults to 2. - region (str, optional): Filter to geographic region. {Asia, Australia, Eastern Europe, ...}. All regions listed here. Defaults to "".
- ethnic (str, optional): Filter to ethnicity. {"Amerindian", "Black", "Caucasian", ...}. All ethnicities listed here. Defaults to "".
- study_type (str, optional): Type of study. {"Anthropology", "Blood+Donor", "Bone+Marrow+Registry", "Controls+for+Disease+Study", "Disease+Study+Patients", "Other", "Solid+Organd+Unrelated+Donors", "Stem+cell+donors"}. Defaults to "".
- dataset_source (str, optional): Source of data. {"Literature", "Proceedings+of+IHWs", "Unpublished"}. Defaults to "".
- sample_year (int, optional): Sample year to compare to. Filter created using sample_year and sample_year_pattern. Defaults to "".
- sample_year_pattern (str, optional): Pattern to compare sample year to. Filter created using sample_year and sample_year_pattern. {'equal', 'different', 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. Defaults to "".
- sample_size (int, optional): Sample size to compare to. Filter created using sample_size and sample_size_pattern. Defaults to "".
- sample_size_pattern (str, optional): Pattern to compare sample size to. Filter created using sample_size and sample_size_pattern. {'equal', 'different', 'less_than', 'bigger_than', 'less_equal_than', 'bigger_equal_than'}. Defaults to "".
Returns:
str: URL to search allelefrequencies.net
143def parseAF(bs): 144 """Generate a dataframe from a given html page 145 146 Args: 147 bs (bs4.BeautifulSoup): BeautifulSoup object from allelefrequencies.net page 148 149 Returns: 150 pd.DataFrame: Table of allele, allele frequency, samplesize, and population 151 """ 152 # Get the results table from the div `divGenDetail` 153 tab = bs.find("div", {"id": "divGenDetail"}).find("table", {"class": "tblNormal"}) 154 # Get the column headers from the first row of the table 155 columns = [ 156 "line", 157 "allele", 158 "flag", 159 "population", 160 "carriers%", 161 "allele_freq", 162 "AF_graphic", 163 "sample_size", 164 "database", 165 "distribution", 166 "haplotype_association", 167 "notes", 168 ] 169 rows = [] 170 for row in tab.find_all("tr"): 171 rows.append([td.get_text(strip=True) for td in row.find_all("td")]) 172 # Make dataframe of table rows 173 # skip the first row as it's `th` headers 174 df = pd.DataFrame(rows[1:], columns=columns) 175 176 # Get HLA loci 177 df["loci"] = df.allele.apply(lambda x: x.split("*")[0]) 178 179 # Drop unwanted columns 180 df = df[["allele", "loci", "population", "allele_freq", "carriers%", "sample_size"]] 181 return df
Generate a dataframe from a given html page
Arguments:
- bs (bs4.BeautifulSoup): BeautifulSoup object from allelefrequencies.net page
Returns:
pd.DataFrame: Table of allele, allele frequency, samplesize, and population
184def Npages(bs): 185 """How many pages of results are there? 186 187 Args: 188 bs (bs4.BeautifulSoup): BS object of allelefrequencies.net results page 189 190 Returns: 191 int: Total number of results pages 192 """ 193 # Get the table with number of pages 194 navtab = bs.find("div", {"id": "divGenNavig"}).find("table", {"class": "table10"}) 195 if not navtab: 196 raise AssertionError( 197 "navtab does not evaluate to True. Check URL returns results in web browser." 198 ) 199 # Get cell with ' of ' in 200 pagesOfN = [ 201 td.get_text(strip=True) for td in navtab.find_all("td") if " of " in td.text 202 ] 203 # Check single cell returned 204 if not len(pagesOfN) == 1: 205 raise AssertionError("divGenNavig should contain 1 of not %s" % len(pagesOfN)) 206 # Get total number of pages 207 N = pagesOfN[0].split("of ")[1] 208 N = int(N) 209 return N
How many pages of results are there?
Arguments:
- bs (bs4.BeautifulSoup): BS object of allelefrequencies.net results page
Returns:
int: Total number of results pages
212def formatAF(AFtab, ignoreG=True): 213 """Format allele frequency table. 214 215 Convert sample_size and allele_freq to numeric data type. 216 Removes commas from sample size. Removes "(*)" from allele frequency if 217 `ignoreG` is `True`. `formatAF()` is used internally by combineAF and getAFdata 218 by default. 219 220 Args: 221 AFtab (pd.DataFrame): Allele frequency data downloaded from allelefrequency.net 222 using `getAFdata()`. 223 ignoreG (bool, optional): Treat G group alleles as normal. 224 See http://hla.alleles.org/alleles/g_groups.html for details. Defaults to True. 225 226 Returns: 227 pd.DataFrame: The formatted allele frequency data. 228 """ 229 df = AFtab.copy() 230 if df.sample_size.dtype == "O": 231 df.sample_size = pd.to_numeric(df.sample_size.str.replace(",", "")) 232 if df.allele_freq.dtype == "O": 233 if ignoreG: 234 df.allele_freq = df.allele_freq.str.replace("(*)", "", regex=False) 235 df.allele_freq = pd.to_numeric(df.allele_freq) 236 return df
Format allele frequency table.
Convert sample_size and allele_freq to numeric data type.
Removes commas from sample size. Removes "(*)" from allele frequency if
ignoreG
is True
. formatAF()
is used internally by combineAF and getAFdata
by default.
Arguments:
- AFtab (pd.DataFrame): Allele frequency data downloaded from allelefrequency.net
using
getAFdata()
. - ignoreG (bool, optional): Treat G group alleles as normal. See http://hla.alleles.org/alleles/g_groups.html for details. Defaults to True.
Returns:
pd.DataFrame: The formatted allele frequency data.
239def getAFdata(base_url, timeout=20, format=True, ignoreG=True): 240 """Get all allele frequency data from a search base_url. 241 242 Iterates over all pages regardless of which page is based. 243 244 Args: 245 base_url (str): URL for base search. 246 timeout (int): How long to wait to receive a response. 247 format (bool): Format the downloaded data using `formatAF()`. 248 ignoreG (bool): treat allele G groups as normal. 249 See http://hla.alleles.org/alleles/g_groups.html for details. Default = True 250 251 Returns: 252 pd.DataFrame: allele frequency data parsed into a pandas dataframe 253 """ 254 # Get BS object from base search 255 try: 256 bs = BeautifulSoup(requests.get(base_url, timeout=timeout).text, "html.parser") 257 except requests.exceptions.ReadTimeout as e: 258 raise Exception( 259 "Requests timeout, try a larger `timeout` value for `getAFdata()`" 260 ) from None 261 # How many pages of results 262 N = Npages(bs) 263 print("%s pages of results" % N) 264 # iterate over pages, parse and combine data from each 265 tabs = [] 266 for i in range(N): 267 # print (" Parsing page %s" %(i+1)) 268 print(" Parsing page %s" % (i + 1), end="\r") 269 url = base_url + "page=" + str(i + 1) 270 try: 271 bs = BeautifulSoup(requests.get(url, timeout=timeout).text, "html.parser") 272 except requests.exceptions.ReadTimeout as e: 273 raise Exception( 274 "Requests timeout, try a larger `timeout` value for `getAFdata()`" 275 ) from None 276 tab = parseAF(bs) 277 tabs.append(tab) 278 print("Download complete") 279 tabs = pd.concat(tabs) 280 if format: 281 try: 282 tabs = formatAF(tabs, ignoreG) 283 except AttributeError: 284 print("Formatting failed, non-numeric datatypes may remain.") 285 return tabs
Get all allele frequency data from a search base_url.
Iterates over all pages regardless of which page is based.
Arguments:
- base_url (str): URL for base search.
- timeout (int): How long to wait to receive a response.
- format (bool): Format the downloaded data using
formatAF()
. - ignoreG (bool): treat allele G groups as normal. See http://hla.alleles.org/alleles/g_groups.html for details. Default = True
Returns:
pd.DataFrame: allele frequency data parsed into a pandas dataframe
288def incomplete_studies(AFtab, llimit=0.95, ulimit=1.1, datasetID="population"): 289 """Report any studies with allele freqs that don't sum to 1 290 291 Args: 292 AFtab (pd.DataFrame): Dataframe containing multiple studies 293 llimit (float, optional): Lower allele_freq sum limit that counts as complete. 294 Defaults to 0.95. 295 ulimit (float, optional): Upper allele_freq sum limit that will not be reported. 296 Defaults to 1.1. 297 datasetID (str): Unique identifier column for study 298 """ 299 poplocs = AFtab.groupby([datasetID, "loci"]).allele_freq.sum() 300 lmask = poplocs < llimit 301 if sum(lmask > 0): 302 print(poplocs[lmask]) 303 print(f"{sum(lmask)} studies have total allele frequency < {llimit}") 304 umask = poplocs > ulimit 305 if sum(umask > 0): 306 print(poplocs[umask]) 307 print(f"{sum(umask)} studies have total allele frequency > {ulimit}") 308 incomplete = pd.concat([poplocs[lmask], poplocs[umask]]) 309 return incomplete
Report any studies with allele freqs that don't sum to 1
Arguments:
- AFtab (pd.DataFrame): Dataframe containing multiple studies
- llimit (float, optional): Lower allele_freq sum limit that counts as complete. Defaults to 0.95.
- ulimit (float, optional): Upper allele_freq sum limit that will not be reported. Defaults to 1.1.
- datasetID (str): Unique identifier column for study
312def only_complete(AFtab, llimit=0.95, ulimit=1.1, datasetID="population"): 313 """Returns only complete studies. 314 315 Data is dropped if the locus for that population is not complete, i.e. doesn't 316 sum to between `llimit` and `ulimit`. This prevents throwing away data if 317 another loci in the population is incomplete. 318 319 Args: 320 AFtab (pd.DataFrame): Dataframe containing multiple studies 321 llimit (float, optional): Lower allele_freq sum limit that counts as complete. 322 Defaults to 0.95. 323 ulimit (float, optional): Upper allele_freq sum limit that will not be reported. 324 Defaults to 1.1. 325 datasetID (str): Unique identifier column for study. Defaults to 'population'. 326 327 Returns: 328 pd.DataFrame: Allele frequency data of multiple studies, but only complete studies. 329 """ 330 noncomplete = incomplete_studies( 331 AFtab=AFtab, llimit=llimit, ulimit=ulimit, datasetID=datasetID 332 ) 333 # Returns False if population AND loci are in the noncomplete.index 334 # AS A PAIR 335 # This is important so that we don't throw away all data on a population 336 # just because one loci is incomplete. 337 complete_mask = AFtab.apply( 338 lambda x: (x[datasetID], x.loci) not in noncomplete.index, axis=1 339 ) 340 df = AFtab[complete_mask] 341 return df
Returns only complete studies.
Data is dropped if the locus for that population is not complete, i.e. doesn't
sum to between llimit
and ulimit
. This prevents throwing away data if
another loci in the population is incomplete.
Arguments:
- AFtab (pd.DataFrame): Dataframe containing multiple studies
- llimit (float, optional): Lower allele_freq sum limit that counts as complete. Defaults to 0.95.
- ulimit (float, optional): Upper allele_freq sum limit that will not be reported. Defaults to 1.1.
- datasetID (str): Unique identifier column for study. Defaults to 'population'.
Returns:
pd.DataFrame: Allele frequency data of multiple studies, but only complete studies.
344def check_resolution(AFtab): 345 """Check if all alleles in AFtab have the same resolution. 346 Will print the number of records with each resolution. 347 348 Args: 349 AFtab (pd.DataFrame): Allele frequency data 350 351 Returns: 352 bool: True only if all alleles have the same resolution, else False. 353 """ 354 resolution = 1 + AFtab.allele.str.count(":") 355 resVC = resolution.value_counts() 356 pass_check = len(resVC) == 1 357 if not pass_check: 358 print(resVC) 359 print("Multiple resolutions in AFtab. Fix with decrease_resolution()") 360 return pass_check
Check if all alleles in AFtab have the same resolution. Will print the number of records with each resolution.
Arguments:
- AFtab (pd.DataFrame): Allele frequency data
Returns:
bool: True only if all alleles have the same resolution, else False.
363def decrease_resolution(AFtab, newres, datasetID="population"): 364 """Decrease allele resolution so all alleles have the same resolution. 365 366 Args: 367 AFtab (pd.DataFrame): Allele frequency data. 368 newres (int): The desired number of fields for resolution. 369 datasetID (str, optional): Column to use as stud identifier. 370 Defaults to 'population'. 371 372 Returns: 373 pd.DataFrame: Allele frequency data with all alleles of requested resolution. 374 """ 375 df = AFtab.copy() 376 resolution = 1 + df.allele.str.count(":") 377 if not all(resolution >= newres): 378 raise AssertionError(f"Some alleles have resolution below {newres} fields") 379 new_allele = df.allele.str.split(":").apply(lambda x: ":".join(x[:newres])) 380 df.allele = new_allele 381 collapsed = collapse_reduced_alleles(df, datasetID=datasetID) 382 return collapsed
Decrease allele resolution so all alleles have the same resolution.
Arguments:
- AFtab (pd.DataFrame): Allele frequency data.
- newres (int): The desired number of fields for resolution.
- datasetID (str, optional): Column to use as stud identifier. Defaults to 'population'.
Returns:
pd.DataFrame: Allele frequency data with all alleles of requested resolution.
385def collapse_reduced_alleles(AFtab, datasetID="population"): 386 df = AFtab.copy() 387 # Group by alleles within datasets 388 grouped = df.groupby([datasetID, "allele"]) 389 # Sum allele freq but keep other columns 390 collapsed = grouped.apply( 391 lambda row: [ 392 sum(row.allele_freq), 393 row.sample_size.unique()[0], 394 row.loci.unique()[0], 395 len(row.loci.unique()), 396 len(row.sample_size.unique()), 397 ] 398 ) 399 collapsed = pd.DataFrame( 400 collapsed.tolist(), 401 index=collapsed.index, 402 columns=["allele_freq", "sample_size", "loci", "#loci", "#sample_sizes"], 403 ).reset_index() 404 # Within a study each all identical alleles should have the same loci and sample size 405 if not all(collapsed["#loci"] == 1): 406 raise AssertionError( 407 "Multiple loci found for a single allele in a single population" 408 ) 409 if not all(collapsed["#sample_sizes"] == 1): 410 raise AssertionError( 411 "Multiple sample_sizes found for a single allele in a single population" 412 ) 413 collapsed = collapsed[ 414 ["allele", "loci", "population", "allele_freq", "sample_size"] 415 ] 416 alleles_unique_in_study(collapsed) 417 return collapsed
420def unmeasured_alleles(AFtab, datasetID="population"): 421 """When combining AF estimates, unreported alleles can inflate frequencies 422 so AF sums to >1. Therefore we add unreported alleles with frequency zero. 423 424 Args: 425 AFtab (pd.DataFrame): Formatted allele frequency data 426 datasetID (str): Unique identifier column for study 427 428 Returns: 429 pd.DataFrame: Allele frequency data with all locus alleles reported 430 for each dataset 431 """ 432 df = AFtab.copy() 433 loci = df.loci.unique() 434 # Iterate over loci separately 435 for locus in loci: 436 # Iterate over each dataset reporting that locus 437 datasets = df[df.loci == locus][datasetID].unique() 438 for dataset in datasets: 439 # Single locus, single dataset 440 datasetAF = df[(df[datasetID] == dataset) & (df.loci == locus)] 441 # What was the sample size for this data? 442 dataset_sample_size = datasetAF.sample_size.unique() 443 if not (len(dataset_sample_size) == 1): 444 raise AssertionError( 445 "dataset_sample_size must be 1, not %s" % len(dataset_sample_size) 446 ) 447 dataset_sample_size = dataset_sample_size[0] 448 # Get all alleles for this locus (across datasets) 449 ualleles = df[df.loci == locus].allele.unique() 450 # Which of these alleles are not in this dataset? 451 missing_alleles = [ 452 allele for allele in ualleles if not allele in datasetAF.allele.values 453 ] 454 missing_rows = [ 455 (al, locus, dataset, 0, 0, dataset_sample_size) 456 for al in missing_alleles 457 ] 458 missing_rows = pd.DataFrame( 459 missing_rows, 460 columns=[ 461 "allele", 462 "loci", 463 datasetID, 464 "allele_freq", 465 "carriers%", 466 "sample_size", 467 ], 468 ) 469 # Add them in with zero frequency 470 if not missing_rows.empty: 471 df = pd.concat([df, missing_rows], ignore_index=True) 472 return df
When combining AF estimates, unreported alleles can inflate frequencies so AF sums to >1. Therefore we add unreported alleles with frequency zero.
Arguments:
- AFtab (pd.DataFrame): Formatted allele frequency data
- datasetID (str): Unique identifier column for study
Returns:
pd.DataFrame: Allele frequency data with all locus alleles reported for each dataset
475def combineAF( 476 AFtab, 477 weights="2n", 478 alpha=[], 479 datasetID="population", 480 format=True, 481 ignoreG=True, 482 add_unmeasured=True, 483 complete=True, 484 resolution=True, 485 unique=True, 486): 487 """Combine allele frequencies from multiple studies. 488 489 `datasetID` is the unique identifier for studies to combine. 490 Allele frequencies combined using a Dirichlet distribution where each study's 491 contribution to the concentration parameter is $2 * sample_size * allele_frequency$. 492 Sample size is doubled to get `2n` due to diploidy. If an alternative `weights` is 493 set it is not doubled. The total concentration parameter of the Dirichlet distribution 494 is the contributions from all studies plus the prior `alpha`. If `alpha` is not set 495 the prior defaults to 1 observation of each allele. 496 497 Args: 498 AFtab (pd.DataFrame): Table of Allele frequency data 499 weights (str, optional): Column to be weighted by allele frequency to generate 500 concentration parameter of Dirichlet distribution. Defaults to '2n'. 501 alpha (list, optional): Prior to use for Dirichlet distribution. Defaults to []. 502 datasetID (str, optional): Unique identifier column for study. Defaults to 503 'population'. 504 format (bool, optional): Run `formatAF()`. Defaults to True. 505 ignoreG (bool, optional): Treat allele G groups as normal, see `formatAF()`. 506 Defaults to True. 507 add_unmeasured (bool, optional): Add unmeasured alleles to each study. This is 508 important to ensure combined allele frequencies sum to 1. See 509 `add_unmeasured()`. Defaults to True. 510 complete (bool, optional): Check study completeness. Uses default values for 511 `incomplete_studies()`. If you are happy with your study completeness can 512 be switched off with False. Defaults to True. 513 resolution (bool, optional): Check that all alleles have the same resolution, 514 see `check_resolution()`. Defaults to True. 515 unique (bool, optional): Check that each allele appears no more than once per 516 study. See `alleles_unique_in_study()`. Defaults to True. 517 518 Returns: 519 pd.DataFrame: Allele frequencies after combining estimates from all studies. 520 *allele_freq* is the combined frequency estimate from the Dirichlet mean 521 where the concentration is `alpha` + `c`. 522 *alpha* is the prior used for the Dirichlet distribution. 523 *c* is the observations used for the Dirichlet distribution. 524 *sample_size* is the total sample size of all combined studies. 525 *wav* is the weighted average. 526 """ 527 df = AFtab.copy() 528 single_loci(df) 529 if unique: 530 if not alleles_unique_in_study(df, datasetID=datasetID): 531 raise AssertionError("The same allele appears multiple times in a dataset") 532 if complete: 533 if not incomplete_studies(df, datasetID=datasetID).empty: 534 raise AssertionError( 535 "AFtab contains studies with AF that doesn't sum to 1. Check incomplete_studies(AFtab)" 536 ) 537 if resolution: 538 if not check_resolution(df): 539 raise AssertionError( 540 "AFtab conains alleles at multiple resolutions, check check_resolution(AFtab)" 541 ) 542 if format: 543 df = formatAF(df, ignoreG) 544 if add_unmeasured: 545 df = unmeasured_alleles(df, datasetID) 546 try: 547 df["2n"] = df.sample_size * 2 548 except AttributeError: 549 print("column '2n' could not be created") 550 df["c"] = df.allele_freq * df[weights] 551 grouped = df.groupby("allele", sort=True) 552 combined = grouped.apply( 553 lambda row: [ 554 row.name, 555 row.loci.unique()[0], 556 np.average(row.allele_freq, weights=row[weights]), 557 row.c.sum(), 558 row.sample_size.sum(), 559 ] 560 ) 561 combined = pd.DataFrame( 562 combined.tolist(), columns=["allele", "loci", "wav", "c", "sample_size"] 563 ) 564 combined = combined.reset_index(drop=True) 565 # Check that all alleles in a locus have the same sample size 566 # after merging 567 if duplicated_sample_size(combined): 568 id_duplicated_allele(grouped) 569 if not alpha: 570 alpha = default_prior(len(combined.allele)) 571 combined["alpha"] = alpha 572 # Calculate Dirichlet mean for each allele 573 combined["allele_freq"] = sp.stats.dirichlet(combined.alpha + combined.c).mean() 574 575 return combined
Combine allele frequencies from multiple studies.
datasetID
is the unique identifier for studies to combine.
Allele frequencies combined using a Dirichlet distribution where each study's
contribution to the concentration parameter is $2 * sample_size * allele_frequency$.
Sample size is doubled to get 2n
due to diploidy. If an alternative weights
is
set it is not doubled. The total concentration parameter of the Dirichlet distribution
is the contributions from all studies plus the prior alpha
. If alpha
is not set
the prior defaults to 1 observation of each allele.
Arguments:
- AFtab (pd.DataFrame): Table of Allele frequency data
- weights (str, optional): Column to be weighted by allele frequency to generate concentration parameter of Dirichlet distribution. Defaults to '2n'.
- alpha (list, optional): Prior to use for Dirichlet distribution. Defaults to [].
- datasetID (str, optional): Unique identifier column for study. Defaults to 'population'.
- format (bool, optional): Run
formatAF()
. Defaults to True. - ignoreG (bool, optional): Treat allele G groups as normal, see
formatAF()
. Defaults to True. - add_unmeasured (bool, optional): Add unmeasured alleles to each study. This is
important to ensure combined allele frequencies sum to 1. See
add_unmeasured()
. Defaults to True. - complete (bool, optional): Check study completeness. Uses default values for
incomplete_studies()
. If you are happy with your study completeness can be switched off with False. Defaults to True. - resolution (bool, optional): Check that all alleles have the same resolution,
see
check_resolution()
. Defaults to True. - unique (bool, optional): Check that each allele appears no more than once per
study. See
alleles_unique_in_study()
. Defaults to True.
Returns:
pd.DataFrame: Allele frequencies after combining estimates from all studies. allele_freq is the combined frequency estimate from the Dirichlet mean where the concentration is
alpha
+c
. alpha is the prior used for the Dirichlet distribution. c is the observations used for the Dirichlet distribution. sample_size is the total sample size of all combined studies. wav is the weighted average.
578def default_prior(k): 579 """Calculate a default prior, 1 observation of each class. 580 581 Args: 582 k (int): Number of classes in the Dirichlet distribution. 583 584 Returns: 585 list: List of k 1s to use as prior. 586 """ 587 alpha = [1] * k 588 return alpha
Calculate a default prior, 1 observation of each class.
Arguments:
- k (int): Number of classes in the Dirichlet distribution.
Returns:
list: List of k 1s to use as prior.
591def single_loci(AFtab): 592 """Check that allele frequency data is only of one locus 593 594 Args: 595 AFtab (pd.DataFrame): Allele frequency data 596 """ 597 if not len(AFtab.loci.unique()) == 1: 598 raise AssertionError("'AFtab' must contain only 1 loci")
Check that allele frequency data is only of one locus
Arguments:
- AFtab (pd.DataFrame): Allele frequency data
601def alleles_unique_in_study(AFtab, datasetID="population"): 602 """Are all alleles unique in each study? 603 604 Checks that no alleles are reported more than once in a single study. 605 Study is defined by `datasetID`. 606 607 Args: 608 AFtab (pd.DataFrame): Allele frequency data 609 datasetID (str, optional): Unique identifier column to define study. 610 Defaults to 'population'. 611 612 Returns: 613 bool: `True` on if no alleles occur more than once in any study, otherwise `False`. 614 """ 615 df = AFtab.copy() 616 grouped = df.groupby([datasetID, "allele"]) 617 # Are allele alleles unique? i.e. do any occur multiple times in grouping? 618 unique = grouped.size()[grouped.size() > 1].empty 619 if not unique: 620 print(f"Non unique alleles in study, is datasetID correct? {datasetID}") 621 print(grouped.size()[grouped.size() > 1]) 622 return unique
Are all alleles unique in each study?
Checks that no alleles are reported more than once in a single study.
Study is defined by datasetID
.
Arguments:
- AFtab (pd.DataFrame): Allele frequency data
- datasetID (str, optional): Unique identifier column to define study. Defaults to 'population'.
Returns:
bool:
True
on if no alleles occur more than once in any study, otherwiseFalse
.
625def duplicated_sample_size(AFtab): 626 """Returns True if any loci has more than 1 unique sample size""" 627 locus_sample_sizes = AFtab.groupby("loci").sample_size.apply( 628 lambda x: len(x.unique()) 629 ) 630 return any(locus_sample_sizes != 1)
Returns True if any loci has more than 1 unique sample size
633def id_duplicated_allele(grouped): 634 """Reports the allele that has mupltiple sample sizes""" 635 duplicated_population = grouped.population.apply(lambda x: any(x.duplicated())) 636 if not all(~duplicated_population): 637 raise AssertionError( 638 f"duplicated population within allele {duplicated_population[duplicated_population].index.tolist()}" 639 )
Reports the allele that has mupltiple sample sizes
642def population_coverage(p): 643 """Proportion of people with at least 1 copy of this allele assuming HWE. 644 645 Args: 646 p (float): Allele frequency 647 648 Returns: 649 float: Sum of homozygotes and heterozygotes for this allele 650 """ 651 q = 1 - p 652 homo = p**2 653 hetero = 2 * p * q 654 return homo + hetero
Proportion of people with at least 1 copy of this allele assuming HWE.
Arguments:
- p (float): Allele frequency
Returns:
float: Sum of homozygotes and heterozygotes for this allele
657def betaAB(alpha): 658 """Calculate `a` `b` values for all composite beta distributions. 659 660 Given the `alpha` vector defining a Dirichlet distribution calculate the `a` `b` values 661 for all composite beta distributions. 662 663 Args: 664 alpha (list): Values defining a Dirichlet distribution. This will be the prior 665 (for a naive distribution) or the prior + caf.c for a posterior distribution. 666 667 Returns: 668 list: List of `a` `b` values defining beta values, i.e. for each allele it is 669 the number of times it was and wasn't observed. 670 """ 671 ab = [(a, sum(alpha) - a) for a in alpha] 672 return ab
Calculate a
b
values for all composite beta distributions.
Given the alpha
vector defining a Dirichlet distribution calculate the a
b
values
for all composite beta distributions.
Arguments:
- alpha (list): Values defining a Dirichlet distribution. This will be the prior (for a naive distribution) or the prior + caf.c for a posterior distribution.
Returns:
list: List of
a
b
values defining beta values, i.e. for each allele it is the number of times it was and wasn't observed.
712def plot_prior(concentration, ncol=2, psteps=1000, labels=""): 713 """Plot probability density function for prior values. 714 715 Args: 716 concentration (list): Vector of the prior Dirichlet concentration values. 717 ncol (int, optional): Number of columns. Defaults to 2. 718 labels (list, optional): Labels for elements of concentration in the same 719 order. Defaults to "". 720 """ 721 ab = betaAB(concentration) 722 pline = np.linspace(0, 1, psteps) 723 nrow = math.ceil(len(concentration) / ncol) 724 fig, ax = plt.subplots(nrow, ncol, sharex=True) 725 fig.suptitle("Probability density") 726 # If labels is a list nothing happens, 727 # But if it's a series it converts to a list 728 labels = list(labels) 729 if not labels: 730 labels = [""] * len(concentration) 731 if not len(concentration) == len(labels): 732 raise AssertionError("concentration must be same length as labels") 733 for i, alpha in enumerate(concentration): 734 a, b = ab[i] 735 bd = sp.stats.beta(a, b) 736 pdf = [bd.pdf(p) for p in pline] 737 ax.flat[i].plot(pline, pdf) 738 ax.flat[i].set_title(labels[i]) 739 for axi in ax[-1, :]: 740 axi.set(xlabel="Allele freq") 741 for axi in ax[:, 0]: 742 axi.set(ylabel="PDF") 743 plt.show()
Plot probability density function for prior values.
Arguments:
- concentration (list): Vector of the prior Dirichlet concentration values.
- ncol (int, optional): Number of columns. Defaults to 2.
- labels (list, optional): Labels for elements of concentration in the same order. Defaults to "".
746def plotAF( 747 caf=pd.DataFrame(), 748 AFtab=pd.DataFrame(), 749 cols=list(mcolors.TABLEAU_COLORS.keys()), 750 datasetID="population", 751 hdi=pd.DataFrame(), 752 compound_mean=pd.DataFrame(), 753): 754 """Plot allele frequency results from `HLAfreq`. 755 756 Plot combined allele frequencies, individual allele frequencies, 757 and credible intervals on combined allele frequency estimates. 758 Credible interval is only plotted if a value is given for `hdi`. 759 The plotted Credible interval is whatever was passed to HLAfreq_pymc.AFhdi() 760 when calculating hdi. 761 762 Args: 763 caf (pd.DataFrame, optional): Combined allele frequency estimates from 764 HLAfreq.combineAF. Defaults to pd.DataFrame(). 765 AFtab (pd.DataFrame, optional): Table of allele frequency data. Defaults 766 to pd.DataFrame(). 767 cols (list, optional): List of colours to use for each individual dataset. 768 Defaults to list(mcolors.TABLEAU_COLORS.keys()). 769 datasetID (str, optional): Column used to define separate datasets. Defaults 770 to "population". 771 weights (str, optional): Column to be weighted by allele frequency to generate 772 concentration parameter of Dirichlet distribution. Defaults to '2n'. 773 hdi (pd.DataFrame, optional): The high density interval object to plot credible 774 intervals. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame(). 775 compound_mean (pd.DataFrame, optional): The high density interval object to plot 776 post_mean. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame(). 777 """ 778 # Plot allele frequency for each dataset 779 if not AFtab.empty: 780 # Cols must be longer than the list of alleles 781 # If not, repeat the list of cols 782 repeat_cols = np.ceil(len(AFtab[datasetID]) / len(cols)) 783 repeat_cols = int(repeat_cols) 784 cols = cols * repeat_cols 785 # Make a dictionary mapping datasetID to colours 786 cmap = dict(zip(AFtab[datasetID].unique(), cols)) 787 plt.scatter( 788 x=AFtab.allele_freq, 789 y=AFtab.allele, 790 c=[cmap[x] for x in AFtab[datasetID]], 791 alpha=0.7, 792 zorder=2, 793 ) 794 # Plot combined allele frequency 795 if not caf.empty: 796 plt.scatter( 797 x=caf.allele_freq, 798 y=caf.allele, 799 edgecolors="black", 800 facecolors="none", 801 zorder=3, 802 ) 803 # Plot high density interval 804 if not hdi.empty: 805 # assert not AFtab.empty, "AFtab is needed to calculate credible interval" 806 # from HLAfreq import HLAfreq_pymc as HLAhdi 807 # print("Fitting model with PyMC, make take a few seconds") 808 # hdi = HLAhdi.AFhdi( 809 # AFtab=AFtab, 810 # weights=weights, 811 # datasetID=datasetID, 812 # credible_interval=credible_interval, 813 # conc_mu=conc_mu, 814 # conc_sigma=conc_sigma 815 # ) 816 for interval in hdi.iterrows(): 817 # .iterrows returns a index and data as a tuple for each row 818 plt.hlines( 819 y=interval[1]["allele"], 820 xmin=interval[1]["lo"], 821 xmax=interval[1]["hi"], 822 color="black", 823 ) 824 if not compound_mean.empty: 825 for row in compound_mean.iterrows(): 826 plt.scatter( 827 y=row[1]["allele"], x=row[1]["post_mean"], color="black", marker="|" 828 ) 829 plt.xlabel("Allele frequency") 830 plt.grid(zorder=0) 831 plt.show()
Plot allele frequency results from HLAfreq
.
Plot combined allele frequencies, individual allele frequencies,
and credible intervals on combined allele frequency estimates.
Credible interval is only plotted if a value is given for hdi
.
The plotted Credible interval is whatever was passed to HLAfreq_pymc.AFhdi()
when calculating hdi.
Arguments:
- caf (pd.DataFrame, optional): Combined allele frequency estimates from HLAfreq.combineAF. Defaults to pd.DataFrame().
- AFtab (pd.DataFrame, optional): Table of allele frequency data. Defaults to pd.DataFrame().
- cols (list, optional): List of colours to use for each individual dataset. Defaults to list(mcolors.TABLEAU_COLORS.keys()).
- datasetID (str, optional): Column used to define separate datasets. Defaults to "population".
- weights (str, optional): Column to be weighted by allele frequency to generate concentration parameter of Dirichlet distribution. Defaults to '2n'.
- hdi (pd.DataFrame, optional): The high density interval object to plot credible intervals. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame().
- compound_mean (pd.DataFrame, optional): The high density interval object to plot post_mean. Produced by HLAfreq.HLA_pymc.AFhdi(). Defaults to pd.DataFrame().