"""Class for downloading and loading required external resources.
`lineage` uses tables and data from UCSC's Genome Browser:
* http://genome.ucsc.edu/
* http://genome.ucsc.edu/cgi-bin/hgTables
References
----------
1. Karolchik D, Hinrichs AS, Furey TS, Roskin KM, Sugnet CW, Haussler D, Kent WJ.
The UCSC Table Browser data retrieval tool. Nucleic Acids Res. 2004 Jan
1;32(Database issue):D493-6. PubMed PMID: 14681465; PubMed Central PMCID:
PMC308837. https://www.ncbi.nlm.nih.gov/pubmed/14681465
2. Tyner C, Barber GP, Casper J, Clawson H, Diekhans M, Eisenhart C, Fischer CM,
Gibson D, Gonzalez JN, Guruvadoo L, Haeussler M, Heitner S, Hinrichs AS,
Karolchik D, Lee BT, Lee CM, Nejad P, Raney BJ, Rosenbloom KR, Speir ML,
Villarreal C, Vivian J, Zweig AS, Haussler D, Kuhn RM, Kent WJ. The UCSC Genome
Browser database: 2017 update. Nucleic Acids Res. 2017 Jan 4;45(D1):D626-D634.
doi: 10.1093/nar/gkw1134. Epub 2016 Nov 29. PubMed PMID: 27899642; PubMed Central
PMCID: PMC5210591. https://www.ncbi.nlm.nih.gov/pubmed/27899642
3. International Human Genome Sequencing Consortium. Initial sequencing and
analysis of the human genome. Nature. 2001 Feb 15;409(6822):860-921.
http://dx.doi.org/10.1038/35057062
4. hg19 (GRCh37): Hiram Clawson, Brooke Rhead, Pauline Fujita, Ann Zweig, Katrina
Learned, Donna Karolchik and Robert Kuhn, https://genome.ucsc.edu/cgi-bin/hgGateway?db=hg19
5. Yates et. al. (doi:10.1093/bioinformatics/btu613),
`<http://europepmc.org/search/?query=DOI:10.1093/bioinformatics/btu613>`_
6. Zerbino et. al. (doi.org/10.1093/nar/gkx1098), https://doi.org/10.1093/nar/gkx1098
"""
import logging
import tarfile
import pandas as pd
from snps.resources import Resources as SNPsResources
logger = logging.getLogger(__name__)
[docs]
class Resources(SNPsResources):
"""Object used to manage resources required by `lineage`."""
[docs]
def __init__(self, resources_dir="resources"):
"""Initialize a ``Resources`` object.
Parameters
----------
resources_dir : str
name / path of resources directory
"""
super().__init__(resources_dir=resources_dir)
self._genetic_map = {}
self._genetic_map_name = ""
self._cytoBand_hg19 = pd.DataFrame()
self._knownGene_hg19 = pd.DataFrame()
self._kgXref_hg19 = pd.DataFrame()
[docs]
def get_genetic_map(self, genetic_map):
"""Get specified genetic map.
Parameters
----------
genetic_map : {'HapMap2', 'ACB', 'ASW', 'CDX', 'CEU', 'CHB', 'CHS', 'CLM', 'FIN', 'GBR', 'GIH', 'IBS', 'JPT', 'KHV', 'LWK', 'MKK', 'MXL', 'PEL', 'PUR', 'TSI', 'YRI'}
`HapMap2` corresponds to the HapMap Phase II genetic map from the
`International HapMap Project <https://www.genome.gov/10001688/international-hapmap-project/>`_
and all others correspond to the
`population-specific <https://www.internationalgenome.org/faq/which-populations-are-part-your-study/>`_
genetic maps generated from the
`1000 Genomes Project <https://www.internationalgenome.org>`_ phased OMNI data.
Returns
-------
dict
dict of pandas.DataFrame genetic maps if loading was successful, else {}
"""
if genetic_map not in [
"HapMap2",
"ACB",
"ASW",
"CDX",
"CEU",
"CHB",
"CHS",
"CLM",
"FIN",
"GBR",
"GIH",
"IBS",
"JPT",
"KHV",
"LWK",
"MKK",
"MXL",
"PEL",
"PUR",
"TSI",
"YRI",
]:
logger.warning("Invalid genetic map")
return {}
if genetic_map == "HapMap2":
return self.get_genetic_map_HapMapII_GRCh37()
else:
return self.get_genetic_map_1000G_GRCh37(genetic_map)
[docs]
def get_genetic_map_HapMapII_GRCh37(self):
"""Get International HapMap Consortium HapMap Phase II genetic map for Build 37.
[#InternationalHapMapConsortium]_ [#Auton2010]_
Returns
-------
dict
dict of pandas.DataFrame HapMapII genetic maps if loading was successful, else {}
References
----------
.. [#InternationalHapMapConsortium] "The International HapMap Consortium (2007). A second generation human
haplotype map of over 3.1 million SNPs. Nature 449: 851-861."
.. [#Auton2010] "The map was generated by lifting the HapMap Phase II genetic map from build
35 to GRCh37. The original map was generated using LDhat as described in the 2007 HapMap
paper (Nature, 18th Sept 2007). The conversion from b35 to GRCh37 was achieved using
the UCSC liftOver tool. Adam Auton, 08/12/2010"
"""
if self._genetic_map_name != "HapMap2":
self._genetic_map = self._load_genetic_map_HapMapII_GRCh37(
self._get_path_genetic_map_HapMapII_GRCh37()
)
self._genetic_map_name = "HapMap2"
return self._genetic_map
[docs]
def get_genetic_map_1000G_GRCh37(self, pop):
"""Get population-specific 1000 Genomes Project genetic map. [#Auton2013]_ [#Auton2015]_ [#Pickrell]_
Notes
-----
From `README_omni_recombination_20130507
<ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130507_omni_recombination_rates/README_omni_recombination_20130507>`_ [#Auton2013]_ :
Genetic maps generated from the 1000G phased OMNI data.
[Build 37] OMNI haplotypes were obtained from the Phase 1 dataset
(`/vol1/ftp/phase1/analysis_results/supporting/omni_haplotypes/ <ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase1/analysis_results/supporting/omni_haplotypes/>`_).
Genetic maps were generated for each population separately using LDhat
(http://ldhat.sourceforge.net/). Haplotypes were split into 2000 SNP windows
with an overlap of 200 SNPs between each window. The recombination rate was
estimated for each window independently, using a block penalty of 5 for a
total of 22.5 million iterations with a sample being taken from the MCMC
chain every 15,000 iterations. The first 7.5 million iterations were
discarded as burn in. Once rates were estimated, windows were merged by
splicing the estimates at the mid-point of the overlapping regions.
LDhat estimates the population genetic recombination rate, rho = 4Ner. In
order to convert to per-generation rates (measured in cM/Mb), the LDhat
rates were compared to pedigree-based rates from Kong et al. (2010).
Specifically, rates were binned at the 5Mb scale, and a linear regression
performed between the two datasets. The gradient of this line gives an
estimate of 4Ne, allowing the population based rates to be converted to
per-generation rates.
Returns
-------
dict
dict of pandas.DataFrame population-specific 1000 Genomes Project genetic maps if
loading was successful, else {}
References
----------
.. [#Auton2013] Adam Auton, May 7th, 2013
.. [#Auton2015] The 1000 Genomes Project Consortium., Corresponding authors.,
Auton, A. et al. A global reference for human genetic variation. Nature 526, 68–74 (2015).
https://doi.org/10.1038/nature15393
.. [#Pickrell] https://github.com/joepickrell/1000-genomes-genetic-maps
"""
if self._genetic_map_name != pop:
self._genetic_map = self._load_genetic_map_1000G_GRCh37(
self._get_path_genetic_map_1000G_GRCh37(pop)
)
self._genetic_map_name = pop
return self._genetic_map
[docs]
def get_cytoBand_hg19(self):
"""Get UCSC cytoBand table for Build 37.
Returns
-------
pandas.DataFrame
cytoBand table if loading was successful, else empty DataFrame
References
----------
1. Ryan Dale, GitHub Gist,
https://gist.github.com/daler/c98fc410282d7570efc3#file-ideograms-py
"""
if self._cytoBand_hg19.empty:
self._cytoBand_hg19 = self._load_cytoBand(self._get_path_cytoBand_hg19())
return self._cytoBand_hg19
[docs]
def get_knownGene_hg19(self):
"""Get UCSC knownGene table for Build 37.
Returns
-------
pandas.DataFrame
knownGene table if loading was successful, else empty DataFrame
"""
if self._knownGene_hg19.empty:
self._knownGene_hg19 = self._load_knownGene(self._get_path_knownGene_hg19())
return self._knownGene_hg19
[docs]
def get_kgXref_hg19(self):
"""Get UCSC kgXref table for Build 37.
Returns
-------
pandas.DataFrame
kgXref table if loading was successful, else empty DataFrame
"""
if self._kgXref_hg19.empty:
self._kgXref_hg19 = self._load_kgXref(self._get_path_kgXref_hg19())
return self._kgXref_hg19
[docs]
def create_example_datasets(self):
"""Create synthetic example datasets for lineage demonstrations.
Generates related individual pairs suitable for demonstrating
``find_shared_dna()`` and ``find_discordant_snps()`` functionality.
Creates:
- A parent-child pair (showing 100% one_chrom_shared_dna)
- A sibling pair (showing mix of one_chrom and two_chrom_shared_dna)
Returns
-------
dict
Dictionary with keys:
- 'parent': path to parent file
- 'child': path to child file
- 'sibling1': path to first sibling file
- 'sibling2': path to second sibling file
"""
from lineage.generator import SyntheticRelatedGenerator
gen = SyntheticRelatedGenerator(build=37, seed=47)
# Generate parent-child pair with a small discordant rate to simulate
# real-world genotyping errors
parent_path, child_path = gen.save_parent_child_pair(
self._resources_dir,
parent_format="23andme",
child_format="ftdna",
num_snps=900000,
discordant_rate=0.0001, # ~0.01% discordant SNPs
)
# Generate sibling pair
sib1_path, sib2_path = gen.save_sibling_pair(
self._resources_dir,
sib1_format="23andme",
sib2_format="ftdna",
num_snps=900000,
)
return {
"parent": parent_path,
"child": child_path,
"sibling1": sib1_path,
"sibling2": sib2_path,
}
[docs]
def get_all_resources(self):
"""Get / download all resources (except reference sequences) used throughout `lineage`.
Returns
-------
dict
dict of resources
"""
resources = {}
resources["genetic_map_HapMapII_GRCh37"] = (
self.get_genetic_map_HapMapII_GRCh37()
)
resources["cytoBand_hg19"] = self.get_cytoBand_hg19()
resources["knownGene_hg19"] = self.get_knownGene_hg19()
resources["kgXref_hg19"] = self.get_kgXref_hg19()
return resources
@staticmethod
def _load_genetic_map_HapMapII_GRCh37(filename):
"""Load HapMapII genetic map.
Parameters
----------
filename : str
path to compressed archive with genetic map data
Returns
-------
genetic_map : dict
dict of pandas.DataFrame genetic maps
Notes
-----
Keys of returned dict are chromosomes and values are the corresponding genetic map.
"""
genetic_map = {}
with tarfile.open(filename, "r:gz") as tar:
# http://stackoverflow.com/a/2018576
for member in tar.getmembers():
if "genetic_map" in member.name:
df = pd.read_csv(tar.extractfile(member), sep="\t")
df = df.rename(
columns={
"Position(bp)": "pos",
"Rate(cM/Mb)": "rate",
"Map(cM)": "map",
}
)
del df["Chromosome"]
start_pos = member.name.index("chr") + 3
end_pos = member.name.index(".")
genetic_map[member.name[start_pos:end_pos]] = df
# X chrom consists of X PAR regions and X non-PAR region
genetic_map["X"] = pd.concat(
[genetic_map["X_par1"], genetic_map["X"], genetic_map["X_par2"]]
)
del genetic_map["X_par1"]
del genetic_map["X_par2"]
return genetic_map
@staticmethod
def _load_genetic_map_1000G_GRCh37(filename):
"""Load 1000 Genomes Project genetic map.
Parameters
----------
filename : str
path to archive with compressed genetic map data
Returns
-------
genetic_map : dict
dict of pandas.DataFrame genetic maps
Notes
-----
Keys of returned dict are chromosomes and values are the corresponding genetic map.
"""
genetic_map = {}
with tarfile.open(filename, "r") as tar:
# http://stackoverflow.com/a/2018576
for member in tar.getmembers():
df = pd.read_csv(
tar.extractfile(member),
compression="gzip",
sep=r"\s+",
usecols=["Position(bp)", "Rate(cM/Mb)", "Map(cM)"],
)
df = df.rename(
columns={
"Position(bp)": "pos",
"Rate(cM/Mb)": "rate",
"Map(cM)": "map",
}
)
chrom = member.name.split("-")[1]
genetic_map[chrom] = df
return genetic_map
@staticmethod
def _load_cytoBand(filename):
"""Load UCSC cytoBand table.
Parameters
----------
filename : str
path to cytoBand file
Returns
-------
df : pandas.DataFrame
cytoBand table
"""
# adapted from chromosome plotting code (see get_cytoBand_hg19 Ref. 1.)
df = pd.read_csv(
filename, names=["chrom", "start", "end", "name", "gie_stain"], sep="\t"
)
df["chrom"] = df["chrom"].str[3:]
return df
@staticmethod
def _load_knownGene(filename):
"""Load UCSC knownGene table.
Parameters
----------
filename : str
path to knownGene file
Returns
-------
df : pandas.DataFrame
knownGene table
"""
df = pd.read_csv(
filename,
names=[
"name",
"chrom",
"strand",
"txStart",
"txEnd",
"cdsStart",
"cdsEnd",
"exonCount",
"exonStarts",
"exonEnds",
"proteinID",
"alignID",
],
index_col=0,
sep="\t",
)
df["chrom"] = df["chrom"].str[3:]
return df
@staticmethod
def _load_kgXref(filename):
"""Load UCSC kgXref table.
Parameters
----------
filename : str
path to kgXref file
Returns
-------
df : pandas.DataFrame
kgXref table
"""
df = pd.read_csv(
filename,
names=[
"kgID",
"mRNA",
"spID",
"spDisplayID",
"geneSymbol",
"refseq",
"protAcc",
"description",
"rfamAcc",
"tRnaName",
],
index_col=0,
sep="\t",
dtype=object,
)
return df
def _get_path_cytoBand_hg19(self):
"""Get local path to cytoBand file for hg19 / GRCh37 from UCSC, downloading if necessary.
Returns
-------
str
path to cytoBand_hg19.txt.gz
"""
return self._download_file(
"ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz",
"cytoBand_hg19.txt.gz",
)
def _get_path_genetic_map_HapMapII_GRCh37(self):
"""Get local path to HapMap Phase II genetic map for hg19 / GRCh37 (HapMapII),
downloading if necessary.
Returns
-------
str
path to genetic_map_HapMapII_GRCh37.tar.gz
"""
return self._download_file(
"ftp://ftp.ncbi.nlm.nih.gov/hapmap/recombination/2011-01_phaseII_B37/"
"genetic_map_HapMapII_GRCh37.tar.gz",
"genetic_map_HapMapII_GRCh37.tar.gz",
)
def _get_path_genetic_map_1000G_GRCh37(self, pop):
"""Get local path to population-specific 1000 Genomes Project genetic map,
downloading if necessary.
Returns
-------
str
path to {pop}_omni_recombination_20130507.tar
"""
filename = f"{pop}_omni_recombination_20130507.tar"
return self._download_file(
f"ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130507_omni_recombination_rates/{filename}",
filename,
)
def _get_path_knownGene_hg19(self):
"""Get local path to knownGene file for hg19 / GRCh37 from UCSC, downloading if necessary.
Returns
-------
str
path to knownGene_hg19.txt.gz
"""
return self._download_file(
"ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.txt.gz",
"knownGene_hg19.txt.gz",
)
def _get_path_kgXref_hg19(self):
"""Get local path to kgXref file for hg19 / GRCh37 from UCSC, downloading if necessary.
Returns
-------
str
path to kgXref_hg19.txt.gz
"""
return self._download_file(
"ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/kgXref.txt.gz",
"kgXref_hg19.txt.gz",
)