Source code for lineage.resources

"""Class for downloading and loading required external resources.

`lineage` uses tables and data from UCSC's Genome Browser:

* http://genome.ucsc.edu/
* http://genome.ucsc.edu/cgi-bin/hgTables

References
----------
1. Karolchik D, Hinrichs AS, Furey TS, Roskin KM, Sugnet CW, Haussler D, Kent WJ.
   The UCSC Table Browser data retrieval tool. Nucleic Acids Res. 2004 Jan
   1;32(Database issue):D493-6. PubMed PMID: 14681465; PubMed Central PMCID:
   PMC308837. https://www.ncbi.nlm.nih.gov/pubmed/14681465
2. Tyner C, Barber GP, Casper J, Clawson H, Diekhans M, Eisenhart C, Fischer CM,
   Gibson D, Gonzalez JN, Guruvadoo L, Haeussler M, Heitner S, Hinrichs AS,
   Karolchik D, Lee BT, Lee CM, Nejad P, Raney BJ, Rosenbloom KR, Speir ML,
   Villarreal C, Vivian J, Zweig AS, Haussler D, Kuhn RM, Kent WJ. The UCSC Genome
   Browser database: 2017 update. Nucleic Acids Res. 2017 Jan 4;45(D1):D626-D634.
   doi: 10.1093/nar/gkw1134. Epub 2016 Nov 29. PubMed PMID: 27899642; PubMed Central
   PMCID: PMC5210591. https://www.ncbi.nlm.nih.gov/pubmed/27899642
3. International Human Genome Sequencing Consortium. Initial sequencing and
   analysis of the human genome. Nature. 2001 Feb 15;409(6822):860-921.
   http://dx.doi.org/10.1038/35057062
4. hg19 (GRCh37): Hiram Clawson, Brooke Rhead, Pauline Fujita, Ann Zweig, Katrina
   Learned, Donna Karolchik and Robert Kuhn, https://genome.ucsc.edu/cgi-bin/hgGateway?db=hg19
5. Yates et. al. (doi:10.1093/bioinformatics/btu613),
   `<http://europepmc.org/search/?query=DOI:10.1093/bioinformatics/btu613>`_
6. Zerbino et. al. (doi.org/10.1093/nar/gkx1098), https://doi.org/10.1093/nar/gkx1098

"""

import logging
import tarfile

import pandas as pd
from snps.resources import Resources as SNPsResources

logger = logging.getLogger(__name__)



[docs]
class Resources(SNPsResources):
    """Object used to manage resources required by `lineage`."""


[docs]
    def __init__(self, resources_dir="resources"):
        """Initialize a ``Resources`` object.

        Parameters
        ----------
        resources_dir : str
            name / path of resources directory
        """
        super().__init__(resources_dir=resources_dir)

        self._genetic_map = {}
        self._genetic_map_name = ""
        self._cytoBand_hg19 = pd.DataFrame()
        self._knownGene_hg19 = pd.DataFrame()
        self._kgXref_hg19 = pd.DataFrame()



[docs]
    def get_genetic_map(self, genetic_map):
        """Get specified genetic map.

        Parameters
        ----------
        genetic_map : {'HapMap2', 'ACB', 'ASW', 'CDX', 'CEU', 'CHB', 'CHS', 'CLM', 'FIN', 'GBR', 'GIH', 'IBS', 'JPT', 'KHV', 'LWK', 'MKK', 'MXL', 'PEL', 'PUR', 'TSI', 'YRI'}
            `HapMap2` corresponds to the HapMap Phase II genetic map from the
            `International HapMap Project <https://www.genome.gov/10001688/international-hapmap-project/>`_
            and all others correspond to the
            `population-specific <https://www.internationalgenome.org/faq/which-populations-are-part-your-study/>`_
            genetic maps generated from the
            `1000 Genomes Project <https://www.internationalgenome.org>`_ phased OMNI data.

        Returns
        -------
        dict
            dict of pandas.DataFrame genetic maps if loading was successful, else {}
        """
        if genetic_map not in [
            "HapMap2",
            "ACB",
            "ASW",
            "CDX",
            "CEU",
            "CHB",
            "CHS",
            "CLM",
            "FIN",
            "GBR",
            "GIH",
            "IBS",
            "JPT",
            "KHV",
            "LWK",
            "MKK",
            "MXL",
            "PEL",
            "PUR",
            "TSI",
            "YRI",
        ]:
            logger.warning("Invalid genetic map")
            return {}

        if genetic_map == "HapMap2":
            return self.get_genetic_map_HapMapII_GRCh37()
        else:
            return self.get_genetic_map_1000G_GRCh37(genetic_map)



[docs]
    def get_genetic_map_HapMapII_GRCh37(self):
        """Get International HapMap Consortium HapMap Phase II genetic map for Build 37.
        [#InternationalHapMapConsortium]_ [#Auton2010]_

        Returns
        -------
        dict
            dict of pandas.DataFrame HapMapII genetic maps if loading was successful, else {}

        References
        ----------
        .. [#InternationalHapMapConsortium] "The International HapMap Consortium (2007).  A second generation human
           haplotype map of over 3.1 million SNPs.  Nature 449: 851-861."
        .. [#Auton2010] "The map was generated by lifting the HapMap Phase II genetic map from build
           35 to GRCh37. The original map was generated using LDhat as described in the 2007 HapMap
           paper (Nature, 18th Sept 2007). The conversion from b35 to GRCh37 was achieved using
           the UCSC liftOver tool. Adam Auton, 08/12/2010"
        """
        if self._genetic_map_name != "HapMap2":
            self._genetic_map = self._load_genetic_map_HapMapII_GRCh37(
                self._get_path_genetic_map_HapMapII_GRCh37()
            )
            self._genetic_map_name = "HapMap2"

        return self._genetic_map



[docs]
    def get_genetic_map_1000G_GRCh37(self, pop):
        """Get population-specific 1000 Genomes Project genetic map. [#Auton2013]_ [#Auton2015]_ [#Pickrell]_

        Notes
        -----
        From `README_omni_recombination_20130507
        <ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130507_omni_recombination_rates/README_omni_recombination_20130507>`_ [#Auton2013]_ :

            Genetic maps generated from the 1000G phased OMNI data.

            [Build 37] OMNI haplotypes were obtained from the Phase 1 dataset
            (`/vol1/ftp/phase1/analysis_results/supporting/omni_haplotypes/ <ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase1/analysis_results/supporting/omni_haplotypes/>`_).

            Genetic maps were generated for each population separately using LDhat
            (http://ldhat.sourceforge.net/). Haplotypes were split into 2000 SNP windows
            with an overlap of 200 SNPs between each window. The recombination rate was
            estimated for each window independently, using a block penalty of 5 for a
            total of 22.5 million iterations with a sample being taken from the MCMC
            chain every 15,000 iterations. The first 7.5 million iterations were
            discarded as burn in. Once rates were estimated, windows were merged by
            splicing the estimates at the mid-point of the overlapping regions.

            LDhat estimates the population genetic recombination rate, rho = 4Ner. In
            order to convert to per-generation rates (measured in cM/Mb), the LDhat
            rates were compared to pedigree-based rates from Kong et al. (2010).
            Specifically, rates were binned at the 5Mb scale, and a linear regression
            performed between the two datasets. The gradient of this line gives an
            estimate of 4Ne, allowing the population based rates to be converted to
            per-generation rates.

        Returns
        -------
        dict
            dict of pandas.DataFrame population-specific 1000 Genomes Project genetic maps if
            loading was successful, else {}

        References
        ----------
        .. [#Auton2013] Adam Auton, May 7th, 2013
        .. [#Auton2015] The 1000 Genomes Project Consortium., Corresponding authors.,
           Auton, A. et al. A global reference for human genetic variation. Nature 526, 68–74 (2015).
           https://doi.org/10.1038/nature15393
        .. [#Pickrell] https://github.com/joepickrell/1000-genomes-genetic-maps
        """
        if self._genetic_map_name != pop:
            self._genetic_map = self._load_genetic_map_1000G_GRCh37(
                self._get_path_genetic_map_1000G_GRCh37(pop)
            )
            self._genetic_map_name = pop

        return self._genetic_map



[docs]
    def get_cytoBand_hg19(self):
        """Get UCSC cytoBand table for Build 37.

        Returns
        -------
        pandas.DataFrame
            cytoBand table if loading was successful, else empty DataFrame

        References
        ----------
        1. Ryan Dale, GitHub Gist,
           https://gist.github.com/daler/c98fc410282d7570efc3#file-ideograms-py
        """
        if self._cytoBand_hg19.empty:
            self._cytoBand_hg19 = self._load_cytoBand(self._get_path_cytoBand_hg19())

        return self._cytoBand_hg19



[docs]
    def get_knownGene_hg19(self):
        """Get UCSC knownGene table for Build 37.

        Returns
        -------
        pandas.DataFrame
            knownGene table if loading was successful, else empty DataFrame
        """
        if self._knownGene_hg19.empty:
            self._knownGene_hg19 = self._load_knownGene(self._get_path_knownGene_hg19())

        return self._knownGene_hg19



[docs]
    def get_kgXref_hg19(self):
        """Get UCSC kgXref table for Build 37.

        Returns
        -------
        pandas.DataFrame
            kgXref table if loading was successful, else empty DataFrame
        """
        if self._kgXref_hg19.empty:
            self._kgXref_hg19 = self._load_kgXref(self._get_path_kgXref_hg19())

        return self._kgXref_hg19



[docs]
    def create_example_datasets(self):
        """Create synthetic example datasets for lineage demonstrations.

        Generates related individual pairs suitable for demonstrating
        ``find_shared_dna()`` and ``find_discordant_snps()`` functionality.

        Creates:
        - A parent-child pair (showing 100% one_chrom_shared_dna)
        - A sibling pair (showing mix of one_chrom and two_chrom_shared_dna)

        Returns
        -------
        dict
            Dictionary with keys:
            - 'parent': path to parent file
            - 'child': path to child file
            - 'sibling1': path to first sibling file
            - 'sibling2': path to second sibling file
        """
        from lineage.generator import SyntheticRelatedGenerator

        gen = SyntheticRelatedGenerator(build=37, seed=47)

        # Generate parent-child pair with a small discordant rate to simulate
        # real-world genotyping errors
        parent_path, child_path = gen.save_parent_child_pair(
            self._resources_dir,
            parent_format="23andme",
            child_format="ftdna",
            num_snps=900000,
            discordant_rate=0.0001,  # ~0.01% discordant SNPs
        )

        # Generate sibling pair
        sib1_path, sib2_path = gen.save_sibling_pair(
            self._resources_dir,
            sib1_format="23andme",
            sib2_format="ftdna",
            num_snps=900000,
        )

        return {
            "parent": parent_path,
            "child": child_path,
            "sibling1": sib1_path,
            "sibling2": sib2_path,
        }



[docs]
    def get_all_resources(self):
        """Get / download all resources (except reference sequences) used throughout `lineage`.

        Returns
        -------
        dict
            dict of resources
        """
        resources = {}
        resources["genetic_map_HapMapII_GRCh37"] = (
            self.get_genetic_map_HapMapII_GRCh37()
        )
        resources["cytoBand_hg19"] = self.get_cytoBand_hg19()
        resources["knownGene_hg19"] = self.get_knownGene_hg19()
        resources["kgXref_hg19"] = self.get_kgXref_hg19()
        return resources


    @staticmethod
    def _load_genetic_map_HapMapII_GRCh37(filename):
        """Load HapMapII genetic map.

        Parameters
        ----------
        filename : str
            path to compressed archive with genetic map data

        Returns
        -------
        genetic_map : dict
            dict of pandas.DataFrame genetic maps

        Notes
        -----
        Keys of returned dict are chromosomes and values are the corresponding genetic map.
        """
        genetic_map = {}

        with tarfile.open(filename, "r:gz") as tar:
            # http://stackoverflow.com/a/2018576
            for member in tar.getmembers():
                if "genetic_map" in member.name:
                    df = pd.read_csv(tar.extractfile(member), sep="\t")
                    df = df.rename(
                        columns={
                            "Position(bp)": "pos",
                            "Rate(cM/Mb)": "rate",
                            "Map(cM)": "map",
                        }
                    )
                    del df["Chromosome"]
                    start_pos = member.name.index("chr") + 3
                    end_pos = member.name.index(".")
                    genetic_map[member.name[start_pos:end_pos]] = df

        # X chrom consists of X PAR regions and X non-PAR region
        genetic_map["X"] = pd.concat(
            [genetic_map["X_par1"], genetic_map["X"], genetic_map["X_par2"]]
        )
        del genetic_map["X_par1"]
        del genetic_map["X_par2"]

        return genetic_map

    @staticmethod
    def _load_genetic_map_1000G_GRCh37(filename):
        """Load 1000 Genomes Project genetic map.

        Parameters
        ----------
        filename : str
            path to archive with compressed genetic map data

        Returns
        -------
        genetic_map : dict
            dict of pandas.DataFrame genetic maps

        Notes
        -----
        Keys of returned dict are chromosomes and values are the corresponding genetic map.
        """
        genetic_map = {}

        with tarfile.open(filename, "r") as tar:
            # http://stackoverflow.com/a/2018576
            for member in tar.getmembers():
                df = pd.read_csv(
                    tar.extractfile(member),
                    compression="gzip",
                    sep=r"\s+",
                    usecols=["Position(bp)", "Rate(cM/Mb)", "Map(cM)"],
                )
                df = df.rename(
                    columns={
                        "Position(bp)": "pos",
                        "Rate(cM/Mb)": "rate",
                        "Map(cM)": "map",
                    }
                )
                chrom = member.name.split("-")[1]
                genetic_map[chrom] = df

        return genetic_map

    @staticmethod
    def _load_cytoBand(filename):
        """Load UCSC cytoBand table.

        Parameters
        ----------
        filename : str
            path to cytoBand file

        Returns
        -------
        df : pandas.DataFrame
            cytoBand table
        """
        # adapted from chromosome plotting code (see get_cytoBand_hg19 Ref. 1.)
        df = pd.read_csv(
            filename, names=["chrom", "start", "end", "name", "gie_stain"], sep="\t"
        )
        df["chrom"] = df["chrom"].str[3:]
        return df

    @staticmethod
    def _load_knownGene(filename):
        """Load UCSC knownGene table.

        Parameters
        ----------
        filename : str
            path to knownGene file

        Returns
        -------
        df : pandas.DataFrame
            knownGene table
        """
        df = pd.read_csv(
            filename,
            names=[
                "name",
                "chrom",
                "strand",
                "txStart",
                "txEnd",
                "cdsStart",
                "cdsEnd",
                "exonCount",
                "exonStarts",
                "exonEnds",
                "proteinID",
                "alignID",
            ],
            index_col=0,
            sep="\t",
        )
        df["chrom"] = df["chrom"].str[3:]
        return df

    @staticmethod
    def _load_kgXref(filename):
        """Load UCSC kgXref table.

        Parameters
        ----------
        filename : str
            path to kgXref file

        Returns
        -------
        df : pandas.DataFrame
            kgXref table
        """
        df = pd.read_csv(
            filename,
            names=[
                "kgID",
                "mRNA",
                "spID",
                "spDisplayID",
                "geneSymbol",
                "refseq",
                "protAcc",
                "description",
                "rfamAcc",
                "tRnaName",
            ],
            index_col=0,
            sep="\t",
            dtype=object,
        )
        return df

    def _get_path_cytoBand_hg19(self):
        """Get local path to cytoBand file for hg19 / GRCh37 from UCSC, downloading if necessary.

        Returns
        -------
        str
            path to cytoBand_hg19.txt.gz
        """
        return self._download_file(
            "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz",
            "cytoBand_hg19.txt.gz",
        )

    def _get_path_genetic_map_HapMapII_GRCh37(self):
        """Get local path to HapMap Phase II genetic map for hg19 / GRCh37 (HapMapII),
        downloading if necessary.

        Returns
        -------
        str
            path to genetic_map_HapMapII_GRCh37.tar.gz
        """
        return self._download_file(
            "ftp://ftp.ncbi.nlm.nih.gov/hapmap/recombination/2011-01_phaseII_B37/"
            "genetic_map_HapMapII_GRCh37.tar.gz",
            "genetic_map_HapMapII_GRCh37.tar.gz",
        )

    def _get_path_genetic_map_1000G_GRCh37(self, pop):
        """Get local path to population-specific 1000 Genomes Project genetic map,
        downloading if necessary.

        Returns
        -------
        str
            path to {pop}_omni_recombination_20130507.tar
        """
        filename = f"{pop}_omni_recombination_20130507.tar"
        return self._download_file(
            f"ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130507_omni_recombination_rates/{filename}",
            filename,
        )

    def _get_path_knownGene_hg19(self):
        """Get local path to knownGene file for hg19 / GRCh37 from UCSC, downloading if necessary.

        Returns
        -------
        str
            path to knownGene_hg19.txt.gz
        """
        return self._download_file(
            "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.txt.gz",
            "knownGene_hg19.txt.gz",
        )

    def _get_path_kgXref_hg19(self):
        """Get local path to kgXref file for hg19 / GRCh37 from UCSC, downloading if necessary.

        Returns
        -------
        str
            path to kgXref_hg19.txt.gz
        """
        return self._download_file(
            "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/kgXref.txt.gz",
            "kgXref_hg19.txt.gz",
        )