Source code for lineage

""" lineage

tools for genetic genealogy and the analysis of consumer DNA test results

"""

"""
MIT License

Copyright (c) 2016 Andrew Riha

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

"""

import datetime
from itertools import chain, combinations
import logging
import os

import numpy as np
import pandas as pd
from snps.utils import Parallelizer, create_dir, save_df_as_csv

from lineage.individual import Individual
from lineage.resources import Resources
from lineage.visualization import plot_chromosomes

# set version string with Versioneer
from . import _version

__version__ = _version.get_versions()["version"]

logger = logging.getLogger(__name__)



[docs]
class Lineage:
    """Object used to interact with the `lineage` framework."""


[docs]
    def __init__(
        self,
        output_dir="output",
        resources_dir="resources",
        parallelize=False,
        processes=os.cpu_count(),
    ):
        """Initialize a ``Lineage`` object.

        Parameters
        ----------
        output_dir : str
            name / path of output directory
        resources_dir : str
            name / path of resources directory
        parallelize : bool
            utilize multiprocessing to speedup calculations
        processes : int
            processes to launch if multiprocessing
        """
        self._output_dir = output_dir
        self._resources_dir = resources_dir
        self._resources = Resources(resources_dir=resources_dir)
        self._parallelizer = Parallelizer(parallelize=parallelize, processes=processes)



[docs]
    def create_individual(self, name, raw_data=(), **kwargs):
        """Initialize an individual in the context of the `lineage` framework.

        Parameters
        ----------
        name : str
            name of the individual
        raw_data : str, bytes, ``SNPs`` (or list or tuple thereof)
            path(s) to file(s), bytes, or ``SNPs`` object(s) with raw genotype data
        **kwargs
            parameters to ``snps.SNPs`` and/or ``snps.SNPs.merge``

        Returns
        -------
        Individual
            ``Individual`` initialized in the context of the `lineage` framework
        """
        if "output_dir" not in kwargs:
            kwargs["output_dir"] = self._output_dir
        if "resources_dir" not in kwargs:
            kwargs["resources_dir"] = self._resources_dir

        return Individual(name, raw_data, **kwargs)



[docs]
    def download_example_datasets(self):
        """Download example datasets from `openSNP <https://opensnp.org>`_.

        Per openSNP, "the data is donated into the public domain using `CC0 1.0
        <http://creativecommons.org/publicdomain/zero/1.0/>`_."

        Returns
        -------
        paths : list of str or empty str
            paths to example datasets

        References
        ----------
        1. Greshake B, Bayer PE, Rausch H, Reda J (2014), "openSNP-A Crowdsourced Web Resource
           for Personal Genomics," PLOS ONE, 9(3): e89204,
           https://doi.org/10.1371/journal.pone.0089204
        """
        paths = self._resources.download_example_datasets()

        if "" in paths:
            logger.warning("Example dataset(s) not currently available")

        return paths



[docs]
    def find_discordant_snps(
        self, individual1, individual2, individual3=None, save_output=False
    ):
        """Find discordant SNPs between two or three individuals.

        Parameters
        ----------
        individual1 : Individual
            reference individual (child if `individual2` and `individual3` are parents)
        individual2 : Individual
            comparison individual
        individual3 : Individual
            other parent if `individual1` is child and `individual2` is a parent
        save_output : bool
            specifies whether to save output to a CSV file in the output directory

        Returns
        -------
        pandas.DataFrame
            discordant SNPs and associated genetic data

        References
        ----------
        1. David Pike, "Search for Discordant SNPs in Parent-Child
           Raw Data Files," David Pike's Utilities,
           http://www.math.mun.ca/~dapike/FF23utils/pair-discord.php
        2. David Pike, "Search for Discordant SNPs when given data
           for child and both parents," David Pike's Utilities,
           http://www.math.mun.ca/~dapike/FF23utils/trio-discord.php
        """
        self._remap_snps_to_GRCh37([individual1, individual2, individual3])

        df = individual1.snps

        # remove nulls for reference individual
        df = df.loc[df["genotype"].notnull()]

        # add SNPs shared with `individual2`
        df = df.join(individual2.snps["genotype"], rsuffix="2")

        genotype1 = "genotype_" + individual1.get_var_name()
        genotype2 = "genotype_" + individual2.get_var_name()

        if individual3 is None:
            df = df.rename(columns={"genotype": genotype1, "genotype2": genotype2})

            # find discordant SNPs between reference and comparison individuals
            df = df.loc[
                df[genotype2].notnull()
                & (
                    (df[genotype1].str.len() == 1)
                    & (df[genotype2].str.len() == 1)
                    & (df[genotype1] != df[genotype2])
                )
                | (
                    (df[genotype1].str.len() == 2)
                    & (df[genotype2].str.len() == 2)
                    & (df[genotype1].str[0] != df[genotype2].str[0])
                    & (df[genotype1].str[0] != df[genotype2].str[1])
                    & (df[genotype1].str[1] != df[genotype2].str[0])
                    & (df[genotype1].str[1] != df[genotype2].str[1])
                )
            ]
            if save_output:
                save_df_as_csv(
                    df,
                    self._output_dir,
                    "discordant_snps_{}_{}_GRCh37.csv".format(
                        individual1.get_var_name(), individual2.get_var_name()
                    ),
                    comment=self._get_csv_header(),
                    prepend_info=False,
                )
        else:
            # add SNPs shared with `individual3`
            df = df.join(individual3.snps["genotype"], rsuffix="3")

            genotype3 = "genotype_" + individual3.get_var_name()

            df = df.rename(
                columns={
                    "genotype": genotype1,
                    "genotype2": genotype2,
                    "genotype3": genotype3,
                }
            )

            # find discordant SNPs between child and two parents
            df = df.loc[
                (
                    df[genotype2].notnull()
                    & (
                        (df[genotype1].str.len() == 1)
                        & (df[genotype2].str.len() == 1)
                        & (df[genotype1] != df[genotype2])
                    )
                    | (
                        (df[genotype1].str.len() == 2)
                        & (df[genotype2].str.len() == 2)
                        & (df[genotype1].str[0] != df[genotype2].str[0])
                        & (df[genotype1].str[0] != df[genotype2].str[1])
                        & (df[genotype1].str[1] != df[genotype2].str[0])
                        & (df[genotype1].str[1] != df[genotype2].str[1])
                    )
                )
                | (
                    df[genotype3].notnull()
                    & (
                        (df[genotype1].str.len() == 1)
                        & (df[genotype3].str.len() == 1)
                        & (df[genotype1] != df[genotype3])
                    )
                    | (
                        (df[genotype1].str.len() == 2)
                        & (df[genotype3].str.len() == 2)
                        & (df[genotype1].str[0] != df[genotype3].str[0])
                        & (df[genotype1].str[0] != df[genotype3].str[1])
                        & (df[genotype1].str[1] != df[genotype3].str[0])
                        & (df[genotype1].str[1] != df[genotype3].str[1])
                    )
                )
                | (
                    df[genotype2].notnull()
                    & df[genotype3].notnull()
                    & (df[genotype2].str.len() == 2)
                    & (df[genotype2].str[0] == df[genotype2].str[1])
                    & (df[genotype2] == df[genotype3])
                    & (df[genotype1] != df[genotype2])
                )
            ]

            if save_output:
                save_df_as_csv(
                    df,
                    self._output_dir,
                    "discordant_snps_{}_{}_{}_GRCh37.csv".format(
                        individual1.get_var_name(),
                        individual2.get_var_name(),
                        individual3.get_var_name(),
                    ),
                    comment=self._get_csv_header(),
                    prepend_info=False,
                )

        return df



[docs]
    def find_shared_dna(
        self,
        individuals=(),
        cM_threshold=0.75,
        snp_threshold=1100,
        shared_genes=False,
        save_output=True,
        genetic_map="HapMap2",
    ):
        """Find the shared DNA between individuals.

        Computes the genetic distance in centiMorgans (cMs) between SNPs using the specified genetic
        map. Applies thresholds to determine the shared DNA. Plots shared DNA. Optionally determines
        shared genes (i.e., genes transcribed from the shared DNA).

        All output is saved to the output directory as `CSV` or `PNG` files.

        Notes
        -----
        The code is commented throughout to help describe the algorithm and its operation.

        To summarize, the algorithm first computes the genetic distance in cMs between SNPs
        common to all individuals using the specified genetic map.

        Then, individuals are compared for whether they share one or two alleles for each SNP in
        common; in this manner, where all individuals share one chromosome, for example, there
        will be several SNPs in a row where at least one allele is shared between individuals for
        each SNP. The ``cM_threshold`` is then applied to each of these "matching segments" to
        determine whether the segment could be a potential shared DNA segment (i.e., whether each
        segment has a cM value greater than the threshold).

        The matching segments that passed the ``cM_threshold`` are then checked to see if they
        are adjacent to another matching segment, and if so, the segments are stitched together,
        and the single SNP separating the segments is flagged as potentially discrepant. (This
        means that multiple smaller matching segments passing the ``cM_threshold`` could be
        stitched, identifying the SNP between each segment as discrepant.)

        Next, the ``snp_threshold`` is applied to each segment to ensure there are enough SNPs in
        the segment and the segment is not only a few SNPs in a region with a high recombination
        rate; for each segment that passes this test, we have a segment of shared DNA, and the
        total cMs for this segment are computed.

        Finally, discrepant SNPs are checked to ensure that only SNPs internal to a shared DNA
        segment are reported as discrepant (i.e., don't report a discrepant SNP if it was part of a
        segment that didn't pass the ``snp_threshold``). Currently, no action other than reporting
        is taken on discrepant SNPs.

        Parameters
        ----------
        individuals : iterable of Individuals
        cM_threshold : float
            minimum centiMorgans for each shared DNA segment
        snp_threshold : int
            minimum SNPs for each shared DNA segment
        shared_genes : bool
            determine shared genes
        save_output : bool
            specifies whether to save output files in the output directory
        genetic_map : {'HapMap2', 'ACB', 'ASW', 'CDX', 'CEU', 'CHB', 'CHS', 'CLM', 'FIN', 'GBR', 'GIH', 'IBS', 'JPT', 'KHV', 'LWK', 'MKK', 'MXL', 'PEL', 'PUR', 'TSI', 'YRI'}
            genetic map to use for computation of shared DNA; `HapMap2` corresponds to the HapMap
            Phase II genetic map from the
            `International HapMap Project <https://www.genome.gov/10001688/international-hapmap-project/>`_
            and all others correspond to the
            `population-specific <https://www.internationalgenome.org/faq/which-populations-are-part-your-study/>`_
            genetic maps generated from the
            `1000 Genomes Project <https://www.internationalgenome.org>`_ phased OMNI data.
            Note that shared DNA is not computed on the X chromosome with the 1000 Genomes
            Project genetic maps since the X chromosome is not included in these genetic maps.

        Returns
        -------
        dict
            dict with the following items:

            one_chrom_shared_dna (pandas.DataFrame)
                segments of shared DNA on one chromosome
            two_chrom_shared_dna (pandas.DataFrame)
                segments of shared DNA on two chromosomes
            one_chrom_shared_genes (pandas.DataFrame)
                shared genes on one chromosome
            two_chrom_shared_genes (pandas.DataFrame)
                shared genes on two chromosomes
            one_chrom_discrepant_snps (pandas.Index)
                discrepant SNPs discovered while finding shared DNA on one chromosome
            two_chrom_discrepant_snps (pandas.Index)
                discrepant SNPs discovered while finding shared DNA on two chromosomes
        """
        # initialize all objects to be returned to be empty to start
        one_chrom_shared_dna = pd.DataFrame()
        two_chrom_shared_dna = pd.DataFrame()
        one_chrom_shared_genes = pd.DataFrame()
        two_chrom_shared_genes = pd.DataFrame()
        one_chrom_discrepant_snps = pd.Index([])
        two_chrom_discrepant_snps = pd.Index([])

        # ensure that all individuals have SNPs that are mapped relative to Build 37
        self._remap_snps_to_GRCh37(individuals)

        # return if there aren't enough individuals to compare
        if len(individuals) < 2:
            logger.warning("find_shared_dna requires two or more individuals...")
            return self._find_shared_dna_return_helper(
                one_chrom_shared_dna,
                two_chrom_shared_dna,
                one_chrom_shared_genes,
                two_chrom_shared_genes,
                one_chrom_discrepant_snps,
                two_chrom_discrepant_snps,
            )

        # load the specified genetic map (one genetic map for each chromosome)
        genetic_map_dfs = self._resources.get_genetic_map(genetic_map)

        if len(genetic_map_dfs) == 0:
            return self._find_shared_dna_return_helper(
                one_chrom_shared_dna,
                two_chrom_shared_dna,
                one_chrom_shared_genes,
                two_chrom_shared_genes,
                one_chrom_discrepant_snps,
                two_chrom_discrepant_snps,
            )

        # generate a list of dynamically named columns for each individual's genotype
        # (e.g., genotype0, genotype1, etc).
        cols = [f"genotype{str(i)}" for i in range(len(individuals))]

        # set the reference SNPs to compare to be that of the first individual
        df = individuals[0].snps
        df = df.rename(columns={"genotype": cols[0]})

        # build-up a dataframe of SNPs that are common to all individuals
        for i, ind in enumerate(individuals[1:]):
            # join SNPs for all individuals
            df = df.join(ind.snps["genotype"], how="inner")
            df = df.rename(columns={"genotype": cols[i + 1]})

        # set a flag for if one individual is male (i.e., only one chromosome match on the X
        # chromosome is possible in the non-PAR region)
        one_x_chrom = self._is_one_individual_male(individuals)

        # create tasks to compute the genetic distances (cMs) between each SNP on each chromosome
        tasks = []
        chroms_to_drop = []
        for chrom in df["chrom"].unique():
            if chrom not in genetic_map_dfs.keys():
                chroms_to_drop.append(chrom)
                continue

            # each task requires the genetic map for the chromosome and the positions of all SNPs
            # in common on that chromosome
            tasks.append(
                {
                    "genetic_map": genetic_map_dfs[chrom],
                    # get positions for the current chromosome
                    "snps": pd.DataFrame(df.loc[(df["chrom"] == chrom)]["pos"]),
                }
            )

        # drop chromosomes without genetic distance data (e.g., chroms MT, PAR, etc.)
        for chrom in chroms_to_drop:
            df = df.drop(df.loc[df["chrom"] == chrom].index)

        # determine the genetic distance between each SNP using the specified genetic map
        snp_distances = map(self._compute_snp_distances, tasks)
        snp_distances = pd.concat(snp_distances)

        # extract the column "cM_from_prev_snp" from the result and add that to the dataframe
        # of SNPs common to all individuals; now we have the genetic distance between each SNP
        df["cM_from_prev_snp"] = snp_distances["cM_from_prev_snp"]

        # now we apply a mask for whether all individuals match on one or two chromosomes...
        # first, set all rows for these columns to True
        df["one_chrom_match"] = True
        df["two_chrom_match"] = True
        # determine where individuals share an allele on one chromosome (i.e., set to False when
        # at least one allele doesn't match for all individuals)
        for genotype1, genotype2 in combinations(cols, 2):
            df.loc[
                ~df[genotype1].isnull()
                & ~df[genotype2].isnull()
                & (df[genotype1].str[0] != df[genotype2].str[0])
                & (df[genotype1].str[0] != df[genotype2].str[1])
                & (df[genotype1].str[1] != df[genotype2].str[0])
                & (df[genotype1].str[1] != df[genotype2].str[1]),
                "one_chrom_match",
            ] = False

        # determine where individuals share alleles on two chromosomes (i.e., set to False when
        # two alleles don't match for all individuals)
        for genotype1, genotype2 in combinations(cols, 2):
            df.loc[
                ~df[genotype1].isnull()
                & ~df[genotype2].isnull()
                & (df[genotype1] != df[genotype2])
                & ~(
                    (df[genotype1].str[0] == df[genotype2].str[1])
                    & (df[genotype1].str[1] == df[genotype2].str[0])
                ),
                "two_chrom_match",
            ] = False

        # genotype columns are no longer required for calculation
        df = df.drop(cols, axis=1)

        # find shared DNA on one chrom
        one_chrom_shared_dna, one_chrom_discrepant_snps = self._find_shared_dna_helper(
            df[["chrom", "pos", "cM_from_prev_snp", "one_chrom_match"]],
            cM_threshold,
            snp_threshold,
            one_x_chrom,
        )
        # find shared DNA on two chroms
        two_chrom_shared_dna, two_chrom_discrepant_snps = self._find_shared_dna_helper(
            df[["chrom", "pos", "cM_from_prev_snp", "two_chrom_match"]],
            cM_threshold,
            snp_threshold,
            one_x_chrom,
        )

        if shared_genes:
            one_chrom_shared_genes = self._compute_shared_genes(one_chrom_shared_dna)
            two_chrom_shared_genes = self._compute_shared_genes(two_chrom_shared_dna)

        if save_output:
            self._find_shared_dna_output_helper(
                individuals,
                cM_threshold,
                snp_threshold,
                one_chrom_shared_dna,
                two_chrom_shared_dna,
                one_chrom_shared_genes,
                two_chrom_shared_genes,
                genetic_map,
            )

        return self._find_shared_dna_return_helper(
            one_chrom_shared_dna,
            two_chrom_shared_dna,
            one_chrom_shared_genes,
            two_chrom_shared_genes,
            one_chrom_discrepant_snps,
            two_chrom_discrepant_snps,
        )


    def _find_shared_dna_helper(self, df, cM_threshold, snp_threshold, one_x_chrom):
        tasks = []

        for chrom in df["chrom"].unique():
            tasks.append(
                {
                    "df": df.loc[df["chrom"] == chrom],
                    "chrom": chrom,
                    "cM_threshold": cM_threshold,
                    "snp_threshold": snp_threshold,
                    "one_x_chrom": one_x_chrom,
                }
            )

        # compute shared DNA between individuals
        results = map(self._compute_shared_dna, tasks)

        shared_dna = []
        discrepant_snps = pd.Index([], name="rsid")
        for result in list(results):
            shared_dna.append(result["shared_dna"])
            discrepant_snps = discrepant_snps.append(result["discrepant_snps"])

        # https://stackoverflow.com/a/952952
        shared_dna = list(chain.from_iterable(shared_dna))

        return (self._convert_shared_dna_list_to_df(shared_dna), discrepant_snps)

    def _find_shared_dna_output_helper(
        self,
        individuals,
        cM_threshold,
        snp_threshold,
        one_chrom_shared_dna,
        two_chrom_shared_dna,
        one_chrom_shared_genes,
        two_chrom_shared_genes,
        genetic_map,
    ):
        def output_csv(df, file, float_format="%.2f"):
            save_df_as_csv(
                df,
                self._output_dir,
                file,
                comment=self._get_csv_header(),
                prepend_info=False,
                float_format=float_format,
            )

        cytobands = self._resources.get_cytoBand_hg19()

        individuals_filename = ""
        individuals_plot_title = ""

        for individual in individuals:
            individuals_filename += individual.get_var_name() + "_"
            individuals_plot_title += individual.name + " / "

        individuals_filename = individuals_filename[:-1]
        individuals_plot_title = individuals_plot_title[:-3]

        cM = "{:.2f}".format(cM_threshold).replace(".", "p")
        filename_details = (
            f"{individuals_filename}_{cM}cM_{snp_threshold}snps_GRCh37_{genetic_map}"
        )

        if create_dir(self._output_dir):
            plot_chromosomes(
                one_chrom_shared_dna,
                two_chrom_shared_dna,
                cytobands,
                os.path.join(
                    self._output_dir,
                    f"shared_dna_{filename_details}.png",
                ),
                f"{individuals_plot_title} shared DNA",
                37,
            )

        if len(one_chrom_shared_dna) > 0:
            output_csv(
                one_chrom_shared_dna,
                f"shared_dna_one_chrom_{filename_details}.csv",
            )

        if len(two_chrom_shared_dna) > 0:
            output_csv(
                two_chrom_shared_dna,
                f"shared_dna_two_chroms_{filename_details}.csv",
            )

        if len(one_chrom_shared_genes) > 0:
            output_csv(
                one_chrom_shared_genes,
                f"shared_genes_one_chrom_{filename_details}.csv",
                None,
            )

        if len(two_chrom_shared_genes) > 0:
            output_csv(
                two_chrom_shared_genes,
                f"shared_genes_two_chroms_{filename_details}.csv",
                None,
            )

    def _find_shared_dna_return_helper(
        self,
        one_chrom_shared_dna,
        two_chrom_shared_dna,
        one_chrom_shared_genes,
        two_chrom_shared_genes,
        one_chrom_discrepant_snps,
        two_chrom_discrepant_snps,
    ):
        return {
            "one_chrom_shared_dna": one_chrom_shared_dna,
            "two_chrom_shared_dna": two_chrom_shared_dna,
            "one_chrom_shared_genes": one_chrom_shared_genes,
            "two_chrom_shared_genes": two_chrom_shared_genes,
            "one_chrom_discrepant_snps": one_chrom_discrepant_snps,
            "two_chrom_discrepant_snps": two_chrom_discrepant_snps,
        }

    def _convert_shared_dna_list_to_df(self, shared_dna):
        df = pd.DataFrame(shared_dna, columns=["chrom", "start", "end", "cMs", "snps"])
        df.index.name = "segment"
        df.index = df.index + 1
        return df

    def _compute_shared_genes(self, shared_dna):
        knownGenes = self._resources.get_knownGene_hg19()
        kgXref = self._resources.get_kgXref_hg19()

        # http://seqanswers.com/forums/showthread.php?t=22336
        df = knownGenes.join(kgXref)

        shared_genes_dfs = []
        shared_genes = pd.DataFrame()

        for shared_segment in shared_dna.itertuples():
            # determine genes transcribed from this shared DNA segment
            temp = df.loc[
                (df["chrom"] == shared_segment.chrom)
                & (df["txStart"] >= shared_segment.start)
                & (df["txEnd"] <= shared_segment.end)
            ].copy()

            # select subset of columns
            temp = temp[
                [
                    "geneSymbol",
                    "chrom",
                    "strand",
                    "txStart",
                    "txEnd",
                    "refseq",
                    "proteinID",
                    "description",
                ]
            ]

            if len(temp) > 0:
                shared_genes_dfs.append(temp)

        if len(shared_genes_dfs) > 0:
            shared_genes = pd.concat(shared_genes_dfs, sort=True)

        return shared_genes

    def _is_one_individual_male(self, individuals):
        for ind in individuals:
            if ind.sex == "Male":
                return True
        return False

    def _compute_snp_distances(self, task):
        """Compute genetic distance in cMs between SNPs.

        Parameters
        ----------
        task : dict
            dict with `snps` to compute distance between using `genetic_map`

        Returns
        -------
        pandas.DataFrame
            genetic distances between SNPs
        """
        genetic_map = task["genetic_map"]
        temp = task["snps"]

        # merge genetic map for this chrom
        temp = pd.concat([temp, genetic_map], ignore_index=False, sort=True)

        # sort based on pos
        temp = temp.sort_values("pos")

        # fill recombination rates forward
        temp["rate"] = temp["rate"].fillna(method="ffill")

        # assume recombination rate of 0 for SNPs upstream of first defined rate
        temp["rate"] = temp["rate"].fillna(0)

        # get difference between positions
        pos_diffs = np.ediff1d(temp["pos"])

        # compute cMs between each pos based on probabilistic recombination rate
        # https://www.biostars.org/p/123539/
        cMs_match_segment = (temp["rate"] * np.r_[pos_diffs, 0] / 1e6).values

        # add back into temp
        temp["cMs"] = np.r_[0, cMs_match_segment][:-1]

        temp = temp.reset_index()

        # use null `map` values to find locations of SNPs
        snp_indices = temp.loc[temp["map"].isnull()].index

        # use SNP indices to determine boundaries over which to sum cMs
        start_snp_ix = snp_indices + 1
        end_snp_ix = np.r_[snp_indices, snp_indices[-1]][1:] + 1
        snp_boundaries = np.c_[start_snp_ix, end_snp_ix]

        # sum cMs between SNPs to get total cM distance between SNPs
        # http://stackoverflow.com/a/7471967
        c = np.r_[0, temp["cMs"].cumsum()][snp_boundaries]
        cM_from_prev_snp = c[:, 1] - c[:, 0]

        temp = temp.loc[temp["map"].isna()]

        # add back into temp
        temp["cM_from_prev_snp"] = np.r_[0, cM_from_prev_snp][:-1]

        # restore index
        temp = temp.set_index("index")

        return pd.DataFrame(temp["cM_from_prev_snp"])

    def _compute_shared_dna(self, task):
        df = task["df"]
        chrom = task["chrom"]
        cM_threshold = task["cM_threshold"]
        snp_threshold = task["snp_threshold"]
        one_x_chrom = task["one_x_chrom"]

        if "one_chrom_match" in df.keys():
            match_col = "one_chrom_match"
        else:
            match_col = "two_chrom_match"

        shared_dna = []
        discrepant_snps = pd.Index([], name="rsid")

        # set two_chrom_match in non-PAR region to False if an individual is male
        if chrom == "X" and match_col == "two_chrom_match" and one_x_chrom:
            df = df.copy()
            # https://www.ncbi.nlm.nih.gov/grc/human
            df.loc[
                (df["chrom"] == "X") & (df["pos"] > 2699520) & (df["pos"] < 154931044),
                "two_chrom_match",
            ] = False

        # get consecutive strings of Trues, for where there's a one or two chrom match between
        # individuals, depending on the task; http://stackoverflow.com/a/17151327
        a = df.loc[(df["chrom"] == chrom)][match_col].values
        a = np.r_[a, False]
        a_rshifted = np.roll(a, 1)
        starts = a & ~a_rshifted
        ends = ~a & a_rshifted
        a_starts = np.nonzero(starts)[0]
        a_starts = np.reshape(a_starts, (len(a_starts), 1))
        a_ends = np.nonzero(ends)[0]
        a_ends = np.reshape(a_ends, (len(a_ends), 1))

        # get the matching segments
        matches = np.hstack((a_starts, a_ends))

        # compute total cMs for each matching segment
        c = np.r_[0, df.loc[(df["chrom"] == chrom)]["cM_from_prev_snp"].cumsum()][
            matches
        ]
        cMs_match_segment = c[:, 1] - c[:, 0]

        # get matching segments where total cMs is greater than the threshold
        matches_passed = matches[np.where(cMs_match_segment > cM_threshold)]

        # get indices where a_end boundary is adjacent to a_start, perhaps indicating a
        # discrepant SNP
        adjacent_ix = np.where(
            np.roll(matches_passed[:, 0], -1) - matches_passed[:, 1] == 1
        )

        # if there are adjacent segments
        if len(adjacent_ix[0]) != 0:
            discrepant_snps = df.iloc[matches_passed[adjacent_ix[0], 1]].index
            matches_stitched = np.array([0, 0])
            prev = -1
            counter = 0

            # stitch together adjacent segments
            for x in matches_passed:
                if prev != -1 and prev + 1 == x[0]:
                    matches_stitched[counter, 1] = x[1]
                else:
                    matches_stitched = np.vstack((matches_stitched, x))
                    counter += 1

                prev = x[1]

            matches_passed = matches_stitched[1:]

        snp_counts = matches_passed[:, 1] - matches_passed[:, 0]

        # apply SNP count threshold for each matching segment; we now have the shared DNA
        # segments that pass the centiMorgan and SNP thresholds
        matches_passed = matches_passed[np.where(snp_counts > snp_threshold)]

        # compute total cMs for each match segment
        c = np.r_[0, df.loc[(df["chrom"] == chrom)]["cM_from_prev_snp"].cumsum()][
            matches_passed
        ]
        cMs_match_segment = c[:, 1] - c[:, 0]

        discrepant_snps_passed = pd.Index([], name="rsid")
        counter = 0
        # save matches for this chromosome
        for x in matches_passed:
            d = {
                "chrom": chrom,
                "start": df.loc[(df["chrom"] == chrom)].iloc[x[0]].pos,
                "end": df.loc[(df["chrom"] == chrom)].iloc[x[1] - 1].pos,
                "cMs": cMs_match_segment[counter],
                "snps": x[1] - x[0],
            }

            # ensure discrepant SNPs are in shared DNA segments
            for discrepant_snp in discrepant_snps:
                if d["start"] <= df.loc[discrepant_snp].pos <= d["end"]:
                    discrepant_snps_passed = discrepant_snps_passed.append(
                        df.loc[[discrepant_snp]].index
                    )

            # remove found discrepant SNPs from search on next iteration
            discrepant_snps = discrepant_snps.drop(
                discrepant_snps_passed, errors="ignore"
            )

            shared_dna.append(d)
            counter += 1
        return {"shared_dna": shared_dna, "discrepant_snps": discrepant_snps_passed}

    def _remap_snps_to_GRCh37(self, individuals):
        for ind in individuals:
            if ind is None:
                continue

            ind.remap(37)

    def _get_csv_header(self):
        return (
            f"# Generated by lineage v{__version__}; https://pypi.org/project/lineage/{os.linesep}"
            f"# Generated at {datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')} UTC{os.linesep}"
        )