Select Page
# Copyright (c) 2020 Brit Nicholson.
# All rights reserved. This program is made available under
# the terms of the Simple Non Code License (SNCL) v2.3.0,
# which is available at
# https://github.com/MysteryDash/Simple-Non-Code-License/blob/master/License.txt

# This is the code that was used to calculate expected percentages of X-DNA shared
# between a person and various ancestors. More information can be found here:
# http://www.dna-sci.com/2020/03/23/modeling-the-inheritance-of-x-chromosome-dna/

import numpy as np
import random
import fnmatch

def recomb_a_first(mat_anc, fun_cut_spots, pat_anc=None):
    
    mat_a = mat_anc[:,0].tolist()
    mat_b = mat_anc[:,1].tolist()
    
    if len(cut_spots) == 0:
        recomb_out = mat_a
        return recomb_out

    recomb_out = []
    for i in range(0, len(fun_cut_spots)):
        if i == 0:
            recomb_out = mat_a[0:fun_cut_spots[0]]
        if i % 2 == 1:
            recomb_out.extend(mat_b[fun_cut_spots[i-1]:fun_cut_spots[i]])
            last_a = False
        else:
            recomb_out.extend(mat_a[fun_cut_spots[i-1]:fun_cut_spots[i]])
            last_a = True
    if last_a:
        recomb_out.extend(mat_b[fun_cut_spots[i]:len(mat_b)])
    else:
        recomb_out.extend(mat_a[fun_cut_spots[i]:len(mat_a)])
    return recomb_out

def recomb_b_first(mat_anc, fun_cut_spots, pat_anc=None):
    
    mat_a = mat_anc[:,0].tolist()
    mat_b = mat_anc[:,1].tolist()
    
    if len(cut_spots) == 0:
        recomb_out = mat_b
        return recomb_out
    
    recomb_out = []
    for i in range(0, len(fun_cut_spots)):
        if i == 0:
            recomb_out = mat_b[0:fun_cut_spots[0]]
        if i % 2 == 1:
            recomb_out.extend(mat_a[fun_cut_spots[i-1]:fun_cut_spots[i]])
            last_a = True
        else:
            recomb_out.extend(mat_b[fun_cut_spots[i-1]:fun_cut_spots[i]])
            last_a = False
    if last_a:
        recomb_out.extend(mat_b[fun_cut_spots[i]:len(mat_b)])
    else:
        recomb_out.extend(mat_a[fun_cut_spots[i]:len(mat_a)])
    return recomb_out

def find_start(g):
    start_out = 2
    for g in range(2, g + 1):
        if g % 2 == 1: #odd
            start_out = start_out*2
        else: #even
            start_out = start_out*2 - 1
    return start_out

# Gen. = 2 simulates grandparents, 3 simulates great-grandparents, etc.
gen = 5

# Shouldn't pick less than 14 for genes_num if λ = 1.7. If the random Poisson variable ends up larger than
# genes_num, it will simply recombine in fewer places than the random variable designated (w/o error, surprisingly).
# In a test of 500m trials, the largest random Poisson value I got (w/ avg. = 1.7) was 13 
genes_num = 14
genes_list = list(range(0, genes_num))
spots = list(range(1, genes_num))

# λ
rand_fish = 1.7

trials = 500000

# Mat. is the same only shifted back one
mat_start_num = find_start(gen-1)
pat_start_num = find_start(gen)

# Need to name output variables in order to later add percentages for each trial run
# Just the ones who could be anc. of m2_0, excluding those who could be anc. of p1_0:
m2_0_in_common_w_p, m2_0_in_common_w_m, p1_0_in_common_w_p, m2_0_in_common_w_p,\
    p1_0_in_common_w_m, m2_0_in_common_w_m = ({} for i in range(6))
for num in range(mat_start_num, pat_start_num):
    if num % 2 == 1: # odd means pat.
        m2_0_in_common_w_p[str(num) + '_' + str(gen)] = []
    else: # even means mat.
        m2_0_in_common_w_m[str(num) + '_' + str(gen)] = []
# Ones who could be anc. of both m2_0 and p1_0
for num in range(pat_start_num, 2**gen + 1):
    if num % 2 == 1: # odd means pat.
        p1_0_in_common_w_p[str(num) + '_' + str(gen)] = []
        m2_0_in_common_w_p[str(num) + '_' + str(gen)] = []
    else: # even means mat.
        p1_0_in_common_w_m[str(num) + '_' + str(gen)] = []
        m2_0_in_common_w_m[str(num) + '_' + str(gen)] = []

p, m = {}, {}
# Begin the trials 'for loop' (the crux of the model):
for i_trials in range(0, trials):

    # Make the set of anc. at gen. steps back
    for num in range(1, 2**gen + 1):
        if num % 2 == 1: # Male ancestor
            p[(num, gen)] = ['p' + str(num) + '_' + str(i) for i in genes_list]
        else: # Female ancestor. Make two columns, signifying homologues
            homol_a = ['m' + str(num) + '_a' + str(i) for i in genes_list]
            homol_b = ['m' + str(num) + '_b' + str(i) for i in genes_list]  
            m[(num, gen)] = np.column_stack((homol_a, homol_b))
    
    random.shuffle(spots)

    # Populate the rest of the tree except sibs.:
    for g in range(gen - 1, -1, -1):
        for num in range(1, 2**g + 1):

            num_of_recombs = np.random.poisson(rand_fish)
            cut_spots = sorted(spots[:num_of_recombs])
            rand = random.random() # Used for deciding which homologue goes first in alternating during recombination
            if num % 2 == 1: # Male ancestor
                if rand < 0.5:
                    p[(num, g)] = recomb_a_first(m[(num*2, g+1)], cut_spots)
                else:
                    p[(num, g)] = recomb_b_first(m[(num*2, g+1)], cut_spots)
            else: # Female ancestor
                if rand < 0.5:
                    homol_a = recomb_a_first(m[(num*2, g+1)], cut_spots)
                else:
                    homol_a = recomb_b_first(m[(num*2, g+1)], cut_spots)
                homol_b = p[(num*2-1, g+1)]
                m[(num, g)] = np.column_stack((homol_a, homol_b))
    
    # Since there won't be a 2 to make m2_0, it's hard-coded as a sib. to p1_0:
    random.shuffle(spots)
    num_of_recombs = np.random.poisson(rand_fish)
    cut_spots = sorted(spots[:num_of_recombs])
    if random.random() < 0.5:
        homol_a = recomb_a_first(m[(2, 1)], cut_spots)
    else:
        homol_a = recomb_b_first(m[(2, 1)], cut_spots)
    homol_b = p[(1, 1)]
    #s1 = np.column_stack((homol_a, homol_b)) # Checking 'in common' doesn't work if it's an array.
    m[(2, 0)] = homol_a + homol_b # Ok to concat. homologues because m2_0 has no descendants

    # Calculate percent in common with ancestors. When the final descendant is female, the
    # percentage is that of just one homologue. To find the percentage of her total X-DNA,
    # genes_num would have to be multiplied by 2, but I believe that would be misleading.
    # W/o multiplying by 2, it's comparable to amount of DNA rather than just percentage,
    # and allows comparison to male descendants' DNA amount.
    # Just the ones who could be anc. of m2_0, excluding those who could be anc. of p1_0:
    for num in range(mat_start_num, pat_start_num):
        if num % 2 == 1: # odd means pat.
            m2_0_in_common_w_p[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(m[(2, 0)], 'p' + str(num) + '*')) / genes_num))
        else: # even means mat.
            m2_0_in_common_w_m[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(m[(2, 0)], 'm' + str(num) + '*')) / genes_num))
    # Ones who could be anc. of both m2_0 and p1_0:
    for num in range(pat_start_num, 2**gen + 1):
        if num % 2 == 1: # odd means pat.
            p1_0_in_common_w_p[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(p[(1, 0)], 'p' + str(num) + '*')) / genes_num))
            m2_0_in_common_w_p[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(m[(2, 0)], 'p' + str(num) + '*')) / genes_num))
        else: # even means mat.
            p1_0_in_common_w_m[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(p[(1, 0)], 'm' + str(num) + '*')) / genes_num))
            m2_0_in_common_w_m[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(m[(2, 0)], 'm' + str(num) + '*')) / genes_num))
# 'For loop' for trials ends here

import statistics

# combos() and kRecur() were modified from here:
# https://stackoverflow.com/questions/7074051/what-is-the-best-way-to-generate-all-possible-three-letter-strings
# which was modified by alphahmed from a solution by Abhinav Ramana on GeeksforGeeks.org.
def combos(letters, k):
    out = []
    l = len(letters)
    kRecur(letters, "", l, k, out)
    return(out)

def kRecur(letters, prfx, l, k, out):
    if k == 0:
        out = out.append(prfx)
    else:
        for i in range(l):
            newPrfx = prfx + letters[i]
            kRecur(letters, newPrfx, l, k-1, out)
            if len(out) == 2**k:
                return out

order_labels = sorted(combos(['m','p'], gen), reverse = True)

confid = 90
lo = (100 - confid) / 2
hi = 100 - lo

# Print ("pv" stands for print variable)
for num in range(mat_start_num, pat_start_num):
  if num % 2 == 1: # odd means ancestor is pat.
    pv = m2_0_in_common_w_p[str(num) + '_' + str(gen)]
    print('m_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',\
          'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',\
          'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',\
          pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%')
  else: # even means ancestor is mat.
    pv = m2_0_in_common_w_m[str(num) + '_' + str(gen)]
    print('m_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',\
          'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',\
          'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',\
          pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%')

for num in range(pat_start_num, 2**gen + 1):
  if num % 2 == 1: # odd means ancestor is pat.
    pv = p1_0_in_common_w_p[str(num) + '_' + str(gen)]
    print('p_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',\
          'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',\
          'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',\
          pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%')
    pv = m2_0_in_common_w_p[str(num) + '_' + str(gen)]
    print('m_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',\
          'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',\
          'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',\
          pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%')

  else: # even means ancestor is mat.
    pv = p1_0_in_common_w_m[str(num) + '_' + str(gen)]
    print('p_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',\
          'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',\
          'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',\
          pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%')

    pv = m2_0_in_common_w_m[str(num) + '_' + str(gen)]
    print('m_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',\
          'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',\
          'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',\
          pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%')