Select Page
```# Copyright (c) 2020 Brit Nicholson.
# the terms of the Simple Non Code License (SNCL) v2.3.0,
# which is available at

# This is the code that was used to calculate expected percentages of X-DNA shared
# between a person and various ancestors. More information can be found here:
# http://www.dna-sci.com/2020/03/23/modeling-the-inheritance-of-x-chromosome-dna/

import numpy as np
import random
import fnmatch

def recomb_a_first(mat_anc, fun_cut_spots, pat_anc=None):

mat_a = mat_anc[:,0].tolist()
mat_b = mat_anc[:,1].tolist()

if len(cut_spots) == 0:
recomb_out = mat_a
return recomb_out

recomb_out = []
for i in range(0, len(fun_cut_spots)):
if i == 0:
recomb_out = mat_a[0:fun_cut_spots[0]]
if i % 2 == 1:
recomb_out.extend(mat_b[fun_cut_spots[i-1]:fun_cut_spots[i]])
last_a = False
else:
recomb_out.extend(mat_a[fun_cut_spots[i-1]:fun_cut_spots[i]])
last_a = True
if last_a:
recomb_out.extend(mat_b[fun_cut_spots[i]:len(mat_b)])
else:
recomb_out.extend(mat_a[fun_cut_spots[i]:len(mat_a)])
return recomb_out

def recomb_b_first(mat_anc, fun_cut_spots, pat_anc=None):

mat_a = mat_anc[:,0].tolist()
mat_b = mat_anc[:,1].tolist()

if len(cut_spots) == 0:
recomb_out = mat_b
return recomb_out

recomb_out = []
for i in range(0, len(fun_cut_spots)):
if i == 0:
recomb_out = mat_b[0:fun_cut_spots[0]]
if i % 2 == 1:
recomb_out.extend(mat_a[fun_cut_spots[i-1]:fun_cut_spots[i]])
last_a = True
else:
recomb_out.extend(mat_b[fun_cut_spots[i-1]:fun_cut_spots[i]])
last_a = False
if last_a:
recomb_out.extend(mat_b[fun_cut_spots[i]:len(mat_b)])
else:
recomb_out.extend(mat_a[fun_cut_spots[i]:len(mat_a)])
return recomb_out

def find_start(g):
start_out = 2
for g in range(2, g + 1):
if g % 2 == 1: #odd
start_out = start_out*2
else: #even
start_out = start_out*2 - 1
return start_out

# Gen. = 2 simulates grandparents, 3 simulates great-grandparents, etc.
gen = 5

# Shouldn't pick less than 14 for genes_num if λ = 1.7. If the random Poisson variable ends up larger than
# genes_num, it will simply recombine in fewer places than the random variable designated (w/o error, surprisingly).
# In a test of 500m trials, the largest random Poisson value I got (w/ avg. = 1.7) was 13
genes_num = 14
genes_list = list(range(0, genes_num))
spots = list(range(1, genes_num))

# λ
rand_fish = 1.7

trials = 500000

# Mat. is the same only shifted back one
mat_start_num = find_start(gen-1)
pat_start_num = find_start(gen)

# Need to name output variables in order to later add percentages for each trial run
# Just the ones who could be anc. of m2_0, excluding those who could be anc. of p1_0:
m2_0_in_common_w_p, m2_0_in_common_w_m, p1_0_in_common_w_p, m2_0_in_common_w_p,\
p1_0_in_common_w_m, m2_0_in_common_w_m = ({} for i in range(6))
for num in range(mat_start_num, pat_start_num):
if num % 2 == 1: # odd means pat.
m2_0_in_common_w_p[str(num) + '_' + str(gen)] = []
else: # even means mat.
m2_0_in_common_w_m[str(num) + '_' + str(gen)] = []
# Ones who could be anc. of both m2_0 and p1_0
for num in range(pat_start_num, 2**gen + 1):
if num % 2 == 1: # odd means pat.
p1_0_in_common_w_p[str(num) + '_' + str(gen)] = []
m2_0_in_common_w_p[str(num) + '_' + str(gen)] = []
else: # even means mat.
p1_0_in_common_w_m[str(num) + '_' + str(gen)] = []
m2_0_in_common_w_m[str(num) + '_' + str(gen)] = []

p, m = {}, {}
# Begin the trials 'for loop' (the crux of the model):
for i_trials in range(0, trials):

# Make the set of anc. at gen. steps back
for num in range(1, 2**gen + 1):
if num % 2 == 1: # Male ancestor
p[(num, gen)] = ['p' + str(num) + '_' + str(i) for i in genes_list]
else: # Female ancestor. Make two columns, signifying homologues
homol_a = ['m' + str(num) + '_a' + str(i) for i in genes_list]
homol_b = ['m' + str(num) + '_b' + str(i) for i in genes_list]
m[(num, gen)] = np.column_stack((homol_a, homol_b))

random.shuffle(spots)

# Populate the rest of the tree except sibs.:
for g in range(gen - 1, -1, -1):
for num in range(1, 2**g + 1):

num_of_recombs = np.random.poisson(rand_fish)
cut_spots = sorted(spots[:num_of_recombs])
rand = random.random() # Used for deciding which homologue goes first in alternating during recombination
if num % 2 == 1: # Male ancestor
if rand < 0.5:
p[(num, g)] = recomb_a_first(m[(num*2, g+1)], cut_spots)
else:
p[(num, g)] = recomb_b_first(m[(num*2, g+1)], cut_spots)
else: # Female ancestor
if rand < 0.5:
homol_a = recomb_a_first(m[(num*2, g+1)], cut_spots)
else:
homol_a = recomb_b_first(m[(num*2, g+1)], cut_spots)
homol_b = p[(num*2-1, g+1)]
m[(num, g)] = np.column_stack((homol_a, homol_b))

# Since there won't be a 2 to make m2_0, it's hard-coded as a sib. to p1_0:
random.shuffle(spots)
num_of_recombs = np.random.poisson(rand_fish)
cut_spots = sorted(spots[:num_of_recombs])
if random.random() < 0.5:
homol_a = recomb_a_first(m[(2, 1)], cut_spots)
else:
homol_a = recomb_b_first(m[(2, 1)], cut_spots)
homol_b = p[(1, 1)]
#s1 = np.column_stack((homol_a, homol_b)) # Checking 'in common' doesn't work if it's an array.
m[(2, 0)] = homol_a + homol_b # Ok to concat. homologues because m2_0 has no descendants

# Calculate percent in common with ancestors. When the final descendant is female, the
# percentage is that of just one homologue. To find the percentage of her total X-DNA,
# genes_num would have to be multiplied by 2, but I believe that would be misleading.
# W/o multiplying by 2, it's comparable to amount of DNA rather than just percentage,
# and allows comparison to male descendants' DNA amount.
# Just the ones who could be anc. of m2_0, excluding those who could be anc. of p1_0:
for num in range(mat_start_num, pat_start_num):
if num % 2 == 1: # odd means pat.
m2_0_in_common_w_p[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(m[(2, 0)], 'p' + str(num) + '*')) / genes_num))
else: # even means mat.
m2_0_in_common_w_m[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(m[(2, 0)], 'm' + str(num) + '*')) / genes_num))
# Ones who could be anc. of both m2_0 and p1_0:
for num in range(pat_start_num, 2**gen + 1):
if num % 2 == 1: # odd means pat.
p1_0_in_common_w_p[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(p[(1, 0)], 'p' + str(num) + '*')) / genes_num))
m2_0_in_common_w_p[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(m[(2, 0)], 'p' + str(num) + '*')) / genes_num))
else: # even means mat.
p1_0_in_common_w_m[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(p[(1, 0)], 'm' + str(num) + '*')) / genes_num))
m2_0_in_common_w_m[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(m[(2, 0)], 'm' + str(num) + '*')) / genes_num))
# 'For loop' for trials ends here

import statistics

# combos() and kRecur() were modified from here:
# https://stackoverflow.com/questions/7074051/what-is-the-best-way-to-generate-all-possible-three-letter-strings
# which was modified by alphahmed from a solution by Abhinav Ramana on GeeksforGeeks.org.
def combos(letters, k):
out = []
l = len(letters)
kRecur(letters, "", l, k, out)
return(out)

def kRecur(letters, prfx, l, k, out):
if k == 0:
out = out.append(prfx)
else:
for i in range(l):
newPrfx = prfx + letters[i]
kRecur(letters, newPrfx, l, k-1, out)
if len(out) == 2**k:
return out

order_labels = sorted(combos(['m','p'], gen), reverse = True)

confid = 90
lo = (100 - confid) / 2
hi = 100 - lo

# Print ("pv" stands for print variable)
for num in range(mat_start_num, pat_start_num):
if num % 2 == 1: # odd means ancestor is pat.
pv = m2_0_in_common_w_p[str(num) + '_' + str(gen)]
print('m_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',\
'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',\
'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',\
pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%')
else: # even means ancestor is mat.
pv = m2_0_in_common_w_m[str(num) + '_' + str(gen)]
print('m_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',\
'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',\
'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',\
pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%')

for num in range(pat_start_num, 2**gen + 1):
if num % 2 == 1: # odd means ancestor is pat.
pv = p1_0_in_common_w_p[str(num) + '_' + str(gen)]
print('p_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',\
'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',\
'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',\
pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%')
pv = m2_0_in_common_w_p[str(num) + '_' + str(gen)]
print('m_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',\
'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',\
'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',\
pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%')

else: # even means ancestor is mat.
pv = p1_0_in_common_w_m[str(num) + '_' + str(gen)]
print('p_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',\
'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',\
'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',\
pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%')

pv = m2_0_in_common_w_m[str(num) + '_' + str(gen)]
print('m_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',\
'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',\
'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',\
pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%')```