# Copyright (c) 2020 Brit Nicholson. # All rights reserved. This program is made available under # the terms of the Simple Non Code License (SNCL) v2.3.0, # which is available at # https://github.com/MysteryDash/Simple-Non-Code-License/blob/master/License.txt # This is the code that was used to calculate expected percentages of X-DNA shared # between a person and various ancestors. More information can be found here: # http://www.dna-sci.com/2020/03/23/modeling-the-inheritance-of-x-chromosome-dna/ import numpy as np import random import fnmatch def recomb_a_first(mat_anc, fun_cut_spots, pat_anc=None): mat_a = mat_anc[:,0].tolist() mat_b = mat_anc[:,1].tolist() if len(cut_spots) == 0: recomb_out = mat_a return recomb_out recomb_out = [] for i in range(0, len(fun_cut_spots)): if i == 0: recomb_out = mat_a[0:fun_cut_spots[0]] if i % 2 == 1: recomb_out.extend(mat_b[fun_cut_spots[i-1]:fun_cut_spots[i]]) last_a = False else: recomb_out.extend(mat_a[fun_cut_spots[i-1]:fun_cut_spots[i]]) last_a = True if last_a: recomb_out.extend(mat_b[fun_cut_spots[i]:len(mat_b)]) else: recomb_out.extend(mat_a[fun_cut_spots[i]:len(mat_a)]) return recomb_out def recomb_b_first(mat_anc, fun_cut_spots, pat_anc=None): mat_a = mat_anc[:,0].tolist() mat_b = mat_anc[:,1].tolist() if len(cut_spots) == 0: recomb_out = mat_b return recomb_out recomb_out = [] for i in range(0, len(fun_cut_spots)): if i == 0: recomb_out = mat_b[0:fun_cut_spots[0]] if i % 2 == 1: recomb_out.extend(mat_a[fun_cut_spots[i-1]:fun_cut_spots[i]]) last_a = True else: recomb_out.extend(mat_b[fun_cut_spots[i-1]:fun_cut_spots[i]]) last_a = False if last_a: recomb_out.extend(mat_b[fun_cut_spots[i]:len(mat_b)]) else: recomb_out.extend(mat_a[fun_cut_spots[i]:len(mat_a)]) return recomb_out def find_start(g): start_out = 2 for g in range(2, g + 1): if g % 2 == 1: #odd start_out = start_out*2 else: #even start_out = start_out*2 - 1 return start_out # Gen. = 2 simulates grandparents, 3 simulates great-grandparents, etc. gen = 5 # Shouldn't pick less than 14 for genes_num if λ = 1.7. If the random Poisson variable ends up larger than # genes_num, it will simply recombine in fewer places than the random variable designated (w/o error, surprisingly). # In a test of 500m trials, the largest random Poisson value I got (w/ avg. = 1.7) was 13 genes_num = 14 genes_list = list(range(0, genes_num)) spots = list(range(1, genes_num)) # λ rand_fish = 1.7 trials = 500000 # Mat. is the same only shifted back one mat_start_num = find_start(gen-1) pat_start_num = find_start(gen) # Need to name output variables in order to later add percentages for each trial run # Just the ones who could be anc. of m2_0, excluding those who could be anc. of p1_0: m2_0_in_common_w_p, m2_0_in_common_w_m, p1_0_in_common_w_p, m2_0_in_common_w_p,\ p1_0_in_common_w_m, m2_0_in_common_w_m = ({} for i in range(6)) for num in range(mat_start_num, pat_start_num): if num % 2 == 1: # odd means pat. m2_0_in_common_w_p[str(num) + '_' + str(gen)] = [] else: # even means mat. m2_0_in_common_w_m[str(num) + '_' + str(gen)] = [] # Ones who could be anc. of both m2_0 and p1_0 for num in range(pat_start_num, 2**gen + 1): if num % 2 == 1: # odd means pat. p1_0_in_common_w_p[str(num) + '_' + str(gen)] = [] m2_0_in_common_w_p[str(num) + '_' + str(gen)] = [] else: # even means mat. p1_0_in_common_w_m[str(num) + '_' + str(gen)] = [] m2_0_in_common_w_m[str(num) + '_' + str(gen)] = [] p, m = {}, {} # Begin the trials 'for loop' (the crux of the model): for i_trials in range(0, trials): # Make the set of anc. at gen. steps back for num in range(1, 2**gen + 1): if num % 2 == 1: # Male ancestor p[(num, gen)] = ['p' + str(num) + '_' + str(i) for i in genes_list] else: # Female ancestor. Make two columns, signifying homologues homol_a = ['m' + str(num) + '_a' + str(i) for i in genes_list] homol_b = ['m' + str(num) + '_b' + str(i) for i in genes_list] m[(num, gen)] = np.column_stack((homol_a, homol_b)) random.shuffle(spots) # Populate the rest of the tree except sibs.: for g in range(gen - 1, -1, -1): for num in range(1, 2**g + 1): num_of_recombs = np.random.poisson(rand_fish) cut_spots = sorted(spots[:num_of_recombs]) rand = random.random() # Used for deciding which homologue goes first in alternating during recombination if num % 2 == 1: # Male ancestor if rand < 0.5: p[(num, g)] = recomb_a_first(m[(num*2, g+1)], cut_spots) else: p[(num, g)] = recomb_b_first(m[(num*2, g+1)], cut_spots) else: # Female ancestor if rand < 0.5: homol_a = recomb_a_first(m[(num*2, g+1)], cut_spots) else: homol_a = recomb_b_first(m[(num*2, g+1)], cut_spots) homol_b = p[(num*2-1, g+1)] m[(num, g)] = np.column_stack((homol_a, homol_b)) # Since there won't be a 2 to make m2_0, it's hard-coded as a sib. to p1_0: random.shuffle(spots) num_of_recombs = np.random.poisson(rand_fish) cut_spots = sorted(spots[:num_of_recombs]) if random.random() < 0.5: homol_a = recomb_a_first(m[(2, 1)], cut_spots) else: homol_a = recomb_b_first(m[(2, 1)], cut_spots) homol_b = p[(1, 1)] #s1 = np.column_stack((homol_a, homol_b)) # Checking 'in common' doesn't work if it's an array. m[(2, 0)] = homol_a + homol_b # Ok to concat. homologues because m2_0 has no descendants # Calculate percent in common with ancestors. When the final descendant is female, the # percentage is that of just one homologue. To find the percentage of her total X-DNA, # genes_num would have to be multiplied by 2, but I believe that would be misleading. # W/o multiplying by 2, it's comparable to amount of DNA rather than just percentage, # and allows comparison to male descendants' DNA amount. # Just the ones who could be anc. of m2_0, excluding those who could be anc. of p1_0: for num in range(mat_start_num, pat_start_num): if num % 2 == 1: # odd means pat. m2_0_in_common_w_p[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(m[(2, 0)], 'p' + str(num) + '*')) / genes_num)) else: # even means mat. m2_0_in_common_w_m[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(m[(2, 0)], 'm' + str(num) + '*')) / genes_num)) # Ones who could be anc. of both m2_0 and p1_0: for num in range(pat_start_num, 2**gen + 1): if num % 2 == 1: # odd means pat. p1_0_in_common_w_p[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(p[(1, 0)], 'p' + str(num) + '*')) / genes_num)) m2_0_in_common_w_p[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(m[(2, 0)], 'p' + str(num) + '*')) / genes_num)) else: # even means mat. p1_0_in_common_w_m[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(p[(1, 0)], 'm' + str(num) + '*')) / genes_num)) m2_0_in_common_w_m[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(m[(2, 0)], 'm' + str(num) + '*')) / genes_num)) # 'For loop' for trials ends here import statistics # combos() and kRecur() were modified from here: # https://stackoverflow.com/questions/7074051/what-is-the-best-way-to-generate-all-possible-three-letter-strings # which was modified by alphahmed from a solution by Abhinav Ramana on GeeksforGeeks.org. def combos(letters, k): out = [] l = len(letters) kRecur(letters, "", l, k, out) return(out) def kRecur(letters, prfx, l, k, out): if k == 0: out = out.append(prfx) else: for i in range(l): newPrfx = prfx + letters[i] kRecur(letters, newPrfx, l, k-1, out) if len(out) == 2**k: return out order_labels = sorted(combos(['m','p'], gen), reverse = True) confid = 90 lo = (100 - confid) / 2 hi = 100 - lo # Print ("pv" stands for print variable) for num in range(mat_start_num, pat_start_num): if num % 2 == 1: # odd means ancestor is pat. pv = m2_0_in_common_w_p[str(num) + '_' + str(gen)] print('m_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',\ 'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',\ 'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',\ pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%') else: # even means ancestor is mat. pv = m2_0_in_common_w_m[str(num) + '_' + str(gen)] print('m_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',\ 'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',\ 'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',\ pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%') for num in range(pat_start_num, 2**gen + 1): if num % 2 == 1: # odd means ancestor is pat. pv = p1_0_in_common_w_p[str(num) + '_' + str(gen)] print('p_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',\ 'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',\ 'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',\ pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%') pv = m2_0_in_common_w_p[str(num) + '_' + str(gen)] print('m_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',\ 'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',\ 'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',\ pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%') else: # even means ancestor is mat. pv = p1_0_in_common_w_m[str(num) + '_' + str(gen)] print('p_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',\ 'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',\ 'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',\ pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%') pv = m2_0_in_common_w_m[str(num) + '_' + str(gen)] print('m_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',\ 'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',\ 'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',\ pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%')
Recent Comments