# Copyright (c) 2020 Brit Nicholson. # All rights reserved. This program is made available under # the terms of the Simple Non Code License (SNCL) v2.3.0, # which is available at # https://github.com/MysteryDash/Simple-Non-Code-License/blob/master/License.txt # Training model for determining the average number of "segments" passed from a simulated # parent to a child based on the known standard deviation between siblings. import numpy as np import random import math import pandas as pd # The below function allows a "child" to "inherit" 50% of each of his/her parents' DNA # The halved # of segments that a parent has available is passed as the 1st arg. # The labeled "DNA segments" of the first parent are passed as the 2nd arg. # An optional argument is the labeled segments of the 2nd parent. This is always used here. def recomb(fun_half_segs, first_anc, second_anc=None): # Half of the 1st parent's segments are passed out = random.sample(first_anc, fun_half_segs) if second_anc != None: # Half of the 2nd parent's segments are passed (added to the list) out.extend(random.sample(second_anc, fun_half_segs)) return out # Initialize the number of segments segs = 96.95 # Assuming std_dev is just under 0.036. # Doesn't really matter if it's + or - at first. It will correct itself. difference = -0.0001 segs_out = [] mean_out = [] std_dev_out = [] # The simulation appears to keep running with the value: 0.000001. # With one less zero it would sometimes exit the while loop # when the number of segments was around 97.5 while abs(difference) > 0.000001: if difference > 0: # If std_dev is high, the # of segs. needs to be increased # Got too much variability with 0.001, even w/ 200,401 sib_pairs segs += 0.05 else: # If std_dev is low, the # of segs. needs to be decreased segs -= 0.05 percent_same = [] # This is how many trials will be done to compair a pair of siblings sib_pairs = 200401 for i in range(0, sib_pairs): # Only the decimal portion remaining after halving the segments. # This allows #s other than even whole integer #s of segments. seg_decimal = segs/2 - int(segs/2) if np.random.random() > seg_decimal: # If the decimal is 0.75, round down the halved # of segments 25% of the time. half_segs = math.floor(segs/2) else: # If the decimal is 0.75, round up the halved # of segments 75% of the time. half_segs = math.ceil(segs/2) # Make labeled "segments" for maternal and paternal ancestors (parents here): m = list(range(0, half_segs*2)) p = list(m) m = ['m' + str(i) for i in m] p = ['p' + str(i) for i in p] # Call the function for the first sibling s1 = recomb(half_segs, m, p) # Call the function for the second sibling s2 = recomb(half_segs, m, p) # Take the unique intersection (shared segments) of the siblings in_common = list(set(s1) & set(s2)) # Calculate the fraction of segments per sibling that is shared percent_same.append(len(in_common)/(2*half_segs)) # Find the standard deviation of all of the shared fractions std_dev = np.std(percent_same) # Find if the model overshot or undershot, for future correction difference = std_dev - 0.036 segs_out.append(segs) mean_out.append(np.mean(percent_same)) std_dev_out.append(std_dev) print(segs) print(std_dev) #print(min(percent_same)) #print(max(percent_same))
Recent Comments