Select Page
```# Copyright (c) 2020 Brit Nicholson.
# the terms of the Simple Non Code License (SNCL) v2.3.0,
# which is available at

# Training model for determining the average number of "segments" passed from a simulated
# parent to a child based on the known standard deviation between siblings.

import numpy as np
import random
import math
import pandas as pd

# The below function allows a "child" to "inherit" 50% of each of his/her parents' DNA
# The halved # of segments that a parent has available is passed as the 1st arg.
# The labeled "DNA segments" of the first parent are passed as the 2nd arg.
# An optional argument is the labeled segments of the 2nd parent. This is always used here.
def recomb(fun_half_segs, first_anc, second_anc=None):

# Half of the 1st parent's segments are passed
out = random.sample(first_anc, fun_half_segs)

if second_anc != None:
# Half of the 2nd parent's segments are passed (added to the list)
out.extend(random.sample(second_anc, fun_half_segs))
return out

# Initialize the number of segments
segs = 96.95

# Assuming std_dev is just under 0.036.
# Doesn't really matter if it's + or - at first. It will correct itself.
difference = -0.0001

segs_out = []
mean_out = []
std_dev_out = []

# The simulation appears to keep running with the value: 0.000001.
# With one less zero it would sometimes exit the while loop
# when the number of segments was around 97.5
while abs(difference) > 0.000001:

if difference > 0:
# If std_dev is high, the # of segs. needs to be increased
# Got too much variability with 0.001, even w/ 200,401 sib_pairs
segs += 0.05
else:
# If std_dev is low, the # of segs. needs to be decreased
segs -= 0.05

percent_same = []

# This is how many trials will be done to compair a pair of siblings
sib_pairs = 200401
for i in range(0, sib_pairs):

# Only the decimal portion remaining after halving the segments.
# This allows #s other than even whole integer #s of segments.
seg_decimal = segs/2 - int(segs/2)

if np.random.random() > seg_decimal:
# If the decimal is 0.75, round down the halved # of segments 25% of the time.
half_segs = math.floor(segs/2)
else:
# If the decimal is 0.75, round up the halved # of segments 75% of the time.
half_segs = math.ceil(segs/2)

# Make labeled "segments" for maternal and paternal ancestors (parents here):
m = list(range(0, half_segs*2))
p = list(m)

m = ['m' + str(i) for i in m]
p = ['p' + str(i) for i in p]

# Call the function for the first sibling
s1 = recomb(half_segs, m, p)

# Call the function for the second sibling
s2 = recomb(half_segs, m, p)

# Take the unique intersection (shared segments) of the siblings
in_common = list(set(s1) & set(s2))

# Calculate the fraction of segments per sibling that is shared
percent_same.append(len(in_common)/(2*half_segs))

# Find the standard deviation of all of the shared fractions
std_dev = np.std(percent_same)

# Find if the model overshot or undershot, for future correction
difference = std_dev - 0.036

segs_out.append(segs)
mean_out.append(np.mean(percent_same))
std_dev_out.append(std_dev)
print(segs)
print(std_dev)
#print(min(percent_same))
#print(max(percent_same))```