Select Page
# Copyright (c) 2020 Brit Nicholson.
# All rights reserved. This program is made available under
# the terms of the Simple Non Code License (SNCL) v2.3.0,
# which is available at
# https://github.com/MysteryDash/Simple-Non-Code-License/blob/master/License.txt

# Training model for determining the average number of "segments" passed from a simulated
# parent to a child based on the known standard deviation between siblings.

import numpy as np
import random
import math
import pandas as pd

# The below function allows a "child" to "inherit" 50% of each of his/her parents' DNA
# The halved # of segments that a parent has available is passed as the 1st arg.
# The labeled "DNA segments" of the first parent are passed as the 2nd arg.
# An optional argument is the labeled segments of the 2nd parent. This is always used here.
def recomb(fun_half_segs, first_anc, second_anc=None):

  # Half of the 1st parent's segments are passed
  out = random.sample(first_anc, fun_half_segs)

  if second_anc != None:
    # Half of the 2nd parent's segments are passed (added to the list)
    out.extend(random.sample(second_anc, fun_half_segs))
  return out

# Initialize the number of segments
segs = 96.95

# Assuming std_dev is just under 0.036.
# Doesn't really matter if it's + or - at first. It will correct itself.
difference = -0.0001 

segs_out = []
mean_out = []
std_dev_out = []

# The simulation appears to keep running with the value: 0.000001.
# With one less zero it would sometimes exit the while loop
# when the number of segments was around 97.5
while abs(difference) > 0.000001:

  if difference > 0:
    # If std_dev is high, the # of segs. needs to be increased
    # Got too much variability with 0.001, even w/ 200,401 sib_pairs
    segs += 0.05
  else:
    # If std_dev is low, the # of segs. needs to be decreased
    segs -= 0.05

  percent_same = []

  # This is how many trials will be done to compair a pair of siblings
  sib_pairs = 200401
  for i in range(0, sib_pairs):

    # Only the decimal portion remaining after halving the segments.
    # This allows #s other than even whole integer #s of segments.
    seg_decimal = segs/2 - int(segs/2)

    if np.random.random() > seg_decimal:
      # If the decimal is 0.75, round down the halved # of segments 25% of the time.
      half_segs = math.floor(segs/2)
    else:
      # If the decimal is 0.75, round up the halved # of segments 75% of the time.
      half_segs = math.ceil(segs/2)

    # Make labeled "segments" for maternal and paternal ancestors (parents here):
    m = list(range(0, half_segs*2))
    p = list(m)

    m = ['m' + str(i) for i in m]
    p = ['p' + str(i) for i in p]
  
    # Call the function for the first sibling
    s1 = recomb(half_segs, m, p)
  
    # Call the function for the second sibling
    s2 = recomb(half_segs, m, p)
  
    # Take the unique intersection (shared segments) of the siblings
    in_common = list(set(s1) & set(s2))
  
    # Calculate the fraction of segments per sibling that is shared
    percent_same.append(len(in_common)/(2*half_segs))

  # Find the standard deviation of all of the shared fractions
  std_dev = np.std(percent_same)
    
  # Find if the model overshot or undershot, for future correction
  difference = std_dev - 0.036

  segs_out.append(segs)
  mean_out.append(np.mean(percent_same))
  std_dev_out.append(std_dev)
  print(segs)
  print(std_dev)
  #print(min(percent_same))
  #print(max(percent_same))