Select Page
# Copyright (c) 2020 Brit Nicholson.
# All rights reserved. This program is made available under
# the terms of the Simple Non Code License (SNCL) v2.3.0,
# which is available at
# https://github.com/MysteryDash/Simple-Non-Code-License/blob/master/License.txt

# This program finds the largest distinct DNA segments you have based on your DNA matches.
# Optionally, you can run the code a second time. The goal would be that one run would find
# paternal segments and the other would find maternal segments, although, even before
# attempting such a program, one would expect the result to be far from perfect.

import csv
import pandas as pd

DATA_FILE = 'Your MyHeritage Segment-Level Matching File Name'
fields = ['Name', 'Match name', 'Chromosome', 'Start Location', 'End Location', 'Centimorgans', 'SNPs']

# Read in the data file
df_file = pd.read_csv(DATA_FILE, delimiter=',', usecols=fields, encoding='latin-1')

# Get rid of NaN values
df_file.dropna(inplace=True)

# Get rid of segments that match on less than 2300 SNPs
df_raw = df_file[df_file['SNPs'] >= 2300].copy().drop_duplicates()

# Get rid of unused columns. Match name isn’t needed for the program. It will be merged back in at the end.
df_raw.drop(['Name','Match name','SNPs'], axis=1, inplace=True)

df_raw.columns = ['Chr.', 'Start', 'End', 'cM'] # Rename columns

df_map = df_file.copy().drop_duplicates()

del df_file

df_map.drop(['Name','Centimorgans'], axis=1, inplace=True)

df_map.columns = ['Name', 'Chr.', 'Start', 'End', 'SNPs']

# Initialize the output dataframe that will print results to a file.
# This will be the same whether phase 2 is run or only phase 1.
df_out = pd.DataFrame(columns=['Chr.','Start','End','cM'])

# Loop through all 22 chromosomes
for chro_i in range(1, 23):

  df = df_raw[df_raw['Chr.'] == chro_i].copy().drop(['Chr.'], axis=1)

  # Need to sort by Start, then by cM, otherwise a smaller segment could get saved even if it's followed by a larger segment w/ the same start value (because the first segment has Overlap = NaN)
  df = df.sort_values(['Start', 'cM'], ascending=[True, False]).reset_index(drop=True)

  df['MaxEnd'] = df['End'].cummax()

  # Drop segments that are fully contained w/in a previous segment.
  df.drop(df[df['MaxEnd'] > df['End']].index, inplace=True)

  df.drop('MaxEnd', axis=1, inplace=True)

  # This will be for phase 1
  df1 = df.copy()

  df1 = df1.sort_values(['cM', 'Start'], ascending=[False, True]).reset_index(drop=True)

  # df1out = df1.loc[0][:] # creates a series
  df1out = pd.DataFrame({'Start' : [df1.loc[0][0]], 'End' : [df1.loc[0][1]], 'cM' : [df1.loc[0][2]]})
  cols = list(df1out.columns)
  a, b, c = cols.index('End'), cols.index('Start'), cols.index('cM')
  cols[b], cols[a], cols[c] = cols[a], cols[b], cols[c]
  df1out = df1out[cols]
    
  new = [False] # 0 for old, 1 for new
  # Make a column called 'New'. New rows are the only ones that can be dropped from the output.
  df1out.insert(loc=3, column='New', value=new)

  # Find non-overlapping segments for phase 1. This should probably be a function since the entire ‘for loop’ is called twice in this program.
  for i in range(1, len(df1.index)):

    # Append a new (ith) row to the ouput
    newrow = pd.DataFrame({'Start' : [df1.loc[i][0]], 'End' : [df1.loc[i][1]], 'cM' : [df1.loc[i][2]], 'New' : True})
    df1out = df1out.append(newrow.loc[0, :])
    
    # Overlap will be calculated between a segment and the previous one, so rows need to be sorted based on start values.
    df1out = df1out.sort_values(['Start', 'cM'], ascending=[True, False]).reset_index(drop=True)

    # Calculate the overlap between a segment and the one from the previous row
    df1out['Overlap'] = df1out['End'].shift(1) - df1out['Start']
    
    # Checks overlap for all rows in case the new row has too much overlap.
    # Maybe in the future change it to check only the new row and those before/after it, which would be faster
    if (df1out['Overlap'] > 1000000).any():

      # Drop new seg. if any rows have overlap
      df1out.drop(df1out[df1out['New'] == True].index, inplace=True)

    # Need to change any New = 1 to New = 0 before the next iteration in case a new row remains in the dataframe
    df1out.loc[:,'New'] = 0
  # End of 'for loop' to find non-overlapping segs. for a given chromosome
  
  del df1

  df1out.drop(['New', 'Overlap'], axis=1, inplace=True)
  # Need to keep df1out for phase 2
 
  #################
  # Begin Phase 2 #
  #################

  # Keep only rows that didn't make it to the output of phase 1. The following method will only work if duplicates have already been removed from both dataframes (which is true):
  df2 = pd.concat([df, df1out]).drop_duplicates(keep=False).sort_values(['cM', 'Start'],ascending=[False, True]).reset_index(drop=True)

  del df
  # Most of the rest of this code is trying to find a large cM row that overlaps a Phase 1 output row by ~50%

  # Need to delete everything except for the top ten segs. or so. using df1out (about top 10--change it once I have enough rows) for better coverage and to make it likely that the Phase 2 output is from the other homologue.
  df1_test = df1out.sort_values(['cM', 'Start'], ascending=[False, True]).reset_index(drop=True).astype(int)

  # Use the 6 largest segs. from Phase 1 to find a good overlapping seg.
  df1_test = df1_test.iloc[0:5]

  # Ensure decent coverage from df2_test
  df1_start = df1_test['Start'].min()
  df1_end = df1_test['End'].max()

  # Could change this 7, maybe to the average number of segments (minus one) in the output of Phase 1:
  bin_num = 7 # Makes n + 1 bins

  bin_start = list(range(0, df1_end - int((df1_end - df1_start)/bin_num), int((df1_end - df1_start)/bin_num)))

  bin_end = list(range(int((df1_end - df1_start)/bin_num), df1_end, int((df1_end - df1_start)/bin_num)))

  df2_test = pd.DataFrame({'Start' : bin_start, 'End' : bin_end})
  cols = list(df2_test.columns)
  a, b = cols.index('End'), cols.index('Start')
  cols[b], cols[a] = cols[a], cols[b]
  df2_test = df2_test[cols]
  df2_test['cM'] = 0

  df2_test = pd.concat([df2, df2_test]).sort_values(['Start', 'cM'], ascending=[True, False]).reset_index(drop=True)
  # df2_test will become really large, but the next line will only keep the largest cM rows in between the ones made above w/ 0 cM
  
  # Resulting segs. are large w/ good coverage for comparing to df1_out. Have confirmed that some can overlap.
  df2_test.drop(df2_test[(df2_test['cM'] == 0) | (df2_test['cM'] <= df2_test['cM'].shift(1))].index, inplace=True)

  # Make sure df2 is sorted by cM and indices are reset
  # Might want to add back in the ~10 largest segs. just in case the above ones don't have good overlap
  df2_test = pd.concat([df2.iloc[0:20], df2_test]).drop_duplicates().reset_index(drop=True)

  df2_test_out = pd.DataFrame(columns=['Start','End','cM','Overlap'])
  for i in range(0, len(df2_test.index)):
    for j in range(0, len(df1_test.index)):
    # Ensure there is overlap:
      if (df2_test.loc[i]['Start'] < df1_test.loc[j]['End']) & (df1_test.loc[j]['Start'] < df2_test.loc[i]['End']):

        overlap = df2_test.loc[i]['End'] - df1_test.loc[j]['Start']

        if overlap < 0:
          overlap = df1_test.loc[j]['End'] - df2_test.loc[i]['Start']

        # Get the abs. fraction of overlap
        overlap = abs(df2_test.loc[i]['End'] - df2_test.loc[i]['Start'])/overlap

        if (0.25 < overlap) & (overlap < 0.75):

          newrow = pd.DataFrame({'Start' : [df2_test.loc[i]['Start']], 'End' : [df2_test.loc[i]['End']], 'cM' : [df2_test.loc[i]['cM']], 'Overlap' : overlap})

          df2_test_out = df2_test_out.append(newrow.loc[0, :])
        
          # Break the inner (j) loop, since df1_test is only non-overlapping segs.
          break

  del df1_test
  del df2_test
  
  df2_test_out['Overlap'] = abs(0.5 - df2_test_out['Overlap'])
    
  df2_test_out['Overlap'] = (df2_test_out['cM']**3)/(df2_test_out['Overlap'])

  df2out = df2_test_out[df2_test_out['Overlap'] == df2_test_out['Overlap'].max()].copy()
  df2out.drop('Overlap', axis=1, inplace=True)

  del df2_test_out

  new = [False] # 0 for old, 1 for new
  df2out.insert(loc=3, column='New', value=new)

  # Find non-overlapping segments for phase 2. This should probably be a function since the entire ‘for loop’ is called twice in this program.
  for i in range(1, len(df2.index)):

    # Append a new (ith) row to the ouput
    newrow = pd.DataFrame({'Start' : [df2.loc[i][0]], 'End' : [df2.loc[i][1]], 'cM' : [df2.loc[i][2]], 'New' : True})
    df2out = df2out.append(newrow.loc[0, :])

    # Overlap will be calculated between a segment and the previous one, so rows need to be sorted based on start values.
    df2out = df2out.sort_values(['Start', 'cM'], ascending=[True, False]).reset_index(drop=True)

    # Calculate the overlap between a segment and the one from the previous row
    df2out['Overlap'] = df2out['End'].shift(1) - df2out['Start']

    # Checks overlap for all rows in case the new row has too much overlap.
    # Maybe in the future change it to check only the new row and those before/after it, which would be faster
    if (df2out['Overlap'] > 1000000).any():

      # Drop new seg. if any rows have overlap
      df2out.drop(df2out[df2out['New'] == True].index, inplace=True)

    # Need to change any New = 1 to New = 0 before the next iteration in case a new row remains in the dataframe
    df2out.loc[:,'New'] = 0
    
  del df2
  # End of 'for loop' to find non-overlapping segs. for a given chromosome
  
  df1out.insert(loc=0, column='Chr.', value=chro_i)
  df1out.loc[:,'Chr.'] = chro_i
    
  df2out.drop(['New', 'Overlap'], axis=1, inplace=True)
  df2out.insert(loc=0, column='Chr.', value=chro_i)
  df2out.loc[:,'Chr.'] = chro_i

  df_out = df_out.append(df1out)
  del df1out
  df_out = df_out.append(df2out)
  del df2out

# 'For loop' ends here for chrs. 1-22

df_out = df_out.merge(df_map, how='left', on=['Chr.','Start','End'], copy=False)
cols = list(df_out.columns)
a, b, c, d, e = cols.index('Chr.'), cols.index('Start'), cols.index('End'), cols.index('Name'), cols.index('cM')
cols[a], cols[b], cols[c], cols[e], cols[d] = cols[a], cols[b], cols[c], cols[d], cols[e]
df_out = df_out[cols]
print(df_out)

# Need to drop index first or while printing
df_out.to_csv('Your MyHeritage Segment-Level Matching File Name.csv', index=False)