I’ve written two Python algorithms that I describe in this article. The algorithm below works for downloaded segment-level match files from GEDmatch. The other algorithm, found here, is for match files downloaded from MyHeritage.
# Copyright (c) 2020 Brit Nicholson. # All rights reserved. This program is made available under # the terms of the Simple Non Code License (SNCL) v2.3.0, # which is available at # https://github.com/MysteryDash/Simple-Non-Code-License/blob/master/License.txt # This program finds the largest distinct DNA segments you have based on your DNA matches. # Optionally, you can run the code a second time. The goal would be that one run would find paternal segments and the other would find maternal segments, although, even before attempting such a program, one would expect the result to be far from perfect. # This code was developed using Python 3. import csv import pandas as pd DATA_FILE = 'Your MyHeritage Segment-Level Matching File Name.csv' fields = ['PrimaryKit','MatchedKit','chr','B37Start','B37End','Segment cM','SNPs','MatchedName','Matched Sex','MatchedEmail'] # Read in the data file df_file = pd.read_csv(DATA_FILE, delimiter=',', usecols=fields, encoding='latin-1') # Get rid of NaN values df_file.dropna(inplace=True) df_file = df_file.loc[~((df_file['SNPs'] < 200)),:] df_file['chr'] = df_file['chr'].astype(str) # Get rid of segments that match on less than 2300 SNPs df_raw = df_file.copy().drop_duplicates() # Get rid of unused columns. Match name isn’t needed for the program. It will be merged back in at the end. df_raw.drop(['PrimaryKit','MatchedKit','SNPs','MatchedName','Matched Sex','MatchedEmail'], axis=1, inplace=True) df_raw.columns = ['Chr.', 'Start', 'End', 'cM'] # Rename columns df_map = df_file.copy().drop_duplicates() del df_file df_map.drop(['PrimaryKit','Segment cM','Matched Sex'], axis=1, inplace=True) df_map.columns = ['Kit', 'Chr.', 'Start', 'End', 'SNPs', 'Name', 'Email'] # Initialize the output dataframe that will print results to a file. # This will be the same whether phase 2 is run or only phase 1. df_out = pd.DataFrame(columns=['Chr.','Start','End','cM']) chr_list = list(range(1, 23)) + ['X'] # Loop through all 23 chromosomes for chro_i in chr_list: chro_i_str = str(chro_i) df = df_raw[df_raw['Chr.'] == chro_i_str].copy().drop(['Chr.'], axis=1) # Need to sort by Start, then by cM, otherwise a smaller segment could get saved even if it's followed by a larger segment w/ the same start value (because the first segment has Overlap = NaN) df = df.sort_values(['Start', 'cM'], ascending=[True, False]).reset_index(drop=True) df['MaxEnd'] = df['End'].cummax() # Drop segments that are fully contained w/in a previous segment. df.drop(df[df['MaxEnd'] > df['End']].index, inplace=True) df.drop('MaxEnd', axis=1, inplace=True) # This will be for phase 1 df1 = df.copy() df1 = df1.sort_values(['cM', 'Start'], ascending=[False, True]).reset_index(drop=True) # df1out = df1.loc[0][:] # creates a series df1out = pd.DataFrame({'Start' : [df1.loc[0][0]], 'End' : [df1.loc[0][1]], 'cM' : [df1.loc[0][2]]}) df1out = df1out.astype({"Start":'int', "End":'int'}) cols = list(df1out.columns) a, b, c = cols.index('End'), cols.index('Start'), cols.index('cM') cols[b], cols[a], cols[c] = cols[a], cols[b], cols[c] df1out = df1out[cols] new = [False] # 0 for old, 1 for new # Make a column called 'New'. New rows are the only ones that can be dropped from the output. df1out.insert(loc=3, column='New', value=new) # Find non-overlapping segments for phase 1. This could be a function since the entire ‘for loop’ is called twice in this program. for i in range(1, len(df1.index)): # Append a new (ith) row to the ouput newrow = pd.DataFrame({'Start' : [df1.loc[i][0]], 'End' : [df1.loc[i][1]], 'cM' : [df1.loc[i][2]], 'New' : True}) df1out = df1out.append(newrow.loc[0, :]) # Overlap will be calculated between a segment and the previous one, so rows need to be sorted based on start values. df1out = df1out.sort_values(['Start', 'cM'], ascending=[True, False]).reset_index(drop=True) # Calculate the overlap between a segment and the one from the previous row df1out['Overlap'] = df1out['End'].shift(1) - df1out['Start'] # Checks overlap for all rows in case the new row has too much overlap. # Maybe in the future change it to check only the new row and those before/after it, which would be faster if (df1out['Overlap'] > 1000000).any(): # Drop new seg. if any rows have overlap df1out.drop(df1out[df1out['New'] == True].index, inplace=True) # Need to change any New = 1 to New = 0 before the next iteration in case a new row remains in the dataframe df1out.loc[:,'New'] = 0 # End of 'for loop' to find non-overlapping segs. for a given chromosome del df1 df1out.drop(['New', 'Overlap'], axis=1, inplace=True) # Need to keep df1out for phase 2 ################# # Begin Phase 2 # ################# # Keep only rows that didn't make it to the output of phase 1. The following method will only work if duplicates have already been removed from both dataframes (which is true): df2 = pd.concat([df, df1out]).drop_duplicates(keep=False).sort_values(['cM', 'Start'],ascending=[False, True]).reset_index(drop=True) del df # Most of the rest of this code is trying to find a large cM row that overlaps a Phase 1 output row by ~50% # Need to delete everything except for 6-10 segs. or so using df1out for better coverage and to make it likely that the Phase 2 output is from the other homologue. df1_test = df1out.copy().sort_values(['cM', 'Start'], ascending=[False, True]).reset_index(drop=True) df1_test = df1_test.astype({"Start":'int', "End":'int'}) # Use the 6 largest segs. from Phase 1 to find a good overlapping seg. (may include more in the future for higher likelyhood of finding on on the opposite homologue) df1_test = df1_test.iloc[0:5] # Ensure decent coverage from df2_test df1_start = df1_test['Start'].min() df1_end = df1_test['End'].max() # Could change this 7, maybe to the average number of segments (minus one) in the output of Phase 1: bin_num = 7 # Makes n + 1 bins bin_start = list(range(0, df1_end - int((df1_end - df1_start)/bin_num), int((df1_end - df1_start)/bin_num))) bin_end = list(range(int((df1_end - df1_start)/bin_num), df1_end, int((df1_end - df1_start)/bin_num))) df2_test = pd.DataFrame({'Start' : bin_start, 'End' : bin_end}) cols = list(df2_test.columns) a, b = cols.index('End'), cols.index('Start') cols[b], cols[a] = cols[a], cols[b] df2_test = df2_test[cols] df2_test['cM'] = 0 df2_test = pd.concat([df2, df2_test]).sort_values(['Start', 'cM'], ascending=[True, False]).reset_index(drop=True) # df2_test will become really large, but the next line will only keep the largest cM rows in between the ones made above w/ 0 cM # Resulting segs. are large w/ good coverage for comparing to df1_out. Have confirmed that some can overlap. df2_test.drop(df2_test[(df2_test['cM'] == 0) | (df2_test['cM'] <= df2_test['cM'].shift(1))].index, inplace=True) # Make sure df2 is sorted by cM and indices are reset df2_test = pd.concat([df2.iloc[0:20], df2_test]).drop_duplicates().reset_index(drop=True) df2_test_out = pd.DataFrame(columns=['Start','End','cM','Overlap']) for i in range(0, len(df2_test.index)): for j in range(0, len(df1_test.index)): # Ensure there is overlap: if (df2_test.loc[i]['Start'] < df1_test.loc[j]['End']) & (df1_test.loc[j]['Start'] < df2_test.loc[i]['End']): overlap = df2_test.loc[i]['End'] - df1_test.loc[j]['Start'] if overlap < 0: overlap = df1_test.loc[j]['End'] - df2_test.loc[i]['Start'] # Get the abs. fraction of overlap overlap = abs(df2_test.loc[i]['End'] - df2_test.loc[i]['Start'])/overlap if (0.25 < overlap) & (overlap < 0.75): newrow = pd.DataFrame({'Start' : [df2_test.loc[i]['Start']], 'End' : [df2_test.loc[i]['End']], 'cM' : [df2_test.loc[i]['cM']], 'Overlap' : overlap}) df2_test_out = df2_test_out.append(newrow.loc[0, :]) # Break the inner (j) loop, since df1_test is only non-overlapping segs. break del df1_test del df2_test df2_test_out['Overlap'] = abs(0.5 - df2_test_out['Overlap']) df2_test_out['Overlap'] = (df2_test_out['cM']**3)/(df2_test_out['Overlap']) df2out = df2_test_out[df2_test_out['Overlap'] == df2_test_out['Overlap'].max()].copy() df2out.drop('Overlap', axis=1, inplace=True) del df2_test_out new = [False] # 0 for old, 1 for new df2out.insert(loc=3, column='New', value=new) # Find non-overlapping segments for phase 2. This could be a function since the entire ‘for loop’ is called twice in this program. for i in range(1, len(df2.index)): # Append a new (ith) row to the ouput newrow = pd.DataFrame({'Start' : [df2.loc[i][0]], 'End' : [df2.loc[i][1]], 'cM' : [df2.loc[i][2]], 'New' : True}) df2out = df2out.append(newrow.loc[0, :]) # Overlap will be calculated between a segment and the previous one, so rows need to be sorted based on start values. df2out = df2out.sort_values(['Start', 'cM'], ascending=[True, False]).reset_index(drop=True) # Calculate the overlap between a segment and the one from the previous row df2out['Overlap'] = df2out['End'].shift(1) - df2out['Start'] # Checks overlap for all rows in case the new row has too much overlap. # Maybe in the future change it to check only the new row and those before/after it, which would be faster if (df2out['Overlap'] > 1000000).any(): # Drop new seg. if any rows have overlap df2out.drop(df2out[df2out['New'] == True].index, inplace=True) # Need to change any New = 1 to New = 0 before the next iteration in case a new row remains in the dataframe df2out.loc[:,'New'] = 0 del df2 # End of 'for loop' to find non-overlapping segs. for a given chromosome df1out.insert(loc=0, column='Chr.', value=chro_i_str) df1out.loc[:,'Chr.'] = chro_i_str df2out.drop(['New', 'Overlap'], axis=1, inplace=True) df2out.insert(loc=0, column='Chr.', value=chro_i_str) df2out.loc[:,'Chr.'] = chro_i_str df_out = df_out.append(df1out) del df1out df_out = df_out.append(df2out) del df2out # 'For loop' ends here for chrs. 1-22 df_out = df_out.astype({"Start":'int', "End":'int'}) df_out = df_out.merge(df_map, how='left', on=['Chr.','Start','End'], copy=False) cols = list(df_out.columns) a, b, c, d, e, f, g, h = cols.index('Chr.'), cols.index('Start'), cols.index('End'), cols.index('Name'), cols.index('Kit'), cols.index('cM'), cols.index('SNPs'), cols.index('Email') cols[a], cols[b], cols[c], cols[f], cols[e], cols[g], cols[d], cols[h] = cols[a], cols[b], cols[c], cols[d], cols[e], cols[f], cols[g], cols[h] df_out = df_out[cols] print(df_out) # Need to drop index first or while printing df_out.to_csv('Your MyHeritage Distinct Segment Output File Name.csv', index=False)
Recent Comments