This script searches for all relatives in a 23 and Me account that share any chromosomome segments with a “Subject Relative”. This should make it easier to choose a starting point of deeper research into unknown relatives.

Import Data

df  <- read.csv("John_Doe_relatives_download.csv")

Clean and Process Data

df0 <- df %>%
  filter(!chromosome_number == "") %>%
  mutate(
    chromosome_number = ifelse(chromosome_number == "X",
                               "23", chromosome_number),
    percent_dna_shared = gsub("%", "", percent_dna_shared),
    chromosome_number = as.numeric(chromosome_number),
    chromosome_start_point = as.numeric(chromosome_start_point),
    chromosome_end_point = as.numeric(chromosome_end_point),
    genetic_distance = as.numeric(genetic_distance),
    num_snps = as.numeric(num_snps),
    birth_year = as.numeric(birth_year),
    percent_dna_shared =  as.numeric(percent_dna_shared),
    num_segments_shared = as.numeric(num_segments_shared)
  )

df1  <- df0 %>%
  select(display_name, chromosome_number, chromosome_start_point, chromosome_end_point)

Generate Dataframe of Subject/Relative

subject <- "John Doe" # User Input:  Enter Subject display_name
lower <- df1 %>%
  filter(display_name == subject) %>%
  select(chromosome_start_point)
upper <- df1 %>%
  filter(display_name == subject) %>%
  select(chromosome_end_point)
chromosome <- df1 %>%
  filter(display_name == subject) %>%
  select(chromosome_number)

df2 <- data.frame(
  subject,
  chromosome,
  lower,
  upper
)

head(df2)

Each observation of this subject generates a chromosome segment range as related to the the Profile Subject

Join Subject Dataframe to Main Dataframe

df3 <- left_join(df2, df1, by = 'chromosome_number', relationship = "many-to-many")

By rejoining the two datasets it is possible to see the chromosome segments that the near relative subject has in common with relatives of the Profile Subject.

Create Subject List from Nearest / Close relative

subject_list  <- df3 %>%
  select(subject,
         chromosome_start_point.x,
         chromosome_end_point.x,
         chromosome_number,
         display_name,
         chromosome_start_point.y,
         chromosome_end_point.y) %>%
  #Filter to Range
  filter(!display_name == subject) %>%
  filter(
      (chromosome_start_point.y <= chromosome_start_point.x &
      chromosome_end_point.y >= chromosome_end_point.x) |
      (chromosome_start_point.y <= chromosome_end_point.x &
      chromosome_end_point.y >= chromosome_end_point.x) |
      (chromosome_start_point.y > chromosome_start_point.x &
      chromosome_end_point.y < chromosome_end_point.x) |
         (chromosome_start_point.y > chromosome_start_point.x &
         chromosome_end_point.y < chromosome_end_point.x)
  ) %>%
  group_by(display_name) %>%
  arrange(display_name) %>%
  ungroup() %>%
  group_by(display_name) %>%
  summarize(count = n()) %>%
  arrange(desc(count)) %>%
  filter(count > 1) %>%
  select(display_name) %>%
  list()

subject_list

This list shows other relatives that the Subject shares segments with.

Loop Through Relative list and Generate Plot

df3 %>%
  filter(display_name %in% c(subject_list[[1]]$display_name))  %>%
  mutate(genetic_distance =  chromosome_end_point.y - chromosome_start_point.y) %>%
  arrange(genetic_distance)  %>%
  group_by(display_name) %>%
  summarize(total_distance = sum(genetic_distance) / 2) %>%
  arrange(desc(total_distance)) %>%
  View()

df3 %>%
  filter(display_name %in% c(subject_list[[1]]$display_name))  %>%
  ggplot(aes(x = chromosome_number, color = display_name)) +
  geom_linerange(aes(ymin = chromosome_start_point.y,
                     ymax = chromosome_end_point.y,
                     x = chromosome_number,
                     size = 3.5, alpha = 0.2)) +
  scale_y_continuous(labels = scales::label_number(accuracy = 1)) +
  scale_x_continuous(breaks = 1:23) +
  theme_bw() +
  labs(title = paste("Shared Chromos with ", subject))