This notebook takes the output for the MAF Pipeline (mycsvfile.csv) and does:

  1. Calculates true pos, false pos, and false neg and makes a precision recall curve for all tools
  2. Finds total numbers of variants found by tools at different allele frequencies and sequencing depths
  3. Looks at correlation between SNPS in the golden vcf and worflow vcf For viral variant callers!
#knitr::opts_knit$set(root.dir = "C:\\Users\\mk5636\\Desktop\\mad-r\\")

#af_report = "/scratch/mk5636/soggy_wright_af_data.csv"
#golden_vcf = "/scratch/mk5636/MAD-1_AF_random.vcf"

Loading Libraries

library('ggplot2')
library('plyr')
library('tidyverse')
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble  3.0.4     ✔ purrr   0.3.3
## ✔ tidyr   1.0.0     ✔ dplyr   1.0.2
## ✔ readr   1.3.1     ✔ stringr 1.4.0
## ✔ tibble  3.0.4     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange()   masks plyr::arrange()
## ✖ purrr::compact()   masks plyr::compact()
## ✖ dplyr::count()     masks plyr::count()
## ✖ dplyr::failwith()  masks plyr::failwith()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::id()        masks plyr::id()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::mutate()    masks plyr::mutate()
## ✖ dplyr::rename()    masks plyr::rename()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
library('ggpubr')
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:plyr':
## 
##     mutate
library("MLmetrics")
## 
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
## 
##     Recall
MyColors = c("#E76F51", "#E9C369", "#2A9D8F","#2B4FA2", "#119DAC", "#672D7B", "#262366", "#BCBCBC")
ggplot2::theme_set(theme_minimal())

totalbases = 29903
detach(package:plyr, unload=TRUE)
library("plotrix") 

Making Transition/Transversion Function

TsTv = function(dataframe) {
  transitions = 0
  for (row in 1:nrow(dataframe)) {
  
   REF <- dataframe[row, "ref"] #requires dataframe to have column name "ref"
   ALT <- dataframe[row, "alt"] # requires dataframe to have column name "alt"

    if(REF == "A" & ALT == "G")
      transitions = transitions + 1
    if(REF == "G" & ALT == "A")
      transitions = transitions + 1
    if(REF == "C" & ALT == "T")
      transitions = transitions + 1
    if(REF == "T" & ALT == "C")
      transitions = transitions + 1
  }

  transversions = 0
  for (row in 1:nrow(dataframe)) {
  
    REF <- dataframe[row, "ref"]
    ALT <- dataframe[row, "alt"]
  
    if(REF == "A" & ALT == "T")
      transversions = transversions + 1
    if(REF == "T" & ALT == "A")
      transversions = transversions + 1
    if(REF == "C" & ALT == "A")
      transversions = transversions + 1
    if(REF == "A" & ALT == "C")
      transversions = transversions + 1
    if(REF == "G" & ALT == "T")
      transversions = transversions + 1
    if(REF == "T" & ALT == "G")
      transversions = transversions + 1
    if(REF == "C" & ALT == "G")
      transversions = transversions + 1
    if(REF == "G" & ALT == "C")
      transversions = transversions + 1
  }

  TsTv_df = data.frame(transitions,transversions)
  print(TsTv_df)
  TsTv_ratio = (transitions/transversions)
  print(TsTv_ratio)
  
}

Reading in Variant Data and Evaluating

AV = read.csv(af_report,header = T)
AV$sample_id = as.character(AV$sample_id)
AV$dp = as.numeric(AV$dp)
AV = separate(AV, sample_id, sep = "_", 
               into = c("pipe","AF","allele_freq","frac","seq_depth","tool","parameter")) %>%
  select(-c(pipe, AF, frac)) %>% 
  filter(ref == "A" | ref == "C" | ref == "T" | ref == "G") %>% 
  filter(alt == "A" | alt == "C" | alt == "T" | alt == "G") %>% droplevels()
## Warning: Expected 7 pieces. Additional pieces discarded in 13052 rows [1929,
## 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942,
## 1943, 1944, 1945, 1946, 1947, 1948, ...].
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 74054 rows [1, 2,
## 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
AV$seq_depth = as.numeric(AV$seq_depth)
AV$coverage = (AV$seq_depth * 100000)


AV_ivar = filter(AV, tool == "ivar") %>% droplevels()
AV_lofreq = filter(AV, tool == "lofreq") %>% droplevels()
AV_varscan_unfiltered = filter(AV, tool == "varscan" & parameter == "unfiltered") %>% droplevels()
#AV_varscan_unfiltered$tool = paste0(AV_varscan_unfiltered$tool,"_",AV_varscan_unfiltered$parameter)
AV_varscan_filtered = filter(AV, tool == "varscan"& parameter == "filtered") %>% droplevels()
#AV_varscan_filtered$tool = paste0(AV_varscan_filtered$tool,"_",AV_varscan_filtered$parameter)
AV_tims = filter(AV, tool == "tims") %>% droplevels()
AV_freebayes = filter(AV, tool == "freebayes") %>% droplevels()
AV_mutect = filter(AV, tool == "mutect2") %>% droplevels()
AV_haplotypecaller = filter(AV, tool == "filtered") %>% droplevels()
AV_haplotypecaller$tool = "haplotypecaller"

AV = rbind(AV_ivar,AV_varscan_filtered,AV_tims,
           AV_freebayes,AV_mutect,AV_haplotypecaller,AV_lofreq)
AV$parameter = NULL
head(AV)
##   allele_freq seq_depth tool  pos af_golden af_workflow ref alt dp coverage
## 1        0.03     1e-05 ivar  351      0.03           0   G   A  1        1
## 2        0.03     1e-05 ivar  581      0.03           0   G   A  1        1
## 3        0.03     1e-05 ivar 1002      0.03           0   A   G  1        1
## 4        0.03     1e-05 ivar 1170      0.03           0   C   T  1        1
## 5        0.03     1e-05 ivar 1180      0.03           0   A   T  1        1
## 6        0.03     1e-05 ivar 1208      0.03           0   A   G  1        1
Dp_Variation = ggplot(AV, aes(x = log10(coverage), y = log10(dp), color = tool)) +
  geom_point() + 
  facet_wrap(~tool, scales = "free_y") +
  geom_abline() +
  scale_color_manual(values = MyColors)
## Warning: namespace 'plyr' is not available and has been replaced
## by .GlobalEnv when processing object ''
print(Dp_Variation)
## Warning: Removed 7991 rows containing missing values (geom_point).

ggsave(Dp_Variation, file = "AV_Dp_Coverage.png")
## Saving 7 x 5 in image
## Warning: Removed 7991 rows containing missing values (geom_point).

Reading in Golden Data and Evaluating

viral_golden = read.table(golden_vcf, stringsAsFactors = F)
colnames(viral_golden) = c("chrom","pos","ID","ref","alt","qual","filter","info","codes","genotype")
viral_golden = separate(viral_golden, info, sep = "=", into = c("label","AF"))
viral_golden$label = NULL
head(viral_golden)
##       chrom  pos ID ref alt qual filter   AF          codes
## 1 SARS-CoV2  351  .   G   A    .      . 0.04 GT:AD:DP:GQ:PL
## 2 SARS-CoV2  581  .   G   A    .      .  0.4 GT:AD:DP:GQ:PL
## 3 SARS-CoV2 1002  .   A   G    .      . 0.46 GT:AD:DP:GQ:PL
## 4 SARS-CoV2 1170  .   C   T    .      . 0.18 GT:AD:DP:GQ:PL
## 5 SARS-CoV2 1180  .   A   T    .      . 0.73 GT:AD:DP:GQ:PL
## 6 SARS-CoV2 1208  .   A   G    .      . 0.18 GT:AD:DP:GQ:PL
##                                                                                                                                                          genotype
## 1                                                                                                                                            1/1/1/1:96,4:100:1:1
## 2                                                                   1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1:60,40:100:1:1
## 3                                                       1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1:54,46:100:1:1
## 4                                                                                                               1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1:82,18:100:1:1
## 5 1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1:27,73:100:1:1
## 6                                                                                                               1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1:82,18:100:1:1
dim(viral_golden)
## [1] 161  10
viral_golden_pos = ggplot(viral_golden, aes(x = pos, y = ref, color = alt)) +
  geom_point()
print(viral_golden_pos)

TsTv(viral_golden)
##   transitions transversions
## 1         114            47
## [1] 2.425532

SetAF - Separating Data into TP, FP, FN

AV_setAF = filter(AV, allele_freq != "random") %>% droplevels()

AV_setAF_falsepos = filter(AV_setAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
  AV_setAF_falsepos$category = c("FP")
AV_setAF_falseneg = filter(AV_setAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
  AV_setAF_falseneg$category = c("FN")
AV_setAF_truepos = filter(AV_setAF, af_golden != 0 & af_workflow != 0)
  AV_setAF_truepos$category = c("TP")
  
AV_setAF = rbind(AV_setAF_truepos, AV_setAF_falseneg, AV_setAF_falsepos)

FP_All = group_by(AV_setAF_falsepos, allele_freq,coverage,tool) %>% tally()
  colnames(FP_All) = c("allele_freq","coverage","tool","FP")
FN_All = group_by(AV_setAF_falseneg, allele_freq,coverage,tool) %>% tally()
  colnames(FN_All) = c("allele_freq","coverage","tool","FN")
TP_All = group_by(AV_setAF_truepos, allele_freq,coverage,tool) %>% tally()
  colnames(TP_All) = c("allele_freq","coverage","tool","TP")
    
AllVar = merge(FP_All,FN_All, by=c("allele_freq","coverage","tool"),all=T)
AllVar = merge(AllVar,TP_All, by=c("allele_freq","coverage","tool"), all=T)
head(AllVar)
##   allele_freq coverage            tool FP  FN TP
## 1        0.01        1       freebayes NA 161 NA
## 2        0.01        1 haplotypecaller NA 161 NA
## 3        0.01        1            ivar NA 161 NA
## 4        0.01        1          lofreq NA 161 NA
## 5        0.01        1         mutect2  1 161 NA
## 6        0.01        1            tims  2 161 NA
AllVar[is.na(AllVar)] = 0
head(AllVar)
##   allele_freq coverage            tool FP  FN TP
## 1        0.01        1       freebayes  0 161  0
## 2        0.01        1 haplotypecaller  0 161  0
## 3        0.01        1            ivar  0 161  0
## 4        0.01        1          lofreq  0 161  0
## 5        0.01        1         mutect2  1 161  0
## 6        0.01        1            tims  2 161  0
AllVar$prec = (AllVar$TP)/(AllVar$TP + AllVar$FP)
AllVar$recall = (AllVar$TP)/(AllVar$TP + AllVar$FN)
AllVar$area = AllVar$prec * AllVar$recall
head(AllVar)
##   allele_freq coverage            tool FP  FN TP prec recall area
## 1        0.01        1       freebayes  0 161  0  NaN      0  NaN
## 2        0.01        1 haplotypecaller  0 161  0  NaN      0  NaN
## 3        0.01        1            ivar  0 161  0  NaN      0  NaN
## 4        0.01        1          lofreq  0 161  0  NaN      0  NaN
## 5        0.01        1         mutect2  1 161  0    0      0    0
## 6        0.01        1            tims  2 161  0    0      0    0

RandomAF - Separating Data into TP, FP, FN

AV_randomAF = filter(AV, allele_freq == "random") %>% droplevels()

AV_randomAF_falsepos = filter(AV_randomAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
  if (NROW(AV_randomAF_falsepos) > 0){
    AV_randomAF_falsepos$category = c("FP")
  }
AV_randomAF_falseneg = filter(AV_randomAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
  if (NROW(AV_randomAF_falseneg) > 0){
    AV_randomAF_falseneg$category = c("FN")
  }
AV_randomAF_truepos = filter(AV_randomAF, af_golden != 0 & af_workflow != 0)
  if (NROW(AV_randomAF_truepos) > 0){
    AV_randomAF_truepos$category = c("TP")
  }
AV_randomAF = rbind(AV_randomAF_truepos, AV_randomAF_falseneg, AV_randomAF_falsepos)

rFP_All = group_by(AV_randomAF_falsepos, allele_freq,coverage,tool) %>% tally()
  colnames(rFP_All) = c("allele_freq","coverage","tool","FP")
rFN_All = group_by(AV_randomAF_falseneg, allele_freq,coverage,tool) %>% tally()
  colnames(rFN_All) = c("allele_freq","coverage","tool","FN")
rTP_All = group_by(AV_randomAF_truepos, allele_freq,coverage,tool) %>% tally()
  colnames(rTP_All) = c("allele_freq","coverage","tool","TP")

rAllVar = merge(rFP_All,rFN_All, by=c("allele_freq","coverage","tool"),all=T)
rAllVar = merge(rAllVar, rTP_All, by=c("allele_freq","coverage","tool"), all=T)
head(rAllVar)
##   allele_freq coverage            tool FP  FN TP
## 1      random        1       freebayes NA 154  7
## 2      random        1 haplotypecaller NA 154  7
## 3      random        1            ivar NA 161 NA
## 4      random        1          lofreq NA 161 NA
## 5      random        1         mutect2 NA 157  4
## 6      random        1            tims  4 136 25
rAllVar[is.na(rAllVar)] = 0

rAllVar$prec = (rAllVar$TP)/(rAllVar$TP + rAllVar$FP)
rAllVar$recall = (rAllVar$TP)/(rAllVar$TP + rAllVar$FN)
rAllVar$area = rAllVar$prec * rAllVar$recall
head(rAllVar)
##   allele_freq coverage            tool FP  FN TP     prec     recall       area
## 1      random        1       freebayes  0 154  7 1.000000 0.04347826 0.04347826
## 2      random        1 haplotypecaller  0 154  7 1.000000 0.04347826 0.04347826
## 3      random        1            ivar  0 161  0      NaN 0.00000000        NaN
## 4      random        1          lofreq  0 161  0      NaN 0.00000000        NaN
## 5      random        1         mutect2  0 157  4 1.000000 0.02484472 0.02484472
## 6      random        1            tims  4 136 25 0.862069 0.15527950 0.13386164

1. Precision_Recall Curves

PR_AllVarSet_freq = ggplot(AllVar,aes(x=recall, y=prec, group = tool, color = tool)) +
  geom_point() + 
  geom_line() +
  facet_wrap(~allele_freq) +
  scale_color_manual(values = MyColors)
print(PR_AllVarSet_freq)
## Warning: Removed 38 rows containing missing values (geom_point).
## Warning: Removed 10 rows containing missing values (geom_path).

ggsave("PR_AllVarSet_byAF.png",PR_AllVarSet_freq)
## Saving 7 x 5 in image
## Warning: Removed 38 rows containing missing values (geom_point).

## Warning: Removed 10 rows containing missing values (geom_path).
PR_AllVarSet_cover = ggplot(AllVar,aes(x=recall, y=prec, group = tool, color=tool)) +
  geom_point() + 
  geom_line() +
  facet_wrap(~coverage) +
  scale_color_manual(values = MyColors)
print(PR_AllVarSet_cover)
## Warning: Removed 38 rows containing missing values (geom_point).
## Warning: Removed 32 rows containing missing values (geom_path).

ggsave("PR_AllVarSet_byCov.png",PR_AllVarSet_cover)
## Saving 7 x 5 in image
## Warning: Removed 38 rows containing missing values (geom_point).

## Warning: Removed 32 rows containing missing values (geom_path).
PR_AllVarRandom = ggplot(rAllVar,aes(x=recall, y=prec, group = tool, color=tool)) +
  geom_point() + 
  geom_line() +
  scale_color_manual(values = MyColors)
print(PR_AllVarRandom)
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_path).

ggsave("PR_AllVarRandom.png",PR_AllVarRandom)
## Saving 7 x 5 in image
## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing missing values (geom_path).
# Examining Data

AllVar_PR_zeros = filter(AllVar, prec == 0 & recall == 0)
AllVar_PR_high = filter(AllVar, prec > 0.9 & recall > 0.9)

AllVar_PR_highest = group_by(AllVar, tool, allele_freq, coverage) %>%
  arrange(area, decreasing = TRUE)
write.csv(AllVar_PR_highest, file = "AllVar_PR_Thresholds.csv")

2.1 SetAF - Proportion of Variants Found

#AV_setAF_counts = group_by(AV_setAF_truepos, tool, allele_freq, coverage) %>% tally()
#AV_setAF_counts$proportion = (AV_setAF_counts$n / nrow(viral_golden))

#AV_setAF_proportion = ggplot(AV_setAF_counts,aes(x = log10(coverage), y = proportion,
#                                                group = tool, color = tool)) +
#  geom_point() +
#  geom_line() +
#  facet_grid(~allele_freq) +
#  scale_color_manual(values = MyColors) #+
#  #theme(axis.text = element_text(size = 8),
#        #axis.title = element_text(size = 30),
#        #strip.text = element_text(size = 15),
#        #axis.text.x = element_text(angle = 40))
#print(AV_setAF_proportion)
#ggsave("AllVarSet_Proportion.png", AV_setAF_proportion)

## Only looking at coverage between 100-1000
#AV_setAF_truepos_f = filter(AV_setAF_truepos, coverage > 99 & coverage < 1001)
#AV_setAF_counts_f = group_by(AV_setAF_truepos_f, tool, allele_freq, coverage) %>% tally()
#AV_setAF_counts_f$proportion = (AV_setAF_counts_f$n / nrow(viral_golden))
 
#AV_setAF_proportion_f = ggplot(AV_setAF_counts_f,aes(x = log10(coverage), y = proportion, group = tool, color = tool)) +
#  geom_point() +
#  geom_line() +
#  facet_grid(~allele_freq) +
#  scale_color_manual(values = MyColors) +
#  #scale_x_continuous(breaks = c(100,200,300,400,500,600,700,800,900,1000), 
#                   #labels = c("100","200","300","400","500","600","700","800","900","1000"))  +
#  theme(axis.text.x = element_text(angle = 90))
#        #axis.text = element_text(size = 8),
#        #axis.title = element_text(size = 30),
#        #strip.text = element_text(size = 15),
#print(AV_setAF_proportion_f)
#ggsave("AllVarSet_Proportion_Filtered.png", AV_setAF_proportion_f)

2.2 RandomAF - Proportion of Variants Found

AV_randomAF_counts = group_by(AV_randomAF_truepos, tool, allele_freq, coverage) %>% tally()
AV_randomAF_counts$proportion = (AV_randomAF_counts$n / nrow(viral_golden))

AV_randomAF_proportion = ggplot(AV_randomAF_counts, aes(x = log10(coverage), y = proportion, 
                                                        group = tool, color = tool)) +
  geom_point() +
  geom_line() +
  scale_color_manual(values = MyColors) #+
  #theme(axis.text = element_text(size = 8),
        #axis.title = element_text(size = 30),
        #strip.text = element_text(size = 15),
        #axis.text.x = element_text(angle = 40))
print(AV_randomAF_proportion)

ggsave("AllVarRandom_Proportion.png", AV_randomAF_proportion)
## Saving 7 x 5 in image
## Only looking at coverage between 100-1000
AV_randomAF_truepos_f = filter(AV_randomAF_truepos, coverage > 99 & coverage < 1001)
AV_randomAF_counts_f = group_by(AV_randomAF_truepos_f, tool, allele_freq, coverage) %>% tally()
AV_randomAF_counts_f$proportion = (AV_randomAF_counts_f$n / nrow(viral_golden))
 
AV_randomAF_proportion_f = ggplot(AV_randomAF_counts_f, aes(x = log10(coverage), y = proportion, 
                                                             group = tool, color = tool)) +
  geom_point() +
  geom_line() +
  scale_color_manual(values = MyColors) #+
  #scale_x_continuous(breaks = c(100,200,300,400,500,600,700,800,900,1000), 
                   #labels = c("100","200","300","400","500","600","700","800","900","1000"))  +
  #theme(axis.text = element_text(size = 8),
        #axis.title = element_text(size = 30),
        #strip.text = element_text(size = 15),
        #axis.text.x = element_text(angle = 40))
print(AV_randomAF_proportion_f)

ggsave("AllVarRandom_Proportion_Filtered.png", AV_randomAF_proportion_f)
## Saving 7 x 5 in image

3.1 SetAF - Correlation Between Golden and Workflow

## Only looking at true positives
#AV_Corr_setAF = ggplot(AV_setAF, aes(x = af_golden, y = af_workflow,
#                                                color = tool, shape = category)) +
#  geom_point() +
#  scale_color_manual(values = MyColors) +
#  geom_boxplot(aes(group = allele_freq)) + 
#  facet_grid(tool~coverage) +
#  #theme(axis.text = element_text(size = 8), 
#        #axis.title = element_text(size = 20), 
#        #strip.text = element_text(size = 15),
#        #axis.text.x = element_text(angle = 40)) +
#  ylim(-0.001,1.0) + xlim(-0.001,1.0)
#print(AV_Corr_setAF)
#ggsave("AllVarSet_Correlation.png",AV_Corr_setAF)

##Looking at all SetAF Data
#AV_setAF_f = filter(AV_setAF, coverage > 99 & coverage < 1001)

#AV_Corr_setAF = ggplot(AV_setAF_f, aes(x = af_golden, y = af_workflow, 
#                                       color = tool, group = tool, shape = category)) +
#  geom_point(size = 3, alpha = 0.4) +
#  scale_color_manual(values = MyColors) +
#  facet_grid(~coverage) +
#  geom_smooth(method='lm', size = 1.5) +
#  #theme(axis.text = element_text(size = 8), 
#        #axis.title = element_text(size = 20), 
#        #strip.text = element_text(size = 15),
#        #axis.text.x = element_text(angle = 40)) +
#  ylim(-0.001,1.0) + xlim(-0.001,1.0)
#print(AV_Corr_setAF)
#ggsave("AllVarSet_Correlation_Filtered.png",AV_Corr_setAF)

#AV_Corr_setAF_Zoom = ggplot(AV_setAF_f, aes(x = af_golden, y = af_workflow,
#                                            color = tool, group = tool, shape = category)) +
#  geom_point(size = 3, alpha = 0.4) +
#  scale_color_manual(values = MyColors) +
#  facet_grid(~coverage) +
#  geom_smooth(method='lm', size = 1.5) +
#  #theme(axis.text = element_text(size = 8), 
#        #axis.title = element_text(size = 20), 
#        #strip.text = element_text(size = 15),
#        #axis.text.x = element_text(angle = 40)) +
#  ylim(-0.01,0.15) + xlim(-0.01,0.15)
#print(AV_Corr_setAF_Zoom)
#ggsave("AllVarSet_CorrelationF_Filtered_Zoom.png", AV_Corr_setAF_Zoom)

3.2 RandomAF - Correlation Between Golden and Workflow

## Only looking at true positives
#AV_Corr_randomAF = ggplot(AV_randomAF, aes(x = af_golden, y = af_workflow,
#                                                      color = tool, shape = category)) +
#  geom_point() +
#  scale_color_manual(values = MyColors) +
#  facet_grid(tool~coverage) +
#  #theme(axis.text = element_text(size = 8), 
#        #axis.title = element_text(size = 20), 
#        #strip.text = element_text(size = 15),
#        #axis.text.x = element_text(angle = 40)) +
#  ylim(-0.001,1.0) + xlim(-0.001,1.0)
#print(AV_Corr_randomAF)
#ggsave("AllVarRandom_Correlation.png",AV_Corr_randomAF)

##Looking at all RandomAF Data
#AV_randomAF_f = filter(AV_randomAF, coverage > 99 & coverage < 1001)

#AV_Corr_randomAF = ggplot(AV_randomAF_f, aes(x = af_golden, y = af_workflow, 
#                                             color = tool, group = tool, shape = category)) +
#  geom_point(size = 3, alpha = 0.4) +
#  scale_color_manual(values = MyColors) +
#  facet_grid(~coverage) +
#  geom_smooth(method='lm', size = 1.5) +
#  #theme(axis.text = element_text(size = 8), 
#        #axis.title = element_text(size = 20), 
#        #strip.text = element_text(size = 15),
#        #axis.text.x = element_text(angle = 40)) +
#  ylim(-0.01,1) + xlim(-0.01,1)
#print(AV_Corr_randomAF)
#ggsave("AllVarRandom_Correlation_Filtered.png",AV_Corr_randomAF)

#AV_Corr_randomAF_Zoom = ggplot(AV_randomAF_f, aes(x = af_golden, y = af_workflow,
#                                                  color = tool, group = tool, shape = category)) +
#  geom_point(size = 3, alpha = 0.4) +
#  scale_color_manual(values = MyColors) +
#  facet_grid(~coverage) +
#  geom_smooth(method='lm', size = 1.5) +
#  #theme(axis.text = element_text(size = 8), 
#        #axis.title = element_text(size = 20), 
#        #strip.text = element_text(size = 15),
#        #axis.text.x = element_text(angle = 40)) +
#  ylim(-0.01,0.15) + xlim(-0.01,0.15)
#print(AV_Corr_randomAF_Zoom)
#ggsave("AllVarRandom_Correlation_Filtered_Zoom.png", AV_Corr_randomAF_Zoom)

Stats for SetAF Data

coeff_var <- function(x) {
  CV <- sd(x) / mean(x) * 100
  return(CV)
}

AV_setAF_split = group_by(AV_setAF_truepos, tool, allele_freq, coverage) %>%
  summarize(mean(af_workflow), sd(af_workflow), coeff_var(af_workflow)) %>% droplevels()
## `summarise()` regrouping output by 'tool', 'allele_freq' (override with `.groups` argument)
colnames(AV_setAF_split) = c("tool","AF","cov","mean","sd","variance")

Mean_setAF = ggplot(AV_setAF_split, aes(x = tool, y = mean, color = cov)) + 
  geom_point() +
  facet_grid(rows = vars(tool), cols = vars(AF)) +
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(Mean_setAF)
## Registered S3 method overwritten by 'plyr':
##   method   from       
##   [.quoted R_GlobalEnv

ggsave("SetAF_Mean_AlleleFrequency.png",Mean_setAF)
## Saving 7 x 5 in image
CoeffVar_setAF = ggplot(AV_setAF_split, aes(x = tool, y = variance, color = cov)) + 
  geom_point() +
  facet_grid(rows = vars(tool), cols = vars(AF)) +
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_setAF)
## Warning: Removed 15 rows containing missing values (geom_point).

ggsave("SetAF_CoeffVar_AlleleFrequency.png",CoeffVar_setAF)
## Saving 7 x 5 in image
## Warning: Removed 15 rows containing missing values (geom_point).

Set Up for Stats for RandomAF Data

AV_randomAF_split = group_by(AV_randomAF_truepos, tool, coverage) %>%
  summarize(mean(af_workflow), sd(af_workflow), coeff_var(af_workflow)) %>% droplevels()
## `summarise()` regrouping output by 'tool' (override with `.groups` argument)
colnames(AV_randomAF_split) = c("tool","cov","mean","sd","variance")

Mean_randomAF = ggplot(AV_randomAF_split, aes(x = tool, y = mean, color = cov)) + 
  geom_point()
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## List of 2
##  $ text       :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : num 15
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 1
##   ..$ vjust        : num 0.5
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE
print(Mean_randomAF)

ggsave("RandomAF_Mean_AlleleFrequency.png",Mean_randomAF)
## Saving 7 x 5 in image
CoeffVar_randomAF = ggplot(AV_randomAF_split, aes(x = tool, y = variance, color = cov)) + 
  geom_point() +
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_randomAF)

ggsave("RandomAF_CoeffVar_AlleleFrequency.png",CoeffVar_randomAF)
## Saving 7 x 5 in image

4. Look at how each tool compares at each AF and seq-depth

#AVSet_Corr = ggplot(AV_setAF_truepos, aes(x = tool, y = af_workflow)) + 
#  geom_point() +
#  geom_boxplot() +
#  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
#  facet_grid(allele_freq~coverage, scales = "free_y")
#print(AVSet_Corr)
#ggsave("AllVarSet_Tool_Comparison.pdf", AVSet_Corr, width = 15, height = 20)
# y axes are set per row at the moment (scales = "free_y")

#AVRandom_Corr = ggplot(AV_randomAF_truepos, aes(x = tool, y = af_workflow)) + 
#  geom_point() +
#  geom_violin() +
#  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
#  facet_grid(allele_freq~coverage)
#print(AVRandom_Corr)
#ggsave("AllVarRandom_Tool_Comparison.pdf", AVRandom_Corr, width = 15, height = 20)

5.1. Looks at where the SNPS are located throughout the genome

# Set AF Data
AV_Pos = ggplot(AV_setAF_truepos, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos)

AV_Pos_falsepos = ggplot(AV_setAF_falsepos, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos_falsepos)

AV_Pos_falseneg = ggplot(AV_setAF_falseneg, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos_falseneg)

# Random AF Data
AV_Pos_r = ggplot(AV_randomAF_truepos, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos_r)

AV_Pos_falsepos_r = ggplot(AV_randomAF_falsepos, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos_falsepos_r)

AV_Pos_falseneg_r = ggplot(AV_randomAF_falseneg, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos_falseneg_r)

5.2.Transitions/Transversions

#setAF_TP_pos = ggplot(AV_setAF_truepos, aes(x = pos, y = ref, color = tool, shape = alt)) +
#  geom_point()
#print(setAF_TP_pos)

#TsTv(AV_setAF_truepos)
#counts variants from all tools/seq_depth/AF trials

#randomAF_TP_pos = ggplot(AV_randomAF_truepos, aes(x = pos, y = ref, color = tool, shape = alt)) +
#  geom_point()
#print(randomAF_TP_pos)

#TsTv(AV_randomAF_truepos)