This notebook takes the output for the MAF Pipeline (mycsvfile.csv) and does:

  1. Calculates true pos, false pos, and false neg and makes a precision recall curve for all tools
  2. Finds total numbers of variants found by tools at different allele frequencies and sequencing depths
  3. Looks at correlation between SNPS in the golden vcf and worflow vcf For viral variant callers!

Loading Libraries

library('ggplot2')
library('plyr')
library('tidyverse')
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## v purrr   0.3.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::arrange()   masks plyr::arrange()
## x purrr::compact()   masks plyr::compact()
## x dplyr::count()     masks plyr::count()
## x dplyr::failwith()  masks plyr::failwith()
## x dplyr::filter()    masks stats::filter()
## x dplyr::id()        masks plyr::id()
## x dplyr::lag()       masks stats::lag()
## x dplyr::mutate()    masks plyr::mutate()
## x dplyr::rename()    masks plyr::rename()
## x dplyr::summarise() masks plyr::summarise()
## x dplyr::summarize() masks plyr::summarize()
library('ggpubr')
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:plyr':
## 
##     mutate
library("MLmetrics")
## 
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
## 
##     Recall
MyColors = c("#E76F51", "#E9C369", "#2A9D8F","#2B4FA2", "#119DAC", "#672D7B", "#262366", "#BCBCBC")
ggplot2::theme_set(theme_minimal())
detach(package:plyr, unload=TRUE)
library("plotrix") 

Making Transition/Transversion Function

TsTv = function(dataframe) {
  transitions = 0
  for (row in 1:nrow(dataframe)) {
  
   REF <- dataframe[row, "ref"] #requires dataframe to have column name "ref"
   ALT <- dataframe[row, "alt"] # requires dataframe to have column name "alt"

    if(REF == "A" & ALT == "G")
      transitions = transitions + 1
    if(REF == "G" & ALT == "A")
      transitions = transitions + 1
    if(REF == "C" & ALT == "T")
      transitions = transitions + 1
    if(REF == "T" & ALT == "C")
      transitions = transitions + 1
  }

  transversions = 0
  for (row in 1:nrow(dataframe)) {
  
    REF <- dataframe[row, "ref"]
    ALT <- dataframe[row, "alt"]
  
    if(REF == "A" & ALT == "T")
      transversions = transversions + 1
    if(REF == "T" & ALT == "A")
      transversions = transversions + 1
    if(REF == "C" & ALT == "A")
      transversions = transversions + 1
    if(REF == "A" & ALT == "C")
      transversions = transversions + 1
    if(REF == "G" & ALT == "T")
      transversions = transversions + 1
    if(REF == "T" & ALT == "G")
      transversions = transversions + 1
    if(REF == "C" & ALT == "G")
      transversions = transversions + 1
    if(REF == "G" & ALT == "C")
      transversions = transversions + 1
  }

  TsTv_df = data.frame(transitions,transversions)
  print(TsTv_df)
  TsTv_ratio = (transitions/transversions)
  print(TsTv_ratio)
  
}

Reading in Variant Data and Evaluating

AV = read.csv(af_report,header = T)
AV$sample_id = as.character(AV$sample_id)
AV$dp = as.numeric(AV$dp)
AV = separate(AV, sample_id, sep = "_", 
               into = c("pipe","AF","allele_freq","frac","seq_depth","tool","parameter")) %>%
  select(-c(pipe, AF, frac)) %>% 
  filter(ref == "A" | ref == "C" | ref == "T" | ref == "G") %>% 
  filter(alt == "A" | alt == "C" | alt == "T" | alt == "G") %>% droplevels()
## Warning: Expected 7 pieces. Additional pieces discarded in 1788 rows [2863,
## 2864, 2865, 2866, 2867, 2868, 2869, 2870, 2871, 2872, 2873, 2874, 2875, 2876,
## 2877, 2878, 2879, 2880, 2881, 2882, ...].
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 9992 rows [1, 2,
## 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
AV$seq_depth = as.numeric(AV$seq_depth)
AV$coverage = (AV$seq_depth * 100000)


AV_ivar = filter(AV, tool == "ivar") %>% droplevels()
AV_lofreq = filter(AV, tool == "lofreq") %>% droplevels()
AV_varscan_unfiltered = filter(AV, tool == "varscan" & parameter == "unfiltered") %>% droplevels()
#AV_varscan_unfiltered$tool = paste0(AV_varscan_unfiltered$tool,"_",AV_varscan_unfiltered$parameter)
AV_varscan_filtered = filter(AV, tool == "varscan"& parameter == "filtered") %>% droplevels()
#AV_varscan_filtered$tool = paste0(AV_varscan_filtered$tool,"_",AV_varscan_filtered$parameter)
AV_tims = filter(AV, tool == "tims") %>% droplevels()
AV_freebayes = filter(AV, tool == "freebayes") %>% droplevels()
AV_mutect = filter(AV, tool == "mutect2") %>% droplevels()
AV_haplotypecaller = filter(AV, tool == "filtered") %>% droplevels()
AV_haplotypecaller$tool = "haplotypecaller"

AV = rbind(AV_ivar,AV_varscan_filtered,AV_tims,
           AV_freebayes,AV_mutect,AV_haplotypecaller,AV_lofreq)
AV$parameter = NULL
head(AV)
##   allele_freq seq_depth tool   pos af_golden af_workflow ref alt    dp coverage
## 1        0.03         1 ivar 29884      0.03        0.00   A   G 10000    1e+05
## 2        0.03         1 ivar   595      0.03        0.03   T   A 17571    1e+05
## 3        0.03         1 ivar   967      0.03        0.03   T   C 11717    1e+05
## 4        0.03         1 ivar  1304      0.03        0.03   G   T 10177    1e+05
## 5        0.03         1 ivar  1460      0.03        0.03   A   G 10149    1e+05
## 6        0.03         1 ivar  1544      0.03        0.03   G   T 10210    1e+05
Dp_Variation = ggplot(AV, aes(x = log10(coverage), y = log10(dp), color = tool)) +
  geom_point() + 
  facet_wrap(~tool, scales = "free_y") +
  geom_abline() +
  scale_color_manual(values = MyColors)
## Warning: namespace 'plyr' is not available and has been replaced
## by .GlobalEnv when processing object ''
print(Dp_Variation)
## Warning: Removed 1541 rows containing missing values (geom_point).

ggsave(Dp_Variation, file = "AV_Dp_Coverage.png")
## Saving 7 x 5 in image
## Warning: Removed 1541 rows containing missing values (geom_point).

Reading in Golden Data and Evaluating

viral_golden = read.table(golden_vcf, stringsAsFactors = F)
colnames(viral_golden) = c("chrom","pos","ID","ref","alt","qual","filter","info","codes","genotype")
viral_golden = separate(viral_golden, info, sep = "=", into = c("label","AF"))
viral_golden$label = NULL
head(viral_golden)
##       chrom  pos ID ref alt qual filter  AF          codes
## 1 SARS-CoV2  595  .   T   A    .      . 0.1 GT:AD:DP:GQ:PL
## 2 SARS-CoV2  967  .   T   C    .      . 0.1 GT:AD:DP:GQ:PL
## 3 SARS-CoV2 1304  .   G   T    .      . 0.1 GT:AD:DP:GQ:PL
## 4 SARS-CoV2 1460  .   A   G    .      . 0.1 GT:AD:DP:GQ:PL
## 5 SARS-CoV2 1544  .   G   T    .      . 0.1 GT:AD:DP:GQ:PL
## 6 SARS-CoV2 1619  .   G   C    .      . 0.1 GT:AD:DP:GQ:PL
##                            genotype
## 1 1/1/1/1/1/1/1/1/1/1:90,10:100:1:1
## 2 1/1/1/1/1/1/1/1/1/1:90,10:100:1:1
## 3 1/1/1/1/1/1/1/1/1/1:90,10:100:1:1
## 4 1/1/1/1/1/1/1/1/1/1:90,10:100:1:1
## 5 1/1/1/1/1/1/1/1/1/1:90,10:100:1:1
## 6 1/1/1/1/1/1/1/1/1/1:90,10:100:1:1
dim(viral_golden)
## [1] 149  10
viral_golden_pos = ggplot(viral_golden, aes(x = pos, y = ref, color = alt)) +
  geom_point()
print(viral_golden_pos)

TsTv(viral_golden)
##   transitions transversions
## 1         111            38
## [1] 2.921053

SetAF - Separating Data into TP, FP, FN

AV_setAF = filter(AV, allele_freq != "random") %>% droplevels()

AV_setAF_falsepos = filter(AV_setAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
  AV_setAF_falsepos$category = c("FP")
AV_setAF_falseneg = filter(AV_setAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
  AV_setAF_falseneg$category = c("FN")
AV_setAF_truepos = filter(AV_setAF, af_golden != 0 & af_workflow != 0)
  AV_setAF_truepos$category = c("TP")
  
AV_setAF = rbind(AV_setAF_truepos, AV_setAF_falseneg, AV_setAF_falsepos)

FP_All = group_by(AV_setAF_falsepos, allele_freq,coverage,tool) %>% tally()
  colnames(FP_All) = c("allele_freq","coverage","tool","FP")
FN_All = group_by(AV_setAF_falseneg, allele_freq,coverage,tool) %>% tally()
  colnames(FN_All) = c("allele_freq","coverage","tool","FN")
TP_All = group_by(AV_setAF_truepos, allele_freq,coverage,tool) %>% tally()
  colnames(TP_All) = c("allele_freq","coverage","tool","TP")
    
AllVar = merge(FP_All,FN_All, by=c("allele_freq","coverage","tool"),all=T)
AllVar = merge(AllVar,TP_All, by=c("allele_freq","coverage","tool"), all=T)
head(AllVar)
##   allele_freq coverage            tool FP  FN TP
## 1        0.02     1000       freebayes  9  65 84
## 2        0.02     1000 haplotypecaller NA  69 80
## 3        0.02     1000            ivar NA 146  3
## 4        0.02     1000          lofreq NA 120 29
## 5        0.02     1000         mutect2 NA 108 41
## 6        0.02     1000            tims NA  90 59
AllVar[is.na(AllVar)] = 0
head(AllVar)
##   allele_freq coverage            tool FP  FN TP
## 1        0.02     1000       freebayes  9  65 84
## 2        0.02     1000 haplotypecaller  0  69 80
## 3        0.02     1000            ivar  0 146  3
## 4        0.02     1000          lofreq  0 120 29
## 5        0.02     1000         mutect2  0 108 41
## 6        0.02     1000            tims  0  90 59
AllVar$prec = (AllVar$TP)/(AllVar$TP + AllVar$FP)
AllVar$recall = (AllVar$TP)/(AllVar$TP + AllVar$FN)
AllVar$area = AllVar$prec * AllVar$recall
head(AllVar)
##   allele_freq coverage            tool FP  FN TP      prec     recall
## 1        0.02     1000       freebayes  9  65 84 0.9032258 0.56375839
## 2        0.02     1000 haplotypecaller  0  69 80 1.0000000 0.53691275
## 3        0.02     1000            ivar  0 146  3 1.0000000 0.02013423
## 4        0.02     1000          lofreq  0 120 29 1.0000000 0.19463087
## 5        0.02     1000         mutect2  0 108 41 1.0000000 0.27516779
## 6        0.02     1000            tims  0  90 59 1.0000000 0.39597315
##         area
## 1 0.50920113
## 2 0.53691275
## 3 0.02013423
## 4 0.19463087
## 5 0.27516779
## 6 0.39597315

RandomAF - Separating Data into TP, FP, FN

AV_randomAF = filter(AV, allele_freq == "random") %>% droplevels()

AV_randomAF_falsepos = filter(AV_randomAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
  if (NROW(AV_randomAF_falsepos) > 0){
    AV_randomAF_falsepos$category = c("FP")
  }
AV_randomAF_falseneg = filter(AV_randomAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
  if (NROW(AV_randomAF_falseneg) > 0){
    AV_randomAF_falseneg$category = c("FN")
  }
AV_randomAF_truepos = filter(AV_randomAF, af_golden != 0 & af_workflow != 0)
  if (NROW(AV_randomAF_truepos) > 0){
    AV_randomAF_truepos$category = c("TP")
  }
AV_randomAF = rbind(AV_randomAF_truepos, AV_randomAF_falseneg, AV_randomAF_falsepos)

rFP_All = group_by(AV_randomAF_falsepos, allele_freq,coverage,tool) %>% tally()
  colnames(rFP_All) = c("allele_freq","coverage","tool","FP")
rFN_All = group_by(AV_randomAF_falseneg, allele_freq,coverage,tool) %>% tally()
  colnames(rFN_All) = c("allele_freq","coverage","tool","FN")
rTP_All = group_by(AV_randomAF_truepos, allele_freq,coverage,tool) %>% tally()
  colnames(rTP_All) = c("allele_freq","coverage","tool","TP")

rAllVar = merge(rFP_All,rFN_All, by=c("allele_freq","coverage","tool"),all=T)
rAllVar = merge(rAllVar, rTP_All, by=c("allele_freq","coverage","tool"), all=T)
head(rAllVar)
##   allele_freq coverage            tool FP FN  TP
## 1      random     1000       freebayes 14 10 139
## 2      random     1000 haplotypecaller NA  6 143
## 3      random     1000            ivar NA 19 130
## 4      random     1000          lofreq NA 11 138
## 5      random     1000         mutect2 NA 46 103
## 6      random     1000            tims NA 17 132
rAllVar[is.na(rAllVar)] = 0

rAllVar$prec = (rAllVar$TP)/(rAllVar$TP + rAllVar$FP)
rAllVar$recall = (rAllVar$TP)/(rAllVar$TP + rAllVar$FN)
rAllVar$area = rAllVar$prec * rAllVar$recall
head(rAllVar)
##   allele_freq coverage            tool FP FN  TP      prec    recall      area
## 1      random     1000       freebayes 14 10 139 0.9084967 0.9328859 0.8475238
## 2      random     1000 haplotypecaller  0  6 143 1.0000000 0.9597315 0.9597315
## 3      random     1000            ivar  0 19 130 1.0000000 0.8724832 0.8724832
## 4      random     1000          lofreq  0 11 138 1.0000000 0.9261745 0.9261745
## 5      random     1000         mutect2  0 46 103 1.0000000 0.6912752 0.6912752
## 6      random     1000            tims  0 17 132 1.0000000 0.8859060 0.8859060

1. Precision_Recall Curves

PR_AllVarSet_freq = ggplot(AllVar,aes(x=recall, y=prec, group = tool, color = tool)) +
  geom_point() + 
  geom_line() +
  facet_wrap(~allele_freq) +
  scale_color_manual(values = MyColors)
print(PR_AllVarSet_freq)

ggsave("PR_AllVarSet_byAF.png",PR_AllVarSet_freq)
## Saving 7 x 5 in image
PR_AllVarSet_cover = ggplot(AllVar,aes(x=recall, y=prec, group = tool, color=tool)) +
  geom_point() + 
  geom_line() +
  facet_wrap(~coverage) +
  scale_color_manual(values = MyColors)
print(PR_AllVarSet_cover)

ggsave("PR_AllVarSet_byCov.png",PR_AllVarSet_cover)
## Saving 7 x 5 in image
PR_AllVarRandom = ggplot(rAllVar,aes(x=recall, y=prec, group = tool, color=tool)) +
  geom_point() + 
  geom_line() +
  scale_color_manual(values = MyColors)
print(PR_AllVarRandom)

ggsave("PR_AllVarRandom.png",PR_AllVarRandom)
## Saving 7 x 5 in image
# Examining Data

AllVar_PR_zeros = filter(AllVar, prec == 0 & recall == 0)
AllVar_PR_high = filter(AllVar, prec > 0.9 & recall > 0.9)

AllVar_PR_highest = group_by(AllVar, tool, allele_freq, coverage) %>%
  arrange(area, decreasing = TRUE)
write.csv(AllVar_PR_highest, file = "AllVar_PR_Thresholds.csv")

2.1 SetAF - Proportion of Variants Found

#AV_setAF_counts = group_by(AV_setAF_truepos, tool, allele_freq, coverage) %>% tally()
#AV_setAF_counts$proportion = (AV_setAF_counts$n / nrow(viral_golden))

#AV_setAF_proportion = ggplot(AV_setAF_counts,aes(x = log10(coverage), y = proportion,
#                                                group = tool, color = tool)) +
#  geom_point() +
#  geom_line() +
#  facet_grid(~allele_freq) +
#  scale_color_manual(values = MyColors) #+
#  #theme(axis.text = element_text(size = 8),
#        #axis.title = element_text(size = 30),
#        #strip.text = element_text(size = 15),
#        #axis.text.x = element_text(angle = 40))
#print(AV_setAF_proportion)
#ggsave("AllVarSet_Proportion.png", AV_setAF_proportion)

## Only looking at coverage between 100-1000
#AV_setAF_truepos_f = filter(AV_setAF_truepos, coverage > 99 & coverage < 1001)
#AV_setAF_counts_f = group_by(AV_setAF_truepos_f, tool, allele_freq, coverage) %>% tally()
#AV_setAF_counts_f$proportion = (AV_setAF_counts_f$n / nrow(viral_golden))
 
#AV_setAF_proportion_f = ggplot(AV_setAF_counts_f,aes(x = log10(coverage), y = proportion, group = tool, color = tool)) +
#  geom_point() +
#  geom_line() +
#  facet_grid(~allele_freq) +
#  scale_color_manual(values = MyColors) +
#  #scale_x_continuous(breaks = c(100,200,300,400,500,600,700,800,900,1000), 
#                   #labels = c("100","200","300","400","500","600","700","800","900","1000"))  +
#  theme(axis.text.x = element_text(angle = 90))
#        #axis.text = element_text(size = 8),
#        #axis.title = element_text(size = 30),
#        #strip.text = element_text(size = 15),
#print(AV_setAF_proportion_f)
#ggsave("AllVarSet_Proportion_Filtered.png", AV_setAF_proportion_f)

2.2 RandomAF - Proportion of Variants Found

AV_randomAF_counts = group_by(AV_randomAF_truepos, tool, allele_freq, coverage) %>% tally()
AV_randomAF_counts$proportion = (AV_randomAF_counts$n / nrow(viral_golden))

AV_randomAF_proportion = ggplot(AV_randomAF_counts, aes(x = log10(coverage), y = proportion, 
                                                        group = tool, color = tool)) +
  geom_point() +
  geom_line() +
  scale_color_manual(values = MyColors) #+
  #theme(axis.text = element_text(size = 8),
        #axis.title = element_text(size = 30),
        #strip.text = element_text(size = 15),
        #axis.text.x = element_text(angle = 40))
print(AV_randomAF_proportion)

ggsave("AllVarRandom_Proportion.png", AV_randomAF_proportion)
## Saving 7 x 5 in image
## Only looking at coverage between 100-1000
AV_randomAF_truepos_f = filter(AV_randomAF_truepos, coverage > 99 & coverage < 1001)
AV_randomAF_counts_f = group_by(AV_randomAF_truepos_f, tool, allele_freq, coverage) %>% tally()
AV_randomAF_counts_f$proportion = (AV_randomAF_counts_f$n / nrow(viral_golden))
 
AV_randomAF_proportion_f = ggplot(AV_randomAF_counts_f, aes(x = log10(coverage), y = proportion, 
                                                             group = tool, color = tool)) +
  geom_point() +
  geom_line() +
  scale_color_manual(values = MyColors) #+
  #scale_x_continuous(breaks = c(100,200,300,400,500,600,700,800,900,1000), 
                   #labels = c("100","200","300","400","500","600","700","800","900","1000"))  +
  #theme(axis.text = element_text(size = 8),
        #axis.title = element_text(size = 30),
        #strip.text = element_text(size = 15),
        #axis.text.x = element_text(angle = 40))
print(AV_randomAF_proportion_f)
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?

ggsave("AllVarRandom_Proportion_Filtered.png", AV_randomAF_proportion_f)
## Saving 7 x 5 in image
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?

3.1 SetAF - Correlation Between Golden and Workflow

## Only looking at true positives
#AV_Corr_setAF = ggplot(AV_setAF, aes(x = af_golden, y = af_workflow,
#                                                color = tool, shape = category)) +
#  geom_point() +
#  scale_color_manual(values = MyColors) +
#  geom_boxplot(aes(group = allele_freq)) + 
#  facet_grid(tool~coverage) +
#  #theme(axis.text = element_text(size = 8), 
#        #axis.title = element_text(size = 20), 
#        #strip.text = element_text(size = 15),
#        #axis.text.x = element_text(angle = 40)) +
#  ylim(-0.001,1.0) + xlim(-0.001,1.0)
#print(AV_Corr_setAF)
#ggsave("AllVarSet_Correlation.png",AV_Corr_setAF)

##Looking at all SetAF Data
#AV_setAF_f = filter(AV_setAF, coverage > 99 & coverage < 1001)

#AV_Corr_setAF = ggplot(AV_setAF_f, aes(x = af_golden, y = af_workflow, 
#                                       color = tool, group = tool, shape = category)) +
#  geom_point(size = 3, alpha = 0.4) +
#  scale_color_manual(values = MyColors) +
#  facet_grid(~coverage) +
#  geom_smooth(method='lm', size = 1.5) +
#  #theme(axis.text = element_text(size = 8), 
#        #axis.title = element_text(size = 20), 
#        #strip.text = element_text(size = 15),
#        #axis.text.x = element_text(angle = 40)) +
#  ylim(-0.001,1.0) + xlim(-0.001,1.0)
#print(AV_Corr_setAF)
#ggsave("AllVarSet_Correlation_Filtered.png",AV_Corr_setAF)

#AV_Corr_setAF_Zoom = ggplot(AV_setAF_f, aes(x = af_golden, y = af_workflow,
#                                            color = tool, group = tool, shape = category)) +
#  geom_point(size = 3, alpha = 0.4) +
#  scale_color_manual(values = MyColors) +
#  facet_grid(~coverage) +
#  geom_smooth(method='lm', size = 1.5) +
#  #theme(axis.text = element_text(size = 8), 
#        #axis.title = element_text(size = 20), 
#        #strip.text = element_text(size = 15),
#        #axis.text.x = element_text(angle = 40)) +
#  ylim(-0.01,0.15) + xlim(-0.01,0.15)
#print(AV_Corr_setAF_Zoom)
#ggsave("AllVarSet_CorrelationF_Filtered_Zoom.png", AV_Corr_setAF_Zoom)

3.2 RandomAF - Correlation Between Golden and Workflow

## Only looking at true positives
#AV_Corr_randomAF = ggplot(AV_randomAF, aes(x = af_golden, y = af_workflow,
#                                                      color = tool, shape = category)) +
#  geom_point() +
#  scale_color_manual(values = MyColors) +
#  facet_grid(tool~coverage) +
#  #theme(axis.text = element_text(size = 8), 
#        #axis.title = element_text(size = 20), 
#        #strip.text = element_text(size = 15),
#        #axis.text.x = element_text(angle = 40)) +
#  ylim(-0.001,1.0) + xlim(-0.001,1.0)
#print(AV_Corr_randomAF)
#ggsave("AllVarRandom_Correlation.png",AV_Corr_randomAF)

##Looking at all RandomAF Data
#AV_randomAF_f = filter(AV_randomAF, coverage > 99 & coverage < 1001)

#AV_Corr_randomAF = ggplot(AV_randomAF_f, aes(x = af_golden, y = af_workflow, 
#                                             color = tool, group = tool, shape = category)) +
#  geom_point(size = 3, alpha = 0.4) +
#  scale_color_manual(values = MyColors) +
#  facet_grid(~coverage) +
#  geom_smooth(method='lm', size = 1.5) +
#  #theme(axis.text = element_text(size = 8), 
#        #axis.title = element_text(size = 20), 
#        #strip.text = element_text(size = 15),
#        #axis.text.x = element_text(angle = 40)) +
#  ylim(-0.01,1) + xlim(-0.01,1)
#print(AV_Corr_randomAF)
#ggsave("AllVarRandom_Correlation_Filtered.png",AV_Corr_randomAF)

#AV_Corr_randomAF_Zoom = ggplot(AV_randomAF_f, aes(x = af_golden, y = af_workflow,
#                                                  color = tool, group = tool, shape = category)) +
#  geom_point(size = 3, alpha = 0.4) +
#  scale_color_manual(values = MyColors) +
#  facet_grid(~coverage) +
#  geom_smooth(method='lm', size = 1.5) +
#  #theme(axis.text = element_text(size = 8), 
#        #axis.title = element_text(size = 20), 
#        #strip.text = element_text(size = 15),
#        #axis.text.x = element_text(angle = 40)) +
#  ylim(-0.01,0.15) + xlim(-0.01,0.15)
#print(AV_Corr_randomAF_Zoom)
#ggsave("AllVarRandom_Correlation_Filtered_Zoom.png", AV_Corr_randomAF_Zoom)

Stats for SetAF Data

coeff_var <- function(x) {
  CV <- sd(x) / mean(x) * 100
  return(CV)
}

AV_setAF_split = group_by(AV_setAF_truepos, tool, allele_freq, coverage) %>%
  summarize(mean(af_workflow), sd(af_workflow), coeff_var(af_workflow)) %>% droplevels()
## `summarise()` regrouping output by 'tool', 'allele_freq' (override with `.groups` argument)
colnames(AV_setAF_split) = c("tool","AF","cov","mean","sd","variance")

Mean_setAF = ggplot(AV_setAF_split, aes(x = tool, y = mean, color = cov)) + 
  geom_point() +
  facet_grid(rows = vars(tool), cols = vars(AF)) +
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(Mean_setAF)

ggsave("SetAF_Mean_AlleleFrequency.png",Mean_setAF)
## Saving 7 x 5 in image
CoeffVar_setAF = ggplot(AV_setAF_split, aes(x = tool, y = variance, color = cov)) + 
  geom_point() +
  facet_grid(rows = vars(tool), cols = vars(AF)) +
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_setAF)

ggsave("SetAF_CoeffVar_AlleleFrequency.png",CoeffVar_setAF)
## Saving 7 x 5 in image

Set Up for Stats for RandomAF Data

AV_randomAF_split = group_by(AV_randomAF_truepos, tool, coverage) %>%
  summarize(mean(af_workflow), sd(af_workflow), coeff_var(af_workflow)) %>% droplevels()
## `summarise()` regrouping output by 'tool' (override with `.groups` argument)
colnames(AV_randomAF_split) = c("tool","cov","mean","sd","variance")

Mean_randomAF = ggplot(AV_randomAF_split, aes(x = tool, y = mean, color = cov)) + 
  geom_point()
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## List of 2
##  $ text       :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : num 15
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 1
##   ..$ vjust        : num 0.5
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE
print(Mean_randomAF)

ggsave("RandomAF_Mean_AlleleFrequency.png",Mean_randomAF)
## Saving 7 x 5 in image
CoeffVar_randomAF = ggplot(AV_randomAF_split, aes(x = tool, y = variance, color = cov)) + 
  geom_point() +
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_randomAF)

ggsave("RandomAF_CoeffVar_AlleleFrequency.png",CoeffVar_randomAF)
## Saving 7 x 5 in image

4. Look at how each tool compares at each AF and seq-depth

#AVSet_Corr = ggplot(AV_setAF_truepos, aes(x = tool, y = af_workflow)) + 
#  geom_point() +
#  geom_boxplot() +
#  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
#  facet_grid(allele_freq~coverage, scales = "free_y")
#print(AVSet_Corr)
#ggsave("AllVarSet_Tool_Comparison.pdf", AVSet_Corr, width = 15, height = 20)
# y axes are set per row at the moment (scales = "free_y")

#AVRandom_Corr = ggplot(AV_randomAF_truepos, aes(x = tool, y = af_workflow)) + 
#  geom_point() +
#  geom_violin() +
#  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
#  facet_grid(allele_freq~coverage)
#print(AVRandom_Corr)
#ggsave("AllVarRandom_Tool_Comparison.pdf", AVRandom_Corr, width = 15, height = 20)

5.1. Looks at where the SNPS are located throughout the genome

# Set AF Data
AV_Pos = ggplot(AV_setAF_truepos, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos)

AV_Pos_falsepos = ggplot(AV_setAF_falsepos, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos_falsepos)

AV_Pos_falseneg = ggplot(AV_setAF_falseneg, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos_falseneg)

# Random AF Data
AV_Pos_r = ggplot(AV_randomAF_truepos, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos_r)

AV_Pos_falsepos_r = ggplot(AV_randomAF_falsepos, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos_falsepos_r)

AV_Pos_falseneg_r = ggplot(AV_randomAF_falseneg, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos_falseneg_r)

5.2.Transitions/Transversions

#setAF_TP_pos = ggplot(AV_setAF_truepos, aes(x = pos, y = ref, color = tool, shape = alt)) +
#  geom_point()
#print(setAF_TP_pos)

#TsTv(AV_setAF_truepos)
#counts variants from all tools/seq_depth/AF trials

#randomAF_TP_pos = ggplot(AV_randomAF_truepos, aes(x = pos, y = ref, color = tool, shape = alt)) +
#  geom_point()
#print(randomAF_TP_pos)

#TsTv(AV_randomAF_truepos)