This notebook takes the output for the MAF Pipeline (mycsvfile.csv) and does:

  1. Calculates true pos, false pos, and false neg and makes a precision recall curve for all tools
  2. Finds total numbers of variants found by tools at different allele frequencies and sequencing depths
  3. Looks at correlation between SNPS in the golden vcf and worflow vcf For viral variant callers!
print(TEST)
## [1] "hello"
#knitr::opts_knit$set(root.dir = "C:\\Users\\mk5636\\Desktop\\mad-r\\")

af_report = "/scratch/mk5636/soggy_wright_af_data.csv"
golden_vcf = "/scratch/mk5636/MAD-1_AF_random.vcf"

Loading Libraries

library('ggplot2')
library('plyr')
library('tidyverse')
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble  2.1.3     ✔ purrr   0.3.3
## ✔ tidyr   1.0.0     ✔ dplyr   0.8.3
## ✔ readr   1.3.1     ✔ stringr 1.4.0
## ✔ tibble  2.1.3     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange()   masks plyr::arrange()
## ✖ purrr::compact()   masks plyr::compact()
## ✖ dplyr::count()     masks plyr::count()
## ✖ dplyr::failwith()  masks plyr::failwith()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::id()        masks plyr::id()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::mutate()    masks plyr::mutate()
## ✖ dplyr::rename()    masks plyr::rename()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
library('ggpubr')
## Error in library("ggpubr"): there is no package called 'ggpubr'
library("MLmetrics")
## Error in library("MLmetrics"): there is no package called 'MLmetrics'
MyColors = c("#E76F51", "#E9C369", "#2A9D8F","#2B4FA2", "#119DAC", "#672D7B", "#262366", "#BCBCBC")
ggplot2::theme_set(theme_minimal())

totalbases = 29903
detach(package:plyr, unload=TRUE)
library("plotrix") 
## Error in library("plotrix"): there is no package called 'plotrix'

Making Transition/Transversion Function

TsTv = function(dataframe) {
  transitions = 0
  for (row in 1:nrow(dataframe)) {
  
   REF <- dataframe[row, "ref"] #requires dataframe to have column name "ref"
   ALT <- dataframe[row, "alt"] # requires dataframe to have column name "alt"

    if(REF == "A" & ALT == "G")
      transitions = transitions + 1
    if(REF == "G" & ALT == "A")
      transitions = transitions + 1
    if(REF == "C" & ALT == "T")
      transitions = transitions + 1
    if(REF == "T" & ALT == "C")
      transitions = transitions + 1
  }

  transversions = 0
  for (row in 1:nrow(dataframe)) {
  
    REF <- dataframe[row, "ref"]
    ALT <- dataframe[row, "alt"]
  
    if(REF == "A" & ALT == "T")
      transversions = transversions + 1
    if(REF == "T" & ALT == "A")
      transversions = transversions + 1
    if(REF == "C" & ALT == "A")
      transversions = transversions + 1
    if(REF == "A" & ALT == "C")
      transversions = transversions + 1
    if(REF == "G" & ALT == "T")
      transversions = transversions + 1
    if(REF == "T" & ALT == "G")
      transversions = transversions + 1
    if(REF == "C" & ALT == "G")
      transversions = transversions + 1
    if(REF == "G" & ALT == "C")
      transversions = transversions + 1
  }

  TsTv_df = data.frame(transitions,transversions)
  print(TsTv_df)
  TsTv_ratio = (transitions/transversions)
  print(TsTv_ratio)
  
}

Reading in Variant Data and Evaluating

AV = read.csv(af_report,header = T)
AV$sample_id = as.character(AV$sample_id)
AV$dp = as.numeric(AV$dp)
AV = separate(AV, sample_id, sep = "_", 
               into = c("pipe","AF","allele_freq","frac","seq_depth","tool","parameter")) %>%
  select(-c(pipe, AF, frac)) %>% 
  filter(ref == "A" | ref == "C" | ref == "T" | ref == "G") %>% 
  filter(alt == "A" | alt == "C" | alt == "T" | alt == "G") %>% droplevels()
## Warning: Expected 7 pieces. Additional pieces discarded in 13052 rows [1929,
## 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942,
## 1943, 1944, 1945, 1946, 1947, 1948, ...].
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 74054 rows [1, 2,
## 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
AV$seq_depth = as.numeric(AV$seq_depth)
AV$coverage = (AV$seq_depth * 100000)


AV_ivar = filter(AV, tool == "ivar") %>% droplevels()
AV_lofreq = filter(AV, tool == "lofreq") %>% droplevels()
AV_varscan_unfiltered = filter(AV, tool == "varscan" & parameter == "unfiltered") %>% droplevels()
#AV_varscan_unfiltered$tool = paste0(AV_varscan_unfiltered$tool,"_",AV_varscan_unfiltered$parameter)
AV_varscan_filtered = filter(AV, tool == "varscan"& parameter == "filtered") %>% droplevels()
#AV_varscan_filtered$tool = paste0(AV_varscan_filtered$tool,"_",AV_varscan_filtered$parameter)
AV_tims = filter(AV, tool == "tims") %>% droplevels()
AV_freebayes = filter(AV, tool == "freebayes") %>% droplevels()
AV_mutect = filter(AV, tool == "mutect2") %>% droplevels()
AV_haplotypecaller = filter(AV, tool == "filtered") %>% droplevels()
AV_haplotypecaller$tool = "haplotypecaller"

AV = rbind(AV_ivar,AV_varscan_filtered,AV_tims,
           AV_freebayes,AV_mutect,AV_haplotypecaller,AV_lofreq)
AV$parameter = NULL
head(AV)
##   allele_freq seq_depth tool  pos af_golden af_workflow ref alt dp coverage
## 1        0.03     1e-05 ivar  351      0.03           0   G   A  1        1
## 2        0.03     1e-05 ivar  581      0.03           0   G   A  1        1
## 3        0.03     1e-05 ivar 1002      0.03           0   A   G  1        1
## 4        0.03     1e-05 ivar 1170      0.03           0   C   T  1        1
## 5        0.03     1e-05 ivar 1180      0.03           0   A   T  1        1
## 6        0.03     1e-05 ivar 1208      0.03           0   A   G  1        1
Dp_Variation = ggplot(AV, aes(x = log10(coverage), y = log10(dp), color = tool)) +
  geom_point() + 
  facet_wrap(~tool, scales = "free_y") +
  geom_abline() +
  scale_color_manual(values = MyColors)
## Warning: namespace 'plyr' is not available and has been replaced
## by .GlobalEnv when processing object ''
print(Dp_Variation)
## Warning: Removed 7991 rows containing missing values (geom_point).
plot of chunk unnamed-chunk-5

plot of chunk unnamed-chunk-5

ggsave(Dp_Variation, file = "AV_Dp_Coverage.png")
## Saving 7 x 7 in image
## Warning: Removed 7991 rows containing missing values (geom_point).

Reading in Golden Data and Evaluating

viral_golden = read.table(golden_vcf, stringsAsFactors = F)
colnames(viral_golden) = c("chrom","pos","ID","ref","alt","qual","filter","info","codes","genotype")
viral_golden = separate(viral_golden, info, sep = "=", into = c("label","AF"))
viral_golden$label = NULL
head(viral_golden)
##       chrom  pos ID ref alt qual filter   AF          codes
## 1 SARS-CoV2  351  .   G   A    .      . 0.04 GT:AD:DP:GQ:PL
## 2 SARS-CoV2  581  .   G   A    .      .  0.4 GT:AD:DP:GQ:PL
## 3 SARS-CoV2 1002  .   A   G    .      . 0.46 GT:AD:DP:GQ:PL
## 4 SARS-CoV2 1170  .   C   T    .      . 0.18 GT:AD:DP:GQ:PL
## 5 SARS-CoV2 1180  .   A   T    .      . 0.73 GT:AD:DP:GQ:PL
## 6 SARS-CoV2 1208  .   A   G    .      . 0.18 GT:AD:DP:GQ:PL
##                                                                                                                                                          genotype
## 1                                                                                                                                            1/1/1/1:96,4:100:1:1
## 2                                                                   1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1:60,40:100:1:1
## 3                                                       1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1:54,46:100:1:1
## 4                                                                                                               1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1:82,18:100:1:1
## 5 1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1:27,73:100:1:1
## 6                                                                                                               1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1:82,18:100:1:1
dim(viral_golden)
## [1] 161  10
viral_golden_pos = ggplot(viral_golden, aes(x = pos, y = ref, color = alt)) +
  geom_point()
print(viral_golden_pos)
plot of chunk unnamed-chunk-6

plot of chunk unnamed-chunk-6

TsTv(viral_golden)
##   transitions transversions
## 1         114            47
## [1] 2.425532

SetAF - Separating Data into TP, FP, FN

AV_setAF = filter(AV, allele_freq != "random") %>% droplevels()

AV_setAF_falsepos = filter(AV_setAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
  AV_setAF_falsepos$category = c("FP")
AV_setAF_falseneg = filter(AV_setAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
  AV_setAF_falseneg$category = c("FN")
AV_setAF_truepos = filter(AV_setAF, af_golden != 0 & af_workflow != 0)
  AV_setAF_truepos$category = c("TP")
  
AV_setAF = rbind(AV_setAF_truepos, AV_setAF_falseneg, AV_setAF_falsepos)

FP_All = group_by(AV_setAF_falsepos, allele_freq,coverage,tool) %>% tally()
  colnames(FP_All) = c("allele_freq","coverage","tool","FP")
FN_All = group_by(AV_setAF_falseneg, allele_freq,coverage,tool) %>% tally()
  colnames(FN_All) = c("allele_freq","coverage","tool","FN")
TP_All = group_by(AV_setAF_truepos, allele_freq,coverage,tool) %>% tally()
  colnames(TP_All) = c("allele_freq","coverage","tool","TP")
    
AllVar = merge(FP_All,FN_All, by=c("allele_freq","coverage","tool"),all=T)
AllVar = merge(AllVar,TP_All, by=c("allele_freq","coverage","tool"), all=T)
head(AllVar)
##   allele_freq coverage            tool FP  FN TP
## 1        0.01        1       freebayes NA 161 NA
## 2        0.01        1 haplotypecaller NA 161 NA
## 3        0.01        1            ivar NA 161 NA
## 4        0.01        1          lofreq NA 161 NA
## 5        0.01        1         mutect2  1 161 NA
## 6        0.01        1            tims  2 161 NA
AllVar[is.na(AllVar)] = 0
head(AllVar)
##   allele_freq coverage            tool FP  FN TP
## 1        0.01        1       freebayes  0 161  0
## 2        0.01        1 haplotypecaller  0 161  0
## 3        0.01        1            ivar  0 161  0
## 4        0.01        1          lofreq  0 161  0
## 5        0.01        1         mutect2  1 161  0
## 6        0.01        1            tims  2 161  0
AllVar$prec = (AllVar$TP)/(AllVar$TP + AllVar$FP)
AllVar$recall = (AllVar$TP)/(AllVar$TP + AllVar$FN)
AllVar$area = AllVar$prec * AllVar$recall
head(AllVar)
##   allele_freq coverage            tool FP  FN TP prec recall area
## 1        0.01        1       freebayes  0 161  0  NaN      0  NaN
## 2        0.01        1 haplotypecaller  0 161  0  NaN      0  NaN
## 3        0.01        1            ivar  0 161  0  NaN      0  NaN
## 4        0.01        1          lofreq  0 161  0  NaN      0  NaN
## 5        0.01        1         mutect2  1 161  0    0      0    0
## 6        0.01        1            tims  2 161  0    0      0    0

RandomAF - Separating Data into TP, FP, FN

AV_randomAF = filter(AV, allele_freq == "random") %>% droplevels()

AV_randomAF_falsepos = filter(AV_randomAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
  AV_randomAF_falsepos$category = c("FP")
AV_randomAF_falseneg = filter(AV_randomAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
  AV_randomAF_falseneg$category = c("FN")
AV_randomAF_truepos = filter(AV_randomAF, af_golden != 0 & af_workflow != 0)
  AV_randomAF_truepos$category = c("TP")
  
AV_randomAF = rbind(AV_randomAF_truepos, AV_randomAF_falseneg, AV_randomAF_falsepos)

rFP_All = group_by(AV_randomAF_falsepos, allele_freq,coverage,tool) %>% tally()
  colnames(rFP_All) = c("allele_freq","coverage","tool","FP")
rFN_All = group_by(AV_randomAF_falseneg, allele_freq,coverage,tool) %>% tally()
  colnames(rFN_All) = c("allele_freq","coverage","tool","FN")
rTP_All = group_by(AV_randomAF_truepos, allele_freq,coverage,tool) %>% tally()
  colnames(rTP_All) = c("allele_freq","coverage","tool","TP")

rAllVar = merge(rFP_All,rFN_All, by=c("allele_freq","coverage","tool"),all=T)
rAllVar = merge(rAllVar, rTP_All, by=c("allele_freq","coverage","tool"), all=T)
head(rAllVar)
##   allele_freq coverage            tool FP  FN TP
## 1      random        1       freebayes NA 154  7
## 2      random        1 haplotypecaller NA 154  7
## 3      random        1            ivar NA 161 NA
## 4      random        1          lofreq NA 161 NA
## 5      random        1         mutect2 NA 157  4
## 6      random        1            tims  4 136 25
rAllVar[is.na(rAllVar)] = 0

rAllVar$prec = (rAllVar$TP)/(rAllVar$TP + rAllVar$FP)
rAllVar$recall = (rAllVar$TP)/(rAllVar$TP + rAllVar$FN)
rAllVar$area = rAllVar$prec * rAllVar$recall
head(rAllVar)
##   allele_freq coverage            tool FP  FN TP     prec     recall       area
## 1      random        1       freebayes  0 154  7 1.000000 0.04347826 0.04347826
## 2      random        1 haplotypecaller  0 154  7 1.000000 0.04347826 0.04347826
## 3      random        1            ivar  0 161  0      NaN 0.00000000        NaN
## 4      random        1          lofreq  0 161  0      NaN 0.00000000        NaN
## 5      random        1         mutect2  0 157  4 1.000000 0.02484472 0.02484472
## 6      random        1            tims  4 136 25 0.862069 0.15527950 0.13386164

1. Precision_Recall Curves

PR_AllVarSet_freq = ggplot(AllVar,aes(x=recall, y=prec, group = tool, color = tool)) +
  geom_point() + 
  geom_line() +
  facet_wrap(~allele_freq) +
  scale_color_manual(values = MyColors)
print(PR_AllVarSet_freq)
## Warning: Removed 38 rows containing missing values (geom_point).
## Warning: Removed 10 rows containing missing values (geom_path).
plot of chunk unnamed-chunk-9

plot of chunk unnamed-chunk-9

ggsave("PR_AllVarSet_byAF.png",PR_AllVarSet_freq)
## Saving 7 x 7 in image
## Warning: Removed 38 rows containing missing values (geom_point).

## Warning: Removed 10 rows containing missing values (geom_path).
PR_AllVarSet_cover = ggplot(AllVar,aes(x=recall, y=prec, group = tool, color=tool)) +
  geom_point() + 
  geom_line() +
  facet_wrap(~coverage) +
  scale_color_manual(values = MyColors)
print(PR_AllVarSet_cover)
## Warning: Removed 38 rows containing missing values (geom_point).
## Warning: Removed 32 rows containing missing values (geom_path).
plot of chunk unnamed-chunk-9

plot of chunk unnamed-chunk-9

ggsave("PR_AllVarSet_byCov.png",PR_AllVarSet_cover)
## Saving 7 x 7 in image
## Warning: Removed 38 rows containing missing values (geom_point).

## Warning: Removed 32 rows containing missing values (geom_path).
PR_AllVarRandom = ggplot(rAllVar,aes(x=recall, y=prec, group = tool, color=tool)) +
  geom_point() + 
  geom_line() +
  scale_color_manual(values = MyColors)
print(PR_AllVarRandom)
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_path).
plot of chunk unnamed-chunk-9

plot of chunk unnamed-chunk-9

ggsave("PR_AllVarRandom.png",PR_AllVarRandom)
## Saving 7 x 7 in image
## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing missing values (geom_path).
# Examining Data

AllVar_PR_zeros = filter(AllVar, prec == 0 & recall == 0)
AllVar_PR_high = filter(AllVar, prec > 0.9 & recall > 0.9)

AllVar_PR_highest = group_by(AllVar, tool, allele_freq, coverage) %>%
  arrange(area, decreasing = TRUE)
## Error: incorrect size (1) at position 2, expecting : 432
write.csv(AllVar_PR_highest, file = "AllVar_PR_Thresholds.csv")
## Error in is.data.frame(x): object 'AllVar_PR_highest' not found

2.1 SetAF - Proportion of Variants Found

AV_setAF_counts = group_by(AV_setAF_truepos, tool, allele_freq, coverage) %>% tally()
AV_setAF_counts$proportion = (AV_setAF_counts$n / nrow(viral_golden))

AV_setAF_proportion = ggplot(AV_setAF_counts,aes(x = log10(coverage), y = proportion,
                                                group = tool, color = tool)) +
  geom_point() +
  geom_line() +
  facet_grid(~allele_freq) +
  scale_color_manual(values = MyColors) #+
  #theme(axis.text = element_text(size = 8),
        #axis.title = element_text(size = 30),
        #strip.text = element_text(size = 15),
        #axis.text.x = element_text(angle = 40))
print(AV_setAF_proportion)
## Registered S3 method overwritten by 'plyr':
##   method   from       
##   [.quoted R_GlobalEnv
plot of chunk unnamed-chunk-10

plot of chunk unnamed-chunk-10

ggsave("AllVarSet_Proportion.png", AV_setAF_proportion)
## Saving 7 x 7 in image
## Only looking at coverage between 100-1000
AV_setAF_truepos_f = filter(AV_setAF_truepos, coverage > 99 & coverage < 1001)
AV_setAF_counts_f = group_by(AV_setAF_truepos_f, tool, allele_freq, coverage) %>% tally()
AV_setAF_counts_f$proportion = (AV_setAF_counts_f$n / nrow(viral_golden))
 
AV_setAF_proportion_f = ggplot(AV_setAF_counts_f,aes(x = log10(coverage), y = proportion,
                               group = tool, color = tool)) +
  geom_point() +
  geom_line() +
  facet_grid(~allele_freq) +
  scale_color_manual(values = MyColors) +
  #scale_x_continuous(breaks = c(100,200,300,400,500,600,700,800,900,1000), 
                   #labels = c("100","200","300","400","500","600","700","800","900","1000"))  +
  theme(axis.text.x = element_text(angle = 90))
        #axis.text = element_text(size = 8),
        #axis.title = element_text(size = 30),
        #strip.text = element_text(size = 15),
print(AV_setAF_proportion_f)
plot of chunk unnamed-chunk-10

plot of chunk unnamed-chunk-10

ggsave("AllVarSet_Proportion_Filtered.png", AV_setAF_proportion_f)
## Saving 7 x 7 in image

2.2 RandomAF - Proportion of Variants Found

AV_randomAF_counts = group_by(AV_randomAF_truepos, tool, allele_freq, coverage) %>% tally()
AV_randomAF_counts$proportion = (AV_randomAF_counts$n / nrow(viral_golden))

AV_randomAF_proportion = ggplot(AV_randomAF_counts, aes(x = log10(coverage), y = proportion, 
                                                        group = tool, color = tool)) +
  geom_point() +
  geom_line() +
  scale_color_manual(values = MyColors) #+
  #theme(axis.text = element_text(size = 8),
        #axis.title = element_text(size = 30),
        #strip.text = element_text(size = 15),
        #axis.text.x = element_text(angle = 40))
print(AV_randomAF_proportion)
plot of chunk unnamed-chunk-11

plot of chunk unnamed-chunk-11

ggsave("AllVarRandom_Proportion.png", AV_randomAF_proportion)
## Saving 7 x 7 in image
## Only looking at coverage between 100-1000
AV_randomAF_truepos_f = filter(AV_randomAF_truepos, coverage > 99 & coverage < 1001)
AV_randomAF_counts_f = group_by(AV_randomAF_truepos_f, tool, allele_freq, coverage) %>% tally()
AV_randomAF_counts_f$proportion = (AV_randomAF_counts_f$n / nrow(viral_golden))
 
AV_randomAF_proportion_f = ggplot(AV_randomAF_counts_f, aes(x = log10(coverage), y = proportion, 
                                                             group = tool, color = tool)) +
  geom_point() +
  geom_line() +
  scale_color_manual(values = MyColors) #+
  #scale_x_continuous(breaks = c(100,200,300,400,500,600,700,800,900,1000), 
                   #labels = c("100","200","300","400","500","600","700","800","900","1000"))  +
  #theme(axis.text = element_text(size = 8),
        #axis.title = element_text(size = 30),
        #strip.text = element_text(size = 15),
        #axis.text.x = element_text(angle = 40))
print(AV_randomAF_proportion_f)
plot of chunk unnamed-chunk-11

plot of chunk unnamed-chunk-11

ggsave("AllVarRandom_Proportion_Filtered.png", AV_randomAF_proportion_f)
## Saving 7 x 7 in image

3.1 SetAF - Correlation Between Golden and Workflow

## Only looking at true positives
AV_Corr_setAF = ggplot(AV_setAF, aes(x = af_golden, y = af_workflow,
                                                color = tool, shape = category)) +
  geom_point() +
  scale_color_manual(values = MyColors) +
  geom_boxplot(aes(group = allele_freq)) + 
  facet_grid(tool~coverage) +
  #theme(axis.text = element_text(size = 8), 
        #axis.title = element_text(size = 20), 
        #strip.text = element_text(size = 15),
        #axis.text.x = element_text(angle = 40)) +
  ylim(-0.001,1.0) + xlim(-0.001,1.0)
print(AV_Corr_setAF)
plot of chunk unnamed-chunk-12

plot of chunk unnamed-chunk-12

ggsave("AllVarSet_Correlation.png",AV_Corr_setAF)
## Saving 7 x 7 in image
##Looking at all SetAF Data
AV_setAF_f = filter(AV_setAF, coverage > 99 & coverage < 1001)

AV_Corr_setAF = ggplot(AV_setAF_f, aes(x = af_golden, y = af_workflow, 
                                       color = tool, group = tool, shape = category)) +
  geom_point(size = 3, alpha = 0.4) +
  scale_color_manual(values = MyColors) +
  facet_grid(~coverage) +
  geom_smooth(method='lm', size = 1.5) +
  #theme(axis.text = element_text(size = 8), 
        #axis.title = element_text(size = 20), 
        #strip.text = element_text(size = 15),
        #axis.text.x = element_text(angle = 40)) +
  ylim(-0.001,1.0) + xlim(-0.001,1.0)
print(AV_Corr_setAF)
## Warning: Removed 8 rows containing missing values (geom_smooth).
plot of chunk unnamed-chunk-12

plot of chunk unnamed-chunk-12

ggsave("AllVarSet_Correlation_Filtered.png",AV_Corr_setAF)
## Saving 7 x 7 in image
## Warning: Removed 8 rows containing missing values (geom_smooth).
AV_Corr_setAF_Zoom = ggplot(AV_setAF_f, aes(x = af_golden, y = af_workflow,
                                            color = tool, group = tool, shape = category)) +
  geom_point(size = 3, alpha = 0.4) +
  scale_color_manual(values = MyColors) +
  facet_grid(~coverage) +
  geom_smooth(method='lm', size = 1.5) +
  #theme(axis.text = element_text(size = 8), 
        #axis.title = element_text(size = 20), 
        #strip.text = element_text(size = 15),
        #axis.text.x = element_text(angle = 40)) +
  ylim(-0.01,0.15) + xlim(-0.01,0.15)
print(AV_Corr_setAF_Zoom)
## Warning: Removed 14645 rows containing non-finite values (stat_smooth).
## Warning: Removed 14645 rows containing missing values (geom_point).
plot of chunk unnamed-chunk-12

plot of chunk unnamed-chunk-12

ggsave("AllVarSet_CorrelationF_Filtered_Zoom.png", AV_Corr_setAF_Zoom)
## Saving 7 x 7 in image
## Warning: Removed 14645 rows containing non-finite values (stat_smooth).

## Warning: Removed 14645 rows containing missing values (geom_point).

3.2 RandomAF - Correlation Between Golden and Workflow

## Only looking at true positives
AV_Corr_randomAF = ggplot(AV_randomAF, aes(x = af_golden, y = af_workflow,
                                                      color = tool, shape = category)) +
  geom_point() +
  scale_color_manual(values = MyColors) +
  facet_grid(tool~coverage) +
  #theme(axis.text = element_text(size = 8), 
        #axis.title = element_text(size = 20), 
        #strip.text = element_text(size = 15),
        #axis.text.x = element_text(angle = 40)) +
  ylim(-0.001,1.0) + xlim(-0.001,1.0)
print(AV_Corr_randomAF)
plot of chunk unnamed-chunk-13

plot of chunk unnamed-chunk-13

ggsave("AllVarRandom_Correlation.png",AV_Corr_randomAF)
## Saving 7 x 7 in image
##Looking at all RandomAF Data
AV_randomAF_f = filter(AV_randomAF, coverage > 99 & coverage < 1001)

AV_Corr_randomAF = ggplot(AV_randomAF_f, aes(x = af_golden, y = af_workflow, 
                                             color = tool, group = tool, shape = category)) +
  geom_point(size = 3, alpha = 0.4) +
  scale_color_manual(values = MyColors) +
  facet_grid(~coverage) +
  geom_smooth(method='lm', size = 1.5) +
  #theme(axis.text = element_text(size = 8), 
        #axis.title = element_text(size = 20), 
        #strip.text = element_text(size = 15),
        #axis.text.x = element_text(angle = 40)) +
  ylim(-0.01,1) + xlim(-0.01,1)
print(AV_Corr_randomAF)
plot of chunk unnamed-chunk-13

plot of chunk unnamed-chunk-13

ggsave("AllVarRandom_Correlation_Filtered.png",AV_Corr_randomAF)
## Saving 7 x 7 in image
AV_Corr_randomAF_Zoom = ggplot(AV_randomAF_f, aes(x = af_golden, y = af_workflow,
                                                  color = tool, group = tool, shape = category)) +
  geom_point(size = 3, alpha = 0.4) +
  scale_color_manual(values = MyColors) +
  facet_grid(~coverage) +
  geom_smooth(method='lm', size = 1.5) +
  #theme(axis.text = element_text(size = 8), 
        #axis.title = element_text(size = 20), 
        #strip.text = element_text(size = 15),
        #axis.text.x = element_text(angle = 40)) +
  ylim(-0.01,0.15) + xlim(-0.01,0.15)
print(AV_Corr_randomAF_Zoom)
## Warning: Removed 3817 rows containing non-finite values (stat_smooth).
## Warning: Removed 3817 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing missing values (geom_smooth).
plot of chunk unnamed-chunk-13

plot of chunk unnamed-chunk-13

ggsave("AllVarRandom_Correlation_Filtered_Zoom.png", AV_Corr_randomAF_Zoom)
## Saving 7 x 7 in image
## Warning: Removed 3817 rows containing non-finite values (stat_smooth).
## Warning: Removed 3817 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing missing values (geom_smooth).

Stats for SetAF Data

coeff_var <- function(x) {
  CV <- sd(x) / mean(x) * 100
  return(CV)
}

AV_setAF_split = group_by(AV_setAF_truepos, tool, allele_freq, coverage) %>%
  summarize(mean(af_workflow), sd(af_workflow), coeff_var(af_workflow)) %>% droplevels()
colnames(AV_setAF_split) = c("tool","AF","cov","mean","sd","variance")

Mean_setAF = ggplot(AV_setAF_split, aes(x = tool, y = mean, color = cov)) + 
  geom_point() +
  facet_grid(rows = vars(tool), cols = vars(AF)) +
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(Mean_setAF)
plot of chunk unnamed-chunk-14

plot of chunk unnamed-chunk-14

ggsave("SetAF_Mean_AlleleFrequency.png",Mean_setAF)
## Saving 7 x 7 in image
CoeffVar_setAF = ggplot(AV_setAF_split, aes(x = tool, y = variance, color = cov)) + 
  geom_point() +
  facet_grid(rows = vars(tool), cols = vars(AF)) +
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_setAF)
## Warning: Removed 15 rows containing missing values (geom_point).
plot of chunk unnamed-chunk-14

plot of chunk unnamed-chunk-14

ggsave("SetAF_CoeffVar_AlleleFrequency.png",CoeffVar_setAF)
## Saving 7 x 7 in image
## Warning: Removed 15 rows containing missing values (geom_point).

Set Up for Stats for RandomAF Data

AV_randomAF_split = group_by(AV_randomAF_truepos, tool, coverage) %>%
  summarize(mean(af_workflow), sd(af_workflow), coeff_var(af_workflow)) %>% droplevels()
colnames(AV_randomAF_split) = c("tool","cov","mean","sd","variance")

Mean_randomAF = ggplot(AV_randomAF_split, aes(x = tool, y = mean, color = cov)) + 
  geom_point()
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## List of 2
##  $ text       :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : num 15
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 1
##   ..$ vjust        : num 0.5
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE
print(Mean_randomAF)
plot of chunk unnamed-chunk-15

plot of chunk unnamed-chunk-15

ggsave("RandomAF_Mean_AlleleFrequency.png",Mean_randomAF)
## Saving 7 x 7 in image
CoeffVar_randomAF = ggplot(AV_randomAF_split, aes(x = tool, y = variance, color = cov)) + 
  geom_point() +
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_randomAF)
plot of chunk unnamed-chunk-15

plot of chunk unnamed-chunk-15

ggsave("RandomAF_CoeffVar_AlleleFrequency.png",CoeffVar_randomAF)
## Saving 7 x 7 in image

4. Look at how each tool compares at each AF and seq-depth

AVSet_Corr = ggplot(AV_setAF_truepos, aes(x = tool, y = af_workflow)) + 
  geom_point() +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  facet_grid(allele_freq~coverage, scales = "free_y")
print(AVSet_Corr)
plot of chunk unnamed-chunk-16

plot of chunk unnamed-chunk-16

ggsave("AllVarSet_Tool_Comparison.pdf", AVSet_Corr, width = 15, height = 20)
# y axes are set per row at the moment (scales = "free_y")

AVRandom_Corr = ggplot(AV_randomAF_truepos, aes(x = tool, y = af_workflow)) + 
  geom_point() +
  geom_violin() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  facet_grid(allele_freq~coverage)
print(AVRandom_Corr)
plot of chunk unnamed-chunk-16

plot of chunk unnamed-chunk-16

ggsave("AllVarRandom_Tool_Comparison.pdf", AVRandom_Corr, width = 15, height = 20)

5.1. Looks at where the SNPS are located throughout the genome

# Set AF Data
AV_Pos = ggplot(AV_setAF_truepos, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos)
plot of chunk unnamed-chunk-17

plot of chunk unnamed-chunk-17

AV_Pos_falsepos = ggplot(AV_setAF_falsepos, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos_falsepos)
plot of chunk unnamed-chunk-17

plot of chunk unnamed-chunk-17

AV_Pos_falseneg = ggplot(AV_setAF_falseneg, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos_falseneg)
plot of chunk unnamed-chunk-17

plot of chunk unnamed-chunk-17

# Random AF Data
AV_Pos_r = ggplot(AV_randomAF_truepos, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos_r)
plot of chunk unnamed-chunk-17

plot of chunk unnamed-chunk-17

AV_Pos_falsepos_r = ggplot(AV_randomAF_falsepos, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos_falsepos_r)
plot of chunk unnamed-chunk-17

plot of chunk unnamed-chunk-17

AV_Pos_falseneg_r = ggplot(AV_randomAF_falseneg, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos_falseneg_r)
plot of chunk unnamed-chunk-17

plot of chunk unnamed-chunk-17

5.2.Transitions/Transversions

setAF_TP_pos = ggplot(AV_setAF_truepos, aes(x = pos, y = ref, color = tool, shape = alt)) +
  geom_point()
print(setAF_TP_pos)
plot of chunk unnamed-chunk-18

plot of chunk unnamed-chunk-18

TsTv(AV_setAF_truepos)
##   transitions transversions
## 1       30901         13001
## [1] 2.376817
#counts variants from all tools/seq_depth/AF trials

randomAF_TP_pos = ggplot(AV_randomAF_truepos, aes(x = pos, y = ref, color = tool, shape = alt)) +
  geom_point()
print(randomAF_TP_pos)
plot of chunk unnamed-chunk-18

plot of chunk unnamed-chunk-18

TsTv(AV_randomAF_truepos)
##   transitions transversions
## 1        4584          1875
## [1] 2.4448