This notebook takes the output for the MAF Pipeline (mycsvfile.csv) and does:
- Calculates true pos, false pos, and false neg and makes a precision recall curve for all tools
- Finds total numbers of variants found by tools at different allele frequencies and sequencing depths
- Looks at correlation between SNPS in the golden vcf and worflow vcf For viral variant callers!
Loading Libraries
library('ggplot2')
library('plyr')
library('tidyverse')
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.4 v dplyr 1.0.2
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## v purrr 0.3.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::arrange() masks plyr::arrange()
## x purrr::compact() masks plyr::compact()
## x dplyr::count() masks plyr::count()
## x dplyr::failwith() masks plyr::failwith()
## x dplyr::filter() masks stats::filter()
## x dplyr::id() masks plyr::id()
## x dplyr::lag() masks stats::lag()
## x dplyr::mutate() masks plyr::mutate()
## x dplyr::rename() masks plyr::rename()
## x dplyr::summarise() masks plyr::summarise()
## x dplyr::summarize() masks plyr::summarize()
library('ggpubr')
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:plyr':
##
## mutate
library("MLmetrics")
##
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
##
## Recall
MyColors = c("#E76F51", "#E9C369", "#2A9D8F","#2B4FA2", "#119DAC", "#672D7B", "#262366", "#BCBCBC")
ggplot2::theme_set(theme_minimal())
detach(package:plyr, unload=TRUE)
library("plotrix")
Making Transition/Transversion Function
TsTv = function(dataframe) {
transitions = 0
for (row in 1:nrow(dataframe)) {
REF <- dataframe[row, "ref"] #requires dataframe to have column name "ref"
ALT <- dataframe[row, "alt"] # requires dataframe to have column name "alt"
if(REF == "A" & ALT == "G")
transitions = transitions + 1
if(REF == "G" & ALT == "A")
transitions = transitions + 1
if(REF == "C" & ALT == "T")
transitions = transitions + 1
if(REF == "T" & ALT == "C")
transitions = transitions + 1
}
transversions = 0
for (row in 1:nrow(dataframe)) {
REF <- dataframe[row, "ref"]
ALT <- dataframe[row, "alt"]
if(REF == "A" & ALT == "T")
transversions = transversions + 1
if(REF == "T" & ALT == "A")
transversions = transversions + 1
if(REF == "C" & ALT == "A")
transversions = transversions + 1
if(REF == "A" & ALT == "C")
transversions = transversions + 1
if(REF == "G" & ALT == "T")
transversions = transversions + 1
if(REF == "T" & ALT == "G")
transversions = transversions + 1
if(REF == "C" & ALT == "G")
transversions = transversions + 1
if(REF == "G" & ALT == "C")
transversions = transversions + 1
}
TsTv_df = data.frame(transitions,transversions)
print(TsTv_df)
TsTv_ratio = (transitions/transversions)
print(TsTv_ratio)
}
Reading in Variant Data and Evaluating
AV = read.csv(af_report,header = T)
AV$sample_id = as.character(AV$sample_id)
AV$dp = as.numeric(AV$dp)
AV = separate(AV, sample_id, sep = "_",
into = c("pipe","AF","allele_freq","frac","seq_depth","tool","parameter")) %>%
select(-c(pipe, AF, frac)) %>%
filter(ref == "A" | ref == "C" | ref == "T" | ref == "G") %>%
filter(alt == "A" | alt == "C" | alt == "T" | alt == "G") %>% droplevels()
## Warning: Expected 7 pieces. Additional pieces discarded in 1788 rows [2863,
## 2864, 2865, 2866, 2867, 2868, 2869, 2870, 2871, 2872, 2873, 2874, 2875, 2876,
## 2877, 2878, 2879, 2880, 2881, 2882, ...].
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 9992 rows [1, 2,
## 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
AV$seq_depth = as.numeric(AV$seq_depth)
AV$coverage = (AV$seq_depth * 100000)
AV_ivar = filter(AV, tool == "ivar") %>% droplevels()
AV_lofreq = filter(AV, tool == "lofreq") %>% droplevels()
AV_varscan_unfiltered = filter(AV, tool == "varscan" & parameter == "unfiltered") %>% droplevels()
#AV_varscan_unfiltered$tool = paste0(AV_varscan_unfiltered$tool,"_",AV_varscan_unfiltered$parameter)
AV_varscan_filtered = filter(AV, tool == "varscan"& parameter == "filtered") %>% droplevels()
#AV_varscan_filtered$tool = paste0(AV_varscan_filtered$tool,"_",AV_varscan_filtered$parameter)
AV_tims = filter(AV, tool == "tims") %>% droplevels()
AV_freebayes = filter(AV, tool == "freebayes") %>% droplevels()
AV_mutect = filter(AV, tool == "mutect2") %>% droplevels()
AV_haplotypecaller = filter(AV, tool == "filtered") %>% droplevels()
AV_haplotypecaller$tool = "haplotypecaller"
AV = rbind(AV_ivar,AV_varscan_filtered,AV_tims,
AV_freebayes,AV_mutect,AV_haplotypecaller,AV_lofreq)
AV$parameter = NULL
head(AV)
## allele_freq seq_depth tool pos af_golden af_workflow ref alt dp coverage
## 1 0.03 1 ivar 29884 0.03 0.00 A G 10000 1e+05
## 2 0.03 1 ivar 595 0.03 0.03 T A 17571 1e+05
## 3 0.03 1 ivar 967 0.03 0.03 T C 11717 1e+05
## 4 0.03 1 ivar 1304 0.03 0.03 G T 10177 1e+05
## 5 0.03 1 ivar 1460 0.03 0.03 A G 10149 1e+05
## 6 0.03 1 ivar 1544 0.03 0.03 G T 10210 1e+05
Dp_Variation = ggplot(AV, aes(x = log10(coverage), y = log10(dp), color = tool)) +
geom_point() +
facet_wrap(~tool, scales = "free_y") +
geom_abline() +
scale_color_manual(values = MyColors)
## Warning: namespace 'plyr' is not available and has been replaced
## by .GlobalEnv when processing object ''
print(Dp_Variation)
## Warning: Removed 1541 rows containing missing values (geom_point).

ggsave(Dp_Variation, file = "AV_Dp_Coverage.png")
## Saving 7 x 5 in image
## Warning: Removed 1541 rows containing missing values (geom_point).
Reading in Golden Data and Evaluating
viral_golden = read.table(golden_vcf, stringsAsFactors = F)
colnames(viral_golden) = c("chrom","pos","ID","ref","alt","qual","filter","info","codes","genotype")
viral_golden = separate(viral_golden, info, sep = "=", into = c("label","AF"))
viral_golden$label = NULL
head(viral_golden)
## chrom pos ID ref alt qual filter AF codes
## 1 SARS-CoV2 595 . T A . . 0.1 GT:AD:DP:GQ:PL
## 2 SARS-CoV2 967 . T C . . 0.1 GT:AD:DP:GQ:PL
## 3 SARS-CoV2 1304 . G T . . 0.1 GT:AD:DP:GQ:PL
## 4 SARS-CoV2 1460 . A G . . 0.1 GT:AD:DP:GQ:PL
## 5 SARS-CoV2 1544 . G T . . 0.1 GT:AD:DP:GQ:PL
## 6 SARS-CoV2 1619 . G C . . 0.1 GT:AD:DP:GQ:PL
## genotype
## 1 1/1/1/1/1/1/1/1/1/1:90,10:100:1:1
## 2 1/1/1/1/1/1/1/1/1/1:90,10:100:1:1
## 3 1/1/1/1/1/1/1/1/1/1:90,10:100:1:1
## 4 1/1/1/1/1/1/1/1/1/1:90,10:100:1:1
## 5 1/1/1/1/1/1/1/1/1/1:90,10:100:1:1
## 6 1/1/1/1/1/1/1/1/1/1:90,10:100:1:1
dim(viral_golden)
## [1] 149 10
viral_golden_pos = ggplot(viral_golden, aes(x = pos, y = ref, color = alt)) +
geom_point()
print(viral_golden_pos)

TsTv(viral_golden)
## transitions transversions
## 1 111 38
## [1] 2.921053
SetAF - Separating Data into TP, FP, FN
AV_setAF = filter(AV, allele_freq != "random") %>% droplevels()
AV_setAF_falsepos = filter(AV_setAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
AV_setAF_falsepos$category = c("FP")
AV_setAF_falseneg = filter(AV_setAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
AV_setAF_falseneg$category = c("FN")
AV_setAF_truepos = filter(AV_setAF, af_golden != 0 & af_workflow != 0)
AV_setAF_truepos$category = c("TP")
AV_setAF = rbind(AV_setAF_truepos, AV_setAF_falseneg, AV_setAF_falsepos)
FP_All = group_by(AV_setAF_falsepos, allele_freq,coverage,tool) %>% tally()
colnames(FP_All) = c("allele_freq","coverage","tool","FP")
FN_All = group_by(AV_setAF_falseneg, allele_freq,coverage,tool) %>% tally()
colnames(FN_All) = c("allele_freq","coverage","tool","FN")
TP_All = group_by(AV_setAF_truepos, allele_freq,coverage,tool) %>% tally()
colnames(TP_All) = c("allele_freq","coverage","tool","TP")
AllVar = merge(FP_All,FN_All, by=c("allele_freq","coverage","tool"),all=T)
AllVar = merge(AllVar,TP_All, by=c("allele_freq","coverage","tool"), all=T)
head(AllVar)
## allele_freq coverage tool FP FN TP
## 1 0.02 1000 freebayes 9 65 84
## 2 0.02 1000 haplotypecaller NA 69 80
## 3 0.02 1000 ivar NA 146 3
## 4 0.02 1000 lofreq NA 120 29
## 5 0.02 1000 mutect2 NA 108 41
## 6 0.02 1000 tims NA 90 59
AllVar[is.na(AllVar)] = 0
head(AllVar)
## allele_freq coverage tool FP FN TP
## 1 0.02 1000 freebayes 9 65 84
## 2 0.02 1000 haplotypecaller 0 69 80
## 3 0.02 1000 ivar 0 146 3
## 4 0.02 1000 lofreq 0 120 29
## 5 0.02 1000 mutect2 0 108 41
## 6 0.02 1000 tims 0 90 59
AllVar$prec = (AllVar$TP)/(AllVar$TP + AllVar$FP)
AllVar$recall = (AllVar$TP)/(AllVar$TP + AllVar$FN)
AllVar$area = AllVar$prec * AllVar$recall
head(AllVar)
## allele_freq coverage tool FP FN TP prec recall
## 1 0.02 1000 freebayes 9 65 84 0.9032258 0.56375839
## 2 0.02 1000 haplotypecaller 0 69 80 1.0000000 0.53691275
## 3 0.02 1000 ivar 0 146 3 1.0000000 0.02013423
## 4 0.02 1000 lofreq 0 120 29 1.0000000 0.19463087
## 5 0.02 1000 mutect2 0 108 41 1.0000000 0.27516779
## 6 0.02 1000 tims 0 90 59 1.0000000 0.39597315
## area
## 1 0.50920113
## 2 0.53691275
## 3 0.02013423
## 4 0.19463087
## 5 0.27516779
## 6 0.39597315
RandomAF - Separating Data into TP, FP, FN
AV_randomAF = filter(AV, allele_freq == "random") %>% droplevels()
AV_randomAF_falsepos = filter(AV_randomAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
if (NROW(AV_randomAF_falsepos) > 0){
AV_randomAF_falsepos$category = c("FP")
}
AV_randomAF_falseneg = filter(AV_randomAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
if (NROW(AV_randomAF_falseneg) > 0){
AV_randomAF_falseneg$category = c("FN")
}
AV_randomAF_truepos = filter(AV_randomAF, af_golden != 0 & af_workflow != 0)
if (NROW(AV_randomAF_truepos) > 0){
AV_randomAF_truepos$category = c("TP")
}
AV_randomAF = rbind(AV_randomAF_truepos, AV_randomAF_falseneg, AV_randomAF_falsepos)
rFP_All = group_by(AV_randomAF_falsepos, allele_freq,coverage,tool) %>% tally()
colnames(rFP_All) = c("allele_freq","coverage","tool","FP")
rFN_All = group_by(AV_randomAF_falseneg, allele_freq,coverage,tool) %>% tally()
colnames(rFN_All) = c("allele_freq","coverage","tool","FN")
rTP_All = group_by(AV_randomAF_truepos, allele_freq,coverage,tool) %>% tally()
colnames(rTP_All) = c("allele_freq","coverage","tool","TP")
rAllVar = merge(rFP_All,rFN_All, by=c("allele_freq","coverage","tool"),all=T)
rAllVar = merge(rAllVar, rTP_All, by=c("allele_freq","coverage","tool"), all=T)
head(rAllVar)
## allele_freq coverage tool FP FN TP
## 1 random 1000 freebayes 14 10 139
## 2 random 1000 haplotypecaller NA 6 143
## 3 random 1000 ivar NA 19 130
## 4 random 1000 lofreq NA 11 138
## 5 random 1000 mutect2 NA 46 103
## 6 random 1000 tims NA 17 132
rAllVar[is.na(rAllVar)] = 0
rAllVar$prec = (rAllVar$TP)/(rAllVar$TP + rAllVar$FP)
rAllVar$recall = (rAllVar$TP)/(rAllVar$TP + rAllVar$FN)
rAllVar$area = rAllVar$prec * rAllVar$recall
head(rAllVar)
## allele_freq coverage tool FP FN TP prec recall area
## 1 random 1000 freebayes 14 10 139 0.9084967 0.9328859 0.8475238
## 2 random 1000 haplotypecaller 0 6 143 1.0000000 0.9597315 0.9597315
## 3 random 1000 ivar 0 19 130 1.0000000 0.8724832 0.8724832
## 4 random 1000 lofreq 0 11 138 1.0000000 0.9261745 0.9261745
## 5 random 1000 mutect2 0 46 103 1.0000000 0.6912752 0.6912752
## 6 random 1000 tims 0 17 132 1.0000000 0.8859060 0.8859060
1. Precision_Recall Curves
PR_AllVarSet_freq = ggplot(AllVar,aes(x=recall, y=prec, group = tool, color = tool)) +
geom_point() +
geom_line() +
facet_wrap(~allele_freq) +
scale_color_manual(values = MyColors)
print(PR_AllVarSet_freq)

ggsave("PR_AllVarSet_byAF.png",PR_AllVarSet_freq)
## Saving 7 x 5 in image
PR_AllVarSet_cover = ggplot(AllVar,aes(x=recall, y=prec, group = tool, color=tool)) +
geom_point() +
geom_line() +
facet_wrap(~coverage) +
scale_color_manual(values = MyColors)
print(PR_AllVarSet_cover)

ggsave("PR_AllVarSet_byCov.png",PR_AllVarSet_cover)
## Saving 7 x 5 in image
PR_AllVarRandom = ggplot(rAllVar,aes(x=recall, y=prec, group = tool, color=tool)) +
geom_point() +
geom_line() +
scale_color_manual(values = MyColors)
print(PR_AllVarRandom)

ggsave("PR_AllVarRandom.png",PR_AllVarRandom)
## Saving 7 x 5 in image
# Examining Data
AllVar_PR_zeros = filter(AllVar, prec == 0 & recall == 0)
AllVar_PR_high = filter(AllVar, prec > 0.9 & recall > 0.9)
AllVar_PR_highest = group_by(AllVar, tool, allele_freq, coverage) %>%
arrange(area, decreasing = TRUE)
write.csv(AllVar_PR_highest, file = "AllVar_PR_Thresholds.csv")
2.1 SetAF - Proportion of Variants Found
#AV_setAF_counts = group_by(AV_setAF_truepos, tool, allele_freq, coverage) %>% tally()
#AV_setAF_counts$proportion = (AV_setAF_counts$n / nrow(viral_golden))
#AV_setAF_proportion = ggplot(AV_setAF_counts,aes(x = log10(coverage), y = proportion,
# group = tool, color = tool)) +
# geom_point() +
# geom_line() +
# facet_grid(~allele_freq) +
# scale_color_manual(values = MyColors) #+
# #theme(axis.text = element_text(size = 8),
# #axis.title = element_text(size = 30),
# #strip.text = element_text(size = 15),
# #axis.text.x = element_text(angle = 40))
#print(AV_setAF_proportion)
#ggsave("AllVarSet_Proportion.png", AV_setAF_proportion)
## Only looking at coverage between 100-1000
#AV_setAF_truepos_f = filter(AV_setAF_truepos, coverage > 99 & coverage < 1001)
#AV_setAF_counts_f = group_by(AV_setAF_truepos_f, tool, allele_freq, coverage) %>% tally()
#AV_setAF_counts_f$proportion = (AV_setAF_counts_f$n / nrow(viral_golden))
#AV_setAF_proportion_f = ggplot(AV_setAF_counts_f,aes(x = log10(coverage), y = proportion, group = tool, color = tool)) +
# geom_point() +
# geom_line() +
# facet_grid(~allele_freq) +
# scale_color_manual(values = MyColors) +
# #scale_x_continuous(breaks = c(100,200,300,400,500,600,700,800,900,1000),
# #labels = c("100","200","300","400","500","600","700","800","900","1000")) +
# theme(axis.text.x = element_text(angle = 90))
# #axis.text = element_text(size = 8),
# #axis.title = element_text(size = 30),
# #strip.text = element_text(size = 15),
#print(AV_setAF_proportion_f)
#ggsave("AllVarSet_Proportion_Filtered.png", AV_setAF_proportion_f)
2.2 RandomAF - Proportion of Variants Found
AV_randomAF_counts = group_by(AV_randomAF_truepos, tool, allele_freq, coverage) %>% tally()
AV_randomAF_counts$proportion = (AV_randomAF_counts$n / nrow(viral_golden))
AV_randomAF_proportion = ggplot(AV_randomAF_counts, aes(x = log10(coverage), y = proportion,
group = tool, color = tool)) +
geom_point() +
geom_line() +
scale_color_manual(values = MyColors) #+
#theme(axis.text = element_text(size = 8),
#axis.title = element_text(size = 30),
#strip.text = element_text(size = 15),
#axis.text.x = element_text(angle = 40))
print(AV_randomAF_proportion)

ggsave("AllVarRandom_Proportion.png", AV_randomAF_proportion)
## Saving 7 x 5 in image
## Only looking at coverage between 100-1000
AV_randomAF_truepos_f = filter(AV_randomAF_truepos, coverage > 99 & coverage < 1001)
AV_randomAF_counts_f = group_by(AV_randomAF_truepos_f, tool, allele_freq, coverage) %>% tally()
AV_randomAF_counts_f$proportion = (AV_randomAF_counts_f$n / nrow(viral_golden))
AV_randomAF_proportion_f = ggplot(AV_randomAF_counts_f, aes(x = log10(coverage), y = proportion,
group = tool, color = tool)) +
geom_point() +
geom_line() +
scale_color_manual(values = MyColors) #+
#scale_x_continuous(breaks = c(100,200,300,400,500,600,700,800,900,1000),
#labels = c("100","200","300","400","500","600","700","800","900","1000")) +
#theme(axis.text = element_text(size = 8),
#axis.title = element_text(size = 30),
#strip.text = element_text(size = 15),
#axis.text.x = element_text(angle = 40))
print(AV_randomAF_proportion_f)
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?

ggsave("AllVarRandom_Proportion_Filtered.png", AV_randomAF_proportion_f)
## Saving 7 x 5 in image
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
3.1 SetAF - Correlation Between Golden and Workflow
## Only looking at true positives
#AV_Corr_setAF = ggplot(AV_setAF, aes(x = af_golden, y = af_workflow,
# color = tool, shape = category)) +
# geom_point() +
# scale_color_manual(values = MyColors) +
# geom_boxplot(aes(group = allele_freq)) +
# facet_grid(tool~coverage) +
# #theme(axis.text = element_text(size = 8),
# #axis.title = element_text(size = 20),
# #strip.text = element_text(size = 15),
# #axis.text.x = element_text(angle = 40)) +
# ylim(-0.001,1.0) + xlim(-0.001,1.0)
#print(AV_Corr_setAF)
#ggsave("AllVarSet_Correlation.png",AV_Corr_setAF)
##Looking at all SetAF Data
#AV_setAF_f = filter(AV_setAF, coverage > 99 & coverage < 1001)
#AV_Corr_setAF = ggplot(AV_setAF_f, aes(x = af_golden, y = af_workflow,
# color = tool, group = tool, shape = category)) +
# geom_point(size = 3, alpha = 0.4) +
# scale_color_manual(values = MyColors) +
# facet_grid(~coverage) +
# geom_smooth(method='lm', size = 1.5) +
# #theme(axis.text = element_text(size = 8),
# #axis.title = element_text(size = 20),
# #strip.text = element_text(size = 15),
# #axis.text.x = element_text(angle = 40)) +
# ylim(-0.001,1.0) + xlim(-0.001,1.0)
#print(AV_Corr_setAF)
#ggsave("AllVarSet_Correlation_Filtered.png",AV_Corr_setAF)
#AV_Corr_setAF_Zoom = ggplot(AV_setAF_f, aes(x = af_golden, y = af_workflow,
# color = tool, group = tool, shape = category)) +
# geom_point(size = 3, alpha = 0.4) +
# scale_color_manual(values = MyColors) +
# facet_grid(~coverage) +
# geom_smooth(method='lm', size = 1.5) +
# #theme(axis.text = element_text(size = 8),
# #axis.title = element_text(size = 20),
# #strip.text = element_text(size = 15),
# #axis.text.x = element_text(angle = 40)) +
# ylim(-0.01,0.15) + xlim(-0.01,0.15)
#print(AV_Corr_setAF_Zoom)
#ggsave("AllVarSet_CorrelationF_Filtered_Zoom.png", AV_Corr_setAF_Zoom)
3.2 RandomAF - Correlation Between Golden and Workflow
## Only looking at true positives
#AV_Corr_randomAF = ggplot(AV_randomAF, aes(x = af_golden, y = af_workflow,
# color = tool, shape = category)) +
# geom_point() +
# scale_color_manual(values = MyColors) +
# facet_grid(tool~coverage) +
# #theme(axis.text = element_text(size = 8),
# #axis.title = element_text(size = 20),
# #strip.text = element_text(size = 15),
# #axis.text.x = element_text(angle = 40)) +
# ylim(-0.001,1.0) + xlim(-0.001,1.0)
#print(AV_Corr_randomAF)
#ggsave("AllVarRandom_Correlation.png",AV_Corr_randomAF)
##Looking at all RandomAF Data
#AV_randomAF_f = filter(AV_randomAF, coverage > 99 & coverage < 1001)
#AV_Corr_randomAF = ggplot(AV_randomAF_f, aes(x = af_golden, y = af_workflow,
# color = tool, group = tool, shape = category)) +
# geom_point(size = 3, alpha = 0.4) +
# scale_color_manual(values = MyColors) +
# facet_grid(~coverage) +
# geom_smooth(method='lm', size = 1.5) +
# #theme(axis.text = element_text(size = 8),
# #axis.title = element_text(size = 20),
# #strip.text = element_text(size = 15),
# #axis.text.x = element_text(angle = 40)) +
# ylim(-0.01,1) + xlim(-0.01,1)
#print(AV_Corr_randomAF)
#ggsave("AllVarRandom_Correlation_Filtered.png",AV_Corr_randomAF)
#AV_Corr_randomAF_Zoom = ggplot(AV_randomAF_f, aes(x = af_golden, y = af_workflow,
# color = tool, group = tool, shape = category)) +
# geom_point(size = 3, alpha = 0.4) +
# scale_color_manual(values = MyColors) +
# facet_grid(~coverage) +
# geom_smooth(method='lm', size = 1.5) +
# #theme(axis.text = element_text(size = 8),
# #axis.title = element_text(size = 20),
# #strip.text = element_text(size = 15),
# #axis.text.x = element_text(angle = 40)) +
# ylim(-0.01,0.15) + xlim(-0.01,0.15)
#print(AV_Corr_randomAF_Zoom)
#ggsave("AllVarRandom_Correlation_Filtered_Zoom.png", AV_Corr_randomAF_Zoom)
Stats for SetAF Data
coeff_var <- function(x) {
CV <- sd(x) / mean(x) * 100
return(CV)
}
AV_setAF_split = group_by(AV_setAF_truepos, tool, allele_freq, coverage) %>%
summarize(mean(af_workflow), sd(af_workflow), coeff_var(af_workflow)) %>% droplevels()
## `summarise()` regrouping output by 'tool', 'allele_freq' (override with `.groups` argument)
colnames(AV_setAF_split) = c("tool","AF","cov","mean","sd","variance")
Mean_setAF = ggplot(AV_setAF_split, aes(x = tool, y = mean, color = cov)) +
geom_point() +
facet_grid(rows = vars(tool), cols = vars(AF)) +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(Mean_setAF)

ggsave("SetAF_Mean_AlleleFrequency.png",Mean_setAF)
## Saving 7 x 5 in image
CoeffVar_setAF = ggplot(AV_setAF_split, aes(x = tool, y = variance, color = cov)) +
geom_point() +
facet_grid(rows = vars(tool), cols = vars(AF)) +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_setAF)

ggsave("SetAF_CoeffVar_AlleleFrequency.png",CoeffVar_setAF)
## Saving 7 x 5 in image
Set Up for Stats for RandomAF Data
AV_randomAF_split = group_by(AV_randomAF_truepos, tool, coverage) %>%
summarize(mean(af_workflow), sd(af_workflow), coeff_var(af_workflow)) %>% droplevels()
## `summarise()` regrouping output by 'tool' (override with `.groups` argument)
colnames(AV_randomAF_split) = c("tool","cov","mean","sd","variance")
Mean_randomAF = ggplot(AV_randomAF_split, aes(x = tool, y = mean, color = cov)) +
geom_point()
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## List of 2
## $ text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : num 15
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x:List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 1
## ..$ vjust : num 0.5
## ..$ angle : num 90
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
print(Mean_randomAF)

ggsave("RandomAF_Mean_AlleleFrequency.png",Mean_randomAF)
## Saving 7 x 5 in image
CoeffVar_randomAF = ggplot(AV_randomAF_split, aes(x = tool, y = variance, color = cov)) +
geom_point() +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_randomAF)

ggsave("RandomAF_CoeffVar_AlleleFrequency.png",CoeffVar_randomAF)
## Saving 7 x 5 in image
5.1. Looks at where the SNPS are located throughout the genome
# Set AF Data
AV_Pos = ggplot(AV_setAF_truepos, aes(x = pos, y = af_workflow)) +
geom_point()
print(AV_Pos)

AV_Pos_falsepos = ggplot(AV_setAF_falsepos, aes(x = pos, y = af_workflow)) +
geom_point()
print(AV_Pos_falsepos)

AV_Pos_falseneg = ggplot(AV_setAF_falseneg, aes(x = pos, y = af_workflow)) +
geom_point()
print(AV_Pos_falseneg)

# Random AF Data
AV_Pos_r = ggplot(AV_randomAF_truepos, aes(x = pos, y = af_workflow)) +
geom_point()
print(AV_Pos_r)

AV_Pos_falsepos_r = ggplot(AV_randomAF_falsepos, aes(x = pos, y = af_workflow)) +
geom_point()
print(AV_Pos_falsepos_r)

AV_Pos_falseneg_r = ggplot(AV_randomAF_falseneg, aes(x = pos, y = af_workflow)) +
geom_point()
print(AV_Pos_falseneg_r)

5.2.Transitions/Transversions
#setAF_TP_pos = ggplot(AV_setAF_truepos, aes(x = pos, y = ref, color = tool, shape = alt)) +
# geom_point()
#print(setAF_TP_pos)
#TsTv(AV_setAF_truepos)
#counts variants from all tools/seq_depth/AF trials
#randomAF_TP_pos = ggplot(AV_randomAF_truepos, aes(x = pos, y = ref, color = tool, shape = alt)) +
# geom_point()
#print(randomAF_TP_pos)
#TsTv(AV_randomAF_truepos)