This notebook takes the output for the MAF Pipeline (mycsvfile.csv) and does:
- Calculates true pos, false pos, and false neg and makes a precision recall curve for all tools
- Finds total numbers of variants found by tools at different allele frequencies and sequencing depths
- Looks at correlation between SNPS in the golden vcf and worflow vcf For viral variant callers!
Loading Libraries
library('ggplot2')
library('tidyverse')
── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
✔ tibble 3.0.4 ✔ dplyr 1.0.2
✔ tidyr 1.0.0 ✔ stringr 1.4.0
✔ readr 1.3.1 ✔ forcats 0.4.0
✔ purrr 0.3.3
── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
library('plyr')
------------------------------------------------------------------------------
You have loaded plyr after dplyr - this is likely to cause problems.
If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
library(plyr); library(dplyr)
------------------------------------------------------------------------------
Attaching package: 'plyr'
The following objects are masked from 'package:dplyr':
arrange, count, desc, failwith, id, mutate, rename, summarise,
summarize
The following object is masked from 'package:purrr':
compact
library('ggpubr')
Attaching package: 'ggpubr'
The following object is masked from 'package:plyr':
mutate
library("MLmetrics")
Attaching package: 'MLmetrics'
The following object is masked from 'package:base':
Recall
MyColors = c("#E76F51", "#E9C369", "#2A9D8F","#2B4FA2", "#119DAC", "#672D7B", "#262366", "#BCBCBC")
ggplot2::theme_set(theme_minimal())
detach(package:plyr, unload=TRUE)
library("plotrix")
Making Transition/Transversion Function
TsTv = function(dataframe) {
transitions = 0
for (row in 1:nrow(dataframe)) {
REF <- dataframe[row, "ref"] #requires dataframe to have column name "ref"
ALT <- dataframe[row, "alt"] # requires dataframe to have column name "alt"
if(REF == "A" & ALT == "G")
transitions = transitions + 1
if(REF == "G" & ALT == "A")
transitions = transitions + 1
if(REF == "C" & ALT == "T")
transitions = transitions + 1
if(REF == "T" & ALT == "C")
transitions = transitions + 1
}
transversions = 0
for (row in 1:nrow(dataframe)) {
REF <- dataframe[row, "ref"]
ALT <- dataframe[row, "alt"]
if(REF == "A" & ALT == "T")
transversions = transversions + 1
if(REF == "T" & ALT == "A")
transversions = transversions + 1
if(REF == "C" & ALT == "A")
transversions = transversions + 1
if(REF == "A" & ALT == "C")
transversions = transversions + 1
if(REF == "G" & ALT == "T")
transversions = transversions + 1
if(REF == "T" & ALT == "G")
transversions = transversions + 1
if(REF == "C" & ALT == "G")
transversions = transversions + 1
if(REF == "G" & ALT == "C")
transversions = transversions + 1
}
TsTv_df = data.frame(transitions,transversions)
print(TsTv_df)
TsTv_ratio = (transitions/transversions)
print(TsTv_ratio)
}
Reading in Variant Data and Evaluating
AV = read.csv(af_report,header = T)
AV$sample_id = as.character(AV$sample_id)
AV$dp = as.numeric(AV$dp)
AV = separate(AV, sample_id, sep = "_",
into = c(NA,NA,"allele_freq",NA,"seq_depth","tool","parameter"))%>%
filter(ref == "A" | ref == "C" | ref == "T" | ref == "G") %>%
filter(alt == "A" | alt == "C" | alt == "T" | alt == "G") %>% droplevels()
Warning: Expected 7 pieces. Additional pieces discarded in 9552 rows [5010,
5011, 5012, 5013, 5014, 5015, 5016, 5017, 5018, 5019, 5020, 5021, 5022, 5023,
5024, 5025, 5026, 5027, 5028, 5029, ...].
Warning: Expected 7 pieces. Missing pieces filled with `NA` in 44000 rows [458,
459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474,
475, 476, 477, ...].
AV$seq_depth = as.numeric(AV$seq_depth)
AV$coverage = (AV$seq_depth * 100000)
AV$parameter = NULL
head(AV)
Dp_Variation = ggplot(AV, aes(x = log10(coverage), y = log10(dp), color = tool)) +
geom_point() +
facet_wrap(~tool, scales = "free_y") +
scale_color_manual(values = MyColors)
Warning: namespace 'plyr' is not available and has been replaced
by .GlobalEnv when processing object ''
print(Dp_Variation)
Warning: Removed 7435 rows containing missing values (geom_point).

ggsave(Dp_Variation, file = "AV_Dp_Coverage.png")
Saving 7 x 5 in image
Warning: Removed 7435 rows containing missing values (geom_point).
Reading in Golden Data and Evaluating
viral_golden = read.table(golden_vcf, stringsAsFactors = F)
colnames(viral_golden) = c("chrom","pos","ID","ref","alt","qual","filter","info","codes","genotype")
viral_golden = separate(viral_golden, info, sep = "=", into = c("label","AF"))
viral_golden$label = NULL
head(viral_golden)
dim(viral_golden)
[1] 150 10
viral_golden_pos = ggplot(viral_golden, aes(x = pos, y = ref, color = alt)) +
geom_point()
print(viral_golden_pos)

TsTv(viral_golden)
transitions transversions
1 109 41
[1] 2.658537
SetAF - Separating Data into TP, FP, FN
AV_setAF = filter(AV, allele_freq != "random") %>% droplevels()
AV_setAF_falsepos = filter(AV_setAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
AV_setAF_falsepos$category = c("FP")
AV_setAF_falseneg = filter(AV_setAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
AV_setAF_falseneg$category = c("FN")
AV_setAF_truepos = filter(AV_setAF, af_golden != 0 & af_workflow != 0)
AV_setAF_truepos$category = c("TP")
AV_setAF = rbind(AV_setAF_truepos, AV_setAF_falseneg, AV_setAF_falsepos)
FP_All = group_by(AV_setAF_falsepos, allele_freq,coverage,tool) %>% tally()
colnames(FP_All) = c("allele_freq","coverage","tool","FP")
FN_All = group_by(AV_setAF_falseneg, allele_freq,coverage,tool) %>% tally()
colnames(FN_All) = c("allele_freq","coverage","tool","FN")
TP_All = group_by(AV_setAF_truepos, allele_freq,coverage,tool) %>% tally()
colnames(TP_All) = c("allele_freq","coverage","tool","TP")
AllVar = merge(FP_All,FN_All, by=c("allele_freq","coverage","tool"),all=T)
AllVar = merge(AllVar,TP_All, by=c("allele_freq","coverage","tool"), all=T)
head(AllVar)
AllVar[is.na(AllVar)] = 0
head(AllVar)
AllVar$prec = (AllVar$TP)/(AllVar$TP + AllVar$FP)
AllVar$recall = (AllVar$TP)/(AllVar$TP + AllVar$FN)
AllVar$area = AllVar$prec * AllVar$recall
head(AllVar)
RandomAF - Separating Data into TP, FP, FN
AV_randomAF = filter(AV, allele_freq == "random") %>% droplevels()
AV_randomAF_falsepos = filter(AV_randomAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
AV_randomAF_falsepos$category = c("FP")
AV_randomAF_falseneg = filter(AV_randomAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
AV_randomAF_falseneg$category = c("FN")
AV_randomAF_truepos = filter(AV_randomAF, af_golden != 0 & af_workflow != 0)
AV_randomAF_truepos$category = c("TP")
AV_randomAF = rbind(AV_randomAF_truepos, AV_randomAF_falseneg, AV_randomAF_falsepos)
rFP_All = group_by(AV_randomAF_falsepos, allele_freq,coverage,tool) %>% tally()
colnames(rFP_All) = c("allele_freq","coverage","tool","FP")
rFN_All = group_by(AV_randomAF_falseneg, allele_freq,coverage,tool) %>% tally()
colnames(rFN_All) = c("allele_freq","coverage","tool","FN")
rTP_All = group_by(AV_randomAF_truepos, allele_freq,coverage,tool) %>% tally()
colnames(rTP_All) = c("allele_freq","coverage","tool","TP")
rAllVar = merge(rFP_All,rFN_All, by=c("allele_freq","coverage","tool"),all=T)
rAllVar = merge(rAllVar, rTP_All, by=c("allele_freq","coverage","tool"), all=T)
head(rAllVar)
rAllVar[is.na(rAllVar)] = 0
rAllVar$prec = (rAllVar$TP)/(rAllVar$TP + rAllVar$FP)
rAllVar$recall = (rAllVar$TP)/(rAllVar$TP + rAllVar$FN)
rAllVar$area = rAllVar$prec * rAllVar$recall
head(rAllVar)
1. Precision_Recall Curves
PR_AllVarSet_freq = ggplot(AllVar,aes(x=recall, y=prec, group = tool, color = tool)) +
geom_point() +
geom_line() +
facet_wrap(~allele_freq) +
scale_color_manual(values = MyColors)
print(PR_AllVarSet_freq)
Warning: Removed 4 rows containing missing values (geom_point).
Warning: Removed 4 row(s) containing missing values (geom_path).

ggsave("PR_AllVarSet_byAF.png",PR_AllVarSet_freq)
Saving 7 x 5 in image
Warning: Removed 4 rows containing missing values (geom_point).
Warning: Removed 4 row(s) containing missing values (geom_path).
PR_AllVarSet_cover = ggplot(AllVar,aes(x=recall, y=prec, group = tool, color=tool)) +
geom_point() +
geom_line() +
facet_wrap(~coverage) +
scale_color_manual(values = MyColors)
print(PR_AllVarSet_cover)
Warning: Removed 4 rows containing missing values (geom_point).
Warning: Removed 2 row(s) containing missing values (geom_path).

ggsave("PR_AllVarSet_byCov.png",PR_AllVarSet_cover)
Saving 7 x 5 in image
Warning: Removed 4 rows containing missing values (geom_point).
Warning: Removed 2 row(s) containing missing values (geom_path).
PR_AllVarRandom = ggplot(rAllVar,aes(x=recall, y=prec, group = tool, color=tool)) +
geom_point() +
geom_line() +
scale_color_manual(values = MyColors)
print(PR_AllVarRandom)

ggsave("PR_AllVarRandom.png",PR_AllVarRandom)
Saving 7 x 5 in image
# Examining Data
AllVar_PR_zeros = filter(AllVar, prec == 0 & recall == 0)
AllVar_PR_high = filter(AllVar, prec > 0.9 & recall > 0.9)
AllVar_PR_highest = group_by(AllVar, tool, allele_freq, coverage) %>%
arrange(area, decreasing = TRUE)
write.csv(AllVar_PR_highest, file = "AllVar_PR_Thresholds.csv")
2.1 SetAF - Proportion of Variants Found
AV_setAF_counts = group_by(AV_setAF_truepos, tool, allele_freq, coverage) %>% tally()
AV_setAF_counts$proportion = (AV_setAF_counts$n / nrow(viral_golden))
AV_setAF_proportion = ggplot(AV_setAF_counts,aes(x = log10(coverage), y = proportion,
group = tool, color = tool)) +
geom_point() +
geom_line() +
facet_grid(~allele_freq) +
scale_color_manual(values = MyColors)
print(AV_setAF_proportion)

ggsave("AllVarSet_Proportion.png", AV_setAF_proportion)
Saving 7 x 5 in image
2.2 RandomAF - Proportion of Variants Found
AV_randomAF_counts = group_by(AV_randomAF_truepos, tool, allele_freq, coverage) %>% tally()
AV_randomAF_counts$proportion = (AV_randomAF_counts$n / nrow(viral_golden))
AV_randomAF_proportion = ggplot(AV_randomAF_counts, aes(x = log10(coverage), y = proportion,
group = tool, color = tool)) +
geom_point() +
geom_line() +
scale_color_manual(values = MyColors)
print(AV_randomAF_proportion)

ggsave("AllVarRandom_Proportion.png", AV_randomAF_proportion)
Saving 7 x 5 in image
## Only looking at coverage between 100-1000
AV_randomAF_truepos_f = filter(AV_randomAF_truepos, coverage > 99 & coverage < 1001)
AV_randomAF_counts_f = group_by(AV_randomAF_truepos_f, tool, allele_freq, coverage) %>% tally()
AV_randomAF_counts_f$proportion = (AV_randomAF_counts_f$n / nrow(viral_golden))
AV_randomAF_proportion_f = ggplot(AV_randomAF_counts_f, aes(x = log10(coverage), y = proportion,
group = tool, color = tool)) +
geom_point() +
geom_line() +
scale_color_manual(values = MyColors)
print(AV_randomAF_proportion_f)

ggsave("AllVarRandom_Proportion_Filtered.png", AV_randomAF_proportion_f)
Saving 7 x 5 in image
3.1 SetAF - Correlation Between Golden and Workflow
## Only looking at true positives
AV_Corr_setAF = ggplot(AV_setAF, aes(x = af_golden, y = af_workflow,
color = tool, shape = category)) +
geom_point() +
scale_color_manual(values = MyColors) +
geom_boxplot(aes(group = allele_freq)) +
facet_grid(tool~coverage) +
ylim(-0.001,1.0) + xlim(-0.001,1.0)
print(AV_Corr_setAF)

ggsave("AllVarSet_Correlation.png",AV_Corr_setAF)
Saving 7 x 5 in image
##Looking at all SetAF Data
AV_Corr_setAF = ggplot(AV_setAF, aes(x = af_golden, y = af_workflow,
color = tool, group = tool, shape = category)) +
geom_point(size = 3, alpha = 0.4) +
scale_color_manual(values = MyColors) +
facet_grid(~coverage) +
geom_smooth(method='lm', size = 1.5) +
ylim(-0.001,1.0) + xlim(-0.001,1.0)
print(AV_Corr_setAF)
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 6 rows containing missing values (geom_smooth).

ggsave("AllVarSet_Correlation_Filtered.png",AV_Corr_setAF)
Saving 7 x 5 in image
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 6 rows containing missing values (geom_smooth).
AV_Corr_setAF_Zoom = ggplot(AV_setAF, aes(x = af_golden, y = af_workflow,
color = tool, group = tool, shape = category)) +
geom_point(size = 3, alpha = 0.4) +
scale_color_manual(values = MyColors) +
facet_grid(~coverage) +
ylim(-0.01,0.15) + xlim(-0.01,0.15)
print(AV_Corr_setAF_Zoom)
Warning: Removed 22195 rows containing missing values (geom_point).

ggsave("AllVarSet_CorrelationF_Filtered_Zoom.png", AV_Corr_setAF_Zoom)
Saving 7 x 5 in image
Warning: Removed 22195 rows containing missing values (geom_point).
3.2 RandomAF - Correlation Between Golden and Workflow
## Only looking at true positives
AV_Corr_randomAF = ggplot(AV_randomAF, aes(x = af_golden, y = af_workflow,
color = tool, shape = category)) +
geom_point() +
scale_color_manual(values = MyColors) +
facet_grid(tool~coverage) +
ylim(-0.001,1.0) + xlim(-0.001,1.0)
print(AV_Corr_randomAF)

ggsave("AllVarRandom_Correlation.png",AV_Corr_randomAF)
Saving 7 x 5 in image
##Looking at all RandomAF Data
AV_Corr_randomAF = ggplot(AV_randomAF, aes(x = af_golden, y = af_workflow,
color = tool, group = tool, shape = category)) +
geom_point(size = 3, alpha = 0.4) +
scale_color_manual(values = MyColors) +
facet_grid(~coverage) +
geom_smooth(method='lm', size = 1.5) +
ylim(-0.01,1) + xlim(-0.01,1)
print(AV_Corr_randomAF)
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 8 rows containing missing values (geom_smooth).

ggsave("AllVarRandom_Correlation_Filtered.png",AV_Corr_randomAF)
Saving 7 x 5 in image
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 8 rows containing missing values (geom_smooth).
AV_Corr_randomAF_Zoom = ggplot(AV_randomAF, aes(x = af_golden, y = af_workflow,
color = tool, group = tool, shape = category)) +
geom_point(size = 3, alpha = 0.4) +
scale_color_manual(values = MyColors) +
facet_grid(~coverage) +
geom_smooth(method='lm', size = 1.5) +
ylim(-0.01,0.15) + xlim(-0.01,0.15)
print(AV_Corr_randomAF_Zoom)
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 5942 rows containing non-finite values (stat_smooth).
Warning: Removed 5942 rows containing missing values (geom_point).
Warning: Removed 8 rows containing missing values (geom_smooth).

ggsave("AllVarRandom_Correlation_Filtered_Zoom.png", AV_Corr_randomAF_Zoom)
Saving 7 x 5 in image
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 5942 rows containing non-finite values (stat_smooth).
Warning: Removed 5942 rows containing missing values (geom_point).
Warning: Removed 8 rows containing missing values (geom_smooth).
Stats for SetAF Data
coeff_var <- function(x) {
CV <- sd(x) / mean(x) * 100
return(CV)
}
AV_setAF_split = group_by(AV_setAF_truepos, tool, allele_freq, coverage) %>%
summarize(mean(af_workflow), sd(af_workflow), coeff_var(af_workflow)) %>% droplevels()
`summarise()` regrouping output by 'tool', 'allele_freq' (override with `.groups` argument)
colnames(AV_setAF_split) = c("tool","AF","cov","mean","sd","variance")
Mean_setAF = ggplot(AV_setAF_split, aes(x = tool, y = mean, color = cov)) +
geom_point() +
facet_grid(rows = vars(tool), cols = vars(AF)) +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(Mean_setAF)

ggsave("SetAF_Mean_AlleleFrequency.png",Mean_setAF)
Saving 7 x 5 in image
CoeffVar_setAF = ggplot(AV_setAF_split, aes(x = tool, y = variance, color = cov)) +
geom_point() +
facet_grid(rows = vars(tool), cols = vars(AF)) +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_setAF)
Warning: Removed 3 rows containing missing values (geom_point).

ggsave("SetAF_CoeffVar_AlleleFrequency.png",CoeffVar_setAF)
Saving 7 x 5 in image
Warning: Removed 3 rows containing missing values (geom_point).
Set Up for Stats for RandomAF Data
AV_randomAF_split = group_by(AV_randomAF_truepos, tool, coverage) %>%
summarize(mean(af_workflow), sd(af_workflow), coeff_var(af_workflow)) %>% droplevels()
`summarise()` regrouping output by 'tool' (override with `.groups` argument)
colnames(AV_randomAF_split) = c("tool","cov","mean","sd","variance")
Mean_randomAF = ggplot(AV_randomAF_split, aes(x = tool, y = mean, color = cov)) +
geom_point()
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
List of 2
$ text :List of 11
..$ family : NULL
..$ face : NULL
..$ colour : NULL
..$ size : num 15
..$ hjust : NULL
..$ vjust : NULL
..$ angle : NULL
..$ lineheight : NULL
..$ margin : NULL
..$ debug : NULL
..$ inherit.blank: logi FALSE
..- attr(*, "class")= chr [1:2] "element_text" "element"
$ axis.text.x:List of 11
..$ family : NULL
..$ face : NULL
..$ colour : NULL
..$ size : NULL
..$ hjust : num 1
..$ vjust : num 0.5
..$ angle : num 90
..$ lineheight : NULL
..$ margin : NULL
..$ debug : NULL
..$ inherit.blank: logi FALSE
..- attr(*, "class")= chr [1:2] "element_text" "element"
- attr(*, "class")= chr [1:2] "theme" "gg"
- attr(*, "complete")= logi FALSE
- attr(*, "validate")= logi TRUE
print(Mean_randomAF)

ggsave("RandomAF_Mean_AlleleFrequency.png",Mean_randomAF)
Saving 7 x 5 in image
CoeffVar_randomAF = ggplot(AV_randomAF_split, aes(x = tool, y = variance, color = cov)) +
geom_point() +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_randomAF)

ggsave("RandomAF_CoeffVar_AlleleFrequency.png",CoeffVar_randomAF)
Saving 7 x 5 in image
5.1. Looks at where the SNPS are located throughout the genome
# Set AF Data
AV_Pos = ggplot(AV_setAF_truepos, aes(x = pos, y = af_workflow)) +
geom_point()
print(AV_Pos)

AV_Pos_falsepos = ggplot(AV_setAF_falsepos, aes(x = pos, y = af_workflow)) +
geom_point()
print(AV_Pos_falsepos)

AV_Pos_falseneg = ggplot(AV_setAF_falseneg, aes(x = pos, y = af_workflow)) +
geom_point()
print(AV_Pos_falseneg)

# Random AF Data
AV_Pos_r = ggplot(AV_randomAF_truepos, aes(x = pos, y = af_workflow)) +
geom_point()
print(AV_Pos_r)

AV_Pos_falsepos_r = ggplot(AV_randomAF_falsepos, aes(x = pos, y = af_workflow)) +
geom_point()
print(AV_Pos_falsepos_r)

AV_Pos_falseneg_r = ggplot(AV_randomAF_falseneg, aes(x = pos, y = af_workflow)) +
geom_point()
print(AV_Pos_falseneg_r)

5.2.Transitions/Transversions
setAF_TP_pos = ggplot(AV_setAF_truepos, aes(x = pos, y = ref, color = tool, shape = alt)) +
geom_point()
print(setAF_TP_pos)

TsTv(AV_setAF_truepos)
transitions transversions
1 34795 13141
[1] 2.64782
#counts variants from all tools/seq_depth/AF trials
randomAF_TP_pos = ggplot(AV_randomAF_truepos, aes(x = pos, y = ref, color = tool, shape = alt)) +
geom_point()
print(randomAF_TP_pos)

TsTv(AV_randomAF_truepos)
transitions transversions
1 4961 1854
[1] 2.675836
---
title: "Variants_AllCallers"
output: html_notebook
---

##This notebook takes the output for the MAF Pipeline (mycsvfile.csv) and does:
1. Calculates true pos, false pos, and false neg and makes a precision recall curve for all tools
2. Finds total numbers of variants found by tools at different allele frequencies and sequencing depths
3. Looks at correlation between SNPS in the golden vcf and worflow vcf
For viral variant callers!


##Loading Libraries
```{r}
library('ggplot2')
library('tidyverse')
library('plyr')
library('ggpubr')
library("MLmetrics")

MyColors = c("#E76F51", "#E9C369", "#2A9D8F","#2B4FA2", "#119DAC", "#672D7B", "#262366", "#BCBCBC")
ggplot2::theme_set(theme_minimal())

```

```{r}
detach(package:plyr, unload=TRUE)
library("plotrix") 
```

##Making Transition/Transversion Function
```{r}
TsTv = function(dataframe) {
  transitions = 0
  for (row in 1:nrow(dataframe)) {
  
   REF <- dataframe[row, "ref"] #requires dataframe to have column name "ref"
   ALT <- dataframe[row, "alt"] # requires dataframe to have column name "alt"

    if(REF == "A" & ALT == "G")
      transitions = transitions + 1
    if(REF == "G" & ALT == "A")
      transitions = transitions + 1
    if(REF == "C" & ALT == "T")
      transitions = transitions + 1
    if(REF == "T" & ALT == "C")
      transitions = transitions + 1
  }

  transversions = 0
  for (row in 1:nrow(dataframe)) {
  
    REF <- dataframe[row, "ref"]
    ALT <- dataframe[row, "alt"]
  
    if(REF == "A" & ALT == "T")
      transversions = transversions + 1
    if(REF == "T" & ALT == "A")
      transversions = transversions + 1
    if(REF == "C" & ALT == "A")
      transversions = transversions + 1
    if(REF == "A" & ALT == "C")
      transversions = transversions + 1
    if(REF == "G" & ALT == "T")
      transversions = transversions + 1
    if(REF == "T" & ALT == "G")
      transversions = transversions + 1
    if(REF == "C" & ALT == "G")
      transversions = transversions + 1
    if(REF == "G" & ALT == "C")
      transversions = transversions + 1
  }

  TsTv_df = data.frame(transitions,transversions)
  print(TsTv_df)
  TsTv_ratio = (transitions/transversions)
  print(TsTv_ratio)
  
}
```

##Reading in Variant Data and Evaluating
```{r}
AV = read.csv(af_report,header = T)
AV$sample_id = as.character(AV$sample_id)
AV$dp = as.numeric(AV$dp)
AV = separate(AV, sample_id, sep = "_", 
               into = c(NA,NA,"allele_freq",NA,"seq_depth","tool","parameter"))%>% 
  filter(ref == "A" | ref == "C" | ref == "T" | ref == "G") %>% 
  filter(alt == "A" | alt == "C" | alt == "T" | alt == "G") %>% droplevels()
AV$seq_depth = as.numeric(AV$seq_depth)
AV$coverage = (AV$seq_depth * 100000)

AV$parameter = NULL
head(AV)

Dp_Variation = ggplot(AV, aes(x = log10(coverage), y = log10(dp), color = tool)) +
  geom_point() + 
  facet_wrap(~tool, scales = "free_y") +
  scale_color_manual(values = MyColors)
print(Dp_Variation)
ggsave(Dp_Variation, file = "AV_Dp_Coverage.png")
```

##Reading in Golden Data and Evaluating
```{r}
viral_golden = read.table(golden_vcf, stringsAsFactors = F)
colnames(viral_golden) = c("chrom","pos","ID","ref","alt","qual","filter","info","codes","genotype")
viral_golden = separate(viral_golden, info, sep = "=", into = c("label","AF"))
viral_golden$label = NULL
head(viral_golden)
dim(viral_golden)

viral_golden_pos = ggplot(viral_golden, aes(x = pos, y = ref, color = alt)) +
  geom_point()
print(viral_golden_pos)

TsTv(viral_golden)
```

##SetAF - Separating Data into TP, FP, FN
```{r}
AV_setAF = filter(AV, allele_freq != "random") %>% droplevels()

AV_setAF_falsepos = filter(AV_setAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
  AV_setAF_falsepos$category = c("FP")
AV_setAF_falseneg = filter(AV_setAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
  AV_setAF_falseneg$category = c("FN")
AV_setAF_truepos = filter(AV_setAF, af_golden != 0 & af_workflow != 0)
  AV_setAF_truepos$category = c("TP")
  
AV_setAF = rbind(AV_setAF_truepos, AV_setAF_falseneg, AV_setAF_falsepos)

FP_All = group_by(AV_setAF_falsepos, allele_freq,coverage,tool) %>% tally()
  colnames(FP_All) = c("allele_freq","coverage","tool","FP")
FN_All = group_by(AV_setAF_falseneg, allele_freq,coverage,tool) %>% tally()
  colnames(FN_All) = c("allele_freq","coverage","tool","FN")
TP_All = group_by(AV_setAF_truepos, allele_freq,coverage,tool) %>% tally()
  colnames(TP_All) = c("allele_freq","coverage","tool","TP")
    
AllVar = merge(FP_All,FN_All, by=c("allele_freq","coverage","tool"),all=T)
AllVar = merge(AllVar,TP_All, by=c("allele_freq","coverage","tool"), all=T)
head(AllVar)

AllVar[is.na(AllVar)] = 0
head(AllVar)

AllVar$prec = (AllVar$TP)/(AllVar$TP + AllVar$FP)
AllVar$recall = (AllVar$TP)/(AllVar$TP + AllVar$FN)
AllVar$area = AllVar$prec * AllVar$recall
head(AllVar)
```

##RandomAF - Separating Data into TP, FP, FN
```{r}
AV_randomAF = filter(AV, allele_freq == "random") %>% droplevels()

AV_randomAF_falsepos = filter(AV_randomAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
  AV_randomAF_falsepos$category = c("FP")
AV_randomAF_falseneg = filter(AV_randomAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
  AV_randomAF_falseneg$category = c("FN")
AV_randomAF_truepos = filter(AV_randomAF, af_golden != 0 & af_workflow != 0)
  AV_randomAF_truepos$category = c("TP")
  
AV_randomAF = rbind(AV_randomAF_truepos, AV_randomAF_falseneg, AV_randomAF_falsepos)

rFP_All = group_by(AV_randomAF_falsepos, allele_freq,coverage,tool) %>% tally()
  colnames(rFP_All) = c("allele_freq","coverage","tool","FP")
rFN_All = group_by(AV_randomAF_falseneg, allele_freq,coverage,tool) %>% tally()
  colnames(rFN_All) = c("allele_freq","coverage","tool","FN")
rTP_All = group_by(AV_randomAF_truepos, allele_freq,coverage,tool) %>% tally()
  colnames(rTP_All) = c("allele_freq","coverage","tool","TP")

rAllVar = merge(rFP_All,rFN_All, by=c("allele_freq","coverage","tool"),all=T)
rAllVar = merge(rAllVar, rTP_All, by=c("allele_freq","coverage","tool"), all=T)
head(rAllVar)

rAllVar[is.na(rAllVar)] = 0

rAllVar$prec = (rAllVar$TP)/(rAllVar$TP + rAllVar$FP)
rAllVar$recall = (rAllVar$TP)/(rAllVar$TP + rAllVar$FN)
rAllVar$area = rAllVar$prec * rAllVar$recall
head(rAllVar)
```

#1. Precision_Recall Curves
```{r}
PR_AllVarSet_freq = ggplot(AllVar,aes(x=recall, y=prec, group = tool, color = tool)) +
  geom_point() + 
  geom_line() +
  facet_wrap(~allele_freq) +
  scale_color_manual(values = MyColors)
print(PR_AllVarSet_freq)
ggsave("PR_AllVarSet_byAF.png",PR_AllVarSet_freq)

PR_AllVarSet_cover = ggplot(AllVar,aes(x=recall, y=prec, group = tool, color=tool)) +
  geom_point() + 
  geom_line() +
  facet_wrap(~coverage) +
  scale_color_manual(values = MyColors)
print(PR_AllVarSet_cover)
ggsave("PR_AllVarSet_byCov.png",PR_AllVarSet_cover)

PR_AllVarRandom = ggplot(rAllVar,aes(x=recall, y=prec, group = tool, color=tool)) +
  geom_point() + 
  geom_line() +
  scale_color_manual(values = MyColors)
print(PR_AllVarRandom)
ggsave("PR_AllVarRandom.png",PR_AllVarRandom)

# Examining Data

AllVar_PR_zeros = filter(AllVar, prec == 0 & recall == 0)
AllVar_PR_high = filter(AllVar, prec > 0.9 & recall > 0.9)

AllVar_PR_highest = group_by(AllVar, tool, allele_freq, coverage) %>%
  arrange(area, decreasing = TRUE)
write.csv(AllVar_PR_highest, file = "AllVar_PR_Thresholds.csv")
```

#2.1 SetAF - Proportion of Variants Found
```{r}
AV_setAF_counts = group_by(AV_setAF_truepos, tool, allele_freq, coverage) %>% tally()
AV_setAF_counts$proportion = (AV_setAF_counts$n / nrow(viral_golden))

AV_setAF_proportion = ggplot(AV_setAF_counts,aes(x = log10(coverage), y = proportion,
                                                group = tool, color = tool)) +
  geom_point() +
  geom_line() +
  facet_grid(~allele_freq) +
  scale_color_manual(values = MyColors)
print(AV_setAF_proportion)
ggsave("AllVarSet_Proportion.png", AV_setAF_proportion)
```

#2.2 RandomAF - Proportion of Variants Found
```{r}
AV_randomAF_counts = group_by(AV_randomAF_truepos, tool, allele_freq, coverage) %>% tally()
AV_randomAF_counts$proportion = (AV_randomAF_counts$n / nrow(viral_golden))

AV_randomAF_proportion = ggplot(AV_randomAF_counts, aes(x = log10(coverage), y = proportion, 
                                                        group = tool, color = tool)) +
  geom_point() +
  geom_line() +
  scale_color_manual(values = MyColors)
print(AV_randomAF_proportion)
ggsave("AllVarRandom_Proportion.png", AV_randomAF_proportion)
 
## Only looking at coverage between 100-1000
AV_randomAF_truepos_f = filter(AV_randomAF_truepos, coverage > 99 & coverage < 1001)
AV_randomAF_counts_f = group_by(AV_randomAF_truepos_f, tool, allele_freq, coverage) %>% tally()
AV_randomAF_counts_f$proportion = (AV_randomAF_counts_f$n / nrow(viral_golden))
 
AV_randomAF_proportion_f = ggplot(AV_randomAF_counts_f, aes(x = log10(coverage), y = proportion, 
                                                             group = tool, color = tool)) +
  geom_point() +
  geom_line() +
  scale_color_manual(values = MyColors)
print(AV_randomAF_proportion_f)
ggsave("AllVarRandom_Proportion_Filtered.png", AV_randomAF_proportion_f)
```

#3.1 SetAF - Correlation Between Golden and Workflow
```{r}
## Only looking at true positives
AV_Corr_setAF = ggplot(AV_setAF, aes(x = af_golden, y = af_workflow,
                                                color = tool, shape = category)) +
  geom_point() +
  scale_color_manual(values = MyColors) +
  geom_boxplot(aes(group = allele_freq)) + 
  facet_grid(tool~coverage) +
  ylim(-0.001,1.0) + xlim(-0.001,1.0)
print(AV_Corr_setAF)
ggsave("AllVarSet_Correlation.png",AV_Corr_setAF)

##Looking at all SetAF Data
AV_Corr_setAF = ggplot(AV_setAF, aes(x = af_golden, y = af_workflow, 
                                       color = tool, group = tool, shape = category)) +
  geom_point(size = 3, alpha = 0.4) +
  scale_color_manual(values = MyColors) +
  facet_grid(~coverage) +
  geom_smooth(method='lm', size = 1.5) +
  ylim(-0.001,1.0) + xlim(-0.001,1.0)
print(AV_Corr_setAF)
ggsave("AllVarSet_Correlation_Filtered.png",AV_Corr_setAF)

AV_Corr_setAF_Zoom = ggplot(AV_setAF, aes(x = af_golden, y = af_workflow,
                                            color = tool, group = tool, shape = category)) +
  geom_point(size = 3, alpha = 0.4) +
  scale_color_manual(values = MyColors) +
  facet_grid(~coverage) +
  ylim(-0.01,0.15) + xlim(-0.01,0.15)
print(AV_Corr_setAF_Zoom)
ggsave("AllVarSet_CorrelationF_Filtered_Zoom.png", AV_Corr_setAF_Zoom)
```

#3.2 RandomAF - Correlation Between Golden and Workflow
```{r}
## Only looking at true positives
AV_Corr_randomAF = ggplot(AV_randomAF, aes(x = af_golden, y = af_workflow,
                                                      color = tool, shape = category)) +
  geom_point() +
  scale_color_manual(values = MyColors) +
  facet_grid(tool~coverage) +
  ylim(-0.001,1.0) + xlim(-0.001,1.0)
print(AV_Corr_randomAF)
ggsave("AllVarRandom_Correlation.png",AV_Corr_randomAF)

##Looking at all RandomAF Data
AV_Corr_randomAF = ggplot(AV_randomAF, aes(x = af_golden, y = af_workflow, 
                                             color = tool, group = tool, shape = category)) +
  geom_point(size = 3, alpha = 0.4) +
  scale_color_manual(values = MyColors) +
  facet_grid(~coverage) +
  geom_smooth(method='lm', size = 1.5) +
  ylim(-0.01,1) + xlim(-0.01,1)
print(AV_Corr_randomAF)
ggsave("AllVarRandom_Correlation_Filtered.png",AV_Corr_randomAF)

AV_Corr_randomAF_Zoom = ggplot(AV_randomAF, aes(x = af_golden, y = af_workflow,
                                                  color = tool, group = tool, shape = category)) +
  geom_point(size = 3, alpha = 0.4) +
  scale_color_manual(values = MyColors) +
  facet_grid(~coverage) +
  geom_smooth(method='lm', size = 1.5) +
  ylim(-0.01,0.15) + xlim(-0.01,0.15)
print(AV_Corr_randomAF_Zoom)
ggsave("AllVarRandom_Correlation_Filtered_Zoom.png", AV_Corr_randomAF_Zoom)
```

## Stats for SetAF Data
```{r}
coeff_var <- function(x) {
  CV <- sd(x) / mean(x) * 100
  return(CV)
}

AV_setAF_split = group_by(AV_setAF_truepos, tool, allele_freq, coverage) %>%
  summarize(mean(af_workflow), sd(af_workflow), coeff_var(af_workflow)) %>% droplevels()
colnames(AV_setAF_split) = c("tool","AF","cov","mean","sd","variance")

Mean_setAF = ggplot(AV_setAF_split, aes(x = tool, y = mean, color = cov)) + 
  geom_point() +
  facet_grid(rows = vars(tool), cols = vars(AF)) +
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(Mean_setAF)
ggsave("SetAF_Mean_AlleleFrequency.png",Mean_setAF)

CoeffVar_setAF = ggplot(AV_setAF_split, aes(x = tool, y = variance, color = cov)) + 
  geom_point() +
  facet_grid(rows = vars(tool), cols = vars(AF)) +
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_setAF)
ggsave("SetAF_CoeffVar_AlleleFrequency.png",CoeffVar_setAF)
```


## Set Up for Stats for RandomAF Data
```{r}
AV_randomAF_split = group_by(AV_randomAF_truepos, tool, coverage) %>%
  summarize(mean(af_workflow), sd(af_workflow), coeff_var(af_workflow)) %>% droplevels()
colnames(AV_randomAF_split) = c("tool","cov","mean","sd","variance")

Mean_randomAF = ggplot(AV_randomAF_split, aes(x = tool, y = mean, color = cov)) + 
  geom_point()
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(Mean_randomAF)
ggsave("RandomAF_Mean_AlleleFrequency.png",Mean_randomAF)

CoeffVar_randomAF = ggplot(AV_randomAF_split, aes(x = tool, y = variance, color = cov)) + 
  geom_point() +
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_randomAF)
ggsave("RandomAF_CoeffVar_AlleleFrequency.png",CoeffVar_randomAF)
```

#4. Look at how each tool compares at each AF and seq-depth
```{r}
AVSet_Corr = ggplot(AV_setAF_truepos, aes(x = tool, y = af_workflow)) + 
  geom_point() +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  facet_grid(allele_freq~coverage, scales = "free_y")
print(AVSet_Corr)
ggsave("AllVarSet_Tool_Comparison.pdf", AVSet_Corr, width = 15, height = 20)
# y axes are set per row at the moment (scales = "free_y")

AVRandom_Corr = ggplot(AV_randomAF_truepos, aes(x = tool, y = af_workflow)) + 
  geom_point() +
  geom_violin() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  facet_grid(allele_freq~coverage)
print(AVRandom_Corr)
ggsave("AllVarRandom_Tool_Comparison.pdf", AVRandom_Corr, width = 15, height = 20)
```

#5.1. Looks at where the SNPS are located throughout the genome
```{r}
# Set AF Data
AV_Pos = ggplot(AV_setAF_truepos, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos)
AV_Pos_falsepos = ggplot(AV_setAF_falsepos, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos_falsepos)
AV_Pos_falseneg = ggplot(AV_setAF_falseneg, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos_falseneg)

# Random AF Data
AV_Pos_r = ggplot(AV_randomAF_truepos, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos_r)
AV_Pos_falsepos_r = ggplot(AV_randomAF_falsepos, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos_falsepos_r)
AV_Pos_falseneg_r = ggplot(AV_randomAF_falseneg, aes(x = pos, y = af_workflow)) +
  geom_point()
print(AV_Pos_falseneg_r)
```

#5.2.Transitions/Transversions
```{r}
setAF_TP_pos = ggplot(AV_setAF_truepos, aes(x = pos, y = ref, color = tool, shape = alt)) +
  geom_point()
print(setAF_TP_pos)

TsTv(AV_setAF_truepos)
#counts variants from all tools/seq_depth/AF trials

randomAF_TP_pos = ggplot(AV_randomAF_truepos, aes(x = pos, y = ref, color = tool, shape = alt)) +
  geom_point()
print(randomAF_TP_pos)

TsTv(AV_randomAF_truepos)
```
