---
title: "R Notebook"
output: html_notebook
---
##This notebook takes the output for the MAF Pipeline (mycsvfile.csv) and does:
1. Calculates true pos, false pos, and false neg and makes a precision recall curve for all tools
2. Finds total numbers of variants found by tools at different allele frequencies and sequencing depths
3. Looks at correlation between SNPS in the golden vcf and worflow vcf
```{r}
#knitr::opts_knit$set(root.dir = "/Users/marissaknoll/Desktop/MAF/")
#run in working directory
#setwd("/Users/marissaknoll/Desktop/MAF/")
```
```{r}
#library(available)
#available("Vivaldi", browse = FALSE)
```
##Loading Libraries
```{r}
library('plyr')
library('tidyverse')
library("plotrix")
library('ggplot2')
library('ggpubr')
library("MLmetrics")
library("RColorBrewer")
MyColors = c("#e41a1c", "#377eb8", "#4daf4a","#984ea3", "#ff6600", "#ffcc33", "#f781bf", "#a65628", "#999999", "#006600", "#993333", "#cc99ff", "#663300", '#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#000000')
# This is basically RColorBrewer Set1 but I darkened yellow to make it easier to see then changed orange a bit to differ more from yellow
#
SEGMENTLIST = c("H1N1_PB2","H1N1_PB1","H1N1_PA","H1N1_HA","H1N1_NP","H1N1_NA","H1N1_MP","H1N1_NS")
ggplot2::theme_set(theme_minimal())
#totalbases = 29903 # at some point should change this from being hardcoded to make this more flexible
totalbases = 13588 # hard coding for flu A, is this correct?
readsim_cov = 100000 #hardocding this for now so my code runs
```
##Reading in Variant Data and Evaluating
```{r}
AV = read.csv(af_report,header = T)
#make the segments in a specific order
AV$chrom = factor(AV$chrom, levels = SEGMENTLIST)
AV$sample_id = as.character(AV$sample_id)
AV$dp = as.numeric(AV$dp)
AV = separate(AV, sample_id, sep = "_",
into = c("strain","mv","AF","allele_freq","frac","seq_depth","tool","parameter","parameter2")) %>%
select(allele_freq,seq_depth,tool, parameter, chrom,pos,af_golden,af_workflow,ref,alt,dp) %>%
filter(ref == "A" | ref == "C" | ref == "T" | ref == "G") %>%
filter(alt == "A" | alt == "C" | alt == "T" | alt == "G") %>% droplevels()
AV$seq_depth = as.numeric(AV$seq_depth)
AV$coverage = (AV$seq_depth * readsim_cov) #variable called in pipeline, corresponds to highest coverage
AV$coverage = as.character(AV$coverage)
head(AV)
variant_pos = ggplot(AV, aes(x = pos, y = tool)) +
geom_point() +
facet_grid(~chrom, scales = "free_x") + theme_minimal(base_size = 6)
print(variant_pos)
```
##Reading in Golden Data and Evaluating
```{r}
viral_golden = read.table(golden_vcf, stringsAsFactors = F)
colnames(viral_golden) = c("chrom","pos","ID","ref","alt","qual","filter","info","codes","genotype")
viral_golden = separate(viral_golden, info, sep = "=", into = c(NA,"AF"))
viral_golden$chrom = factor(viral_golden$chrom, levels = SEGMENTLIST)
head(viral_golden)
dim(viral_golden)
viral_golden_pos = ggplot(viral_golden, aes(x = pos, y = ref, color = alt)) +
geom_point() +
facet_grid(~chrom, scales = "free_x")+ theme_minimal(base_size = 6)
print(viral_golden_pos)
```
##SetAF - Separating Data into TP, FP, FN
```{r}
AV_setAF = filter(AV, allele_freq != "random") %>% droplevels()
AV_setAF_falsepos = filter(AV_setAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
AV_setAF_falsepos$category = c("FP")
AV_setAF_falseneg = filter(AV_setAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
AV_setAF_falseneg$category = c("FN")
AV_setAF_truepos = filter(AV_setAF, af_golden != 0 & af_workflow != 0)
AV_setAF_truepos$category = c("TP")
AV_setAF = rbind(AV_setAF_truepos, AV_setAF_falseneg, AV_setAF_falsepos)
FP_All = group_by(AV_setAF_falsepos, allele_freq,coverage,tool) %>% tally()
colnames(FP_All) = c("allele_freq","coverage","tool","FP")
FN_All = group_by(AV_setAF_falseneg, allele_freq,coverage,tool) %>% tally()
colnames(FN_All) = c("allele_freq","coverage","tool","FN")
TP_All = group_by(AV_setAF_truepos, allele_freq,coverage,tool) %>% tally()
colnames(TP_All) = c("allele_freq","coverage","tool","TP")
AllVar = merge(FP_All,FN_All, by=c("allele_freq","coverage","tool"),all=T)
AllVar = merge(AllVar,TP_All, by=c("allele_freq","coverage","tool"), all=T)
head(AllVar)
AllVar[is.na(AllVar)] = 0
head(AllVar)
AllVar$prec = (AllVar$TP)/(AllVar$TP + AllVar$FP)
AllVar$recall = (AllVar$TP)/(AllVar$TP + AllVar$FN)
AllVar$area = AllVar$prec * AllVar$recall
head(AllVar)
```
```{r}
# SetAF Data
ggplot(AV_setAF, aes(x = tool, fill = category)) +
geom_bar() +
facet_grid(allele_freq~coverage) +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
scale_fill_brewer(palette = "Dark2")
# plot as percentage of FN as well
```
##RandomAF - Separating Data into TP, FP, FN
```{r}
AV_randomAF = filter(AV, allele_freq == "random") %>% droplevels()
AV_randomAF_falsepos = filter(AV_randomAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
AV_randomAF_falsepos$category = c("FP")
AV_randomAF_falseneg = filter(AV_randomAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
AV_randomAF_falseneg$category = c("FN")
AV_randomAF_truepos = filter(AV_randomAF, af_golden != 0 & af_workflow != 0)
AV_randomAF_truepos$category = c("TP")
AV_randomAF = rbind(AV_randomAF_truepos, AV_randomAF_falseneg, AV_randomAF_falsepos)
rFP_All = group_by(AV_randomAF_falsepos, allele_freq,coverage,tool) %>% tally()
colnames(rFP_All) = c("allele_freq","coverage","tool","FP")
rFN_All = group_by(AV_randomAF_falseneg, allele_freq,coverage,tool) %>% tally()
colnames(rFN_All) = c("allele_freq","coverage","tool","FN")
rTP_All = group_by(AV_randomAF_truepos, allele_freq,coverage,tool) %>% tally()
colnames(rTP_All) = c("allele_freq","coverage","tool","TP")
rAllVar = merge(rFP_All,rFN_All, by=c("allele_freq","coverage","tool"),all=T)
rAllVar = merge(rAllVar, rTP_All, by=c("allele_freq","coverage","tool"), all=T)
head(rAllVar)
rAllVar[is.na(rAllVar)] = 0
rAllVar$prec = (rAllVar$TP)/(rAllVar$TP + rAllVar$FP)
rAllVar$recall = (rAllVar$TP)/(rAllVar$TP + rAllVar$FN)
rAllVar$area = rAllVar$prec * rAllVar$recall
head(rAllVar)
```
```{r}
# RandomAF Data
ggplot(AV_randomAF, aes(x = tool, fill = category)) +
geom_bar() +
facet_grid(allele_freq~coverage) +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
scale_fill_brewer(palette = "Dark2")
# plot as percentage of FN as well
```
##SetAF - Correlation Between Golden and Workflow
```{r}
#Looking at all SetAF Data
AV_Corr_setAF <-
ggplot(AV_setAF, aes(x = af_golden,y = af_workflow, color = category)) +
geom_point(size = 3, alpha = 0.4) +
scale_color_manual(values = MyColors) +
facet_grid(coverage~tool) +
geom_smooth(method='lm', size = 1.5) +
theme(plot.title = element_text(hjust = 0.5))+ theme_minimal(base_size = 6)
print(AV_Corr_setAF)
#ggsave(AV_Corr_setAF, file = "AllVarSet_Correlation_Filtered.png",
# path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
```
##RandomAF - Correlation Between Golden and Workflow
```{r}
##Looking at all RandomAF Data
AV_Corr_randomAF <-
ggplot(AV_randomAF, aes(x = af_golden,y = af_workflow, color = category)) +
geom_point(size = 3, alpha = 0.4) +
scale_color_manual(values = MyColors) +
facet_grid(coverage~tool) +
geom_smooth(method='lm', size = 1.5) +
theme(plot.title = element_text(hjust = 0.5))+ theme_minimal(base_size = 6)
print(AV_Corr_randomAF)
#ggsave("AllVarRandom_Correlation_Filtered.png",AV_Corr_randomAF,
# path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
```
## Stats for SetAF Data
```{r}
detach(package:plyr, unload=TRUE)
coeff_var <- function(x) {
CV <- sd(x) / mean(x) * 100
return(CV)
}
AV_setAF_split = group_by(AV_setAF_truepos, tool, allele_freq, coverage) %>%
summarize(mean = mean(af_workflow), sd = sd(af_workflow), variance = coeff_var(af_workflow)) %>% droplevels()
Mean_setAF = ggplot(AV_setAF_split, aes(x = coverage, y = mean, color = allele_freq)) +
geom_point() +
#facet_grid(cols = vars(allele_freq)) +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(Mean_setAF)
#ggsave("SetAF_Mean_AlleleFrequency.png",Mean_setAF,
# path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
#visulize the effect of downsampling.
CoeffVar_setAF = ggplot(AV_setAF_split, aes(x = coverage, y = variance, color = allele_freq)) +
geom_point() +
#facet_grid(cols = vars(allele_freq)) +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_setAF)
#ggsave("SetAF_CoeffVar_AlleleFrequency.png",CoeffVar_setAF,
# path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
#variances above 10000 are simliar
```
## Stats for RandomAF Data
```{r}
AV_randomAF_split = group_by(AV_randomAF_truepos, tool, coverage) %>%
summarize(mean = mean(af_workflow), sd = sd(af_workflow), variance = coeff_var(af_workflow)) %>% droplevels()
Mean_randomAF = ggplot(AV_randomAF_split, aes(x = coverage, y = mean, color = tool)) +
geom_point() +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(Mean_randomAF)
#ggsave("RandomAF_Mean_AlleleFrequency.png",Mean_randomAF,
# path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
CoeffVar_randomAF = ggplot(AV_randomAF_split, aes(x = coverage, y = variance, color = tool)) +
geom_point() +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_randomAF)
#ggsave("RandomAF_CoeffVar_AlleleFrequency.png",CoeffVar_randomAF,
# path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
```
```{r}
ggplot(AV_setAF, aes(x = log10(dp), y = af_workflow, color = tool)) +
geom_point() +
facet_grid(rows = vars(coverage), cols = vars(allele_freq)) +
xlim(0,6)
# do this with all bamfiles, not just variants
ggplot(AV_setAF, aes(x = log10(dp), fill = tool)) +
geom_histogram()+
facet_grid(rows = vars(coverage), cols = vars(allele_freq))
```