This notebook takes the output for the MAF Pipeline (mycsvfile.csv) and does:
- Calculates true pos, false pos, and false neg and makes a precision recall curve for all tools
- Finds total numbers of variants found by tools at different allele frequencies and sequencing depths
- Looks at correlation between SNPS in the golden vcf and worflow vcf
#knitr::opts_knit$set(root.dir = "/Users/marissaknoll/Desktop/MAF/")
#run in working directory
#setwd("/Users/marissaknoll/Desktop/MAF/")
#library(available)
#available("Vivaldi", browse = FALSE)
Loading Libraries
library('plyr')
library('tidyverse')
-- Attaching packages --------------------------------------- tidyverse 1.3.0 --
v ggplot2 3.3.2 v purrr 0.3.3
v tibble 3.0.4 v dplyr 1.0.2
v tidyr 1.0.0 v stringr 1.4.0
v readr 1.3.1 v forcats 0.4.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::arrange() masks plyr::arrange()
x purrr::compact() masks plyr::compact()
x dplyr::count() masks plyr::count()
x dplyr::failwith() masks plyr::failwith()
x dplyr::filter() masks stats::filter()
x dplyr::id() masks plyr::id()
x dplyr::lag() masks stats::lag()
x dplyr::mutate() masks plyr::mutate()
x dplyr::rename() masks plyr::rename()
x dplyr::summarise() masks plyr::summarise()
x dplyr::summarize() masks plyr::summarize()
library("plotrix")
library('ggplot2')
library('ggpubr')
Attaching package: 'ggpubr'
The following object is masked from 'package:plyr':
mutate
library("MLmetrics")
Attaching package: 'MLmetrics'
The following object is masked from 'package:base':
Recall
library("RColorBrewer")
MyColors = c("#e41a1c", "#377eb8", "#4daf4a","#984ea3", "#ff6600", "#ffcc33", "#f781bf", "#a65628", "#999999", "#006600", "#993333", "#cc99ff", "#663300")
# This is basically RColorBrewer Set1 but I darkened yellow to make it easier to see then changed orange a bit to differ more from yellow
#
SEGMENTLIST = c("H1N1_PB2","H1N1_PB1","H1N1_PA","H1N1_HA","H1N1_NP","H1N1_NA","H1N1_MP","H1N1_NS")
ggplot2::theme_set(theme_minimal())
totalbases = 29903 # at some point should change this from being hardcoded to make this more flexible
readsim_cov = 100000 #hardocding this for now so my code runs
Reading in Variant Data and Evaluating
AV = read.csv(af_report,header = T)
#make the segments in a specific order
AV$chrom = factor(AV$chrom, levels = SEGMENTLIST)
AV$sample_id = as.character(AV$sample_id)
AV$dp = as.numeric(AV$dp)
AV = separate(AV, sample_id, sep = "_",
into = c("strain","AF","allele_freq","frac","seq_depth","tool","parameter","parameter2")) %>%
select(allele_freq,seq_depth,tool, parameter, chrom,pos,af_golden,af_workflow,ref,alt,dp) %>%
filter(ref == "A" | ref == "C" | ref == "T" | ref == "G") %>%
filter(alt == "A" | alt == "C" | alt == "T" | alt == "G") %>% droplevels()
Warning: Expected 8 pieces. Missing pieces filled with `NA` in 128704 rows [1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
AV$seq_depth = as.numeric(AV$seq_depth)
AV$coverage = (AV$seq_depth * readsim_cov) #variable called in pipeline, corresponds to highest coverage
AV$coverage = as.character(AV$coverage)
head(AV)
variant_pos = ggplot(AV, aes(x = pos, y = tool)) +
geom_point() +
facet_grid(~chrom, scales = "free_x") + theme_minimal(base_size = 6)
print(variant_pos)

Reading in Golden Data and Evaluating
viral_golden = read.table(golden_vcf, stringsAsFactors = F)
colnames(viral_golden) = c("chrom","pos","ID","ref","alt","qual","filter","info","codes","genotype")
viral_golden = separate(viral_golden, info, sep = "=", into = c(NA,"AF"))
viral_golden$chrom = factor(viral_golden$chrom, levels = SEGMENTLIST)
head(viral_golden)
dim(viral_golden)
[1] 143 10
viral_golden_pos = ggplot(viral_golden, aes(x = pos, y = ref, color = alt)) +
geom_point() +
facet_grid(~chrom, scales = "free_x")+ theme_minimal(base_size = 6)
print(viral_golden_pos)

SetAF - Separating Data into TP, FP, FN
AV_setAF = filter(AV, allele_freq != "random") %>% droplevels()
AV_setAF_falsepos = filter(AV_setAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
AV_setAF_falsepos$category = c("FP")
AV_setAF_falseneg = filter(AV_setAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
AV_setAF_falseneg$category = c("FN")
AV_setAF_truepos = filter(AV_setAF, af_golden != 0 & af_workflow != 0)
AV_setAF_truepos$category = c("TP")
AV_setAF = rbind(AV_setAF_truepos, AV_setAF_falseneg, AV_setAF_falsepos)
FP_All = group_by(AV_setAF_falsepos, allele_freq,coverage,tool) %>% tally()
colnames(FP_All) = c("allele_freq","coverage","tool","FP")
FN_All = group_by(AV_setAF_falseneg, allele_freq,coverage,tool) %>% tally()
colnames(FN_All) = c("allele_freq","coverage","tool","FN")
TP_All = group_by(AV_setAF_truepos, allele_freq,coverage,tool) %>% tally()
colnames(TP_All) = c("allele_freq","coverage","tool","TP")
AllVar = merge(FP_All,FN_All, by=c("allele_freq","coverage","tool"),all=T)
AllVar = merge(AllVar,TP_All, by=c("allele_freq","coverage","tool"), all=T)
head(AllVar)
AllVar[is.na(AllVar)] = 0
head(AllVar)
AllVar$prec = (AllVar$TP)/(AllVar$TP + AllVar$FP)
AllVar$recall = (AllVar$TP)/(AllVar$TP + AllVar$FN)
AllVar$area = AllVar$prec * AllVar$recall
head(AllVar)
# SetAF Data
ggplot(AV_setAF, aes(x = tool, fill = category)) +
geom_bar() +
facet_grid(allele_freq~coverage) +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
scale_fill_brewer(palette = "Dark2")

# plot as percentage of FN as well
RandomAF - Separating Data into TP, FP, FN
AV_randomAF = filter(AV, allele_freq == "random") %>% droplevels()
AV_randomAF_falsepos = filter(AV_randomAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
AV_randomAF_falsepos$category = c("FP")
AV_randomAF_falseneg = filter(AV_randomAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
AV_randomAF_falseneg$category = c("FN")
AV_randomAF_truepos = filter(AV_randomAF, af_golden != 0 & af_workflow != 0)
AV_randomAF_truepos$category = c("TP")
AV_randomAF = rbind(AV_randomAF_truepos, AV_randomAF_falseneg, AV_randomAF_falsepos)
rFP_All = group_by(AV_randomAF_falsepos, allele_freq,coverage,tool) %>% tally()
colnames(rFP_All) = c("allele_freq","coverage","tool","FP")
rFN_All = group_by(AV_randomAF_falseneg, allele_freq,coverage,tool) %>% tally()
colnames(rFN_All) = c("allele_freq","coverage","tool","FN")
rTP_All = group_by(AV_randomAF_truepos, allele_freq,coverage,tool) %>% tally()
colnames(rTP_All) = c("allele_freq","coverage","tool","TP")
rAllVar = merge(rFP_All,rFN_All, by=c("allele_freq","coverage","tool"),all=T)
rAllVar = merge(rAllVar, rTP_All, by=c("allele_freq","coverage","tool"), all=T)
head(rAllVar)
rAllVar[is.na(rAllVar)] = 0
rAllVar$prec = (rAllVar$TP)/(rAllVar$TP + rAllVar$FP)
rAllVar$recall = (rAllVar$TP)/(rAllVar$TP + rAllVar$FN)
rAllVar$area = rAllVar$prec * rAllVar$recall
head(rAllVar)
# RandomAF Data
ggplot(AV_randomAF, aes(x = tool, fill = category)) +
geom_bar() +
facet_grid(allele_freq~coverage) +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
scale_fill_brewer(palette = "Dark2")

# plot as percentage of FN as well
SetAF - Correlation Between Golden and Workflow
#Looking at all SetAF Data
AV_Corr_setAF <-
ggplot(AV_setAF, aes(x = af_golden,y = af_workflow, color = category)) +
geom_point(size = 3, alpha = 0.4) +
scale_color_manual(values = MyColors) +
facet_grid(coverage~tool) +
geom_smooth(method='lm', size = 1.5) +
theme(plot.title = element_text(hjust = 0.5))+ theme_minimal(base_size = 6)
print(AV_Corr_setAF)
`geom_smooth()` using formula 'y ~ x'

#ggsave(AV_Corr_setAF, file = "AllVarSet_Correlation_Filtered.png",
# path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
RandomAF - Correlation Between Golden and Workflow
##Looking at all RandomAF Data
AV_Corr_randomAF <-
ggplot(AV_randomAF, aes(x = af_golden,y = af_workflow, color = category)) +
geom_point(size = 3, alpha = 0.4) +
scale_color_manual(values = MyColors) +
facet_grid(coverage~tool) +
geom_smooth(method='lm', size = 1.5) +
theme(plot.title = element_text(hjust = 0.5))+ theme_minimal(base_size = 6)
print(AV_Corr_randomAF)
`geom_smooth()` using formula 'y ~ x'
Warning in qt((1 - level)/2, df): NaNs produced
Warning in qt((1 - level)/2, df): NaNs produced
Warning in qt((1 - level)/2, df): NaNs produced
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
Inf
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
Inf
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
Inf

#ggsave("AllVarRandom_Correlation_Filtered.png",AV_Corr_randomAF,
# path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
Stats for SetAF Data
detach(package:plyr, unload=TRUE)
coeff_var <- function(x) {
CV <- sd(x) / mean(x) * 100
return(CV)
}
AV_setAF_split = group_by(AV_setAF_truepos, tool, allele_freq, coverage) %>%
summarize(mean = mean(af_workflow), sd = sd(af_workflow), variance = coeff_var(af_workflow)) %>% droplevels()
`summarise()` regrouping output by 'tool', 'allele_freq' (override with `.groups` argument)
Mean_setAF = ggplot(AV_setAF_split, aes(x = coverage, y = mean, color = allele_freq)) +
geom_point() +
#facet_grid(cols = vars(allele_freq)) +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(Mean_setAF)

#ggsave("SetAF_Mean_AlleleFrequency.png",Mean_setAF,
# path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
#visulize the effect of downsampling.
CoeffVar_setAF = ggplot(AV_setAF_split, aes(x = coverage, y = variance, color = allele_freq)) +
geom_point() +
#facet_grid(cols = vars(allele_freq)) +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_setAF)
Warning: Removed 24 rows containing missing values (geom_point).

#ggsave("SetAF_CoeffVar_AlleleFrequency.png",CoeffVar_setAF,
# path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
#variances above 10000 are simliar
Stats for RandomAF Data
AV_randomAF_split = group_by(AV_randomAF_truepos, tool, coverage) %>%
summarize(mean = mean(af_workflow), sd = sd(af_workflow), variance = coeff_var(af_workflow)) %>% droplevels()
`summarise()` regrouping output by 'tool' (override with `.groups` argument)
Mean_randomAF = ggplot(AV_randomAF_split, aes(x = coverage, y = mean, color = tool)) +
geom_point() +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(Mean_randomAF)

#ggsave("RandomAF_Mean_AlleleFrequency.png",Mean_randomAF,
# path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
CoeffVar_randomAF = ggplot(AV_randomAF_split, aes(x = coverage, y = variance, color = tool)) +
geom_point() +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_randomAF)
Warning: Removed 4 rows containing missing values (geom_point).

#ggsave("RandomAF_CoeffVar_AlleleFrequency.png",CoeffVar_randomAF,
# path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
ggplot(AV_setAF, aes(x = log10(dp), y = af_workflow, color = tool)) +
geom_point() +
facet_grid(rows = vars(coverage), cols = vars(allele_freq)) +
xlim(0,6)
Warning: Removed 6147 rows containing missing values (geom_point).

# do this with all bamfiles, not just variants
ggplot(AV_setAF, aes(x = log10(dp), fill = tool)) +
geom_histogram()+
facet_grid(rows = vars(coverage), cols = vars(allele_freq))
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 6147 rows containing non-finite values (stat_bin).

---
title: "R Notebook"
output: html_notebook
---

##This notebook takes the output for the MAF Pipeline (mycsvfile.csv) and does:
1. Calculates true pos, false pos, and false neg and makes a precision recall curve for all tools
2. Finds total numbers of variants found by tools at different allele frequencies and sequencing depths
3. Looks at correlation between SNPS in the golden vcf and worflow vcf

```{r}
#knitr::opts_knit$set(root.dir = "/Users/marissaknoll/Desktop/MAF/")
#run in working directory
#setwd("/Users/marissaknoll/Desktop/MAF/") 
```

```{r}
#library(available)
#available("Vivaldi", browse = FALSE)
```

##Loading Libraries
```{r}
library('plyr')
library('tidyverse')
library("plotrix") 
library('ggplot2')
library('ggpubr')
library("MLmetrics")
library("RColorBrewer")
        
MyColors = c("#e41a1c", "#377eb8", "#4daf4a","#984ea3", "#ff6600", "#ffcc33", "#f781bf", "#a65628", "#999999", "#006600", "#993333", "#cc99ff", "#663300")
# This is basically RColorBrewer Set1 but I darkened yellow to make it easier to see then changed orange a bit to differ more from yellow
#
SEGMENTLIST = c("H1N1_PB2","H1N1_PB1","H1N1_PA","H1N1_HA","H1N1_NP","H1N1_NA","H1N1_MP","H1N1_NS")
ggplot2::theme_set(theme_minimal())

totalbases = 29903 # at some point should change this from being hardcoded to make this more flexible
readsim_cov = 100000 #hardocding this for now so my code runs
```

##Reading in Variant Data and Evaluating
```{r}
AV = read.csv(af_report,header = T)
#make the segments in a specific order
AV$chrom = factor(AV$chrom, levels = SEGMENTLIST)

AV$sample_id = as.character(AV$sample_id)
AV$dp = as.numeric(AV$dp)
AV = separate(AV, sample_id, sep = "_", 
               into = c("strain","AF","allele_freq","frac","seq_depth","tool","parameter","parameter2")) %>%
  select(allele_freq,seq_depth,tool, parameter, chrom,pos,af_golden,af_workflow,ref,alt,dp) %>%
  filter(ref == "A" | ref == "C" | ref == "T" | ref == "G") %>% 
  filter(alt == "A" | alt == "C" | alt == "T" | alt == "G") %>% droplevels()
AV$seq_depth = as.numeric(AV$seq_depth)
AV$coverage = (AV$seq_depth * readsim_cov) #variable called in pipeline, corresponds to highest coverage
AV$coverage = as.character(AV$coverage)
head(AV)

variant_pos = ggplot(AV, aes(x = pos, y = tool)) +
  geom_point() +
  facet_grid(~chrom, scales = "free_x") + theme_minimal(base_size = 6)
print(variant_pos)

```

##Reading in Golden Data and Evaluating
```{r}
viral_golden = read.table(golden_vcf, stringsAsFactors = F)
colnames(viral_golden) = c("chrom","pos","ID","ref","alt","qual","filter","info","codes","genotype")
viral_golden = separate(viral_golden, info, sep = "=", into = c(NA,"AF"))
viral_golden$chrom = factor(viral_golden$chrom, levels = SEGMENTLIST)
head(viral_golden)
dim(viral_golden)


viral_golden_pos = ggplot(viral_golden, aes(x = pos, y = ref, color = alt)) +
  geom_point() +
  facet_grid(~chrom, scales = "free_x")+ theme_minimal(base_size = 6)
print(viral_golden_pos)
```

##SetAF - Separating Data into TP, FP, FN
```{r}
AV_setAF = filter(AV, allele_freq != "random") %>% droplevels()

AV_setAF_falsepos = filter(AV_setAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
  AV_setAF_falsepos$category = c("FP")
AV_setAF_falseneg = filter(AV_setAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
  AV_setAF_falseneg$category = c("FN")
AV_setAF_truepos = filter(AV_setAF, af_golden != 0 & af_workflow != 0)
  AV_setAF_truepos$category = c("TP")
  
AV_setAF = rbind(AV_setAF_truepos, AV_setAF_falseneg, AV_setAF_falsepos)

FP_All = group_by(AV_setAF_falsepos, allele_freq,coverage,tool) %>% tally()
  colnames(FP_All) = c("allele_freq","coverage","tool","FP")
FN_All = group_by(AV_setAF_falseneg, allele_freq,coverage,tool) %>% tally()
  colnames(FN_All) = c("allele_freq","coverage","tool","FN")
TP_All = group_by(AV_setAF_truepos, allele_freq,coverage,tool) %>% tally()
  colnames(TP_All) = c("allele_freq","coverage","tool","TP")
    
AllVar = merge(FP_All,FN_All, by=c("allele_freq","coverage","tool"),all=T)
AllVar = merge(AllVar,TP_All, by=c("allele_freq","coverage","tool"), all=T)
head(AllVar)

AllVar[is.na(AllVar)] = 0
head(AllVar)

AllVar$prec = (AllVar$TP)/(AllVar$TP + AllVar$FP)
AllVar$recall = (AllVar$TP)/(AllVar$TP + AllVar$FN)
AllVar$area = AllVar$prec * AllVar$recall
head(AllVar)
```
```{r}
# SetAF Data
ggplot(AV_setAF, aes(x = tool, fill = category)) +
  geom_bar() +
  facet_grid(allele_freq~coverage) +
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  scale_fill_brewer(palette = "Dark2")
# plot as percentage of FN as well
```

##RandomAF - Separating Data into TP, FP, FN
```{r}
AV_randomAF = filter(AV, allele_freq == "random") %>% droplevels()

AV_randomAF_falsepos = filter(AV_randomAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
  AV_randomAF_falsepos$category = c("FP")
AV_randomAF_falseneg = filter(AV_randomAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
  AV_randomAF_falseneg$category = c("FN")
AV_randomAF_truepos = filter(AV_randomAF, af_golden != 0 & af_workflow != 0)
  AV_randomAF_truepos$category = c("TP")
  
AV_randomAF = rbind(AV_randomAF_truepos, AV_randomAF_falseneg, AV_randomAF_falsepos)

rFP_All = group_by(AV_randomAF_falsepos, allele_freq,coverage,tool) %>% tally()
  colnames(rFP_All) = c("allele_freq","coverage","tool","FP")
rFN_All = group_by(AV_randomAF_falseneg, allele_freq,coverage,tool) %>% tally()
  colnames(rFN_All) = c("allele_freq","coverage","tool","FN")
rTP_All = group_by(AV_randomAF_truepos, allele_freq,coverage,tool) %>% tally()
  colnames(rTP_All) = c("allele_freq","coverage","tool","TP")

rAllVar = merge(rFP_All,rFN_All, by=c("allele_freq","coverage","tool"),all=T)
rAllVar = merge(rAllVar, rTP_All, by=c("allele_freq","coverage","tool"), all=T)
head(rAllVar)

rAllVar[is.na(rAllVar)] = 0

rAllVar$prec = (rAllVar$TP)/(rAllVar$TP + rAllVar$FP)
rAllVar$recall = (rAllVar$TP)/(rAllVar$TP + rAllVar$FN)
rAllVar$area = rAllVar$prec * rAllVar$recall
head(rAllVar)
```
```{r}
# RandomAF Data
ggplot(AV_randomAF, aes(x = tool, fill = category)) +
  geom_bar() +
  facet_grid(allele_freq~coverage) +
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  scale_fill_brewer(palette = "Dark2")
# plot as percentage of FN as well
```

##SetAF - Correlation Between Golden and Workflow
```{r}
#Looking at all SetAF Data
AV_Corr_setAF <- 
  ggplot(AV_setAF, aes(x = af_golden,y = af_workflow, color = category)) +
  geom_point(size = 3, alpha = 0.4) +
  scale_color_manual(values = MyColors) +
  facet_grid(coverage~tool) + 
  geom_smooth(method='lm', size = 1.5) +
  theme(plot.title = element_text(hjust = 0.5))+ theme_minimal(base_size = 6)
print(AV_Corr_setAF)
#ggsave(AV_Corr_setAF, file = "AllVarSet_Correlation_Filtered.png",
#       path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")

```

##RandomAF - Correlation Between Golden and Workflow
```{r}
##Looking at all RandomAF Data
  AV_Corr_randomAF <- 
  ggplot(AV_randomAF, aes(x = af_golden,y = af_workflow, color = category)) +
  geom_point(size = 3, alpha = 0.4) +
  scale_color_manual(values = MyColors) +
  facet_grid(coverage~tool) + 
  geom_smooth(method='lm', size = 1.5) +
  theme(plot.title = element_text(hjust = 0.5))+ theme_minimal(base_size = 6)
print(AV_Corr_randomAF)
#ggsave("AllVarRandom_Correlation_Filtered.png",AV_Corr_randomAF,
#       path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
```



## Stats for SetAF Data
```{r}
detach(package:plyr, unload=TRUE)
coeff_var <- function(x) {
  CV <- sd(x) / mean(x) * 100
  return(CV)
}

AV_setAF_split = group_by(AV_setAF_truepos, tool, allele_freq, coverage) %>%
  summarize(mean = mean(af_workflow), sd = sd(af_workflow), variance = coeff_var(af_workflow)) %>% droplevels()
Mean_setAF = ggplot(AV_setAF_split, aes(x = coverage, y = mean, color = allele_freq)) + 
  geom_point() +
  #facet_grid(cols = vars(allele_freq)) +
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(Mean_setAF)
#ggsave("SetAF_Mean_AlleleFrequency.png",Mean_setAF,
#       path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
#visulize the effect of downsampling. 
CoeffVar_setAF = ggplot(AV_setAF_split, aes(x = coverage, y = variance, color = allele_freq)) + 
  geom_point() +
  #facet_grid(cols = vars(allele_freq)) +
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_setAF)
#ggsave("SetAF_CoeffVar_AlleleFrequency.png",CoeffVar_setAF,
#       path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
#variances above 10000 are simliar

```
## Stats for RandomAF Data
```{r}
AV_randomAF_split = group_by(AV_randomAF_truepos, tool, coverage) %>%
  summarize(mean = mean(af_workflow), sd = sd(af_workflow), variance = coeff_var(af_workflow)) %>% droplevels()

Mean_randomAF = ggplot(AV_randomAF_split, aes(x = coverage, y = mean, color = tool)) + 
  geom_point() +
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(Mean_randomAF)
#ggsave("RandomAF_Mean_AlleleFrequency.png",Mean_randomAF,
#       path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")

CoeffVar_randomAF = ggplot(AV_randomAF_split, aes(x = coverage, y = variance, color = tool)) + 
  geom_point() +
  theme(text = element_text(size = 15),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_randomAF)
#ggsave("RandomAF_CoeffVar_AlleleFrequency.png",CoeffVar_randomAF,
#       path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
```

```{r}
ggplot(AV_setAF, aes(x = log10(dp), y = af_workflow, color = tool)) +
  geom_point() +
  facet_grid(rows = vars(coverage), cols = vars(allele_freq)) +
  xlim(0,6)

# do this with all bamfiles, not just variants
ggplot(AV_setAF, aes(x = log10(dp), fill = tool)) +
  geom_histogram()+
  facet_grid(rows = vars(coverage), cols = vars(allele_freq))

```
