#knitr::opts_knit$set(root.dir = "/Users/marissaknoll/Desktop/MAF/")
#run in working directory
#setwd("/Users/marissaknoll/Desktop/MAF/")
#library(available)
#available("Vivaldi", browse = FALSE)
library('plyr')
library('tidyverse')
-- Attaching packages --------------------------------------- tidyverse 1.3.0 --
v ggplot2 3.3.2 v purrr 0.3.3
v tibble 3.0.4 v dplyr 1.0.2
v tidyr 1.0.0 v stringr 1.4.0
v readr 1.3.1 v forcats 0.4.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::arrange() masks plyr::arrange()
x purrr::compact() masks plyr::compact()
x dplyr::count() masks plyr::count()
x dplyr::failwith() masks plyr::failwith()
x dplyr::filter() masks stats::filter()
x dplyr::id() masks plyr::id()
x dplyr::lag() masks stats::lag()
x dplyr::mutate() masks plyr::mutate()
x dplyr::rename() masks plyr::rename()
x dplyr::summarise() masks plyr::summarise()
x dplyr::summarize() masks plyr::summarize()
library("plotrix")
library('ggplot2')
library('ggpubr')
Attaching package: 'ggpubr'
The following object is masked from 'package:plyr':
mutate
library("MLmetrics")
Attaching package: 'MLmetrics'
The following object is masked from 'package:base':
Recall
library("RColorBrewer")
MyColors = c("#e41a1c", "#377eb8", "#4daf4a","#984ea3", "#ff6600", "#ffcc33", "#f781bf", "#a65628", "#999999", "#006600", "#993333", "#cc99ff", "#663300")
# This is basically RColorBrewer Set1 but I darkened yellow to make it easier to see then changed orange a bit to differ more from yellow
#
SEGMENTLIST = c("H1N1_PB2","H1N1_PB1","H1N1_PA","H1N1_HA","H1N1_NP","H1N1_NA","H1N1_MP","H1N1_NS")
ggplot2::theme_set(theme_minimal())
totalbases = 29903 # at some point should change this from being hardcoded to make this more flexible
readsim_cov = 100000 #hardocding this for now so my code runs
AV = read.csv(af_report,header = T)
#make the segments in a specific order
AV$chrom = factor(AV$chrom, levels = SEGMENTLIST)
AV$sample_id = as.character(AV$sample_id)
AV$dp = as.numeric(AV$dp)
AV = separate(AV, sample_id, sep = "_",
into = c("strain","AF","allele_freq","frac","seq_depth","tool","parameter","parameter2")) %>%
select(allele_freq,seq_depth,tool, parameter, chrom,pos,af_golden,af_workflow,ref,alt,dp) %>%
filter(ref == "A" | ref == "C" | ref == "T" | ref == "G") %>%
filter(alt == "A" | alt == "C" | alt == "T" | alt == "G") %>% droplevels()
Warning: Expected 8 pieces. Missing pieces filled with `NA` in 65999 rows [1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
AV$seq_depth = as.numeric(AV$seq_depth)
AV$coverage = (AV$seq_depth * readsim_cov) #variable called in pipeline, corresponds to highest coverage
AV$coverage = as.character(AV$coverage)
head(AV)
allele_freq <chr> | seq_depth <dbl> | tool <chr> | parameter <chr> | chrom <fct> | pos <int> | af_golden <dbl> | af_workflow <dbl> | ref <fct> | ||
---|---|---|---|---|---|---|---|---|---|---|
1 | 0.03 | 0.005 | ivar | NA | H1N1_HA | 1644 | 0.03 | 0 | G | |
2 | 0.03 | 0.005 | ivar | NA | H1N1_MP | 862 | 0.03 | 0 | T | |
3 | 0.03 | 0.005 | ivar | NA | H1N1_MP | 875 | 0.03 | 0 | C | |
4 | 0.03 | 0.005 | ivar | NA | H1N1_NA | 1289 | 0.03 | 0 | G | |
5 | 0.03 | 0.005 | ivar | NA | H1N1_NA | 1371 | 0.03 | 0 | G | |
6 | 0.03 | 0.005 | ivar | NA | H1N1_NP | 1392 | 0.03 | 0 | C |
variant_pos = ggplot(AV, aes(x = pos, y = tool)) +
geom_point() +
facet_grid(~chrom, scales = "free_x") + theme_minimal(base_size = 6)
print(variant_pos)
viral_golden = read.table(golden_vcf, stringsAsFactors = F)
colnames(viral_golden) = c("chrom","pos","ID","ref","alt","qual","filter","info","codes","genotype")
viral_golden = separate(viral_golden, info, sep = "=", into = c(NA,"AF"))
viral_golden$chrom = factor(viral_golden$chrom, levels = SEGMENTLIST)
head(viral_golden)
chrom <fct> | pos <int> | ID <chr> | ref <chr> | alt <chr> | qual <chr> | filter <chr> | AF <chr> | codes <chr> | ||
---|---|---|---|---|---|---|---|---|---|---|
1 | H1N1_HA | 382 | . | T | C | . | . | 0.03 | GT:AD:DP:GQ:PL | |
2 | H1N1_HA | 439 | . | A | C | . | . | 0.03 | GT:AD:DP:GQ:PL | |
3 | H1N1_HA | 930 | . | G | A | . | . | 0.03 | GT:AD:DP:GQ:PL | |
4 | H1N1_HA | 976 | . | A | G | . | . | 0.03 | GT:AD:DP:GQ:PL | |
5 | H1N1_HA | 1055 | . | G | A | . | . | 0.03 | GT:AD:DP:GQ:PL | |
6 | H1N1_HA | 1070 | . | G | A | . | . | 0.03 | GT:AD:DP:GQ:PL |
dim(viral_golden)
[1] 111 10
viral_golden_pos = ggplot(viral_golden, aes(x = pos, y = ref, color = alt)) +
geom_point() +
facet_grid(~chrom, scales = "free_x")+ theme_minimal(base_size = 6)
print(viral_golden_pos)
AV_setAF = filter(AV, allele_freq != "random") %>% droplevels()
AV_setAF_falsepos = filter(AV_setAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
AV_setAF_falsepos$category = c("FP")
AV_setAF_falseneg = filter(AV_setAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
AV_setAF_falseneg$category = c("FN")
AV_setAF_truepos = filter(AV_setAF, af_golden != 0 & af_workflow != 0)
AV_setAF_truepos$category = c("TP")
AV_setAF = rbind(AV_setAF_truepos, AV_setAF_falseneg, AV_setAF_falsepos)
FP_All = group_by(AV_setAF_falsepos, allele_freq,coverage,tool) %>% tally()
colnames(FP_All) = c("allele_freq","coverage","tool","FP")
FN_All = group_by(AV_setAF_falseneg, allele_freq,coverage,tool) %>% tally()
colnames(FN_All) = c("allele_freq","coverage","tool","FN")
TP_All = group_by(AV_setAF_truepos, allele_freq,coverage,tool) %>% tally()
colnames(TP_All) = c("allele_freq","coverage","tool","TP")
AllVar = merge(FP_All,FN_All, by=c("allele_freq","coverage","tool"),all=T)
AllVar = merge(AllVar,TP_All, by=c("allele_freq","coverage","tool"), all=T)
head(AllVar)
allele_freq <chr> | coverage <chr> | tool <chr> | FP <int> | FN <int> | TP <int> | |
---|---|---|---|---|---|---|
1 | 0.01 | 100 | filtered | NA | 87 | 24 |
2 | 0.01 | 100 | freebayes-default | 85 | 81 | 30 |
3 | 0.01 | 100 | freebayes-pooled-continuous | 85 | 81 | 30 |
4 | 0.01 | 100 | ivar | NA | 111 | NA |
5 | 0.01 | 100 | lofreq | NA | 111 | NA |
6 | 0.01 | 100 | mutect2-default | NA | 106 | 5 |
AllVar[is.na(AllVar)] = 0
head(AllVar)
allele_freq <chr> | coverage <chr> | tool <chr> | FP <dbl> | FN <dbl> | TP <dbl> | |
---|---|---|---|---|---|---|
1 | 0.01 | 100 | filtered | 0 | 87 | 24 |
2 | 0.01 | 100 | freebayes-default | 85 | 81 | 30 |
3 | 0.01 | 100 | freebayes-pooled-continuous | 85 | 81 | 30 |
4 | 0.01 | 100 | ivar | 0 | 111 | 0 |
5 | 0.01 | 100 | lofreq | 0 | 111 | 0 |
6 | 0.01 | 100 | mutect2-default | 0 | 106 | 5 |
AllVar$prec = (AllVar$TP)/(AllVar$TP + AllVar$FP)
AllVar$recall = (AllVar$TP)/(AllVar$TP + AllVar$FN)
AllVar$area = AllVar$prec * AllVar$recall
head(AllVar)
allele_freq <chr> | coverage <chr> | tool <chr> | FP <dbl> | FN <dbl> | TP <dbl> | prec <dbl> | recall <dbl> | ||
---|---|---|---|---|---|---|---|---|---|
1 | 0.01 | 100 | filtered | 0 | 87 | 24 | 1.0000000 | 0.21621622 | |
2 | 0.01 | 100 | freebayes-default | 85 | 81 | 30 | 0.2608696 | 0.27027027 | |
3 | 0.01 | 100 | freebayes-pooled-continuous | 85 | 81 | 30 | 0.2608696 | 0.27027027 | |
4 | 0.01 | 100 | ivar | 0 | 111 | 0 | NaN | 0.00000000 | |
5 | 0.01 | 100 | lofreq | 0 | 111 | 0 | NaN | 0.00000000 | |
6 | 0.01 | 100 | mutect2-default | 0 | 106 | 5 | 1.0000000 | 0.04504505 |
# SetAF Data
ggplot(AV_setAF, aes(x = tool, fill = category)) +
geom_bar() +
facet_grid(allele_freq~coverage) +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
scale_fill_brewer(palette = "Dark2")
# plot as percentage of FN as well
AV_randomAF = filter(AV, allele_freq == "random") %>% droplevels()
AV_randomAF_falsepos = filter(AV_randomAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
AV_randomAF_falsepos$category = c("FP")
AV_randomAF_falseneg = filter(AV_randomAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
AV_randomAF_falseneg$category = c("FN")
AV_randomAF_truepos = filter(AV_randomAF, af_golden != 0 & af_workflow != 0)
AV_randomAF_truepos$category = c("TP")
AV_randomAF = rbind(AV_randomAF_truepos, AV_randomAF_falseneg, AV_randomAF_falsepos)
rFP_All = group_by(AV_randomAF_falsepos, allele_freq,coverage,tool) %>% tally()
colnames(rFP_All) = c("allele_freq","coverage","tool","FP")
rFN_All = group_by(AV_randomAF_falseneg, allele_freq,coverage,tool) %>% tally()
colnames(rFN_All) = c("allele_freq","coverage","tool","FN")
rTP_All = group_by(AV_randomAF_truepos, allele_freq,coverage,tool) %>% tally()
colnames(rTP_All) = c("allele_freq","coverage","tool","TP")
rAllVar = merge(rFP_All,rFN_All, by=c("allele_freq","coverage","tool"),all=T)
rAllVar = merge(rAllVar, rTP_All, by=c("allele_freq","coverage","tool"), all=T)
head(rAllVar)
allele_freq <chr> | coverage <chr> | tool <chr> | FP <int> | FN <int> | TP <int> | |
---|---|---|---|---|---|---|
1 | random | 100 | filtered | 3 | 4 | 107 |
2 | random | 100 | freebayes-default | 80 | 10 | 101 |
3 | random | 100 | freebayes-pooled-continuous | 80 | 10 | 101 |
4 | random | 100 | ivar | NA | 22 | 89 |
5 | random | 100 | lofreq | NA | 13 | 98 |
6 | random | 100 | mutect2-default | NA | 56 | 55 |
rAllVar[is.na(rAllVar)] = 0
rAllVar$prec = (rAllVar$TP)/(rAllVar$TP + rAllVar$FP)
rAllVar$recall = (rAllVar$TP)/(rAllVar$TP + rAllVar$FN)
rAllVar$area = rAllVar$prec * rAllVar$recall
head(rAllVar)
allele_freq <chr> | coverage <chr> | tool <chr> | FP <dbl> | FN <dbl> | TP <dbl> | prec <dbl> | recall <dbl> | ||
---|---|---|---|---|---|---|---|---|---|
1 | random | 100 | filtered | 3 | 4 | 107 | 0.9727273 | 0.9639640 | |
2 | random | 100 | freebayes-default | 80 | 10 | 101 | 0.5580110 | 0.9099099 | |
3 | random | 100 | freebayes-pooled-continuous | 80 | 10 | 101 | 0.5580110 | 0.9099099 | |
4 | random | 100 | ivar | 0 | 22 | 89 | 1.0000000 | 0.8018018 | |
5 | random | 100 | lofreq | 0 | 13 | 98 | 1.0000000 | 0.8828829 | |
6 | random | 100 | mutect2-default | 0 | 56 | 55 | 1.0000000 | 0.4954955 |
# RandomAF Data
ggplot(AV_randomAF, aes(x = tool, fill = category)) +
geom_bar() +
facet_grid(allele_freq~coverage) +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
scale_fill_brewer(palette = "Dark2")
# plot as percentage of FN as well
#Looking at all SetAF Data
AV_Corr_setAF <-
ggplot(AV_setAF, aes(x = af_golden,y = af_workflow, color = category)) +
geom_point(size = 3, alpha = 0.4) +
scale_color_manual(values = MyColors) +
facet_grid(coverage~tool) +
geom_smooth(method='lm', size = 1.5) +
theme(plot.title = element_text(hjust = 0.5))+ theme_minimal(base_size = 6)
print(AV_Corr_setAF)
`geom_smooth()` using formula 'y ~ x'
Warning in qt((1 - level)/2, df): NaNs produced
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
Inf
#ggsave(AV_Corr_setAF, file = "AllVarSet_Correlation_Filtered.png",
# path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
##Looking at all RandomAF Data
AV_Corr_randomAF <-
ggplot(AV_randomAF, aes(x = af_golden,y = af_workflow, color = category)) +
geom_point(size = 3, alpha = 0.4) +
scale_color_manual(values = MyColors) +
facet_grid(coverage~tool) +
geom_smooth(method='lm', size = 1.5) +
theme(plot.title = element_text(hjust = 0.5))+ theme_minimal(base_size = 6)
print(AV_Corr_randomAF)
`geom_smooth()` using formula 'y ~ x'
Warning in qt((1 - level)/2, df): NaNs produced
Warning in qt((1 - level)/2, df): NaNs produced
Warning in qt((1 - level)/2, df): NaNs produced
Warning in qt((1 - level)/2, df): NaNs produced
Warning in qt((1 - level)/2, df): NaNs produced
Warning in qt((1 - level)/2, df): NaNs produced
Warning in qt((1 - level)/2, df): NaNs produced
Warning in qt((1 - level)/2, df): NaNs produced
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
Inf
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
Inf
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
Inf
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
Inf
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
Inf
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
Inf
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
Inf
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
Inf
#ggsave("AllVarRandom_Correlation_Filtered.png",AV_Corr_randomAF,
# path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
detach(package:plyr, unload=TRUE)
coeff_var <- function(x) {
CV <- sd(x) / mean(x) * 100
return(CV)
}
AV_setAF_split = group_by(AV_setAF_truepos, tool, allele_freq, coverage) %>%
summarize(mean = mean(af_workflow), sd = sd(af_workflow), variance = coeff_var(af_workflow)) %>% droplevels()
`summarise()` regrouping output by 'tool', 'allele_freq' (override with `.groups` argument)
Mean_setAF = ggplot(AV_setAF_split, aes(x = coverage, y = mean, color = allele_freq)) +
geom_point() +
#facet_grid(cols = vars(allele_freq)) +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(Mean_setAF)
#ggsave("SetAF_Mean_AlleleFrequency.png",Mean_setAF,
# path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
#visulize the effect of downsampling.
CoeffVar_setAF = ggplot(AV_setAF_split, aes(x = coverage, y = variance, color = allele_freq)) +
geom_point() +
#facet_grid(cols = vars(allele_freq)) +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_setAF)
Warning: Removed 3 rows containing missing values (geom_point).
#ggsave("SetAF_CoeffVar_AlleleFrequency.png",CoeffVar_setAF,
# path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
#variances above 10000 are simliar
AV_randomAF_split = group_by(AV_randomAF_truepos, tool, coverage) %>%
summarize(mean = mean(af_workflow), sd = sd(af_workflow), variance = coeff_var(af_workflow)) %>% droplevels()
`summarise()` regrouping output by 'tool' (override with `.groups` argument)
Mean_randomAF = ggplot(AV_randomAF_split, aes(x = coverage, y = mean, color = tool)) +
geom_point() +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(Mean_randomAF)
#ggsave("RandomAF_Mean_AlleleFrequency.png",Mean_randomAF,
# path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
CoeffVar_randomAF = ggplot(AV_randomAF_split, aes(x = coverage, y = variance, color = tool)) +
geom_point() +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_randomAF)
#ggsave("RandomAF_CoeffVar_AlleleFrequency.png",CoeffVar_randomAF,
# path = "/Users/marissaknoll/Desktop/MAF/H1N1_Images")
ggplot(AV_setAF, aes(x = log10(dp), y = af_workflow, color = tool)) +
geom_point() +
facet_grid(rows = vars(coverage), cols = vars(allele_freq)) +
xlim(0,6)
Warning: Removed 3210 rows containing missing values (geom_point).
# do this with all bamfiles, not just variants
ggplot(AV_setAF, aes(x = log10(dp), fill = tool)) +
geom_histogram()+
facet_grid(rows = vars(coverage), cols = vars(allele_freq))
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 3210 rows containing non-finite values (stat_bin).