print(TEST)
## [1] "hello"
#knitr::opts_knit$set(root.dir = "C:\\Users\\mk5636\\Desktop\\mad-r\\")
af_report = "/scratch/mk5636/soggy_wright_af_data.csv"
golden_vcf = "/scratch/mk5636/MAD-1_AF_random.vcf"
library('ggplot2')
library('plyr')
library('tidyverse')
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 2.1.3 ✔ purrr 0.3.3
## ✔ tidyr 1.0.0 ✔ dplyr 0.8.3
## ✔ readr 1.3.1 ✔ stringr 1.4.0
## ✔ tibble 2.1.3 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange() masks plyr::arrange()
## ✖ purrr::compact() masks plyr::compact()
## ✖ dplyr::count() masks plyr::count()
## ✖ dplyr::failwith() masks plyr::failwith()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::id() masks plyr::id()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::mutate() masks plyr::mutate()
## ✖ dplyr::rename() masks plyr::rename()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
library('ggpubr')
## Error in library("ggpubr"): there is no package called 'ggpubr'
library("MLmetrics")
## Error in library("MLmetrics"): there is no package called 'MLmetrics'
MyColors = c("#E76F51", "#E9C369", "#2A9D8F","#2B4FA2", "#119DAC", "#672D7B", "#262366", "#BCBCBC")
ggplot2::theme_set(theme_minimal())
totalbases = 29903
detach(package:plyr, unload=TRUE)
library("plotrix")
## Error in library("plotrix"): there is no package called 'plotrix'
TsTv = function(dataframe) {
transitions = 0
for (row in 1:nrow(dataframe)) {
REF <- dataframe[row, "ref"] #requires dataframe to have column name "ref"
ALT <- dataframe[row, "alt"] # requires dataframe to have column name "alt"
if(REF == "A" & ALT == "G")
transitions = transitions + 1
if(REF == "G" & ALT == "A")
transitions = transitions + 1
if(REF == "C" & ALT == "T")
transitions = transitions + 1
if(REF == "T" & ALT == "C")
transitions = transitions + 1
}
transversions = 0
for (row in 1:nrow(dataframe)) {
REF <- dataframe[row, "ref"]
ALT <- dataframe[row, "alt"]
if(REF == "A" & ALT == "T")
transversions = transversions + 1
if(REF == "T" & ALT == "A")
transversions = transversions + 1
if(REF == "C" & ALT == "A")
transversions = transversions + 1
if(REF == "A" & ALT == "C")
transversions = transversions + 1
if(REF == "G" & ALT == "T")
transversions = transversions + 1
if(REF == "T" & ALT == "G")
transversions = transversions + 1
if(REF == "C" & ALT == "G")
transversions = transversions + 1
if(REF == "G" & ALT == "C")
transversions = transversions + 1
}
TsTv_df = data.frame(transitions,transversions)
print(TsTv_df)
TsTv_ratio = (transitions/transversions)
print(TsTv_ratio)
}
AV = read.csv(af_report,header = T)
AV$sample_id = as.character(AV$sample_id)
AV$dp = as.numeric(AV$dp)
AV = separate(AV, sample_id, sep = "_",
into = c("pipe","AF","allele_freq","frac","seq_depth","tool","parameter")) %>%
select(-c(pipe, AF, frac)) %>%
filter(ref == "A" | ref == "C" | ref == "T" | ref == "G") %>%
filter(alt == "A" | alt == "C" | alt == "T" | alt == "G") %>% droplevels()
## Warning: Expected 7 pieces. Additional pieces discarded in 13052 rows [1929,
## 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942,
## 1943, 1944, 1945, 1946, 1947, 1948, ...].
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 74054 rows [1, 2,
## 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
AV$seq_depth = as.numeric(AV$seq_depth)
AV$coverage = (AV$seq_depth * 100000)
AV_ivar = filter(AV, tool == "ivar") %>% droplevels()
AV_lofreq = filter(AV, tool == "lofreq") %>% droplevels()
AV_varscan_unfiltered = filter(AV, tool == "varscan" & parameter == "unfiltered") %>% droplevels()
#AV_varscan_unfiltered$tool = paste0(AV_varscan_unfiltered$tool,"_",AV_varscan_unfiltered$parameter)
AV_varscan_filtered = filter(AV, tool == "varscan"& parameter == "filtered") %>% droplevels()
#AV_varscan_filtered$tool = paste0(AV_varscan_filtered$tool,"_",AV_varscan_filtered$parameter)
AV_tims = filter(AV, tool == "tims") %>% droplevels()
AV_freebayes = filter(AV, tool == "freebayes") %>% droplevels()
AV_mutect = filter(AV, tool == "mutect2") %>% droplevels()
AV_haplotypecaller = filter(AV, tool == "filtered") %>% droplevels()
AV_haplotypecaller$tool = "haplotypecaller"
AV = rbind(AV_ivar,AV_varscan_filtered,AV_tims,
AV_freebayes,AV_mutect,AV_haplotypecaller,AV_lofreq)
AV$parameter = NULL
head(AV)
## allele_freq seq_depth tool pos af_golden af_workflow ref alt dp coverage
## 1 0.03 1e-05 ivar 351 0.03 0 G A 1 1
## 2 0.03 1e-05 ivar 581 0.03 0 G A 1 1
## 3 0.03 1e-05 ivar 1002 0.03 0 A G 1 1
## 4 0.03 1e-05 ivar 1170 0.03 0 C T 1 1
## 5 0.03 1e-05 ivar 1180 0.03 0 A T 1 1
## 6 0.03 1e-05 ivar 1208 0.03 0 A G 1 1
Dp_Variation = ggplot(AV, aes(x = log10(coverage), y = log10(dp), color = tool)) +
geom_point() +
facet_wrap(~tool, scales = "free_y") +
geom_abline() +
scale_color_manual(values = MyColors)
## Warning: namespace 'plyr' is not available and has been replaced
## by .GlobalEnv when processing object ''
print(Dp_Variation)
## Warning: Removed 7991 rows containing missing values (geom_point).
plot of chunk unnamed-chunk-5
ggsave(Dp_Variation, file = "AV_Dp_Coverage.png")
## Saving 7 x 7 in image
## Warning: Removed 7991 rows containing missing values (geom_point).
viral_golden = read.table(golden_vcf, stringsAsFactors = F)
colnames(viral_golden) = c("chrom","pos","ID","ref","alt","qual","filter","info","codes","genotype")
viral_golden = separate(viral_golden, info, sep = "=", into = c("label","AF"))
viral_golden$label = NULL
head(viral_golden)
## chrom pos ID ref alt qual filter AF codes
## 1 SARS-CoV2 351 . G A . . 0.04 GT:AD:DP:GQ:PL
## 2 SARS-CoV2 581 . G A . . 0.4 GT:AD:DP:GQ:PL
## 3 SARS-CoV2 1002 . A G . . 0.46 GT:AD:DP:GQ:PL
## 4 SARS-CoV2 1170 . C T . . 0.18 GT:AD:DP:GQ:PL
## 5 SARS-CoV2 1180 . A T . . 0.73 GT:AD:DP:GQ:PL
## 6 SARS-CoV2 1208 . A G . . 0.18 GT:AD:DP:GQ:PL
## genotype
## 1 1/1/1/1:96,4:100:1:1
## 2 1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1:60,40:100:1:1
## 3 1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1:54,46:100:1:1
## 4 1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1:82,18:100:1:1
## 5 1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1:27,73:100:1:1
## 6 1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1:82,18:100:1:1
dim(viral_golden)
## [1] 161 10
viral_golden_pos = ggplot(viral_golden, aes(x = pos, y = ref, color = alt)) +
geom_point()
print(viral_golden_pos)
plot of chunk unnamed-chunk-6
TsTv(viral_golden)
## transitions transversions
## 1 114 47
## [1] 2.425532
AV_setAF = filter(AV, allele_freq != "random") %>% droplevels()
AV_setAF_falsepos = filter(AV_setAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
AV_setAF_falsepos$category = c("FP")
AV_setAF_falseneg = filter(AV_setAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
AV_setAF_falseneg$category = c("FN")
AV_setAF_truepos = filter(AV_setAF, af_golden != 0 & af_workflow != 0)
AV_setAF_truepos$category = c("TP")
AV_setAF = rbind(AV_setAF_truepos, AV_setAF_falseneg, AV_setAF_falsepos)
FP_All = group_by(AV_setAF_falsepos, allele_freq,coverage,tool) %>% tally()
colnames(FP_All) = c("allele_freq","coverage","tool","FP")
FN_All = group_by(AV_setAF_falseneg, allele_freq,coverage,tool) %>% tally()
colnames(FN_All) = c("allele_freq","coverage","tool","FN")
TP_All = group_by(AV_setAF_truepos, allele_freq,coverage,tool) %>% tally()
colnames(TP_All) = c("allele_freq","coverage","tool","TP")
AllVar = merge(FP_All,FN_All, by=c("allele_freq","coverage","tool"),all=T)
AllVar = merge(AllVar,TP_All, by=c("allele_freq","coverage","tool"), all=T)
head(AllVar)
## allele_freq coverage tool FP FN TP
## 1 0.01 1 freebayes NA 161 NA
## 2 0.01 1 haplotypecaller NA 161 NA
## 3 0.01 1 ivar NA 161 NA
## 4 0.01 1 lofreq NA 161 NA
## 5 0.01 1 mutect2 1 161 NA
## 6 0.01 1 tims 2 161 NA
AllVar[is.na(AllVar)] = 0
head(AllVar)
## allele_freq coverage tool FP FN TP
## 1 0.01 1 freebayes 0 161 0
## 2 0.01 1 haplotypecaller 0 161 0
## 3 0.01 1 ivar 0 161 0
## 4 0.01 1 lofreq 0 161 0
## 5 0.01 1 mutect2 1 161 0
## 6 0.01 1 tims 2 161 0
AllVar$prec = (AllVar$TP)/(AllVar$TP + AllVar$FP)
AllVar$recall = (AllVar$TP)/(AllVar$TP + AllVar$FN)
AllVar$area = AllVar$prec * AllVar$recall
head(AllVar)
## allele_freq coverage tool FP FN TP prec recall area
## 1 0.01 1 freebayes 0 161 0 NaN 0 NaN
## 2 0.01 1 haplotypecaller 0 161 0 NaN 0 NaN
## 3 0.01 1 ivar 0 161 0 NaN 0 NaN
## 4 0.01 1 lofreq 0 161 0 NaN 0 NaN
## 5 0.01 1 mutect2 1 161 0 0 0 0
## 6 0.01 1 tims 2 161 0 0 0 0
AV_randomAF = filter(AV, allele_freq == "random") %>% droplevels()
AV_randomAF_falsepos = filter(AV_randomAF, af_golden == 0 & af_workflow != 0) %>% droplevels()
AV_randomAF_falsepos$category = c("FP")
AV_randomAF_falseneg = filter(AV_randomAF, af_workflow == 0 & af_golden != 0) %>% droplevels()
AV_randomAF_falseneg$category = c("FN")
AV_randomAF_truepos = filter(AV_randomAF, af_golden != 0 & af_workflow != 0)
AV_randomAF_truepos$category = c("TP")
AV_randomAF = rbind(AV_randomAF_truepos, AV_randomAF_falseneg, AV_randomAF_falsepos)
rFP_All = group_by(AV_randomAF_falsepos, allele_freq,coverage,tool) %>% tally()
colnames(rFP_All) = c("allele_freq","coverage","tool","FP")
rFN_All = group_by(AV_randomAF_falseneg, allele_freq,coverage,tool) %>% tally()
colnames(rFN_All) = c("allele_freq","coverage","tool","FN")
rTP_All = group_by(AV_randomAF_truepos, allele_freq,coverage,tool) %>% tally()
colnames(rTP_All) = c("allele_freq","coverage","tool","TP")
rAllVar = merge(rFP_All,rFN_All, by=c("allele_freq","coverage","tool"),all=T)
rAllVar = merge(rAllVar, rTP_All, by=c("allele_freq","coverage","tool"), all=T)
head(rAllVar)
## allele_freq coverage tool FP FN TP
## 1 random 1 freebayes NA 154 7
## 2 random 1 haplotypecaller NA 154 7
## 3 random 1 ivar NA 161 NA
## 4 random 1 lofreq NA 161 NA
## 5 random 1 mutect2 NA 157 4
## 6 random 1 tims 4 136 25
rAllVar[is.na(rAllVar)] = 0
rAllVar$prec = (rAllVar$TP)/(rAllVar$TP + rAllVar$FP)
rAllVar$recall = (rAllVar$TP)/(rAllVar$TP + rAllVar$FN)
rAllVar$area = rAllVar$prec * rAllVar$recall
head(rAllVar)
## allele_freq coverage tool FP FN TP prec recall area
## 1 random 1 freebayes 0 154 7 1.000000 0.04347826 0.04347826
## 2 random 1 haplotypecaller 0 154 7 1.000000 0.04347826 0.04347826
## 3 random 1 ivar 0 161 0 NaN 0.00000000 NaN
## 4 random 1 lofreq 0 161 0 NaN 0.00000000 NaN
## 5 random 1 mutect2 0 157 4 1.000000 0.02484472 0.02484472
## 6 random 1 tims 4 136 25 0.862069 0.15527950 0.13386164
PR_AllVarSet_freq = ggplot(AllVar,aes(x=recall, y=prec, group = tool, color = tool)) +
geom_point() +
geom_line() +
facet_wrap(~allele_freq) +
scale_color_manual(values = MyColors)
print(PR_AllVarSet_freq)
## Warning: Removed 38 rows containing missing values (geom_point).
## Warning: Removed 10 rows containing missing values (geom_path).
plot of chunk unnamed-chunk-9
ggsave("PR_AllVarSet_byAF.png",PR_AllVarSet_freq)
## Saving 7 x 7 in image
## Warning: Removed 38 rows containing missing values (geom_point).
## Warning: Removed 10 rows containing missing values (geom_path).
PR_AllVarSet_cover = ggplot(AllVar,aes(x=recall, y=prec, group = tool, color=tool)) +
geom_point() +
geom_line() +
facet_wrap(~coverage) +
scale_color_manual(values = MyColors)
print(PR_AllVarSet_cover)
## Warning: Removed 38 rows containing missing values (geom_point).
## Warning: Removed 32 rows containing missing values (geom_path).
plot of chunk unnamed-chunk-9
ggsave("PR_AllVarSet_byCov.png",PR_AllVarSet_cover)
## Saving 7 x 7 in image
## Warning: Removed 38 rows containing missing values (geom_point).
## Warning: Removed 32 rows containing missing values (geom_path).
PR_AllVarRandom = ggplot(rAllVar,aes(x=recall, y=prec, group = tool, color=tool)) +
geom_point() +
geom_line() +
scale_color_manual(values = MyColors)
print(PR_AllVarRandom)
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_path).
plot of chunk unnamed-chunk-9
ggsave("PR_AllVarRandom.png",PR_AllVarRandom)
## Saving 7 x 7 in image
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_path).
# Examining Data
AllVar_PR_zeros = filter(AllVar, prec == 0 & recall == 0)
AllVar_PR_high = filter(AllVar, prec > 0.9 & recall > 0.9)
AllVar_PR_highest = group_by(AllVar, tool, allele_freq, coverage) %>%
arrange(area, decreasing = TRUE)
## Error: incorrect size (1) at position 2, expecting : 432
write.csv(AllVar_PR_highest, file = "AllVar_PR_Thresholds.csv")
## Error in is.data.frame(x): object 'AllVar_PR_highest' not found
AV_setAF_counts = group_by(AV_setAF_truepos, tool, allele_freq, coverage) %>% tally()
AV_setAF_counts$proportion = (AV_setAF_counts$n / nrow(viral_golden))
AV_setAF_proportion = ggplot(AV_setAF_counts,aes(x = log10(coverage), y = proportion,
group = tool, color = tool)) +
geom_point() +
geom_line() +
facet_grid(~allele_freq) +
scale_color_manual(values = MyColors) #+
#theme(axis.text = element_text(size = 8),
#axis.title = element_text(size = 30),
#strip.text = element_text(size = 15),
#axis.text.x = element_text(angle = 40))
print(AV_setAF_proportion)
## Registered S3 method overwritten by 'plyr':
## method from
## [.quoted R_GlobalEnv
plot of chunk unnamed-chunk-10
ggsave("AllVarSet_Proportion.png", AV_setAF_proportion)
## Saving 7 x 7 in image
## Only looking at coverage between 100-1000
AV_setAF_truepos_f = filter(AV_setAF_truepos, coverage > 99 & coverage < 1001)
AV_setAF_counts_f = group_by(AV_setAF_truepos_f, tool, allele_freq, coverage) %>% tally()
AV_setAF_counts_f$proportion = (AV_setAF_counts_f$n / nrow(viral_golden))
AV_setAF_proportion_f = ggplot(AV_setAF_counts_f,aes(x = log10(coverage), y = proportion,
group = tool, color = tool)) +
geom_point() +
geom_line() +
facet_grid(~allele_freq) +
scale_color_manual(values = MyColors) +
#scale_x_continuous(breaks = c(100,200,300,400,500,600,700,800,900,1000),
#labels = c("100","200","300","400","500","600","700","800","900","1000")) +
theme(axis.text.x = element_text(angle = 90))
#axis.text = element_text(size = 8),
#axis.title = element_text(size = 30),
#strip.text = element_text(size = 15),
print(AV_setAF_proportion_f)
plot of chunk unnamed-chunk-10
ggsave("AllVarSet_Proportion_Filtered.png", AV_setAF_proportion_f)
## Saving 7 x 7 in image
AV_randomAF_counts = group_by(AV_randomAF_truepos, tool, allele_freq, coverage) %>% tally()
AV_randomAF_counts$proportion = (AV_randomAF_counts$n / nrow(viral_golden))
AV_randomAF_proportion = ggplot(AV_randomAF_counts, aes(x = log10(coverage), y = proportion,
group = tool, color = tool)) +
geom_point() +
geom_line() +
scale_color_manual(values = MyColors) #+
#theme(axis.text = element_text(size = 8),
#axis.title = element_text(size = 30),
#strip.text = element_text(size = 15),
#axis.text.x = element_text(angle = 40))
print(AV_randomAF_proportion)
plot of chunk unnamed-chunk-11
ggsave("AllVarRandom_Proportion.png", AV_randomAF_proportion)
## Saving 7 x 7 in image
## Only looking at coverage between 100-1000
AV_randomAF_truepos_f = filter(AV_randomAF_truepos, coverage > 99 & coverage < 1001)
AV_randomAF_counts_f = group_by(AV_randomAF_truepos_f, tool, allele_freq, coverage) %>% tally()
AV_randomAF_counts_f$proportion = (AV_randomAF_counts_f$n / nrow(viral_golden))
AV_randomAF_proportion_f = ggplot(AV_randomAF_counts_f, aes(x = log10(coverage), y = proportion,
group = tool, color = tool)) +
geom_point() +
geom_line() +
scale_color_manual(values = MyColors) #+
#scale_x_continuous(breaks = c(100,200,300,400,500,600,700,800,900,1000),
#labels = c("100","200","300","400","500","600","700","800","900","1000")) +
#theme(axis.text = element_text(size = 8),
#axis.title = element_text(size = 30),
#strip.text = element_text(size = 15),
#axis.text.x = element_text(angle = 40))
print(AV_randomAF_proportion_f)
plot of chunk unnamed-chunk-11
ggsave("AllVarRandom_Proportion_Filtered.png", AV_randomAF_proportion_f)
## Saving 7 x 7 in image
## Only looking at true positives
AV_Corr_setAF = ggplot(AV_setAF, aes(x = af_golden, y = af_workflow,
color = tool, shape = category)) +
geom_point() +
scale_color_manual(values = MyColors) +
geom_boxplot(aes(group = allele_freq)) +
facet_grid(tool~coverage) +
#theme(axis.text = element_text(size = 8),
#axis.title = element_text(size = 20),
#strip.text = element_text(size = 15),
#axis.text.x = element_text(angle = 40)) +
ylim(-0.001,1.0) + xlim(-0.001,1.0)
print(AV_Corr_setAF)
plot of chunk unnamed-chunk-12
ggsave("AllVarSet_Correlation.png",AV_Corr_setAF)
## Saving 7 x 7 in image
##Looking at all SetAF Data
AV_setAF_f = filter(AV_setAF, coverage > 99 & coverage < 1001)
AV_Corr_setAF = ggplot(AV_setAF_f, aes(x = af_golden, y = af_workflow,
color = tool, group = tool, shape = category)) +
geom_point(size = 3, alpha = 0.4) +
scale_color_manual(values = MyColors) +
facet_grid(~coverage) +
geom_smooth(method='lm', size = 1.5) +
#theme(axis.text = element_text(size = 8),
#axis.title = element_text(size = 20),
#strip.text = element_text(size = 15),
#axis.text.x = element_text(angle = 40)) +
ylim(-0.001,1.0) + xlim(-0.001,1.0)
print(AV_Corr_setAF)
## Warning: Removed 8 rows containing missing values (geom_smooth).
plot of chunk unnamed-chunk-12
ggsave("AllVarSet_Correlation_Filtered.png",AV_Corr_setAF)
## Saving 7 x 7 in image
## Warning: Removed 8 rows containing missing values (geom_smooth).
AV_Corr_setAF_Zoom = ggplot(AV_setAF_f, aes(x = af_golden, y = af_workflow,
color = tool, group = tool, shape = category)) +
geom_point(size = 3, alpha = 0.4) +
scale_color_manual(values = MyColors) +
facet_grid(~coverage) +
geom_smooth(method='lm', size = 1.5) +
#theme(axis.text = element_text(size = 8),
#axis.title = element_text(size = 20),
#strip.text = element_text(size = 15),
#axis.text.x = element_text(angle = 40)) +
ylim(-0.01,0.15) + xlim(-0.01,0.15)
print(AV_Corr_setAF_Zoom)
## Warning: Removed 14645 rows containing non-finite values (stat_smooth).
## Warning: Removed 14645 rows containing missing values (geom_point).
plot of chunk unnamed-chunk-12
ggsave("AllVarSet_CorrelationF_Filtered_Zoom.png", AV_Corr_setAF_Zoom)
## Saving 7 x 7 in image
## Warning: Removed 14645 rows containing non-finite values (stat_smooth).
## Warning: Removed 14645 rows containing missing values (geom_point).
## Only looking at true positives
AV_Corr_randomAF = ggplot(AV_randomAF, aes(x = af_golden, y = af_workflow,
color = tool, shape = category)) +
geom_point() +
scale_color_manual(values = MyColors) +
facet_grid(tool~coverage) +
#theme(axis.text = element_text(size = 8),
#axis.title = element_text(size = 20),
#strip.text = element_text(size = 15),
#axis.text.x = element_text(angle = 40)) +
ylim(-0.001,1.0) + xlim(-0.001,1.0)
print(AV_Corr_randomAF)
plot of chunk unnamed-chunk-13
ggsave("AllVarRandom_Correlation.png",AV_Corr_randomAF)
## Saving 7 x 7 in image
##Looking at all RandomAF Data
AV_randomAF_f = filter(AV_randomAF, coverage > 99 & coverage < 1001)
AV_Corr_randomAF = ggplot(AV_randomAF_f, aes(x = af_golden, y = af_workflow,
color = tool, group = tool, shape = category)) +
geom_point(size = 3, alpha = 0.4) +
scale_color_manual(values = MyColors) +
facet_grid(~coverage) +
geom_smooth(method='lm', size = 1.5) +
#theme(axis.text = element_text(size = 8),
#axis.title = element_text(size = 20),
#strip.text = element_text(size = 15),
#axis.text.x = element_text(angle = 40)) +
ylim(-0.01,1) + xlim(-0.01,1)
print(AV_Corr_randomAF)
plot of chunk unnamed-chunk-13
ggsave("AllVarRandom_Correlation_Filtered.png",AV_Corr_randomAF)
## Saving 7 x 7 in image
AV_Corr_randomAF_Zoom = ggplot(AV_randomAF_f, aes(x = af_golden, y = af_workflow,
color = tool, group = tool, shape = category)) +
geom_point(size = 3, alpha = 0.4) +
scale_color_manual(values = MyColors) +
facet_grid(~coverage) +
geom_smooth(method='lm', size = 1.5) +
#theme(axis.text = element_text(size = 8),
#axis.title = element_text(size = 20),
#strip.text = element_text(size = 15),
#axis.text.x = element_text(angle = 40)) +
ylim(-0.01,0.15) + xlim(-0.01,0.15)
print(AV_Corr_randomAF_Zoom)
## Warning: Removed 3817 rows containing non-finite values (stat_smooth).
## Warning: Removed 3817 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing missing values (geom_smooth).
plot of chunk unnamed-chunk-13
ggsave("AllVarRandom_Correlation_Filtered_Zoom.png", AV_Corr_randomAF_Zoom)
## Saving 7 x 7 in image
## Warning: Removed 3817 rows containing non-finite values (stat_smooth).
## Warning: Removed 3817 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing missing values (geom_smooth).
coeff_var <- function(x) {
CV <- sd(x) / mean(x) * 100
return(CV)
}
AV_setAF_split = group_by(AV_setAF_truepos, tool, allele_freq, coverage) %>%
summarize(mean(af_workflow), sd(af_workflow), coeff_var(af_workflow)) %>% droplevels()
colnames(AV_setAF_split) = c("tool","AF","cov","mean","sd","variance")
Mean_setAF = ggplot(AV_setAF_split, aes(x = tool, y = mean, color = cov)) +
geom_point() +
facet_grid(rows = vars(tool), cols = vars(AF)) +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(Mean_setAF)
plot of chunk unnamed-chunk-14
ggsave("SetAF_Mean_AlleleFrequency.png",Mean_setAF)
## Saving 7 x 7 in image
CoeffVar_setAF = ggplot(AV_setAF_split, aes(x = tool, y = variance, color = cov)) +
geom_point() +
facet_grid(rows = vars(tool), cols = vars(AF)) +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_setAF)
## Warning: Removed 15 rows containing missing values (geom_point).
plot of chunk unnamed-chunk-14
ggsave("SetAF_CoeffVar_AlleleFrequency.png",CoeffVar_setAF)
## Saving 7 x 7 in image
## Warning: Removed 15 rows containing missing values (geom_point).
AV_randomAF_split = group_by(AV_randomAF_truepos, tool, coverage) %>%
summarize(mean(af_workflow), sd(af_workflow), coeff_var(af_workflow)) %>% droplevels()
colnames(AV_randomAF_split) = c("tool","cov","mean","sd","variance")
Mean_randomAF = ggplot(AV_randomAF_split, aes(x = tool, y = mean, color = cov)) +
geom_point()
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## List of 2
## $ text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : num 15
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x:List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 1
## ..$ vjust : num 0.5
## ..$ angle : num 90
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
print(Mean_randomAF)
plot of chunk unnamed-chunk-15
ggsave("RandomAF_Mean_AlleleFrequency.png",Mean_randomAF)
## Saving 7 x 7 in image
CoeffVar_randomAF = ggplot(AV_randomAF_split, aes(x = tool, y = variance, color = cov)) +
geom_point() +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(CoeffVar_randomAF)
plot of chunk unnamed-chunk-15
ggsave("RandomAF_CoeffVar_AlleleFrequency.png",CoeffVar_randomAF)
## Saving 7 x 7 in image
AVSet_Corr = ggplot(AV_setAF_truepos, aes(x = tool, y = af_workflow)) +
geom_point() +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
facet_grid(allele_freq~coverage, scales = "free_y")
print(AVSet_Corr)
plot of chunk unnamed-chunk-16
ggsave("AllVarSet_Tool_Comparison.pdf", AVSet_Corr, width = 15, height = 20)
# y axes are set per row at the moment (scales = "free_y")
AVRandom_Corr = ggplot(AV_randomAF_truepos, aes(x = tool, y = af_workflow)) +
geom_point() +
geom_violin() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
facet_grid(allele_freq~coverage)
print(AVRandom_Corr)
plot of chunk unnamed-chunk-16
ggsave("AllVarRandom_Tool_Comparison.pdf", AVRandom_Corr, width = 15, height = 20)
# Set AF Data
AV_Pos = ggplot(AV_setAF_truepos, aes(x = pos, y = af_workflow)) +
geom_point()
print(AV_Pos)
plot of chunk unnamed-chunk-17
AV_Pos_falsepos = ggplot(AV_setAF_falsepos, aes(x = pos, y = af_workflow)) +
geom_point()
print(AV_Pos_falsepos)
plot of chunk unnamed-chunk-17
AV_Pos_falseneg = ggplot(AV_setAF_falseneg, aes(x = pos, y = af_workflow)) +
geom_point()
print(AV_Pos_falseneg)
plot of chunk unnamed-chunk-17
# Random AF Data
AV_Pos_r = ggplot(AV_randomAF_truepos, aes(x = pos, y = af_workflow)) +
geom_point()
print(AV_Pos_r)
plot of chunk unnamed-chunk-17
AV_Pos_falsepos_r = ggplot(AV_randomAF_falsepos, aes(x = pos, y = af_workflow)) +
geom_point()
print(AV_Pos_falsepos_r)
plot of chunk unnamed-chunk-17
AV_Pos_falseneg_r = ggplot(AV_randomAF_falseneg, aes(x = pos, y = af_workflow)) +
geom_point()
print(AV_Pos_falseneg_r)
plot of chunk unnamed-chunk-17
setAF_TP_pos = ggplot(AV_setAF_truepos, aes(x = pos, y = ref, color = tool, shape = alt)) +
geom_point()
print(setAF_TP_pos)
plot of chunk unnamed-chunk-18
TsTv(AV_setAF_truepos)
## transitions transversions
## 1 30901 13001
## [1] 2.376817
#counts variants from all tools/seq_depth/AF trials
randomAF_TP_pos = ggplot(AV_randomAF_truepos, aes(x = pos, y = ref, color = tool, shape = alt)) +
geom_point()
print(randomAF_TP_pos)
plot of chunk unnamed-chunk-18
TsTv(AV_randomAF_truepos)
## transitions transversions
## 1 4584 1875
## [1] 2.4448