#' ArrangeData_gt #' #' Reads in a directory of VCF files and converts them into a single dataframe #' #' @name ArrangeData_gt #' @param vardir Directory path containing vcf files #' @param ntlist Nucleotides (default ("A", "T", "G", "C") used for finding multiple alt alleles #' @param annotated Whether the vcf files have been annotated using snpeff "yes" or "no" (default "yes") #' @return A large dataframe containing information for all input vcf files. #' @export #' @examples #' df = ArrangeData_gt(vardir, ntlist = c('A','G','T','C'), annotated = 'yes') ArrangeData_gt = function(vardir, ntlist=c('A','G','T','C'), annotated = 'yes'){ filelist = Sys.glob(glue("{vardir}*.vcf")) message("Length of input files: ", length(filelist)) message("Input vcf files include: ", list(filelist)) all_files = data.frame() # empty df to build to for (filename in filelist){ samplename = basename(file_path_sans_ext(filename)) # grab sample name to append later message("Sample name is: ", samplename) vcf_all = read.vcfR(file=filename) # read in vcf file using vcfR vcf_tidy = vcfR2tidy(vcf_all) # change into a tidy dataframe if (annotated == 'yes'){ # $fix contains the INFO fields vcf_fix = vcf_tidy$fix %>% select('ChromKey','CHROM','POS','ID','REF','ALT','ANN') } else( # $fix contains the INFO fields vcf_fix = vcf_tidy$fix %>% select('ChromKey','CHROM','POS','ID','REF','ALT') ) # $gt contains the genotype information. grab info we want vcf_gt = vcf_tidy$gt %>% select("ChromKey","POS","gt_AD","gt_DP") vcf_total = merge(vcf_fix, vcf_gt, by = c("ChromKey","POS"), all= TRUE) # if variants exist if (nrow(vcf_total>0)){ vcf_total$sample = samplename snp_df = vcf_total %>% filter(REF %in% ntlist & ALT %in% ntlist) # only one ref and alt allele snp_df = snp_df %>% separate(gt_AD, c("REF_COUNT","ALT_COUNT"), sep = '[,]') # IT WILL THROW AN ERROR IF THERE ARE MULT ALLELES mult_alt = vcf_total %>% filter(REF %in% ntlist & !ALT %in% ntlist) # mult alt alleles - need to use later!! if (nrow(snp_df) > 0){ # one reference, one alt snp_df$REF_COUNT = as.numeric(as.character(snp_df$REF_COUNT)) # change to numeric to calc freq snp_df$ALT_COUNT = as.numeric(as.character(snp_df$ALT_COUNT)) snp_df$REF_FREQ = snp_df$REF_COUNT/snp_df$gt_DP snp_df$ALT_FREQ = snp_df$ALT_COUNT/snp_df$gt_DP snp_df$ALT_TYPE <- ifelse(snp_df$ALT_FREQ < 0.50, "minor","major") all_files = rbind(all_files, snp_df) } else(print("No snps for sample: ", samplename)) }else(print(glue("No variant data: ", samplename))) } # rearranging the df all_files = all_files %>% mutate(majorfreq = ifelse(ALT_TYPE == 'major', ALT_FREQ, REF_FREQ), minorfreq = ifelse(ALT_TYPE == 'minor', ALT_FREQ, REF_FREQ), major = ifelse(ALT_TYPE == 'major', ALT, REF), minor = ifelse(ALT_TYPE == 'minor', ALT, REF)) all_files = all_files[!duplicated(all_files), ] %>% droplevels() return(all_files) }