#!/usr/bin/env Rscript rm(list = ls()) ## ------------------------------------------------------------------------ require("optparse") ## ------------------------------------------------------------------------ suppressPackageStartupMessages(library("optparse")) ## ------------------------------------------------------------------------ prepare_faa_fna_PathoFact_orf <- function(aa_file_path = NULL, # aa_file_path = "~/Desktop/03.chicken.faa" fasta_file_path = NULL, # fasta_file_path = "~/Desktop/01.chicken.fasta" output_dir = "~/Desktop/", length_fil = 1000, pat = "[[:space:]].*", sample_name = "Sample"){ #######-------------------------- Loading required Packges require(Biostrings) require(tidyverse) require(Biostrings) #######-------------------------- Read faa aa_file_path %>% Biostrings::readAAStringSet() -> sequences #https://support.bioconductor.org/p/87210/ #######-------------------------- Simplify headers names(sequences) <- sub(pattern = pat, "", names(sequences)) # https://stackoverflow.com/questions/9319242/remove-everything-after-space-in-string #######-------------------------- get number of sequences names(sequences) %>% length() -> num_faa_seq #######-------------------------- # sequences[width(sequences) >= length_fil,] -> sequences #######-------------------------- Read contigs fna fasta_file_path %>% Biostrings::readDNAStringSet() -> fna_contigs names(fna_contigs) %>% length() -> num_fna_contigs #######-------------------------- Filter contigs fna fna_contigs[width(fna_contigs) >= length_fil,] -> fna_contigs names(fna_contigs) %>% length() -> num_fna_contigs_filt #######-------------------------- Get contigs header data.frame(names(fna_contigs)) -> df_fasta rownames(df_fasta) <- NULL #######-------------------------- Get faa header data.frame(names(sequences)) -> df rownames(df) <- NULL #######-------------------------- Simplify header faa and filter names based on contig kept after length filtering df %>% mutate(CONTIG = sub("(.*?_.*?)_.*", "\\1", names.sequences.)) %>% dplyr::rename("ORF" = "names.sequences.") %>% rownames_to_column("id") %>% select(CONTIG, ORF) %>% filter(CONTIG %in% df_fasta$names.fna_contigs.) -> df #######-------------------------- # fna_contigs[df$CONTIG] %>% fna_contigs %>% Biostrings::writeXStringSet(paste0(output_dir,"/", sample_name, ".fna"), append=FALSE, compress=FALSE, compression_level=NA, format="fasta") sequences[df$ORF] %>% Biostrings::writeXStringSet(paste0(output_dir,"/", sample_name, ".faa"), append=FALSE, compress=FALSE, compression_level=NA, format="fasta") df %>% add_count(CONTIG) %>% distinct(CONTIG, .keep_all = TRUE) %>% arrange(-n) %>% write_tsv(paste0(output_dir,"/", sample_name, "_report.tsv")) colnames(df) <- NULL write_tsv(x = df , col_names = FALSE, file = paste0(output_dir,"/", sample_name, ".contig")) #######-------------------------- # system2("gsed ", # args = c("-i ",paste(c( 's/\*//g' ))), # args = c(" ",paste0(output_dir,"/orf.faa")), # # ) system2("gsed", args = c("-i 's/*//g' ", paste0(output_dir,"/", sample_name, ".faa")) ) } # prepare_faa_PathoFact_orf(aa_file_path = "~/Desktop/03.SqueezeHuman.faa", # fasta_file_path = "~/Desktop/01.SqueezeHuman.fasta", # length_fil = 10000) ## ------------------------------------------------------------------------ option_list = list( make_option(c("-f", "--aa_file_path"), type="character", default = NULL, help="Path of the input aminoacid fasta of the orfs", metavar="character"), make_option(c("-g", "--contig_fasta_file_path"), type="character", default = NULL, help="Path of the input contig fasta file", metavar="character"), make_option(c("-o","--output_dir"), type="character", default = "~/Desktop/", help="Name of the output directory - created before running the analysis", metavar="character"), make_option(c("-s","--sample_name"), type="character", default = "Sample", help="sample_name for the output files", metavar="character"), make_option(c("-l","--length_fil"), type="numeric", default= 2000, help="Length filtering of contig fasta file", metavar="character"), make_option(c("--pat"), type="character", default= "[[:space:]].*", help="Default regex pattern for faa header cleangin", metavar="character") ) opt_parser = OptionParser(option_list = option_list); opt = parse_args(opt_parser) ## ------------------------------------------------------------------------ # parse_args(opt_parser, args = c("--help")) ## ------------------------------------------------------------------------ # # prepare_faa_fna_PathoFact_orf(aa_file_path = "~/Desktop/03.chicken.faa", # fasta_file_path = "~/Desktop/01.chicken.fasta", # sample_name = "chicken1_l1000") prepare_faa_fna_PathoFact_orf(aa_file_path = opt$aa_file_path, fasta_file_path = opt$contig_fasta_file_path, output_dir = opt$output_dir, length_fil = opt$length_fil, pat = opt$pat, sample_name = opt$sample_name) ## ------------------------------------------------------------------------ print(sessionInfo()) ## ------------------------------------------------------------------------ print(Sys.time()) ## ------------------------------------------------------------------------