#Configuration file

#general
library_file = src/starpa/data/libraries.txt		#Relative loaction of file with library names (each name on separate line)
genome = tests/data/genome/NC_000913.2.fna		#Relative location of the genome file (in fasta format)
samtools_call = samtools		#Call for samtools command [eg. samtools]
samtools_threads = 1		#Thread (cores) used by samtools [integer, eg. 1]
bedtools_call = bedtools		#Call for bedtools command [eg. ~/bin/bedtools2.26/bin/bedtools]
featureCounts_call = featureCounts		#Call for featureCounts command (Release 1.6.1+) [eg. featureCounts]
cd_hit_est_call = cd-hit-est		#Call for CDI-HIT-EST command [eg. ~/bin/cd-hit-est]
CPUs = 2		#Number of libraries to be analysed in parallel manner [integer, eg. 2]
min_length = 18		#Minimum length of reads, shorter reads will be discarded [integer, eg. 18]
max_length = 300		#Maximum length of reads,shorter reads will be discarded [integer, eg. 300]
non_overlap = 2		#Number of bases allowed to not overlap between read and processing product
					#It can be up to around square root of the 'min_length' parameter (minimal length of the allowed reads)
min_pp_reads = 10	#Minimum number of reads matching processing product to be considered
#Important only for tasks "trim","sma_sort","pseudoSE":
paired = True		#Are reads in paired-end format [True or False]


#trim
#Trims reads using cutadapt
##Name format of input_file: "library" + "suffix" (eg. "_1.fq") -> library_1.fq
trim_call = cutadapt		#Call for cutadapt (eg. ~/.local/bin/cutadapt)
trim_min_qual = 30		#3' end trimming quality threshold [integer, eg. 30]
trim_overlap = 1	#Mininum match between adapter and read's 3' end. In case of paired end read it can be set to 1 as paired match will
					#cover short trimmings happeinging by chance.
trim_adapter_for = tests/data/adapters/adapter1.fasta		#Relative path of the 5' adapter file (in fasta format)
trim_adapter_rev = tests/data/adapters/adapter2.fasta		#Relative path of the 3' adapter file (in fasta format)
trim_threads = 1		#Thread (cores) used by cutadapt, currently works only with 1 [integer, eg. 1]
trim_quality_base = 64		#Phred quality encoding of input data of task "trim" [64 if Phred+64 or 33 if Phred+33]
trim_input_file_suffix_for = _1.fq		#Suffix for forward input file names as name format above [eg. _1.fq]
trim_input_file_suffix_rev = _2.fq		#Suffix for reverse input file names as name format above [eg. _2.fq]
										#They should be "_1.fq" and "_2.fq" if input data is coming from STARPA pipeline
##Important only in non paired (single-end) mode  [paired = False]:
trim_input_file_suffix_SE = _1.fq		#Suffix for input file names as name format above [Default: .fq]
										#It should be "_1.fq" if input data is coming from STARPA pipeline
										
										
#align
#Aligns reads using bowtie2
##Name format of input_file: "library" + "suffix" (eg. "_1_trim.fq") -> library_1_trim.fq
align_call = bowtie2		#Call for bowtie2 command [~/bin/bowtie2-2.3.4.1-linux-x86_64/bowtie2]
align_index_call = bowtie2-build		#Call for bowtie2-build command [~/bin/bowtie2-2.3.4.1-linux-x86_64/bowtie2-build]
align_threads = 1		#Threads(cores) used by bowtie2 [integer, eg. 1]
align_quality_base = 64		#Phred quality encoding of input data of task "align" [64 if Phred+64 or 33 if Phred+33]
align_sensitive = False		#Alignment in sensitive mode (two step process) [True or False]
align_input_file_suffix_for = _1_trim.fq		#Suffix for forward input file names as name format above [Default: _1_trim.fq]
align_input_file_suffix_rev = _2_trim.fq		#Suffix for reverse input file names as name format above [Default: _2_trim.fq]
												#They should be "_1_trim.fq" and "_2_trim.fq" if input data is coming from STARPA pipeline
##Important only in non paired (single-end) mode  [paired = False]:
align_input_file_suffix_SE = _trim.fq		#Suffix for input file names as name format above [Default: _trim.fq]
											#It should be "_trim.fq" if input data is coming from STARPA pipeline


#sort
#Removes unaligned reads and discards mappings with lower quality
##Name format of input_file: "library" + ".sam" -> library.sam 


#pseudoSE
#Converts PE reads to pseudo SE, removes reads with too many mismatches or too many mappings.
#If SE reads are used as input: removes reads with too many mismatches or too many mappings.
#If the read pairs (in case of PE) are not overlapping then the gap between reads is filled with genomic sequence.
##Name format of input_file: "library" + "suffix" (eg. "_sort.sam") -> library_sort.sam 
pseudoSE_max_mappings = 100		#Number of maximum mappings [integer, eg. 100]
pseudoSE_oligoA = True		#Are reads with extragenomic A-s at the 3' end and having too many mismatches included?
							#They might be easily discarded if number of allowed mismatches is too lower that lenght of oligoA tale.
							#In bacteria RNAs are often oligoadenylated [True or False]
pseudoSE_quality_base = 33		#Phred quality encoding of input data for task "psudoSE" [64 if Phred+64 or 33 if Phred+33]
								#If input for task "pseudoSE" originates from task "align" the initially Phred+64 data is converted to Phred+33.
pseudoSE_allowed_mismatch = 2		#Maximum number of mismatches allowed [integer, eg. 2]
pseudoSE_mismatch_precentage = 0		#Maximum percentage of mismatches allowed in sequenced region [integer: 0-100]
										#If the read pairs (in case of PE) are not overlapping then the mismatches are calculated 
										#only for the region of reads.
#pseudoSE_max_read_length = 49		#Maximum length of read from pair [integer, eg. 49]
pseudoSE_input_file_suffix = _sort.sam		#Suffix for input file names as name format above [eg. _sort.sam]
											#It should be "_sort.sam" if input data is coming from STARPA pipeline

											
#identify
#Identifies processing products via Flaimapper2 which are filtered by read number estimated by flaimapper (50% of min_pp_reads)
#Then reads are quantified by BEDtools intersect.
##Name format of input_file: "library" + "suffix" (eg. "_pseudoSE.sam") -> library_pseudoSE.sam 
identify_flaimapper_parameters = tests/data/flaimapper_parameters/parameters.dev-2-100-2.txt		#Relative location of Flaimapper2 parameter file
#identify_overlap = 30		#
identify_flaimapper_call = flaimapper		#Call for Flaimapper2 command
identify_input_file_suffix = _pseudoSE.sam		#Suffix for input file names as name format above [eg. _pseudoSE.sam]
												#It should be "_pseudoSE.sam" if input data is coming from STARPA pipeline
identify_split_step = 10		#Read length intervals for sam file fragmentation which will be analysed separately by Flaimapper2.
								#This allows identification of reads with overlapping start or end positions.

								
#cluster
#Filters processing product by the number of reads they they match and relative coverage
#(combined coverage o
#Clusters processing products by overlap and sequence identity if they
#belong to the same metacontig classes
##Name format of input_file: "library" + "suffix" (eg. "__pp_counted.BED") -> library__pp_counted.BED
cluster_min_contig_length = 18		#Minimum contig length, could be equal with minimal read length.
cluster_min_contig_cov = 1		#Coverage threshold of genomic positions to be included in a contig.
cluster_min_contig_reads = 10		#Minimum number of reads in contig, should be equal with min_pp_reads,
									#otherwise too many non-informative contigs will be saved.
cluster_min_contig_length_meta = 18		#Minimum contig length for contigs used to create metacontigs, could be equal with minimal read length
cluster_min_contig_cov_meta = 5		#Coverage threshold of genomic positions to be included in a contig used to create metacontigs,
									# can/should be set bigger than 1. Then contigs connected by few reads will be separated
									#allowing adjusting clustering via metacontigs.
									#Most efficient clustering via metacontigs depends on the nature on library but good starting points
									#are around 100% to 50% of the value of parameter "min_pp_reads".
cluster_min_contig_reads_meta = 10		#Minimum number of reads in contig used to create metacontigs, should be equal with min_pp_reads,
										#otherwise too many non-informative contigs will be used
##Relative coverage of pp (average coverage of each position of the whole processing product)
cluster_rel_cov_list = [0.25,0.1]		#Minimum relative coverage. In form: [x] or [x,y,...]. [eg [0.25,0.1]]
										#First value is minimum relative coverage of processing products from minimum length untill
										#the first size (excluded) in cluster_rel_cov_size_range. If cluster_rel_cov_size_range is [] (no numbers given)
										#then it is minimal relative coverage for all processing products.
										#Following numbers (if there) represent minimal relative coverage for processing products for next 
										#size range (first (included) and second (excluded) number of cluster_rel_cov_size_range). And so forth.
										#For example:
										#cluster_rel_cov_list = [0.25,0.1]
										#cluster_rel_cov_size_range = [120]
										#read size_range -> minimum relative coverage
										#min_length(18)-119 -> 0.25
										#120-max_length(300) -> 0.1
cluster_rel_cov_size_range = [120]		#Size range for minimum relative coverage. In form: [], [x] or [x,y,...] and has to be shorter by 1 
										#compared with cluster_rel_cov_size_range and numbers has to be in growing order. [eg. [120]]
cluster_input_file_suffix = _pp_counted.BED		#Suffix for pp counted BED file: "library1" + "suffix" (eg. "_pp_counted.BED") -> library1_pp_counted.BED
												#It should be "_pp_counted.BED" if input data is coming from STARPA pipeline
cluster_wig = True		#Are wig files created [True or False]
cluster_contig_data = True		#Are sam and fasta for contigs created [True or False]
##important when "cluster" is first task
cluster_pseudoSE_location = ../pseudoSE/pseudoSE_info		#Location of folder with task "pseudoSE" info files relative to input folder

										
#quantify
#Quantifies processing products in all libraries and collects and calculates various parameters.
##Name format of input_file: "library" + "suffix" (eg. "__pp_counted.BED") -> library__pp_counted.BED
##important if "quantify" or "cluster" is first task:
quantify_sam_file_location = ../pseudoSE		#Location of a folder (relative to input folder) with aligned SE or pseudoSE reads in SAM format
												#- output of task "pseudoSE" 
												#(Reads require NH tag to describe the number of reported alignments.)
quantify_sam_file_suffix = _pseudoSE.sam		#Suffixs for sam file; "library" + "suffix" (eg. "_pseudoSE.sam") -> library_pseudoSE.sam 
quantify_annotation_file = tests/data/annotation/sequence.gff3		#Annotation file in GFF, GFF3 format.

##parameters for annotation:
quantify_keys_to_skip = [Src, mobile_element, misc_feature, STS]		#Annotation elements which are skipped
quantify_single_line_elements = [repeat_region, telomere, rep_origin, centromere]		#Annotation elements which are given as single line. Those are included 
quantify_ncRNA_types = [antisense_RNA, ncRNA, RNase_P_RNA,tmRNA, SRP_RNA, RNase_MRP_RNA, miscRNA,tmRNA_pseudogene,tRNA_pseudogene,ncRNA_pseudogene,telomerase_RNA]		#RNA types groupped as ncRNAs
quantify_non_groupped_biotypes = [tRNA, rRNA, repeat_region]		#Annotatioon elements not groupped
##RNA groups annotated separately:
#tRNA,rRNA,snRNA,snoRNA,