# Creating a file with largest scaffolds To save time on the analyses we will be working on chromosomes/scaffold of >5MB. #### Select for >5MB scaffolds ``` cat acreference.fna | awk '$0 ~ ">" {if (NR > 1) {print c;} c=0;printf substr($0,2) "\t"; } $0 !~ ">" {c+=length($0);} END { print c; }' | awk '$(NF) >= 5000000 {print $1}' > /vol/storage/swarmGenomics/golden_eagle/target_len.lst ``` You may alter the code to select for scaffolds of different size. Currently scaffolds ≥5,000,000 bp are selected. To change the threshold, update the number in the second awk command: ``` awk '$(NF) >= NEW_SIZE {print $1}' ``` For example, to select scaffolds ≥10 MB, replace 5000000 with 10000000 #### VCF Filtering and Conversion to FASTQ ``` # Make a directory for the scaffolds vcf files mkdir /vol/storage/swarmGenomics/golden_eagle/vcf cd /vol/storage/swarmGenomics/golden_eagle/vcf # Select larger scaffolds while read chr; do tabix -h /vol/storage/swarmGenomics/golden_eagle/output.vcf.gz $chr > /vol/storage/swarmGenomics/golden_eagle/vcf/$chr.vcf ;done < /vol/storage/swarmGenomics/golden_eagle/target_len.lst # Filtering for file in *; do vcftools --vcf $file --minDP 5 --min-alleles 2 --max-alleles 2 --recode --recode-INFO-all --out ${file%.vcf}_filtered; done; # VCF to fq # 1) Compress all per-scaffold VCFs for file in *_filtered.recode.vcf; do bgzip -@ 10 "$file" done # 2) Index all gzipped VCFs for file in *_filtered.recode.vcf.gz; do /vol/storage/bcftools-1.19/bcftools index /vol/storage/swarmGenomics/golden_eagle/"$file" done # 3) Build per-scaffold consensus FASTQ while read chr; do echo "Processing $chr..." # extract this scaffold from the reference samtools faidx /vol/storage/swarmGenomics/golden_eagle/acreference.fna "$chr" > "${chr}.fa" || { echo "FAILED faidx: $chr"; continue; } # apply VCF variants to build consensus, then convert to FASTQ with dummy qualities /vol/storage/bcftools-1.19/bcftools consensus \ -f "${chr}.fa" \ "${chr}_filtered.recode.vcf.gz" \ | seqtk seq -F 'I' - > "${chr}.fq" || echo "FAILED consensus: $chr" done < /vol/storage/swarmGenomics/golden_eagle/target_len.lst # Concatenate and compress the final FASTQ cat *.fq > diploid.fq gzip diploid.fq # Move the file to working directory mv diploid.fq.gz /vol/storage/swarmGenomics/golden_eagle/ ```