# Creating a file with largest scaffolds
To save time on the analyses we will be working on chromosomes/scaffold of >5MB.

#### Select for >5MB scaffolds

```
cat acreference.fna | awk '$0 ~ ">" {if (NR > 1) {print c;} c=0;printf substr($0,2) "\t"; } $0 !~ ">" {c+=length($0);} END { print c; }' | awk '$(NF) >= 5000000  {print $1}' > /vol/storage/swarmGenomics/golden_eagle/target_len.lst
```
You may alter the code to select for scaffolds of different size.
Currently scaffolds ≥5,000,000 bp are selected. To change the threshold, update the number in the second awk command: 
```
awk '$(NF) >= NEW_SIZE {print $1}'
```
For example, to select scaffolds ≥10 MB, replace 5000000 with 10000000

#### VCF Filtering and Conversion to FASTQ
```
# Make a directory for the scaffolds vcf files
mkdir /vol/storage/swarmGenomics/golden_eagle/vcf
cd /vol/storage/swarmGenomics/golden_eagle/vcf

# Select larger scaffolds
while read chr; do tabix -h /vol/storage/swarmGenomics/golden_eagle/output.vcf.gz $chr > /vol/storage/swarmGenomics/golden_eagle/vcf/$chr.vcf ;done < /vol/storage/swarmGenomics/golden_eagle/target_len.lst

# Filtering
for file in *; do vcftools --vcf $file --minDP 5 --min-alleles 2 --max-alleles 2 --recode --recode-INFO-all --out ${file%.vcf}_filtered; done;

# VCF to fq
# 1) Compress all per-scaffold VCFs
for file in *_filtered.recode.vcf; do
    bgzip -@ 10 "$file"
done

# 2) Index all gzipped VCFs
for file in *_filtered.recode.vcf.gz; do
    /vol/storage/bcftools-1.19/bcftools index /vol/storage/swarmGenomics/golden_eagle/"$file"
done

# 3) Build per-scaffold consensus FASTQ
while read chr; do
    echo "Processing $chr..."
    
    # extract this scaffold from the reference
    samtools faidx /vol/storage/swarmGenomics/golden_eagle/acreference.fna "$chr" > "${chr}.fa" || { echo "FAILED faidx: $chr"; continue; }

    # apply VCF variants to build consensus, then convert to FASTQ with dummy qualities
    /vol/storage/bcftools-1.19/bcftools consensus \
        -f "${chr}.fa" \
        "${chr}_filtered.recode.vcf.gz" \
    | seqtk seq -F 'I' - > "${chr}.fq" || echo "FAILED consensus: $chr"
done < /vol/storage/swarmGenomics/golden_eagle/target_len.lst

# Concatenate and compress the final FASTQ
cat *.fq > diploid.fq
gzip diploid.fq

# Move the file to working directory
mv diploid.fq.gz /vol/storage/swarmGenomics/golden_eagle/
```