#!/usr/bin/perl -w use strict; use Getopt::Long; use IPC::System::Simple qw(system); my ($vcf, $sample_panel, $user_input_pop_string, $region, $out_dir, $user_out_file, $tabix, $no_tabix, $help, %sample_pop_hash,#keys are sample IDs and values the population for that sample %pops,#keys of this hash are the populations in the sample panel file @selected_pops,#processed list of selected pops @pop_cols,#record columns belonging to the selected populations $vcf_cmd,#command to retrieve VCF ); &GetOptions( 'vcf=s' => \$vcf, 'sample_panel=s' => \$sample_panel, 'pop=s' => \$user_input_pop_string, 'region=s' => \$region, 'out_dir:s' => \$out_dir, 'out_file:s' => \$user_out_file, 'tabix=s' => \$tabix, 'no_tabix!' => \$no_tabix, 'help!' => \$help, ); if ($help) { exec('perldoc', $0); } check_and_process_input(); #input and output files open(VCF, $vcf_cmd." |") or die "Failed to open $vcf_cmd $!\n"; my $outfile; if($user_out_file){ $outfile = $user_out_file; } else{ $outfile = "$out_dir/afc.".($region =~ s/:/./r).".proc$$.tsv"; } open(OUT, ">", $outfile) or die "Cannot open $outfile.\n"; #print header print OUT "#CHROM\tPOS\tID\tREF\tALT\t"; foreach my $sp (@selected_pops){ print OUT $sp."_TOTAL_CNT\t".$sp."_ALT_CNT\t".$sp."_FRQ\t"; } print OUT "ALL_TOTAL_CNT\tALL_ALT_CNT\tALL_FRQ\n"; #process VCF my $line; LINE: while(){ chomp; $line = $_; #ignore general header lines next LINE if($line =~ /^##/); #process column header line to note which samples are in which columns if($line =~ /^#[Cc]/){ my @cols = split "\t", $line; #for each sample in header SAMPLE: for(my $i=9;$i) { chomp; my $sample_line = $_; my ($sample, $pop_in_panel) = split(/\t/, $sample_line); #add unless it looks like the header $sample_pop_hash{$sample} = $pop_in_panel unless $sample eq "sample"; $pops{$pop_in_panel} = 1 unless $sample eq "sample"; } } =pod =head1 NAME calculate_allele_frq_from_vcf_file.pl =head1 SYNOPSIS This script takes a VCF file, a matching sample panel file, a chromosomal region, population names, it then calculates population-wide allele frequency for sites within the chromosomal region defined. When no population is specified, allele fequences will be calcuated for all populations in the VCF files, one at a time. =head1 Dependency To run this script, you need to install the following software tabix: http://sourceforge.net/projects/samtools/files/tabix/ vcftools: http://sourceforge.net/projects/vcftools/files/ =head1 Options -vcf Input VCF file that contains genotype data for each samples; this file must be bgzipped and tabix indexed; if '-no_tabix' is used, the vcf file can be uncompressed and un-indexed. -sample_panel A tab deliminated file lists mapping between sample and population (see example below) -pop Populations of interest; separated by ",". This field can be null -region chromosome region of interest, format is chr_number:start_end (optional). -out_dir Where temporary file and final output files should be written to. Default is current direcotry. -tabix A path to tabix executable -vcftools_dir A path to the vcftools base directory that contains vcftools executable and perl libraries -no_tabix If the input VCF files is a pre-sliced VCF file containing a small number of sites, this option can be used so the vcf file doesn't have to be tabix indexed or bgzipped. This is to speed up run time for the web application. -help Print this page when specified =head1 EXAMPLE lines from a sample panel file. Only the first 2 columns are essential. HG00096 GBR EUR HG00097 GBR EUR HG00099 GBR EUR HG00100 GBR EUR =head1 OUTPUT The allele frequency of an user-specified population for sites within the user-specified chromosomal region is written to a file. The headers of the output file are: CHR: Chromosome POS: Start position of the variant ID: Identification of the variant REF: Reference allele ALT: Alternative allele TOTAL_CNT: Total number of alleles in samples of the chosen population(s) ALT_CNT: Number of alternative alleles observed in samples of the chosen populations(s) FRQ: Ratio of ALT_CNT to TOTAL_CNT =head1 EXAMPLE perl $ZHENG_RP/bin/calculate_allele_frq_from_vcf.pl \ -vcf /nfs/1000g-archive/vol1/ftp/phase1/analysis_results/integrated_call_sets/ALL.chr22.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.vcf.gz \ -out_dir . \ -sample_panel /nfs/1000g-archive/vol1/ftp/phase1/analysis_results/integrated_call_sets/integrated_call_samples.20101123.ALL.panel \ -region 22:17000000-17005000 \ -pop CEU,FIN \ OR (when the input VCF file is pre-sliced small size file) perl $ZHENG_RP/bin/calculate_allele_frq_from_vcf.pl \ -vcf ALL.chr22_17000000_17005000.test.vcf \ -out_dir ~ \ -sample_panel /nfs/1000g-archive/vol1/ftp/phase1/analysis_results/integrated_call_sets/integrated_call_samples.20101123.ALL.panel \ -pop CEU,FIN \ -no_tabix