#!/usr/bin/env perl # # Plot the output of "bcftools stats -s -" # # The MIT License # # Copyright (C) 2012-2025 Genome Research Ltd. # # Author: Petr Danecek # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. # Dependencies: # - matplotlib # http://matplotlib.sourceforge.net # - LaTex/xcolor.sty # Download .sty.gz LaTeX class style from http://www.ukern.de/tex/xcolor.html, # unpack and install system-wide or place elsewhere and make available by # setting the TEXINPUTS environment variable (note the colon) # export TEXINPUTS=some/dir: # The list of the recognised path can be obtained from `kpsepath tex` # # use strict; use warnings; use Carp; use Storable qw(dclone); my $opts = parse_params(); parse_vcfstats($opts); merge_vcfstats($opts) if @{$$opts{vcfstats}} > 1; chdir($$opts{dir}); if ( $$opts{make_plots} ) { init_plots($opts); plot_venn_bars($opts); plot_counts_by_AF($opts); plot_overlap_by_AF($opts); plot_concordance_by_AF($opts); plot_concordance_by_sample($opts); for my $id (file_ids($opts)) { plot_tstv_by_AF($opts,$id); plot_tstv_by_QUAL($opts,$id); plot_tstv_by_usr($opts,$id); plot_indel_distribution($opts,$id); plot_indel_vaf_distribution($opts,$id); plot_substitutions($opts,$id); plot_vaf($opts,$id,'snv'); plot_vaf($opts,$id,'indel'); plot_per_sample_stats($opts,$id); plot_DP($opts,$id); plot_hwe($opts,$id); plot_vaf25_by_sample($opts,$id,'snv'); plot_vaf25_by_sample($opts,$id,'indel'); } plot($opts); } create_pdf($opts) unless !$$opts{make_pdf}; exit; #-------------------------------- sub usage { print STDERR "About: Plots the output of \"bcftools stats\"\n", "Usage: plot-vcfstats [OPTIONS] -p outdir file.chk ...\n", "Options:\n", " -m, --merge Merge vcfstats files to STDOUT, skip plotting.\n", " -p, --prefix Output directory.\n", " -P, --no-PDF Skip the PDF creation step.\n", " -r, --rasterize Rasterize PDF images for fast rendering, the default and opposite of -v.\n", " -s, --sample-names Use sample names for xticks rather than numeric IDs.\n", " -t, --title Identify files by these titles in plots. Can be given multiple times.\n", " -T, --main-title Main title for the PDF.\n", " -v, --vectors Generate vector graphics for PDF images, the opposite of -r\n", " -h, -?, --help This help message.\n", "\n", "Example:\n", " # Generate the stats\n", " bcftools stats -s - > file.vchk\n", "\n", " # Plot the stats\n", " plot-vcfstats -p outdir file.vchk\n", "\n", " # The final looks can be customized by editing the generated\n", " # 'outdir/plot.py' script and re-running manually\n", " cd outdir && python3 plot.py && pdflatex summary.tex\n", "\n"; } sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } usage(); exit 1; } sub parse_params { $0 =~ s{^.+/}{}; my $opts = { usr_plots => [], use_sample_names => 0, verbose => 1, make_pdf => 1, make_plots => 1, merge => 0, args => join(' ',$0,@ARGV), img_fmt => 'png', img_width => 11/2.54, img_height => 10/2.54, id2col => [ 'orange', 'red', 'darkgreen' ], tex => { slide3v => { height1 => '7cm', height2 => '7cm', height3 => '4.5cm' }, slide3h => { width1 => '15cm', width2 => '10cm', width3 => '8cm' }, }, # for file version sanity check sections => [ { id=>'ID', header=>'Definition of sets', exp=>"# ID\t[2]id\t[3]tab-separated file names" }, { id=>'SN', header=>'SN, Summary numbers', exp=>"# SN\t[2]id\t[3]key\t[4]value" }, { id=>'TSTV', header=>'# TSTV, transition/transversions:', exp=>"# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)" }, { id=>'SiS', header=>'Sis, Singleton stats', exp=>"# SiS\t[2]id\t[3]allele count\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable" }, { id=>'AF', header=>'AF, Stats by non-reference allele frequency', exp=>"# AF\t[2]id\t[3]allele frequency\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable" }, { id=>'IDD', header=>'IDD, InDel distribution', exp=>"# IDD\t[2]id\t[3]length (deletions negative)\t[4]number of sites\t[5]number of genotypes\t[6]mean VAF" }, { id=>'ST', header=>'ST, Substitution types', exp=>"# ST\t[2]id\t[3]type\t[4]count" }, { id=>'GCsAF', header=>'GCsAF, Genotype concordance by non-reference allele frequency (SNPs)', exp=>"# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of genotypes" }, { id=>'GCiAF', header=>'GCiAF, Genotype concordance by non-reference allele frequency (indels)', exp=>"# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of genotypes" }, { id=>'NRDs', header=>'Non-Reference Discordance (NRD), SNPs', exp=>"# NRDs\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance" }, { id=>'NRDi', header=>'Non-Reference Discordance (NRD), indels', exp=>"# NRDi\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance" }, { id=>'GCsS', header=>'GCsS, Genotype concordance by sample (SNPs)', exp=>"# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared" }, { id=>'GCiS', header=>'GCiS, Genotype concordance by sample (indels)', exp=>"# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared" }, { id=>'PSC', header=>'PSC, Per-sample counts', exp=>"# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons\t[12]nHapRef\t[13]nHapAlt\t[14]nMissing" }, { id=>'PSI', header=>'PSI, Per-sample Indels', exp=>"# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\t[8]nInsHets\t[9]nDelHets\t[10]nInsAltHoms\t[11]nDelAltHoms" }, { id=>'DP', header=>'DP, Depth distribution', exp=>"# DP\t[2]id\t[3]bin\t[4]number of genotypes\t[5]fraction of genotypes (%)\t[6]number of sites\t[7]fraction of sites (%)" }, { id=>'FS', header=>'FS, Indel frameshifts', exp=>"# FS\t[2]id\t[3]in-frame\t[4]out-frame\t[5]not applicable\t[6]out/(in+out) ratio\t[7]in-frame (1st ALT)\t[8]out-frame (1st ALT)\t[9]not applicable (1st ALT)\t[10]out/(in+out) ratio (1st ALT)" }, { id=>'ICS', header=>'ICS, Indel context summary', exp=>"# ICS\t[2]id\t[3]repeat-consistent\t[4]repeat-inconsistent\t[5]not applicable\t[6]c/(c+i) ratio" }, { id=>'ICL', header=>'ICL, Indel context by length', exp=>"# ICL\t[2]id\t[3]length of repeat element\t[4]repeat-consistent deletions)\t[5]repeat-inconsistent deletions\t[6]consistent insertions\t[7]inconsistent insertions\t[8]c/(c+i) ratio" }, { id=>'QUAL', header=>'QUAL, Stats by quality', exp=>"# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels" }, { id=>'HWE', header=>'HWE', exp=>"# HWE\t[2]id\t[3]1st ALT allele frequency\t[4]Number of observations\t[5]25th percentile\t[6]median\t[7]75th percentile", }, { id=>'VAF', header=>'VAF', exp=>"# VAF\t[2]id\t[3]sample\t[4]SNV VAF distribution\t[5]indel VAF distribution", }, ], SN_keys=>[ 'number of samples:', 'number of records:', 'number of no-ALTs:', 'number of SNPs:', 'number of MNPs:', 'number of indels:', 'number of others:', 'number of multiallelic sites:', 'number of multiallelic SNP sites:', ], }; for my $sec (@{$$opts{sections}}) { $$opts{exp}{$$sec{id}} = $$sec{exp}; $$opts{id2sec}{$$sec{id}} = $sec; } while (defined(my $arg=shift(@ARGV))) { if ( $arg eq '--no-plots' ) { $$opts{make_plots}=0; next; } if ( $arg eq '-P' || $arg eq '--no-PDF' ) { $$opts{make_pdf}=0; next; } if ( $arg eq '-r' || $arg eq '--rasterize' ) { $$opts{img_fmt} = 'png'; next; } if ( $arg eq '-v' || $arg eq '--vectors' ) { $$opts{img_fmt} = 'pdf'; next; } if ( $arg eq '-m' || $arg eq '--merge' ) { $$opts{make_plots}=0; $$opts{make_pdf}=0; $$opts{merge}=1; next; } if ( $arg eq '-s' || $arg eq '--sample-names' ) { $$opts{use_sample_names}=1; next; } if ( $arg eq '-t' || $arg eq '--title' ) { push @{$$opts{titles}},shift(@ARGV); next; } if ( $arg eq '-T' || $arg eq '--main-title' ) { $$opts{main_title} = shift(@ARGV); next; } if ( $arg eq '-p' || $arg eq '--prefix' ) { $$opts{prefix}=shift(@ARGV); next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { usage(); exit 0; } if ( -e $arg ) { push @{$$opts{vcfstats}},$arg; next; } error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n"); } if ( !exists($$opts{vcfstats}) ) { error(); } if ( !exists($$opts{prefix}) ) { if ( !$$opts{merge} ) { error("Expected -p parameter.\n") } $$opts{prefix} = '.'; } elsif ( $$opts{merge} ) { error("Only one of -p or -m should be given.\n"); } if ( $$opts{merge} && @{$$opts{vcfstats}} < 2 ) { error("Nothing to merge\n") } $$opts{dir} = $$opts{prefix}; $$opts{logfile} = "plot-vcfstats.log"; if ( !-d $$opts{dir} ) { `mkdir -p $$opts{dir}`; } `> $$opts{dir}/$$opts{logfile}` unless $$opts{merge}; return $opts; } sub plot { my ($opts) = @_; if ( !exists($$opts{plt_fh}) ) { return; } close($$opts{plt_fh}) or error("close $$opts{plt_fh}"); my $cmd = "python3 $$opts{plt_file}"; print STDERR "Plotting graphs: $cmd\n" unless !$$opts{verbose}; system($cmd); if ( $? ) { error("The command exited with non-zero status $?:\n\t$cmd\n\n"); } } sub parse_vcfstats { my ($opts) = @_; for (my $i=0; $i<@{$$opts{vcfstats}}; $i++) { parse_vcfstats1($opts,$i); } # Check sanity if ( !exists($$opts{dat}{ID}{0}) ) { error("Sanity check failed: no stats found by vcfstats??\n"); } # Set titles my %file2title; my %title2file; if ( exists($$opts{titles}) ) { for (my $i=0; $i<@{$$opts{titles}}; $i++) { if ( !exists($$opts{dat}{ID}{$i}) ) { next; } $file2title{$$opts{dat}{ID}{$i}[0][0]} = $$opts{titles}[$i]; $title2file{$$opts{titles}[$i]} = $$opts{dat}{ID}{$i}[0][0]; } } for my $id (file_ids($opts)) { if ( @{$$opts{dat}{ID}{$id}[0]}>1 ) { next; } # shared stats of two files my $file = $$opts{dat}{ID}{$id}[0][0]; if ( !exists($file2title{$file}) ) # create short title { my $bname = $file; $bname =~ s{^.*/}{}; $bname =~ s{\.vcf\.gz$}{}i; if ( length($bname) > 5 ) { $bname = substr($bname,0,5); } my $i = 0; my $title = $bname; while ( exists($title2file{$title}) ) { $title = $bname.chr(66+$i); $i++; } $file2title{$file} = $title; $title2file{$title} = $file; } } for my $id (file_ids($opts)) { my @titles; for my $file (@{$$opts{dat}{ID}{$id}[0]}) { push @titles, $file2title{$file} if $file2title{$file}; } $$opts{title}{$id} = join(' + ',@titles); } # mapping from file names to list of IDs for my $id (file_ids($opts)) { my @files; for my $file (@{$$opts{dat}{ID}{$id}[0]}) { push @{$$opts{file2ids}{$file}}, $id; } } # check sanity of the file merge: were the correct files merged? if ( exists($$opts{coalesced_files}) && $$opts{verbose} ) { print STDERR "The vcfstats outputs have been merged as follows:\n"; my %printed; for my $id (keys %{$$opts{coalesced_files}}) { for (my $i=0; $i<@{$$opts{coalesced_files}{$id}}; $i++) { for (my $j=0; $j<@{$$opts{coalesced_files}{$id}[$i]}; $j++) { if ( exists($printed{$$opts{dat}{ID}{$id}[$i][$j]}) ) { next; } print STDERR "\t$$opts{dat}{ID}{$id}[$i][$j]\n"; for my $file (keys %{$$opts{coalesced_files}{$id}[$i][$j]}) { my $n = $$opts{coalesced_files}{$id}[$i][$j]{$file}; print STDERR "\t\t$file", ($n>1 ? "\t..\t${n}x" : ''),"\n"; } $printed{$$opts{dat}{ID}{$id}[$i][$j]} = 1; } } } } } sub add_to_values { my ($dst,$src,$cmp) = @_; my $id = 0; my $is = 0; while ($is<@$src) { while ( $id<@$dst && &$cmp($$src[$is][0],$$dst[$id][0])>0 ) { $id++; } if ( $id<@$dst && !&$cmp($$src[$is][0],$$dst[$id][0]) ) { for (my $j=1; $j<@{$$src[$is]}; $j++) { if ( $$dst[$id][$j] eq '.' or $$src[$is][$j] eq '.' ) { next; } $$dst[$id][$j] += $$src[$is][$j]; } } else { splice(@$dst,$id,0,$$src[$is]); } $is++; } } sub add_to_sample_values { my ($dst,$src) = @_; my %id2i; for (my $i=0; $i<@$dst; $i++) { $id2i{$$dst[$i][0]} = $i; } for (my $i=0; $i<@$src; $i++) { if ( !exists($id2i{$$src[$i][0]}) ) { error("Whoops, no such dst sample: $$src[$i][0]\n"); } my $di = $id2i{$$src[$i][0]}; for (my $j=1; $j<@{$$src[$i]}; $j++) { $$dst[$di][$j] += $$src[$i][$j]; } } } sub merge_PSC { my ($a,$b,$n) = @_; for (my $i=0; $i<@$a; $i++) { $$a[$i][7] *= $n; } # average DP add_to_sample_values($a,$b); for (my $i=0; $i<@$a; $i++) { $$a[$i][7] /= $n+1; } } sub merge_PSI { my ($a,$b,$n) = @_; add_to_sample_values($a,$b); for (my $i=0; $i<@$b; $i++) { $$a[$i][4] = sprintf "%.2f", ($$a[$i][1]+$$a[$i][2] ? $$a[$i][2]/($$a[$i][1]+$$a[$i][2]) : 0); } } sub rglob { my ($a,$b) = @_; if ( $a eq $b ) { return $a; } $a =~ s/\*//; my $la = length($a); my $lb = length($b); my $i = 0; while ( $i<$la && $i<$lb && substr($a,$i,1) eq substr($b,$i,1) ) { $i++; } $la--; $lb--; while ( $la>$i && $lb>$i && substr($a,$la,1) eq substr($b,$lb,1) ) { $la--; $lb--; } $la = $la==$i && $lb==$i ? 1 : $la-$i; substr($a,$i,$la,'*'); return $a; } sub merge_id { # merge id filenames my ($opts,$dst,$src,$id) = @_; for (my $i=0; $i<@{$$src{$id}}; $i++) { for (my $j=0; $j<@{$$src{$id}[$i]}; $j++) { my $gname = rglob($$dst{$id}[$i][$j], $$src{$id}[$i][$j]); $$dst{$id}[$i][$j] = $gname; $$opts{coalesced_files}{$id}[$i][$j]{$$src{$id}[$i][$j]}++; } } } sub add_to_avg { my ($dst,$src,$n) = @_; for (my $i=0; $i<@$src; $i++) { if ( ref($$dst[$i]) eq 'ARRAY' ) { for (my $j=0; $j<@{$$dst[$i]}; $j++) { $$dst[$i][$j] = ($n*$$dst[$i][$j]+$$src[$i][$j])/($n+1); } } else { $$dst[$i] = ($n*$$dst[$i]+$$src[$i])/($n+1); } } } sub cmp_str($$) { my ($a,$b) = @_; return $a cmp $b; } sub cmp_num($$) { my ($a,$b) = @_; return $a <=> $b; } sub cmp_num_op($$) { # numeric compare with operators # Cases like <3, >500 make it complicated my ($a,$b) = @_; my $xa = '='; my $xb = '='; if ( $a=~/^(\D+)/ ) { $xa = $1; $a = $'; } if ( $b=~/^(\D+)/ ) { $xb = $1; $b = $'; } if ( $a==$b ) { return $xa cmp $xb; } $a <=> $b; } sub merge_dp { my ($a,$b) = @_; add_to_values($a,$b,\&cmp_num_op); # recalculate fraction of GTs and fraction of sites, cannot be simply summed my $gsum = 0; # genotype sum my $ssum = 0; # site sum for (my $i=0; $i<@$a; $i++) { $gsum += $$a[$i][1]; if (@{$$a[$i]}>3) { $ssum += $$a[$i][3]; } else{ push @{$$a[$i]}, (0,0); # older stats files will not have last 2 columns for (number of sites, fraction of sites), so fill in as zero } } for (my $i=0; $i<@$a; $i++) { $$a[$i][2] = $gsum ? $$a[$i][1]*100./$gsum : 0; $$a[$i][4] = $ssum ? $$a[$i][3]*100./$ssum : 0; } } sub merge_GCsS { my ($a,$b,$n) = @_; # average the non-ref discordance rate for (my $i=0; $i<@$a; $i++) { $$a[$i][1] *= $n; } add_to_sample_values($a,$b); for (my $i=0; $i<@$a; $i++) { $$a[$i][1] /= $n+1; } } sub merge_FS { my ($a,$b) = @_; for (my $i=0; $i<@$a; $i++) { for (my $j=0; $j<3; $j++) { $$a[$i][$j] += $$b[$i][$j]; } $$a[$i][3] = sprintf "%.2f", ($$a[$i][0] + $$a[$i][1]) ? $$a[$i][1]/($$a[$i][0] + $$a[$i][1]) : 0; for (my $j=4; $j<7; $j++) { $$a[$i][$j] += $$b[$i][$j]; } $$a[$i][7] = sprintf "%.2f", ($$a[$i][4] + $$a[$i][5]) ? $$a[$i][5]/($$a[$i][4] + $$a[$i][5]) : 0; } } sub merge_ICS { my ($a,$b) = @_; for (my $i=0; $i<@$a; $i++) { for (my $j=0; $j<3; $j++) { $$a[$i][$j] += $$b[$i][$j]; } $$a[$i][3] = sprintf "%.4f", ($$a[$i][0] + $$a[$i][1]) ? $$a[$i][0]/($$a[$i][0] + $$a[$i][1]) : 0; } } sub merge_ICL { my ($a,$b) = @_; for (my $i=0; $i<@$a; $i++) { for (my $j=1; $j<5; $j++) { $$a[$i][$j] += $$b[$i][$j]; } $$a[$i][5] = sprintf "%.4f", ($$a[$i][2] + $$a[$i][4]) ? ($$a[$i][1] + $$a[$i][3])/($$a[$i][1] + $$a[$i][2] + $$a[$i][3] + $$a[$i][4]) : 0; } } sub merge_TSTV { my ($a,$b) = @_; for (my $i=0; $i<@$a; $i++) { for (my $j=0; $j<2; $j++) { $$a[$i][$j] += $$b[$i][$j]; } $$a[$i][2] = sprintf "%.2f", $$a[$i][1] ? $$a[$i][0]/$$a[$i][1] : 0; for (my $j=3; $j<5; $j++) { $$a[$i][$j] += $$b[$i][$j]; } $$a[$i][5] = sprintf "%.2f", $$a[$i][4] ? $$a[$i][3]/$$a[$i][4] : 0; } } sub merge_GCsAF { my ($a,$b) = @_; # recalculate r2 for (my $i=0; $i<@$a; $i++) { $$a[$i][7] *= $$a[$i][8]; } for (my $i=0; $i<@$b; $i++) { $$b[$i][7] *= $$b[$i][8]; } add_to_values($a,$b,\&cmp_num_op); for (my $i=0; $i<@$a; $i++) { $$a[$i][7] /= $$a[$i][8]; } } sub merge_HWE { my ($dst,$src) = @_; # The fields are: # [0] - AF of the 1st ALT # [1] - number of observations # [2] - 25th percentile # [3] - median # [4] - 75th percentile # # It is impossible to do the proper merge without source data, hence # an approximation: take average of the percentiles weighted by the # number of observations my $id = 0; my $is = 0; while ($is<@$src) { while ( $id<@$dst && $$src[$is][0] > $$dst[$id][0] ) { $id++; } if ( $id<@$dst && $$src[$is][0] == $$dst[$id][0] ) { my $sw = $$src[$is][1]+$$dst[$id][1] ? $$src[$is][1]/($$src[$is][1]+$$dst[$id][1]) : 0; my $dw = $$src[$is][1]+$$dst[$id][1] ? $$dst[$id][1]/($$src[$is][1]+$$dst[$id][1]) : 0; for (my $j=2; $j<@{$$src[$is]}; $j++) { if ( $$dst[$id][$j] eq '.' or $$src[$is][$j] eq '.' ) { next; } $$dst[$id][$j] = $dw*$$dst[$id][$j] + $sw*$$src[$is][$j]; } $$dst[$id][1] += $sw*$$src[$is][1]; } else { splice(@$dst,$id,0,$$src[$is]); } $is++; } } sub parse_vcfstats1 { my ($opts,$i) = @_; my $file = $$opts{vcfstats}[$i]; print STDERR "Parsing bcftools stats output: $file\n" unless !$$opts{verbose}; open(my $fh,'<',$file) or error("$file: $!"); my $line = <$fh>; if ( !$line or !($line=~/^# This file was produced by \S*/) ) { error("Sanity check failed: was this file generated by bcftools stats?"); } my %dat; while ($line=<$fh>) { $line =~ s/\s*$//; if ( $line=~/^#\s+(\S+)\t/ ) { $$opts{def_line}{$1} = $line; next; } if ( $line=~/^#/ ) { next; } my @items = split(/\t/,$line); if ( $items[0] eq 'ID' && $items[2] eq '' ) { $items[2] = 'STDIN'; } if ( $items[0] eq 'SN' ) { $dat{$items[1]}{$items[2]} = splice(@items,3); next; } push @{$dat{$items[0]}{$items[1]}}, [splice(@items,2)]; } close($fh); for my $a (keys %dat) { if ( !exists($$opts{dat}{$a}) ) { $$opts{dat}{$a} = $dat{$a}; next; } # first vcfstats file for my $b (keys %{$dat{$a}}) { # Merging multiple vcfstats files. Honestly, this is quite hacky. if ( !exists($$opts{dat}{$a}{$b}) ) { $$opts{dat}{$a}{$b} = $dat{$a}{$b}; next; } # copy all, first occurrence if ( $a eq 'ID' ) { merge_id($opts,$$opts{dat}{$a},$dat{$a},$b); } elsif ( ref($dat{$a}{$b}) ne 'ARRAY' ) { $$opts{dat}{$a}{$b} += $dat{$a}{$b} unless $b eq 'number of samples:'; } # SN, Summary numbers, do not sum sample counts elsif ( $a eq 'NRDs' ) { add_to_avg($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'NRDi' ) { add_to_avg($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'DP' ) { merge_dp($$opts{dat}{$a}{$b},$dat{$a}{$b}); } elsif ( $a eq 'GCsS' ) { merge_GCsS($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'GCiS' ) { merge_GCsS($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'GCsAF' ) { merge_GCsAF($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'GCiAF' ) { merge_GCsAF($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'HWE' ) { merge_HWE($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'ST' ) { add_to_values($$opts{dat}{$a}{$b},$dat{$a}{$b},\&cmp_str); } elsif ( $a eq 'PSC') { merge_PSC($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'PSI') { merge_PSI($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'IDD') { add_to_values($$opts{dat}{$a}{$b},$dat{$a}{$b},\&cmp_num); } elsif ( $a eq 'FS') { merge_FS($$opts{dat}{$a}{$b},$dat{$a}{$b}); } elsif ( $a eq 'ICS') { merge_ICS($$opts{dat}{$a}{$b},$dat{$a}{$b}); } elsif ( $a eq 'ICL') { merge_ICL($$opts{dat}{$a}{$b},$dat{$a}{$b}); } elsif ( $a eq 'TSTV') { merge_TSTV($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'DBG' ) { next; } else { add_to_values($$opts{dat}{$a}{$b},$dat{$a}{$b},\&cmp_num_op); } # SiS AF IDD } } } sub file_ids { my ($opts) = @_; my $id = 0; my @out; while ( exists($$opts{dat}{ID}) && exists($$opts{dat}{ID}{$id}) ) { push @out, $id++; } return @out; } sub tprint { my ($fh,$text) = @_; my @lines = split(/\n/,$text); my $ntab; for my $line (@lines) { if ( !defined $ntab ) { if ( $line=~/^\s*$/ ) { print $fh $line; next; } $ntab = ( $line=~/^\s+/ ) ? length($&) : 0; } if ( $line=~/^\s{$ntab}/ ) { substr($line,0,$ntab,''); } print $fh $line."\n"; } } sub init_plots { my ($opts) = @_; $$opts{plt_file} = "plot.py"; my $titles = "# Title abbreviations:\n"; for my $id (file_ids($opts)) { $titles .= "# \t $id .. $$opts{title}{$id} .. $$opts{dat}{ID}{$id}[0][0]\n"; } $titles .= "#"; open(my $fh,'>',$$opts{plt_file}) or error("$$opts{plt_file}: $!"); tprint $fh, qq[ # This file was produced by plot-vcfstats, the command line was: # $$opts{args} # # Edit as necessary and recreate the plots by running # python3 $$opts{plt_file} # $titles img_fmt = '$$opts{img_fmt}' # Use logarithmic X axis for allele frequency plots af_xlog = 0 # Plots to generate, set to 0 to disable plot_venn_snps = 1 plot_venn_indels = 1 plot_tstv_by_sample = 1 plot_hethom_by_sample = 1 plot_snps_by_sample = 1 plot_indels_by_sample = 1 plot_singletons_by_sample = 1 plot_depth_by_sample = 1 plot_SNP_count_by_af = 1 plot_Indel_count_by_af = 1 plot_SNP_overlap_by_af = 1 plot_Indel_overlap_by_af = 1 plot_dp_dist = 1 plot_hwe = 1 plot_concordance_by_af = 1 plot_r2_by_af = 1 plot_discordance_by_sample = 1 plot_tstv_by_af = 1 plot_indel_dist = 1 plot_indel_vaf = 1 plot_tstv_by_qual = 1 plot_tstv_by_usr = 1 plot_substitutions = 1 plot_vaf_snv = 1 plot_vaf_indel = 1 plot_vaf25_snv = 1 plot_vaf25_indel = 1 # Set to 1 to use sample names for xticks instead of numeric sequential IDs # and adjust margins and font properties if necessary sample_names = $$opts{use_sample_names} sample_margins = {'right':0.98, 'left':0.07, 'bottom':0.2} sample_font = {'rotation':45, 'ha':'right', 'fontsize':8} if sample_names==0: sample_margins=(); sample_font=(); #------------------------------------------------- import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt import csv csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE) import numpy def smooth(x,window_len=11,window='hanning'): if x.ndim != 1: raise ValueError("The function 'smooth' only accepts 1 dimension arrays.") if x.size < window_len: return x if window_len<3: return x if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']: raise ValueError("Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'") s = numpy.r_[x[window_len-1:0:-1],x,x[-1:-window_len:-1]] if window == 'flat': # moving average w = numpy.ones(window_len,'d') else: w = eval('numpy.'+window+'(window_len)') y = numpy.convolve(w/w.sum(),s,mode='valid') return y[(window_len//2-1):-(window_len//2)] ]; $$opts{plt_fh} = $fh; } sub percentile { my ($p,@vals) = @_; my $N = 0; for my $val (@vals) { $N += $val; } my $n = $p*($N+1)/100.; my $k = int($n); my $d = $n-$k; if ( $k<=0 ) { return 0; } if ( $k>=$N ) { return scalar @vals-1; } my $cnt; for (my $i=0; $i<@vals; $i++) { $cnt += $vals[$i]; if ( $cnt>=$k ) { return $i; } } error("FIXME: this should not happen [percentile]\n"); } sub get_values { my ($opts,$id,$key,$i,$j) = @_; if ( !exists($$opts{dat}{$key}) ) { return (); } if ( !exists($$opts{dat}{$key}{$id}) ) { return (); } my $fields_ok = 1; if ( !exists($$opts{exp}{$key}) && !($key=~/^USR:/) ) { error("todo: sanity check for $key\n"); } if ( exists($$opts{def_line}{$key}) && !($key=~/^USR:/) && $$opts{def_line}{$key} ne $$opts{exp}{$key} && !$$opts{def_line_warned}{$key} ) { warn("Warning: Possible version mismatch, the definition line differs\n\texpected: $$opts{exp}{$key}\n\tfound: $$opts{def_line}{$key}\n"); $$opts{def_line_warned}{$key} = 1; } if ( defined $i ) { if ( defined $j ) { return $$opts{dat}{$key}{$id}[$i][$j]; } return (@{$$opts{dat}{$key}{$id}[$i]}); } return (@{$$opts{dat}{$key}{$id}}); } sub get_value { my ($opts,$id,$key) = @_; if ( !exists($$opts{dat}{$id}) ) { return undef; } if ( !exists($$opts{dat}{$id}{$key}) ) { return undef} return $$opts{dat}{$id}{$key}; } sub plot_venn_bars { my ($opts) = @_; my @ids = file_ids($opts); if ( @ids != 3 ) { return; } my (@snps,@indels,@tstv,@snp_titles,@indel_titles); for my $id (0..2) { push @snps, get_value($opts,$id,'number of SNPs:'); push @indels, get_value($opts,$id,'number of indels:'); push @tstv, sprintf("%.2f",get_values($opts,$id,'TSTV',0,5)); push @snp_titles, "$$opts{title}{$id}\\nts/tv $tstv[$id]\\n" .bignum($snps[$id]); my @fs = get_values($opts,$id,'FS'); my $fs = @fs ? "frm $fs[0][3]\\n" : ''; push @indel_titles, "$$opts{title}{$id}\\n$fs" .bignum($indels[$id]); } my $fh = $$opts{plt_fh}; tprint $fh, qq[ if plot_venn_snps: fig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) ax1 = fig.add_subplot(111) ax1.bar([1,2,3],[$snps[0],$snps[2],$snps[1]],align='center',color='$$opts{id2col}[0]',width=0.3) ax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') ax1.set_xlim(0.5,3.5) plt.xticks([1,2,3],('$snp_titles[0]','$snp_titles[2]','$snp_titles[1]')) plt.title('Number of SNPs') plt.subplots_adjust(right=0.95,bottom=0.15) plt.savefig('venn_bars.snps.png') if img_fmt != 'png': plt.savefig('venn_bars.snps.' + img_fmt) plt.close() if plot_venn_indels: fig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) ax1 = fig.add_subplot(111) ax1.bar([1,2,3],[$indels[0],$indels[2],$indels[1]],align='center',color='$$opts{id2col}[1]',width=0.3) ax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') ax1.set_xlim(0.5,3.5) plt.xticks([1,2,3],('$indel_titles[0]','$indel_titles[2]','$indel_titles[1]')) plt.title('Number of indels') plt.subplots_adjust(right=0.95,bottom=0.15) plt.savefig('venn_bars.indels.png') if img_fmt != 'png': plt.savefig('venn_bars.indels.' + img_fmt) plt.close() ]; } sub plot_per_sample_stats { my ($opts,$id) = @_; my @vals = get_values($opts,$id,'PSC'); if ( !@vals ) { return; } my $fh = $$opts{plt_fh}; my $img = "tstv_by_sample.$id"; my $img2 = "hets_by_sample.$id"; my $img3 = "snps_by_sample.$id"; my $img4 = "indels_by_sample.$id"; my $img5 = "singletons_by_sample.$id"; my $img6 = "dp_by_sample.$id"; open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Sample ID\t[2]ts/tv\t[3]het/hom\t[4]nSNPs\t[5]nIndels\t[6]Average depth\t[7]nSingletons\t[8]Sample name\n"; for (my $i=0; $i<@vals; $i++) { my $tstv = $vals[$i][5] ? $vals[$i][4]/$vals[$i][5] : 0; my $hethom = $vals[$i][2] ? $vals[$i][3]/$vals[$i][2] : 0; printf $tfh "%d\t%f\t%f\t%d\t%d\t%f\t%d\t%s\n", $i, $tstv, $hethom, $vals[$i][4]+$vals[$i][5], $vals[$i][6], $vals[$i][7], $vals[$i][8], $vals[$i][0]; } close($tfh); tprint $fh, " dat = [] with open('$img.dat', 'r') as f: reader = csv.reader(f, 'tab') for row in reader: if row[0][0] != '#': dat.append([float(x) for x in row[:7]] + [row[7]]) if plot_tstv_by_sample: fig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7)) ax1 = fig.add_subplot(111) ax1.plot([row[0] for row in dat], [row[1] for row in dat], 'o', color='$$opts{id2col}[$id]',mec='$$opts{id2col}[$id]') ax1.set_ylabel('Ts/Tv') ax1.set_ylim(min(float(row[1]) for row in dat)-0.1,max(float(row[1]) for row in dat)+0.1) if sample_names: plt.xticks([int(row[0]) for row in dat],[row[7] for row in dat],**sample_font) plt.subplots_adjust(**sample_margins) else: plt.subplots_adjust(right=0.98,left=0.07,bottom=0.17) ax1.set_xlabel('Sample ID') plt.title('$$opts{title}{$id}') plt.savefig('$img.png') if img_fmt != 'png': plt.savefig('$img.' + img_fmt) plt.close() if plot_hethom_by_sample: fig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7)) ax1 = fig.add_subplot(111) ax1.plot([row[0] for row in dat], [row[2] for row in dat], 'o', color='$$opts{id2col}[$id]',mec='$$opts{id2col}[$id]') ax1.set_ylabel('nHet(RA) / nHom(AA)') ax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') if sample_names: plt.xticks([int(row[0]) for row in dat],[row[7] for row in dat],**sample_font) plt.subplots_adjust(**sample_margins) else: plt.subplots_adjust(right=0.98,left=0.07,bottom=0.17) ax1.set_xlabel('Sample ID') plt.title('$$opts{title}{$id}') plt.savefig('$img2.png') if img_fmt != 'png': plt.savefig('$img2.' + img_fmt) plt.close() if plot_snps_by_sample: fig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7)) ax1 = fig.add_subplot(111) ax1.plot([row[0] for row in dat], [row[3] for row in dat], 'o', color='$$opts{id2col}[$id]',mec='$$opts{id2col}[$id]') ax1.set_ylabel('Number of SNPs') ax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') if sample_names: plt.xticks([int(row[0]) for row in dat],[row[7] for row in dat],**sample_font) plt.subplots_adjust(**sample_margins) else: plt.subplots_adjust(right=0.98,left=0.07,bottom=0.17) ax1.set_xlabel('Sample ID') plt.title('$$opts{title}{$id}') plt.savefig('$img3.png') if img_fmt != 'png': plt.savefig('$img3.' + img_fmt) plt.close() if plot_indels_by_sample: fig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7)) ax1 = fig.add_subplot(111) ax1.plot([row[0] for row in dat], [row[4] for row in dat], 'o', color='$$opts{id2col}[$id]',mec='$$opts{id2col}[$id]') ax1.set_ylabel('Number of indels') ax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') if sample_names: plt.xticks([int(row[0]) for row in dat],[row[7] for row in dat],**sample_font) plt.subplots_adjust(**sample_margins) else: plt.subplots_adjust(right=0.98,left=0.07,bottom=0.17) ax1.set_xlabel('Sample ID') plt.title('$$opts{title}{$id}') plt.savefig('$img4.png') if img_fmt != 'png': plt.savefig('$img4.' + img_fmt) plt.close() if plot_singletons_by_sample: fig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7)) ax1 = fig.add_subplot(111) ax1.plot([row[0] for row in dat], [row[6] for row in dat], 'o', color='$$opts{id2col}[$id]',mec='$$opts{id2col}[$id]') ax1.set_ylabel('Number of singletons') ax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') if sample_names: plt.xticks([int(row[0]) for row in dat],[row[7] for row in dat],**sample_font) plt.subplots_adjust(**sample_margins) else: plt.subplots_adjust(right=0.98,left=0.07,bottom=0.17) ax1.set_xlabel('Sample ID') plt.title('$$opts{title}{$id}') plt.savefig('$img5.png') if img_fmt != 'png': plt.savefig('$img5.' + img_fmt) plt.close() if plot_depth_by_sample: fig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7)) ax1 = fig.add_subplot(111) ax1.plot([row[0] for row in dat], [row[5] for row in dat], 'o', color='$$opts{id2col}[$id]',mec='$$opts{id2col}[$id]') ax1.set_ylabel('Average depth') ax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') if sample_names: plt.xticks([int(row[0]) for row in dat],[row[7] for row in dat],**sample_font) plt.subplots_adjust(**sample_margins) else: plt.subplots_adjust(right=0.98,left=0.07,bottom=0.17) ax1.set_xlabel('Sample ID') plt.title('$$opts{title}{$id}') plt.savefig('$img6.png') if img_fmt != 'png': plt.savefig('$img6.' + img_fmt) plt.close() "; } sub plot_DP { my ($opts,$id) = @_; my @vals = get_values($opts,$id,'DP'); if ( !@vals ) { return; } my $fh = $$opts{plt_fh}; my $img = "depth.$id"; open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Depth\t[2]Cumulative number of genotypes\t[3]Number of genotypes\n"; my $sum = 0; for (my $i=0; $i<@vals; $i++) { if ( $sum>99. ) { last; } if ( !($vals[$i][0]=~/^\d+$/) ) { next; } # DP ">500" case $sum += $vals[$i][2]; printf $tfh "%d\t%f\t%f\n", $vals[$i][0], $sum, $vals[$i][2]; } close($tfh); tprint $fh, " dat = [] with open('$img.dat', 'r') as f: reader = csv.reader(f, 'tab') for row in reader: if row[0][0] != '#': dat.append([float(x) for x in row]) if plot_dp_dist: fig = plt.figure(figsize=($$opts{img_width}*1.2,$$opts{img_height})) ax1 = fig.add_subplot(111) ax1.plot([row[0] for row in dat], [row[2] for row in dat], '-^', color='k') ax1.set_ylabel('Number of genotypes [%]',color='k') ax1.set_xlabel('Depth') ax2 = ax1.twinx() ax2.plot([row[0] for row in dat], [row[1] for row in dat], '-o', color='$$opts{id2col}[$id]') ax2.set_ylabel('Cumulative number of genotypes [%]',color='$$opts{id2col}[$id]') for tl in ax2.get_yticklabels(): tl.set_color('$$opts{id2col}[$id]') plt.subplots_adjust(left=0.2,bottom=0.15,right=0.8) plt.title('$$opts{title}{$id}') plt.savefig('$img.png') if img_fmt != 'png': plt.savefig('$img.' + img_fmt) plt.close() "; } sub plot_hwe { my ($opts,$id) = @_; my @vals = get_values($opts,$id,'HWE'); if ( !@vals ) { return; } my $fh = $$opts{plt_fh}; my $img = "hwe.$id"; open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Allele Frequency\t[2]Depth\t[3]Number of hets (median)\t[4]Number of hets (25-75th percentile)\n"; for (my $i=0; $i<@vals; $i++) { if ( !$vals[$i][1] ) { next; } print $tfh join("\t", @{$vals[$i]}), "\n"; } close($tfh); tprint $fh, " dat = [] with open('$img.dat', 'r') as f: reader = csv.reader(f, 'tab') for row in reader: if row[0][0] != '#': dat.append(row) if plot_hwe and len(dat)>1: x = [float(row[0]) for row in dat] y1 = smooth(numpy.array([float(row[2]) for row in dat]),40,'hanning') y2 = smooth(numpy.array([float(row[3]) for row in dat]),40,'hanning') y3 = smooth(numpy.array([float(row[4]) for row in dat]),40,'hanning') dp = smooth(numpy.array([float(row[1]) for row in dat]),40,'hanning') hwe = [] for af in x: hwe.append(2*af*(1-af)) fig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) ax1 = fig.add_subplot(111) plots = ax1.plot(x,hwe,'--',color='#ff9900',label='Expected (HWE)') plots += ax1.plot(x,y2,color='#ff9900',label='Median') plots += ax1.plot(x,y3,color='#ffe0b2',label='25-75th percentile') ax1.fill_between(x,y1,y3, facecolor='#ffeacc',edgecolor='#ffe0b2') ax1.set_ylabel('Fraction of hets',color='#ff9900') ax1.set_xlabel('Allele frequency') for tl in ax1.get_yticklabels(): tl.set_color('#ff9900') ax2 = ax1.twinx() plots += ax2.plot(x,dp, 'k', label='Number of sites') ax2.set_ylabel('Number of sites') ax2.set_yscale('log') if af_xlog: ax1.set_xscale('log') if af_xlog: ax2.set_xscale('log') labels = [l.get_label() for l in plots] plt.legend(plots,labels,numpoints=1,markerscale=2,loc='center',prop={'size':9},frameon=False) plt.subplots_adjust(left=0.15,bottom=0.15,right=0.86) plt.title('$$opts{title}{$id}') plt.savefig('$img.png') if img_fmt != 'png': plt.savefig('$img.' + img_fmt) plt.close() "; } sub plot_tstv_by_AF { my ($opts,$id) = @_; my @vals = get_values($opts,$id,'AF'); if ( !@vals ) { return; } my $fh = $$opts{plt_fh}; my $img = "tstv_by_af.$id"; my $vals = rebin_values(\@vals,8,0); open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Allele frequency\t[2]Number of sites\t[3]ts/tv\n"; for (my $i=0; $i<@$vals; $i++) { if ( $$vals[$i][2] + $$vals[$i][3] == 0 ) { next; } printf $tfh "%f\t%d\t%f\n", $$vals[$i][0], $$vals[$i][2] + $$vals[$i][3], $$vals[$i][3] ? $$vals[$i][2]/$$vals[$i][3]: 0; } close($tfh); tprint $fh, " dat = [] with open('$img.dat', 'r') as f: reader = csv.reader(f, 'tab') for row in reader: if row[0][0] != '#': dat.append([float(x) for x in row]) if plot_tstv_by_af and len(dat)>2: fig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) ax1 = fig.add_subplot(111) ax1.plot([row[0] for row in dat], [row[1] for row in dat], '-o',color='k',mec='k',markersize=3) ax1.set_ylabel('Number of sites',color='k') ax1.set_yscale('log') #ax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') for tl in ax1.get_yticklabels(): tl.set_color('k') ax1.set_xlabel('Non-ref allele frequency') ax2 = ax1.twinx() ax2.plot([row[0] for row in dat], [row[2] for row in dat], '-o',color='$$opts{id2col}[$id]',mec='$$opts{id2col}[$id]',markersize=3) ax2.set_ylabel('Ts/Tv',color='$$opts{id2col}[$id]') ax2.set_ylim(0,0.5+max(3,max(row[2] for row in dat))) ax1.set_xlim(0,1) for tl in ax2.get_yticklabels(): tl.set_color('$$opts{id2col}[$id]') plt.subplots_adjust(right=0.88,left=0.15,bottom=0.11) plt.title('$$opts{title}{$id}') plt.savefig('$img.png') if img_fmt != 'png': plt.savefig('$img.' + img_fmt) plt.close() "; } # Sort calls by quality and plot ts/tv in bins of reasonable size (1%); # Plot both cumulative ts/tv (as if quality threshold was applied at that point) # and per bin. sub plot_tstv_by_QUAL { my ($opts,$id) = @_; my @vals = get_values($opts,$id,'QUAL'); if ( !@vals ) { return; } my $fh = $$opts{plt_fh}; my $img = "tstv_by_qual.$id"; open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Quality\t[2]Number of sites\t[3]Marginal Ts/Tv\n"; my @dat = (); my $ntot = 0; for my $val (@vals) { if ( $$val[0] eq '.' ) { next; } push @dat, [ $$val[0], $$val[2], $$val[3] ]; # qual, nts, ntv $ntot += $$val[2] + $$val[3]; } my @sdat = sort { $$b[0] <=> $$a[0] } @dat; push @sdat, [-1]; my $dn = $ntot*0.01; my $qprev = $sdat[0][0]; my $nout = 0; my $nts = 0; my $ntv = 0; my $nts_tot = 0; my $ntv_tot = 0; for my $rec (@sdat) # sorted in descendent order, biggest qualities come first { if ( $$rec[0]==-1 or $nts+$ntv > $dn ) { if ( $ntv ) { printf $tfh "$qprev\t%d\t%f\t%f\n", $nts+$ntv+$nout,$nts/$ntv,$nts_tot/$ntv_tot; } if ( $$rec[0]==-1 ) { last; } $nout += $nts+$ntv; $nts = 0; $ntv = 0; $qprev = $$rec[0]; } $nts += $$rec[1]; $ntv += $$rec[2]; $nts_tot += $$rec[1]; $ntv_tot += $$rec[2]; } close($tfh) or error("close $img.dat"); tprint $fh, " dat = [] with open('$img.dat', 'r') as f: reader = csv.reader(f, 'tab') for row in reader: if row[0][0] != '#': dat.append([float(x) for x in row]) if plot_tstv_by_qual and len(dat)>2: fig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) ax1 = fig.add_subplot(111) ax1.plot([row[1] for row in dat], [row[3] for row in dat], '-', ms=1, mec='$$opts{id2col}[$id]', color='$$opts{id2col}[$id]', label='Cumulative ts/tv') ax1.plot([row[1] for row in dat], [row[2] for row in dat], '--', ms=1, mec='$$opts{id2col}[$id]', color='$$opts{id2col}[$id]', label='Per 1% bins') ax1.set_ylabel('Ts/Tv',fontsize=10) ax1.set_xlabel('Number of sites\\n(sorted by QUAL, descending)',fontsize=10) ax1.ticklabel_format(style='sci', scilimits=(-3,2), axis='x') ax1.set_ylim(min(2,min(row[2] for row in dat))-0.3,0.3+max(2.2,max(row[2] for row in dat))) plt.legend(numpoints=1,markerscale=2,loc='best',prop={'size':9},frameon=False) plt.subplots_adjust(right=0.88,left=0.15,bottom=0.15) plt.title('$$opts{title}{$id}') plt.savefig('$img.png') if img_fmt != 'png': plt.savefig('$img.' + img_fmt) plt.close() "; } sub plot_tstv_by_usr { my ($opts,$id) = @_; for my $key (keys %{$$opts{dat}}) { if ( !($key=~/^USR:/) ) { next; } plot_tstv_by_usr1($opts,$id,$key); } } sub plot_tstv_by_usr1 { my ($opts,$id,$name) = @_; my @vals = get_values($opts,$id,$name); if ( !@vals ) { return; } my $fname = $name; $fname =~ s{/}{_}g; push @{$$opts{usr_plots}},{fname=>"tstv_by_$fname",uname=>$name}; my $fh = $$opts{plt_fh}; my $img = "tstv_by_$fname.$id"; open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]$name\t[2]Number of sites\t[3]Marginal Ts/Tv\n"; for my $rec (@vals) { my ($val,$nts,$ntv,undef) = @$rec; if ( $nts+$ntv==0 ) { next; } printf $tfh "$val\t%d\t%f\n", $nts+$ntv,$ntv?$nts/$ntv:0; } close($tfh) or error("close $img.dat"); tprint $fh, " dat = [] with open('$img.dat', 'r') as f: reader = csv.reader(f, 'tab') for row in reader: if row[0][0] != '#': dat.append([float(x) for x in row]) if plot_tstv_by_usr and len(dat)>2: fig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) ax1 = fig.add_subplot(111) ax2 = ax1.twinx() plots = ax1.plot([row[0] for row in dat], [row[2] for row in dat], 'o', mec='$$opts{id2col}[$id]', color='$$opts{id2col}[$id]') plots += ax2.plot([row[0] for row in dat], [row[1] for row in dat], 'o', mec='grey', color='grey') ax1.set_ylabel('Ts/Tv',fontsize=10) ax2.set_ylabel('Number of sites',fontsize=10) ax1.set_xlabel('$name',fontsize=10) ax1.ticklabel_format(style='sci', scilimits=(-3,2), axis='x') ax2.ticklabel_format(style='sci', scilimits=(-2,2), axis='y') ax1.set_ylim(min(2,min(row[2] for row in dat))-0.3,0.3+max(2.2,max(row[2] for row in dat))) labels = ['ts/tv','Number of sites'] plt.legend(plots,labels,numpoints=1,loc='best',prop={'size':9},frameon=False) plt.subplots_adjust(right=0.88,left=0.15,bottom=0.15) plt.title('$$opts{title}{$id}') plt.savefig('$img.png') if img_fmt != 'png': plt.savefig('$img.' + img_fmt) plt.close() "; } sub rebin_values { my ($vals,$bin_size,$col,%args) = @_; my %avg = exists($args{avg}) ? map {$_=>1} @{$args{avg}} : (); my $prev = $$vals[0][$col]; my $iout = 0; my $nsum = 0; my (@dat,@out); for (my $i=0; $i<@$vals; $i++) { for (my $icol=0; $icol<@{$$vals[$i]}; $icol++) { if ( $icol==$col ) { next; } $dat[$icol] += $$vals[$i][$icol]; } $nsum++; if ( $i+1<@$vals && $$vals[$i][$col] - $prev < $bin_size ) { next; } $dat[$col] = $prev; for (my $icol=0; $icol<@{$$vals[$i]}; $icol++) { $out[$iout][$icol] = $dat[$icol] ? $dat[$icol] : 0; if ( $avg{$icol} && $nsum ) { $out[$iout][$icol] /= $nsum; } } $nsum = 0; @dat = (); $iout++; $prev = $$vals[$i][$col]; } return \@out; } sub plot_concordance_by_AF { my ($opts) = @_; my @vals = get_values($opts,2,'GCsAF'); if ( !@vals ) { return; } # create a local copy and prepare r2 for rebinning @vals = @{ dclone(\@vals) }; for (my $i=0; $i<@vals; $i++) { $vals[$i][7] *= $vals[$i][8]; } my $vals = rebin_values(\@vals,0.01,0); my $fh = $$opts{plt_fh}; my $img = "gts_by_af"; my $img2 = "r2_by_af"; open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Allele Frequency\t[2]RR concordance\t[3]RA concordance\t[4]AA concordance\t[5]nRR\t[6]nRA\t[7]nAA\t[8]R^2\t[9]Number of genotypes\n"; for (my $i=0; $i<@$vals; $i++) { printf $tfh "%f\t%f\t%f\t%f\t%d\t%d\t%d\t%f\t%d\n", $$vals[$i][0], $$vals[$i][1]+$$vals[$i][4] ? $$vals[$i][1]/($$vals[$i][1]+$$vals[$i][4]) : 1, $$vals[$i][2]+$$vals[$i][5] ? $$vals[$i][2]/($$vals[$i][2]+$$vals[$i][5]) : 1, $$vals[$i][3]+$$vals[$i][6] ? $$vals[$i][3]/($$vals[$i][3]+$$vals[$i][6]) : 1, $$vals[$i][1]+$$vals[$i][4], $$vals[$i][2]+$$vals[$i][5], $$vals[$i][3]+$$vals[$i][6], $$vals[$i][8] ? $$vals[$i][7]/$$vals[$i][8] : 1, $$vals[$i][8]; } close($tfh); tprint $fh, " dat = [] with open('$img.dat', 'r') as f: reader = csv.reader(f, 'tab') for row in reader: if row[0][0] != '#': dat.append(row) if plot_concordance_by_af and len(dat)>1: fig = plt.figure(figsize=($$opts{img_width}*1.2,$$opts{img_height})) ax1 = fig.add_subplot(111) ax1.plot([row[0] for row in dat], [row[1] for row in dat],'.',color='$$opts{id2col}[1]',label='Hom RR') ax1.plot([row[0] for row in dat], [row[2] for row in dat],'.',color='$$opts{id2col}[0]',label='Het RA') ax1.plot([row[0] for row in dat], [row[3] for row in dat],'.',color='k',label='Hom AA') ax1.set_xlabel('Non-ref allele frequency') ax1.set_ylabel('Concordance') leg = ax1.legend(title='Concordance:',numpoints=1,markerscale=2,loc='best',prop={'size':9}) leg.draw_frame(False) plt.setp(leg.get_title(),fontsize=9) ax2 = ax1.twinx() ax2.plot([row[0] for row in dat], [row[4] for row in dat],color='$$opts{id2col}[1]') ax2.plot([row[0] for row in dat], [row[5] for row in dat],color='$$opts{id2col}[0]') ax2.plot([row[0] for row in dat], [row[6] for row in dat],color='k') ax2.set_ylabel('Number of genotypes') ax2.set_yscale('log') if af_xlog: ax1.set_xscale('log') if af_xlog: ax2.set_xscale('log') plt.subplots_adjust(left=0.15,right=0.83,bottom=0.11) plt.savefig('$img.png') if img_fmt != 'png': plt.savefig('$img.' + img_fmt) plt.close() if plot_r2_by_af and len(dat)>1: fig = plt.figure(figsize=($$opts{img_width}*1.3,$$opts{img_height})) ax1 = fig.add_subplot(111) ax2 = ax1.twinx() ax1.set_zorder(ax2.get_zorder()+1) ax1.patch.set_visible(False) ax2.plot([row[0] for row in dat], [row[8] for row in dat], '-o', color='r',mec='r',markersize=3) ax1.plot([row[0] for row in dat], [row[7] for row in dat], '-^', color='k',markersize=3) for tl in ax2.get_yticklabels(): tl.set_color('r') ax2.set_ylabel('Number of genotypes', color='r') ax2.set_yscale('log') if af_xlog: ax1.set_xscale('log') if af_xlog: ax2.set_xscale('log') ax1.set_ylabel('Aggregate allelic R\$^2\$', color='k') ax1.set_xlabel('Non-ref allele frequency') plt.subplots_adjust(left=0.19,right=0.83,bottom=0.11) plt.savefig('$img2.png') if img_fmt != 'png': plt.savefig('$img2.' + img_fmt) plt.close() "; } sub plot_concordance_by_sample { my ($opts) = @_; my @vals = get_values($opts,2,'GCsS'); if ( !@vals ) { return; } my $fh = $$opts{plt_fh}; my $img = "gts_by_sample"; open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Sample ID\t[2]Discordance\t[3]Sample Name\n"; for (my $i=0; $i<@vals; $i++) { printf $tfh "%d\t%f\t%s\n", $i, $vals[$i][1], $vals[$i][0]; } close($tfh); tprint $fh, " dat = [] with open('$img.dat', 'r') as f: reader = csv.reader(f, 'tab') for row in reader: if row[0][0] != '#': dat.append(row) if plot_discordance_by_sample: fig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7)) ax1 = fig.add_subplot(111) ax1.plot([row[0] for row in dat], [row[1] for row in dat],'.',color='orange') ax1.set_ylabel('Non-ref discordance') ax1.set_ylim(0,) if sample_names: plt.xticks([int(row[0]) for row in dat],[row[2] for row in dat],**sample_font) plt.subplots_adjust(**sample_margins) else: plt.subplots_adjust(right=0.98,left=0.07,bottom=0.17) ax1.set_xlabel('Sample ID') plt.savefig('$img.png') if img_fmt != 'png': plt.savefig('$img.' + img_fmt) plt.close() "; } sub plot_counts_by_AF { my ($opts) = @_; plot_counts_by_AF_col($opts,1,'SNP'); plot_counts_by_AF_col($opts,4,'Indel'); } sub plot_counts_by_AF_col { my ($opts,$col,$title) = @_; my $fh = $$opts{plt_fh}; my $img = "counts_by_af.".lc($title)."s"; open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]id\t[2]Nonref Allele Frequency\t[3]Number of sites\n"; for my $id (file_ids($opts)) { my @tmp = get_values($opts,$id,'AF'); my $vals = rebin_values(\@tmp,1,0); for my $val (@$vals) { if ( !$$val[$col] ) { next; } print $tfh "$id\t$$val[0]\t$$val[$col]\n"; } } close($tfh); tprint $fh, " dat = {} with open('$img.dat', 'r') as f: reader = csv.reader(f, 'tab') for row in reader: if row[0][0] == '#': continue id = int(row[0]) if id not in dat: dat[id] = [] dat[id].append([float(row[1]),float(row[2])]) if plot_${title}_count_by_af: fig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7)) ax1 = fig.add_subplot(111) ax1.set_ylabel('Number of sites') ax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') ax1.set_yscale('log') if af_xlog: ax1.set_xscale('log') ax1.set_xlabel('Non-reference allele frequency') ax1.set_xlim(-0.05,1.05) has_data = 0 "; for my $id (file_ids($opts)) { tprint $fh, " if $id in dat and len(dat[$id])>2: ax1.plot([row[0] for row in dat[$id]], [row[1] for row in dat[$id]], '-o',markersize=3, color='$$opts{id2col}[$id]',mec='$$opts{id2col}[$id]',label='$$opts{title}{$id}') has_data = 1 "; } tprint $fh, " if has_data: ax1.legend(numpoints=1,markerscale=1,loc='best',prop={'size':10},frameon=False) plt.title('$title count by AF') plt.subplots_adjust(bottom=0.2,left=0.1,right=0.95) plt.savefig('$img.png') if img_fmt != 'png': plt.savefig('$img.' + img_fmt) plt.close() "; } sub plot_overlap_by_AF { my ($opts) = @_; plot_overlap_by_AF_col($opts,1,'SNP'); plot_overlap_by_AF_col($opts,4,'Indel'); } sub plot_overlap_by_AF_col { my ($opts,$col,$title) = @_; my @ids = file_ids($opts); if ( @ids != 3 ) { return; } my ($ia,$ib,$iab); for (my $i=0; $i<@ids; $i++) { if ( @{$$opts{dat}{ID}{$ids[$i]}[0]}>1 ) { $iab = $i; next; } if ( !defined $ia ) { $ia = $i; next; } $ib = $i; } my $fh = $$opts{plt_fh}; my $img = "overlap_by_af.".lc($title)."s"; my @has_vals; my @vals_a = get_values($opts,$ia,'AF'); my @vals_b = get_values($opts,$ib,'AF'); my @vals_ab = get_values($opts,$iab,'AF'); my (%afs,%af_a,%af_ab); for my $val (@vals_a) { $afs{$$val[0]} = $$val[$col]; $af_a{$$val[0]} = $$val[$col]; } for my $val (@vals_ab) { $afs{$$val[0]} = $$val[$col]; $af_ab{$$val[0]} = $$val[$col]; } open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Allele frequency\t[2]Fraction of sites from $$opts{title}{$ids[$ia]} also in $$opts{title}{$ids[$ib]}\t[3]Number of sites\n"; for my $af (sort { $a<=>$b } keys %afs) { my $a = exists($af_a{$af}) ? $af_a{$af} : 0; my $ab = exists($af_ab{$af}) ? $af_ab{$af} : 0; my $yval = ($a+$ab) ? $ab * 100. / ($a + $ab) : 0; print $tfh "$af\t$yval\t" .($a+$ab). "\n"; } close($tfh) or error("close $img.dat"); tprint $fh, " dat = [] with open('$img.dat', 'r') as f: reader = csv.reader(f, 'tab') for row in reader: if row[0][0] != '#': dat.append(row) if plot_${title}_overlap_by_af and len(dat)>1: fig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7)) ax1 = fig.add_subplot(111) ax1.plot([row[0] for row in dat], [row[1] for row in dat],'-o',markersize=3, color='$$opts{id2col}[1]',mec='$$opts{id2col}[1]') ax1.set_ylabel('Fraction found in $$opts{title}{$ib} [%]') ax1.set_xscale('log') ax1.set_xlabel('Non-reference allele frequency in $$opts{title}{$ia}') ax1.set_xlim(0,1.01) plt.title('$title overlap by AF') plt.subplots_adjust(bottom=0.2,left=0.1,right=0.95) plt.savefig('$img.png') if img_fmt != 'png': plt.savefig('$img.' + img_fmt) plt.close() "; } sub plot_indel_distribution { my ($opts,$id) = @_; my @vals = get_values($opts,$id,'IDD'); if ( !@vals ) { return; } # Set xlim to show 99% of indels but ignore outliers my @tmp; for my $id (file_ids($opts)) { my @v = get_values($opts,$id,'IDD'); for my $v (@v) { $tmp[ abs($$v[0]) ] += $$v[1]; } } my $n; for my $t (@tmp) { $n += $t ? $t : 0; } my ($sum,$xlim); for ($xlim=0; $xlim<@tmp; $xlim++) { $sum += $tmp[$xlim] ? $tmp[$xlim] : 0; if ( $sum/$n >= 0.99 ) { last; } } if ( $xlim<20 ) { $xlim=20; } my $fh = $$opts{plt_fh}; my $img = "indels.$id"; open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Indel length\t[2]Count\n"; for my $val (@vals) { print $tfh "$$val[0]\t$$val[1]\n"; } close($tfh); tprint $fh, " dat = [] with open('$img.dat', 'r') as f: reader = csv.reader(f, 'tab') for row in reader: if row[0][0] != '#': dat.append([float(x) for x in row]) if plot_indel_dist and len(dat)>0: fig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) ax1 = fig.add_subplot(111) ax1.bar([row[0]-0.5 for row in dat], [row[1] for row in dat], color='$$opts{id2col}[0]')# , edgecolor='$$opts{id2col}[0]') ax1.set_xlabel('InDel Length') ax1.set_ylabel('Count') ax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') ax1.set_xlim(-$xlim,$xlim) plt.subplots_adjust(bottom=0.17) plt.title('$$opts{title}{$id}') plt.savefig('$img.png') if img_fmt != 'png': plt.savefig('$img.' + img_fmt) plt.close() "; } sub plot_indel_vaf_distribution { my ($opts,$id) = @_; my @vals = get_values($opts,$id,'IDD'); if ( !@vals ) { return; } my ($xmin,$xmax); my @dat = (); for my $val (@vals) { if ( $$val[2] eq '0' or $$val[3] eq '.' ) { next; } if ( !defined $xmin or $$val[0]<$xmin ) { $xmin = $$val[0]; } if ( !defined $xmax or $$val[0]>$xmax ) { $xmax = $$val[0]; } push @dat, "$$val[0]\t$$val[3]\n"; } if ( !@dat ) { return; } my $img = "indel_vaf.$id"; open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Indel length\t[2]Mean VAF\n"; print $tfh @dat; close($tfh); my $fh = $$opts{plt_fh}; tprint $fh, " dat = [] with open('$img.dat', 'r') as f: reader = csv.reader(f, 'tab') for row in reader: if row[0][0] != '#': dat.append([float(x) for x in row]) if plot_indel_vaf and len(dat)>0: fig = plt.figure(figsize=($$opts{img_width}*2,$$opts{img_height}*0.7)) ax1 = fig.add_subplot(111) ax1.plot([$xmin,$xmax], [0.5,0.5],color='#c5c5c5') ax1.plot([row[0] for row in dat], [row[1] for row in dat],'.-',color='$$opts{id2col}[0]')# , edgecolor='$$opts{id2col}[0]') ax1.set_xlabel('Size of deletion (negative) or insertion (positive)') ax1.set_ylabel('Fraction of alt allele') ax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') plt.subplots_adjust(bottom=0.2) plt.title('$$opts{title}{$id}') ax1.spines['top'].set_visible(False) ax1.spines['right'].set_visible(False) ax1.get_xaxis().tick_bottom() ax1.get_yaxis().tick_left() ax1.spines['bottom'].set_color('grey') ax1.spines['left'].set_color('grey') mpl.rcParams['text.color'] = '555555' ax1.patch.set_visible(False) plt.savefig('$img.png') if img_fmt != 'png': plt.savefig('$img.' + img_fmt) plt.close() "; } sub plot_substitutions { my ($opts,$id) = @_; my @vals = get_values($opts,$id,'ST'); if ( !@vals ) { return; } my $fh = $$opts{plt_fh}; my $img = "substitutions.$id"; tprint $fh, " dat = [ "; for (my $i=0; $i<@vals; $i++) { my $val=$vals[$i]; tprint $fh, "\t[$i,'$$val[0]',$$val[1]],\n"; } tprint $fh, "]\n"; tprint $fh, " if plot_substitutions: fig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) ts = [ 'A>G','G>A','C>T','T>C' ] nts = 0 ntv = 0 for x in dat: if x[1] in ts: nts += 1 else: ntv += 1 n = 12 col = list(range(n)) ecol = list(range(n)) for i in range(n): col[i] = '#ffce84' ecol[i] = '#f5c781' col[1] = col[5] = col[6] = col[10] = '#ff9900' ecol[1] = ecol[5] = ecol[6] = ecol[10] = '#ef8f00' ax1 = fig.add_subplot(111) ax1.bar([row[0] for row in dat], [row[2] for row in dat], color=col, edgecolor=ecol) ax1.set_ylabel('Count') ax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') ax1.spines['top'].set_visible(False) ax1.spines['right'].set_visible(False) ax1.get_xaxis().tick_bottom() ax1.get_yaxis().tick_left() ax1.spines['bottom'].set_color('grey') ax1.spines['left'].set_color('grey') mpl.rcParams['text.color'] = '555555' ax1.patch.set_visible(False) ax1.set_xlim(-0.5,n-0.5) plt.xticks([row[0] for row in dat],[row[1] for row in dat],rotation=45) plt.title('$$opts{title}{$id}') plt.savefig('$img.png') if img_fmt != 'png': plt.savefig('$img.' + img_fmt) plt.close() "; } sub plot_vaf { my ($opts,$id,$type) = @_; my @vals = get_values($opts,$id,'VAF'); if ( !@vals ) { return; } my @sum = (); for my $row (@vals) { my @cnt = split(/,/,$type eq 'snv' ? $$row[1] : $$row[2]); for (my $i=0; $i<@cnt; $i++) { $sum[$i] += $cnt[$i]; } } my $fh = $$opts{plt_fh}; my $img = "vaf.$type.$id"; tprint $fh, " dat = [ "; for (my $i=0; $i<@sum; $i++) { tprint $fh, "\t[".1.0*$i/(scalar @sum).",$sum[$i]],\n"; } tprint $fh, "]\n"; tprint $fh, " if plot_vaf_$type: fig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) ax1 = fig.add_subplot(111) wd = 0.7 # fraction of dx distance min_dx = None for i in range(len(dat)-1): if min_dx==None or min_dx > abs(dat[i+1][0]-dat[i][0]): min_dx = abs(dat[i+1][0]-dat[i][0]) if min_dx==None: min_dx = 1 wd = min_dx*wd ax1.bar([x[0] for x in dat],[x[1] for x in dat],wd) #,**plt_args) ax1.set_ylabel('Count') ax1.set_xlabel('Variant Allele Frequency') ax1.ticklabel_format(style='sci', scilimits=(-2,2), axis='y') ax1.spines['top'].set_visible(False) ax1.spines['right'].set_visible(False) ax1.get_xaxis().tick_bottom() ax1.get_yaxis().tick_left() ax1.spines['bottom'].set_color('grey') ax1.spines['left'].set_color('grey') mpl.rcParams['text.color'] = '555555' ax1.patch.set_visible(False) plt.subplots_adjust(right=0.95,bottom=0.15) plt.title('$$opts{title}{$id}') plt.savefig('$img.png') if img_fmt != 'png': plt.savefig('$img.' + img_fmt) plt.close() "; } sub plot_vaf25_by_sample { my ($opts,$id,$type) = @_; my @vals = get_values($opts,$id,'VAF'); if ( !@vals ) { return; } my @vaf = (); for my $row (@vals) { my @cnt = split(/,/,$type eq 'snv' ? $$row[1] : $$row[2]); my $sum = 0; my $sum25 = 0; for (my $i=0; $i<@cnt; $i++) { if ( 1.0*$i/(scalar @cnt) < 0.25 ) { $sum25 += $cnt[$i] } $sum += $cnt[$i]; } push @vaf, $sum ? $sum25/$sum : 0; } my $fh = $$opts{plt_fh}; my $img = "vaf25.$type.$id"; tprint $fh, " dat = [ "; for (my $i=0; $i<@vaf; $i++) { tprint $fh, "\t[$i,$vaf[$i],'$vals[$i][0]'],\n"; } tprint $fh, "]\n"; tprint $fh, " if plot_vaf25_$type: fig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7)) ax1 = fig.add_subplot(111) ax1.plot([row[0] for row in dat], [row[1] for row in dat], 'o', color='$$opts{id2col}[$id]',mec='$$opts{id2col}[$id]') ax1.set_ylabel('nVAF<0.25') ax1.set_ylim(-0.1,1.1) if sample_names: plt.xticks([int(row[0]) for row in dat],[row[2] for row in dat],**sample_font) plt.subplots_adjust(**sample_margins) else: plt.subplots_adjust(right=0.98,left=0.07,bottom=0.17) ax1.set_xlabel('Sample ID') ax1.spines['top'].set_visible(False) ax1.spines['right'].set_visible(False) ax1.get_xaxis().tick_bottom() ax1.get_yaxis().tick_left() ax1.spines['bottom'].set_color('grey') ax1.spines['left'].set_color('grey') mpl.rcParams['text.color'] = '555555' ax1.patch.set_visible(False) plt.title('$$opts{title}{$id}') plt.savefig('$img.png') if img_fmt != 'png': plt.savefig('$img.' + img_fmt) plt.close() "; } sub singletons { my ($opts,$id) = @_; my @si_vals = get_values($opts,$id,'SiS'); my $si_snps = $si_vals[0][1]; my $si_indels = $si_vals[0][4]; my $si_irc = sprintf "%.3f", $si_vals[0][6] ? $si_vals[0][5]/($si_vals[0][5]+$si_vals[0][6]) : 0; my $si_tstv = sprintf "%.2f", $si_vals[0][3] ? $si_vals[0][2]/$si_vals[0][3] : 0; my @all_vals = get_values($opts,$id,'AF'); my $nsnps = 0; my $nindels = 0; for my $val (@all_vals) { $nsnps += $$val[1]; $nindels += $$val[4]; } $si_snps = sprintf "%.1f", $nsnps ? $si_snps*100./$nsnps : 0; $si_indels = sprintf "%.1f", $nindels ? $si_indels*100./$nindels : 0; return { snps=>$si_snps, indels=>$si_indels, tstv=>$si_tstv, irc=>$si_irc }; } sub calc_3n_n3n { my (@vals) = @_; my $n3 = 0; my $nn3 = 0; for my $val (@vals) { if ( !($$val[0]%3) ) { $n3++; } else { $nn3++; } } if ( !$nn3 ) { return '-'; } return sprintf("%.2f", $n3/$nn3); } sub fmt_slide3v { my ($opts, $image, $title) = @_; my $n = 0; for my $id (0..2) { if ( -e "$image.$id.$$opts{img_fmt}" ) { $n++; } } if ( !$n ) { return ''; } my $h = $$opts{tex}{slide3v}{"height$n"}; my $slide = q[\vbox{]; for my $id (0..2) { if ( !-e "$image.$id.$$opts{img_fmt}" ) { next; } $slide .= qq[\\centerline{\\includegraphics[height=$h]{\{$image.$id\}.$$opts{img_fmt}}}]; } $slide .= '}'; return qq[ % $title % \\hslide{$title}{$slide} ]; } sub fmt_slide3h { my ($opts, $image, $title) = @_; my $n = 0; for my $id (0..2) { if ( -e "$image.$id.$$opts{img_fmt}" ) { $n++; } } if ( !$n ) { return ''; } my $w = $$opts{tex}{slide3h}{"width$n"}; my $slide = ''; for my $id (0..2) { if ( !-e "$image.$id.$$opts{img_fmt}" ) { next; } $slide .= qq[\\includegraphics[width=$w]{\{$image.$id\}.$$opts{img_fmt}}]; } return qq[ % $title % \\hslide{$title}{$slide} ]; } sub bignum { my ($num) = @_; if ( !defined $num ) { return ''; } if ( !($num=~/^\d+$/) ) { return $num; } my $len = length($num); my $out; for (my $i=0; $i<$len; $i++) { $out .= substr($num,$i,1); if ( $i+1<$len && !(($len-$i-1)%3) ) { $out .= ','; } } return $out; } sub create_pdf { my ($opts) = @_; chdir($$opts{dir}); my @ids = file_ids($opts); my $width = "25.4cm"; # todo: move all this to $$opts{tex} my $height = "19cm"; my $height1 = "13cm"; my $width1 = "23cm"; my $width2 = @ids==3 ? "10.5cm" : "10.5cm"; my $width3 = @ids==3 ? "8cm" : "15cm"; my $args = { width3=>$width3, n=>scalar @ids }; my $engine = ''; system('command -v pdflatex >/dev/null'); if ($? == 0) { $engine = 'pdflatex'; } else { system('command -v tectonic >/dev/null'); if ($? == 0) { $engine = 'tectonic'; } } if (! $engine) { error('Neither pdflatex or tectonic were found in your PATH, impossible to create a PDF') } # Check that xcolor is available my @has_xcolor = $engine eq 'tectonic' || `kpsewhich xcolor.sty`; if ( !@has_xcolor ) { warn("Note: The xcolor.sty package not available, black and white tables only...\n\n"); } my $tex_file = "summary.tex"; my $pdf_file = "summary.pdf"; open(my $tex,'>',$tex_file) or error("$tex_file: $!"); tprint $tex, qq[ % This file was produced by plot-vcfstats, the command line was: % $$opts{args} % % Edit as necessary and recreate the PDF by running % $engine $tex_file % % Slides style and dimensions % \\nonstopmode \\documentclass[17pt]{memoir} \\setstocksize{$height}{$width} \\settrimmedsize{\\stockheight}{\\stockwidth}{*} \\settrims{0pt}{0pt} \\setlrmarginsandblock{1cm}{*}{*} \\setulmarginsandblock{1.5cm}{*}{*} \\setheadfoot{1mm}{1cm} \\setlength{\\parskip}{0pt} \\setheaderspaces{*}{1mm}{*} \\setmarginnotes{1mm}{1mm}{1mm} \\checkandfixthelayout[fixed] \\usepackage{charter} % font \\pagestyle{plain} \\makeevenfoot{plain}{}{}{\\thepage} \\makeoddfoot{plain}{}{}{\\thepage} \\usepackage{graphicx} % For colored tables. If xcolor.sty is not available on your system, % download xcolor.sty.gz LaTeX class style from % http://www.ukern.de/tex/xcolor.html % Unpack and install system-wide or place elsewhere and make available by % setting the TEXINPUTS environment variable (note the colon) % export TEXINPUTS=some/dir: % The list of the recognised path can be obtained by running `kpsepath tex` % \\usepackage{multirow} \\setlength{\\tabcolsep}{0.6em} \\renewcommand{\\arraystretch}{1.2} ]; if ( @has_xcolor ) { tprint $tex, '\usepackage[table]{xcolor}'; } else { tprint $tex, qq[ \\newcommand{\\definecolor}[3]{} \\newcommand{\\columncolor}[1]{} \\newcommand{\\rowcolors}[4]{} \\newcommand{\\arrayrulecolor}[1]{} ]; } tprint $tex, qq[ \\definecolor{hcol1}{rgb}{1,0.6,0} \\definecolor{hcol2}{rgb}{1,0.68,0.2} \\definecolor{row1}{rgb}{1,0.88,0.7} \\definecolor{row2}{rgb}{1,0.92,0.8} % #FFEBCC \\setlength{\\arrayrulewidth}{1.5pt} % Slide headings \\newcommand*{\\head}[1]{{\\Large\\centerline{#1}\\vskip0.5em}} % Slide definition \\newcommand*{\\hslide}[2]{% \\head{#1}% \\begin{vplace}[0.5]\\centerline{#2}\\end{vplace}\\newpage} \\newcommand{\\pdf}[2]{\\IfFileExists{#2}{\\includegraphics[#1]{#2}}{}} % The actual slides \\begin{document} ]; # Table with summary numbers my $slide .= q[ \begin{minipage}{\textwidth}\centering \small \rowcolors*{3}{row2}{row1} \arrayrulecolor{black} \begin{tabular}{l | r r r | r r | r | r} \multicolumn{1}{>{\columncolor{hcol1}}l|}{} & \multicolumn{3}{>{\columncolor{hcol1}}c|}{SNPs} & \multicolumn{2}{>{\columncolor{hcol1}}c|}{indels} & \multicolumn{1}{>{\columncolor{hcol1}}c|}{MNPs} & \multicolumn{1}{>{\columncolor{hcol1}}c}{others} \\\\ % \multicolumn{1}{>{\columncolor{hcol2}}l|}{Callset} & \multicolumn{1}{>{\columncolor{hcol2}}c}{n} & \multicolumn{1}{>{\columncolor{hcol2}}c }{ts/tv} & \multicolumn{1}{>{\columncolor{hcol2}}c|}{\\footnotesize(1st ALT)} & \multicolumn{1}{>{\columncolor{hcol2}}c}{n} & \multicolumn{1}{>{\columncolor{hcol2}}c}{frm$^*$} & \multicolumn{1}{>{\columncolor{hcol2}}c|}{} & \multicolumn{1}{>{\columncolor{hcol2}}c}{} \\\\ \hline ]; my %tex_titles; for my $id (@ids) { my $snps = get_value($opts,$id,'number of SNPs:'); my $indels = get_value($opts,$id,'number of indels:'); my $mnps = get_value($opts,$id,'number of MNPs:'); my $others = get_value($opts,$id,'number of others:'); my $tstv = sprintf "%.2f",get_values($opts,$id,'TSTV',0,2); my $tstv1 = sprintf "%.2f",get_values($opts,$id,'TSTV',0,5); my @frsh = get_values($opts,$id,'FS'); my $frsh = @frsh ? $frsh[0][3] : '--'; my @rc = get_values($opts,$id,'ICS'); my $title = $$opts{title}{$id}; $title =~ s/_/\\_/g; $title =~ s/^\s*\*/\$*\$/; # leading asterisks is eaten by TeX $tex_titles{$id} = $title; $slide .= qq[ $title & ] . bignum($snps) . qq[ & $tstv & $tstv1 & ] . bignum($indels) . qq[ & $frsh & ] . bignum($mnps) . ' & ' . bignum($others) . qq[ \\\\ \n]; } $slide .= q[% \multicolumn{8}{r}{$^*$ frameshift ratio: out/(out+in)} \\\\ \end{tabular} \\\\ \vspace{1em} \begin{tabular}{l | r r r | r r} \multicolumn{1}{>{\columncolor{hcol1}}l|}{} & \multicolumn{3}{>{\columncolor{hcol1}}c|}{singletons {\footnotesize(AC=1)}} & \multicolumn{2}{>{\columncolor{hcol1}}c}{multiallelic} \\\\ % \multicolumn{1}{>{\columncolor{hcol2}}l|}{Callset} & \multicolumn{1}{>{\columncolor{hcol2}}c}{SNPs} & \multicolumn{1}{>{\columncolor{hcol2}}c}{ts/tv} & \multicolumn{1}{>{\columncolor{hcol2}}c}{indels} & \multicolumn{1}{>{\columncolor{hcol2}}c}{sites} & \multicolumn{1}{>{\columncolor{hcol2}}c}{SNPs} \\\\ \hline ]; for my $id (@ids) { my $snps = get_value($opts,$id,'number of SNPs:'); my $s = singletons($opts,$id); my $mals = get_value($opts,$id,'number of multiallelic sites:'); my $msnps = get_value($opts,$id,'number of multiallelic SNP sites:'); my $title = $tex_titles{$id}; $slide .= qq[ $title & $$s{snps}\\% & $$s{tstv} & $$s{indels}\\% & ] . bignum($mals) . ' &' . bignum($msnps) . qq[ \\\\ \n]; } $slide .= q[ \\end{tabular} \\vspace{2em} \\begin{itemize}[-] \\setlength{\\itemsep}{0pt} ]; for my $id (@ids) { my $fname = $$opts{dat}{ID}{$id}[0][0]; if ( $$opts{title}{$id} =~ / \+ / ) { next; } $fname =~ s/.{80}/$&\\\\\\hskip2em /g; $fname =~ s/_/\\_/g; $slide .= qq[\\item $tex_titles{$id} .. \\texttt{\\footnotesize $fname}\n]; } $slide .= q[\\end{itemize}\\end{minipage}]; my $title = exists($$opts{main_title}) ? $$opts{main_title} : 'Summary Numbers'; tprint $tex, qq[ % Table with summary numbers % \\hslide{$title}{$slide} ]; # Venn bars if ( @ids==3 ) { tprint $tex, qq[% % Venn numbers % \\hslide{Total counts}{% \\includegraphics[width=$width2]{{venn_bars.snps}.$$opts{img_fmt}}% \\includegraphics[width=$width2]{{venn_bars.indels}.$$opts{img_fmt}} } ]; } tprint $tex, fmt_slide3v($opts, "tstv_by_sample", 'Ts/Tv by sample'); tprint $tex, fmt_slide3v($opts, "vaf25.snv", 'Fraction of SNVs with VAF$<$25\% by sample'); tprint $tex, fmt_slide3v($opts, "vaf25.indel", 'Fraction of indels with VAF$<$25\% by sample'); tprint $tex, fmt_slide3v($opts, "hets_by_sample", 'Hets vs non-ref Homs by sample'); tprint $tex, fmt_slide3v($opts, "singletons_by_sample", 'Singletons by sample {\normalsize(hets and homs)}'); tprint $tex, fmt_slide3v($opts, "dp_by_sample", 'Average depth by sample'); tprint $tex, fmt_slide3v($opts, "snps_by_sample", 'Number of SNPs by sample'); tprint $tex, fmt_slide3v($opts, "indels_by_sample", 'Number of indels by sample'); if ( scalar get_values($opts,2,'GCsS') ) { tprint $tex, qq[ % Genotype discordance by sample % \\hslide{Genotype discordance by sample}{\\pdf{width=$width1}{gts_by_sample.$$opts{img_fmt}}} ]; } if ( scalar get_values($opts,2,'GCsAF') ) { my @vals = get_values($opts,2,'NRDs'); my $nrd = sprintf "%.2f", $vals[0][0]; my $rr = sprintf "%.2f", $vals[0][1]; my $ra = sprintf "%.2f", $vals[0][2]; my $aa = sprintf "%.2f", $vals[0][3]; my $nsamples = get_value($opts,2,'number of samples:'); my $table = qq[% {\\small \\rowcolors*{1}{row2}{row1}\\arrayrulecolor{black} \\begin{tabular}{c | c | c | c } \\multicolumn{1}{>{\\columncolor{hcol1}}c|}{REF/REF} & \\multicolumn{1}{>{\\columncolor{hcol1}}c|}{REF/ALT} & \\multicolumn{1}{>{\\columncolor{hcol1}}c|}{ALT/ALT} & \\multicolumn{1}{>{\\columncolor{hcol1}}c}{NRDs} \\\\ \\hline $rr\\% & $ra\\% & $aa\\% & $nrd\\% \\\\ \\end{tabular}}]; tprint $tex, qq[ % Genotype discordance by AF % \\head{Genotype discordance by AF}\\begin{vplace}[0.7]\\centerline{$table}% \\centerline{\\pdf{height=$height1}{gts_by_af.$$opts{img_fmt}}}\\end{vplace} \\newpage % dosage r2 by AF % \\hslide{Allelic R\$^2\$ by AF}{\\pdf{height=$height1}{r2_by_af.$$opts{img_fmt}}} ]; } if ( -e "counts_by_af.snps.$$opts{img_fmt}" && -e "counts_by_af.indels.$$opts{img_fmt}" ) { tprint $tex, qq[ % SNP and indel counts by AF % \\hslide{}{\\vbox{\\noindent\\includegraphics[width=$width1]{{counts_by_af.snps}.$$opts{img_fmt}}\\\\% \\noindent\\includegraphics[width=$width1]{{counts_by_af.indels}.$$opts{img_fmt}}}} ]; } if ( -e "overlap_by_af.snps.$$opts{img_fmt}" && -e "overlap_by_af.indels.$$opts{img_fmt}" ) { tprint $tex, qq[ % SNP and indel overlap by AF % \\hslide{}{\\vbox{\\noindent\\includegraphics[width=$width1]{{overlap_by_af.snps}.$$opts{img_fmt}}\\\\% \\noindent\\includegraphics[width=$width1]{{overlap_by_af.indels}.$$opts{img_fmt}}}} ]; } tprint $tex, fmt_slide3h($opts, "tstv_by_af", 'Ts/Tv by AF'); tprint $tex, fmt_slide3h($opts, "tstv_by_qual", 'Ts/Tv stratified by QUAL'); for my $plot (@{$$opts{usr_plots}}) { my $uname = $$plot{uname}; my $fname = $$plot{fname}; tprint $tex, fmt_slide3h($opts, $fname, 'Ts/Tv stratified by '.$uname); } tprint $tex, fmt_slide3h($opts, "indels", 'Indel distribution'); tprint $tex, fmt_slide3v($opts, "indel_vaf", 'Fraction of alternate indel allele'); tprint $tex, fmt_slide3h($opts, "depth", 'Depth distribution'); tprint $tex, fmt_slide3h($opts, "hwe", 'Number of HETs by AF'); tprint $tex, fmt_slide3h($opts, "substitutions", 'Substitution types'); tprint $tex, fmt_slide3h($opts, "vaf.snv", 'SNV Variant Allele Frequency'); tprint $tex, fmt_slide3h($opts, "vaf.indel", 'Indel Variant Allele Frequency'); #tprint $tex, fmt_slide3h($opts, "irc_by_af", 'Indel Repeat Consistency by AF'); #tprint $tex, fmt_slide3h($opts, "irc_by_rlen", 'Indel Consistency by Repeat Type'); tprint $tex, "\n\n\\end{document}\n"; close($tex); $tex_file =~ s{^.+/}{}; my $cmd = "$engine $tex_file >$$opts{logfile} 2>&1"; print STDERR "Creating PDF: $cmd\n" unless !$$opts{verbose}; system($cmd); if ( $? ) { error("The command exited with non-zero status, please consult the output of $engine: $$opts{dir}/$$opts{logfile}\n\n"); } print STDERR "Finished: $$opts{dir}/$pdf_file\n" unless !$$opts{verbose}; } sub merge_vcfstats { my ($opts) = @_; my $fh = *STDOUT; if ( !$$opts{merge} ) { open($fh,'>',"merge.chk") or error("merge.chk: $!\n"); } print $fh "# This file was produced by plot-vcfstats, the command line was:\n# $$opts{args}\n#\n"; for my $sec (@{$$opts{sections}}) { my $sid = $$sec{id}; if ( !exists($$opts{dat}{$sid}) ) { next; } print $fh "# $$sec{header}\n$$sec{exp}\n"; for my $id (sort keys %{$$opts{dat}{$sid}}) { for my $rec (@{$$opts{dat}{$sid}{$id}}) { print $fh "$sid\t$id\t", join("\t",@$rec), "\n"; } } if ( $sid eq 'ID' ) { print $fh "# $$opts{id2sec}{SN}{header}\n$$opts{id2sec}{SN}{exp}\n"; # output summary numbers here for my $id (keys %{$$opts{dat}}) { if ( exists($$opts{exp}{$id}) ) { next; } for my $key (@{$$opts{SN_keys}}) { next unless exists $$opts{dat}{$id}{$key}; print $fh "SN\t$id\t$key\t$$opts{dat}{$id}{$key}\n"; } } } } }