#!/usr/bin/env perl # # The MIT License # # Copyright (c) 2024-2025 Genome Research Ltd. # # Authors: # Petr Danecek # Based on code by Jakub Genci, https://github.com/GenciJakub/BcThesis # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. use strict; use warnings; use Data::Dumper; my $opts = parse_params(); process_data($opts); generate_html($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { print "@msg\n"; } print "About: HTML/JavaScript visualization of homozygosity rate and bcftools/roh output\n", "Usage: roh-viz [OPTIONS]\n", "Options:\n", " -c --compress 0|1 Compress the embedded data [1]\n", " -e --embed-d3 0|1 Embed JS libraries for offline rendering [0]\n", " -i, --RoH-file FILE Output of bcftools/roh\n", " -l, --min-length NUM Mimimum length of ROH [1e6]\n", " -o, --output FILE HTML output file\n", " -r, --regions LIST List of chromosomes/regions\n", " -s, --samples LIST List of samples to show\n", " -v, --VCF-file FILE VCF file to determine homozygosity rate\n", " -h, -?, --help This help message\n", "Example:\n", " bcftools roh --AF-dflt 0.5 -G 30 -Or -o roh.txt file.bcf\n", " roh-viz -r roh.txt -v file.bcf -o output.html\n", "\n"; exit -1; } sub parse_params { my $opts = { min_length => 1e6, bcftools => 'bcftools', outdir => '.', d3_url => 'https://d3js.org/d3.v7.min.js', pako_url => 'https://cdnjs.cloudflare.com/ajax/libs/pako/2.0.4/pako.min.js', compress => 1, embed_d3 => 0, bin_size => 100_000, # bin size cmd => join(' ',$0,@ARGV), }; while (defined(my $arg=shift(@ARGV))) { if ( $arg eq '-e' or $arg eq '--embed-d3' ) { $$opts{embed_d3}=shift(@ARGV); next; } if ( $arg eq '-c' or $arg eq '--compress' ) { $$opts{compress}=shift(@ARGV); next; } if ( $arg eq '-r' || $arg eq '--regions' ) { $$opts{regions}=shift(@ARGV); next } if ( $arg eq '-s' || $arg eq '--samples' ) { $$opts{samples}=shift(@ARGV); next } if ( $arg eq '-i' || $arg eq '--RoH-file' ) { $$opts{roh_file}=shift(@ARGV); next } if ( $arg eq '-v' || $arg eq '--VCF-file' ) { $$opts{vcf_file}=shift(@ARGV); next } if ( $arg eq '-l' || $arg eq '--min-length' ) { $$opts{min_length}=shift(@ARGV); next } if ( $arg eq '-o' || $arg eq '--output' ) { $$opts{outfile}=shift(@ARGV); next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( !exists($$opts{roh_file}) ) { error("Missing the -r option.\n"); } if ( !exists($$opts{vcf_file}) ) { error("Missing the -v option.\n"); } return $opts; } sub cmd { my ($cmd,%args) = @_; if ( $args{verbose} ) { print STDERR $cmd,"\n"; } # Why not to use backticks? Perl calls /bin/sh, which is often bash. To get the correct # status of failing pipes, it must be called with the pipefail option. my $kid_io; my $pid = open($kid_io, "-|"); if ( !defined $pid ) { error("Cannot fork: $!"); } my @out; if ($pid) { # parent @out = <$kid_io>; close($kid_io); } else { # child exec('bash', '-o','pipefail','-c', $cmd) or error("Failed to run the command [bash -o pipefail -c $cmd]: $!"); } if ( exists($args{exit_on_error}) && !$args{exit_on_error} ) { return @out; } my $exit_status = $?; my $status = exists($args{require_status}) ? $args{require_status} : 0; if ( $status ne $exit_status ) { my $msg; if ( $? & 0xff ) { $msg = "The command died with signal ".($? & 0xff); } else { $msg = "The command exited with status ".($? >> 8)." (expected $status)"; } $msg .= ":\n\t$cmd\n\n"; if ( @out ) { $msg .= join('',@out,"\n\n"); } error($msg); } return @out; } sub format_number_with_commas { my ($number) = @_; $number =~ s/(?<=\d)(?=(\d{3})+$)/,/g; return $number; } sub process_data { my ($opts) = @_; print STDERR "Parsing $$opts{outdir}/roh.txt\n"; open(my $in,"zless $$opts{roh_file} |") or error("zless $$opts{roh_file}: $!"); while (my $line=<$in>) { chomp($line); if ( $line=~/^#/ ) { if ( $line=~/The command line was:\s+/ ) { $$opts{roh_cmd} = $'; } next; } my @col = split(/\t/,$line); if ( $col[0] ne 'RG' ) { next; } # RG [2]Sample [3]Chromosome [4]Start [5]End [6]Length (bp) [7]Number of markers [8]Quality (average fwd-bwd phred score) my (undef,$smpl,$chr,$beg,$end,undef,$nsnp,$qual) = (@col); if ( $end - $beg >= $$opts{min_length} ) { push @{$$opts{roh}{$chr}{$smpl}},"$beg,$end,$nsnp,$qual"; } } close($in) or error("close failed: zless $$opts{in_file}"); my $bin_size = $$opts{bin_size}; my %chr_len = (); my %dat = (); my %smpl = (); my $smpl = exists($$opts{samples}) ? "-s $$opts{samples}" : ''; my $regs = exists($$opts{regions}) ? "-r $$opts{regions}" : ''; my $cmd = qq[$$opts{bcftools} query $$opts{vcf_file} -f'[%CHROM\\t%POS\\t%SAMPLE\\t%GT\\n]' -i'GT="alt"' $smpl $regs]; print STDERR "Parsing $$opts{vcf_file}\n"; print STDERR " $cmd\n"; open($in,"$cmd |") or error("$cmd: $!"); while (my $line=<$in>) { chomp($line); my ($chr,$pos,$smpl,$gt) = split(/\t/,$line); my %gt = (); for my $x (split(m{[|/]},$gt)) { $gt{$x}=1; } # split by the phase symbol, eg 0/1 and 0|1 my $is_hom = scalar %gt == 1 ? 1 : 0; if ( !exists($dat{$chr}) ) { print STDERR "."; } my $bin = int($pos/$bin_size)*$bin_size + $bin_size/2; $dat{$chr}{$smpl}{$bin}{$is_hom}++; if ( !defined $chr_len{$chr} or $pos > $chr_len{$chr} ) { $chr_len{$chr} = $pos; } $smpl{$smpl} = 1; } close($in) or error("close failed: $cmd"); print STDERR "\n"; $$opts{dat} = \%dat; $$opts{chr} = [sort {$chr_len{$b}<=>$chr_len{$a}} keys %chr_len]; for my $chr (keys %chr_len) { my $chr_id = $chr; $chr_id =~ s/\./_/g; if ( !($chr_id=~/^chr/i) ) { $chr_id = "chr$chr_id"; } $$opts{chr_id}{$chr} = $chr_id; } $$opts{smpl} = [sort keys %smpl]; $$opts{chr_len} = \%chr_len; } sub generate_html { my ($opts) = @_; my $d3js = qq[]; my $pakojs = $$opts{compress} ? qq[] : ''; if ( $$opts{embed_d3} ) { $d3js = ''; $pakojs = $$opts{compress} ? '' : ''; } my $fh = \*STDOUT; if ( exists($$opts{outfile}) ) { open($fh,'>',$$opts{outfile}) or error("$$opts{outfile}: $!"); } print {$fh} << "EOT"; $d3js $pakojs EOT # HTML part my @cmd = qq[This file was produced with]; if ( exists($$opts{roh_cmd}) ) { push @cmd,qq[$$opts{roh_cmd}]; } if ( exists($$opts{cmd}) ) { push @cmd,qq[$$opts{cmd}]; } my $cmd = join('
',@cmd); my $bin_size = format_number_with_commas($$opts{bin_size}); print {$fh} << "EOT";
Bin size $bin_size bp
EOT for my $chr (@{$$opts{chr}}) { print $fh qq[
\n]; } print {$fh} << "EOT"; $cmd EOT if ( exists($$opts{outfile}) ) { close($fh) or error("close failed: $$opts{outfile}"); } } sub embed_js { my ($opts,$file) = @_; if ( $file=~m{^http}i ) { if ( !($file=~m{([^/]+)$}) ) { error("Could not parse the file name: $file"); } my $dat; my $name = $1; if ( -e $name ) { $dat = join('',`cat $name`); } else { my $tmp = "$$opts{outfile}.rmme"; cmd(qq[wget -O $tmp $file]); $dat = join('',`cat $tmp`); unlink($tmp); } return $dat; } return join('',`cat $file`); }