=head1 LICENSE Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute Copyright [2016-2025] EMBL-European Bioinformatics Institute Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. =head1 CONTACT Ensembl <http://www.ensembl.org/info/about/contact/index.html> =cut =head1 NAME Paralogues =head1 SYNOPSIS mv Paralogues.pm ~/.vep/Plugins # Find paralogue regions of all input variants using Ensembl paralogue annotation # (automatically created if not in current directory) and fetch variants within # those regions from VEP cache and whose clinical significance partially # matches 'pathogenic' ./vep -i variations.vcf --cache --plugin Paralogues # Find paralogue regions of input variants using Ensembl paralogue annotation # (automatically created if not in current directory) and fetch variants within # those regions from a custom VCF file (regardless of their clinical significance) ./vep -i variations.vcf --cache --plugin Paralogues,vcf=/path/to/file.vcf,clnsig=ignore # Same using a custom VCF file but filtering for 'pathogenic' variants ./vep -i variations.vcf --cache --plugin Paralogues,vcf=/path/to/file.vcf,clnsig_col=CLNSIG # Same but output different fields ./vep -i variations.vcf --cache --plugin Paralogues,vcf=/path/to/file.vcf.gz,clnsig_col=CLNSIG,fields=identifier:alleles:CLNSIG:CLNVI:GENEINFO # Use a file with regions matched to paralogue variants -- fastest method; # download 'matches' files from https://ftp.ensembl.org/pub/current_variation/Paralogues ./vep -i variations.vcf --cache --plugin Paralogues,matches=Paralogues.pm_homo_sapiens_113_GRCh38_clinvar_20240107.tsv.gz,clnsig=ignore # Same using a 'matches' file but filtering for 'pathogenic' variants (default) ./vep -i variations.vcf --cache --plugin Paralogues,matches=Paralogues.pm_homo_sapiens_113_GRCh38_clinvar_20240107.tsv.gz # Fetch all Ensembl variants in paralogue proteins using only the Ensembl API # (requires database access) ./vep -i variations.vcf --database --plugin Paralogues,mode=remote,clnsig=ignore =head1 DESCRIPTION A VEP plugin that fetches variants overlapping the genomic coordinates of amino acids aligned between paralogue proteins. This is useful to predict the pathogenicity of variants in paralogue positions. This plugin can determine paralogue regions for a variant based on: 1. Pre-computed matches between genomic regions and paralogue variants. For this approach, either download the file calculated using ClinVar variants and respective TBI from https://ftp.ensembl.org/pub/current_variation/Paralogues or create such matches file yourself. Details on how to create such 'matches' file can be found below. 2. Ensembl paralogue annotation. These versatile annotations can look up paralogue regions for all variants from any species with Ensembl paralogues, but take longer to process. After retrieving the paralogue regions, this plugin fetches variants overlapping those regions from one of the following sources (by this order): 1. Custom VCF via the 'vcf' parameter 2. VEP cache (in cache/offline mode) 3. Ensembl API (in database mode) To create a 'matches' file based on a custom set of variants, run VEP using `--plugin Paralogues,regions=1,min_perc_cov=0,min_perc_pos=0,clnsig=ignore` and the `--vcf` option. Afterwards, process the output of the VEP command: `perl -e "use Paralogues; Paralogues::prepare_matches_file('variant_effect_output.txt')"` Options are passed to the plugin as key=value pairs: matches : Tabix-indexed TSV file with pre-computed matches between genomic regions and paralogue variants (fastest method); this option is incompatible with the `paralogues` and `vcf` options dir : Directory with paralogue annotation (the annotation is created in this folder if the paralogue annotation files do not exist) paralogues : Tabix-indexed TSV file with paralogue annotation (if the file does not exist, the annotation is automatically created); if set to 'remote', the annotation is fetched but not stored vcf : Tabix-indexed VCF file to fetch variant information (if not used, variants are fetched from VEP cache in cache/offline mode or Ensembl API in database mode) fields : Colon-separated list of information from paralogue variants to output (default: 'identifier:alleles:clinical_significance'); keyword 'all' can be used to print all fields; available fields include 'identifier', 'chromosome', 'start', 'alleles', 'perc_cov', 'perc_pos', and 'clinical_significance' (if `clnsig_col` is defined for custom VCF); additional fields are available depending on variant source: - VEP cache: 'end' and 'strand' - Ensembl API: 'end', 'strand', 'source', 'consequence' and 'gene_symbol' - Custom VCF: 'quality', 'filter' and name of INFO fields - Matches file: check column names in file header clnsig : Clinical significance term to filter variants (default: 'pathogenic'); use 'ignore' to fetch all paralogue variants, regardless of clinical significance clnsig_match : Type of match when filtering variants based on option `clnsig`: 'partial' (default), 'exact' or 'regex' clnsig_col : Column name containing clinical significance in custom VCF (required with `vcf` option and if `clnsig` is not 'ignore') min_perc_cov : Minimum alignment percentage of the peptide associated with the input variant (default: 0) min_perc_pos : Minimum percentage of positivity (similarity) between both homologues (default: 50) regions : Boolean value to return regions used to look up paralogue variants (default: 1) The tabix utility must be installed in your path to read the paralogue annotation, the custom VCF file and the matches file. =cut package Paralogues; @EXPORT_OK = qw(&process_data); use strict; use warnings; use List::Util qw(any); use File::Basename; use Compress::Zlib; use File::Spec; use Bio::SimpleAlign; use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); use Bio::EnsEMBL::Variation::VariationFeature; use Bio::EnsEMBL::Variation::Utils::Sequence qw(get_matched_variant_alleles); use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap); use Bio::EnsEMBL::IO::Parser::VCF4Tabix; use Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin; use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin); our @MATCHES_FIELDS = ( 'identifier', 'chromosome', 'start', 'end', 'alleles', 'clinical_significance', 'perc_cov', 'perc_pos', ); our @VCF_FIELDS = ( 'identifier', 'chromosome', 'start', 'alleles', 'quality', 'filter', 'clinical_significance', 'perc_cov', 'perc_pos', ); our @CACHE_FIELDS = ( 'identifier', 'chromosome', 'start', 'end', 'strand', 'alleles', 'clinical_significance', 'perc_cov', 'perc_pos', ); our @API_FIELDS = ( @CACHE_FIELDS, 'source', 'consequence', 'gene_symbol', ); our @MATCHES_HEADER = qw/chr start end feature perc_cov perc_pos var_id var_chr var_start var_end var_ref var_alt var_feature/; ## FETCH VARIANTS -------------------------------------------------------------- =head2 _create_vf Arg[1] : hash Description: Create variant feature from a variant hash Returntype : Bio::EnsEMBL::Variation::VariationFeature Status : Experimental =cut sub _create_vf { my ($self, $var) = @_; my $slice = Bio::EnsEMBL::Slice->new( -seq_region_name => $var->{chr}, -start => $var->{start}, -end => $var->{end}, -strand => $var->{strand}, ); my $vf = Bio::EnsEMBL::Variation::VariationFeature->new( -variation_name => $var->{variation_name}, -seq_region_name => $var->{chr}, -start => $var->{start}, -end => $var->{end}, -slice => $slice, -strand => $var->{strand}, -allele_string => $var->{allele_string}, -is_somatic => $var->{somatic}, -clinical_significance => $var->{clin_sig} ? [split /,/, $var->{clin_sig}] : [] ); return $vf; } =head2 _fetch_cache_vars Arg[1] : $seq_region_name Arg[2] : $start Arg[3] : $end Description: Fetch variant features within a given genomic region from VEP cache; supports indexed (faster) and non-indexed (slower) cache Returntype : arrayref of Bio::EnsEMBL::Variation::VariationFeature Status : Experimental =cut sub _fetch_cache_vars { my ($self, $chr, $start, $end) = @_; my ($as, $variants); if (defined($as = $self->_get_AnnotationSource('VariationTabix'))) { # code based on AnnotationSource::Cache::VariationTabix my $source_chr = $as->get_source_chr_name($chr); my $tabix_obj = $as->_get_tabix_obj($source_chr); return unless $tabix_obj; my $iter = $tabix_obj->query(sprintf("%s:%i-%i", $source_chr, $start - 1, $end + 1)); return unless $iter; while(my $line = $iter->next) { chomp $line; my $var = $as->parse_variation($line); push @$variants, $self->_create_vf($var); } } elsif (defined($as = $self->_get_AnnotationSource('Variation'))) { warn("Using non-indexed VEP cache is slow; for optimal performance, please use indexed VEP cache\n") unless $self->{skip_slow_warning}; $self->{skip_slow_warning} = 1; # code based on AnnotationSource::Cache::Variation and AnnotationSource my $cache_region_size = $as->{cache_region_size}; my ($source_chr, $min, $max, $seen, @regions) = $as->get_regions_from_coords( $chr, $start, $end, undef, $cache_region_size, $as->up_down_size()); for my $region (@regions) { my ($c, $s) = @$region; my $file = $as->get_dump_file_name( $c, ($s * $cache_region_size) + 1, ($s + 1) * $cache_region_size ); next unless -e $file; my $gz = gzopen($file, 'rb'); my $line; while($gz->gzreadline($line)) { chomp $line; # ignore non-overlapping variants my $var = $as->parse_variation($line); $var->{chr} = $c; next unless overlap($start, $end, $var->{start}, $var->{end}); push @$variants, $self->_create_vf($var); } } } else { die "ERROR: could not get variants from VEP cache"; } return $variants; } =head2 _fetch_database_vars Arg[1] : $seq_region_name Arg[2] : $start Arg[3] : $end Description: Fetch variant features within a given genomic region from Ensembl database (requires database connection) Returntype : arrayref of Bio::EnsEMBL::Variation::VariationFeature Status : Experimental =cut sub _fetch_database_vars { my ($self, $chr, $start, $end) = @_; my $config = $self->{config}; my $reg = $config->{reg}; my $species = $config->{species}; $self->{slice_adaptor} ||= $reg->get_adaptor($species, 'core', 'slice'); $self->{vf_adaptor} ||= $reg->get_adaptor($species, 'variation', 'variationfeature'); my $slice = $self->{slice_adaptor}->fetch_by_region('chromosome', $chr, $start, $end); next unless defined $slice; return $self->{vf_adaptor}->fetch_all_by_Slice($slice); } =head2 _fetch_database_vars Arg[1] : $seq_region_name Arg[2] : $start Arg[3] : $end Description: Fetch variant features within a given genomic region from: - VEP cache (indexed or non-indexed) if --cache is enabled - Ensembl database if --database is enabled Throws error if --cache and --database are not enabled Returntype : arrayref of Bio::EnsEMBL::Variation::VariationFeature Status : Experimental =cut sub fetch_variants { my ($self, $chr, $start, $end) = @_; my $variants; if ($self->config->{cache}) { $variants = $self->_fetch_cache_vars($chr, $start, $end); } elsif ($self->config->{database}) { $variants = $self->_fetch_database_vars($chr, $start, $end); } else { die("ERROR: cannot fetch variants from cache (no cache available?) neither from Ensembl API (database mode must be enabled)"); } return $variants; } ## GENERATE PARALOGUE ANNOTATION ----------------------------------------------- sub _prepare_filename { my $self = shift; my $config = $self->{config}; # Prepare file name based on species, database version and assembly my $pkg = __PACKAGE__.'.pm'; my $species = $config->{species}; my $version = $config->{db_version} || 'Bio::EnsEMBL::Registry'->software_version; my @name = ($pkg, $species, $version); if( $species eq 'homo_sapiens' || $species eq 'human'){ my $assembly = $config->{assembly} || $config->{human_assembly}; die "specify assembly using --assembly [assembly]\n" unless defined $assembly; push(@name, $assembly) if defined $assembly; } return join("_", @name) . ".tsv.gz"; } sub _get_homology_adaptor { my $self = shift; my $config = $self->{config}; my $reg = $config->{reg}; if (!defined $self->{ha}) { $self->{ha} = $reg->get_adaptor( "multi", "compara", "homology" ); } if (!defined $self->{ha}) { # reconnect to DB without species param if ($config->{host}) { $reg->load_registry_from_db( -host => $config->{host}, -user => $config->{user}, -pass => $config->{password}, -port => $config->{port}, -db_version => $config->{db_version}, -no_cache => $config->{no_slice_cache}, ); } $self->{ha} = $reg->get_adaptor( "multi", "compara", "homology" ); } return $self->{ha}; } sub _get_method_link_species_set_id { my $ha = shift; my $species = shift; my $type = shift; # Get ID corresponding to species and paralogues my @query = qq/ SELECT method_link_species_set_id FROM method_link_species_set JOIN method_link USING (method_link_id) JOIN species_set ss USING (species_set_id) JOIN genome_db gdb ON ss.genome_db_id = gdb.genome_db_id WHERE gdb.name = "$species" AND type = "$type"; /; my $sth = $ha->db->dbc->prepare(@query, { mysql_use_result => 1 }); $sth->execute(); my $id = @{$sth->fetchrow_arrayref}[0]; $sth->finish(); return $id; } sub _write_paralogue_annotation { my ($sth, $file) = @_; # Open lock my $lock = "$file\.lock"; open LOCK, ">$lock" or die "ERROR: $file not found and cannot write to lock file $lock\n"; print LOCK "1\n"; close LOCK; open OUT, " | bgzip -c > $file" or die "ERROR: cannot write to file $file\n"; # write header my @header = qw(homology_id chr start end strand stable_id version perc_cov perc_id perc_pos cigar_line paralogue_chr paralogue_start paralogue_end paralogue_strand paralogue_stable_id paralogue_version paralogue_cigar_line); print OUT "#", join("\t", @header), "\n"; while (my $line = $sth->fetchrow_arrayref()) { print OUT join("\t", @$line), "\n"; } close OUT; unlink($lock); return $file; } sub _generate_paralogue_annotation { my $self = shift; my $file = shift; my $config = $self->{config}; my $species = $config->{species}; my $reg = $config->{reg}; die("ERROR: Cannot generate paralogue annotation in offline mode\n") if $config->{offline}; die("ERROR: Cannot generate paralogue annotation in REST mode\n") if $config->{rest}; print "### Paralogues plugin: Querying Ensembl compara database (this may take a few minutes)\n" unless $config->{quiet}; my $mlss_id = _get_method_link_species_set_id($self->_get_homology_adaptor, $species, 'ENSEMBL_PARALOGUES'); # Create paralogue annotation my @query = qq/ SELECT hm.homology_id, df.name AS chr, sm.dnafrag_start AS start, sm.dnafrag_end AS end, sm.dnafrag_strand AS strand, sm.stable_id, sm.version, hm.perc_cov, hm.perc_id, hm.perc_pos, hm.cigar_line, df2.name AS paralogue_chr, sm2.dnafrag_start AS paralogue_start, sm2.dnafrag_end AS paralogue_end, sm2.dnafrag_strand AS paralogue_strand, sm2.stable_id AS paralogue_id, sm2.version AS paralogue_version, hm2.cigar_line AS paralogue_cigar_line -- Reference proteins FROM homology_member hm JOIN homology h USING (homology_id) JOIN seq_member sm USING (seq_member_id) JOIN dnafrag df USING (dnafrag_id) -- Paralogue proteins JOIN homology_member hm2 ON hm.homology_id = hm2.homology_id JOIN seq_member sm2 ON hm2.seq_member_id = sm2.seq_member_id JOIN dnafrag df2 ON sm2.dnafrag_id = df2.dnafrag_id WHERE method_link_species_set_id = ${mlss_id} AND sm.source_name = 'ENSEMBLPEP' AND sm2.stable_id != sm.stable_id ORDER BY df.name, sm.dnafrag_start, sm.dnafrag_end; /; my $sth = $self->{ha}->db->dbc->prepare(@query, { mysql_use_result => 1}); $sth->execute(); print "### Paralogues plugin: Writing to file\n" unless $config->{quiet}; _write_paralogue_annotation($sth, $file); $sth->finish(); print "### Paralogues plugin: Creating tabix index\n" unless $config->{quiet}; system "tabix -s2 -b3 -e4 $file" and die "ERROR: tabix index creation failed\n"; print "### Paralogues plugin: file ready at $file\n" unless $config->{quiet}; return 1; } sub _get_database_homologies { my $self = shift; my $transcript = shift; my $config = $self->{config}; my $species = $config->{species}; my $reg = $config->{reg}; $self->{ga} ||= $reg->get_adaptor($species, 'core', 'gene'); my $gene = $self->{ga}->fetch_by_stable_id( $transcript->{_gene_stable_id} ); my $homologies = $self->_get_homology_adaptor->fetch_all_by_Gene( $gene, -METHOD_LINK_TYPE => 'ENSEMBL_PARALOGUES', -TARGET_SPECIES => $species); return $homologies; } ## RETRIEVE PARALOGUES FROM ANNOTATION ----------------------------------------- sub _compose_alignment_from_cigar { my $seq = shift; my $cigar = shift; die "Unsupported characters found in CIGAR line: $cigar\n" unless $cigar =~ /^[0-9MD]+$/; my $aln_str = $seq; my $index = 0; while ($cigar =~ /(\d*)([MD])/g) { my $num = $1 || 1; my $letter = $2; substr($aln_str, $index, 0) = '-' x $num if $letter =~ /D/; $index += $num; } return $aln_str; } sub _create_SimpleAlign { my ($self, $homology_id, $protein, $ref_cigar, $paralogue, $par_cigar, $perc_cov, $perc_pos) = @_; my $par = $paralogue->translation; my $par_id = $par->stable_id; my $par_seq = $par->seq; my $ref_id = $protein->stable_id; my $ref_seq = $protein->seq; my $aln = Bio::SimpleAlign->new(); $aln->id($homology_id); $aln->add_seq(Bio::LocatableSeq->new( -SEQ => _compose_alignment_from_cigar($ref_seq, $ref_cigar), -ALPHABET => 'protein', -START => 1, -END => length($ref_seq), -ID => $ref_id, -STRAND => 0 )); $aln->add_seq(Bio::LocatableSeq->new( -SEQ => _compose_alignment_from_cigar($par_seq, $par_cigar), -ALPHABET => 'protein', -START => 1, -END => length($par_seq), -ID => $par_id, -STRAND => 0 )); # add alignment stats $aln->{_stats}->{ref_perc_cov} = $perc_cov; $aln->{_stats}->{ref_perc_pos} = $perc_pos; # add paralogue information to retrieve from cache later on my $key = $par_id . '_info'; $aln->{$key}->{_chr} = $paralogue->seq_region_name; $aln->{$key}->{_start} = $par->genomic_start; $aln->{$key}->{_strand} = $paralogue->strand; return $aln; } sub _get_paralogues { my ($self, $vf, $translation) = @_; my $var_chr = $vf->seq_region_name || $vf->{chr}; my $var_start = $vf->start - 2; my $var_end = $vf->end; # get translation info my $translation_id = $translation->stable_id; my $translation_seq = $translation->seq; # get paralogues for this variant region my $file = $self->{paralogues}; my @data = @{$self->get_data($var_chr, $var_start, $var_end, $file)}; my $paralogues = []; for (@data) { my ( $homology_id, $chr, $start, $end, $strand, $protein_id, $version, $perc_cov, $perc_id, $perc_pos, $cigar, $para_chr, $para_start, $para_end, $para_strand, $para_id, $para_version, $para_cigar ) = split /\t/, $_; next unless $translation_id eq $protein_id; next unless $perc_cov >= $self->{min_perc_cov}; next unless $perc_pos >= $self->{min_perc_pos}; my $paralogue = $self->_get_transcript_from_translation( $para_id, $para_chr, $para_start, $para_end); my $aln = $self->_create_SimpleAlign($homology_id, $translation, $cigar, $paralogue, $para_cigar, $perc_cov, $perc_pos); push @$paralogues, $aln; } return $paralogues; } sub _get_paralogue_coords { my $self = shift; my $tva = shift; my $aln = shift; my $translation_id = $tva->transcript->translation->stable_id; my $translation_start = $tva->base_variation_feature_overlap->translation_start; # avoid cases where $translation_start is located in stop codon return unless $translation_start <= $tva->transcript->translation->length; # identify paralogue protein my @proteins = keys %{ $aln->{'_start_end_lists'} }; my $paralogue; if ($translation_id eq $proteins[0]) { $paralogue = $proteins[1]; } elsif ($translation_id eq $proteins[1]) { $paralogue = $proteins[0]; } else { return; } # get genomic coordinates for aligned residue in paralogue my $col = $aln->column_from_residue_number($translation_id, $translation_start); my $coords = $aln->get_seq_by_id($paralogue)->location_from_column($col); return unless defined $coords and $coords->location_type eq 'EXACT'; my $tr_info = $aln->{$paralogue . '_info'}; my $tr_chr = $tr_info->{_chr} if defined $tr_info; my $tr_genomic_start = $tr_info->{_start} if defined $tr_info; my $tr_strand = $tr_info->{_strand} if defined $tr_info; my $para_tr = $self->_get_transcript_from_translation( $paralogue, $tr_chr, $tr_genomic_start, $tr_strand); my ($para_coords) = $para_tr->pep2genomic($coords->start, $coords->start); my $chr = $para_tr->seq_region_name; my $start = $para_coords->start; my $end = $para_coords->end; my $transcript_id = $para_tr->stable_id || ''; return ($chr, $start, $end, $transcript_id); } ## GET DATA FROM ANNOTATION ---------------------------------------------------- sub _get_config { my $self = shift; if (!defined $self->{config_obj}) { $self->{config_obj} = Bio::EnsEMBL::VEP::Config->new( $self->{config} ); } return $self->{config_obj}; } sub _get_AnnotationSource { my $self = shift; my $filter = shift; my %match = ( 'Transcript' => 'Bio::EnsEMBL::VEP::AnnotationSource::Cache::Transcript', 'Variation' => 'Bio::EnsEMBL::VEP::AnnotationSource::Cache::Variation', 'VariationTabix' => 'Bio::EnsEMBL::VEP::AnnotationSource::Cache::VariationTabix', ); if (!defined $self->{asa}) { # Cache all annotation sources my $cfg = $self->_get_config; $cfg->{_params}->{check_existing} = 1; # enable to fetch variants from VEP cache $self->{asa} = Bio::EnsEMBL::VEP::AnnotationSourceAdaptor->new({config => $cfg}); } my $asa = $self->{asa}; if (defined $asa && $asa->can('get_all_from_cache')) { for (@{$self->{asa}->get_all_from_cache}) { return $_ if ref $_ eq $match{$filter}; } } return undef; } sub _get_transcript_from_translation { my $self = shift; my $protein_id = shift; my $chr = shift; my $start = shift; my $strand = shift; die "No protein identifier given\n" unless defined $protein_id; return $self->{_cache}->{$protein_id} if $self->{_cache}->{$protein_id}; # try to get transcript from cache if enabled if ($self->{config}->{cache} && defined $chr && defined $start && defined $strand) { my $as = $self->_get_AnnotationSource('Transcript'); die "ERROR: could not get transcripts from VEP cache" unless defined $as; my (@regions, $seen, $min_max, $min, $max); my $cache_region_size = $as->{cache_region_size}; my $up_down_size = defined $as->{up_down_size} ? $as->{up_down_size} : $as->up_down_size; ($chr, $min, $max, $seen, @regions) = $as->get_regions_from_coords( $chr, $start, $start, $min_max, $cache_region_size, $up_down_size, $seen); foreach my $region (@regions) { my ($chr, $range) = @$region; my $file = $as->get_dump_file_name( $chr, ($range * $cache_region_size) + 1, ($range + 1) * $cache_region_size ); my @features = @{ $as->deserialized_obj_to_features( $as->deserialize_from_file($file) ) } if -e $file; for my $transcript (@features) { my $translation = $transcript->translation; if ($translation && $translation->stable_id eq $protein_id) { $self->{_cache}->{$protein_id} = $transcript; return $transcript; } } } } # get transcript from database if not returned yet if ($self->{config}->{offline}) { die "Translation $protein_id not cached; avoid using --offline to allow to connect to database\n"; } my $config = $self->_get_config; my $species = $config->species; my $reg = $config->registry; $self->{ta} ||= $reg->get_adaptor($species, 'core', 'translation'); my $transcript = $self->{ta}->fetch_by_stable_id($protein_id)->transcript; $self->{_cache}->{$protein_id} = $transcript; return $transcript; } ## GET PARALOGUE VARIANTS BASED ON MATCHES OR ANNOTATION ----------------------- sub _get_paralogue_vars_from_matches { my ($self, $tva) = @_; my $vf = $tva->variation_feature; my $allele = $tva->base_variation_feature->alt_alleles; # get transcript for this feature (skip if not available) return {} unless $tva->can('transcript'); my $feat = $tva->transcript->stable_id; my $file = $self->{matches}; my $chr = $vf->{chr}; my ($start, $end) = $vf->{start} < $vf->{end} ? ($vf->{start}, $vf->{end}) : ($vf->{end}, $vf->{start}); # get paralogue variants for this region my @data = @{$self->get_data($chr, $start, $end, $file)}; my $all_results = {}; foreach (@data) { next unless $feat eq $_->{feature}; my $var = $_->{var}; $all_results = $self->_prepare_paralogue_vars_output( $var->{chr}, $var->{start}, $var->{end}, $var->{feature}, $_->{perc_cov}, $_->{perc_pos}, [ $var ], $all_results); } return $all_results; } sub _get_paralogue_vars_from_annotation { my ($self, $tva) = @_; my $vf = $tva->variation_feature; my $homologies = []; if ($self->{remote}) { $homologies = $self->_get_database_homologies($tva->transcript); } else { my $translation = $tva->transcript->translation; $homologies = $self->{_cache_homologies}->{$translation->stable_id} ||= $self->_get_paralogues($vf, $translation); } return {} unless @$homologies; my $all_results = {}; for my $aln (@$homologies) { my ($perc_cov, $perc_pos); if ($aln->isa('Bio::EnsEMBL::Compara::Homology')) { my $ref = $aln->get_all_Members->[0]; $perc_cov = $ref->perc_cov; $perc_pos = $ref->perc_pos; $aln = $aln->get_SimpleAlign; } elsif ($aln->isa('Bio::SimpleAlign')) { $perc_cov = $aln->{_stats}->{ref_perc_cov}; $perc_pos = $aln->{_stats}->{ref_perc_pos}; } else { next; } my ($chr, $start, $end, $transcript_id) = $self->_get_paralogue_coords($tva, $aln); next unless defined $chr and defined $start and defined $end; my $variants; if (defined $self->{vcf}) { # get variants from custom VCF file $variants = $self->get_data($chr, $start, $end, $self->{vcf}); } else { # get Ensembl variants from mapped genomic coordinates $variants = $self->fetch_variants($chr, $start, $end); } $all_results = $self->_prepare_paralogue_vars_output( $chr, $start, $end, $transcript_id, $perc_cov, $perc_pos, $variants, $all_results); } return $all_results; } sub _prepare_paralogue_vars_output { my ($self, $chr, $start, $end, $transcript_id, $perc_cov, $perc_pos, $variants, $all_results) = @_; my $is_rest = $self->{config}->{output_format} eq 'json' || $self->{config}->{rest}; foreach my $var (@$variants) { # check clinical significance (if set) my $cln_sig = $var->{clinical_significance}; $cln_sig = [ $cln_sig ] if defined $self->{vcf} or defined $self->{matches}; next unless $self->_is_clinically_significant($cln_sig); my $FUN = (defined $self->{matches} or defined $self->{vcf}) ? '_prepare_vcf_info' : '_summarise_vf'; my $res = $self->$FUN($var, $perc_cov, $perc_pos); if ($is_rest) { my %res_hash; @res_hash{ @{$self->{fields}} } = @{$res}; $res = \%res_hash; } else { $res = join(':', @{$res}); } $all_results = $self->_join_results($all_results, { PARALOGUE_VARIANTS => $res }); } if ($self->{regions}) { my @keys = qw/chromosome start end transcript_id perc_cov perc_pos/; my @values = ($chr, $start, $end, $transcript_id, $perc_cov, $perc_pos); my $regions; if ($is_rest) { my %regions_hash; @regions_hash{@keys} = @values; $regions = \%regions_hash; } else { $regions = join(':', @values); } $all_results = $self->_join_results($all_results, { PARALOGUE_REGIONS => $regions }); } return $all_results; } ## PLUGIN ---------------------------------------------------------------------- sub _get_valid_fields { my $selected = shift; my $available = shift; # return all available fields when using 'all' return $available if $selected eq 'all'; my @fields = split(/:/, $selected); # check if the selected fields exist my @valid; my @invalid; for my $field (@fields) { if ( grep { $_ eq $field } @$available ) { push(@valid, $field); } else { push(@invalid, $field); } } die "ERROR: all fields given are invalid. Available fields are:\n" . join(", ", @$available)."\n" unless @valid; warn "Paralogues plugin: WARNING: the following fields are not valid and were ignored: ", join(", ", @invalid), "\n" if @invalid; return \@valid; } sub _validate_matches_file { my ($self, $params) = @_; die "ERROR: options 'matches' and 'paralogues' are incompatible\n" if defined $params->{paralogues}; die "ERROR: options 'matches' and 'vcf' are incompatible\n" if defined $params->{vcf}; die "ERROR: 'matches=$self->{matches}': file not found\n" unless -e $self->{matches}; die "ERROR: 'matches=$self->{matches}': respective TBI file not found\n" unless -e $self->{matches} . ".tbi" || -e $self->{matches} . ".csi"; return $self; } sub new { my $class = shift; my $self = $class->SUPER::new(@_); $self->expand_left(0); $self->expand_right(0); $self->get_user_params(); my $params = $self->params_to_hash(); my $config = $self->{config}; # Thresholds for minimum percentage of homology similarity and coverage $self->{min_perc_cov} = defined $params->{min_perc_cov} ? $params->{min_perc_cov} : 0; $self->{min_perc_pos} = defined $params->{min_perc_pos} ? $params->{min_perc_pos} : 50; $self->{regions} = defined $params->{regions} ? $params->{regions} : 1; # Prepare clinical significance parameters my $no_clnsig = defined $params->{clnsig} && $params->{clnsig} eq 'ignore'; $self->{clnsig_term} = $params->{clnsig} || 'pathogenic' unless $no_clnsig; if (defined $self->{clnsig_term}) { $self->{clnsig_match} = $params->{clnsig_match} || 'partial'; die "ERROR: clnsig_match only accepts 'exact', 'partial' or 'regex'\n" unless grep { $self->{clnsig_match} eq $_ } ('exact', 'partial', 'regex'); } # Check information to retrieve from paralogue variants my $vcf = $params->{vcf}; my @fields= ('identifier', 'alleles', 'clinical_significance'); # default if (defined $params->{matches}) { # File with matches between variants and their paralogues regions my $file = $params->{matches}; $self->{matches} = $file; $self->{clnsig_col} ||= 'CLNSIG'; $self->add_file($file); $self->_validate_matches_file($params); # Read column names from matches file $self->{matches_col} = `tabix -H ${file}` or die $!; chomp $self->{matches_col}; $self->{matches_col} = [ split /\t/, $self->{matches_col} ]; # Remove basic columns from user-selectable columns for my $elem (@MATCHES_HEADER) { $elem = '#' . $elem if $elem eq 'chr'; # Add hash to first column name $self->{matches_col} = [ grep { $elem ne $_ } @{ $self->{matches_col} } ]; } my $valid_fields = [ @MATCHES_FIELDS, @{$self->{matches_col}} ]; @fields = @{ _get_valid_fields($params->{fields}, $valid_fields) } if defined $params->{fields};; } elsif (defined $vcf) { $self->{vcf} = $vcf; $self->add_file($vcf); # get INFO fields names from VCF my $vcf_file = Bio::EnsEMBL::IO::Parser::VCF4Tabix->open($vcf); my $info = $vcf_file->get_metadata_by_pragma('INFO'); my $info_ids = [ map { $_->{ID} } @$info ]; @fields = @{ _get_valid_fields($params->{fields}, [@VCF_FIELDS, @$info_ids]) } if defined $params->{fields}; # check if clinical significance column exists if (defined $self->{clnsig_term} || defined $params->{clnsig_col}) { $self->{clnsig_col} = $params->{clnsig_col}; die "ERROR: clnsig_col must be set when using a custom VCF unless clnsig=ignore\n" unless defined $self->{clnsig_col} or $no_clnsig; my $filename = basename $vcf; die "ERROR: clnsig_col $self->{clnsig_col} not found in $filename. Available INFO fields are:\n" . join(", ", @$info_ids)."\n" unless grep { $self->{clnsig_col} eq $_ } @$info_ids; } # warn if trying to return clinical significance when the user does not input its VCF field warn("WARNING: clnsig_col not defined; clinical significance of paralogue variants will be empty\n") if !defined $self->{clnsig_col} && grep { /^clinical_significance$/ } @fields; } elsif (defined $params->{fields}) { # valid fields differ if using cache or database my @VAR_FIELDS = $self->{config}->{cache} ? @CACHE_FIELDS : @API_FIELDS; @fields = @{ _get_valid_fields($params->{fields}, \@VAR_FIELDS) }; } $self->{fields} = \@fields; return $self if $self->{matches}; # Check if paralogue annotation should be downloaded $self->{remote} = defined $params->{paralogues} && $params->{paralogues} eq 'remote'; return $self if $self->{remote}; # Generate paralogue annotation my $annot = defined $params->{paralogues} ? $params->{paralogues} : $self->_prepare_filename; my $dir = $params->{dir}; if (defined $dir) { die "ERROR: directory $dir not found" unless -e -d $dir; $annot = File::Spec->catfile( $dir, $annot ); } $self->_generate_paralogue_annotation($annot) unless (-e $annot || -e "$annot.lock"); $self->{paralogues} = $annot; $self->add_file($self->{paralogues}); return $self; } sub feature_types { return ['Feature']; } sub get_header_info { my $self = shift; my $source; if (defined $self->{matches}) { $source = basename $self->{matches}; } elsif (defined $self->{vcf}) { $source = basename $self->{vcf}; } elsif ($self->{config}->{cache}) { $source = 'VEP cache'; } elsif ($self->{config}->{database}) { $source = 'Ensembl API'; } my $fields = join(':', @{ $self->{fields} }); my $description = "Variants from $source in paralogue locations (colon-separated fields: $fields)"; my $header = { PARALOGUE_VARIANTS => $description }; $header->{PARALOGUE_REGIONS} = 'Genomic location of paralogue regions (colon-separated fields: chromosome:start:end:transcript_id:perc_cov:perc_pos)' if $self->{regions}; return $header; } sub _obj_in_arrayref { # Returns whether $obj is equal to any element of $arrayref my ($arrayref, $obj) = @_; for my $elem (@$arrayref) { if (ref $elem eq 'HASH') { next unless ref $obj eq 'HASH'; # Compare if keys are the same length between the two objects my @keys_obj = sort keys %$obj; my @keys_k = sort keys %$elem; next unless $#keys_k eq $#keys_obj; # Compare keys and values between two hashes my $sum = 0; my $total = 0; for (@keys_obj) { # sum equal values $sum += $obj->{$_} eq $elem->{$_} ? 1 : 0; $total++; } return 1 if $sum eq $total; } else { return 1 if $elem eq $obj; } } return 0; } sub _join_results { my $self = shift; my $all_results = shift; my $res = shift; # Create array of results per key for (keys %$res) { $all_results->{$_} = [] unless defined $all_results->{$_}; if (not _obj_in_arrayref($all_results->{$_}, $res->{$_})) { # Avoid storing duplicate elements push(@{ $all_results->{$_} }, $res->{$_}) } } return $all_results; } sub _summarise_vf { my ($self, $vf, $perc_cov, $perc_pos) = @_; my @var; my $clnsig; for my $field (@{ $self->{fields} }) { my $info; if ($field eq 'identifier') { $info = $vf->name; } elsif ($field eq 'chromosome') { $info = $vf->seq_region_name; } elsif ($field eq 'start') { $info = $vf->seq_region_start; } elsif ($field eq 'end') { $info = $vf->seq_region_end; } elsif ($field eq 'strand') { $info = $vf->strand; } elsif ($field eq 'alleles') { $info = $vf->allele_string; } elsif ($field eq 'clinical_significance') { $info = $clnsig ||= join('/', @{$vf->get_all_clinical_significance_states}); } elsif ($field eq 'source') { $info = $vf->source_name; } elsif ($field eq 'consequence') { $info = $vf->display_consequence; } elsif ($field eq 'perc_cov') { $info = $perc_cov; } elsif ($field eq 'perc_pos') { $info = $perc_pos; } elsif ($field eq 'gene_symbol') { my @symbols; for (@{ $vf->{slice}->get_all_Genes }) { push(@symbols, $_->display_xref->display_id) if $_->display_xref; } $info = join('/', @symbols); } push @var, $info || ''; } return \@var; } sub _is_clinically_significant { my ($self, $clnsig) = @_; # if no filter is set, avoid filtering my $filter = $self->{clnsig_term}; return 1 unless defined $filter; return 0 unless any { defined } @$clnsig; my $res; my $match = $self->{clnsig_match}; if ($match eq 'partial') { $filter = "\Q$filter\E"; } elsif ($match eq 'exact') { $filter = "^\Q$filter\E\$"; } elsif ($match eq 'regex') { # no need to do anything } else { return 0; } return grep /$filter/i, @$clnsig; } sub _prepare_vcf_info { my ($self, $data, $perc_cov, $perc_pos)= @_; $data->{perc_cov} = $perc_cov; $data->{perc_pos} = $perc_pos; # prepare data from selected fields my @res; for my $field (@{ $self->{fields} }) { my $value = defined $data->{$field} ? $data->{$field} : ''; $value =~ s/,/ /g; $value =~ s/[:|]/_/g; push @res, $value; } return \@res; } sub run { my ($self, $tva) = @_; # Quick checks to save time my $vf = $tva->variation_feature; my $translation_start = $tva->base_variation_feature_overlap->translation_start; my $translation_end = $tva->base_variation_feature_overlap->translation_end; return {} unless defined $translation_start and defined $translation_end and $translation_start == $translation_end; return $self->{matches} ? $self->_get_paralogue_vars_from_matches($tva) : $self->_get_paralogue_vars_from_annotation($tva); } sub _parse_matches { my ($self, $line, $file) = @_; my ($chrom, $start, $end, $feature, $perc_cov, $perc_pos, $var_id, $var_chr, $var_start, $var_end, $var_ref, $var_alt, $var_feature, @INFO) = split /\t/, $line; my %data = ( 'chromosome' => $chrom, 'start' => $start, 'end' => $end, 'feature' => $feature, 'perc_cov' => $perc_cov, 'perc_pos' => $perc_pos, 'var' => { 'chr' => $var_chr, 'chromosome' => $var_chr, 'start' => $var_start, 'end' => $var_end, 'identifier' => $var_id, 'alleles' => $var_ref . "/" . $var_alt, 'feature' => $var_feature } ); # add all ClinVar INFO fields to variant object $data{var}{ $_ } = shift @INFO for @{$self->{matches_col}}; # Save clinical significance my $clnsig_col = $self->{clnsig_col}; $data{var}{clinical_significance} = $data{var}{$clnsig_col} if $clnsig_col; return \%data; } sub _parse_vcf { my ($self, $line) = @_; my ($chrom, $start, $id, $ref, $alt, $qual, $filter, $info) = split /\t/, $line; # VCF-like adjustment of mismatched substitutions for comparison with VEP if(length($alt) != length($ref)) { my $first_ref = substr($ref, 0, 1); my $first_alt = substr($alt, 0, 1); if ($first_ref eq $first_alt) { $start++; $ref = substr($ref, 1); $alt = substr($alt, 1); $ref ||= '-'; $alt ||= '-'; } } # fetch data from INFO fields my %data = ( 'chromosome' => $chrom, 'start' => $start, 'identifier' => $id, 'alleles' => $ref.'/'.$alt, 'ref' => $ref, 'alt' => $alt, 'quality' => $qual, 'filter' => $filter, ); for my $field ( split /;/, $info ) { my ($key, $value) = split /=/, $field; $data{$key} = $value; } my $clnsig_col = $self->{clnsig_col}; $data{clinical_significance} = $data{$clnsig_col} if $clnsig_col; return \%data; } sub parse_data { my ($self, $line, $file) = @_; if (defined $self->{paralogues} && $file eq $self->{paralogues}) { return $line; } elsif (defined $self->{matches} && $file eq $self->{matches}) { return $self->_parse_matches($line, $file); } else { return $self->_parse_vcf($line); } } sub get_start { return $_[1]->{start}; } sub get_end { return $_[1]->{end}; } sub prepare_matches_file { # Prepare annotation containing matches between genomic regions and paralogue # variant location; the file is then sorted, bgzipped and tabixed # Use this function on VEP output with Paralogues plugin data: # perl -e "use Paralogues; Paralogues::prepare_matches_file('variant_effect_output.txt')" my $results_file = shift; my $outfile = shift || dirname($results_file) . '/' . 'paralogues_matches.tsv'; $results_file = File::Spec->rel2abs($results_file); die "File not found: $results_file\n" unless -e $results_file; my $gz = gzopen($results_file, 'r') or die "Not able to open file: $results_file\n"; open(OUT, '>', $outfile) or die $!; my ($info_fields, $csq_cols); my $has_header = 0; my $progress = 0; # Split each region into new rows while($gz->gzreadline($_) > 0) { chomp; # read INFO columns if ($_ =~ /^##INFO=<ID=(.*?),/) { if ($1 eq 'CSQ') { $_ =~ /.*Format: (.*)">/; $csq_cols = [ split /\|/, $1 ]; } else { push @$info_fields, $1; } } next if $_ =~ /^#/; die "Could not find INFO attribute for CSQ; check if VCF header is okay\n" unless $csq_cols; print("Processing line ", $progress, "...\n") if $progress++ % 100000 == 0; if (!$has_header) { print OUT "#", join("\t", @MATCHES_HEADER, @$info_fields), "\n"; $has_header = 1; } my ($chr, $start, $id, $ref, $alt, $qual, $filter, $info) = split "\t", $_; $info = { map { split /=/ } split(/;/, $info) }; my $csq = $info->{'CSQ'}; for my $each_csq (split /,/, $csq) { my %res; @res{@$csq_cols} = split /\|/, $each_csq; if (defined $res{PARALOGUE_REGIONS} and $res{PARALOGUE_REGIONS} ne '-') { for my $region (split /&/, $res{PARALOGUE_REGIONS}) { $region =~ s/:/\t/g; my $end = $start + length($alt) - 1; my $data = join("\t", $region, $id, $chr, $start, $end, $ref, $alt, $res{Feature}, map { $_ ne "CSQ" and defined $info->{$_} ? $info->{$_} : "NA" } @$info_fields ); print OUT $data, "\n"; } } } } print "Sorting file...\n"; system "cat ${outfile} | (sed -u 1q; sort -k1,1V -k2,2n -k3,3n -k7,7V -k8,8n -k9,9n) > ${outfile}_sorted && rm ${outfile}" and die $!; my $outfile_gz = $outfile . '.gz'; print "Compressing file with bgzip...\n"; system "bgzip -c ${outfile}_sorted > $outfile_gz && rm ${outfile}_sorted" and die $!; print "Indexing file with tabix...\n"; system "tabix -s1 -b2 -e3 $outfile_gz" and die $!; $gz->gzclose(); close OUT; } 1;