| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Bio::Palantir::Roles::Fillable; | 
| 2 |  |  |  |  |  |  | # ABSTRACT: Fillable Moose role for the construction of DomainPlus object arrays and Exploratory methods | 
| 3 |  |  |  |  |  |  | $Bio::Palantir::Roles::Fillable::VERSION = '0.200700'; | 
| 4 | 1 |  |  | 1 |  | 593 | use Moose::Role; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 9 |  | 
| 5 |  |  |  |  |  |  |  | 
| 6 | 1 |  |  | 1 |  | 4733 | use autodie; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 8 |  | 
| 7 |  |  |  |  |  |  |  | 
| 8 | 1 |  |  | 1 |  | 4553 | use Const::Fast; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 8 |  | 
| 9 | 1 |  |  | 1 |  | 949 | use File::ShareDir qw(dist_dir); | 
|  | 1 |  |  |  |  | 18934 |  | 
|  | 1 |  |  |  |  | 44 |  | 
| 10 | 1 |  |  | 1 |  | 7 | use File::Temp; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 63 |  | 
| 11 |  |  |  |  |  |  |  | 
| 12 | 1 |  |  | 1 |  | 6 | use aliased 'Bio::FastParsers::Hmmer::DomTable'; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 7 |  | 
| 13 | 1 |  |  | 1 |  | 190273 | use aliased 'Bio::Palantir::Refiner::DomainPlus'; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 6 |  | 
| 14 |  |  |  |  |  |  |  | 
| 15 |  |  |  |  |  |  |  | 
| 16 |  |  |  |  |  |  | const my $DATA_PATH => dist_dir('Bio-Palantir') . '/'; | 
| 17 |  |  |  |  |  |  |  | 
| 18 |  |  |  |  |  |  | # pHMM database sources: | 
| 19 |  |  |  |  |  |  | # | 
| 20 |  |  |  |  |  |  | # Weber T, Blin K, Duddela S, et al. antiSMASH 3.0-a comprehensive resource for | 
| 21 |  |  |  |  |  |  | # the genome mining of biosynthetic gene clusters. Nucleic Acids Res. | 
| 22 |  |  |  |  |  |  | # 2015;43(W1):W237âW243. doi:10.1093/nar/gkv437 | 
| 23 |  |  |  |  |  |  | # | 
| 24 |  |  |  |  |  |  | # Khayatt, B. I., Overmars, L., Siezen, R. J., & Francke, C. (2013). | 
| 25 |  |  |  |  |  |  | # Classification of the Adenylation and Acyl-Transferase Activity of NRPS and | 
| 26 |  |  |  |  |  |  | # PKS Systems Using Ensembles of Substrate Specific Hidden Markov Models. | 
| 27 |  |  |  |  |  |  | # PLoS ONE, 8(4). http://doi.org/10.1371/journal.pone.0062136 | 
| 28 |  |  |  |  |  |  | # | 
| 29 |  |  |  |  |  |  | # Bushley, K. E., & Turgeon, B. G. (2010). Phylogenomics reveals subfamilies of | 
| 30 |  |  |  |  |  |  | # fungal nonribosomal peptide synthetases and their evolutionary relationships. | 
| 31 |  |  |  |  |  |  | # BMC Evolutionary Biology, 10(1), 26. http://doi.org/10.1186/1471-2148-10-26 | 
| 32 |  |  |  |  |  |  |  | 
| 33 |  |  |  |  |  |  | # biological data | 
| 34 |  |  |  |  |  |  | my %ADJUSTMENT_FOR = (      # length column useless: this information can be obtained by OO variable | 
| 35 |  |  |  |  |  |  | # for antismash activity | 
| 36 |  |  |  |  |  |  | A                    => { start => 20, end => 100, length => 418 }, | 
| 37 |  |  |  |  |  |  | Red                  => { start => 0,  end => 0  , length => 242 }, # equivalent of TD | 
| 38 |  |  |  |  |  |  | # NRPS | 
| 39 |  |  |  |  |  |  | Condensation         => { start => 0,  end => 150, length => 300 }, | 
| 40 |  |  |  |  |  |  | Condensation_LCL     => { start => 0,  end => 154, length => 296 }, | 
| 41 |  |  |  |  |  |  | Condensation_DCL     => { start => 0,  end => 150, length => 300 }, | 
| 42 |  |  |  |  |  |  | Condensation_Starter => { start => 0,  end => 150, length => 300 }, | 
| 43 |  |  |  |  |  |  | Condensation_Dual    => { start => 0,  end => 157, length => 293 }, | 
| 44 |  |  |  |  |  |  | Epimerization        => { start => 0,  end => 133, length => 317 }, | 
| 45 |  |  |  |  |  |  | Heterocyclization    => { start => 0,  end => 150, length => 300 }, | 
| 46 |  |  |  |  |  |  | X                    => { start => 0,  end => 147, length => 303 }, | 
| 47 |  |  |  |  |  |  | Cglyc                => { start => 0,  end => 151, length => 299 }, | 
| 48 |  |  |  |  |  |  | Condensation_fungi   => { start => 0,  end => 251, length => 209 }, # où faut-il ajuster ? | 
| 49 |  |  |  |  |  |  | 'AMP-binding'        => { start => 27, end => 66,  length => 418 }, | 
| 50 |  |  |  |  |  |  | 'AMP-binding_fungi'  => { start => 4,  end => 0  , length => 507 }, | 
| 51 |  |  |  |  |  |  | ACPS                 => { start => 0,  end => 0  , length => 78  }, | 
| 52 |  |  |  |  |  |  | Aminotran_1_2        => { start => 0,  end => 0  , length => 363 }, | 
| 53 |  |  |  |  |  |  | Aminotran_3          => { start => 0,  end => 0  , length => 339 }, | 
| 54 |  |  |  |  |  |  | Aminotran_4          => { start => 0,  end => 0  , length => 232 }, | 
| 55 |  |  |  |  |  |  | Aminotran_5          => { start => 0,  end => 0  , length => 371 }, | 
| 56 |  |  |  |  |  |  | 'A-OX'               => { start => 0,  end => -250,length => 780 }, # it overlaps following domains | 
| 57 |  |  |  |  |  |  | B                    => { start => 0,  end => 0  , length => 365 }, | 
| 58 |  |  |  |  |  |  | cMT                  => { start => 0,  end => 0  , length => 230 }, | 
| 59 |  |  |  |  |  |  | ECH                  => { start => 0,  end => 0  , length => 245 }, | 
| 60 |  |  |  |  |  |  | Epimerization        => { start => 0,  end => 0  , length => 317 }, | 
| 61 |  |  |  |  |  |  | F                    => { start => 0,  end => 0  , length => 127 }, | 
| 62 |  |  |  |  |  |  | Fkbh                 => { start => 0,  end => 0  , length => 142 }, | 
| 63 |  |  |  |  |  |  | GNAT                 => { start => 0,  end => 0  , length => 139 }, | 
| 64 |  |  |  |  |  |  | Hal                  => { start => 0,  end => 0  , length => 222 }, | 
| 65 |  |  |  |  |  |  | Heterocyclization    => { start => 0,  end => 0  , length => 300 }, | 
| 66 |  |  |  |  |  |  | NAD_binding_4        => { start => 0,  end => 0  , length => 249 }, | 
| 67 |  |  |  |  |  |  | nMT                  => { start => 0,  end => 0  , length => 244 }, | 
| 68 |  |  |  |  |  |  | 'NRPS-COM_Cterm'     => { start => 0,  end => 0  , length => 21  }, | 
| 69 |  |  |  |  |  |  | 'NRPS-COM_Nterm'     => { start => 0,  end => 0  , length => 33  }, | 
| 70 |  |  |  |  |  |  | oMT                  => { start => 0,  end => 0  , length => 280 }, | 
| 71 |  |  |  |  |  |  | PCP                  => { start => 15, end => 0  , length => 70  }, | 
| 72 |  |  |  |  |  |  | 'PP-binding'         => { start => 15, end => 0  , length => 70  }, | 
| 73 |  |  |  |  |  |  | PCP_fungi            => { start => 0,  end => 0  , length => 86  }, | 
| 74 |  |  |  |  |  |  | PS                   => { start => 0,  end => 0  , length => 214 }, | 
| 75 |  |  |  |  |  |  | TD                   => { start => 0,  end => 0  , length => 242 }, | 
| 76 |  |  |  |  |  |  | Thioesterase         => { start => 0,  end => 0  , length => 229 }, | 
| 77 |  |  |  |  |  |  | # PKS | 
| 78 |  |  |  |  |  |  | PKS_KS               => { start => 8,  end => 0, length => 426   }, | 
| 79 |  |  |  |  |  |  | PKS_KR               => { start => 0,  end => 0, length => 185   }, | 
| 80 |  |  |  |  |  |  | PKS_ER               => { start => 0,  end => 0, length => 313   }, | 
| 81 |  |  |  |  |  |  | PKS_DH               => { start => 0,  end => 0, length => 166   }, | 
| 82 |  |  |  |  |  |  | PKS_DH2              => { start => 0,  end => 0, length => 233   }, | 
| 83 |  |  |  |  |  |  | PKS_DHt              => { start => 0,  end => 0, length => 236   }, | 
| 84 |  |  |  |  |  |  | PKS_AT               => { start => 114, end => 22, length => 298 }, | 
| 85 |  |  |  |  |  |  | Polyketide_cyc       => { start => 0,  end => 0, length => 130   }, | 
| 86 |  |  |  |  |  |  | Polyketide_cyc2      => { start => 0,  end => 0, length => 139   }, | 
| 87 |  |  |  |  |  |  | PKS_Docking_Nterm    => { start => 0,  end => 0, length => 28    }, | 
| 88 |  |  |  |  |  |  | PKS_Docking_Cterm    => { start => 0,  end => 0, length => 73 }, | 
| 89 |  |  |  |  |  |  | CAL_domain           => { start => 0,  end => 0, length => 465 }, | 
| 90 |  |  |  |  |  |  | 'Trans-AT_docking'   => { start => 0,  end => 0, length => 155 }, | 
| 91 |  |  |  |  |  |  | ACP                  => { start => 0,  end => 0, length => 73 }, | 
| 92 |  |  |  |  |  |  | ACP_beta             => { start => 0,  end => 0, length => 67 }, | 
| 93 |  |  |  |  |  |  | ); | 
| 94 |  |  |  |  |  |  |  | 
| 95 |  |  |  |  |  |  |  | 
| 96 |  |  |  |  |  |  | # public methods | 
| 97 |  |  |  |  |  |  |  | 
| 98 |  |  |  |  |  |  | sub detect_domains {                        ## no critic (RequireArgUnpacking) | 
| 99 | 0 |  |  | 0 | 0 |  | my $self = shift; | 
| 100 |  |  |  |  |  |  |  | 
| 101 | 0 |  |  |  |  |  | my ($seq, $gene_pos, $gap_coords, $from_seq) = @_; #TODO switch to hash | 
| 102 |  |  |  |  |  |  |  | 
| 103 | 0 |  |  |  |  |  | my %hit_for = %{ $self->_parse_generic_domains($seq) }; | 
|  | 0 |  |  |  |  |  |  | 
| 104 |  |  |  |  |  |  |  | 
| 105 |  |  |  |  |  |  | # build DomainPlus objects | 
| 106 | 0 |  |  |  |  |  | my @domains; | 
| 107 | 0 |  |  |  |  |  | for my $hit (keys %hit_for) { | 
| 108 |  |  |  |  |  |  | push @domains, DomainPlus->new( | 
| 109 | 0 |  |  |  |  |  | map { $_ => $hit_for{$hit}{$_} } keys %{ $hit_for{$hit} } | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 110 |  |  |  |  |  |  | ); | 
| 111 |  |  |  |  |  |  | } | 
| 112 |  |  |  |  |  |  |  | 
| 113 | 0 |  |  |  |  |  | $_->_set_protein_sequence($seq) for @domains; | 
| 114 |  |  |  |  |  |  |  | 
| 115 | 0 |  |  |  |  |  | $self->_use_hit_information($_, $gene_pos) for @domains; | 
| 116 |  |  |  |  |  |  |  | 
| 117 | 0 |  |  |  |  |  | $self->_elongate_coordinates(\@domains, $gap_coords); | 
| 118 | 0 |  |  |  |  |  | @domains = $self->_handle_overlaps($from_seq, @domains); | 
| 119 | 0 |  |  |  |  |  | $self->_refine_coordinates(@domains); | 
| 120 |  |  |  |  |  |  |  | 
| 121 |  |  |  |  |  |  | # subtype the domains | 
| 122 | 0 |  |  |  |  |  | $self->_get_domain_subtype($_) for @domains; | 
| 123 |  |  |  |  |  |  |  | 
| 124 | 0 |  |  |  |  |  | return (@domains); | 
| 125 |  |  |  |  |  |  | } | 
| 126 |  |  |  |  |  |  |  | 
| 127 |  |  |  |  |  |  | # private methods | 
| 128 |  |  |  |  |  |  |  | 
| 129 |  |  |  |  |  |  | sub _elongate_coordinates { | 
| 130 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 131 |  |  |  |  |  |  |  | 
| 132 | 0 |  |  |  |  |  | my $domains = shift; | 
| 133 | 0 |  |  |  |  |  | my $gap_coords = shift; | 
| 134 |  |  |  |  |  |  |  | 
| 135 |  |  |  |  |  |  | # adjustment of domain coordinates to get plausible domain sizes | 
| 136 | 0 |  |  |  |  |  | for my $domain (@{ $domains }) { | 
|  | 0 |  |  |  |  |  |  | 
| 137 |  |  |  |  |  |  |  | 
| 138 |  |  |  |  |  |  | # begin at profile start even if does not match the profile, and adjust the coords depending of the domain nature | 
| 139 | 0 |  |  |  |  |  | my $start    = $domain->begin - $domain->hmm_from; # complete the domain if the phmm didnt match entirely the sequence | 
| 140 | 0 |  |  |  |  |  | my $end      = $domain->end   + ($domain->tlen - $domain->hmm_to); | 
| 141 |  |  |  |  |  |  |  | 
| 142 | 0 | 0 |  |  |  |  | if ($ADJUSTMENT_FOR{$domain->target_name}) { | 
| 143 | 0 |  | 0 |  |  |  | $start -= $ADJUSTMENT_FOR{$domain->target_name}{start} // 0; | 
| 144 | 0 |  | 0 |  |  |  | $end   += $ADJUSTMENT_FOR{$domain->target_name}{end} // 0; | 
| 145 |  |  |  |  |  |  | } | 
| 146 |  |  |  |  |  |  |  | 
| 147 |  |  |  |  |  |  | # handle truncated domains | 
| 148 |  |  |  |  |  |  |  | 
| 149 |  |  |  |  |  |  | # check strain orientation | 
| 150 | 0 |  |  |  |  |  | my ($gene_begin, $gene_end); | 
| 151 | 0 | 0 |  |  |  |  | if ($self->gene_begin > $self->gene_end) { | 
| 152 | 0 |  |  |  |  |  | $gene_begin = $self->gene_end; | 
| 153 | 0 |  |  |  |  |  | $gene_end   = $self->gene_begin; | 
| 154 |  |  |  |  |  |  | } | 
| 155 |  |  |  |  |  |  |  | 
| 156 |  |  |  |  |  |  | else { | 
| 157 | 0 |  |  |  |  |  | $gene_begin = $self->gene_begin; | 
| 158 | 0 |  |  |  |  |  | $gene_end   = $self->gene_end; | 
| 159 |  |  |  |  |  |  | } | 
| 160 |  |  |  |  |  |  |  | 
| 161 |  |  |  |  |  |  | # standard elongation checks | 
| 162 | 0 | 0 |  |  |  |  | unless ($gap_coords) { | 
| 163 | 0 | 0 |  |  |  |  | $start = 1 if $start <= 0;       # as we do not use -1 position before substr() | 
| 164 | 0 | 0 |  |  |  |  | $end = $gene_end if $end > $gene_end; | 
| 165 |  |  |  |  |  |  | } | 
| 166 |  |  |  |  |  |  |  | 
| 167 |  |  |  |  |  |  | # gap filling elongation check | 
| 168 |  |  |  |  |  |  | else { | 
| 169 | 0 | 0 |  |  |  |  | $start = $gap_coords->[0] if $start < $gap_coords->[0]; | 
| 170 | 0 | 0 |  |  |  |  | $end   = ($gap_coords->[1] - 1) if $end >= $gap_coords->[1]; | 
| 171 |  |  |  |  |  |  | } | 
| 172 |  |  |  |  |  |  |  | 
| 173 | 0 |  |  |  |  |  | my $size = $end - $start + 1; | 
| 174 |  |  |  |  |  |  |  | 
| 175 | 0 |  |  |  |  |  | $domain->_set_begin($start); | 
| 176 | 0 |  |  |  |  |  | $domain->_set_end($end); | 
| 177 | 0 |  |  |  |  |  | $domain->_set_size($size); | 
| 178 | 0 |  |  |  |  |  | $domain->_set_coordinates( [$start, $end] ); | 
| 179 |  |  |  |  |  |  | } | 
| 180 |  |  |  |  |  |  |  | 
| 181 | 0 |  |  |  |  |  | return; | 
| 182 |  |  |  |  |  |  | } | 
| 183 |  |  |  |  |  |  |  | 
| 184 |  |  |  |  |  |  | sub _handle_overlaps {                      ## no critic (RequireArgUnpacking) | 
| 185 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 186 |  |  |  |  |  |  |  | 
| 187 | 0 |  | 0 |  |  |  | my $from_seq = shift // 0; | 
| 188 | 0 |  |  |  |  |  | my @domains = @_; | 
| 189 |  |  |  |  |  |  | my @sorted_domains | 
| 190 | 0 | 0 |  |  |  |  | = sort { $b->score <=> $a->{score} || $a->begin <=> $b->begin } | 
|  | 0 |  |  |  |  |  |  | 
| 191 |  |  |  |  |  |  | @domains | 
| 192 |  |  |  |  |  |  | ; | 
| 193 |  |  |  |  |  |  |  | 
| 194 | 0 |  |  |  |  |  | my %deja_vu; | 
| 195 |  |  |  |  |  |  | my @non_overlapping_domains; | 
| 196 |  |  |  |  |  |  |  | 
| 197 |  |  |  |  |  |  | REF: | 
| 198 | 0 |  |  |  |  |  | for my $ref_hit ( 0..(@sorted_domains - 1) ) { | 
| 199 |  |  |  |  |  |  |  | 
| 200 | 0 | 0 |  |  |  |  | next REF if $deja_vu{$ref_hit}; | 
| 201 |  |  |  |  |  |  |  | 
| 202 | 0 |  |  |  |  |  | my ($x1, $y1) = map { $sorted_domains[$ref_hit]->$_ } qw(begin end); | 
|  | 0 |  |  |  |  |  |  | 
| 203 | 0 |  |  |  |  |  | $deja_vu{$ref_hit} = 1; | 
| 204 |  |  |  |  |  |  |  | 
| 205 | 0 |  |  |  |  |  | my @overlaps = $ref_hit; | 
| 206 |  |  |  |  |  |  |  | 
| 207 |  |  |  |  |  |  | CMP: | 
| 208 | 0 |  |  |  |  |  | for my $cmp_hit ( 0..(@sorted_domains - 1) ) { | 
| 209 |  |  |  |  |  |  |  | 
| 210 | 0 | 0 |  |  |  |  | next CMP if $deja_vu{$cmp_hit}; | 
| 211 |  |  |  |  |  |  |  | 
| 212 | 0 |  |  |  |  |  | my ($x2, $y2) = map { $sorted_domains[$cmp_hit]->$_ } qw(begin end); | 
|  | 0 |  |  |  |  |  |  | 
| 213 |  |  |  |  |  |  |  | 
| 214 | 0 | 0 | 0 |  |  |  | unless ($x2 > $y1 || $y2 < $x1) {   ## no critic (ProhibitNegativeExpressionsInUnlessAndUntilConditions) | 
| 215 |  |  |  |  |  |  |  | 
| 216 | 0 |  |  |  |  |  | push @overlaps, $cmp_hit; | 
| 217 |  |  |  |  |  |  |  | 
| 218 | 0 | 0 |  |  |  |  | if ($from_seq == 1) { | 
| 219 | 0 | 0 |  |  |  |  | $deja_vu{$cmp_hit} = 1 | 
| 220 |  |  |  |  |  |  | if $sorted_domains[$cmp_hit]->class | 
| 221 |  |  |  |  |  |  | eq $sorted_domains[$ref_hit]->class | 
| 222 |  |  |  |  |  |  | ;     # avoid superposition of domain of the same class | 
| 223 |  |  |  |  |  |  | } | 
| 224 |  |  |  |  |  |  |  | 
| 225 |  |  |  |  |  |  | else { | 
| 226 | 0 |  |  |  |  |  | $deja_vu{$cmp_hit} = 1; | 
| 227 |  |  |  |  |  |  | } | 
| 228 |  |  |  |  |  |  | } | 
| 229 |  |  |  |  |  |  | } | 
| 230 |  |  |  |  |  |  |  | 
| 231 | 0 |  |  |  |  |  | push @non_overlapping_domains, $sorted_domains[ $overlaps[0]  ]; | 
| 232 |  |  |  |  |  |  | } | 
| 233 |  |  |  |  |  |  |  | 
| 234 | 0 |  |  |  |  |  | return(@non_overlapping_domains); | 
| 235 |  |  |  |  |  |  | } | 
| 236 |  |  |  |  |  |  |  | 
| 237 |  |  |  |  |  |  | sub _refine_coordinates {                   ## no critic (RequireArgUnpacking) | 
| 238 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 239 |  |  |  |  |  |  |  | 
| 240 | 0 |  |  |  |  |  | my @domains = @_; | 
| 241 |  |  |  |  |  |  |  | 
| 242 |  |  |  |  |  |  | # final refinment of coordinates | 
| 243 | 0 |  |  |  |  |  | my @sorted_domains = sort { $a->begin <=> $b->begin } @domains; | 
|  | 0 |  |  |  |  |  |  | 
| 244 |  |  |  |  |  |  |  | 
| 245 | 0 |  |  |  |  |  | for my $i (0..(scalar @sorted_domains - 2)) { | 
| 246 |  |  |  |  |  |  |  | 
| 247 | 0 | 0 |  |  |  |  | if ($sorted_domains[$i]->end >= $sorted_domains[$i + 1]->begin) {        # si le hit recouvre la fin du hit précént | 
| 248 |  |  |  |  |  |  |  | 
| 249 |  |  |  |  |  |  | my $adjust_start | 
| 250 |  |  |  |  |  |  | = $ADJUSTMENT_FOR{ $sorted_domains[$i + 1]->target_name }{start} | 
| 251 | 0 |  | 0 |  |  |  | // 0 | 
| 252 |  |  |  |  |  |  | ; | 
| 253 |  |  |  |  |  |  |  | 
| 254 | 0 | 0 |  |  |  |  | if ($adjust_start > 0) { # si ajustement start domaine suivant | 
| 255 |  |  |  |  |  |  |  | 
| 256 | 0 | 0 |  |  |  |  | if ( ($sorted_domains[$i + 1]->begin + $adjust_start) | 
| 257 |  |  |  |  |  |  | > $sorted_domains[$i]->end) { # si conserver une partie de l'ajustement est possible | 
| 258 |  |  |  |  |  |  | # et si la partie qui empiète le hit précédent est uen région ajoutée artificiellement --> réduire la taille de la région ajoutée (jusqu'au pHMM tout au plus | 
| 259 |  |  |  |  |  |  | # il vaut mieux commencer par éliminer les parties ajoutées en N-ter d'abord, car elles sont assez limitées. | 
| 260 | 0 |  |  |  |  |  | $sorted_domains[$i + 1]->_set_begin( | 
| 261 |  |  |  |  |  |  | $sorted_domains[$i]->end + 1); | 
| 262 |  |  |  |  |  |  | } | 
| 263 |  |  |  |  |  |  |  | 
| 264 |  |  |  |  |  |  | else { # sinon déduire ajustement start sans endommager les coordonées initiales du domaine | 
| 265 | 0 |  |  |  |  |  | $sorted_domains[$i + 1]->_set_begin( | 
| 266 |  |  |  |  |  |  | $sorted_domains[$i + 1]->begin + $adjust_start); | 
| 267 |  |  |  |  |  |  | } | 
| 268 |  |  |  |  |  |  | } | 
| 269 |  |  |  |  |  |  | } | 
| 270 |  |  |  |  |  |  |  | 
| 271 | 0 | 0 |  |  |  |  | if ($sorted_domains[$i]->end >= $sorted_domains[$i + 1]->begin) {  # si cela ne résoud pas le problème, on peut réduire la taille de la région ajoutée C-ter du domaine précédent (jusqu'au pHMM tout au plus | 
| 272 |  |  |  |  |  |  |  | 
| 273 |  |  |  |  |  |  | my $adjust_end | 
| 274 |  |  |  |  |  |  | = $ADJUSTMENT_FOR{ $sorted_domains[$i]->target_name }{end} | 
| 275 | 0 |  | 0 |  |  |  | // 0 | 
| 276 |  |  |  |  |  |  | ; | 
| 277 |  |  |  |  |  |  |  | 
| 278 | 0 | 0 |  |  |  |  | if ($adjust_end > 0) {  # si ajustement end | 
| 279 |  |  |  |  |  |  |  | 
| 280 | 0 | 0 |  |  |  |  | if ( ($sorted_domains[$i]->end - $adjust_end) | 
| 281 |  |  |  |  |  |  | < $sorted_domains[$i + 1]->begin) { # si conserver une partie de l'ajustement est possible | 
| 282 |  |  |  |  |  |  |  | 
| 283 | 0 |  |  |  |  |  | $sorted_domains[$i]->_set_end( | 
| 284 |  |  |  |  |  |  | $sorted_domains[$i + 1]->begin - 1); | 
| 285 |  |  |  |  |  |  | } | 
| 286 |  |  |  |  |  |  |  | 
| 287 |  |  |  |  |  |  | else { # sinon déduire ajustement end sans endommager les coordonées initiales du domaine | 
| 288 |  |  |  |  |  |  |  | 
| 289 | 0 |  |  |  |  |  | $sorted_domains[$i]->_set_end($sorted_domains[$i]->end - $adjust_end); | 
| 290 |  |  |  |  |  |  | } | 
| 291 |  |  |  |  |  |  | } | 
| 292 |  |  |  |  |  |  | # ne rien faire si les parties qui s'empiètent font partie du pHMM --> comment pourrait-on les distinguer ? | 
| 293 |  |  |  |  |  |  | } | 
| 294 |  |  |  |  |  |  | } | 
| 295 |  |  |  |  |  |  |  | 
| 296 |  |  |  |  |  |  | # attribute refined sequence | 
| 297 | 0 |  |  |  |  |  | for my $domain (@domains) { | 
| 298 | 0 |  |  |  |  |  | $domain->_set_size($domain->end - $domain->begin + 1); | 
| 299 |  |  |  |  |  |  |  | 
| 300 | 0 |  |  |  |  |  | my ($seq) = $self->protein_sequence;   # list context | 
| 301 | 0 |  |  |  |  |  | $domain->_set_protein_sequence( | 
| 302 |  |  |  |  |  |  | substr($seq, $domain->begin - 1, $domain->size) ); | 
| 303 |  |  |  |  |  |  | } | 
| 304 |  |  |  |  |  |  |  | 
| 305 | 0 |  |  |  |  |  | return; | 
| 306 |  |  |  |  |  |  | } | 
| 307 |  |  |  |  |  |  |  | 
| 308 |  |  |  |  |  |  | sub _get_domain_subtype { | 
| 309 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 310 |  |  |  |  |  |  |  | 
| 311 | 0 |  |  |  |  |  | my $domain = shift; | 
| 312 |  |  |  |  |  |  |  | 
| 313 |  |  |  |  |  |  | # search for A & AT substrate specificity and C & KS subtypes | 
| 314 | 0 |  |  |  |  |  | my %hmmdb_for = ( | 
| 315 |  |  |  |  |  |  | A  => $DATA_PATH . 'A_specificity.hmm', | 
| 316 |  |  |  |  |  |  | AT => $DATA_PATH . 'AT_specificity.hmm', | 
| 317 |  |  |  |  |  |  | C  => $DATA_PATH . 'C_subtypes.hmm', | 
| 318 |  |  |  |  |  |  | KS => $DATA_PATH . 'KS_subtypes.hmm', | 
| 319 |  |  |  |  |  |  | ); | 
| 320 |  |  |  |  |  |  |  | 
| 321 | 0 | 0 |  |  |  |  | unless ($hmmdb_for{$domain->symbol}) { | 
| 322 |  |  |  |  |  |  |  | 
| 323 | 0 | 0 |  |  |  |  | if ($domain->function =~ m/MT | ^Amt$ | Aminotran/xms) { | 
| 324 | 0 |  |  |  |  |  | $domain->_set_subtype($domain->target_name); | 
| 325 |  |  |  |  |  |  | } | 
| 326 |  |  |  |  |  |  |  | 
| 327 |  |  |  |  |  |  | else { | 
| 328 | 0 |  |  |  |  |  | $domain->_set_subtype('NULL'); | 
| 329 |  |  |  |  |  |  | } | 
| 330 |  |  |  |  |  |  |  | 
| 331 | 0 |  |  |  |  |  | $domain->_set_subtype_evalue('NULL'); | 
| 332 | 0 |  |  |  |  |  | $domain->_set_subtype_score('NULL'); | 
| 333 | 0 |  |  |  |  |  | return; | 
| 334 |  |  |  |  |  |  | } | 
| 335 |  |  |  |  |  |  |  | 
| 336 | 0 |  |  |  |  |  | my $seq   = $domain->protein_sequence; | 
| 337 |  |  |  |  |  |  |  | 
| 338 | 0 |  |  |  |  |  | my $hmmdb = $hmmdb_for{$domain->symbol}; | 
| 339 | 0 |  |  |  |  |  | my $tbout = $self->_do_hmmscan($seq, $hmmdb); | 
| 340 |  |  |  |  |  |  |  | 
| 341 |  |  |  |  |  |  | # parse domtblout hmmer report | 
| 342 | 0 |  |  |  |  |  | my $report = DomTable->new( file => $tbout->filename ); | 
| 343 |  |  |  |  |  |  |  | 
| 344 | 0 |  |  |  |  |  | my %subtype_for; | 
| 345 |  |  |  |  |  |  | my $i; | 
| 346 |  |  |  |  |  |  |  | 
| 347 | 0 |  |  |  |  |  | while (my $hit = $report->next_hit) { | 
| 348 |  |  |  |  |  |  |  | 
| 349 | 0 |  |  |  |  |  | $i++; | 
| 350 |  |  |  |  |  |  |  | 
| 351 |  |  |  |  |  |  | $subtype_for{$hit->target_name} = {     # avoid doublons | 
| 352 | 0 |  |  |  |  |  | map { $_ => $hit->$_ } | 
|  | 0 |  |  |  |  |  |  | 
| 353 |  |  |  |  |  |  | qw(target_name ali_from ali_to hmm_from hmm_to tlen) | 
| 354 |  |  |  |  |  |  | }; | 
| 355 |  |  |  |  |  |  |  | 
| 356 | 0 |  |  |  |  |  | $subtype_for{$hit->target_name}{evalue} = $hit->i_evalue; | 
| 357 | 0 |  |  |  |  |  | $subtype_for{$hit->target_name}{score}  = $hit->dom_score; | 
| 358 |  |  |  |  |  |  | } | 
| 359 |  |  |  |  |  |  |  | 
| 360 | 0 | 0 |  |  |  |  | unless (values %subtype_for) { | 
| 361 | 0 |  |  |  |  |  | $domain->_set_subtype('na'); | 
| 362 | 0 |  |  |  |  |  | $domain->_set_subtype_evalue('na'); | 
| 363 | 0 |  |  |  |  |  | $domain->_set_subtype_score('na'); | 
| 364 | 0 |  |  |  |  |  | return; | 
| 365 |  |  |  |  |  |  | } | 
| 366 |  |  |  |  |  |  |  | 
| 367 |  |  |  |  |  |  | #TODO handle multiple hits for the same pHMM -> give many times the same value | 
| 368 |  |  |  |  |  |  | my @sorted_hits | 
| 369 | 0 |  |  |  |  |  | = sort { $subtype_for{$b}{score} <=> $subtype_for{$a}{score} } | 
|  | 0 |  |  |  |  |  |  | 
| 370 |  |  |  |  |  |  | keys %subtype_for | 
| 371 |  |  |  |  |  |  | ; | 
| 372 |  |  |  |  |  |  |  | 
| 373 |  |  |  |  |  |  | # get best scoring subtype but also the closest ones (>= 95% than the best score) | 
| 374 | 0 |  |  |  |  |  | my $score_ref = $subtype_for{$sorted_hits[0]}{score}; | 
| 375 |  |  |  |  |  |  | my @keys | 
| 376 | 0 |  |  |  |  |  | = grep { $subtype_for{$_}{score} >= ($score_ref / 100) * 95 } | 
|  | 0 |  |  |  |  |  |  | 
| 377 |  |  |  |  |  |  | @sorted_hits | 
| 378 |  |  |  |  |  |  | ; | 
| 379 |  |  |  |  |  |  |  | 
| 380 | 0 |  |  |  |  |  | my @values  = map { $subtype_for{$_}{target_name} } @keys; | 
|  | 0 |  |  |  |  |  |  | 
| 381 | 0 |  |  |  |  |  | my @evalues = map { $subtype_for{$_}{evalue} } @keys; | 
|  | 0 |  |  |  |  |  |  | 
| 382 | 0 |  |  |  |  |  | my @scores  = map { $subtype_for{$_}{score} } @keys; | 
|  | 0 |  |  |  |  |  |  | 
| 383 |  |  |  |  |  |  |  | 
| 384 | 0 |  |  |  |  |  | $domain->_set_subtype( (join '/', @values)); | 
| 385 | 0 |  |  |  |  |  | $domain->_set_subtype_evalue( (join '/', @evalues)); | 
| 386 | 0 |  |  |  |  |  | $domain->_set_subtype_score( (join '/', @scores)); | 
| 387 |  |  |  |  |  |  |  | 
| 388 | 0 |  |  |  |  |  | return; | 
| 389 |  |  |  |  |  |  | } | 
| 390 |  |  |  |  |  |  |  | 
| 391 |  |  |  |  |  |  | # sub _get_docking_domains { | 
| 392 |  |  |  |  |  |  | #     my $self = shift; | 
| 393 |  |  |  |  |  |  | # | 
| 394 |  |  |  |  |  |  | #     my $seq = shift; | 
| 395 |  |  |  |  |  |  | #     my $hmmdb = $DATA_PATH . 'docking.hmm'; | 
| 396 |  |  |  |  |  |  | # | 
| 397 |  |  |  |  |  |  | #     my $tbout = $self->_do_hmmscan($seq, $hmmdb); | 
| 398 |  |  |  |  |  |  | # | 
| 399 |  |  |  |  |  |  | #     # parse domtblout hmmer report | 
| 400 |  |  |  |  |  |  | #     my $report = DomTable->new( file => $tbout->filename ); | 
| 401 |  |  |  |  |  |  | # | 
| 402 |  |  |  |  |  |  | #     my %hit_for; | 
| 403 |  |  |  |  |  |  | #     my $i; | 
| 404 |  |  |  |  |  |  | # | 
| 405 |  |  |  |  |  |  | #     while (my $hit = $report->next_hit) { | 
| 406 |  |  |  |  |  |  | # | 
| 407 |  |  |  |  |  |  | #         if ($ARGV{'--evalue-threshold'}) { | 
| 408 |  |  |  |  |  |  | #             next if $hit->evalue > $ARGV{'--evalue-threshold'}; | 
| 409 |  |  |  |  |  |  | #         } | 
| 410 |  |  |  |  |  |  | #         $i++; | 
| 411 |  |  |  |  |  |  | # | 
| 412 |  |  |  |  |  |  | #         $hit_for{ 'hit_' . $i } = { | 
| 413 |  |  |  |  |  |  | #             map { $_ => $hit->$_ } qw(target_name ali_from ali_to hmm_from hmm_to tlen score evalue ) | 
| 414 |  |  |  |  |  |  | #         }; | 
| 415 |  |  |  |  |  |  | #     } | 
| 416 |  |  |  |  |  |  | # | 
| 417 |  |  |  |  |  |  | #     return \%hit_for; | 
| 418 |  |  |  |  |  |  | # } | 
| 419 |  |  |  |  |  |  |  | 
| 420 |  |  |  |  |  |  |  | 
| 421 |  |  |  |  |  |  | ## no critic (ProhibitUnusedPrivateSubroutines) | 
| 422 |  |  |  |  |  |  |  | 
| 423 |  |  |  |  |  |  | sub _get_domain_features { | 
| 424 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 425 |  |  |  |  |  |  |  | 
| 426 | 0 |  |  |  |  |  | my $domain = shift; | 
| 427 | 0 |  | 0 |  |  |  | my $gene_pos = shift // 0; | 
| 428 |  |  |  |  |  |  |  | 
| 429 | 0 |  |  |  |  |  | my $query = $domain->protein_sequence; | 
| 430 |  |  |  |  |  |  |  | 
| 431 | 0 |  |  |  |  |  | my %domain_for = %{ $self->_parse_generic_domains($query) }; | 
|  | 0 |  |  |  |  |  |  | 
| 432 |  |  |  |  |  |  |  | 
| 433 | 0 | 0 |  |  |  |  | unless (%domain_for) { | 
| 434 | 0 |  |  |  |  |  | $domain->_set_function('to_remove'); | 
| 435 | 0 |  |  |  |  |  | return; | 
| 436 |  |  |  |  |  |  | } | 
| 437 |  |  |  |  |  |  |  | 
| 438 |  |  |  |  |  |  | my $best_hit | 
| 439 | 0 |  |  |  |  |  | = (sort { $domain_for{$b}{score} <=> $domain_for{$a}{score} } | 
|  | 0 |  |  |  |  |  |  | 
| 440 |  |  |  |  |  |  | keys %domain_for)[0] | 
| 441 |  |  |  |  |  |  | ; | 
| 442 |  |  |  |  |  |  |  | 
| 443 | 0 |  |  |  |  |  | for my $feature (keys %{ $domain_for{$best_hit} }) { | 
|  | 0 |  |  |  |  |  |  | 
| 444 | 0 |  |  |  |  |  | my $_set_attr = '_set_' . $feature; | 
| 445 | 0 |  |  |  |  |  | $domain->$_set_attr( $domain_for{$best_hit}{$feature} ); | 
| 446 |  |  |  |  |  |  | } | 
| 447 |  |  |  |  |  |  |  | 
| 448 | 0 |  |  |  |  |  | $self->_use_hit_information($domain, $gene_pos); | 
| 449 | 0 |  |  |  |  |  | return; | 
| 450 |  |  |  |  |  |  | } | 
| 451 |  |  |  |  |  |  |  | 
| 452 |  |  |  |  |  |  | ## use critic | 
| 453 |  |  |  |  |  |  |  | 
| 454 |  |  |  |  |  |  |  | 
| 455 |  |  |  |  |  |  | sub _use_hit_information { | 
| 456 | 0 |  |  | 0 |  |  | my $self     = shift; | 
| 457 | 0 |  |  |  |  |  | my $domain   = shift; | 
| 458 | 0 |  |  |  |  |  | my $gene_pos = shift; | 
| 459 |  |  |  |  |  |  |  | 
| 460 | 0 |  |  |  |  |  | $domain->_set_begin($gene_pos + $domain->ali_from); | 
| 461 | 0 |  |  |  |  |  | $domain->_set_end($gene_pos + $domain->ali_to); | 
| 462 |  |  |  |  |  |  |  | 
| 463 | 0 |  |  |  |  |  | $domain->_set_phmm_name($domain->target_name); | 
| 464 |  |  |  |  |  |  |  | 
| 465 |  |  |  |  |  |  |  | 
| 466 | 0 | 0 | 0 |  |  |  | if (! $domain->function || $domain->function =~ m/NA/xmsi) { | 
| 467 | 0 |  |  |  |  |  | $domain->_set_function($domain->target_name) | 
| 468 |  |  |  |  |  |  | } | 
| 469 |  |  |  |  |  |  |  | 
| 470 | 0 |  |  |  |  |  | return; | 
| 471 |  |  |  |  |  |  | } | 
| 472 |  |  |  |  |  |  |  | 
| 473 |  |  |  |  |  |  |  | 
| 474 |  |  |  |  |  |  | sub _parse_generic_domains { | 
| 475 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 476 |  |  |  |  |  |  |  | 
| 477 | 0 |  |  |  |  |  | my $seq = shift; | 
| 478 |  |  |  |  |  |  |  | 
| 479 | 0 |  |  |  |  |  | my $hmmdb = $DATA_PATH . 'generic_domains.hmm'; | 
| 480 |  |  |  |  |  |  |  | 
| 481 | 0 |  |  |  |  |  | my $tbout = $self->_do_hmmscan($seq, $hmmdb); | 
| 482 |  |  |  |  |  |  |  | 
| 483 |  |  |  |  |  |  | # parsing of domtblout hmmscan report | 
| 484 | 0 |  |  |  |  |  | my $report = DomTable->new( file => $tbout->filename ); | 
| 485 |  |  |  |  |  |  |  | 
| 486 | 0 |  |  |  |  |  | my $ug = Data::UUID->new; | 
| 487 | 0 |  |  |  |  |  | my %hit_for; | 
| 488 | 0 |  |  |  |  |  | my $evalue_threshold = 10e-3; | 
| 489 | 0 |  |  |  |  |  | my $i = 1; | 
| 490 |  |  |  |  |  |  |  | 
| 491 |  |  |  |  |  |  | HIT: | 
| 492 | 0 |  |  |  |  |  | while (my $hit = $report->next_hit) { | 
| 493 |  |  |  |  |  |  |  | 
| 494 | 0 | 0 |  |  |  |  | next HIT if $hit->evalue > $evalue_threshold; | 
| 495 |  |  |  |  |  |  |  | 
| 496 | 0 |  |  |  |  |  | my $ui = $ug->create_str; | 
| 497 |  |  |  |  |  |  |  | 
| 498 |  |  |  |  |  |  | $hit_for{$ui} = { | 
| 499 | 0 |  |  |  |  |  | map { $_ => $hit->$_ } | 
|  | 0 |  |  |  |  |  |  | 
| 500 |  |  |  |  |  |  | qw(query_name target_name ali_from ali_to hmm_from hmm_to tlen qlen) | 
| 501 |  |  |  |  |  |  | }; | 
| 502 |  |  |  |  |  |  |  | 
| 503 | 0 |  |  |  |  |  | $hit_for{$ui}{score}  = $hit->dom_score; | 
| 504 | 0 |  |  |  |  |  | $hit_for{$ui}{evalue} = $hit->i_evalue;     # i-value = independent evalue ('evalue' return cumulative evalues for the sequence, and c-evalue may be deleted because potentially e-value) | 
| 505 | 0 |  |  |  |  |  | $i++; | 
| 506 |  |  |  |  |  |  | } | 
| 507 |  |  |  |  |  |  |  | 
| 508 | 0 |  |  |  |  |  | return \%hit_for; | 
| 509 |  |  |  |  |  |  | } | 
| 510 |  |  |  |  |  |  |  | 
| 511 |  |  |  |  |  |  | sub _do_hmmscan { | 
| 512 | 0 |  |  | 0 |  |  | my $self  = shift; | 
| 513 | 0 |  |  |  |  |  | my $seq   = shift; | 
| 514 | 0 |  |  |  |  |  | my $hmmdb = shift; | 
| 515 |  |  |  |  |  |  |  | 
| 516 | 0 |  |  |  |  |  | my $query = File::Temp->new( | 
| 517 |  |  |  |  |  |  | template => 'tempfile_XXXXXXXXXXXXXXX', | 
| 518 |  |  |  |  |  |  | suffix => '.faa', | 
| 519 |  |  |  |  |  |  | unlock => 1 | 
| 520 |  |  |  |  |  |  | ); | 
| 521 |  |  |  |  |  |  |  | 
| 522 | 0 |  |  |  |  |  | print $query '>query' . "\n" . $seq; | 
| 523 |  |  |  |  |  |  |  | 
| 524 | 0 |  |  |  |  |  | my $pgm = 'hmmscan'; | 
| 525 |  |  |  |  |  |  |  | 
| 526 | 0 |  |  |  |  |  | my $cpu_n = 1; | 
| 527 | 0 |  |  |  |  |  | my $tbout = File::Temp->new(suffix => '_domtblout.tsv'); | 
| 528 | 0 |  |  |  |  |  | my $opt = ' --domtblout ' . $tbout . ' --cpu ' . $cpu_n; | 
| 529 | 0 |  |  |  |  |  | my $log = File::Temp->new(suffix => '_hmmscan.log', unlock => 1); | 
| 530 |  |  |  |  |  |  |  | 
| 531 | 0 |  |  |  |  |  | my $cmd = "$pgm $opt $hmmdb $query > $log"; | 
| 532 | 0 |  |  |  |  |  | system $cmd; | 
| 533 |  |  |  |  |  |  |  | 
| 534 | 0 |  |  |  |  |  | return $tbout; | 
| 535 |  |  |  |  |  |  | } | 
| 536 |  |  |  |  |  |  |  | 
| 537 | 1 |  |  | 1 |  | 2491 | no Moose::Role; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 6 |  | 
| 538 |  |  |  |  |  |  | 1; | 
| 539 |  |  |  |  |  |  |  | 
| 540 |  |  |  |  |  |  | __END__ | 
| 541 |  |  |  |  |  |  |  | 
| 542 |  |  |  |  |  |  | =pod | 
| 543 |  |  |  |  |  |  |  | 
| 544 |  |  |  |  |  |  | =head1 NAME | 
| 545 |  |  |  |  |  |  |  | 
| 546 |  |  |  |  |  |  | Bio::Palantir::Roles::Fillable - Fillable Moose role for the construction of DomainPlus object arrays and Exploratory methods | 
| 547 |  |  |  |  |  |  |  | 
| 548 |  |  |  |  |  |  | =head1 VERSION | 
| 549 |  |  |  |  |  |  |  | 
| 550 |  |  |  |  |  |  | version 0.200700 | 
| 551 |  |  |  |  |  |  |  | 
| 552 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 553 |  |  |  |  |  |  |  | 
| 554 |  |  |  |  |  |  | # TODO | 
| 555 |  |  |  |  |  |  |  | 
| 556 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 557 |  |  |  |  |  |  |  | 
| 558 |  |  |  |  |  |  | # TODO | 
| 559 |  |  |  |  |  |  |  | 
| 560 |  |  |  |  |  |  | =head1 AUTHOR | 
| 561 |  |  |  |  |  |  |  | 
| 562 |  |  |  |  |  |  | Loic MEUNIER <lmeunier@uliege.be> | 
| 563 |  |  |  |  |  |  |  | 
| 564 |  |  |  |  |  |  | =head1 COPYRIGHT AND LICENSE | 
| 565 |  |  |  |  |  |  |  | 
| 566 |  |  |  |  |  |  | This software is copyright (c) 2019 by University of Liege / Unit of Eukaryotic Phylogenomics / Loic MEUNIER and Denis BAURAIN. | 
| 567 |  |  |  |  |  |  |  | 
| 568 |  |  |  |  |  |  | This is free software; you can redistribute it and/or modify it under | 
| 569 |  |  |  |  |  |  | the same terms as the Perl 5 programming language system itself. | 
| 570 |  |  |  |  |  |  |  | 
| 571 |  |  |  |  |  |  | =cut |