File Coverage

blib/lib/Bio/MUST/Drivers/Exonerate/Aligned.pm

Criterion	Covered	Total	%
statement	42	159	26.4
branch	0	22	0.0
condition	0	3	0.0
subroutine	14	19	73.6
pod	0	1	0.0
total	56	204	27.4

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::MUST::Drivers::Exonerate::Aligned;
2							# ABSTRACT: Bio::MUST driver for running the exonerate alignment program
3							$Bio::MUST::Drivers::Exonerate::Aligned::VERSION = '0.193030';
4	5			5		33	use Moose;
	5					11
	5					40
5	5			5		36425	use namespace::autoclean;
	5					12
	5					49
6
7	5			5		439	use autodie;
	5					10
	5					44
8	5			5		28437	use feature qw(say switch);
	5					10
	5					517
9	5			5		2989	use experimental qw(smartmatch); # to suppress warnings about 'when'
	5					6042
	5					26
10
11							# use Smart::Comments '###';
12
13	5			5		365	use Carp;
	5					10
	5					267
14	5			5		33	use Const::Fast;
	5					21
	5					68
15	5			5		384	use IPC::System::Simple qw(system);
	5					10
	5					222
16	5			5		27	use Module::Runtime qw(use_module);
	5					10
	5					42
17	5			5		348	use Path::Class qw(file);
	5					9
	5					302
18
19	5			5		32	use Bio::MUST::Core;
	5					11
	5					1377
20	5			5		42	use Bio::MUST::Core::Constants qw(:gaps);
	5					10
	5					765
21	5			5		32	use aliased 'Bio::MUST::Core::Ali';
	5					16
	5					52
22	5			5		1132	use aliased 'Bio::MUST::Core::Seq';
	5					11
	5					19
23
24
25							has 'dna_seq' => (
26							is => 'ro',
27							isa => 'Bio::MUST::Core::Seq',
28							required => 1,
29							);
30
31							has 'pep_seq' => (
32							is => 'ro',
33							isa => 'Bio::MUST::Core::Seq',
34							required => 1,
35							);
36
37							# TODO: homogeneize with main class
38							has 'code' => (
39							is => 'ro',
40							isa => 'Str',
41							default => '1',
42							);
43
44							has 'model' => (
45							is => 'ro',
46							isa => 'Str',
47							init_arg => undef,
48							writer => '_set_model',
49							);
50
51							has $_ => (
52							is => 'ro',
53							isa => 'Num',
54							init_arg => undef,
55							writer => '_set_' . $_,
56							) for qw(
57							query_start query_end
58							target_start target_end
59							score
60							);
61
62							has $_ . '_seq' => (
63							is => 'ro',
64							isa => 'Bio::MUST::Core::Seq',
65							init_arg => undef,
66							writer => '_set_' . $_,
67							) for qw(query target spliced);
68
69
70							# TODO: subclass this to avoid code repetition with main class
71							# TODO: handle reverse complement
72
73							sub BUILD {
74	0			0	0		my $self = shift;
75
76							# provision executable
77	0						my $app = use_module('Bio::MUST::Provision::Exonerate')->new;
78	0						$app->meet();
79
80							# build temp Ali file for input DNA seq
81	0						my $dna = Ali->new(
82							seqs => [ $self->dna_seq ],
83							guessing => 0,
84							);
85	0						my $dnafile = $dna->temp_fasta;
86
87							# build temp Ali file for input PEP seq
88	0						my $pep = Ali->new(
89							seqs => [ $self->pep_seq ],
90							guessing => 0,
91							);
92	0						my $pepfile = $pep->temp_fasta;
93
94							# execute exonerate
95	0						my $outfile = $self->_exonerate($dnafile, $pepfile);
96
97							# parse outfile and populate seqs
98	0						$self->_parse_outfile($outfile);
99
100							# check that everything ran fine
101	0	0	0				unless ( $self->query_seq->seq_len ) {
		0
102	0						carp '[BMD] Warning: exonerate could not align seqs;'
103							. ' returning empty seqs!';
104							### dnafile: join q{}, "\n", file($dnafile)->slurp
105							### pepfile: join q{}, "\n", file($pepfile)->slurp
106							}
107	0						elsif ( $self->query_seq->seq_len != $self->target_seq->seq_len ) {
108	0						carp '[BMD] Warning: query and target seqs not of same length!';
109							### dnafile: join q{}, "\n", file($dnafile)->slurp
110							### pepfile: join q{}, "\n", file($pepfile)->slurp
111							}
112							elsif ( $self->spliced_seq->seq_len != 3 * $self->target_seq->seq_len ) {
113							carp '[BMD] Warning: DNA and protein target seqs not of same length!';
114							### dnafile: join q{}, "\n", file($dnafile)->slurp
115							### pepfile: join q{}, "\n", file($pepfile)->slurp
116							}
117
118							# unlink temp files
119	0						file($_)->remove for ($dnafile, $pepfile, $outfile);
120
121	0						return;
122							}
123
124
125							const my %ARGS_FOR => (
126							'lc' => 'protein2genome',
127							'bf' => 'protein2genome:bestfit --exhaustive',
128							);
129
130							const my $NEW_HSP => qr{C4 \s Alignment:}xms;
131
132							sub _exonerate {
133	0			0			my $self = shift;
134	0						my $dnafile = shift;
135	0						my $pepfile = shift;
136
137							# first try with heuristic 'protein2genome' model
138							# if it yields > 1 HSPs try with exhaustive 'protein2genome:bestfit' model
139							# it this better model fails then return first HSP from lesser model
140
141	0						my $pgm = 'exonerate';
142	0						my $code = $self->code;
143
144	0						my $return = q{};
145	0						my $remove = q{};
146
147	0						my %outfile_for;
148							my $hsp_n;
149
150							MODEL:
151	0						for my $model ( qw(lc bf) ) {
152
153	0						$remove = $return; # lesser model failed (if any)
154	0						$return = $model; # try better model
155
156							# create exonerate command
157	0						my $outfile = $dnafile . ".exo.$model.out";
158	0						$outfile_for{$model} = $outfile;
159							my $cmd = qq{$pgm --showvulgar no --showalignment yes}
160							. ' --alignmentwidth 100000' # needed for robust splicing
161	0						. " --verbose 0 --geneticcode $code --model " . $ARGS_FOR{$model}
162							. " --target $dnafile --query $pepfile > $outfile 2> /dev/null"
163							;
164							#### $cmd
165
166							# Note: we ask for a single-line alignment to avoid the difficult
167							# parsing of linebreak-interrupted runs of spaces within introns
168
169							# try to robustly execute exonerate
170	0						my $ret_code = system( [ 0, 1, 127, 139 ], $cmd);
171	0	0					if ($ret_code == 127) {
172	0						carp "[BMD] Warning: cannot execute $pgm command;"
173							. ' returning nothing!';
174	0						return; # This will likely crash calling code but that's OK.
175							}
176	0	0					if ($ret_code == 139) {
177	0						carp "[BMD] Warning: $pgm crashed; skipping model: $model!";
178							# do nothing more to leave loop with accurate $hsp_n
179							}
180	0	0					if ($ret_code == 1) {
181	0						carp "[BMD] Warning: $pgm crashed because of a bad nt seq!";
182	0						return $outfile_for{$model};
183							}
184							# TODO: try to bypass shell (need for absolute path to executable then)
185
186							# check number of HSPs in outfile
187	0						my @lines = file($outfile)->slurp;
188	0						$hsp_n = grep { m/$NEW_HSP/xms } @lines;
	0
189
190							# stop here if only one; otherwise possibly try better model
191	0	0					last MODEL if $hsp_n == 1;
192							}
193
194							# if no HSP then better model was tried and finally failed
195							# thus switch back to lesser model (with > 1 HSPs)
196	0	0					if ($hsp_n == 0) {
197	0						carp "[BMD] Warning: cannot get only one HSP from $pgm;"
198							. ' returning first one!';
199	0						($return, $remove) = ($remove, $return);
200							}
201
202							#### $remove
203							#### $return
204
205							# delete unused file (if any) and return outfile
206	0						file( $outfile_for{ $remove} )->remove;
207	0						$self->_set_model( $return);
208
209	0						return $outfile_for{$return};
210							}
211
212
213							# Raw score: 1215
214							# Query range: 0 -> 247
215							# Target range: 1429 -> 2184
216
217							# 43 : ysGlyIleValLysGluIleIle<->AspProGlyArgGlyAlaProLeuAlaArgValVal : 61
218							# \|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|
219							# ysGlyIleValLysGluIleIleHisAspProGlyArgGlyAlaProLeuAlaArgValVal
220							# 1554 : AGGGCATAGTGAAAGAAATCATACACGACCCAGGAAGAGGCGCTCCCTTAGCCAGAGTTGTT : 1613
221							#
222							# ...
223							#
224							# 145 : sThrArgValArgLeuProSerGlySerLysLysValLeuSerSerThrAsnArg-AlaVal : 164
225							# \|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|#.!!\|\|\|
226							# sThrArgValArgLeuProSerGlySerLysLysValLeuSerSerThrAsnArg#UnkVal
227							# 1863 : AACGCGCGTGCGTCTTCCGTCAGGATCGAAAAAGGTGCTATCGTCGACGAATCGAGNCCGTG : 1923
228							#
229							# ...
230							#
231							# 177 : gGlnArgLysAlaHisLeuIleGluIleGlnValAsnGlyGlyThrValAlaGlnLysValAsp : 197
232							# \|\|\|\|!:!\|\|\|\|\|\|\|\|\|\|\|\|!!:\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|:!!\|\|\|\|\|\| !!\|\|\|\|\|\|\|\|\|
233							# gGlnLysLysAlaHisLeuMetGluIleGlnValAsnGlyGlySerValAla***LysValAsp
234							# 321 : GCAAAAGAAGGCCCACCTTATGGAAATCCAAGTCAATGGTGGATCCGTTGCGTAAAAGGTCGAC : 383
235							#
236							# ...
237							#
238							# 91 : rgLysHisProTrpHisValIleArgIleAsnLysMetLeuSerCysAlaGlyAlaAspArg{L : 111
239							# \|\|:!!\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|:!!\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|{\|
240							# rgGlnHisProTrpHisValLeuArgIleAsnLysMetLeuSerCysAlaGlyAlaAspArg{L
241							# 257 : GACAACATCCATGGCACGTTCTTAGAATCAACAAGATGCTTTCCTGCGCAGGTGCCGATAGA{C : 319
242							#
243							# 112 : e} >>>> Target Intron 1 >>>> {u}GlnGlnGlyMetArgGlyAlaPheGlyLys : 121
244							# \|} 86 bp {\|} \|\|\|\|\|\|\|\|\| !\|\|\|\|\|\|\|\|\|\|\|\|
245							# e}++ --{u}UnkUnkGlyMetArgHisAlaPheGlyLys
246							# 320 : T}gt.........................nn{N}NNNNNCGGTATGAGACATGCTTTCGGAAAG : 435
247							#
248							# ...
249							#
250							# 126 : SerGlnAlaLeuAspValIleHisGluTyrPheLys : 137
251							# !!!! !\|\|\|\|\|\|!!: !!:!! ! !\|\|\|
252							# ThrProAlaLeuGluPheValUnkUnkValHisLys
253							# 689 : ACGCCAGCTCTGGAGTTCGTAAKGRGAGTACATAAG : 652
254
255							# regexes for parsing outfile
256							const my $LEFT_NUM => qr{-?\d+\s+:}xms;
257							const my $RIGHT_NUM => qr{:\s+-?\d+}xms;
258							const my $SEQ_STR => qr{[A-Za-z0-9\.\#\*\+\-\<\>\{\}\ ]+}xms;
259
260							sub _parse_outfile {
261	0			0			my $self = shift;
262	0						my $outfile = shift;
263
264							# read output file (human-readable alignment)
265	0						open my $out, '<', $outfile;
266							#### $outfile
267
268	0						my $query_seq = q{};
269	0						my $target_seq = q{};
270	0						my $spliced_seq = q{};
271
272	0						my $hsp_n = 0;
273	0						my $in_block = 0;
274
275							LINE:
276	0						while (my $line = <$out>) {
277	0						chomp $line;
278							##### $line
279
280	0						given ($line) {
281
282							# header lines
283	0						when (/$NEW_HSP/xms) {
284	0						$hsp_n++;
285	0	0					last LINE if $hsp_n > 1; # process only first HSP
286							}
287	0						when (/^ \s*Raw \s score:\s (\d+) /xms) {
288	0						$self->_set_score( $1 );
289							}
290	0						when (/^ \s*Query \s range:\s (\d+) \s->\s (\d+) /xms) {
291	0						$self->_set_query_start( $1 + 1 );
292	0						$self->_set_query_end( $2);
293							} # Note: coordinates are converted to BLAST/GFF standard
294	0						when (/^ \s*Target \s range:\s (\d+) \s->\s (\d+) /xms) {
295	0						$self->_set_target_start( $1 + 1 );
296	0						$self->_set_target_end( $2);
297							} # Note: coordinates are converted to BLAST/GFF standard
298
299							# alignment lines
300	0						when (/^ \s* $LEFT_NUM \s ($SEQ_STR) \s $RIGHT_NUM \s* $/xms) {
301	0	0					unless ($in_block) {
302							# this is query line
303	0						$query_seq .= $1;
304	0						$in_block = 1;
305	0						<$out>; # skip midline (may be hard to parse)
306							}
307							else {
308							# this is DNA
309	0						$spliced_seq .= $1;
310	0						$in_block = 0;
311							}
312							}
313	0						when (/^ \s+ ($SEQ_STR) \s* $/xms) {
314							# this is likely target line
315	0	0					$target_seq .= $1 if $in_block;
316							}
317							}
318							}
319
320							# remove introns and frameshifts in all seqs simultaneously
321	0						($query_seq, $target_seq, $spliced_seq)
322							= _splice_seqs($query_seq, $target_seq, $spliced_seq);
323
324							# store aligned pep seqs
325	0						$self->_set_query(
326							Seq->new(
327							seq_id => $self->pep_seq->full_id,
328							seq => _canonize_seq(1, $query_seq)
329							)
330							);
331	0						$self->_set_target(
332							Seq->new(
333							seq_id => $self->dna_seq->full_id,
334							seq => _canonize_seq(1, $target_seq)
335							)
336							);
337	0						$self->_set_spliced(
338							Seq->new(
339							seq_id => $self->dna_seq->full_id,
340							seq => _canonize_seq(0, $spliced_seq)
341							)
342							);
343
344	0						return;
345							}
346
347
348							# regexes for splicing seqs
349							const my $ANGLES => qr{(?:<\|>){4}}xms;
350							const my $TARGET => qr{Target \s Intron \s \d+}xms;
351							const my $INTRON => qr{\s{2} $ANGLES \s $TARGET \s $ANGLES \s{2}}xms;
352
353							sub _splice_seqs {
354	0			0			my $query_seq = shift;
355	0						my $target_seq = shift;
356	0						my $spliced_seq = shift;
357							#### $query_seq
358							#### $target_seq
359							#### $spliced_seq
360
361							# preserve framshifts in target_seq
362							# Note: this is needed to avoid splicing out real frameshifts below
363	0						$target_seq =~ s/\#{2}\-{3}/##Unk/xmsg; # 1-nt deletion frameshifts
364	0						$target_seq =~ s/\#{3} /Unk/xmsg; # ???
365
366							# store intron pos and 'lengths' in query_seq
367	0						my @introns;
368	0						while ($query_seq =~ m/($INTRON)/xmsg) {
369	0						my $len = length $1;
370	0						my $pos = pos($query_seq) - $len;
371	0						push @introns, [ $pos, $len ];
372							}
373
374							# store frameshift pos and 'lengths' in target_seq
375	0						while ($target_seq =~ m/(\#+)/xmsg) {
376	0						my $len = length $1;
377	0						my $pos = pos($target_seq) - $len;
378	0						push @introns, [ $pos, $len ];
379							}
380
381							# splice out introns starting from seq ends
382							# @introns
383	0						for my $intron (sort { $b->[0] <=> $a->[0] } @introns) {
	0
384	0						my ($pos, $len) = @{$intron};
	0
385	0						my $query_in = substr $query_seq, $pos, $len, q{};
386	0						my $target_in = substr $target_seq, $pos, $len, q{};
387	0						my $spliced_in = substr $spliced_seq, $pos, $len, q{};
388
389							#### $query_in
390							#### $target_in
391							#### $spliced_in
392							}
393
394							#### $query_seq
395							#### $target_seq
396							#### $spliced_seq
397
398	0						return ($query_seq, $target_seq, $spliced_seq);
399							}
400
401
402							const my %SHORT_FOR => (
403							Ala => 'A',
404							Asx => 'B',
405							Cys => 'C',
406							Asp => 'D',
407							Glu => 'E',
408							Phe => 'F',
409							Gly => 'G',
410							His => 'H',
411							Ile => 'I',
412							Lys => 'K',
413							Leu => 'L',
414							Met => 'M',
415							Asn => 'N',
416							Pro => 'P',
417							Gln => 'Q',
418							Arg => 'R',
419							Ser => 'S',
420							Thr => 'T',
421							Val => 'V',
422							Trp => 'W',
423							Xaa => 'X',
424							Tyr => 'Y',
425							Glx => 'Z',
426							Unk => $FRAMESHIFT,
427							'---' => '-',
428							);
429
430							sub _canonize_seq {
431	0			0			my $prot = shift;
432	0						my $seq3 = shift;
433							#### $seq3
434
435							# example of 1-nt deletion
436							# ArgGlyAsnAla--GlyGlyGlnHisHisHisArgIle...LysPhePheThrArgArgAlaGlu--GluLysIleLys
437							#
438							# ArgGlyAsnAla##---GlyGlnHisHisHisArgIle...LysPhePheThrArgArgAlaGlu##---LysIleLys
439							# CGTGGTAATGCTGT---GGTCAGCACCACCACAGAATT...AAGTTCTTCACACGCCGTGCTGAGGA---AAGATCAAG
440
441							# ensure no residual frameshift
442	0						$seq3 =~ tr/\{\}//d; # spliced codons
443	0						$seq3 =~ s/\<\-\>/---/xmsg; # insertion
444	0						$seq3 =~ s/\*{3} /Unk/xmsg; # STOP
445							#### $seq3
446
447	0	0					return $seq3 unless $prot;
448
449							# 'translate' 3-letter AAs to 1-letter AAs
450	0						my $seq1 = q{};
451	0						for (my $i = 0; $i < length $seq3; $i += 3) {
452	0						my $triplet = substr $seq3, $i, 3;
453	0						$seq1 .= $SHORT_FOR{ $triplet };
454							}
455							#### $seq1
456
457	0						return $seq1;
458							}
459
460							__PACKAGE__->meta->make_immutable;
461							1;
462
463							__END__
464
465							=pod
466
467							=head1 NAME
468
469							Bio::MUST::Drivers::Exonerate::Aligned - Bio::MUST driver for running the exonerate alignment program
470
471							=head1 VERSION
472
473							version 0.193030
474
475							=head1 SYNOPSIS
476
477							# TODO
478
479							=head1 DESCRIPTION
480
481							# TODO
482
483							=head1 AUTHOR
484
485							Denis BAURAIN <denis.baurain@uliege.be>
486
487							=head1 COPYRIGHT AND LICENSE
488
489							This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
490
491							This is free software; you can redistribute it and/or modify it under
492							the same terms as the Perl 5 programming language system itself.
493
494							=cut