File Coverage

Bio/AlignIO/phylip.pm
Criterion Covered Total %
statement 169 202 83.6
branch 62 98 63.2
condition 15 33 45.4
subroutine 16 17 94.1
pod 10 10 100.0
total 272 360 75.5


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::AlignIO::phylip
3             #
4             # Copyright Heikki Lehvaslaiho
5             #
6              
7             =head1 NAME
8              
9             Bio::AlignIO::phylip - PHYLIP format sequence input/output stream
10              
11             =head1 SYNOPSIS
12              
13             Do not use this module directly. Use it via the Bio::AlignIO class.
14              
15             This example shows how to write to phylip format:
16              
17             use Bio::AlignIO;
18             use Bio::SimpleAlign;
19              
20             # Use -idlength to set the name length to something other than
21             # the default 10 if you need longer ids.
22             my $phylipstream = Bio::AlignIO->new(-format => 'phylip',
23             -fh => \*STDOUT,
24             -idlength => 30);
25             # Convert data from one format to another
26             my $gcgstream = Bio::AlignIO->new(-format => 'msf',
27             -file => 't/data/cysprot1a.msf');
28              
29             while( my $aln = $gcgstream->next_aln ) {
30             $phylipstream->write_aln($aln);
31             }
32              
33             # With phylip sequential format format
34             $phylipstream->interleaved(0);
35             # Or initialize the object like this
36             $phylipstream = Bio::AlignIO->new(-interleaved => 0,
37             -format => 'phylip',
38             -fh => \*STDOUT,
39             -idlength => 20 );
40             $gcgstream = Bio::AlignIO->new(-format => 'msf',
41             -file => 't/data/cysprot1a.msf');
42              
43             while( my $aln = $gcgstream->next_aln ) {
44             $phylipstream->write_aln($aln);
45             }
46              
47             This example shows how to read phylip format:
48              
49             my $in = Bio::AlignIO->new(
50             -file => $inFile,
51             -format => 'phylip',
52             -interleaved => 0,
53             -longid => 1
54             );
55              
56             my $out = Bio::AlignIO->new(
57             -file => ">$outFile",
58             -format => 'fasta'
59             );
60              
61             while ( my $aln = $in->next_aln() ) {
62             $out->write_aln($aln);
63             }
64              
65             The -longid argument is required if the input phylip format file
66             has ids with lengths greater then 10 characters.
67              
68             =head1 DESCRIPTION
69              
70             This object can transform Bio::SimpleAlign objects to and from PHYLIP
71             format. By default it works with the interleaved format. By specifying
72             the flag -interleaved =E 0 in the initialization the module can
73             read or write data in sequential format.
74              
75             Reading phylip format with long IDs up to 50 characters is supported by
76             the flag -longid =E1. ID strings can be surrounded by single quotes.
77             They are mandatory only if the IDs contain spaces.
78              
79             =head1 FEEDBACK
80              
81             =head2 Support
82              
83             Please direct usage questions or support issues to the mailing list:
84              
85             I
86              
87             rather than to the module maintainer directly. Many experienced and
88             reponsive experts will be able look at the problem and quickly
89             address it. Please include a thorough description of the problem
90             with code and data examples if at all possible.
91              
92             =head2 Reporting Bugs
93              
94             Report bugs to the Bioperl bug tracking system to help us keep track
95             the bugs and their resolution. Bug reports can be submitted via the
96             web:
97              
98             https://github.com/bioperl/bioperl-live/issues
99              
100             =head1 AUTHORS - Heikki Lehvaslaiho and Jason Stajich
101              
102             Email: heikki at ebi.ac.uk
103             Email: jason at bioperl.org
104              
105             =head1 APPENDIX
106              
107             The rest of the documentation details each of the object
108             methods. Internal methods are usually preceded with a _
109              
110             =cut
111              
112             # Let the code begin...
113              
114             package Bio::AlignIO::phylip;
115 4     4   463 use vars qw($DEFAULTIDLENGTH $DEFAULTLINELEN $DEFAULTTAGLEN);
  4         7  
  4         223  
116 4     4   18 use strict;
  4         7  
  4         84  
117              
118 4     4   421 use Bio::SimpleAlign;
  4         8  
  4         127  
119 4     4   1296 use POSIX; # for the rounding call
  4         19970  
  4         16  
120              
121 4     4   8586 use base qw(Bio::AlignIO);
  4         6  
  4         671  
122              
123             BEGIN {
124 4     4   12 $DEFAULTIDLENGTH = 10;
125 4         8 $DEFAULTLINELEN = 60;
126 4         5888 $DEFAULTTAGLEN = 10;
127             }
128              
129             =head2 new
130              
131             Title : new
132             Usage : my $alignio = Bio::AlignIO->new(-format => 'phylip'
133             -file => '>file',
134             -idlength => 10,
135             -idlinebreak => 1);
136             Function: Initialize a new L reader or writer
137             Returns : L object
138             Args : [specific for writing of phylip format files]
139             -idlength => integer - length of the id (will pad w/
140             spaces if needed) when writing phylip
141             -interleaved => boolean - whether interleaved
142             or sequential format required
143             -line_length => integer of how long a sequence lines should be
144             -idlinebreak => insert a line break after the sequence id
145             so that sequence starts on the next line
146             -flag_SI => whether or not write a "S" or "I" just after
147             the num.seq. and line len., in the first line
148             -tag_length => integer of how long the tags have to be in
149             each line between the space separator. set it
150             to 0 to have 1 tag only.
151             -wrap_sequential => boolean for whether or not sequential
152             format should be broken up or a single line
153             default is false (single line)
154             -longid => boolean to read arbitrary long IDs (default is false)
155              
156             =cut
157              
158             sub _initialize {
159 14     14   31 my ( $self, @args ) = @_;
160 14         49 $self->SUPER::_initialize(@args);
161              
162 14         58 my ( $interleave, $linelen, $idlinebreak,
163             $idlength, $flag_SI, $tag_length, $ws, $longid )
164             = $self->_rearrange(
165             [ qw(INTERLEAVED
166             LINE_LENGTH
167             IDLINEBREAK
168             IDLENGTH
169             FLAG_SI
170             TAG_LENGTH
171             WRAP_SEQUENTIAL
172             LONGID)
173             ],
174             @args
175             );
176 14 50       52 $self->interleaved( $interleave ? 1 : 0 ) if defined $interleave;
    100          
177 14   33     64 $self->idlength( $idlength || $DEFAULTIDLENGTH );
178 14 50       29 $self->id_linebreak(1) if ($idlinebreak);
179 14 50 33     34 $self->line_length($linelen) if defined $linelen && $linelen > 0;
180 14 50       23 $self->flag_SI(1) if ($flag_SI);
181 14 50 33     65 $self->tag_length($tag_length) if ( $tag_length || $DEFAULTTAGLEN );
182 14 50       40 $self->wrap_sequential( $ws ? 1 : 0 );
183 14 100       38 $self->longid( $longid ? 1 : 0 );
184 14         21 1;
185             }
186              
187             =head2 next_aln
188              
189             Title : next_aln
190             Usage : $aln = $stream->next_aln()
191             Function: returns the next alignment in the stream.
192             Throws an exception if trying to read in PHYLIP
193             sequential format.
194             Returns : L object
195             Args :
196              
197             =cut
198              
199             sub next_aln {
200 10     10 1 54 my $self = shift;
201 10         14 my $entry;
202 10         17 my ($seqcount, $residuecount, %hash, $name,
203             $str, @names, $seqname, $start,
204             $end, $count, $seq
205             );
206              
207 10         65 my $aln = Bio::SimpleAlign->new( -source => 'phylip' );
208              
209             # First, parse up through the header.
210             # If we see a non-blank line that isn't the seqcount and residuecount line
211             # then bail out of next_aln (return)
212 10         53 while ( $entry = $self->_readline ) {
213 10 50       69 if ( $entry =~ /^\s?$/ ) {
    50          
214 0         0 next;
215             } elsif ( $entry =~ /\s*(\d+)\s+(\d+)/ ) {
216 10         37 ( $seqcount, $residuecount ) = ( $1, $2 );
217 10         16 last;
218             } else {
219 0         0 $self->warn(
220             "Failed to parse PHYLIP: Did not see a sequence count and residue count."
221             );
222 0         0 return;
223             }
224             }
225 10 50 33     41 return unless $seqcount and $residuecount;
226              
227             # First alignment section. We expect to see a name and (part of) a sequence.
228 10         19 my $idlen = $self->idlength;
229 10         16 $count = 0;
230              
231 10         23 while ( $entry = $self->_readline ) {
232 74 100       179 if ( $entry =~ /^\s?$/ ) { # eat the newlines
233 4         8 next;
234             }
235              
236             # Names can be in a few different formats:
237             # 1. they can be traditional phylip: 10 chars long, period. If this is the case, that name can have spaces.
238             # 2. they can be hacked with a long ID, as passed in with the flag -longid.
239             # 3. if there is a long ID, the name can have spaces as long as it is wrapped in single quotes.
240 70 100       110 if ( $self->longid() ) { # 2 or 3
241 12 100       24 if ( $entry =~ /^'(.+)'\s+(.+)$/ ) { # 3. name has single quotes.
242 1         2 $name = $1;
243 1         2 $str = $2;
244             } else { # 2. name does not have single quotes, so should not have spaces.
245             # therefore, the first part of the line is the name and the rest is the seq.
246             # make sure that the line does not lead with extra spaces.
247 11         16 $entry =~ s/^\s+//;
248 11         38 ( $name, $str ) = split( /\s+/, $entry, 2 );
249             }
250             } else { # 1. traditional phylip.
251 58         127 $entry =~ /^(.{1,10})\s(.+)$/;
252 58         83 $name = $1;
253 58         77 $str = $2;
254 58         128 $name =~ s/\s+$//; # eat any trailing spaces
255 58         98 $name =~ s/\s+/_/g;
256             }
257 70         97 push @names, $name;
258              
259             #clean sequence of spaces:
260 70         1166 $str =~ s/\s+//g;
261              
262             # are we sequential? If so, we should keep adding to the sequence until we've got all the residues.
263 70 100       97 if ( ( $self->interleaved ) == 0 ) {
264 4         10 while ( length($str) < $residuecount ) {
265 20         28 $entry = $self->_readline;
266 20         28 $str .= $entry;
267 20         92 $str =~ s/\s+//g;
268 20 50       58 if ( $entry =~ /^\s*$/ ) { # we ran into a newline before we got a complete sequence: bail!
269 0         0 $self->warn(
270             "Failed to parse PHYLIP: Sequence $name was shorter than expected: "
271             . length($str)
272             . " instead of $residuecount." );
273 0         0 last;
274             }
275             }
276             }
277 70         128 $hash{$count} = $str;
278              
279 70         67 $count++;
280              
281             # if we've read as many seqs as we're supposed to, move on.
282 70 100       136 if ( $count == $seqcount ) {
283 10         15 last;
284             }
285             }
286              
287             # if we are interleaved, we're going to keep seeing chunks of sequence until we get all of it.
288 10 100       17 if ( $self->interleaved ) {
289 9         36 while ( length( $hash{ $seqcount - 1 } ) < $residuecount ) {
290 106         101 $count = 0;
291 106         156 while ( $entry = $self->_readline ) {
292 427 100       780 if ( $entry =~ /^\s*$/ ) { # eat newlines
293 106 50       148 if ( $count != 0 ) { # there was a newline at an unexpected place!
294 0         0 $self->warn(
295             "Failed to parse PHYLIP: Interleaved file is missing a segment: saw $count, expected $seqcount."
296             );
297 0         0 return;
298             }
299 106         149 next;
300             } else { # start taking in chunks
301 321         1980 $entry =~ s/\s//g;
302 321         462 $hash{$count} .= $entry;
303 321         277 $count++;
304             }
305 321 100       499 if ( $count >= $seqcount ) { # we've read all of the sequences for this chunk, so move on.
306 106         190 last;
307             }
308             }
309             }
310             }
311 10 50       31 if ( ( scalar @names ) != $seqcount ) {
312 0         0 $self->warn(
313             "Failed to parse PHYLIP: Did not see the correct number of seqs: saw "
314             . scalar(@names)
315             . ", expected $seqcount." );
316 0         0 return;
317             }
318 10         27 for ( $count = 0; $count < $seqcount; $count++ ) {
319 70         124 $str = $hash{$count};
320 70         105 my $seqname = $names[$count];
321 70 50       105 if ( length($str) != $residuecount ) {
322 0         0 $self->warn(
323             "Failed to parse PHYLIP: Sequence $seqname was the wrong length: "
324             . length($str)
325             . " instead of $residuecount." );
326             }
327             $seq = Bio::LocatableSeq->new(
328 70         192 '-seq' => $hash{$count},
329             '-display_id' => $seqname
330             );
331 70         176 $aln->add_seq($seq);
332             }
333 10         66 return $aln;
334             }
335              
336             =head2 write_aln
337              
338             Title : write_aln
339             Usage : $stream->write_aln(@aln)
340             Function: writes the $aln object into the stream in phylip format
341             Returns : 1 for success and 0 for error
342             Args : L object
343              
344             =cut
345              
346             sub write_aln {
347 4     4 1 21 my ( $self, @aln ) = @_;
348 4         7 my $count = 0;
349 4         8 my $wrapped = 0;
350 4         6 my $maxname;
351 4         12 my $width = $self->line_length();
352 4         10 my ($length, $date, $name, $seq, $miss,
353             $pad, %hash, @arr, $tempcount, $index,
354             $idlength, $flag_SI, $line_length, $tag_length
355             );
356              
357 4         6 foreach my $aln (@aln) {
358 4 50 33     29 if ( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
359 0         0 $self->warn(
360             "Must provide a Bio::Align::AlignI object when calling write_aln"
361             );
362 0         0 next;
363             }
364 4 50       16 $self->throw("All sequences in the alignment must be the same length")
365             unless $aln->is_flush(1);
366              
367 4         15 $flag_SI = $self->flag_SI();
368 4         12 $aln->set_displayname_flat(); # plain
369 4         12 $length = $aln->length();
370 4 50       14 if ($flag_SI) {
371 0 0       0 if ( $self->interleaved() ) {
372 0         0 $self->_print(
373             sprintf(
374             " %s %s I\n", $aln->num_sequences, $aln->length
375             )
376             );
377             } else {
378 0         0 $self->_print(
379             sprintf(
380             " %s %s S\n", $aln->num_sequences, $aln->length
381             )
382             );
383             }
384             } else {
385 4         114 $self->_print(
386             sprintf( " %s %s\n", $aln->num_sequences, $aln->length ) );
387             }
388              
389 4         11 $idlength = $self->idlength();
390 4         11 $line_length = $self->line_length();
391 4         11 $tag_length = $self->tag_length();
392 4         11 foreach $seq ( $aln->each_seq() ) {
393 16         29 $name = $aln->displayname( $seq->get_nse );
394 16 50       29 if ( $self->longid ) {
395 0 0       0 $self->warn(
396             "The length of the name is over 50 chars long [$name]")
397             if length($name) > 50;
398 0         0 $name = "'$name' ";
399             } else {
400 16 100       46 $name = substr( $name, 0, $idlength )
401             if length($name) > $idlength;
402 16         42 $name = sprintf( "%-" . $idlength . "s", $name );
403 16 50       26 if ( $self->interleaved() ) {
    0          
404 16         28 $name .= ' ';
405             } elsif ( $self->id_linebreak ) {
406 0         0 $name .= "\n";
407             }
408             }
409              
410             #phylip needs dashes not dots
411 16         31 my $seq = $seq->seq();
412 16         28 $seq =~ s/\./-/g;
413 16         29 $hash{$name} = $seq;
414 16         27 push( @arr, $name );
415             }
416              
417 4 50       10 if ( $self->interleaved() ) {
418 4         8 my $numtags;
419 4 50       9 if ( $tag_length <= $line_length ) {
420 4         47 $numtags = floor( $line_length / $tag_length );
421 4         11 $line_length = $tag_length * $numtags;
422             } else {
423 0         0 $numtags = 1;
424             }
425 4         12 while ( $count < $length ) {
426              
427             # there is another block to go!
428 11         16 foreach $name (@arr) {
429 58         63 my $dispname = $name;
430 58 100       101 $dispname = '' if $wrapped;
431 58         174 $self->_print(
432             sprintf( "%" . ( $idlength + 3 ) . "s", $dispname ) );
433 58         56 $tempcount = $count;
434 58         57 $index = 0;
435 58 100       125 $self->debug("residue count: $count\n")
436             if ( $count % 100000 == 0 );
437 58   100     136 while (( $tempcount + $tag_length < $length )
438             && ( $index < $numtags ) ) {
439             $self->_print(
440             sprintf(
441             "%s ",
442             substr(
443 286         919 $hash{$name}, $tempcount, $tag_length
444             )
445             )
446             );
447 286         321 $tempcount += $tag_length;
448 286         610 $index++;
449             }
450              
451             # last
452 58 100       79 if ( $index < $numtags ) {
453              
454             # space to print!
455             $self->_print(
456             sprintf( "%s",
457 16         53 substr( $hash{$name}, $tempcount ) )
458             );
459 16         19 $tempcount += $tag_length;
460             }
461 58         92 $self->_print("\n");
462             }
463 11         31 $self->_print("\n");
464 11         14 $count = $tempcount;
465 11         23 $wrapped = 1;
466             }
467             } else {
468 0         0 foreach $name (@arr) {
469 0         0 my $dispname = $name;
470 0         0 my $line = sprintf( "%s%s\n", $dispname, $hash{$name} );
471 0 0       0 if ( $self->wrap_sequential ) {
472 0         0 $line =~ s/(.{1,$width})/$1\n/g;
473             }
474 0         0 $self->_print($line);
475             }
476             }
477             }
478 4 50 33     13 $self->flush if $self->_flush_on_write && defined $self->_fh;
479 4         20 return 1;
480             }
481              
482             =head2 interleaved
483              
484             Title : interleaved
485             Usage : my $interleaved = $obj->interleaved
486             Function: Get/Set Interleaved status
487             Returns : boolean
488             Args : boolean
489              
490              
491             =cut
492              
493             sub interleaved {
494 101     101 1 122 my ( $self, $value ) = @_;
495 101 100       147 if ( defined $value ) {
496 1 50       5 if ($value) {
497 0         0 $self->{'_interleaved'} = 1
498             } else {
499 1         2 $self->{'_interleaved'} = 0
500             }
501             }
502 101 100       225 return 1 unless defined $self->{'_interleaved'};
503 6         11 return $self->{'_interleaved'};
504             }
505              
506             =head2 flag_SI
507              
508             Title : flag_SI
509             Usage : my $flag = $obj->flag_SI
510             Function: Get/Set if the Sequential/Interleaved flag has to be shown
511             after the number of sequences and sequence length
512             Example :
513             Returns : boolean
514             Args : boolean
515              
516              
517             =cut
518              
519             sub flag_SI {
520 4     4 1 8 my ( $self, $value ) = @_;
521 4         8 my $previous = $self->{'_flag_SI'};
522 4 50       10 if ( defined $value ) {
523 0         0 $self->{'_flag_SI'} = $value;
524             }
525 4         7 return $previous;
526             }
527              
528             =head2 idlength
529              
530             Title : idlength
531             Usage : my $idlength = $obj->idlength
532             Function: Get/Set value of id length
533             Returns : string
534             Args : string
535              
536              
537             =cut
538              
539             sub idlength {
540 28     28 1 44 my ( $self, $value ) = @_;
541 28 100       48 if ( defined $value ) {
542 14         26 $self->{'_idlength'} = $value;
543             }
544 28         37 return $self->{'_idlength'};
545             }
546              
547             =head2 line_length
548              
549             Title : line_length
550             Usage : $obj->line_length($newval)
551             Function:
552             Returns : value of line_length
553             Args : newvalue (optional)
554              
555              
556             =cut
557              
558             sub line_length {
559 8     8 1 13 my ( $self, $value ) = @_;
560 8 50       16 if ( defined $value ) {
561 0         0 $self->{'_line_length'} = $value;
562             }
563 8   33     24 return $self->{'_line_length'} || $DEFAULTLINELEN;
564              
565             }
566              
567             =head2 tag_length
568              
569             Title : tag_length
570             Usage : $obj->tag_length($newval)
571             Function:
572             Example : my $tag_length = $obj->tag_length
573             Returns : value of the length for each space-separated tag in a line
574             Args : newvalue (optional) - set to zero to have one tag per line
575              
576              
577             =cut
578              
579             sub tag_length {
580 18     18 1 29 my ( $self, $value ) = @_;
581 18 50       33 if ( defined $value ) {
582 0         0 $self->{'_tag_length'} = $value;
583             }
584 18   66     42 return $self->{'_tag_length'} || $DEFAULTTAGLEN;
585             }
586              
587             =head2 id_linebreak
588              
589             Title : id_linebreak
590             Usage : $obj->id_linebreak($newval)
591             Function:
592             Returns : value of id_linebreak
593             Args : newvalue (optional)
594              
595              
596             =cut
597              
598             sub id_linebreak {
599 0     0 1 0 my ( $self, $value ) = @_;
600 0 0       0 if ( defined $value ) {
601 0         0 $self->{'_id_linebreak'} = $value;
602             }
603 0   0     0 return $self->{'_id_linebreak'} || 0;
604             }
605              
606             =head2 wrap_sequential
607              
608             Title : wrap_sequential
609             Usage : $obj->wrap_sequential($newval)
610             Function:
611             Returns : value of wrap_sequential
612             Args : newvalue (optional)
613              
614              
615             =cut
616              
617             sub wrap_sequential {
618 14     14 1 26 my ( $self, $value ) = @_;
619 14 50       27 if ( defined $value ) {
620 14         22 $self->{'_wrap_sequential'} = $value;
621             }
622 14   50     30 return $self->{'_wrap_sequential'} || 0;
623             }
624              
625             =head2 longid
626              
627             Title : longid
628             Usage : $obj->longid($newval)
629             Function:
630             Returns : value of longid
631             Args : newvalue (optional)
632              
633              
634             =cut
635              
636             sub longid {
637 100     100 1 122 my ( $self, $value ) = @_;
638 100 100       138 if ( defined $value ) {
639 14         27 $self->{'_longid'} = $value;
640             }
641 100   100     246 return $self->{'_longid'} || 0;
642             }
643              
644             1;