File Coverage

Bio/AlignIO.pm
Criterion Covered Total %
statement 91 101 90.1
branch 42 60 70.0
condition 11 25 44.0
subroutine 18 20 90.0
pod 7 7 100.0
total 169 213 79.3


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::AlignIO
3             #
4             # based on the Bio::SeqIO module
5             # by Ewan Birney
6             # and Lincoln Stein
7             #
8             # Copyright Peter Schattner
9             #
10             # You may distribute this module under the same terms as perl itself
11             #
12             # History
13             # September, 2000 AlignIO written by Peter Schattner
14              
15             # POD documentation - main docs before the code
16              
17             =head1 NAME
18              
19             Bio::AlignIO - Handler for AlignIO Formats
20              
21             =head1 SYNOPSIS
22              
23             use Bio::AlignIO;
24              
25             $inputfilename = "testaln.fasta";
26             $in = Bio::AlignIO->new(-file => $inputfilename ,
27             -format => 'fasta');
28             $out = Bio::AlignIO->new(-file => ">out.aln.pfam" ,
29             -format => 'pfam');
30              
31             while ( my $aln = $in->next_aln() ) {
32             $out->write_aln($aln);
33             }
34              
35             # OR
36              
37             use Bio::AlignIO;
38              
39             open MYIN, '<', 'testaln.fasta' or die "Could not read file 'testaln.fasta': $!\n";
40             $in = Bio::AlignIO->newFh(-fh => \*MYIN,
41             -format => 'fasta');
42             open my $MYOUT, '>', 'testaln.pfam' or die "Could not write file 'testaln.pfam': $!\n";
43             $out = Bio::AlignIO->newFh(-fh => $MYOUT,
44             -format => 'pfam');
45              
46             # World's smallest Fasta<->pfam format converter:
47             print $out $_ while <$in>;
48              
49             =head1 DESCRIPTION
50              
51             L is a handler module for the formats in the AlignIO set,
52             for example, L. It is the officially sanctioned way
53             of getting at the alignment objects. The resulting alignment is a
54             L-compliant object.
55              
56             The idea is that you request an object for a particular format.
57             All the objects have a notion of an internal file that is read
58             from or written to. A particular AlignIO object instance is configured
59             for either input or output, you can think of it as a stream object.
60              
61             Each object has functions:
62              
63             $stream->next_aln();
64              
65             And:
66              
67             $stream->write_aln($aln);
68              
69             Also:
70              
71             $stream->type() # returns 'INPUT' or 'OUTPUT'
72              
73             As an added bonus, you can recover a filehandle that is tied to the
74             AlignIO object, allowing you to use the standard EE and print
75             operations to read and write alignment objects:
76              
77             use Bio::AlignIO;
78              
79             # read from standard input
80             $stream = Bio::AlignIO->newFh(-format => 'Fasta');
81              
82             while ( $aln = <$stream> ) {
83             # do something with $aln
84             }
85              
86             And:
87              
88             print $stream $aln; # when stream is in output mode
89              
90             L is patterned on the L module and shares
91             most of its features. One significant difference is that
92             L usually handles IO for only a single alignment at a time,
93             whereas L handles IO for multiple sequences in a single stream.
94             The principal reason for this is that whereas simultaneously handling
95             multiple sequences is a common requirement, simultaneous handling of
96             multiple alignments is not. The only current exception is format
97             C which parses results of the BLAST C program and which
98             may produce several alignment pairs. This set of alignment pairs can
99             be read using multiple calls to L.
100              
101             =head1 CONSTRUCTORS
102              
103             =head2 Bio::AlignIO-Enew()
104              
105             $seqIO = Bio::AlignIO->new(-file => 'filename', -format=>$format);
106             $seqIO = Bio::AlignIO->new(-fh => \*FILEHANDLE, -format=>$format);
107             $seqIO = Bio::AlignIO->new(-format => $format);
108             $seqIO = Bio::AlignIO->new(-fh => \*STDOUT, -format => $format);
109              
110             The L class method constructs a new L object.
111             The returned object can be used to retrieve or print alignment
112             objects. L accepts the following parameters:
113              
114             =over 4
115              
116             =item -file
117              
118             A file path to be opened for reading or writing. The usual Perl
119             conventions apply:
120              
121             'file' # open file for reading
122             '>file' # open file for writing
123             '>>file' # open file for appending
124             '+
125             'command |' # open a pipe from the command
126             '| command' # open a pipe to the command
127              
128             =item -fh
129              
130             You may provide new() with a previously-opened filehandle. For
131             example, to read from STDIN:
132              
133             $seqIO = Bio::AlignIO->new(-fh => \*STDIN);
134              
135             Note that you must pass filehandles as references to globs.
136              
137             If neither a filehandle nor a filename is specified, then the module
138             will read from the @ARGV array or STDIN, using the familiar EE
139             semantics.
140              
141             =item -format
142              
143             Specify the format of the file. Supported formats include:
144              
145             bl2seq Bl2seq Blast output
146             clustalw clustalw (.aln) format
147             emboss EMBOSS water and needle format
148             fasta FASTA format
149             maf Multiple Alignment Format
150             mase mase (seaview) format
151             mega MEGA format
152             meme MEME format
153             msf msf (GCG) format
154             nexus Swofford et al NEXUS format
155             pfam Pfam sequence alignment format
156             phylip Felsenstein PHYLIP format
157             prodom prodom (protein domain) format
158             psi PSI-BLAST format
159             selex selex (hmmer) format
160             stockholm stockholm format
161              
162             Currently only those formats which were implemented in L
163             have been incorporated into L. Specifically, C, C
164             and C have only been implemented for input. See the specific module
165             (e.g. L) for notes on supported versions.
166              
167             If no format is specified and a filename is given, then the module
168             will attempt to deduce it from the filename suffix. If this is unsuccessful,
169             C format is assumed.
170              
171             The format name is case insensitive; C, C and C are
172             all treated equivalently.
173              
174             =back
175              
176             =head2 Bio::AlignIO-EnewFh()
177              
178             $fh = Bio::AlignIO->newFh(-fh => \*FILEHANDLE, -format=>$format);
179             # read from STDIN or use @ARGV:
180             $fh = Bio::AlignIO->newFh(-format => $format);
181              
182             This constructor behaves like L, but returns a tied filehandle
183             rather than a L object. You can read sequences from this
184             object using the familiar EE operator, and write to it using
185             L. The usual array and $_ semantics work. For example, you can
186             read all sequence objects into an array like this:
187              
188             @sequences = <$fh>;
189              
190             Other operations, such as read(), sysread(), write(), close(), and printf()
191             are not supported.
192              
193             =over 1
194              
195             =item -flush
196              
197             By default, all files (or filehandles) opened for writing alignments
198             will be flushed after each write_aln() making the file immediately
199             usable. If you do not need this facility and would like to marginally
200             improve the efficiency of writing multiple sequences to the same file
201             (or filehandle), pass the -flush option '0' or any other value that
202             evaluates as defined but false:
203              
204             my $clustal = Bio::AlignIO->new( -file => "
205             -format => "clustalw" );
206             my $msf = Bio::AlignIO->new(-file => ">prot.msf",
207             -format => "msf",
208             -flush => 0 ); # go as fast as we can!
209             while($seq = $clustal->next_aln) { $msf->write_aln($seq) }
210              
211             =back
212              
213             =head1 OBJECT METHODS
214              
215             See below for more detailed summaries. The main methods are:
216              
217             =head2 $alignment = $AlignIO-Enext_aln()
218              
219             Fetch an alignment from a formatted file.
220              
221             =head2 $AlignIO-Ewrite_aln($aln)
222              
223             Write the specified alignment to a file..
224              
225             =head2 TIEHANDLE(), READLINE(), PRINT()
226              
227             These provide the tie interface. See L for more details.
228              
229             =head1 FEEDBACK
230              
231             =head2 Mailing Lists
232              
233             User feedback is an integral part of the evolution of this and other
234             Bioperl modules. Send your comments and suggestions preferably to one
235             of the Bioperl mailing lists. Your participation is much appreciated.
236              
237             bioperl-l@bioperl.org - General discussion
238             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
239              
240             =head2 Support
241              
242             Please direct usage questions or support issues to the mailing list:
243              
244             I
245              
246             rather than to the module maintainer directly. Many experienced and
247             reponsive experts will be able look at the problem and quickly
248             address it. Please include a thorough description of the problem
249             with code and data examples if at all possible.
250              
251             =head2 Reporting Bugs
252              
253             Report bugs to the Bioperl bug tracking system to help us keep track
254             the bugs and their resolution. Bug reports can be submitted via the
255             web:
256              
257             https://github.com/bioperl/bioperl-live/issues
258              
259             =head1 AUTHOR - Peter Schattner
260              
261             Email: schattner@alum.mit.edu
262              
263             =head1 CONTRIBUTORS
264              
265             Jason Stajich, jason@bioperl.org
266              
267             =head1 APPENDIX
268              
269             The rest of the documentation details each of the object
270             methods. Internal methods are usually preceded with a _
271              
272             =cut
273              
274             # 'Let the code begin...
275              
276             package Bio::AlignIO;
277              
278 35     35   10193 use strict;
  35         58  
  35         823  
279              
280 35     35   7699 use Bio::Seq;
  35         151  
  35         707  
281 35     35   5448 use Bio::LocatableSeq;
  35         48  
  35         647  
282 35     35   12691 use Bio::SimpleAlign;
  35         53  
  35         986  
283 35     35   10521 use Bio::Tools::GuessSeqFormat;
  35         63  
  35         1063  
284 35     35   161 use base qw(Bio::Root::Root Bio::Root::IO);
  35         40  
  35         30696  
285              
286             =head2 new
287              
288             Title : new
289             Usage : $stream = Bio::AlignIO->new(-file => $filename,
290             -format => 'Format')
291             Function: Returns a new seqstream
292             Returns : A Bio::AlignIO::Handler initialised with
293             the appropriate format
294             Args : -file => $filename
295             -format => format
296             -fh => filehandle to attach to
297             -displayname_flat => 1 [optional]
298             to force the displayname to not show start/end
299             information
300              
301             =cut
302              
303             sub new {
304 338     338 1 1467 my ($caller,@args) = @_;
305 338   33     1179 my $class = ref($caller) || $caller;
306              
307             # or do we want to call SUPER on an object if $caller is an
308             # object?
309 338 100       1606 if( $class =~ /Bio::AlignIO::(\S+)/ ) {
310 169         637 my ($self) = $class->SUPER::new(@args);
311 169         624 $self->_initialize(@args);
312 169         749 return $self;
313             } else {
314              
315 169         523 my %param = @args;
316 169         467 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
  321         712  
317             my $format = $param{'-format'} ||
318 169   100     674 $class->_guess_format( $param{-file} || $ARGV[0] );
319 169 100       350 unless ($format) {
320 8 50       22 if ($param{-file}) {
    0          
321 8   33     67 $format = Bio::Tools::GuessSeqFormat->new(-file => $param{-file}||$ARGV[0] )->guess;
322             }
323             elsif ($param{-fh}) {
324 0   0     0 $format = Bio::Tools::GuessSeqFormat->new(-fh => $param{-fh}||$ARGV[0] )->guess;
325             }
326             }
327 169         291 $format = "\L$format"; # normalize capitalization to lower case
328 169 50       350 $class->throw("Unknown format given or could not determine it [$format]")
329             unless $format;
330              
331 169 50       494 return unless( $class->_load_format_module($format) );
332 169         1166 return "Bio::AlignIO::$format"->new(@args);
333             }
334             }
335              
336              
337             =head2 newFh
338              
339             Title : newFh
340             Usage : $fh = Bio::AlignIO->newFh(-file=>$filename,-format=>'Format')
341             Function: does a new() followed by an fh()
342             Example : $fh = Bio::AlignIO->newFh(-file=>$filename,-format=>'Format')
343             $sequence = <$fh>; # read a sequence object
344             print $fh $sequence; # write a sequence object
345             Returns : filehandle tied to the Bio::AlignIO::Fh class
346             Args :
347              
348             =cut
349              
350             sub newFh {
351 56     56 1 335 my $class = shift;
352 56 50       178 return unless my $self = $class->new(@_);
353 56         194 return $self->fh;
354             }
355              
356             =head2 fh
357              
358             Title : fh
359             Usage : $obj->fh
360             Function:
361             Example : $fh = $obj->fh; # make a tied filehandle
362             $sequence = <$fh>; # read a sequence object
363             print $fh $sequence; # write a sequence object
364             Returns : filehandle tied to the Bio::AlignIO::Fh class
365             Args :
366              
367             =cut
368              
369              
370             sub fh {
371 56     56 1 62 my $self = shift;
372 56   33     141 my $class = ref($self) || $self;
373 56         127 my $s = Symbol::gensym;
374 56         773 tie $$s,$class,$self;
375 56         147 return $s;
376             }
377              
378              
379             =head2 format
380              
381             Title : format
382             Usage : $format = $stream->format()
383             Function: Get the alignment format
384             Returns : alignment format
385             Args : none
386              
387             =cut
388              
389             # format() method inherited from Bio::Root::IO
390              
391              
392             # _initialize is where the heavy stuff will happen when new is called
393              
394             sub _initialize {
395 169     169   284 my($self,@args) = @_;
396 169         759 my ($flat,$alphabet,$width) = $self->_rearrange([qw(DISPLAYNAME_FLAT ALPHABET WIDTH)],
397             @args);
398 169 50       452 $self->force_displayname_flat($flat) if defined $flat;
399 169         543 $self->alphabet($alphabet);
400 169 50       311 $self->width($width) if defined $width;
401 169         555 $self->_initialize_io(@args);
402 169         410 1;
403             }
404              
405             =head2 _load_format_module
406              
407             Title : _load_format_module
408             Usage : *INTERNAL AlignIO stuff*
409             Function: Loads up (like use) a module at run time on demand
410             Example :
411             Returns :
412             Args :
413              
414             =cut
415              
416             sub _load_format_module {
417 169     169   232 my ($self,$format) = @_;
418 169         326 my $module = "Bio::AlignIO::" . $format;
419 169         180 my $ok;
420              
421 169         234 eval {
422 169         653 $ok = $self->_load_module($module);
423             };
424 169 50       343 if ( $@ ) {
425 0         0 print STDERR <
426             $self: $format cannot be found
427             Exception $@
428             For more information about the AlignIO system please see the AlignIO docs.
429             This includes ways of checking for formats at compile time, not run time
430             END
431             ;
432 0         0 return;
433             }
434 169         387 return 1;
435             }
436              
437             =head2 next_aln
438              
439             Title : next_aln
440             Usage : $aln = stream->next_aln
441             Function: reads the next $aln object from the stream
442             Returns : a Bio::Align::AlignI compliant object
443             Args :
444              
445             =cut
446              
447             sub next_aln {
448 0     0 1 0 my ($self,$aln) = @_;
449 0         0 $self->throw("Sorry, you cannot read from a generic Bio::AlignIO object.");
450             }
451              
452             =head2 write_aln
453              
454             Title : write_aln
455             Usage : $stream->write_aln($aln)
456             Function: writes the $aln object into the stream
457             Returns : 1 for success and 0 for error
458             Args : Bio::Seq object
459              
460             =cut
461              
462             sub write_aln {
463 0     0 1 0 my ($self,$aln) = @_;
464 0         0 $self->throw("Sorry, you cannot write to a generic Bio::AlignIO object.");
465             }
466              
467             =head2 _guess_format
468              
469             Title : _guess_format
470             Usage : $obj->_guess_format($filename)
471             Function:
472             Example :
473             Returns : guessed format of filename (lower case)
474             Args :
475              
476             =cut
477              
478             sub _guess_format {
479 25     25   41 my $class = shift;
480 25 50       56 return unless $_ = shift;
481 25 100       111 return 'clustalw' if /\.aln$/i;
482 23 50       84 return 'emboss' if /\.(water|needle)$/i;
483 23 100       62 return 'metafasta' if /\.metafasta$/;
484 22 100       112 return 'fasta' if /\.(fasta|fast|seq|fa|fsa|nt|aa)$/i;
485 20 100       67 return 'maf' if /\.maf/i;
486 18 100       60 return 'mega' if /\.(meg|mega)$/i;
487 17 50       48 return 'meme' if /\.meme$/i;
488 17 100       58 return 'msf' if /\.(msf|pileup|gcg)$/i;
489 15 100       51 return 'nexus' if /\.(nexus|nex)$/i;
490 14 100       66 return 'pfam' if /\.(pfam|pfm)$/i;
491 10 100       29 return 'phylip' if /\.(phylip|phlp|phyl|phy|ph)$/i;
492 9 50       24 return 'psi' if /\.psi$/i;
493 9 50       25 return 'stockholm' if /\.stk$/i;
494 9 100       31 return 'selex' if /\.(selex|slx|selx|slex|sx)$/i;
495 8 50       39 return 'xmfa' if /\.xmfa$/i;
496             }
497              
498             sub DESTROY {
499 225     225   19126 my $self = shift;
500 225         772 $self->close();
501             }
502              
503             sub TIEHANDLE {
504 56     56   79 my $class = shift;
505 56         191 return bless {'alignio' => shift},$class;
506             }
507              
508             sub READLINE {
509 28     28   217 my $self = shift;
510 28 50 50     164 return $self->{'alignio'}->next_aln() || undef unless wantarray;
511 0         0 my (@list,$obj);
512 0         0 push @list,$obj while $obj = $self->{'alignio'}->next_aln();
513 0         0 return @list;
514             }
515              
516             sub PRINT {
517 13     13   25 my $self = shift;
518 13         81 $self->{'alignio'}->write_aln(@_);
519             }
520              
521              
522             =head2 force_displayname_flat
523              
524             Title : force_displayname_flat
525             Usage : $obj->force_displayname_flat($newval)
526             Function:
527             Example :
528             Returns : value of force_displayname_flat (a scalar)
529             Args : on set, new value (a scalar or undef, optional)
530              
531              
532             =cut
533              
534             sub force_displayname_flat{
535 4     4 1 6 my $self = shift;
536 4 50       14 return $self->{'_force_displayname_flat'} = shift if @_;
537 4   50     29 return $self->{'_force_displayname_flat'} || 0;
538             }
539              
540             =head2 alphabet
541              
542             Title : alphabet
543             Usage : $obj->alphabet($newval)
544             Function: Get/Set alphabet for purpose of passing to Bio::LocatableSeq creation
545             Example : $obj->alphabet('dna');
546             Returns : value of alphabet (a scalar)
547             Args : on set, new value (a scalar or undef, optional)
548              
549              
550             =cut
551              
552             sub alphabet {
553 1185     1185 1 1151 my $self = shift;
554 1185         938 my $value = shift;
555 1185 100       1839 if ( defined $value ) {
556 4 50 66     29 $self->throw("Invalid alphabet $value") unless $value eq 'rna' || $value eq 'protein' || $value eq 'dna';
      33        
557 4         6 $self->{'_alphabet'} = $value;
558             }
559 1185         4778 return $self->{'_alphabet'};
560             }
561              
562              
563             1;