File Coverage

blib/lib/Bio/NexmlIO.pm
Criterion Covered Total %
statement 21 171 12.2
branch 0 40 0.0
condition 0 3 0.0
subroutine 7 21 33.3
pod 12 12 100.0
total 40 247 16.1


line stmt bran cond sub pod time code
1             # $Id: Nexml.pm 15889 2009-07-29 13:35:29Z chmille4 $
2             # BioPerl module for Bio::NexmlIO
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Chase Miller
7             #
8             # Copyright Chase Miller
9             #
10             # You may distribute this module under the same terms as perl itself
11             #
12             # _history
13             # June 16, 2009 Largely rewritten by Chase Miller
14              
15             # POD documentation - main docs before the code
16              
17             =head1 NAME
18              
19             Bio::NexmlIO - stream handler for NeXML documents
20              
21             =head1 SYNOPSIS
22              
23             #Instantiate a Bio::Nexml object and link it to a file
24             my $in_nexml = Bio::Nexml->new(-file => 'nexml_doc.xml', -format => 'Nexml');
25              
26             #Read in some data
27             my $bptree1 = $in_nexml->next_tree();
28             my $bpaln1 = $in_nexml->next_aln();
29             my $bpseq1 = $in_nexml->next_seq();
30              
31             #Use/manipulate data
32             ...
33              
34             #Write data to nexml file
35             my $out_nexml = Bio::Nexml->new(-file => '>new_nexml_doc.xml', -format => 'Nexml');
36             $out_nexml->to_xml();
37            
38              
39              
40             =head1 DESCRIPTION
41              
42             Bio::NexmlIO is an I/O handler for a NeXML document. A NeXML document can
43             represent three different data types: simple sequences, alignments,
44             and trees. NexmlIO has four main methods next_tree, next_seq,
45             next_aln, and write. NexmlIO returns bioperl seq, tree, and aln
46             objects which can be manipulated then passed to the write method of a
47             new NexmlIO instance to allow the creation of a NeXML document.
48              
49             Each bioperl object contains all the information necessary to recreate
50             a Bio::Phylo::Taxa object, so each time a bioperl object is converted
51             to a biophylo object, the bioperl object is checked to see if its
52             associated taxa has already been created (against a hash using the
53             NexmlIO_ID and Taxa_ID to create a unique string). If not, it is
54             created; if so, that taxa object is used to link the Bio::Phylo tree
55             or matrix.
56              
57             For more information on the NeXML format, see L.
58              
59             =head1 FEEDBACK
60              
61             =head2 Mailing Lists
62              
63             User feedback is an integral part of the evolution of this and other
64             Bioperl modules. Send your comments and suggestions preferably to one
65             of the Bioperl mailing lists.
66              
67             Your participation is much appreciated.
68              
69             bioperl-l@bioperl.org - General discussion
70             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
71              
72             =head2 Support
73              
74             Please direct usage questions or support issues to the mailing list:
75              
76             I
77              
78             rather than to the module maintainer directly. Many experienced and
79             reponsive experts will be able look at the problem and quickly
80             address it. Please include a thorough description of the problem
81             with code and data examples if at all possible.
82              
83             =head2 Reporting Bugs
84              
85             Report bugs to the Bioperl bug tracking system to help us keep track
86             the bugs and their resolution. Bug reports can be submitted via the
87             web:
88              
89             https://github.com/bioperl/bioperl-live/issues
90              
91             =head1 AUTHOR - Chase Miller
92              
93             Email chmille4@gmail.com
94              
95             =head1 CONTRIBUTORS
96              
97             Mark A. Jensen, maj -at- fortinbras -dot- com
98              
99             =head1 APPENDIX
100              
101             The rest of the documentation details each of the object
102             methods. Internal methods are usually preceded with a _
103              
104             =cut
105              
106             # Let the code begin...
107              
108              
109             package Bio::NexmlIO;
110 1     1   644 use strict;
  1         2  
  1         25  
111             #TODO Change this
112 1     1   377 use lib '..';
  1         442  
  1         4  
113              
114 1     1   536 use Bio::SeqIO::nexml;
  1         3  
  1         31  
115 1     1   792 use Bio::AlignIO::nexml;
  1         4  
  1         31  
116 1     1   636 use Bio::TreeIO::nexml;
  1         2  
  1         27  
117 1     1   4 use Bio::Nexml::Factory;
  1         1  
  1         14  
118              
119 1     1   3 use base qw(Bio::Root::IO);
  1         1  
  1         1106  
120              
121             my $nexml_fac = Bio::Nexml::Factory->new();
122              
123             =head1 CONSTRUCTOR
124              
125             =head2 new
126              
127             Title : new
128             Usage : my $in_nexmlIO = Bio::NexmlIO->new(-file => 'data.nexml.xml');
129             Function: Creates a L object linked to a stream
130             Returns : a L object
131             Args : file name
132            
133             See L
134              
135             =cut
136              
137             sub new {
138 0     0 1   my($class,@args) = @_;
139 0           my $self = $class->SUPER::new(@args);
140            
141 0           my %params = @args;
142 0           my $file_string = $params{'-file'};
143            
144             #create unique ID by creating a scalar and using the memory address
145 0           my $ID = bless \(my $dummy), "UniqueID";
146 0           ($self->{'_ID'}) = sprintf("%s",\$ID) =~ /(0x[0-9a-fA-F]+)/;
147            
148 0 0         unless ($file_string =~ m/^\>/) {
149             # Only pass filename if filehandle is not available,
150             # or "Bio::Phylo" will create a new filehandle that ends
151             # out of scope and can't be closed directly, leaving 2 open
152             # filehandles for the same file (so file can't be deleted)
153 0           my $file_arg;
154             my $file_value;
155 0 0 0       if ( exists $self->{'_filehandle'}
156             and defined $self->{'_filehandle'}
157             ) {
158 0           $file_arg = '-handle';
159 0           $file_value = $self->{'_filehandle'};
160             }
161             else {
162 0           $file_arg = '-file';
163 0           $file_value = $self->{'_file'};
164             }
165              
166 0           $self->{'_doc'} = Bio::Phylo::IO->parse($file_arg => $file_value,,
167             '-format' => 'nexml',
168             '-as_project' => '1');
169             }
170            
171 0           return $self;
172             }
173              
174             =head2 doc
175              
176             Title : doc
177             Usage : my $nexml_doc = $in_nexmlIO->doc();
178             Function: returns a L object that contains all the Bio::Phylo data objects parsed from the stream
179             Returns : a L object
180             Args : none
181              
182             =cut
183              
184             sub doc {
185 0     0 1   my $self = shift;
186 0           return $self->{'_doc'};
187             }
188              
189             # Takes the Bio::Phylo::Project object and creats BioPerl trees, alns, and seqs from it
190             sub _parse {
191 0     0     my ($self) = @_;
192            
193 0           $self->{'_treeiter'} = 0;
194 0           $self->{'_seqiter'} = 0;
195 0           $self->{'_alniter'} = 0;
196            
197 0           $self->{_trees} = $nexml_fac->create_bperl_tree($self);
198 0           $self->{_alns} = $nexml_fac->create_bperl_aln($self);
199 0           $self->{_seqs} = $nexml_fac->create_bperl_seq($self);
200 0           my $taxa_array = $self->doc->get_taxa();
201            
202 0           $self->{'_parsed'} = 1; #success
203             }
204              
205             =head1 ITERATORS
206              
207             =head2 next_tree
208              
209             Title : next_tree
210             Usage : $tree = $stream->next_tree
211             Function: Reads the next tree object from the stream and returns it.
212             Returns : a L object
213             Args : none
214              
215             See L, L
216              
217             =cut
218              
219             sub next_tree {
220 0     0 1   my $self = shift;
221 0 0         $self->_parse unless $self->{'_parsed'};
222              
223 0           return $self->{'_trees'}->[ $self->{'_treeiter'}++ ];
224             }
225              
226             =head2 next_seq
227              
228             Title : next_seq
229             Usage : $seq = $stream->next_seq
230             Function: Reads the next seq object from the stream and returns it.
231             Returns : a L object
232             Args : none
233              
234             See L, L
235              
236             =cut
237              
238             sub next_seq {
239 0     0 1   my $self = shift;
240 0 0         unless ( $self->{'_parsed'} ) {
241 0           $self->_parse;
242             }
243 0           return $self->{'_seqs'}->[ $self->{'_seqiter'}++ ];
244             }
245              
246             =head2 next_aln
247              
248             Title : next_aln
249             Usage : $aln = $stream->next_aln
250             Function: Reads the next aln object from the stream and returns it.
251             Returns : a L object
252             Args : none
253              
254             See L, L
255              
256             =cut
257              
258             sub next_aln {
259 0     0 1   my $self = shift;
260 0 0         unless ( $self->{'_parsed'} ) {
261 0           $self->_parse;
262             }
263 0           return $self->{'_alns'}->[ $self->{'_alniter'}++ ];
264             }
265              
266             sub _rewind {
267 0     0     my $self = shift;
268 0           my $elt = shift;
269 0 0         $self->{"_${elt}iter"} = 0 if defined $self->{"_${elt}iter"};
270 0           return 1;
271             }
272              
273             =head2 rewind_seq
274              
275             Title : rewind_seq
276             Usage : $stream->rewind_seq
277             Function: Resets the stream for seqs
278             Returns : none
279             Args : none
280              
281             See L, L
282              
283             =cut
284              
285 0     0 1   sub rewind_seq { shift->_rewind('seq'); }
286              
287             =head2 rewind_aln
288              
289             Title : rewind_aln
290             Usage : $stream->rewind_aln
291             Function: Resets the stream for alns
292             Returns : none
293             Args : none
294              
295             See L, L
296              
297             =cut
298              
299 0     0 1   sub rewind_aln { shift->_rewind('aln'); }
300              
301             =head2 rewind_tree
302              
303             Title : rewind_tree
304             Usage : $stream->rewind_tree
305             Function: Resets the stream for trees
306             Returns : none
307             Args : none
308              
309             See L, L
310              
311             =cut
312              
313 0     0 1   sub rewind_tree { shift->_rewind('tree'); }
314              
315             =head2 write
316              
317             Title : write
318             Usage : $stream->write(-alns => $alns,-seqs => $seqs,-trees => $trees)
319             Function: converts BioPerl seq, tree, and aln objects into Bio::Phylo
320             seq, tree, and aln objects, constructs a Bio::Phylo::Project
321             object made up of the newly created Bio::Phylo objects, and
322             writes the Bio::Phylo:Project object to the stream as a valid
323             nexml document
324             Returns : none
325             Args : \@L, \@L, \@L
326              
327             See L, L, L, L
328              
329             =cut
330              
331             sub write {
332 0     0 1   my ($self, @args) = @_;
333            
334 0           my %params = @args;
335            
336 0           my ($trees, $alns, $seqs) = @params{qw( -trees -alns -seqs )};
337 0           my %taxa_hash = ();
338 0           my %seq_matrices = ();
339              
340 0           my $proj_doc = Bio::Phylo::Factory->create_project();
341            
342             #convert trees to bio::Phylo objects
343 0           my $forest = Bio::Phylo::Factory->create_forest();
344 0           my @forests;
345             my @taxa_array;
346 0           my $ent;
347 0           my $taxa_o;
348 0           my $phylo_tree_o;
349            
350 0           foreach my $tree (@$trees) {
351 0           my $nexml_id = $tree->get_tag_values('_NexmlIO_ID');
352 0           $taxa_o = undef;
353 0 0         if ( defined $taxa_hash{$nexml_id} ) {
354 0           $taxa_o = $taxa_hash{$nexml_id};
355             }
356             else {
357 0           ($taxa_o) = $nexml_fac->create_bphylo_taxa($tree);
358 0 0         $forest->set_taxa($taxa_o) if defined $taxa_o;
359 0           $taxa_hash{$nexml_id} = $taxa_o;
360             }
361            
362 0           ($phylo_tree_o) = $nexml_fac->create_bphylo_tree($tree, $taxa_o);
363            
364 0           $forest->insert($phylo_tree_o);
365             }
366              
367             #convert matrices to Bio::Phylo objects
368 0           my $matrices = Bio::Phylo::Matrices->new();
369 0           my $phylo_matrix_o;
370            
371 0           foreach my $aln (@$alns)
372             {
373 0           $taxa_o = undef;
374 0 0         if (defined $taxa_hash{ $aln->{_Nexml_ID} }) {
375 0           $taxa_o = $taxa_hash{$aln->{_Nexml_ID}};
376             }
377             else {
378 0           ($taxa_o) = $nexml_fac->create_bphylo_taxa($aln);
379 0           $taxa_hash{$aln->{_Nexml_ID}} = $taxa_o;
380             }
381            
382 0           ($phylo_matrix_o) = $nexml_fac->create_bphylo_aln($aln, $taxa_o);
383            
384 0 0         $phylo_matrix_o->set_taxa($taxa_o) if defined $taxa_o;
385 0           $matrices->insert($phylo_matrix_o);
386             }
387            
388 0           my $seq_matrix_o;
389             my $datum;
390             #convert sequences to Bio::Phylo objects
391 0           foreach my $seq (@$seqs)
392             {
393 0           $taxa_o = undef;
394             #check if this Bio::Phylo::Taxa obj has already been created
395 0 0         if (defined $taxa_hash{ $seq->{_Nexml_ID} }) {
396 0           $taxa_o = $taxa_hash{$seq->{_Nexml_ID}};
397             }
398             else {
399 0           ($taxa_o) = $nexml_fac->create_bphylo_taxa($seq);
400 0           $taxa_hash{$seq->{_Nexml_ID}} = $taxa_o;
401             }
402 0           $datum = $nexml_fac->create_bphylo_seq($seq, $taxa_o);
403             #check if this Bio::Phylo::Matrices::Matrix obj has already been created
404 0 0         if (defined $seq_matrices{ $seq->{_Nexml_matrix_ID} }) {
405 0           $seq_matrix_o = $seq_matrices{$seq->{_Nexml_matrix_ID}};
406 0           my $taxon_name = $datum->get_taxon()->get_name();
407 0           $datum->unset_taxon();
408 0           $seq_matrix_o->insert($datum);
409 0           $datum->set_taxon($seq_matrix_o->get_taxa()->get_by_name($taxon_name));
410             }
411             else {
412 0           $seq_matrix_o = Bio::Phylo::Factory->create_matrix('-type' => $datum->moltype);
413 0           $seq_matrices{$seq->{_Nexml_matrix_ID}} = $seq_matrix_o;
414 0 0         $seq_matrix_o->set_taxa($taxa_o) if defined $taxa_o;
415 0           $seq_matrix_o->insert($datum);
416            
417             #get matrix label
418 0           my $feat = ($seq->get_SeqFeatures())[0];
419 0 0         my $matrix_label = ($feat->get_tag_values('matrix_label'))[0] if $feat->has_tag('id');
420 0           $seq_matrix_o->set_name($matrix_label);
421            
422 0           $matrices->insert($seq_matrix_o);
423             }
424             }
425            
426             #Add matrices and forest objects to project object which represents a complete nexml document
427 0 0         if($forest->first) {
428 0           $proj_doc->insert($forest);
429             }
430 0           while(my $curr_matrix = $matrices->next) {
431 0           $proj_doc->insert($curr_matrix);
432             }
433            
434             #write nexml document to stream
435 0           my $ret = $self->_print($proj_doc->to_xml(-compact=>1));
436 0           $self->flush;
437 0           return($ret);
438             }
439              
440             =head2 extract_seqs
441              
442             Title : extract_seqs
443             Usage : $nexmlIO->extract_seqs(-file => ">$outfile", -format => $format)
444             Function: converts BioPerl seqs stored in the NexmlIO object into the provided
445             format and writes it to the provided file. Uses L to do
446             the conversion and writing.
447             Returns : none
448             Args : file to write to, format to be converted to
449              
450             See L, L
451              
452             =cut
453              
454             sub extract_seqs {
455 0     0 1   my $self = shift;
456 0 0         unless ( $self->{'_parsed'} ) {
457 0           $self->_parse;
458             }
459            
460 0           my %params = @_;
461 0           my $remove_spaces = 0;
462 0           my $ret = 0;
463 0           my ($format, $file) = @params{qw( -format -file)};
464            
465 0           for ($format) {
466 0 0         /^fasta$/i && do {
467             # this is ok, flag so that the nexmlid gets converted;
468 0           $remove_spaces = 1;
469 0           last;
470             };
471             # default
472 0           do {
473 0           $self->throw("Format '$format' not yet supported for extraction");
474             };
475             }
476            
477 0           my $seqIO = Bio::SeqIO->new(-format => $format, -file => $file);
478 0           my $seqs = $self->{_seqs};
479 0           foreach my $seq (@$seqs) {
480 0 0         if ($remove_spaces) {
481 0           my $id = $seq->id;
482 0           $id =~ s/ /_/;
483 0           $seq->id($id);
484             }
485 0           $ret = $seqIO->write_seq($seq);
486             }
487 0           return $ret;
488             }
489              
490             =head2 extract_alns
491              
492             Title : extract_alns
493             Usage : $nexmlIO->extract_alns(-file => ">$outfile", -format => $format)
494             Function: converts BioPerl alns stored in the NexmlIO object into the provided
495             format and writes it to the provided file. Uses L to do
496             the conversion and writing.
497             Returns : none
498             Args : file to write to, format to be converted to
499              
500             See L, L
501              
502             =cut
503              
504             sub extract_alns {
505 0     0 1   my $self = shift;
506 0 0         unless ( $self->{'_parsed'} ) {
507 0           $self->_parse;
508             }
509            
510 0           my $ret = 0;
511 0           my %params = @_;
512 0           my ($format, $file) = @params{qw( -format -file)};
513            
514 0           my $alignIO = Bio::AlignIO->new(-format => $format, -file => $file);
515 0           my $alns = $self->{_alns};
516 0           foreach my $aln (@$alns) {
517 0           $ret = $alignIO->write_aln($aln);
518             }
519 0           return $ret;
520             }
521              
522             =head2 extract_trees
523              
524             Title : extract_trees
525             Usage : $nexmlIO->extract_trees(-file => ">$outfile", -format => $format)
526             Function: converts BioPerl trees stored in the NexmlIO object into the provided
527             format and writes it to the provided file. Uses L to do
528             the conversion and writing.
529             Returns : none
530             Args : file to write to, format to be converted to
531              
532             See L, L
533              
534             =cut
535              
536             sub extract_trees {
537 0     0 1   my $self = shift;
538 0 0         unless ( $self->{'_parsed'} ) {
539 0           $self->_parse;
540             }
541            
542 0           my $ret = 0;
543 0           my %params = @_;
544 0           my ($format, $file) = @params{qw( -format -file)};
545            
546 0           my $treeIO = Bio::TreeIO->new(-format => $format, -file => $file);
547 0           my $trees = $self->{_trees};
548 0           foreach my $tree (@$trees) {
549 0           $treeIO->write_tree($tree);
550 0           $ret = 1;
551             }
552 0           return $ret;
553             }
554              
555             1;