File Coverage

lib/Bio/Phylo/Forest/DBTree.pm
Criterion Covered Total %
statement 41 93 44.0
branch 3 12 25.0
condition 0 2 0.0
subroutine 14 23 60.8
pod 9 9 100.0
total 67 139 48.2


line stmt bran cond sub pod time code
1             package Bio::Phylo::Forest::DBTree;
2 2     2   6515 use strict;
  2         4  
  2         50  
3 2     2   8 use warnings;
  2         2  
  2         44  
4 2     2   2022 use DBI;
  2         24791  
  2         112  
5 2     2   713 use Bio::Phylo::Factory;
  2         15289  
  2         10  
6 2     2   63 use Bio::Phylo::Util::Exceptions 'throw';
  2         3  
  2         76  
7 2     2   9 use base 'DBIx::Class::Schema';
  2         4  
  2         801  
8 2     2   88229 use base 'Bio::Phylo::Forest::Tree';
  2         7  
  2         900  
9              
10             __PACKAGE__->load_namespaces;
11              
12             my $SINGLETON;
13             my $DBH;
14             my $fac = Bio::Phylo::Factory->new;
15 2     2   133500 use version 0.77; our $VERSION = qv("v0.1.2");
  2         2721  
  2         17  
16              
17             =head1 NAME
18              
19             Bio::Phylo::Forest::DBTree - Phylogenetic database as a tree object
20              
21             =head1 SYNOPSIS
22              
23             use Bio::Phylo::Forest::DBTree;
24            
25             # connect to the Green Genes tree
26             my $file = 'gg_13_5_otus_99_annotated.db';
27             my $dbtree = Bio::Phylo::Forest::DBTree->connect($file);
28              
29             # $dbtree can be used as a Bio::Phylo::Forest::Tree object,
30             # and the node objects that are returned can be used as
31             # Bio::Phylo::Forest::Node objects
32             my $root = $dbtree->get_root;
33              
34             =head1 DESCRIPTION
35              
36             This package provides the functionality to handle very large phylogenies (examples: the
37             NCBI taxonomy, the Green Genes tree) as if they are L tree objects, with all
38             the possibilities for traversal, computation, serialization, and visualization, but stored
39             in a SQLite database. These databases are single files, so that they can be easily shared.
40             Some useful database files are available here:
41             https://figshare.com/account/home#/projects/18808
42              
43             To make new tree databases, a number of scripts are provided with the distribution of this
44             package:
45              
46             =over
47              
48             =item * C Loads a very large Newick tree into a database.
49              
50             =item * C Loads the NCBI taxonomy dump into a database.
51              
52             =item * C Loads a tree in the format of L
53             into a database.
54              
55             =back
56              
57             As an example of interacting with a database tree, the script C can be
58             used to extract subtrees from a database.
59              
60             =head1 DATABASE METHODS
61              
62             The following methods deal with the database as a whole: creating a new database,
63             connecting to an existing one, persisting a tree in a database and extracting one as a
64             mutable, in-memory object.
65              
66             =head2 create()
67              
68             Creates a SQLite database file in the provided location. Usage:
69              
70             use Bio::Phylo::Forest::DBTree;
71            
72             # second argument is optional
73             Bio::Phylo::Forest::DBTree->create( $file, '/opt/local/bin/sqlite3' );
74              
75             The first argument is the location where the database file is going to be created. The
76             second argument is optional, and provides the location of the C executable that
77             is used to create the database. By default, the C is simply found on the
78             C<$PATH>, but if it is installed in a non-standard location that location can be provided
79             here. The database schema that is created corresponds to the following SQL statements:
80              
81             create table node(
82             id int not null,
83             parent int,
84             left int,
85             right int,
86             name varchar(20),
87             length float,
88             height float,
89             primary key(id)
90             );
91             create index parent_idx on node(parent);
92             create index left_idx on node(left);
93             create index right_idx on node(right);
94             create index name_idx on node(name);
95              
96             =cut
97              
98             sub create {
99 0     0 1 0 my $class = shift;
100 0         0 my $file = shift;
101 0   0     0 my $sqlite3 = shift || 'sqlite3';
102 0         0 my $command = do { local $/; };
  0         0  
  0         0  
103 0 0       0 system("echo '$command' | sqlite3 '$file'") == 0 or die 'Create failed!';
104             }
105              
106             =head2 connect()
107              
108             Connects to a SQLite database file, returns the connection as a
109             C object. Usage:
110              
111             use Bio::Phylo::Forest::DBTree;
112             my $dbtree = Bio::Phylo::Forest::DBTree->connect($file);
113              
114             The argument is a file name. If the file exists, a L database handle to that
115             file is returned. If the file does not exist, a new database is created in that location,
116             and subsequently the handle to that newly created database is returned. The creation of
117             the database is handled by the C method (see below).
118              
119             =cut
120              
121             sub connect {
122 2     2 1 534 my $class = shift;
123 2         5 my $file = shift;
124 2 100       10 if ( not $SINGLETON ) {
125            
126             # create if not exist
127 1 50       37 if ( not -e $file ) {
128 0         0 $class->create($file);
129             }
130            
131             # fuck it, let's just hardcode it here - Yeehaw!
132 1         3 my $dsn = "dbi:SQLite:dbname=$file";
133 1         8 $DBH = DBI->connect($dsn,'','');
134 1         8473 $DBH->{'RaiseError'} = 1;
135 1     1   17 $SINGLETON = $class->SUPER::connect( sub { $DBH } );
  1         140071  
136             }
137 2         55779 return $SINGLETON;
138             }
139              
140             =head2 persist()
141              
142             Persist a phylogenetic tree object (a subclass of L) into a
143             newly created database file. Usage:
144              
145             use Bio::Phylo::Forest::DBTree;
146             my $dbtree = Bio::Phylo::Forest::DBTree->persist(
147             -file => $file,
148             -tree => $tree,
149             );
150              
151             This method first create a database at the location specified by C<$file> by making a call
152             to the C method. Subsequently, the C<$tree> object is traversed from root to
153             tips and inserted in the newly created database. Finally, the handle to this database is
154             returned, i.e. a C object.
155              
156             =cut
157              
158             sub persist {
159 0     0 1 0 my ( $class, %args ) = @_;
160            
161             # need a file argument to write to
162 0 0       0 if ( not $args{'-file'} ) {
163 0         0 throw 'BadArgs' => "Need -file argument!";
164             }
165            
166             # need a tree argument to persis
167 0 0       0 if ( not $args{'-tree'} ) {
168 0         0 throw 'BadArgs' => "Need -tree argument!";
169             }
170            
171             # create a new database, prepare statement handler
172 0         0 $class->create( $args{'-file'} );
173 0         0 my $dsn = 'dbi:SQLite:dbname=' . $args{'-file'};
174 0         0 my $dbh = DBI->connect($dsn,'','');
175 0         0 $dbh->{'RaiseError'} = 1;
176 0     0   0 my $db = $class->SUPER::connect( sub { $dbh } );
  0         0  
177 0         0 my $sth = $dbh->prepare("insert into node values(?,?,?,?)");
178            
179             # start traversing
180 0         0 my $counter = 2;
181 0         0 my %idmap;
182             $args{'-tree'}->visit_depth_first(
183             '-pre' => sub {
184 0     0   0 my $node = shift;
185 0         0 my $id = $node->get_id;
186 0         0 $idmap{$id} = $counter++;
187            
188             # get the parent id, or "1" if root
189 0         0 my $parent_id;
190 0 0       0 if ( my $parent = $node->get_parent ) {
191 0         0 my $pid = $parent->get_id;
192 0         0 $parent_id = $idmap{$pid};
193             }
194             else {
195 0         0 $parent_id = 1;
196             }
197            
198             # do the insertion
199             $sth->execute(
200 0         0 $idmap{$id}, # primary key
201             $parent_id, # self-joining foreign key
202             undef, # not indexed yet
203             undef, # not indexed yet
204             $node->get_internal_name, # node label or taxon name
205             $node->get_branch_length, # branch length
206             undef # not computed yet
207             );
208             }
209 0         0 );
210 0         0 my $i = 0;
211 0         0 $db->get_root->_index(\$i,0);
212 0         0 return $db;
213             }
214              
215             =head2 extract()
216              
217             Extracts a tree from a database. The returned tree is an in-memory object. Hence, this is
218             an expensive operation that is best avoided as much as possible. Usage:
219              
220             my $tree = $dbtree->extract;
221              
222             =cut
223              
224             sub extract {
225 0     0 1 0 my $self = shift;
226 0         0 my $tree = $fac->create_tree;
227 0         0 my $root = $self->get_root;
228 0         0 _clone_mutable(
229             $fac->create_node(
230             '-name' => $root->get_name,
231             '-branch_length' => $root->get_branch_length,
232             ),
233             $root,
234             $tree
235             );
236 0         0 return $tree;
237             }
238              
239             {
240 2     2   1046 no warnings 'recursion';
  2         5  
  2         501  
241             sub _clone_mutable {
242 0     0   0 my ( $parent, $template, $tree ) = @_;
243 0         0 $tree->insert($parent);
244 0         0 for my $child ( @{ $template->get_children } ) {
  0         0  
245 0         0 _clone_mutable(
246             $fac->create_node(
247             '-name' => $child->get_name,
248             '-branch_length' => $child->get_branch_length,
249             '-parent' => $parent,
250             ),
251             $child,
252             $tree
253             );
254             }
255             }
256             }
257              
258             =head2 dbh()
259              
260             Returns the underlying handle through which SQL statements can be executed directly on the
261             database. This is a L object. Usage:
262              
263             my $dbh = $dbtree->dbh;
264              
265             =cut
266              
267 0     0 1 0 sub dbh { $DBH }
268              
269             =head1 TREE METHODS
270              
271             The following methods are implemented here to override methods of the same name in the
272             L hierarchy so that the tree database is accessed more efficiently than
273             otherwise would be the case.
274              
275             =head2 get_root()
276              
277             Returns the root of the tree, i.e. a L object,
278             which is a subclass of L. Usage:
279              
280             my $root = $dbtree->get_root;
281              
282             =cut
283              
284             sub get_root {
285             shift->_rs->search(
286 1     1 1 533 { 'parent' => 1 },
287             {
288             'order_by' => 'id',
289             'rows' => 1,
290             }
291             )->single
292             }
293              
294             =head2 get_id()
295              
296             Returns a dummy ID, an integer. Usage:
297              
298             my $id = $dbtree->get_id;
299              
300             =cut
301              
302 0     0 1 0 sub get_id { 0 }
303              
304             =head2 get_by_name()
305              
306             Returns the first node object that has the provided name. Usage:
307              
308             my $node = $dbtree->get_by_name( 'Homo sapiens' );
309              
310             =cut
311              
312             sub get_by_name {
313 6     6 1 48553 my ( $self, $name ) = @_;
314 6         20 return $self->_rs->search({ 'name' => $name })->single;
315             }
316              
317             =head2 visit()
318              
319             Given a code reference, visits all the nodes in the tree and executes the code on the
320             focal node. Usage:
321              
322             $dbtree->visit(sub{
323             my $node = shift;
324             print $node->name, "\n";
325             });
326              
327             =cut
328              
329             sub visit {
330 0     0 1 0 my ( $self, $code ) = @_;
331 0         0 my $rs = $self->_rs;
332 0         0 while( my $node = $rs->next ) {
333 0         0 $code->($node);
334             }
335 0         0 return $self;
336             }
337              
338 7     7   34 sub _rs { shift->resultset('Node') }
339              
340             1;
341              
342             __DATA__