File Coverage

Bio/DB/GFF/Aggregator.pm
Criterion Covered Total %
statement 116 124 93.5
branch 39 52 75.0
condition 8 18 44.4
subroutine 18 21 85.7
pod 14 17 82.3
total 195 232 84.0


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Bio::DB::GFF::Aggregator -- Aggregate GFF groups into composite features
4              
5             =head1 SYNOPSIS
6              
7             use Bio::DB::GFF;
8              
9             my $agg1 = Bio::DB::GFF::Aggregator->new(-method => 'cistron',
10             -main_method => 'locus',
11             -sub_parts => ['allele','variant']
12             );
13              
14             my $agg2 = Bio::DB::GFF::Aggregator->new(-method => 'splice_group',
15             -sub_parts => 'transcript');
16              
17             my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql',
18             -aggregator => [$agg1,$agg2],
19             -dsn => 'dbi:mysql:elegans42',
20             );
21              
22              
23             =head1 DESCRIPTION
24              
25             Bio::DB::GFF::Aggregator is used to aggregate GFF groups into
26             composite features. Each composite feature has a "main part", the
27             top-level feature, and a series of zero or more subparts, retrieved
28             with the sub_SeqFeature() method. The aggregator class is designed to
29             be subclassable, allowing a variety of GFF feature types to be
30             supported.
31              
32             The base Bio::DB::GFF::Aggregator class is generic, and can be used to
33             create specific instances to be passed to the -aggregator argument of
34             Bio::DB::GFF-Enew() call. The various subclasses of
35             Bio::DB::GFF::Aggregator are tuned for specific common feature types
36             such as clones, gapped alignments and transcripts.
37              
38             Instances of Bio::DB::GFF::Aggregator have three attributes:
39              
40             =over 3
41              
42             =item *
43              
44             method
45              
46             This is the GFF method field of the composite feature as a whole. For
47             example, "transcript" may be used for a composite feature created by
48             aggregating individual intron, exon and UTR features.
49              
50             =item *
51              
52             main method
53              
54             Sometimes GFF groups are organized hierarchically, with one feature
55             logically containing another. For example, in the C. elegans schema,
56             methods of type "Sequence:curated" correspond to regions covered by
57             curated genes. There can be zero or one main methods.
58              
59             =item *
60              
61             subparts
62              
63             This is a list of one or more methods that correspond to the component
64             features of the aggregates. For example, in the C. elegans database,
65             the subparts of transcript are "intron", "exon" and "CDS".
66              
67             =back
68              
69             Aggregators have two main methods that can be overridden in
70             subclasses:
71              
72             =over 4
73              
74             =item *
75              
76             disaggregate()
77              
78             This method is called by the Adaptor object prior to fetching a list
79             of features. The method is passed an associative array containing the
80             [method,source] pairs that the user has requested, and it returns a
81             list of raw features that it would like the adaptor to fetch.
82              
83             =item *
84              
85             aggregate()
86              
87             This method is called by the Adaptor object after it has fetched
88             features. The method is passed a list of raw features and is expected
89             to add its composite features to the list.
90              
91             =back
92              
93             The disaggregate() and aggregate() methods provided by the base
94             Aggregator class should be sufficient for many applications. In this
95             case, it suffices for subclasses to override the following methods:
96              
97             =over 4
98              
99             =item *
100              
101             method()
102              
103             Return the default method for the composite feature as a whole.
104              
105             =item *
106              
107             main_name()
108              
109             Return the default main method name.
110              
111             =item *
112              
113             part_names()
114              
115             Return a list of subpart method names.
116              
117             =back
118              
119             Provided that method() and part_names() are overridden (and optionally
120             main_name() as well), then the bare name of the aggregator subclass
121             can be passed to the -aggregator of Bio::DB::GFF-Enew(). For example,
122             this is a small subclass that will aggregate features of type "allele"
123             and "polymorphism" into an aggregate named "mutant":
124              
125             package Bio::DB::GFF::Aggregator::mutant;
126              
127             use strict;
128             use Bio::DB::GFF::Aggregator;
129              
130             use base qw(Bio::DB::GFF::Aggregator);
131              
132             sub method { 'mutant' }
133              
134             sub part_names {
135             return qw(allele polymorphism);
136             }
137              
138             1;
139              
140             Once installed, this aggregator can be passed to Bio::DB::GFF-Enew()
141             by name like so:
142              
143             my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql',
144             -aggregator => 'mutant',
145             -dsn => 'dbi:mysql:elegans42',
146             );
147              
148             =head1 API
149              
150             The remainder of this document describes the public and private
151             methods implemented by this module.
152              
153             =cut
154              
155             package Bio::DB::GFF::Aggregator;
156              
157 3     3   15 use strict;
  3         6  
  3         69  
158 3     3   12 use Bio::DB::GFF::Util::Rearrange; # for rearrange()
  3         3  
  3         108  
159 3     3   12 use Bio::DB::GFF::Feature;
  3         3  
  3         51  
160              
161 3     3   12 use base qw(Bio::Root::Root);
  3         3  
  3         2829  
162              
163             my $ALWAYS_TRUE = sub { 1 };
164              
165             =head2 new
166              
167             Title : new
168             Usage : $a = Bio::DB::GFF::Aggregator->new(@args)
169             Function: create a new aggregator
170             Returns : a Bio::DB::GFF::Aggregator object
171             Args : see below
172             Status : Public
173              
174             This is the constructor for Bio::DB::GFF::Aggregator. Named arguments
175             are as follows:
176              
177             -method the method for the composite feature
178              
179             -main_method the top-level raw feature, if any
180              
181             -sub_parts the list of raw features that will form the subparts
182             of the composite feature (array reference or scalar)
183              
184             =cut
185              
186             sub new {
187 20     20 1 89 my $class = shift;
188 20         120 my ($method,$main,$sub_parts,$whole_object) = rearrange(['METHOD',
189             ['MAIN_PART','MAIN_METHOD'],
190             ['SUB_METHODS','SUB_PARTS'],
191             'WHOLE_OBJECT'
192             ],@_);
193 20         155 return bless {
194             method => $method,
195             main_method => $main,
196             sub_parts => $sub_parts,
197             require_whole_object => $whole_object,
198             },$class;
199             }
200              
201             =head2 disaggregate
202              
203             Title : disaggregate
204             Usage : $a->disaggregate($types,$factory)
205             Function: disaggregate type list into components
206             Returns : a true value if this aggregator should be called to reaggregate
207             Args : see below
208             Status : Public
209              
210             This method is called to disaggregate a list of types into the set of
211             low-level features to be retrieved from the GFF database. The list of
212             types is passed as an array reference containing a series of
213             [method,source] pairs. This method synthesizes a new set of
214             [method,source] pairs, and appends them to the list of requested
215             types, changing the list in situ.
216              
217             Arguments:
218              
219             $types reference to an array of [method,source] pairs
220              
221             $factory reference to the Adaptor object that is calling
222             this method
223              
224             Note that the API allows disaggregate() to remove types from the type
225             list. This feature is probably not desirable and may be deprecated in
226             the future.
227              
228             =cut
229              
230             # this is called at the beginning to turn the pseudo-type
231             # into its component feature types
232             sub disaggregate {
233 218     218 1 237 my $self = shift;
234 218         188 my $types = shift;
235 218         203 my $factory = shift;
236              
237 218         333 my $sub_features = $factory->parse_types($self->get_part_names);
238 218         437 my $main_feature = $factory->parse_types($self->get_main_name);
239              
240 218 100       317 if (@$types) {
241 85         85 my (@synthetic_types,@unchanged);
242 85         122 foreach (@$types) {
243 270         342 my ($method,$source) = @$_;
244 270 100       355 if (lc $method eq lc $self->get_method) { # e.g. "transcript"
245 20   33     37 push @synthetic_types,map { [$_->[0],$_->[1] || $source] } @$sub_features,@$main_feature;
  120         326  
246             }
247             else {
248 250         351 push @unchanged,$_;
249             }
250             }
251             # remember what we're searching for
252 85         154 $self->components(\@synthetic_types);
253 85         142 $self->passthru(\@unchanged);
254 85         149 @$types = (@unchanged,@synthetic_types);
255             }
256              
257             # we get here when no search types are listed
258             else {
259 133         183 my @stypes = map { [$_->[0],$_->[1]] } @$sub_features,@$main_feature;
  1202         1720  
260 133         318 $self->components(\@stypes);
261 133         222 $self->passthru(undef);
262             }
263              
264 218         288 return $self->component_count > 0;
265             }
266              
267              
268             =head2 aggregate
269              
270             Title : aggregate
271             Usage : $features = $a->aggregate($features,$factory)
272             Function: aggregate a feature list into composite features
273             Returns : an array reference containing modified features
274             Args : see below
275             Status : Public
276              
277             This method is called to aggregate a list of raw GFF features into the
278             set of composite features. The method is called an array reference to
279             a set of Bio::DB::GFF::Feature objects. It runs through the list,
280             creating new composite features when appropriate. The method result
281             is an array reference containing the composite features.
282              
283             Arguments:
284              
285             $features reference to an array of Bio::DB::GFF::Feature objects
286              
287             $factory reference to the Adaptor object that is calling
288             this method
289              
290             NOTE: The reason that the function result contains the raw features as
291             well as the aggregated ones is to allow queries like this one:
292              
293             @features = $segment->features('exon','transcript:curated');
294              
295             Assuming that "transcript" is the name of an aggregated feature and
296             that "exon" is one of its components, we do not want the transcript
297             aggregator to remove features of type "exon" because the user asked
298             for them explicitly.
299              
300             =cut
301              
302             sub aggregate {
303 168     168 1 202 my $self = shift;
304 168         183 my $features = shift;
305 168         154 my $factory = shift;
306              
307 168         256 my $main_method = $self->get_main_name;
308 168 50       308 my $matchsub = $self->match_sub($factory) or return;
309 168         330 my $strictmatch = $self->strict_match();
310 168         242 my $passthru = $self->passthru_sub($factory);
311              
312 168         197 my (%aggregates,@result);
313 168         209 for my $feature (@$features) {
314              
315 592 100 66     1001 if ($feature->group && $matchsub->($feature)) {
316 280 50       576 my $key = $strictmatch->{lc $feature->method,lc $feature->source}
317             ? join ($;,$feature->group,$feature->refseq,$feature->source)
318             : join ($;,$feature->group,$feature->refseq);
319 280 100 66     703 if ($main_method && lc $feature->method eq lc $main_method) {
320 81   33     341 $aggregates{$key}{base} ||= $feature->clone;
321             } else {
322 199         206 push @{$aggregates{$key}{subparts}},$feature;
  199         431  
323             }
324 280 50 33     929 push @result,$feature if $passthru && $passthru->($feature);
325              
326             } else {
327 312         636 push @result,$feature;
328             }
329             }
330              
331             # aggregate components
332 168         314 my $pseudo_method = $self->get_method;
333 168         244 my $require_whole_object = $self->require_whole_object;
334 168         299 foreach (keys %aggregates) {
335 107 50 33     182 if ($require_whole_object && $self->components) {
336 0 0       0 next unless $aggregates{$_}{base}; # && $aggregates{$_}{subparts};
337             }
338 107         130 my $base = $aggregates{$_}{base};
339 107 100       171 unless ($base) { # no base, so create one
340 26         43 my $first = $aggregates{$_}{subparts}[0];
341 26         65 $base = $first->clone; # to inherit parent coordinate system, etc
342 26         70 $base->score(undef);
343 26         60 $base->phase(undef);
344             }
345 107         344 $base->method($pseudo_method);
346 107         144 $base->add_subfeature($_) foreach @{$aggregates{$_}{subparts}};
  107         317  
347 107         298 $base->adjust_bounds;
348 107         225 $base->compound(1); # set the compound flag
349 107         171 push @result,$base;
350             }
351 168         622 @$features = @result;
352             }
353              
354              
355             =head2 method
356              
357             Title : method
358             Usage : $string = $a->method
359             Function: get the method type for the composite feature
360             Returns : a string
361             Args : none
362             Status : Protected
363              
364             This method is called to get the method to be assigned to the
365             composite feature once it is aggregated. It is called if the user did
366             not explicitly supply a -method argument when the aggregator was
367             created.
368              
369             This is the method that should be overridden in aggregator subclasses.
370              
371             =cut
372              
373             # default method - override in subclasses
374             sub method {
375 0     0 1 0 my $self = shift;
376 0         0 $self->{method};
377             }
378              
379             =head2 main_name
380              
381             Title : main_name
382             Usage : $string = $a->main_name
383             Function: get the method type for the "main" component of the feature
384             Returns : a string
385             Args : none
386             Status : Protected
387              
388             This method is called to get the method of the "main component" of the
389             composite feature. It is called if the user did not explicitly supply
390             a -main-method argument when the aggregator was created.
391              
392             This is the method that should be overridden in aggregator subclasses.
393              
394             =cut
395              
396             # no default main method
397             sub main_name {
398 0     0 1 0 my $self = shift;
399 0         0 return;
400             }
401              
402             =head2 part_names
403              
404             Title : part_names
405             Usage : @methods = $a->part_names
406             Function: get the methods for the non-main various components of the feature
407             Returns : a list of strings
408             Args : none
409             Status : Protected
410              
411             This method is called to get the list of methods of the "main component" of the
412             composite feature. It is called if the user did not explicitly supply
413             a -main-method argument when the aggregator was created.
414              
415             This is the method that should be overridden in aggregator subclasses.
416              
417             =cut
418              
419             # no default part names
420             sub part_names {
421 0     0 1 0 my $self = shift;
422 0         0 return;
423             }
424              
425             =head2 require_whole_object
426              
427             Title : require_whole_object
428             Usage : $bool = $a->require_whole_object
429             Function: see below
430             Returns : a boolean flag
431             Args : none
432             Status : Internal
433              
434             This method returns true if the aggregator should refuse to aggregate
435             an object unless both its main part and its subparts are present.
436              
437             =cut
438              
439             sub require_whole_object {
440 168     168 1 167 my $self = shift;
441 168         183 my $d = $self->{require_whole_object};
442 168 50       265 $self->{require_whole_object} = shift if @_;
443 168         201 $d;
444             }
445              
446             =head2 match_sub
447              
448             Title : match_sub
449             Usage : $coderef = $a->match_sub($factory)
450             Function: generate a code reference that will match desired features
451             Returns : a code reference
452             Args : see below
453             Status : Internal
454              
455             This method is used internally to generate a code sub that will
456             quickly filter out the raw features that we're interested in
457             aggregating. The returned sub accepts a Feature and returns true if
458             we should aggregate it, false otherwise.
459              
460             =cut
461              
462             #' make emacs happy
463              
464             sub match_sub {
465 168     168 1 187 my $self = shift;
466 168         153 my $factory = shift;
467 168 50       200 my $types_to_aggregate = $self->components() or return; # saved from disaggregate call
468 168 50       243 return unless @$types_to_aggregate;
469 168         361 return $factory->make_match_sub($types_to_aggregate);
470             }
471              
472             =head2 strict_match
473              
474             Title : strict_match
475             Usage : $strict = $a->strict_match
476             Function: generate a hashref that indicates which subfeatures
477             need to be tested strictly for matching sources before
478             aggregating
479             Returns : a hash ref
480             Status : Internal
481              
482             =cut
483              
484             sub strict_match {
485 168     168 1 167 my $self = shift;
486 168         204 my $types_to_aggregate = $self->components();
487 168         165 my %strict;
488 168         212 for my $t (@$types_to_aggregate) {
489 1432 50       1735 $strict{lc $t->[0],lc $t->[1]}++ if defined $t->[1];
490             }
491 168         234 \%strict;
492             }
493              
494             sub passthru_sub {
495 168     168 0 173 my $self = shift;
496 168         161 my $factory = shift;
497 168 100       206 my $passthru = $self->passthru() or return;
498 20 50       54 return unless @$passthru;
499 0         0 return $factory->make_match_sub($passthru);
500             }
501              
502             =head2 components
503              
504             Title : components
505             Usage : @array= $a->components([$components])
506             Function: get/set stored list of parsed raw feature types
507             Returns : an array in list context, an array ref in scalar context
508             Args : new arrayref of feature types
509             Status : Internal
510              
511             This method is used internally to remember the parsed list of raw
512             features that we will aggregate. The need for this subroutine is
513             seen when a user requests a composite feature of type
514             "clone:cosmid". This generates a list of components in which the
515             source is appended to the method, like "clone_left_end:cosmid" and
516             "clone_right_end:cosmid". components() stores this information for
517             later use.
518              
519             =cut
520              
521             sub components {
522 772     772 1 702 my $self = shift;
523 772         745 my $d = $self->{components};
524 772 100       1090 $self->{components} = shift if @_;
525 772 100       1046 return unless ref $d;
526 752 100       1461 return wantarray ? @$d : $d;
527             }
528              
529             sub component_count {
530 218     218 0 266 my @c = shift->components;
531 218         1048 scalar @c;
532             }
533              
534             sub passthru {
535 386     386 0 366 my $self = shift;
536 386         386 my $d = $self->{passthru};
537 386 100       580 $self->{passthru} = shift if @_;
538 386 100       709 return unless ref $d;
539 96 50       218 return wantarray ? @$d : $d;
540             }
541              
542             sub clone {
543 15     15 1 18 my $self = shift;
544 15         17 my %new = %{$self};
  15         61  
545 15         89 return bless \%new,ref($self);
546             }
547              
548             =head2 get_part_names
549              
550             Title : get_part_names
551             Usage : @array = $a->get_part_names
552             Function: get list of sub-parts for this type of feature
553             Returns : an array
554             Args : none
555             Status : Internal
556              
557             This method is used internally to fetch the list of feature types that
558             form the components of the composite feature. Type names in the
559             format "method:source" are recognized, as are "method" and
560             Bio::DB::GFF::Typename objects as well. It checks instance variables
561             first, and if not defined calls the part_names() method.
562              
563             =cut
564              
565             sub get_part_names {
566 218     218 1 209 my $self = shift;
567 218 100       330 if ($self->{sub_parts}) {
568 46 50       85 return ref $self->{sub_parts} ? @{$self->{sub_parts}} : $self->{sub_parts};
  46         137  
569             } else {
570 172         354 return $self->part_names;
571             }
572             }
573              
574             =head2 get_main_name
575              
576             Title : get_main_name
577             Usage : $string = $a->get_main_name
578             Function: get the "main" method type for this feature
579             Returns : a string
580             Args : none
581             Status : Internal
582              
583             This method is used internally to fetch the type of the "main part" of
584             the feature. It checks instance variables first, and if not defined
585             calls the main_name() method.
586              
587             =cut
588              
589             sub get_main_name {
590 386     386 1 381 my $self = shift;
591 386 100       708 return $self->{main_method} if defined $self->{main_method};
592 309         526 return $self->main_name;
593             }
594              
595             =head2 get_method
596              
597             Title : get_method
598             Usage : $string = $a->get_method
599             Function: get the method type for the composite feature
600             Returns : a string
601             Args : none
602             Status : Internal
603              
604             This method is used internally to fetch the type of the method that
605             will be assigned to the composite feature once it is synthesized.
606              
607             =cut
608              
609             sub get_method {
610 458     458 1 411 my $self = shift;
611 458 100       817 return $self->{method} if defined $self->{method};
612 302         491 return $self->method;
613             }
614              
615             1;
616              
617             =head1 BUGS
618              
619             None known yet.
620              
621             =head1 SEE ALSO
622              
623             L,
624             L,
625             L,
626             L,
627             L,
628             L,
629             L,
630             L
631              
632             =head1 AUTHOR
633              
634             Lincoln Stein Elstein@cshl.orgE.
635              
636             Copyright (c) 2001 Cold Spring Harbor Laboratory.
637              
638             This library is free software; you can redistribute it and/or modify
639             it under the same terms as Perl itself.
640              
641             =cut
642