File Coverage

Bio/DB/GFF/Aggregator.pm
Criterion Covered Total %
statement 116 124 93.5
branch 39 52 75.0
condition 8 18 44.4
subroutine 18 21 85.7
pod 14 17 82.3
total 195 232 84.0


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Bio::DB::GFF::Aggregator -- Aggregate GFF groups into composite features
4              
5             =head1 SYNOPSIS
6              
7             use Bio::DB::GFF;
8              
9             my $agg1 = Bio::DB::GFF::Aggregator->new(-method => 'cistron',
10             -main_method => 'locus',
11             -sub_parts => ['allele','variant']
12             );
13              
14             my $agg2 = Bio::DB::GFF::Aggregator->new(-method => 'splice_group',
15             -sub_parts => 'transcript');
16              
17             my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql',
18             -aggregator => [$agg1,$agg2],
19             -dsn => 'dbi:mysql:elegans42',
20             );
21              
22              
23             =head1 DESCRIPTION
24              
25             Bio::DB::GFF::Aggregator is used to aggregate GFF groups into
26             composite features. Each composite feature has a "main part", the
27             top-level feature, and a series of zero or more subparts, retrieved
28             with the sub_SeqFeature() method. The aggregator class is designed to
29             be subclassable, allowing a variety of GFF feature types to be
30             supported.
31              
32             The base Bio::DB::GFF::Aggregator class is generic, and can be used to
33             create specific instances to be passed to the -aggregator argument of
34             Bio::DB::GFF-Enew() call. The various subclasses of
35             Bio::DB::GFF::Aggregator are tuned for specific common feature types
36             such as clones, gapped alignments and transcripts.
37              
38             Instances of Bio::DB::GFF::Aggregator have three attributes:
39              
40             =over 3
41              
42             =item *
43              
44             method
45              
46             This is the GFF method field of the composite feature as a whole. For
47             example, "transcript" may be used for a composite feature created by
48             aggregating individual intron, exon and UTR features.
49              
50             =item *
51              
52             main method
53              
54             Sometimes GFF groups are organized hierarchically, with one feature
55             logically containing another. For example, in the C. elegans schema,
56             methods of type "Sequence:curated" correspond to regions covered by
57             curated genes. There can be zero or one main methods.
58              
59             =item *
60              
61             subparts
62              
63             This is a list of one or more methods that correspond to the component
64             features of the aggregates. For example, in the C. elegans database,
65             the subparts of transcript are "intron", "exon" and "CDS".
66              
67             =back
68              
69             Aggregators have two main methods that can be overridden in
70             subclasses:
71              
72             =over 4
73              
74             =item *
75              
76             disaggregate()
77              
78             This method is called by the Adaptor object prior to fetching a list
79             of features. The method is passed an associative array containing the
80             [method,source] pairs that the user has requested, and it returns a
81             list of raw features that it would like the adaptor to fetch.
82              
83             =item *
84              
85             aggregate()
86              
87             This method is called by the Adaptor object after it has fetched
88             features. The method is passed a list of raw features and is expected
89             to add its composite features to the list.
90              
91             =back
92              
93             The disaggregate() and aggregate() methods provided by the base
94             Aggregator class should be sufficient for many applications. In this
95             case, it suffices for subclasses to override the following methods:
96              
97             =over 4
98              
99             =item *
100              
101             method()
102              
103             Return the default method for the composite feature as a whole.
104              
105             =item *
106              
107             main_name()
108              
109             Return the default main method name.
110              
111             =item *
112              
113             part_names()
114              
115             Return a list of subpart method names.
116              
117             =back
118              
119             Provided that method() and part_names() are overridden (and optionally
120             main_name() as well), then the bare name of the aggregator subclass
121             can be passed to the -aggregator of Bio::DB::GFF-Enew(). For example,
122             this is a small subclass that will aggregate features of type "allele"
123             and "polymorphism" into an aggregate named "mutant":
124              
125             package Bio::DB::GFF::Aggregator::mutant;
126              
127             use strict;
128             use Bio::DB::GFF::Aggregator;
129              
130             use base qw(Bio::DB::GFF::Aggregator);
131              
132             sub method { 'mutant' }
133              
134             sub part_names {
135             return qw(allele polymorphism);
136             }
137              
138             1;
139              
140             Once installed, this aggregator can be passed to Bio::DB::GFF-Enew()
141             by name like so:
142              
143             my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql',
144             -aggregator => 'mutant',
145             -dsn => 'dbi:mysql:elegans42',
146             );
147              
148             =head1 API
149              
150             The remainder of this document describes the public and private
151             methods implemented by this module.
152              
153             =cut
154              
155             package Bio::DB::GFF::Aggregator;
156              
157 3     3   12 use strict;
  3         3  
  3         69  
158 3     3   9 use Bio::DB::GFF::Util::Rearrange; # for rearrange()
  3         6  
  3         105  
159 3     3   9 use Bio::DB::GFF::Feature;
  3         3  
  3         48  
160              
161 3     3   9 use base qw(Bio::Root::Root);
  3         3  
  3         5604  
162              
163             my $ALWAYS_TRUE = sub { 1 };
164              
165             =head2 new
166              
167             Title : new
168             Usage : $a = Bio::DB::GFF::Aggregator->new(@args)
169             Function: create a new aggregator
170             Returns : a Bio::DB::GFF::Aggregator object
171             Args : see below
172             Status : Public
173              
174             This is the constructor for Bio::DB::GFF::Aggregator. Named arguments
175             are as follows:
176              
177             -method the method for the composite feature
178              
179             -main_method the top-level raw feature, if any
180              
181             -sub_parts the list of raw features that will form the subparts
182             of the composite feature (array reference or scalar)
183              
184             =cut
185              
186             sub new {
187 20     20 1 79 my $class = shift;
188 20         109 my ($method,$main,$sub_parts,$whole_object) = rearrange(['METHOD',
189             ['MAIN_PART','MAIN_METHOD'],
190             ['SUB_METHODS','SUB_PARTS'],
191             'WHOLE_OBJECT'
192             ],@_);
193 20         144 return bless {
194             method => $method,
195             main_method => $main,
196             sub_parts => $sub_parts,
197             require_whole_object => $whole_object,
198             },$class;
199             }
200              
201             =head2 disaggregate
202              
203             Title : disaggregate
204             Usage : $a->disaggregate($types,$factory)
205             Function: disaggregate type list into components
206             Returns : a true value if this aggregator should be called to reaggregate
207             Args : see below
208             Status : Public
209              
210             This method is called to disaggregate a list of types into the set of
211             low-level features to be retrieved from the GFF database. The list of
212             types is passed as an array reference containing a series of
213             [method,source] pairs. This method synthesizes a new set of
214             [method,source] pairs, and appends them to the list of requested
215             types, changing the list in situ.
216              
217             Arguments:
218              
219             $types reference to an array of [method,source] pairs
220              
221             $factory reference to the Adaptor object that is calling
222             this method
223              
224             Note that the API allows disaggregate() to remove types from the type
225             list. This feature is probably not desirable and may be deprecated in
226             the future.
227              
228             =cut
229              
230             # this is called at the beginning to turn the pseudo-type
231             # into its component feature types
232             sub disaggregate {
233 218     218 1 174 my $self = shift;
234 218         167 my $types = shift;
235 218         161 my $factory = shift;
236              
237 218         321 my $sub_features = $factory->parse_types($self->get_part_names);
238 218         375 my $main_feature = $factory->parse_types($self->get_main_name);
239              
240 218 100       317 if (@$types) {
241 85         65 my (@synthetic_types,@unchanged);
242 85         105 foreach (@$types) {
243 270         295 my ($method,$source) = @$_;
244 270 100       332 if (lc $method eq lc $self->get_method) { # e.g. "transcript"
245 20   33     31 push @synthetic_types,map { [$_->[0],$_->[1] || $source] } @$sub_features,@$main_feature;
  120         377  
246             }
247             else {
248 250         311 push @unchanged,$_;
249             }
250             }
251             # remember what we're searching for
252 85         121 $self->components(\@synthetic_types);
253 85         114 $self->passthru(\@unchanged);
254 85         158 @$types = (@unchanged,@synthetic_types);
255             }
256              
257             # we get here when no search types are listed
258             else {
259 133         130 my @stypes = map { [$_->[0],$_->[1]] } @$sub_features,@$main_feature;
  1202         1338  
260 133         323 $self->components(\@stypes);
261 133         198 $self->passthru(undef);
262             }
263              
264 218         304 return $self->component_count > 0;
265             }
266              
267              
268             =head2 aggregate
269              
270             Title : aggregate
271             Usage : $features = $a->aggregate($features,$factory)
272             Function: aggregate a feature list into composite features
273             Returns : an array reference containing modified features
274             Args : see below
275             Status : Public
276              
277             This method is called to aggregate a list of raw GFF features into the
278             set of composite features. The method is called an array reference to
279             a set of Bio::DB::GFF::Feature objects. It runs through the list,
280             creating new composite features when appropriate. The method result
281             is an array reference containing the composite features.
282              
283             Arguments:
284              
285             $features reference to an array of Bio::DB::GFF::Feature objects
286              
287             $factory reference to the Adaptor object that is calling
288             this method
289              
290             NOTE: The reason that the function result contains the raw features as
291             well as the aggregated ones is to allow queries like this one:
292              
293             @features = $segment->features('exon','transcript:curated');
294              
295             Assuming that "transcript" is the name of an aggregated feature and
296             that "exon" is one of its components, we do not want the transcript
297             aggregator to remove features of type "exon" because the user asked
298             for them explicitly.
299              
300             =cut
301              
302             sub aggregate {
303 168     168 1 128 my $self = shift;
304 168         137 my $features = shift;
305 168         115 my $factory = shift;
306              
307 168         195 my $main_method = $self->get_main_name;
308 168 50       262 my $matchsub = $self->match_sub($factory) or return;
309 168         333 my $strictmatch = $self->strict_match();
310 168         278 my $passthru = $self->passthru_sub($factory);
311              
312 168         128 my (%aggregates,@result);
313 168         185 for my $feature (@$features) {
314              
315 592 100 66     881 if ($feature->group && $matchsub->($feature)) {
316 280 50       464 my $key = $strictmatch->{lc $feature->method,lc $feature->source}
317             ? join ($;,$feature->group,$feature->refseq,$feature->source)
318             : join ($;,$feature->group,$feature->refseq);
319 280 100 66     715 if ($main_method && lc $feature->method eq lc $main_method) {
320 81   33     346 $aggregates{$key}{base} ||= $feature->clone;
321             } else {
322 199         160 push @{$aggregates{$key}{subparts}},$feature;
  199         438  
323             }
324 280 50 33     928 push @result,$feature if $passthru && $passthru->($feature);
325              
326             } else {
327 312         533 push @result,$feature;
328             }
329             }
330              
331             # aggregate components
332 168         333 my $pseudo_method = $self->get_method;
333 168         319 my $require_whole_object = $self->require_whole_object;
334 168         337 foreach (keys %aggregates) {
335 107 50 33     228 if ($require_whole_object && $self->components) {
336 0 0       0 next unless $aggregates{$_}{base}; # && $aggregates{$_}{subparts};
337             }
338 107         117 my $base = $aggregates{$_}{base};
339 107 100       162 unless ($base) { # no base, so create one
340 26         33 my $first = $aggregates{$_}{subparts}[0];
341 26         55 $base = $first->clone; # to inherit parent coordinate system, etc
342 26         72 $base->score(undef);
343 26         48 $base->phase(undef);
344             }
345 107         278 $base->method($pseudo_method);
346 107         83 $base->add_subfeature($_) foreach @{$aggregates{$_}{subparts}};
  107         356  
347 107         233 $base->adjust_bounds;
348 107         192 $base->compound(1); # set the compound flag
349 107         154 push @result,$base;
350             }
351 168         647 @$features = @result;
352             }
353              
354              
355             =head2 method
356              
357             Title : method
358             Usage : $string = $a->method
359             Function: get the method type for the composite feature
360             Returns : a string
361             Args : none
362             Status : Protected
363              
364             This method is called to get the method to be assigned to the
365             composite feature once it is aggregated. It is called if the user did
366             not explicitly supply a -method argument when the aggregator was
367             created.
368              
369             This is the method that should be overridden in aggregator subclasses.
370              
371             =cut
372              
373             # default method - override in subclasses
374             sub method {
375 0     0 1 0 my $self = shift;
376 0         0 $self->{method};
377             }
378              
379             =head2 main_name
380              
381             Title : main_name
382             Usage : $string = $a->main_name
383             Function: get the method type for the "main" component of the feature
384             Returns : a string
385             Args : none
386             Status : Protected
387              
388             This method is called to get the method of the "main component" of the
389             composite feature. It is called if the user did not explicitly supply
390             a -main-method argument when the aggregator was created.
391              
392             This is the method that should be overridden in aggregator subclasses.
393              
394             =cut
395              
396             # no default main method
397             sub main_name {
398 0     0 1 0 my $self = shift;
399 0         0 return;
400             }
401              
402             =head2 part_names
403              
404             Title : part_names
405             Usage : @methods = $a->part_names
406             Function: get the methods for the non-main various components of the feature
407             Returns : a list of strings
408             Args : none
409             Status : Protected
410              
411             This method is called to get the list of methods of the "main component" of the
412             composite feature. It is called if the user did not explicitly supply
413             a -main-method argument when the aggregator was created.
414              
415             This is the method that should be overridden in aggregator subclasses.
416              
417             =cut
418              
419             # no default part names
420             sub part_names {
421 0     0 1 0 my $self = shift;
422 0         0 return;
423             }
424              
425             =head2 require_whole_object
426              
427             Title : require_whole_object
428             Usage : $bool = $a->require_whole_object
429             Function: see below
430             Returns : a boolean flag
431             Args : none
432             Status : Internal
433              
434             This method returns true if the aggregator should refuse to aggregate
435             an object unless both its main part and its subparts are present.
436              
437             =cut
438              
439             sub require_whole_object {
440 168     168 1 113 my $self = shift;
441 168         149 my $d = $self->{require_whole_object};
442 168 50       239 $self->{require_whole_object} = shift if @_;
443 168         161 $d;
444             }
445              
446             =head2 match_sub
447              
448             Title : match_sub
449             Usage : $coderef = $a->match_sub($factory)
450             Function: generate a code reference that will match desired features
451             Returns : a code reference
452             Args : see below
453             Status : Internal
454              
455             This method is used internally to generate a code sub that will
456             quickly filter out the raw features that we're interested in
457             aggregating. The returned sub accepts a Feature and returns true if
458             we should aggregate it, false otherwise.
459              
460             =cut
461              
462             #' make emacs happy
463              
464             sub match_sub {
465 168     168 1 148 my $self = shift;
466 168         139 my $factory = shift;
467 168 50       220 my $types_to_aggregate = $self->components() or return; # saved from disaggregate call
468 168 50       273 return unless @$types_to_aggregate;
469 168         309 return $factory->make_match_sub($types_to_aggregate);
470             }
471              
472             =head2 strict_match
473              
474             Title : strict_match
475             Usage : $strict = $a->strict_match
476             Function: generate a hashref that indicates which subfeatures
477             need to be tested strictly for matching sources before
478             aggregating
479             Returns : a hash ref
480             Status : Internal
481              
482             =cut
483              
484             sub strict_match {
485 168     168 1 160 my $self = shift;
486 168         230 my $types_to_aggregate = $self->components();
487 168         143 my %strict;
488 168         202 for my $t (@$types_to_aggregate) {
489 1432 50       1680 $strict{lc $t->[0],lc $t->[1]}++ if defined $t->[1];
490             }
491 168         206 \%strict;
492             }
493              
494             sub passthru_sub {
495 168     168 0 150 my $self = shift;
496 168         121 my $factory = shift;
497 168 100       226 my $passthru = $self->passthru() or return;
498 20 50       58 return unless @$passthru;
499 0         0 return $factory->make_match_sub($passthru);
500             }
501              
502             =head2 components
503              
504             Title : components
505             Usage : @array= $a->components([$components])
506             Function: get/set stored list of parsed raw feature types
507             Returns : an array in list context, an array ref in scalar context
508             Args : new arrayref of feature types
509             Status : Internal
510              
511             This method is used internally to remember the parsed list of raw
512             features that we will aggregate. The need for this subroutine is
513             seen when a user requests a composite feature of type
514             "clone:cosmid". This generates a list of components in which the
515             source is appended to the method, like "clone_left_end:cosmid" and
516             "clone_right_end:cosmid". components() stores this information for
517             later use.
518              
519             =cut
520              
521             sub components {
522 772     772 1 489 my $self = shift;
523 772         605 my $d = $self->{components};
524 772 100       1090 $self->{components} = shift if @_;
525 772 100       1895 return unless ref $d;
526 752 100       1617 return wantarray ? @$d : $d;
527             }
528              
529             sub component_count {
530 218     218 0 250 my @c = shift->components;
531 218         1032 scalar @c;
532             }
533              
534             sub passthru {
535 386     386 0 288 my $self = shift;
536 386         294 my $d = $self->{passthru};
537 386 100       539 $self->{passthru} = shift if @_;
538 386 100       759 return unless ref $d;
539 96 50       232 return wantarray ? @$d : $d;
540             }
541              
542             sub clone {
543 15     15 1 16 my $self = shift;
544 15         12 my %new = %{$self};
  15         76  
545 15         86 return bless \%new,ref($self);
546             }
547              
548             =head2 get_part_names
549              
550             Title : get_part_names
551             Usage : @array = $a->get_part_names
552             Function: get list of sub-parts for this type of feature
553             Returns : an array
554             Args : none
555             Status : Internal
556              
557             This method is used internally to fetch the list of feature types that
558             form the components of the composite feature. Type names in the
559             format "method:source" are recognized, as are "method" and
560             Bio::DB::GFF::Typename objects as well. It checks instance variables
561             first, and if not defined calls the part_names() method.
562              
563             =cut
564              
565             sub get_part_names {
566 218     218 1 171 my $self = shift;
567 218 100       351 if ($self->{sub_parts}) {
568 46 50       72 return ref $self->{sub_parts} ? @{$self->{sub_parts}} : $self->{sub_parts};
  46         103  
569             } else {
570 172         403 return $self->part_names;
571             }
572             }
573              
574             =head2 get_main_name
575              
576             Title : get_main_name
577             Usage : $string = $a->get_main_name
578             Function: get the "main" method type for this feature
579             Returns : a string
580             Args : none
581             Status : Internal
582              
583             This method is used internally to fetch the type of the "main part" of
584             the feature. It checks instance variables first, and if not defined
585             calls the main_name() method.
586              
587             =cut
588              
589             sub get_main_name {
590 386     386 1 289 my $self = shift;
591 386 100       670 return $self->{main_method} if defined $self->{main_method};
592 309         525 return $self->main_name;
593             }
594              
595             =head2 get_method
596              
597             Title : get_method
598             Usage : $string = $a->get_method
599             Function: get the method type for the composite feature
600             Returns : a string
601             Args : none
602             Status : Internal
603              
604             This method is used internally to fetch the type of the method that
605             will be assigned to the composite feature once it is synthesized.
606              
607             =cut
608              
609             sub get_method {
610 458     458 1 314 my $self = shift;
611 458 100       850 return $self->{method} if defined $self->{method};
612 302         602 return $self->method;
613             }
614              
615             1;
616              
617             =head1 BUGS
618              
619             None known yet.
620              
621             =head1 SEE ALSO
622              
623             L,
624             L,
625             L,
626             L,
627             L,
628             L,
629             L,
630             L
631              
632             =head1 AUTHOR
633              
634             Lincoln Stein Elstein@cshl.orgE.
635              
636             Copyright (c) 2001 Cold Spring Harbor Laboratory.
637              
638             This library is free software; you can redistribute it and/or modify
639             it under the same terms as Perl itself.
640              
641             =cut
642