File Coverage

Bio/DB/GFF.pm
Criterion Covered Total %
statement 511 710 71.9
branch 198 362 54.7
condition 75 135 55.5
subroutine 73 119 61.3
pod 67 79 84.8
total 924 1405 65.7


line stmt bran cond sub pod time code
1              
2             =head1 NAME
3              
4             Bio::DB::GFF -- Storage and retrieval of sequence annotation data
5              
6             =head1 SYNOPSIS
7              
8             use Bio::DB::GFF;
9              
10             # Open the sequence database
11             my $db = Bio::DB::GFF->new( -adaptor => 'dbi::mysqlopt',
12             -dsn => 'dbi:mysql:elegans');
13              
14             # fetch a 1 megabase segment of sequence starting at landmark "ZK909"
15             my $segment = $db->segment('ZK909', 1 => 1000000);
16              
17             # pull out all transcript features
18             my @transcripts = $segment->features('transcript');
19              
20             # for each transcript, total the length of the introns
21             my %totals;
22             for my $t (@transcripts) {
23             my @introns = $t->Intron;
24             $totals{$t->name} += $_->length foreach @introns;
25             }
26              
27             # Sort the exons of the first transcript by position
28             my @exons = sort {$a->start <=> $b->start} $transcripts[0]->Exon;
29              
30             # Get a region 1000 bp upstream of first exon
31             my $upstream = $exons[0]->subseq(-1000,0);
32              
33             # get its DNA
34             my $dna = $upstream->seq;
35              
36             # and get all curated polymorphisms inside it
37             @polymorphisms = $upstream->contained_features('polymorphism:curated');
38              
39             # get all feature types in the database
40             my @types = $db->types;
41              
42             # count all feature types in the segment
43             my %type_counts = $segment->types(-enumerate=>1);
44              
45             # get an iterator on all curated features of type 'exon' or 'intron'
46             my $iterator = $db->get_seq_stream(-type => ['exon:curated','intron:curated']);
47              
48             while (my $s = $iterator->next_seq) {
49             print $s,"\n";
50             }
51              
52             # find all transcripts annotated as having function 'kinase'
53             my $iterator = $db->get_seq_stream(-type=>'transcript',
54             -attributes=>{Function=>'kinase'});
55             while (my $s = $iterator->next_seq) {
56             print $s,"\n";
57             }
58              
59             =head1 DESCRIPTION
60              
61             Bio::DB::GFF provides fast indexed access to a sequence annotation
62             database. It supports multiple database types (ACeDB, relational),
63             and multiple schemas through a system of adaptors and aggregators.
64              
65             The following operations are supported by this module:
66              
67             - retrieving a segment of sequence based on the ID of a landmark
68             - retrieving the DNA from that segment
69             - finding all annotations that overlap with the segment
70             - finding all annotations that are completely contained within the
71             segment
72             - retrieving all annotations of a particular type, either within a
73             segment, or globally
74             - conversion from absolute to relative coordinates and back again,
75             using any arbitrary landmark for the relative coordinates
76             - using a sequence segment to create new segments based on relative
77             offsets
78              
79             The data model used by Bio::DB::GFF is compatible with the GFF flat
80             file format (L). The module
81             can load a set of GFF files into the database, and serves objects that
82             have methods corresponding to GFF fields.
83              
84             The objects returned by Bio::DB::GFF are compatible with the
85             SeqFeatureI interface, allowing their use by the Bio::Graphics and
86             Bio::DAS modules.
87              
88             =head2 Auxiliary Scripts
89              
90             The bioperl distribution includes several scripts that make it easier
91             to work with Bio::DB::GFF databases. They are located in the scripts
92             directory under a subdirectory named Bio::DB::GFF:
93              
94             =over 4
95              
96             =item *
97              
98             bp_load_gff.pl
99              
100             This script will load a Bio::DB::GFF database from a flat GFF file of
101             sequence annotations. Only the relational database version of
102             Bio::DB::GFF is supported. It can be used to create the database from
103             scratch, as well as to incrementally load new data.
104              
105             This script takes a --fasta argument to load raw DNA into the database
106             as well. However, GFF databases do not require access to the raw DNA
107             for most of their functionality.
108              
109             load_gff.pl also has a --upgrade option, which will perform a
110             non-destructive upgrade of older schemas to newer ones.
111              
112             =item *
113              
114             bp_bulk_load_gff.pl
115              
116             This script will populate a Bio::DB::GFF database from a flat GFF file
117             of sequence annotations. Only the MySQL database version of
118             Bio::DB::GFF is supported. It uses the "LOAD DATA INFILE" query in
119             order to accelerate loading considerably; however, it can only be used
120             for the initial load, and not for updates.
121              
122             This script takes a --fasta argument to load raw DNA into the database
123             as well. However, GFF databases do not require access to the raw DNA
124             for most of their functionality.
125              
126             =item *
127              
128             bp_fast_load_gff.pl
129              
130             This script is as fast as bp_bulk_load_gff.pl but uses Unix pipe
131             tricks to allow for incremental updates. It only supports the MySQL
132             database version of Bio::DB::GFF and is guaranteed not to work on
133             non-Unix platforms.
134              
135             Arguments are the same as bp_load_gff.pl
136              
137             =item *
138              
139             gadfly_to_gff.pl
140              
141             This script will convert the GFF-like format used by the Berkeley
142             Drosophila Sequencing project into a format suitable for use with this
143             module.
144              
145             =item *
146              
147             sgd_to_gff.pl
148              
149             This script will convert the tab-delimited feature files used by the
150             Saccharomyces Genome Database into a format suitable for use with this
151             module.
152              
153             =back
154              
155             =head2 GFF Fundamentals
156              
157             The GFF format is a flat tab-delimited file, each line of which
158             corresponds to an annotation, or feature. Each line has nine columns
159             and looks like this:
160              
161             Chr1 curated CDS 365647 365963 . + 1 Transcript "R119.7"
162              
163             The 9 columns are as follows:
164              
165             =over 4
166              
167             =item 1.
168              
169             reference sequence
170              
171             This is the ID of the sequence that is used to establish the
172             coordinate system of the annotation. In the example above, the
173             reference sequence is "Chr1".
174              
175             =item 2.
176              
177             source
178              
179             The source of the annotation. This field describes how the annotation
180             was derived. In the example above, the source is "curated" to
181             indicate that the feature is the result of human curation. The names
182             and versions of software programs are often used for the source field,
183             as in "tRNAScan-SE/1.2".
184              
185             =item 3.
186              
187             method
188              
189             The annotation method. This field describes the type of the
190             annotation, such as "CDS". Together the method and source describe
191             the annotation type.
192              
193             =item 4.
194              
195             start position
196              
197             The start of the annotation relative to the reference sequence.
198              
199             =item 5.
200              
201             stop position
202              
203             The stop of the annotation relative to the reference sequence. Start
204             is always less than or equal to stop.
205              
206             =item 6.
207              
208             score
209              
210             For annotations that are associated with a numeric score (for example,
211             a sequence similarity), this field describes the score. The score
212             units are completely unspecified, but for sequence similarities, it is
213             typically percent identity. Annotations that don't have a score can
214             use "."
215              
216             =item 7.
217              
218             strand
219              
220             For those annotations which are strand-specific, this field is the
221             strand on which the annotation resides. It is "+" for the forward
222             strand, "-" for the reverse strand, or "." for annotations that are
223             not stranded.
224              
225             =item 8.
226              
227             phase
228              
229             For annotations that are linked to proteins, this field describes the
230             phase of the annotation on the codons. It is a number from 0 to 2, or
231             "." for features that have no phase.
232              
233             =item 9.
234              
235             group
236              
237             GFF provides a simple way of generating annotation hierarchies ("is
238             composed of" relationships) by providing a group field. The group
239             field contains the class and ID of an annotation which is the logical
240             parent of the current one. In the example given above, the group is
241             the Transcript named "R119.7".
242              
243             The group field is also used to store information about the target of
244             sequence similarity hits, and miscellaneous notes. See the next
245             section for a description of how to describe similarity targets.
246              
247             The format of the group fields is "Class ID" with a single space (not
248             a tab) separating the class from the ID. It is VERY IMPORTANT to
249             follow this format, or grouping will not work properly.
250              
251             =back
252              
253             The sequences used to establish the coordinate system for annotations
254             can correspond to sequenced clones, clone fragments, contigs or
255             super-contigs. Thus, this module can be used throughout the lifecycle
256             of a sequencing project.
257              
258             In addition to a group ID, the GFF format allows annotations to have a
259             group class. For example, in the ACeDB representation, RNA
260             interference experiments have a class of "RNAi" and an ID that is
261             unique among the RNAi experiments. Since not all databases support
262             this notion, the class is optional in all calls to this module, and
263             defaults to "Sequence" when not provided.
264              
265             Double-quotes are sometimes used in GFF files around components of the
266             group field. Strictly, this is only necessary if the group name or
267             class contains whitespace.
268              
269             =head2 Making GFF files work with this module
270              
271             Some annotations do not need to be individually named. For example,
272             it is probably not useful to assign a unique name to each ALU repeat
273             in a vertebrate genome. Others, such as predicted genes, correspond
274             to named biological objects; you probably want to be able to fetch the
275             positions of these objects by referring to them by name.
276              
277             To accommodate named annotations, the GFF format places the object
278             class and name in the group field. The name identifies the object,
279             and the class prevents similarly-named objects, for example clones and
280             sequences, from collding.
281              
282             A named object is shown in the following excerpt from a GFF file:
283              
284             Chr1 curated transcript 939627 942410 . + . Transcript Y95B8A.2
285              
286             This object is a predicted transcript named Y95BA.2. In this case,
287             the group field is used to identify the class and name of the object,
288             even though no other annotation belongs to that group.
289              
290             It now becomes possible to retrieve the region of the genome covered
291             by transcript Y95B8A.2 using the segment() method:
292              
293             $segment = $db->segment(-class=>'Transcript',-name=>'Y95B8A.2');
294              
295             It is not necessary for the annotation's method to correspond to the
296             object class, although this is commonly the case.
297              
298             As explained above, each annotation in a GFF file refers to a
299             reference sequence. It is important that each reference sequence also
300             be identified by a line in the GFF file. This allows the Bio::DB::GFF
301             module to determine the length and class of the reference sequence,
302             and makes it possible to do relative arithmetic.
303              
304             For example, if "Chr1" is used as a reference sequence, then it should
305             have an entry in the GFF file similar to this one:
306              
307             Chr1 assembly chromosome 1 14972282 . + . Sequence Chr1
308              
309             This indicates that the reference sequence named "Chr1" has length
310             14972282 bp, method "chromosome" and source "assembly". In addition,
311             as indicated by the group field, Chr1 has class "Sequence" and name
312             "Chr1".
313              
314             The object class "Sequence" is used by default when the class is not
315             specified in the segment() call. This allows you to use a shortcut
316             form of the segment() method:
317              
318             $segment = $db->segment('Chr1'); # whole chromosome
319             $segment = $db->segment('Chr1',1=>1000); # first 1000 bp
320              
321             For your convenience, if, during loading a GFF file, Bio::DB::GFF
322             encounters a line like the following:
323              
324             ##sequence-region Chr1 1 14972282
325              
326             It will automatically generate the following entry:
327              
328             Chr1 reference Component 1 14972282 . + . Sequence Chr1
329              
330             This is sufficient to use Chr1 as a reference point.
331             The ##sequence-region line is frequently found in the GFF files
332             distributed by annotation groups.
333              
334             =head2 Specifying the group tag
335              
336             A frequent problem with GFF files is the problem distinguishing
337             which of the several tag/value pairs in the 9th column is the grouping
338             pair. Ordinarily the first tag will be used for grouping, but some
339             GFF manipulating tools do not preserve the order of attributes. To
340             eliminate this ambiguity, this module provides two ways of explicitly
341             specifying which tag to group on:
342              
343             =over 4
344              
345             =item *
346              
347             Using -preferred_groups
348              
349             When you create a Bio::DB::GFF object, pass it a -preferred_groups=E
350             argument. This specifies a tag that will be used for grouping. You
351             can pass an array reference to specify a list of such tags.
352              
353             =item *
354              
355             In the GFF header
356              
357             The GFF file itself can specify which tags are to be used for
358             grouping. Insert a comment like the following:
359              
360             ##group-tags Accession Locus
361              
362             This says to use the Accession tag for grouping. If it is not
363             available, use the Locus tag. If neither tag is available, use the
364             first pair to appear.
365              
366             =back
367              
368             These options only apply when B a GFF file into the database,
369             and have no effect on existing databases.
370              
371             The group-tags comment in the GFF file will *override* the preferred
372             groups set when you create the Bio::DB::GFF object.
373              
374             For backward compatibility, the tags Sequence and Transcript are
375             always treated as grouping tags unless preferred_tags are specified.
376             The "Target" tag is always used for grouping regardless of the
377             preferred_groups() setting, and the tags "tstart", "tend" and "Note"
378             cannot be used for grouping. These are historical artefacts coming
379             from various interpretations of GFF2, and cannot be changed.
380              
381             =head2 Sequence alignments
382              
383             There are two cases in which an annotation indicates the relationship
384             between two sequences. The first case is a similarity hit, where the
385             annotation indicates an alignment. The second case is a map assembly,
386             in which the annotation indicates that a portion of a larger sequence
387             is built up from one or more smaller ones.
388              
389             Both cases are indicated by using the B tag in the group
390             field. For example, a typical similarity hit will look like this:
391              
392             Chr1 BLASTX similarity 76953 77108 132 + 0 Target Protein:SW:ABL_DROME 493 544
393              
394             The group field contains the Target tag, followed by an identifier for
395             the biological object referred to. The GFF format uses the notation
396             I:I for the biological object, and even though this is
397             stylistically inconsistent, that's the way it's done. The object
398             identifier is followed by two integers indicating the start and stop
399             of the alignment on the target sequence.
400              
401             Unlike the main start and stop columns, it is possible for the target
402             start to be greater than the target end. The previous example
403             indicates that the the section of Chr1 from 76,953 to 77,108 aligns to
404             the protein SW:ABL_DROME starting at position 493 and extending to
405             position 544.
406              
407             A similar notation is used for sequence assembly information as shown
408             in this example:
409              
410             Chr1 assembly Link 10922906 11177731 . . . Target Sequence:LINK_H06O01 1 254826
411             LINK_H06O01 assembly Cosmid 32386 64122 . . . Target Sequence:F49B2 6 31742
412              
413             This indicates that the region between bases 10922906 and 11177731 of
414             Chr1 are composed of LINK_H06O01 from bp 1 to bp 254826. The region
415             of LINK_H0601 between 32386 and 64122 is, in turn, composed of the
416             bases 5 to 31742 of cosmid F49B2.
417              
418             =head2 Attributes
419              
420             While not intended to serve as a general-purpose sequence database
421             (see bioperl-db for that), GFF allows you to tag features with
422             arbitrary attributes. Attributes appear in the Group field following
423             the initial class/name pair. For example:
424              
425             Chr1 cur trans 939 942 . + . Transcript Y95B8A.2 ; Gene sma-3 ; Alias sma3
426              
427             This line tags the feature named Transcript Y95B8A.2 as being "Gene"
428             named sma-3 and having the Alias "sma3". Features having these
429             attributes can be looked up using the fetch_feature_by_attribute() method.
430              
431             Two attributes have special meaning: "Note" is for backward
432             compatibility and is used for unstructured text remarks. "Alias" is
433             considered as a synonym for the feature name and will be consulted
434             when looking up a feature by its name.
435              
436             =head2 Adaptors and Aggregators
437              
438             This module uses a system of adaptors and aggregators in order to make
439             it adaptable to use with a variety of databases.
440              
441             =over 4
442              
443             =item *
444              
445             Adaptors
446              
447             The core of the module handles the user API, annotation coordinate
448             arithmetic, and other common issues. The details of fetching
449             information from databases is handled by an adaptor, which is
450             specified during Bio::DB::GFF construction. The adaptor encapsulates
451             database-specific information such as the schema, user authentication
452             and access methods.
453              
454             There are currently five adaptors recommended for general use:
455              
456             Adaptor Name Description
457             ------------ -----------
458              
459             memory A simple in-memory database suitable for testing
460             and small data sets.
461              
462             berkeleydb An indexed file database based on the DB_File module,
463             suitable for medium-sized read-only data sets.
464              
465             dbi::mysql An interface to a schema implemented in the Mysql
466             relational database management system.
467              
468             dbi::oracle An interface to a schema implemented in the Oracle
469             relational database management system.
470              
471             dbi::pg An interface to a schema implemented in the PostgreSQL
472             relational database management system.
473              
474             Check the Bio/DB/GFF/Adaptor directory and subdirectories for other,
475             more specialized adaptors, as well as experimental ones.
476              
477             =item *
478              
479             Aggregators
480              
481             The GFF format uses a "group" field to indicate aggregation properties
482             of individual features. For example, a set of exons and introns may
483             share a common transcript group, and multiple transcripts may share
484             the same gene group.
485              
486             Aggregators are small modules that use the group information to
487             rebuild the hierarchy. When a Bio::DB::GFF object is created, you
488             indicate that it use a set of one or more aggregators. Each
489             aggregator provides a new composite annotation type. Before the
490             database query is generated each aggregator is called to
491             "disaggregate" its annotation type into list of component types
492             contained in the database. After the query is generated, each
493             aggregator is called again in order to build composite annotations
494             from the returned components.
495              
496             For example, during disaggregation, the standard
497             "processed_transcript" aggregator generates a list of component
498             feature types including "UTR", "CDS", and "polyA_site". Later, it
499             aggregates these features into a set of annotations of type
500             "processed_transcript".
501              
502             During aggregation, the list of aggregators is called in reverse
503             order. This allows aggregators to collaborate to create multi-level
504             structures: the transcript aggregator assembles transcripts from
505             introns and exons; the gene aggregator then assembles genes from sets
506             of transcripts.
507              
508             Three default aggregators are provided:
509              
510             transcript assembles transcripts from features of type
511             exon, CDS, 5'UTR, 3'UTR, TSS, and PolyA
512             clone assembles clones from Clone_left_end, Clone_right_end
513             and Sequence features.
514             alignment assembles gapped alignments from features of type
515             "similarity".
516              
517             In addition, this module provides the optional "wormbase_gene"
518             aggregator, which accommodates the WormBase representation of genes.
519             This aggregator aggregates features of method "exon", "CDS", "5'UTR",
520             "3'UTR", "polyA" and "TSS" into a single object. It also expects to
521             find a single feature of type "Sequence" that spans the entire gene.
522              
523             The existing aggregators are easily customized.
524              
525             Note that aggregation will not occur unless you specifically request
526             the aggregation type. For example, this call:
527              
528             @features = $segment->features('alignment');
529              
530             will generate an array of aggregated alignment features. However,
531             this call:
532              
533             @features = $segment->features();
534              
535             will return a list of unaggregated similarity segments.
536              
537             For more informnation, see the manual pages for
538             Bio::DB::GFF::Aggregator::processed_transcript, Bio::DB::GFF::Aggregator::clone,
539             etc.
540              
541             =back
542              
543             =head2 Loading GFF3 Files
544              
545             This module will accept GFF3 files, as described at
546             http://song.sourceforge.net/gff3.shtml. However, the implementation
547             has some limitations.
548              
549             =over 4
550              
551             =item GFF version string is required
552              
553             The GFF file B contain the version comment:
554              
555             ##gff-version 3
556              
557             Unless this version string is present at the top of the GFF file, the
558             loader will attempt to parse the file in GFF2 format, with
559             less-than-desirable results.
560              
561             =item Only one level of nesting allowed
562              
563             A major restriction is that Bio::DB::GFF only allows one level of
564             nesting of features. For nesting, the Target tag will be used
565             preferentially followed by the ID tag, followed by the Parent tag.
566             This means that if genes are represented like this:
567              
568             XXXX XXXX gene XXXX XXXX XXXX ID=myGene
569             XXXX XXXX mRNA XXXX XXXX XXXX ID=myTranscript;Parent=myGene
570             XXXX XXXX exon XXXX XXXX XXXX Parent=myTranscript
571             XXXX XXXX exon XXXX XXXX XXXX Parent=myTranscript
572              
573             Then there will be one group called myGene containing the "gene"
574             feature and one group called myTranscript containing the mRNA, and two
575             exons.
576              
577             You can work around this restriction to some extent by using the Alias
578             attribute literally:
579              
580             XXXX XXXX gene XXXX XXXX XXXX ID=myGene
581             XXXX XXXX mRNA XXXX XXXX XXXX ID=myTranscript;Parent=myGene;Alias=myGene
582             XXXX XXXX exon XXXX XXXX XXXX Parent=myTranscript;Alias=myGene
583             XXXX XXXX exon XXXX XXXX XXXX Parent=myTranscript;Alias=myGene
584              
585             This limitation will be corrected in the next version of Bio::DB::GFF.
586              
587             =back
588              
589             =head1 API
590              
591             The following is the API for Bio::DB::GFF.
592              
593             =cut
594              
595             package Bio::DB::GFF;
596              
597 3     3   2127 use strict;
  3         3  
  3         69  
598              
599 3     3   663 use IO::File;
  3         2196  
  3         267  
600 3     3   18 use File::Glob ':glob';
  3         3  
  3         456  
601 3     3   702 use Bio::DB::GFF::Util::Rearrange;
  3         6  
  3         132  
602 3     3   882 use Bio::DB::GFF::RelSegment;
  3         6  
  3         99  
603 3     3   15 use Bio::DB::GFF::Feature;
  3         3  
  3         48  
604 3     3   834 use Bio::DB::GFF::Aggregator;
  3         6  
  3         78  
605              
606 3     3   15 use base qw(Bio::Root::Root Bio::DasI);
  3         3  
  3         882  
607              
608             my %valid_range_types = (overlaps => 1,
609             contains => 1,
610             contained_in => 1);
611              
612             =head1 Querying GFF Databases
613              
614             =head2 new
615              
616             Title : new
617             Usage : my $db = Bio::DB::GFF->new(@args);
618             Function: create a new Bio::DB::GFF object
619             Returns : new Bio::DB::GFF object
620             Args : lists of adaptors and aggregators
621             Status : Public
622              
623             These are the arguments:
624              
625             -adaptor Name of the adaptor module to use. If none
626             provided, defaults to "dbi::mysqlopt".
627              
628             -aggregator Array reference to a list of aggregators
629             to apply to the database. If none provided,
630             defaults to ['processed_transcript','alignment'].
631              
632             -preferred_groups When interpreteting the 9th column of a GFF2 file,
633             the indicated group names will have preference over
634             other attributes, even if they do not come first in
635             the list of attributes. This can be a scalar value
636             or an array reference.
637              
638             Any other named argument pairs are passed to
639             the adaptor for processing.
640              
641             The adaptor argument must correspond to a module contained within the
642             Bio::DB::GFF::Adaptor namespace. For example, the
643             Bio::DB::GFF::Adaptor::dbi::mysql adaptor is loaded by specifying
644             'dbi::mysql'. By Perl convention, the adaptors names are lower case
645             because they are loaded at run time.
646              
647             The aggregator array may contain a list of aggregator names, a list of
648             initialized aggregator objects, or a string in the form
649             "aggregator_name{subpart1,subpart2,subpart3/main_method}" (the
650             "/main_method" part is optional, but if present a feature with the
651             main_method must be present in order for aggregation to occur). For
652             example, if you wish to change the components aggregated by the
653             transcript aggregator, you could pass it to the GFF constructor this
654             way:
655              
656             my $transcript =
657             Bio::DB::Aggregator::transcript->new(-sub_parts=>[qw(exon intron utr
658             polyA spliced_leader)]);
659              
660             my $db = Bio::DB::GFF->new(-aggregator=>[$transcript,'clone','alignment],
661             -adaptor => 'dbi::mysql',
662             -dsn => 'dbi:mysql:elegans42');
663              
664             Alternatively, you could create an entirely new transcript aggregator
665             this way:
666              
667             my $new_agg = 'transcript{exon,intron,utr,polyA,spliced_leader}';
668             my $db = Bio::DB::GFF->new(-aggregator=>[$new_agg,'clone','alignment],
669             -adaptor => 'dbi::mysql',
670             -dsn => 'dbi:mysql:elegans42');
671              
672             See L for more details.
673              
674             The B<-preferred_groups> argument is used to change the default
675             processing of the 9th column of GFF version 2 files. By default, the
676             first tag/value pair is used to establish the group class and name.
677             If you pass -preferred_groups a scalar, the parser will look for a tag
678             of the indicated type and use it as the group even if it is not first
679             in the file. If you pass this argument a list of group classes as an
680             array ref, then the list will establish the precedence for searching.
681              
682             The commonly used 'dbi::mysql' adaptor recognizes the following
683             adaptor-specific arguments:
684              
685             Argument Description
686             -------- -----------
687              
688             -dsn the DBI data source, e.g. 'dbi:mysql:ens0040'
689             If a partial name is given, such as "ens0040", the
690             "dbi:mysql:" prefix will be added automatically.
691              
692             -user username for authentication
693              
694             -pass the password for authentication
695              
696             -refclass landmark Class; defaults to "Sequence"
697              
698              
699             The commonly used 'dbi::mysqlopt' adaptor also recognizes the following
700             arguments.
701              
702             Argument Description
703             -------- -----------
704              
705             -fasta path to a directory containing FASTA files for the DNA
706             contained in this database (e.g. "/usr/local/share/fasta")
707              
708             -acedb an acedb URL to use when converting features into ACEDB
709             objects (e.g. sace://localhost:2005)
710              
711             =cut
712              
713             #'
714              
715             sub new {
716 5     5 1 90122 my $package = shift;
717 5         18 my ($adaptor,$aggregators,$args,$refclass,$preferred_groups);
718              
719 5 50       21 if (@_ == 1) { # special case, default to dbi::mysqlopt
720 0         0 $adaptor = 'dbi::mysqlopt';
721 0         0 $args = {DSN => shift};
722             } else {
723 5         50 ($adaptor,$aggregators,$refclass,$preferred_groups,$args) = rearrange([
724             [qw(ADAPTOR FACTORY)],
725             [qw(AGGREGATOR AGGREGATORS)],
726             'REFCLASS',
727             'PREFERRED_GROUPS'
728             ],@_);
729             }
730              
731 5   50     22 $adaptor ||= 'dbi::mysqlopt';
732 5         18 my $class = "Bio::DB::GFF::Adaptor::\L${adaptor}\E";
733 5 100       72 unless ($class->can('new')) {
734 3 50       192 eval "require $class;1;" or $package->throw("Unable to load $adaptor adaptor: $@");
735             }
736              
737             # this hack saves the memory adaptor, which loads the GFF file in new()
738 5 50       25 $args->{PREFERRED_GROUPS} = $preferred_groups if defined $preferred_groups;
739              
740 5         26 my $self = $class->new($args);
741              
742             # handle preferred groups
743 5 50       13 $self->preferred_groups($preferred_groups) if defined $preferred_groups;
744 5   50     66 $self->default_class($refclass || 'Sequence');
745              
746             # handle the aggregators.
747             # aggregators are responsible for creating complex multi-part features
748             # from the GFF "group" field. If none are provided, then we provide a
749             # list of the two used in WormBase.
750             # Each aggregator can be a scalar or a ref. In the former case
751             # it is treated as a class name to call new() on. In the latter
752             # the aggreator is treated as a ready made object.
753 5 50       19 $aggregators = $self->default_aggregators unless defined $aggregators;
754 5 50       35 my @a = ref($aggregators) eq 'ARRAY' ? @$aggregators : $aggregators;
755 5         18 for my $a (@a) {
756 10         44 $self->add_aggregator($a);
757             }
758              
759             # default settings go here.....
760 5         40 $self->automerge(1); # set automerge to true
761              
762 5         15 $self;
763             }
764              
765              
766             =head2 types
767              
768             Title : types
769             Usage : $db->types(@args)
770             Function: return list of feature types in range or database
771             Returns : a list of Bio::DB::GFF::Typename objects
772             Args : see below
773             Status : public
774              
775             This routine returns a list of feature types known to the database.
776             The list can be database-wide or restricted to a region. It is also
777             possible to find out how many times each feature occurs.
778              
779             For range queries, it is usually more convenient to create a
780             Bio::DB::GFF::Segment object, and then invoke it's types() method.
781              
782             Arguments are as follows:
783              
784             -ref ID of reference sequence
785             -class class of reference sequence
786             -start start of segment
787             -stop stop of segment
788             -enumerate if true, count the features
789              
790             The returned value will be a list of Bio::DB::GFF::Typename objects,
791             which if evaluated in a string context will return the feature type in
792             "method:source" format. This object class also has method() and
793             source() methods for retrieving the like-named fields.
794              
795             If -enumerate is true, then the function returns a hash (not a hash
796             reference) in which the keys are type names in "method:source" format
797             and the values are the number of times each feature appears in the
798             database or segment.
799              
800             The argument -end is a synonum for -stop, and -count is a synonym for
801             -enumerate.
802              
803             =cut
804              
805             sub types {
806 25     25 1 1283 my $self = shift;
807 25         146 my ($refseq,$start,$stop,$enumerate,$refclass,$types) = rearrange ([
808             [qw(REF REFSEQ)],
809             qw(START),
810             [qw(STOP END)],
811             [qw(ENUMERATE COUNT)],
812             [qw(CLASS SEQCLASS)],
813             [qw(TYPE TYPES)],
814             ],@_);
815 25 50       91 $types = $self->parse_types($types) if defined $types;
816 25         78 $self->get_types($refseq,$refclass,$start,$stop,$enumerate,$types);
817             }
818              
819             =head2 classes
820              
821             Title : classes
822             Usage : $db->classes
823             Function: return list of landmark classes in database
824             Returns : a list of classes
825             Args : none
826             Status : public
827              
828             This routine returns the list of reference classes known to the
829             database, or empty if classes are not used by the database. Classes
830             are distinct from types, being essentially qualifiers on the reference
831             namespaces.
832              
833             =cut
834              
835             sub classes {
836 0     0 1 0 my $self = shift;
837 0         0 return ();
838             }
839              
840             =head2 segment
841              
842             Title : segment
843             Usage : $db->segment(@args);
844             Function: create a segment object
845             Returns : segment object(s)
846             Args : numerous, see below
847             Status : public
848              
849             This method generates a segment object, which is a Perl object
850             subclassed from Bio::DB::GFF::Segment. The segment can be used to
851             find overlapping features and the raw DNA.
852              
853             When making the segment() call, you specify the ID of a sequence
854             landmark (e.g. an accession number, a clone or contig), and a
855             positional range relative to the landmark. If no range is specified,
856             then the entire extent of the landmark is used to generate the
857             segment.
858              
859             You may also provide the ID of a "reference" sequence, which will set
860             the coordinate system and orientation used for all features contained
861             within the segment. The reference sequence can be changed later. If
862             no reference sequence is provided, then the coordinate system is based
863             on the landmark.
864              
865             Arguments:
866              
867             -name ID of the landmark sequence.
868              
869             -class Database object class for the landmark sequence.
870             "Sequence" assumed if not specified. This is
871             irrelevant for databases which do not recognize
872             object classes.
873              
874             -start Start of the segment relative to landmark. Positions
875             follow standard 1-based sequence rules. If not specified,
876             defaults to the beginning of the landmark.
877              
878             -end Stop of the segment relative to the landmark. If not specified,
879             defaults to the end of the landmark.
880              
881             -stop Same as -end.
882              
883             -offset For those who prefer 0-based indexing, the offset specifies the
884             position of the new segment relative to the start of the landmark.
885              
886             -length For those who prefer 0-based indexing, the length specifies the
887             length of the new segment.
888              
889             -refseq Specifies the ID of the reference landmark used to establish the
890             coordinate system for the newly-created segment.
891              
892             -refclass Specifies the class of the reference landmark, for those databases
893             that distinguish different object classes. Defaults to "Sequence".
894              
895             -absolute
896             Return features in absolute coordinates rather than relative to the
897             parent segment.
898              
899             -nocheck Don't check the database for the coordinates and length of this
900             feature. Construct a segment using the indicated name as the
901             reference, a start coordinate of 1, an undefined end coordinate,
902             and a strand of +1.
903              
904             -force Same as -nocheck.
905              
906             -seq,-sequence,-sourceseq Aliases for -name.
907              
908             -begin,-end Aliases for -start and -stop
909              
910             -off,-len Aliases for -offset and -length
911              
912             -seqclass Alias for -class
913              
914             Here's an example to explain how this works:
915              
916             my $db = Bio::DB::GFF->new(-dsn => 'dbi:mysql:human',-adaptor=>'dbi::mysql');
917              
918             If successful, $db will now hold the database accessor object. We now
919             try to fetch the fragment of sequence whose ID is A0000182 and class
920             is "Accession."
921              
922             my $segment = $db->segment(-name=>'A0000182',-class=>'Accession');
923              
924             If successful, $segment now holds the entire segment corresponding to
925             this accession number. By default, the sequence is used as its own
926             reference sequence, so its first base will be 1 and its last base will
927             be the length of the accession.
928              
929             Assuming that this sequence belongs to a longer stretch of DNA, say a
930             contig, we can fetch this information like so:
931              
932             my $sourceseq = $segment->sourceseq;
933              
934             and find the start and stop on the source like this:
935              
936             my $start = $segment->abs_start;
937             my $stop = $segment->abs_stop;
938              
939             If we had another segment, say $s2, which is on the same contiguous
940             piece of DNA, we can pass that to the refseq() method in order to
941             establish it as the coordinate reference point:
942              
943             $segment->refseq($s2);
944              
945             Now calling start() will return the start of the segment relative to
946             the beginning of $s2, accounting for differences in strandedness:
947              
948             my $rel_start = $segment->start;
949              
950             IMPORTANT NOTE: This method can be used to return the segment spanned
951             by an arbitrary named annotation. However, if the annotation appears
952             at multiple locations on the genome, for example an EST that maps to
953             multiple locations, then, provided that all locations reside on the
954             same physical segment, the method will return a segment that spans the
955             minimum and maximum positions. If the reference sequence occupies
956             ranges on different physical segments, then it returns them all in an
957             array context, and raises a "multiple segment exception" exception in
958             a scalar context.
959              
960             =cut
961              
962             #'
963              
964             sub segment {
965 203     203 1 12952 my $self = shift;
966 203         407 my @segments = Bio::DB::GFF::RelSegment->new(-factory => $self,
967             $self->setup_segment_args(@_));
968 203         392 foreach (@segments) {
969 195 50       331 $_->absolute(1) if $self->absolute;
970             }
971              
972 203         314 $self->_multiple_return_args(@segments);
973             }
974              
975             sub _multiple_return_args {
976 203     203   200 my $self = shift;
977 203         262 my @args = @_;
978 203 100       418 if (@args == 0) {
    50          
    0          
979 8         21 return;
980             } elsif (@args == 1) {
981 195         611 return $args[0];
982             } elsif (wantarray) { # more than one reference sequence
983 0         0 return @args;
984             } else {
985 0         0 $self->error($args[0]->name,
986             " has more than one reference sequence in database. Please call in a list context to retrieve them all.");
987 0         0 $self->throw('multiple segment exception');
988 0         0 return;
989             }
990              
991             }
992              
993             # backward compatibility -- don't use!
994             # (deliberately undocumented too)
995             sub abs_segment {
996 0     0 0 0 my $self = shift;
997 0         0 return $self->segment($self->setup_segment_args(@_),-absolute=>1);
998             }
999              
1000             sub setup_segment_args {
1001 224     224 0 250 my $self = shift;
1002 224 100 100     1247 return @_ if defined $_[0] && $_[0] =~ /^-/;
1003 156 100       486 return (-name=>$_[0],-start=>$_[1],-stop=>$_[2]) if @_ == 3;
1004 71 100       222 return (-class=>$_[0],-name=>$_[1]) if @_ == 2;
1005 36 100       256 return (-name=>$_[0]) if @_ == 1;
1006             }
1007              
1008             =head2 features
1009              
1010             Title : features
1011             Usage : $db->features(@args)
1012             Function: get all features, possibly filtered by type
1013             Returns : a list of Bio::DB::GFF::Feature objects
1014             Args : see below
1015             Status : public
1016              
1017             This routine will retrieve features in the database regardless of
1018             position. It can be used to return all features, or a subset based on
1019             their method and source.
1020              
1021             Arguments are as follows:
1022              
1023             -types List of feature types to return. Argument is an array
1024             reference containing strings of the format "method:source"
1025              
1026             -merge Whether to apply aggregators to the generated features.
1027              
1028             -rare Turn on optimizations suitable for a relatively rare feature type,
1029             where it makes more sense to filter by feature type first,
1030             and then by position.
1031              
1032             -attributes A hash reference containing attributes to match.
1033              
1034             -iterator Whether to return an iterator across the features.
1035              
1036             -binsize A true value will create a set of artificial features whose
1037             start and stop positions indicate bins of the given size, and
1038             whose scores are the number of features in the bin. The
1039             class and method of the feature will be set to "bin",
1040             its source to "method:source", and its group to "bin:method:source".
1041             This is a handy way of generating histograms of feature density.
1042              
1043             If -iterator is true, then the method returns a single scalar value
1044             consisting of a Bio::SeqIO object. You can call next_seq() repeatedly
1045             on this object to fetch each of the features in turn. If iterator is
1046             false or absent, then all the features are returned as a list.
1047              
1048             Currently aggregation is disabled when iterating over a series of
1049             features.
1050              
1051             Types are indicated using the nomenclature "method:source". Either of
1052             these fields can be omitted, in which case a wildcard is used for the
1053             missing field. Type names without the colon (e.g. "exon") are
1054             interpreted as the method name and a source wild card. Regular
1055             expressions are allowed in either field, as in: "similarity:BLAST.*".
1056              
1057             The -attributes argument is a hashref containing one or more attributes
1058             to match against:
1059              
1060             -attributes => { Gene => 'abc-1',
1061             Note => 'confirmed' }
1062              
1063             Attribute matching is simple string matching, and multiple attributes
1064             are ANDed together.
1065              
1066             =cut
1067              
1068             sub features {
1069 15     15 1 2412 my $self = shift;
1070 15         34 my ($types,$automerge,$sparse,$iterator,$refseq,$start,$end,$other);
1071 15 100 66     107 if (defined $_[0] &&
1072             $_[0] =~ /^-/) {
1073 10         63 ($types,$automerge,$sparse,$iterator,
1074             $refseq,$start,$end,
1075             $other) = rearrange([
1076             [qw(TYPE TYPES)],
1077             [qw(MERGE AUTOMERGE)],
1078             [qw(RARE SPARSE)],
1079             'ITERATOR',
1080             [qw(REFSEQ SEQ_ID)],
1081             'START',
1082             [qw(STOP END)],
1083             ],@_);
1084             } else {
1085 5         40 $types = \@_;
1086             }
1087              
1088             # for whole database retrievals, we probably don't want to automerge!
1089 15 100       117 $automerge = $self->automerge unless defined $automerge;
1090 15   50     136 $other ||= {};
1091 15 50       274 $self->_features({
1092             rangetype => $refseq ? 'overlaps' : 'contains',
1093             types => $types,
1094             refseq => $refseq,
1095             start => $start,
1096             stop => $end,
1097             },
1098             { sparse => $sparse,
1099             automerge => $automerge,
1100             iterator =>$iterator,
1101             %$other,
1102             }
1103             );
1104             }
1105              
1106             =head2 get_seq_stream
1107              
1108             Title : get_seq_stream
1109             Usage : my $seqio = $self->get_seq_sream(@args)
1110             Function: Performs a query and returns an iterator over it
1111             Returns : a Bio::SeqIO stream capable of producing sequence
1112             Args : As in features()
1113             Status : public
1114              
1115             This routine takes the same arguments as features(), but returns a
1116             Bio::SeqIO::Stream-compliant object. Use it like this:
1117              
1118             $stream = $db->get_seq_stream('exon');
1119             while (my $exon = $stream->next_seq) {
1120             print $exon,"\n";
1121             }
1122              
1123             NOTE: This is also called get_feature_stream(), since that's what it
1124             really does.
1125              
1126             =cut
1127              
1128             sub get_seq_stream {
1129 0     0 1 0 my $self = shift;
1130 0 0 0     0 my @args = !defined($_[0]) || $_[0] =~ /^-/ ? (@_,-iterator=>1)
1131             : (-types=>\@_,-iterator=>1);
1132 0         0 $self->features(@args);
1133             }
1134              
1135             *get_feature_stream = \&get_seq_stream;
1136              
1137             =head2 get_feature_by_name
1138              
1139             Title : get_feature_by_name
1140             Usage : $db->get_feature_by_name($class => $name)
1141             Function: fetch features by their name
1142             Returns : a list of Bio::DB::GFF::Feature objects
1143             Args : the class and name of the desired feature
1144             Status : public
1145              
1146             This method can be used to fetch a named feature from the database.
1147             GFF annotations are named using the group class and name fields, so
1148             for features that belong to a group of size one, this method can be
1149             used to retrieve that group (and is equivalent to the segment()
1150             method). Any Alias attributes are also searched for matching names.
1151              
1152             An alternative syntax allows you to search for features by name within
1153             a circumscribed region:
1154              
1155             @f = $db->get_feature_by_name(-class => $class,-name=>$name,
1156             -ref => $sequence_name,
1157             -start => $start,
1158             -end => $end);
1159              
1160             This method may return zero, one, or several Bio::DB::GFF::Feature
1161             objects.
1162              
1163             Aggregation is performed on features as usual.
1164              
1165             NOTE: At various times, this function was called fetch_group(),
1166             fetch_feature(), fetch_feature_by_name() and segments(). These names
1167             are preserved for backward compatibility.
1168              
1169             =cut
1170              
1171             sub get_feature_by_name {
1172 16     16 1 30 my $self = shift;
1173 16         31 my ($gclass,$gname,$automerge,$ref,$start,$end);
1174 16 50       40 if (@_ == 1) {
1175 0         0 $gclass = $self->default_class;
1176 0         0 $gname = shift;
1177             } else {
1178 16         108 ($gclass,$gname,$automerge,$ref,$start,$end) = rearrange(['CLASS','NAME','AUTOMERGE',
1179             ['REF','REFSEQ'],
1180             'START',['STOP','END']
1181             ],@_);
1182 16   33     48 $gclass ||= $self->default_class;
1183             }
1184 16 50       52 $automerge = $self->automerge unless defined $automerge;
1185              
1186             # we need to refactor this... It's repeated code (see below)...
1187 16         19 my @aggregators;
1188 16 50       32 if ($automerge) {
1189 16         40 for my $a ($self->aggregators) {
1190 33 50       114 push @aggregators,$a if $a->disaggregate([],$self);
1191             }
1192             }
1193              
1194 16         18 my %groups; # cache the groups we create to avoid consuming too much unecessary memory
1195 16         21 my $features = [];
1196 16     74   70 my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) };
  74         162  
1197 16 50       43 my $location = [$ref,$start,$end] if defined $ref;
1198 16         57 $self->_feature_by_name($gclass,$gname,$location,$callback);
1199              
1200 16 50       39 warn "aggregating...\n" if $self->debug;
1201 16         31 foreach my $a (@aggregators) { # last aggregator gets first shot
1202 33 50       92 $a->aggregate($features,$self) or next;
1203             }
1204              
1205 16         113 @$features;
1206             }
1207              
1208             # horrible indecision regarding proper names!
1209             *fetch_group = *fetch_feature = *fetch_feature_by_name = \&get_feature_by_name;
1210             *segments = \&segment;
1211              
1212             =head2 get_feature_by_target
1213              
1214             Title : get_feature_by_target
1215             Usage : $db->get_feature_by_target($class => $name)
1216             Function: fetch features by their similarity target
1217             Returns : a list of Bio::DB::GFF::Feature objects
1218             Args : the class and name of the desired feature
1219             Status : public
1220              
1221             This method can be used to fetch a named feature from the database
1222             based on its similarity hit.
1223              
1224             =cut
1225              
1226             sub get_feature_by_target {
1227 0     0 1 0 shift->get_feature_by_name(@_);
1228             }
1229              
1230             =head2 get_feature_by_attribute
1231              
1232             Title : get_feature_by_attribute
1233             Usage : $db->get_feature_by_attribute(attribute1=>value1,attribute2=>value2)
1234             Function: fetch segments by combinations of attribute values
1235             Returns : a list of Bio::DB::GFF::Feature objects
1236             Args : the class and name of the desired feature
1237             Status : public
1238              
1239             This method can be used to fetch a set of features from the database.
1240             Attributes are a list of name=Evalue pairs. They will be logically
1241             ANDED together.
1242              
1243             =cut
1244              
1245             sub get_feature_by_attribute {
1246 5     5 1 2262 my $self = shift;
1247 5 50       23 my %attributes = ref($_[0]) ? %{$_[0]} : @_;
  0         0  
1248              
1249             # we need to refactor this... It's repeated code (see above)...
1250 5         8 my @aggregators;
1251 5 50       10 if ($self->automerge) {
1252 5         10 for my $a ($self->aggregators) {
1253 10 50       26 unshift @aggregators,$a if $a->disaggregate([],$self);
1254             }
1255             }
1256              
1257 5         7 my %groups; # cache the groups we create to avoid consuming too much unecessary memory
1258 5         8 my $features = [];
1259 5     10   23 my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) };
  10         20  
1260 5         20 $self->_feature_by_attribute(\%attributes,$callback);
1261              
1262 5 50       17 warn "aggregating...\n" if $self->debug;
1263 5         12 foreach my $a (@aggregators) { # last aggregator gets first shot
1264 10 50       29 $a->aggregate($features,$self) or next;
1265             }
1266              
1267 5         84 @$features;
1268             }
1269              
1270             # more indecision...
1271             *fetch_feature_by_attribute = \&get_feature_by_attribute;
1272              
1273             =head2 get_feature_by_id
1274              
1275             Title : get_feature_by_id
1276             Usage : $db->get_feature_by_id($id)
1277             Function: fetch segments by feature ID
1278             Returns : a Bio::DB::GFF::Feature object
1279             Args : the feature ID
1280             Status : public
1281              
1282             This method can be used to fetch a feature from the database using its
1283             ID. Not all GFF databases support IDs, so be careful with this.
1284              
1285             =cut
1286              
1287             sub get_feature_by_id {
1288 5     5 1 10 my $self = shift;
1289 5 50       16 my $id = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_;
1290 5         7 my %groups; # cache the groups we create to avoid consuming too much unecessary memory
1291 5         8 my $features = [];
1292 5     5   27 my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) };
  5         15  
1293 5         28 $self->_feature_by_id($id,'feature',$callback);
1294 5 50       33 return wantarray ? @$features : $features->[0];
1295             }
1296             *fetch_feature_by_id = \&get_feature_by_id;
1297              
1298             =head2 get_feature_by_gid
1299              
1300             Title : get_feature_by_gid
1301             Usage : $db->get_feature_by_gid($id)
1302             Function: fetch segments by feature ID
1303             Returns : a Bio::DB::GFF::Feature object
1304             Args : the feature ID
1305             Status : public
1306              
1307             This method can be used to fetch a feature from the database using its
1308             group ID. Not all GFF databases support IDs, so be careful with this.
1309              
1310             The group ID is often more interesting than the feature ID, since
1311             groups can be complex objects containing subobjects.
1312              
1313             =cut
1314              
1315             sub get_feature_by_gid {
1316 0     0 1 0 my $self = shift;
1317 0 0       0 my $id = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_;
1318 0         0 my %groups; # cache the groups we create to avoid consuming too much unecessary memory
1319 0         0 my $features = [];
1320 0     0   0 my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) };
  0         0  
1321 0         0 $self->_feature_by_id($id,'group',$callback);
1322 0 0       0 return wantarray ? @$features : $features->[0];
1323             }
1324             *fetch_feature_by_gid = \&get_feature_by_gid;
1325              
1326             =head2 delete_fattribute_to_features
1327              
1328             Title : delete_fattribute_to_features
1329             Usage : $db->delete_fattribute_to_features(@ids_or_features)
1330             Function: delete one or more fattribute_to_features
1331             Returns : count of fattribute_to_features deleted
1332             Args : list of features or feature ids
1333             Status : public
1334              
1335             Pass this method a list of numeric feature ids or a set of features.
1336             It will attempt to remove the fattribute_to_features rows of those features
1337             from the database and return a count of the rows removed.
1338              
1339             NOTE: This method is also called delete_fattribute_to_feature(). Also see
1340             delete_groups() and delete_features().
1341              
1342             =cut
1343              
1344             *delete_fattribute_to_feature = \&delete_fattribute_to_features;
1345              
1346             sub delete_fattribute_to_features {
1347 0     0 1 0 my $self = shift;
1348 0         0 my @features_or_ids = @_;
1349 0 0       0 my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->id : $_} @features_or_ids;
  0         0  
1350 0 0       0 return unless @ids;
1351 0         0 $self->_delete_fattribute_to_features(@ids);
1352             }
1353              
1354             =head2 delete_features
1355              
1356             Title : delete_features
1357             Usage : $db->delete_features(@ids_or_features)
1358             Function: delete one or more features
1359             Returns : count of features deleted
1360             Args : list of features or feature ids
1361             Status : public
1362              
1363             Pass this method a list of numeric feature ids or a set of features.
1364             It will attempt to remove the features from the database and return a
1365             count of the features removed.
1366              
1367             NOTE: This method is also called delete_feature(). Also see
1368             delete_groups().
1369              
1370             =cut
1371              
1372             *delete_feature = \&delete_features;
1373              
1374             sub delete_features {
1375 20     20 1 30 my $self = shift;
1376 20         44 my @features_or_ids = @_;
1377 20 50       25 my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->id : $_} @features_or_ids;
  75         203  
1378 20 50       38 return unless @ids;
1379 20         55 $self->_delete_features(@ids);
1380             }
1381              
1382             =head2 delete_groups
1383              
1384             Title : delete_groups
1385             Usage : $db->delete_groups(@ids_or_features)
1386             Function: delete one or more feature groups
1387             Returns : count of features deleted
1388             Args : list of features or feature group ids
1389             Status : public
1390              
1391             Pass this method a list of numeric group ids or a set of features. It
1392             will attempt to recursively remove the features and ALL members of
1393             their group from the database. It returns a count of the number of
1394             features (not groups) returned.
1395              
1396             NOTE: This method is also called delete_group(). Also see
1397             delete_features().
1398              
1399             =cut
1400              
1401             *delete_group = \&delete_groupss;
1402              
1403             sub delete_groups {
1404 5     5 1 7 my $self = shift;
1405 5         13 my @features_or_ids = @_;
1406 5 50       13 my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->group_id : $_} @features_or_ids;
  25         46  
1407 5 50       15 return unless @ids;
1408 5         21 $self->_delete_groups(@ids);
1409             }
1410              
1411             =head2 delete
1412              
1413             Title : delete
1414             Usage : $db->delete(@args)
1415             Function: delete features
1416             Returns : count of features deleted -- if available
1417             Args : numerous, see below
1418             Status : public
1419              
1420             This method deletes all features that overlap the specified region or
1421             are of a particular type. If no arguments are provided and the -force
1422             argument is true, then deletes ALL features.
1423              
1424             Arguments:
1425              
1426             -name ID of the landmark sequence.
1427              
1428             -ref ID of the landmark sequence (synonym for -name).
1429              
1430             -class Database object class for the landmark sequence.
1431             "Sequence" assumed if not specified. This is
1432             irrelevant for databases which do not recognize
1433             object classes.
1434              
1435             -start Start of the segment relative to landmark. Positions
1436             follow standard 1-based sequence rules. If not specified,
1437             defaults to the beginning of the landmark.
1438              
1439             -end Stop of the segment relative to the landmark. If not specified,
1440             defaults to the end of the landmark.
1441              
1442             -offset Zero-based addressing
1443              
1444             -length Length of region
1445              
1446             -type,-types Either a single scalar type to be deleted, or an
1447             reference to an array of types.
1448              
1449             -force Force operation to be performed even if it would delete
1450             entire feature table.
1451              
1452             -range_type Control the range type of the deletion. One of "overlaps" (default)
1453             "contains" or "contained_in"
1454              
1455             Examples:
1456              
1457             $db->delete(-type=>['intron','repeat:repeatMasker']); # remove all introns & repeats
1458             $db->delete(-name=>'chr3',-start=>1,-end=>1000); # remove annotations on chr3 from 1 to 1000
1459             $db->delete(-name=>'chr3',-type=>'exon'); # remove all exons on chr3
1460              
1461             The short form of this call, as described in segment() is also allowed:
1462              
1463             $db->delete("chr3",1=>1000);
1464             $db->delete("chr3");
1465              
1466             IMPORTANT NOTE: This method only deletes features. It does *NOT*
1467             delete the names of groups that contain the deleted features. Group
1468             IDs will be reused if you later load a feature with the same group
1469             name as one that was previously deleted.
1470              
1471             NOTE ON FEATURE COUNTS: The DBI-based versions of this call return the
1472             result code from the SQL DELETE operation. Some dbd drivers return the
1473             count of rows deleted, while others return 0E0. Caveat emptor.
1474              
1475             =cut
1476              
1477             sub delete {
1478 21     21 1 14055215 my $self = shift;
1479 21         60 my @args = $self->setup_segment_args(@_);
1480 21         149 my ($name,$class,$start,$end,$offset,$length,$type,$force,$range_type) =
1481             rearrange([['NAME','REF'],'CLASS','START',[qw(END STOP)],'OFFSET',
1482             'LENGTH',[qw(TYPE TYPES)],'FORCE','RANGE_TYPE'],@args);
1483 21 50       75 $offset = 0 unless defined $offset;
1484 21 50       45 $start = $offset+1 unless defined $start;
1485 21 50 33     81 $end = $start+$length-1 if !defined $end and $length;
1486 21   66     80 $class ||= $self->default_class;
1487              
1488 21         68 my $types = $self->parse_types($type); # parse out list of types
1489              
1490 21   50     72 $range_type ||= 'overlaps';
1491             $self->throw("range type must be one of {".
1492             join(',',keys %valid_range_types).
1493             "}\n")
1494 21 50       66 unless $valid_range_types{lc $range_type};
1495              
1496              
1497 21         23 my @segments;
1498 21 100 100     86 if (defined $name && $name ne '') {
1499 10         20 my @args = (-name=>$name,-class=>$class);
1500 10 50       31 push @args,(-start=>$start) if defined $start;
1501 10 50       21 push @args,(-end =>$end) if defined $end;
1502 10         17 @segments = $self->segment(@args);
1503 10 100       40 return unless @segments;
1504             }
1505 16         114 $self->_delete({segments => \@segments,
1506             types => $types,
1507             range_type => $range_type,
1508             force => $force}
1509             );
1510             }
1511              
1512             =head2 absolute
1513              
1514             Title : absolute
1515             Usage : $abs = $db->absolute([$abs]);
1516             Function: gets/sets absolute mode
1517             Returns : current setting of absolute mode boolean
1518             Args : new setting for absolute mode boolean
1519             Status : public
1520              
1521             $db-Eabsolute(1) will turn on absolute mode for the entire database.
1522             All segments retrieved will use absolute coordinates by default,
1523             rather than relative coordinates. You can still set them to use
1524             relative coordinates by calling $segment-Eabsolute(0).
1525              
1526             Note that this is not the same as calling abs_segment(); it continues
1527             to allow you to look up groups that are not used directly as reference
1528             sequences.
1529              
1530             =cut
1531              
1532             sub absolute {
1533 195     195 1 196 my $self = shift;
1534 195         234 my $d = $self->{absolute};
1535 195 50       310 $self->{absolute} = shift if @_;
1536 195         357 $d;
1537             }
1538              
1539             =head2 strict_bounds_checking
1540              
1541             Title : strict_bounds_checking
1542             Usage : $flag = $db->strict_bounds_checking([$flag])
1543             Function: gets/sets strict bounds checking
1544             Returns : current setting of bounds checking flag
1545             Args : new setting for bounds checking flag
1546             Status : public
1547              
1548             This flag enables extra checks for segment requests that go beyond the
1549             ends of their reference sequences. If bounds checking is enabled,
1550             then retrieved segments will be truncated to their physical length,
1551             and their truncated() methods will return true.
1552              
1553             If the flag is off (the default), then the module will return segments
1554             that appear to extend beyond their physical boundaries. Requests for
1555             features beyond the end of the segment will, however, return empty.
1556              
1557             =cut
1558              
1559             sub strict_bounds_checking {
1560 170     170 1 217 my $self = shift;
1561 170         215 my $d = $self->{strict};
1562 170 100       253 $self->{strict} = shift if @_;
1563 170         402 $d;
1564             }
1565              
1566             =head2 get_Seq_by_id
1567              
1568             Title : get_Seq_by_id
1569             Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
1570             Function: Gets a Bio::Seq object by its name
1571             Returns : a Bio::Seq object
1572             Args : the id (as a string) of a sequence
1573             Throws : "id does not exist" exception
1574              
1575             NOTE: Bio::DB::RandomAccessI compliant method
1576              
1577             =cut
1578              
1579             sub get_Seq_by_id {
1580 0     0 1 0 my $self = shift;
1581 0         0 $self->get_feature_by_name(@_);
1582             }
1583              
1584              
1585             =head2 get_Seq_by_accession
1586              
1587             Title : get_Seq_by_accession
1588             Usage : $seq = $db->get_Seq_by_accession('AL12234')
1589             Function: Gets a Bio::Seq object by its accession
1590             Returns : a Bio::Seq object
1591             Args : the id (as a string) of a sequence
1592             Throws : "id does not exist" exception
1593              
1594             NOTE: Bio::DB::RandomAccessI compliant method
1595              
1596             =cut
1597              
1598             sub get_Seq_by_accession {
1599 0     0 1 0 my $self = shift;
1600 0         0 $self->get_feature_by_name(@_);
1601             }
1602              
1603             =head2 get_Seq_by_acc
1604              
1605             Title : get_Seq_by_acc
1606             Usage : $seq = $db->get_Seq_by_acc('X77802');
1607             Function: Gets a Bio::Seq object by accession number
1608             Returns : A Bio::Seq object
1609             Args : accession number (as a string)
1610             Throws : "acc does not exist" exception
1611              
1612             NOTE: Bio::DB::RandomAccessI compliant method
1613              
1614             =cut
1615              
1616             sub get_Seq_by_acc {
1617 0     0 1 0 my $self = shift;
1618 0         0 $self->get_feature_by_name(@_);
1619             }
1620              
1621             =head2 get_Stream_by_name
1622              
1623             Title : get_Stream_by_name
1624             Usage : $seq = $db->get_Stream_by_name(@ids);
1625             Function: Retrieves a stream of Seq objects given their names
1626             Returns : a Bio::SeqIO stream object
1627             Args : an array of unique ids/accession numbers, or
1628             an array reference
1629              
1630             NOTE: This is also called get_Stream_by_batch()
1631              
1632             =cut
1633              
1634             sub get_Stream_by_name {
1635 0     0 1 0 my $self = shift;
1636 0         0 my @ids = @_;
1637 0 0       0 my $id = ref($ids[0]) ? $ids[0] : \@ids;
1638 0         0 Bio::DB::GFF::ID_Iterator->new($self,$id,'name');
1639             }
1640              
1641             =head2 get_Stream_by_id
1642              
1643             Title : get_Stream_by_id
1644             Usage : $seq = $db->get_Stream_by_id(@ids);
1645             Function: Retrieves a stream of Seq objects given their ids
1646             Returns : a Bio::SeqIO stream object
1647             Args : an array of unique ids/accession numbers, or
1648             an array reference
1649              
1650             NOTE: This is also called get_Stream_by_batch()
1651              
1652             =cut
1653              
1654             sub get_Stream_by_id {
1655 0     0 1 0 my $self = shift;
1656 0         0 my @ids = @_;
1657 0 0       0 my $id = ref($ids[0]) ? $ids[0] : \@ids;
1658 0         0 Bio::DB::GFF::ID_Iterator->new($self,$id,'feature');
1659             }
1660              
1661             =head2 get_Stream_by_batch ()
1662              
1663             Title : get_Stream_by_batch
1664             Usage : $seq = $db->get_Stream_by_batch(@ids);
1665             Function: Retrieves a stream of Seq objects given their ids
1666             Returns : a Bio::SeqIO stream object
1667             Args : an array of unique ids/accession numbers, or
1668             an array reference
1669              
1670             NOTE: This is the same as get_Stream_by_id().
1671              
1672             =cut
1673              
1674             *get_Stream_by_batch = \&get_Stream_by_id;
1675              
1676              
1677             =head2 get_Stream_by_group ()
1678              
1679             Bioperl compatibility.
1680              
1681             =cut
1682              
1683             sub get_Stream_by_group {
1684 0     0 1 0 my $self = shift;
1685 0         0 my @ids = @_;
1686 0 0       0 my $id = ref($ids[0]) ? $ids[0] : \@ids;
1687 0         0 Bio::DB::GFF::ID_Iterator->new($self,$id,'group');
1688             }
1689              
1690             =head2 all_seqfeatures
1691              
1692             Title : all_seqfeatures
1693             Usage : @features = $db->all_seqfeatures(@args)
1694             Function: fetch all the features in the database
1695             Returns : an array of features, or an iterator
1696             Args : See below
1697             Status : public
1698              
1699             This is equivalent to calling $db-Efeatures() without any types, and
1700             will return all the features in the database. The -merge and
1701             -iterator arguments are recognized, and behave the same as described
1702             for features().
1703              
1704             =cut
1705              
1706             sub all_seqfeatures {
1707 0     0 1 0 my $self = shift;
1708 0         0 my ($automerge,$iterator)= rearrange([
1709             [qw(MERGE AUTOMERGE)],
1710             'ITERATOR'
1711             ],@_);
1712 0         0 my @args;
1713 0 0       0 push @args,(-merge=>$automerge) if defined $automerge;
1714 0 0       0 push @args,(-iterator=>$iterator) if defined $iterator;
1715 0         0 $self->features(@args);
1716             }
1717              
1718             =head1 Creating and Loading GFF Databases
1719              
1720             =head2 initialize
1721              
1722             Title : initialize
1723             Usage : $db->initialize(-erase=>$erase,-option1=>value1,-option2=>value2);
1724             Function: initialize a GFF database
1725             Returns : true if initialization successful
1726             Args : a set of named parameters
1727             Status : Public
1728              
1729             This method can be used to initialize an empty database. It takes the following
1730             named arguments:
1731              
1732             -erase A boolean value. If true the database will be wiped clean if it
1733             already contains data.
1734              
1735             Other named arguments may be recognized by subclasses. They become database
1736             meta values that control various settable options.
1737              
1738             As a shortcut (and for backward compatibility) a single true argument
1739             is the same as initialize(-erase=E1).
1740              
1741             =cut
1742              
1743             sub initialize {
1744 5     5 1 5029 my $self = shift;
1745              
1746 5         39 my ($erase,$meta) = rearrange(['ERASE'],@_);
1747 5   50     30 $meta ||= {};
1748              
1749             # initialize (possibly erasing)
1750 5 50       28 return unless $self->do_initialize($erase);
1751 5         24 my @default = $self->default_meta_values;
1752              
1753             # this is an awkward way of uppercasing the
1754             # even-numbered values (necessary for case-insensitive SQL databases)
1755 5         15 for (my $i=0; $i<@default; $i++) {
1756 0 0       0 $default[$i] = uc $default[$i] if !($i % 2);
1757             }
1758              
1759 5         15 my %values = (@default,%$meta);
1760 5         13 foreach (keys %values) {
1761 0         0 $self->meta($_ => $values{$_});
1762             }
1763 5         20 1;
1764             }
1765              
1766              
1767             =head2 load_gff
1768              
1769             Title : load_gff
1770             Usage : $db->load_gff($file|$directory|$filehandle [,$verbose]);
1771             Function: load GFF data into database
1772             Returns : count of records loaded
1773             Args : a directory, a file, a list of files,
1774             or a filehandle
1775             Status : Public
1776              
1777             This method takes a single overloaded argument, which can be any of:
1778              
1779             =over 4
1780              
1781             =item *
1782              
1783             a scalar corresponding to a GFF file on the system
1784              
1785             A pathname to a local GFF file. Any files ending with the .gz, .Z, or
1786             .bz2 suffixes will be transparently decompressed with the appropriate
1787             command-line utility.
1788              
1789             =item *
1790              
1791             an array reference containing a list of GFF files on the system
1792              
1793             For example ['/home/gff/gff1.gz','/home/gff/gff2.gz']
1794              
1795             =item *
1796              
1797             directory path
1798              
1799             The indicated directory will be searched for all files ending in the
1800             suffixes .gff, .gff.gz, .gff.Z or .gff.bz2.
1801              
1802             =item *
1803              
1804             filehandle
1805              
1806             An open filehandle from which to read the GFF data. Tied filehandles
1807             now work as well.
1808              
1809             =item *
1810              
1811             a pipe expression
1812              
1813             A pipe expression will also work. For example, a GFF file on a remote
1814             web server can be loaded with an expression like this:
1815              
1816             $db->load_gff("lynx -dump -source http://stein.cshl.org/gff_test |");
1817              
1818             =back
1819              
1820             The optional second argument, if true, will turn on verbose status
1821             reports that indicate the progress.
1822              
1823             If successful, the method will return the number of GFF lines
1824             successfully loaded.
1825              
1826             NOTE:this method used to be called load(), but has been changed. The
1827             old method name is also recognized.
1828              
1829             =cut
1830              
1831             sub load_gff {
1832 5     5 1 10 my $self = shift;
1833 5   50     28 my $file_or_directory = shift || '.';
1834 5         5 my $verbose = shift;
1835              
1836 5         19 local $self->{__verbose__} = $verbose;
1837 5 50 33     22 return $self->do_load_gff($file_or_directory) if ref($file_or_directory)
1838             && tied *$file_or_directory;
1839              
1840 5         12 my $tied_stdin = tied(*STDIN);
1841 5 50       167 open my $SAVEIN, "<&STDIN" unless $tied_stdin;
1842 5 50       33 local @ARGV = $self->setup_argv($file_or_directory,'gff','gff3') or return; # to play tricks with reader
1843 5         27 my $result = $self->do_load_gff('ARGV');
1844 5 50       121 open STDIN, '<', $SAVEIN unless $tied_stdin; # restore STDIN
1845 5         47 return $result;
1846             }
1847              
1848             *load = \&load_gff;
1849              
1850             =head2 load_gff_file
1851              
1852             Title : load_gff_file
1853             Usage : $db->load_gff_file($file [,$verbose]);
1854             Function: load GFF data into database
1855             Returns : count of records loaded
1856             Args : a path to a file
1857             Status : Public
1858              
1859             This is provided as an alternative to load_gff_file. It doesn't munge
1860             STDIN or play tricks with ARGV.
1861              
1862             =cut
1863              
1864             sub load_gff_file {
1865 0     0 1 0 my $self = shift;
1866 0         0 my $file = shift;
1867 0         0 my $verbose = shift;
1868 0 0       0 my $fh = IO::File->new($file) or return;
1869 0         0 return $self->do_load_gff($fh);
1870             }
1871              
1872             =head2 load_fasta
1873              
1874             Title : load_fasta
1875             Usage : $db->load_fasta($file|$directory|$filehandle);
1876             Function: load FASTA data into database
1877             Returns : count of records loaded
1878             Args : a directory, a file, a list of files,
1879             or a filehandle
1880             Status : Public
1881              
1882             This method takes a single overloaded argument, which can be any of:
1883              
1884             =over 4
1885              
1886             =item *
1887              
1888             scalar corresponding to a FASTA file on the system
1889              
1890             A pathname to a local FASTA file. Any files ending with the .gz, .Z, or
1891             .bz2 suffixes will be transparently decompressed with the appropriate
1892             command-line utility.
1893              
1894             =item *
1895              
1896             array reference containing a list of FASTA files on the
1897             system
1898              
1899             For example ['/home/fasta/genomic.fa.gz','/home/fasta/genomic.fa.gz']
1900              
1901             =item *
1902              
1903             path to a directory
1904              
1905             The indicated directory will be searched for all files ending in the
1906             suffixes .fa, .fa.gz, .fa.Z or .fa.bz2.
1907              
1908             =item *
1909              
1910             filehandle
1911              
1912             An open filehandle from which to read the FASTA data.
1913              
1914             =item *
1915              
1916             pipe expression
1917              
1918             A pipe expression will also work. For example, a FASTA file on a remote
1919             web server can be loaded with an expression like this:
1920              
1921             $db->load_gff("lynx -dump -source http://stein.cshl.org/fasta_test.fa |");
1922              
1923             =back
1924              
1925             =cut
1926              
1927             sub load_fasta {
1928 5     5 1 13 my $self = shift;
1929 5   50     15 my $file_or_directory = shift || '.';
1930 5         8 my $verbose = shift;
1931              
1932 5         10 local $self->{__verbose__} = $verbose;
1933 5 50 33     17 return $self->load_sequence($file_or_directory) if ref($file_or_directory)
1934             && tied *$file_or_directory;
1935              
1936 5         10 my $tied = tied(*STDIN);
1937 5 50       78 open my $SAVEIN, "<&STDIN" unless $tied;
1938 5 50       26 local @ARGV = $self->setup_argv($file_or_directory,'fa','dna','fasta') or return; # to play tricks with reader
1939 5         35 my $result = $self->load_sequence('ARGV');
1940 5 50       156 open STDIN, '<', $SAVEIN unless $tied; # restore STDIN
1941 5         87 return $result;
1942             }
1943              
1944              
1945             =head2 load_fasta_file
1946              
1947             Title : load_fasta_file
1948             Usage : $db->load_fasta_file($file [,$verbose]);
1949             Function: load FASTA data into database
1950             Returns : count of records loaded
1951             Args : a path to a file
1952             Status : Public
1953              
1954             This is provided as an alternative to load_fasta. It doesn't munge
1955             STDIN or play tricks with ARGV.
1956              
1957             =cut
1958              
1959             sub load_fasta_file {
1960 0     0 1 0 my $self = shift;
1961 0         0 my $file = shift;
1962 0         0 my $verbose = shift;
1963 0 0       0 my $fh = IO::File->new($file) or return;
1964 0         0 return $self->do_load_fasta($fh);
1965             }
1966              
1967              
1968             =head2 load_sequence_string
1969              
1970             Title : load_sequence_string
1971             Usage : $db->load_sequence_string($id,$dna)
1972             Function: load a single DNA entry
1973             Returns : true if successfully loaded
1974             Args : a raw sequence string (DNA, RNA, protein)
1975             Status : Public
1976              
1977             =cut
1978              
1979             sub load_sequence_string {
1980 0     0 1 0 my $self = shift;
1981 0         0 my ($acc,$seq) = @_;
1982 0         0 my $offset = 0;
1983 0 0       0 $self->insert_sequence_chunk($acc,\$offset,\$seq) or return;
1984 0 0       0 $self->insert_sequence($acc,$offset,$seq) or return;
1985 0         0 1;
1986             }
1987              
1988             sub setup_argv {
1989 10     10 0 18 my $self = shift;
1990 10         15 my $file_or_directory = shift;
1991 10         43 my @suffixes = @_;
1992 3     3   21 no strict 'refs'; # so that we can call fileno() on the argument
  3         3  
  3         12540  
1993              
1994 10         18 my @argv;
1995              
1996 10 100       193 if (-d $file_or_directory) {
    50          
    50          
1997             # Because glob() is broken with long file names that contain spaces
1998 5 50 33     55 $file_or_directory = Win32::GetShortPathName($file_or_directory)
1999             if $^O =~ /^MSWin/i && eval 'use Win32; 1';
2000 5         16 @argv = map { glob("$file_or_directory/*.{$_,$_.gz,$_.Z,$_.bz2}")} @suffixes;
  15         3097  
2001             }elsif (my $fd = fileno($file_or_directory)) {
2002 0 0       0 open STDIN,"<&=$fd" or $self->throw("Can't dup STDIN");
2003 0         0 @argv = '-';
2004             } elsif (ref $file_or_directory) {
2005 0         0 @argv = @$file_or_directory;
2006             } else {
2007 5         12 @argv = $file_or_directory;
2008             }
2009              
2010 10         28 foreach (@argv) {
2011 45 50       374 if (/\.gz$/) {
    50          
    50          
2012 0         0 $_ = "gunzip -c $_ |";
2013             } elsif (/\.Z$/) {
2014 0         0 $_ = "uncompress -c $_ |";
2015             } elsif (/\.bz2$/) {
2016 0         0 $_ = "bunzip2 -c $_ |";
2017             }
2018             }
2019 10         53 @argv;
2020             }
2021              
2022             =head2 lock_on_load
2023              
2024             Title : lock_on_load
2025             Usage : $lock = $db->lock_on_load([$lock])
2026             Function: set write locking during load
2027             Returns : current value of lock-on-load flag
2028             Args : new value of lock-on-load-flag
2029             Status : Public
2030              
2031             This method is honored by some of the adaptors. If the value is true,
2032             the tables used by the GFF modules will be locked for writing during
2033             loads and inaccessible to other processes.
2034              
2035             =cut
2036              
2037             sub lock_on_load {
2038 0     0 1 0 my $self = shift;
2039 0         0 my $d = $self->{lock};
2040 0 0       0 $self->{lock} = shift if @_;
2041 0         0 $d;
2042             }
2043              
2044             =head2 meta
2045              
2046             Title : meta
2047             Usage : $value = $db->meta($name [,$newval])
2048             Function: get or set a meta variable
2049             Returns : a string
2050             Args : meta variable name and optionally value
2051             Status : abstract
2052              
2053             Get or set a named metavalues for the database. Metavalues can be
2054             used for database-specific settings.
2055              
2056             By default, this method does nothing!
2057              
2058             =cut
2059              
2060             sub meta {
2061 0     0 1 0 my $self = shift;
2062 0         0 my ($name,$value) = @_;
2063 0         0 return;
2064             }
2065              
2066             =head2 default_meta_values
2067              
2068             Title : default_meta_values
2069             Usage : %values = $db->default_meta_values
2070             Function: empty the database
2071             Returns : a list of tag=>value pairs
2072             Args : none
2073             Status : protected
2074              
2075             This method returns a list of tag=Evalue pairs that contain default
2076             meta information about the database. It is invoked by initialize() to
2077             write out the default meta values. The base class version returns an
2078             empty list.
2079              
2080             For things to work properly, meta value names must be UPPERCASE.
2081              
2082             =cut
2083              
2084             sub default_meta_values {
2085 5     5 1 10 my $self = shift;
2086 5         10 return ();
2087             }
2088              
2089              
2090             =head2 error
2091              
2092             Title : error
2093             Usage : $db->error( [$new error] );
2094             Function: read or set error message
2095             Returns : error message
2096             Args : an optional argument to set the error message
2097             Status : Public
2098              
2099             This method can be used to retrieve the last error message. Errors
2100             are not reset to empty by successful calls, so contents are only valid
2101             immediately after an error condition has been detected.
2102              
2103             =cut
2104              
2105             sub error {
2106 8     8 1 12 my $self = shift;
2107 8         18 my $g = $self->{error};
2108 8 50       41 $self->{error} = join '',@_ if @_;
2109 8         13 $g;
2110             }
2111              
2112             =head2 debug
2113              
2114             Title : debug
2115             Usage : $db->debug( [$flag] );
2116             Function: read or set debug flag
2117             Returns : current value of debug flag
2118             Args : new debug flag (optional)
2119             Status : Public
2120              
2121             This method can be used to turn on debug messages. The exact nature
2122             of those messages depends on the adaptor in use.
2123              
2124             =cut
2125              
2126             sub debug {
2127 216     216 1 2596 my $self = shift;
2128 216         253 my $g = $self->{debug};
2129 216 100       919 $self->{debug} = shift if @_;
2130 216         398 $g;
2131             }
2132              
2133              
2134             =head2 automerge
2135              
2136             Title : automerge
2137             Usage : $db->automerge( [$new automerge] );
2138             Function: get or set automerge value
2139             Returns : current value (boolean)
2140             Args : an optional argument to set the automerge value
2141             Status : Public
2142              
2143             By default, this module will use the aggregators to merge groups into
2144             single composite objects. This default can be changed to false by
2145             calling automerge(0).
2146              
2147             =cut
2148              
2149             sub automerge {
2150 111     111 1 210 my $self = shift;
2151 111         163 my $g = $self->{automerge};
2152 111 100       229 $self->{automerge} = shift if @_;
2153 111         163 $g;
2154             }
2155              
2156             =head2 attributes
2157              
2158             Title : attributes
2159             Usage : @attributes = $db->attributes($id,$name)
2160             Function: get the "attributes" on a particular feature
2161             Returns : an array of string
2162             Args : feature ID
2163             Status : public
2164              
2165             Some GFF version 2 files use the groups column to store a series of
2166             attribute/value pairs. In this interpretation of GFF, the first such
2167             pair is treated as the primary group for the feature; subsequent pairs
2168             are treated as attributes. Two attributes have special meaning:
2169             "Note" is for backward compatibility and is used for unstructured text
2170             remarks. "Alias" is considered as a synonym for the feature name.
2171              
2172             If no name is provided, then attributes() returns a flattened hash, of
2173             attribute=Evalue pairs. This lets you do:
2174              
2175             %attributes = $db->attributes($id);
2176              
2177             If no arguments are provided, attributes() will return the list of
2178             all attribute names:
2179              
2180             @attribute_names = $db->attributes();
2181              
2182             Normally, however, attributes() will be called by the feature:
2183              
2184             @notes = $feature->attributes('Note');
2185              
2186             In a scalar context, attributes() returns the first value of the
2187             attribute if a tag is present, otherwise a hash reference in which the
2188             keys are attribute names and the values are anonymous arrays
2189             containing the values.
2190              
2191             =cut
2192              
2193             sub attributes {
2194 20     20 1 27 my $self = shift;
2195 20         63 my ($id,$tag) = @_;
2196 20 50       69 my @result = $self->do_attributes(@_) or return;
2197 20 100       80 return @result if wantarray;
2198              
2199             # what to do in an array context
2200 10 100       30 return $result[0] if $tag;
2201 5         8 my %result;
2202 5         23 while (my($key,$value) = splice(@result,0,2)) {
2203 15         15 push @{$result{$key}},$value;
  15         57  
2204             }
2205 5         25 return \%result;
2206             }
2207              
2208             =head2 fast_queries
2209              
2210             Title : fast_queries
2211             Usage : $flag = $db->fast_queries([$flag])
2212             Function: turn on and off the "fast queries" option
2213             Returns : a boolean
2214             Args : a boolean flag (optional)
2215             Status : public
2216              
2217             The mysql database driver (and possibly others) support a "fast" query
2218             mode that caches results on the server side. This makes queries come
2219             back faster, particularly when creating iterators. The downside is
2220             that while iterating, new queries will die with a "command synch"
2221             error. This method turns the feature on and off.
2222              
2223             For databases that do not support a fast query, this method has no
2224             effect.
2225              
2226             =cut
2227              
2228             # override this method in order to set the mysql_use_result attribute, which is an obscure
2229             # but extremely powerful optimization for both performance and memory.
2230             sub fast_queries {
2231 0     0 1 0 my $self = shift;
2232 0         0 my $d = $self->{fast_queries};
2233 0 0       0 $self->{fast_queries} = shift if @_;
2234 0         0 $d;
2235             }
2236              
2237             =head2 add_aggregator
2238              
2239             Title : add_aggregator
2240             Usage : $db->add_aggregator($aggregator)
2241             Function: add an aggregator to the list
2242             Returns : nothing
2243             Args : an aggregator
2244             Status : public
2245              
2246             This method will append an aggregator to the end of the list of
2247             registered aggregators. Three different argument types are accepted:
2248              
2249             1) a Bio::DB::GFF::Aggregator object -- will be added
2250             2) a string in the form "aggregator_name{subpart1,subpart2,subpart3/main_method}"
2251             -- will be turned into a Bio::DB::GFF::Aggregator object (the /main_method
2252             part is optional).
2253             3) a valid Perl token -- will be turned into a Bio::DB::GFF::Aggregator
2254             subclass, where the token corresponds to the subclass name.
2255              
2256             =cut
2257              
2258             sub add_aggregator {
2259 20     20 1 58 my $self = shift;
2260 20         25 my $aggregator = shift;
2261 20   100     79 my $list = $self->{aggregators} ||= [];
2262 20 100       83 if (ref $aggregator) { # an object
    50          
2263 5         15 @$list = grep {$_->get_method ne $aggregator->get_method} @$list;
  10         28  
2264 5         13 push @$list,$aggregator;
2265             }
2266              
2267             elsif ($aggregator =~ /^(\w+)\{([^\/\}]+)\/?(.*)\}$/) {
2268 0         0 my($agg_name,$subparts,$mainpart) = ($1,$2,$3);
2269 0         0 my @subparts = split /,\s*/,$subparts;
2270 0         0 my @args = (-method => $agg_name,
2271             -sub_parts => \@subparts);
2272 0 0       0 if ($mainpart) {
2273 0         0 push @args,(-main_method => $mainpart,
2274             -whole_object => 1);
2275             }
2276 0 0       0 warn "making an aggregator with (@args), subparts = @subparts" if $self->debug;
2277 0         0 push @$list,Bio::DB::GFF::Aggregator->new(@args);
2278             }
2279              
2280             else {
2281 15         57 my $class = "Bio::DB::GFF::Aggregator::\L${aggregator}\E";
2282 15 50       771 eval "require $class; 1" or $self->throw("Unable to load $aggregator aggregator: $@");
2283 15         120 push @$list,$class->new();
2284             }
2285             }
2286              
2287             =head2 aggregators
2288              
2289             Title : aggregators
2290             Usage : $db->aggregators([@new_aggregators]);
2291             Function: retrieve list of aggregators
2292             Returns : list of aggregators
2293             Args : a list of aggregators to set (optional)
2294             Status : public
2295              
2296             This method will get or set the list of aggregators assigned to
2297             the database. If 1 or more arguments are passed, the existing
2298             set will be cleared.
2299              
2300             =cut
2301              
2302             sub aggregators {
2303 106     106 1 120 my $self = shift;
2304 106         113 my $d = $self->{aggregators};
2305 106 50       176 if (@_) {
2306 0         0 $self->clear_aggregators;
2307 0         0 $self->add_aggregator($_) foreach @_;
2308             }
2309 106 50       155 return unless $d;
2310 106         248 return @$d;
2311             }
2312              
2313             =head2 clear_aggregators
2314              
2315             Title : clear_aggregators
2316             Usage : $db->clear_aggregators
2317             Function: clears list of aggregators
2318             Returns : nothing
2319             Args : none
2320             Status : public
2321              
2322             This method will clear the aggregators stored in the database object.
2323             Use aggregators() or add_aggregator() to add some back.
2324              
2325             =cut
2326              
2327 5     5 1 26 sub clear_aggregators { shift->{aggregators} = [] }
2328              
2329             =head2 preferred_groups
2330              
2331             Title : preferred_groups
2332             Usage : $db->preferred_groups([$group_name_or_arrayref])
2333             Function: get/set list of groups for altering GFF2 parsing
2334             Returns : a list of classes
2335             Args : new list (scalar or array ref)
2336             Status : public
2337              
2338             =cut
2339              
2340             sub preferred_groups {
2341 15     15 1 55 my $self = shift;
2342 15         23 my $d = $self->{preferred_groups};
2343 15 100       33 if (@_) {
2344 5 50       12 my @v = map {ref($_) eq 'ARRAY' ? @$_ : $_} @_;
  5         33  
2345 5         15 $self->{preferred_groups} = \@v;
2346 5         21 delete $self->{preferred_groups_hash};
2347             }
2348 15 100       38 return unless $d;
2349 10         37 return @$d;
2350             }
2351              
2352             sub _preferred_groups_hash {
2353 144     144   154 my $self = shift;
2354 144         131 my $gff3 = shift;
2355 144 100       296 return $self->{preferred_groups_hash} if exists $self->{preferred_groups_hash};
2356 5         7 my $count = 0;
2357              
2358 5         13 my @preferred = $self->preferred_groups;
2359              
2360             # defaults
2361 5 50       12 if (!@preferred) {
2362 0 0 0     0 @preferred = $gff3 || $self->{load_data}{gff3_flag} ? qw(Target Parent ID) : qw(Target Sequence Transcript);
2363             }
2364              
2365 5         20 my %preferred = map {lc($_) => @preferred-$count++} @preferred;
  15         62  
2366 5         29 return $self->{preferred_groups_hash} = \%preferred;
2367             }
2368              
2369             =head1 Methods for use by Subclasses
2370              
2371             The following methods are chiefly of interest to subclasses and are
2372             not intended for use by end programmers.
2373              
2374             =head2 abscoords
2375              
2376             Title : abscoords
2377             Usage : $db->abscoords($name,$class,$refseq)
2378             Function: finds position of a landmark in reference coordinates
2379             Returns : ($ref,$class,$start,$stop,$strand)
2380             Args : name and class of landmark
2381             Status : public
2382              
2383             This method is called by Bio::DB::GFF::RelSegment to obtain the
2384             absolute coordinates of a sequence landmark. The arguments are the
2385             name and class of the landmark. If successful, abscoords() returns
2386             the ID of the reference sequence, its class, its start and stop
2387             positions, and the orientation of the reference sequence's coordinate
2388             system ("+" for forward strand, "-" for reverse strand).
2389              
2390             If $refseq is present in the argument list, it forces the query to
2391             search for the landmark in a particular reference sequence.
2392              
2393             =cut
2394              
2395             sub abscoords {
2396 193     193 1 212 my $self = shift;
2397 193         285 my ($name,$class,$refseq) = @_;
2398 193   33     264 $class ||= $self->{default_class};
2399 193         545 $self->get_abscoords($name,$class,$refseq);
2400             }
2401              
2402             =head1 Protected API
2403              
2404             The following methods are not intended for public consumption, but are
2405             intended to be overridden/implemented by adaptors.
2406              
2407             =head2 default_aggregators
2408              
2409             Title : default_aggregators
2410             Usage : $db->default_aggregators;
2411             Function: retrieve list of aggregators
2412             Returns : array reference containing list of aggregator names
2413             Args : none
2414             Status : protected
2415              
2416             This method (which is intended to be overridden by adaptors) returns a
2417             list of standard aggregators to be applied when no aggregators are
2418             specified in the constructor.
2419              
2420             =cut
2421              
2422             sub default_aggregators {
2423 0     0 1 0 my $self = shift;
2424 0         0 return ['processed_transcript','alignment'];
2425             }
2426              
2427             =head2 do_load_gff
2428              
2429             Title : do_load_gff
2430             Usage : $db->do_load_gff($handle)
2431             Function: load a GFF input stream
2432             Returns : number of features loaded
2433             Args : A filehandle.
2434             Status : protected
2435              
2436             This method is called to load a GFF data stream. The method will read
2437             GFF features from EE and load them into the database. On exit the
2438             method must return the number of features loaded.
2439              
2440             Note that the method is responsible for parsing the GFF lines. This
2441             is to allow for differences in the interpretation of the "group"
2442             field, which are legion.
2443              
2444             You probably want to use load_gff() instead. It is more flexible
2445             about the arguments it accepts.
2446              
2447             =cut
2448              
2449             sub do_load_gff {
2450 5     5 1 8 my $self = shift;
2451 5         143 my $io_handle = shift;
2452              
2453             local $self->{load_data} = {
2454 5 50 33     85 lineend => (-t STDERR && !$ENV{EMACS} ? "\r" : "\n"),
2455             count => 0
2456             };
2457              
2458 5         34 $self->setup_load();
2459 5         21 my $mode = 'gff';
2460              
2461 5         201 while (<$io_handle>) {
2462 214         259 chomp;
2463 214 50       388 if ($mode eq 'gff') {
    0          
2464 214 50       320 if (/^>/) { # Sequence coming
2465 0         0 $mode = 'fasta';
2466 0         0 $self->_load_sequence_start;
2467 0         0 $self->_load_sequence_line($_);
2468             } else {
2469 214         304 $self->_load_gff_line($_);
2470             }
2471             }
2472             elsif ($mode eq 'fasta') {
2473 0 0       0 if (/^##|\t/) { # Back to GFF mode
2474 0         0 $self->_load_sequence_finish;
2475 0         0 $mode = 'gff';
2476 0         0 $self->_load_gff_line($_);
2477             } else {
2478 0         0 $self->_load_sequence_line($_);
2479             }
2480             }
2481             }
2482 5         20 $self->finish_load();
2483 5         24 $self->_load_sequence_finish;
2484              
2485 5         15 return $self->{load_data}{count};
2486             }
2487              
2488             sub _load_gff_line {
2489 214     214   277 my $self = shift;
2490 214         251 my $line = shift;
2491 214         259 my $lineend = $self->{load_data}{lineend};
2492              
2493 214 100       366 $self->{load_data}{gff3_flag}++ if $line =~ /^\#\#\s*gff-version\s+3/;
2494              
2495 214 100 100     462 if (defined $self->{load_data}{gff3_flag} and !defined $self->{load_data}{gff3_warning}) {
2496 2         32 $self->print_gff3_warning();
2497 2         10 $self->{load_data}{gff3_warning}=1;
2498             }
2499              
2500 214 50       299 $self->preferred_groups(split(/\s+/,$1)) if $line =~ /^\#\#\s*group-tags?\s+(.+)/;
2501              
2502 214 100       344 if ($line =~ /^\#\#\s*sequence-region\s+(\S+)\s+(-?\d+)\s+(-?\d+)/i) { # header line
2503 10         114 $self->load_gff_line(
2504             {
2505             ref => $1,
2506             class => 'Sequence',
2507             source => 'reference',
2508             method => 'Component',
2509             start => $2,
2510             stop => $3,
2511             score => undef,
2512             strand => undef,
2513             phase => undef,
2514             gclass => 'Sequence',
2515             gname => $1,
2516             tstart => undef,
2517             tstop => undef,
2518             attributes => [],
2519             }
2520             );
2521 10         51 return $self->{load_data}{count}++;
2522             }
2523              
2524 204 100       312 return if /^#/;
2525              
2526 190         740 my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t",$line;
2527 190 50 66     852 return unless defined($ref) && defined($method) && defined($start) && defined($stop);
      66        
      33        
2528 165         246 foreach (\$score,\$strand,\$phase) {
2529 495 100       709 undef $$_ if $$_ eq '.';
2530             }
2531              
2532 165         397 my ($gclass,$gname,$tstart,$tstop,$attributes) = $self->split_group($group,$self->{load_data}{gff3_flag});
2533              
2534             # no standard way in the GFF file to denote the class of the reference sequence -- drat!
2535             # so we invoke the factory to do it
2536 165         349 my $class = $self->refclass($ref);
2537              
2538             # call subclass to do the dirty work
2539 165 50       353 if ($start > $stop) {
2540 0         0 ($start,$stop) = ($stop,$start);
2541 0 0       0 if ($strand eq '+') {
    0          
2542 0         0 $strand = '-';
2543             } elsif ($strand eq '-') {
2544 0         0 $strand = '+';
2545             }
2546             }
2547             # GFF2/3 transition stuff
2548 165 50       329 $gclass = [$gclass] unless ref $gclass;
2549 165 50       299 $gname = [$gname] unless ref $gname;
2550 165         329 for (my $i=0; $i<@$gname;$i++) {
2551 165         1369 $self->load_gff_line({ref => $ref,
2552             class => $class,
2553             source => $source,
2554             method => $method,
2555             start => $start,
2556             stop => $stop,
2557             score => $score,
2558             strand => $strand,
2559             phase => $phase,
2560             gclass => $gclass->[$i],
2561             gname => $gname->[$i],
2562             tstart => $tstart,
2563             tstop => $tstop,
2564             attributes => $attributes}
2565             );
2566 165         862 $self->{load_data}{count}++;
2567             }
2568             }
2569              
2570             sub _load_sequence_start {
2571 5     5   16 my $self = shift;
2572 5         12 my $ld = $self->{load_data};
2573 5         14 undef $ld->{id};
2574 5         15 $ld->{offset} = 0;
2575 5         12 $ld->{seq} = '';
2576             }
2577             sub _load_sequence_finish {
2578 10     10   22 my $self = shift;
2579 10         20 my $ld = $self->{load_data};
2580 10 100       38 $self->insert_sequence($ld->{id},$ld->{offset},$ld->{seq}) if defined $ld->{id};
2581             }
2582              
2583             sub _load_sequence_line {
2584 32950     32950   27547 my $self = shift;
2585 32950         27776 my $line = shift;
2586 32950         27441 my $ld = $self->{load_data};
2587 32950         27646 my $lineend = $ld->{lineend};
2588              
2589 32950 100       41398 if (/^>(\S+)/) {
2590 2620 100       6208 $self->insert_sequence($ld->{id},$ld->{offset},$ld->{seq}) if defined $ld->{id};
2591 2620         3811 $ld->{id} = $1;
2592 2620         2292 $ld->{offset} = 0;
2593 2620         2234 $ld->{seq} = '';
2594 2620         2308 $ld->{count}++;
2595 2620 50 33     8399 print STDERR $ld->{count}," sequences loaded$lineend" if $self->{__verbose__} && $ld->{count} % 1000 == 0;
2596             } else {
2597 30330         29425 $ld->{seq} .= $_;
2598 30330         37471 $self->insert_sequence_chunk($ld->{id},\$ld->{offset},\$ld->{seq});
2599             }
2600              
2601             }
2602              
2603             =head2 load_sequence
2604              
2605             Title : load_sequence
2606             Usage : $db->load_sequence($handle)
2607             Function: load a FASTA data stream
2608             Returns : number of sequences
2609             Args : a filehandle to the FASTA file
2610             Status : protected
2611              
2612             You probably want to use load_fasta() instead.
2613              
2614             =cut
2615              
2616             # note - there is some repeated code here
2617             sub load_sequence {
2618 5     5 1 13 my $self = shift;
2619 5         7 my $io_handle = shift;
2620              
2621             local $self->{load_data} = {
2622 5 50 33     46 lineend => (-t STDERR && !$ENV{EMACS} ? "\r" : "\n"),
2623             count => 0
2624             };
2625              
2626 5         23 $self->_load_sequence_start;
2627 5         152 while (<$io_handle>) {
2628 32950         30254 chomp;
2629 32950         32778 $self->_load_sequence_line($_);
2630             }
2631 5         26 $self->_load_sequence_finish;
2632 5         28 return $self->{load_data}{count};
2633             }
2634              
2635             sub insert_sequence_chunk {
2636 30330     30330 0 24507 my $self = shift;
2637 30330         32438 my ($id,$offsetp,$seqp) = @_;
2638 30330 50       28906 if (my $cs = $self->dna_chunk_size) {
2639 0         0 while (length($$seqp) >= $cs) {
2640 0         0 my $chunk = substr($$seqp,0,$cs);
2641 0         0 $self->insert_sequence($id,$$offsetp,$chunk);
2642 0         0 $$offsetp += length($chunk);
2643 0         0 substr($$seqp,0,$cs) = '';
2644             }
2645             }
2646 30330         61558 return 1; # the calling routine may expect success or failure
2647             }
2648              
2649             # used to store big pieces of DNA in itty bitty pieces
2650             sub dna_chunk_size {
2651 30330     30330 0 35971 return 0;
2652             }
2653              
2654             sub insert_sequence {
2655 0     0 0 0 my $self = shift;
2656 0         0 my($id,$offset,$seq) = @_;
2657 0         0 $self->throw('insert_sequence(): must be defined in subclass');
2658             }
2659              
2660             # This is the default class for reference points. Defaults to Sequence.
2661             sub default_class {
2662 335     335 0 351 my $self = shift;
2663 335 50       577 return 'Sequence' unless ref $self;
2664 335         503 my $d = $self->{default_class};
2665 335 100       503 $self->{default_class} = shift if @_;
2666 335         614 $d;
2667             }
2668              
2669             # gets name of the reference sequence, and returns its class
2670             # currently just calls default_class
2671             sub refclass {
2672 165     165 1 171 my $self = shift;
2673 165         149 my $name = shift;
2674 165         214 return $self->default_class;
2675             }
2676              
2677             =head2 setup_load
2678              
2679             Title : setup_load
2680             Usage : $db->setup_load
2681             Function: called before load_gff_line()
2682             Returns : void
2683             Args : none
2684             Status : abstract
2685              
2686             This abstract method gives subclasses a chance to do any
2687             schema-specific initialization prior to loading a set of GFF records.
2688             It must be implemented by a subclass.
2689              
2690             =cut
2691              
2692       0 1   sub setup_load {
2693             # default, do nothing
2694             }
2695              
2696             =head2 finish_load
2697              
2698             Title : finish_load
2699             Usage : $db->finish_load
2700             Function: called after load_gff_line()
2701             Returns : number of records loaded
2702             Args : none
2703             Status :abstract
2704              
2705             This method gives subclasses a chance to do any schema-specific
2706             cleanup after loading a set of GFF records.
2707              
2708             =cut
2709              
2710       0 1   sub finish_load {
2711             # default, do nothing
2712             }
2713              
2714             =head2 load_gff_line
2715              
2716             Title : load_gff_line
2717             Usage : $db->load_gff_line(@args)
2718             Function: called to load one parsed line of GFF
2719             Returns : true if successfully inserted
2720             Args : see below
2721             Status : abstract
2722              
2723             This abstract method is called once per line of the GFF and passed a
2724             hashref containing parsed GFF fields. The fields are:
2725              
2726             {ref => $ref,
2727             class => $class,
2728             source => $source,
2729             method => $method,
2730             start => $start,
2731             stop => $stop,
2732             score => $score,
2733             strand => $strand,
2734             phase => $phase,
2735             gclass => $gclass,
2736             gname => $gname,
2737             tstart => $tstart,
2738             tstop => $tstop,
2739             attributes => $attributes}
2740              
2741             =cut
2742              
2743             sub load_gff_line {
2744 0     0 1 0 shift->throw("load_gff_line(): must be implemented by an adaptor");
2745             }
2746              
2747              
2748             =head2 do_initialize
2749              
2750             Title : do_initialize
2751             Usage : $db->do_initialize([$erase])
2752             Function: initialize and possibly erase database
2753             Returns : true if successful
2754             Args : optional erase flag
2755             Status : protected
2756              
2757             This method implements the initialize() method described above, and
2758             takes the same arguments.
2759              
2760             =cut
2761              
2762             sub do_initialize {
2763 0     0 1 0 shift->throw('do_initialize(): must be implemented by an adaptor');
2764             }
2765              
2766             =head2 dna
2767              
2768             Title : dna
2769             Usage : $db->dna($id,$start,$stop,$class)
2770             Function: return the raw DNA string for a segment
2771             Returns : a raw DNA string
2772             Args : id of the sequence, its class, start and stop positions
2773             Status : public
2774              
2775             This method is invoked by Bio::DB::GFF::Segment to fetch the raw DNA
2776             sequence.
2777              
2778             Arguments: -name sequence name
2779             -start start position
2780             -stop stop position
2781             -class sequence class
2782              
2783             If start and stop are both undef, then the entire DNA is retrieved.
2784             So to fetch the whole dna, call like this:
2785              
2786             $db->dna($name_of_sequence);
2787              
2788             or like this:
2789              
2790             $db->dna(-name=>$name_of_sequence,-class=>$class_of_sequence);
2791              
2792             NOTE: you will probably prefer to create a Segment and then invoke its
2793             dna() method.
2794              
2795             =cut
2796              
2797             # call to return the DNA string for the indicated region
2798             # real work is done by get_dna()
2799             sub dna {
2800 120     120 1 133 my $self = shift;
2801 120         410 my ($id,$start,$stop,$class) = rearrange([
2802             [qw(NAME ID REF REFSEQ)],
2803             qw(START),
2804             [qw(STOP END)],
2805             'CLASS',
2806             ],@_);
2807             # return unless defined $start && defined $stop;
2808 120         415 $self->get_dna($id,$start,$stop,$class);
2809             }
2810              
2811 0     0 0 0 sub fetch_sequence { shift->dna(@_) }
2812              
2813             sub features_in_range {
2814 85     85 0 89 my $self = shift;
2815 85         430 my ($range_type,$refseq,$class,$start,$stop,$types,$parent,$sparse,$automerge,$iterator,$other) =
2816             rearrange([
2817             [qw(RANGE_TYPE)],
2818             [qw(REF REFSEQ)],
2819             qw(CLASS),
2820             qw(START),
2821             [qw(STOP END)],
2822             [qw(TYPE TYPES)],
2823             qw(PARENT),
2824             [qw(RARE SPARSE)],
2825             [qw(MERGE AUTOMERGE)],
2826             'ITERATOR'
2827             ],@_);
2828 85   100     401 $other ||= {};
2829             # $automerge = $types && $self->automerge unless defined $automerge;
2830 85 100       218 $automerge = $self->automerge unless defined $automerge;
2831             $self->throw("range type must be one of {".
2832             join(',',keys %valid_range_types).
2833             "}\n")
2834 85 50       207 unless $valid_range_types{lc $range_type};
2835 85         543 $self->_features({
2836             rangetype => lc $range_type,
2837             refseq => $refseq,
2838             refclass => $class,
2839             start => $start,
2840             stop => $stop,
2841             types => $types },
2842             {
2843             sparse => $sparse,
2844             automerge => $automerge,
2845             iterator => $iterator,
2846             %$other,
2847             },
2848             $parent);
2849             }
2850              
2851             =head2 get_dna
2852              
2853             Title : get_dna
2854             Usage : $db->get_dna($id,$start,$stop,$class)
2855             Function: get DNA for indicated segment
2856             Returns : the dna string
2857             Args : sequence ID, start, stop and class
2858             Status : protected
2859              
2860             If start E stop and the sequence is nucleotide, then this method
2861             should return the reverse complement. The sequence class may be
2862             ignored by those databases that do not recognize different object
2863             types.
2864              
2865             =cut
2866              
2867             sub get_dna {
2868 0     0 1 0 my $self = shift;
2869 0         0 my ($id,$start,$stop,$class,) = @_;
2870 0         0 $self->throw("get_dna() must be implemented by an adaptor");
2871             }
2872              
2873             =head2 get_features
2874              
2875             Title : get_features
2876             Usage : $db->get_features($search,$options,$callback)
2877             Function: get list of features for a region
2878             Returns : count of number of features retrieved
2879             Args : see below
2880             Status : protected
2881              
2882             The first argument is a hash reference containing search criteria for
2883             retrieving features. It contains the following keys:
2884              
2885             rangetype One of "overlaps", "contains" or "contained_in". Indicates
2886             the type of range query requested.
2887              
2888             refseq ID of the landmark that establishes the absolute
2889             coordinate system.
2890              
2891             refclass Class of this landmark. Can be ignored by implementations
2892             that don't recognize such distinctions.
2893              
2894             start Start of the range, inclusive.
2895              
2896             stop Stop of the range, inclusive.
2897              
2898             types Array reference containing the list of annotation types
2899             to fetch from the database. Each annotation type is an
2900             array reference consisting of [source,method].
2901              
2902             The second argument is a hash reference containing certain options
2903             that affect the way information is retrieved:
2904              
2905             sort_by_group
2906             A flag. If true, means that the returned features should be
2907             sorted by the group that they're in.
2908              
2909             sparse A flag. If true, means that the expected density of the
2910             features is such that it will be more efficient to search
2911             by type rather than by range. If it is taking a long
2912             time to fetch features, give this a try.
2913              
2914             binsize A true value will create a set of artificial features whose
2915             start and stop positions indicate bins of the given size, and
2916             whose scores are the number of features in the bin. The
2917             class of the feature will be set to "bin", and its name to
2918             "method:source". This is a handy way of generating histograms
2919             of feature density.
2920              
2921             The third argument, the $callback, is a code reference to which
2922             retrieved features are passed. It is described in more detail below.
2923              
2924             This routine is responsible for getting arrays of GFF data out of the
2925             database and passing them to the callback subroutine. The callback
2926             does the work of constructing a Bio::DB::GFF::Feature object out of
2927             that data. The callback expects a list of 13 fields:
2928              
2929             $refseq The reference sequence
2930             $start feature start
2931             $stop feature stop
2932             $source feature source
2933             $method feature method
2934             $score feature score
2935             $strand feature strand
2936             $phase feature phase
2937             $groupclass group class (may be undef)
2938             $groupname group ID (may be undef)
2939             $tstart target start for similarity hits (may be undef)
2940             $tstop target stop for similarity hits (may be undef)
2941             $feature_id A unique feature ID (may be undef)
2942              
2943             These fields are in the same order as the raw GFF file, with the
2944             exception that the group column has been parsed into group class and
2945             group name fields.
2946              
2947             The feature ID, if provided, is a unique identifier of the feature
2948             line. The module does not depend on this ID in any way, but it is
2949             available via Bio::DB::GFF-Eid() if wanted. In the dbi::mysql and
2950             dbi::mysqlopt adaptor, the ID is a unique row ID. In the acedb
2951             adaptor it is not used.
2952              
2953             =cut
2954              
2955             =head2 feature_summary(), coverage_array()
2956              
2957             The DBI adaptors provide methods for rapidly fetching coverage
2958             statistics across a region of interest. Please see
2959             L for more information about these
2960             methods.
2961              
2962             =cut
2963              
2964             sub get_features{
2965 0     0 1 0 my $self = shift;
2966 0         0 my ($search,$options,$callback) = @_;
2967 0         0 $self->throw("get_features() must be implemented by an adaptor");
2968             }
2969              
2970              
2971             =head2 _feature_by_name
2972              
2973             Title : _feature_by_name
2974             Usage : $db->_feature_by_name($class,$name,$location,$callback)
2975             Function: get a list of features by name and class
2976             Returns : count of number of features retrieved
2977             Args : name of feature, class of feature, and a callback
2978             Status : abstract
2979              
2980             This method is used internally. The callback arguments are the same
2981             as those used by make_feature(). This method must be overridden by
2982             subclasses.
2983              
2984             =cut
2985              
2986             sub _feature_by_name {
2987 0     0   0 my $self = shift;
2988 0         0 my ($class,$name,$location,$callback) = @_;
2989 0         0 $self->throw("_feature_by_name() must be implemented by an adaptor");
2990             }
2991              
2992             sub _feature_by_attribute {
2993 0     0   0 my $self = shift;
2994 0         0 my ($attributes,$callback) = @_;
2995 0         0 $self->throw("_feature_by_name() must be implemented by an adaptor");
2996             }
2997              
2998             =head2 _feature_by_id
2999              
3000             Title : _feature_by_id
3001             Usage : $db->_feature_by_id($ids,$type,$callback)
3002             Function: get a feature based
3003             Returns : count of number of features retrieved
3004             Args : arrayref to feature IDs to fetch
3005             Status : abstract
3006              
3007             This method is used internally to fetch features either by their ID or
3008             their group ID. $ids is a arrayref containing a list of IDs, $type is
3009             one of "feature" or "group", and $callback is a callback. The
3010             callback arguments are the same as those used by make_feature(). This
3011             method must be overridden by subclasses.
3012              
3013             =cut
3014              
3015             sub _feature_by_id {
3016 0     0   0 my $self = shift;
3017 0         0 my ($ids,$type,$callback) = @_;
3018 0         0 $self->throw("_feature_by_id() must be implemented by an adaptor");
3019             }
3020              
3021             =head2 overlapping_features
3022              
3023             Title : overlapping_features
3024             Usage : $db->overlapping_features(@args)
3025             Function: get features that overlap the indicated range
3026             Returns : a list of Bio::DB::GFF::Feature objects
3027             Args : see below
3028             Status : public
3029              
3030             This method is invoked by Bio::DB::GFF::Segment-Efeatures() to find
3031             the list of features that overlap a given range. It is generally
3032             preferable to create the Segment first, and then fetch the features.
3033              
3034             This method takes set of named arguments:
3035              
3036             -refseq ID of the reference sequence
3037             -class Class of the reference sequence
3038             -start Start of the desired range in refseq coordinates
3039             -stop Stop of the desired range in refseq coordinates
3040             -types List of feature types to return. Argument is an array
3041             reference containing strings of the format "method:source"
3042             -parent A parent Bio::DB::GFF::Segment object, used to create
3043             relative coordinates in the generated features.
3044             -rare Turn on an optimization suitable for a relatively rare feature type,
3045             where it will be faster to filter by feature type first
3046             and then by position, rather than vice versa.
3047             -merge Whether to apply aggregators to the generated features.
3048             -iterator Whether to return an iterator across the features.
3049              
3050             If -iterator is true, then the method returns a single scalar value
3051             consisting of a Bio::SeqIO object. You can call next_seq() repeatedly
3052             on this object to fetch each of the features in turn. If iterator is
3053             false or absent, then all the features are returned as a list.
3054              
3055             Currently aggregation is disabled when iterating over a series of
3056             features.
3057              
3058             Types are indicated using the nomenclature "method:source". Either of
3059             these fields can be omitted, in which case a wildcard is used for the
3060             missing field. Type names without the colon (e.g. "exon") are
3061             interpreted as the method name and a source wild card. Regular
3062             expressions are allowed in either field, as in: "similarity:BLAST.*".
3063              
3064             =cut
3065              
3066             # call to return the features that overlap the named region
3067             # real work is done by get_features
3068             sub overlapping_features {
3069 85     85 1 114 my $self = shift;
3070 85         219 $self->features_in_range(-range_type=>'overlaps',@_);
3071             }
3072              
3073             =head2 contained_features
3074              
3075             Title : contained_features
3076             Usage : $db->contained_features(@args)
3077             Function: get features that are contained within the indicated range
3078             Returns : a list of Bio::DB::GFF::Feature objects
3079             Args : see overlapping_features()
3080             Status : public
3081              
3082             This call is similar to overlapping_features(), except that it only
3083             retrieves features whose end points are completely contained within
3084             the specified range.
3085              
3086             Generally you will want to fetch a Bio::DB::GFF::Segment object and
3087             call its contained_features() method rather than call this directly.
3088              
3089             =cut
3090              
3091             # The same, except that it only returns features that are completely contained within the
3092             # range (much faster usually)
3093             sub contained_features {
3094 0     0 1 0 my $self = shift;
3095 0         0 $self->features_in_range(-range_type=>'contains',@_);
3096             }
3097              
3098             =head2 contained_in
3099              
3100             Title : contained_in
3101             Usage : @features = $s->contained_in(@args)
3102             Function: get features that contain this segment
3103             Returns : a list of Bio::DB::GFF::Feature objects
3104             Args : see features()
3105             Status : Public
3106              
3107             This is identical in behavior to features() except that it returns
3108             only those features that completely contain the segment.
3109              
3110             =cut
3111              
3112             sub contained_in {
3113 0     0 1 0 my $self = shift;
3114 0         0 $self->features_in_range(-range_type=>'contained_in',@_);
3115             }
3116              
3117             =head2 get_abscoords
3118              
3119             Title : get_abscoords
3120             Usage : $db->get_abscoords($name,$class,$refseq)
3121             Function: get the absolute coordinates of sequence with name & class
3122             Returns : ($absref,$absstart,$absstop,$absstrand)
3123             Args : name and class of the landmark
3124             Status : protected
3125              
3126             Given the name and class of a genomic landmark, this function returns
3127             a four-element array consisting of:
3128              
3129             $absref the ID of the reference sequence that contains this landmark
3130             $absstart the position at which the landmark starts
3131             $absstop the position at which the landmark stops
3132             $absstrand the strand of the landmark, relative to the reference sequence
3133              
3134             If $refseq is provided, the function searches only within the
3135             specified reference sequence.
3136              
3137             =cut
3138              
3139             sub get_abscoords {
3140 0     0 1 0 my $self = shift;
3141 0         0 my ($name,$class,$refseq) = @_;
3142 0         0 $self->throw("get_abscoords() must be implemented by an adaptor");
3143             }
3144              
3145             =head2 get_types
3146              
3147             Title : get_types
3148             Usage : $db->get_types($absref,$class,$start,$stop,$count)
3149             Function: get list of all feature types on the indicated segment
3150             Returns : list or hash of Bio::DB::GFF::Typename objects
3151             Args : see below
3152             Status : protected
3153              
3154             Arguments are:
3155              
3156             $absref the ID of the reference sequence
3157             $class the class of the reference sequence
3158             $start the position to start counting
3159             $stop the position to end counting
3160             $count a boolean indicating whether to count the number
3161             of occurrences of each feature type
3162              
3163             If $count is true, then a hash is returned. The keys of the hash are
3164             feature type names in the format "method:source" and the values are
3165             the number of times a feature of this type overlaps the indicated
3166             segment. Otherwise, the call returns a set of Bio::DB::GFF::Typename
3167             objects. If $start or $stop are undef, then all features on the
3168             indicated segment are enumerated. If $absref is undef, then the call
3169             returns all feature types in the database.
3170              
3171             =cut
3172              
3173             sub get_types {
3174 0     0 1 0 my $self = shift;
3175 0         0 my ($refseq,$class,$start,$stop,$count,$types) = @_;
3176 0         0 $self->throw("get_types() must be implemented by an adaptor");
3177             }
3178              
3179             =head2 make_feature
3180              
3181             Title : make_feature
3182             Usage : $db->make_feature(@args)
3183             Function: Create a Bio::DB::GFF::Feature object from string data
3184             Returns : a Bio::DB::GFF::Feature object
3185             Args : see below
3186             Status : internal
3187              
3188             This takes 14 arguments (really!):
3189              
3190             $parent A Bio::DB::GFF::RelSegment object
3191             $group_hash A hashref containing unique list of GFF groups
3192             $refname The name of the reference sequence for this feature
3193             $refclass The class of the reference sequence for this feature
3194             $start Start of feature
3195             $stop Stop of feature
3196             $source Feature source field
3197             $method Feature method field
3198             $score Feature score field
3199             $strand Feature strand
3200             $phase Feature phase
3201             $group_class Class of feature group
3202             $group_name Name of feature group
3203             $tstart For homologies, start of hit on target
3204             $tstop Stop of hit on target
3205              
3206             The $parent argument, if present, is used to establish relative
3207             coordinates in the resulting Bio::DB::Feature object. This allows one
3208             feature to generate a list of other features that are relative to its
3209             coordinate system (for example, finding the coordinates of the second
3210             exon relative to the coordinates of the first).
3211              
3212             The $group_hash allows the group_class/group_name strings to be turned
3213             into rich database objects via the make_obect() method (see above).
3214             Because these objects may be expensive to create, $group_hash is used
3215             to uniquefy them. The index of this hash is the composite key
3216             {$group_class,$group_name,$tstart,$tstop}. Values are whatever object
3217             is returned by the make_object() method.
3218              
3219             The remainder of the fields are taken from the GFF line, with the
3220             exception that "Target" features, which contain information about the
3221             target of a homology search, are parsed into their components.
3222              
3223             =cut
3224              
3225             # This call is responsible for turning a line of GFF into a
3226             # feature object.
3227             # The $parent argument is a Bio::DB::GFF::Segment object and is used
3228             # to establish the coordinate system for the new feature.
3229             # The $group_hash argument is an hash ref that holds previously-
3230             # generated group objects.
3231             # Other arguments are taken right out of the GFF table.
3232             sub make_feature {
3233 844     844 1 896 my $self = shift;
3234 844         1734 my ($parent,$group_hash, # these arguments provided by generic mechanisms
3235             $srcseq, # the rest is provided by adaptor
3236             $start,$stop,
3237             $source,$method,
3238             $score,$strand,$phase,
3239             $group_class,$group_name,
3240             $tstart,$tstop,
3241             $db_id,$group_id) = @_;
3242              
3243 844 100       1123 return unless $srcseq; # return undef if called with no arguments. This behavior is used for
3244             # on-the-fly aggregation.
3245              
3246 829         666 my $group; # undefined
3247 829 50 33     2042 if (defined $group_class && defined $group_name) {
3248 829   100     1699 $tstart ||= '';
3249 829   100     1580 $tstop ||= '';
3250 829 100       984 if ($group_hash) {
3251 559   66     1755 $group = $group_hash->{$group_class,$group_name,$tstart,$tstop}
3252             ||= $self->make_object($group_class,$group_name,$tstart,$tstop);
3253             } else {
3254 270         355 $group = $self->make_object($group_class,$group_name,$tstart,$tstop);
3255             }
3256             }
3257              
3258             # fix for some broken GFF files
3259             # unfortunately - has undesired side effects
3260             # if (defined $tstart && defined $tstop && !defined $strand) {
3261             # $strand = $tstart <= $tstop ? '+' : '-';
3262             # }
3263              
3264 829 100       1261 if (ref $parent) { # note that the src sequence is ignored
3265 505         1071 return Bio::DB::GFF::Feature->new_from_parent($parent,$start,$stop,
3266             $method,$source,
3267             $score,$strand,$phase,
3268             $group,$db_id,$group_id,
3269             $tstart,$tstop);
3270             } else {
3271 324         682 return Bio::DB::GFF::Feature->new($self,$srcseq,
3272             $start,$stop,
3273             $method,$source,
3274             $score,$strand,$phase,
3275             $group,$db_id,$group_id,
3276             $tstart,$tstop);
3277             }
3278             }
3279              
3280             sub make_aggregated_feature {
3281 15     15 0 23 my $self = shift;
3282 15         30 my ($accumulated_features,$parent,$aggregators) = splice(@_,0,3);
3283 15         33 my $feature = $self->make_feature($parent,undef,@_);
3284 15 50 66     29 return [$feature] if $feature && !$feature->group;
3285              
3286             # if we have accumulated features and either:
3287             # (1) make_feature() returned undef, indicated very end or
3288             # (2) the current group is different from the previous one
3289              
3290 15         48 local $^W = 0; # irritating uninitialized value warning in next statement
3291 15 50 66     64 if (@$accumulated_features &&
      100        
3292             (!defined($feature) || ($accumulated_features->[-1]->group ne $feature->group))) {
3293 10         20 foreach my $a (@$aggregators) { # last aggregator gets first shot
3294 30 50       63 $a->aggregate($accumulated_features,$self) or next;
3295             }
3296 10         20 my @result = @$accumulated_features;
3297 10 100       18 @$accumulated_features = $feature ? ($feature) : ();
3298 10 50       32 return unless @result;
3299 10         41 return \@result ;
3300             }
3301 5         10 push @$accumulated_features,$feature;
3302 5         13 return;
3303             }
3304              
3305             =head2 make_match_sub
3306              
3307             Title : make_match_sub
3308             Usage : $db->make_match_sub($types)
3309             Function: creates a subroutine used for filtering features
3310             Returns : a code reference
3311             Args : a list of parsed type names
3312             Status : protected
3313              
3314             This method is used internally to generate a code subroutine that will
3315             accept or reject a feature based on its method and source. It takes
3316             an array of parsed type names in the format returned by parse_types(),
3317             and generates an anonymous subroutine. The subroutine takes a single
3318             Bio::DB::GFF::Feature object and returns true if the feature matches
3319             one of the desired feature types, and false otherwise.
3320              
3321             =cut
3322              
3323             # a subroutine that matches features indicated by list of types
3324             sub make_match_sub {
3325 168     168 1 171 my $self = shift;
3326 168         186 my $types = shift;
3327              
3328 168 50 33 0   490 return sub { 1 } unless ref $types && @$types;
  0         0  
3329              
3330 168         153 my @expr;
3331 168         211 for my $type (@$types) {
3332 1432         1657 my ($method,$source) = @$type;
3333 1432 50       1770 $method = $method ? "\\Q$method\\E" : ".*";
3334 1432 50       1469 $source = $source ? ":\\Q$source\\E" : "(?::.+)?";
3335 1432         2150 push @expr,"${method}${source}";
3336             }
3337 168         367 my $expr = join '|',@expr;
3338 168 100       703 return $self->{match_subs}{$expr} if $self->{match_subs}{$expr};
3339              
3340 15         42 my $sub =<
3341             sub {
3342             my \$feature = shift or return;
3343             return \$feature->type =~ /^($expr)\$/i;
3344             }
3345             END
3346 15 50       28 warn "match sub: $sub\n" if $self->debug;
3347 15         25 undef $@;
3348 15         2799 my $compiled_sub = eval $sub;
3349 15 50       42 $self->throw($@) if $@;
3350 15         81 return $self->{match_subs}{$expr} = $compiled_sub;
3351             }
3352              
3353             =head2 make_object
3354              
3355             Title : make_object
3356             Usage : $db->make_object($class,$name,$start,$stop)
3357             Function: creates a feature object
3358             Returns : a feature object
3359             Args : see below
3360             Status : protected
3361              
3362             This method is called to make an object from the GFF "group" field.
3363             By default, all Target groups are turned into Bio::DB::GFF::Homol
3364             objects, and everything else becomes a Bio::DB::GFF::Featname.
3365             However, adaptors are free to override this method to generate more
3366             interesting objects, such as true BioPerl objects, or Acedb objects.
3367              
3368             Arguments are:
3369              
3370             $name database ID for object
3371             $class class of object
3372             $start for similarities, start of match inside object
3373             $stop for similarities, stop of match inside object
3374              
3375             =cut
3376              
3377             # abstract call to turn a feature into an object, given its class and name
3378             sub make_object {
3379 594     594 1 582 my $self = shift;
3380 594         837 my ($class,$name,$start,$stop) = @_;
3381 594 100 66     1614 return Bio::DB::GFF::Homol->new($self,$class,$name,$start,$stop)
3382             if defined $start and length $start;
3383 434         1031 return Bio::DB::GFF::Featname->new($class,$name);
3384             }
3385              
3386              
3387             =head2 do_attributes
3388              
3389             Title : do_attributes
3390             Usage : $db->do_attributes($id [,$tag]);
3391             Function: internal method to retrieve attributes given an id and tag
3392             Returns : a list of Bio::DB::GFF::Feature objects
3393             Args : a feature id and a attribute tag (optional)
3394             Status : protected
3395              
3396             This method is overridden by subclasses in order to return a list of
3397             attributes. If called with a tag, returns the value of attributes of
3398             that tag type. If called without a tag, returns a flattened array of
3399             (tag=Evalue) pairs. A particular tag can be present multiple times.
3400              
3401             =cut
3402              
3403             sub do_attributes {
3404 0     0 1 0 my $self = shift;
3405 0         0 my ($id,$tag) = @_;
3406 0         0 return ();
3407             }
3408              
3409             =head2 clone
3410              
3411             The clone() method should be used when you want to pass the
3412             Bio::DB::GFF object to a child process across a fork(). The child must
3413             call clone() before making any queries.
3414              
3415             The default behavior is to do nothing, but adaptors that use the DBI
3416             interface may need to implement this in order to avoid database handle
3417             errors. See the dbi adaptor for an example.
3418              
3419             =cut
3420              
3421       2 1   sub clone { }
3422              
3423              
3424             =head1 Internal Methods
3425              
3426             The following methods are internal to Bio::DB::GFF and are not
3427             guaranteed to remain the same.
3428              
3429             =head2 _features
3430              
3431             Title : _features
3432             Usage : $db->_features($search,$options,$parent)
3433             Function: internal method
3434             Returns : a list of Bio::DB::GFF::Feature objects
3435             Args : see below
3436             Status : internal
3437              
3438             This is an internal method that is called by overlapping_features(),
3439             contained_features() and features() to create features based on a
3440             parent segment's coordinate system. It takes three arguments, a
3441             search options hashref, an options hashref, and a parent segment.
3442              
3443             The search hashref contains the following keys:
3444              
3445             rangetype One of "overlaps", "contains" or "contained_in". Indicates
3446             the type of range query requested.
3447             refseq reference sequence ID
3448             refclass reference sequence class
3449             start start of range
3450             stop stop of range
3451             types arrayref containing list of types in "method:source" form
3452              
3453             The options hashref contains zero or more of the following keys:
3454              
3455             sparse turn on optimizations for a rare feature
3456             automerge if true, invoke aggregators to merge features
3457             iterator if true, return an iterator
3458              
3459             The $parent argument is a scalar object containing a
3460             Bio::DB::GFF::RelSegment object or descendent.
3461              
3462             =cut
3463              
3464             #'
3465              
3466             sub _features {
3467 100     100   129 my $self = shift;
3468 100         148 my ($search,$options,$parent) = @_;
3469 0         0 (@{$search}{qw(start stop)}) = (@{$search}{qw(stop start)})
  0         0  
3470 100 50 66     285 if defined($search->{start}) && $search->{start} > $search->{stop};
3471 100 50       183 $search->{refseq} = $search->{seq_id} if exists $search->{seq_id};
3472              
3473 100         290 my $types = $self->parse_types($search->{types}); # parse out list of types
3474 100         167 my @aggregated_types = @$types; # keep a copy
3475              
3476             # allow the aggregators to operate on the original
3477 100         82 my @aggregators;
3478 100 100       186 if ($options->{automerge}) {
3479 85         199 for my $a ($self->aggregators) {
3480 175 100       299 $a = $a->clone if $options->{iterator};
3481 175 100       424 unshift @aggregators,$a
3482             if $a->disaggregate(\@aggregated_types,$self);
3483             }
3484             }
3485              
3486 100 100       204 if ($options->{iterator}) {
3487 15         18 my @accumulated_features;
3488 15     15   48 my $callback = $options->{automerge} ? sub { $self->make_aggregated_feature(\@accumulated_features,$parent,\@aggregators,@_) }
3489 15 100   270   75 : sub { [$self->make_feature($parent,undef,@_)] };
  270         497  
3490             return $self->get_features_iterator({ %$search,
3491             types => \@aggregated_types },
3492             { %$options,
3493             sort_by_group => $options->{automerge} },
3494 15         118 $callback
3495             );
3496             }
3497              
3498 85         106 my %groups; # cache the groups we create to avoid consuming too much unecessary memory
3499 85         98 my $features = [];
3500              
3501 85     470   383 my $callback = sub { push @$features,$self->make_feature($parent,\%groups,@_) };
  470         1023  
3502 85         587 $self->get_features({ %$search,
3503             types => \@aggregated_types },
3504             $options,
3505             $callback);
3506              
3507 85 100       274 if ($options->{automerge}) {
3508 80 50       167 warn "aggregating...\n" if $self->debug;
3509 80         135 foreach my $a (@aggregators) { # last aggregator gets first shot
3510 95 50       146 warn "Aggregator $a:\n" if $self->debug;
3511 95         256 $a->aggregate($features,$self);
3512             }
3513             }
3514              
3515 85         979 @$features;
3516             }
3517              
3518             =head2 get_features_iterator
3519              
3520             Title : get_features_iterator
3521             Usage : $db->get_features_iterator($search,$options,$callback)
3522             Function: get an iterator on a features query
3523             Returns : a Bio::SeqIO object
3524             Args : as per get_features()
3525             Status : Public
3526              
3527             This method takes the same arguments as get_features(), but returns an
3528             iterator that can be used to fetch features sequentially, as per
3529             Bio::SeqIO.
3530              
3531             Internally, this method is simply a front end to range_query().
3532             The latter method constructs and executes the query, returning a
3533             statement handle. This routine passes the statement handle to the
3534             constructor for the iterator, along with the callback.
3535              
3536             =cut
3537              
3538             sub get_features_iterator {
3539 0     0 1 0 my $self = shift;
3540 0         0 my ($search,$options,$callback) = @_;
3541 0         0 $self->throw('feature iteration is not implemented in this adaptor');
3542             }
3543              
3544             =head2 split_group
3545              
3546             Title : split_group
3547             Usage : $db->split_group($group_field,$gff3_flag)
3548             Function: parse GFF group field
3549             Returns : ($gclass,$gname,$tstart,$tstop,$attributes)
3550             Args : the gff group column and a flag indicating gff3 compatibility
3551             Status : internal
3552              
3553             This is a method that is called by load_gff_line to parse out the
3554             contents of one or more group fields. It returns the class of the
3555             group, its name, the start and stop of the target, if any, and an
3556             array reference containing any attributes that were stuck into the
3557             group field, in [attribute_name,attribute_value] format.
3558              
3559             =cut
3560              
3561             sub split_group {
3562 165     165 1 207 my $self = shift;
3563 165         296 my ($group,$gff3) = @_;
3564 165 100       231 if ($gff3) {
3565 66         170 my @groups = split /[;&]/,$group; # so easy!
3566 66         118 return $self->_split_gff3_group(@groups);
3567             } else {
3568             # handle group parsing
3569             # protect embedded semicolons in the group; there must be faster/more elegant way
3570             # to do this.
3571 99         150 $group =~ s/\\;/$;/g;
3572 99         201 while ($group =~ s/( \"[^\"]*);([^\"]*\")/$1$;$2/) { 1 }
  0         0  
3573 99         261 my @groups = split(/\s*;\s*/,$group);
3574 99         141 foreach (@groups) { s/$;/;/g }
  138         288  
3575 99         183 return $self->_split_gff2_group(@groups);
3576             }
3577             }
3578              
3579             =head2 _split_gff2_group
3580              
3581             This is an internal method called by split_group().
3582              
3583             =cut
3584              
3585             # this has gotten quite nasty due to transition from GFF2 to GFF2.5
3586             # (artemis) to GFF3.
3587              
3588             sub _split_gff2_group {
3589 99     99   111 my $self = shift;
3590 99         159 my @groups = @_;
3591 99         87 my $target_found;
3592              
3593 99         102 my ($gclass,$gname,$tstart,$tstop,@attributes,@notes);
3594              
3595 99         108 for (@groups) {
3596              
3597 138         510 my ($tag,$value) = /^(\S+)(?:\s+(.+))?/;
3598 138 50       243 $value = '' unless defined $value;
3599 138 100       249 if ($value =~ /^\"(.+)\"$/) { #remove quotes
3600 24         54 $value = $1;
3601             }
3602 138         177 $value =~ s/\\t/\t/g;
3603 138         135 $value =~ s/\\r/\r/g;
3604 138         192 $value =~ s/\s+$//;
3605              
3606             # Any additional groups become part of the attributes hash
3607             # For historical reasons, the tag "Note" is treated as an
3608             # attribute, even if it is the only group.
3609 138   50     192 $tag ||= '';
3610 138 50 33     702 if ($tag eq 'tstart' && $target_found) {
    50 33        
    100 66        
    100          
    50          
3611 0         0 $tstart = $value;
3612             }
3613              
3614             elsif ($tag eq 'tend' && $target_found) {
3615 0         0 $tstop = $value;
3616             }
3617              
3618             elsif (ucfirst $tag eq 'Note') {
3619 12         33 push @notes, [$tag => $value];
3620             }
3621              
3622             elsif ($tag eq 'Target' && /([^:\"\s]+):([^\"\s]+)/) { # major disagreement in implementors of GFF2 here
3623 21         39 $target_found++;
3624 21         57 ($gclass,$gname) = ($1,$2);
3625 21         90 ($tstart,$tstop) = / (\d+) (\d+)/;
3626             }
3627              
3628             elsif (!defined($value)) {
3629 0         0 push @notes, [Note => $tag]; # e.g. "Confirmed_by_EST"
3630             }
3631              
3632             else {
3633 105         312 push @attributes, [$tag => $value];
3634             }
3635             }
3636              
3637             # group assignment
3638 99 100 33     288 if (@attributes && !($gclass && $gname) ) {
      66        
3639              
3640 78 50       195 my $preferred = ref($self) ? $self->_preferred_groups_hash : {};
3641              
3642 78         195 for my $pair (@attributes) {
3643 105         159 my ($c,$n) = @$pair;
3644             ($gclass,$gname) = ($c,$n)
3645             if !$gclass # pick up first one
3646             ||
3647 105 100 100     357 ($preferred->{lc $gclass}||0) < ($preferred->{lc $c}||0); # pick up higher priority one
      100        
      100        
3648             }
3649              
3650 78         111 @attributes = grep {$gclass ne $_->[0]} @attributes;
  105         252  
3651             }
3652              
3653 99         114 push @attributes, @notes;
3654              
3655 99         291 return ($gclass,$gname,$tstart,$tstop,\@attributes);
3656             }
3657              
3658              
3659             =head2 gff3_name_munging
3660              
3661             Title : gff3_name_munging
3662             Usage : $db->gff3_name_munging($boolean)
3663             Function: get/set gff3_name_munging flag
3664             Returns : $current value of flag
3665             Args : new value of flag (optional)
3666             Status : utility
3667              
3668             If this is set to true (default false), then features identified in
3669             gff3 files with an ID in the format foo:bar will be parsed so that
3670             "foo" is the class and "bar" is the name. This is mostly for backward
3671             compatibility with GFF2.
3672              
3673             =cut
3674              
3675             sub gff3_name_munging {
3676 71     71 1 101 my $self = shift;
3677 71         77 my $d = $self->{gff3_name_munging};
3678 71 100       122 $self->{gff3_name_munging} = shift if @_;
3679 71         96 $d;
3680             }
3681              
3682             =head2 _split_gff3_group
3683              
3684             This is called internally from split_group().
3685              
3686             =cut
3687              
3688             sub _split_gff3_group {
3689 66     66   72 my $self = shift;
3690 66         130 my @groups = @_;
3691 66         100 my $dc = $self->default_class;
3692 66         78 my (%id,@attributes);
3693              
3694 66         86 for my $group (@groups) {
3695 92         194 my ($tag,$value) = split /=/,$group;
3696 92         160 $tag = unescape($tag);
3697 92         154 my @values = map {unescape($_)} split /,/,$value;
  92         120  
3698              
3699             # GFF2 traditionally did not distinguish between a feature's name
3700             # and the group it belonged to. This code is a transition between
3701             # gff2 and the new parent/ID dichotomy in gff3.
3702 92 50 66     362 if ($tag eq 'Parent') {
    100          
    100          
    50          
3703 0         0 my (@names,@classes);
3704 0         0 for (@values) {
3705 0         0 my ($name,$class) = $self->_gff3_name_munging($_,$dc);
3706 0         0 push @names,$name;
3707 0         0 push @classes,$class;
3708             }
3709 0 0       0 $id{$tag} = @names > 1 ? [\@names,\@classes] : [$names[0],$classes[0]];
3710             }
3711             elsif ($tag eq 'ID' || $tag eq 'Name') {
3712 52         88 $id{$tag} = [$self->_gff3_name_munging(shift(@values),$dc)];
3713             }
3714             elsif ($tag eq 'Target') {
3715 14         68 my ($gname,$tstart,$tstop) = split /\s+/,shift @values;
3716 14         34 $id{$tag} = [$self->_gff3_name_munging($gname,$dc),$tstart,$tstop];
3717             }
3718             elsif ($tag =~ /synonym/i) {
3719 0         0 $tag = 'Alias';
3720             }
3721 92         202 push @attributes,[$tag=>$_] foreach @values;
3722             }
3723              
3724 66         138 my $priorities = $self->_preferred_groups_hash(1);
3725 66         78 my ($gclass,$gname,$tstart,$tstop);
3726 66         140 for my $preferred (sort {$priorities->{lc $b}<=>$priorities->{lc $a}}
  0         0  
3727             keys %id) {
3728 66 50       106 unless (defined $gname) {
3729 66         64 ($gname,$gclass,$tstart,$tstop) = @{$id{$preferred}};
  66         140  
3730             }
3731             }
3732              
3733             # set null gclass to empty string to preserve compatibility with
3734             # programs that expect a defined gclass if no gname
3735 66 50 50     142 $gclass ||= '' if defined $gname;
3736              
3737 66         250 return ($gclass,$gname,$tstart,$tstop,\@attributes);
3738             }
3739              
3740             # accomodation for wormbase style of class:name naming
3741             sub _gff3_name_munging {
3742 66     66   70 my $self = shift;
3743 66         76 my ($name,$default_class) = @_;
3744 66 50       88 return ($name,$default_class) unless $self->gff3_name_munging;
3745              
3746 66 50       224 if ($name =~ /^(\w+):(.+)/) {
3747 66         266 return ($2,$1);
3748             } else {
3749 0         0 return ($name,$default_class);
3750             }
3751             }
3752              
3753             =head2 _delete_features(), _delete_groups(),_delete(),_delete_fattribute_to_features()
3754              
3755             Title : _delete_features(), _delete_groups(),_delete(),_delete_fattribute_to_features()
3756             Usage : $count = $db->_delete_features(@feature_ids)
3757             $count = $db->_delete_groups(@group_ids)
3758             $count = $db->_delete(\%delete_spec)
3759             $count = $db->_delete_fattribute_to_features(@feature_ids)
3760             Function: low-level feature/group deleter
3761             Returns : count of groups removed
3762             Args : list of feature or group ids removed
3763             Status : for implementation by subclasses
3764              
3765             These methods need to be implemented in adaptors. For _delete_features,
3766             _delete_groups and _delete_fattribute_to_features, the arguments are a list of
3767             feature or group IDs to remove. For _delete(), the argument is a hashref with
3768             the three keys 'segments', 'types' and 'force'. The first contains an arrayref
3769             of Bio::DB::GFF::RelSegment objects to delete (all FEATURES within the segment
3770             are deleted). The second contains an arrayref of [method,source] feature types
3771             to delete. The two are ANDed together. If 'force' has a true value, this
3772             forces the operation to continue even if it would delete all features.
3773              
3774             =cut
3775              
3776             sub _delete_features {
3777 0     0   0 my $self = shift;
3778 0         0 my @feature_ids = @_;
3779 0         0 $self->throw('_delete_features is not implemented in this adaptor');
3780             }
3781              
3782             sub _delete_groups {
3783 5     5   10 my $self = shift;
3784 5         8 my @group_ids = @_;
3785 5         24 $self->throw('_delete_groups is not implemented in this adaptor');
3786             }
3787              
3788             sub _delete {
3789 0     0   0 my $self = shift;
3790 0         0 my $delete_options = shift;
3791 0         0 $self->throw('_delete is not implemented in this adaptor');
3792             }
3793              
3794             sub _delete_fattribute_to_features {
3795 0     0   0 my $self = shift;
3796 0         0 my @feature_ids = @_;
3797 0         0 $self->throw('_delete_fattribute_to_features is not implemented in this adaptor');
3798             }
3799              
3800              
3801             sub unescape {
3802 184     184 0 184 my $v = shift;
3803 184         204 $v =~ tr/+/ /;
3804 184         220 $v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
  0         0  
3805 184         280 return $v;
3806             }
3807              
3808             sub print_gff3_warning {
3809 2     2 0 6 my $self = shift;
3810 2         66 print STDERR <
3811              
3812             You are loading a Bio::DB::GFF database with GFF3 formatted data.
3813             While this will likely work fine, the Bio::DB::GFF schema does not
3814             always faithfully capture the complexity represented in GFF3 files.
3815             Unless you have a specific reason for using Bio::DB::GFF, we suggest
3816             that you use a Bio::DB::SeqFeature::Store database and its corresponding
3817             loader, bp_seqfeature_load.pl.
3818              
3819             END
3820             ;
3821              
3822 2         18 return;
3823             }
3824              
3825              
3826             package Bio::DB::GFF::ID_Iterator;
3827 3     3   18 use strict;
  3         3  
  3         57  
3828              
3829 3     3   15 use base qw(Bio::Root::Root);
  3         3  
  3         729  
3830              
3831             sub new {
3832 0     0     my $class = shift;
3833 0           my ($db,$ids,$type) = @_;
3834 0           return bless {ids=>$ids,db=>$db,type=>$type},$class;
3835             }
3836              
3837             sub next_seq {
3838 0     0     my $self = shift;
3839 0           my $next = shift @{$self->{ids}};
  0            
3840 0 0         return unless $next;
3841 0 0         my $name = ref($next) eq 'ARRAY' ? Bio::DB::GFF::Featname->new(@$next) : $next;
3842             my $segment = $self->{type} eq 'name' ? $self->{db}->segment($name)
3843             : $self->{type} eq 'feature' ? $self->{db}->fetch_feature_by_id($name)
3844 0 0         : $self->{type} eq 'group' ? $self->{db}->fetch_feature_by_gid($name)
    0          
    0          
3845             : $self->throw("Bio::DB::GFF::ID_Iterator called to fetch an unknown type of identifier");
3846 0 0         $self->throw("id does not exist") unless $segment;
3847 0           return $segment;
3848             }
3849              
3850             package Bio::DB::GFF::FeatureIterator;
3851              
3852             sub new {
3853 0     0     my $self = shift;
3854 0           my @features = @_;
3855 0   0       return bless \@features,ref $self || $self;
3856             }
3857             sub next_seq {
3858 0     0     my $self = shift;
3859 0 0         return unless @$self;
3860 0           return shift @$self;
3861             }
3862              
3863              
3864             1;
3865              
3866             __END__