File Coverage

Bio/DB/GFF.pm

Criterion	Covered	Total	%
statement	511	710	71.9
branch	198	362	54.7
condition	75	135	55.5
subroutine	73	119	61.3
pod	67	79	84.8
total	924	1405	65.7

line	stmt	bran	cond	sub	pod	time	code
1
2							=head1 NAME
3
4							Bio::DB::GFF -- Storage and retrieval of sequence annotation data
5
6							=head1 SYNOPSIS
7
8							use Bio::DB::GFF;
9
10							# Open the sequence database
11							my $db = Bio::DB::GFF->new( -adaptor => 'dbi::mysqlopt',
12							-dsn => 'dbi:mysql:elegans');
13
14							# fetch a 1 megabase segment of sequence starting at landmark "ZK909"
15							my $segment = $db->segment('ZK909', 1 => 1000000);
16
17							# pull out all transcript features
18							my @transcripts = $segment->features('transcript');
19
20							# for each transcript, total the length of the introns
21							my %totals;
22							for my $t (@transcripts) {
23							my @introns = $t->Intron;
24							$totals{$t->name} += $_->length foreach @introns;
25							}
26
27							# Sort the exons of the first transcript by position
28							my @exons = sort {$a->start <=> $b->start} $transcripts[0]->Exon;
29
30							# Get a region 1000 bp upstream of first exon
31							my $upstream = $exons[0]->subseq(-1000,0);
32
33							# get its DNA
34							my $dna = $upstream->seq;
35
36							# and get all curated polymorphisms inside it
37							@polymorphisms = $upstream->contained_features('polymorphism:curated');
38
39							# get all feature types in the database
40							my @types = $db->types;
41
42							# count all feature types in the segment
43							my %type_counts = $segment->types(-enumerate=>1);
44
45							# get an iterator on all curated features of type 'exon' or 'intron'
46							my $iterator = $db->get_seq_stream(-type => ['exon:curated','intron:curated']);
47
48							while (my $s = $iterator->next_seq) {
49							print $s,"\n";
50							}
51
52							# find all transcripts annotated as having function 'kinase'
53							my $iterator = $db->get_seq_stream(-type=>'transcript',
54							-attributes=>{Function=>'kinase'});
55							while (my $s = $iterator->next_seq) {
56							print $s,"\n";
57							}
58
59							=head1 DESCRIPTION
60
61							Bio::DB::GFF provides fast indexed access to a sequence annotation
62							database. It supports multiple database types (ACeDB, relational),
63							and multiple schemas through a system of adaptors and aggregators.
64
65							The following operations are supported by this module:
66
67							- retrieving a segment of sequence based on the ID of a landmark
68							- retrieving the DNA from that segment
69							- finding all annotations that overlap with the segment
70							- finding all annotations that are completely contained within the
71							segment
72							- retrieving all annotations of a particular type, either within a
73							segment, or globally
74							- conversion from absolute to relative coordinates and back again,
75							using any arbitrary landmark for the relative coordinates
76							- using a sequence segment to create new segments based on relative
77							offsets
78
79							The data model used by Bio::DB::GFF is compatible with the GFF flat
80							file format (L). The module
81							can load a set of GFF files into the database, and serves objects that
82							have methods corresponding to GFF fields.
83
84							The objects returned by Bio::DB::GFF are compatible with the
85							SeqFeatureI interface, allowing their use by the Bio::Graphics and
86							Bio::DAS modules.
87
88							=head2 Auxiliary Scripts
89
90							The bioperl distribution includes several scripts that make it easier
91							to work with Bio::DB::GFF databases. They are located in the scripts
92							directory under a subdirectory named Bio::DB::GFF:
93
94							=over 4
95
96							=item *
97
98							bp_load_gff.pl
99
100							This script will load a Bio::DB::GFF database from a flat GFF file of
101							sequence annotations. Only the relational database version of
102							Bio::DB::GFF is supported. It can be used to create the database from
103							scratch, as well as to incrementally load new data.
104
105							This script takes a --fasta argument to load raw DNA into the database
106							as well. However, GFF databases do not require access to the raw DNA
107							for most of their functionality.
108
109							load_gff.pl also has a --upgrade option, which will perform a
110							non-destructive upgrade of older schemas to newer ones.
111
112							=item *
113
114							bp_bulk_load_gff.pl
115
116							This script will populate a Bio::DB::GFF database from a flat GFF file
117							of sequence annotations. Only the MySQL database version of
118							Bio::DB::GFF is supported. It uses the "LOAD DATA INFILE" query in
119							order to accelerate loading considerably; however, it can only be used
120							for the initial load, and not for updates.
121
122							This script takes a --fasta argument to load raw DNA into the database
123							as well. However, GFF databases do not require access to the raw DNA
124							for most of their functionality.
125
126							=item *
127
128							bp_fast_load_gff.pl
129
130							This script is as fast as bp_bulk_load_gff.pl but uses Unix pipe
131							tricks to allow for incremental updates. It only supports the MySQL
132							database version of Bio::DB::GFF and is guaranteed not to work on
133							non-Unix platforms.
134
135							Arguments are the same as bp_load_gff.pl
136
137							=item *
138
139							gadfly_to_gff.pl
140
141							This script will convert the GFF-like format used by the Berkeley
142							Drosophila Sequencing project into a format suitable for use with this
143							module.
144
145							=item *
146
147							sgd_to_gff.pl
148
149							This script will convert the tab-delimited feature files used by the
150							Saccharomyces Genome Database into a format suitable for use with this
151							module.
152
153							=back
154
155							=head2 GFF Fundamentals
156
157							The GFF format is a flat tab-delimited file, each line of which
158							corresponds to an annotation, or feature. Each line has nine columns
159							and looks like this:
160
161							Chr1 curated CDS 365647 365963 . + 1 Transcript "R119.7"
162
163							The 9 columns are as follows:
164
165							=over 4
166
167							=item 1.
168
169							reference sequence
170
171							This is the ID of the sequence that is used to establish the
172							coordinate system of the annotation. In the example above, the
173							reference sequence is "Chr1".
174
175							=item 2.
176
177							source
178
179							The source of the annotation. This field describes how the annotation
180							was derived. In the example above, the source is "curated" to
181							indicate that the feature is the result of human curation. The names
182							and versions of software programs are often used for the source field,
183							as in "tRNAScan-SE/1.2".
184
185							=item 3.
186
187							method
188
189							The annotation method. This field describes the type of the
190							annotation, such as "CDS". Together the method and source describe
191							the annotation type.
192
193							=item 4.
194
195							start position
196
197							The start of the annotation relative to the reference sequence.
198
199							=item 5.
200
201							stop position
202
203							The stop of the annotation relative to the reference sequence. Start
204							is always less than or equal to stop.
205
206							=item 6.
207
208							score
209
210							For annotations that are associated with a numeric score (for example,
211							a sequence similarity), this field describes the score. The score
212							units are completely unspecified, but for sequence similarities, it is
213							typically percent identity. Annotations that don't have a score can
214							use "."
215
216							=item 7.
217
218							strand
219
220							For those annotations which are strand-specific, this field is the
221							strand on which the annotation resides. It is "+" for the forward
222							strand, "-" for the reverse strand, or "." for annotations that are
223							not stranded.
224
225							=item 8.
226
227							phase
228
229							For annotations that are linked to proteins, this field describes the
230							phase of the annotation on the codons. It is a number from 0 to 2, or
231							"." for features that have no phase.
232
233							=item 9.
234
235							group
236
237							GFF provides a simple way of generating annotation hierarchies ("is
238							composed of" relationships) by providing a group field. The group
239							field contains the class and ID of an annotation which is the logical
240							parent of the current one. In the example given above, the group is
241							the Transcript named "R119.7".
242
243							The group field is also used to store information about the target of
244							sequence similarity hits, and miscellaneous notes. See the next
245							section for a description of how to describe similarity targets.
246
247							The format of the group fields is "Class ID" with a single space (not
248							a tab) separating the class from the ID. It is VERY IMPORTANT to
249							follow this format, or grouping will not work properly.
250
251							=back
252
253							The sequences used to establish the coordinate system for annotations
254							can correspond to sequenced clones, clone fragments, contigs or
255							super-contigs. Thus, this module can be used throughout the lifecycle
256							of a sequencing project.
257
258							In addition to a group ID, the GFF format allows annotations to have a
259							group class. For example, in the ACeDB representation, RNA
260							interference experiments have a class of "RNAi" and an ID that is
261							unique among the RNAi experiments. Since not all databases support
262							this notion, the class is optional in all calls to this module, and
263							defaults to "Sequence" when not provided.
264
265							Double-quotes are sometimes used in GFF files around components of the
266							group field. Strictly, this is only necessary if the group name or
267							class contains whitespace.
268
269							=head2 Making GFF files work with this module
270
271							Some annotations do not need to be individually named. For example,
272							it is probably not useful to assign a unique name to each ALU repeat
273							in a vertebrate genome. Others, such as predicted genes, correspond
274							to named biological objects; you probably want to be able to fetch the
275							positions of these objects by referring to them by name.
276
277							To accommodate named annotations, the GFF format places the object
278							class and name in the group field. The name identifies the object,
279							and the class prevents similarly-named objects, for example clones and
280							sequences, from collding.
281
282							A named object is shown in the following excerpt from a GFF file:
283
284							Chr1 curated transcript 939627 942410 . + . Transcript Y95B8A.2
285
286							This object is a predicted transcript named Y95BA.2. In this case,
287							the group field is used to identify the class and name of the object,
288							even though no other annotation belongs to that group.
289
290							It now becomes possible to retrieve the region of the genome covered
291							by transcript Y95B8A.2 using the segment() method:
292
293							$segment = $db->segment(-class=>'Transcript',-name=>'Y95B8A.2');
294
295							It is not necessary for the annotation's method to correspond to the
296							object class, although this is commonly the case.
297
298							As explained above, each annotation in a GFF file refers to a
299							reference sequence. It is important that each reference sequence also
300							be identified by a line in the GFF file. This allows the Bio::DB::GFF
301							module to determine the length and class of the reference sequence,
302							and makes it possible to do relative arithmetic.
303
304							For example, if "Chr1" is used as a reference sequence, then it should
305							have an entry in the GFF file similar to this one:
306
307							Chr1 assembly chromosome 1 14972282 . + . Sequence Chr1
308
309							This indicates that the reference sequence named "Chr1" has length
310							14972282 bp, method "chromosome" and source "assembly". In addition,
311							as indicated by the group field, Chr1 has class "Sequence" and name
312							"Chr1".
313
314							The object class "Sequence" is used by default when the class is not
315							specified in the segment() call. This allows you to use a shortcut
316							form of the segment() method:
317
318							$segment = $db->segment('Chr1'); # whole chromosome
319							$segment = $db->segment('Chr1',1=>1000); # first 1000 bp
320
321							For your convenience, if, during loading a GFF file, Bio::DB::GFF
322							encounters a line like the following:
323
324							##sequence-region Chr1 1 14972282
325
326							It will automatically generate the following entry:
327
328							Chr1 reference Component 1 14972282 . + . Sequence Chr1
329
330							This is sufficient to use Chr1 as a reference point.
331							The ##sequence-region line is frequently found in the GFF files
332							distributed by annotation groups.
333
334							=head2 Specifying the group tag
335
336							A frequent problem with GFF files is the problem distinguishing
337							which of the several tag/value pairs in the 9th column is the grouping
338							pair. Ordinarily the first tag will be used for grouping, but some
339							GFF manipulating tools do not preserve the order of attributes. To
340							eliminate this ambiguity, this module provides two ways of explicitly
341							specifying which tag to group on:
342
343							=over 4
344
345							=item *
346
347							Using -preferred_groups
348
349							When you create a Bio::DB::GFF object, pass it a -preferred_groups=E
350							argument. This specifies a tag that will be used for grouping. You
351							can pass an array reference to specify a list of such tags.
352
353							=item *
354
355							In the GFF header
356
357							The GFF file itself can specify which tags are to be used for
358							grouping. Insert a comment like the following:
359
360							##group-tags Accession Locus
361
362							This says to use the Accession tag for grouping. If it is not
363							available, use the Locus tag. If neither tag is available, use the
364							first pair to appear.
365
366							=back
367
368							These options only apply when B a GFF file into the database,
369							and have no effect on existing databases.
370
371							The group-tags comment in the GFF file will override the preferred
372							groups set when you create the Bio::DB::GFF object.
373
374							For backward compatibility, the tags Sequence and Transcript are
375							always treated as grouping tags unless preferred_tags are specified.
376							The "Target" tag is always used for grouping regardless of the
377							preferred_groups() setting, and the tags "tstart", "tend" and "Note"
378							cannot be used for grouping. These are historical artefacts coming
379							from various interpretations of GFF2, and cannot be changed.
380
381							=head2 Sequence alignments
382
383							There are two cases in which an annotation indicates the relationship
384							between two sequences. The first case is a similarity hit, where the
385							annotation indicates an alignment. The second case is a map assembly,
386							in which the annotation indicates that a portion of a larger sequence
387							is built up from one or more smaller ones.
388
389							Both cases are indicated by using the B tag in the group
390							field. For example, a typical similarity hit will look like this:
391
392							Chr1 BLASTX similarity 76953 77108 132 + 0 Target Protein:SW:ABL_DROME 493 544
393
394							The group field contains the Target tag, followed by an identifier for
395							the biological object referred to. The GFF format uses the notation
396							I:I for the biological object, and even though this is
397							stylistically inconsistent, that's the way it's done. The object
398							identifier is followed by two integers indicating the start and stop
399							of the alignment on the target sequence.
400
401							Unlike the main start and stop columns, it is possible for the target
402							start to be greater than the target end. The previous example
403							indicates that the the section of Chr1 from 76,953 to 77,108 aligns to
404							the protein SW:ABL_DROME starting at position 493 and extending to
405							position 544.
406
407							A similar notation is used for sequence assembly information as shown
408							in this example:
409
410							Chr1 assembly Link 10922906 11177731 . . . Target Sequence:LINK_H06O01 1 254826
411							LINK_H06O01 assembly Cosmid 32386 64122 . . . Target Sequence:F49B2 6 31742
412
413							This indicates that the region between bases 10922906 and 11177731 of
414							Chr1 are composed of LINK_H06O01 from bp 1 to bp 254826. The region
415							of LINK_H0601 between 32386 and 64122 is, in turn, composed of the
416							bases 5 to 31742 of cosmid F49B2.
417
418							=head2 Attributes
419
420							While not intended to serve as a general-purpose sequence database
421							(see bioperl-db for that), GFF allows you to tag features with
422							arbitrary attributes. Attributes appear in the Group field following
423							the initial class/name pair. For example:
424
425							Chr1 cur trans 939 942 . + . Transcript Y95B8A.2 ; Gene sma-3 ; Alias sma3
426
427							This line tags the feature named Transcript Y95B8A.2 as being "Gene"
428							named sma-3 and having the Alias "sma3". Features having these
429							attributes can be looked up using the fetch_feature_by_attribute() method.
430
431							Two attributes have special meaning: "Note" is for backward
432							compatibility and is used for unstructured text remarks. "Alias" is
433							considered as a synonym for the feature name and will be consulted
434							when looking up a feature by its name.
435
436							=head2 Adaptors and Aggregators
437
438							This module uses a system of adaptors and aggregators in order to make
439							it adaptable to use with a variety of databases.
440
441							=over 4
442
443							=item *
444
445							Adaptors
446
447							The core of the module handles the user API, annotation coordinate
448							arithmetic, and other common issues. The details of fetching
449							information from databases is handled by an adaptor, which is
450							specified during Bio::DB::GFF construction. The adaptor encapsulates
451							database-specific information such as the schema, user authentication
452							and access methods.
453
454							There are currently five adaptors recommended for general use:
455
456							Adaptor Name Description
457							------------ -----------
458
459							memory A simple in-memory database suitable for testing
460							and small data sets.
461
462							berkeleydb An indexed file database based on the DB_File module,
463							suitable for medium-sized read-only data sets.
464
465							dbi::mysql An interface to a schema implemented in the Mysql
466							relational database management system.
467
468							dbi::oracle An interface to a schema implemented in the Oracle
469							relational database management system.
470
471							dbi::pg An interface to a schema implemented in the PostgreSQL
472							relational database management system.
473
474							Check the Bio/DB/GFF/Adaptor directory and subdirectories for other,
475							more specialized adaptors, as well as experimental ones.
476
477							=item *
478
479							Aggregators
480
481							The GFF format uses a "group" field to indicate aggregation properties
482							of individual features. For example, a set of exons and introns may
483							share a common transcript group, and multiple transcripts may share
484							the same gene group.
485
486							Aggregators are small modules that use the group information to
487							rebuild the hierarchy. When a Bio::DB::GFF object is created, you
488							indicate that it use a set of one or more aggregators. Each
489							aggregator provides a new composite annotation type. Before the
490							database query is generated each aggregator is called to
491							"disaggregate" its annotation type into list of component types
492							contained in the database. After the query is generated, each
493							aggregator is called again in order to build composite annotations
494							from the returned components.
495
496							For example, during disaggregation, the standard
497							"processed_transcript" aggregator generates a list of component
498							feature types including "UTR", "CDS", and "polyA_site". Later, it
499							aggregates these features into a set of annotations of type
500							"processed_transcript".
501
502							During aggregation, the list of aggregators is called in reverse
503							order. This allows aggregators to collaborate to create multi-level
504							structures: the transcript aggregator assembles transcripts from
505							introns and exons; the gene aggregator then assembles genes from sets
506							of transcripts.
507
508							Three default aggregators are provided:
509
510							transcript assembles transcripts from features of type
511							exon, CDS, 5'UTR, 3'UTR, TSS, and PolyA
512							clone assembles clones from Clone_left_end, Clone_right_end
513							and Sequence features.
514							alignment assembles gapped alignments from features of type
515							"similarity".
516
517							In addition, this module provides the optional "wormbase_gene"
518							aggregator, which accommodates the WormBase representation of genes.
519							This aggregator aggregates features of method "exon", "CDS", "5'UTR",
520							"3'UTR", "polyA" and "TSS" into a single object. It also expects to
521							find a single feature of type "Sequence" that spans the entire gene.
522
523							The existing aggregators are easily customized.
524
525							Note that aggregation will not occur unless you specifically request
526							the aggregation type. For example, this call:
527
528							@features = $segment->features('alignment');
529
530							will generate an array of aggregated alignment features. However,
531							this call:
532
533							@features = $segment->features();
534
535							will return a list of unaggregated similarity segments.
536
537							For more informnation, see the manual pages for
538							Bio::DB::GFF::Aggregator::processed_transcript, Bio::DB::GFF::Aggregator::clone,
539							etc.
540
541							=back
542
543							=head2 Loading GFF3 Files
544
545							This module will accept GFF3 files, as described at
546							http://song.sourceforge.net/gff3.shtml. However, the implementation
547							has some limitations.
548
549							=over 4
550
551							=item GFF version string is required
552
553							The GFF file B contain the version comment:
554
555							##gff-version 3
556
557							Unless this version string is present at the top of the GFF file, the
558							loader will attempt to parse the file in GFF2 format, with
559							less-than-desirable results.
560
561							=item Only one level of nesting allowed
562
563							A major restriction is that Bio::DB::GFF only allows one level of
564							nesting of features. For nesting, the Target tag will be used
565							preferentially followed by the ID tag, followed by the Parent tag.
566							This means that if genes are represented like this:
567
568							XXXX XXXX gene XXXX XXXX XXXX ID=myGene
569							XXXX XXXX mRNA XXXX XXXX XXXX ID=myTranscript;Parent=myGene
570							XXXX XXXX exon XXXX XXXX XXXX Parent=myTranscript
571							XXXX XXXX exon XXXX XXXX XXXX Parent=myTranscript
572
573							Then there will be one group called myGene containing the "gene"
574							feature and one group called myTranscript containing the mRNA, and two
575							exons.
576
577							You can work around this restriction to some extent by using the Alias
578							attribute literally:
579
580							XXXX XXXX gene XXXX XXXX XXXX ID=myGene
581							XXXX XXXX mRNA XXXX XXXX XXXX ID=myTranscript;Parent=myGene;Alias=myGene
582							XXXX XXXX exon XXXX XXXX XXXX Parent=myTranscript;Alias=myGene
583							XXXX XXXX exon XXXX XXXX XXXX Parent=myTranscript;Alias=myGene
584
585							This limitation will be corrected in the next version of Bio::DB::GFF.
586
587							=back
588
589							=head1 API
590
591							The following is the API for Bio::DB::GFF.
592
593							=cut
594
595							package Bio::DB::GFF;
596
597	3			3		2127	use strict;
	3					3
	3					69
598
599	3			3		663	use IO::File;
	3					2196
	3					267
600	3			3		18	use File::Glob ':glob';
	3					3
	3					456
601	3			3		702	use Bio::DB::GFF::Util::Rearrange;
	3					6
	3					132
602	3			3		882	use Bio::DB::GFF::RelSegment;
	3					6
	3					99
603	3			3		15	use Bio::DB::GFF::Feature;
	3					3
	3					48
604	3			3		834	use Bio::DB::GFF::Aggregator;
	3					6
	3					78
605
606	3			3		15	use base qw(Bio::Root::Root Bio::DasI);
	3					3
	3					882
607
608							my %valid_range_types = (overlaps => 1,
609							contains => 1,
610							contained_in => 1);
611
612							=head1 Querying GFF Databases
613
614							=head2 new
615
616							Title : new
617							Usage : my $db = Bio::DB::GFF->new(@args);
618							Function: create a new Bio::DB::GFF object
619							Returns : new Bio::DB::GFF object
620							Args : lists of adaptors and aggregators
621							Status : Public
622
623							These are the arguments:
624
625							-adaptor Name of the adaptor module to use. If none
626							provided, defaults to "dbi::mysqlopt".
627
628							-aggregator Array reference to a list of aggregators
629							to apply to the database. If none provided,
630							defaults to ['processed_transcript','alignment'].
631
632							-preferred_groups When interpreteting the 9th column of a GFF2 file,
633							the indicated group names will have preference over
634							other attributes, even if they do not come first in
635							the list of attributes. This can be a scalar value
636							or an array reference.
637
638							Any other named argument pairs are passed to
639							the adaptor for processing.
640
641							The adaptor argument must correspond to a module contained within the
642							Bio::DB::GFF::Adaptor namespace. For example, the
643							Bio::DB::GFF::Adaptor::dbi::mysql adaptor is loaded by specifying
644							'dbi::mysql'. By Perl convention, the adaptors names are lower case
645							because they are loaded at run time.
646
647							The aggregator array may contain a list of aggregator names, a list of
648							initialized aggregator objects, or a string in the form
649							"aggregator_name{subpart1,subpart2,subpart3/main_method}" (the
650							"/main_method" part is optional, but if present a feature with the
651							main_method must be present in order for aggregation to occur). For
652							example, if you wish to change the components aggregated by the
653							transcript aggregator, you could pass it to the GFF constructor this
654							way:
655
656							my $transcript =
657							Bio::DB::Aggregator::transcript->new(-sub_parts=>[qw(exon intron utr
658							polyA spliced_leader)]);
659
660							my $db = Bio::DB::GFF->new(-aggregator=>[$transcript,'clone','alignment],
661							-adaptor => 'dbi::mysql',
662							-dsn => 'dbi:mysql:elegans42');
663
664							Alternatively, you could create an entirely new transcript aggregator
665							this way:
666
667							my $new_agg = 'transcript{exon,intron,utr,polyA,spliced_leader}';
668							my $db = Bio::DB::GFF->new(-aggregator=>[$new_agg,'clone','alignment],
669							-adaptor => 'dbi::mysql',
670							-dsn => 'dbi:mysql:elegans42');
671
672							See L for more details.
673
674							The B<-preferred_groups> argument is used to change the default
675							processing of the 9th column of GFF version 2 files. By default, the
676							first tag/value pair is used to establish the group class and name.
677							If you pass -preferred_groups a scalar, the parser will look for a tag
678							of the indicated type and use it as the group even if it is not first
679							in the file. If you pass this argument a list of group classes as an
680							array ref, then the list will establish the precedence for searching.
681
682							The commonly used 'dbi::mysql' adaptor recognizes the following
683							adaptor-specific arguments:
684
685							Argument Description
686							-------- -----------
687
688							-dsn the DBI data source, e.g. 'dbi:mysql:ens0040'
689							If a partial name is given, such as "ens0040", the
690							"dbi:mysql:" prefix will be added automatically.
691
692							-user username for authentication
693
694							-pass the password for authentication
695
696							-refclass landmark Class; defaults to "Sequence"
697
698
699							The commonly used 'dbi::mysqlopt' adaptor also recognizes the following
700							arguments.
701
702							Argument Description
703							-------- -----------
704
705							-fasta path to a directory containing FASTA files for the DNA
706							contained in this database (e.g. "/usr/local/share/fasta")
707
708							-acedb an acedb URL to use when converting features into ACEDB
709							objects (e.g. sace://localhost:2005)
710
711							=cut
712
713							#'
714
715							sub new {
716	5			5	1	90122	my $package = shift;
717	5					18	my ($adaptor,$aggregators,$args,$refclass,$preferred_groups);
718
719	5	50				21	if (@_ == 1) { # special case, default to dbi::mysqlopt
720	0					0	$adaptor = 'dbi::mysqlopt';
721	0					0	$args = {DSN => shift};
722							} else {
723	5					50	($adaptor,$aggregators,$refclass,$preferred_groups,$args) = rearrange([
724							[qw(ADAPTOR FACTORY)],
725							[qw(AGGREGATOR AGGREGATORS)],
726							'REFCLASS',
727							'PREFERRED_GROUPS'
728							],@_);
729							}
730
731	5		50			22	$adaptor \|\|= 'dbi::mysqlopt';
732	5					18	my $class = "Bio::DB::GFF::Adaptor::\L${adaptor}\E";
733	5	100				72	unless ($class->can('new')) {
734	3	50				192	eval "require $class;1;" or $package->throw("Unable to load $adaptor adaptor: $@");
735							}
736
737							# this hack saves the memory adaptor, which loads the GFF file in new()
738	5	50				25	$args->{PREFERRED_GROUPS} = $preferred_groups if defined $preferred_groups;
739
740	5					26	my $self = $class->new($args);
741
742							# handle preferred groups
743	5	50				13	$self->preferred_groups($preferred_groups) if defined $preferred_groups;
744	5		50			66	$self->default_class($refclass \|\| 'Sequence');
745
746							# handle the aggregators.
747							# aggregators are responsible for creating complex multi-part features
748							# from the GFF "group" field. If none are provided, then we provide a
749							# list of the two used in WormBase.
750							# Each aggregator can be a scalar or a ref. In the former case
751							# it is treated as a class name to call new() on. In the latter
752							# the aggreator is treated as a ready made object.
753	5	50				19	$aggregators = $self->default_aggregators unless defined $aggregators;
754	5	50				35	my @a = ref($aggregators) eq 'ARRAY' ? @$aggregators : $aggregators;
755	5					18	for my $a (@a) {
756	10					44	$self->add_aggregator($a);
757							}
758
759							# default settings go here.....
760	5					40	$self->automerge(1); # set automerge to true
761
762	5					15	$self;
763							}
764
765
766							=head2 types
767
768							Title : types
769							Usage : $db->types(@args)
770							Function: return list of feature types in range or database
771							Returns : a list of Bio::DB::GFF::Typename objects
772							Args : see below
773							Status : public
774
775							This routine returns a list of feature types known to the database.
776							The list can be database-wide or restricted to a region. It is also
777							possible to find out how many times each feature occurs.
778
779							For range queries, it is usually more convenient to create a
780							Bio::DB::GFF::Segment object, and then invoke it's types() method.
781
782							Arguments are as follows:
783
784							-ref ID of reference sequence
785							-class class of reference sequence
786							-start start of segment
787							-stop stop of segment
788							-enumerate if true, count the features
789
790							The returned value will be a list of Bio::DB::GFF::Typename objects,
791							which if evaluated in a string context will return the feature type in
792							"method:source" format. This object class also has method() and
793							source() methods for retrieving the like-named fields.
794
795							If -enumerate is true, then the function returns a hash (not a hash
796							reference) in which the keys are type names in "method:source" format
797							and the values are the number of times each feature appears in the
798							database or segment.
799
800							The argument -end is a synonum for -stop, and -count is a synonym for
801							-enumerate.
802
803							=cut
804
805							sub types {
806	25			25	1	1283	my $self = shift;
807	25					146	my ($refseq,$start,$stop,$enumerate,$refclass,$types) = rearrange ([
808							[qw(REF REFSEQ)],
809							qw(START),
810							[qw(STOP END)],
811							[qw(ENUMERATE COUNT)],
812							[qw(CLASS SEQCLASS)],
813							[qw(TYPE TYPES)],
814							],@_);
815	25	50				91	$types = $self->parse_types($types) if defined $types;
816	25					78	$self->get_types($refseq,$refclass,$start,$stop,$enumerate,$types);
817							}
818
819							=head2 classes
820
821							Title : classes
822							Usage : $db->classes
823							Function: return list of landmark classes in database
824							Returns : a list of classes
825							Args : none
826							Status : public
827
828							This routine returns the list of reference classes known to the
829							database, or empty if classes are not used by the database. Classes
830							are distinct from types, being essentially qualifiers on the reference
831							namespaces.
832
833							=cut
834
835							sub classes {
836	0			0	1	0	my $self = shift;
837	0					0	return ();
838							}
839
840							=head2 segment
841
842							Title : segment
843							Usage : $db->segment(@args);
844							Function: create a segment object
845							Returns : segment object(s)
846							Args : numerous, see below
847							Status : public
848
849							This method generates a segment object, which is a Perl object
850							subclassed from Bio::DB::GFF::Segment. The segment can be used to
851							find overlapping features and the raw DNA.
852
853							When making the segment() call, you specify the ID of a sequence
854							landmark (e.g. an accession number, a clone or contig), and a
855							positional range relative to the landmark. If no range is specified,
856							then the entire extent of the landmark is used to generate the
857							segment.
858
859							You may also provide the ID of a "reference" sequence, which will set
860							the coordinate system and orientation used for all features contained
861							within the segment. The reference sequence can be changed later. If
862							no reference sequence is provided, then the coordinate system is based
863							on the landmark.
864
865							Arguments:
866
867							-name ID of the landmark sequence.
868
869							-class Database object class for the landmark sequence.
870							"Sequence" assumed if not specified. This is
871							irrelevant for databases which do not recognize
872							object classes.
873
874							-start Start of the segment relative to landmark. Positions
875							follow standard 1-based sequence rules. If not specified,
876							defaults to the beginning of the landmark.
877
878							-end Stop of the segment relative to the landmark. If not specified,
879							defaults to the end of the landmark.
880
881							-stop Same as -end.
882
883							-offset For those who prefer 0-based indexing, the offset specifies the
884							position of the new segment relative to the start of the landmark.
885
886							-length For those who prefer 0-based indexing, the length specifies the
887							length of the new segment.
888
889							-refseq Specifies the ID of the reference landmark used to establish the
890							coordinate system for the newly-created segment.
891
892							-refclass Specifies the class of the reference landmark, for those databases
893							that distinguish different object classes. Defaults to "Sequence".
894
895							-absolute
896							Return features in absolute coordinates rather than relative to the
897							parent segment.
898
899							-nocheck Don't check the database for the coordinates and length of this
900							feature. Construct a segment using the indicated name as the
901							reference, a start coordinate of 1, an undefined end coordinate,
902							and a strand of +1.
903
904							-force Same as -nocheck.
905
906							-seq,-sequence,-sourceseq Aliases for -name.
907
908							-begin,-end Aliases for -start and -stop
909
910							-off,-len Aliases for -offset and -length
911
912							-seqclass Alias for -class
913
914							Here's an example to explain how this works:
915
916							my $db = Bio::DB::GFF->new(-dsn => 'dbi:mysql:human',-adaptor=>'dbi::mysql');
917
918							If successful, $db will now hold the database accessor object. We now
919							try to fetch the fragment of sequence whose ID is A0000182 and class
920							is "Accession."
921
922							my $segment = $db->segment(-name=>'A0000182',-class=>'Accession');
923
924							If successful, $segment now holds the entire segment corresponding to
925							this accession number. By default, the sequence is used as its own
926							reference sequence, so its first base will be 1 and its last base will
927							be the length of the accession.
928
929							Assuming that this sequence belongs to a longer stretch of DNA, say a
930							contig, we can fetch this information like so:
931
932							my $sourceseq = $segment->sourceseq;
933
934							and find the start and stop on the source like this:
935
936							my $start = $segment->abs_start;
937							my $stop = $segment->abs_stop;
938
939							If we had another segment, say $s2, which is on the same contiguous
940							piece of DNA, we can pass that to the refseq() method in order to
941							establish it as the coordinate reference point:
942
943							$segment->refseq($s2);
944
945							Now calling start() will return the start of the segment relative to
946							the beginning of $s2, accounting for differences in strandedness:
947
948							my $rel_start = $segment->start;
949
950							IMPORTANT NOTE: This method can be used to return the segment spanned
951							by an arbitrary named annotation. However, if the annotation appears
952							at multiple locations on the genome, for example an EST that maps to
953							multiple locations, then, provided that all locations reside on the
954							same physical segment, the method will return a segment that spans the
955							minimum and maximum positions. If the reference sequence occupies
956							ranges on different physical segments, then it returns them all in an
957							array context, and raises a "multiple segment exception" exception in
958							a scalar context.
959
960							=cut
961
962							#'
963
964							sub segment {
965	203			203	1	12952	my $self = shift;
966	203					407	my @segments = Bio::DB::GFF::RelSegment->new(-factory => $self,
967							$self->setup_segment_args(@_));
968	203					392	foreach (@segments) {
969	195	50				331	$_->absolute(1) if $self->absolute;
970							}
971
972	203					314	$self->_multiple_return_args(@segments);
973							}
974
975							sub _multiple_return_args {
976	203			203		200	my $self = shift;
977	203					262	my @args = @_;
978	203	100				418	if (@args == 0) {
		50
		0
979	8					21	return;
980							} elsif (@args == 1) {
981	195					611	return $args[0];
982							} elsif (wantarray) { # more than one reference sequence
983	0					0	return @args;
984							} else {
985	0					0	$self->error($args[0]->name,
986							" has more than one reference sequence in database. Please call in a list context to retrieve them all.");
987	0					0	$self->throw('multiple segment exception');
988	0					0	return;
989							}
990
991							}
992
993							# backward compatibility -- don't use!
994							# (deliberately undocumented too)
995							sub abs_segment {
996	0			0	0	0	my $self = shift;
997	0					0	return $self->segment($self->setup_segment_args(@_),-absolute=>1);
998							}
999
1000							sub setup_segment_args {
1001	224			224	0	250	my $self = shift;
1002	224	100	100			1247	return @_ if defined $_[0] && $_[0] =~ /^-/;
1003	156	100				486	return (-name=>$_[0],-start=>$_[1],-stop=>$_[2]) if @_ == 3;
1004	71	100				222	return (-class=>$_[0],-name=>$_[1]) if @_ == 2;
1005	36	100				256	return (-name=>$_[0]) if @_ == 1;
1006							}
1007
1008							=head2 features
1009
1010							Title : features
1011							Usage : $db->features(@args)
1012							Function: get all features, possibly filtered by type
1013							Returns : a list of Bio::DB::GFF::Feature objects
1014							Args : see below
1015							Status : public
1016
1017							This routine will retrieve features in the database regardless of
1018							position. It can be used to return all features, or a subset based on
1019							their method and source.
1020
1021							Arguments are as follows:
1022
1023							-types List of feature types to return. Argument is an array
1024							reference containing strings of the format "method:source"
1025
1026							-merge Whether to apply aggregators to the generated features.
1027
1028							-rare Turn on optimizations suitable for a relatively rare feature type,
1029							where it makes more sense to filter by feature type first,
1030							and then by position.
1031
1032							-attributes A hash reference containing attributes to match.
1033
1034							-iterator Whether to return an iterator across the features.
1035
1036							-binsize A true value will create a set of artificial features whose
1037							start and stop positions indicate bins of the given size, and
1038							whose scores are the number of features in the bin. The
1039							class and method of the feature will be set to "bin",
1040							its source to "method:source", and its group to "bin:method:source".
1041							This is a handy way of generating histograms of feature density.
1042
1043							If -iterator is true, then the method returns a single scalar value
1044							consisting of a Bio::SeqIO object. You can call next_seq() repeatedly
1045							on this object to fetch each of the features in turn. If iterator is
1046							false or absent, then all the features are returned as a list.
1047
1048							Currently aggregation is disabled when iterating over a series of
1049							features.
1050
1051							Types are indicated using the nomenclature "method:source". Either of
1052							these fields can be omitted, in which case a wildcard is used for the
1053							missing field. Type names without the colon (e.g. "exon") are
1054							interpreted as the method name and a source wild card. Regular
1055							expressions are allowed in either field, as in: "similarity:BLAST.*".
1056
1057							The -attributes argument is a hashref containing one or more attributes
1058							to match against:
1059
1060							-attributes => { Gene => 'abc-1',
1061							Note => 'confirmed' }
1062
1063							Attribute matching is simple string matching, and multiple attributes
1064							are ANDed together.
1065
1066							=cut
1067
1068							sub features {
1069	15			15	1	2412	my $self = shift;
1070	15					34	my ($types,$automerge,$sparse,$iterator,$refseq,$start,$end,$other);
1071	15	100	66			107	if (defined $_[0] &&
1072							$_[0] =~ /^-/) {
1073	10					63	($types,$automerge,$sparse,$iterator,
1074							$refseq,$start,$end,
1075							$other) = rearrange([
1076							[qw(TYPE TYPES)],
1077							[qw(MERGE AUTOMERGE)],
1078							[qw(RARE SPARSE)],
1079							'ITERATOR',
1080							[qw(REFSEQ SEQ_ID)],
1081							'START',
1082							[qw(STOP END)],
1083							],@_);
1084							} else {
1085	5					40	$types = \@_;
1086							}
1087
1088							# for whole database retrievals, we probably don't want to automerge!
1089	15	100				117	$automerge = $self->automerge unless defined $automerge;
1090	15		50			136	$other \|\|= {};
1091	15	50				274	$self->_features({
1092							rangetype => $refseq ? 'overlaps' : 'contains',
1093							types => $types,
1094							refseq => $refseq,
1095							start => $start,
1096							stop => $end,
1097							},
1098							{ sparse => $sparse,
1099							automerge => $automerge,
1100							iterator =>$iterator,
1101							%$other,
1102							}
1103							);
1104							}
1105
1106							=head2 get_seq_stream
1107
1108							Title : get_seq_stream
1109							Usage : my $seqio = $self->get_seq_sream(@args)
1110							Function: Performs a query and returns an iterator over it
1111							Returns : a Bio::SeqIO stream capable of producing sequence
1112							Args : As in features()
1113							Status : public
1114
1115							This routine takes the same arguments as features(), but returns a
1116							Bio::SeqIO::Stream-compliant object. Use it like this:
1117
1118							$stream = $db->get_seq_stream('exon');
1119							while (my $exon = $stream->next_seq) {
1120							print $exon,"\n";
1121							}
1122
1123							NOTE: This is also called get_feature_stream(), since that's what it
1124							really does.
1125
1126							=cut
1127
1128							sub get_seq_stream {
1129	0			0	1	0	my $self = shift;
1130	0	0	0			0	my @args = !defined($_[0]) \|\| $_[0] =~ /^-/ ? (@_,-iterator=>1)
1131							: (-types=>\@_,-iterator=>1);
1132	0					0	$self->features(@args);
1133							}
1134
1135							*get_feature_stream = \&get_seq_stream;
1136
1137							=head2 get_feature_by_name
1138
1139							Title : get_feature_by_name
1140							Usage : $db->get_feature_by_name($class => $name)
1141							Function: fetch features by their name
1142							Returns : a list of Bio::DB::GFF::Feature objects
1143							Args : the class and name of the desired feature
1144							Status : public
1145
1146							This method can be used to fetch a named feature from the database.
1147							GFF annotations are named using the group class and name fields, so
1148							for features that belong to a group of size one, this method can be
1149							used to retrieve that group (and is equivalent to the segment()
1150							method). Any Alias attributes are also searched for matching names.
1151
1152							An alternative syntax allows you to search for features by name within
1153							a circumscribed region:
1154
1155							@f = $db->get_feature_by_name(-class => $class,-name=>$name,
1156							-ref => $sequence_name,
1157							-start => $start,
1158							-end => $end);
1159
1160							This method may return zero, one, or several Bio::DB::GFF::Feature
1161							objects.
1162
1163							Aggregation is performed on features as usual.
1164
1165							NOTE: At various times, this function was called fetch_group(),
1166							fetch_feature(), fetch_feature_by_name() and segments(). These names
1167							are preserved for backward compatibility.
1168
1169							=cut
1170
1171							sub get_feature_by_name {
1172	16			16	1	30	my $self = shift;
1173	16					31	my ($gclass,$gname,$automerge,$ref,$start,$end);
1174	16	50				40	if (@_ == 1) {
1175	0					0	$gclass = $self->default_class;
1176	0					0	$gname = shift;
1177							} else {
1178	16					108	($gclass,$gname,$automerge,$ref,$start,$end) = rearrange(['CLASS','NAME','AUTOMERGE',
1179							['REF','REFSEQ'],
1180							'START',['STOP','END']
1181							],@_);
1182	16		33			48	$gclass \|\|= $self->default_class;
1183							}
1184	16	50				52	$automerge = $self->automerge unless defined $automerge;
1185
1186							# we need to refactor this... It's repeated code (see below)...
1187	16					19	my @aggregators;
1188	16	50				32	if ($automerge) {
1189	16					40	for my $a ($self->aggregators) {
1190	33	50				114	push @aggregators,$a if $a->disaggregate([],$self);
1191							}
1192							}
1193
1194	16					18	my %groups; # cache the groups we create to avoid consuming too much unecessary memory
1195	16					21	my $features = [];
1196	16			74		70	my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) };
	74					162
1197	16	50				43	my $location = [$ref,$start,$end] if defined $ref;
1198	16					57	$self->_feature_by_name($gclass,$gname,$location,$callback);
1199
1200	16	50				39	warn "aggregating...\n" if $self->debug;
1201	16					31	foreach my $a (@aggregators) { # last aggregator gets first shot
1202	33	50				92	$a->aggregate($features,$self) or next;
1203							}
1204
1205	16					113	@$features;
1206							}
1207
1208							# horrible indecision regarding proper names!
1209							fetch_group = fetch_feature = *fetch_feature_by_name = \&get_feature_by_name;
1210							*segments = \&segment;
1211
1212							=head2 get_feature_by_target
1213
1214							Title : get_feature_by_target
1215							Usage : $db->get_feature_by_target($class => $name)
1216							Function: fetch features by their similarity target
1217							Returns : a list of Bio::DB::GFF::Feature objects
1218							Args : the class and name of the desired feature
1219							Status : public
1220
1221							This method can be used to fetch a named feature from the database
1222							based on its similarity hit.
1223
1224							=cut
1225
1226							sub get_feature_by_target {
1227	0			0	1	0	shift->get_feature_by_name(@_);
1228							}
1229
1230							=head2 get_feature_by_attribute
1231
1232							Title : get_feature_by_attribute
1233							Usage : $db->get_feature_by_attribute(attribute1=>value1,attribute2=>value2)
1234							Function: fetch segments by combinations of attribute values
1235							Returns : a list of Bio::DB::GFF::Feature objects
1236							Args : the class and name of the desired feature
1237							Status : public
1238
1239							This method can be used to fetch a set of features from the database.
1240							Attributes are a list of name=Evalue pairs. They will be logically
1241							ANDED together.
1242
1243							=cut
1244
1245							sub get_feature_by_attribute {
1246	5			5	1	2262	my $self = shift;
1247	5	50				23	my %attributes = ref($_[0]) ? %{$_[0]} : @_;
	0					0
1248
1249							# we need to refactor this... It's repeated code (see above)...
1250	5					8	my @aggregators;
1251	5	50				10	if ($self->automerge) {
1252	5					10	for my $a ($self->aggregators) {
1253	10	50				26	unshift @aggregators,$a if $a->disaggregate([],$self);
1254							}
1255							}
1256
1257	5					7	my %groups; # cache the groups we create to avoid consuming too much unecessary memory
1258	5					8	my $features = [];
1259	5			10		23	my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) };
	10					20
1260	5					20	$self->_feature_by_attribute(\%attributes,$callback);
1261
1262	5	50				17	warn "aggregating...\n" if $self->debug;
1263	5					12	foreach my $a (@aggregators) { # last aggregator gets first shot
1264	10	50				29	$a->aggregate($features,$self) or next;
1265							}
1266
1267	5					84	@$features;
1268							}
1269
1270							# more indecision...
1271							*fetch_feature_by_attribute = \&get_feature_by_attribute;
1272
1273							=head2 get_feature_by_id
1274
1275							Title : get_feature_by_id
1276							Usage : $db->get_feature_by_id($id)
1277							Function: fetch segments by feature ID
1278							Returns : a Bio::DB::GFF::Feature object
1279							Args : the feature ID
1280							Status : public
1281
1282							This method can be used to fetch a feature from the database using its
1283							ID. Not all GFF databases support IDs, so be careful with this.
1284
1285							=cut
1286
1287							sub get_feature_by_id {
1288	5			5	1	10	my $self = shift;
1289	5	50				16	my $id = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_;
1290	5					7	my %groups; # cache the groups we create to avoid consuming too much unecessary memory
1291	5					8	my $features = [];
1292	5			5		27	my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) };
	5					15
1293	5					28	$self->_feature_by_id($id,'feature',$callback);
1294	5	50				33	return wantarray ? @$features : $features->[0];
1295							}
1296							*fetch_feature_by_id = \&get_feature_by_id;
1297
1298							=head2 get_feature_by_gid
1299
1300							Title : get_feature_by_gid
1301							Usage : $db->get_feature_by_gid($id)
1302							Function: fetch segments by feature ID
1303							Returns : a Bio::DB::GFF::Feature object
1304							Args : the feature ID
1305							Status : public
1306
1307							This method can be used to fetch a feature from the database using its
1308							group ID. Not all GFF databases support IDs, so be careful with this.
1309
1310							The group ID is often more interesting than the feature ID, since
1311							groups can be complex objects containing subobjects.
1312
1313							=cut
1314
1315							sub get_feature_by_gid {
1316	0			0	1	0	my $self = shift;
1317	0	0				0	my $id = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_;
1318	0					0	my %groups; # cache the groups we create to avoid consuming too much unecessary memory
1319	0					0	my $features = [];
1320	0			0		0	my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) };
	0					0
1321	0					0	$self->_feature_by_id($id,'group',$callback);
1322	0	0				0	return wantarray ? @$features : $features->[0];
1323							}
1324							*fetch_feature_by_gid = \&get_feature_by_gid;
1325
1326							=head2 delete_fattribute_to_features
1327
1328							Title : delete_fattribute_to_features
1329							Usage : $db->delete_fattribute_to_features(@ids_or_features)
1330							Function: delete one or more fattribute_to_features
1331							Returns : count of fattribute_to_features deleted
1332							Args : list of features or feature ids
1333							Status : public
1334
1335							Pass this method a list of numeric feature ids or a set of features.
1336							It will attempt to remove the fattribute_to_features rows of those features
1337							from the database and return a count of the rows removed.
1338
1339							NOTE: This method is also called delete_fattribute_to_feature(). Also see
1340							delete_groups() and delete_features().
1341
1342							=cut
1343
1344							*delete_fattribute_to_feature = \&delete_fattribute_to_features;
1345
1346							sub delete_fattribute_to_features {
1347	0			0	1	0	my $self = shift;
1348	0					0	my @features_or_ids = @_;
1349	0	0				0	my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->id : $_} @features_or_ids;
	0					0
1350	0	0				0	return unless @ids;
1351	0					0	$self->_delete_fattribute_to_features(@ids);
1352							}
1353
1354							=head2 delete_features
1355
1356							Title : delete_features
1357							Usage : $db->delete_features(@ids_or_features)
1358							Function: delete one or more features
1359							Returns : count of features deleted
1360							Args : list of features or feature ids
1361							Status : public
1362
1363							Pass this method a list of numeric feature ids or a set of features.
1364							It will attempt to remove the features from the database and return a
1365							count of the features removed.
1366
1367							NOTE: This method is also called delete_feature(). Also see
1368							delete_groups().
1369
1370							=cut
1371
1372							*delete_feature = \&delete_features;
1373
1374							sub delete_features {
1375	20			20	1	30	my $self = shift;
1376	20					44	my @features_or_ids = @_;
1377	20	50				25	my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->id : $_} @features_or_ids;
	75					203
1378	20	50				38	return unless @ids;
1379	20					55	$self->_delete_features(@ids);
1380							}
1381
1382							=head2 delete_groups
1383
1384							Title : delete_groups
1385							Usage : $db->delete_groups(@ids_or_features)
1386							Function: delete one or more feature groups
1387							Returns : count of features deleted
1388							Args : list of features or feature group ids
1389							Status : public
1390
1391							Pass this method a list of numeric group ids or a set of features. It
1392							will attempt to recursively remove the features and ALL members of
1393							their group from the database. It returns a count of the number of
1394							features (not groups) returned.
1395
1396							NOTE: This method is also called delete_group(). Also see
1397							delete_features().
1398
1399							=cut
1400
1401							*delete_group = \&delete_groupss;
1402
1403							sub delete_groups {
1404	5			5	1	7	my $self = shift;
1405	5					13	my @features_or_ids = @_;
1406	5	50				13	my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->group_id : $_} @features_or_ids;
	25					46
1407	5	50				15	return unless @ids;
1408	5					21	$self->_delete_groups(@ids);
1409							}
1410
1411							=head2 delete
1412
1413							Title : delete
1414							Usage : $db->delete(@args)
1415							Function: delete features
1416							Returns : count of features deleted -- if available
1417							Args : numerous, see below
1418							Status : public
1419
1420							This method deletes all features that overlap the specified region or
1421							are of a particular type. If no arguments are provided and the -force
1422							argument is true, then deletes ALL features.
1423
1424							Arguments:
1425
1426							-name ID of the landmark sequence.
1427
1428							-ref ID of the landmark sequence (synonym for -name).
1429
1430							-class Database object class for the landmark sequence.
1431							"Sequence" assumed if not specified. This is
1432							irrelevant for databases which do not recognize
1433							object classes.
1434
1435							-start Start of the segment relative to landmark. Positions
1436							follow standard 1-based sequence rules. If not specified,
1437							defaults to the beginning of the landmark.
1438
1439							-end Stop of the segment relative to the landmark. If not specified,
1440							defaults to the end of the landmark.
1441
1442							-offset Zero-based addressing
1443
1444							-length Length of region
1445
1446							-type,-types Either a single scalar type to be deleted, or an
1447							reference to an array of types.
1448
1449							-force Force operation to be performed even if it would delete
1450							entire feature table.
1451
1452							-range_type Control the range type of the deletion. One of "overlaps" (default)
1453							"contains" or "contained_in"
1454
1455							Examples:
1456
1457							$db->delete(-type=>['intron','repeat:repeatMasker']); # remove all introns & repeats
1458							$db->delete(-name=>'chr3',-start=>1,-end=>1000); # remove annotations on chr3 from 1 to 1000
1459							$db->delete(-name=>'chr3',-type=>'exon'); # remove all exons on chr3
1460
1461							The short form of this call, as described in segment() is also allowed:
1462
1463							$db->delete("chr3",1=>1000);
1464							$db->delete("chr3");
1465
1466							IMPORTANT NOTE: This method only deletes features. It does NOT
1467							delete the names of groups that contain the deleted features. Group
1468							IDs will be reused if you later load a feature with the same group
1469							name as one that was previously deleted.
1470
1471							NOTE ON FEATURE COUNTS: The DBI-based versions of this call return the
1472							result code from the SQL DELETE operation. Some dbd drivers return the
1473							count of rows deleted, while others return 0E0. Caveat emptor.
1474
1475							=cut
1476
1477							sub delete {
1478	21			21	1	14055215	my $self = shift;
1479	21					60	my @args = $self->setup_segment_args(@_);
1480	21					149	my ($name,$class,$start,$end,$offset,$length,$type,$force,$range_type) =
1481							rearrange([['NAME','REF'],'CLASS','START',[qw(END STOP)],'OFFSET',
1482							'LENGTH',[qw(TYPE TYPES)],'FORCE','RANGE_TYPE'],@args);
1483	21	50				75	$offset = 0 unless defined $offset;
1484	21	50				45	$start = $offset+1 unless defined $start;
1485	21	50	33			81	$end = $start+$length-1 if !defined $end and $length;
1486	21		66			80	$class \|\|= $self->default_class;
1487
1488	21					68	my $types = $self->parse_types($type); # parse out list of types
1489
1490	21		50			72	$range_type \|\|= 'overlaps';
1491							$self->throw("range type must be one of {".
1492							join(',',keys %valid_range_types).
1493							"}\n")
1494	21	50				66	unless $valid_range_types{lc $range_type};
1495
1496
1497	21					23	my @segments;
1498	21	100	100			86	if (defined $name && $name ne '') {
1499	10					20	my @args = (-name=>$name,-class=>$class);
1500	10	50				31	push @args,(-start=>$start) if defined $start;
1501	10	50				21	push @args,(-end =>$end) if defined $end;
1502	10					17	@segments = $self->segment(@args);
1503	10	100				40	return unless @segments;
1504							}
1505	16					114	$self->_delete({segments => \@segments,
1506							types => $types,
1507							range_type => $range_type,
1508							force => $force}
1509							);
1510							}
1511
1512							=head2 absolute
1513
1514							Title : absolute
1515							Usage : $abs = $db->absolute([$abs]);
1516							Function: gets/sets absolute mode
1517							Returns : current setting of absolute mode boolean
1518							Args : new setting for absolute mode boolean
1519							Status : public
1520
1521							$db-Eabsolute(1) will turn on absolute mode for the entire database.
1522							All segments retrieved will use absolute coordinates by default,
1523							rather than relative coordinates. You can still set them to use
1524							relative coordinates by calling $segment-Eabsolute(0).
1525
1526							Note that this is not the same as calling abs_segment(); it continues
1527							to allow you to look up groups that are not used directly as reference
1528							sequences.
1529
1530							=cut
1531
1532							sub absolute {
1533	195			195	1	196	my $self = shift;
1534	195					234	my $d = $self->{absolute};
1535	195	50				310	$self->{absolute} = shift if @_;
1536	195					357	$d;
1537							}
1538
1539							=head2 strict_bounds_checking
1540
1541							Title : strict_bounds_checking
1542							Usage : $flag = $db->strict_bounds_checking([$flag])
1543							Function: gets/sets strict bounds checking
1544							Returns : current setting of bounds checking flag
1545							Args : new setting for bounds checking flag
1546							Status : public
1547
1548							This flag enables extra checks for segment requests that go beyond the
1549							ends of their reference sequences. If bounds checking is enabled,
1550							then retrieved segments will be truncated to their physical length,
1551							and their truncated() methods will return true.
1552
1553							If the flag is off (the default), then the module will return segments
1554							that appear to extend beyond their physical boundaries. Requests for
1555							features beyond the end of the segment will, however, return empty.
1556
1557							=cut
1558
1559							sub strict_bounds_checking {
1560	170			170	1	217	my $self = shift;
1561	170					215	my $d = $self->{strict};
1562	170	100				253	$self->{strict} = shift if @_;
1563	170					402	$d;
1564							}
1565
1566							=head2 get_Seq_by_id
1567
1568							Title : get_Seq_by_id
1569							Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
1570							Function: Gets a Bio::Seq object by its name
1571							Returns : a Bio::Seq object
1572							Args : the id (as a string) of a sequence
1573							Throws : "id does not exist" exception
1574
1575							NOTE: Bio::DB::RandomAccessI compliant method
1576
1577							=cut
1578
1579							sub get_Seq_by_id {
1580	0			0	1	0	my $self = shift;
1581	0					0	$self->get_feature_by_name(@_);
1582							}
1583
1584
1585							=head2 get_Seq_by_accession
1586
1587							Title : get_Seq_by_accession
1588							Usage : $seq = $db->get_Seq_by_accession('AL12234')
1589							Function: Gets a Bio::Seq object by its accession
1590							Returns : a Bio::Seq object
1591							Args : the id (as a string) of a sequence
1592							Throws : "id does not exist" exception
1593
1594							NOTE: Bio::DB::RandomAccessI compliant method
1595
1596							=cut
1597
1598							sub get_Seq_by_accession {
1599	0			0	1	0	my $self = shift;
1600	0					0	$self->get_feature_by_name(@_);
1601							}
1602
1603							=head2 get_Seq_by_acc
1604
1605							Title : get_Seq_by_acc
1606							Usage : $seq = $db->get_Seq_by_acc('X77802');
1607							Function: Gets a Bio::Seq object by accession number
1608							Returns : A Bio::Seq object
1609							Args : accession number (as a string)
1610							Throws : "acc does not exist" exception
1611
1612							NOTE: Bio::DB::RandomAccessI compliant method
1613
1614							=cut
1615
1616							sub get_Seq_by_acc {
1617	0			0	1	0	my $self = shift;
1618	0					0	$self->get_feature_by_name(@_);
1619							}
1620
1621							=head2 get_Stream_by_name
1622
1623							Title : get_Stream_by_name
1624							Usage : $seq = $db->get_Stream_by_name(@ids);
1625							Function: Retrieves a stream of Seq objects given their names
1626							Returns : a Bio::SeqIO stream object
1627							Args : an array of unique ids/accession numbers, or
1628							an array reference
1629
1630							NOTE: This is also called get_Stream_by_batch()
1631
1632							=cut
1633
1634							sub get_Stream_by_name {
1635	0			0	1	0	my $self = shift;
1636	0					0	my @ids = @_;
1637	0	0				0	my $id = ref($ids[0]) ? $ids[0] : \@ids;
1638	0					0	Bio::DB::GFF::ID_Iterator->new($self,$id,'name');
1639							}
1640
1641							=head2 get_Stream_by_id
1642
1643							Title : get_Stream_by_id
1644							Usage : $seq = $db->get_Stream_by_id(@ids);
1645							Function: Retrieves a stream of Seq objects given their ids
1646							Returns : a Bio::SeqIO stream object
1647							Args : an array of unique ids/accession numbers, or
1648							an array reference
1649
1650							NOTE: This is also called get_Stream_by_batch()
1651
1652							=cut
1653
1654							sub get_Stream_by_id {
1655	0			0	1	0	my $self = shift;
1656	0					0	my @ids = @_;
1657	0	0				0	my $id = ref($ids[0]) ? $ids[0] : \@ids;
1658	0					0	Bio::DB::GFF::ID_Iterator->new($self,$id,'feature');
1659							}
1660
1661							=head2 get_Stream_by_batch ()
1662
1663							Title : get_Stream_by_batch
1664							Usage : $seq = $db->get_Stream_by_batch(@ids);
1665							Function: Retrieves a stream of Seq objects given their ids
1666							Returns : a Bio::SeqIO stream object
1667							Args : an array of unique ids/accession numbers, or
1668							an array reference
1669
1670							NOTE: This is the same as get_Stream_by_id().
1671
1672							=cut
1673
1674							*get_Stream_by_batch = \&get_Stream_by_id;
1675
1676
1677							=head2 get_Stream_by_group ()
1678
1679							Bioperl compatibility.
1680
1681							=cut
1682
1683							sub get_Stream_by_group {
1684	0			0	1	0	my $self = shift;
1685	0					0	my @ids = @_;
1686	0	0				0	my $id = ref($ids[0]) ? $ids[0] : \@ids;
1687	0					0	Bio::DB::GFF::ID_Iterator->new($self,$id,'group');
1688							}
1689
1690							=head2 all_seqfeatures
1691
1692							Title : all_seqfeatures
1693							Usage : @features = $db->all_seqfeatures(@args)
1694							Function: fetch all the features in the database
1695							Returns : an array of features, or an iterator
1696							Args : See below
1697							Status : public
1698
1699							This is equivalent to calling $db-Efeatures() without any types, and
1700							will return all the features in the database. The -merge and
1701							-iterator arguments are recognized, and behave the same as described
1702							for features().
1703
1704							=cut
1705
1706							sub all_seqfeatures {
1707	0			0	1	0	my $self = shift;
1708	0					0	my ($automerge,$iterator)= rearrange([
1709							[qw(MERGE AUTOMERGE)],
1710							'ITERATOR'
1711							],@_);
1712	0					0	my @args;
1713	0	0				0	push @args,(-merge=>$automerge) if defined $automerge;
1714	0	0				0	push @args,(-iterator=>$iterator) if defined $iterator;
1715	0					0	$self->features(@args);
1716							}
1717
1718							=head1 Creating and Loading GFF Databases
1719
1720							=head2 initialize
1721
1722							Title : initialize
1723							Usage : $db->initialize(-erase=>$erase,-option1=>value1,-option2=>value2);
1724							Function: initialize a GFF database
1725							Returns : true if initialization successful
1726							Args : a set of named parameters
1727							Status : Public
1728
1729							This method can be used to initialize an empty database. It takes the following
1730							named arguments:
1731
1732							-erase A boolean value. If true the database will be wiped clean if it
1733							already contains data.
1734
1735							Other named arguments may be recognized by subclasses. They become database
1736							meta values that control various settable options.
1737
1738							As a shortcut (and for backward compatibility) a single true argument
1739							is the same as initialize(-erase=E1).
1740
1741							=cut
1742
1743							sub initialize {
1744	5			5	1	5029	my $self = shift;
1745
1746	5					39	my ($erase,$meta) = rearrange(['ERASE'],@_);
1747	5		50			30	$meta \|\|= {};
1748
1749							# initialize (possibly erasing)
1750	5	50				28	return unless $self->do_initialize($erase);
1751	5					24	my @default = $self->default_meta_values;
1752
1753							# this is an awkward way of uppercasing the
1754							# even-numbered values (necessary for case-insensitive SQL databases)
1755	5					15	for (my $i=0; $i<@default; $i++) {
1756	0	0				0	$default[$i] = uc $default[$i] if !($i % 2);
1757							}
1758
1759	5					15	my %values = (@default,%$meta);
1760	5					13	foreach (keys %values) {
1761	0					0	$self->meta($_ => $values{$_});
1762							}
1763	5					20	1;
1764							}
1765
1766
1767							=head2 load_gff
1768
1769							Title : load_gff
1770							Usage : $db->load_gff($file\|$directory\|$filehandle [,$verbose]);
1771							Function: load GFF data into database
1772							Returns : count of records loaded
1773							Args : a directory, a file, a list of files,
1774							or a filehandle
1775							Status : Public
1776
1777							This method takes a single overloaded argument, which can be any of:
1778
1779							=over 4
1780
1781							=item *
1782
1783							a scalar corresponding to a GFF file on the system
1784
1785							A pathname to a local GFF file. Any files ending with the .gz, .Z, or
1786							.bz2 suffixes will be transparently decompressed with the appropriate
1787							command-line utility.
1788
1789							=item *
1790
1791							an array reference containing a list of GFF files on the system
1792
1793							For example ['/home/gff/gff1.gz','/home/gff/gff2.gz']
1794
1795							=item *
1796
1797							directory path
1798
1799							The indicated directory will be searched for all files ending in the
1800							suffixes .gff, .gff.gz, .gff.Z or .gff.bz2.
1801
1802							=item *
1803
1804							filehandle
1805
1806							An open filehandle from which to read the GFF data. Tied filehandles
1807							now work as well.
1808
1809							=item *
1810
1811							a pipe expression
1812
1813							A pipe expression will also work. For example, a GFF file on a remote
1814							web server can be loaded with an expression like this:
1815
1816							$db->load_gff("lynx -dump -source http://stein.cshl.org/gff_test \|");
1817
1818							=back
1819
1820							The optional second argument, if true, will turn on verbose status
1821							reports that indicate the progress.
1822
1823							If successful, the method will return the number of GFF lines
1824							successfully loaded.
1825
1826							NOTE:this method used to be called load(), but has been changed. The
1827							old method name is also recognized.
1828
1829							=cut
1830
1831							sub load_gff {
1832	5			5	1	10	my $self = shift;
1833	5		50			28	my $file_or_directory = shift \|\| '.';
1834	5					5	my $verbose = shift;
1835
1836	5					19	local $self->{__verbose__} = $verbose;
1837	5	50	33			22	return $self->do_load_gff($file_or_directory) if ref($file_or_directory)
1838							&& tied *$file_or_directory;
1839
1840	5					12	my $tied_stdin = tied(*STDIN);
1841	5	50				167	open my $SAVEIN, "<&STDIN" unless $tied_stdin;
1842	5	50				33	local @ARGV = $self->setup_argv($file_or_directory,'gff','gff3') or return; # to play tricks with reader
1843	5					27	my $result = $self->do_load_gff('ARGV');
1844	5	50				121	open STDIN, '<', $SAVEIN unless $tied_stdin; # restore STDIN
1845	5					47	return $result;
1846							}
1847
1848							*load = \&load_gff;
1849
1850							=head2 load_gff_file
1851
1852							Title : load_gff_file
1853							Usage : $db->load_gff_file($file [,$verbose]);
1854							Function: load GFF data into database
1855							Returns : count of records loaded
1856							Args : a path to a file
1857							Status : Public
1858
1859							This is provided as an alternative to load_gff_file. It doesn't munge
1860							STDIN or play tricks with ARGV.
1861
1862							=cut
1863
1864							sub load_gff_file {
1865	0			0	1	0	my $self = shift;
1866	0					0	my $file = shift;
1867	0					0	my $verbose = shift;
1868	0	0				0	my $fh = IO::File->new($file) or return;
1869	0					0	return $self->do_load_gff($fh);
1870							}
1871
1872							=head2 load_fasta
1873
1874							Title : load_fasta
1875							Usage : $db->load_fasta($file\|$directory\|$filehandle);
1876							Function: load FASTA data into database
1877							Returns : count of records loaded
1878							Args : a directory, a file, a list of files,
1879							or a filehandle
1880							Status : Public
1881
1882							This method takes a single overloaded argument, which can be any of:
1883
1884							=over 4
1885
1886							=item *
1887
1888							scalar corresponding to a FASTA file on the system
1889
1890							A pathname to a local FASTA file. Any files ending with the .gz, .Z, or
1891							.bz2 suffixes will be transparently decompressed with the appropriate
1892							command-line utility.
1893
1894							=item *
1895
1896							array reference containing a list of FASTA files on the
1897							system
1898
1899							For example ['/home/fasta/genomic.fa.gz','/home/fasta/genomic.fa.gz']
1900
1901							=item *
1902
1903							path to a directory
1904
1905							The indicated directory will be searched for all files ending in the
1906							suffixes .fa, .fa.gz, .fa.Z or .fa.bz2.
1907
1908							=item *
1909
1910							filehandle
1911
1912							An open filehandle from which to read the FASTA data.
1913
1914							=item *
1915
1916							pipe expression
1917
1918							A pipe expression will also work. For example, a FASTA file on a remote
1919							web server can be loaded with an expression like this:
1920
1921							$db->load_gff("lynx -dump -source http://stein.cshl.org/fasta_test.fa \|");
1922
1923							=back
1924
1925							=cut
1926
1927							sub load_fasta {
1928	5			5	1	13	my $self = shift;
1929	5		50			15	my $file_or_directory = shift \|\| '.';
1930	5					8	my $verbose = shift;
1931
1932	5					10	local $self->{__verbose__} = $verbose;
1933	5	50	33			17	return $self->load_sequence($file_or_directory) if ref($file_or_directory)
1934							&& tied *$file_or_directory;
1935
1936	5					10	my $tied = tied(*STDIN);
1937	5	50				78	open my $SAVEIN, "<&STDIN" unless $tied;
1938	5	50				26	local @ARGV = $self->setup_argv($file_or_directory,'fa','dna','fasta') or return; # to play tricks with reader
1939	5					35	my $result = $self->load_sequence('ARGV');
1940	5	50				156	open STDIN, '<', $SAVEIN unless $tied; # restore STDIN
1941	5					87	return $result;
1942							}
1943
1944
1945							=head2 load_fasta_file
1946
1947							Title : load_fasta_file
1948							Usage : $db->load_fasta_file($file [,$verbose]);
1949							Function: load FASTA data into database
1950							Returns : count of records loaded
1951							Args : a path to a file
1952							Status : Public
1953
1954							This is provided as an alternative to load_fasta. It doesn't munge
1955							STDIN or play tricks with ARGV.
1956
1957							=cut
1958
1959							sub load_fasta_file {
1960	0			0	1	0	my $self = shift;
1961	0					0	my $file = shift;
1962	0					0	my $verbose = shift;
1963	0	0				0	my $fh = IO::File->new($file) or return;
1964	0					0	return $self->do_load_fasta($fh);
1965							}
1966
1967
1968							=head2 load_sequence_string
1969
1970							Title : load_sequence_string
1971							Usage : $db->load_sequence_string($id,$dna)
1972							Function: load a single DNA entry
1973							Returns : true if successfully loaded
1974							Args : a raw sequence string (DNA, RNA, protein)
1975							Status : Public
1976
1977							=cut
1978
1979							sub load_sequence_string {
1980	0			0	1	0	my $self = shift;
1981	0					0	my ($acc,$seq) = @_;
1982	0					0	my $offset = 0;
1983	0	0				0	$self->insert_sequence_chunk($acc,\$offset,\$seq) or return;
1984	0	0				0	$self->insert_sequence($acc,$offset,$seq) or return;
1985	0					0	1;
1986							}
1987
1988							sub setup_argv {
1989	10			10	0	18	my $self = shift;
1990	10					15	my $file_or_directory = shift;
1991	10					43	my @suffixes = @_;
1992	3			3		21	no strict 'refs'; # so that we can call fileno() on the argument
	3					3
	3					12540
1993
1994	10					18	my @argv;
1995
1996	10	100				193	if (-d $file_or_directory) {
		50
		50
1997							# Because glob() is broken with long file names that contain spaces
1998	5	50	33			55	$file_or_directory = Win32::GetShortPathName($file_or_directory)
1999							if $^O =~ /^MSWin/i && eval 'use Win32; 1';
2000	5					16	@argv = map { glob("$file_or_directory/*.{$_,$_.gz,$_.Z,$_.bz2}")} @suffixes;
	15					3097
2001							}elsif (my $fd = fileno($file_or_directory)) {
2002	0	0				0	open STDIN,"<&=$fd" or $self->throw("Can't dup STDIN");
2003	0					0	@argv = '-';
2004							} elsif (ref $file_or_directory) {
2005	0					0	@argv = @$file_or_directory;
2006							} else {
2007	5					12	@argv = $file_or_directory;
2008							}
2009
2010	10					28	foreach (@argv) {
2011	45	50				374	if (/\.gz$/) {
		50
		50
2012	0					0	$_ = "gunzip -c $_ \|";
2013							} elsif (/\.Z$/) {
2014	0					0	$_ = "uncompress -c $_ \|";
2015							} elsif (/\.bz2$/) {
2016	0					0	$_ = "bunzip2 -c $_ \|";
2017							}
2018							}
2019	10					53	@argv;
2020							}
2021
2022							=head2 lock_on_load
2023
2024							Title : lock_on_load
2025							Usage : $lock = $db->lock_on_load([$lock])
2026							Function: set write locking during load
2027							Returns : current value of lock-on-load flag
2028							Args : new value of lock-on-load-flag
2029							Status : Public
2030
2031							This method is honored by some of the adaptors. If the value is true,
2032							the tables used by the GFF modules will be locked for writing during
2033							loads and inaccessible to other processes.
2034
2035							=cut
2036
2037							sub lock_on_load {
2038	0			0	1	0	my $self = shift;
2039	0					0	my $d = $self->{lock};
2040	0	0				0	$self->{lock} = shift if @_;
2041	0					0	$d;
2042							}
2043
2044							=head2 meta
2045
2046							Title : meta
2047							Usage : $value = $db->meta($name [,$newval])
2048							Function: get or set a meta variable
2049							Returns : a string
2050							Args : meta variable name and optionally value
2051							Status : abstract
2052
2053							Get or set a named metavalues for the database. Metavalues can be
2054							used for database-specific settings.
2055
2056							By default, this method does nothing!
2057
2058							=cut
2059
2060							sub meta {
2061	0			0	1	0	my $self = shift;
2062	0					0	my ($name,$value) = @_;
2063	0					0	return;
2064							}
2065
2066							=head2 default_meta_values
2067
2068							Title : default_meta_values
2069							Usage : %values = $db->default_meta_values
2070							Function: empty the database
2071							Returns : a list of tag=>value pairs
2072							Args : none
2073							Status : protected
2074
2075							This method returns a list of tag=Evalue pairs that contain default
2076							meta information about the database. It is invoked by initialize() to
2077							write out the default meta values. The base class version returns an
2078							empty list.
2079
2080							For things to work properly, meta value names must be UPPERCASE.
2081
2082							=cut
2083
2084							sub default_meta_values {
2085	5			5	1	10	my $self = shift;
2086	5					10	return ();
2087							}
2088
2089
2090							=head2 error
2091
2092							Title : error
2093							Usage : $db->error( [$new error] );
2094							Function: read or set error message
2095							Returns : error message
2096							Args : an optional argument to set the error message
2097							Status : Public
2098
2099							This method can be used to retrieve the last error message. Errors
2100							are not reset to empty by successful calls, so contents are only valid
2101							immediately after an error condition has been detected.
2102
2103							=cut
2104
2105							sub error {
2106	8			8	1	12	my $self = shift;
2107	8					18	my $g = $self->{error};
2108	8	50				41	$self->{error} = join '',@_ if @_;
2109	8					13	$g;
2110							}
2111
2112							=head2 debug
2113
2114							Title : debug
2115							Usage : $db->debug( [$flag] );
2116							Function: read or set debug flag
2117							Returns : current value of debug flag
2118							Args : new debug flag (optional)
2119							Status : Public
2120
2121							This method can be used to turn on debug messages. The exact nature
2122							of those messages depends on the adaptor in use.
2123
2124							=cut
2125
2126							sub debug {
2127	216			216	1	2596	my $self = shift;
2128	216					253	my $g = $self->{debug};
2129	216	100				919	$self->{debug} = shift if @_;
2130	216					398	$g;
2131							}
2132
2133
2134							=head2 automerge
2135
2136							Title : automerge
2137							Usage : $db->automerge( [$new automerge] );
2138							Function: get or set automerge value
2139							Returns : current value (boolean)
2140							Args : an optional argument to set the automerge value
2141							Status : Public
2142
2143							By default, this module will use the aggregators to merge groups into
2144							single composite objects. This default can be changed to false by
2145							calling automerge(0).
2146
2147							=cut
2148
2149							sub automerge {
2150	111			111	1	210	my $self = shift;
2151	111					163	my $g = $self->{automerge};
2152	111	100				229	$self->{automerge} = shift if @_;
2153	111					163	$g;
2154							}
2155
2156							=head2 attributes
2157
2158							Title : attributes
2159							Usage : @attributes = $db->attributes($id,$name)
2160							Function: get the "attributes" on a particular feature
2161							Returns : an array of string
2162							Args : feature ID
2163							Status : public
2164
2165							Some GFF version 2 files use the groups column to store a series of
2166							attribute/value pairs. In this interpretation of GFF, the first such
2167							pair is treated as the primary group for the feature; subsequent pairs
2168							are treated as attributes. Two attributes have special meaning:
2169							"Note" is for backward compatibility and is used for unstructured text
2170							remarks. "Alias" is considered as a synonym for the feature name.
2171
2172							If no name is provided, then attributes() returns a flattened hash, of
2173							attribute=Evalue pairs. This lets you do:
2174
2175							%attributes = $db->attributes($id);
2176
2177							If no arguments are provided, attributes() will return the list of
2178							all attribute names:
2179
2180							@attribute_names = $db->attributes();
2181
2182							Normally, however, attributes() will be called by the feature:
2183
2184							@notes = $feature->attributes('Note');
2185
2186							In a scalar context, attributes() returns the first value of the
2187							attribute if a tag is present, otherwise a hash reference in which the
2188							keys are attribute names and the values are anonymous arrays
2189							containing the values.
2190
2191							=cut
2192
2193							sub attributes {
2194	20			20	1	27	my $self = shift;
2195	20					63	my ($id,$tag) = @_;
2196	20	50				69	my @result = $self->do_attributes(@_) or return;
2197	20	100				80	return @result if wantarray;
2198
2199							# what to do in an array context
2200	10	100				30	return $result[0] if $tag;
2201	5					8	my %result;
2202	5					23	while (my($key,$value) = splice(@result,0,2)) {
2203	15					15	push @{$result{$key}},$value;
	15					57
2204							}
2205	5					25	return \%result;
2206							}
2207
2208							=head2 fast_queries
2209
2210							Title : fast_queries
2211							Usage : $flag = $db->fast_queries([$flag])
2212							Function: turn on and off the "fast queries" option
2213							Returns : a boolean
2214							Args : a boolean flag (optional)
2215							Status : public
2216
2217							The mysql database driver (and possibly others) support a "fast" query
2218							mode that caches results on the server side. This makes queries come
2219							back faster, particularly when creating iterators. The downside is
2220							that while iterating, new queries will die with a "command synch"
2221							error. This method turns the feature on and off.
2222
2223							For databases that do not support a fast query, this method has no
2224							effect.
2225
2226							=cut
2227
2228							# override this method in order to set the mysql_use_result attribute, which is an obscure
2229							# but extremely powerful optimization for both performance and memory.
2230							sub fast_queries {
2231	0			0	1	0	my $self = shift;
2232	0					0	my $d = $self->{fast_queries};
2233	0	0				0	$self->{fast_queries} = shift if @_;
2234	0					0	$d;
2235							}
2236
2237							=head2 add_aggregator
2238
2239							Title : add_aggregator
2240							Usage : $db->add_aggregator($aggregator)
2241							Function: add an aggregator to the list
2242							Returns : nothing
2243							Args : an aggregator
2244							Status : public
2245
2246							This method will append an aggregator to the end of the list of
2247							registered aggregators. Three different argument types are accepted:
2248
2249							1) a Bio::DB::GFF::Aggregator object -- will be added
2250							2) a string in the form "aggregator_name{subpart1,subpart2,subpart3/main_method}"
2251							-- will be turned into a Bio::DB::GFF::Aggregator object (the /main_method
2252							part is optional).
2253							3) a valid Perl token -- will be turned into a Bio::DB::GFF::Aggregator
2254							subclass, where the token corresponds to the subclass name.
2255
2256							=cut
2257
2258							sub add_aggregator {
2259	20			20	1	58	my $self = shift;
2260	20					25	my $aggregator = shift;
2261	20		100			79	my $list = $self->{aggregators} \|\|= [];
2262	20	100				83	if (ref $aggregator) { # an object
		50
2263	5					15	@$list = grep {$_->get_method ne $aggregator->get_method} @$list;
	10					28
2264	5					13	push @$list,$aggregator;
2265							}
2266
2267							elsif ($aggregator =~ /^(\w+)\{([^\/\}]+)\/?(.*)\}$/) {
2268	0					0	my($agg_name,$subparts,$mainpart) = ($1,$2,$3);
2269	0					0	my @subparts = split /,\s*/,$subparts;
2270	0					0	my @args = (-method => $agg_name,
2271							-sub_parts => \@subparts);
2272	0	0				0	if ($mainpart) {
2273	0					0	push @args,(-main_method => $mainpart,
2274							-whole_object => 1);
2275							}
2276	0	0				0	warn "making an aggregator with (@args), subparts = @subparts" if $self->debug;
2277	0					0	push @$list,Bio::DB::GFF::Aggregator->new(@args);
2278							}
2279
2280							else {
2281	15					57	my $class = "Bio::DB::GFF::Aggregator::\L${aggregator}\E";
2282	15	50				771	eval "require $class; 1" or $self->throw("Unable to load $aggregator aggregator: $@");
2283	15					120	push @$list,$class->new();
2284							}
2285							}
2286
2287							=head2 aggregators
2288
2289							Title : aggregators
2290							Usage : $db->aggregators([@new_aggregators]);
2291							Function: retrieve list of aggregators
2292							Returns : list of aggregators
2293							Args : a list of aggregators to set (optional)
2294							Status : public
2295
2296							This method will get or set the list of aggregators assigned to
2297							the database. If 1 or more arguments are passed, the existing
2298							set will be cleared.
2299
2300							=cut
2301
2302							sub aggregators {
2303	106			106	1	120	my $self = shift;
2304	106					113	my $d = $self->{aggregators};
2305	106	50				176	if (@_) {
2306	0					0	$self->clear_aggregators;
2307	0					0	$self->add_aggregator($_) foreach @_;
2308							}
2309	106	50				155	return unless $d;
2310	106					248	return @$d;
2311							}
2312
2313							=head2 clear_aggregators
2314
2315							Title : clear_aggregators
2316							Usage : $db->clear_aggregators
2317							Function: clears list of aggregators
2318							Returns : nothing
2319							Args : none
2320							Status : public
2321
2322							This method will clear the aggregators stored in the database object.
2323							Use aggregators() or add_aggregator() to add some back.
2324
2325							=cut
2326
2327	5			5	1	26	sub clear_aggregators { shift->{aggregators} = [] }
2328
2329							=head2 preferred_groups
2330
2331							Title : preferred_groups
2332							Usage : $db->preferred_groups([$group_name_or_arrayref])
2333							Function: get/set list of groups for altering GFF2 parsing
2334							Returns : a list of classes
2335							Args : new list (scalar or array ref)
2336							Status : public
2337
2338							=cut
2339
2340							sub preferred_groups {
2341	15			15	1	55	my $self = shift;
2342	15					23	my $d = $self->{preferred_groups};
2343	15	100				33	if (@_) {
2344	5	50				12	my @v = map {ref($_) eq 'ARRAY' ? @$_ : $_} @_;
	5					33
2345	5					15	$self->{preferred_groups} = \@v;
2346	5					21	delete $self->{preferred_groups_hash};
2347							}
2348	15	100				38	return unless $d;
2349	10					37	return @$d;
2350							}
2351
2352							sub _preferred_groups_hash {
2353	144			144		154	my $self = shift;
2354	144					131	my $gff3 = shift;
2355	144	100				296	return $self->{preferred_groups_hash} if exists $self->{preferred_groups_hash};
2356	5					7	my $count = 0;
2357
2358	5					13	my @preferred = $self->preferred_groups;
2359
2360							# defaults
2361	5	50				12	if (!@preferred) {
2362	0	0	0			0	@preferred = $gff3 \|\| $self->{load_data}{gff3_flag} ? qw(Target Parent ID) : qw(Target Sequence Transcript);
2363							}
2364
2365	5					20	my %preferred = map {lc($_) => @preferred-$count++} @preferred;
	15					62
2366	5					29	return $self->{preferred_groups_hash} = \%preferred;
2367							}
2368
2369							=head1 Methods for use by Subclasses
2370
2371							The following methods are chiefly of interest to subclasses and are
2372							not intended for use by end programmers.
2373
2374							=head2 abscoords
2375
2376							Title : abscoords
2377							Usage : $db->abscoords($name,$class,$refseq)
2378							Function: finds position of a landmark in reference coordinates
2379							Returns : ($ref,$class,$start,$stop,$strand)
2380							Args : name and class of landmark
2381							Status : public
2382
2383							This method is called by Bio::DB::GFF::RelSegment to obtain the
2384							absolute coordinates of a sequence landmark. The arguments are the
2385							name and class of the landmark. If successful, abscoords() returns
2386							the ID of the reference sequence, its class, its start and stop
2387							positions, and the orientation of the reference sequence's coordinate
2388							system ("+" for forward strand, "-" for reverse strand).
2389
2390							If $refseq is present in the argument list, it forces the query to
2391							search for the landmark in a particular reference sequence.
2392
2393							=cut
2394
2395							sub abscoords {
2396	193			193	1	212	my $self = shift;
2397	193					285	my ($name,$class,$refseq) = @_;
2398	193		33			264	$class \|\|= $self->{default_class};
2399	193					545	$self->get_abscoords($name,$class,$refseq);
2400							}
2401
2402							=head1 Protected API
2403
2404							The following methods are not intended for public consumption, but are
2405							intended to be overridden/implemented by adaptors.
2406
2407							=head2 default_aggregators
2408
2409							Title : default_aggregators
2410							Usage : $db->default_aggregators;
2411							Function: retrieve list of aggregators
2412							Returns : array reference containing list of aggregator names
2413							Args : none
2414							Status : protected
2415
2416							This method (which is intended to be overridden by adaptors) returns a
2417							list of standard aggregators to be applied when no aggregators are
2418							specified in the constructor.
2419
2420							=cut
2421
2422							sub default_aggregators {
2423	0			0	1	0	my $self = shift;
2424	0					0	return ['processed_transcript','alignment'];
2425							}
2426
2427							=head2 do_load_gff
2428
2429							Title : do_load_gff
2430							Usage : $db->do_load_gff($handle)
2431							Function: load a GFF input stream
2432							Returns : number of features loaded
2433							Args : A filehandle.
2434							Status : protected
2435
2436							This method is called to load a GFF data stream. The method will read
2437							GFF features from EE and load them into the database. On exit the
2438							method must return the number of features loaded.
2439
2440							Note that the method is responsible for parsing the GFF lines. This
2441							is to allow for differences in the interpretation of the "group"
2442							field, which are legion.
2443
2444							You probably want to use load_gff() instead. It is more flexible
2445							about the arguments it accepts.
2446
2447							=cut
2448
2449							sub do_load_gff {
2450	5			5	1	8	my $self = shift;
2451	5					143	my $io_handle = shift;
2452
2453							local $self->{load_data} = {
2454	5	50	33			85	lineend => (-t STDERR && !$ENV{EMACS} ? "\r" : "\n"),
2455							count => 0
2456							};
2457
2458	5					34	$self->setup_load();
2459	5					21	my $mode = 'gff';
2460
2461	5					201	while (<$io_handle>) {
2462	214					259	chomp;
2463	214	50				388	if ($mode eq 'gff') {
		0
2464	214	50				320	if (/^>/) { # Sequence coming
2465	0					0	$mode = 'fasta';
2466	0					0	$self->_load_sequence_start;
2467	0					0	$self->_load_sequence_line($_);
2468							} else {
2469	214					304	$self->_load_gff_line($_);
2470							}
2471							}
2472							elsif ($mode eq 'fasta') {
2473	0	0				0	if (/^##\|\t/) { # Back to GFF mode
2474	0					0	$self->_load_sequence_finish;
2475	0					0	$mode = 'gff';
2476	0					0	$self->_load_gff_line($_);
2477							} else {
2478	0					0	$self->_load_sequence_line($_);
2479							}
2480							}
2481							}
2482	5					20	$self->finish_load();
2483	5					24	$self->_load_sequence_finish;
2484
2485	5					15	return $self->{load_data}{count};
2486							}
2487
2488							sub _load_gff_line {
2489	214			214		277	my $self = shift;
2490	214					251	my $line = shift;
2491	214					259	my $lineend = $self->{load_data}{lineend};
2492
2493	214	100				366	$self->{load_data}{gff3_flag}++ if $line =~ /^\#\#\s*gff-version\s+3/;
2494
2495	214	100	100			462	if (defined $self->{load_data}{gff3_flag} and !defined $self->{load_data}{gff3_warning}) {
2496	2					32	$self->print_gff3_warning();
2497	2					10	$self->{load_data}{gff3_warning}=1;
2498							}
2499
2500	214	50				299	$self->preferred_groups(split(/\s+/,$1)) if $line =~ /^\#\#\s*group-tags?\s+(.+)/;
2501
2502	214	100				344	if ($line =~ /^\#\#\s*sequence-region\s+(\S+)\s+(-?\d+)\s+(-?\d+)/i) { # header line
2503	10					114	$self->load_gff_line(
2504							{
2505							ref => $1,
2506							class => 'Sequence',
2507							source => 'reference',
2508							method => 'Component',
2509							start => $2,
2510							stop => $3,
2511							score => undef,
2512							strand => undef,
2513							phase => undef,
2514							gclass => 'Sequence',
2515							gname => $1,
2516							tstart => undef,
2517							tstop => undef,
2518							attributes => [],
2519							}
2520							);
2521	10					51	return $self->{load_data}{count}++;
2522							}
2523
2524	204	100				312	return if /^#/;
2525
2526	190					740	my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t",$line;
2527	190	50	66			852	return unless defined($ref) && defined($method) && defined($start) && defined($stop);
			66
			33
2528	165					246	foreach (\$score,\$strand,\$phase) {
2529	495	100				709	undef $$_ if $$_ eq '.';
2530							}
2531
2532	165					397	my ($gclass,$gname,$tstart,$tstop,$attributes) = $self->split_group($group,$self->{load_data}{gff3_flag});
2533
2534							# no standard way in the GFF file to denote the class of the reference sequence -- drat!
2535							# so we invoke the factory to do it
2536	165					349	my $class = $self->refclass($ref);
2537
2538							# call subclass to do the dirty work
2539	165	50				353	if ($start > $stop) {
2540	0					0	($start,$stop) = ($stop,$start);
2541	0	0				0	if ($strand eq '+') {
		0
2542	0					0	$strand = '-';
2543							} elsif ($strand eq '-') {
2544	0					0	$strand = '+';
2545							}
2546							}
2547							# GFF2/3 transition stuff
2548	165	50				329	$gclass = [$gclass] unless ref $gclass;
2549	165	50				299	$gname = [$gname] unless ref $gname;
2550	165					329	for (my $i=0; $i<@$gname;$i++) {
2551	165					1369	$self->load_gff_line({ref => $ref,
2552							class => $class,
2553							source => $source,
2554							method => $method,
2555							start => $start,
2556							stop => $stop,
2557							score => $score,
2558							strand => $strand,
2559							phase => $phase,
2560							gclass => $gclass->[$i],
2561							gname => $gname->[$i],
2562							tstart => $tstart,
2563							tstop => $tstop,
2564							attributes => $attributes}
2565							);
2566	165					862	$self->{load_data}{count}++;
2567							}
2568							}
2569
2570							sub _load_sequence_start {
2571	5			5		16	my $self = shift;
2572	5					12	my $ld = $self->{load_data};
2573	5					14	undef $ld->{id};
2574	5					15	$ld->{offset} = 0;
2575	5					12	$ld->{seq} = '';
2576							}
2577							sub _load_sequence_finish {
2578	10			10		22	my $self = shift;
2579	10					20	my $ld = $self->{load_data};
2580	10	100				38	$self->insert_sequence($ld->{id},$ld->{offset},$ld->{seq}) if defined $ld->{id};
2581							}
2582
2583							sub _load_sequence_line {
2584	32950			32950		27547	my $self = shift;
2585	32950					27776	my $line = shift;
2586	32950					27441	my $ld = $self->{load_data};
2587	32950					27646	my $lineend = $ld->{lineend};
2588
2589	32950	100				41398	if (/^>(\S+)/) {
2590	2620	100				6208	$self->insert_sequence($ld->{id},$ld->{offset},$ld->{seq}) if defined $ld->{id};
2591	2620					3811	$ld->{id} = $1;
2592	2620					2292	$ld->{offset} = 0;
2593	2620					2234	$ld->{seq} = '';
2594	2620					2308	$ld->{count}++;
2595	2620	50	33			8399	print STDERR $ld->{count}," sequences loaded$lineend" if $self->{__verbose__} && $ld->{count} % 1000 == 0;
2596							} else {
2597	30330					29425	$ld->{seq} .= $_;
2598	30330					37471	$self->insert_sequence_chunk($ld->{id},\$ld->{offset},\$ld->{seq});
2599							}
2600
2601							}
2602
2603							=head2 load_sequence
2604
2605							Title : load_sequence
2606							Usage : $db->load_sequence($handle)
2607							Function: load a FASTA data stream
2608							Returns : number of sequences
2609							Args : a filehandle to the FASTA file
2610							Status : protected
2611
2612							You probably want to use load_fasta() instead.
2613
2614							=cut
2615
2616							# note - there is some repeated code here
2617							sub load_sequence {
2618	5			5	1	13	my $self = shift;
2619	5					7	my $io_handle = shift;
2620
2621							local $self->{load_data} = {
2622	5	50	33			46	lineend => (-t STDERR && !$ENV{EMACS} ? "\r" : "\n"),
2623							count => 0
2624							};
2625
2626	5					23	$self->_load_sequence_start;
2627	5					152	while (<$io_handle>) {
2628	32950					30254	chomp;
2629	32950					32778	$self->_load_sequence_line($_);
2630							}
2631	5					26	$self->_load_sequence_finish;
2632	5					28	return $self->{load_data}{count};
2633							}
2634
2635							sub insert_sequence_chunk {
2636	30330			30330	0	24507	my $self = shift;
2637	30330					32438	my ($id,$offsetp,$seqp) = @_;
2638	30330	50				28906	if (my $cs = $self->dna_chunk_size) {
2639	0					0	while (length($$seqp) >= $cs) {
2640	0					0	my $chunk = substr($$seqp,0,$cs);
2641	0					0	$self->insert_sequence($id,$$offsetp,$chunk);
2642	0					0	$$offsetp += length($chunk);
2643	0					0	substr($$seqp,0,$cs) = '';
2644							}
2645							}
2646	30330					61558	return 1; # the calling routine may expect success or failure
2647							}
2648
2649							# used to store big pieces of DNA in itty bitty pieces
2650							sub dna_chunk_size {
2651	30330			30330	0	35971	return 0;
2652							}
2653
2654							sub insert_sequence {
2655	0			0	0	0	my $self = shift;
2656	0					0	my($id,$offset,$seq) = @_;
2657	0					0	$self->throw('insert_sequence(): must be defined in subclass');
2658							}
2659
2660							# This is the default class for reference points. Defaults to Sequence.
2661							sub default_class {
2662	335			335	0	351	my $self = shift;
2663	335	50				577	return 'Sequence' unless ref $self;
2664	335					503	my $d = $self->{default_class};
2665	335	100				503	$self->{default_class} = shift if @_;
2666	335					614	$d;
2667							}
2668
2669							# gets name of the reference sequence, and returns its class
2670							# currently just calls default_class
2671							sub refclass {
2672	165			165	1	171	my $self = shift;
2673	165					149	my $name = shift;
2674	165					214	return $self->default_class;
2675							}
2676
2677							=head2 setup_load
2678
2679							Title : setup_load
2680							Usage : $db->setup_load
2681							Function: called before load_gff_line()
2682							Returns : void
2683							Args : none
2684							Status : abstract
2685
2686							This abstract method gives subclasses a chance to do any
2687							schema-specific initialization prior to loading a set of GFF records.
2688							It must be implemented by a subclass.
2689
2690							=cut
2691
2692				0	1		sub setup_load {
2693							# default, do nothing
2694							}
2695
2696							=head2 finish_load
2697
2698							Title : finish_load
2699							Usage : $db->finish_load
2700							Function: called after load_gff_line()
2701							Returns : number of records loaded
2702							Args : none
2703							Status :abstract
2704
2705							This method gives subclasses a chance to do any schema-specific
2706							cleanup after loading a set of GFF records.
2707
2708							=cut
2709
2710				0	1		sub finish_load {
2711							# default, do nothing
2712							}
2713
2714							=head2 load_gff_line
2715
2716							Title : load_gff_line
2717							Usage : $db->load_gff_line(@args)
2718							Function: called to load one parsed line of GFF
2719							Returns : true if successfully inserted
2720							Args : see below
2721							Status : abstract
2722
2723							This abstract method is called once per line of the GFF and passed a
2724							hashref containing parsed GFF fields. The fields are:
2725
2726							{ref => $ref,
2727							class => $class,
2728							source => $source,
2729							method => $method,
2730							start => $start,
2731							stop => $stop,
2732							score => $score,
2733							strand => $strand,
2734							phase => $phase,
2735							gclass => $gclass,
2736							gname => $gname,
2737							tstart => $tstart,
2738							tstop => $tstop,
2739							attributes => $attributes}
2740
2741							=cut
2742
2743							sub load_gff_line {
2744	0			0	1	0	shift->throw("load_gff_line(): must be implemented by an adaptor");
2745							}
2746
2747
2748							=head2 do_initialize
2749
2750							Title : do_initialize
2751							Usage : $db->do_initialize([$erase])
2752							Function: initialize and possibly erase database
2753							Returns : true if successful
2754							Args : optional erase flag
2755							Status : protected
2756
2757							This method implements the initialize() method described above, and
2758							takes the same arguments.
2759
2760							=cut
2761
2762							sub do_initialize {
2763	0			0	1	0	shift->throw('do_initialize(): must be implemented by an adaptor');
2764							}
2765
2766							=head2 dna
2767
2768							Title : dna
2769							Usage : $db->dna($id,$start,$stop,$class)
2770							Function: return the raw DNA string for a segment
2771							Returns : a raw DNA string
2772							Args : id of the sequence, its class, start and stop positions
2773							Status : public
2774
2775							This method is invoked by Bio::DB::GFF::Segment to fetch the raw DNA
2776							sequence.
2777
2778							Arguments: -name sequence name
2779							-start start position
2780							-stop stop position
2781							-class sequence class
2782
2783							If start and stop are both undef, then the entire DNA is retrieved.
2784							So to fetch the whole dna, call like this:
2785
2786							$db->dna($name_of_sequence);
2787
2788							or like this:
2789
2790							$db->dna(-name=>$name_of_sequence,-class=>$class_of_sequence);
2791
2792							NOTE: you will probably prefer to create a Segment and then invoke its
2793							dna() method.
2794
2795							=cut
2796
2797							# call to return the DNA string for the indicated region
2798							# real work is done by get_dna()
2799							sub dna {
2800	120			120	1	133	my $self = shift;
2801	120					410	my ($id,$start,$stop,$class) = rearrange([
2802							[qw(NAME ID REF REFSEQ)],
2803							qw(START),
2804							[qw(STOP END)],
2805							'CLASS',
2806							],@_);
2807							# return unless defined $start && defined $stop;
2808	120					415	$self->get_dna($id,$start,$stop,$class);
2809							}
2810
2811	0			0	0	0	sub fetch_sequence { shift->dna(@_) }
2812
2813							sub features_in_range {
2814	85			85	0	89	my $self = shift;
2815	85					430	my ($range_type,$refseq,$class,$start,$stop,$types,$parent,$sparse,$automerge,$iterator,$other) =
2816							rearrange([
2817							[qw(RANGE_TYPE)],
2818							[qw(REF REFSEQ)],
2819							qw(CLASS),
2820							qw(START),
2821							[qw(STOP END)],
2822							[qw(TYPE TYPES)],
2823							qw(PARENT),
2824							[qw(RARE SPARSE)],
2825							[qw(MERGE AUTOMERGE)],
2826							'ITERATOR'
2827							],@_);
2828	85		100			401	$other \|\|= {};
2829							# $automerge = $types && $self->automerge unless defined $automerge;
2830	85	100				218	$automerge = $self->automerge unless defined $automerge;
2831							$self->throw("range type must be one of {".
2832							join(',',keys %valid_range_types).
2833							"}\n")
2834	85	50				207	unless $valid_range_types{lc $range_type};
2835	85					543	$self->_features({
2836							rangetype => lc $range_type,
2837							refseq => $refseq,
2838							refclass => $class,
2839							start => $start,
2840							stop => $stop,
2841							types => $types },
2842							{
2843							sparse => $sparse,
2844							automerge => $automerge,
2845							iterator => $iterator,
2846							%$other,
2847							},
2848							$parent);
2849							}
2850
2851							=head2 get_dna
2852
2853							Title : get_dna
2854							Usage : $db->get_dna($id,$start,$stop,$class)
2855							Function: get DNA for indicated segment
2856							Returns : the dna string
2857							Args : sequence ID, start, stop and class
2858							Status : protected
2859
2860							If start E stop and the sequence is nucleotide, then this method
2861							should return the reverse complement. The sequence class may be
2862							ignored by those databases that do not recognize different object
2863							types.
2864
2865							=cut
2866
2867							sub get_dna {
2868	0			0	1	0	my $self = shift;
2869	0					0	my ($id,$start,$stop,$class,) = @_;
2870	0					0	$self->throw("get_dna() must be implemented by an adaptor");
2871							}
2872
2873							=head2 get_features
2874
2875							Title : get_features
2876							Usage : $db->get_features($search,$options,$callback)
2877							Function: get list of features for a region
2878							Returns : count of number of features retrieved
2879							Args : see below
2880							Status : protected
2881
2882							The first argument is a hash reference containing search criteria for
2883							retrieving features. It contains the following keys:
2884
2885							rangetype One of "overlaps", "contains" or "contained_in". Indicates
2886							the type of range query requested.
2887
2888							refseq ID of the landmark that establishes the absolute
2889							coordinate system.
2890
2891							refclass Class of this landmark. Can be ignored by implementations
2892							that don't recognize such distinctions.
2893
2894							start Start of the range, inclusive.
2895
2896							stop Stop of the range, inclusive.
2897
2898							types Array reference containing the list of annotation types
2899							to fetch from the database. Each annotation type is an
2900							array reference consisting of [source,method].
2901
2902							The second argument is a hash reference containing certain options
2903							that affect the way information is retrieved:
2904
2905							sort_by_group
2906							A flag. If true, means that the returned features should be
2907							sorted by the group that they're in.
2908
2909							sparse A flag. If true, means that the expected density of the
2910							features is such that it will be more efficient to search
2911							by type rather than by range. If it is taking a long
2912							time to fetch features, give this a try.
2913
2914							binsize A true value will create a set of artificial features whose
2915							start and stop positions indicate bins of the given size, and
2916							whose scores are the number of features in the bin. The
2917							class of the feature will be set to "bin", and its name to
2918							"method:source". This is a handy way of generating histograms
2919							of feature density.
2920
2921							The third argument, the $callback, is a code reference to which
2922							retrieved features are passed. It is described in more detail below.
2923
2924							This routine is responsible for getting arrays of GFF data out of the
2925							database and passing them to the callback subroutine. The callback
2926							does the work of constructing a Bio::DB::GFF::Feature object out of
2927							that data. The callback expects a list of 13 fields:
2928
2929							$refseq The reference sequence
2930							$start feature start
2931							$stop feature stop
2932							$source feature source
2933							$method feature method
2934							$score feature score
2935							$strand feature strand
2936							$phase feature phase
2937							$groupclass group class (may be undef)
2938							$groupname group ID (may be undef)
2939							$tstart target start for similarity hits (may be undef)
2940							$tstop target stop for similarity hits (may be undef)
2941							$feature_id A unique feature ID (may be undef)
2942
2943							These fields are in the same order as the raw GFF file, with the
2944							exception that the group column has been parsed into group class and
2945							group name fields.
2946
2947							The feature ID, if provided, is a unique identifier of the feature
2948							line. The module does not depend on this ID in any way, but it is
2949							available via Bio::DB::GFF-Eid() if wanted. In the dbi::mysql and
2950							dbi::mysqlopt adaptor, the ID is a unique row ID. In the acedb
2951							adaptor it is not used.
2952
2953							=cut
2954
2955							=head2 feature_summary(), coverage_array()
2956
2957							The DBI adaptors provide methods for rapidly fetching coverage
2958							statistics across a region of interest. Please see
2959							L for more information about these
2960							methods.
2961
2962							=cut
2963
2964							sub get_features{
2965	0			0	1	0	my $self = shift;
2966	0					0	my ($search,$options,$callback) = @_;
2967	0					0	$self->throw("get_features() must be implemented by an adaptor");
2968							}
2969
2970
2971							=head2 _feature_by_name
2972
2973							Title : _feature_by_name
2974							Usage : $db->_feature_by_name($class,$name,$location,$callback)
2975							Function: get a list of features by name and class
2976							Returns : count of number of features retrieved
2977							Args : name of feature, class of feature, and a callback
2978							Status : abstract
2979
2980							This method is used internally. The callback arguments are the same
2981							as those used by make_feature(). This method must be overridden by
2982							subclasses.
2983
2984							=cut
2985
2986							sub _feature_by_name {
2987	0			0		0	my $self = shift;
2988	0					0	my ($class,$name,$location,$callback) = @_;
2989	0					0	$self->throw("_feature_by_name() must be implemented by an adaptor");
2990							}
2991
2992							sub _feature_by_attribute {
2993	0			0		0	my $self = shift;
2994	0					0	my ($attributes,$callback) = @_;
2995	0					0	$self->throw("_feature_by_name() must be implemented by an adaptor");
2996							}
2997
2998							=head2 _feature_by_id
2999
3000							Title : _feature_by_id
3001							Usage : $db->_feature_by_id($ids,$type,$callback)
3002							Function: get a feature based
3003							Returns : count of number of features retrieved
3004							Args : arrayref to feature IDs to fetch
3005							Status : abstract
3006
3007							This method is used internally to fetch features either by their ID or
3008							their group ID. $ids is a arrayref containing a list of IDs, $type is
3009							one of "feature" or "group", and $callback is a callback. The
3010							callback arguments are the same as those used by make_feature(). This
3011							method must be overridden by subclasses.
3012
3013							=cut
3014
3015							sub _feature_by_id {
3016	0			0		0	my $self = shift;
3017	0					0	my ($ids,$type,$callback) = @_;
3018	0					0	$self->throw("_feature_by_id() must be implemented by an adaptor");
3019							}
3020
3021							=head2 overlapping_features
3022
3023							Title : overlapping_features
3024							Usage : $db->overlapping_features(@args)
3025							Function: get features that overlap the indicated range
3026							Returns : a list of Bio::DB::GFF::Feature objects
3027							Args : see below
3028							Status : public
3029
3030							This method is invoked by Bio::DB::GFF::Segment-Efeatures() to find
3031							the list of features that overlap a given range. It is generally
3032							preferable to create the Segment first, and then fetch the features.
3033
3034							This method takes set of named arguments:
3035
3036							-refseq ID of the reference sequence
3037							-class Class of the reference sequence
3038							-start Start of the desired range in refseq coordinates
3039							-stop Stop of the desired range in refseq coordinates
3040							-types List of feature types to return. Argument is an array
3041							reference containing strings of the format "method:source"
3042							-parent A parent Bio::DB::GFF::Segment object, used to create
3043							relative coordinates in the generated features.
3044							-rare Turn on an optimization suitable for a relatively rare feature type,
3045							where it will be faster to filter by feature type first
3046							and then by position, rather than vice versa.
3047							-merge Whether to apply aggregators to the generated features.
3048							-iterator Whether to return an iterator across the features.
3049
3050							If -iterator is true, then the method returns a single scalar value
3051							consisting of a Bio::SeqIO object. You can call next_seq() repeatedly
3052							on this object to fetch each of the features in turn. If iterator is
3053							false or absent, then all the features are returned as a list.
3054
3055							Currently aggregation is disabled when iterating over a series of
3056							features.
3057
3058							Types are indicated using the nomenclature "method:source". Either of
3059							these fields can be omitted, in which case a wildcard is used for the
3060							missing field. Type names without the colon (e.g. "exon") are
3061							interpreted as the method name and a source wild card. Regular
3062							expressions are allowed in either field, as in: "similarity:BLAST.*".
3063
3064							=cut
3065
3066							# call to return the features that overlap the named region
3067							# real work is done by get_features
3068							sub overlapping_features {
3069	85			85	1	114	my $self = shift;
3070	85					219	$self->features_in_range(-range_type=>'overlaps',@_);
3071							}
3072
3073							=head2 contained_features
3074
3075							Title : contained_features
3076							Usage : $db->contained_features(@args)
3077							Function: get features that are contained within the indicated range
3078							Returns : a list of Bio::DB::GFF::Feature objects
3079							Args : see overlapping_features()
3080							Status : public
3081
3082							This call is similar to overlapping_features(), except that it only
3083							retrieves features whose end points are completely contained within
3084							the specified range.
3085
3086							Generally you will want to fetch a Bio::DB::GFF::Segment object and
3087							call its contained_features() method rather than call this directly.
3088
3089							=cut
3090
3091							# The same, except that it only returns features that are completely contained within the
3092							# range (much faster usually)
3093							sub contained_features {
3094	0			0	1	0	my $self = shift;
3095	0					0	$self->features_in_range(-range_type=>'contains',@_);
3096							}
3097
3098							=head2 contained_in
3099
3100							Title : contained_in
3101							Usage : @features = $s->contained_in(@args)
3102							Function: get features that contain this segment
3103							Returns : a list of Bio::DB::GFF::Feature objects
3104							Args : see features()
3105							Status : Public
3106
3107							This is identical in behavior to features() except that it returns
3108							only those features that completely contain the segment.
3109
3110							=cut
3111
3112							sub contained_in {
3113	0			0	1	0	my $self = shift;
3114	0					0	$self->features_in_range(-range_type=>'contained_in',@_);
3115							}
3116
3117							=head2 get_abscoords
3118
3119							Title : get_abscoords
3120							Usage : $db->get_abscoords($name,$class,$refseq)
3121							Function: get the absolute coordinates of sequence with name & class
3122							Returns : ($absref,$absstart,$absstop,$absstrand)
3123							Args : name and class of the landmark
3124							Status : protected
3125
3126							Given the name and class of a genomic landmark, this function returns
3127							a four-element array consisting of:
3128
3129							$absref the ID of the reference sequence that contains this landmark
3130							$absstart the position at which the landmark starts
3131							$absstop the position at which the landmark stops
3132							$absstrand the strand of the landmark, relative to the reference sequence
3133
3134							If $refseq is provided, the function searches only within the
3135							specified reference sequence.
3136
3137							=cut
3138
3139							sub get_abscoords {
3140	0			0	1	0	my $self = shift;
3141	0					0	my ($name,$class,$refseq) = @_;
3142	0					0	$self->throw("get_abscoords() must be implemented by an adaptor");
3143							}
3144
3145							=head2 get_types
3146
3147							Title : get_types
3148							Usage : $db->get_types($absref,$class,$start,$stop,$count)
3149							Function: get list of all feature types on the indicated segment
3150							Returns : list or hash of Bio::DB::GFF::Typename objects
3151							Args : see below
3152							Status : protected
3153
3154							Arguments are:
3155
3156							$absref the ID of the reference sequence
3157							$class the class of the reference sequence
3158							$start the position to start counting
3159							$stop the position to end counting
3160							$count a boolean indicating whether to count the number
3161							of occurrences of each feature type
3162
3163							If $count is true, then a hash is returned. The keys of the hash are
3164							feature type names in the format "method:source" and the values are
3165							the number of times a feature of this type overlaps the indicated
3166							segment. Otherwise, the call returns a set of Bio::DB::GFF::Typename
3167							objects. If $start or $stop are undef, then all features on the
3168							indicated segment are enumerated. If $absref is undef, then the call
3169							returns all feature types in the database.
3170
3171							=cut
3172
3173							sub get_types {
3174	0			0	1	0	my $self = shift;
3175	0					0	my ($refseq,$class,$start,$stop,$count,$types) = @_;
3176	0					0	$self->throw("get_types() must be implemented by an adaptor");
3177							}
3178
3179							=head2 make_feature
3180
3181							Title : make_feature
3182							Usage : $db->make_feature(@args)
3183							Function: Create a Bio::DB::GFF::Feature object from string data
3184							Returns : a Bio::DB::GFF::Feature object
3185							Args : see below
3186							Status : internal
3187
3188							This takes 14 arguments (really!):
3189
3190							$parent A Bio::DB::GFF::RelSegment object
3191							$group_hash A hashref containing unique list of GFF groups
3192							$refname The name of the reference sequence for this feature
3193							$refclass The class of the reference sequence for this feature
3194							$start Start of feature
3195							$stop Stop of feature
3196							$source Feature source field
3197							$method Feature method field
3198							$score Feature score field
3199							$strand Feature strand
3200							$phase Feature phase
3201							$group_class Class of feature group
3202							$group_name Name of feature group
3203							$tstart For homologies, start of hit on target
3204							$tstop Stop of hit on target
3205
3206							The $parent argument, if present, is used to establish relative
3207							coordinates in the resulting Bio::DB::Feature object. This allows one
3208							feature to generate a list of other features that are relative to its
3209							coordinate system (for example, finding the coordinates of the second
3210							exon relative to the coordinates of the first).
3211
3212							The $group_hash allows the group_class/group_name strings to be turned
3213							into rich database objects via the make_obect() method (see above).
3214							Because these objects may be expensive to create, $group_hash is used
3215							to uniquefy them. The index of this hash is the composite key
3216							{$group_class,$group_name,$tstart,$tstop}. Values are whatever object
3217							is returned by the make_object() method.
3218
3219							The remainder of the fields are taken from the GFF line, with the
3220							exception that "Target" features, which contain information about the
3221							target of a homology search, are parsed into their components.
3222
3223							=cut
3224
3225							# This call is responsible for turning a line of GFF into a
3226							# feature object.
3227							# The $parent argument is a Bio::DB::GFF::Segment object and is used
3228							# to establish the coordinate system for the new feature.
3229							# The $group_hash argument is an hash ref that holds previously-
3230							# generated group objects.
3231							# Other arguments are taken right out of the GFF table.
3232							sub make_feature {
3233	844			844	1	896	my $self = shift;
3234	844					1734	my ($parent,$group_hash, # these arguments provided by generic mechanisms
3235							$srcseq, # the rest is provided by adaptor
3236							$start,$stop,
3237							$source,$method,
3238							$score,$strand,$phase,
3239							$group_class,$group_name,
3240							$tstart,$tstop,
3241							$db_id,$group_id) = @_;
3242
3243	844	100				1123	return unless $srcseq; # return undef if called with no arguments. This behavior is used for
3244							# on-the-fly aggregation.
3245
3246	829					666	my $group; # undefined
3247	829	50	33			2042	if (defined $group_class && defined $group_name) {
3248	829		100			1699	$tstart \|\|= '';
3249	829		100			1580	$tstop \|\|= '';
3250	829	100				984	if ($group_hash) {
3251	559		66			1755	$group = $group_hash->{$group_class,$group_name,$tstart,$tstop}
3252							\|\|= $self->make_object($group_class,$group_name,$tstart,$tstop);
3253							} else {
3254	270					355	$group = $self->make_object($group_class,$group_name,$tstart,$tstop);
3255							}
3256							}
3257
3258							# fix for some broken GFF files
3259							# unfortunately - has undesired side effects
3260							# if (defined $tstart && defined $tstop && !defined $strand) {
3261							# $strand = $tstart <= $tstop ? '+' : '-';
3262							# }
3263
3264	829	100				1261	if (ref $parent) { # note that the src sequence is ignored
3265	505					1071	return Bio::DB::GFF::Feature->new_from_parent($parent,$start,$stop,
3266							$method,$source,
3267							$score,$strand,$phase,
3268							$group,$db_id,$group_id,
3269							$tstart,$tstop);
3270							} else {
3271	324					682	return Bio::DB::GFF::Feature->new($self,$srcseq,
3272							$start,$stop,
3273							$method,$source,
3274							$score,$strand,$phase,
3275							$group,$db_id,$group_id,
3276							$tstart,$tstop);
3277							}
3278							}
3279
3280							sub make_aggregated_feature {
3281	15			15	0	23	my $self = shift;
3282	15					30	my ($accumulated_features,$parent,$aggregators) = splice(@_,0,3);
3283	15					33	my $feature = $self->make_feature($parent,undef,@_);
3284	15	50	66			29	return [$feature] if $feature && !$feature->group;
3285
3286							# if we have accumulated features and either:
3287							# (1) make_feature() returned undef, indicated very end or
3288							# (2) the current group is different from the previous one
3289
3290	15					48	local $^W = 0; # irritating uninitialized value warning in next statement
3291	15	50	66			64	if (@$accumulated_features &&
			100
3292							(!defined($feature) \|\| ($accumulated_features->[-1]->group ne $feature->group))) {
3293	10					20	foreach my $a (@$aggregators) { # last aggregator gets first shot
3294	30	50				63	$a->aggregate($accumulated_features,$self) or next;
3295							}
3296	10					20	my @result = @$accumulated_features;
3297	10	100				18	@$accumulated_features = $feature ? ($feature) : ();
3298	10	50				32	return unless @result;
3299	10					41	return \@result ;
3300							}
3301	5					10	push @$accumulated_features,$feature;
3302	5					13	return;
3303							}
3304
3305							=head2 make_match_sub
3306
3307							Title : make_match_sub
3308							Usage : $db->make_match_sub($types)
3309							Function: creates a subroutine used for filtering features
3310							Returns : a code reference
3311							Args : a list of parsed type names
3312							Status : protected
3313
3314							This method is used internally to generate a code subroutine that will
3315							accept or reject a feature based on its method and source. It takes
3316							an array of parsed type names in the format returned by parse_types(),
3317							and generates an anonymous subroutine. The subroutine takes a single
3318							Bio::DB::GFF::Feature object and returns true if the feature matches
3319							one of the desired feature types, and false otherwise.
3320
3321							=cut
3322
3323							# a subroutine that matches features indicated by list of types
3324							sub make_match_sub {
3325	168			168	1	171	my $self = shift;
3326	168					186	my $types = shift;
3327
3328	168	50	33	0		490	return sub { 1 } unless ref $types && @$types;
	0					0
3329
3330	168					153	my @expr;
3331	168					211	for my $type (@$types) {
3332	1432					1657	my ($method,$source) = @$type;
3333	1432	50				1770	$method = $method ? "\\Q$method\\E" : ".*";
3334	1432	50				1469	$source = $source ? ":\\Q$source\\E" : "(?::.+)?";
3335	1432					2150	push @expr,"${method}${source}";
3336							}
3337	168					367	my $expr = join '\|',@expr;
3338	168	100				703	return $self->{match_subs}{$expr} if $self->{match_subs}{$expr};
3339
3340	15					42	my $sub =<
3341							sub {
3342							my \$feature = shift or return;
3343							return \$feature->type =~ /^($expr)\$/i;
3344							}
3345							END
3346	15	50				28	warn "match sub: $sub\n" if $self->debug;
3347	15					25	undef $@;
3348	15					2799	my $compiled_sub = eval $sub;
3349	15	50				42	$self->throw($@) if $@;
3350	15					81	return $self->{match_subs}{$expr} = $compiled_sub;
3351							}
3352
3353							=head2 make_object
3354
3355							Title : make_object
3356							Usage : $db->make_object($class,$name,$start,$stop)
3357							Function: creates a feature object
3358							Returns : a feature object
3359							Args : see below
3360							Status : protected
3361
3362							This method is called to make an object from the GFF "group" field.
3363							By default, all Target groups are turned into Bio::DB::GFF::Homol
3364							objects, and everything else becomes a Bio::DB::GFF::Featname.
3365							However, adaptors are free to override this method to generate more
3366							interesting objects, such as true BioPerl objects, or Acedb objects.
3367
3368							Arguments are:
3369
3370							$name database ID for object
3371							$class class of object
3372							$start for similarities, start of match inside object
3373							$stop for similarities, stop of match inside object
3374
3375							=cut
3376
3377							# abstract call to turn a feature into an object, given its class and name
3378							sub make_object {
3379	594			594	1	582	my $self = shift;
3380	594					837	my ($class,$name,$start,$stop) = @_;
3381	594	100	66			1614	return Bio::DB::GFF::Homol->new($self,$class,$name,$start,$stop)
3382							if defined $start and length $start;
3383	434					1031	return Bio::DB::GFF::Featname->new($class,$name);
3384							}
3385
3386
3387							=head2 do_attributes
3388
3389							Title : do_attributes
3390							Usage : $db->do_attributes($id [,$tag]);
3391							Function: internal method to retrieve attributes given an id and tag
3392							Returns : a list of Bio::DB::GFF::Feature objects
3393							Args : a feature id and a attribute tag (optional)
3394							Status : protected
3395
3396							This method is overridden by subclasses in order to return a list of
3397							attributes. If called with a tag, returns the value of attributes of
3398							that tag type. If called without a tag, returns a flattened array of
3399							(tag=Evalue) pairs. A particular tag can be present multiple times.
3400
3401							=cut
3402
3403							sub do_attributes {
3404	0			0	1	0	my $self = shift;
3405	0					0	my ($id,$tag) = @_;
3406	0					0	return ();
3407							}
3408
3409							=head2 clone
3410
3411							The clone() method should be used when you want to pass the
3412							Bio::DB::GFF object to a child process across a fork(). The child must
3413							call clone() before making any queries.
3414
3415							The default behavior is to do nothing, but adaptors that use the DBI
3416							interface may need to implement this in order to avoid database handle
3417							errors. See the dbi adaptor for an example.
3418
3419							=cut
3420
3421				2	1		sub clone { }
3422
3423
3424							=head1 Internal Methods
3425
3426							The following methods are internal to Bio::DB::GFF and are not
3427							guaranteed to remain the same.
3428
3429							=head2 _features
3430
3431							Title : _features
3432							Usage : $db->_features($search,$options,$parent)
3433							Function: internal method
3434							Returns : a list of Bio::DB::GFF::Feature objects
3435							Args : see below
3436							Status : internal
3437
3438							This is an internal method that is called by overlapping_features(),
3439							contained_features() and features() to create features based on a
3440							parent segment's coordinate system. It takes three arguments, a
3441							search options hashref, an options hashref, and a parent segment.
3442
3443							The search hashref contains the following keys:
3444
3445							rangetype One of "overlaps", "contains" or "contained_in". Indicates
3446							the type of range query requested.
3447							refseq reference sequence ID
3448							refclass reference sequence class
3449							start start of range
3450							stop stop of range
3451							types arrayref containing list of types in "method:source" form
3452
3453							The options hashref contains zero or more of the following keys:
3454
3455							sparse turn on optimizations for a rare feature
3456							automerge if true, invoke aggregators to merge features
3457							iterator if true, return an iterator
3458
3459							The $parent argument is a scalar object containing a
3460							Bio::DB::GFF::RelSegment object or descendent.
3461
3462							=cut
3463
3464							#'
3465
3466							sub _features {
3467	100			100		129	my $self = shift;
3468	100					148	my ($search,$options,$parent) = @_;
3469	0					0	(@{$search}{qw(start stop)}) = (@{$search}{qw(stop start)})
	0					0
3470	100	50	66			285	if defined($search->{start}) && $search->{start} > $search->{stop};
3471	100	50				183	$search->{refseq} = $search->{seq_id} if exists $search->{seq_id};
3472
3473	100					290	my $types = $self->parse_types($search->{types}); # parse out list of types
3474	100					167	my @aggregated_types = @$types; # keep a copy
3475
3476							# allow the aggregators to operate on the original
3477	100					82	my @aggregators;
3478	100	100				186	if ($options->{automerge}) {
3479	85					199	for my $a ($self->aggregators) {
3480	175	100				299	$a = $a->clone if $options->{iterator};
3481	175	100				424	unshift @aggregators,$a
3482							if $a->disaggregate(\@aggregated_types,$self);
3483							}
3484							}
3485
3486	100	100				204	if ($options->{iterator}) {
3487	15					18	my @accumulated_features;
3488	15			15		48	my $callback = $options->{automerge} ? sub { $self->make_aggregated_feature(\@accumulated_features,$parent,\@aggregators,@_) }
3489	15	100		270		75	: sub { [$self->make_feature($parent,undef,@_)] };
	270					497
3490							return $self->get_features_iterator({ %$search,
3491							types => \@aggregated_types },
3492							{ %$options,
3493							sort_by_group => $options->{automerge} },
3494	15					118	$callback
3495							);
3496							}
3497
3498	85					106	my %groups; # cache the groups we create to avoid consuming too much unecessary memory
3499	85					98	my $features = [];
3500
3501	85			470		383	my $callback = sub { push @$features,$self->make_feature($parent,\%groups,@_) };
	470					1023
3502	85					587	$self->get_features({ %$search,
3503							types => \@aggregated_types },
3504							$options,
3505							$callback);
3506
3507	85	100				274	if ($options->{automerge}) {
3508	80	50				167	warn "aggregating...\n" if $self->debug;
3509	80					135	foreach my $a (@aggregators) { # last aggregator gets first shot
3510	95	50				146	warn "Aggregator $a:\n" if $self->debug;
3511	95					256	$a->aggregate($features,$self);
3512							}
3513							}
3514
3515	85					979	@$features;
3516							}
3517
3518							=head2 get_features_iterator
3519
3520							Title : get_features_iterator
3521							Usage : $db->get_features_iterator($search,$options,$callback)
3522							Function: get an iterator on a features query
3523							Returns : a Bio::SeqIO object
3524							Args : as per get_features()
3525							Status : Public
3526
3527							This method takes the same arguments as get_features(), but returns an
3528							iterator that can be used to fetch features sequentially, as per
3529							Bio::SeqIO.
3530
3531							Internally, this method is simply a front end to range_query().
3532							The latter method constructs and executes the query, returning a
3533							statement handle. This routine passes the statement handle to the
3534							constructor for the iterator, along with the callback.
3535
3536							=cut
3537
3538							sub get_features_iterator {
3539	0			0	1	0	my $self = shift;
3540	0					0	my ($search,$options,$callback) = @_;
3541	0					0	$self->throw('feature iteration is not implemented in this adaptor');
3542							}
3543
3544							=head2 split_group
3545
3546							Title : split_group
3547							Usage : $db->split_group($group_field,$gff3_flag)
3548							Function: parse GFF group field
3549							Returns : ($gclass,$gname,$tstart,$tstop,$attributes)
3550							Args : the gff group column and a flag indicating gff3 compatibility
3551							Status : internal
3552
3553							This is a method that is called by load_gff_line to parse out the
3554							contents of one or more group fields. It returns the class of the
3555							group, its name, the start and stop of the target, if any, and an
3556							array reference containing any attributes that were stuck into the
3557							group field, in [attribute_name,attribute_value] format.
3558
3559							=cut
3560
3561							sub split_group {
3562	165			165	1	207	my $self = shift;
3563	165					296	my ($group,$gff3) = @_;
3564	165	100				231	if ($gff3) {
3565	66					170	my @groups = split /[;&]/,$group; # so easy!
3566	66					118	return $self->_split_gff3_group(@groups);
3567							} else {
3568							# handle group parsing
3569							# protect embedded semicolons in the group; there must be faster/more elegant way
3570							# to do this.
3571	99					150	$group =~ s/\\;/$;/g;
3572	99					201	while ($group =~ s/( \"[^\"]);([^\"]\")/$1$;$2/) { 1 }
	0					0
3573	99					261	my @groups = split(/\s;\s/,$group);
3574	99					141	foreach (@groups) { s/$;/;/g }
	138					288
3575	99					183	return $self->_split_gff2_group(@groups);
3576							}
3577							}
3578
3579							=head2 _split_gff2_group
3580
3581							This is an internal method called by split_group().
3582
3583							=cut
3584
3585							# this has gotten quite nasty due to transition from GFF2 to GFF2.5
3586							# (artemis) to GFF3.
3587
3588							sub _split_gff2_group {
3589	99			99		111	my $self = shift;
3590	99					159	my @groups = @_;
3591	99					87	my $target_found;
3592
3593	99					102	my ($gclass,$gname,$tstart,$tstop,@attributes,@notes);
3594
3595	99					108	for (@groups) {
3596
3597	138					510	my ($tag,$value) = /^(\S+)(?:\s+(.+))?/;
3598	138	50				243	$value = '' unless defined $value;
3599	138	100				249	if ($value =~ /^\"(.+)\"$/) { #remove quotes
3600	24					54	$value = $1;
3601							}
3602	138					177	$value =~ s/\\t/\t/g;
3603	138					135	$value =~ s/\\r/\r/g;
3604	138					192	$value =~ s/\s+$//;
3605
3606							# Any additional groups become part of the attributes hash
3607							# For historical reasons, the tag "Note" is treated as an
3608							# attribute, even if it is the only group.
3609	138		50			192	$tag \|\|= '';
3610	138	50	33			702	if ($tag eq 'tstart' && $target_found) {
		50	33
		100	66
		100
		50
3611	0					0	$tstart = $value;
3612							}
3613
3614							elsif ($tag eq 'tend' && $target_found) {
3615	0					0	$tstop = $value;
3616							}
3617
3618							elsif (ucfirst $tag eq 'Note') {
3619	12					33	push @notes, [$tag => $value];
3620							}
3621
3622							elsif ($tag eq 'Target' && /([^:\"\s]+):([^\"\s]+)/) { # major disagreement in implementors of GFF2 here
3623	21					39	$target_found++;
3624	21					57	($gclass,$gname) = ($1,$2);
3625	21					90	($tstart,$tstop) = / (\d+) (\d+)/;
3626							}
3627
3628							elsif (!defined($value)) {
3629	0					0	push @notes, [Note => $tag]; # e.g. "Confirmed_by_EST"
3630							}
3631
3632							else {
3633	105					312	push @attributes, [$tag => $value];
3634							}
3635							}
3636
3637							# group assignment
3638	99	100	33			288	if (@attributes && !($gclass && $gname) ) {
			66
3639
3640	78	50				195	my $preferred = ref($self) ? $self->_preferred_groups_hash : {};
3641
3642	78					195	for my $pair (@attributes) {
3643	105					159	my ($c,$n) = @$pair;
3644							($gclass,$gname) = ($c,$n)
3645							if !$gclass # pick up first one
3646							\|\|
3647	105	100	100			357	($preferred->{lc $gclass}\|\|0) < ($preferred->{lc $c}\|\|0); # pick up higher priority one
			100
			100
3648							}
3649
3650	78					111	@attributes = grep {$gclass ne $_->[0]} @attributes;
	105					252
3651							}
3652
3653	99					114	push @attributes, @notes;
3654
3655	99					291	return ($gclass,$gname,$tstart,$tstop,\@attributes);
3656							}
3657
3658
3659							=head2 gff3_name_munging
3660
3661							Title : gff3_name_munging
3662							Usage : $db->gff3_name_munging($boolean)
3663							Function: get/set gff3_name_munging flag
3664							Returns : $current value of flag
3665							Args : new value of flag (optional)
3666							Status : utility
3667
3668							If this is set to true (default false), then features identified in
3669							gff3 files with an ID in the format foo:bar will be parsed so that
3670							"foo" is the class and "bar" is the name. This is mostly for backward
3671							compatibility with GFF2.
3672
3673							=cut
3674
3675							sub gff3_name_munging {
3676	71			71	1	101	my $self = shift;
3677	71					77	my $d = $self->{gff3_name_munging};
3678	71	100				122	$self->{gff3_name_munging} = shift if @_;
3679	71					96	$d;
3680							}
3681
3682							=head2 _split_gff3_group
3683
3684							This is called internally from split_group().
3685
3686							=cut
3687
3688							sub _split_gff3_group {
3689	66			66		72	my $self = shift;
3690	66					130	my @groups = @_;
3691	66					100	my $dc = $self->default_class;
3692	66					78	my (%id,@attributes);
3693
3694	66					86	for my $group (@groups) {
3695	92					194	my ($tag,$value) = split /=/,$group;
3696	92					160	$tag = unescape($tag);
3697	92					154	my @values = map {unescape($_)} split /,/,$value;
	92					120
3698
3699							# GFF2 traditionally did not distinguish between a feature's name
3700							# and the group it belonged to. This code is a transition between
3701							# gff2 and the new parent/ID dichotomy in gff3.
3702	92	50	66			362	if ($tag eq 'Parent') {
		100
		100
		50
3703	0					0	my (@names,@classes);
3704	0					0	for (@values) {
3705	0					0	my ($name,$class) = $self->_gff3_name_munging($_,$dc);
3706	0					0	push @names,$name;
3707	0					0	push @classes,$class;
3708							}
3709	0	0				0	$id{$tag} = @names > 1 ? [\@names,\@classes] : [$names[0],$classes[0]];
3710							}
3711							elsif ($tag eq 'ID' \|\| $tag eq 'Name') {
3712	52					88	$id{$tag} = [$self->_gff3_name_munging(shift(@values),$dc)];
3713							}
3714							elsif ($tag eq 'Target') {
3715	14					68	my ($gname,$tstart,$tstop) = split /\s+/,shift @values;
3716	14					34	$id{$tag} = [$self->_gff3_name_munging($gname,$dc),$tstart,$tstop];
3717							}
3718							elsif ($tag =~ /synonym/i) {
3719	0					0	$tag = 'Alias';
3720							}
3721	92					202	push @attributes,[$tag=>$_] foreach @values;
3722							}
3723
3724	66					138	my $priorities = $self->_preferred_groups_hash(1);
3725	66					78	my ($gclass,$gname,$tstart,$tstop);
3726	66					140	for my $preferred (sort {$priorities->{lc $b}<=>$priorities->{lc $a}}
	0					0
3727							keys %id) {
3728	66	50				106	unless (defined $gname) {
3729	66					64	($gname,$gclass,$tstart,$tstop) = @{$id{$preferred}};
	66					140
3730							}
3731							}
3732
3733							# set null gclass to empty string to preserve compatibility with
3734							# programs that expect a defined gclass if no gname
3735	66	50	50			142	$gclass \|\|= '' if defined $gname;
3736
3737	66					250	return ($gclass,$gname,$tstart,$tstop,\@attributes);
3738							}
3739
3740							# accomodation for wormbase style of class:name naming
3741							sub _gff3_name_munging {
3742	66			66		70	my $self = shift;
3743	66					76	my ($name,$default_class) = @_;
3744	66	50				88	return ($name,$default_class) unless $self->gff3_name_munging;
3745
3746	66	50				224	if ($name =~ /^(\w+):(.+)/) {
3747	66					266	return ($2,$1);
3748							} else {
3749	0					0	return ($name,$default_class);
3750							}
3751							}
3752
3753							=head2 _delete_features(), _delete_groups(),_delete(),_delete_fattribute_to_features()
3754
3755							Title : _delete_features(), _delete_groups(),_delete(),_delete_fattribute_to_features()
3756							Usage : $count = $db->_delete_features(@feature_ids)
3757							$count = $db->_delete_groups(@group_ids)
3758							$count = $db->_delete(\%delete_spec)
3759							$count = $db->_delete_fattribute_to_features(@feature_ids)
3760							Function: low-level feature/group deleter
3761							Returns : count of groups removed
3762							Args : list of feature or group ids removed
3763							Status : for implementation by subclasses
3764
3765							These methods need to be implemented in adaptors. For _delete_features,
3766							_delete_groups and _delete_fattribute_to_features, the arguments are a list of
3767							feature or group IDs to remove. For _delete(), the argument is a hashref with
3768							the three keys 'segments', 'types' and 'force'. The first contains an arrayref
3769							of Bio::DB::GFF::RelSegment objects to delete (all FEATURES within the segment
3770							are deleted). The second contains an arrayref of [method,source] feature types
3771							to delete. The two are ANDed together. If 'force' has a true value, this
3772							forces the operation to continue even if it would delete all features.
3773
3774							=cut
3775
3776							sub _delete_features {
3777	0			0		0	my $self = shift;
3778	0					0	my @feature_ids = @_;
3779	0					0	$self->throw('_delete_features is not implemented in this adaptor');
3780							}
3781
3782							sub _delete_groups {
3783	5			5		10	my $self = shift;
3784	5					8	my @group_ids = @_;
3785	5					24	$self->throw('_delete_groups is not implemented in this adaptor');
3786							}
3787
3788							sub _delete {
3789	0			0		0	my $self = shift;
3790	0					0	my $delete_options = shift;
3791	0					0	$self->throw('_delete is not implemented in this adaptor');
3792							}
3793
3794							sub _delete_fattribute_to_features {
3795	0			0		0	my $self = shift;
3796	0					0	my @feature_ids = @_;
3797	0					0	$self->throw('_delete_fattribute_to_features is not implemented in this adaptor');
3798							}
3799
3800
3801							sub unescape {
3802	184			184	0	184	my $v = shift;
3803	184					204	$v =~ tr/+/ /;
3804	184					220	$v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
	0					0
3805	184					280	return $v;
3806							}
3807
3808							sub print_gff3_warning {
3809	2			2	0	6	my $self = shift;
3810	2					66	print STDERR <
3811
3812							You are loading a Bio::DB::GFF database with GFF3 formatted data.
3813							While this will likely work fine, the Bio::DB::GFF schema does not
3814							always faithfully capture the complexity represented in GFF3 files.
3815							Unless you have a specific reason for using Bio::DB::GFF, we suggest
3816							that you use a Bio::DB::SeqFeature::Store database and its corresponding
3817							loader, bp_seqfeature_load.pl.
3818
3819							END
3820							;
3821
3822	2					18	return;
3823							}
3824
3825
3826							package Bio::DB::GFF::ID_Iterator;
3827	3			3		18	use strict;
	3					3
	3					57
3828
3829	3			3		15	use base qw(Bio::Root::Root);
	3					3
	3					729
3830
3831							sub new {
3832	0			0			my $class = shift;
3833	0						my ($db,$ids,$type) = @_;
3834	0						return bless {ids=>$ids,db=>$db,type=>$type},$class;
3835							}
3836
3837							sub next_seq {
3838	0			0			my $self = shift;
3839	0						my $next = shift @{$self->{ids}};
	0
3840	0	0					return unless $next;
3841	0	0					my $name = ref($next) eq 'ARRAY' ? Bio::DB::GFF::Featname->new(@$next) : $next;
3842							my $segment = $self->{type} eq 'name' ? $self->{db}->segment($name)
3843							: $self->{type} eq 'feature' ? $self->{db}->fetch_feature_by_id($name)
3844	0	0					: $self->{type} eq 'group' ? $self->{db}->fetch_feature_by_gid($name)
		0
		0
3845							: $self->throw("Bio::DB::GFF::ID_Iterator called to fetch an unknown type of identifier");
3846	0	0					$self->throw("id does not exist") unless $segment;
3847	0						return $segment;
3848							}
3849
3850							package Bio::DB::GFF::FeatureIterator;
3851
3852							sub new {
3853	0			0			my $self = shift;
3854	0						my @features = @_;
3855	0		0				return bless \@features,ref $self \|\| $self;
3856							}
3857							sub next_seq {
3858	0			0			my $self = shift;
3859	0	0					return unless @$self;
3860	0						return shift @$self;
3861							}
3862
3863
3864							1;
3865
3866							__END__