File Coverage

blib/lib/Bio/ToolBox/Data/Feature.pm

Criterion	Covered	Total	%
statement	228	696	32.7
branch	101	432	23.3
condition	48	319	15.0
subroutine	26	48	54.1
pod	32	34	94.1
total	435	1529	28.4

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::ToolBox::Data::Feature;
2							our $VERSION = '1.68';
3
4							=head1 NAME
5
6							Bio::ToolBox::Data::Feature - Objects representing rows in a data table
7
8							=head1 DESCRIPTION
9
10							A Bio::ToolBox::Data::Feature is an object representing a row in the
11							data table. Usually, this in turn represents an annotated feature or
12							segment in the genome. As such, this object provides convenient
13							methods for accessing and manipulating the values in a row, as well as
14							methods for working with the represented genomic feature.
15
16							This class should NOT be used directly by the user. Rather, Feature
17							objects are generated from a Bio::ToolBox::Data::Iterator object
18							(generated itself from the L
19							function in Bio::ToolBox::Data), or the L
20							function in Bio::ToolBox::Data. Please see the respective documentation
21							for more information.
22
23							Example of working with a stream object.
24
25							my $Data = Bio::ToolBox::Data->new(file => $file);
26
27							# stream method
28							my $stream = $Data->row_stream;
29							while (my $row = $stream->next_row) {
30							# each $row is a Bio::ToolBox::Data::Feature object
31							# representing the row in the data table
32							my $value = $row->value($index);
33							# do something with $value
34							}
35
36							# iterate method
37							$Data->iterate( sub {
38							my $row = shift;
39							my $number = $row->value($index);
40							my $log_number = log($number);
41							$row->value($index, $log_number);
42							} );
43
44
45							=head1 METHODS
46
47							=head2 General information methods
48
49							=over 4
50
51							=item row_index
52
53							Returns the index position of the current data row within the
54							data table. Useful for knowing where you are at within the data
55							table.
56
57							=item feature_type
58
59							Returns one of three specific values describing the contents
60							of the data table inferred by the presence of specific column
61							names. This provides a clue as to whether the table features
62							represent genomic regions (defined by coordinate positions) or
63							named database features. The return values include:
64
65							=over 4
66
67							=item * coordinate: Table includes at least chromosome and start
68
69							=item * named: Table includes name, type, and/or Primary_ID
70
71							=item * unknown: unrecognized
72
73							=back
74
75							=item column_name
76
77							Returns the column name for the given index.
78
79							item data
80
81							Returns the parent L object, in case you may
82							have lost it by going out of scope.
83
84							=back
85
86							=head2 Methods to access row feature attributes
87
88							These methods return the corresponding value, if present in the
89							data table, based on the column header name. If the row represents
90							a named database object, try calling the L method first.
91							This will retrieve the database SeqFeature object, and the attributes
92							can then be retrieved using the methods below or on the actual
93							database SeqFeature object.
94
95							In cases where there is a table column and a corresponding SeqFeature
96							object, for example a start column and a parsed SeqFeature object, the
97							table value takes precedence and is returned. You can always obtain the
98							SeqFeature's value separately and directly.
99
100							These methods do not set attribute values. If you need to change the
101							values in a table, use the L method below.
102
103							=over 4
104
105							=item seq_id
106
107							The name of the chromosome the feature is on.
108
109							=item start
110
111							=item end
112
113							=item stop
114
115							The coordinates of the feature or segment. Coordinates from known
116							0-based file formats, e.g. BED, are returned as 1-based. Coordinates
117							must be integers to be returned. Zero or negative start coordinates
118							are assumed to be accidents or poor programming and transformed to 1.
119							Use the L method if you don't want this to happen.
120
121							=item strand
122
123							The strand of the feature or segment. Returns -1, 0, or 1. Default is 0,
124							or unstranded.
125
126							=item name
127
128							=item display_name
129
130							The name of the feature.
131
132							=item coordinate
133
134							Returns a coordinate string formatted as C.
135
136							=item type
137
138							The type of feature. Typically either C or C.
139							In a GFF3 file, this represents columns 3 and 2, respectively. In annotation
140							databases such as L, the type is used to restrict
141							to one of many different types of features, e.g. gene, mRNA, or exon.
142
143							=item id
144
145							=item primary_id
146
147							Here, this represents the C in the database. Note that this number
148							is generally unique to a specific database, and not portable between databases.
149
150							=item length
151
152							The length of the feature or segment.
153
154							=item score
155
156							Returns the value of the Score column, if one is available. Typically
157							associated with defined file formats, such as GFF files (6th column),
158							BED and related Peak files (5th column), and bedGraph (4th column).
159
160							=back
161
162							=head2 Accessing and setting values in the row.
163
164							=over 4
165
166							=item value
167
168							# retrieve a value
169							my $v = $row->value($index);
170							# set a value
171							$row->value($index, $v + 1);
172
173							Returns or sets the value at a specific column index in the
174							current data row. Null values return a '.', symbolizing an
175							internal null value.
176
177							=item row_values
178
179							Returns an array or array reference representing all the values
180							in the current data row.
181
182							=back
183
184							=head2 Special feature attributes
185
186							GFF and VCF files have special attributes in the form of key =E value pairs.
187							These are stored as specially formatted, character-delimited lists in
188							certain columns. These methods will parse this information and return as
189							a convenient hash reference. The keys and values of this hash may be
190							changed, deleted, or added to as desired. To write the changes back to
191							the file, use the L to properly write the attributes
192							back to the file with the proper formatting.
193
194							=over 4
195
196							=item attributes
197
198							Generic method that calls either L or L
199							depending on the data table format.
200
201							=item gff_attributes
202
203							Parses the 9th column of GFF files. URL-escaped characters are converted
204							back to text. Returns a hash reference of key =E value pairs.
205
206							=item vcf_attributes
207
208							Parses the C (8th column) and all sample columns (10th and higher
209							columns) in a version 4 VCF file. The Sample columns use the C
210							column (9th column) as keys. The returned hash reference has two levels:
211							The first level keys are both the column names and index (0-based). The
212							second level keys are the individual attribute keys to each value.
213							For example:
214
215							my $attr = $row->vcf_attributes;
216
217							# access by column name
218							my $genotype = $attr->{sample1}{GT};
219							my $depth = $attr->{INFO}{ADP};
220
221							# access by 0-based column index
222							my $genotype = $attr->{9}{GT};
223							my $depth = $attr->{7}{ADP}
224
225							=item rewrite_attributes
226
227							Generic method that either calls L or
228							L depending on the data table format.
229
230							=item rewrite_gff_attributes
231
232							Rewrites the GFF attributes column (the 9th column) based on the
233							contents of the attributes hash that was previously generated with
234							the L method. Useful when you have modified the
235							contents of the attributes hash.
236
237							=item rewrite_vcf_attributes
238
239							Rewrite the VCF attributes for the C (8th column), C (9th
240							column), and sample columns (10th and higher columns) based on the
241							contents of the attributes hash that was previously generated with
242							the L method. Useful when you have modified the
243							contents of the attributes hash.
244
245							=back
246
247							=head2 Convenience Methods to database functions
248
249							The next three functions are convenience methods for using the
250							attributes in the current data row to interact with databases.
251							They are wrappers to methods in the L
252							module.
253
254							=over 4
255
256							=item seqfeature
257
258							=item feature
259
260							Returns a SeqFeature object representing the feature or item in
261							the current row. If the SeqFeature object is stored in the parent
262							C<$Data> object (usually from parsing an annotation file), it is
263							immediately returned. Otherwise, the SeqFeature
264							object is retrieved from the database using the name and
265							type values in the current Data table row. The SeqFeature object
266							is requested from the database named in the general metadata. If
267							an alternate database is desired, you should change it first using
268							the C<$Data>-Edatabase() method. If the feature name or type is not
269							present in the table, then nothing is returned.
270
271							See L and L for more
272							information about working with these objects. See L
273							about working with database features.
274
275							This method normally only works with "named" feature types in a
276							L Data table. If your Data table has coordinate
277							information, i.e. chromosome, start, and stop columns, then it will
278							likely be recognized as a "coordinate" feature_type and not work.
279
280							Pass a true value to this method to force the seqfeature lookup. This
281							will still require the presence of Name, ID, and/or Type columns to
282							perform the database lookup. The L method feature()
283							is used to determine the type if a Type column is not present.
284
285							=item segment
286
287							Returns a database Segment object corresponding to the coordinates
288							defined in the Data table row. If a named feature and type are
289							present instead of coordinates, then the feature is first automatically
290							retrieved and a Segment returned based on its coordinates. The
291							database named in the general metadata is used to establish the
292							Segment object. If a different database is desired, it should be
293							changed first using the general L method.
294
295							See L and L for more information
296							about working with Segment objects.
297
298							=item get_features
299
300							my @overlap_features = $row->get_features(type => $type);
301
302							Returns seqfeature objects from a database that overlap the Feature
303							or interval in the current Data table row. This is essentially a
304							convenience wrapper for a Bio::DB style I method using the
305							coordinates of the Feature. Optionally pass an array of key value pairs
306							to specify alternate coordinates if so desired. Potential keys
307							include
308
309							=over 4
310
311							=item seq_id
312
313							=item start
314
315							=item end
316
317							=item type
318
319							The type of database features to retrieve.
320
321							=item db
322
323							An alternate database object to collect from.
324
325							=back
326
327							=item get_sequence
328
329							Fetches genomic sequence based on the coordinates of the current seqfeature
330							or interval in the current Feature. This requires a database that
331							contains the genomic sequence, either the database specified in the
332							Data table metadata or an external indexed genomic fasta file.
333
334							If the Feature represents a transcript or gene, then a concatenated
335							sequence of the selected subfeatures may be generated and returned. B
336							that redundant or overlapping subfeatures are B merged, and
337							unexpected results may be obtained.
338
339							The sequence is returned as simple string. If the feature is on the reverse
340							strand, then the reverse complement sequence is automatically returned.
341
342							Pass an array of key value pairs to specify alternate coordinates if so
343							desired. Potential keys include
344
345							=over 4
346
347							=item subfeature
348
349							Pass a text string representing the type of subfeature from which to collect
350							the sequence. Acceptable values include
351
352							=over 4
353
354							=item * exon
355
356							=item * cds
357
358							=item * 5p_utr
359
360							=item * 3p_utr
361
362							=item * intron
363
364							=back
365
366							=item seq_id
367
368							=item start
369
370							=item end
371
372							=item strand
373
374							=item extend
375
376							Indicate additional basepairs of sequence added to both sides
377
378							=item db
379
380							The fasta file or database from which to fetch the sequence
381
382							=back
383
384							=back
385
386							=head2 Data collection
387
388							The following methods allow for data collection from various
389							sources, including bam, bigwig, bigbed, useq, Bio::DB databases, etc.
390
391							=over 4
392
393							=item get_score
394
395							my $score = $row->get_score(
396							dataset => 'scores.bw',
397							method => 'max',
398							);
399
400							This method collects a single score over the feature or interval.
401							Usually a mathematical or statistical value is employed to derive the
402							single score. Pass an array of key value pairs to control data collection.
403							Keys include the following:
404
405							=over 4
406
407							=item db
408
409							=item ddb
410
411							Specify a Bio::DB database from which to collect the data. The default
412							value is the database specified in the Data table metadata, if present.
413							Examples include a L or L
414							database.
415
416							=item dataset
417
418							Specify the name of the dataset. If a database was specified, then this
419							value would be the C or C feature found in the
420							database. Otherwise, the name of a data file, such as a bam, bigWig,
421							bigBed, or USeq file, would be provided here. This options is required!
422
423							=item method
424
425							Specify the mathematical or statistical method combining multiple scores
426							over the interval into one value. Options include the following:
427
428							=over 4
429
430							=item * mean
431
432							=item * sum
433
434							=item * min
435
436							=item * max
437
438							=item * median
439
440							=item * count
441
442							Count all overlapping items.
443
444							=item * pcount
445
446							Precisely count only containing (not overlapping) items.
447
448							=item * ncount
449
450							Count overlapping unique names only.
451
452							=item * range
453
454							The difference between minimum and maximum values.
455
456							=item * stddev
457
458							Standard deviation.
459
460							=back
461
462							=item strandedness
463
464							Specify what strand from which the data should be taken, with respect
465							to the Feature strand. Three options are available. Only really relevant
466							for data sources that support strand.
467
468							=over 4
469
470							=item * sense
471
472							The same strand as the Feature.
473
474							=item * antisense
475
476							The opposite strand as the Feature.
477
478							=item * all
479
480							Strand is ignored, all is taken (default).
481
482							=back
483
484							=item subfeature
485
486							Specify the subfeature type from which to collect the scores. Typically
487							a SeqFeature object representing a transcript is provided, and the
488							indicated subfeatures are collected from object. Pass the name of the
489							subfeature to use. Accepted values include the following.
490
491							=over 4
492
493							=item * exon
494
495							=item * cds
496
497							=item * 5p_utr
498
499							=item * 3p_utr
500
501							=item * intron
502
503							=back
504
505							=item extend
506
507							Specify the number of basepairs that the Data table Feature's
508							coordinates should be extended in both directions. Ignored
509							when used with the subfeature option.
510
511							=item seq_id
512
513							=item chromo
514
515							=item start
516
517							=item end
518
519							=item stop
520
521							=item strand
522
523							Optionally specify zero or more alternate coordinates to use.
524							By default, these are obtained from the Data table Feature.
525
526							=back
527
528							=item get_relative_point_position_scores
529
530							while (my $row = $stream->next_row) {
531							my $pos2score = $row->get_relative_point_position_scores(
532							'ddb' => '/path/to/BigWigSet/',
533							'dataset' => 'MyData',
534							'position' => 5,
535							'extend' => 1000,
536							);
537							}
538
539							This method collects indexed position scores centered around a
540							specific reference point. The returned data is a hash of
541							relative positions (example -20, -10, 1, 10, 20) and their score
542							values. Pass an array of key value pairs to control data collection.
543							Keys include the following:
544
545							=over 4
546
547							=item db
548
549							=item ddb
550
551							Specify a Bio::DB database from which to collect the data. The default
552							value is the database specified in the Data table metadata, if present.
553							Examples include a L or L
554							database.
555
556							=item dataset
557
558							Specify the name of the dataset. If a database was specified, then this
559							value would be the C or C feature found in the
560							database. Otherwise, the name of a data file, such as a bam, bigWig,
561							bigBed, or USeq file, would be provided here. This options is required!
562
563							=item position
564
565							Indicate the position of the reference point relative to the Data table
566							Feature. 5 is the 5' coordinate, 3 is the 3' coordinate, and 4 is the
567							midpoint (get it? it's between 5 and 3). Default is 5.
568
569							=item extend
570
571							Indicate the number of base pairs to extend from the reference coordinate.
572							This option is required!
573
574							=item coordinate
575
576							Optionally provide the real chromosomal coordinate as the reference point.
577
578							=item absolute
579
580							Boolean option to indicate that the returned hash of positions and scores
581							should not be transformed into relative positions but kept as absolute
582							chromosomal coordinates.
583
584							=item avoid
585
586							Provide a C or C database feature type to avoid overlapping
587							scores. Each found score is checked for overlapping features and is
588							discarded if found to do so. The database should be set to use this.
589
590							=item strandedness
591
592							Specify what strand from which the data should be taken, with respect
593							to the Feature strand. Three options are available. Only really relevant
594							for data sources that support strand.
595
596							=over 4
597
598							=item * sense
599
600							The same strand as the Feature.
601
602							=item * antisense
603
604							The opposite strand as the Feature.
605
606							=item * all
607
608							Strand is ignored, all is taken (default).
609
610							=back
611
612							=item method
613
614							Only required when counting objects.
615
616							=over 4
617
618							=item * count
619
620							Count all overlapping items.
621
622							=item * pcount
623
624							Precisely count only containing (not overlapping) items.
625
626							=item * ncount
627
628							Count overlapping unique names only.
629
630							=back
631
632							=back
633
634							=item get_region_position_scores
635
636							while (my $row = $stream->next_row) {
637							my $pos2score = $row->get_relative_point_position_scores(
638							'ddb' => '/path/to/BigWigSet/',
639							'dataset' => 'MyData',
640							'position' => 5,
641							'extend' => 1000,
642							);
643							}
644
645							This method collects indexed position scores across a defined
646							region or interval. The returned data is a hash of positions and
647							their score values. The positions are by default relative to a
648							region coordinate, usually to the 5' end. Pass an array of key value
649							pairs to control data collection. Keys include the following:
650
651							=over 4
652
653							=item db
654
655							=item ddb
656
657							Specify a Bio::DB database from which to collect the data. The default
658							value is the database specified in the Data table metadata, if present.
659							Examples include a L or L
660							database.
661
662							=item dataset
663
664							Specify the name of the dataset. If a database was specified, then this
665							value would be the C or C feature found in the
666							database. Otherwise, the name of a data file, such as a bam, bigWig,
667							bigBed, or USeq file, would be provided here. This options is required!
668
669							=item subfeature
670
671							Specify the subfeature type from which to collect the scores. Typically
672							a SeqFeature object representing a transcript is provided, and the
673							indicated subfeatures are collected from object. When converting to
674							relative coordinates, the coordinates will be relative to the length of
675							the sum of the subfeatures, i.e. the length of the introns will be ignored.
676
677							Pass the name of the subfeature to use. Accepted values include the following.
678
679							=over 4
680
681							=item * exon
682
683							=item * cds
684
685							=item * 5p_utr
686
687							=item * 3p_utr
688
689							=item * intron
690
691							=back
692
693							=item extend
694
695							Specify the number of basepairs that the Data table Feature's
696							coordinates should be extended in both directions.
697
698							=item seq_id
699
700							=item chromo
701
702							=item start
703
704							=item end
705
706							=item stop
707
708							=item strand
709
710							Optionally specify zero or more alternate coordinates to use.
711							By default, these are obtained from the Data table Feature.
712
713							=item position
714
715							Indicate the position of the reference point relative to the Data table
716							Feature. 5 is the 5' coordinate, 3 is the 3' coordinate, and 4 is the
717							midpoint (get it? it's between 5 and 3). Default is 5.
718
719							=item coordinate
720
721							Optionally provide the real chromosomal coordinate as the reference point.
722
723							=item absolute
724
725							Boolean option to indicate that the returned hash of positions and scores
726							should not be transformed into relative positions but kept as absolute
727							chromosomal coordinates.
728
729							=item avoid
730
731							Provide a C or C database feature type to avoid overlapping
732							scores. Each found score is checked for overlapping features and is
733							discarded if found to do so. The database should be set to use this.
734
735							=item strandedness
736
737							Specify what strand from which the data should be taken, with respect
738							to the Feature strand. Three options are available. Only really relevant
739							for data sources that support strand.
740
741							=over 4
742
743							=item * sense
744
745							The same strand as the Feature.
746
747							=item * antisense
748
749							The opposite strand as the Feature.
750
751							=item * all
752
753							Strand is ignored, all is taken (default).
754
755							=back
756
757							=item method
758
759							Only required when counting objects.
760
761							=over 4
762
763							=item * count
764
765							Count all overlapping items.
766
767							=item * pcount
768
769							Precisely count only containing (not overlapping) items.
770
771							=item * ncount
772
773							Count overlapping unique names only.
774
775							=back
776
777							=back
778
779							=item fetch_alignments
780
781							my $sam = $Data->open_database('/path/to/file.bam');
782							my $alignment_data = { mapq => [] };
783							my $callback = sub {
784							my ($a, $data) = @_;
785							push @{ $data->{mapq} }, $a->qual;
786							};
787							while (my $row = $stream->next_row) {
788							$row->fetch_alignments(
789							'db' => $sam,
790							'data' => $alignment_data,
791							'callback' => $callback,
792							);
793							}
794
795							This function allows you to iterate over alignments in a Bam file,
796							allowing custom information to be collected based on a callback
797							code reference that is provided.
798
799							Three parameters are required: C, C, and C. A
800							true value (1) is returned upon success.
801
802							=over 4
803
804							=item db
805
806							Provide an opened, high-level, Bam database object.
807
808							=item callback
809
810							Provide a code callback reference to use when iterating over the
811							alignments. Two objects are passed to this code function: the
812							alignment object and the data structure that is provided. See the
813							Bam adapter documentation for details on low-level C through
814							the Bam index object for details.
815
816							=item data
817
818							This is a reference to a C data object for storing information.
819							It is passed to the callback function along with the alignment. Three
820							new key =E value pairs are automatically added: C, C,
821							and C. These correspond to the values for the current queried
822							interval. Coordinates are automatically transformed to 0-base coordinate
823							system to match low level alignment objects.
824
825							=item subfeature
826
827							If the feature has subfeatures, such as exons, introns, etc., pass
828							the name of the subfeature to restrict iteration only over the
829							indicated subfeatures. The C object will inherit the coordinates
830							for each subfeatures. Allowed subfeatures include the following:
831
832							=over 4
833
834							=item * exon
835
836							=item * cds
837
838							=item * 5p_utr
839
840							=item * 3p_utr
841
842							=item * intron
843
844							=back
845
846							=item start
847
848							=item stop
849
850							=item end
851
852							Provide alternate, custom start and stop coordinates for the row
853							feature. Ignored with subfeatures.
854
855							=back
856
857							=back
858
859							=head2 Feature Export
860
861							These methods allow the feature to be exported in industry standard
862							formats, including the BED format and the GFF format. Both methods
863							return a formatted tab-delimited text string suitable for printing to
864							file. The string does not include a line ending character.
865
866							These methods rely on coordinates being present in the source table.
867							If the row feature represents a database item, the L method
868							should be called prior to these methods, allowing the feature to be
869							retrieved from the database and coordinates obtained.
870
871							=over 4
872
873							=item bed_string
874
875							Returns a BED formatted string. By default, a 6-element string is
876							generated, unless otherwise specified. Pass an array of key values
877							to control how the string is generated. The following arguments
878							are supported.
879
880							=over 4
881
882							=item bed
883
884							Specify the number of BED elements to include. The number of elements
885							correspond to the number of columns in the BED file specification. A
886							minimum of 3 (chromosome, start, stop) is required, and maximum of 6
887							is allowed (chromosome, start, stop, name, score, strand).
888
889							=item chromo
890
891							=item seq_id
892
893							Provide a text string of an alternative chromosome or sequence name.
894
895							=item start
896
897							=item stop
898
899							=item end
900
901							Provide alternative integers for the start and stop coordinates.
902							Note that start values are automatically converted to 0-base
903							by subtracting 1.
904
905							=item strand
906
907							Provide alternate an alternative strand value.
908
909							=item name
910
911							Provide an alternate or missing name value to be used as text in the 4th
912							column. If no name is provided or available, a default name is generated.
913
914							=item score
915
916							Provide a numerical value to be included as the score. BED files typically
917							use integer values ranging from 1..1000.
918
919							=back
920
921							=item gff_string
922
923							Returns a GFF3 formatted string. Pass an array of key values
924							to control how the string is generated. The following arguments
925							are supported.
926
927							=over 4
928
929							=item chromo
930
931							=item seq_id
932
933							=item start
934
935							=item stop
936
937							=item end
938
939							=item strand
940
941							Provide alternate values from those defined or missing in the current
942							row Feature.
943
944							=item source
945
946							Provide a text string to be used as the source_tag value in the 2nd
947							column. The default value is null ".".
948
949							=item primary_tag
950
951							Provide a text string to be used as the primary_tag value in the 3rd
952							column. The default value is null ".".
953
954							=item type
955
956							Provide a text string. This can be either a "primary_tag:source_tag" value
957							as used by GFF based BioPerl databases, or "primary_tag" alone.
958
959							=item score
960
961							Provide a numerical value to be included as the score. The default
962							value is null ".".
963
964							=item name
965
966							Provide alternate or missing name value to be used as the display_name.
967							If no name is provided or available, a default name is generated.
968
969							=item attributes
970
971							Provide an anonymous array reference of one or more row Feature indices
972							to be used as GFF attributes. The name of the column is used as the GFF
973							attribute key.
974
975							=back
976
977							=back
978
979							=cut
980
981	3			3		26	use strict;
	3					8
	3					128
982	3			3		17	use Carp qw(carp cluck croak confess);
	3					8
	3					229
983	3			3		21	use Module::Load;
	3					7
	3					24
984	3					199	use Bio::ToolBox::db_helper qw(
985							get_db_feature
986							get_segment_score
987							calculate_score
988							get_genomic_sequence
989							low_level_bam_fetch
990	3			3		184	);
	3					6
991	3			3		22	use Bio::ToolBox::db_helper::constants;
	3					7
	3					27041
992
993							my $GENETOOL_LOADED = 0;
994							1;
995
996
997
998							### Initialization
999
1000							sub new {
1001							# this should ONLY be called from Bio::ToolBox::Data* iterators
1002	349			349	0	594	my $class = shift;
1003	349					780	my %self = @_;
1004							# we trust that new is called properly with data and index values
1005	349					1128	return bless \%self, $class;
1006							}
1007
1008
1009							### Set and retrieve values
1010
1011							sub data {
1012	0			0	1	0	return shift->{data};
1013							}
1014
1015							sub column_name {
1016	0			0	1	0	my ($self, $column) = @_;
1017	0	0				0	return unless defined $column;
1018	0					0	return $self->{data}->name($column);
1019							}
1020
1021							sub feature_type {
1022	28			28	1	37	my $self = shift;
1023	28	50				56	carp "feature_type is a read only method" if @_;
1024	28					75	return $self->{data}->feature_type;
1025							}
1026
1027							sub row_index {
1028	65			65	1	92	my $self = shift;
1029	65	50				117	carp "row_index is a read only method" if @_;
1030	65					201	return $self->{'index'};
1031							}
1032
1033							sub line_number {
1034	0			0	0	0	my $self = shift;
1035	0	0				0	carp "line_number is a read only method" if @_;
1036	0	0				0	if (exists $self->{data}->{line_count}) {
		0
1037	0					0	return $self->{data}->{line_count};
1038							}
1039							elsif (exists $self->{data}->{header_line_count}) {
1040	0					0	return $self->{data}->{header_line_count} + $self->row_index;
1041							}
1042							else {
1043	0					0	return;
1044							}
1045							}
1046
1047							sub row_values {
1048	196			196	1	345	my $self = shift;
1049	196	50				386	carp "row_values is a read only method" if @_;
1050	196					279	my $row = $self->{'index'};
1051	196					322	my @data = @{ $self->{data}->{data_table}->[$row] };
	196					535
1052	196	100				670	return wantarray ? @data : \@data;
1053							}
1054
1055							sub value {
1056	390			390	1	1977	my ($self, $column, $value) = @_;
1057	390	50				686	return unless defined $column;
1058	390					569	my $row = $self->{'index'};
1059
1060	390	100				681	if (defined $value) {
1061							# set a value
1062	83					153	$self->{data}->{data_table}->[$row][$column] = $value;
1063							}
1064	390					679	my $v = $self->{data}->{data_table}->[$row][$column];
1065	390	100				1057	return length($v) ? $v : '.'; # internal null value, inherited from GFF definition
1066							}
1067
1068							sub seq_id {
1069	21			21	1	477	my $self = shift;
1070	21	100				51	if ($_[0]) {
1071							# update only if we have an actual column
1072	1					4	my $i = $self->{data}->chromo_column;
1073	1	50				5	if (defined $i) {
		0
1074	1					4	my $c = $self->value($i, $_[0]);
1075	1					6	return $c;
1076							}
1077							elsif (exists $self->{feature}) {
1078	0					0	carp "Unable to update seq_id for parsed SeqFeature objects";
1079							}
1080							else {
1081	0					0	carp "No Chromosome column to update!";
1082							}
1083							}
1084							# seqfeature
1085	20	50				45	if (exists $self->{feature}) {
1086	0					0	my $c = $self->{feature}->seq_id;
1087	0					0	return $c;
1088							}
1089							# collect from table
1090	20					63	my $i = $self->{data}->chromo_column;
1091	20	50				62	my $c = defined $i ? $self->value($i) : undef;
1092	20					77	return $c;
1093							}
1094
1095							sub start {
1096	109			109	1	611	my $self = shift;
1097	109	100				406	if ($_[0]) {
1098							# update only if we have an actual column
1099	5					48	my $i = $self->{data}->start_column;
1100	5	50				64	my $d = $_[0] =~ /^\d+$/ ? 1 : 0;
1101	5	100	66			31	if (defined $i and $d) {
		50
		50
1102	1	50				4	if (substr($self->{data}->name($i), -1) eq '0') {
1103							# compensate for 0-based, assuming we're always working with 1-based
1104	1					3	my $n = $_[0] - 1;
1105	1					1061	return $self->value($i, $n);
1106							}
1107							else {
1108	0					0	return $self->value($i, $_[0]);
1109							}
1110							}
1111							elsif (not $d) {
1112	0					0	carp "Start coordinate value is not an integer";
1113							}
1114							elsif (exists $self->{feature}) {
1115	4					587	carp "Unable to update Start coordinate for parsed SeqFeature objects";
1116							}
1117							else {
1118	0					0	carp "No Start coordinate column to update!";
1119							}
1120							}
1121							# seqfeature
1122	108	100				542	if (exists $self->{feature}) {
1123	12					47	return $self->{feature}->start;
1124							}
1125							# collect from table
1126	96					205	my $i = $self->{data}->start_column;
1127	96	50				212	if (defined $i) {
1128	96					169	my $s = $self->value($i);
1129	96	100				232	if (substr($self->{data}->name($i), -1) eq '0') {
1130							# compensate for 0-based index
1131	17					79	return $s + 1;
1132							}
1133							else {
1134	79					227	return $s;
1135							}
1136							}
1137	0					0	return;
1138							}
1139
1140							*stop = \&end;
1141							sub end {
1142	28			28	1	56	my $self = shift;
1143	28	100				62	if ($_[0]) {
1144							# update only if we have an actual column
1145	1					7	my $i = $self->{data}->stop_column;
1146	1	50				11	my $d = $_[0] =~ /^\d+$/ ? 1 : 0;
1147	1	50	33			8	if (defined $i and $d) {
		0
		0
1148	1					4	return $self->value($i, $_[0]);
1149							}
1150							elsif (not $d) {
1151	0					0	carp "End coordinate value is not an integer";
1152							}
1153							elsif (exists $self->{feature}) {
1154	0					0	carp "Unable to update End coordinate for parsed SeqFeature objects";
1155							}
1156							else {
1157	0					0	carp "No End coordinate column to update!";
1158							}
1159							}
1160							# seqfeature
1161	27	50				62	if (exists $self->{feature}) {
1162	0					0	my $e = $self->{feature}->end;
1163	0					0	$self->{end} = $e;
1164	0					0	return $e;
1165							}
1166							# collect from table
1167	27					85	my $i = $self->{data}->stop_column;
1168	27	50				73	my $e = defined $i ? $self->value($i) : undef;
1169	27					54	$self->{end} = $e;
1170	27					92	return $e;
1171							}
1172
1173							sub strand {
1174	25			25	1	50	my $self = shift;
1175	25	100				78	if ($_[0]) {
1176							# update only if we have an actual column
1177	1					7	my $i = $self->{data}->strand_column;
1178	1	50				5	if (defined $i) {
		50
1179	0					0	$self->value($i, $_[0]);
1180	0					0	return $self->_strand($_[0]);
1181							}
1182							elsif (exists $self->{feature}) {
1183	0					0	carp "Unable to update Strand for parsed SeqFeature objects";
1184							}
1185							else {
1186	1					225	carp "No Strand column to update!";
1187							}
1188							}
1189							# seqfeature
1190	25	100				165	if (exists $self->{feature}) {
1191	4					16	my $s = $self->{feature}->strand;
1192	4					28	return $s;
1193							}
1194							# collect from table
1195	21					64	my $i = $self->{data}->strand_column;
1196	21	100				72	return defined $i ? $self->_strand( $self->value($i) ) : 0;
1197							}
1198
1199							sub _strand {
1200	18			18		26	my $self = shift;
1201	18					38	my $str = shift;
1202	18	50	33			100	if ($str eq '1' or $str eq '-1' or $str eq '0') {
		50	33
		50
		0
		0
		0
1203	0					0	return $str;
1204							}
1205							elsif ($str eq '+') {
1206	0					0	return 1;
1207							}
1208							elsif ($str eq '-') {
1209	18					40	return -1;
1210							}
1211							elsif ($str eq '.') {
1212	0					0	return 0;
1213							}
1214							elsif ($str eq '2') {
1215							# R packages use 1 for + and 2 for -
1216	0					0	return -1;
1217							}
1218							elsif ($str eq '*') {
1219							# R packages use * for .
1220	0					0	return 0;
1221							}
1222							else {
1223	0					0	return 0;
1224							}
1225							}
1226
1227							*name = \&display_name;
1228							sub display_name {
1229	75			75	1	1663	my $self = shift;
1230	75	100				154	if ($_[0]) {
1231							# update only if we have an actual column
1232	1					6	my $i = $self->{data}->name_column;
1233	1	50				5	if (defined $i) {
		0
1234	1					4	return $self->value($i, $_[0]);
1235							}
1236							elsif (exists $self->{feature}) {
1237	0					0	carp "Unable to update display_name for parsed SeqFeature objects";
1238							}
1239							else {
1240	0					0	carp "No Name column to update!";
1241							}
1242							}
1243							# seqfeature
1244	74	100				155	if (exists $self->{feature}) {
1245	4					19	return $self->{feature}->display_name;
1246							}
1247							# collect from table
1248	70					176	my $i = $self->{data}->name_column;
1249	70	50				144	if (defined $i) {
		0
1250	70					163	return $self->value($i);
1251							}
1252							elsif (my $att = $self->gff_attributes) {
1253							return $att->{Name} \|\| $att->{ID} \|\| $att->{transcript_name} \|\|
1254	0		0			0	$att->{gene_name} \|\| undef;
1255							}
1256							}
1257
1258							sub coordinate {
1259	3			3	1	10	my $self = shift;
1260	3	50				14	carp "name is a read only method" if @_;
1261							# to avoid auto-converting start0 coordinates, which might confuse people or programs,
1262							# we will take the start value as is when it's available, otherwise calculate start
1263	3					16	my $start_i = $self->{data}->start_column;
1264							my $coord = sprintf("%s:%d", $self->seq_id,
1265							defined $start_i ? $self->value($start_i) :
1266	3	0				14	exists $self->{feature} ? $self->{feature}->start : 0);
		50
1267	3					12	my $end = $self->end;
1268	3	50				22	$coord .= "-$end" if $end;
1269	3	50				20	return CORE::length($coord) > 2 ? $coord : undef;
1270							}
1271
1272							sub type {
1273	9			9	1	24	my $self = shift;
1274	9	100				30	if ($_[0]) {
1275							# update only if we have an actual column
1276	1					5	my $i = $self->{data}->type_column;
1277	1	50				11	if (defined $i) {
		50
1278	0					0	return $self->value($i, $_[0]);
1279							}
1280							elsif (exists $self->{feature}) {
1281	0					0	carp "Unable to update primary_tag for parsed SeqFeature objects";
1282							}
1283							else {
1284	1					78	carp "No Type column to update!";
1285							}
1286							}
1287							# collect from table
1288	9					126	my $i = $self->{data}->type_column;
1289	9	50				29	if (defined $i) {
1290	0					0	return $self->value($i);
1291							}
1292							# seqfeature
1293	9	100				30	if (exists $self->{feature}) {
1294	4					19	return $self->{feature}->primary_tag;
1295							}
1296							# general metadata
1297	5	50				14	if ($self->{data}->feature) {
1298	5					13	return $self->{data}->feature;
1299							}
1300	0					0	return undef;
1301							}
1302
1303							*id = \&primary_id;
1304							sub primary_id {
1305	65			65	1	97	my $self = shift;
1306	65	50				127	carp "id is a read only method" if @_;
1307	65					138	my $i = $self->{data}->id_column;
1308	65	50				172	my $v = $self->value($i) if defined $i;
1309	65	50	33			221	if (defined $v and $v ne '.') {
1310	65					202	return $v;
1311							}
1312	0	0				0	return $self->{feature}->primary_id if exists $self->{feature};
1313	0	0				0	if (my $att = $self->gff_attributes) {
1314	0		0			0	return $att->{ID} \|\| $att->{Name} \|\| $att->{transcript_id};
1315							}
1316	0					0	return undef;
1317							}
1318
1319							sub length {
1320	0			0	1	0	my $self = shift;
1321	0	0				0	carp "length is a read only method" if @_;
1322	0	0				0	if ($self->{data}->vcf) {
1323							# special case for vcf files, measure the length of the ALT allele
1324	0					0	return CORE::length($self->value(4));
1325							}
1326	0					0	my $s = $self->start;
1327	0					0	my $e = $self->end;
1328	0	0	0			0	if (defined $s and defined $e) {
		0
1329	0					0	return $e - $s + 1;
1330							}
1331							elsif (defined $s) {
1332	0					0	return 1;
1333							}
1334							else {
1335	0					0	return undef;
1336							}
1337							}
1338
1339							sub score {
1340	0			0	1	0	my $self = shift;
1341	0					0	my $c = $self->{data}->score_column;
1342	0	0				0	return defined $c ? $self->value($c) : undef;
1343							}
1344
1345							sub attributes {
1346	0			0	1	0	my $self = shift;
1347	0	0				0	return $self->gff_attributes if ($self->{data}->gff);
1348	0	0				0	return $self->vcf_attributes if ($self->{data}->vcf);
1349	0					0	return;
1350							}
1351
1352							sub gff_attributes {
1353	0			0	1	0	my $self = shift;
1354	0	0				0	return unless ($self->{data}->gff);
1355	0	0				0	return $self->{attributes} if (exists $self->{attributes});
1356	0					0	$self->{attributes} = {};
1357	0					0	foreach my $g (split(/\s;\s/, $self->value(8))) {
1358	0					0	my ($tag, $value) = split /\s+/, $g;
1359	0	0	0			0	next unless ($tag and $value);
1360							# unescape URL encoded values, borrowed from Bio::DB::GFF
1361	0					0	$value =~ tr/+/ /;
1362	0					0	$value =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
	0					0
1363	0					0	$self->{attributes}->{$tag} = $value;
1364							}
1365	0					0	return $self->{attributes};
1366							}
1367
1368							sub vcf_attributes {
1369	0			0	1	0	my $self = shift;
1370	0	0				0	return unless ($self->{data}->vcf);
1371	0	0				0	return $self->{attributes} if (exists $self->{attributes});
1372	0					0	$self->{attributes} = {};
1373
1374							# INFO attributes
1375	0					0	my %info;
1376	0	0				0	if ($self->{data}->name(7) eq 'INFO') {
1377	0	0				0	%info = map {$_->[0] => defined $_->[1] ? $_->[1] : undef}
1378							# some tags are simple and have no value, eg SOMATIC
1379	0					0	map { [split(/=/, $_)] }
	0					0
1380							split(/;/, $self->value(7));
1381							}
1382	0					0	$self->{attributes}->{INFO} = \%info;
1383	0					0	$self->{attributes}->{7} = \%info;
1384
1385							# Sample attributes
1386	0	0				0	if ($self->{data}->number_columns > 8) {
1387	0					0	my @formatKeys = split /:/, $self->value(8);
1388	0					0	foreach my $i (9 .. $self->{data}->last_column) {
1389	0					0	my $name = $self->{data}->name($i);
1390	0					0	my @sampleVals = split /:/, $self->value($i);
1391							my %sample = map {
1392	0	0				0	$formatKeys[$_] => defined $sampleVals[$_] ? $sampleVals[$_] : undef }
	0					0
1393							(0 .. $#formatKeys);
1394	0					0	$self->{attributes}->{$name} = \%sample;
1395	0					0	$self->{attributes}->{$i} = \%sample;
1396							}
1397							}
1398	0					0	return $self->{attributes};
1399							}
1400
1401							sub rewrite_attributes {
1402	0			0	1	0	my $self = shift;
1403	0	0				0	return $self->rewrite_gff_attributes if ($self->{data}->gff);
1404	0	0				0	return $self->rewrite_vcf_attributes if ($self->{data}->vcf);
1405	0					0	return;
1406							}
1407
1408							sub rewrite_gff_attributes {
1409	0			0	1	0	my $self = shift;
1410	0	0				0	return unless ($self->{data}->gff);
1411	0	0				0	return unless exists $self->{attributes};
1412	0					0	my @pairs; # of key=value items
1413	0	0				0	if (exists $self->{attributes}{ID}) {
1414							# I assume this does not need to be escaped!
1415	0					0	push @pairs, 'ID=' . $self->{attributes}{ID};
1416							}
1417	0	0				0	if (exists $self->{attributes}{Name}) {
1418	0					0	my $name = $self->{attributes}{Name};
1419	0					0	$name =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
	0					0
1420	0					0	push @pairs, "Name=$name";
1421							}
1422	0					0	foreach my $key (sort {$a cmp $b} keys %{ $self->{attributes} }) {
	0					0
	0					0
1423	0	0				0	next if $key eq 'ID';
1424	0	0				0	next if $key eq 'Name';
1425	0					0	my $value = $self->{attributes}{$key};
1426	0					0	$key =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
	0					0
1427	0					0	$value =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
	0					0
1428	0					0	push @pairs, "$key=$value";
1429							}
1430	0					0	$self->value(8, join("; ", @pairs));
1431	0					0	return 1;
1432							}
1433
1434							sub rewrite_vcf_attributes {
1435	0			0	1	0	my $self = shift;
1436	0	0				0	return unless ($self->{data}->vcf);
1437	0	0				0	return unless exists $self->{attributes};
1438
1439							# INFO
1440							my $info = join(';',
1441							map {
1442							defined $self->{attributes}->{INFO}{$_} ?
1443	0	0				0	join('=', $_, $self->{attributes}->{INFO}{$_}) : $_
1444							}
1445	0					0	sort {$a cmp $b}
1446	0					0	keys %{$self->{attributes}->{INFO}}
	0					0
1447							);
1448	0		0			0	$info \|\|= '.'; # sometimes we have nothing left
1449	0					0	$self->value(7, $info);
1450
1451							# FORMAT
1452	0					0	my @order;
1453	0	0				0	push @order, 'GT' if exists $self->{attributes}{9}{GT};
1454	0					0	foreach my $key (sort {$a cmp $b} keys %{ $self->{attributes}{9} } ) {
	0					0
	0					0
1455	0	0				0	next if $key eq 'GT';
1456	0					0	push @order, $key;
1457							}
1458	0	0				0	if (@order) {
1459	0					0	$self->value(8, join(':', @order));
1460							}
1461							else {
1462	0					0	$self->value(8, '.');
1463							}
1464
1465							# SAMPLES
1466	0					0	foreach my $i (9 .. $self->{data}->last_column) {
1467	0	0				0	if (@order) {
1468							$self->value($i, join(":",
1469	0					0	map { $self->{attributes}{$i}{$_} } @order
	0					0
1470							) );
1471							}
1472							else {
1473	0					0	$self->value($i, '.');
1474							}
1475							}
1476	0					0	return 1;
1477							}
1478
1479							### Data collection convenience methods
1480
1481							*feature = \&seqfeature;
1482							sub seqfeature {
1483	9			9	1	15	my $self = shift;
1484	9		50			27	my $force = shift \|\| 0;
1485	9	50				20	carp "feature is a read only method" if @_;
1486	9	50				21	return $self->{feature} if exists $self->{feature};
1487							# normally this is only for named features in a data table
1488							# skip this for coordinate features like bed files
1489	9	50	33			20	return unless $self->feature_type eq 'named' or $force;
1490
1491							# retrieve from main Data store
1492	0					0	my $f = $self->{data}->get_seqfeature( $self->{'index'} );
1493	0	0				0	if ($f) {
1494	0					0	$self->{feature} = $f;
1495	0					0	return $f;
1496							}
1497
1498							# retrieve the feature from the database
1499	0	0				0	return unless $self->{data}->database;
1500							$f = get_db_feature(
1501							'db' => $self->{data}->open_meta_database,
1502							'id' => $self->id \|\| undef,
1503							'name' => $self->name \|\| undef,
1504							'type' => $self->type \|\| $self->{data}->feature,
1505	0		0			0	);
			0
			0
1506	0	0				0	return unless $f;
1507	0					0	$self->{feature} = $f;
1508	0					0	return $f;
1509							}
1510
1511							sub segment {
1512	1			1	1	3	my $self = shift;
1513	1	50				4	carp "segment is a read only method" if @_;
1514	1	50				4	return unless $self->{data}->database;
1515	1	50				5	if ($self->feature_type eq 'coordinate') {
		0
1516	1					3	my $chromo = $self->seq_id;
1517	1					2	my $start = $self->start;
1518	1		33			3	my $stop = $self->end \|\| $start;
1519	1					4	my $db = $self->{data}->open_meta_database;
1520	1	50				7	return $db ? $db->segment($chromo, $start, $stop) : undef;
1521							}
1522							elsif ($self->feature_type eq 'named') {
1523	0					0	my $f = $self->feature;
1524	0	0				0	return $f ? $f->segment : undef;
1525							}
1526							else {
1527	0					0	return undef;
1528							}
1529							}
1530
1531							sub get_features {
1532	0			0	1	0	my $self = shift;
1533	0					0	my %args = @_;
1534	0		0			0	my $db = $args{db} \|\| $self->{data}->open_meta_database \|\| undef;
1535	0	0				0	carp "no database defined to get features!" unless defined $db;
1536	0	0				0	return unless $db->can('features');
1537
1538							# convert the argument style for most bioperl db APIs
1539	0					0	my %opts;
1540	0		0			0	$opts{-seq_id} = $args{chromo} \|\| $self->seq_id;
1541	0		0			0	$opts{-start} = $args{start} \|\| $self->start;
1542	0		0			0	$opts{-end} = $args{end} \|\| $self->end;
1543	0		0			0	$opts{-type} = $args{type} \|\| $self->type;
1544
1545	0					0	return $db->features(%opts);
1546							}
1547
1548							sub get_sequence {
1549	0			0	1	0	my $self = shift;
1550	0					0	my %args = @_;
1551	0		0			0	my $db = $args{db} \|\| $args{database} \|\| $self->{data}->open_meta_database \|\| undef;
1552							# this will fail immediately if user doesn't provide valid database
1553
1554							# get sequence over subfeatures
1555	0		0			0	$args{subfeature} \|\|= undef;
1556	0	0	0			0	if ($self->feature_type eq 'named' and $args{subfeature}) {
1557							# this is more complicated so we have a dedicated method
1558	0					0	return $self->_get_subfeature_sequence($db, \%args);
1559							}
1560
1561							# get coordinates
1562	0		0			0	my $seqid = $args{seq_id} \|\| $args{chromo} \|\| $self->seq_id;
1563	0		0			0	my $start = $args{start} \|\| $self->start;
1564	0		0			0	my $stop = $args{stop} \|\| $args{end} \|\| $self->end;
1565	0					0	my $strand = $self->strand;
1566	0	0				0	if (exists $args{strand}) {
1567							# user supplied strand, gotta check it
1568	0	0				0	$strand = $args{strand} =~ /\-\|r/i ? -1 : 1;
1569							}
1570	0	0	0			0	if (exists $args{extend} and $args{extend}) {
1571	0					0	$start -= $args{extend};
1572	0	0				0	$start = 1 if $start <= 0;
1573	0					0	$stop += $args{extend};
1574							}
1575	0	0	0			0	return unless (defined $seqid and defined $start and defined $stop);
			0
1576
1577							# retrieve and return sequence
1578	0					0	my $seq = get_genomic_sequence($db, $seqid, $start, $stop);
1579	0	0				0	if ($strand == -1) {
1580	0					0	$seq =~ tr/gatcGATC/ctagCTAG/;
1581	0					0	$seq = reverse $seq;
1582							}
1583	0					0	return $seq;
1584							}
1585
1586							sub _get_subfeature_sequence {
1587	0			0		0	my ($self, $db, $args) = @_;
1588
1589							# get the subfeatures
1590	0					0	my $subfeatures = $self->_get_subfeatures($args->{subfeature});
1591	0	0				0	unless (@$subfeatures) {
1592	0					0	carp "no subfeatures available! Returning parent sequence!";
1593							# just return the parent
1594	0					0	undef $args->{subfeature};
1595	0					0	return $self->get_sequence(@$args);
1596							}
1597
1598							# sort subfeatures
1599							# this should be done by GeneTools in most cases but just to be sure
1600							# note that this does NOT merge redundant or overlapping exons!!!!
1601	0					0	my @sorted = map { $_->[0] }
1602	0	0				0	sort { $a->[1] <=> $b->[1] or $a->[2] <=> $b->[2] }
1603	0					0	map { [$_, $_->start, $_->end] }
	0					0
1604							@$subfeatures;
1605
1606							# collect sequence
1607	0					0	my $sequence;
1608	0					0	foreach my $subf (@sorted) {
1609	0					0	my $seq = get_genomic_sequence($db, $subf->seq_id, $subf->start, $subf->stop);
1610	0					0	$sequence .= $seq;
1611							}
1612
1613							# flip the sequence
1614	0	0				0	if ($self->strand == -1) {
1615	0					0	$sequence =~ tr/gatcGATC/ctagCTAG/;
1616	0					0	$sequence = reverse $sequence;
1617							}
1618	0					0	return $sequence;
1619							}
1620
1621							sub _get_subfeatures {
1622	0			0		0	my $self = shift;
1623	0					0	my $subf = lc shift;
1624
1625							# load GeneTools
1626	0	0				0	unless ($GENETOOL_LOADED) {
1627	0					0	load('Bio::ToolBox::GeneTools', qw(get_exons get_cds get_5p_utrs get_3p_utrs
1628							get_introns));
1629	0	0				0	if ($@) {
1630	0					0	croak "missing required modules! $@";
1631							}
1632							else {
1633	0					0	$GENETOOL_LOADED = 1;
1634							}
1635							}
1636
1637							# feature
1638	0					0	my $feature = $self->seqfeature;
1639	0	0				0	return unless ($feature);
1640
1641							# get the subfeatures
1642	0					0	my @subfeatures;
1643	0	0				0	if ($subf eq 'exon') {
		0
		0
		0
		0
1644	0					0	@subfeatures = get_exons($feature);
1645							}
1646							elsif ($subf eq 'cds') {
1647	0					0	@subfeatures = get_cds($feature);
1648							}
1649							elsif ($subf eq '5p_utr') {
1650	0					0	@subfeatures = get_5p_utrs($feature);
1651							}
1652							elsif ($subf eq '3p_utr') {
1653	0					0	@subfeatures = get_3p_utrs($feature);
1654							}
1655							elsif ($subf eq 'intron') {
1656	0					0	@subfeatures = get_introns($feature);
1657							}
1658							else {
1659	0					0	croak "unrecognized subfeature parameter '$subf'!";
1660							}
1661
1662	0					0	return \@subfeatures;
1663							}
1664
1665							sub get_score {
1666	7			7	1	7295	my $self = shift;
1667	7					38	my %args = @_; # passed arguments to this method
1668
1669							# verify the dataset for the user, cannot trust whether it has been done or not
1670	7		50			57	my $db = $args{ddb} \|\| $args{db} \|\| $self->{data}->open_meta_database \|\| undef;
1671	7					34	$args{dataset} = $self->{data}->verify_dataset($args{dataset}, $db);
1672	7	50				20	unless ($args{dataset}) {
1673	0					0	croak "provided dataset was unrecognized format or otherwise could not be verified!";
1674							}
1675
1676							# get positioned scores over subfeatures only
1677	7		50			41	$args{subfeature} \|\|= q();
1678	7	0	33			20	if ($self->feature_type eq 'named' and $args{subfeature}) {
1679							# this is more complicated so we have a dedicated method
1680	0					0	return $self->_get_subfeature_scores($db, \%args);
1681							}
1682
1683							# build parameter array to pass on to the adapter
1684	7					11	my @params;
1685
1686							# verify coordinates based on type of feature
1687	7	50				25	if ($self->feature_type eq 'coordinate') {
		0
1688							# coordinates are already in the table, use those
1689	7		33			31	$params[CHR] = $args{seq_id} \|\| $self->seq_id;
1690	7		33			27	$params[STRT] = $args{start} \|\| $self->start;
1691	7		33			43	$params[STOP] = $args{stop} \|\| $args{end} \|\| $self->end;
1692							$params[STR] = (exists $args{strand} and defined $args{strand}) ? $args{strand} :
1693	7	50	33			25	$self->strand;
1694							}
1695							elsif ($self->feature_type eq 'named') {
1696							# must retrieve feature from the database first
1697	0					0	my $f = $self->seqfeature;
1698	0	0				0	return unless $f;
1699	0		0			0	$params[CHR] = $args{seq_id} \|\| $f->seq_id;
1700	0		0			0	$params[STRT] = $args{start} \|\| $f->start;
1701	0		0			0	$params[STOP] = $args{stop} \|\| $args{end} \|\| $f->end;
1702							$params[STR] = (exists $args{strand} and defined $args{strand}) ? $args{strand} :
1703	0	0	0			0	$f->strand;
1704							}
1705							else {
1706	0					0	croak "data table does not have identifiable coordinate or feature identification columns for score collection";
1707							}
1708
1709							# adjust coordinates as necessary
1710	7	0	33			20	if (exists $args{extend} and $args{extend}) {
1711	0					0	$params[STRT] -= $args{extend};
1712	0					0	$params[STOP] += $args{extend};
1713							}
1714
1715							# check coordinates
1716	7	50				22	$params[STRT] = 1 if $params[STRT] <= 0;
1717	7	50				22	if ($params[STOP] < $params[STRT]) {
1718							# coordinates are flipped, reverse strand
1719	0	0				0	return if ($params[STOP] <= 0);
1720	0					0	my $stop = $params[STRT];
1721	0					0	$params[STRT] = $params[STOP];
1722	0					0	$params[STOP] = $stop;
1723	0					0	$params[STR] = -1;
1724							}
1725	7	50	33			28	return unless ($params[CHR] and defined $params[STRT]);
1726
1727							# score attributes
1728	7		50			18	$params[METH] = $args{'method'} \|\| 'mean';
1729	7		100			38	$params[STND] = $args{strandedness} \|\| $args{stranded} \|\| 'all';
1730
1731							# other parameters
1732	7					11	$params[DB] = $db;
1733	7					11	$params[RETT] = 0; # return type should be a calculated value
1734	7					10	$params[DATA] = $args{dataset};
1735
1736							# get the score
1737	7					25	return get_segment_score(@params);
1738							}
1739
1740							sub _get_subfeature_scores {
1741	0			0		0	my ($self, $db, $args) = @_;
1742
1743							# get the subfeatures
1744	0					0	my $subfeatures = $self->_get_subfeatures($args->{subfeature});
1745	0	0				0	unless (@$subfeatures) {
1746	0					0	carp "no subfeatures available! Returning parent score data!";
1747							# just return the parent
1748	0					0	undef $args->{subfeature};
1749	0	0				0	delete $args->{exon} if exists $args->{exon};
1750	0					0	return $self->get_score(@$args);
1751							}
1752
1753							# collect over each subfeature
1754	0					0	my @scores;
1755	0					0	foreach my $exon (@$subfeatures) {
1756	0					0	my @params; # parameters to pass on to adapter
1757	0					0	$params[CHR] = $exon->seq_id;
1758	0					0	$params[STRT] = $exon->start;
1759	0					0	$params[STOP] = $exon->end;
1760	0	0				0	$params[STR] = defined $args->{strand} ? $args->{strand} : $exon->strand;
1761	0		0			0	$params[STND] = $args->{strandedness} \|\| $args->{stranded} \|\| 'all';
1762	0		0			0	$params[METH] = $args->{method} \|\| 'mean';
1763	0					0	$params[RETT] = 1; # return type should be an array reference of scores
1764	0					0	$params[DB] = $db;
1765	0					0	$params[DATA] = $args->{dataset};
1766
1767	0					0	my $exon_scores = get_segment_score(@params);
1768	0	0				0	push @scores, @$exon_scores if defined $exon_scores;
1769							}
1770
1771							# combine all the scores based on the requested method
1772	0					0	return calculate_score($args->{method}, \@scores);
1773							}
1774
1775							sub get_relative_point_position_scores {
1776	1			1	1	2393	my $self = shift;
1777	1					7	my %args = @_;
1778
1779							# get the database and verify the dataset
1780	1		33			15	my $ddb = $args{ddb} \|\| $args{db} \|\| $self->{data}->open_meta_database;
1781	1					14	$args{dataset} = $self->{data}->verify_dataset($args{dataset}, $ddb);
1782	1	50				7	unless ($args{dataset}) {
1783	0					0	croak "provided dataset was unrecognized format or otherwise could not be verified!\n";
1784							}
1785
1786							# assign some defaults
1787	1		50			9	$args{strandedness} \|\|= $args{stranded} \|\| 'all';
			33
1788	1		50			4	$args{position} \|\|= 5;
1789	1		50			8	$args{coordinate} \|\|= undef;
1790	1		50			7	$args{avoid} \|\|= undef;
1791	1		50			7	$args{'method'} \|\|= 'mean'; # in most cases this doesn't do anything
1792	1	50				6	unless ($args{extend}) {
1793	0					0	croak "must provide an extend value!";
1794							}
1795	1	50	33			6	$args{avoid} = undef unless ($args{db} or $self->{data}->open_meta_database);
1796
1797							# determine reference coordinate
1798	1	50				14	$self->_calculate_reference(\%args) unless defined $args{coordinate};
1799
1800							# build parameter array to pass on to the adapter
1801	1					5	my @params;
1802	1					6	$params[CHR] = $self->seq_id;
1803	1					5	$params[STRT] = $args{coordinate} - $args{extend};
1804	1	50				4	$params[STRT] = 1 if $params[STRT] < 1; # sanity check
1805	1					3	$params[STOP] = $args{coordinate} + $args{extend};
1806	1	50				6	$params[STR] = defined $args{strand} ? $args{strand} : $self->strand;
1807	1					4	$params[STND] = $args{strandedness};
1808	1					2	$params[METH] = $args{'method'};
1809	1					3	$params[RETT] = 2; # return type should be a hash reference of positioned scores
1810	1					3	$params[DB] = $ddb;
1811	1					2	$params[DATA] = $args{dataset};
1812
1813							# Data collection
1814	1					5	my $pos2data = get_segment_score(@params);
1815
1816							# Avoid positions
1817	1	50				6	if ($args{avoid}) {
1818	0					0	$self->_avoid_positions($pos2data, \%args, $params[CHR], $params[STRT], $params[STOP]);
1819							}
1820
1821							# covert to relative positions
1822	1	50				5	if ($args{absolute}) {
1823							# do not convert to relative positions
1824	0	0				0	return wantarray ? %$pos2data : $pos2data;
1825							}
1826							else {
1827							# return the collected dataset hash
1828							return $self->_convert_to_relative_positions($pos2data,
1829	1					5	$args{coordinate}, $params[STR]);
1830							}
1831							}
1832
1833							sub get_region_position_scores {
1834	4			4	1	8311	my $self = shift;
1835	4					21	my %args = @_;
1836
1837							# get the database and verify the dataset
1838	4		33			38	my $ddb = $args{ddb} \|\| $args{db} \|\| $self->{data}->open_meta_database;
1839	4					14	$args{dataset} = $self->{data}->verify_dataset($args{dataset}, $ddb);
1840	4	50				13	unless ($args{dataset}) {
1841	0					0	croak "provided dataset was unrecognized format or otherwise could not be verified!\n";
1842							}
1843
1844							# assign some defaults here, in case we get passed on to subfeature method
1845	4		50			23	$args{strandedness} \|\|= $args{stranded} \|\| 'all';
			33
1846	4		50			18	$args{extend} \|\|= 0;
1847	4		50			16	$args{position} \|\|= 5;
1848	4		100			12	$args{'method'} \|\|= 'mean'; # in most cases this doesn't do anything
1849	4	50	33			24	$args{avoid} = undef unless ($args{db} or $self->{data}->open_meta_database);
1850
1851							# get positioned scores over subfeatures only
1852	4		50			19	$args{subfeature} \|\|= q();
1853	4	0	33			13	if ($self->feature_type eq 'named' and $args{subfeature}) {
1854							# this is more complicated so we have a dedicated method
1855	0					0	return $self->_get_subfeature_position_scores(\%args, $ddb);
1856							}
1857
1858							# Assign coordinates
1859							# build parameter array to pass on to the adapter
1860	4					9	my @params;
1861	4		33			12	my $feature = $self->seqfeature \|\| $self;
1862	4		33			22	$params[CHR] = $args{chromo} \|\| $args{seq_id} \|\| $feature->seq_id;
1863	4		33			15	$params[STRT] = $args{start} \|\| $feature->start;
1864	4		33			24	$params[STOP] = $args{stop} \|\| $args{end} \|\| $feature->end;
1865	4	50				14	$params[STR] = defined $args{strand} ? $args{strand} : $feature->strand;
1866	4	50				11	if ($args{extend}) {
1867	0					0	$params[STRT] -= $args{extend};
1868	0					0	$params[STOP] += $args{extend};
1869	0	0				0	$params[STRT] = 1 if $params[STRT] < 1; # sanity check
1870							}
1871	4					9	$params[STND] = $args{strandedness};
1872	4					6	$params[METH] = $args{method};
1873	4					10	$params[RETT] = 2; # return type should be a hash reference of positioned scores
1874	4					7	$params[DB] = $ddb;
1875	4					7	$params[DATA] = $args{dataset};
1876
1877							# Data collection
1878	4					13	my $pos2data = get_segment_score(@params);
1879
1880							# Avoid positions
1881	4	50				12	if ($args{avoid}) {
1882	0					0	$self->_avoid_positions($pos2data, \%args, $params[CHR], $params[STRT], $params[STOP]);
1883							}
1884
1885							# covert to relative positions
1886	4	50				11	if ($args{absolute}) {
1887							# do not convert to relative positions
1888	0	0				0	return wantarray ? %$pos2data : $pos2data;
1889							}
1890							else {
1891							# return data converted to relative positions
1892	4	50				20	$self->_calculate_reference(\%args) unless defined $args{coordinate};
1893							return $self->_convert_to_relative_positions($pos2data,
1894	4					17	$args{coordinate}, $params[STR]);
1895							}
1896							}
1897
1898							sub _get_subfeature_position_scores {
1899	0			0		0	my ($self, $args, $ddb) = @_;
1900
1901							# get the subfeatures
1902	0					0	my $subfeatures = $self->_get_subfeatures($args->{subfeature});
1903	0	0				0	unless (@$subfeatures) {
1904	0					0	carp "no subfeatures available! Returning parent score data!";
1905							# just return the parent
1906	0					0	undef $args->{subfeature};
1907	0	0				0	delete $args->{exon} if exists $args->{exon};
1908	0					0	return $self->get_sequence(@$args);
1909							}
1910
1911							# reset the practical start and stop to the actual subfeatures' final start and stop
1912							# we can no longer rely on the feature start and stop, consider CDS
1913							# these subfeatures should already be genomic sorted by GeneTools
1914	0					0	my $practical_start = $subfeatures->[0]->start;
1915	0					0	my $practical_stop = $subfeatures->[-1]->end;
1916
1917							# collect over each exon
1918							# we will adjust the positions of each reported score so that
1919							# it will appear as if all the exons are adjacent to each other
1920							# and no introns exist
1921	0					0	my $pos2data = {};
1922	0					0	my $namecheck = {}; # to check unique names when using ncount method....
1923	0					0	my $current_end = $practical_start;
1924	0					0	my $adjustment = 0;
1925	0	0				0	my $fstrand = defined $args->{strand} ? $args->{strand} : $self->strand;
1926	0					0	foreach my $exon (@$subfeatures) {
1927
1928	0					0	my @params; # parameters to pass on to adapter
1929	0					0	$params[CHR] = $exon->seq_id;
1930	0					0	$params[STRT] = $exon->start;
1931	0					0	$params[STOP] = $exon->end;
1932	0					0	$params[STR] = $fstrand;
1933	0					0	$params[STND] = $args->{strandedness};
1934	0					0	$params[METH] = $args->{method};
1935	0					0	$params[RETT] = 2; # return type should be a hash reference of positioned scores
1936	0					0	$params[DB] = $ddb;
1937	0					0	$params[DATA] = $args->{dataset};
1938
1939							# collect scores
1940	0					0	my $exon_scores = get_segment_score(@params);
1941
1942							# adjust the scores
1943	0					0	$adjustment = $params[STRT] - $current_end;
1944							$self->_process_exon_scores($exon_scores, $pos2data, $adjustment, $params[STRT],
1945	0					0	$params[STOP], $namecheck, $args->{method});
1946
1947							# reset
1948	0					0	$current_end += $exon->length;
1949							}
1950
1951							# collect extensions if requested
1952	0	0				0	if ($args->{extend}) {
1953							# left side
1954	0					0	my @params; # parameters to pass on to adapter
1955	0					0	$params[CHR] = $self->seq_id;
1956	0					0	$params[STRT] = $practical_start - $args->{extend};
1957	0					0	$params[STOP] = $practical_start - 1;
1958	0					0	$params[STR] = $fstrand;
1959	0					0	$params[STND] = $args->{strandedness};
1960	0					0	$params[METH] = $args->{method};
1961	0					0	$params[RETT] = 2; # return type should be a hash reference of positioned scores
1962	0					0	$params[DB] = $ddb;
1963	0					0	$params[DATA] = $args->{dataset};
1964
1965	0					0	my $ext_scores = get_segment_score(@params);
1966
1967							# no adjustment should be needed
1968							$self->_process_exon_scores($ext_scores, $pos2data, 0, $params[STRT],
1969	0					0	$params[STOP], $namecheck, $args->{method});
1970
1971
1972							# right side
1973							# we can reuse our parameter array
1974	0					0	$params[STRT] = $practical_stop + 1;
1975	0					0	$params[STOP] = $practical_stop + $args->{extend};
1976	0					0	$ext_scores = get_segment_score(@params);
1977
1978							# the adjustment should be the same as the last exon
1979							$self->_process_exon_scores($ext_scores, $pos2data, $adjustment, $params[STRT],
1980	0					0	$params[STOP], $namecheck, $args->{method});
1981							}
1982
1983							# covert to relative positions
1984	0	0				0	if ($args->{absolute}) {
1985							# do not convert to relative positions
1986	0	0				0	return wantarray ? %$pos2data : $pos2data;
1987							}
1988							else {
1989							# return data converted to relative positions
1990							# can no longer use original coordinates, but instead the new shifted coordinates
1991	0					0	$args->{practical_start} = $practical_start;
1992	0					0	$args->{practical_stop} = $current_end;
1993	0					0	$self->_calculate_reference($args);
1994							return $self->_convert_to_relative_positions($pos2data,
1995	0					0	$args->{coordinate}, $fstrand);
1996							}
1997							}
1998
1999							sub _calculate_reference {
2000	5			5		12	my ($self, $args) = @_;
2001	5		33			14	my $feature = $self->seqfeature \|\| $self;
2002	5	50				30	my $strand = defined $args->{strand} ? $args->{strand} : $feature->strand;
2003	5	50	33			40	if ($args->{position} == 5 and $strand >= 0) {
		50	33
		50	33
		0	0
		0
2004	0		0			0	$args->{coordinate} = $args->{practical_start} \|\| $feature->start;
2005							}
2006							elsif ($args->{position} == 3 and $strand >= 0) {
2007	0		0			0	$args->{coordinate} = $args->{practical_stop} \|\| $feature->end;
2008							}
2009							elsif ($args->{position} == 5 and $strand < 0) {
2010	5		33			16	$args->{coordinate} = $args->{practical_stop} \|\| $feature->end;
2011							}
2012							elsif ($args->{position} == 3 and $strand < 0) {
2013	0		0			0	$args->{coordinate} = $args->{practical_start} \|\| $feature->start;
2014							}
2015							elsif ($args->{position} == 4) {
2016							# strand doesn't matter here
2017	0		0			0	my $s = $args->{practical_start} \|\| $feature->start;
2018	0					0	$args->{coordinate} = $s + int(($feature->length / 2) + 0.5);
2019							}
2020							else {
2021	0					0	croak "position must be one of 5, 3, or 4";
2022							}
2023							}
2024
2025							sub _avoid_positions {
2026	0			0		0	my ($self, $pos2data, $args, $seqid, $start, $stop) = @_;
2027
2028							# first check the list of avoid types
2029	0	0				0	if (ref $args->{avoid} eq 'ARRAY') {
		0
		0
2030							# we have types, presume they're ok
2031							}
2032							elsif ($args->{avoid} eq '1') {
2033							# old style boolean value
2034	0	0				0	if (defined $args->{type}) {
2035	0					0	$args->{avoid} = [ $args->{type} ];
2036							}
2037							else {
2038							# no type provided, we can't avoid that which is not defined!
2039							# this is an error, but won't complain as we never did before
2040	0					0	$args->{avoid} = $self->type;
2041							}
2042							}
2043							elsif ($args->{avoid} =~ /w+/i) {
2044							# someone passed a string, a feature type perhaps?
2045	0					0	$args->{avoid} = [ $args->{avoid} ];
2046							}
2047
2048							### Check for conflicting features
2049	0		0			0	my $db = $args->{db} \|\| $self->{data}->open_meta_database;
2050							my @overlap_features = $self->get_features(
2051							seq_id => $seqid,
2052							start => $start,
2053							end => $stop,
2054							type => $args->{avoid},
2055	0					0	);
2056
2057							# get the overlapping features of the same type
2058	0	0				0	if (@overlap_features) {
2059	0					0	my $primary = $self->primary_id;
2060							# there are one or more feature of the type in this region
2061							# one of them is likely the one we're working with
2062							# but not necessarily - user may be looking outside original feature
2063							# the others are not what we want and therefore need to be
2064							# avoided
2065	0					0	foreach my $feat (@overlap_features) {
2066							# skip the one we want
2067	0	0				0	next if ($feat->primary_id eq $primary);
2068							# now eliminate those scores which overlap this feature
2069	0					0	my $start = $feat->start;
2070	0					0	my $stop = $feat->end;
2071	0					0	foreach my $position (keys %$pos2data) {
2072							# delete the scored position if it overlaps with
2073							# the offending feature
2074	0	0	0			0	if (
2075							$position >= $start and
2076							$position <= $stop
2077							) {
2078	0					0	delete $pos2data->{$position};
2079							}
2080							}
2081							}
2082							}
2083							}
2084
2085							sub _convert_to_relative_positions {
2086	5			5		23	my ($self, $pos2data, $position, $strand) = @_;
2087
2088	5					7	my %relative_pos2data;
2089	5	50				17	if ($strand >= 0) {
		50
2090	0					0	foreach my $p (keys %$pos2data) {
2091	0					0	$relative_pos2data{ $p - $position } = $pos2data->{$p};
2092							}
2093							}
2094							elsif ($strand < 0) {
2095	5					32	foreach my $p (keys %$pos2data) {
2096	223					451	$relative_pos2data{ $position - $p } = $pos2data->{$p};
2097							}
2098							}
2099	5	50				145	return wantarray ? %relative_pos2data : \%relative_pos2data;
2100							}
2101
2102							sub _process_exon_scores {
2103	0			0			my ($self, $exon_scores, $pos2data, $adjustment, $start, $end,
2104							$namecheck, $method) = @_;
2105
2106							# ncount method
2107	0	0					if ($method eq 'ncount') {
2108							# we need to check both names and adjust position
2109	0						foreach my $p (keys %$exon_scores) {
2110	0	0	0				next unless ($p >= $start and $p <= $end);
2111	0						foreach my $n (@{ $exon_scores->{$p} }) {
	0
2112	0	0					if (exists $namecheck->{$n} ) {
2113	0						$namecheck->{$n}++;
2114	0						next;
2115							}
2116							else {
2117	0						$namecheck->{$n} = 1;
2118	0						my $a = $p - $adjustment;
2119	0		0				$pos2data->{$a} \|\|= [];
2120	0						push @{ $pos2data->{$a} }, $n;
	0
2121							}
2122							}
2123							}
2124							}
2125							else {
2126							# just adjust scores
2127	0						foreach my $p (keys %$exon_scores) {
2128	0	0	0				next unless ($p >= $start and $p <= $end);
2129	0						$pos2data->{ $p - $adjustment } = $exon_scores->{$p};
2130							}
2131							}
2132							}
2133
2134							sub fetch_alignments {
2135	0			0	1		my $self = shift;
2136	0						my %args = @_;
2137	0		0				$args{db} \|\|= $args{dataset} \|\| undef;
			0
2138	0		0				$args{data} \|\|= undef;
2139	0		0				$args{callback} \|\|= undef;
2140	0		0				$args{subfeature} \|\|= q();
2141
2142							# verify - trusting that these are valid, else they will fail lower down in the code
2143	0	0					unless ($args{db}) {
2144	0						croak "must provide a Bam object database to fetch alignments!\n";
2145							}
2146	0	0	0				unless ($args{data} and ref($args{data}) eq 'HASH') {
2147	0						croak "must provide a data HASH for the fetch callback!\n";
2148							}
2149	0	0					unless ($args{callback}) {
2150	0						croak "must provide a callback code reference!\n";
2151							}
2152
2153							# array of features to iterate, probably just one or subfeatures
2154	0						my @intervals;
2155	0	0	0				if ($self->feature_type eq 'named' and $args{subfeature}) {
2156							# we have subfeatures to iterate over
2157
2158							# get the subfeatures
2159	0						my $subfeatures = $self->_get_subfeatures($args{subfeature});
2160	0	0					if (@$subfeatures) {
2161	0						foreach my $sf (@$subfeatures) {
2162	0						push @intervals, [
2163							$sf->start - 1,
2164							$sf->end
2165							];
2166							}
2167							}
2168							else {
2169							# zere subfeatures? just take the parent then
2170							push @intervals, [
2171							($args{start} \|\| $self->start) - 1,
2172	0		0				$args{stop} \|\| $args{end} \|\| $self->end
			0
2173							];
2174							}
2175							}
2176							else {
2177							# take feature as is
2178							push @intervals, [
2179							($args{start} \|\| $self->start) - 1,
2180	0		0				$args{stop} \|\| $args{end} \|\| $self->end
			0
2181							];
2182							}
2183
2184							# get the target id for the chromosome
2185							# this will fail if the user didn't provide a real bam object!!!
2186	0						my ($tid, undef, undef) = $args{db}->header->parse_region($self->seq_id);
2187	0	0					return unless defined $tid;
2188
2189							# now iterate over the intervals
2190	0						foreach my $i (@intervals) {
2191	0						$args{data}->{start} = $i->[0];
2192	0						$args{data}->{end} = $i->[1];
2193	0						$args{data}->{strand} = $self->strand;
2194							low_level_bam_fetch(
2195							$args{db},
2196							$tid,
2197							$i->[0],
2198							$i->[1],
2199							$args{callback},
2200							$args{data}
2201	0						);
2202							}
2203
2204							# nothing to return since we're using a data reference
2205	0						return 1;
2206							}
2207
2208
2209
2210							### String export
2211
2212							sub bed_string {
2213	0			0	1		my $self = shift;
2214	0						my %args = @_;
2215	0		0				$args{bed} \|\|= 6; # number of bed columns
2216	0	0					croak "bed count must be an integer!" unless $args{bed} =~ /^\d+$/;
2217	0	0					croak "bed count must be at least 3!" unless $args{bed} >= 3;
2218
2219							# coordinate information
2220	0						$self->seqfeature; # retrieve the seqfeature object first
2221	0		0				my $chr = $args{chromo} \|\| $args{seq_id} \|\| $self->seq_id;
2222	0		0				my $start = $args{start} \|\| $self->start;
2223	0		0				my $stop = $args{stop} \|\| $args{end} \|\| $self->stop \|\|
2224							$start + $self->length - 1 \|\| $start;
2225	0	0	0				if ($chr eq '.' or not CORE::length($chr) or $start eq '.' or not CORE::length($start)) {
			0
			0
2226	0						carp sprintf("no valid seq_id or start for data line %d", $self->line_number);
2227	0						return;
2228							}
2229	0						$start -= 1; # 0-based coordinates
2230	0						my $string = "$chr\t$start\t$stop";
2231
2232							# additional information
2233	0	0					if ($args{bed} >= 4) {
2234	0		0				my $name = $args{name} \|\| $self->name \|\| 'Feature_' . $self->line_number;
2235	0						$string .= "\t$name";
2236							}
2237	0	0					if ($args{bed} >= 5) {
2238	0	0					my $score = exists $args{score} ? $args{score} : $self->score;
2239	0	0					$score = 1 unless defined $score;
2240	0						$string .= "\t$score";
2241							}
2242	0	0					if ($args{bed} >= 6) {
2243	0						my $s;
2244	0	0	0				if (exists $args{strand} and defined $args{strand}) {
2245	0						$s = $self->_strand($args{strand});
2246							}
2247							else {
2248	0						$s = $self->strand;
2249							}
2250	0	0					$string .= sprintf("\t%s", $s == -1 ? '-' : '+')
2251							}
2252							# we could go on with other columns, but there's no guarantee that additional
2253							# information is available, and we would have to implement user provided data
2254
2255							# done
2256	0						return $string;
2257							}
2258
2259							sub gff_string {
2260	0			0	1		my $self = shift;
2261	0						my %args = @_;
2262
2263							# coordinate information
2264	0						$self->seqfeature; # retrieve the seqfeature object first
2265	0		0				my $chr = $args{chromo} \|\| $args{seq_id} \|\| $self->seq_id;
2266	0		0				my $start = $args{start} \|\| $self->start;
2267	0		0				my $stop = $args{stop} \|\| $args{end} \|\| $self->stop \|\|
2268							$start + $self->length - 1 \|\| $start;
2269	0	0	0				if ($chr eq '.' or not CORE::length($chr) or $start eq '.' or not CORE::length($start)) {
			0
			0
2270	0						carp sprintf("no valid seq_id or start for data line %d", $self->line_number);
2271	0						return;
2272							}
2273	0						my $strand;
2274	0	0	0				if (exists $args{strand} and defined $args{strand}) {
2275	0						$strand = $self->_strand($args{strand});
2276							}
2277							else {
2278	0						$strand = $self->strand;
2279							}
2280	0	0					$strand = $strand == -1 ? '-' : $strand == 1 ? '+' : '.';
		0
2281
2282							# type information
2283	0		0				my $type = $args{type} \|\| $self->type \|\| undef;
2284	0						my ($source, $primary_tag);
2285	0	0	0				if (defined $type and $type =~ /:/) {
2286	0						($primary_tag, $source) = split /:/, $type;
2287							}
2288	0	0					unless ($source) {
2289	0		0				$source = $args{source} \|\| '.';
2290							}
2291	0	0					unless ($primary_tag) {
2292	0	0	0				$primary_tag = $args{primary_tag} \|\| defined $type ? $type : '.';
2293							}
2294
2295							# score
2296	0	0					my $score = exists $args{score} ? $args{score} : $self->score;
2297	0	0					$score = '.' unless defined $score;
2298	0						my $phase = '.'; # do not even bother!!!!
2299
2300							# attributes
2301	0		0				my $name = $args{name} \|\| $self->name \|\| 'Feature_' . $self->line_number;
2302	0						my $attributes = "Name=$name";
2303	0		0				my $id = $args{id} \|\| sprintf("%08d", $self->line_number);
2304	0						$attributes .= ";ID=$id";
2305	0	0	0				if (exists $args{attributes} and ref($args{attributes}) eq 'ARRAY') {
2306	0						foreach my $i (@{$args{attributes}}) {
	0
2307	0						my $k = $self->{data}->name($i);
2308	0						$k =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
	0
2309	0						my $v = $self->value($i);
2310	0						$v =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
	0
2311	0						$attributes .= ";$k=$v";
2312							}
2313							}
2314
2315							# done
2316	0						my $string = join("\t", $chr, $source, $primary_tag, $start, $stop, $score,
2317							$strand, $phase, $attributes);
2318	0						return $string;
2319							}
2320
2321
2322							__END__