File Coverage

blib/lib/Bio/ToolBox/Data/Feature.pm
Criterion Covered Total %
statement 228 696 32.7
branch 101 432 23.3
condition 48 319 15.0
subroutine 26 48 54.1
pod 32 34 94.1
total 435 1529 28.4


line stmt bran cond sub pod time code
1             package Bio::ToolBox::Data::Feature;
2             our $VERSION = '1.68';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::Data::Feature - Objects representing rows in a data table
7              
8             =head1 DESCRIPTION
9              
10             A Bio::ToolBox::Data::Feature is an object representing a row in the
11             data table. Usually, this in turn represents an annotated feature or
12             segment in the genome. As such, this object provides convenient
13             methods for accessing and manipulating the values in a row, as well as
14             methods for working with the represented genomic feature.
15              
16             This class should NOT be used directly by the user. Rather, Feature
17             objects are generated from a Bio::ToolBox::Data::Iterator object
18             (generated itself from the L
19             function in Bio::ToolBox::Data), or the L
20             function in Bio::ToolBox::Data. Please see the respective documentation
21             for more information.
22              
23             Example of working with a stream object.
24              
25             my $Data = Bio::ToolBox::Data->new(file => $file);
26            
27             # stream method
28             my $stream = $Data->row_stream;
29             while (my $row = $stream->next_row) {
30             # each $row is a Bio::ToolBox::Data::Feature object
31             # representing the row in the data table
32             my $value = $row->value($index);
33             # do something with $value
34             }
35            
36             # iterate method
37             $Data->iterate( sub {
38             my $row = shift;
39             my $number = $row->value($index);
40             my $log_number = log($number);
41             $row->value($index, $log_number);
42             } );
43              
44              
45             =head1 METHODS
46              
47             =head2 General information methods
48              
49             =over 4
50              
51             =item row_index
52              
53             Returns the index position of the current data row within the
54             data table. Useful for knowing where you are at within the data
55             table.
56              
57             =item feature_type
58              
59             Returns one of three specific values describing the contents
60             of the data table inferred by the presence of specific column
61             names. This provides a clue as to whether the table features
62             represent genomic regions (defined by coordinate positions) or
63             named database features. The return values include:
64              
65             =over 4
66              
67             =item * coordinate: Table includes at least chromosome and start
68              
69             =item * named: Table includes name, type, and/or Primary_ID
70              
71             =item * unknown: unrecognized
72              
73             =back
74              
75             =item column_name
76              
77             Returns the column name for the given index.
78              
79             item data
80              
81             Returns the parent L object, in case you may
82             have lost it by going out of scope.
83              
84             =back
85              
86             =head2 Methods to access row feature attributes
87              
88             These methods return the corresponding value, if present in the
89             data table, based on the column header name. If the row represents
90             a named database object, try calling the L method first.
91             This will retrieve the database SeqFeature object, and the attributes
92             can then be retrieved using the methods below or on the actual
93             database SeqFeature object.
94              
95             In cases where there is a table column and a corresponding SeqFeature
96             object, for example a start column and a parsed SeqFeature object, the
97             table value takes precedence and is returned. You can always obtain the
98             SeqFeature's value separately and directly.
99              
100             These methods do not set attribute values. If you need to change the
101             values in a table, use the L method below.
102              
103             =over 4
104              
105             =item seq_id
106              
107             The name of the chromosome the feature is on.
108              
109             =item start
110              
111             =item end
112              
113             =item stop
114              
115             The coordinates of the feature or segment. Coordinates from known
116             0-based file formats, e.g. BED, are returned as 1-based. Coordinates
117             must be integers to be returned. Zero or negative start coordinates
118             are assumed to be accidents or poor programming and transformed to 1.
119             Use the L method if you don't want this to happen.
120              
121             =item strand
122              
123             The strand of the feature or segment. Returns -1, 0, or 1. Default is 0,
124             or unstranded.
125              
126             =item name
127              
128             =item display_name
129              
130             The name of the feature.
131              
132             =item coordinate
133              
134             Returns a coordinate string formatted as C.
135              
136             =item type
137              
138             The type of feature. Typically either C or C.
139             In a GFF3 file, this represents columns 3 and 2, respectively. In annotation
140             databases such as L, the type is used to restrict
141             to one of many different types of features, e.g. gene, mRNA, or exon.
142              
143             =item id
144              
145             =item primary_id
146              
147             Here, this represents the C in the database. Note that this number
148             is generally unique to a specific database, and not portable between databases.
149              
150             =item length
151              
152             The length of the feature or segment.
153              
154             =item score
155              
156             Returns the value of the Score column, if one is available. Typically
157             associated with defined file formats, such as GFF files (6th column),
158             BED and related Peak files (5th column), and bedGraph (4th column).
159              
160             =back
161              
162             =head2 Accessing and setting values in the row.
163              
164             =over 4
165              
166             =item value
167              
168             # retrieve a value
169             my $v = $row->value($index);
170             # set a value
171             $row->value($index, $v + 1);
172              
173             Returns or sets the value at a specific column index in the
174             current data row. Null values return a '.', symbolizing an
175             internal null value.
176              
177             =item row_values
178              
179             Returns an array or array reference representing all the values
180             in the current data row.
181              
182             =back
183              
184             =head2 Special feature attributes
185              
186             GFF and VCF files have special attributes in the form of key =E value pairs.
187             These are stored as specially formatted, character-delimited lists in
188             certain columns. These methods will parse this information and return as
189             a convenient hash reference. The keys and values of this hash may be
190             changed, deleted, or added to as desired. To write the changes back to
191             the file, use the L to properly write the attributes
192             back to the file with the proper formatting.
193              
194             =over 4
195              
196             =item attributes
197              
198             Generic method that calls either L or L
199             depending on the data table format.
200              
201             =item gff_attributes
202              
203             Parses the 9th column of GFF files. URL-escaped characters are converted
204             back to text. Returns a hash reference of key =E value pairs.
205              
206             =item vcf_attributes
207              
208             Parses the C (8th column) and all sample columns (10th and higher
209             columns) in a version 4 VCF file. The Sample columns use the C
210             column (9th column) as keys. The returned hash reference has two levels:
211             The first level keys are both the column names and index (0-based). The
212             second level keys are the individual attribute keys to each value.
213             For example:
214              
215             my $attr = $row->vcf_attributes;
216            
217             # access by column name
218             my $genotype = $attr->{sample1}{GT};
219             my $depth = $attr->{INFO}{ADP};
220            
221             # access by 0-based column index
222             my $genotype = $attr->{9}{GT};
223             my $depth = $attr->{7}{ADP}
224              
225             =item rewrite_attributes
226              
227             Generic method that either calls L or
228             L depending on the data table format.
229              
230             =item rewrite_gff_attributes
231              
232             Rewrites the GFF attributes column (the 9th column) based on the
233             contents of the attributes hash that was previously generated with
234             the L method. Useful when you have modified the
235             contents of the attributes hash.
236              
237             =item rewrite_vcf_attributes
238              
239             Rewrite the VCF attributes for the C (8th column), C (9th
240             column), and sample columns (10th and higher columns) based on the
241             contents of the attributes hash that was previously generated with
242             the L method. Useful when you have modified the
243             contents of the attributes hash.
244              
245             =back
246              
247             =head2 Convenience Methods to database functions
248              
249             The next three functions are convenience methods for using the
250             attributes in the current data row to interact with databases.
251             They are wrappers to methods in the L
252             module.
253              
254             =over 4
255              
256             =item seqfeature
257              
258             =item feature
259              
260             Returns a SeqFeature object representing the feature or item in
261             the current row. If the SeqFeature object is stored in the parent
262             C<$Data> object (usually from parsing an annotation file), it is
263             immediately returned. Otherwise, the SeqFeature
264             object is retrieved from the database using the name and
265             type values in the current Data table row. The SeqFeature object
266             is requested from the database named in the general metadata. If
267             an alternate database is desired, you should change it first using
268             the C<$Data>-Edatabase() method. If the feature name or type is not
269             present in the table, then nothing is returned.
270              
271             See L and L for more
272             information about working with these objects. See L
273             about working with database features.
274              
275             This method normally only works with "named" feature types in a
276             L Data table. If your Data table has coordinate
277             information, i.e. chromosome, start, and stop columns, then it will
278             likely be recognized as a "coordinate" feature_type and not work.
279              
280             Pass a true value to this method to force the seqfeature lookup. This
281             will still require the presence of Name, ID, and/or Type columns to
282             perform the database lookup. The L method feature()
283             is used to determine the type if a Type column is not present.
284              
285             =item segment
286              
287             Returns a database Segment object corresponding to the coordinates
288             defined in the Data table row. If a named feature and type are
289             present instead of coordinates, then the feature is first automatically
290             retrieved and a Segment returned based on its coordinates. The
291             database named in the general metadata is used to establish the
292             Segment object. If a different database is desired, it should be
293             changed first using the general L method.
294              
295             See L and L for more information
296             about working with Segment objects.
297              
298             =item get_features
299              
300             my @overlap_features = $row->get_features(type => $type);
301              
302             Returns seqfeature objects from a database that overlap the Feature
303             or interval in the current Data table row. This is essentially a
304             convenience wrapper for a Bio::DB style I method using the
305             coordinates of the Feature. Optionally pass an array of key value pairs
306             to specify alternate coordinates if so desired. Potential keys
307             include
308              
309             =over 4
310              
311             =item seq_id
312              
313             =item start
314              
315             =item end
316              
317             =item type
318              
319             The type of database features to retrieve.
320              
321             =item db
322              
323             An alternate database object to collect from.
324              
325             =back
326              
327             =item get_sequence
328              
329             Fetches genomic sequence based on the coordinates of the current seqfeature
330             or interval in the current Feature. This requires a database that
331             contains the genomic sequence, either the database specified in the
332             Data table metadata or an external indexed genomic fasta file.
333              
334             If the Feature represents a transcript or gene, then a concatenated
335             sequence of the selected subfeatures may be generated and returned. B
336             that redundant or overlapping subfeatures are B merged, and
337             unexpected results may be obtained.
338              
339             The sequence is returned as simple string. If the feature is on the reverse
340             strand, then the reverse complement sequence is automatically returned.
341              
342             Pass an array of key value pairs to specify alternate coordinates if so
343             desired. Potential keys include
344              
345             =over 4
346              
347             =item subfeature
348              
349             Pass a text string representing the type of subfeature from which to collect
350             the sequence. Acceptable values include
351              
352             =over 4
353              
354             =item * exon
355              
356             =item * cds
357              
358             =item * 5p_utr
359              
360             =item * 3p_utr
361              
362             =item * intron
363              
364             =back
365              
366             =item seq_id
367              
368             =item start
369              
370             =item end
371              
372             =item strand
373              
374             =item extend
375              
376             Indicate additional basepairs of sequence added to both sides
377              
378             =item db
379              
380             The fasta file or database from which to fetch the sequence
381              
382             =back
383              
384             =back
385              
386             =head2 Data collection
387              
388             The following methods allow for data collection from various
389             sources, including bam, bigwig, bigbed, useq, Bio::DB databases, etc.
390              
391             =over 4
392              
393             =item get_score
394              
395             my $score = $row->get_score(
396             dataset => 'scores.bw',
397             method => 'max',
398             );
399              
400             This method collects a single score over the feature or interval.
401             Usually a mathematical or statistical value is employed to derive the
402             single score. Pass an array of key value pairs to control data collection.
403             Keys include the following:
404              
405             =over 4
406              
407             =item db
408              
409             =item ddb
410              
411             Specify a Bio::DB database from which to collect the data. The default
412             value is the database specified in the Data table metadata, if present.
413             Examples include a L or L
414             database.
415              
416             =item dataset
417              
418             Specify the name of the dataset. If a database was specified, then this
419             value would be the C or C feature found in the
420             database. Otherwise, the name of a data file, such as a bam, bigWig,
421             bigBed, or USeq file, would be provided here. This options is required!
422              
423             =item method
424              
425             Specify the mathematical or statistical method combining multiple scores
426             over the interval into one value. Options include the following:
427              
428             =over 4
429              
430             =item * mean
431              
432             =item * sum
433              
434             =item * min
435              
436             =item * max
437              
438             =item * median
439              
440             =item * count
441              
442             Count all overlapping items.
443              
444             =item * pcount
445              
446             Precisely count only containing (not overlapping) items.
447              
448             =item * ncount
449              
450             Count overlapping unique names only.
451              
452             =item * range
453              
454             The difference between minimum and maximum values.
455              
456             =item * stddev
457              
458             Standard deviation.
459              
460             =back
461              
462             =item strandedness
463              
464             Specify what strand from which the data should be taken, with respect
465             to the Feature strand. Three options are available. Only really relevant
466             for data sources that support strand.
467              
468             =over 4
469              
470             =item * sense
471              
472             The same strand as the Feature.
473              
474             =item * antisense
475              
476             The opposite strand as the Feature.
477              
478             =item * all
479              
480             Strand is ignored, all is taken (default).
481              
482             =back
483              
484             =item subfeature
485              
486             Specify the subfeature type from which to collect the scores. Typically
487             a SeqFeature object representing a transcript is provided, and the
488             indicated subfeatures are collected from object. Pass the name of the
489             subfeature to use. Accepted values include the following.
490              
491             =over 4
492              
493             =item * exon
494              
495             =item * cds
496              
497             =item * 5p_utr
498              
499             =item * 3p_utr
500              
501             =item * intron
502              
503             =back
504              
505             =item extend
506              
507             Specify the number of basepairs that the Data table Feature's
508             coordinates should be extended in both directions. Ignored
509             when used with the subfeature option.
510              
511             =item seq_id
512              
513             =item chromo
514              
515             =item start
516              
517             =item end
518              
519             =item stop
520              
521             =item strand
522              
523             Optionally specify zero or more alternate coordinates to use.
524             By default, these are obtained from the Data table Feature.
525              
526             =back
527            
528             =item get_relative_point_position_scores
529              
530             while (my $row = $stream->next_row) {
531             my $pos2score = $row->get_relative_point_position_scores(
532             'ddb' => '/path/to/BigWigSet/',
533             'dataset' => 'MyData',
534             'position' => 5,
535             'extend' => 1000,
536             );
537             }
538              
539             This method collects indexed position scores centered around a
540             specific reference point. The returned data is a hash of
541             relative positions (example -20, -10, 1, 10, 20) and their score
542             values. Pass an array of key value pairs to control data collection.
543             Keys include the following:
544              
545             =over 4
546              
547             =item db
548              
549             =item ddb
550              
551             Specify a Bio::DB database from which to collect the data. The default
552             value is the database specified in the Data table metadata, if present.
553             Examples include a L or L
554             database.
555              
556             =item dataset
557              
558             Specify the name of the dataset. If a database was specified, then this
559             value would be the C or C feature found in the
560             database. Otherwise, the name of a data file, such as a bam, bigWig,
561             bigBed, or USeq file, would be provided here. This options is required!
562              
563             =item position
564              
565             Indicate the position of the reference point relative to the Data table
566             Feature. 5 is the 5' coordinate, 3 is the 3' coordinate, and 4 is the
567             midpoint (get it? it's between 5 and 3). Default is 5.
568              
569             =item extend
570              
571             Indicate the number of base pairs to extend from the reference coordinate.
572             This option is required!
573              
574             =item coordinate
575              
576             Optionally provide the real chromosomal coordinate as the reference point.
577              
578             =item absolute
579              
580             Boolean option to indicate that the returned hash of positions and scores
581             should not be transformed into relative positions but kept as absolute
582             chromosomal coordinates.
583              
584             =item avoid
585              
586             Provide a C or C database feature type to avoid overlapping
587             scores. Each found score is checked for overlapping features and is
588             discarded if found to do so. The database should be set to use this.
589              
590             =item strandedness
591              
592             Specify what strand from which the data should be taken, with respect
593             to the Feature strand. Three options are available. Only really relevant
594             for data sources that support strand.
595              
596             =over 4
597              
598             =item * sense
599              
600             The same strand as the Feature.
601              
602             =item * antisense
603              
604             The opposite strand as the Feature.
605              
606             =item * all
607              
608             Strand is ignored, all is taken (default).
609              
610             =back
611              
612             =item method
613              
614             Only required when counting objects.
615              
616             =over 4
617              
618             =item * count
619              
620             Count all overlapping items.
621              
622             =item * pcount
623              
624             Precisely count only containing (not overlapping) items.
625              
626             =item * ncount
627              
628             Count overlapping unique names only.
629              
630             =back
631              
632             =back
633              
634             =item get_region_position_scores
635              
636             while (my $row = $stream->next_row) {
637             my $pos2score = $row->get_relative_point_position_scores(
638             'ddb' => '/path/to/BigWigSet/',
639             'dataset' => 'MyData',
640             'position' => 5,
641             'extend' => 1000,
642             );
643             }
644              
645             This method collects indexed position scores across a defined
646             region or interval. The returned data is a hash of positions and
647             their score values. The positions are by default relative to a
648             region coordinate, usually to the 5' end. Pass an array of key value
649             pairs to control data collection. Keys include the following:
650              
651             =over 4
652              
653             =item db
654              
655             =item ddb
656              
657             Specify a Bio::DB database from which to collect the data. The default
658             value is the database specified in the Data table metadata, if present.
659             Examples include a L or L
660             database.
661              
662             =item dataset
663              
664             Specify the name of the dataset. If a database was specified, then this
665             value would be the C or C feature found in the
666             database. Otherwise, the name of a data file, such as a bam, bigWig,
667             bigBed, or USeq file, would be provided here. This options is required!
668              
669             =item subfeature
670              
671             Specify the subfeature type from which to collect the scores. Typically
672             a SeqFeature object representing a transcript is provided, and the
673             indicated subfeatures are collected from object. When converting to
674             relative coordinates, the coordinates will be relative to the length of
675             the sum of the subfeatures, i.e. the length of the introns will be ignored.
676              
677             Pass the name of the subfeature to use. Accepted values include the following.
678              
679             =over 4
680              
681             =item * exon
682              
683             =item * cds
684              
685             =item * 5p_utr
686              
687             =item * 3p_utr
688              
689             =item * intron
690              
691             =back
692              
693             =item extend
694              
695             Specify the number of basepairs that the Data table Feature's
696             coordinates should be extended in both directions.
697              
698             =item seq_id
699              
700             =item chromo
701              
702             =item start
703              
704             =item end
705              
706             =item stop
707              
708             =item strand
709              
710             Optionally specify zero or more alternate coordinates to use.
711             By default, these are obtained from the Data table Feature.
712              
713             =item position
714              
715             Indicate the position of the reference point relative to the Data table
716             Feature. 5 is the 5' coordinate, 3 is the 3' coordinate, and 4 is the
717             midpoint (get it? it's between 5 and 3). Default is 5.
718              
719             =item coordinate
720              
721             Optionally provide the real chromosomal coordinate as the reference point.
722              
723             =item absolute
724              
725             Boolean option to indicate that the returned hash of positions and scores
726             should not be transformed into relative positions but kept as absolute
727             chromosomal coordinates.
728              
729             =item avoid
730              
731             Provide a C or C database feature type to avoid overlapping
732             scores. Each found score is checked for overlapping features and is
733             discarded if found to do so. The database should be set to use this.
734              
735             =item strandedness
736              
737             Specify what strand from which the data should be taken, with respect
738             to the Feature strand. Three options are available. Only really relevant
739             for data sources that support strand.
740              
741             =over 4
742              
743             =item * sense
744              
745             The same strand as the Feature.
746              
747             =item * antisense
748              
749             The opposite strand as the Feature.
750              
751             =item * all
752              
753             Strand is ignored, all is taken (default).
754              
755             =back
756              
757             =item method
758              
759             Only required when counting objects.
760              
761             =over 4
762              
763             =item * count
764              
765             Count all overlapping items.
766              
767             =item * pcount
768              
769             Precisely count only containing (not overlapping) items.
770              
771             =item * ncount
772              
773             Count overlapping unique names only.
774              
775             =back
776              
777             =back
778              
779             =item fetch_alignments
780              
781             my $sam = $Data->open_database('/path/to/file.bam');
782             my $alignment_data = { mapq => [] };
783             my $callback = sub {
784             my ($a, $data) = @_;
785             push @{ $data->{mapq} }, $a->qual;
786             };
787             while (my $row = $stream->next_row) {
788             $row->fetch_alignments(
789             'db' => $sam,
790             'data' => $alignment_data,
791             'callback' => $callback,
792             );
793             }
794              
795             This function allows you to iterate over alignments in a Bam file,
796             allowing custom information to be collected based on a callback
797             code reference that is provided.
798              
799             Three parameters are required: C, C, and C. A
800             true value (1) is returned upon success.
801              
802             =over 4
803              
804             =item db
805              
806             Provide an opened, high-level, Bam database object.
807              
808             =item callback
809              
810             Provide a code callback reference to use when iterating over the
811             alignments. Two objects are passed to this code function: the
812             alignment object and the data structure that is provided. See the
813             Bam adapter documentation for details on low-level C through
814             the Bam index object for details.
815              
816             =item data
817              
818             This is a reference to a C data object for storing information.
819             It is passed to the callback function along with the alignment. Three
820             new key =E value pairs are automatically added: C, C,
821             and C. These correspond to the values for the current queried
822             interval. Coordinates are automatically transformed to 0-base coordinate
823             system to match low level alignment objects.
824              
825             =item subfeature
826              
827             If the feature has subfeatures, such as exons, introns, etc., pass
828             the name of the subfeature to restrict iteration only over the
829             indicated subfeatures. The C object will inherit the coordinates
830             for each subfeatures. Allowed subfeatures include the following:
831              
832             =over 4
833              
834             =item * exon
835              
836             =item * cds
837              
838             =item * 5p_utr
839              
840             =item * 3p_utr
841              
842             =item * intron
843              
844             =back
845              
846             =item start
847              
848             =item stop
849              
850             =item end
851              
852             Provide alternate, custom start and stop coordinates for the row
853             feature. Ignored with subfeatures.
854              
855             =back
856              
857             =back
858              
859             =head2 Feature Export
860              
861             These methods allow the feature to be exported in industry standard
862             formats, including the BED format and the GFF format. Both methods
863             return a formatted tab-delimited text string suitable for printing to
864             file. The string does not include a line ending character.
865              
866             These methods rely on coordinates being present in the source table.
867             If the row feature represents a database item, the L method
868             should be called prior to these methods, allowing the feature to be
869             retrieved from the database and coordinates obtained.
870              
871             =over 4
872              
873             =item bed_string
874              
875             Returns a BED formatted string. By default, a 6-element string is
876             generated, unless otherwise specified. Pass an array of key values
877             to control how the string is generated. The following arguments
878             are supported.
879              
880             =over 4
881              
882             =item bed
883              
884             Specify the number of BED elements to include. The number of elements
885             correspond to the number of columns in the BED file specification. A
886             minimum of 3 (chromosome, start, stop) is required, and maximum of 6
887             is allowed (chromosome, start, stop, name, score, strand).
888              
889             =item chromo
890              
891             =item seq_id
892              
893             Provide a text string of an alternative chromosome or sequence name.
894              
895             =item start
896              
897             =item stop
898              
899             =item end
900              
901             Provide alternative integers for the start and stop coordinates.
902             Note that start values are automatically converted to 0-base
903             by subtracting 1.
904              
905             =item strand
906              
907             Provide alternate an alternative strand value.
908              
909             =item name
910              
911             Provide an alternate or missing name value to be used as text in the 4th
912             column. If no name is provided or available, a default name is generated.
913              
914             =item score
915              
916             Provide a numerical value to be included as the score. BED files typically
917             use integer values ranging from 1..1000.
918              
919             =back
920              
921             =item gff_string
922              
923             Returns a GFF3 formatted string. Pass an array of key values
924             to control how the string is generated. The following arguments
925             are supported.
926              
927             =over 4
928              
929             =item chromo
930              
931             =item seq_id
932              
933             =item start
934              
935             =item stop
936              
937             =item end
938              
939             =item strand
940              
941             Provide alternate values from those defined or missing in the current
942             row Feature.
943              
944             =item source
945              
946             Provide a text string to be used as the source_tag value in the 2nd
947             column. The default value is null ".".
948              
949             =item primary_tag
950              
951             Provide a text string to be used as the primary_tag value in the 3rd
952             column. The default value is null ".".
953              
954             =item type
955              
956             Provide a text string. This can be either a "primary_tag:source_tag" value
957             as used by GFF based BioPerl databases, or "primary_tag" alone.
958              
959             =item score
960              
961             Provide a numerical value to be included as the score. The default
962             value is null ".".
963              
964             =item name
965              
966             Provide alternate or missing name value to be used as the display_name.
967             If no name is provided or available, a default name is generated.
968              
969             =item attributes
970              
971             Provide an anonymous array reference of one or more row Feature indices
972             to be used as GFF attributes. The name of the column is used as the GFF
973             attribute key.
974              
975             =back
976              
977             =back
978              
979             =cut
980              
981 3     3   26 use strict;
  3         8  
  3         128  
982 3     3   17 use Carp qw(carp cluck croak confess);
  3         8  
  3         229  
983 3     3   21 use Module::Load;
  3         7  
  3         24  
984 3         199 use Bio::ToolBox::db_helper qw(
985             get_db_feature
986             get_segment_score
987             calculate_score
988             get_genomic_sequence
989             low_level_bam_fetch
990 3     3   184 );
  3         6  
991 3     3   22 use Bio::ToolBox::db_helper::constants;
  3         7  
  3         27041  
992              
993             my $GENETOOL_LOADED = 0;
994             1;
995              
996              
997              
998             ### Initialization
999              
1000             sub new {
1001             # this should ONLY be called from Bio::ToolBox::Data* iterators
1002 349     349 0 594 my $class = shift;
1003 349         780 my %self = @_;
1004             # we trust that new is called properly with data and index values
1005 349         1128 return bless \%self, $class;
1006             }
1007              
1008              
1009             ### Set and retrieve values
1010              
1011             sub data {
1012 0     0 1 0 return shift->{data};
1013             }
1014              
1015             sub column_name {
1016 0     0 1 0 my ($self, $column) = @_;
1017 0 0       0 return unless defined $column;
1018 0         0 return $self->{data}->name($column);
1019             }
1020              
1021             sub feature_type {
1022 28     28 1 37 my $self = shift;
1023 28 50       56 carp "feature_type is a read only method" if @_;
1024 28         75 return $self->{data}->feature_type;
1025             }
1026              
1027             sub row_index {
1028 65     65 1 92 my $self = shift;
1029 65 50       117 carp "row_index is a read only method" if @_;
1030 65         201 return $self->{'index'};
1031             }
1032              
1033             sub line_number {
1034 0     0 0 0 my $self = shift;
1035 0 0       0 carp "line_number is a read only method" if @_;
1036 0 0       0 if (exists $self->{data}->{line_count}) {
    0          
1037 0         0 return $self->{data}->{line_count};
1038             }
1039             elsif (exists $self->{data}->{header_line_count}) {
1040 0         0 return $self->{data}->{header_line_count} + $self->row_index;
1041             }
1042             else {
1043 0         0 return;
1044             }
1045             }
1046              
1047             sub row_values {
1048 196     196 1 345 my $self = shift;
1049 196 50       386 carp "row_values is a read only method" if @_;
1050 196         279 my $row = $self->{'index'};
1051 196         322 my @data = @{ $self->{data}->{data_table}->[$row] };
  196         535  
1052 196 100       670 return wantarray ? @data : \@data;
1053             }
1054              
1055             sub value {
1056 390     390 1 1977 my ($self, $column, $value) = @_;
1057 390 50       686 return unless defined $column;
1058 390         569 my $row = $self->{'index'};
1059            
1060 390 100       681 if (defined $value) {
1061             # set a value
1062 83         153 $self->{data}->{data_table}->[$row][$column] = $value;
1063             }
1064 390         679 my $v = $self->{data}->{data_table}->[$row][$column];
1065 390 100       1057 return length($v) ? $v : '.'; # internal null value, inherited from GFF definition
1066             }
1067              
1068             sub seq_id {
1069 21     21 1 477 my $self = shift;
1070 21 100       51 if ($_[0]) {
1071             # update only if we have an actual column
1072 1         4 my $i = $self->{data}->chromo_column;
1073 1 50       5 if (defined $i) {
    0          
1074 1         4 my $c = $self->value($i, $_[0]);
1075 1         6 return $c;
1076             }
1077             elsif (exists $self->{feature}) {
1078 0         0 carp "Unable to update seq_id for parsed SeqFeature objects";
1079             }
1080             else {
1081 0         0 carp "No Chromosome column to update!";
1082             }
1083             }
1084             # seqfeature
1085 20 50       45 if (exists $self->{feature}) {
1086 0         0 my $c = $self->{feature}->seq_id;
1087 0         0 return $c;
1088             }
1089             # collect from table
1090 20         63 my $i = $self->{data}->chromo_column;
1091 20 50       62 my $c = defined $i ? $self->value($i) : undef;
1092 20         77 return $c;
1093             }
1094              
1095             sub start {
1096 109     109 1 611 my $self = shift;
1097 109 100       406 if ($_[0]) {
1098             # update only if we have an actual column
1099 5         48 my $i = $self->{data}->start_column;
1100 5 50       64 my $d = $_[0] =~ /^\d+$/ ? 1 : 0;
1101 5 100 66     31 if (defined $i and $d) {
    50          
    50          
1102 1 50       4 if (substr($self->{data}->name($i), -1) eq '0') {
1103             # compensate for 0-based, assuming we're always working with 1-based
1104 1         3 my $n = $_[0] - 1;
1105 1         1061 return $self->value($i, $n);
1106             }
1107             else {
1108 0         0 return $self->value($i, $_[0]);
1109             }
1110             }
1111             elsif (not $d) {
1112 0         0 carp "Start coordinate value is not an integer";
1113             }
1114             elsif (exists $self->{feature}) {
1115 4         587 carp "Unable to update Start coordinate for parsed SeqFeature objects";
1116             }
1117             else {
1118 0         0 carp "No Start coordinate column to update!";
1119             }
1120             }
1121             # seqfeature
1122 108 100       542 if (exists $self->{feature}) {
1123 12         47 return $self->{feature}->start;
1124             }
1125             # collect from table
1126 96         205 my $i = $self->{data}->start_column;
1127 96 50       212 if (defined $i) {
1128 96         169 my $s = $self->value($i);
1129 96 100       232 if (substr($self->{data}->name($i), -1) eq '0') {
1130             # compensate for 0-based index
1131 17         79 return $s + 1;
1132             }
1133             else {
1134 79         227 return $s;
1135             }
1136             }
1137 0         0 return;
1138             }
1139              
1140             *stop = \&end;
1141             sub end {
1142 28     28 1 56 my $self = shift;
1143 28 100       62 if ($_[0]) {
1144             # update only if we have an actual column
1145 1         7 my $i = $self->{data}->stop_column;
1146 1 50       11 my $d = $_[0] =~ /^\d+$/ ? 1 : 0;
1147 1 50 33     8 if (defined $i and $d) {
    0          
    0          
1148 1         4 return $self->value($i, $_[0]);
1149             }
1150             elsif (not $d) {
1151 0         0 carp "End coordinate value is not an integer";
1152             }
1153             elsif (exists $self->{feature}) {
1154 0         0 carp "Unable to update End coordinate for parsed SeqFeature objects";
1155             }
1156             else {
1157 0         0 carp "No End coordinate column to update!";
1158             }
1159             }
1160             # seqfeature
1161 27 50       62 if (exists $self->{feature}) {
1162 0         0 my $e = $self->{feature}->end;
1163 0         0 $self->{end} = $e;
1164 0         0 return $e;
1165             }
1166             # collect from table
1167 27         85 my $i = $self->{data}->stop_column;
1168 27 50       73 my $e = defined $i ? $self->value($i) : undef;
1169 27         54 $self->{end} = $e;
1170 27         92 return $e;
1171             }
1172              
1173             sub strand {
1174 25     25 1 50 my $self = shift;
1175 25 100       78 if ($_[0]) {
1176             # update only if we have an actual column
1177 1         7 my $i = $self->{data}->strand_column;
1178 1 50       5 if (defined $i) {
    50          
1179 0         0 $self->value($i, $_[0]);
1180 0         0 return $self->_strand($_[0]);
1181             }
1182             elsif (exists $self->{feature}) {
1183 0         0 carp "Unable to update Strand for parsed SeqFeature objects";
1184             }
1185             else {
1186 1         225 carp "No Strand column to update!";
1187             }
1188             }
1189             # seqfeature
1190 25 100       165 if (exists $self->{feature}) {
1191 4         16 my $s = $self->{feature}->strand;
1192 4         28 return $s;
1193             }
1194             # collect from table
1195 21         64 my $i = $self->{data}->strand_column;
1196 21 100       72 return defined $i ? $self->_strand( $self->value($i) ) : 0;
1197             }
1198              
1199             sub _strand {
1200 18     18   26 my $self = shift;
1201 18         38 my $str = shift;
1202 18 50 33     100 if ($str eq '1' or $str eq '-1' or $str eq '0') {
    50 33        
    50          
    0          
    0          
    0          
1203 0         0 return $str;
1204             }
1205             elsif ($str eq '+') {
1206 0         0 return 1;
1207             }
1208             elsif ($str eq '-') {
1209 18         40 return -1;
1210             }
1211             elsif ($str eq '.') {
1212 0         0 return 0;
1213             }
1214             elsif ($str eq '2') {
1215             # R packages use 1 for + and 2 for -
1216 0         0 return -1;
1217             }
1218             elsif ($str eq '*') {
1219             # R packages use * for .
1220 0         0 return 0;
1221             }
1222             else {
1223 0         0 return 0;
1224             }
1225             }
1226              
1227             *name = \&display_name;
1228             sub display_name {
1229 75     75 1 1663 my $self = shift;
1230 75 100       154 if ($_[0]) {
1231             # update only if we have an actual column
1232 1         6 my $i = $self->{data}->name_column;
1233 1 50       5 if (defined $i) {
    0          
1234 1         4 return $self->value($i, $_[0]);
1235             }
1236             elsif (exists $self->{feature}) {
1237 0         0 carp "Unable to update display_name for parsed SeqFeature objects";
1238             }
1239             else {
1240 0         0 carp "No Name column to update!";
1241             }
1242             }
1243             # seqfeature
1244 74 100       155 if (exists $self->{feature}) {
1245 4         19 return $self->{feature}->display_name;
1246             }
1247             # collect from table
1248 70         176 my $i = $self->{data}->name_column;
1249 70 50       144 if (defined $i) {
    0          
1250 70         163 return $self->value($i);
1251             }
1252             elsif (my $att = $self->gff_attributes) {
1253             return $att->{Name} || $att->{ID} || $att->{transcript_name} ||
1254 0   0     0 $att->{gene_name} || undef;
1255             }
1256             }
1257              
1258             sub coordinate {
1259 3     3 1 10 my $self = shift;
1260 3 50       14 carp "name is a read only method" if @_;
1261             # to avoid auto-converting start0 coordinates, which might confuse people or programs,
1262             # we will take the start value as is when it's available, otherwise calculate start
1263 3         16 my $start_i = $self->{data}->start_column;
1264             my $coord = sprintf("%s:%d", $self->seq_id,
1265             defined $start_i ? $self->value($start_i) :
1266 3 0       14 exists $self->{feature} ? $self->{feature}->start : 0);
    50          
1267 3         12 my $end = $self->end;
1268 3 50       22 $coord .= "-$end" if $end;
1269 3 50       20 return CORE::length($coord) > 2 ? $coord : undef;
1270             }
1271              
1272             sub type {
1273 9     9 1 24 my $self = shift;
1274 9 100       30 if ($_[0]) {
1275             # update only if we have an actual column
1276 1         5 my $i = $self->{data}->type_column;
1277 1 50       11 if (defined $i) {
    50          
1278 0         0 return $self->value($i, $_[0]);
1279             }
1280             elsif (exists $self->{feature}) {
1281 0         0 carp "Unable to update primary_tag for parsed SeqFeature objects";
1282             }
1283             else {
1284 1         78 carp "No Type column to update!";
1285             }
1286             }
1287             # collect from table
1288 9         126 my $i = $self->{data}->type_column;
1289 9 50       29 if (defined $i) {
1290 0         0 return $self->value($i);
1291             }
1292             # seqfeature
1293 9 100       30 if (exists $self->{feature}) {
1294 4         19 return $self->{feature}->primary_tag;
1295             }
1296             # general metadata
1297 5 50       14 if ($self->{data}->feature) {
1298 5         13 return $self->{data}->feature;
1299             }
1300 0         0 return undef;
1301             }
1302              
1303             *id = \&primary_id;
1304             sub primary_id {
1305 65     65 1 97 my $self = shift;
1306 65 50       127 carp "id is a read only method" if @_;
1307 65         138 my $i = $self->{data}->id_column;
1308 65 50       172 my $v = $self->value($i) if defined $i;
1309 65 50 33     221 if (defined $v and $v ne '.') {
1310 65         202 return $v;
1311             }
1312 0 0       0 return $self->{feature}->primary_id if exists $self->{feature};
1313 0 0       0 if (my $att = $self->gff_attributes) {
1314 0   0     0 return $att->{ID} || $att->{Name} || $att->{transcript_id};
1315             }
1316 0         0 return undef;
1317             }
1318              
1319             sub length {
1320 0     0 1 0 my $self = shift;
1321 0 0       0 carp "length is a read only method" if @_;
1322 0 0       0 if ($self->{data}->vcf) {
1323             # special case for vcf files, measure the length of the ALT allele
1324 0         0 return CORE::length($self->value(4));
1325             }
1326 0         0 my $s = $self->start;
1327 0         0 my $e = $self->end;
1328 0 0 0     0 if (defined $s and defined $e) {
    0          
1329 0         0 return $e - $s + 1;
1330             }
1331             elsif (defined $s) {
1332 0         0 return 1;
1333             }
1334             else {
1335 0         0 return undef;
1336             }
1337             }
1338              
1339             sub score {
1340 0     0 1 0 my $self = shift;
1341 0         0 my $c = $self->{data}->score_column;
1342 0 0       0 return defined $c ? $self->value($c) : undef;
1343             }
1344              
1345             sub attributes {
1346 0     0 1 0 my $self = shift;
1347 0 0       0 return $self->gff_attributes if ($self->{data}->gff);
1348 0 0       0 return $self->vcf_attributes if ($self->{data}->vcf);
1349 0         0 return;
1350             }
1351              
1352             sub gff_attributes {
1353 0     0 1 0 my $self = shift;
1354 0 0       0 return unless ($self->{data}->gff);
1355 0 0       0 return $self->{attributes} if (exists $self->{attributes});
1356 0         0 $self->{attributes} = {};
1357 0         0 foreach my $g (split(/\s*;\s*/, $self->value(8))) {
1358 0         0 my ($tag, $value) = split /\s+/, $g;
1359 0 0 0     0 next unless ($tag and $value);
1360             # unescape URL encoded values, borrowed from Bio::DB::GFF
1361 0         0 $value =~ tr/+/ /;
1362 0         0 $value =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
  0         0  
1363 0         0 $self->{attributes}->{$tag} = $value;
1364             }
1365 0         0 return $self->{attributes};
1366             }
1367              
1368             sub vcf_attributes {
1369 0     0 1 0 my $self = shift;
1370 0 0       0 return unless ($self->{data}->vcf);
1371 0 0       0 return $self->{attributes} if (exists $self->{attributes});
1372 0         0 $self->{attributes} = {};
1373            
1374             # INFO attributes
1375 0         0 my %info;
1376 0 0       0 if ($self->{data}->name(7) eq 'INFO') {
1377 0 0       0 %info = map {$_->[0] => defined $_->[1] ? $_->[1] : undef}
1378             # some tags are simple and have no value, eg SOMATIC
1379 0         0 map { [split(/=/, $_)] }
  0         0  
1380             split(/;/, $self->value(7));
1381             }
1382 0         0 $self->{attributes}->{INFO} = \%info;
1383 0         0 $self->{attributes}->{7} = \%info;
1384            
1385             # Sample attributes
1386 0 0       0 if ($self->{data}->number_columns > 8) {
1387 0         0 my @formatKeys = split /:/, $self->value(8);
1388 0         0 foreach my $i (9 .. $self->{data}->last_column) {
1389 0         0 my $name = $self->{data}->name($i);
1390 0         0 my @sampleVals = split /:/, $self->value($i);
1391             my %sample = map {
1392 0 0       0 $formatKeys[$_] => defined $sampleVals[$_] ? $sampleVals[$_] : undef }
  0         0  
1393             (0 .. $#formatKeys);
1394 0         0 $self->{attributes}->{$name} = \%sample;
1395 0         0 $self->{attributes}->{$i} = \%sample;
1396             }
1397             }
1398 0         0 return $self->{attributes};
1399             }
1400              
1401             sub rewrite_attributes {
1402 0     0 1 0 my $self = shift;
1403 0 0       0 return $self->rewrite_gff_attributes if ($self->{data}->gff);
1404 0 0       0 return $self->rewrite_vcf_attributes if ($self->{data}->vcf);
1405 0         0 return;
1406             }
1407              
1408             sub rewrite_gff_attributes {
1409 0     0 1 0 my $self = shift;
1410 0 0       0 return unless ($self->{data}->gff);
1411 0 0       0 return unless exists $self->{attributes};
1412 0         0 my @pairs; # of key=value items
1413 0 0       0 if (exists $self->{attributes}{ID}) {
1414             # I assume this does not need to be escaped!
1415 0         0 push @pairs, 'ID=' . $self->{attributes}{ID};
1416             }
1417 0 0       0 if (exists $self->{attributes}{Name}) {
1418 0         0 my $name = $self->{attributes}{Name};
1419 0         0 $name =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
  0         0  
1420 0         0 push @pairs, "Name=$name";
1421             }
1422 0         0 foreach my $key (sort {$a cmp $b} keys %{ $self->{attributes} }) {
  0         0  
  0         0  
1423 0 0       0 next if $key eq 'ID';
1424 0 0       0 next if $key eq 'Name';
1425 0         0 my $value = $self->{attributes}{$key};
1426 0         0 $key =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
  0         0  
1427 0         0 $value =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
  0         0  
1428 0         0 push @pairs, "$key=$value";
1429             }
1430 0         0 $self->value(8, join("; ", @pairs));
1431 0         0 return 1;
1432             }
1433              
1434             sub rewrite_vcf_attributes {
1435 0     0 1 0 my $self = shift;
1436 0 0       0 return unless ($self->{data}->vcf);
1437 0 0       0 return unless exists $self->{attributes};
1438            
1439             # INFO
1440             my $info = join(';',
1441             map {
1442             defined $self->{attributes}->{INFO}{$_} ?
1443 0 0       0 join('=', $_, $self->{attributes}->{INFO}{$_}) : $_
1444             }
1445 0         0 sort {$a cmp $b}
1446 0         0 keys %{$self->{attributes}->{INFO}}
  0         0  
1447             );
1448 0   0     0 $info ||= '.'; # sometimes we have nothing left
1449 0         0 $self->value(7, $info);
1450            
1451             # FORMAT
1452 0         0 my @order;
1453 0 0       0 push @order, 'GT' if exists $self->{attributes}{9}{GT};
1454 0         0 foreach my $key (sort {$a cmp $b} keys %{ $self->{attributes}{9} } ) {
  0         0  
  0         0  
1455 0 0       0 next if $key eq 'GT';
1456 0         0 push @order, $key;
1457             }
1458 0 0       0 if (@order) {
1459 0         0 $self->value(8, join(':', @order));
1460             }
1461             else {
1462 0         0 $self->value(8, '.');
1463             }
1464            
1465             # SAMPLES
1466 0         0 foreach my $i (9 .. $self->{data}->last_column) {
1467 0 0       0 if (@order) {
1468             $self->value($i, join(":",
1469 0         0 map { $self->{attributes}{$i}{$_} } @order
  0         0  
1470             ) );
1471             }
1472             else {
1473 0         0 $self->value($i, '.');
1474             }
1475             }
1476 0         0 return 1;
1477             }
1478              
1479             ### Data collection convenience methods
1480              
1481             *feature = \&seqfeature;
1482             sub seqfeature {
1483 9     9 1 15 my $self = shift;
1484 9   50     27 my $force = shift || 0;
1485 9 50       20 carp "feature is a read only method" if @_;
1486 9 50       21 return $self->{feature} if exists $self->{feature};
1487             # normally this is only for named features in a data table
1488             # skip this for coordinate features like bed files
1489 9 50 33     20 return unless $self->feature_type eq 'named' or $force;
1490            
1491             # retrieve from main Data store
1492 0         0 my $f = $self->{data}->get_seqfeature( $self->{'index'} );
1493 0 0       0 if ($f) {
1494 0         0 $self->{feature} = $f;
1495 0         0 return $f;
1496             }
1497            
1498             # retrieve the feature from the database
1499 0 0       0 return unless $self->{data}->database;
1500             $f = get_db_feature(
1501             'db' => $self->{data}->open_meta_database,
1502             'id' => $self->id || undef,
1503             'name' => $self->name || undef,
1504             'type' => $self->type || $self->{data}->feature,
1505 0   0     0 );
      0        
      0        
1506 0 0       0 return unless $f;
1507 0         0 $self->{feature} = $f;
1508 0         0 return $f;
1509             }
1510              
1511             sub segment {
1512 1     1 1 3 my $self = shift;
1513 1 50       4 carp "segment is a read only method" if @_;
1514 1 50       4 return unless $self->{data}->database;
1515 1 50       5 if ($self->feature_type eq 'coordinate') {
    0          
1516 1         3 my $chromo = $self->seq_id;
1517 1         2 my $start = $self->start;
1518 1   33     3 my $stop = $self->end || $start;
1519 1         4 my $db = $self->{data}->open_meta_database;
1520 1 50       7 return $db ? $db->segment($chromo, $start, $stop) : undef;
1521             }
1522             elsif ($self->feature_type eq 'named') {
1523 0         0 my $f = $self->feature;
1524 0 0       0 return $f ? $f->segment : undef;
1525             }
1526             else {
1527 0         0 return undef;
1528             }
1529             }
1530              
1531             sub get_features {
1532 0     0 1 0 my $self = shift;
1533 0         0 my %args = @_;
1534 0   0     0 my $db = $args{db} || $self->{data}->open_meta_database || undef;
1535 0 0       0 carp "no database defined to get features!" unless defined $db;
1536 0 0       0 return unless $db->can('features');
1537            
1538             # convert the argument style for most bioperl db APIs
1539 0         0 my %opts;
1540 0   0     0 $opts{-seq_id} = $args{chromo} || $self->seq_id;
1541 0   0     0 $opts{-start} = $args{start} || $self->start;
1542 0   0     0 $opts{-end} = $args{end} || $self->end;
1543 0   0     0 $opts{-type} = $args{type} || $self->type;
1544            
1545 0         0 return $db->features(%opts);
1546             }
1547              
1548             sub get_sequence {
1549 0     0 1 0 my $self = shift;
1550 0         0 my %args = @_;
1551 0   0     0 my $db = $args{db} || $args{database} || $self->{data}->open_meta_database || undef;
1552             # this will fail immediately if user doesn't provide valid database
1553            
1554             # get sequence over subfeatures
1555 0   0     0 $args{subfeature} ||= undef;
1556 0 0 0     0 if ($self->feature_type eq 'named' and $args{subfeature}) {
1557             # this is more complicated so we have a dedicated method
1558 0         0 return $self->_get_subfeature_sequence($db, \%args);
1559             }
1560            
1561             # get coordinates
1562 0   0     0 my $seqid = $args{seq_id} || $args{chromo} || $self->seq_id;
1563 0   0     0 my $start = $args{start} || $self->start;
1564 0   0     0 my $stop = $args{stop} || $args{end} || $self->end;
1565 0         0 my $strand = $self->strand;
1566 0 0       0 if (exists $args{strand}) {
1567             # user supplied strand, gotta check it
1568 0 0       0 $strand = $args{strand} =~ /\-|r/i ? -1 : 1;
1569             }
1570 0 0 0     0 if (exists $args{extend} and $args{extend}) {
1571 0         0 $start -= $args{extend};
1572 0 0       0 $start = 1 if $start <= 0;
1573 0         0 $stop += $args{extend};
1574             }
1575 0 0 0     0 return unless (defined $seqid and defined $start and defined $stop);
      0        
1576            
1577             # retrieve and return sequence
1578 0         0 my $seq = get_genomic_sequence($db, $seqid, $start, $stop);
1579 0 0       0 if ($strand == -1) {
1580 0         0 $seq =~ tr/gatcGATC/ctagCTAG/;
1581 0         0 $seq = reverse $seq;
1582             }
1583 0         0 return $seq;
1584             }
1585              
1586             sub _get_subfeature_sequence {
1587 0     0   0 my ($self, $db, $args) = @_;
1588            
1589             # get the subfeatures
1590 0         0 my $subfeatures = $self->_get_subfeatures($args->{subfeature});
1591 0 0       0 unless (@$subfeatures) {
1592 0         0 carp "no subfeatures available! Returning parent sequence!";
1593             # just return the parent
1594 0         0 undef $args->{subfeature};
1595 0         0 return $self->get_sequence(@$args);
1596             }
1597            
1598             # sort subfeatures
1599             # this should be done by GeneTools in most cases but just to be sure
1600             # note that this does NOT merge redundant or overlapping exons!!!!
1601 0         0 my @sorted = map { $_->[0] }
1602 0 0       0 sort { $a->[1] <=> $b->[1] or $a->[2] <=> $b->[2] }
1603 0         0 map { [$_, $_->start, $_->end] }
  0         0  
1604             @$subfeatures;
1605            
1606             # collect sequence
1607 0         0 my $sequence;
1608 0         0 foreach my $subf (@sorted) {
1609 0         0 my $seq = get_genomic_sequence($db, $subf->seq_id, $subf->start, $subf->stop);
1610 0         0 $sequence .= $seq;
1611             }
1612            
1613             # flip the sequence
1614 0 0       0 if ($self->strand == -1) {
1615 0         0 $sequence =~ tr/gatcGATC/ctagCTAG/;
1616 0         0 $sequence = reverse $sequence;
1617             }
1618 0         0 return $sequence;
1619             }
1620              
1621             sub _get_subfeatures {
1622 0     0   0 my $self = shift;
1623 0         0 my $subf = lc shift;
1624            
1625             # load GeneTools
1626 0 0       0 unless ($GENETOOL_LOADED) {
1627 0         0 load('Bio::ToolBox::GeneTools', qw(get_exons get_cds get_5p_utrs get_3p_utrs
1628             get_introns));
1629 0 0       0 if ($@) {
1630 0         0 croak "missing required modules! $@";
1631             }
1632             else {
1633 0         0 $GENETOOL_LOADED = 1;
1634             }
1635             }
1636            
1637             # feature
1638 0         0 my $feature = $self->seqfeature;
1639 0 0       0 return unless ($feature);
1640            
1641             # get the subfeatures
1642 0         0 my @subfeatures;
1643 0 0       0 if ($subf eq 'exon') {
    0          
    0          
    0          
    0          
1644 0         0 @subfeatures = get_exons($feature);
1645             }
1646             elsif ($subf eq 'cds') {
1647 0         0 @subfeatures = get_cds($feature);
1648             }
1649             elsif ($subf eq '5p_utr') {
1650 0         0 @subfeatures = get_5p_utrs($feature);
1651             }
1652             elsif ($subf eq '3p_utr') {
1653 0         0 @subfeatures = get_3p_utrs($feature);
1654             }
1655             elsif ($subf eq 'intron') {
1656 0         0 @subfeatures = get_introns($feature);
1657             }
1658             else {
1659 0         0 croak "unrecognized subfeature parameter '$subf'!";
1660             }
1661            
1662 0         0 return \@subfeatures;
1663             }
1664              
1665             sub get_score {
1666 7     7 1 7295 my $self = shift;
1667 7         38 my %args = @_; # passed arguments to this method
1668            
1669             # verify the dataset for the user, cannot trust whether it has been done or not
1670 7   50     57 my $db = $args{ddb} || $args{db} || $self->{data}->open_meta_database || undef;
1671 7         34 $args{dataset} = $self->{data}->verify_dataset($args{dataset}, $db);
1672 7 50       20 unless ($args{dataset}) {
1673 0         0 croak "provided dataset was unrecognized format or otherwise could not be verified!";
1674             }
1675            
1676             # get positioned scores over subfeatures only
1677 7   50     41 $args{subfeature} ||= q();
1678 7 0 33     20 if ($self->feature_type eq 'named' and $args{subfeature}) {
1679             # this is more complicated so we have a dedicated method
1680 0         0 return $self->_get_subfeature_scores($db, \%args);
1681             }
1682            
1683             # build parameter array to pass on to the adapter
1684 7         11 my @params;
1685            
1686             # verify coordinates based on type of feature
1687 7 50       25 if ($self->feature_type eq 'coordinate') {
    0          
1688             # coordinates are already in the table, use those
1689 7   33     31 $params[CHR] = $args{seq_id} || $self->seq_id;
1690 7   33     27 $params[STRT] = $args{start} || $self->start;
1691 7   33     43 $params[STOP] = $args{stop} || $args{end} || $self->end;
1692             $params[STR] = (exists $args{strand} and defined $args{strand}) ? $args{strand} :
1693 7 50 33     25 $self->strand;
1694             }
1695             elsif ($self->feature_type eq 'named') {
1696             # must retrieve feature from the database first
1697 0         0 my $f = $self->seqfeature;
1698 0 0       0 return unless $f;
1699 0   0     0 $params[CHR] = $args{seq_id} || $f->seq_id;
1700 0   0     0 $params[STRT] = $args{start} || $f->start;
1701 0   0     0 $params[STOP] = $args{stop} || $args{end} || $f->end;
1702             $params[STR] = (exists $args{strand} and defined $args{strand}) ? $args{strand} :
1703 0 0 0     0 $f->strand;
1704             }
1705             else {
1706 0         0 croak "data table does not have identifiable coordinate or feature identification columns for score collection";
1707             }
1708            
1709             # adjust coordinates as necessary
1710 7 0 33     20 if (exists $args{extend} and $args{extend}) {
1711 0         0 $params[STRT] -= $args{extend};
1712 0         0 $params[STOP] += $args{extend};
1713             }
1714            
1715             # check coordinates
1716 7 50       22 $params[STRT] = 1 if $params[STRT] <= 0;
1717 7 50       22 if ($params[STOP] < $params[STRT]) {
1718             # coordinates are flipped, reverse strand
1719 0 0       0 return if ($params[STOP] <= 0);
1720 0         0 my $stop = $params[STRT];
1721 0         0 $params[STRT] = $params[STOP];
1722 0         0 $params[STOP] = $stop;
1723 0         0 $params[STR] = -1;
1724             }
1725 7 50 33     28 return unless ($params[CHR] and defined $params[STRT]);
1726            
1727             # score attributes
1728 7   50     18 $params[METH] = $args{'method'} || 'mean';
1729 7   100     38 $params[STND] = $args{strandedness} || $args{stranded} || 'all';
1730            
1731             # other parameters
1732 7         11 $params[DB] = $db;
1733 7         11 $params[RETT] = 0; # return type should be a calculated value
1734 7         10 $params[DATA] = $args{dataset};
1735            
1736             # get the score
1737 7         25 return get_segment_score(@params);
1738             }
1739              
1740             sub _get_subfeature_scores {
1741 0     0   0 my ($self, $db, $args) = @_;
1742            
1743             # get the subfeatures
1744 0         0 my $subfeatures = $self->_get_subfeatures($args->{subfeature});
1745 0 0       0 unless (@$subfeatures) {
1746 0         0 carp "no subfeatures available! Returning parent score data!";
1747             # just return the parent
1748 0         0 undef $args->{subfeature};
1749 0 0       0 delete $args->{exon} if exists $args->{exon};
1750 0         0 return $self->get_score(@$args);
1751             }
1752            
1753             # collect over each subfeature
1754 0         0 my @scores;
1755 0         0 foreach my $exon (@$subfeatures) {
1756 0         0 my @params; # parameters to pass on to adapter
1757 0         0 $params[CHR] = $exon->seq_id;
1758 0         0 $params[STRT] = $exon->start;
1759 0         0 $params[STOP] = $exon->end;
1760 0 0       0 $params[STR] = defined $args->{strand} ? $args->{strand} : $exon->strand;
1761 0   0     0 $params[STND] = $args->{strandedness} || $args->{stranded} || 'all';
1762 0   0     0 $params[METH] = $args->{method} || 'mean';
1763 0         0 $params[RETT] = 1; # return type should be an array reference of scores
1764 0         0 $params[DB] = $db;
1765 0         0 $params[DATA] = $args->{dataset};
1766            
1767 0         0 my $exon_scores = get_segment_score(@params);
1768 0 0       0 push @scores, @$exon_scores if defined $exon_scores;
1769             }
1770            
1771             # combine all the scores based on the requested method
1772 0         0 return calculate_score($args->{method}, \@scores);
1773             }
1774              
1775             sub get_relative_point_position_scores {
1776 1     1 1 2393 my $self = shift;
1777 1         7 my %args = @_;
1778            
1779             # get the database and verify the dataset
1780 1   33     15 my $ddb = $args{ddb} || $args{db} || $self->{data}->open_meta_database;
1781 1         14 $args{dataset} = $self->{data}->verify_dataset($args{dataset}, $ddb);
1782 1 50       7 unless ($args{dataset}) {
1783 0         0 croak "provided dataset was unrecognized format or otherwise could not be verified!\n";
1784             }
1785            
1786             # assign some defaults
1787 1   50     9 $args{strandedness} ||= $args{stranded} || 'all';
      33        
1788 1   50     4 $args{position} ||= 5;
1789 1   50     8 $args{coordinate} ||= undef;
1790 1   50     7 $args{avoid} ||= undef;
1791 1   50     7 $args{'method'} ||= 'mean'; # in most cases this doesn't do anything
1792 1 50       6 unless ($args{extend}) {
1793 0         0 croak "must provide an extend value!";
1794             }
1795 1 50 33     6 $args{avoid} = undef unless ($args{db} or $self->{data}->open_meta_database);
1796            
1797             # determine reference coordinate
1798 1 50       14 $self->_calculate_reference(\%args) unless defined $args{coordinate};
1799            
1800             # build parameter array to pass on to the adapter
1801 1         5 my @params;
1802 1         6 $params[CHR] = $self->seq_id;
1803 1         5 $params[STRT] = $args{coordinate} - $args{extend};
1804 1 50       4 $params[STRT] = 1 if $params[STRT] < 1; # sanity check
1805 1         3 $params[STOP] = $args{coordinate} + $args{extend};
1806 1 50       6 $params[STR] = defined $args{strand} ? $args{strand} : $self->strand;
1807 1         4 $params[STND] = $args{strandedness};
1808 1         2 $params[METH] = $args{'method'};
1809 1         3 $params[RETT] = 2; # return type should be a hash reference of positioned scores
1810 1         3 $params[DB] = $ddb;
1811 1         2 $params[DATA] = $args{dataset};
1812            
1813             # Data collection
1814 1         5 my $pos2data = get_segment_score(@params);
1815            
1816             # Avoid positions
1817 1 50       6 if ($args{avoid}) {
1818 0         0 $self->_avoid_positions($pos2data, \%args, $params[CHR], $params[STRT], $params[STOP]);
1819             }
1820            
1821             # covert to relative positions
1822 1 50       5 if ($args{absolute}) {
1823             # do not convert to relative positions
1824 0 0       0 return wantarray ? %$pos2data : $pos2data;
1825             }
1826             else {
1827             # return the collected dataset hash
1828             return $self->_convert_to_relative_positions($pos2data,
1829 1         5 $args{coordinate}, $params[STR]);
1830             }
1831             }
1832              
1833             sub get_region_position_scores {
1834 4     4 1 8311 my $self = shift;
1835 4         21 my %args = @_;
1836            
1837             # get the database and verify the dataset
1838 4   33     38 my $ddb = $args{ddb} || $args{db} || $self->{data}->open_meta_database;
1839 4         14 $args{dataset} = $self->{data}->verify_dataset($args{dataset}, $ddb);
1840 4 50       13 unless ($args{dataset}) {
1841 0         0 croak "provided dataset was unrecognized format or otherwise could not be verified!\n";
1842             }
1843            
1844             # assign some defaults here, in case we get passed on to subfeature method
1845 4   50     23 $args{strandedness} ||= $args{stranded} || 'all';
      33        
1846 4   50     18 $args{extend} ||= 0;
1847 4   50     16 $args{position} ||= 5;
1848 4   100     12 $args{'method'} ||= 'mean'; # in most cases this doesn't do anything
1849 4 50 33     24 $args{avoid} = undef unless ($args{db} or $self->{data}->open_meta_database);
1850            
1851             # get positioned scores over subfeatures only
1852 4   50     19 $args{subfeature} ||= q();
1853 4 0 33     13 if ($self->feature_type eq 'named' and $args{subfeature}) {
1854             # this is more complicated so we have a dedicated method
1855 0         0 return $self->_get_subfeature_position_scores(\%args, $ddb);
1856             }
1857            
1858             # Assign coordinates
1859             # build parameter array to pass on to the adapter
1860 4         9 my @params;
1861 4   33     12 my $feature = $self->seqfeature || $self;
1862 4   33     22 $params[CHR] = $args{chromo} || $args{seq_id} || $feature->seq_id;
1863 4   33     15 $params[STRT] = $args{start} || $feature->start;
1864 4   33     24 $params[STOP] = $args{stop} || $args{end} || $feature->end;
1865 4 50       14 $params[STR] = defined $args{strand} ? $args{strand} : $feature->strand;
1866 4 50       11 if ($args{extend}) {
1867 0         0 $params[STRT] -= $args{extend};
1868 0         0 $params[STOP] += $args{extend};
1869 0 0       0 $params[STRT] = 1 if $params[STRT] < 1; # sanity check
1870             }
1871 4         9 $params[STND] = $args{strandedness};
1872 4         6 $params[METH] = $args{method};
1873 4         10 $params[RETT] = 2; # return type should be a hash reference of positioned scores
1874 4         7 $params[DB] = $ddb;
1875 4         7 $params[DATA] = $args{dataset};
1876            
1877             # Data collection
1878 4         13 my $pos2data = get_segment_score(@params);
1879            
1880             # Avoid positions
1881 4 50       12 if ($args{avoid}) {
1882 0         0 $self->_avoid_positions($pos2data, \%args, $params[CHR], $params[STRT], $params[STOP]);
1883             }
1884            
1885             # covert to relative positions
1886 4 50       11 if ($args{absolute}) {
1887             # do not convert to relative positions
1888 0 0       0 return wantarray ? %$pos2data : $pos2data;
1889             }
1890             else {
1891             # return data converted to relative positions
1892 4 50       20 $self->_calculate_reference(\%args) unless defined $args{coordinate};
1893             return $self->_convert_to_relative_positions($pos2data,
1894 4         17 $args{coordinate}, $params[STR]);
1895             }
1896             }
1897              
1898             sub _get_subfeature_position_scores {
1899 0     0   0 my ($self, $args, $ddb) = @_;
1900            
1901             # get the subfeatures
1902 0         0 my $subfeatures = $self->_get_subfeatures($args->{subfeature});
1903 0 0       0 unless (@$subfeatures) {
1904 0         0 carp "no subfeatures available! Returning parent score data!";
1905             # just return the parent
1906 0         0 undef $args->{subfeature};
1907 0 0       0 delete $args->{exon} if exists $args->{exon};
1908 0         0 return $self->get_sequence(@$args);
1909             }
1910            
1911             # reset the practical start and stop to the actual subfeatures' final start and stop
1912             # we can no longer rely on the feature start and stop, consider CDS
1913             # these subfeatures should already be genomic sorted by GeneTools
1914 0         0 my $practical_start = $subfeatures->[0]->start;
1915 0         0 my $practical_stop = $subfeatures->[-1]->end;
1916            
1917             # collect over each exon
1918             # we will adjust the positions of each reported score so that
1919             # it will appear as if all the exons are adjacent to each other
1920             # and no introns exist
1921 0         0 my $pos2data = {};
1922 0         0 my $namecheck = {}; # to check unique names when using ncount method....
1923 0         0 my $current_end = $practical_start;
1924 0         0 my $adjustment = 0;
1925 0 0       0 my $fstrand = defined $args->{strand} ? $args->{strand} : $self->strand;
1926 0         0 foreach my $exon (@$subfeatures) {
1927            
1928 0         0 my @params; # parameters to pass on to adapter
1929 0         0 $params[CHR] = $exon->seq_id;
1930 0         0 $params[STRT] = $exon->start;
1931 0         0 $params[STOP] = $exon->end;
1932 0         0 $params[STR] = $fstrand;
1933 0         0 $params[STND] = $args->{strandedness};
1934 0         0 $params[METH] = $args->{method};
1935 0         0 $params[RETT] = 2; # return type should be a hash reference of positioned scores
1936 0         0 $params[DB] = $ddb;
1937 0         0 $params[DATA] = $args->{dataset};
1938            
1939             # collect scores
1940 0         0 my $exon_scores = get_segment_score(@params);
1941            
1942             # adjust the scores
1943 0         0 $adjustment = $params[STRT] - $current_end;
1944             $self->_process_exon_scores($exon_scores, $pos2data, $adjustment, $params[STRT],
1945 0         0 $params[STOP], $namecheck, $args->{method});
1946            
1947             # reset
1948 0         0 $current_end += $exon->length;
1949             }
1950            
1951             # collect extensions if requested
1952 0 0       0 if ($args->{extend}) {
1953             # left side
1954 0         0 my @params; # parameters to pass on to adapter
1955 0         0 $params[CHR] = $self->seq_id;
1956 0         0 $params[STRT] = $practical_start - $args->{extend};
1957 0         0 $params[STOP] = $practical_start - 1;
1958 0         0 $params[STR] = $fstrand;
1959 0         0 $params[STND] = $args->{strandedness};
1960 0         0 $params[METH] = $args->{method};
1961 0         0 $params[RETT] = 2; # return type should be a hash reference of positioned scores
1962 0         0 $params[DB] = $ddb;
1963 0         0 $params[DATA] = $args->{dataset};
1964            
1965 0         0 my $ext_scores = get_segment_score(@params);
1966              
1967             # no adjustment should be needed
1968             $self->_process_exon_scores($ext_scores, $pos2data, 0, $params[STRT],
1969 0         0 $params[STOP], $namecheck, $args->{method});
1970            
1971            
1972             # right side
1973             # we can reuse our parameter array
1974 0         0 $params[STRT] = $practical_stop + 1;
1975 0         0 $params[STOP] = $practical_stop + $args->{extend};
1976 0         0 $ext_scores = get_segment_score(@params);
1977              
1978             # the adjustment should be the same as the last exon
1979             $self->_process_exon_scores($ext_scores, $pos2data, $adjustment, $params[STRT],
1980 0         0 $params[STOP], $namecheck, $args->{method});
1981             }
1982            
1983             # covert to relative positions
1984 0 0       0 if ($args->{absolute}) {
1985             # do not convert to relative positions
1986 0 0       0 return wantarray ? %$pos2data : $pos2data;
1987             }
1988             else {
1989             # return data converted to relative positions
1990             # can no longer use original coordinates, but instead the new shifted coordinates
1991 0         0 $args->{practical_start} = $practical_start;
1992 0         0 $args->{practical_stop} = $current_end;
1993 0         0 $self->_calculate_reference($args);
1994             return $self->_convert_to_relative_positions($pos2data,
1995 0         0 $args->{coordinate}, $fstrand);
1996             }
1997             }
1998              
1999             sub _calculate_reference {
2000 5     5   12 my ($self, $args) = @_;
2001 5   33     14 my $feature = $self->seqfeature || $self;
2002 5 50       30 my $strand = defined $args->{strand} ? $args->{strand} : $feature->strand;
2003 5 50 33     40 if ($args->{position} == 5 and $strand >= 0) {
    50 33        
    50 33        
    0 0        
    0          
2004 0   0     0 $args->{coordinate} = $args->{practical_start} || $feature->start;
2005             }
2006             elsif ($args->{position} == 3 and $strand >= 0) {
2007 0   0     0 $args->{coordinate} = $args->{practical_stop} || $feature->end;
2008             }
2009             elsif ($args->{position} == 5 and $strand < 0) {
2010 5   33     16 $args->{coordinate} = $args->{practical_stop} || $feature->end;
2011             }
2012             elsif ($args->{position} == 3 and $strand < 0) {
2013 0   0     0 $args->{coordinate} = $args->{practical_start} || $feature->start;
2014             }
2015             elsif ($args->{position} == 4) {
2016             # strand doesn't matter here
2017 0   0     0 my $s = $args->{practical_start} || $feature->start;
2018 0         0 $args->{coordinate} = $s + int(($feature->length / 2) + 0.5);
2019             }
2020             else {
2021 0         0 croak "position must be one of 5, 3, or 4";
2022             }
2023             }
2024              
2025             sub _avoid_positions {
2026 0     0   0 my ($self, $pos2data, $args, $seqid, $start, $stop) = @_;
2027            
2028             # first check the list of avoid types
2029 0 0       0 if (ref $args->{avoid} eq 'ARRAY') {
    0          
    0          
2030             # we have types, presume they're ok
2031             }
2032             elsif ($args->{avoid} eq '1') {
2033             # old style boolean value
2034 0 0       0 if (defined $args->{type}) {
2035 0         0 $args->{avoid} = [ $args->{type} ];
2036             }
2037             else {
2038             # no type provided, we can't avoid that which is not defined!
2039             # this is an error, but won't complain as we never did before
2040 0         0 $args->{avoid} = $self->type;
2041             }
2042             }
2043             elsif ($args->{avoid} =~ /w+/i) {
2044             # someone passed a string, a feature type perhaps?
2045 0         0 $args->{avoid} = [ $args->{avoid} ];
2046             }
2047            
2048             ### Check for conflicting features
2049 0   0     0 my $db = $args->{db} || $self->{data}->open_meta_database;
2050             my @overlap_features = $self->get_features(
2051             seq_id => $seqid,
2052             start => $start,
2053             end => $stop,
2054             type => $args->{avoid},
2055 0         0 );
2056            
2057             # get the overlapping features of the same type
2058 0 0       0 if (@overlap_features) {
2059 0         0 my $primary = $self->primary_id;
2060             # there are one or more feature of the type in this region
2061             # one of them is likely the one we're working with
2062             # but not necessarily - user may be looking outside original feature
2063             # the others are not what we want and therefore need to be
2064             # avoided
2065 0         0 foreach my $feat (@overlap_features) {
2066             # skip the one we want
2067 0 0       0 next if ($feat->primary_id eq $primary);
2068             # now eliminate those scores which overlap this feature
2069 0         0 my $start = $feat->start;
2070 0         0 my $stop = $feat->end;
2071 0         0 foreach my $position (keys %$pos2data) {
2072             # delete the scored position if it overlaps with
2073             # the offending feature
2074 0 0 0     0 if (
2075             $position >= $start and
2076             $position <= $stop
2077             ) {
2078 0         0 delete $pos2data->{$position};
2079             }
2080             }
2081             }
2082             }
2083             }
2084              
2085             sub _convert_to_relative_positions {
2086 5     5   23 my ($self, $pos2data, $position, $strand) = @_;
2087            
2088 5         7 my %relative_pos2data;
2089 5 50       17 if ($strand >= 0) {
    50          
2090 0         0 foreach my $p (keys %$pos2data) {
2091 0         0 $relative_pos2data{ $p - $position } = $pos2data->{$p};
2092             }
2093             }
2094             elsif ($strand < 0) {
2095 5         32 foreach my $p (keys %$pos2data) {
2096 223         451 $relative_pos2data{ $position - $p } = $pos2data->{$p};
2097             }
2098             }
2099 5 50       145 return wantarray ? %relative_pos2data : \%relative_pos2data;
2100             }
2101              
2102             sub _process_exon_scores {
2103 0     0     my ($self, $exon_scores, $pos2data, $adjustment, $start, $end,
2104             $namecheck, $method) = @_;
2105            
2106             # ncount method
2107 0 0         if ($method eq 'ncount') {
2108             # we need to check both names and adjust position
2109 0           foreach my $p (keys %$exon_scores) {
2110 0 0 0       next unless ($p >= $start and $p <= $end);
2111 0           foreach my $n (@{ $exon_scores->{$p} }) {
  0            
2112 0 0         if (exists $namecheck->{$n} ) {
2113 0           $namecheck->{$n}++;
2114 0           next;
2115             }
2116             else {
2117 0           $namecheck->{$n} = 1;
2118 0           my $a = $p - $adjustment;
2119 0   0       $pos2data->{$a} ||= [];
2120 0           push @{ $pos2data->{$a} }, $n;
  0            
2121             }
2122             }
2123             }
2124             }
2125             else {
2126             # just adjust scores
2127 0           foreach my $p (keys %$exon_scores) {
2128 0 0 0       next unless ($p >= $start and $p <= $end);
2129 0           $pos2data->{ $p - $adjustment } = $exon_scores->{$p};
2130             }
2131             }
2132             }
2133              
2134             sub fetch_alignments {
2135 0     0 1   my $self = shift;
2136 0           my %args = @_;
2137 0   0       $args{db} ||= $args{dataset} || undef;
      0        
2138 0   0       $args{data} ||= undef;
2139 0   0       $args{callback} ||= undef;
2140 0   0       $args{subfeature} ||= q();
2141            
2142             # verify - trusting that these are valid, else they will fail lower down in the code
2143 0 0         unless ($args{db}) {
2144 0           croak "must provide a Bam object database to fetch alignments!\n";
2145             }
2146 0 0 0       unless ($args{data} and ref($args{data}) eq 'HASH') {
2147 0           croak "must provide a data HASH for the fetch callback!\n";
2148             }
2149 0 0         unless ($args{callback}) {
2150 0           croak "must provide a callback code reference!\n";
2151             }
2152            
2153             # array of features to iterate, probably just one or subfeatures
2154 0           my @intervals;
2155 0 0 0       if ($self->feature_type eq 'named' and $args{subfeature}) {
2156             # we have subfeatures to iterate over
2157              
2158             # get the subfeatures
2159 0           my $subfeatures = $self->_get_subfeatures($args{subfeature});
2160 0 0         if (@$subfeatures) {
2161 0           foreach my $sf (@$subfeatures) {
2162 0           push @intervals, [
2163             $sf->start - 1,
2164             $sf->end
2165             ];
2166             }
2167             }
2168             else {
2169             # zere subfeatures? just take the parent then
2170             push @intervals, [
2171             ($args{start} || $self->start) - 1,
2172 0   0       $args{stop} || $args{end} || $self->end
      0        
2173             ];
2174             }
2175             }
2176             else {
2177             # take feature as is
2178             push @intervals, [
2179             ($args{start} || $self->start) - 1,
2180 0   0       $args{stop} || $args{end} || $self->end
      0        
2181             ];
2182             }
2183              
2184             # get the target id for the chromosome
2185             # this will fail if the user didn't provide a real bam object!!!
2186 0           my ($tid, undef, undef) = $args{db}->header->parse_region($self->seq_id);
2187 0 0         return unless defined $tid;
2188            
2189             # now iterate over the intervals
2190 0           foreach my $i (@intervals) {
2191 0           $args{data}->{start} = $i->[0];
2192 0           $args{data}->{end} = $i->[1];
2193 0           $args{data}->{strand} = $self->strand;
2194             low_level_bam_fetch(
2195             $args{db},
2196             $tid,
2197             $i->[0],
2198             $i->[1],
2199             $args{callback},
2200             $args{data}
2201 0           );
2202             }
2203            
2204             # nothing to return since we're using a data reference
2205 0           return 1;
2206             }
2207              
2208              
2209              
2210             ### String export
2211              
2212             sub bed_string {
2213 0     0 1   my $self = shift;
2214 0           my %args = @_;
2215 0   0       $args{bed} ||= 6; # number of bed columns
2216 0 0         croak "bed count must be an integer!" unless $args{bed} =~ /^\d+$/;
2217 0 0         croak "bed count must be at least 3!" unless $args{bed} >= 3;
2218            
2219             # coordinate information
2220 0           $self->seqfeature; # retrieve the seqfeature object first
2221 0   0       my $chr = $args{chromo} || $args{seq_id} || $self->seq_id;
2222 0   0       my $start = $args{start} || $self->start;
2223 0   0       my $stop = $args{stop} || $args{end} || $self->stop ||
2224             $start + $self->length - 1 || $start;
2225 0 0 0       if ($chr eq '.' or not CORE::length($chr) or $start eq '.' or not CORE::length($start)) {
      0        
      0        
2226 0           carp sprintf("no valid seq_id or start for data line %d", $self->line_number);
2227 0           return;
2228             }
2229 0           $start -= 1; # 0-based coordinates
2230 0           my $string = "$chr\t$start\t$stop";
2231            
2232             # additional information
2233 0 0         if ($args{bed} >= 4) {
2234 0   0       my $name = $args{name} || $self->name || 'Feature_' . $self->line_number;
2235 0           $string .= "\t$name";
2236             }
2237 0 0         if ($args{bed} >= 5) {
2238 0 0         my $score = exists $args{score} ? $args{score} : $self->score;
2239 0 0         $score = 1 unless defined $score;
2240 0           $string .= "\t$score";
2241             }
2242 0 0         if ($args{bed} >= 6) {
2243 0           my $s;
2244 0 0 0       if (exists $args{strand} and defined $args{strand}) {
2245 0           $s = $self->_strand($args{strand});
2246             }
2247             else {
2248 0           $s = $self->strand;
2249             }
2250 0 0         $string .= sprintf("\t%s", $s == -1 ? '-' : '+')
2251             }
2252             # we could go on with other columns, but there's no guarantee that additional
2253             # information is available, and we would have to implement user provided data
2254            
2255             # done
2256 0           return $string;
2257             }
2258              
2259             sub gff_string {
2260 0     0 1   my $self = shift;
2261 0           my %args = @_;
2262            
2263             # coordinate information
2264 0           $self->seqfeature; # retrieve the seqfeature object first
2265 0   0       my $chr = $args{chromo} || $args{seq_id} || $self->seq_id;
2266 0   0       my $start = $args{start} || $self->start;
2267 0   0       my $stop = $args{stop} || $args{end} || $self->stop ||
2268             $start + $self->length - 1 || $start;
2269 0 0 0       if ($chr eq '.' or not CORE::length($chr) or $start eq '.' or not CORE::length($start)) {
      0        
      0        
2270 0           carp sprintf("no valid seq_id or start for data line %d", $self->line_number);
2271 0           return;
2272             }
2273 0           my $strand;
2274 0 0 0       if (exists $args{strand} and defined $args{strand}) {
2275 0           $strand = $self->_strand($args{strand});
2276             }
2277             else {
2278 0           $strand = $self->strand;
2279             }
2280 0 0         $strand = $strand == -1 ? '-' : $strand == 1 ? '+' : '.';
    0          
2281            
2282             # type information
2283 0   0       my $type = $args{type} || $self->type || undef;
2284 0           my ($source, $primary_tag);
2285 0 0 0       if (defined $type and $type =~ /:/) {
2286 0           ($primary_tag, $source) = split /:/, $type;
2287             }
2288 0 0         unless ($source) {
2289 0   0       $source = $args{source} || '.';
2290             }
2291 0 0         unless ($primary_tag) {
2292 0 0 0       $primary_tag = $args{primary_tag} || defined $type ? $type : '.';
2293             }
2294            
2295             # score
2296 0 0         my $score = exists $args{score} ? $args{score} : $self->score;
2297 0 0         $score = '.' unless defined $score;
2298 0           my $phase = '.'; # do not even bother!!!!
2299            
2300             # attributes
2301 0   0       my $name = $args{name} || $self->name || 'Feature_' . $self->line_number;
2302 0           my $attributes = "Name=$name";
2303 0   0       my $id = $args{id} || sprintf("%08d", $self->line_number);
2304 0           $attributes .= ";ID=$id";
2305 0 0 0       if (exists $args{attributes} and ref($args{attributes}) eq 'ARRAY') {
2306 0           foreach my $i (@{$args{attributes}}) {
  0            
2307 0           my $k = $self->{data}->name($i);
2308 0           $k =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
  0            
2309 0           my $v = $self->value($i);
2310 0           $v =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
  0            
2311 0           $attributes .= ";$k=$v";
2312             }
2313             }
2314            
2315             # done
2316 0           my $string = join("\t", $chr, $source, $primary_tag, $start, $stop, $score,
2317             $strand, $phase, $attributes);
2318 0           return $string;
2319             }
2320              
2321              
2322             __END__