File Coverage

blib/lib/Bio/ToolBox/Data/Feature.pm
Criterion Covered Total %
statement 310 766 40.4
branch 144 494 29.1
condition 51 310 16.4
subroutine 31 51 60.7
pod 35 37 94.5
total 571 1658 34.4


line stmt bran cond sub pod time code
1             package Bio::ToolBox::Data::Feature;
2             our $VERSION = '1.691';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::Data::Feature - Objects representing rows in a data table
7              
8             =head1 DESCRIPTION
9              
10             A Bio::ToolBox::Data::Feature is an object representing a row in the
11             data table. Usually, this in turn represents an annotated feature or
12             segment in the genome. As such, this object provides convenient
13             methods for accessing and manipulating the values in a row, as well as
14             methods for working with the represented genomic feature.
15              
16             This class should NOT be used directly by the user. Rather, Feature
17             objects are generated from a Bio::ToolBox::Data::Iterator object
18             (generated itself from the L
19             function in Bio::ToolBox::Data), or the L
20             function in Bio::ToolBox::Data. Please see the respective documentation
21             for more information.
22              
23             Example of working with a stream object.
24              
25             my $Data = Bio::ToolBox::Data->new(file => $file);
26            
27             # stream method
28             my $stream = $Data->row_stream;
29             while (my $row = $stream->next_row) {
30             # each $row is a Bio::ToolBox::Data::Feature object
31             # representing the row in the data table
32             my $value = $row->value($index);
33             # do something with $value
34             }
35            
36             # iterate method
37             $Data->iterate( sub {
38             my $row = shift;
39             my $number = $row->value($index);
40             my $log_number = log($number);
41             $row->value($index, $log_number);
42             } );
43              
44              
45             =head1 METHODS
46              
47             =head2 General information methods
48              
49             =over 4
50              
51             =item row_index
52              
53             Returns the index position of the current data row within the
54             data table. Useful for knowing where you are at within the data
55             table.
56              
57             =item feature_type
58              
59             Returns one of three specific values describing the contents
60             of the data table inferred by the presence of specific column
61             names. This provides a clue as to whether the table features
62             represent genomic regions (defined by coordinate positions) or
63             named database features. The return values include:
64              
65             =over 4
66              
67             =item * coordinate: Table includes at least chromosome and start
68              
69             =item * named: Table includes name, type, and/or Primary_ID
70              
71             =item * unknown: unrecognized
72              
73             =back
74              
75             =item column_name
76              
77             Returns the column name for the given index.
78              
79             item data
80              
81             Returns the parent L object, in case you may
82             have lost it by going out of scope.
83              
84             =back
85              
86             =head2 Methods to access row feature attributes
87              
88             These methods return the corresponding value, if present in the
89             data table, based on the column header name. If the row represents
90             a named database object, try calling the L method first.
91             This will retrieve the database SeqFeature object, and the attributes
92             can then be retrieved using the methods below or on the actual
93             database SeqFeature object.
94              
95             In cases where there is a table column and a corresponding SeqFeature
96             object, for example a start column and a parsed SeqFeature object, the
97             table value takes precedence and is returned. You can always obtain the
98             SeqFeature's value separately and directly.
99              
100             These methods do not set attribute values. If you need to change the
101             values in a table, use the L method below.
102              
103             =over 4
104              
105             =item seq_id
106              
107             The name of the chromosome the feature is on.
108              
109             =item start
110              
111             =item end
112              
113             =item stop
114              
115             The coordinates of the feature or segment. Coordinates from known
116             0-based file formats, e.g. BED, are returned as 1-based. Coordinates
117             must be integers to be returned. Zero or negative start coordinates
118             are assumed to be accidents or poor programming and transformed to 1.
119             Use the L method if you don't want this to happen.
120              
121             =item strand
122              
123             The strand of the feature or segment. Returns -1, 0, or 1. Default is 0,
124             or unstranded.
125              
126             =item midpoint
127              
128             The calculated midpoint position of the feature.
129              
130             =item peak
131              
132             For features in a C file, this will report the peak coordinate,
133             transformed into a genomic coordinate.
134              
135             =item name
136              
137             =item display_name
138              
139             The name of the feature.
140              
141             =item coordinate
142              
143             Returns a coordinate string formatted as C. This uses the
144             start coordinate as listed in the source file and does not convert 0-based
145             start values to 1-based values. This may confound downstream applications.
146              
147             =item type
148              
149             The type of feature. Typically either C or C.
150             In a GFF3 file, this represents columns 3 and 2, respectively. In annotation
151             databases such as L, the type is used to restrict
152             to one of many different types of features, e.g. gene, mRNA, or exon.
153              
154             =item id
155              
156             =item primary_id
157              
158             Here, this represents the C in the database. Note that this number
159             is generally unique to a specific database, and not portable between databases.
160              
161             =item length
162              
163             The length of the feature or segment.
164              
165             =item score
166              
167             Returns the value of the Score column, if one is available. Typically
168             associated with defined file formats, such as GFF files (6th column),
169             BED and related Peak files (5th column), and bedGraph (4th column).
170              
171             =back
172              
173             =head2 Accessing and setting values in the row.
174              
175             =over 4
176              
177             =item value
178              
179             # retrieve a value
180             my $v = $row->value($index);
181             # set a value
182             $row->value($index, $v + 1);
183              
184             Returns or sets the value at a specific column index in the
185             current data row. Null values return a '.', symbolizing an
186             internal null value.
187              
188             =item row_values
189              
190             Returns an array or array reference representing all the values
191             in the current data row.
192              
193             =back
194              
195             =head2 Special feature attributes
196              
197             GFF and VCF files have special attributes in the form of key =E value pairs.
198             These are stored as specially formatted, character-delimited lists in
199             certain columns. These methods will parse this information and return as
200             a convenient hash reference. The keys and values of this hash may be
201             changed, deleted, or added to as desired. To write the changes back to
202             the file, use the L to properly write the attributes
203             back to the file with the proper formatting.
204              
205             =over 4
206              
207             =item attributes
208              
209             Generic method that calls either L or L
210             depending on the data table format.
211              
212             =item gff_attributes
213              
214             Parses the 9th column of GFF files. URL-escaped characters are converted
215             back to text. Returns a hash reference of key =E value pairs.
216              
217             =item vcf_attributes
218              
219             Parses the C (8th column) and all sample columns (10th and higher
220             columns) in a version 4 VCF file. The Sample columns use the C
221             column (9th column) as keys. The returned hash reference has two levels:
222             The first level keys are both the column names and index (0-based). The
223             second level keys are the individual attribute keys to each value.
224             For example:
225              
226             my $attr = $row->vcf_attributes;
227            
228             # access by column name
229             my $genotype = $attr->{sample1}{GT};
230             my $depth = $attr->{INFO}{ADP};
231            
232             # access by 0-based column index
233             my $genotype = $attr->{9}{GT};
234             my $depth = $attr->{7}{ADP}
235              
236             =item rewrite_attributes
237              
238             Generic method that either calls L or
239             L depending on the data table format.
240              
241             =item rewrite_gff_attributes
242              
243             Rewrites the GFF attributes column (the 9th column) based on the
244             contents of the attributes hash that was previously generated with
245             the L method. Useful when you have modified the
246             contents of the attributes hash.
247              
248             =item rewrite_vcf_attributes
249              
250             Rewrite the VCF attributes for the C (8th column), C (9th
251             column), and sample columns (10th and higher columns) based on the
252             contents of the attributes hash that was previously generated with
253             the L method. Useful when you have modified the
254             contents of the attributes hash.
255              
256             =back
257              
258             =head2 Convenience Methods to database functions
259              
260             The next three functions are convenience methods for using the
261             attributes in the current data row to interact with databases.
262             They are wrappers to methods in the L
263             module.
264              
265             =over 4
266              
267             =item seqfeature
268              
269             =item feature
270              
271             Returns a SeqFeature object representing the feature or item in
272             the current row. If the SeqFeature object is stored in the parent
273             C<$Data> object (usually from parsing an annotation file), it is
274             immediately returned. Otherwise, the SeqFeature
275             object is retrieved from the database using the name and
276             type values in the current Data table row. The SeqFeature object
277             is requested from the database named in the general metadata. If
278             an alternate database is desired, you should change it first using
279             the C<$Data>-Edatabase() method. If the feature name or type is not
280             present in the table, then nothing is returned.
281              
282             See L and L for more
283             information about working with these objects. See L
284             about working with database features.
285              
286             This method normally only works with "named" feature types in a
287             L Data table. If your Data table has coordinate
288             information, i.e. chromosome, start, and stop columns, then it will
289             likely be recognized as a "coordinate" feature_type and not work.
290              
291             Pass a true value to this method to force the seqfeature lookup. This
292             will still require the presence of Name, ID, and/or Type columns to
293             perform the database lookup. The L method feature()
294             is used to determine the type if a Type column is not present.
295              
296             =item segment
297              
298             Returns a database Segment object corresponding to the coordinates
299             defined in the Data table row. If a named feature and type are
300             present instead of coordinates, then the feature is first automatically
301             retrieved and a Segment returned based on its coordinates. The
302             database named in the general metadata is used to establish the
303             Segment object. If a different database is desired, it should be
304             changed first using the general L method.
305              
306             See L and L for more information
307             about working with Segment objects.
308              
309             =item get_features
310              
311             my @overlap_features = $row->get_features(type => $type);
312              
313             Returns seqfeature objects from a database that overlap the Feature
314             or interval in the current Data table row. This is essentially a
315             convenience wrapper for a Bio::DB style I method using the
316             coordinates of the Feature. Optionally pass an array of key value pairs
317             to specify alternate coordinates if so desired. Potential keys
318             include
319              
320             =over 4
321              
322             =item seq_id
323              
324             =item start
325              
326             =item end
327              
328             =item type
329              
330             The type of database features to retrieve.
331              
332             =item db
333              
334             An alternate database object to collect from.
335              
336             =back
337              
338             =item get_sequence
339              
340             Fetches genomic sequence based on the coordinates of the current seqfeature
341             or interval in the current Feature. This requires a database that
342             contains the genomic sequence, either the database specified in the
343             Data table metadata or an external indexed genomic fasta file.
344              
345             If the Feature represents a transcript or gene, then a concatenated
346             sequence of the selected subfeatures may be generated and returned. B
347             that redundant or overlapping subfeatures are B merged, and
348             unexpected results may be obtained.
349              
350             The sequence is returned as simple string. If the feature is on the reverse
351             strand, then the reverse complement sequence is automatically returned.
352              
353             Pass an array of key value pairs to specify alternate coordinates if so
354             desired. Potential keys include
355              
356             =over 4
357              
358             =item subfeature
359              
360             Pass a text string representing the type of subfeature from which to collect
361             the sequence. Acceptable values include
362              
363             =over 4
364              
365             =item * exon
366              
367             =item * cds
368              
369             =item * 5p_utr
370              
371             =item * 3p_utr
372              
373             =item * intron
374              
375             =back
376              
377             =item seq_id
378              
379             =item start
380              
381             =item end
382              
383             =item strand
384              
385             =item extend
386              
387             Indicate additional basepairs of sequence added to both sides
388              
389             =item db
390              
391             The fasta file or database from which to fetch the sequence
392              
393             =back
394              
395             =back
396              
397             =head2 Data collection
398              
399             The following methods allow for data collection from various
400             sources, including bam, bigwig, bigbed, useq, Bio::DB databases, etc.
401              
402             =over 4
403              
404             =item calculate_reference($position)
405              
406             Calculates and returns the absolute genomic coordinate for a relative
407             reference position taking into account feature orientation (strand).
408             This is not explicitly data collection, but often used in conjunction
409             with such. Provide an integer representing the relative position point:
410              
411             =over 4
412              
413             =item * C<3> representing C<3'> end coordinate
414              
415             =item * C<4> representing mid point coordinate
416              
417             =item * C<5> representing C<5'> end coordinate
418              
419             =item * C<9> representing peak summit in F formatted files
420              
421             =back
422              
423             If necessary, an array or array reference may be provided as an
424             alternative parameter with keys including C, C,
425             C, and C if alternate or adjusted
426             coordinates should be used instead of the given row feature coordinates.
427              
428             =item get_score
429              
430             my $score = $row->get_score(
431             dataset => 'scores.bw',
432             method => 'max',
433             );
434              
435             This method collects a single score over the feature or interval.
436             Usually a mathematical or statistical value is employed to derive the
437             single score. Pass an array of key value pairs to control data collection.
438             Keys include the following:
439              
440             =over 4
441              
442             =item db
443              
444             =item ddb
445              
446             Specify a Bio::DB database from which to collect the data. The default
447             value is the database specified in the Data table metadata, if present.
448             Examples include a L or L
449             database.
450              
451             =item dataset
452              
453             Specify the name of the dataset. If a database was specified, then this
454             value would be the C or C feature found in the
455             database. Otherwise, the name of a data file, such as a bam, bigWig,
456             bigBed, or USeq file, would be provided here. This options is required!
457              
458             =item method
459              
460             Specify the mathematical or statistical method combining multiple scores
461             over the interval into one value. Options include the following:
462              
463             =over 4
464              
465             =item * mean
466              
467             =item * sum
468              
469             =item * min
470              
471             =item * max
472              
473             =item * median
474              
475             =item * count
476              
477             Count all overlapping items.
478              
479             =item * pcount
480              
481             Precisely count only containing (not overlapping) items.
482              
483             =item * ncount
484              
485             Count overlapping unique names only.
486              
487             =item * range
488              
489             The difference between minimum and maximum values.
490              
491             =item * stddev
492              
493             Standard deviation.
494              
495             =back
496              
497             =item strandedness
498              
499             Specify what strand from which the data should be taken, with respect
500             to the Feature strand. Three options are available. Only really relevant
501             for data sources that support strand.
502              
503             =over 4
504              
505             =item * sense
506              
507             The same strand as the Feature.
508              
509             =item * antisense
510              
511             The opposite strand as the Feature.
512              
513             =item * all
514              
515             Strand is ignored, all is taken (default).
516              
517             =back
518              
519             =item subfeature
520              
521             Specify the subfeature type from which to collect the scores. Typically
522             a SeqFeature object representing a transcript is provided, and the
523             indicated subfeatures are collected from object. Pass the name of the
524             subfeature to use. Accepted values include the following.
525              
526             =over 4
527              
528             =item * exon
529              
530             =item * cds
531              
532             =item * 5p_utr
533              
534             =item * 3p_utr
535              
536             =item * intron
537              
538             =back
539              
540             =item extend
541              
542             Specify the number of basepairs that the Data table Feature's
543             coordinates should be extended in both directions. Ignored
544             when used with the subfeature option.
545              
546             =item seq_id
547              
548             =item chromo
549              
550             =item start
551              
552             =item end
553              
554             =item stop
555              
556             =item strand
557              
558             Optionally specify zero or more alternate coordinates to use.
559             By default, these are obtained from the Data table Feature.
560              
561             =back
562            
563             =item get_relative_point_position_scores
564              
565             while (my $row = $stream->next_row) {
566             my $pos2score = $row->get_relative_point_position_scores(
567             'ddb' => '/path/to/BigWigSet/',
568             'dataset' => 'MyData',
569             'position' => 5,
570             'extend' => 1000,
571             );
572             }
573              
574             This method collects indexed position scores centered around a
575             specific reference point. The returned data is a hash of
576             relative positions (example -20, -10, 1, 10, 20) and their score
577             values. Pass an array of key value pairs to control data collection.
578             Keys include the following:
579              
580             =over 4
581              
582             =item db
583              
584             =item ddb
585              
586             Specify a Bio::DB database from which to collect the data. The default
587             value is the database specified in the Data table metadata, if present.
588             Examples include a L or L
589             database.
590              
591             =item dataset
592              
593             Specify the name of the dataset. If a database was specified, then this
594             value would be the C or C feature found in the
595             database. Otherwise, the name of a data file, such as a bam, bigWig,
596             bigBed, or USeq file, would be provided here. This options is required!
597              
598             =item position
599              
600             Indicate the position of the reference point relative to the Data table
601             Feature. 5 is the 5' coordinate, 3 is the 3' coordinate, and 4 is the
602             midpoint (get it? it's between 5 and 3). Default is 5.
603              
604             =item extend
605              
606             Indicate the number of base pairs to extend from the reference coordinate.
607             This option is required!
608              
609             =item coordinate
610              
611             Optionally provide the real chromosomal coordinate as the reference point.
612              
613             =item absolute
614              
615             Boolean option to indicate that the returned hash of positions and scores
616             should not be transformed into relative positions but kept as absolute
617             chromosomal coordinates.
618              
619             =item avoid
620              
621             Provide a C or C database feature type to avoid overlapping
622             scores. Each found score is checked for overlapping features and is
623             discarded if found to do so. The database should be set to use this.
624              
625             =item strandedness
626              
627             Specify what strand from which the data should be taken, with respect
628             to the Feature strand. Three options are available. Only really relevant
629             for data sources that support strand.
630              
631             =over 4
632              
633             =item * sense
634              
635             The same strand as the Feature.
636              
637             =item * antisense
638              
639             The opposite strand as the Feature.
640              
641             =item * all
642              
643             Strand is ignored, all is taken (default).
644              
645             =back
646              
647             =item method
648              
649             Only required when counting objects.
650              
651             =over 4
652              
653             =item * count
654              
655             Count all overlapping items.
656              
657             =item * pcount
658              
659             Precisely count only containing (not overlapping) items.
660              
661             =item * ncount
662              
663             Count overlapping unique names only.
664              
665             =back
666              
667             =back
668              
669             =item get_region_position_scores
670              
671             while (my $row = $stream->next_row) {
672             my $pos2score = $row->get_relative_point_position_scores(
673             'ddb' => '/path/to/BigWigSet/',
674             'dataset' => 'MyData',
675             'position' => 5,
676             'extend' => 1000,
677             );
678             }
679              
680             This method collects indexed position scores across a defined
681             region or interval. The returned data is a hash of positions and
682             their score values. The positions are by default relative to a
683             region coordinate, usually to the 5' end. Pass an array of key value
684             pairs to control data collection. Keys include the following:
685              
686             =over 4
687              
688             =item db
689              
690             =item ddb
691              
692             Specify a Bio::DB database from which to collect the data. The default
693             value is the database specified in the Data table metadata, if present.
694             Examples include a L or L
695             database.
696              
697             =item dataset
698              
699             Specify the name of the dataset. If a database was specified, then this
700             value would be the C or C feature found in the
701             database. Otherwise, the name of a data file, such as a bam, bigWig,
702             bigBed, or USeq file, would be provided here. This options is required!
703              
704             =item subfeature
705              
706             Specify the subfeature type from which to collect the scores. Typically
707             a SeqFeature object representing a transcript is provided, and the
708             indicated subfeatures are collected from object. When converting to
709             relative coordinates, the coordinates will be relative to the length of
710             the sum of the subfeatures, i.e. the length of the introns will be ignored.
711              
712             Pass the name of the subfeature to use. Accepted values include the following.
713              
714             =over 4
715              
716             =item * exon
717              
718             =item * cds
719              
720             =item * 5p_utr
721              
722             =item * 3p_utr
723              
724             =item * intron
725              
726             =back
727              
728             =item extend
729              
730             Specify the number of basepairs that the Data table Feature's
731             coordinates should be extended in both directions.
732              
733             =item seq_id
734              
735             =item chromo
736              
737             =item start
738              
739             =item end
740              
741             =item stop
742              
743             =item strand
744              
745             Optionally specify zero or more alternate coordinates to use.
746             By default, these are obtained from the Data table Feature.
747              
748             =item position
749              
750             Indicate the position of the reference point relative to the Data table
751             Feature. 5 is the 5' coordinate, 3 is the 3' coordinate, and 4 is the
752             midpoint (get it? it's between 5 and 3). Default is 5.
753              
754             =item coordinate
755              
756             Optionally provide the real chromosomal coordinate as the reference point.
757              
758             =item absolute
759              
760             Boolean option to indicate that the returned hash of positions and scores
761             should not be transformed into relative positions but kept as absolute
762             chromosomal coordinates.
763              
764             =item avoid
765              
766             Provide a C or C database feature type to avoid overlapping
767             scores. Each found score is checked for overlapping features and is
768             discarded if found to do so. The database should be set to use this.
769              
770             =item strandedness
771              
772             Specify what strand from which the data should be taken, with respect
773             to the Feature strand. Three options are available. Only really relevant
774             for data sources that support strand.
775              
776             =over 4
777              
778             =item * sense
779              
780             The same strand as the Feature.
781              
782             =item * antisense
783              
784             The opposite strand as the Feature.
785              
786             =item * all
787              
788             Strand is ignored, all is taken (default).
789              
790             =back
791              
792             =item method
793              
794             Only required when counting objects.
795              
796             =over 4
797              
798             =item * count
799              
800             Count all overlapping items.
801              
802             =item * pcount
803              
804             Precisely count only containing (not overlapping) items.
805              
806             =item * ncount
807              
808             Count overlapping unique names only.
809              
810             =back
811              
812             =back
813              
814             =item fetch_alignments
815              
816             my $sam = $Data->open_database('/path/to/file.bam');
817             my $alignment_data = { mapq => [] };
818             my $callback = sub {
819             my ($a, $data) = @_;
820             push @{ $data->{mapq} }, $a->qual;
821             };
822             while (my $row = $stream->next_row) {
823             $row->fetch_alignments(
824             'db' => $sam,
825             'data' => $alignment_data,
826             'callback' => $callback,
827             );
828             }
829              
830             This function allows you to iterate over alignments in a Bam file,
831             allowing custom information to be collected based on a callback
832             code reference that is provided.
833              
834             Three parameters are required: C, C, and C. A
835             true value (1) is returned upon success.
836              
837             =over 4
838              
839             =item db
840              
841             Provide an opened, high-level, Bam database object.
842              
843             =item callback
844              
845             Provide a code callback reference to use when iterating over the
846             alignments. Two objects are passed to this code function: the
847             alignment object and the data structure that is provided. See the
848             Bam adapter documentation for details on low-level C through
849             the Bam index object for details.
850              
851             =item data
852              
853             This is a reference to a C data object for storing information.
854             It is passed to the callback function along with the alignment. Three
855             new key =E value pairs are automatically added: C, C,
856             and C. These correspond to the values for the current queried
857             interval. Coordinates are automatically transformed to 0-base coordinate
858             system to match low level alignment objects.
859              
860             =item subfeature
861              
862             If the feature has subfeatures, such as exons, introns, etc., pass
863             the name of the subfeature to restrict iteration only over the
864             indicated subfeatures. The C object will inherit the coordinates
865             for each subfeatures. Allowed subfeatures include the following:
866              
867             =over 4
868              
869             =item * exon
870              
871             =item * cds
872              
873             =item * 5p_utr
874              
875             =item * 3p_utr
876              
877             =item * intron
878              
879             =back
880              
881             =item start
882              
883             =item stop
884              
885             =item end
886              
887             Provide alternate, custom start and stop coordinates for the row
888             feature. Ignored with subfeatures.
889              
890             =back
891              
892             =back
893              
894             =head2 Feature Export
895              
896             These methods allow the feature to be exported in industry standard
897             formats, including the BED format and the GFF format. Both methods
898             return a formatted tab-delimited text string suitable for printing to
899             file. The string does not include a line ending character.
900              
901             These methods rely on coordinates being present in the source table.
902             If the row feature represents a database item, the L method
903             should be called prior to these methods, allowing the feature to be
904             retrieved from the database and coordinates obtained.
905              
906             =over 4
907              
908             =item bed_string
909              
910             Returns a BED formatted string. By default, a 6-element string is
911             generated, unless otherwise specified. Pass an array of key values
912             to control how the string is generated. The following arguments
913             are supported.
914              
915             =over 4
916              
917             =item bed
918              
919             Specify the number of BED elements to include. The number of elements
920             correspond to the number of columns in the BED file specification. A
921             minimum of 3 (chromosome, start, stop) is required, and maximum of 6
922             is allowed (chromosome, start, stop, name, score, strand).
923              
924             =item chromo
925              
926             =item seq_id
927              
928             Provide a text string of an alternative chromosome or sequence name.
929              
930             =item start
931              
932             =item stop
933              
934             =item end
935              
936             Provide alternative integers for the start and stop coordinates.
937             Note that start values are automatically converted to 0-base
938             by subtracting 1.
939              
940             =item strand
941              
942             Provide alternate an alternative strand value.
943              
944             =item name
945              
946             Provide an alternate or missing name value to be used as text in the 4th
947             column. If no name is provided or available, a default name is generated.
948              
949             =item score
950              
951             Provide a numerical value to be included as the score. BED files typically
952             use integer values ranging from 1..1000.
953              
954             =back
955              
956             =item gff_string
957              
958             Returns a GFF3 formatted string. Pass an array of key values
959             to control how the string is generated. The following arguments
960             are supported.
961              
962             =over 4
963              
964             =item chromo
965              
966             =item seq_id
967              
968             =item start
969              
970             =item stop
971              
972             =item end
973              
974             =item strand
975              
976             Provide alternate values from those defined or missing in the current
977             row Feature.
978              
979             =item source
980              
981             Provide a text string to be used as the source_tag value in the 2nd
982             column. The default value is null ".".
983              
984             =item primary_tag
985              
986             Provide a text string to be used as the primary_tag value in the 3rd
987             column. The default value is null ".".
988              
989             =item type
990              
991             Provide a text string. This can be either a "primary_tag:source_tag" value
992             as used by GFF based BioPerl databases, or "primary_tag" alone.
993              
994             =item score
995              
996             Provide a numerical value to be included as the score. The default
997             value is null ".".
998              
999             =item name
1000              
1001             Provide alternate or missing name value to be used as the display_name.
1002             If no name is provided or available, a default name is generated.
1003              
1004             =item attributes
1005              
1006             Provide an anonymous array reference of one or more row Feature indices
1007             to be used as GFF attributes. The name of the column is used as the GFF
1008             attribute key.
1009              
1010             =back
1011              
1012             =back
1013              
1014             =cut
1015              
1016 3     3   16 use strict;
  3         4  
  3         87  
1017 3     3   12 use Carp qw(carp cluck croak confess);
  3         5  
  3         180  
1018 3     3   13 use Module::Load;
  3         17  
  3         22  
1019 3         145 use Bio::ToolBox::db_helper qw(
1020             get_db_feature
1021             get_segment_score
1022             calculate_score
1023             get_genomic_sequence
1024             low_level_bam_fetch
1025 3     3   134 );
  3         6  
1026 3     3   13 use Bio::ToolBox::db_helper::constants;
  3         6  
  3         19848  
1027              
1028             my $GENETOOL_LOADED = 0;
1029             1;
1030              
1031              
1032              
1033             ### Initialization
1034              
1035             sub new {
1036             # this should ONLY be called from Bio::ToolBox::Data* iterators
1037 353     353 0 374 my $class = shift;
1038 353         581 my %self = @_;
1039             # we trust that new is called properly with data and index values
1040 353         776 return bless \%self, $class;
1041             }
1042              
1043              
1044             ### Set and retrieve values
1045              
1046             sub data {
1047 0     0 1 0 return shift->{data};
1048             }
1049              
1050             sub column_name {
1051 0     0 1 0 my ($self, $column) = @_;
1052 0 0       0 return unless defined $column;
1053 0         0 return $self->{data}->name($column);
1054             }
1055              
1056             sub feature_type {
1057 24     24 1 30 my $self = shift;
1058 24 50       38 carp "feature_type is a read only method" if @_;
1059 24         60 return $self->{data}->feature_type;
1060             }
1061              
1062             sub row_index {
1063 65     65 1 63 my $self = shift;
1064 65 50       100 carp "row_index is a read only method" if @_;
1065 65         137 return $self->{'index'};
1066             }
1067              
1068             sub line_number {
1069 0     0 0 0 my $self = shift;
1070 0 0       0 carp "line_number is a read only method" if @_;
1071 0 0       0 if (exists $self->{data}->{line_count}) {
    0          
1072 0         0 return $self->{data}->{line_count};
1073             }
1074             elsif (exists $self->{data}->{header_line_count}) {
1075 0         0 return $self->{data}->{header_line_count} + $self->row_index;
1076             }
1077             else {
1078 0         0 return;
1079             }
1080             }
1081              
1082             sub row_values {
1083 196     196 1 252 my $self = shift;
1084 196 50       242 carp "row_values is a read only method" if @_;
1085 196         197 my $row = $self->{'index'};
1086 196         174 my @data = @{ $self->{data}->{data_table}->[$row] };
  196         373  
1087 196 100       486 return wantarray ? @data : \@data;
1088             }
1089              
1090             sub value {
1091 341     341 1 1378 my ($self, $column, $value) = @_;
1092 341 50       455 return unless defined $column;
1093 341         382 my $row = $self->{'index'};
1094            
1095 341 100       406 if (defined $value) {
1096             # set a value
1097 85         104 $self->{data}->{data_table}->[$row][$column] = $value;
1098             }
1099 341         473 my $v = $self->{data}->{data_table}->[$row][$column];
1100 341 100       721 return length($v) ? $v : '.'; # internal null value, inherited from GFF definition
1101             }
1102              
1103             sub seq_id {
1104 21     21 1 300 my $self = shift;
1105 21 100       44 if ($_[0]) {
1106 1         4 my $i = $self->{data}->chromo_column;
1107             # update only if we have an actual column
1108 1 50       3 if (defined $i) {
    0          
1109 1         4 $self->value($i, $_[0]);
1110 1         2 $self->{seqid} = $_[0];
1111             }
1112             elsif (exists $self->{feature}) {
1113 0         0 carp "Unable to update seq_id for parsed SeqFeature objects";
1114             }
1115             else {
1116 0         0 carp "No Chromosome column to update!";
1117             }
1118             }
1119            
1120             # return cached value
1121 21 100       71 return $self->{seqid} if exists $self->{seqid};
1122            
1123             # get seq_id value
1124 6         10 my $c;
1125 6         19 my $i = $self->{data}->chromo_column;
1126 6         30 my $j = $self->{data}->coord_column;
1127 6 50       23 if (defined $i) {
    0          
    0          
    0          
1128             # collect from table
1129 6         15 $c = $self->value($i);
1130             }
1131             elsif (exists $self->{feature}) {
1132             # seqfeature
1133 0         0 $c = $self->{feature}->seq_id;
1134             }
1135             elsif (defined $j) {
1136 0         0 $self->_from_coordinate_string($j);
1137 0         0 return $self->{seqid};
1138             }
1139             elsif ($self->feature_type eq 'named') {
1140 0         0 my $f = $self->seqfeature;
1141 0 0       0 if ($f) {
1142 0         0 $c = $f->seq_id;
1143             }
1144             }
1145 6 50       22 $self->{seqid} = $c if defined $c;
1146 6         18 return $c;
1147             }
1148              
1149             sub start {
1150 119     119 1 1038 my $self = shift;
1151 119 100       179 if ($_[0]) {
1152 5         16 my $i = $self->{data}->start_column;
1153             # update only if we have an actual column
1154 5 50       34 my $d = $_[0] =~ /^\d+$/ ? 1 : 0; # looks like an integer
1155 5 100 66     25 if (defined $i and $d) {
    50          
    50          
1156 1 50       4 if ($self->{data}->interbase) {
1157             # compensate for 0-based, assuming we're always working with 1-based
1158 1         4 $self->value($i, $_[0] - 1);
1159 1         2 $self->{start} = $_[0];
1160             }
1161             else {
1162 0         0 $self->value($i, $_[0]);
1163 0         0 $self->{start} = $_[0];
1164             }
1165             }
1166             elsif (not $d) {
1167 0         0 carp "Start coordinate value is not an integer";
1168             }
1169             elsif (exists $self->{feature}) {
1170 4         480 carp "Unable to update Start coordinate for parsed SeqFeature objects";
1171             }
1172             else {
1173 0         0 carp "No Start coordinate column to update!";
1174             }
1175             }
1176            
1177             # return cached value
1178 119 100       496 return $self->{start} if exists $self->{start};
1179            
1180             # get start value
1181 92         85 my $s;
1182 92         157 my $i = $self->{data}->start_column;
1183 92         168 my $j = $self->{data}->coord_column;
1184 92 100       130 if (defined $i) {
    50          
    0          
    0          
1185             # collect from table
1186 87         107 $s = $self->value($i);
1187 87 100       150 if ($self->{data}->interbase) {
1188             # compensate for 0-based index
1189 6         18 $s += 1;
1190             }
1191             }
1192             elsif (exists $self->{feature}) {
1193             # seqfeature
1194 5         17 $s = $self->{feature}->start;
1195             }
1196             elsif (defined $j) {
1197 0         0 $self->_from_coordinate_string($j);
1198 0         0 return $self->{start};
1199             }
1200             elsif ($self->feature_type eq 'named') {
1201 0         0 my $f = $self->seqfeature;
1202 0 0       0 if ($f) {
1203 0         0 $s = $f->start;
1204             }
1205             }
1206 92 50       149 $self->{start} = $s if defined $s;
1207 92         194 return $s;
1208             }
1209              
1210             *stop = \&end;
1211             sub end {
1212 38     38 1 66 my $self = shift;
1213 38 100       73 if ($_[0]) {
1214 2         8 my $i = $self->{data}->stop_column;
1215             # update only if we have an actual column
1216 2 50       14 my $d = $_[0] =~ /^\d+$/ ? 1 : 0;
1217 2 50 33     10 if (defined $i and $d) {
    0          
    0          
1218 2         7 $self->value($i, $_[0]);
1219 2         3 $self->{end} = $_[0];
1220             }
1221             elsif (not $d) {
1222 0         0 carp "End coordinate value is not an integer";
1223             }
1224             elsif (exists $self->{feature}) {
1225 0         0 carp "Unable to update End coordinate for parsed SeqFeature objects";
1226             }
1227             else {
1228 0         0 carp "No End coordinate column to update!";
1229             }
1230             }
1231            
1232             # return cached value
1233 38 100       113 return $self->{end} if exists $self->{end};
1234            
1235             # get end value
1236 11         13 my $e;
1237 11         43 my $i = $self->{data}->stop_column;
1238 11         27 my $j = $self->{data}->coord_column;
1239 11 100       29 if (defined $i) {
    50          
    0          
    0          
1240             # collect from table
1241 10         23 $e = $self->value($i);
1242             }
1243             elsif (exists $self->{feature}) {
1244             # seqfeature
1245 1         4 $e = $self->{feature}->end;
1246             }
1247             elsif (defined $j) {
1248 0         0 $self->_from_coordinate_string($j);
1249 0         0 return $self->{end};
1250             }
1251             elsif ($self->feature_type eq 'named') {
1252 0         0 my $f = $self->seqfeature;
1253 0 0       0 if ($f) {
1254 0         0 $e = $f->end;
1255             }
1256             }
1257 11 50       35 $self->{end} = $e if defined $e;
1258 11         32 return $e;
1259             }
1260              
1261             sub strand {
1262 30     30 1 49 my $self = shift;
1263 30 100       55 if ($_[0]) {
1264             # update only if we have an actual column
1265 1         5 my $i = $self->{data}->strand_column;
1266 1 50       4 if (defined $i) {
    50          
1267 0         0 $self->value($i, $_[0]);
1268 0         0 $self->{strand} = $self->_strand($_[0]);
1269             }
1270             elsif (exists $self->{feature}) {
1271 0         0 carp "Unable to update Strand for parsed SeqFeature objects";
1272             }
1273             else {
1274 1         123 carp "No Strand column to update!";
1275             }
1276             }
1277            
1278             # return cached value
1279 30 100       161 return $self->{strand} if exists $self->{strand};
1280            
1281             # get strand value
1282 8         10 my $s = 0;
1283 8 100       36 if (my $i = $self->{data}->strand_column) {
    100          
    50          
1284             # collect from table
1285 2         6 $s = $self->_strand( $self->value($i) );
1286             }
1287             elsif (exists $self->{feature}) {
1288             # seqfeature
1289 5         15 $s = $self->{feature}->strand;
1290             }
1291             elsif ($self->feature_type eq 'named') {
1292 0         0 my $f = $self->seqfeature;
1293 0 0       0 if ($f) {
1294 0         0 $s = $f->strand;
1295             }
1296             }
1297 8         15 $self->{strand} = $s;
1298 8         23 return $s;
1299             }
1300              
1301             sub _strand {
1302 2     2   4 my $self = shift;
1303 2         3 my $str = shift;
1304 2 50 33     31 if ($str eq '1' or $str eq '-1' or $str eq '0') {
    50 33        
    50          
    0          
    0          
    0          
1305 0         0 return $str;
1306             }
1307             elsif ($str eq '+') {
1308 0         0 return 1;
1309             }
1310             elsif ($str eq '-') {
1311 2         6 return -1;
1312             }
1313             elsif ($str eq '.') {
1314 0         0 return 0;
1315             }
1316             elsif ($str eq '2') {
1317             # R packages use 1 for + and 2 for -
1318 0         0 return -1;
1319             }
1320             elsif ($str eq '*') {
1321             # R packages use * for .
1322 0         0 return 0;
1323             }
1324             else {
1325 0         0 return 0;
1326             }
1327             }
1328              
1329             sub _from_coordinate_string {
1330 0     0   0 my ($self, $i) = shift;
1331 0         0 my ($chr, $start, $end, $str) = split /(?:\-|\.\.|\s)/, $self->value($i);
1332 0 0       0 $self->{seqid} = $chr unless exists $self->{seqid};
1333 0 0       0 $self->{start} = $start unless exists $self->{start};
1334             # we assume this is a 1-based coordinate
1335 0 0       0 $self->{end} = $end unless exists $self->{end};
1336 0 0 0     0 if (defined $str and not exists $self->{strand}) {
1337             # you never know, the strand may be added to the string
1338 0         0 $self->{strand} = $self->_strand($str);
1339             }
1340             }
1341              
1342             sub peak {
1343 4     4 1 305 my $self = shift;
1344 4 100       15 if ($self->{data}->format eq 'narrowPeak') {
1345 3 100 66     15 if (exists $self->{feature} and $self->{feature}->has_tag('peak')) {
1346 2         6 return $self->{feature}->get_tag_values('peak') + $self->{feature}->start;
1347             }
1348             else {
1349 1         4 return $self->value(1) + $self->value(9) + 1;
1350             }
1351             }
1352             else {
1353 1         3 return $self->midpoint;
1354             }
1355             }
1356              
1357             sub midpoint {
1358 5     5 1 10 my $self = shift;
1359 5         14 my $s = $self->start;
1360 5         15 my $e = $self->end;
1361 5 50 33     19 if ($s and $e) {
1362 5         28 return int( ($s + $e) / 2 );
1363             }
1364             else {
1365 0         0 return undef;
1366             }
1367             }
1368              
1369             *name = \&display_name;
1370             sub display_name {
1371 76     76 1 1246 my $self = shift;
1372 76 100       113 if ($_[0]) {
1373             # update only if we have an actual column
1374 1         4 my $i = $self->{data}->name_column;
1375 1 50       4 if (defined $i) {
    0          
1376 1         4 return $self->value($i, $_[0]);
1377             }
1378             elsif (exists $self->{feature}) {
1379 0         0 carp "Unable to update display_name for parsed SeqFeature objects";
1380             }
1381             else {
1382 0         0 carp "No Name column to update!";
1383             }
1384             }
1385             # seqfeature
1386 75 100       140 if (exists $self->{feature}) {
1387 5         17 return $self->{feature}->display_name;
1388             }
1389             # collect from table
1390 70         146 my $i = $self->{data}->name_column;
1391 70 50       99 if (defined $i) {
    0          
1392 70         107 return $self->value($i);
1393             }
1394             elsif (my $att = $self->gff_attributes) {
1395             return $att->{Name} || $att->{ID} || $att->{transcript_name} ||
1396 0   0     0 $att->{gene_name} || undef;
1397             }
1398             }
1399              
1400             sub coordinate {
1401 3     3 1 7 my $self = shift;
1402 3 50       9 carp "name is a read only method" if @_;
1403             # to avoid auto-converting start0 coordinates, which might confuse people or programs,
1404             # we will take the start value as is when it's available, otherwise calculate start
1405 3         12 my $start_i = $self->{data}->start_column;
1406             my $coord = sprintf("%s:%d", $self->seq_id,
1407             defined $start_i ? $self->value($start_i) :
1408 3 0       10 exists $self->{feature} ? $self->{feature}->start : 0);
    50          
1409 3         8 my $end = $self->end;
1410 3 50       10 $coord .= "-$end" if $end;
1411 3 50       16 return CORE::length($coord) > 2 ? $coord : undef;
1412             }
1413              
1414             sub type {
1415 9     9 1 16 my $self = shift;
1416 9 100       24 if ($_[0]) {
1417             # update only if we have an actual column
1418 1         4 my $i = $self->{data}->type_column;
1419 1 50       6 if (defined $i) {
    50          
1420 0         0 return $self->value($i, $_[0]);
1421             }
1422             elsif (exists $self->{feature}) {
1423 0         0 carp "Unable to update primary_tag for parsed SeqFeature objects";
1424             }
1425             else {
1426 1         52 carp "No Type column to update!";
1427             }
1428             }
1429             # collect from table
1430 9         85 my $i = $self->{data}->type_column;
1431 9 50       18 if (defined $i) {
1432 0         0 return $self->value($i);
1433             }
1434             # seqfeature
1435 9 100       20 if (exists $self->{feature}) {
1436 4         15 return $self->{feature}->primary_tag;
1437             }
1438             # general metadata
1439 5 50       11 if ($self->{data}->feature) {
1440 5         10 return $self->{data}->feature;
1441             }
1442 0         0 return undef;
1443             }
1444              
1445             *id = \&primary_id;
1446             sub primary_id {
1447 66     66 1 71 my $self = shift;
1448 66 50       88 carp "id is a read only method" if @_;
1449 66         130 my $i = $self->{data}->id_column;
1450 66 50       118 my $v = $self->value($i) if defined $i;
1451 66 50 33     156 if (defined $v and $v ne '.') {
1452 66         135 return $v;
1453             }
1454 0 0       0 return $self->{feature}->primary_id if exists $self->{feature};
1455 0 0       0 if (my $att = $self->gff_attributes) {
1456 0   0     0 return $att->{ID} || $att->{Name} || $att->{transcript_id};
1457             }
1458 0         0 return undef;
1459             }
1460              
1461             sub length {
1462 2     2 1 4 my $self = shift;
1463 2 50       5 carp "length is a read only method" if @_;
1464 2 50       7 if ($self->{data}->vcf) {
1465             # special case for vcf files, measure the length of the ALT allele
1466 0         0 return CORE::length($self->value(4));
1467             }
1468 2         5 my $s = $self->start;
1469 2         4 my $e = $self->end;
1470 2 50 33     9 if (defined $s and defined $e) {
    0          
1471 2         7 return $e - $s + 1;
1472             }
1473             elsif (defined $s) {
1474 0         0 return 1;
1475             }
1476             else {
1477 0         0 return undef;
1478             }
1479             }
1480              
1481             sub score {
1482 0     0 1 0 my $self = shift;
1483 0         0 my $c = $self->{data}->score_column;
1484 0 0       0 return defined $c ? $self->value($c) : undef;
1485             }
1486              
1487             sub attributes {
1488 0     0 1 0 my $self = shift;
1489 0 0       0 return $self->gff_attributes if ($self->{data}->gff);
1490 0 0       0 return $self->vcf_attributes if ($self->{data}->vcf);
1491 0         0 return;
1492             }
1493              
1494             sub gff_attributes {
1495 1     1 1 2 my $self = shift;
1496 1 50       5 return unless ($self->{data}->gff);
1497 1 50       4 return $self->{attributes} if (exists $self->{attributes});
1498 1         2 $self->{attributes} = {};
1499 1         3 foreach my $g (split(/\s*;\s*/, $self->value(8))) {
1500 3         19 my ($tag, $value) = split /\s+|=/, $g;
1501 3 50 33     9 next unless ($tag and $value);
1502             # unescape URL encoded values, borrowed from Bio::DB::GFF
1503 3         5 $value =~ tr/+/ /;
1504 3         7 $value =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
  11         25  
1505 3         9 $self->{attributes}->{$tag} = $value;
1506             }
1507 1         3 return $self->{attributes};
1508             }
1509              
1510             sub vcf_attributes {
1511 0     0 1 0 my $self = shift;
1512 0 0       0 return unless ($self->{data}->vcf);
1513 0 0       0 return $self->{attributes} if (exists $self->{attributes});
1514 0         0 $self->{attributes} = {};
1515            
1516             # INFO attributes
1517 0         0 my %info;
1518 0 0       0 if ($self->{data}->name(7) eq 'INFO') {
1519 0 0       0 %info = map {$_->[0] => defined $_->[1] ? $_->[1] : undef}
1520             # some tags are simple and have no value, eg SOMATIC
1521 0         0 map { [split(/=/, $_)] }
  0         0  
1522             split(/;/, $self->value(7));
1523             }
1524 0         0 $self->{attributes}->{INFO} = \%info;
1525 0         0 $self->{attributes}->{7} = \%info;
1526            
1527             # Sample attributes
1528 0 0       0 if ($self->{data}->number_columns > 8) {
1529 0         0 my @formatKeys = split /:/, $self->value(8);
1530 0         0 foreach my $i (9 .. $self->{data}->last_column) {
1531 0         0 my $name = $self->{data}->name($i);
1532 0         0 my @sampleVals = split /:/, $self->value($i);
1533             my %sample = map {
1534 0 0       0 $formatKeys[$_] => defined $sampleVals[$_] ? $sampleVals[$_] : undef }
  0         0  
1535             (0 .. $#formatKeys);
1536 0         0 $self->{attributes}->{$name} = \%sample;
1537 0         0 $self->{attributes}->{$i} = \%sample;
1538             }
1539             }
1540 0         0 return $self->{attributes};
1541             }
1542              
1543             sub rewrite_attributes {
1544 0     0 1 0 my $self = shift;
1545 0 0       0 return $self->rewrite_gff_attributes if ($self->{data}->gff);
1546 0 0       0 return $self->rewrite_vcf_attributes if ($self->{data}->vcf);
1547 0         0 return;
1548             }
1549              
1550             sub rewrite_gff_attributes {
1551 1     1 1 773 my $self = shift;
1552 1 50       3 return unless ($self->{data}->gff);
1553 1 50       4 return unless exists $self->{attributes};
1554 1         3 my @pairs; # of key=value items
1555 1 50       3 if (exists $self->{attributes}{ID}) {
1556             # I assume this does not need to be escaped!
1557 1         3 push @pairs, 'ID=' . $self->{attributes}{ID};
1558             }
1559 1 50       3 if (exists $self->{attributes}{Name}) {
1560 1         3 my $name = $self->{attributes}{Name};
1561 1         3 $name =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
  0         0  
1562 1         2 push @pairs, "Name=$name";
1563             }
1564 1         1 foreach my $key (sort {$a cmp $b} keys %{ $self->{attributes} }) {
  3         8  
  1         7  
1565 3 100       6 next if $key eq 'ID';
1566 2 100       4 next if $key eq 'Name';
1567 1         2 my $value = $self->{attributes}{$key};
1568 1         3 $key =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
  0         0  
1569 1         3 $value =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
  8         21  
1570 1         4 push @pairs, "$key=$value";
1571             }
1572 1         5 $self->value(8, join("; ", @pairs));
1573 1         4 return 1;
1574             }
1575              
1576             sub rewrite_vcf_attributes {
1577 0     0 1 0 my $self = shift;
1578 0 0       0 return unless ($self->{data}->vcf);
1579 0 0       0 return unless exists $self->{attributes};
1580            
1581             # INFO
1582             my $info = join(';',
1583             map {
1584             defined $self->{attributes}->{INFO}{$_} ?
1585 0 0       0 join('=', $_, $self->{attributes}->{INFO}{$_}) : $_
1586             }
1587 0         0 sort {$a cmp $b}
1588 0         0 keys %{$self->{attributes}->{INFO}}
  0         0  
1589             );
1590 0   0     0 $info ||= '.'; # sometimes we have nothing left
1591 0         0 $self->value(7, $info);
1592            
1593             # FORMAT
1594 0         0 my @order;
1595 0 0       0 push @order, 'GT' if exists $self->{attributes}{9}{GT};
1596 0         0 foreach my $key (sort {$a cmp $b} keys %{ $self->{attributes}{9} } ) {
  0         0  
  0         0  
1597 0 0       0 next if $key eq 'GT';
1598 0         0 push @order, $key;
1599             }
1600 0 0       0 if (@order) {
1601 0         0 $self->value(8, join(':', @order));
1602             }
1603             else {
1604 0         0 $self->value(8, '.');
1605             }
1606            
1607             # SAMPLES
1608 0         0 foreach my $i (9 .. $self->{data}->last_column) {
1609 0 0       0 if (@order) {
1610             $self->value($i, join(":",
1611 0         0 map { $self->{attributes}{$i}{$_} } @order
  0         0  
1612             ) );
1613             }
1614             else {
1615 0         0 $self->value($i, '.');
1616             }
1617             }
1618 0         0 return 1;
1619             }
1620              
1621             ### Data collection convenience methods
1622              
1623             *feature = \&seqfeature;
1624             sub seqfeature {
1625 4     4 1 4 my $self = shift;
1626 4   50     11 my $force = shift || 0;
1627 4 50       10 carp "feature is a read only method" if @_;
1628 4 50       10 return $self->{feature} if exists $self->{feature};
1629             # normally this is only for named features in a data table
1630             # skip this for coordinate features like bed files
1631 4 50 33     8 return unless $self->feature_type eq 'named' or $force;
1632            
1633             # retrieve from main Data store
1634 0         0 my $f = $self->{data}->get_seqfeature( $self->{'index'} );
1635 0 0       0 if ($f) {
1636 0         0 $self->{feature} = $f;
1637 0         0 return $f;
1638             }
1639            
1640             # retrieve the feature from the database
1641 0 0       0 return unless $self->{data}->database;
1642             $f = get_db_feature(
1643             'db' => $self->{data}->open_meta_database,
1644             'id' => $self->id || undef,
1645             'name' => $self->name || undef,
1646             'type' => $self->type || $self->{data}->feature,
1647 0   0     0 );
      0        
      0        
1648 0 0       0 return unless $f;
1649 0         0 $self->{feature} = $f;
1650 0         0 return $f;
1651             }
1652              
1653             sub segment {
1654 1     1 1 2 my $self = shift;
1655 1 50       4 carp "segment is a read only method" if @_;
1656 1 50       5 return unless $self->{data}->database;
1657 1 50       4 if ($self->feature_type eq 'coordinate') {
    0          
1658 1         3 my $chromo = $self->seq_id;
1659 1         4 my $start = $self->start;
1660 1   33     5 my $stop = $self->end || $start;
1661 1         4 my $db = $self->{data}->open_meta_database;
1662 1 50       8 return $db ? $db->segment($chromo, $start, $stop) : undef;
1663             }
1664             elsif ($self->feature_type eq 'named') {
1665 0         0 my $f = $self->feature;
1666 0 0       0 return $f ? $f->segment : undef;
1667             }
1668             else {
1669 0         0 return undef;
1670             }
1671             }
1672              
1673             sub get_features {
1674 0     0 1 0 my $self = shift;
1675 0         0 my %args = @_;
1676 0   0     0 my $db = $args{db} || $self->{data}->open_meta_database || undef;
1677 0 0       0 carp "no database defined to get features!" unless defined $db;
1678 0 0       0 return unless $db->can('features');
1679            
1680             # convert the argument style for most bioperl db APIs
1681 0         0 my %opts;
1682 0   0     0 $opts{-seq_id} = $args{chromo} || $self->seq_id;
1683 0   0     0 $opts{-start} = $args{start} || $self->start;
1684 0   0     0 $opts{-end} = $args{end} || $self->end;
1685 0   0     0 $opts{-type} = $args{type} || $self->type;
1686            
1687 0         0 return $db->features(%opts);
1688             }
1689              
1690             sub get_sequence {
1691 0     0 1 0 my $self = shift;
1692 0         0 my %args = @_;
1693 0   0     0 my $db = $args{db} || $args{database} || $self->{data}->open_meta_database || undef;
1694             # this will fail immediately if user doesn't provide valid database
1695            
1696             # get sequence over subfeatures
1697 0   0     0 $args{subfeature} ||= undef;
1698 0 0 0     0 if ($self->feature_type eq 'named' and $args{subfeature}) {
1699             # this is more complicated so we have a dedicated method
1700 0         0 return $self->_get_subfeature_sequence($db, \%args);
1701             }
1702            
1703             # get coordinates
1704 0   0     0 my $seqid = $args{seq_id} || $args{chromo} || $self->seq_id;
1705 0   0     0 my $start = $args{start} || $self->start;
1706 0   0     0 my $stop = $args{stop} || $args{end} || $self->end;
1707 0         0 my $strand = $self->strand;
1708 0 0       0 if (exists $args{strand}) {
1709             # user supplied strand, gotta check it
1710 0 0       0 $strand = $args{strand} =~ /\-|r/i ? -1 : 1;
1711             }
1712 0 0 0     0 if (exists $args{extend} and $args{extend}) {
1713 0         0 $start -= $args{extend};
1714 0 0       0 $start = 1 if $start <= 0;
1715 0         0 $stop += $args{extend};
1716             }
1717 0 0 0     0 return unless (defined $seqid and defined $start and defined $stop);
      0        
1718            
1719             # retrieve and return sequence
1720 0         0 my $seq = get_genomic_sequence($db, $seqid, $start, $stop);
1721 0 0       0 if ($strand == -1) {
1722 0         0 $seq =~ tr/gatcGATC/ctagCTAG/;
1723 0         0 $seq = reverse $seq;
1724             }
1725 0         0 return $seq;
1726             }
1727              
1728             sub _get_subfeature_sequence {
1729 0     0   0 my ($self, $db, $args) = @_;
1730            
1731             # get the subfeatures
1732 0         0 my $subfeatures = $self->_get_subfeatures($args->{subfeature});
1733 0 0       0 unless (@$subfeatures) {
1734 0         0 carp "no subfeatures available! Returning parent sequence!";
1735             # just return the parent
1736 0         0 undef $args->{subfeature};
1737 0         0 return $self->get_sequence(@$args);
1738             }
1739            
1740             # sort subfeatures
1741             # this should be done by GeneTools in most cases but just to be sure
1742             # note that this does NOT merge redundant or overlapping exons!!!!
1743 0         0 my @sorted = map { $_->[0] }
1744 0 0       0 sort { $a->[1] <=> $b->[1] or $a->[2] <=> $b->[2] }
1745 0         0 map { [$_, $_->start, $_->end] }
  0         0  
1746             @$subfeatures;
1747            
1748             # collect sequence
1749 0         0 my $sequence;
1750 0         0 foreach my $subf (@sorted) {
1751 0         0 my $seq = get_genomic_sequence($db, $subf->seq_id, $subf->start, $subf->stop);
1752 0         0 $sequence .= $seq;
1753             }
1754            
1755             # flip the sequence
1756 0 0       0 if ($self->strand == -1) {
1757 0         0 $sequence =~ tr/gatcGATC/ctagCTAG/;
1758 0         0 $sequence = reverse $sequence;
1759             }
1760 0         0 return $sequence;
1761             }
1762              
1763             sub _get_subfeatures {
1764 0     0   0 my $self = shift;
1765 0         0 my $subf = lc shift;
1766            
1767             # load GeneTools
1768 0 0       0 unless ($GENETOOL_LOADED) {
1769 0         0 load('Bio::ToolBox::GeneTools', qw(get_exons get_cds get_5p_utrs get_3p_utrs
1770             get_introns));
1771 0 0       0 if ($@) {
1772 0         0 croak "missing required modules! $@";
1773             }
1774             else {
1775 0         0 $GENETOOL_LOADED = 1;
1776             }
1777             }
1778            
1779             # feature
1780 0         0 my $feature = $self->seqfeature;
1781 0 0       0 return unless ($feature);
1782            
1783             # get the subfeatures
1784 0         0 my @subfeatures;
1785 0 0       0 if ($subf eq 'exon') {
    0          
    0          
    0          
    0          
1786 0         0 @subfeatures = get_exons($feature);
1787             }
1788             elsif ($subf eq 'cds') {
1789 0         0 @subfeatures = get_cds($feature);
1790             }
1791             elsif ($subf eq '5p_utr') {
1792 0         0 @subfeatures = get_5p_utrs($feature);
1793             }
1794             elsif ($subf eq '3p_utr') {
1795 0         0 @subfeatures = get_3p_utrs($feature);
1796             }
1797             elsif ($subf eq 'intron') {
1798 0         0 @subfeatures = get_introns($feature);
1799             }
1800             else {
1801 0         0 croak "unrecognized subfeature parameter '$subf'!";
1802             }
1803            
1804 0         0 return \@subfeatures;
1805             }
1806              
1807             sub get_score {
1808 7     7 1 5766 my $self = shift;
1809 7         27 my %args = @_; # passed arguments to this method
1810            
1811             # verify the dataset for the user, cannot trust whether it has been done or not
1812 7   50     42 my $db = $args{ddb} || $args{db} || $self->{data}->open_meta_database || undef;
1813 7         35 $args{dataset} = $self->{data}->verify_dataset($args{dataset}, $db);
1814 7 50       17 unless ($args{dataset}) {
1815 0         0 croak "provided dataset was unrecognized format or otherwise could not be verified!";
1816             }
1817            
1818             # get positioned scores over subfeatures only
1819 7   50     33 $args{subfeature} ||= q();
1820 7 0 33     17 if ($self->feature_type eq 'named' and $args{subfeature}) {
1821             # this is more complicated so we have a dedicated method
1822 0         0 return $self->_get_subfeature_scores($db, \%args);
1823             }
1824            
1825             # build parameter array to pass on to the adapter
1826 7         10 my @params;
1827            
1828             # verify coordinates based on type of feature
1829 7 50       10 if ($self->feature_type eq 'coordinate') {
    0          
1830             # coordinates are already in the table, use those
1831 7   33     23 $params[CHR] = $args{seq_id} || $self->seq_id;
1832 7   33     38 $params[STRT] = $args{start} || $self->start;
1833 7   33     34 $params[STOP] = $args{stop} || $args{end} || $self->end;
1834             $params[STR] = (exists $args{strand} and defined $args{strand}) ? $args{strand} :
1835 7 50 33     28 $self->strand;
1836             }
1837             elsif ($self->feature_type eq 'named') {
1838             # must retrieve feature from the database first
1839 0         0 my $f = $self->seqfeature;
1840 0 0       0 return unless $f;
1841 0   0     0 $params[CHR] = $args{seq_id} || $f->seq_id;
1842 0   0     0 $params[STRT] = $args{start} || $f->start;
1843 0   0     0 $params[STOP] = $args{stop} || $args{end} || $f->end;
1844             $params[STR] = (exists $args{strand} and defined $args{strand}) ? $args{strand} :
1845 0 0 0     0 $f->strand;
1846             }
1847             else {
1848 0         0 croak "data table does not have identifiable coordinate or feature identification columns for score collection";
1849             }
1850            
1851             # adjust coordinates as necessary
1852 7 0 33     17 if (exists $args{extend} and $args{extend}) {
1853 0         0 $params[STRT] -= $args{extend};
1854 0         0 $params[STOP] += $args{extend};
1855             }
1856            
1857             # check coordinates
1858 7 50       13 $params[STRT] = 1 if $params[STRT] <= 0;
1859 7 50       20 if ($params[STOP] < $params[STRT]) {
1860             # coordinates are flipped, reverse strand
1861 0 0       0 return if ($params[STOP] <= 0);
1862 0         0 my $stop = $params[STRT];
1863 0         0 $params[STRT] = $params[STOP];
1864 0         0 $params[STOP] = $stop;
1865 0         0 $params[STR] = -1;
1866             }
1867 7 50 33     25 return unless ($params[CHR] and defined $params[STRT]);
1868            
1869             # score attributes
1870 7   50     16 $params[METH] = $args{'method'} || 'mean';
1871 7   100     32 $params[STND] = $args{strandedness} || $args{stranded} || 'all';
1872            
1873             # other parameters
1874 7         8 $params[DB] = $db;
1875 7         8 $params[RETT] = 0; # return type should be a calculated value
1876 7         8 $params[DATA] = $args{dataset};
1877            
1878             # get the score
1879 7         20 return get_segment_score(@params);
1880             }
1881              
1882             sub _get_subfeature_scores {
1883 0     0   0 my ($self, $db, $args) = @_;
1884            
1885             # get the subfeatures
1886 0         0 my $subfeatures = $self->_get_subfeatures($args->{subfeature});
1887 0 0       0 unless (@$subfeatures) {
1888 0         0 carp "no subfeatures available! Returning parent score data!";
1889             # just return the parent
1890 0         0 undef $args->{subfeature};
1891 0 0       0 delete $args->{exon} if exists $args->{exon};
1892 0         0 return $self->get_score(@$args);
1893             }
1894            
1895             # collect over each subfeature
1896 0         0 my @scores;
1897 0         0 foreach my $exon (@$subfeatures) {
1898 0         0 my @params; # parameters to pass on to adapter
1899 0         0 $params[CHR] = $exon->seq_id;
1900 0         0 $params[STRT] = $exon->start;
1901 0         0 $params[STOP] = $exon->end;
1902 0 0       0 $params[STR] = defined $args->{strand} ? $args->{strand} : $exon->strand;
1903 0   0     0 $params[STND] = $args->{strandedness} || $args->{stranded} || 'all';
1904 0   0     0 $params[METH] = $args->{method} || 'mean';
1905 0         0 $params[RETT] = 1; # return type should be an array reference of scores
1906 0         0 $params[DB] = $db;
1907 0         0 $params[DATA] = $args->{dataset};
1908            
1909 0         0 my $exon_scores = get_segment_score(@params);
1910 0 0       0 push @scores, @$exon_scores if defined $exon_scores;
1911             }
1912            
1913             # combine all the scores based on the requested method
1914 0         0 return calculate_score($args->{method}, \@scores);
1915             }
1916              
1917             sub get_relative_point_position_scores {
1918 1     1 1 1832 my $self = shift;
1919 1         5 my %args = @_;
1920            
1921             # get the database and verify the dataset
1922 1   33     12 my $ddb = $args{ddb} || $args{db} || $self->{data}->open_meta_database;
1923 1         5 $args{dataset} = $self->{data}->verify_dataset($args{dataset}, $ddb);
1924 1 50       3 unless ($args{dataset}) {
1925 0         0 croak "provided dataset was unrecognized format or otherwise could not be verified!\n";
1926             }
1927            
1928             # assign some defaults
1929 1   50     6 $args{strandedness} ||= $args{stranded} || 'all';
      33        
1930 1   50     4 $args{position} ||= 5;
1931 1   50     5 $args{coordinate} ||= undef;
1932 1   50     4 $args{avoid} ||= undef;
1933 1   50     5 $args{'method'} ||= 'mean'; # in most cases this doesn't do anything
1934 1 50       3 unless ($args{extend}) {
1935 0         0 croak "must provide an extend value!";
1936             }
1937 1 50 33     4 $args{avoid} = undef unless ($args{db} or $self->{data}->open_meta_database);
1938            
1939             # determine reference coordinate
1940 1 50       3 unless (defined $args{coordinate}) {
1941 1         4 $args{coordinate} = $self->calculate_reference(\%args);
1942             }
1943            
1944             # build parameter array to pass on to the adapter
1945 1         1 my @params;
1946 1         4 $params[CHR] = $self->seq_id;
1947 1         4 $params[STRT] = $args{coordinate} - $args{extend};
1948 1 50       4 $params[STRT] = 1 if $params[STRT] < 1; # sanity check
1949 1         14 $params[STOP] = $args{coordinate} + $args{extend};
1950 1 50       6 $params[STR] = defined $args{strand} ? $args{strand} : $self->strand;
1951 1         2 $params[STND] = $args{strandedness};
1952 1         2 $params[METH] = $args{'method'};
1953 1         2 $params[RETT] = 2; # return type should be a hash reference of positioned scores
1954 1         2 $params[DB] = $ddb;
1955 1         2 $params[DATA] = $args{dataset};
1956            
1957             # Data collection
1958 1         5 my $pos2data = get_segment_score(@params);
1959            
1960             # Avoid positions
1961 1 50       4 if ($args{avoid}) {
1962 0         0 $self->_avoid_positions($pos2data, \%args, $params[CHR], $params[STRT], $params[STOP]);
1963             }
1964            
1965             # covert to relative positions
1966 1 50       3 if ($args{absolute}) {
1967             # do not convert to relative positions
1968 0 0       0 return wantarray ? %$pos2data : $pos2data;
1969             }
1970             else {
1971             # return the collected dataset hash
1972             return $self->_convert_to_relative_positions($pos2data,
1973 1         4 $args{coordinate}, $params[STR]);
1974             }
1975             }
1976              
1977             sub get_region_position_scores {
1978 4     4 1 7665 my $self = shift;
1979 4         12 my %args = @_;
1980            
1981             # get the database and verify the dataset
1982 4   33     45 my $ddb = $args{ddb} || $args{db} || $self->{data}->open_meta_database;
1983 4         16 $args{dataset} = $self->{data}->verify_dataset($args{dataset}, $ddb);
1984 4 50       13 unless ($args{dataset}) {
1985 0         0 croak "provided dataset was unrecognized format or otherwise could not be verified!\n";
1986             }
1987            
1988             # assign some defaults here, in case we get passed on to subfeature method
1989 4   50     25 $args{strandedness} ||= $args{stranded} || 'all';
      33        
1990 4   50     16 $args{extend} ||= 0;
1991 4   50     15 $args{position} ||= 5;
1992 4   100     10 $args{'method'} ||= 'mean'; # in most cases this doesn't do anything
1993 4 50 33     20 $args{avoid} = undef unless ($args{db} or $self->{data}->open_meta_database);
1994            
1995             # get positioned scores over subfeatures only
1996 4   50     16 $args{subfeature} ||= q();
1997 4 0 33     12 if ($self->feature_type eq 'named' and $args{subfeature}) {
1998             # this is more complicated so we have a dedicated method
1999 0         0 return $self->_get_subfeature_position_scores(\%args, $ddb);
2000             }
2001            
2002             # Assign coordinates
2003             # build parameter array to pass on to the adapter
2004 4         4 my @params;
2005 4   33     14 my $feature = $self->seqfeature || $self;
2006 4   33     30 $params[CHR] = $args{chromo} || $args{seq_id} || $feature->seq_id;
2007 4   33     15 $params[STRT] = $args{start} || $feature->start;
2008 4   33     20 $params[STOP] = $args{stop} || $args{end} || $feature->end;
2009 4 50       12 $params[STR] = defined $args{strand} ? $args{strand} : $feature->strand;
2010 4 50       12 if ($args{extend}) {
2011 0         0 $params[STRT] -= $args{extend};
2012 0         0 $params[STOP] += $args{extend};
2013 0 0       0 $params[STRT] = 1 if $params[STRT] < 1; # sanity check
2014             }
2015 4         7 $params[STND] = $args{strandedness};
2016 4         8 $params[METH] = $args{method};
2017 4         5 $params[RETT] = 2; # return type should be a hash reference of positioned scores
2018 4         8 $params[DB] = $ddb;
2019 4         4 $params[DATA] = $args{dataset};
2020            
2021             # Data collection
2022 4         14 my $pos2data = get_segment_score(@params);
2023            
2024             # Avoid positions
2025 4 50       23 if ($args{avoid}) {
2026 0         0 $self->_avoid_positions($pos2data, \%args, $params[CHR], $params[STRT], $params[STOP]);
2027             }
2028            
2029             # covert to relative positions
2030 4 50       10 if ($args{absolute}) {
2031             # do not convert to relative positions
2032 0 0       0 return wantarray ? %$pos2data : $pos2data;
2033             }
2034             else {
2035             # return data converted to relative positions
2036 4 50       10 unless (defined $args{coordinate}) {
2037 4         18 $args{coordinate} = $self->calculate_reference(\%args);
2038             }
2039             return $self->_convert_to_relative_positions($pos2data,
2040 4         14 $args{coordinate}, $params[STR]);
2041             }
2042             }
2043              
2044             sub _get_subfeature_position_scores {
2045 0     0   0 my ($self, $args, $ddb) = @_;
2046            
2047             # get the subfeatures
2048 0         0 my $subfeatures = $self->_get_subfeatures($args->{subfeature});
2049 0 0       0 unless (@$subfeatures) {
2050 0         0 carp "no subfeatures available! Returning parent score data!";
2051             # just return the parent
2052 0         0 undef $args->{subfeature};
2053 0 0       0 delete $args->{exon} if exists $args->{exon};
2054 0         0 return $self->get_sequence(@$args);
2055             }
2056            
2057             # reset the practical start and stop to the actual subfeatures' final start and stop
2058             # we can no longer rely on the feature start and stop, consider CDS
2059             # these subfeatures should already be genomic sorted by GeneTools
2060 0         0 my $practical_start = $subfeatures->[0]->start;
2061 0         0 my $practical_stop = $subfeatures->[-1]->end;
2062            
2063             # collect over each exon
2064             # we will adjust the positions of each reported score so that
2065             # it will appear as if all the exons are adjacent to each other
2066             # and no introns exist
2067 0         0 my $pos2data = {};
2068 0         0 my $namecheck = {}; # to check unique names when using ncount method....
2069 0         0 my $current_end = $practical_start;
2070 0         0 my $adjustment = 0;
2071 0 0       0 my $fstrand = defined $args->{strand} ? $args->{strand} : $self->strand;
2072 0         0 foreach my $exon (@$subfeatures) {
2073            
2074 0         0 my @params; # parameters to pass on to adapter
2075 0         0 $params[CHR] = $exon->seq_id;
2076 0         0 $params[STRT] = $exon->start;
2077 0         0 $params[STOP] = $exon->end;
2078 0         0 $params[STR] = $fstrand;
2079 0         0 $params[STND] = $args->{strandedness};
2080 0         0 $params[METH] = $args->{method};
2081 0         0 $params[RETT] = 2; # return type should be a hash reference of positioned scores
2082 0         0 $params[DB] = $ddb;
2083 0         0 $params[DATA] = $args->{dataset};
2084            
2085             # collect scores
2086 0         0 my $exon_scores = get_segment_score(@params);
2087            
2088             # adjust the scores
2089 0         0 $adjustment = $params[STRT] - $current_end;
2090             $self->_process_exon_scores($exon_scores, $pos2data, $adjustment, $params[STRT],
2091 0         0 $params[STOP], $namecheck, $args->{method});
2092            
2093             # reset
2094 0         0 $current_end += $exon->length;
2095             }
2096            
2097             # collect extensions if requested
2098 0 0       0 if ($args->{extend}) {
2099             # left side
2100 0         0 my @params; # parameters to pass on to adapter
2101 0         0 $params[CHR] = $self->seq_id;
2102 0         0 $params[STRT] = $practical_start - $args->{extend};
2103 0         0 $params[STOP] = $practical_start - 1;
2104 0         0 $params[STR] = $fstrand;
2105 0         0 $params[STND] = $args->{strandedness};
2106 0         0 $params[METH] = $args->{method};
2107 0         0 $params[RETT] = 2; # return type should be a hash reference of positioned scores
2108 0         0 $params[DB] = $ddb;
2109 0         0 $params[DATA] = $args->{dataset};
2110            
2111 0         0 my $ext_scores = get_segment_score(@params);
2112              
2113             # no adjustment should be needed
2114             $self->_process_exon_scores($ext_scores, $pos2data, 0, $params[STRT],
2115 0         0 $params[STOP], $namecheck, $args->{method});
2116            
2117            
2118             # right side
2119             # we can reuse our parameter array
2120 0         0 $params[STRT] = $practical_stop + 1;
2121 0         0 $params[STOP] = $practical_stop + $args->{extend};
2122 0         0 $ext_scores = get_segment_score(@params);
2123              
2124             # the adjustment should be the same as the last exon
2125             $self->_process_exon_scores($ext_scores, $pos2data, $adjustment, $params[STRT],
2126 0         0 $params[STOP], $namecheck, $args->{method});
2127             }
2128            
2129             # covert to relative positions
2130 0 0       0 if ($args->{absolute}) {
2131             # do not convert to relative positions
2132 0 0       0 return wantarray ? %$pos2data : $pos2data;
2133             }
2134             else {
2135             # return data converted to relative positions
2136             # can no longer use original coordinates, but instead the new shifted coordinates
2137 0         0 $args->{practical_start} = $practical_start;
2138 0         0 $args->{practical_stop} = $current_end;
2139 0         0 my $reference = $self->calculate_reference($args);
2140 0         0 return $self->_convert_to_relative_positions($pos2data,
2141             $reference, $fstrand);
2142             }
2143             }
2144              
2145             sub calculate_reference {
2146             # my ($self, $args) = @_;
2147             # I need position
2148             # optionally alternate start, stop, and/or strand
2149 10     10 1 20 my $self = shift;
2150 10         15 my $args;
2151 10 50       21 if (scalar @_ == 1) {
2152             # single variable passed
2153 10 100       26 if (ref($_[0]) eq 'HASH') {
2154 6         11 $args = shift @_;
2155             }
2156             else {
2157             # assumed to be the position
2158 4         9 $args = {
2159             position => $_[0]
2160             };
2161             }
2162             }
2163             else {
2164             # multiple variables
2165 0         0 $args = { @_ };
2166             }
2167            
2168             # calculate reference coordinate
2169 10         13 my $coordinate;
2170 10 50       34 my $strand = defined $args->{strand} ? $args->{strand} : $self->strand;
2171 10 100       35 if ($args->{position} == 5) {
    100          
    100          
    50          
2172 6 100       16 if ($strand >= 0) {
2173 1   33     6 $coordinate = $args->{practical_start} || $self->start;
2174             }
2175             else {
2176 5   33     28 $coordinate = $args->{practical_stop} || $self->end;
2177             }
2178             }
2179             elsif ($args->{position} == 3) {
2180 1 50       4 if ($strand >= 0) {
2181 1   33     5 $coordinate = $args->{practical_stop} || $self->end;
2182             }
2183             else {
2184 0   0     0 $coordinate = $args->{practical_start} || $self->start;
2185             }
2186             }
2187             elsif ($args->{position} == 4) {
2188             # strand doesn't matter here
2189             # but practical coordinates do
2190 2 100       5 if (exists $args->{practical_start}) {
2191             $coordinate = int(
2192 1         3 ( ($args->{practical_start} + $args->{practical_stop}) / 2 ) + 0.5
2193             );
2194             }
2195             else {
2196 1         53 $coordinate = $self->midpoint;
2197             }
2198             }
2199             elsif ($args->{position} == 9) {
2200             # narrowPeak coordinate
2201 1         3 $coordinate = $self->peak;
2202             }
2203             else {
2204 0         0 confess "position must be one of 5, 3, 4, or 9";
2205             }
2206 10         30 return $coordinate;
2207             }
2208              
2209             sub _avoid_positions {
2210 0     0   0 my ($self, $pos2data, $args, $seqid, $start, $stop) = @_;
2211            
2212             # first check the list of avoid types
2213 0 0       0 if (ref $args->{avoid} eq 'ARRAY') {
    0          
    0          
2214             # we have types, presume they're ok
2215             }
2216             elsif ($args->{avoid} eq '1') {
2217             # old style boolean value
2218 0 0       0 if (defined $args->{type}) {
2219 0         0 $args->{avoid} = [ $args->{type} ];
2220             }
2221             else {
2222             # no type provided, we can't avoid that which is not defined!
2223             # this is an error, but won't complain as we never did before
2224 0         0 $args->{avoid} = $self->type;
2225             }
2226             }
2227             elsif ($args->{avoid} =~ /w+/i) {
2228             # someone passed a string, a feature type perhaps?
2229 0         0 $args->{avoid} = [ $args->{avoid} ];
2230             }
2231            
2232             ### Check for conflicting features
2233 0   0     0 my $db = $args->{db} || $self->{data}->open_meta_database;
2234             my @overlap_features = $self->get_features(
2235             seq_id => $seqid,
2236             start => $start,
2237             end => $stop,
2238             type => $args->{avoid},
2239 0         0 );
2240            
2241             # get the overlapping features of the same type
2242 0 0       0 if (@overlap_features) {
2243 0         0 my $primary = $self->primary_id;
2244             # there are one or more feature of the type in this region
2245             # one of them is likely the one we're working with
2246             # but not necessarily - user may be looking outside original feature
2247             # the others are not what we want and therefore need to be
2248             # avoided
2249 0         0 foreach my $feat (@overlap_features) {
2250             # skip the one we want
2251 0 0       0 next if ($feat->primary_id eq $primary);
2252             # now eliminate those scores which overlap this feature
2253 0         0 my $start = $feat->start;
2254 0         0 my $stop = $feat->end;
2255 0         0 foreach my $position (keys %$pos2data) {
2256             # delete the scored position if it overlaps with
2257             # the offending feature
2258 0 0 0     0 if (
2259             $position >= $start and
2260             $position <= $stop
2261             ) {
2262 0         0 delete $pos2data->{$position};
2263             }
2264             }
2265             }
2266             }
2267             }
2268              
2269             sub _convert_to_relative_positions {
2270 5     5   13 my ($self, $pos2data, $position, $strand) = @_;
2271            
2272 5         7 my %relative_pos2data;
2273 5 50       16 if ($strand >= 0) {
    50          
2274 0         0 foreach my $p (keys %$pos2data) {
2275 0         0 $relative_pos2data{ $p - $position } = $pos2data->{$p};
2276             }
2277             }
2278             elsif ($strand < 0) {
2279 5         26 foreach my $p (keys %$pos2data) {
2280 223         329 $relative_pos2data{ $position - $p } = $pos2data->{$p};
2281             }
2282             }
2283 5 50       131 return wantarray ? %relative_pos2data : \%relative_pos2data;
2284             }
2285              
2286             sub _process_exon_scores {
2287 0     0     my ($self, $exon_scores, $pos2data, $adjustment, $start, $end,
2288             $namecheck, $method) = @_;
2289            
2290             # ncount method
2291 0 0         if ($method eq 'ncount') {
2292             # we need to check both names and adjust position
2293 0           foreach my $p (keys %$exon_scores) {
2294 0 0 0       next unless ($p >= $start and $p <= $end);
2295 0           foreach my $n (@{ $exon_scores->{$p} }) {
  0            
2296 0 0         if (exists $namecheck->{$n} ) {
2297 0           $namecheck->{$n}++;
2298 0           next;
2299             }
2300             else {
2301 0           $namecheck->{$n} = 1;
2302 0           my $a = $p - $adjustment;
2303 0   0       $pos2data->{$a} ||= [];
2304 0           push @{ $pos2data->{$a} }, $n;
  0            
2305             }
2306             }
2307             }
2308             }
2309             else {
2310             # just adjust scores
2311 0           foreach my $p (keys %$exon_scores) {
2312 0 0 0       next unless ($p >= $start and $p <= $end);
2313 0           $pos2data->{ $p - $adjustment } = $exon_scores->{$p};
2314             }
2315             }
2316             }
2317              
2318             sub fetch_alignments {
2319 0     0 1   my $self = shift;
2320 0           my %args = @_;
2321 0   0       $args{db} ||= $args{dataset} || undef;
      0        
2322 0   0       $args{data} ||= undef;
2323 0   0       $args{callback} ||= undef;
2324 0   0       $args{subfeature} ||= q();
2325            
2326             # verify - trusting that these are valid, else they will fail lower down in the code
2327 0 0         unless ($args{db}) {
2328 0           croak "must provide a Bam object database to fetch alignments!\n";
2329             }
2330 0 0 0       unless ($args{data} and ref($args{data}) eq 'HASH') {
2331 0           croak "must provide a data HASH for the fetch callback!\n";
2332             }
2333 0 0         unless ($args{callback}) {
2334 0           croak "must provide a callback code reference!\n";
2335             }
2336            
2337             # array of features to iterate, probably just one or subfeatures
2338 0           my @intervals;
2339 0 0 0       if ($self->feature_type eq 'named' and $args{subfeature}) {
2340             # we have subfeatures to iterate over
2341              
2342             # get the subfeatures
2343 0           my $subfeatures = $self->_get_subfeatures($args{subfeature});
2344 0 0         if (@$subfeatures) {
2345 0           foreach my $sf (@$subfeatures) {
2346 0           push @intervals, [
2347             $sf->start - 1,
2348             $sf->end
2349             ];
2350             }
2351             }
2352             else {
2353             # zere subfeatures? just take the parent then
2354             push @intervals, [
2355             ($args{start} || $self->start) - 1,
2356 0   0       $args{stop} || $args{end} || $self->end
      0        
2357             ];
2358             }
2359             }
2360             else {
2361             # take feature as is
2362             push @intervals, [
2363             ($args{start} || $self->start) - 1,
2364 0   0       $args{stop} || $args{end} || $self->end
      0        
2365             ];
2366             }
2367              
2368             # get the target id for the chromosome
2369             # this will fail if the user didn't provide a real bam object!!!
2370 0           my ($tid, undef, undef) = $args{db}->header->parse_region($self->seq_id);
2371 0 0         return unless defined $tid;
2372            
2373             # now iterate over the intervals
2374 0           foreach my $i (@intervals) {
2375 0           $args{data}->{start} = $i->[0];
2376 0           $args{data}->{end} = $i->[1];
2377 0           $args{data}->{strand} = $self->strand;
2378             low_level_bam_fetch(
2379             $args{db},
2380             $tid,
2381             $i->[0],
2382             $i->[1],
2383             $args{callback},
2384             $args{data}
2385 0           );
2386             }
2387            
2388             # nothing to return since we're using a data reference
2389 0           return 1;
2390             }
2391              
2392              
2393              
2394             ### String export
2395              
2396             sub bed_string {
2397 0     0 1   my $self = shift;
2398 0           my %args = @_;
2399 0   0       $args{bed} ||= 6; # number of bed columns
2400 0 0         croak "bed count must be an integer!" unless $args{bed} =~ /^\d+$/;
2401 0 0         croak "bed count must be at least 3!" unless $args{bed} >= 3;
2402            
2403             # coordinate information
2404 0           $self->seqfeature; # retrieve the seqfeature object first
2405 0   0       my $chr = $args{chromo} || $args{seq_id} || $self->seq_id;
2406 0   0       my $start = $args{start} || $self->start;
2407 0   0       my $stop = $args{stop} || $args{end} || $self->stop ||
2408             $start + $self->length - 1 || $start;
2409 0 0 0       if ($chr eq '.' or not CORE::length($chr) or $start eq '.' or not CORE::length($start)) {
      0        
      0        
2410 0           carp sprintf("no valid seq_id or start for data line %d", $self->line_number);
2411 0           return;
2412             }
2413 0 0         if ($start > $stop) {
2414             # reversed coordinates? old school way of setting reverse strand
2415 0           my $s = $start;
2416 0           $start = $stop;
2417 0           $stop = $s;
2418 0           $args{strand} = -1; # this will override any user provided data?
2419             }
2420 0           $start -= 1; # 0-based coordinates
2421 0           my $string = "$chr\t$start\t$stop";
2422            
2423             # additional information
2424 0 0         if ($args{bed} >= 4) {
2425 0   0       my $name = $args{name} || $self->name || 'Feature_' . $self->line_number;
2426 0           $string .= "\t$name";
2427             }
2428 0 0         if ($args{bed} >= 5) {
2429 0 0         my $score = exists $args{score} ? $args{score} : $self->score;
2430 0 0         $score = 1 unless defined $score;
2431 0           $string .= "\t$score";
2432             }
2433 0 0         if ($args{bed} >= 6) {
2434 0           my $s;
2435 0 0 0       if (exists $args{strand} and defined $args{strand}) {
2436 0           $s = $self->_strand($args{strand});
2437             }
2438             else {
2439 0           $s = $self->strand;
2440             }
2441 0 0         $string .= sprintf("\t%s", $s == -1 ? '-' : '+')
2442             }
2443             # we could go on with other columns, but there's no guarantee that additional
2444             # information is available, and we would have to implement user provided data
2445            
2446             # done
2447 0           return $string;
2448             }
2449              
2450             sub gff_string {
2451 0     0 1   my $self = shift;
2452 0           my %args = @_;
2453            
2454             # coordinate information
2455 0           $self->seqfeature; # retrieve the seqfeature object first
2456 0   0       my $chr = $args{chromo} || $args{seq_id} || $self->seq_id;
2457 0   0       my $start = $args{start} || $self->start;
2458 0   0       my $stop = $args{stop} || $args{end} || $self->stop ||
2459             $start + $self->length - 1 || $start;
2460 0 0 0       if ($chr eq '.' or not CORE::length($chr) or $start eq '.' or not CORE::length($start)) {
      0        
      0        
2461 0           carp sprintf("no valid seq_id or start for data line %d", $self->line_number);
2462 0           return;
2463             }
2464 0 0         if ($start > $stop) {
2465             # reversed coordinates? old school way of setting reverse strand
2466 0           my $s = $start;
2467 0           $start = $stop;
2468 0           $stop = $s;
2469 0           $args{strand} = -1; # this will override any user provided data?
2470             }
2471 0           my $strand;
2472 0 0 0       if (exists $args{strand} and defined $args{strand}) {
2473 0           $strand = $self->_strand($args{strand});
2474             }
2475             else {
2476 0           $strand = $self->strand;
2477             }
2478 0 0         $strand = $strand == -1 ? '-' : $strand == 1 ? '+' : '.';
    0          
2479            
2480             # type information
2481 0   0       my $type = $args{type} || $self->type || undef;
2482 0           my ($source, $primary_tag);
2483 0 0 0       if (defined $type and $type =~ /:/) {
2484 0           ($primary_tag, $source) = split /:/, $type;
2485             }
2486 0 0         unless ($source) {
2487 0   0       $source = $args{source} || '.';
2488             }
2489 0 0         unless ($primary_tag) {
2490 0 0 0       $primary_tag = $args{primary_tag} || defined $type ? $type : '.';
2491             }
2492            
2493             # score
2494 0 0         my $score = exists $args{score} ? $args{score} : $self->score;
2495 0 0         $score = '.' unless defined $score;
2496 0           my $phase = '.'; # do not even bother!!!!
2497            
2498             # attributes
2499 0   0       my $name = $args{name} || $self->name || 'Feature_' . $self->line_number;
2500 0           my $attributes = "Name=$name";
2501 0   0       my $id = $args{id} || sprintf("%08d", $self->line_number);
2502 0           $attributes .= ";ID=$id";
2503 0 0 0       if (exists $args{attributes} and ref($args{attributes}) eq 'ARRAY') {
2504 0           foreach my $i (@{$args{attributes}}) {
  0            
2505 0           my $k = $self->{data}->name($i);
2506 0           $k =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
  0            
2507 0           my $v = $self->value($i);
2508 0           $v =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
  0            
2509 0           $attributes .= ";$k=$v";
2510             }
2511             }
2512            
2513             # done
2514 0           my $string = join("\t", $chr, $source, $primary_tag, $start, $stop, $score,
2515             $strand, $phase, $attributes);
2516 0           return $string;
2517             }
2518              
2519              
2520             __END__