File Coverage

blib/lib/Bio/ToolBox/Data/Feature.pm
Criterion Covered Total %
statement 300 756 39.6
branch 139 490 28.3
condition 51 310 16.4
subroutine 31 51 60.7
pod 34 36 94.4
total 555 1643 33.7


line stmt bran cond sub pod time code
1             package Bio::ToolBox::Data::Feature;
2             our $VERSION = '1.69';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::Data::Feature - Objects representing rows in a data table
7              
8             =head1 DESCRIPTION
9              
10             A Bio::ToolBox::Data::Feature is an object representing a row in the
11             data table. Usually, this in turn represents an annotated feature or
12             segment in the genome. As such, this object provides convenient
13             methods for accessing and manipulating the values in a row, as well as
14             methods for working with the represented genomic feature.
15              
16             This class should NOT be used directly by the user. Rather, Feature
17             objects are generated from a Bio::ToolBox::Data::Iterator object
18             (generated itself from the L
19             function in Bio::ToolBox::Data), or the L
20             function in Bio::ToolBox::Data. Please see the respective documentation
21             for more information.
22              
23             Example of working with a stream object.
24              
25             my $Data = Bio::ToolBox::Data->new(file => $file);
26            
27             # stream method
28             my $stream = $Data->row_stream;
29             while (my $row = $stream->next_row) {
30             # each $row is a Bio::ToolBox::Data::Feature object
31             # representing the row in the data table
32             my $value = $row->value($index);
33             # do something with $value
34             }
35            
36             # iterate method
37             $Data->iterate( sub {
38             my $row = shift;
39             my $number = $row->value($index);
40             my $log_number = log($number);
41             $row->value($index, $log_number);
42             } );
43              
44              
45             =head1 METHODS
46              
47             =head2 General information methods
48              
49             =over 4
50              
51             =item row_index
52              
53             Returns the index position of the current data row within the
54             data table. Useful for knowing where you are at within the data
55             table.
56              
57             =item feature_type
58              
59             Returns one of three specific values describing the contents
60             of the data table inferred by the presence of specific column
61             names. This provides a clue as to whether the table features
62             represent genomic regions (defined by coordinate positions) or
63             named database features. The return values include:
64              
65             =over 4
66              
67             =item * coordinate: Table includes at least chromosome and start
68              
69             =item * named: Table includes name, type, and/or Primary_ID
70              
71             =item * unknown: unrecognized
72              
73             =back
74              
75             =item column_name
76              
77             Returns the column name for the given index.
78              
79             item data
80              
81             Returns the parent L object, in case you may
82             have lost it by going out of scope.
83              
84             =back
85              
86             =head2 Methods to access row feature attributes
87              
88             These methods return the corresponding value, if present in the
89             data table, based on the column header name. If the row represents
90             a named database object, try calling the L method first.
91             This will retrieve the database SeqFeature object, and the attributes
92             can then be retrieved using the methods below or on the actual
93             database SeqFeature object.
94              
95             In cases where there is a table column and a corresponding SeqFeature
96             object, for example a start column and a parsed SeqFeature object, the
97             table value takes precedence and is returned. You can always obtain the
98             SeqFeature's value separately and directly.
99              
100             These methods do not set attribute values. If you need to change the
101             values in a table, use the L method below.
102              
103             =over 4
104              
105             =item seq_id
106              
107             The name of the chromosome the feature is on.
108              
109             =item start
110              
111             =item end
112              
113             =item stop
114              
115             The coordinates of the feature or segment. Coordinates from known
116             0-based file formats, e.g. BED, are returned as 1-based. Coordinates
117             must be integers to be returned. Zero or negative start coordinates
118             are assumed to be accidents or poor programming and transformed to 1.
119             Use the L method if you don't want this to happen.
120              
121             =item strand
122              
123             The strand of the feature or segment. Returns -1, 0, or 1. Default is 0,
124             or unstranded.
125              
126             =item midpoint
127              
128             The calculated midpoint position of the feature.
129              
130             =item peak
131              
132             For features in a C file, this will report the peak coordinate,
133             transformed into a genomic coordinate.
134              
135             =item name
136              
137             =item display_name
138              
139             The name of the feature.
140              
141             =item coordinate
142              
143             Returns a coordinate string formatted as C. This uses the
144             start coordinate as listed in the source file and does not convert 0-based
145             start values to 1-based values. This may confound downstream applications.
146              
147             =item type
148              
149             The type of feature. Typically either C or C.
150             In a GFF3 file, this represents columns 3 and 2, respectively. In annotation
151             databases such as L, the type is used to restrict
152             to one of many different types of features, e.g. gene, mRNA, or exon.
153              
154             =item id
155              
156             =item primary_id
157              
158             Here, this represents the C in the database. Note that this number
159             is generally unique to a specific database, and not portable between databases.
160              
161             =item length
162              
163             The length of the feature or segment.
164              
165             =item score
166              
167             Returns the value of the Score column, if one is available. Typically
168             associated with defined file formats, such as GFF files (6th column),
169             BED and related Peak files (5th column), and bedGraph (4th column).
170              
171             =back
172              
173             =head2 Accessing and setting values in the row.
174              
175             =over 4
176              
177             =item value
178              
179             # retrieve a value
180             my $v = $row->value($index);
181             # set a value
182             $row->value($index, $v + 1);
183              
184             Returns or sets the value at a specific column index in the
185             current data row. Null values return a '.', symbolizing an
186             internal null value.
187              
188             =item row_values
189              
190             Returns an array or array reference representing all the values
191             in the current data row.
192              
193             =back
194              
195             =head2 Special feature attributes
196              
197             GFF and VCF files have special attributes in the form of key =E value pairs.
198             These are stored as specially formatted, character-delimited lists in
199             certain columns. These methods will parse this information and return as
200             a convenient hash reference. The keys and values of this hash may be
201             changed, deleted, or added to as desired. To write the changes back to
202             the file, use the L to properly write the attributes
203             back to the file with the proper formatting.
204              
205             =over 4
206              
207             =item attributes
208              
209             Generic method that calls either L or L
210             depending on the data table format.
211              
212             =item gff_attributes
213              
214             Parses the 9th column of GFF files. URL-escaped characters are converted
215             back to text. Returns a hash reference of key =E value pairs.
216              
217             =item vcf_attributes
218              
219             Parses the C (8th column) and all sample columns (10th and higher
220             columns) in a version 4 VCF file. The Sample columns use the C
221             column (9th column) as keys. The returned hash reference has two levels:
222             The first level keys are both the column names and index (0-based). The
223             second level keys are the individual attribute keys to each value.
224             For example:
225              
226             my $attr = $row->vcf_attributes;
227            
228             # access by column name
229             my $genotype = $attr->{sample1}{GT};
230             my $depth = $attr->{INFO}{ADP};
231            
232             # access by 0-based column index
233             my $genotype = $attr->{9}{GT};
234             my $depth = $attr->{7}{ADP}
235              
236             =item rewrite_attributes
237              
238             Generic method that either calls L or
239             L depending on the data table format.
240              
241             =item rewrite_gff_attributes
242              
243             Rewrites the GFF attributes column (the 9th column) based on the
244             contents of the attributes hash that was previously generated with
245             the L method. Useful when you have modified the
246             contents of the attributes hash.
247              
248             =item rewrite_vcf_attributes
249              
250             Rewrite the VCF attributes for the C (8th column), C (9th
251             column), and sample columns (10th and higher columns) based on the
252             contents of the attributes hash that was previously generated with
253             the L method. Useful when you have modified the
254             contents of the attributes hash.
255              
256             =back
257              
258             =head2 Convenience Methods to database functions
259              
260             The next three functions are convenience methods for using the
261             attributes in the current data row to interact with databases.
262             They are wrappers to methods in the L
263             module.
264              
265             =over 4
266              
267             =item seqfeature
268              
269             =item feature
270              
271             Returns a SeqFeature object representing the feature or item in
272             the current row. If the SeqFeature object is stored in the parent
273             C<$Data> object (usually from parsing an annotation file), it is
274             immediately returned. Otherwise, the SeqFeature
275             object is retrieved from the database using the name and
276             type values in the current Data table row. The SeqFeature object
277             is requested from the database named in the general metadata. If
278             an alternate database is desired, you should change it first using
279             the C<$Data>-Edatabase() method. If the feature name or type is not
280             present in the table, then nothing is returned.
281              
282             See L and L for more
283             information about working with these objects. See L
284             about working with database features.
285              
286             This method normally only works with "named" feature types in a
287             L Data table. If your Data table has coordinate
288             information, i.e. chromosome, start, and stop columns, then it will
289             likely be recognized as a "coordinate" feature_type and not work.
290              
291             Pass a true value to this method to force the seqfeature lookup. This
292             will still require the presence of Name, ID, and/or Type columns to
293             perform the database lookup. The L method feature()
294             is used to determine the type if a Type column is not present.
295              
296             =item segment
297              
298             Returns a database Segment object corresponding to the coordinates
299             defined in the Data table row. If a named feature and type are
300             present instead of coordinates, then the feature is first automatically
301             retrieved and a Segment returned based on its coordinates. The
302             database named in the general metadata is used to establish the
303             Segment object. If a different database is desired, it should be
304             changed first using the general L method.
305              
306             See L and L for more information
307             about working with Segment objects.
308              
309             =item get_features
310              
311             my @overlap_features = $row->get_features(type => $type);
312              
313             Returns seqfeature objects from a database that overlap the Feature
314             or interval in the current Data table row. This is essentially a
315             convenience wrapper for a Bio::DB style I method using the
316             coordinates of the Feature. Optionally pass an array of key value pairs
317             to specify alternate coordinates if so desired. Potential keys
318             include
319              
320             =over 4
321              
322             =item seq_id
323              
324             =item start
325              
326             =item end
327              
328             =item type
329              
330             The type of database features to retrieve.
331              
332             =item db
333              
334             An alternate database object to collect from.
335              
336             =back
337              
338             =item get_sequence
339              
340             Fetches genomic sequence based on the coordinates of the current seqfeature
341             or interval in the current Feature. This requires a database that
342             contains the genomic sequence, either the database specified in the
343             Data table metadata or an external indexed genomic fasta file.
344              
345             If the Feature represents a transcript or gene, then a concatenated
346             sequence of the selected subfeatures may be generated and returned. B
347             that redundant or overlapping subfeatures are B merged, and
348             unexpected results may be obtained.
349              
350             The sequence is returned as simple string. If the feature is on the reverse
351             strand, then the reverse complement sequence is automatically returned.
352              
353             Pass an array of key value pairs to specify alternate coordinates if so
354             desired. Potential keys include
355              
356             =over 4
357              
358             =item subfeature
359              
360             Pass a text string representing the type of subfeature from which to collect
361             the sequence. Acceptable values include
362              
363             =over 4
364              
365             =item * exon
366              
367             =item * cds
368              
369             =item * 5p_utr
370              
371             =item * 3p_utr
372              
373             =item * intron
374              
375             =back
376              
377             =item seq_id
378              
379             =item start
380              
381             =item end
382              
383             =item strand
384              
385             =item extend
386              
387             Indicate additional basepairs of sequence added to both sides
388              
389             =item db
390              
391             The fasta file or database from which to fetch the sequence
392              
393             =back
394              
395             =back
396              
397             =head2 Data collection
398              
399             The following methods allow for data collection from various
400             sources, including bam, bigwig, bigbed, useq, Bio::DB databases, etc.
401              
402             =over 4
403              
404             =item get_score
405              
406             my $score = $row->get_score(
407             dataset => 'scores.bw',
408             method => 'max',
409             );
410              
411             This method collects a single score over the feature or interval.
412             Usually a mathematical or statistical value is employed to derive the
413             single score. Pass an array of key value pairs to control data collection.
414             Keys include the following:
415              
416             =over 4
417              
418             =item db
419              
420             =item ddb
421              
422             Specify a Bio::DB database from which to collect the data. The default
423             value is the database specified in the Data table metadata, if present.
424             Examples include a L or L
425             database.
426              
427             =item dataset
428              
429             Specify the name of the dataset. If a database was specified, then this
430             value would be the C or C feature found in the
431             database. Otherwise, the name of a data file, such as a bam, bigWig,
432             bigBed, or USeq file, would be provided here. This options is required!
433              
434             =item method
435              
436             Specify the mathematical or statistical method combining multiple scores
437             over the interval into one value. Options include the following:
438              
439             =over 4
440              
441             =item * mean
442              
443             =item * sum
444              
445             =item * min
446              
447             =item * max
448              
449             =item * median
450              
451             =item * count
452              
453             Count all overlapping items.
454              
455             =item * pcount
456              
457             Precisely count only containing (not overlapping) items.
458              
459             =item * ncount
460              
461             Count overlapping unique names only.
462              
463             =item * range
464              
465             The difference between minimum and maximum values.
466              
467             =item * stddev
468              
469             Standard deviation.
470              
471             =back
472              
473             =item strandedness
474              
475             Specify what strand from which the data should be taken, with respect
476             to the Feature strand. Three options are available. Only really relevant
477             for data sources that support strand.
478              
479             =over 4
480              
481             =item * sense
482              
483             The same strand as the Feature.
484              
485             =item * antisense
486              
487             The opposite strand as the Feature.
488              
489             =item * all
490              
491             Strand is ignored, all is taken (default).
492              
493             =back
494              
495             =item subfeature
496              
497             Specify the subfeature type from which to collect the scores. Typically
498             a SeqFeature object representing a transcript is provided, and the
499             indicated subfeatures are collected from object. Pass the name of the
500             subfeature to use. Accepted values include the following.
501              
502             =over 4
503              
504             =item * exon
505              
506             =item * cds
507              
508             =item * 5p_utr
509              
510             =item * 3p_utr
511              
512             =item * intron
513              
514             =back
515              
516             =item extend
517              
518             Specify the number of basepairs that the Data table Feature's
519             coordinates should be extended in both directions. Ignored
520             when used with the subfeature option.
521              
522             =item seq_id
523              
524             =item chromo
525              
526             =item start
527              
528             =item end
529              
530             =item stop
531              
532             =item strand
533              
534             Optionally specify zero or more alternate coordinates to use.
535             By default, these are obtained from the Data table Feature.
536              
537             =back
538            
539             =item get_relative_point_position_scores
540              
541             while (my $row = $stream->next_row) {
542             my $pos2score = $row->get_relative_point_position_scores(
543             'ddb' => '/path/to/BigWigSet/',
544             'dataset' => 'MyData',
545             'position' => 5,
546             'extend' => 1000,
547             );
548             }
549              
550             This method collects indexed position scores centered around a
551             specific reference point. The returned data is a hash of
552             relative positions (example -20, -10, 1, 10, 20) and their score
553             values. Pass an array of key value pairs to control data collection.
554             Keys include the following:
555              
556             =over 4
557              
558             =item db
559              
560             =item ddb
561              
562             Specify a Bio::DB database from which to collect the data. The default
563             value is the database specified in the Data table metadata, if present.
564             Examples include a L or L
565             database.
566              
567             =item dataset
568              
569             Specify the name of the dataset. If a database was specified, then this
570             value would be the C or C feature found in the
571             database. Otherwise, the name of a data file, such as a bam, bigWig,
572             bigBed, or USeq file, would be provided here. This options is required!
573              
574             =item position
575              
576             Indicate the position of the reference point relative to the Data table
577             Feature. 5 is the 5' coordinate, 3 is the 3' coordinate, and 4 is the
578             midpoint (get it? it's between 5 and 3). Default is 5.
579              
580             =item extend
581              
582             Indicate the number of base pairs to extend from the reference coordinate.
583             This option is required!
584              
585             =item coordinate
586              
587             Optionally provide the real chromosomal coordinate as the reference point.
588              
589             =item absolute
590              
591             Boolean option to indicate that the returned hash of positions and scores
592             should not be transformed into relative positions but kept as absolute
593             chromosomal coordinates.
594              
595             =item avoid
596              
597             Provide a C or C database feature type to avoid overlapping
598             scores. Each found score is checked for overlapping features and is
599             discarded if found to do so. The database should be set to use this.
600              
601             =item strandedness
602              
603             Specify what strand from which the data should be taken, with respect
604             to the Feature strand. Three options are available. Only really relevant
605             for data sources that support strand.
606              
607             =over 4
608              
609             =item * sense
610              
611             The same strand as the Feature.
612              
613             =item * antisense
614              
615             The opposite strand as the Feature.
616              
617             =item * all
618              
619             Strand is ignored, all is taken (default).
620              
621             =back
622              
623             =item method
624              
625             Only required when counting objects.
626              
627             =over 4
628              
629             =item * count
630              
631             Count all overlapping items.
632              
633             =item * pcount
634              
635             Precisely count only containing (not overlapping) items.
636              
637             =item * ncount
638              
639             Count overlapping unique names only.
640              
641             =back
642              
643             =back
644              
645             =item get_region_position_scores
646              
647             while (my $row = $stream->next_row) {
648             my $pos2score = $row->get_relative_point_position_scores(
649             'ddb' => '/path/to/BigWigSet/',
650             'dataset' => 'MyData',
651             'position' => 5,
652             'extend' => 1000,
653             );
654             }
655              
656             This method collects indexed position scores across a defined
657             region or interval. The returned data is a hash of positions and
658             their score values. The positions are by default relative to a
659             region coordinate, usually to the 5' end. Pass an array of key value
660             pairs to control data collection. Keys include the following:
661              
662             =over 4
663              
664             =item db
665              
666             =item ddb
667              
668             Specify a Bio::DB database from which to collect the data. The default
669             value is the database specified in the Data table metadata, if present.
670             Examples include a L or L
671             database.
672              
673             =item dataset
674              
675             Specify the name of the dataset. If a database was specified, then this
676             value would be the C or C feature found in the
677             database. Otherwise, the name of a data file, such as a bam, bigWig,
678             bigBed, or USeq file, would be provided here. This options is required!
679              
680             =item subfeature
681              
682             Specify the subfeature type from which to collect the scores. Typically
683             a SeqFeature object representing a transcript is provided, and the
684             indicated subfeatures are collected from object. When converting to
685             relative coordinates, the coordinates will be relative to the length of
686             the sum of the subfeatures, i.e. the length of the introns will be ignored.
687              
688             Pass the name of the subfeature to use. Accepted values include the following.
689              
690             =over 4
691              
692             =item * exon
693              
694             =item * cds
695              
696             =item * 5p_utr
697              
698             =item * 3p_utr
699              
700             =item * intron
701              
702             =back
703              
704             =item extend
705              
706             Specify the number of basepairs that the Data table Feature's
707             coordinates should be extended in both directions.
708              
709             =item seq_id
710              
711             =item chromo
712              
713             =item start
714              
715             =item end
716              
717             =item stop
718              
719             =item strand
720              
721             Optionally specify zero or more alternate coordinates to use.
722             By default, these are obtained from the Data table Feature.
723              
724             =item position
725              
726             Indicate the position of the reference point relative to the Data table
727             Feature. 5 is the 5' coordinate, 3 is the 3' coordinate, and 4 is the
728             midpoint (get it? it's between 5 and 3). Default is 5.
729              
730             =item coordinate
731              
732             Optionally provide the real chromosomal coordinate as the reference point.
733              
734             =item absolute
735              
736             Boolean option to indicate that the returned hash of positions and scores
737             should not be transformed into relative positions but kept as absolute
738             chromosomal coordinates.
739              
740             =item avoid
741              
742             Provide a C or C database feature type to avoid overlapping
743             scores. Each found score is checked for overlapping features and is
744             discarded if found to do so. The database should be set to use this.
745              
746             =item strandedness
747              
748             Specify what strand from which the data should be taken, with respect
749             to the Feature strand. Three options are available. Only really relevant
750             for data sources that support strand.
751              
752             =over 4
753              
754             =item * sense
755              
756             The same strand as the Feature.
757              
758             =item * antisense
759              
760             The opposite strand as the Feature.
761              
762             =item * all
763              
764             Strand is ignored, all is taken (default).
765              
766             =back
767              
768             =item method
769              
770             Only required when counting objects.
771              
772             =over 4
773              
774             =item * count
775              
776             Count all overlapping items.
777              
778             =item * pcount
779              
780             Precisely count only containing (not overlapping) items.
781              
782             =item * ncount
783              
784             Count overlapping unique names only.
785              
786             =back
787              
788             =back
789              
790             =item fetch_alignments
791              
792             my $sam = $Data->open_database('/path/to/file.bam');
793             my $alignment_data = { mapq => [] };
794             my $callback = sub {
795             my ($a, $data) = @_;
796             push @{ $data->{mapq} }, $a->qual;
797             };
798             while (my $row = $stream->next_row) {
799             $row->fetch_alignments(
800             'db' => $sam,
801             'data' => $alignment_data,
802             'callback' => $callback,
803             );
804             }
805              
806             This function allows you to iterate over alignments in a Bam file,
807             allowing custom information to be collected based on a callback
808             code reference that is provided.
809              
810             Three parameters are required: C, C, and C. A
811             true value (1) is returned upon success.
812              
813             =over 4
814              
815             =item db
816              
817             Provide an opened, high-level, Bam database object.
818              
819             =item callback
820              
821             Provide a code callback reference to use when iterating over the
822             alignments. Two objects are passed to this code function: the
823             alignment object and the data structure that is provided. See the
824             Bam adapter documentation for details on low-level C through
825             the Bam index object for details.
826              
827             =item data
828              
829             This is a reference to a C data object for storing information.
830             It is passed to the callback function along with the alignment. Three
831             new key =E value pairs are automatically added: C, C,
832             and C. These correspond to the values for the current queried
833             interval. Coordinates are automatically transformed to 0-base coordinate
834             system to match low level alignment objects.
835              
836             =item subfeature
837              
838             If the feature has subfeatures, such as exons, introns, etc., pass
839             the name of the subfeature to restrict iteration only over the
840             indicated subfeatures. The C object will inherit the coordinates
841             for each subfeatures. Allowed subfeatures include the following:
842              
843             =over 4
844              
845             =item * exon
846              
847             =item * cds
848              
849             =item * 5p_utr
850              
851             =item * 3p_utr
852              
853             =item * intron
854              
855             =back
856              
857             =item start
858              
859             =item stop
860              
861             =item end
862              
863             Provide alternate, custom start and stop coordinates for the row
864             feature. Ignored with subfeatures.
865              
866             =back
867              
868             =back
869              
870             =head2 Feature Export
871              
872             These methods allow the feature to be exported in industry standard
873             formats, including the BED format and the GFF format. Both methods
874             return a formatted tab-delimited text string suitable for printing to
875             file. The string does not include a line ending character.
876              
877             These methods rely on coordinates being present in the source table.
878             If the row feature represents a database item, the L method
879             should be called prior to these methods, allowing the feature to be
880             retrieved from the database and coordinates obtained.
881              
882             =over 4
883              
884             =item bed_string
885              
886             Returns a BED formatted string. By default, a 6-element string is
887             generated, unless otherwise specified. Pass an array of key values
888             to control how the string is generated. The following arguments
889             are supported.
890              
891             =over 4
892              
893             =item bed
894              
895             Specify the number of BED elements to include. The number of elements
896             correspond to the number of columns in the BED file specification. A
897             minimum of 3 (chromosome, start, stop) is required, and maximum of 6
898             is allowed (chromosome, start, stop, name, score, strand).
899              
900             =item chromo
901              
902             =item seq_id
903              
904             Provide a text string of an alternative chromosome or sequence name.
905              
906             =item start
907              
908             =item stop
909              
910             =item end
911              
912             Provide alternative integers for the start and stop coordinates.
913             Note that start values are automatically converted to 0-base
914             by subtracting 1.
915              
916             =item strand
917              
918             Provide alternate an alternative strand value.
919              
920             =item name
921              
922             Provide an alternate or missing name value to be used as text in the 4th
923             column. If no name is provided or available, a default name is generated.
924              
925             =item score
926              
927             Provide a numerical value to be included as the score. BED files typically
928             use integer values ranging from 1..1000.
929              
930             =back
931              
932             =item gff_string
933              
934             Returns a GFF3 formatted string. Pass an array of key values
935             to control how the string is generated. The following arguments
936             are supported.
937              
938             =over 4
939              
940             =item chromo
941              
942             =item seq_id
943              
944             =item start
945              
946             =item stop
947              
948             =item end
949              
950             =item strand
951              
952             Provide alternate values from those defined or missing in the current
953             row Feature.
954              
955             =item source
956              
957             Provide a text string to be used as the source_tag value in the 2nd
958             column. The default value is null ".".
959              
960             =item primary_tag
961              
962             Provide a text string to be used as the primary_tag value in the 3rd
963             column. The default value is null ".".
964              
965             =item type
966              
967             Provide a text string. This can be either a "primary_tag:source_tag" value
968             as used by GFF based BioPerl databases, or "primary_tag" alone.
969              
970             =item score
971              
972             Provide a numerical value to be included as the score. The default
973             value is null ".".
974              
975             =item name
976              
977             Provide alternate or missing name value to be used as the display_name.
978             If no name is provided or available, a default name is generated.
979              
980             =item attributes
981              
982             Provide an anonymous array reference of one or more row Feature indices
983             to be used as GFF attributes. The name of the column is used as the GFF
984             attribute key.
985              
986             =back
987              
988             =back
989              
990             =cut
991              
992 3     3   17 use strict;
  3         4  
  3         82  
993 3     3   11 use Carp qw(carp cluck croak confess);
  3         5  
  3         138  
994 3     3   13 use Module::Load;
  3         13  
  3         15  
995 3         126 use Bio::ToolBox::db_helper qw(
996             get_db_feature
997             get_segment_score
998             calculate_score
999             get_genomic_sequence
1000             low_level_bam_fetch
1001 3     3   127 );
  3         6  
1002 3     3   13 use Bio::ToolBox::db_helper::constants;
  3         5  
  3         20259  
1003              
1004             my $GENETOOL_LOADED = 0;
1005             1;
1006              
1007              
1008              
1009             ### Initialization
1010              
1011             sub new {
1012             # this should ONLY be called from Bio::ToolBox::Data* iterators
1013 353     353 0 392 my $class = shift;
1014 353         610 my %self = @_;
1015             # we trust that new is called properly with data and index values
1016 353         818 return bless \%self, $class;
1017             }
1018              
1019              
1020             ### Set and retrieve values
1021              
1022             sub data {
1023 0     0 1 0 return shift->{data};
1024             }
1025              
1026             sub column_name {
1027 0     0 1 0 my ($self, $column) = @_;
1028 0 0       0 return unless defined $column;
1029 0         0 return $self->{data}->name($column);
1030             }
1031              
1032             sub feature_type {
1033 24     24 1 26 my $self = shift;
1034 24 50       36 carp "feature_type is a read only method" if @_;
1035 24         51 return $self->{data}->feature_type;
1036             }
1037              
1038             sub row_index {
1039 65     65 1 69 my $self = shift;
1040 65 50       102 carp "row_index is a read only method" if @_;
1041 65         141 return $self->{'index'};
1042             }
1043              
1044             sub line_number {
1045 0     0 0 0 my $self = shift;
1046 0 0       0 carp "line_number is a read only method" if @_;
1047 0 0       0 if (exists $self->{data}->{line_count}) {
    0          
1048 0         0 return $self->{data}->{line_count};
1049             }
1050             elsif (exists $self->{data}->{header_line_count}) {
1051 0         0 return $self->{data}->{header_line_count} + $self->row_index;
1052             }
1053             else {
1054 0         0 return;
1055             }
1056             }
1057              
1058             sub row_values {
1059 196     196 1 243 my $self = shift;
1060 196 50       262 carp "row_values is a read only method" if @_;
1061 196         200 my $row = $self->{'index'};
1062 196         191 my @data = @{ $self->{data}->{data_table}->[$row] };
  196         408  
1063 196 100       498 return wantarray ? @data : \@data;
1064             }
1065              
1066             sub value {
1067 341     341 1 1349 my ($self, $column, $value) = @_;
1068 341 50       453 return unless defined $column;
1069 341         372 my $row = $self->{'index'};
1070            
1071 341 100       416 if (defined $value) {
1072             # set a value
1073 85         102 $self->{data}->{data_table}->[$row][$column] = $value;
1074             }
1075 341         454 my $v = $self->{data}->{data_table}->[$row][$column];
1076 341 100       741 return length($v) ? $v : '.'; # internal null value, inherited from GFF definition
1077             }
1078              
1079             sub seq_id {
1080 21     21 1 329 my $self = shift;
1081 21 100       38 if ($_[0]) {
1082 1         3 my $i = $self->{data}->chromo_column;
1083             # update only if we have an actual column
1084 1 50       2 if (defined $i) {
    0          
1085 1         3 $self->value($i, $_[0]);
1086 1         2 $self->{seqid} = $_[0];
1087             }
1088             elsif (exists $self->{feature}) {
1089 0         0 carp "Unable to update seq_id for parsed SeqFeature objects";
1090             }
1091             else {
1092 0         0 carp "No Chromosome column to update!";
1093             }
1094             }
1095            
1096             # return cached value
1097 21 100       59 return $self->{seqid} if exists $self->{seqid};
1098            
1099             # get seq_id value
1100 6         8 my $c;
1101 6         17 my $i = $self->{data}->chromo_column;
1102 6         22 my $j = $self->{data}->coord_column;
1103 6 50       13 if (defined $i) {
    0          
    0          
    0          
1104             # collect from table
1105 6         10 $c = $self->value($i);
1106             }
1107             elsif (exists $self->{feature}) {
1108             # seqfeature
1109 0         0 $c = $self->{feature}->seq_id;
1110             }
1111             elsif (defined $j) {
1112 0         0 $self->_from_coordinate_string($j);
1113 0         0 return $self->{seqid};
1114             }
1115             elsif ($self->feature_type eq 'named') {
1116 0         0 my $f = $self->seqfeature;
1117 0 0       0 if ($f) {
1118 0         0 $c = $f->seq_id;
1119             }
1120             }
1121 6 50       25 $self->{seqid} = $c if defined $c;
1122 6         17 return $c;
1123             }
1124              
1125             sub start {
1126 119     119 1 786 my $self = shift;
1127 119 100       169 if ($_[0]) {
1128 5         17 my $i = $self->{data}->start_column;
1129             # update only if we have an actual column
1130 5 50       36 my $d = $_[0] =~ /^\d+$/ ? 1 : 0; # looks like an integer
1131 5 100 66     29 if (defined $i and $d) {
    50          
    50          
1132 1 50       3 if ($self->{data}->interbase) {
1133             # compensate for 0-based, assuming we're always working with 1-based
1134 1         3 $self->value($i, $_[0] - 1);
1135 1         3 $self->{start} = $_[0];
1136             }
1137             else {
1138 0         0 $self->value($i, $_[0]);
1139 0         0 $self->{start} = $_[0];
1140             }
1141             }
1142             elsif (not $d) {
1143 0         0 carp "Start coordinate value is not an integer";
1144             }
1145             elsif (exists $self->{feature}) {
1146 4         465 carp "Unable to update Start coordinate for parsed SeqFeature objects";
1147             }
1148             else {
1149 0         0 carp "No Start coordinate column to update!";
1150             }
1151             }
1152            
1153             # return cached value
1154 119 100       459 return $self->{start} if exists $self->{start};
1155            
1156             # get start value
1157 92         86 my $s;
1158 92         157 my $i = $self->{data}->start_column;
1159 92         156 my $j = $self->{data}->coord_column;
1160 92 100       139 if (defined $i) {
    50          
    0          
    0          
1161             # collect from table
1162 87         105 $s = $self->value($i);
1163 87 100       149 if ($self->{data}->interbase) {
1164             # compensate for 0-based index
1165 6         13 $s += 1;
1166             }
1167             }
1168             elsif (exists $self->{feature}) {
1169             # seqfeature
1170 5         19 $s = $self->{feature}->start;
1171             }
1172             elsif (defined $j) {
1173 0         0 $self->_from_coordinate_string($j);
1174 0         0 return $self->{start};
1175             }
1176             elsif ($self->feature_type eq 'named') {
1177 0         0 my $f = $self->seqfeature;
1178 0 0       0 if ($f) {
1179 0         0 $s = $f->start;
1180             }
1181             }
1182 92 50       152 $self->{start} = $s if defined $s;
1183 92         175 return $s;
1184             }
1185              
1186             *stop = \&end;
1187             sub end {
1188 38     38 1 49 my $self = shift;
1189 38 100       73 if ($_[0]) {
1190 2         7 my $i = $self->{data}->stop_column;
1191             # update only if we have an actual column
1192 2 50       12 my $d = $_[0] =~ /^\d+$/ ? 1 : 0;
1193 2 50 33     8 if (defined $i and $d) {
    0          
    0          
1194 2         6 $self->value($i, $_[0]);
1195 2         3 $self->{end} = $_[0];
1196             }
1197             elsif (not $d) {
1198 0         0 carp "End coordinate value is not an integer";
1199             }
1200             elsif (exists $self->{feature}) {
1201 0         0 carp "Unable to update End coordinate for parsed SeqFeature objects";
1202             }
1203             else {
1204 0         0 carp "No End coordinate column to update!";
1205             }
1206             }
1207            
1208             # return cached value
1209 38 100       99 return $self->{end} if exists $self->{end};
1210            
1211             # get end value
1212 11         15 my $e;
1213 11         38 my $i = $self->{data}->stop_column;
1214 11         23 my $j = $self->{data}->coord_column;
1215 11 100       24 if (defined $i) {
    50          
    0          
    0          
1216             # collect from table
1217 10         19 $e = $self->value($i);
1218             }
1219             elsif (exists $self->{feature}) {
1220             # seqfeature
1221 1         4 $e = $self->{feature}->end;
1222             }
1223             elsif (defined $j) {
1224 0         0 $self->_from_coordinate_string($j);
1225 0         0 return $self->{end};
1226             }
1227             elsif ($self->feature_type eq 'named') {
1228 0         0 my $f = $self->seqfeature;
1229 0 0       0 if ($f) {
1230 0         0 $e = $f->end;
1231             }
1232             }
1233 11 50       28 $self->{end} = $e if defined $e;
1234 11         27 return $e;
1235             }
1236              
1237             sub strand {
1238 29     29 1 41 my $self = shift;
1239 29 100       58 if ($_[0]) {
1240             # update only if we have an actual column
1241 1         4 my $i = $self->{data}->strand_column;
1242 1 50       4 if (defined $i) {
    50          
1243 0         0 $self->value($i, $_[0]);
1244 0         0 $self->{strand} = $self->_strand($_[0]);
1245             }
1246             elsif (exists $self->{feature}) {
1247 0         0 carp "Unable to update Strand for parsed SeqFeature objects";
1248             }
1249             else {
1250 1         118 carp "No Strand column to update!";
1251             }
1252             }
1253            
1254             # return cached value
1255 29 100       140 return $self->{strand} if exists $self->{strand};
1256            
1257             # get strand value
1258 7         12 my $s = 0;
1259 7 100       32 if (my $i = $self->{data}->strand_column) {
    100          
    50          
1260             # collect from table
1261 2         6 $s = $self->_strand( $self->value($i) );
1262             }
1263             elsif (exists $self->{feature}) {
1264             # seqfeature
1265 4         14 $s = $self->{feature}->strand;
1266             }
1267             elsif ($self->feature_type eq 'named') {
1268 0         0 my $f = $self->seqfeature;
1269 0 0       0 if ($f) {
1270 0         0 $s = $f->strand;
1271             }
1272             }
1273 7         17 $self->{strand} = $s;
1274 7         22 return $s;
1275             }
1276              
1277             sub _strand {
1278 2     2   4 my $self = shift;
1279 2         8 my $str = shift;
1280 2 50 33     22 if ($str eq '1' or $str eq '-1' or $str eq '0') {
    50 33        
    50          
    0          
    0          
    0          
1281 0         0 return $str;
1282             }
1283             elsif ($str eq '+') {
1284 0         0 return 1;
1285             }
1286             elsif ($str eq '-') {
1287 2         5 return -1;
1288             }
1289             elsif ($str eq '.') {
1290 0         0 return 0;
1291             }
1292             elsif ($str eq '2') {
1293             # R packages use 1 for + and 2 for -
1294 0         0 return -1;
1295             }
1296             elsif ($str eq '*') {
1297             # R packages use * for .
1298 0         0 return 0;
1299             }
1300             else {
1301 0         0 return 0;
1302             }
1303             }
1304              
1305             sub _from_coordinate_string {
1306 0     0   0 my ($self, $i) = shift;
1307 0         0 my ($chr, $start, $end, $str) = split /(?:\-|\.\.|\s)/, $self->value($i);
1308 0 0       0 $self->{seqid} = $chr unless exists $self->{seqid};
1309 0 0       0 $self->{start} = $start unless exists $self->{start};
1310             # we assume this is a 1-based coordinate
1311 0 0       0 $self->{end} = $end unless exists $self->{end};
1312 0 0 0     0 if (defined $str and not exists $self->{strand}) {
1313             # you never know, the strand may be added to the string
1314 0         0 $self->{strand} = $self->_strand($str);
1315             }
1316             }
1317              
1318             sub peak {
1319 3     3 1 276 my $self = shift;
1320 3 100       12 if ($self->{data}->format eq 'narrowPeak') {
1321 2 100 66     12 if (exists $self->{feature} and $self->{feature}->has_tag('peak')) {
1322 1         4 return $self->{feature}->get_tag_values('peak') + $self->{feature}->start;
1323             }
1324             else {
1325 1         4 return $self->value(1) + $self->value(9) + 1;
1326             }
1327             }
1328             else {
1329 1         2 return $self->midpoint;
1330             }
1331             }
1332              
1333             sub midpoint {
1334 5     5 1 10 my $self = shift;
1335 5         14 my $s = $self->start;
1336 5         10 my $e = $self->end;
1337 5 50 33     18 if ($s and $e) {
1338 5         26 return int( ($s + $e) / 2 );
1339             }
1340             else {
1341 0         0 return undef;
1342             }
1343             }
1344              
1345             *name = \&display_name;
1346             sub display_name {
1347 76     76 1 1292 my $self = shift;
1348 76 100       128 if ($_[0]) {
1349             # update only if we have an actual column
1350 1         3 my $i = $self->{data}->name_column;
1351 1 50       2 if (defined $i) {
    0          
1352 1         3 return $self->value($i, $_[0]);
1353             }
1354             elsif (exists $self->{feature}) {
1355 0         0 carp "Unable to update display_name for parsed SeqFeature objects";
1356             }
1357             else {
1358 0         0 carp "No Name column to update!";
1359             }
1360             }
1361             # seqfeature
1362 75 100       124 if (exists $self->{feature}) {
1363 5         21 return $self->{feature}->display_name;
1364             }
1365             # collect from table
1366 70         136 my $i = $self->{data}->name_column;
1367 70 50       102 if (defined $i) {
    0          
1368 70         122 return $self->value($i);
1369             }
1370             elsif (my $att = $self->gff_attributes) {
1371             return $att->{Name} || $att->{ID} || $att->{transcript_name} ||
1372 0   0     0 $att->{gene_name} || undef;
1373             }
1374             }
1375              
1376             sub coordinate {
1377 3     3 1 4 my $self = shift;
1378 3 50       8 carp "name is a read only method" if @_;
1379             # to avoid auto-converting start0 coordinates, which might confuse people or programs,
1380             # we will take the start value as is when it's available, otherwise calculate start
1381 3         8 my $start_i = $self->{data}->start_column;
1382             my $coord = sprintf("%s:%d", $self->seq_id,
1383             defined $start_i ? $self->value($start_i) :
1384 3 0       7 exists $self->{feature} ? $self->{feature}->start : 0);
    50          
1385 3         7 my $end = $self->end;
1386 3 50       9 $coord .= "-$end" if $end;
1387 3 50       12 return CORE::length($coord) > 2 ? $coord : undef;
1388             }
1389              
1390             sub type {
1391 9     9 1 20 my $self = shift;
1392 9 100       23 if ($_[0]) {
1393             # update only if we have an actual column
1394 1         4 my $i = $self->{data}->type_column;
1395 1 50       4 if (defined $i) {
    50          
1396 0         0 return $self->value($i, $_[0]);
1397             }
1398             elsif (exists $self->{feature}) {
1399 0         0 carp "Unable to update primary_tag for parsed SeqFeature objects";
1400             }
1401             else {
1402 1         56 carp "No Type column to update!";
1403             }
1404             }
1405             # collect from table
1406 9         90 my $i = $self->{data}->type_column;
1407 9 50       18 if (defined $i) {
1408 0         0 return $self->value($i);
1409             }
1410             # seqfeature
1411 9 100       18 if (exists $self->{feature}) {
1412 4         19 return $self->{feature}->primary_tag;
1413             }
1414             # general metadata
1415 5 50       11 if ($self->{data}->feature) {
1416 5         9 return $self->{data}->feature;
1417             }
1418 0         0 return undef;
1419             }
1420              
1421             *id = \&primary_id;
1422             sub primary_id {
1423 66     66 1 85 my $self = shift;
1424 66 50       103 carp "id is a read only method" if @_;
1425 66         113 my $i = $self->{data}->id_column;
1426 66 50       132 my $v = $self->value($i) if defined $i;
1427 66 50 33     172 if (defined $v and $v ne '.') {
1428 66         153 return $v;
1429             }
1430 0 0       0 return $self->{feature}->primary_id if exists $self->{feature};
1431 0 0       0 if (my $att = $self->gff_attributes) {
1432 0   0     0 return $att->{ID} || $att->{Name} || $att->{transcript_id};
1433             }
1434 0         0 return undef;
1435             }
1436              
1437             sub length {
1438 2     2 1 4 my $self = shift;
1439 2 50       5 carp "length is a read only method" if @_;
1440 2 50       7 if ($self->{data}->vcf) {
1441             # special case for vcf files, measure the length of the ALT allele
1442 0         0 return CORE::length($self->value(4));
1443             }
1444 2         4 my $s = $self->start;
1445 2         5 my $e = $self->end;
1446 2 50 33     9 if (defined $s and defined $e) {
    0          
1447 2         7 return $e - $s + 1;
1448             }
1449             elsif (defined $s) {
1450 0         0 return 1;
1451             }
1452             else {
1453 0         0 return undef;
1454             }
1455             }
1456              
1457             sub score {
1458 0     0 1 0 my $self = shift;
1459 0         0 my $c = $self->{data}->score_column;
1460 0 0       0 return defined $c ? $self->value($c) : undef;
1461             }
1462              
1463             sub attributes {
1464 0     0 1 0 my $self = shift;
1465 0 0       0 return $self->gff_attributes if ($self->{data}->gff);
1466 0 0       0 return $self->vcf_attributes if ($self->{data}->vcf);
1467 0         0 return;
1468             }
1469              
1470             sub gff_attributes {
1471 1     1 1 2 my $self = shift;
1472 1 50       4 return unless ($self->{data}->gff);
1473 1 50       3 return $self->{attributes} if (exists $self->{attributes});
1474 1         2 $self->{attributes} = {};
1475 1         3 foreach my $g (split(/\s*;\s*/, $self->value(8))) {
1476 3         18 my ($tag, $value) = split /\s+|=/, $g;
1477 3 50 33     10 next unless ($tag and $value);
1478             # unescape URL encoded values, borrowed from Bio::DB::GFF
1479 3         4 $value =~ tr/+/ /;
1480 3         7 $value =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
  11         24  
1481 3         6 $self->{attributes}->{$tag} = $value;
1482             }
1483 1         3 return $self->{attributes};
1484             }
1485              
1486             sub vcf_attributes {
1487 0     0 1 0 my $self = shift;
1488 0 0       0 return unless ($self->{data}->vcf);
1489 0 0       0 return $self->{attributes} if (exists $self->{attributes});
1490 0         0 $self->{attributes} = {};
1491            
1492             # INFO attributes
1493 0         0 my %info;
1494 0 0       0 if ($self->{data}->name(7) eq 'INFO') {
1495 0 0       0 %info = map {$_->[0] => defined $_->[1] ? $_->[1] : undef}
1496             # some tags are simple and have no value, eg SOMATIC
1497 0         0 map { [split(/=/, $_)] }
  0         0  
1498             split(/;/, $self->value(7));
1499             }
1500 0         0 $self->{attributes}->{INFO} = \%info;
1501 0         0 $self->{attributes}->{7} = \%info;
1502            
1503             # Sample attributes
1504 0 0       0 if ($self->{data}->number_columns > 8) {
1505 0         0 my @formatKeys = split /:/, $self->value(8);
1506 0         0 foreach my $i (9 .. $self->{data}->last_column) {
1507 0         0 my $name = $self->{data}->name($i);
1508 0         0 my @sampleVals = split /:/, $self->value($i);
1509             my %sample = map {
1510 0 0       0 $formatKeys[$_] => defined $sampleVals[$_] ? $sampleVals[$_] : undef }
  0         0  
1511             (0 .. $#formatKeys);
1512 0         0 $self->{attributes}->{$name} = \%sample;
1513 0         0 $self->{attributes}->{$i} = \%sample;
1514             }
1515             }
1516 0         0 return $self->{attributes};
1517             }
1518              
1519             sub rewrite_attributes {
1520 0     0 1 0 my $self = shift;
1521 0 0       0 return $self->rewrite_gff_attributes if ($self->{data}->gff);
1522 0 0       0 return $self->rewrite_vcf_attributes if ($self->{data}->vcf);
1523 0         0 return;
1524             }
1525              
1526             sub rewrite_gff_attributes {
1527 1     1 1 747 my $self = shift;
1528 1 50       3 return unless ($self->{data}->gff);
1529 1 50       3 return unless exists $self->{attributes};
1530 1         2 my @pairs; # of key=value items
1531 1 50       2 if (exists $self->{attributes}{ID}) {
1532             # I assume this does not need to be escaped!
1533 1         3 push @pairs, 'ID=' . $self->{attributes}{ID};
1534             }
1535 1 50       3 if (exists $self->{attributes}{Name}) {
1536 1         1 my $name = $self->{attributes}{Name};
1537 1         3 $name =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
  0         0  
1538 1         3 push @pairs, "Name=$name";
1539             }
1540 1         2 foreach my $key (sort {$a cmp $b} keys %{ $self->{attributes} }) {
  2         5  
  1         5  
1541 3 100       7 next if $key eq 'ID';
1542 2 100       3 next if $key eq 'Name';
1543 1         2 my $value = $self->{attributes}{$key};
1544 1         2 $key =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
  0         0  
1545 1         3 $value =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
  8         20  
1546 1         4 push @pairs, "$key=$value";
1547             }
1548 1         5 $self->value(8, join("; ", @pairs));
1549 1         4 return 1;
1550             }
1551              
1552             sub rewrite_vcf_attributes {
1553 0     0 1 0 my $self = shift;
1554 0 0       0 return unless ($self->{data}->vcf);
1555 0 0       0 return unless exists $self->{attributes};
1556            
1557             # INFO
1558             my $info = join(';',
1559             map {
1560             defined $self->{attributes}->{INFO}{$_} ?
1561 0 0       0 join('=', $_, $self->{attributes}->{INFO}{$_}) : $_
1562             }
1563 0         0 sort {$a cmp $b}
1564 0         0 keys %{$self->{attributes}->{INFO}}
  0         0  
1565             );
1566 0   0     0 $info ||= '.'; # sometimes we have nothing left
1567 0         0 $self->value(7, $info);
1568            
1569             # FORMAT
1570 0         0 my @order;
1571 0 0       0 push @order, 'GT' if exists $self->{attributes}{9}{GT};
1572 0         0 foreach my $key (sort {$a cmp $b} keys %{ $self->{attributes}{9} } ) {
  0         0  
  0         0  
1573 0 0       0 next if $key eq 'GT';
1574 0         0 push @order, $key;
1575             }
1576 0 0       0 if (@order) {
1577 0         0 $self->value(8, join(':', @order));
1578             }
1579             else {
1580 0         0 $self->value(8, '.');
1581             }
1582            
1583             # SAMPLES
1584 0         0 foreach my $i (9 .. $self->{data}->last_column) {
1585 0 0       0 if (@order) {
1586             $self->value($i, join(":",
1587 0         0 map { $self->{attributes}{$i}{$_} } @order
  0         0  
1588             ) );
1589             }
1590             else {
1591 0         0 $self->value($i, '.');
1592             }
1593             }
1594 0         0 return 1;
1595             }
1596              
1597             ### Data collection convenience methods
1598              
1599             *feature = \&seqfeature;
1600             sub seqfeature {
1601 4     4 1 6 my $self = shift;
1602 4   50     8 my $force = shift || 0;
1603 4 50       8 carp "feature is a read only method" if @_;
1604 4 50       8 return $self->{feature} if exists $self->{feature};
1605             # normally this is only for named features in a data table
1606             # skip this for coordinate features like bed files
1607 4 50 33     7 return unless $self->feature_type eq 'named' or $force;
1608            
1609             # retrieve from main Data store
1610 0         0 my $f = $self->{data}->get_seqfeature( $self->{'index'} );
1611 0 0       0 if ($f) {
1612 0         0 $self->{feature} = $f;
1613 0         0 return $f;
1614             }
1615            
1616             # retrieve the feature from the database
1617 0 0       0 return unless $self->{data}->database;
1618             $f = get_db_feature(
1619             'db' => $self->{data}->open_meta_database,
1620             'id' => $self->id || undef,
1621             'name' => $self->name || undef,
1622             'type' => $self->type || $self->{data}->feature,
1623 0   0     0 );
      0        
      0        
1624 0 0       0 return unless $f;
1625 0         0 $self->{feature} = $f;
1626 0         0 return $f;
1627             }
1628              
1629             sub segment {
1630 1     1 1 2 my $self = shift;
1631 1 50       3 carp "segment is a read only method" if @_;
1632 1 50       4 return unless $self->{data}->database;
1633 1 50       3 if ($self->feature_type eq 'coordinate') {
    0          
1634 1         2 my $chromo = $self->seq_id;
1635 1         3 my $start = $self->start;
1636 1   33     2 my $stop = $self->end || $start;
1637 1         3 my $db = $self->{data}->open_meta_database;
1638 1 50       6 return $db ? $db->segment($chromo, $start, $stop) : undef;
1639             }
1640             elsif ($self->feature_type eq 'named') {
1641 0         0 my $f = $self->feature;
1642 0 0       0 return $f ? $f->segment : undef;
1643             }
1644             else {
1645 0         0 return undef;
1646             }
1647             }
1648              
1649             sub get_features {
1650 0     0 1 0 my $self = shift;
1651 0         0 my %args = @_;
1652 0   0     0 my $db = $args{db} || $self->{data}->open_meta_database || undef;
1653 0 0       0 carp "no database defined to get features!" unless defined $db;
1654 0 0       0 return unless $db->can('features');
1655            
1656             # convert the argument style for most bioperl db APIs
1657 0         0 my %opts;
1658 0   0     0 $opts{-seq_id} = $args{chromo} || $self->seq_id;
1659 0   0     0 $opts{-start} = $args{start} || $self->start;
1660 0   0     0 $opts{-end} = $args{end} || $self->end;
1661 0   0     0 $opts{-type} = $args{type} || $self->type;
1662            
1663 0         0 return $db->features(%opts);
1664             }
1665              
1666             sub get_sequence {
1667 0     0 1 0 my $self = shift;
1668 0         0 my %args = @_;
1669 0   0     0 my $db = $args{db} || $args{database} || $self->{data}->open_meta_database || undef;
1670             # this will fail immediately if user doesn't provide valid database
1671            
1672             # get sequence over subfeatures
1673 0   0     0 $args{subfeature} ||= undef;
1674 0 0 0     0 if ($self->feature_type eq 'named' and $args{subfeature}) {
1675             # this is more complicated so we have a dedicated method
1676 0         0 return $self->_get_subfeature_sequence($db, \%args);
1677             }
1678            
1679             # get coordinates
1680 0   0     0 my $seqid = $args{seq_id} || $args{chromo} || $self->seq_id;
1681 0   0     0 my $start = $args{start} || $self->start;
1682 0   0     0 my $stop = $args{stop} || $args{end} || $self->end;
1683 0         0 my $strand = $self->strand;
1684 0 0       0 if (exists $args{strand}) {
1685             # user supplied strand, gotta check it
1686 0 0       0 $strand = $args{strand} =~ /\-|r/i ? -1 : 1;
1687             }
1688 0 0 0     0 if (exists $args{extend} and $args{extend}) {
1689 0         0 $start -= $args{extend};
1690 0 0       0 $start = 1 if $start <= 0;
1691 0         0 $stop += $args{extend};
1692             }
1693 0 0 0     0 return unless (defined $seqid and defined $start and defined $stop);
      0        
1694            
1695             # retrieve and return sequence
1696 0         0 my $seq = get_genomic_sequence($db, $seqid, $start, $stop);
1697 0 0       0 if ($strand == -1) {
1698 0         0 $seq =~ tr/gatcGATC/ctagCTAG/;
1699 0         0 $seq = reverse $seq;
1700             }
1701 0         0 return $seq;
1702             }
1703              
1704             sub _get_subfeature_sequence {
1705 0     0   0 my ($self, $db, $args) = @_;
1706            
1707             # get the subfeatures
1708 0         0 my $subfeatures = $self->_get_subfeatures($args->{subfeature});
1709 0 0       0 unless (@$subfeatures) {
1710 0         0 carp "no subfeatures available! Returning parent sequence!";
1711             # just return the parent
1712 0         0 undef $args->{subfeature};
1713 0         0 return $self->get_sequence(@$args);
1714             }
1715            
1716             # sort subfeatures
1717             # this should be done by GeneTools in most cases but just to be sure
1718             # note that this does NOT merge redundant or overlapping exons!!!!
1719 0         0 my @sorted = map { $_->[0] }
1720 0 0       0 sort { $a->[1] <=> $b->[1] or $a->[2] <=> $b->[2] }
1721 0         0 map { [$_, $_->start, $_->end] }
  0         0  
1722             @$subfeatures;
1723            
1724             # collect sequence
1725 0         0 my $sequence;
1726 0         0 foreach my $subf (@sorted) {
1727 0         0 my $seq = get_genomic_sequence($db, $subf->seq_id, $subf->start, $subf->stop);
1728 0         0 $sequence .= $seq;
1729             }
1730            
1731             # flip the sequence
1732 0 0       0 if ($self->strand == -1) {
1733 0         0 $sequence =~ tr/gatcGATC/ctagCTAG/;
1734 0         0 $sequence = reverse $sequence;
1735             }
1736 0         0 return $sequence;
1737             }
1738              
1739             sub _get_subfeatures {
1740 0     0   0 my $self = shift;
1741 0         0 my $subf = lc shift;
1742            
1743             # load GeneTools
1744 0 0       0 unless ($GENETOOL_LOADED) {
1745 0         0 load('Bio::ToolBox::GeneTools', qw(get_exons get_cds get_5p_utrs get_3p_utrs
1746             get_introns));
1747 0 0       0 if ($@) {
1748 0         0 croak "missing required modules! $@";
1749             }
1750             else {
1751 0         0 $GENETOOL_LOADED = 1;
1752             }
1753             }
1754            
1755             # feature
1756 0         0 my $feature = $self->seqfeature;
1757 0 0       0 return unless ($feature);
1758            
1759             # get the subfeatures
1760 0         0 my @subfeatures;
1761 0 0       0 if ($subf eq 'exon') {
    0          
    0          
    0          
    0          
1762 0         0 @subfeatures = get_exons($feature);
1763             }
1764             elsif ($subf eq 'cds') {
1765 0         0 @subfeatures = get_cds($feature);
1766             }
1767             elsif ($subf eq '5p_utr') {
1768 0         0 @subfeatures = get_5p_utrs($feature);
1769             }
1770             elsif ($subf eq '3p_utr') {
1771 0         0 @subfeatures = get_3p_utrs($feature);
1772             }
1773             elsif ($subf eq 'intron') {
1774 0         0 @subfeatures = get_introns($feature);
1775             }
1776             else {
1777 0         0 croak "unrecognized subfeature parameter '$subf'!";
1778             }
1779            
1780 0         0 return \@subfeatures;
1781             }
1782              
1783             sub get_score {
1784 7     7 1 4473 my $self = shift;
1785 7         24 my %args = @_; # passed arguments to this method
1786            
1787             # verify the dataset for the user, cannot trust whether it has been done or not
1788 7   50     46 my $db = $args{ddb} || $args{db} || $self->{data}->open_meta_database || undef;
1789 7         30 $args{dataset} = $self->{data}->verify_dataset($args{dataset}, $db);
1790 7 50       14 unless ($args{dataset}) {
1791 0         0 croak "provided dataset was unrecognized format or otherwise could not be verified!";
1792             }
1793            
1794             # get positioned scores over subfeatures only
1795 7   50     32 $args{subfeature} ||= q();
1796 7 0 33     20 if ($self->feature_type eq 'named' and $args{subfeature}) {
1797             # this is more complicated so we have a dedicated method
1798 0         0 return $self->_get_subfeature_scores($db, \%args);
1799             }
1800            
1801             # build parameter array to pass on to the adapter
1802 7         10 my @params;
1803            
1804             # verify coordinates based on type of feature
1805 7 50       12 if ($self->feature_type eq 'coordinate') {
    0          
1806             # coordinates are already in the table, use those
1807 7   33     25 $params[CHR] = $args{seq_id} || $self->seq_id;
1808 7   33     24 $params[STRT] = $args{start} || $self->start;
1809 7   33     34 $params[STOP] = $args{stop} || $args{end} || $self->end;
1810             $params[STR] = (exists $args{strand} and defined $args{strand}) ? $args{strand} :
1811 7 50 33     25 $self->strand;
1812             }
1813             elsif ($self->feature_type eq 'named') {
1814             # must retrieve feature from the database first
1815 0         0 my $f = $self->seqfeature;
1816 0 0       0 return unless $f;
1817 0   0     0 $params[CHR] = $args{seq_id} || $f->seq_id;
1818 0   0     0 $params[STRT] = $args{start} || $f->start;
1819 0   0     0 $params[STOP] = $args{stop} || $args{end} || $f->end;
1820             $params[STR] = (exists $args{strand} and defined $args{strand}) ? $args{strand} :
1821 0 0 0     0 $f->strand;
1822             }
1823             else {
1824 0         0 croak "data table does not have identifiable coordinate or feature identification columns for score collection";
1825             }
1826            
1827             # adjust coordinates as necessary
1828 7 0 33     15 if (exists $args{extend} and $args{extend}) {
1829 0         0 $params[STRT] -= $args{extend};
1830 0         0 $params[STOP] += $args{extend};
1831             }
1832            
1833             # check coordinates
1834 7 50       11 $params[STRT] = 1 if $params[STRT] <= 0;
1835 7 50       18 if ($params[STOP] < $params[STRT]) {
1836             # coordinates are flipped, reverse strand
1837 0 0       0 return if ($params[STOP] <= 0);
1838 0         0 my $stop = $params[STRT];
1839 0         0 $params[STRT] = $params[STOP];
1840 0         0 $params[STOP] = $stop;
1841 0         0 $params[STR] = -1;
1842             }
1843 7 50 33     21 return unless ($params[CHR] and defined $params[STRT]);
1844            
1845             # score attributes
1846 7   50     14 $params[METH] = $args{'method'} || 'mean';
1847 7   100     26 $params[STND] = $args{strandedness} || $args{stranded} || 'all';
1848            
1849             # other parameters
1850 7         9 $params[DB] = $db;
1851 7         8 $params[RETT] = 0; # return type should be a calculated value
1852 7         14 $params[DATA] = $args{dataset};
1853            
1854             # get the score
1855 7         19 return get_segment_score(@params);
1856             }
1857              
1858             sub _get_subfeature_scores {
1859 0     0   0 my ($self, $db, $args) = @_;
1860            
1861             # get the subfeatures
1862 0         0 my $subfeatures = $self->_get_subfeatures($args->{subfeature});
1863 0 0       0 unless (@$subfeatures) {
1864 0         0 carp "no subfeatures available! Returning parent score data!";
1865             # just return the parent
1866 0         0 undef $args->{subfeature};
1867 0 0       0 delete $args->{exon} if exists $args->{exon};
1868 0         0 return $self->get_score(@$args);
1869             }
1870            
1871             # collect over each subfeature
1872 0         0 my @scores;
1873 0         0 foreach my $exon (@$subfeatures) {
1874 0         0 my @params; # parameters to pass on to adapter
1875 0         0 $params[CHR] = $exon->seq_id;
1876 0         0 $params[STRT] = $exon->start;
1877 0         0 $params[STOP] = $exon->end;
1878 0 0       0 $params[STR] = defined $args->{strand} ? $args->{strand} : $exon->strand;
1879 0   0     0 $params[STND] = $args->{strandedness} || $args->{stranded} || 'all';
1880 0   0     0 $params[METH] = $args->{method} || 'mean';
1881 0         0 $params[RETT] = 1; # return type should be an array reference of scores
1882 0         0 $params[DB] = $db;
1883 0         0 $params[DATA] = $args->{dataset};
1884            
1885 0         0 my $exon_scores = get_segment_score(@params);
1886 0 0       0 push @scores, @$exon_scores if defined $exon_scores;
1887             }
1888            
1889             # combine all the scores based on the requested method
1890 0         0 return calculate_score($args->{method}, \@scores);
1891             }
1892              
1893             sub get_relative_point_position_scores {
1894 1     1 1 1398 my $self = shift;
1895 1         5 my %args = @_;
1896            
1897             # get the database and verify the dataset
1898 1   33     9 my $ddb = $args{ddb} || $args{db} || $self->{data}->open_meta_database;
1899 1         5 $args{dataset} = $self->{data}->verify_dataset($args{dataset}, $ddb);
1900 1 50       3 unless ($args{dataset}) {
1901 0         0 croak "provided dataset was unrecognized format or otherwise could not be verified!\n";
1902             }
1903            
1904             # assign some defaults
1905 1   50     6 $args{strandedness} ||= $args{stranded} || 'all';
      33        
1906 1   50     3 $args{position} ||= 5;
1907 1   50     4 $args{coordinate} ||= undef;
1908 1   50     5 $args{avoid} ||= undef;
1909 1   50     6 $args{'method'} ||= 'mean'; # in most cases this doesn't do anything
1910 1 50       2 unless ($args{extend}) {
1911 0         0 croak "must provide an extend value!";
1912             }
1913 1 50 33     5 $args{avoid} = undef unless ($args{db} or $self->{data}->open_meta_database);
1914            
1915             # determine reference coordinate
1916 1 50       5 $self->_calculate_reference(\%args) unless defined $args{coordinate};
1917            
1918             # build parameter array to pass on to the adapter
1919 1         2 my @params;
1920 1         3 $params[CHR] = $self->seq_id;
1921 1         4 $params[STRT] = $args{coordinate} - $args{extend};
1922 1 50       3 $params[STRT] = 1 if $params[STRT] < 1; # sanity check
1923 1         2 $params[STOP] = $args{coordinate} + $args{extend};
1924 1 50       4 $params[STR] = defined $args{strand} ? $args{strand} : $self->strand;
1925 1         2 $params[STND] = $args{strandedness};
1926 1         2 $params[METH] = $args{'method'};
1927 1         3 $params[RETT] = 2; # return type should be a hash reference of positioned scores
1928 1         3 $params[DB] = $ddb;
1929 1         2 $params[DATA] = $args{dataset};
1930            
1931             # Data collection
1932 1         4 my $pos2data = get_segment_score(@params);
1933            
1934             # Avoid positions
1935 1 50       4 if ($args{avoid}) {
1936 0         0 $self->_avoid_positions($pos2data, \%args, $params[CHR], $params[STRT], $params[STOP]);
1937             }
1938            
1939             # covert to relative positions
1940 1 50       4 if ($args{absolute}) {
1941             # do not convert to relative positions
1942 0 0       0 return wantarray ? %$pos2data : $pos2data;
1943             }
1944             else {
1945             # return the collected dataset hash
1946             return $self->_convert_to_relative_positions($pos2data,
1947 1         3 $args{coordinate}, $params[STR]);
1948             }
1949             }
1950              
1951             sub get_region_position_scores {
1952 4     4 1 4885 my $self = shift;
1953 4         13 my %args = @_;
1954            
1955             # get the database and verify the dataset
1956 4   33     26 my $ddb = $args{ddb} || $args{db} || $self->{data}->open_meta_database;
1957 4         10 $args{dataset} = $self->{data}->verify_dataset($args{dataset}, $ddb);
1958 4 50       9 unless ($args{dataset}) {
1959 0         0 croak "provided dataset was unrecognized format or otherwise could not be verified!\n";
1960             }
1961            
1962             # assign some defaults here, in case we get passed on to subfeature method
1963 4   50     16 $args{strandedness} ||= $args{stranded} || 'all';
      33        
1964 4   50     12 $args{extend} ||= 0;
1965 4   50     12 $args{position} ||= 5;
1966 4   100     10 $args{'method'} ||= 'mean'; # in most cases this doesn't do anything
1967 4 50 33     15 $args{avoid} = undef unless ($args{db} or $self->{data}->open_meta_database);
1968            
1969             # get positioned scores over subfeatures only
1970 4   50     14 $args{subfeature} ||= q();
1971 4 0 33     7 if ($self->feature_type eq 'named' and $args{subfeature}) {
1972             # this is more complicated so we have a dedicated method
1973 0         0 return $self->_get_subfeature_position_scores(\%args, $ddb);
1974             }
1975            
1976             # Assign coordinates
1977             # build parameter array to pass on to the adapter
1978 4         7 my @params;
1979 4   33     9 my $feature = $self->seqfeature || $self;
1980 4   33     14 $params[CHR] = $args{chromo} || $args{seq_id} || $feature->seq_id;
1981 4   33     12 $params[STRT] = $args{start} || $feature->start;
1982 4   33     15 $params[STOP] = $args{stop} || $args{end} || $feature->end;
1983 4 50       10 $params[STR] = defined $args{strand} ? $args{strand} : $feature->strand;
1984 4 50       7 if ($args{extend}) {
1985 0         0 $params[STRT] -= $args{extend};
1986 0         0 $params[STOP] += $args{extend};
1987 0 0       0 $params[STRT] = 1 if $params[STRT] < 1; # sanity check
1988             }
1989 4         7 $params[STND] = $args{strandedness};
1990 4         5 $params[METH] = $args{method};
1991 4         5 $params[RETT] = 2; # return type should be a hash reference of positioned scores
1992 4         5 $params[DB] = $ddb;
1993 4         5 $params[DATA] = $args{dataset};
1994            
1995             # Data collection
1996 4         10 my $pos2data = get_segment_score(@params);
1997            
1998             # Avoid positions
1999 4 50       8 if ($args{avoid}) {
2000 0         0 $self->_avoid_positions($pos2data, \%args, $params[CHR], $params[STRT], $params[STOP]);
2001             }
2002            
2003             # covert to relative positions
2004 4 50       7 if ($args{absolute}) {
2005             # do not convert to relative positions
2006 0 0       0 return wantarray ? %$pos2data : $pos2data;
2007             }
2008             else {
2009             # return data converted to relative positions
2010 4 50       18 $self->_calculate_reference(\%args) unless defined $args{coordinate};
2011             return $self->_convert_to_relative_positions($pos2data,
2012 4         11 $args{coordinate}, $params[STR]);
2013             }
2014             }
2015              
2016             sub _get_subfeature_position_scores {
2017 0     0   0 my ($self, $args, $ddb) = @_;
2018            
2019             # get the subfeatures
2020 0         0 my $subfeatures = $self->_get_subfeatures($args->{subfeature});
2021 0 0       0 unless (@$subfeatures) {
2022 0         0 carp "no subfeatures available! Returning parent score data!";
2023             # just return the parent
2024 0         0 undef $args->{subfeature};
2025 0 0       0 delete $args->{exon} if exists $args->{exon};
2026 0         0 return $self->get_sequence(@$args);
2027             }
2028            
2029             # reset the practical start and stop to the actual subfeatures' final start and stop
2030             # we can no longer rely on the feature start and stop, consider CDS
2031             # these subfeatures should already be genomic sorted by GeneTools
2032 0         0 my $practical_start = $subfeatures->[0]->start;
2033 0         0 my $practical_stop = $subfeatures->[-1]->end;
2034            
2035             # collect over each exon
2036             # we will adjust the positions of each reported score so that
2037             # it will appear as if all the exons are adjacent to each other
2038             # and no introns exist
2039 0         0 my $pos2data = {};
2040 0         0 my $namecheck = {}; # to check unique names when using ncount method....
2041 0         0 my $current_end = $practical_start;
2042 0         0 my $adjustment = 0;
2043 0 0       0 my $fstrand = defined $args->{strand} ? $args->{strand} : $self->strand;
2044 0         0 foreach my $exon (@$subfeatures) {
2045            
2046 0         0 my @params; # parameters to pass on to adapter
2047 0         0 $params[CHR] = $exon->seq_id;
2048 0         0 $params[STRT] = $exon->start;
2049 0         0 $params[STOP] = $exon->end;
2050 0         0 $params[STR] = $fstrand;
2051 0         0 $params[STND] = $args->{strandedness};
2052 0         0 $params[METH] = $args->{method};
2053 0         0 $params[RETT] = 2; # return type should be a hash reference of positioned scores
2054 0         0 $params[DB] = $ddb;
2055 0         0 $params[DATA] = $args->{dataset};
2056            
2057             # collect scores
2058 0         0 my $exon_scores = get_segment_score(@params);
2059            
2060             # adjust the scores
2061 0         0 $adjustment = $params[STRT] - $current_end;
2062             $self->_process_exon_scores($exon_scores, $pos2data, $adjustment, $params[STRT],
2063 0         0 $params[STOP], $namecheck, $args->{method});
2064            
2065             # reset
2066 0         0 $current_end += $exon->length;
2067             }
2068            
2069             # collect extensions if requested
2070 0 0       0 if ($args->{extend}) {
2071             # left side
2072 0         0 my @params; # parameters to pass on to adapter
2073 0         0 $params[CHR] = $self->seq_id;
2074 0         0 $params[STRT] = $practical_start - $args->{extend};
2075 0         0 $params[STOP] = $practical_start - 1;
2076 0         0 $params[STR] = $fstrand;
2077 0         0 $params[STND] = $args->{strandedness};
2078 0         0 $params[METH] = $args->{method};
2079 0         0 $params[RETT] = 2; # return type should be a hash reference of positioned scores
2080 0         0 $params[DB] = $ddb;
2081 0         0 $params[DATA] = $args->{dataset};
2082            
2083 0         0 my $ext_scores = get_segment_score(@params);
2084              
2085             # no adjustment should be needed
2086             $self->_process_exon_scores($ext_scores, $pos2data, 0, $params[STRT],
2087 0         0 $params[STOP], $namecheck, $args->{method});
2088            
2089            
2090             # right side
2091             # we can reuse our parameter array
2092 0         0 $params[STRT] = $practical_stop + 1;
2093 0         0 $params[STOP] = $practical_stop + $args->{extend};
2094 0         0 $ext_scores = get_segment_score(@params);
2095              
2096             # the adjustment should be the same as the last exon
2097             $self->_process_exon_scores($ext_scores, $pos2data, $adjustment, $params[STRT],
2098 0         0 $params[STOP], $namecheck, $args->{method});
2099             }
2100            
2101             # covert to relative positions
2102 0 0       0 if ($args->{absolute}) {
2103             # do not convert to relative positions
2104 0 0       0 return wantarray ? %$pos2data : $pos2data;
2105             }
2106             else {
2107             # return data converted to relative positions
2108             # can no longer use original coordinates, but instead the new shifted coordinates
2109 0         0 $args->{practical_start} = $practical_start;
2110 0         0 $args->{practical_stop} = $current_end;
2111 0         0 $self->_calculate_reference($args);
2112             return $self->_convert_to_relative_positions($pos2data,
2113 0         0 $args->{coordinate}, $fstrand);
2114             }
2115             }
2116              
2117             sub _calculate_reference {
2118 9     9   17 my ($self, $args) = @_;
2119 9 50       24 my $strand = defined $args->{strand} ? $args->{strand} : $self->strand;
2120 9 100       22 if ($args->{position} == 5) {
    100          
    50          
    0          
2121 6 100       13 if ($strand >= 0) {
2122 1   33     4 $args->{coordinate} = $args->{practical_start} || $self->start;
2123             }
2124             else {
2125 5   33     12 $args->{coordinate} = $args->{practical_stop} || $self->end;
2126             }
2127             }
2128             elsif ($args->{position} == 3) {
2129 1 50       2 if ($strand >= 0) {
2130 1   33     4 $args->{coordinate} = $args->{practical_stop} || $self->end;
2131             }
2132             else {
2133 0   0     0 $args->{coordinate} = $args->{practical_start} || $self->start;
2134             }
2135             }
2136             elsif ($args->{position} == 4) {
2137             # strand doesn't matter here
2138             # but practical coordinates do
2139 2 100       4 if (exists $args->{practical_start}) {
2140             $args->{coordinate} = int(
2141 1         4 ( ($args->{practical_start} + $args->{practical_stop}) / 2 ) + 0.5
2142             );
2143             }
2144             else {
2145 1         35 $args->{coordinate} = $self->midpoint;
2146             }
2147             }
2148             elsif ($args->{position} == 9) {
2149             # narrowPeak coordinate
2150 0         0 $args->{coordinate} = $self->peak;
2151             }
2152             else {
2153 0         0 confess "position must be one of 5, 3, 4, or 9";
2154             }
2155             }
2156              
2157             sub _avoid_positions {
2158 0     0   0 my ($self, $pos2data, $args, $seqid, $start, $stop) = @_;
2159            
2160             # first check the list of avoid types
2161 0 0       0 if (ref $args->{avoid} eq 'ARRAY') {
    0          
    0          
2162             # we have types, presume they're ok
2163             }
2164             elsif ($args->{avoid} eq '1') {
2165             # old style boolean value
2166 0 0       0 if (defined $args->{type}) {
2167 0         0 $args->{avoid} = [ $args->{type} ];
2168             }
2169             else {
2170             # no type provided, we can't avoid that which is not defined!
2171             # this is an error, but won't complain as we never did before
2172 0         0 $args->{avoid} = $self->type;
2173             }
2174             }
2175             elsif ($args->{avoid} =~ /w+/i) {
2176             # someone passed a string, a feature type perhaps?
2177 0         0 $args->{avoid} = [ $args->{avoid} ];
2178             }
2179            
2180             ### Check for conflicting features
2181 0   0     0 my $db = $args->{db} || $self->{data}->open_meta_database;
2182             my @overlap_features = $self->get_features(
2183             seq_id => $seqid,
2184             start => $start,
2185             end => $stop,
2186             type => $args->{avoid},
2187 0         0 );
2188            
2189             # get the overlapping features of the same type
2190 0 0       0 if (@overlap_features) {
2191 0         0 my $primary = $self->primary_id;
2192             # there are one or more feature of the type in this region
2193             # one of them is likely the one we're working with
2194             # but not necessarily - user may be looking outside original feature
2195             # the others are not what we want and therefore need to be
2196             # avoided
2197 0         0 foreach my $feat (@overlap_features) {
2198             # skip the one we want
2199 0 0       0 next if ($feat->primary_id eq $primary);
2200             # now eliminate those scores which overlap this feature
2201 0         0 my $start = $feat->start;
2202 0         0 my $stop = $feat->end;
2203 0         0 foreach my $position (keys %$pos2data) {
2204             # delete the scored position if it overlaps with
2205             # the offending feature
2206 0 0 0     0 if (
2207             $position >= $start and
2208             $position <= $stop
2209             ) {
2210 0         0 delete $pos2data->{$position};
2211             }
2212             }
2213             }
2214             }
2215             }
2216              
2217             sub _convert_to_relative_positions {
2218 5     5   10 my ($self, $pos2data, $position, $strand) = @_;
2219            
2220 5         7 my %relative_pos2data;
2221 5 50       12 if ($strand >= 0) {
    50          
2222 0         0 foreach my $p (keys %$pos2data) {
2223 0         0 $relative_pos2data{ $p - $position } = $pos2data->{$p};
2224             }
2225             }
2226             elsif ($strand < 0) {
2227 5         21 foreach my $p (keys %$pos2data) {
2228 223         298 $relative_pos2data{ $position - $p } = $pos2data->{$p};
2229             }
2230             }
2231 5 50       100 return wantarray ? %relative_pos2data : \%relative_pos2data;
2232             }
2233              
2234             sub _process_exon_scores {
2235 0     0     my ($self, $exon_scores, $pos2data, $adjustment, $start, $end,
2236             $namecheck, $method) = @_;
2237            
2238             # ncount method
2239 0 0         if ($method eq 'ncount') {
2240             # we need to check both names and adjust position
2241 0           foreach my $p (keys %$exon_scores) {
2242 0 0 0       next unless ($p >= $start and $p <= $end);
2243 0           foreach my $n (@{ $exon_scores->{$p} }) {
  0            
2244 0 0         if (exists $namecheck->{$n} ) {
2245 0           $namecheck->{$n}++;
2246 0           next;
2247             }
2248             else {
2249 0           $namecheck->{$n} = 1;
2250 0           my $a = $p - $adjustment;
2251 0   0       $pos2data->{$a} ||= [];
2252 0           push @{ $pos2data->{$a} }, $n;
  0            
2253             }
2254             }
2255             }
2256             }
2257             else {
2258             # just adjust scores
2259 0           foreach my $p (keys %$exon_scores) {
2260 0 0 0       next unless ($p >= $start and $p <= $end);
2261 0           $pos2data->{ $p - $adjustment } = $exon_scores->{$p};
2262             }
2263             }
2264             }
2265              
2266             sub fetch_alignments {
2267 0     0 1   my $self = shift;
2268 0           my %args = @_;
2269 0   0       $args{db} ||= $args{dataset} || undef;
      0        
2270 0   0       $args{data} ||= undef;
2271 0   0       $args{callback} ||= undef;
2272 0   0       $args{subfeature} ||= q();
2273            
2274             # verify - trusting that these are valid, else they will fail lower down in the code
2275 0 0         unless ($args{db}) {
2276 0           croak "must provide a Bam object database to fetch alignments!\n";
2277             }
2278 0 0 0       unless ($args{data} and ref($args{data}) eq 'HASH') {
2279 0           croak "must provide a data HASH for the fetch callback!\n";
2280             }
2281 0 0         unless ($args{callback}) {
2282 0           croak "must provide a callback code reference!\n";
2283             }
2284            
2285             # array of features to iterate, probably just one or subfeatures
2286 0           my @intervals;
2287 0 0 0       if ($self->feature_type eq 'named' and $args{subfeature}) {
2288             # we have subfeatures to iterate over
2289              
2290             # get the subfeatures
2291 0           my $subfeatures = $self->_get_subfeatures($args{subfeature});
2292 0 0         if (@$subfeatures) {
2293 0           foreach my $sf (@$subfeatures) {
2294 0           push @intervals, [
2295             $sf->start - 1,
2296             $sf->end
2297             ];
2298             }
2299             }
2300             else {
2301             # zere subfeatures? just take the parent then
2302             push @intervals, [
2303             ($args{start} || $self->start) - 1,
2304 0   0       $args{stop} || $args{end} || $self->end
      0        
2305             ];
2306             }
2307             }
2308             else {
2309             # take feature as is
2310             push @intervals, [
2311             ($args{start} || $self->start) - 1,
2312 0   0       $args{stop} || $args{end} || $self->end
      0        
2313             ];
2314             }
2315              
2316             # get the target id for the chromosome
2317             # this will fail if the user didn't provide a real bam object!!!
2318 0           my ($tid, undef, undef) = $args{db}->header->parse_region($self->seq_id);
2319 0 0         return unless defined $tid;
2320            
2321             # now iterate over the intervals
2322 0           foreach my $i (@intervals) {
2323 0           $args{data}->{start} = $i->[0];
2324 0           $args{data}->{end} = $i->[1];
2325 0           $args{data}->{strand} = $self->strand;
2326             low_level_bam_fetch(
2327             $args{db},
2328             $tid,
2329             $i->[0],
2330             $i->[1],
2331             $args{callback},
2332             $args{data}
2333 0           );
2334             }
2335            
2336             # nothing to return since we're using a data reference
2337 0           return 1;
2338             }
2339              
2340              
2341              
2342             ### String export
2343              
2344             sub bed_string {
2345 0     0 1   my $self = shift;
2346 0           my %args = @_;
2347 0   0       $args{bed} ||= 6; # number of bed columns
2348 0 0         croak "bed count must be an integer!" unless $args{bed} =~ /^\d+$/;
2349 0 0         croak "bed count must be at least 3!" unless $args{bed} >= 3;
2350            
2351             # coordinate information
2352 0           $self->seqfeature; # retrieve the seqfeature object first
2353 0   0       my $chr = $args{chromo} || $args{seq_id} || $self->seq_id;
2354 0   0       my $start = $args{start} || $self->start;
2355 0   0       my $stop = $args{stop} || $args{end} || $self->stop ||
2356             $start + $self->length - 1 || $start;
2357 0 0 0       if ($chr eq '.' or not CORE::length($chr) or $start eq '.' or not CORE::length($start)) {
      0        
      0        
2358 0           carp sprintf("no valid seq_id or start for data line %d", $self->line_number);
2359 0           return;
2360             }
2361 0 0         if ($start > $stop) {
2362             # reversed coordinates? old school way of setting reverse strand
2363 0           my $s = $start;
2364 0           $start = $stop;
2365 0           $stop = $s;
2366 0           $args{strand} = -1; # this will override any user provided data?
2367             }
2368 0           $start -= 1; # 0-based coordinates
2369 0           my $string = "$chr\t$start\t$stop";
2370            
2371             # additional information
2372 0 0         if ($args{bed} >= 4) {
2373 0   0       my $name = $args{name} || $self->name || 'Feature_' . $self->line_number;
2374 0           $string .= "\t$name";
2375             }
2376 0 0         if ($args{bed} >= 5) {
2377 0 0         my $score = exists $args{score} ? $args{score} : $self->score;
2378 0 0         $score = 1 unless defined $score;
2379 0           $string .= "\t$score";
2380             }
2381 0 0         if ($args{bed} >= 6) {
2382 0           my $s;
2383 0 0 0       if (exists $args{strand} and defined $args{strand}) {
2384 0           $s = $self->_strand($args{strand});
2385             }
2386             else {
2387 0           $s = $self->strand;
2388             }
2389 0 0         $string .= sprintf("\t%s", $s == -1 ? '-' : '+')
2390             }
2391             # we could go on with other columns, but there's no guarantee that additional
2392             # information is available, and we would have to implement user provided data
2393            
2394             # done
2395 0           return $string;
2396             }
2397              
2398             sub gff_string {
2399 0     0 1   my $self = shift;
2400 0           my %args = @_;
2401            
2402             # coordinate information
2403 0           $self->seqfeature; # retrieve the seqfeature object first
2404 0   0       my $chr = $args{chromo} || $args{seq_id} || $self->seq_id;
2405 0   0       my $start = $args{start} || $self->start;
2406 0   0       my $stop = $args{stop} || $args{end} || $self->stop ||
2407             $start + $self->length - 1 || $start;
2408 0 0 0       if ($chr eq '.' or not CORE::length($chr) or $start eq '.' or not CORE::length($start)) {
      0        
      0        
2409 0           carp sprintf("no valid seq_id or start for data line %d", $self->line_number);
2410 0           return;
2411             }
2412 0 0         if ($start > $stop) {
2413             # reversed coordinates? old school way of setting reverse strand
2414 0           my $s = $start;
2415 0           $start = $stop;
2416 0           $stop = $s;
2417 0           $args{strand} = -1; # this will override any user provided data?
2418             }
2419 0           my $strand;
2420 0 0 0       if (exists $args{strand} and defined $args{strand}) {
2421 0           $strand = $self->_strand($args{strand});
2422             }
2423             else {
2424 0           $strand = $self->strand;
2425             }
2426 0 0         $strand = $strand == -1 ? '-' : $strand == 1 ? '+' : '.';
    0          
2427            
2428             # type information
2429 0   0       my $type = $args{type} || $self->type || undef;
2430 0           my ($source, $primary_tag);
2431 0 0 0       if (defined $type and $type =~ /:/) {
2432 0           ($primary_tag, $source) = split /:/, $type;
2433             }
2434 0 0         unless ($source) {
2435 0   0       $source = $args{source} || '.';
2436             }
2437 0 0         unless ($primary_tag) {
2438 0 0 0       $primary_tag = $args{primary_tag} || defined $type ? $type : '.';
2439             }
2440            
2441             # score
2442 0 0         my $score = exists $args{score} ? $args{score} : $self->score;
2443 0 0         $score = '.' unless defined $score;
2444 0           my $phase = '.'; # do not even bother!!!!
2445            
2446             # attributes
2447 0   0       my $name = $args{name} || $self->name || 'Feature_' . $self->line_number;
2448 0           my $attributes = "Name=$name";
2449 0   0       my $id = $args{id} || sprintf("%08d", $self->line_number);
2450 0           $attributes .= ";ID=$id";
2451 0 0 0       if (exists $args{attributes} and ref($args{attributes}) eq 'ARRAY') {
2452 0           foreach my $i (@{$args{attributes}}) {
  0            
2453 0           my $k = $self->{data}->name($i);
2454 0           $k =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
  0            
2455 0           my $v = $self->value($i);
2456 0           $v =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
  0            
2457 0           $attributes .= ";$k=$v";
2458             }
2459             }
2460            
2461             # done
2462 0           my $string = join("\t", $chr, $source, $primary_tag, $start, $stop, $score,
2463             $strand, $phase, $attributes);
2464 0           return $string;
2465             }
2466              
2467              
2468             __END__