File Coverage

blib/lib/Bio/DB/USeq.pm
Criterion Covered Total %
statement 479 949 50.4
branch 158 338 46.7
condition 49 156 31.4
subroutine 62 111 55.8
pod 37 39 94.8
total 785 1593 49.2


line stmt bran cond sub pod time code
1             package Bio::DB::USeq;
2              
3             our $VERSION = '0.26';
4              
5             =head1 NAME
6              
7             Bio::DB::USeq - Read USeq archive database files
8              
9             =head1 SYNOPSIS
10            
11             use Bio::DB::USeq;
12             my $useq = Bio::DB::USeq->new('file.useq') or
13             die "unable to open file.useq!\n";
14            
15             # sequence IDs
16             my @seq_ids = $useq->seq_ids;
17             my $length = $useq->length($chr); # approximate, not exact
18            
19             ### Retrieving features
20             # all features or observations across chromosome I
21             my @features = $useq->features(
22             -seq_id => 'chrI',
23             -type => 'region',
24             );
25            
26             # same thing using a simple form
27             # use an array of (chromosome, start, stop, strand)
28             my @features = $useq->features('chrI');
29            
30             # same thing with a memory efficient iterator
31             my $iterator = $useq->get_seq_stream(
32             -seq_id => 'chrI',
33             );
34             while (my $f = $iterator->next_seq) {
35             # each feature $f supports most SeqFeatureI methods
36             }
37            
38            
39             ### Retrieving simple scores
40             my @scores = $useq->scores(
41             -seq_id => 'chrI',
42             -start => 1000,
43             -end => 2000
44             );
45            
46            
47             ### Same methods as above after defining an interval first
48             my $segment = $useq->segment(
49             -seq_id => 'chrI',
50             -start => 1000,
51             -end => 2000,
52             );
53             my @scores = $segment->scores;
54             my @features = $segment->features;
55             my $iterator = $segment->get_seq_stream;
56            
57            
58             ### Efficient retrieval of positioned scores in 100 bins
59             # compatible with Bio::Graphics
60             my ($wig) = $useq->features(
61             # assuming unstranded data here, otherwise two wiggle objects
62             # would be returned, one for each strand
63             -seq_id => 'chrI',
64             -start => 1000,
65             -end => 2000,
66             -type => 'wiggle:100',
67             );
68             my @bins = $wig->wiggle;
69             my @bins = $wig->coverage; # same thing
70             my ($bins) = $wig->get_tag_values('coverage'); # same thing
71            
72            
73             ### Statistical summaries of intervals
74             # compatible with Bio::Graphics
75             my ($summary) = $useq->features(
76             # assuming unstranded data here, otherwise two summaries
77             # would be returned, one for each strand
78             -seq_id => 'chrI',
79             -start => 1000,
80             -end => 2000,
81             -type => 'summary',
82             );
83             my $stats = $summary->statistical_summary(10);
84             foreach (@$stats) {
85             my $max = $_->{maxVal};
86             my $mean = Bio::DB::USeq::binMean($_);
87             }
88            
89            
90             ### Stranded data using an iterator
91             # could be used with either wiggle or summary features
92             my $stream = $useq->get_seq_stream(
93             -seq_id => 'chrI',
94             -start => 1000,
95             -end => 2000,
96             -type => 'wiggle:100',
97             );
98             my ($forward, $reverse);
99             if ($useq->stranded) {
100             $forward = $stream->next_seq;
101             $reverse = $stream->next_seq;
102             }
103             else {
104             $forward = $stream->next_seq;
105             }
106            
107              
108             =head1 DESCRIPTION
109              
110             Bio::DB::USeq is a B style adaptor for reading USeq files. USeq files
111             are compressed, indexed data files supporting modern bioinformatic datasets,
112             including genomic points, scores, and intervals. More information about the
113             USeq software package can be found at L,
114             including a description of the USeq data
115             L.
116              
117             USeq files are typically half the size of corresponding bigBed and bigWig files,
118             due to a compact internal format and lack of internal zoom data. This adaptor,
119             however, can still return statistics across different zoom levels in the same
120             manner as big files, albeit at a cost of calculating these in realtime.
121              
122             =head2 Generating useq files
123              
124             USeq files may be generated using tools from the USeq package, available at
125             L. They may be generated from
126             native Bar files, text Wig files, text Bed files, and UCSC bigWig and bigBed file
127             formats.
128              
129             =head2 Compatibility
130              
131             The adaptor follows most conventions of other B-style Bio::DB
132             adaptors. Observations or features in the useq file archive are
133             returned as SeqFeatureI compatible objects.
134              
135             Coordinates consumed and returned by the adaptor are 1-based, consistent
136             with B convention. This is not true of the useq file itself, which
137             uses the interbase coordinate system.
138              
139             Unlike wig and bigWig files, useq file archives support stranded data,
140             which can make data collection much simpler for complex experiments.
141              
142             See below for GBrowse compatibility.
143              
144             =head2 Limitations
145              
146             This adaptor is read only. USeq files, in general, are not modified
147             or written with data. The exceptions are chromosome or global statistics
148             are written to the F file inside the zip archive to
149             cache for future queries.
150              
151             No support for genomic sequence is included. Users who need access to
152             genomic sequence should seek an alternative B adaptor, such as
153             L.
154              
155             Useq files do not have a native concept of type, primary_tag, or source
156             attributes, as expected with GFF-based database adaptors. The features
157             method does support special types (see below).
158              
159             =head2 Requirements
160              
161             The adaptor is a Perl-only implementation. It only requires the
162             L module for opening and reading the useq file archive.
163             B is required for working with SeqFeatures objects generated
164             from useq file observations.
165              
166             =head1 METHODS
167              
168             =head2 Initializing the Bio::DB::USeq object
169              
170             These are class methods for creating and working with the Bio::DB::USeq
171             object itself.
172              
173             =over 4
174              
175             =item new
176              
177             =item new($file)
178              
179             =item new(-useq => $file)
180              
181             This will create a new Bio::DB::USeq object. Optionally pass the path
182             to a useq file archive to open immediately. There is not much to do
183             unless you open a file.
184              
185             Named arguments may be used to specify the file. Either -useq or -file
186             may be used.
187              
188             =item open($file)
189              
190             Open a useq file archive. Useq files typically use the F<.useq> file
191             extension. Returns true if successful.
192              
193             DO NOT open a subsequent useq file archive when one has already been
194             opened. Please create a new object to open a new file.
195              
196             =item clone
197              
198             Force the object to re-open the useq archive file under a new file
199             handle to make it clone safe. Do this in the child process before
200             collecting data.
201              
202             =item zip
203              
204             Return the L object representing the useq file archive.
205             Generally not recommended unless you know what you are doing.
206              
207             =back
208              
209             =head2 General information about the useq file
210              
211             These are class methods for obtaining general information or
212             metadata attributes regarding the contents of the file.
213              
214             =over 4
215              
216             =item stranded
217              
218             Return true or false (1 or 0) indicating whether the contents of the
219             useq file archive are recorded in a stranded fashion.
220              
221             =item attributes
222              
223             Return an array of available attribute keys that were recorded in the
224             useq file F member. These are key = value pairs
225             representing metadata for the useq file.
226              
227             =item attribute($key)
228              
229             Return the metadata attribute value for the specified key. These
230             are recorded in the useq file F member. Returns
231             undefined if the key does not exist.
232              
233             =item type
234              
235             Return the useq file metadata C value.
236              
237             =item genome
238              
239             Return the useq file metadata C value.
240              
241             =item version
242              
243             Return the useq file metadata C value.
244              
245             =back
246              
247             =head2 Chromosome Information
248              
249             These are methods to obtain information about the chromosomes or reference
250             sequences represented in the file archive.
251              
252             B that generating score statistics across one or more chromosomes may
253             be computationally expensive. Therefore, chromosome statistics, once
254             calculated, are cached in the useq file metadata for future reference.
255             This necessitates writing to the useq zip archive. This is currently the
256             only supported method for modifying the useq zip archive.
257              
258             =over 4
259              
260             =item seq_ids
261              
262             Return an array of the chromosome or sequence identifiers represented
263             in the useq file archive. The names are sorted ASCIIbetically before
264             they are returned.
265              
266             =item length($seq_id)
267              
268             Return the length of the provided chromosome or sequence identifier.
269             Note that this may not be the actual length in the reference genome, but
270             rather the last recorded position of an observation for that chromosome.
271             Hence, it should be used only as an approximation.
272              
273             =item chr_mean($seq_id)
274              
275             Return the mean score value across the entire chromosome.
276              
277             =item chr_stdev($seq_id)
278              
279             Return the score standard deviation across the entire chromosome.
280              
281             =item chr_stats($seq_id)
282              
283             Return a statistical summary across the entire chromosome. This
284             is an anonymous hash with five keys: validCount, sumData,
285             sumSquares, minVal, and maxVal. These are equivalent to the
286             statistical summaries generated by the Bio::DB::BigWig adaptor.
287              
288             =item global_mean
289              
290             Return the mean score value across all chromosomes.
291              
292             =item global_stdev
293              
294             Return the mean score value across all chromosomes.
295              
296             =item global_stats
297              
298             Return a statistical summary across all chromosomes.
299              
300             =back
301              
302             =head2 Data accession
303              
304             These are the primary methods for working with data contained in
305             useq file archive. These should be familiar to most B users.
306              
307             =over 4
308              
309             =item features
310              
311             Returns an array or array reference of SeqFeatureI compatible
312             objects overlapping a given genomic interval.
313              
314             Coordinates of the interrogated regions must be supplied. At a
315             minimum, the seq_id must be supplied. A start of 1 and an end
316             corresponding to the length of the seq_id is used if not directly
317             provided. Coordinates may be specified in two different manners,
318             either as a list of (seq_id, start, end, strand) or as one or
319             more keyed values.
320            
321             my @features = $useq->features($seq_id, $start, $end);
322            
323             @features = $useq->features(
324             -seq_id => $seq_id,
325             -start => $start,
326             -end => $end,
327             );
328              
329             If the -iterator argument is supplied with a true value, then an
330             iterator object is returned. See get_seq_stream() for details.
331              
332             Bio::DB::USeq supports four different feature types. Feature
333             types may be specified using the -type argument.
334              
335             =over 4
336              
337             =item region
338              
339             =item interval
340              
341             =item observation
342              
343             The default feature type if the type argument is not specified.
344             These are SeqFeatureI compatible objects representing observations
345             in the useq file archive. These are compatible with the iterator.
346             SeqFeature observations are returned in genomic order.
347              
348             =item chromosome
349              
350             Returns SeqFeatureI compatible objects representing the
351             reference sequences (chromosomes) listed in the useq file
352             archive. These are not compatibile with the iterator.
353              
354             =item wiggle
355              
356             =item wiggle:$bins
357              
358             =item coverage
359              
360             =item coverage:$bins
361              
362             Returns an array of SeqFeatureI compatible objects for each
363             strand requested. If the useq file contains stranded data,
364             and no strand is requested, then two objects will be
365             returned representing each strand.
366              
367             Each object contains an array representing scores across
368             the requested coordinates. This object is designed to be
369             backwards compatible with coverage features from the
370             L adaptor for use with L and GBrowse.
371             Note that only scores are returned, not a true depth coverage
372             in the sense of the L coverage types.
373              
374             By default, the wiggle or coverage array is provided at 1
375             bp resolution. To improve efficiency with large regions,
376             the wiggle array may be limited by using a bin option,
377             where the interrogated interval is divided into the number
378             of bins requested.
379              
380             To retrieve the scores, call the wiggle() or coverage() method.
381              
382             For example, to request wiggle scores in 100 equal bins across
383             the interval, see the following example. The wiggle and
384             coverage types are synonymous.
385            
386             my ($wiggle) = $useq->features(
387             -seq_id => $chromosome,
388             -start => $start,
389             -end => $end,
390             -type => 'wiggle:100',
391             );
392             my @scores = $wiggle->wiggle;
393             @scores = $wiggle->coverage;
394              
395             Wiggle objects may also be obtained with a get_seq_stream()
396             iterator objects.
397              
398             =item summary
399              
400             =item summary:$bins
401              
402             Returns an array of SeqFeatureI compatibile Summary objects
403             for each strand requested. If the useq file contains stranded
404             data, and no strand is requested, then two objects will be
405             returned representing each strand.
406              
407             Each Summary object can then be used to call statistical
408             summaries for one or more bins across the interval. Each
409             statistical summary is an anonymous hash with five keys:
410             validCount, sumData, sumSquares, minVal, and maxVal. From
411             these values, a mean and standard deviation may also be
412             calculated.
413            
414             my ($summary) = $useq->features(
415             -seq_id => $chromosome,
416             -start => $start,
417             -end => $end,
418             -type => 'summary',
419             );
420             my @stats = $summary->statistical_summary(100);
421             foreach my $stat (@stats) {
422             my $count = $stat->{validCount};
423             my $sum = $stat->{sumData};
424             my $mean = $sum / $count;
425             }
426              
427             Statistical summaries are equivalent to those generated by the
428             L adaptor and may be used interchangeably. They
429             are compatible with the L modules.
430              
431             Summary objects may also be obtained with a get_seq_stream()
432             iterator object.
433              
434             =back
435              
436             =item get_seq_stream
437              
438             This is a memory efficient data accessor. An iterator object is
439             returned for an interval specified using coordinate values in the
440             same manner as features(). Call the method next_seq() on the
441             iterator object to retrieve the observation SeqFeature objects
442             one at a time. The iterator is compatible with region, wiggle,
443             and summary feature types.
444            
445             # establish the iterator object
446             my $iterator = $useq->get_seq_stream(
447             -seq_id => $seq_id,
448             -start => $start,
449             -end => $end,
450             -type => 'region',
451             );
452            
453             # retrieve the features one at a time
454             while (my $f = $iterator->next_seq) {
455             # each feature $f is either a
456             # Bio::DB::USeq::Feature,
457             # a Bio::DB::USeq::Wiggle, or
458             # a Bio::DB::USeq::Summary object
459             }
460              
461             =item scores
462              
463             This is a simplified data accessor that only returns the score
464             values overlapping an interrogated interval, rather than
465             assembling each observation into a SeqFeature object. The scores
466             are not associated with genomic coordinates, and are not guaranteed
467             to be returned in original genomic order.
468              
469             Provide the interval coordinates in the same manner as the
470             features() method. Stranded data collection is supported.
471            
472             my @scores = $useq->scores(
473             -seq_id => $seq_id,
474             -start => $start,
475             -end => $end,
476             );
477              
478             =item observations
479              
480             This is a low-level data accessor, similar to features() but returning
481             an array of references to the original USeq observations. Each USeq
482             observation is an anonymous array reference of 2-4 elements: start, stop,
483             score, and text, depending on the stored data type. All coordinates are in
484             0-based coordinates, unlike high level accessors. Note that while strand is
485             supported, it is not reported for each observation. If observations exist on
486             both strands, then each strand should be searched separately. Observations
487             are not guaranteed to be returned in genomic order.
488            
489             my @observations = $useq->observations(
490             -seq_id => $seq_id,
491             -start => $start,
492             -end => $end,
493             );
494             foreach my $obs (@observations) {
495             my $start = $obs->[0] + 1; # convert to 1-based coordinate
496             my $stop = $obs->[1];
497             my $score = $obs->[2]; # may not be present
498             my $text = $obs->[3]; # may not be present
499             }
500              
501             =item segment
502              
503             This returns a L object, which is a SeqFeatureI
504             compatible segment object corresponding to the specified coordinates.
505             From this object, one can call the features(), scores(), or
506             get_seq_stream() methods directly. Keyed options or location
507             information need not be provided. Stranded segments are
508             supported.
509            
510             my $segment = $useq->segment(
511             -seq_id => $seq_id,
512             -start => $start,
513             -end => $end,
514             );
515             my @scores = $segment->scores;
516             my @features = $segment->features('wiggle:100');
517             my $iterator = $segment->get_seq_stream('region');
518              
519             =item get_features_by_location
520              
521             Convenience method for returning features restricted by location.
522              
523             =item get_feature_by_id
524              
525             =item get_feature_by_name
526              
527             Compatibility methods for returning a specific feature or
528             observation in the USeq file. Text fields, if present, are not
529             indexed in the USeq file, preventing efficient searching of names.
530             As a workaround, an ID or name comprised of "$seq_id:$start:$end"
531             may be used, although a direct search of coordinates would be
532             more efficient.
533            
534             my $feature = $useq->get_feature_by_id("$seq_id:$start:$end");
535            
536              
537             =back
538              
539             =head1 ADDITIONAL CLASSES
540              
541             These are additional class object that may be returned by various
542             methods above.
543              
544             =head2 Bio::DB::USeq::Feature objects
545              
546             These are SeqFeatureI compliant objects returned by the features()
547             or next_seq() methods. They support the following methods.
548            
549             seq_id
550             start
551             end
552             strand
553             score
554             type
555             source (returns the useq archive filename)
556             name (chromosome:start..stop)
557             Bio::RangeI methods
558              
559             Additionally, chromosome and global statistics are also available
560             from any feature, as well as from Segment, Wiggle, Iterator, and
561             Summary objects. See the corresponding USeq methods for details.
562            
563             =head2 Bio::DB::USeq::Segment objects
564              
565             This is a SeqFeatureI compliant object representing a genomic segment
566             or interval. These support the following methods.
567              
568             =over 4
569              
570             =item features
571              
572             =item features($type)
573              
574             =item get_seq_stream
575              
576             =item get_seq_stream($type)
577              
578             =item scores
579              
580             =item observations
581              
582             Direct methods for returning features or scores. Coordinate information
583             need not be provided. See the corresponding Bio::DB::USeq methods for
584             more information.
585              
586             =item wiggle
587              
588             =item wiggle($bins)
589              
590             =item coverage
591              
592             =item coverage($bins)
593              
594             =item statistical_summary
595              
596             =item statistical_summary($bins)
597              
598             Convenience methods for returning wiggle (coverage) or summary features
599             over the segment. If desired, the number of bins may be specified. See
600             the features() method for more information.
601              
602             =item slices
603              
604             Returns an array of splice member names that overlap this segment.
605             See L for more information.
606              
607             =back
608              
609             =head2 Bio::DB::USeq::Iterator objects
610              
611             This is an iterator object for retrieving useq observation SeqFeatureI
612             objects in a memory efficient manner by returning one feature at a time.
613              
614             =over 4
615              
616             =item next_seq
617              
618             =item next_feature
619              
620             Returns the next feature present in the interrogated interval. Features
621             are generally returned in ascending coordinate order. Returns undefined
622             when no features are remaining in the interval. Features may include
623             either region or wiggle types, depending on how the iterator object was
624             established. See features() and get_seq_stream() methods for more
625             information.
626              
627             =back
628              
629             =head2 Bio::DB::USeq::Wiggle objects
630              
631             These are SeqFeatureI compliant objects for backwards compatibility with
632             L and GBrowse. They support the wiggle() and coverage()
633             methods, which returns an array of scores over the interrogated region. By
634             default, the array is equal to the length of the region (1 bp resolution),
635             or may be limited to a specified number of bins for efficiency. See the
636             features() method for more information.
637              
638             =over 4
639              
640             =item wiggle
641              
642             =item coverage
643              
644             The scores are stored as an array in the coverage attribute. For
645             convenience, the wiggle() and coverage() methods may be used to retrieve
646             the array or array reference of scores.
647              
648             =item statistical_summary
649              
650             Generate a statistical summary hash for the collected wiggle scores
651             (not the original data). This method is not entirely that useful; best
652             to use the summary feature type in the first place.
653              
654             =item chromosome and global statistics
655              
656             Chromosome and global statistics, including mean and standard deviation,
657             are available from wiggle objects. See the corresponding USeq methods
658             for details.
659              
660             =back
661              
662             =head2 Bio::DB::USeq::Summary objects
663              
664             These are SeqFeatureI compliant Summary objects, similar to those
665             generated by the L database adaptor. As such, they are
666             compatible with L and GBrowse.
667              
668             Summary objects can generate statistical summaries over a specified
669             number of bins (default is 1 bin, or the entire requested region).
670             Each statistical summary is an anonymous hash consisting of five
671             keys: validCount, sumData, sumSquares, minVal, and maxVal. From
672             these values, a mean and standard deviation may be calculated.
673              
674             For convenience, three exported functions are available for calculating
675             the mean and standard deviation from a statistical summary hash.
676             See L for more information.
677              
678             Use statistical summaries in the following manner.
679            
680             my $stats = $summary->statistical_summary(10);
681             my $stat = shift @$stats;
682             my $min = $stat->{minVal};
683             my $max = $stat->{maxVal};
684             my $mean = $stat->{sumData} / $stat->{validCount};
685              
686             =over 4
687              
688             =item statistical_summary
689              
690             =item statistical_summary($bins)
691              
692             Generate a statistical summary hash for one or more bins across the
693             interrogated region. Provide the number of bins desired. If a feature
694             type of "summary:$bins" is requested through the features() or
695             get_seq_stream() method, then $bins number of bins will be used.
696             The default number of bins is 1.
697              
698             =item score
699              
700             Generate a single statistical summary over the entire region.
701              
702             =item chromosome and global statistics
703              
704             Chromosome and global statistics, including mean and standard deviation,
705             are available from summary objects. See the corresponding USeq methods
706             for details.
707              
708             =back
709              
710             =head1 EXPORTED FUNCTIONS
711              
712             Three subroutine functions are available for export to assist
713             in calculating the mean, variance, and standard deviation from
714             statistical summaries. These functions are identical to those
715             from the L adaptor and may be used interchangeably.
716              
717             They are not exported by default; they must explicitly listed.
718            
719             use Bio::DB::USeq qw(binMean binStdev);
720             my $stats = $summary->statistical_summary(10);
721             my $stat = shift @$stats;
722             my $mean = binMean($stat);
723             my $stdev = binStdev($stat);
724              
725             =over 4
726              
727             =item binMean( $stat )
728              
729             Calculate the mean from a statistical summary anonymous hash.
730              
731             =item binVariance( $stat )
732              
733             Calculate the variance from a statistical summary anonymous hash.
734              
735             =item binStdev( $stat )
736              
737             Calculate the standard deviation from a statistical summary anonymous hash.
738              
739             =back
740              
741             =head1 USEQ SLICES
742              
743             Genomic observations are recorded in groups, called slices, of
744             usually 10000 observations at a time. Each slice is a separate
745             zip file member in the useq file archive. These methods are for
746             accessing information about each slice. In general, accessing
747             data through slices is a lower level operation. Users should
748             preferentially use the main data accessors.
749              
750             The following are Bio::DB::USeq methods available for working
751             with slices.
752              
753             =over 4
754              
755             =item slices
756              
757             Returns an array of all the slice member names present in the
758             useq archive file.
759              
760             =item slice_feature($slice)
761              
762             Return a L object representing the slice interval.
763             The features(), get_seq_stream(), and scores() methods are supported.
764              
765             =item slice_seq_id($slice)
766              
767             Return the chromosome or sequence identifier associated with a slice.
768              
769             =item slice_start($slice)
770              
771             Return the start position of the slice interval.
772              
773             =item slice_end($slice)
774              
775             Return the end position of the slice interval.
776              
777             =item slice_strand($slice)
778              
779             Return the strand of the slice interval.
780              
781             =item slice_type($slice)
782              
783             Return the file type of the slice member. This corresponds to the
784             file extension of the slice zip member and indicates how to parse
785             the binary member. Each letter in the type corresponds to a data
786             type, be it integer, short, floating-point, or text. See the
787             useq file documentation for more details.
788              
789             =item slice_obs_number($slice)
790              
791             Return the number of observations recorded in the slice interval.
792              
793             =back
794              
795             =head1 GBROWSE COMPATIBILITY
796              
797             The USeq adaptor has support for L and GBrowse.
798             It will work with the segments glyph for intervals,
799             the wiggle_xyplot glyph for displaying mean scores, and the
800             wiggle_whiskers glyph for displaying detailed statistics.
801              
802             Initialize the USeq database adaptor.
803            
804             [data1:database]
805             db_adaptor = Bio::DB::USeq
806             db_args = -file /path/to/data1.useq
807              
808             Displaying simple intervals with the segments glyph.
809            
810             [data1_segments]
811             database = data1
812             feature = region
813             glyph = segments
814             stranded = 1
815              
816             Displaying scores using the wiggle_xyplot glyph.
817             You may set the bins to whatever number is appropriate (in
818             this example, 1000), or leave blank (not recommended,
819             defaults to 1 bp resolution).
820            
821             [data1_xyplot]
822             database = data1
823             feature = wiggle:1000
824             glyph = wiggle_xyplot
825             graph_type = histogram
826             autoscale = chromosome
827              
828             Displaying scores using the wiggle_whiskers glyph. Note that
829             generating statistical summaries are computationally more expensive
830             than simple bins of mean values as with the wiggle feature type.
831            
832             [data1_whiskers]
833             database = data1
834             feature = summary
835             glyph = wiggle_whiskers
836             graph_type = histogram
837             autoscale = chromosome
838              
839             =head1 PERFORMANCE
840              
841             Because the Bio::DB::USeq is implemented as a Perl-only module,
842             performance is subject to the limitations of Perl execution itself and
843             the size of the data that needs to be parsed. In general when collecting
844             score data, requesting scores is the fastest mode of operation, followed
845             by wiggle feature types, and finally summary feature types.
846              
847             In comparison to UCSC bigWig files, the USeq format is typically much
848             faster when viewing intervals where the entire interval is represented by
849             one or a few internal slices. This is especially true for repeated queries
850             over the same or neighboring intervals, as the slice contents are retained
851             in memory. As the number of internal slices that must be loaded into memory
852             increases, for example querying intervals of hundreds of kilobases in size,
853             performance will begin to lag as each internal slice must be parsed into
854             memory. This is where the UCSC bigWig file format with internal zoom levels
855             of summary statistics can outperform, at the cost of file complexity and size.
856              
857             =cut
858              
859              
860             require 5.010000;
861 1     1   7188 use strict;
  1         2  
  1         26  
862 1     1   3 use Carp qw(carp cluck croak confess);
  1         2  
  1         63  
863 1     1   517 use Archive::Zip qw( :ERROR_CODES );
  1         70010  
  1         80  
864 1     1   351 use Set::IntervalTree;
  1         4977  
  1         50  
865 1     1   6 use File::Spec;
  1         2  
  1         20  
866              
867 1     1   4 use base 'Exporter';
  1         1  
  1         6439  
868             our @EXPORT_OK = qw(binMean binVariance binStdev);
869              
870             1;
871              
872             #### USeq initialization ####
873              
874             sub new {
875 2     2 1 181 my $class = shift;
876 2         21 my $self = {
877             'name' => undef,
878             'zip' => undef,
879             'stranded' => undef,
880             'seq_ids' => {},
881             'metadata' => {},
882             'coord2file' => {},
883             'file2attribute' => {},
884             'buffer' => {},
885             };
886 2         6 bless $self, $class;
887            
888             # check for arguments
889 2         3 my %args;
890 2 50       8 if (@_) {
891 2 100       11 if ($_[0] =~ /^-/) {
892 1         5 %args = @_;
893             }
894             else {
895 1         11 $args{-file} = shift;
896             }
897             }
898            
899             # open file
900 2   0     7 my $file = $args{-file} || $args{-useq} || undef;
901 2 50       5 if ($file) {
902 2 50       9 return unless ($self->open($file)); # open must return true
903             }
904            
905             # done
906 2         6 return $self;
907             }
908              
909             sub open {
910 2     2 1 4 my $self = shift;
911            
912             # check file
913 2         3 my $filename = shift;
914 2 50       5 unless ($filename) {
915 0         0 cluck("no file name passed!\n");
916 0         0 return;
917             }
918 2 50       12 unless ($filename =~ /\.useq$/i) {
919 0         0 carp "'$filename' is not a .useq archive file!\n";
920 0         0 return;
921             }
922 2 50       6 if ($self->slices) {
923 0         0 cluck "Only load one useq file per object!\n";
924 0         0 return;
925             }
926            
927             # open the archive
928 2         14 my $zip = Archive::Zip->new();
929 2         74 my $error = $zip->read($filename);
930 2 50       2682 unless ($error == AZ_OK) {
931 0         0 carp " unable to read USeq archive '$filename'! Error $error\n";
932 0         0 return;
933             }
934 2         5 $self->{'zip'} = $zip;
935 2         44 (undef, undef, $self->{'name'}) = File::Spec->splitpath($filename);
936            
937             # parse the contents
938 2 50       8 return unless ($self->_parse_members);
939             # we delay parsing metadata unless it is requested
940            
941             # success
942 2         4 return 1;
943             }
944              
945             sub clone {
946 1     1 1 3 my $self = shift;
947 1 50       4 return unless $self->{'zip'};
948 1         3 my $file = $self->zip->fileName;
949 1         13 my $zip = Archive::Zip->new();
950 1         35 my $error = $zip->read($file);
951 1 50       1273 unless ($error == AZ_OK) {
952 0         0 carp " unable to read USeq archive '$file'! Error $error\n";
953 0         0 return;
954             }
955 1         22 $self->{'zip'} = $zip;
956 1         4 return 1;
957             }
958              
959             sub zip {
960 9     9 1 52 return shift->{'zip'};
961             }
962              
963             sub name {
964 4     4 0 36 return shift->{'name'};
965             }
966              
967              
968              
969             #### General USeq information ####
970              
971             sub seq_ids {
972 1     1 1 2 my $self = shift;
973 1         1 return sort {$a cmp $b} keys %{ $self->{'seq_ids'} };
  1         5  
  1         6  
974             }
975              
976             sub length {
977 7     7 1 52 my $self = shift;
978 7 50       17 my $seq_id = shift or return;
979 7 50       23 if (exists $self->{'seq_ids'}{$seq_id}) {
980 7         22 return $self->{'seq_ids'}{$seq_id};
981             }
982             }
983              
984             sub stranded {
985 8     8 1 42 return shift->{'stranded'};
986             }
987              
988             sub attributes {
989 0     0 1 0 my $self = shift;
990 0 0       0 $self->_parse_metadata unless %{ $self->{'metadata'} };
  0         0  
991 0         0 return (sort {$a cmp $b} keys %{ $self->{'metadata'} });
  0         0  
  0         0  
992             }
993              
994             sub attribute {
995 1     1 1 2 my $self = shift;
996 1         2 my $key = shift;
997 1 50       3 return $self->attributes unless $key;
998 1 50       1 $self->_parse_metadata unless %{ $self->{'metadata'} };
  1         5  
999 1 50       3 if (exists $self->{'metadata'}{$key}) {
1000 1         4 return $self->{'metadata'}{$key};
1001             }
1002 0         0 return;
1003             }
1004              
1005             sub type {
1006 1     1 1 85 return shift->attribute('dataType');
1007             }
1008              
1009             sub version {
1010 0     0 1 0 return shift->attribute('useqArchiveVersion');
1011             }
1012              
1013             sub genome {
1014 0     0 1 0 return shift->attribute('versionedGenome');
1015             }
1016              
1017             sub chr_stats {
1018 1     1 1 1 my $self = shift;
1019 1 50       3 my $seq_id = shift or return;
1020 1         2 my $delay_write = shift; # option to delay rewriting the metadata
1021 1 50       2 $self->_parse_metadata unless %{ $self->{'metadata'} };
  1         14  
1022            
1023             # return the chromosome stats
1024 1 50       6 if (exists $self->{metadata}{"chromStats_$seq_id"}) {
1025 0         0 my @data = split(',', $self->{metadata}{"chromStats_$seq_id"});
1026 0         0 my %stat = (
1027             'validCount' => $data[0],
1028             'sumData' => $data[1],
1029             'sumSquares' => $data[2],
1030             'minVal' => $data[3],
1031             'maxVal' => $data[4],
1032             );
1033 0         0 return \%stat;
1034             }
1035            
1036             # chromosome stats must be generated
1037 1         6 my @slices = $self->_translate_coordinates_to_slices(
1038             $seq_id, 1, $self->length($seq_id), 0);
1039            
1040             # the normal _stat_summary() function requires loading entire chromosome worth of
1041             # slices into memory and can become a huge memory hog, so we'll go one at time
1042             # instead in a slightly more efficient manner
1043 1         3 my $count = 0;
1044 1         4 my $sum;
1045             my $sum_squares;
1046 1         0 my $min;
1047 1         0 my $max;
1048 1         3 foreach my $slice (@slices) {
1049 1 50       3 next unless $self->slice_type($slice) =~ /f/;
1050             # no sense going through if no score present
1051            
1052             # load and unpack the data
1053 1         4 $self->_clear_buffer([$slice]);
1054 1         7 $self->_load_slice($slice);
1055            
1056             # find the overlapping observations
1057 1         5 my $results = $self->{'buffer'}{$slice}->fetch(
1058             $self->slice_start($slice) - 1, $self->slice_end($slice));
1059 1         9 foreach my $r (@$results) {
1060             # each observation is [start, stop, score]
1061 9151 50       10648 if (defined $r->[2]) {
1062 9151         7624 $count++;
1063 9151         7944 $sum += $r->[2];
1064 9151         7972 $sum_squares += ($r->[2] * $r->[2]);
1065 9151 100 100     16267 $min = $r->[2] if (!defined $min or $r->[2] < $min);
1066 9151 100 100     16583 $max = $r->[2] if (!defined $max or $r->[2] > $max);
1067             }
1068             }
1069             }
1070            
1071             # assemble stats
1072 1   50     22 my %stat = (
      50        
      50        
      50        
1073             'validCount' => $count,
1074             'sumData' => $sum || 0,
1075             'sumSquares' => $sum_squares || 0,
1076             'minVal' => $min || 0,
1077             'maxVal' => $max || 0,
1078             );
1079            
1080             # then associate with the metadata
1081 1         5 $self->{'metadata'}{"chromStats_$seq_id"} = join(',', map { $stat{$_} }
  5         45  
1082             qw(validCount sumData sumSquares minVal maxVal) );
1083 1 50       11 $self->_rewrite_metadata unless $delay_write;
1084            
1085 1         4 return \%stat;
1086             }
1087              
1088             sub chr_mean {
1089 1     1 1 127 my $self = shift;
1090 1 50       5 my $seq_id = shift or return;
1091 1         5 return Bio::DB::USeq::binMean( $self->chr_stats($seq_id) );
1092             }
1093              
1094             sub chr_stdev {
1095 0     0 1 0 my $self = shift;
1096 0 0       0 my $seq_id = shift or return;
1097 0         0 return Bio::DB::USeq::binStdev( $self->chr_stats($seq_id) );
1098             }
1099              
1100             sub global_stats {
1101             # this is an expensive proposition, because it must parse through every
1102             # slice in the archive
1103 0     0 1 0 my $self = shift;
1104 0 0       0 $self->_parse_metadata unless %{ $self->{'metadata'} };
  0         0  
1105            
1106             # return the chromosome stats
1107 0 0       0 if (exists $self->{metadata}{globalStats}) {
1108 0         0 my @data = split(',', $self->{metadata}{globalStats});
1109 0         0 my %stat = (
1110             'validCount' => $data[0],
1111             'sumData' => $data[1],
1112             'sumSquares' => $data[2],
1113             'minVal' => $data[3],
1114             'maxVal' => $data[4],
1115             );
1116 0         0 return \%stat;
1117             }
1118            
1119             # calculate new genome-wide statistics from individual chromosome stats
1120 0         0 my $count = 0;
1121 0         0 my $sum;
1122             my $sum_squares;
1123 0         0 my $min;
1124 0         0 my $max;
1125 0         0 foreach my $seq_id ($self->seq_ids) {
1126 0         0 my $stats = $self->chr_stats($seq_id, 1); # delay writing metadata
1127 0         0 $count += $stats->{validCount};
1128 0         0 $sum += $stats->{sumData};
1129 0         0 $sum_squares += $stats->{sumSquares};
1130 0 0 0     0 $min = $stats->{minVal} if (!defined $min or $stats->{minVal} < $min);
1131 0 0 0     0 $max = $stats->{maxVal} if (!defined $max or $stats->{maxVal} > $max);
1132             }
1133            
1134             # assemble the statistical summary hash
1135 0   0     0 my %stat = (
      0        
      0        
      0        
1136             'validCount' => $count,
1137             'sumData' => $sum || 0,
1138             'sumSquares' => $sum_squares || 0,
1139             'minVal' => $min || 0,
1140             'maxVal' => $max || 0,
1141             );
1142            
1143             # update metadata
1144 0         0 $self->{'metadata'}{'globalStats'} = join(',', map { $stat{$_} }
  0         0  
1145             qw(validCount sumData sumSquares minVal maxVal) );
1146 0         0 $self->_rewrite_metadata;
1147            
1148 0         0 return \%stat;
1149             }
1150              
1151             sub global_mean {
1152 0     0 1 0 my $self = shift;
1153 0         0 return Bio::DB::USeq::binMean( $self->global_stats );
1154             }
1155              
1156             sub global_stdev {
1157 0     0 1 0 my $self = shift;
1158 0         0 return Bio::DB::USeq::binStdev( $self->global_stats );
1159             }
1160              
1161             #### slice information ####
1162              
1163             sub slices {
1164 2     2 1 4 my $self = shift;
1165 2         3 return sort {$a cmp $b} keys %{ $self->{'file2attribute'} };
  0         0  
  2         13  
1166             }
1167              
1168             sub slice_seq_id {
1169 0     0 1 0 my $self = shift;
1170 0 0       0 my $slice = shift or return;
1171 0         0 return $self->{'file2attribute'}{$slice}->[0];
1172             }
1173              
1174             sub slice_start {
1175 1088     1088 1 1079 my $self = shift;
1176 1088 50       1429 my $slice = shift or return;
1177 1088         1752 return $self->{'file2attribute'}{$slice}->[1];
1178             }
1179              
1180             sub slice_end {
1181 2021     2021 1 2116 my $self = shift;
1182 2021 50       2490 my $slice = shift or return;
1183 2021         4608 return $self->{'file2attribute'}{$slice}->[2];
1184             }
1185              
1186             sub slice_strand {
1187 4     4 1 11 my $self = shift;
1188 4 50       12 my $slice = shift or return;
1189 4         20 return $self->{'file2attribute'}{$slice}->[3];
1190             }
1191              
1192             sub slice_type {
1193 1020     1020 1 1071 my $self = shift;
1194 1020 50       1238 my $slice = shift or return;
1195 1020         2731 return $self->{'file2attribute'}{$slice}->[4];
1196             }
1197              
1198             sub slice_obs_number {
1199 6     6 1 12 my $self = shift;
1200 6 50       12 my $slice = shift or return;
1201 6         13 return $self->{'file2attribute'}{$slice}->[5];
1202             }
1203              
1204             sub slice_feature {
1205 0     0 1 0 my $self = shift;
1206 0 0       0 my $slice = shift or return;
1207             return Bio::DB::USeq::Segment->new(
1208             -seq_id => $self->{'file2attribute'}{$slice}->[0],
1209             -start => $self->{'file2attribute'}{$slice}->[1],
1210             -stop => $self->{'file2attribute'}{$slice}->[2],
1211             -strand => $self->{'file2attribute'}{$slice}->[3],
1212 0         0 -type => $self->{'file2attribute'}{$slice}->[4],
1213             -source => $self->name,
1214             -name => $slice,
1215             );
1216             }
1217              
1218              
1219              
1220              
1221              
1222             #### Feature and data access ####
1223              
1224             sub segment {
1225 1     1 1 2 my $self = shift;
1226            
1227             # arguments can be chromo, start, stop, strand
1228 1         3 my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_);
1229 1 50       3 return unless $self->length($seq_id); # make sure chromosome is represented
1230            
1231 1         4 return Bio::DB::USeq::Segment->new(
1232             -seq_id => $seq_id,
1233             -start => $start,
1234             -end => $stop,
1235             -strand => $strand,
1236             -type => 'segment',
1237             -source => $self->name,
1238             -useq => $self,
1239             );
1240             }
1241              
1242             sub features {
1243 2     2 1 268 my $self = shift;
1244 2         9 my %args = @_;
1245            
1246             # check for type
1247 2         3 my $type;
1248 2   0     6 $args{-type} ||= $args{-types} || $args{-primary_tag} || 'region';
      33        
1249 2 50 33     15 if (ref $args{-type} and ref $args{-type} eq 'ARRAY') {
1250 0   0     0 $type = $args{-type}->[0] || 'region';
1251 0         0 $args{-type} = $type;
1252             }
1253             else {
1254 2         4 $type = $args{-type};
1255             }
1256            
1257             # return an appropriate feature
1258 2 50       20 if ($type =~ /chromosome/) {
    50          
1259             # compatibility to return chromosome features
1260             my @chromos = map {
1261 0         0 Bio::DB::USeq::Feature->new(
  0         0  
1262             -seq_id => $_,
1263             -start => 1,
1264             -end => $self->length($_),
1265             -type => $type,
1266             -source => $self->name,
1267             -name => $_,
1268             )
1269             } $self->seq_ids;
1270 0 0       0 return wantarray ? @chromos : \@chromos;
1271             }
1272             elsif ($type =~ /region|interval|observation|coverage|wiggle|summary/) {
1273             # region or segment are individual observation features
1274             # coverage or wiggle are for efficient score retrieval with
1275             # backwards compatibility with Bio::Graphics
1276             # summary are statistical summaries akin to Bio::DB::BigFile
1277            
1278             # set up an iterator
1279 2         12 my $iterator = $self->get_seq_stream(%args);
1280 2 50       8 return unless $iterator;
1281            
1282             # if user requested an iterator
1283 2 0 33     5 if (exists $args{-iterator} and $args{-iterator}) {
1284 0         0 return $iterator;
1285             }
1286            
1287             # collect the features
1288 2         2 my @features;
1289 2         6 while (my $f = $iterator->next_seq) {
1290 2         9 push @features, $f;
1291             }
1292 2 50       19 return wantarray ? @features : \@features;
1293             }
1294             else {
1295 0         0 confess "unknown type request '$type'!\n";
1296             }
1297             }
1298              
1299              
1300             sub get_features_by_type {
1301             # does not work without location information
1302 0     0 0 0 cluck "please use features() method instead";
1303 0         0 return;
1304             }
1305              
1306              
1307             sub get_features_by_location {
1308 0     0 1 0 my $self = shift;
1309            
1310             # arguments can be chromo, start, stop, strand
1311 0         0 my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_);
1312            
1313 0         0 return $self->features($seq_id, $start, $stop, $strand);
1314             }
1315              
1316              
1317             sub get_feature_by_id {
1318             # much as Bio::DB::BigWig fakes the id, so we will too here
1319             # I don't know how necessary this really is
1320 0     0 1 0 my $self = shift;
1321            
1322             # id will be encoded as chr:start:end ?
1323 0         0 my $id = shift;
1324 0         0 my ($seq_id, $start, $end, $type) = split /:/, $id;
1325            
1326 0   0     0 my @list = $self->features(
1327             -seq_id => $seq_id,
1328             -start => $start,
1329             -end => $end,
1330             -type => $type || undef,
1331             );
1332 0 0       0 return unless @list;
1333 0 0       0 return $list[0] if scalar @list == 1;
1334 0         0 foreach my $f (@list) {
1335 0 0 0     0 return $f if ($f->start == $start and $f->end == $end);
1336             }
1337 0         0 return;
1338             }
1339              
1340              
1341             sub get_feature_by_name {
1342 0     0 1 0 return shift->get_feature_by_id(@_);
1343             }
1344              
1345              
1346             sub get_seq_stream {
1347 3     3 1 70 my $self = shift;
1348            
1349             # arguments can be chromo, start, stop, strand
1350 3         11 my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_);
1351 3 50       11 return unless $self->length($seq_id); # make sure chromosome is represented
1352            
1353             # check for type
1354 3         9 my %args = @_;
1355 3         3 my $type;
1356 3   50     22 $args{-type} ||= $args{-types} || $args{-primary_tag} || 'region';
      66        
1357 3 50 33     13 if (ref $args{-type} and ref $args{-type} eq 'ARRAY') {
1358 0   0     0 $type = $args{-type}->[0] || 'region';
1359             }
1360             else {
1361 3         4 $type = $args{-type};
1362             }
1363            
1364 3         12 return Bio::DB::USeq::Iterator->new(
1365             -seq_id => $seq_id,
1366             -start => $start,
1367             -end => $stop,
1368             -strand => $strand,
1369             -type => $type,
1370             -source => $self->name,
1371             -useq => $self,
1372             );
1373             }
1374              
1375              
1376             sub scores {
1377 1     1 1 37 my $self = shift;
1378            
1379             # arguments can be chromo, start, stop, strand
1380 1         2 my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_);
1381 1 50       3 return unless $self->length($seq_id); # make sure chromosome is represented
1382            
1383             # determine which slices to retrieve
1384 1         4 my @slices = $self->_translate_coordinates_to_slices(
1385             $seq_id, $start, $stop, $strand);
1386 1 50       3 return unless @slices;
1387 1         3 $self->_clear_buffer(\@slices);
1388            
1389             # collect the scores from each of the requested slices
1390 1         2 my @scores;
1391 1         2 foreach my $slice (@slices) {
1392            
1393             # load and unpack the data
1394 1         3 $self->_load_slice($slice);
1395            
1396             # find the overlapping observations
1397 1         33 my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop);
1398            
1399             # record the scores
1400 1         5 foreach my $r (@$results) {
1401 31 50       44 push @scores, $r->[2] if defined $r->[2];
1402             }
1403             }
1404            
1405 1 50       18 return wantarray ? @scores : \@scores;
1406             }
1407              
1408             sub observations {
1409 0     0 1 0 my $self = shift;
1410            
1411             # arguments can be chromo, start, stop, strand
1412 0         0 my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_);
1413 0 0       0 return unless $self->length($seq_id); # make sure chromosome is represented
1414            
1415             # determine which slices to retrieve
1416 0         0 my @slices = $self->_translate_coordinates_to_slices(
1417             $seq_id, $start, $stop, $strand);
1418 0 0       0 return unless @slices;
1419 0         0 $self->_clear_buffer(\@slices);
1420            
1421             # collect the scores from each of the requested slices
1422 0         0 my @observations;
1423 0         0 foreach my $slice (@slices) {
1424             # load and unpack the data
1425 0         0 $self->_load_slice($slice);
1426            
1427             # find the overlapping observations
1428 0         0 my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop);
1429 0         0 push @observations, @$results;
1430             }
1431              
1432 0 0       0 return wantarray ? @observations : \@observations;
1433             }
1434              
1435              
1436              
1437              
1438             #### Private methods ####
1439              
1440             sub _parse_metadata {
1441 1     1   2 my $self = shift;
1442            
1443             # the metadata file should always be present in a USeq file
1444 1         4 my $readMe = $self->{'zip'}->contents('archiveReadMe.txt');
1445 1 50       1119 unless ($readMe) {
1446 0         0 carp " USeq file is malformed! It does not contain an archiveReadMe.txt file\n";
1447 0         0 return;
1448             }
1449            
1450             # parse the archive file
1451             # this is a simple text file, each line is key = value
1452 1         3 $self->{'metadata'}{'comments'} = [];
1453 1         5 foreach (split /\n/, $readMe) {
1454 5 50       10 if (/^#/) {
1455 0 0       0 push @{ $self->{'metadata'}{'comments'} }, $_ unless $_ =~ /validCount/;
  0         0  
1456 0         0 next;
1457             }
1458 5 50       14 if (/^\s*([^=\s]+)\s*=\s*(.+)\s*$/) {
1459             # separate key = value pairs tolerating whitespace
1460             # key may contain anything excluding = and whitespace
1461 5         12 $self->{'metadata'}{$1} = $2;
1462             }
1463             }
1464 1         2 return 1;
1465             }
1466              
1467              
1468             sub _parse_members {
1469 2     2   4 my $self = shift;
1470            
1471             # there is a lot of information encoded in each zip member file name
1472             # the chromosome, strand, coordinates of represented interval, and file type
1473             # this parses and indexes this metadata into a usable format
1474            
1475 2         5 my @errors;
1476 2         8 foreach my $member ($self->{'zip'}->memberNames) {
1477            
1478             # archive readMe
1479 8 100       71 next if ($member eq 'archiveReadMe.txt');
1480            
1481             # data file
1482 6         39 my ($chromo, $strand, $start, $stop, $number, $extension) =
1483             ($member =~ /^([\w\-\.]+)([\+\-\.])(\d+)\-(\d+)\-(\d+)\.(\w+)$/i);
1484            
1485             # check extracted metadata
1486 6 50 33     45 unless ($chromo and $strand and defined $start and defined $stop and
      33        
      33        
      33        
      33        
1487             $number and $extension) {
1488 0         0 push @errors, " data slice $member";
1489 0         0 next;
1490             }
1491            
1492             # check stranded data
1493 6 100       11 unless (defined $self->{'stranded'}) {
1494 2 100       6 if ($strand eq '.') {
1495 1         2 $self->{'stranded'} = 0;
1496             }
1497             else {
1498 1         3 $self->{'stranded'} = 1;
1499             }
1500             }
1501            
1502             # check seq_ids
1503             # record the length for each seq_id, which may or may not be entirely
1504             # accurate, since it is just the last interval position
1505 6 100       11 if (exists $self->{'seq_ids'}{$chromo} ) {
1506 3 50       9 if ($stop > $self->{'seq_ids'}{$chromo}) {
1507 3         5 $self->{'seq_ids'}{$chromo} = $stop;
1508             }
1509             }
1510             else {
1511 3         7 $self->{'seq_ids'}{$chromo} = $stop;
1512             }
1513            
1514             # convert to BioPerl convention
1515 6 50       16 $strand = $strand eq '+' ? 1 : $strand eq '.' ? 0 : $strand eq '-' ? -1 : 0;
    100          
    100          
1516 6         8 $start += 1;
1517            
1518             # check coordinates
1519             # hack to cover a bug? Can't have zero or negative coordinates
1520 6 50       12 if ($start >= $stop) {
1521 0         0 $stop += 1;
1522             }
1523            
1524             # store the member details for each member slice
1525 6         19 $self->{'file2attribute'}{$member} =
1526             [$chromo, $start, $stop, $strand, $extension, $number];
1527            
1528             # store the member into a chromosome-strand-specific interval tree
1529             # interval trees are stored as 0-based, half-open intervals
1530 6   66     39 $self->{'coord2file'}{$chromo}{$strand} ||= Set::IntervalTree->new();
1531 6         19 $self->{'coord2file'}{$chromo}{$strand}->insert($member, $start - 1, $stop);
1532             }
1533            
1534             # check parsing
1535 2 50       5 if (@errors) {
1536 0         0 carp "Errors parsing data slice filenames:\n" . join("\n", @errors) . "\n";
1537             }
1538 2 50       4 unless (%{ $self->{'coord2file'} }) {
  2         4  
1539 0         0 carp "no data slices present in USeq archive!\n";
1540 0         0 return;
1541             }
1542            
1543 2         6 return 1;
1544             }
1545              
1546              
1547             sub _get_coordinates {
1548 5     5   8 my $self = shift;
1549            
1550 5         11 my ($seq_id, $start, $stop, $strand);
1551 5 50       19 if ($_[0] =~ /^\-/) {
1552 5         14 my %args = @_;
1553 5   0     16 $seq_id = $args{'-seq_id'} || $args{'-chromo'} || undef;
1554 5   0     9 $start = $args{'-start'} || $args{'-pos'} || undef;
1555 5   0     13 $stop = $args{'-end'} || $args{'-stop'} || undef;
1556 5   50     21 $strand = $args{'-strand'} || '0'; # unstranded
1557             }
1558             else {
1559 0         0 ($seq_id, $start, $stop, $strand) = @_;
1560             }
1561 5 50       11 unless ($seq_id) {
1562 0         0 cluck "no sequence ID provided!";
1563 0         0 return;
1564             }
1565 5   50     11 $start ||= 1;
1566 5   33     8 $stop ||= $self->length($seq_id);
1567 5   50     15 $strand ||= 0;
1568 5         14 return ($seq_id, $start, $stop, $strand);
1569             }
1570              
1571              
1572             sub _translate_coordinates_to_slices {
1573 6     6   17 my $self = shift;
1574 6         15 my ($seq_id, $start, $stop, $strand) = @_;
1575 6 50       17 return unless (exists $self->{'coord2file'}{$seq_id});
1576            
1577             # check strand request
1578 6         9 my $both = 0;
1579 6 50       17 if ($strand == 0) {
1580             # strand was not specified,
1581             # but collect from both strands if we have stranded data
1582 6 100       14 $both = 1 if $self->stranded;
1583             }
1584             else {
1585             # strand was specified,
1586             # but convert it to unstranded if the data is not stranded
1587 0 0       0 $strand = 0 unless $self->stranded;
1588             }
1589            
1590             # look for the overlapping slices
1591 6         8 my @slices;
1592 6 100       11 if ($both) {
1593             # need to collect from both strands
1594             # plus strand first, then minus strand
1595 1         7 my $results = $self->{'coord2file'}{$seq_id}{1}->fetch($start - 1, $stop);
1596 1         4 my $results2 = $self->{'coord2file'}{$seq_id}{-1}->fetch($start - 1, $stop);
1597 1         12 push @slices, @$results, @$results2;
1598             }
1599            
1600             # specific strand
1601             else {
1602 5         42 my $results = $self->{'coord2file'}{$seq_id}{$strand}->fetch($start - 1, $stop);
1603 5         12 push @slices, @$results;
1604             }
1605            
1606 6         17 return @slices;
1607             }
1608              
1609              
1610             sub _clear_buffer {
1611 6     6   9 my $self = shift;
1612            
1613             # make a quick hash of wanted slices
1614 6         8 my %wanted = map {$_ => 1} @{$_[0]};
  9         25  
  6         20  
1615            
1616             # delete the existing buffers of slices we do not want
1617 6         10 foreach (keys %{ $self->{buffer} }) {
  6         18  
1618 5 100       6653 delete $self->{buffer}{$_} unless exists $wanted{$_};
1619             }
1620             }
1621              
1622             sub _load_slice {
1623 1016     1016   980 my $self = shift;
1624 1016         977 my $slice = shift;
1625 1016 100       1663 return if (exists $self->{'buffer'}{$slice});
1626            
1627             # each slice is parsed into observations consisting of start, stop, and
1628             # optionally score and text depending on type
1629             # these are stored as anonymous arrays [start, stop, score, text]
1630             # for quick retrieval each observation interval is stored in a Set::IntervalTree
1631             # these operations maintain the original 0-based coordinate system
1632            
1633 7         56 my $type = $self->slice_type($slice);
1634 7 50       58 if ($type eq 's') { $self->_load_s_slice($slice) }
  0 50       0  
    50          
    50          
    100          
    50          
    50          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
1635 0         0 elsif ($type eq 'sf') { $self->_load_sf_slice($slice) }
1636 0         0 elsif ($type eq 'st') { $self->_load_st_slice($slice) }
1637 0         0 elsif ($type eq 'ss') { $self->_load_ss_slice($slice) }
1638 6         20 elsif ($type eq 'ssf') { $self->_load_ssf_slice($slice) }
1639 0         0 elsif ($type eq 'sst') { $self->_load_sst_slice($slice) }
1640 1         5 elsif ($type eq 'ssft') { $self->_load_ssft_slice($slice) }
1641 0         0 elsif ($type eq 'i') { $self->_load_i_slice($slice) }
1642 0         0 elsif ($type eq 'if') { $self->_load_if_slice($slice) }
1643 0         0 elsif ($type eq 'it') { $self->_load_it_slice($slice) }
1644 0         0 elsif ($type eq 'ii') { $self->_load_ii_slice($slice) }
1645 0         0 elsif ($type eq 'iif') { $self->_load_iif_slice($slice) }
1646 0         0 elsif ($type eq 'iit') { $self->_load_iit_slice($slice) }
1647 0         0 elsif ($type eq 'iift') { $self->_load_iift_slice($slice) }
1648 0         0 elsif ($type eq 'is') { $self->_load_is_slice($slice) }
1649 0         0 elsif ($type eq 'isf') { $self->_load_isf_slice($slice) }
1650 0         0 elsif ($type eq 'ist') { $self->_load_ist_slice($slice) }
1651 0         0 elsif ($type eq 'isft') { $self->_load_isft_slice($slice) }
1652             else {
1653 0         0 croak "unable to load slice '$slice'! Unsupported slice type $type\n";
1654             }
1655             }
1656              
1657              
1658             sub _load_s_slice {
1659 0     0   0 my $self = shift;
1660 0         0 my $slice = shift;
1661 0         0 my $tree = Set::IntervalTree->new();
1662            
1663             # unpack the data slice zip member
1664 0         0 my $number = $self->slice_obs_number($slice);
1665 0         0 my ($null, @data) = unpack("si>(s>)$number", $self->zip->contents($slice) );
1666            
1667             # convert the unpacked data into start, stop, score
1668             # first observation
1669 0         0 my $position = shift @data;
1670 0         0 $tree->insert( [$position, $position + 1, undef], $position, $position + 1);
1671            
1672             # remaining observations
1673 0         0 while (@data) {
1674 0         0 $position += (shift @data) + 32768;
1675 0         0 $tree->insert( [$position, $position + 1, undef], $position, $position + 1);
1676             }
1677            
1678             # store Interval Tree buffer
1679 0         0 $self->{buffer}{$slice} = $tree;
1680             }
1681              
1682             sub _load_sf_slice {
1683 0     0   0 my $self = shift;
1684 0         0 my $slice = shift;
1685 0         0 my $tree = Set::IntervalTree->new();
1686            
1687             # unpack the data slice zip member
1688 0         0 my $number = $self->slice_obs_number($slice);
1689 0         0 my ($null, @data) = unpack("si>f>(s>f>)$number", $self->zip->contents($slice) );
1690            
1691             # convert the unpacked data into start, stop, score
1692             # first observation
1693 0         0 my $position = shift @data;
1694 0         0 $tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1);
1695            
1696             # remaining observations
1697 0         0 while (@data) {
1698 0         0 $position += (shift @data) + 32768;
1699 0         0 $tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1);
1700             }
1701            
1702             # store Interval Tree buffer
1703 0         0 $self->{buffer}{$slice} = $tree;
1704             }
1705              
1706             sub _load_st_slice {
1707 0     0   0 my $self = shift;
1708 0         0 my $slice = shift;
1709 0         0 my $tree = Set::IntervalTree->new();
1710            
1711             # convert the unpacked data into start, stop, (score), text
1712            
1713             # load the slice contents
1714 0         0 my $contents = $self->zip->contents($slice);
1715            
1716             # first observation
1717 0         0 my $i = 8; # go ahead and set index up to first observation text
1718 0         0 my (undef, $position, $t) = unpack('si>s>', substr($contents, 0, $i));
1719             # initial null, position, text_length
1720 0         0 my $text = unpack("A$t", substr($contents, $i, $t));
1721 0         0 $tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1);
1722 0         0 $i += $t;
1723            
1724             # remaining observations
1725 0         0 my $p; # position offset
1726 0         0 my $len = CORE::length($contents);
1727 0         0 while ($i < $len) {
1728 0         0 my ($p, $t) = unpack('s>s>', substr($contents, $i, 4));
1729 0         0 $i += 4;
1730 0         0 $position += $p + 32768;
1731 0         0 $text = unpack("A$t", substr($contents, $i, $t));
1732 0         0 $tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1);
1733 0         0 $i += $t;
1734             }
1735            
1736             # store Interval Tree buffer
1737 0         0 $self->{buffer}{$slice} = $tree;
1738             }
1739              
1740             sub _load_ss_slice {
1741 0     0   0 my $self = shift;
1742 0         0 my $slice = shift;
1743 0         0 my $tree = Set::IntervalTree->new();
1744            
1745             # unpack the data slice zip member
1746 0         0 my $number = $self->slice_obs_number($slice);
1747 0         0 my ($null, @data) = unpack("si>s>(s>s>)$number", $self->zip->contents($slice) );
1748            
1749             # convert the unpacked data into start, stop, no score
1750             # first observation
1751 0         0 my $position = @data;
1752 0         0 my $position2 = $position + (shift @data) + 32768;
1753 0         0 $tree->insert( [$position, $position2, undef], $position, $position2);
1754            
1755             # remaining observations
1756 0         0 while (@data) {
1757 0         0 $position += (shift @data) + 32768;
1758 0         0 $position2 = $position + (shift @data) + 32768;
1759 0         0 $tree->insert( [$position, $position2, undef], $position, $position2);
1760             }
1761            
1762             # store Interval Tree buffer
1763 0         0 $self->{buffer}{$slice} = $tree;
1764             }
1765              
1766             sub _load_ssf_slice {
1767 6     6   8 my $self = shift;
1768 6         9 my $slice = shift;
1769 6         46 my $tree = Set::IntervalTree->new();
1770            
1771             # unpack the data slice zip member
1772 6         14 my $number = $self->slice_obs_number($slice);
1773 6         26 my ($null, @data) = unpack("si>s>f>(s>s>f>)$number", $self->zip->contents($slice) );
1774            
1775             # convert the unpacked data into start, stop, score
1776             # first observation
1777 6         25966 my $position = (shift @data);
1778 6         26 my $position2 = $position + (shift @data) + 32768;
1779 6         41 $tree->insert( [$position, $position2, shift(@data)], $position, $position2);
1780            
1781             # remaining observations
1782 6         18 while (@data) {
1783 58298         54937 $position += (shift @data) + 32768;
1784 58298         51623 $position2 = $position + (shift @data) + 32768;
1785 58298         120028 $tree->insert( [$position, $position2, shift(@data)], $position, $position2);
1786             }
1787            
1788             # store Interval Tree buffer
1789 6         41 $self->{buffer}{$slice} = $tree;
1790             }
1791              
1792             sub _load_sst_slice {
1793 0     0   0 my $self = shift;
1794 0         0 my $slice = shift;
1795 0         0 my $tree = Set::IntervalTree->new();
1796 0         0 my $contents = $self->zip->contents($slice);
1797            
1798             # first observation
1799 0         0 my $i = 10; # go ahead and set index up to first observation text
1800 0         0 my (undef, $position, $l, $t) = unpack('si>s>s>', substr($contents, 0, $i));
1801             # initial null, position, length, text_length
1802 0         0 my $position2 = $position + $l + 32768;
1803 0         0 my $text = unpack("A$t", substr($contents, $i, $t));
1804 0         0 $tree->insert( [$position, $position2, undef, $text], $position, $position2);
1805 0         0 $i += $t;
1806            
1807             # remaining observations
1808 0         0 my $p; # position offset
1809 0         0 my $len = CORE::length($contents);
1810 0         0 while ($i < $len) {
1811 0         0 ($p, $l, $t) = unpack('s>s>s>', substr($contents, $i, 6));
1812 0         0 $i += 6;
1813 0         0 $position += $p + 32768;
1814 0         0 $position2 = $position + $l + 32768;
1815 0         0 $text = unpack("A$t", substr($contents, $i, $t));
1816 0         0 $tree->insert( [$position, $position2, undef, $text], $position, $position2);
1817 0         0 $i += $t;
1818             }
1819            
1820             # store Interval Tree buffer
1821 0         0 $self->{buffer}{$slice} = $tree;
1822             }
1823              
1824             sub _load_ssft_slice {
1825 1     1   1 my $self = shift;
1826 1         2 my $slice = shift;
1827 1         6 my $tree = Set::IntervalTree->new();
1828 1         13 my $contents = $self->zip->contents($slice);
1829            
1830             # first observation
1831 1         3388 my $i = 14; # go ahead and set index up to first observation text
1832 1         6 my (undef, $position, $l, $s, $t) = unpack('si>s>f>s>', substr($contents, 0, $i));
1833             # initial null, position, length, score, text_length
1834 1         4 my $position2 = $position + $l + 32768;
1835 1         5 my $text = unpack("A$t", substr($contents, $i, $t));
1836 1         5 $tree->insert( [$position, $position2, $s, $text], $position, $position2);
1837 1         2 $i += $t;
1838            
1839             # remaining observations
1840 1         1 my $p; # position offset
1841 1         2 my $len = CORE::length($contents);
1842 1         4 while ($i < $len) {
1843 10000         17382 ($p, $l, $s, $t) = unpack('s>s>f>s>', substr($contents, $i, 10));
1844 10000         10362 $i += 10;
1845 10000         8556 $position += $p + 32768;
1846 10000         8743 $position2 = $position + $l + 32768;
1847 10000         16254 $text = unpack("A$t", substr($contents, $i, $t));
1848 10000         27084 $tree->insert( [$position, $position2, $s, $text], $position, $position2);
1849 10000         12977 $i += $t;
1850             }
1851            
1852             # store Interval Tree buffer
1853 1         14 $self->{buffer}{$slice} = $tree;
1854             }
1855              
1856             sub _load_i_slice {
1857 0     0   0 my $self = shift;
1858 0         0 my $slice = shift;
1859 0         0 my $tree = Set::IntervalTree->new();
1860            
1861             # unpack the data slice zip member
1862 0         0 my $number = $self->slice_obs_number($slice);
1863 0         0 my ($null, @data) = unpack("si>(i>)$number", $self->zip->contents($slice) );
1864            
1865             # convert the unpacked data into start, stop, score
1866             # first observation
1867 0         0 my $position = @data;
1868 0         0 $tree->insert( [$position, $position + 1, undef], $position, $position + 1);
1869            
1870             # remaining observations
1871 0         0 while (@data) {
1872 0         0 $position += (shift @data);
1873 0         0 $tree->insert( [$position, $position + 1, undef], $position, $position + 1);
1874             }
1875            
1876             # store Interval Tree buffer
1877 0         0 $self->{buffer}{$slice} = $tree;
1878             }
1879              
1880             sub _load_if_slice {
1881 0     0   0 my $self = shift;
1882 0         0 my $slice = shift;
1883 0         0 my $tree = Set::IntervalTree->new();
1884            
1885             # unpack the data slice zip member
1886 0         0 my $number = $self->slice_obs_number($slice);
1887 0         0 my ($null, @data) = unpack("si>f>(i>f>)$number", $self->zip->contents($slice) );
1888            
1889             # convert the unpacked data into start, stop, score
1890             # first observation
1891 0         0 my $position = @data;
1892 0         0 $tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1);
1893            
1894             # remaining observations
1895 0         0 while (@data) {
1896 0         0 $position += (shift @data);
1897 0         0 $tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1);
1898             }
1899            
1900             # store Interval Tree buffer
1901 0         0 $self->{buffer}{$slice} = $tree;
1902             }
1903              
1904             sub _load_it_slice {
1905 0     0   0 my $self = shift;
1906 0         0 my $slice = shift;
1907 0         0 my $tree = Set::IntervalTree->new();
1908            
1909             # convert the unpacked data into start, stop, (score), text
1910            
1911             # load the slice contents
1912 0         0 my $contents = $self->zip->contents($slice);
1913            
1914             # first observation
1915 0         0 my $i = 10; # go ahead and set index up to first observation text
1916 0         0 my (undef, $position, $t) = unpack('si>i>', substr($contents, 0, $i));
1917             # initial null, position, text_length
1918 0         0 my $text = unpack("A$t", substr($contents, $i, $t));
1919 0         0 $tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1);
1920 0         0 $i += $t;
1921            
1922             # remaining observations
1923 0         0 my $p; # position offset
1924 0         0 my $len = CORE::length($contents);
1925 0         0 while ($i < $len) {
1926 0         0 ($p, $t) = unpack('i>s>', substr($contents, $i, 6));
1927 0         0 $i += 6;
1928 0         0 $position += $p + 32768;
1929 0         0 $text = unpack("A$t", substr($contents, $i, $t));
1930 0         0 $tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1);
1931 0         0 $i += $t;
1932             }
1933            
1934             # store Interval Tree buffer
1935 0         0 $self->{buffer}{$slice} = $tree;
1936             }
1937              
1938             sub _load_ii_slice {
1939 0     0   0 my $self = shift;
1940 0         0 my $slice = shift;
1941 0         0 my $tree = Set::IntervalTree->new();
1942            
1943             # unpack the data slice zip member
1944 0         0 my $number = $self->slice_obs_number($slice);
1945 0         0 my ($null, @data) = unpack("si>i>(i>i>)$number", $self->zip->contents($slice) );
1946            
1947             # convert the unpacked data into start, stop, score
1948             # first observation
1949 0         0 my $position = @data;
1950 0         0 my $position2 = $position + (shift @data);
1951 0         0 $tree->insert( [$position, $position2, undef], $position, $position2);
1952            
1953             # remaining observations
1954 0         0 while (@data) {
1955 0         0 $position += (shift @data);
1956 0         0 $position2 = $position + (shift @data);
1957 0         0 $tree->insert( [$position, $position2, undef], $position, $position2);
1958             }
1959            
1960             # store Interval Tree buffer
1961 0         0 $self->{buffer}{$slice} = $tree;
1962             }
1963              
1964             sub _load_iif_slice {
1965 0     0   0 my $self = shift;
1966 0         0 my $slice = shift;
1967 0         0 my $tree = Set::IntervalTree->new();
1968            
1969             # unpack the data slice zip member
1970 0         0 my $number = $self->slice_obs_number($slice);
1971 0         0 my ($null, @data) = unpack("si>i>f>(i>i>f>)$number", $self->zip->contents($slice) );
1972            
1973             # convert the unpacked data into start, stop, score
1974             # first observation
1975 0         0 my $position = @data;
1976 0         0 my $position2 = $position + (shift @data);
1977 0         0 $tree->insert( [$position, $position2, shift(@data)], $position, $position2);
1978            
1979             # remaining observations
1980 0         0 while (@data) {
1981 0         0 $position += (shift @data);
1982 0         0 $position2 = $position + (shift @data);
1983 0         0 $tree->insert( [$position, $position2, shift(@data)], $position, $position2);
1984             }
1985            
1986             # store Interval Tree buffer
1987 0         0 $self->{buffer}{$slice} = $tree;
1988             }
1989              
1990             sub _load_iit_slice {
1991 0     0   0 my $self = shift;
1992 0         0 my $slice = shift;
1993 0         0 my $tree = Set::IntervalTree->new();
1994 0         0 my $contents = $self->zip->contents($slice);
1995            
1996             # first observation
1997 0         0 my $i = 12; # go ahead and set index up to first observation text
1998 0         0 my (undef, $position, $l, $t) = unpack('si>i>s>', substr($contents, 0, $i));
1999             # initial null, position, length, text_length
2000 0         0 my $position2 = $position + $l;
2001 0         0 my $text = unpack("A$t", substr($contents, $i, $t));
2002 0         0 $tree->insert( [$position, $position2, undef, $text], $position, $position2);
2003 0         0 $i += $t;
2004            
2005             # remaining observations
2006 0         0 my $p; # position offset
2007 0         0 my $len = CORE::length($contents);
2008 0         0 while ($i < $len) {
2009 0         0 ($p, $l, $t) = unpack('i>i>s>', substr($contents, $i, 10));
2010 0         0 $i += 10;
2011 0         0 $position += $p;
2012 0         0 $position2 = $position + $l;
2013 0         0 $text = unpack("A$t", substr($contents, $i, $t));
2014 0         0 $tree->insert( [$position, $position2, undef, $text], $position, $position2);
2015 0         0 $i += $t;
2016             }
2017            
2018             # store Interval Tree buffer
2019 0         0 $self->{buffer}{$slice} = $tree;
2020             }
2021              
2022             sub _load_iift_slice {
2023 0     0   0 my $self = shift;
2024 0         0 my $slice = shift;
2025 0         0 my $tree = Set::IntervalTree->new();
2026 0         0 my $contents = $self->zip->contents($slice);
2027            
2028             # first observation
2029 0         0 my $i = 16; # go ahead and set index up to first observation text
2030 0         0 my (undef, $position, $l, $s, $t) = unpack('si>i>f>s>', substr($contents, 0, $i));
2031             # initial null, position, length, score, text_length
2032 0         0 my $position2 = $position + $l;
2033 0         0 my $text = unpack("A$t", substr($contents, $i, $t));
2034 0         0 $tree->insert( [$position, $position2, $s, $text], $position, $position2);
2035 0         0 $i += $t;
2036            
2037             # remaining observations
2038 0         0 my $p; # position offset
2039 0         0 my $len = CORE::length($contents);
2040 0         0 while ($i < $len) {
2041 0         0 ($p, $l, $s, $t) = unpack('i>i>f>s>', substr($contents, $i, 14));
2042 0         0 $i += 14;
2043 0         0 $position += $p;
2044 0         0 $position2 = $position + $l;
2045 0         0 $text = unpack("A$t", substr($contents, $i, $t));
2046 0         0 $tree->insert( [$position, $position2, $s, $text], $position, $position2);
2047 0         0 $i += $t;
2048             }
2049            
2050             # store Interval Tree buffer
2051 0         0 $self->{buffer}{$slice} = $tree;
2052             }
2053              
2054             sub _load_is_slice {
2055 0     0   0 my $self = shift;
2056 0         0 my $tree = Set::IntervalTree->new();
2057 0         0 my $slice = shift;
2058            
2059             # unpack the data slice zip member
2060 0         0 my $number = $self->slice_obs_number($slice);
2061 0         0 my ($null, @data) = unpack("si>s>(i>s>)$number", $self->zip->contents($slice) );
2062            
2063             # convert the unpacked data into start, stop, score
2064             # first observation
2065 0         0 my $position = @data;
2066 0         0 my $position2 = $position + (shift @data) + 32768;
2067 0         0 $tree->insert( [$position, $position2, undef], $position, $position2);
2068            
2069             # remaining observations
2070 0         0 while (@data) {
2071 0         0 $position += (shift @data);
2072 0         0 $position2 = $position + (shift @data) + 32768;
2073 0         0 $tree->insert( [$position, $position2, undef], $position, $position2);
2074             }
2075            
2076             # store Interval Tree buffer
2077 0         0 $self->{buffer}{$slice} = $tree;
2078             }
2079              
2080             sub _load_isf_slice {
2081 0     0   0 my $self = shift;
2082 0         0 my $slice = shift;
2083 0         0 my $tree = Set::IntervalTree->new();
2084            
2085             # unpack the data slice zip member
2086 0         0 my $number = $self->slice_obs_number($slice);
2087 0         0 my ($null, @data) = unpack("si>s>f>(i>s>f>)$number", $self->zip->contents($slice) );
2088            
2089             # convert the unpacked data into start, stop, score
2090             # first observation
2091 0         0 my $position = @data;
2092 0         0 my $position2 = $position + (shift @data) + 32768;
2093 0         0 $tree->insert( [$position, $position2, shift(@data)], $position, $position2);
2094            
2095             # remaining observations
2096 0         0 while (@data) {
2097 0         0 $position += (shift @data);
2098 0         0 $position2 = $position + (shift @data) + 32768;
2099 0         0 $tree->insert( [$position, $position2, shift(@data)], $position, $position2);
2100             }
2101            
2102             # store Interval Tree buffer
2103 0         0 $self->{buffer}{$slice} = $tree;
2104             }
2105              
2106             sub _load_ist_slice {
2107 0     0   0 my $self = shift;
2108 0         0 my $slice = shift;
2109 0         0 my $tree = Set::IntervalTree->new();
2110 0         0 my $contents = $self->zip->contents($slice);
2111            
2112             # first observation
2113 0         0 my $i = 10; # go ahead and set index up to first observation text
2114 0         0 my (undef, $position, $l, $t) = unpack('si>s>s>', substr($contents, 0, $i));
2115             # initial null, position, length, text_length
2116 0         0 my $position2 = $position + $l + 32768;
2117 0         0 my $text = unpack("A$t", substr($contents, $i, $t));
2118 0         0 $tree->insert( [$position, $position2, undef, $text], $position, $position2);
2119 0         0 $i += $t;
2120            
2121             # remaining observations
2122 0         0 my $p; # position offset
2123 0         0 my $len = CORE::length($contents);
2124 0         0 while ($i < $len) {
2125 0         0 ($p, $l, $t) = unpack('i>s>s>', substr($contents, $i, 8));
2126 0         0 $i += 8;
2127 0         0 $position += $p;
2128 0         0 $position2 = $position + $l + 32768;
2129 0         0 $text = unpack("A$t", substr($contents, $i, $t));
2130 0         0 $tree->insert( [$position, $position2, undef, $text], $position, $position2);
2131 0         0 $i += $t;
2132             }
2133            
2134             # store Interval Tree buffer
2135 0         0 $self->{buffer}{$slice} = $tree;
2136             }
2137              
2138             sub _load_isft_slice {
2139 0     0   0 my $self = shift;
2140 0         0 my $slice = shift;
2141 0         0 my $tree = Set::IntervalTree->new();
2142 0         0 my $contents = $self->zip->contents($slice);
2143            
2144             # first observation
2145 0         0 my $i = 14; # go ahead and set index up to first observation text
2146 0         0 my (undef, $position, $l, $s, $t) = unpack('si>s>f>s>', substr($contents, 0, $i));
2147             # initial null, position, length, score, text_length
2148 0         0 my $position2 = $position + $l + 32768;
2149 0         0 my $text = unpack("A$t", substr($contents, $i, $t));
2150 0         0 $tree->insert( [$position, $position2, $s, $text], $position, $position2);
2151 0         0 $i += $t;
2152            
2153             # remaining observations
2154 0         0 my $p; # position offset
2155 0         0 while ($i < CORE::length($contents)) {
2156 0         0 ($p, $l, $s, $t) = unpack('i>s>f>s>', substr($contents, $i, 12));
2157 0         0 $i += 12;
2158 0         0 $position += $p;
2159 0         0 $position2 = $position + $l + 32768;
2160 0         0 $text = unpack("A$t", substr($contents, $i, $t));
2161 0         0 $tree->insert( [$position, $position2, $s, $text], $position, $position2);
2162 0         0 $i += $t;
2163             }
2164            
2165             # store Interval Tree buffer
2166 0         0 $self->{buffer}{$slice} = $tree;
2167             }
2168              
2169             sub _mean_score {
2170 1000     1000   899 my $self = shift;
2171 1000         1231 my ($start, $stop, $slices) = @_;
2172 1000 50       1333 return unless @$slices;
2173            
2174             # collect the scores from each of the requested slices
2175 1000         876 my $sum;
2176 1000         897 my $count = 0;
2177 1000         1050 foreach my $slice (@$slices) {
2178 1001 50       1296 next unless $self->slice_type($slice) =~ /f/;
2179             # no sense going through if no score present
2180            
2181             # load and unpack the data
2182 1001         2009 $self->_load_slice($slice);
2183            
2184             # find the overlapping observations and record values
2185 1001         87617 my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop);
2186 1001         2002 foreach my $r (@$results) {
2187 2276   100     3729 $sum += $r->[2] || 0;
2188 2276         2763 $count++;
2189             }
2190             }
2191 1000 50       3614 return $count ? $sum / $count : undef;
2192             }
2193              
2194             sub _stat_summary {
2195 10     10   10 my $self = shift;
2196 10         13 my ($start, $stop, $slices) = @_;
2197 10 50       18 return unless @$slices;
2198            
2199             # initialize the statistical scores
2200 10         24 my $count = 0;
2201 10         19 my $sum;
2202             my $sum_squares;
2203 10         0 my $min;
2204 10         0 my $max;
2205            
2206             # collect the scores from each of the requested slices
2207 10         13 foreach my $slice (@$slices) {
2208 11 50       19 next unless $self->slice_type($slice) =~ /f/;
2209             # no sense going through if no score present
2210            
2211             # load and unpack the data
2212 11         26 $self->_load_slice($slice);
2213            
2214             # find the overlapping observations
2215 11         1558 my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop);
2216 11         32 foreach my $r (@$results) {
2217             # each observation is [start, stop, score]
2218 11994 50       14234 if (defined $r->[2]) {
2219 11994         10021 $count++;
2220 11994         10760 $sum += $r->[2];
2221 11994         10801 $sum_squares += ($r->[2] * $r->[2]);
2222 11994 100 100     21679 $min = $r->[2] if (!defined $min or $r->[2] < $min);
2223 11994 100 100     21912 $max = $r->[2] if (!defined $max or $r->[2] > $max);
2224             }
2225             }
2226             }
2227            
2228             # assemble the statistical summary hash
2229 10   50     79 my %summary = (
      50        
      50        
      50        
2230             'validCount' => $count,
2231             'sumData' => $sum || 0,
2232             'sumSquares' => $sum_squares || 0,
2233             'minVal' => $min || 0,
2234             'maxVal' => $max || 0,
2235             );
2236 10         74 return \%summary;
2237             }
2238              
2239             sub _rewrite_metadata {
2240 1     1   4 my $self = shift;
2241 1 50       6 return unless (-w $self->zip->fileName);
2242 1         54 my $md = $self->{'metadata'};
2243            
2244             # generate new metadata as an array
2245 1         2 my @new_md;
2246 1         5 push @new_md, "useqArchiveVersion = " . $md->{useqArchiveVersion};
2247 1 50       4 push @new_md, @{ $md->{comments} } if exists $md->{comments};
  1         3  
2248 1         3 push @new_md, "dataType = " . $md->{dataType};
2249 1         3 push @new_md, "versionedGenome = " . $md->{versionedGenome};
2250            
2251             # additional keys that may be present
2252 1         7 foreach (keys %$md) {
2253 7 100       35 next if /useqArchiveVersion|dataType|versionedGenome|comments|chromStats|globalStats/;
2254 2         6 push @new_md, "$_ = $md->{$_}";
2255             }
2256            
2257             # global and chromosome stats
2258 1         2 push @new_md,
2259             "# Bio::DB::USeq statistics validCount,sumData,sumSquares,minVal,maxVal";
2260 1 50       3 push @new_md, "globalStats = $md->{globalStats}" if exists $md->{globalStats};
2261 1         10 foreach (sort {$a cmp $b} keys %$md) {
  12         15  
2262 7 100       19 push @new_md, "$_ = $md->{$_}" if /chromStats/;
2263             }
2264            
2265             # write the new metadata to the zip Archive
2266 1         10 $self->{'zip'}->contents('archiveReadMe.txt', join("\n", @new_md));
2267 1         668 $self->{'zip'}->overwrite;
2268 1         6661 $self->clone; # not sure if this is necessary, but just in case we reopen the zip
2269             }
2270              
2271              
2272             ######## Exported Functions ########
2273             # these are borrowed from Bio::DB::BigWig from Lincoln Stein
2274              
2275             sub binMean {
2276 2     2 1 130 my $score = shift;
2277 2 50       9 return 0 unless $score->{validCount};
2278 2         9 $score->{sumData}/$score->{validCount};
2279             }
2280              
2281             sub binVariance {
2282 1     1 1 2 my $score = shift;
2283 1 50       4 return 0 unless $score->{validCount};
2284 1         7 my $var = $score->{sumSquares} - $score->{sumData}**2/$score->{validCount};
2285 1 50       4 if ($score->{validCount} > 1) {
2286 1         3 $var /= $score->{validCount}-1;
2287             }
2288 1 50       4 return 0 if $var < 0;
2289 1         3 return $var;
2290             }
2291              
2292             sub binStdev {
2293 1     1 1 7 my $score = shift;
2294 1         4 return sqrt(binVariance($score));
2295             }
2296              
2297              
2298              
2299              
2300             ######## Other Classes #############
2301              
2302             package Bio::DB::USeq::Feature;
2303 1     1   7 use base 'Bio::SeqFeature::Lite';
  1         2  
  1         481  
2304              
2305             sub new {
2306 13     13   24 my $class = shift;
2307 13         56 return $class->SUPER::new(@_);
2308             }
2309              
2310             sub type {
2311             # Bio::SeqFeature::Lite mangles the type and returns
2312             # primary_tag:source if both are set
2313             # this may wreck havoc with parsers when the type already has a :
2314             # as in wiggle:1000
2315 16     16   130 my $self = shift;
2316 16         94 return $self->{type};
2317             }
2318              
2319             sub chr_stats {
2320 0     0   0 my $self = shift;
2321 0 0       0 return unless exists $self->{useq};
2322 0         0 return $self->{useq}->chr_stats( $self->seq_id );
2323             }
2324              
2325             sub chr_mean {
2326 0     0   0 my $self = shift;
2327 0 0       0 return unless exists $self->{useq};
2328 0         0 return $self->{useq}->chr_mean( $self->seq_id );
2329             }
2330              
2331             sub chr_stdev {
2332 0     0   0 my $self = shift;
2333 0 0       0 return unless exists $self->{useq};
2334 0         0 return $self->{useq}->chr_stdev( $self->seq_id );
2335             }
2336              
2337             sub global_stats {
2338 0     0   0 my $self = shift;
2339 0 0       0 return unless exists $self->{useq};
2340 0         0 return $self->{useq}->global_stats( $self->seq_id );
2341             }
2342              
2343             sub global_mean {
2344 0     0   0 my $self = shift;
2345 0 0       0 return unless exists $self->{useq};
2346 0         0 return $self->{useq}->global_mean( $self->seq_id );
2347             }
2348              
2349             sub global_stdev {
2350 0     0   0 my $self = shift;
2351 0 0       0 return unless exists $self->{useq};
2352 0         0 return $self->{useq}->global_stdev( $self->seq_id );
2353             }
2354              
2355              
2356              
2357              
2358             package Bio::DB::USeq::Segment;
2359 1     1   50483 use base 'Bio::DB::USeq::Feature';
  1         3  
  1         670  
2360              
2361             sub new {
2362 1     1   2 my $class = shift;
2363 1         4 my %args = @_;
2364 1         7 my $segment = $class->SUPER::new(@_);
2365 1 50       74 $segment->{'useq'} = $args{'-useq'} if exists $args{'-useq'};
2366 1         2 return $segment;
2367             }
2368              
2369             sub scores {
2370 1     1   108 my $self = shift;
2371 1         6 return $self->{'useq'}->scores(
2372             -seq_id => $self->seq_id,
2373             -start => $self->start,
2374             -end => $self->end,
2375             -strand => $self->strand,
2376             );
2377             }
2378              
2379             sub features {
2380 0     0   0 my $self = shift;
2381 0         0 my $type = shift;
2382 0   0     0 $type ||= 'region';
2383 0         0 return $self->{'useq'}->features(
2384             -seq_id => $self->seq_id,
2385             -start => $self->start,
2386             -end => $self->end,
2387             -strand => $self->strand,
2388             -type => $type,
2389             );
2390             }
2391              
2392             sub get_seq_stream {
2393 1     1   258 my $self = shift;
2394 1         2 my $type = shift;
2395 1   50     16 $type ||= 'region';
2396             return Bio::DB::USeq::Iterator->new(
2397             -seq_id => $self->seq_id,
2398             -start => $self->start,
2399             -end => $self->end,
2400             -strand => $self->strand,
2401             -source => $self->source,
2402             -type => $type,
2403             -useq => $self->{useq},
2404 1         5 );
2405             }
2406              
2407             sub slices {
2408 0     0   0 my $self = shift;
2409 0         0 my @slices = $self->{'useq'}->_translate_coordinates_to_slices(
2410             $self->seq_id, $self->start, $self->end, $self->strand
2411             );
2412 0 0       0 return wantarray ? @slices : \@slices;
2413             }
2414              
2415             sub coverage {
2416 0     0   0 my $self = shift;
2417 0         0 my $bins = shift;
2418 0   0     0 $bins ||= $self->length;
2419 0         0 return $self->features("coverage:$bins");
2420             }
2421              
2422             sub wiggle {
2423 0     0   0 my $self = shift;
2424 0         0 my $bins = shift;
2425 0   0     0 $bins ||= $self->length;
2426 0         0 return $self->features("wiggle:$bins");
2427             }
2428              
2429             sub statistical_summary {
2430 0     0   0 my $self = shift;
2431 0         0 my $bins = shift;
2432 0   0     0 $bins ||= 1;
2433 0         0 return $self->features("summary:$bins");
2434             }
2435              
2436             sub observations {
2437 0     0   0 my $self = shift;
2438 0         0 return $self->{'useq'}->observations(
2439             -seq_id => $self->seq_id,
2440             -start => $self->start,
2441             -end => $self->end,
2442             -strand => $self->strand,
2443             );
2444             }
2445              
2446              
2447             package Bio::DB::USeq::Iterator;
2448 1     1   18 use base 'Bio::DB::USeq::Feature';
  1         2  
  1         1252  
2449              
2450             sub new {
2451 4     4   57 my $class = shift;
2452            
2453             # create object
2454 4         18 my %args = @_;
2455 4 50       11 my $useq = $args{'-useq'} or
2456             die "Bio::DB::USeq::Iterator cannot be created without a -useq argument";
2457 4         18 my $iterator = $class->SUPER::new(@_);
2458 4         218 $iterator->{'useq'} = $useq;
2459            
2460             # determine which members to retrieve
2461             my @slices = $useq->_translate_coordinates_to_slices(
2462             $args{-seq_id},
2463             $args{-start},
2464             $args{-end},
2465             $args{-strand},
2466 4         15 );
2467 4         12 $useq->_clear_buffer(\@slices);
2468            
2469             # sort the slices as necessary
2470 4 100       16 if (scalar(@slices) > 1) {
2471 6         12 @slices = map {$_->[1]}
2472 3         11 sort {$a->[0] <=> $b->[0]}
2473 3         17 map { [$useq->slice_start($_), $_] } @slices;
  6         15  
2474             }
2475            
2476             # how we set up the iterator depends on the feature type requested
2477             # we need to add specific information to iterator object
2478            
2479 4 100       15 if ($iterator->type =~ /region|interval|observation/) {
2480             # useq observation features are simple
2481 2         5 $iterator->{'wiggle'} = 0;
2482            
2483             # prepare specific iterator information
2484 2         5 $iterator->{'slices'} = \@slices;
2485 2         11 $iterator->{'current_results'} = undef;
2486 2         6 $iterator->{'current_strand'} = undef;
2487            
2488 2         9 return $iterator;
2489             }
2490            
2491             # otherwise we work with more complex wiggle or summary features
2492             # set the type of wiggle feature: 1 = wiggle, 2 = summary
2493 2 100       6 $iterator->{'wiggle'} = $iterator->type =~ /summary/ ? 2 : 1;
2494            
2495             # normally the iterator will only return one wigggle|summary feature, but
2496             # will return two if stranded data, per behavior for Bio::Grapics...
2497 2 50 33     11 if ($iterator->strand == 0 and $useq->stranded) {
2498             # we have stranded data, so need to return two features
2499             # separate the slices into each respective strands for each feature
2500 0         0 my (@f, @r);
2501 0         0 foreach (@slices) {
2502 0 0       0 if ($useq->slice_strand($_) == 1) {
2503 0         0 push @f, $_;
2504             }
2505             else {
2506 0         0 push @r, $_;
2507             }
2508             }
2509 0         0 $iterator->{slices} = [\@f, \@r];
2510             }
2511             else {
2512             # only need to return one feature
2513 2         6 $iterator->{slices} = [ \@slices ];
2514             }
2515            
2516             # check for type and bins
2517 2         4 my ($bin, $step);
2518 2 100       6 if ($iterator->type =~ /:(\d+)$/i) {
2519 1         3 $bin = $1;
2520 1         6 my $length = $iterator->length;
2521 1 50       18 $bin = $length if $bin > $length;
2522 1         3 $step = $length/$bin;
2523             }
2524 2         5 $iterator->{bin} = $bin;
2525 2         4 $iterator->{step} = $step;
2526            
2527 2         8 return $iterator;
2528             }
2529              
2530             sub next_seq {
2531 10     10   324 my $self = shift;
2532            
2533 10 100       32 if ($self->{'wiggle'} == 1) {
    100          
2534 2         6 return $self->_next_wiggle;
2535             }
2536             elsif ($self->{'wiggle'} == 2) {
2537 2         6 return $self->_next_summary;
2538             }
2539             else {
2540 6         14 return $self->_next_region;
2541             }
2542             }
2543              
2544 0     0   0 sub next_feature {return shift->next_seq}
2545              
2546             sub _next_region {
2547             # this will keep returning observations as SeqFeature objects until we run out
2548             # of observations in the requested interval
2549 6     6   8 my $self = shift;
2550 6         9 my $useq = $self->{'useq'};
2551            
2552             # get current results
2553 6 100       11 unless ($self->{'current_results'}) {
2554 2   50     3 my $slice = shift @{ $self->{'slices'} } || undef;
2555 2 50       10 return unless $slice; # no more slices, we're done
2556            
2557             # collect the observations
2558 2         7 $useq->_load_slice($slice);
2559 2         20 my $result = $useq->{buffer}{$slice}->fetch($self->start - 1, $self->end);
2560            
2561             # sort the observations and store the results
2562 2 50       367 my @sorted = sort {$a->[0] <=> $b->[0] or $a->[1] <=> $b->[1]} @$result;
  303         372  
2563 2         8 $self->{'current_results'} = \@sorted;
2564 2         19 $self->{'current_strand'} = $useq->slice_strand($slice);
2565             }
2566            
2567             # return next observation as a Feature object
2568 6         8 my $obs = shift @{ $self->{'current_results'} };
  6         11  
2569 6 50       12 unless (defined $obs) {
2570             # no more observations for current slice, try to move to next slice
2571 0         0 undef $self->{'current_results'};
2572 0         0 return $self->_next_region;
2573             }
2574             return Bio::DB::USeq::Feature->new(
2575             -seq_id => $self->seq_id,
2576             -start => $obs->[0] + 1,
2577             -end => $obs->[1],
2578 6 100       17 -strand => $self->{'current_strand'},
2579             -score => $obs->[2],
2580             -source => $self->source,
2581             -type => $self->type,
2582             -name => defined $obs->[3] ? $obs->[3] :
2583             join(':', $self->seq_id, $obs->[0], $obs->[1]),
2584             )
2585             }
2586              
2587             sub _next_wiggle {
2588             # this will return one wiggle Feature, two if separate strands are requested
2589            
2590 2     2   3 my $self = shift;
2591 2         3 my $useq = $self->{'useq'};
2592            
2593             # determine which slices to retrieve
2594 2         4 my $slices = shift @{ $self->{slices} };
  2         5  
2595 2 100       7 return unless $slices;
2596            
2597             # more information
2598 1         2 my $start = $self->start;
2599 1         7 my $stop = $self->end;
2600 1         7 my $step = $self->{step};
2601            
2602             # check whether we are working with binned wiggle or not
2603 1         1 my @scores;
2604 1 50 33     5 if ($self->{bin} and $step > 1) {
2605             # we will be collecting the mean score value in bins
2606            
2607             # collect the scores for each bin
2608 1         3 for (my $begin = $start; $begin < $stop; $begin += $step) {
2609            
2610             # round off coordinates to integers
2611             # beginning point and step may not be integers
2612 1000         1358 my $s = int($begin + 0.5);
2613 1000         1136 my $e = int($s + $step - 0.5); # start + step - 1 + 0.5
2614            
2615             # collect the scores from each of the requested slices
2616 1000 50       1180 if (scalar @$slices > 1) {
2617             # more than one slice, identify which subset of slices to collect from
2618             # may or may not be all of the current slices
2619 1000         930 my @sub_slices;
2620 1000         1099 foreach my $slice (@$slices) {
2621 2000 100       2956 next if $s > $useq->slice_end($slice);
2622 1061 100       1355 next if $e < $useq->slice_start($slice);
2623 1001         1445 push @sub_slices, $slice;
2624             }
2625 1000         1435 push @scores, $useq->_mean_score($s, $e, \@sub_slices);
2626             }
2627             else {
2628 0         0 push @scores, $useq->_mean_score($s, $e, $slices);
2629             }
2630             }
2631             }
2632             else {
2633             # otherwise we collect in one step and associate scores at bp resolution
2634             # collect the scores from each of the requested slices and
2635             # assemble them into a hash of values
2636              
2637             # correlate scores with position
2638 0         0 my %pos2score;
2639 0         0 foreach my $slice (@$slices) {
2640              
2641             # load and unpack the data
2642 0         0 $useq->_load_slice($slice);
2643              
2644             # fetch the overlapping observations
2645             # each observation is [start, stop, score]
2646 0         0 my $result = $useq->{'buffer'}{$slice}->fetch($start - 1, $stop);
2647 0         0 foreach my $r (@$result) {
2648 0         0 foreach my $p ( $r->[0] + 1 .. $r->[1] ) {
2649 0         0 $pos2score{$p} = $r->[2];
2650             }
2651             }
2652             }
2653              
2654             # convert positioned scores into an array
2655 0         0 foreach (my $s = $start; $s <= $stop; $s++) {
2656 0 0       0 push @scores, exists $pos2score{$s} ? $pos2score{$s} : undef;
2657             # for Bio::Graphics it is better to store undef than 0
2658             # which can do wonky things with graphs
2659             }
2660             }
2661              
2662             # generate the wiggle object
2663 1   50     6 my $strand = $useq->slice_strand( $slices->[0] ) || 0;
2664 1 50       11 return Bio::DB::USeq::Wiggle->new(
    50          
2665             -seq_id => $self->seq_id,
2666             -start => $start,
2667             -end => $stop,
2668             -strand => $strand,
2669             -type => $self->type,
2670             -source => $self->source,
2671             -attributes => { 'coverage' => [ \@scores ] },
2672             -name => $strand == 1 ? 'Forward' :
2673             $strand == -1 ? 'Reverse' : q(),
2674             -useq => $useq,
2675             );
2676             }
2677              
2678             sub _next_summary {
2679             # this will return one summary Feature, two if separate strands are requested
2680 2     2   4 my $self = shift;
2681            
2682             # determine which slices to retrieve
2683 2         2 my $slices = shift @{ $self->{slices} };
  2         4  
2684 2 100       6 return unless $slices;
2685            
2686             # all of the real statistical work is done elsewhere
2687             # just return the summary object
2688 1   50     5 my $strand = $self->{useq}->slice_strand( $slices->[0] ) || 0;
2689             return Bio::DB::USeq::Summary->new(
2690             -seq_id => $self->seq_id,
2691             -start => $self->start,
2692             -end => $self->end,
2693             -strand => $strand,
2694             -type => $self->type,
2695             -source => $self->source,
2696             -name => $strand == 1 ? 'Forward' :
2697             $strand == -1 ? 'Reverse' : q(),
2698             -useq => $self->{'useq'},
2699             -slices => $slices,
2700             -bin => $self->{bin},
2701             -step => $self->{step},
2702 1 50       6 );
    50          
2703             }
2704              
2705              
2706              
2707             package Bio::DB::USeq::Wiggle;
2708 1     1   9 use base 'Bio::DB::USeq::Feature';
  1         2  
  1         491  
2709             # Wiggle scores are stored in the coverage attribute for backwards
2710             # compatibility with Bio::Graphics.
2711              
2712             sub new {
2713 1     1   30 my $class = shift;
2714 1         18 my $wig = $class->SUPER::new(@_);
2715 1         126 my %args = @_;
2716 1 50       5 my $useq = $args{'-useq'} or
2717             die "Bio::DB::USeq::Wiggle cannot be created without a -useq argument";
2718 1         3 $wig->{useq} = $useq;
2719 1         18 return $wig;
2720             }
2721              
2722             sub coverage {
2723 1     1   2 my $self = shift;
2724 1         10 my ($coverage) = $self->get_tag_values('coverage');
2725 1 50       85 return wantarray ? @$coverage : $coverage;
2726             }
2727              
2728             sub wiggle {
2729 1     1   116 return shift->coverage;
2730             }
2731              
2732             # Borrowed from Bio::SeqFeature::Coverage from Bio::DB::Sam
2733             sub gff3_string {
2734 0     0   0 my $self = shift;
2735 0         0 my $gff3 = $self->SUPER::gff3_string(@_);
2736 0         0 my $coverage = $self->escape(join(',',$self->coverage));
2737 0         0 $gff3 =~ s/coverage=[^;]+/coverage=$coverage/g;
2738 0         0 return $gff3;
2739             }
2740              
2741             sub statistical_summary {
2742             # this is just for the wiggle scores, not original data
2743             # This is a fake statistical_summary to fool Bio::Graphics into
2744             # calculating chromosome or global statistics like a BigWig adaptor.
2745             # A real statistical_summary call would provide the number of bins,
2746             # so return null if bins is present.
2747             # We could calculate a real statistical summary, but that would be an
2748             # expensive circuitous route after we just collected wiggle scores.
2749             # Better to request the correct feature type in the first place.
2750 0     0   0 my $self = shift;
2751 0         0 my $bins = shift;
2752 0 0 0     0 return if $bins && $bins > 1;
2753            
2754             # initialize the statistical scores
2755 0         0 my $count = 0;
2756 0         0 my $sum;
2757             my $sum_squares;
2758 0         0 my $min;
2759 0         0 my $max;
2760            
2761             # generate a statistical summary of just the wiggle scores
2762 0         0 foreach my $s ($self->coverage) {
2763 0         0 $count++;
2764 0 0       0 next unless defined $s;
2765 0         0 $sum += $s;
2766 0         0 $sum_squares += $s * $s;
2767 0 0 0     0 $min = $s if (!defined $min or $s < $min);
2768 0 0 0     0 $max = $s if (!defined $max or $s > $max);
2769             }
2770            
2771             # return the statistical hash
2772 0   0     0 my %stat = (
      0        
      0        
      0        
2773             'validCount' => $count,
2774             'sumData' => $sum || 0,
2775             'sumSquares' => $sum_squares || 0,
2776             'minVal' => $min || 0,
2777             'maxVal' => $max || 0,
2778             );
2779 0         0 return \%stat;
2780             }
2781              
2782             sub get_seq_stream {
2783 0     0   0 my $self = shift;
2784             return Bio::DB::USeq::Iterator->new(
2785             -seq_id => $self->seq_id,
2786             -start => $self->start,
2787             -end => $self->end,
2788             -strand => $self->strand,
2789             -type => 'region',
2790             -source => $self->source,
2791             -useq => $self->{useq},
2792 0         0 );
2793             }
2794              
2795              
2796              
2797              
2798             package Bio::DB::USeq::Summary;
2799 1     1   6 use base 'Bio::DB::USeq::Feature';
  1         1  
  1         618  
2800              
2801             sub new {
2802 1     1   22 my $class = shift;
2803 1         7 my $summary = $class->SUPER::new(@_);
2804 1         71 my %args = @_;
2805 1 50       4 my $useq = $args{'-useq'} or
2806             die "Bio::DB::USeq::Summary cannot be created without a -useq argument";
2807 1         2 $summary->{useq} = $useq;
2808 1 50       4 $summary->{slices} = $args{-slices} if exists $args{-slices};
2809 1 50       4 $summary->{bin} = $args{-bin} if exists $args{-bin};
2810 1 50       2 $summary->{step} = $args{-step} if exists $args{-step};
2811 1         6 return $summary;
2812             }
2813              
2814             sub statistical_summary {
2815 1     1   107 my $self = shift;
2816 1         2 my $useq = $self->{useq};
2817            
2818             # get the number of bins to calculate the statistical summaries
2819 1         2 my $bin = shift;
2820 1 50 33     5 $bin ||= $self->{bin} if exists $self->{bin};
2821 1   50     3 $bin ||= 1;
2822 1 50       4 my $step = $self->{step} if exists $self->{step};
2823 1   33     11 $step ||= $self->length / $bin;
2824            
2825             # get the slices
2826 1 50       24 my $slices = $self->{slices} if exists $self->{slices};
2827 1 50       3 unless ($slices) {
2828             # this should already be established, but just in case
2829 0         0 my @a = $useq->_translate_coordinates_to_slices($self->seq_id,
2830             $self->start, $self->end, $self->strand);
2831 0         0 $useq->_clear_buffer(\@a);
2832 0         0 $slices = \@a;
2833             }
2834            
2835             # collect the statistical summaries for each bin
2836 1         2 my @summaries;
2837 1         2 for (my $begin = $self->start; $begin < $self->end; $begin += $step) {
2838            
2839             # round off coordinates to integers
2840             # beginning point and step may not be integers
2841 10         93 my $s = int($begin + 0.5);
2842 10         309 my $e = int($s + $step - 0.5); # start + step - 1 + 0.5
2843            
2844             # collect the scores from each of the requested slices
2845 10 50       20 if (scalar @$slices > 1) {
2846             # more than one slice, identify which subset of slices to collect from
2847             # may or may not be all of the current slices
2848 10         11 my @sub_slices;
2849 10         17 foreach my $slice (@$slices) {
2850 20 50       29 next if $s > $useq->slice_end($slice);
2851 20 100       30 next if $e < $useq->slice_start($slice);
2852 11         19 push @sub_slices, $slice;
2853             }
2854 10         19 push @summaries, $useq->_stat_summary($s, $e, \@sub_slices);
2855             }
2856             else {
2857 0         0 push @summaries, $useq->_stat_summary($s, $e, $slices);
2858             }
2859             }
2860            
2861             # return the reference to the statistical summaries
2862 1         19 return \@summaries;
2863             }
2864              
2865             sub score {
2866 0     0     my $self = shift;
2867 0           my $a = $self->statistical_summary(1);
2868 0           return $a->[0];
2869             }
2870              
2871             sub gff3_string {
2872             # this is going to be a little convoluted, since what we
2873             # really want here is coverage, which is easier to calculate with means,
2874             # rather than doing statistical summaries and calculate means from those
2875 0     0     my $self = shift;
2876             my ($wig) = $self->{useq}->features(
2877             # this should only return one feature because we have specific strand
2878 0           -seq_id => $self->seq_id,
2879             -start => $self->start,
2880             -end => $self->end,
2881             -strand => $self->strand,
2882             -type => 'coverage:1000',
2883             );
2884 0           return $wig->gff3_string;
2885             }
2886              
2887             sub get_seq_stream {
2888 0     0     my $self = shift;
2889             return Bio::DB::USeq::Iterator->new(
2890             -seq_id => $self->seq_id,
2891             -start => $self->start,
2892             -end => $self->end,
2893             -strand => $self->strand,
2894             -type => 'region',
2895             -source => $self->source,
2896             -useq => $self->{useq},
2897 0           );
2898             }
2899              
2900             =head1 AUTHOR
2901              
2902             Timothy J. Parnell, PhD
2903             Dept of Oncological Sciences
2904             Huntsman Cancer Institute
2905             University of Utah
2906             Salt Lake City, UT, 84112
2907              
2908             This package is free software; you can redistribute it and/or modify
2909             it under the terms of the Artistic License 2.0.