File Coverage

blib/lib/Bio/DB/USeq.pm
Criterion Covered Total %
statement 463 932 49.6
branch 151 328 46.0
condition 39 142 27.4
subroutine 62 111 55.8
pod 37 39 94.8
total 752 1552 48.4


line stmt bran cond sub pod time code
1             package Bio::DB::USeq;
2              
3             our $VERSION = '0.25';
4              
5             =head1 NAME
6              
7             Bio::DB::USeq - Read USeq archive database files
8              
9             =head1 SYNOPSIS
10            
11             use Bio::DB::USeq;
12             my $useq = Bio::DB::USeq->new('file.useq') or
13             die "unable to open file.useq!\n";
14            
15             # sequence IDs
16             my @seq_ids = $useq->seq_ids;
17             my $length = $useq->length($chr); # approximate, not exact
18            
19             ### Retrieving features
20             # all features or observations across chromosome I
21             my @features = $useq->features(
22             -seq_id => 'chrI',
23             -type => 'region',
24             );
25            
26             # same thing using a simple form
27             # use an array of (chromosome, start, stop, strand)
28             my @features = $useq->features('chrI');
29            
30             # same thing with a memory efficient iterator
31             my $iterator = $useq->get_seq_stream(
32             -seq_id => 'chrI',
33             );
34             while (my $f = $iterator->next_seq) {
35             # each feature $f supports most SeqFeatureI methods
36             }
37            
38            
39             ### Retrieving simple scores
40             my @scores = $useq->scores(
41             -seq_id => 'chrI',
42             -start => 1000,
43             -end => 2000
44             );
45            
46            
47             ### Same methods as above after defining an interval first
48             my $segment = $useq->segment(
49             -seq_id => 'chrI',
50             -start => 1000,
51             -end => 2000,
52             );
53             my @scores = $segment->scores;
54             my @features = $segment->features;
55             my $iterator = $segment->get_seq_stream;
56            
57            
58             ### Efficient retrieval of positioned scores in 100 bins
59             # compatible with Bio::Graphics
60             my ($wig) = $useq->features(
61             # assuming unstranded data here, otherwise two wiggle objects
62             # would be returned, one for each strand
63             -seq_id => 'chrI',
64             -start => 1000,
65             -end => 2000,
66             -type => 'wiggle:100',
67             );
68             my @bins = $wig->wiggle;
69             my @bins = $wig->coverage; # same thing
70             my ($bins) = $wig->get_tag_values('coverage'); # same thing
71            
72            
73             ### Statistical summaries of intervals
74             # compatible with Bio::Graphics
75             my ($summary) = $useq->features(
76             # assuming unstranded data here, otherwise two summaries
77             # would be returned, one for each strand
78             -seq_id => 'chrI',
79             -start => 1000,
80             -end => 2000,
81             -type => 'summary',
82             );
83             my $stats = $summary->statistical_summary(10);
84             foreach (@$stats) {
85             my $max = $_->{maxVal};
86             my $mean = Bio::DB::USeq::binMean($_);
87             }
88            
89            
90             ### Stranded data using an iterator
91             # could be used with either wiggle or summary features
92             my $stream = $useq->get_seq_stream(
93             -seq_id => 'chrI',
94             -start => 1000,
95             -end => 2000,
96             -type => 'wiggle:100',
97             );
98             my ($forward, $reverse);
99             if ($useq->stranded) {
100             $forward = $stream->next_seq;
101             $reverse = $stream->next_seq;
102             }
103             else {
104             $forward = $stream->next_seq;
105             }
106            
107              
108             =head1 DESCRIPTION
109              
110             Bio::DB::USeq is a B<BioPerl> style adaptor for reading USeq files. USeq files
111             are compressed, indexed data files supporting modern bioinformatic datasets,
112             including genomic points, scores, and intervals. More information about the
113             USeq software package can be found at L<https://github.com/HuntsmanCancerInstitute/USeq>,
114             including a description of the USeq data
115             L<archive format|https://github.com/HuntsmanCancerInstitute/USeq/blob/master/Documentation/USeqDocumentation/useqArchiveFormat.html>.
116              
117             USeq files are typically half the size of corresponding bigBed and bigWig files,
118             due to a compact internal format and lack of internal zoom data. This adaptor,
119             however, can still return statistics across different zoom levels in the same
120             manner as big files, albeit at a cost of calculating these in realtime.
121              
122             =head2 Generating useq files
123              
124             USeq files may be generated using tools from the USeq package, available at
125             L<https://github.com/HuntsmanCancerInstitute/USeq>. They may be generated from
126             native Bar files, text Wig files, text Bed files, and UCSC bigWig and bigBed file
127             formats.
128              
129             =head2 Compatibility
130              
131             The adaptor follows most conventions of other B<BioPerl>-style Bio::DB
132             adaptors. Observations or features in the useq file archive are
133             returned as SeqFeatureI compatible objects.
134              
135             Coordinates consumed and returned by the adaptor are 1-based, consistent
136             with B<BioPerl> convention. This is not true of the useq file itself, which
137             uses the interbase coordinate system.
138              
139             Unlike wig and bigWig files, useq file archives support stranded data,
140             which can make data collection much simpler for complex experiments.
141              
142             See below for GBrowse compatibility.
143              
144             =head2 Limitations
145              
146             This adaptor is read only. USeq files, in general, are not modified
147             or written with data. The exceptions are chromosome or global statistics
148             are written to the F<archiveReadMe.txt> file inside the zip archive to
149             cache for future queries.
150              
151             No support for genomic sequence is included. Users who need access to
152             genomic sequence should seek an alternative B<BioPerl> adaptor, such as
153             L<Bio::DB::Fasta>.
154              
155             Useq files do not have a native concept of type, primary_tag, or source
156             attributes, as expected with GFF-based database adaptors. The features
157             method does support special types (see below).
158              
159             =head2 Requirements
160              
161             The adaptor is a Perl-only implementation. It only requires the
162             L<Archive::Zip> module for opening and reading the useq file archive.
163             B<BioPerl> is required for working with SeqFeatures objects generated
164             from useq file observations.
165              
166             =head1 METHODS
167              
168             =head2 Initializing the Bio::DB::USeq object
169              
170             These are class methods for creating and working with the Bio::DB::USeq
171             object itself.
172              
173             =over 4
174              
175             =item new
176              
177             =item new($file)
178              
179             =item new(-useq => $file)
180              
181             This will create a new Bio::DB::USeq object. Optionally pass the path
182             to a useq file archive to open immediately. There is not much to do
183             unless you open a file.
184              
185             Named arguments may be used to specify the file. Either -useq or -file
186             may be used.
187              
188             =item open($file)
189              
190             Open a useq file archive. Useq files typically use the F<.useq> file
191             extension. Returns true if successful.
192              
193             DO NOT open a subsequent useq file archive when one has already been
194             opened. Please create a new object to open a new file.
195              
196             =item clone
197              
198             Force the object to re-open the useq archive file under a new file
199             handle to make it clone safe. Do this in the child process before
200             collecting data.
201              
202             =item zip
203              
204             Return the L<Archive::Zip> object representing the useq file archive.
205             Generally not recommended unless you know what you are doing.
206              
207             =back
208              
209             =head2 General information about the useq file
210              
211             These are class methods for obtaining general information or
212             metadata attributes regarding the contents of the file.
213              
214             =over 4
215              
216             =item stranded
217              
218             Return true or false (1 or 0) indicating whether the contents of the
219             useq file archive are recorded in a stranded fashion.
220              
221             =item attributes
222              
223             Return an array of available attribute keys that were recorded in the
224             useq file F<archiveReadMe.txt> member. These are key = value pairs
225             representing metadata for the useq file.
226              
227             =item attribute($key)
228              
229             Return the metadata attribute value for the specified key. These
230             are recorded in the useq file F<archiveReadMe.txt> member. Returns
231             undefined if the key does not exist.
232              
233             =item type
234              
235             Return the useq file metadata C<dataType> value.
236              
237             =item genome
238              
239             Return the useq file metadata C<versionedGenome> value.
240              
241             =item version
242              
243             Return the useq file metadata C<useqArchiveVersion> value.
244              
245             =back
246              
247             =head2 Chromosome Information
248              
249             These are methods to obtain information about the chromosomes or reference
250             sequences represented in the file archive.
251              
252             B<Note> that generating score statistics across one or more chromosomes may
253             be computationally expensive. Therefore, chromosome statistics, once
254             calculated, are cached in the useq file metadata for future reference.
255             This necessitates writing to the useq zip archive. This is currently the
256             only supported method for modifying the useq zip archive.
257              
258             =over 4
259              
260             =item seq_ids
261              
262             Return an array of the chromosome or sequence identifiers represented
263             in the useq file archive. The names are sorted ASCIIbetically before
264             they are returned.
265              
266             =item length($seq_id)
267              
268             Return the length of the provided chromosome or sequence identifier.
269             Note that this may not be the actual length in the reference genome, but
270             rather the last recorded position of an observation for that chromosome.
271             Hence, it should be used only as an approximation.
272              
273             =item chr_mean($seq_id)
274              
275             Return the mean score value across the entire chromosome.
276              
277             =item chr_stdev($seq_id)
278              
279             Return the score standard deviation across the entire chromosome.
280              
281             =item chr_stats($seq_id)
282              
283             Return a statistical summary across the entire chromosome. This
284             is an anonymous hash with five keys: validCount, sumData,
285             sumSquares, minVal, and maxVal. These are equivalent to the
286             statistical summaries generated by the Bio::DB::BigWig adaptor.
287              
288             =item global_mean
289              
290             Return the mean score value across all chromosomes.
291              
292             =item global_stdev
293              
294             Return the mean score value across all chromosomes.
295              
296             =item global_stats
297              
298             Return a statistical summary across all chromosomes.
299              
300             =back
301              
302             =head2 Data accession
303              
304             These are the primary methods for working with data contained in
305             useq file archive. These should be familiar to most B<BioPerl> users.
306              
307             =over 4
308              
309             =item features
310              
311             Returns an array or array reference of SeqFeatureI compatible
312             objects overlapping a given genomic interval.
313              
314             Coordinates of the interrogated regions must be supplied. At a
315             minimum, the seq_id must be supplied. A start of 1 and an end
316             corresponding to the length of the seq_id is used if not directly
317             provided. Coordinates may be specified in two different manners,
318             either as a list of (seq_id, start, end, strand) or as one or
319             more keyed values.
320            
321             my @features = $useq->features($seq_id, $start, $end);
322            
323             @features = $useq->features(
324             -seq_id => $seq_id,
325             -start => $start,
326             -end => $end,
327             );
328              
329             If the -iterator argument is supplied with a true value, then an
330             iterator object is returned. See get_seq_stream() for details.
331              
332             Bio::DB::USeq supports four different feature types. Feature
333             types may be specified using the -type argument.
334              
335             =over 4
336              
337             =item region
338              
339             =item interval
340              
341             =item observation
342              
343             The default feature type if the type argument is not specified.
344             These are SeqFeatureI compatible objects representing observations
345             in the useq file archive. These are compatible with the iterator.
346             SeqFeature observations are returned in genomic order.
347              
348             =item chromosome
349              
350             Returns SeqFeatureI compatible objects representing the
351             reference sequences (chromosomes) listed in the useq file
352             archive. These are not compatibile with the iterator.
353              
354             =item wiggle
355              
356             =item wiggle:$bins
357              
358             =item coverage
359              
360             =item coverage:$bins
361              
362             Returns an array of SeqFeatureI compatible objects for each
363             strand requested. If the useq file contains stranded data,
364             and no strand is requested, then two objects will be
365             returned representing each strand.
366              
367             Each object contains an array representing scores across
368             the requested coordinates. This object is designed to be
369             backwards compatible with coverage features from the
370             L<Bio::DB::Sam> adaptor for use with L<Bio::Graphics> and GBrowse.
371             Note that only scores are returned, not a true depth coverage
372             in the sense of the L<Bio::DB::Sam> coverage types.
373              
374             By default, the wiggle or coverage array is provided at 1
375             bp resolution. To improve efficiency with large regions,
376             the wiggle array may be limited by using a bin option,
377             where the interrogated interval is divided into the number
378             of bins requested.
379              
380             To retrieve the scores, call the wiggle() or coverage() method.
381              
382             For example, to request wiggle scores in 100 equal bins across
383             the interval, see the following example. The wiggle and
384             coverage types are synonymous.
385            
386             my ($wiggle) = $useq->features(
387             -seq_id => $chromosome,
388             -start => $start,
389             -end => $end,
390             -type => 'wiggle:100',
391             );
392             my @scores = $wiggle->wiggle;
393             @scores = $wiggle->coverage;
394              
395             Wiggle objects may also be obtained with a get_seq_stream()
396             iterator objects.
397              
398             =item summary
399              
400             =item summary:$bins
401              
402             Returns an array of SeqFeatureI compatibile Summary objects
403             for each strand requested. If the useq file contains stranded
404             data, and no strand is requested, then two objects will be
405             returned representing each strand.
406              
407             Each Summary object can then be used to call statistical
408             summaries for one or more bins across the interval. Each
409             statistical summary is an anonymous hash with five keys:
410             validCount, sumData, sumSquares, minVal, and maxVal. From
411             these values, a mean and standard deviation may also be
412             calculated.
413            
414             my ($summary) = $useq->features(
415             -seq_id => $chromosome,
416             -start => $start,
417             -end => $end,
418             -type => 'summary',
419             );
420             my @stats = $summary->statistical_summary(100);
421             foreach my $stat (@stats) {
422             my $count = $stat->{validCount};
423             my $sum = $stat->{sumData};
424             my $mean = $sum / $count;
425             }
426              
427             Statistical summaries are equivalent to those generated by the
428             L<Bio::DB::BigWig> adaptor and may be used interchangeably. They
429             are compatible with the L<Bio::Graphics> modules.
430              
431             Summary objects may also be obtained with a get_seq_stream()
432             iterator object.
433              
434             =back
435              
436             =item get_seq_stream
437              
438             This is a memory efficient data accessor. An iterator object is
439             returned for an interval specified using coordinate values in the
440             same manner as features(). Call the method next_seq() on the
441             iterator object to retrieve the observation SeqFeature objects
442             one at a time. The iterator is compatible with region, wiggle,
443             and summary feature types.
444            
445             # establish the iterator object
446             my $iterator = $useq->get_seq_stream(
447             -seq_id => $seq_id,
448             -start => $start,
449             -end => $end,
450             -type => 'region',
451             );
452            
453             # retrieve the features one at a time
454             while (my $f = $iterator->next_seq) {
455             # each feature $f is either a
456             # Bio::DB::USeq::Feature,
457             # a Bio::DB::USeq::Wiggle, or
458             # a Bio::DB::USeq::Summary object
459             }
460              
461             =item scores
462              
463             This is a simplified data accessor that only returns the score
464             values overlapping an interrogated interval, rather than
465             assembling each observation into a SeqFeature object. The scores
466             are not associated with genomic coordinates, and are not guaranteed
467             to be returned in original genomic order.
468              
469             Provide the interval coordinates in the same manner as the
470             features() method. Stranded data collection is supported.
471            
472             my @scores = $useq->scores(
473             -seq_id => $seq_id,
474             -start => $start,
475             -end => $end,
476             );
477              
478             =item observations
479              
480             This is a low-level data accessor, similar to features() but returning
481             an array of references to the original USeq observations. Each USeq
482             observation is an anonymous array reference of 2-4 elements: start, stop,
483             score, and text, depending on the stored data type. All coordinates are in
484             0-based coordinates, unlike high level accessors. Note that while strand is
485             supported, it is not reported for each observation. If observations exist on
486             both strands, then each strand should be searched separately. Observations
487             are not guaranteed to be returned in genomic order.
488            
489             my @observations = $useq->observations(
490             -seq_id => $seq_id,
491             -start => $start,
492             -end => $end,
493             );
494             foreach my $obs (@observations) {
495             my $start = $obs->[0] + 1; # convert to 1-based coordinate
496             my $stop = $obs->[1];
497             my $score = $obs->[2]; # may not be present
498             my $text = $obs->[3]; # may not be present
499             }
500              
501             =item segment
502              
503             This returns a L<Bio::DB::USeq::Segment> object, which is a SeqFeatureI
504             compatible segment object corresponding to the specified coordinates.
505             From this object, one can call the features(), scores(), or
506             get_seq_stream() methods directly. Keyed options or location
507             information need not be provided. Stranded segments are
508             supported.
509            
510             my $segment = $useq->segment(
511             -seq_id => $seq_id,
512             -start => $start,
513             -end => $end,
514             );
515             my @scores = $segment->scores;
516             my @features = $segment->features('wiggle:100');
517             my $iterator = $segment->get_seq_stream('region');
518              
519             =item get_features_by_location
520              
521             Convenience method for returning features restricted by location.
522              
523             =item get_feature_by_id
524              
525             =item get_feature_by_name
526              
527             Compatibility methods for returning a specific feature or
528             observation in the USeq file. Text fields, if present, are not
529             indexed in the USeq file, preventing efficient searching of names.
530             As a workaround, an ID or name comprised of "$seq_id:$start:$end"
531             may be used, although a direct search of coordinates would be
532             more efficient.
533            
534             my $feature = $useq->get_feature_by_id("$seq_id:$start:$end");
535            
536              
537             =back
538              
539             =head1 ADDITIONAL CLASSES
540              
541             These are additional class object that may be returned by various
542             methods above.
543              
544             =head2 Bio::DB::USeq::Feature objects
545              
546             These are SeqFeatureI compliant objects returned by the features()
547             or next_seq() methods. They support the following methods.
548            
549             seq_id
550             start
551             end
552             strand
553             score
554             type
555             source (returns the useq archive filename)
556             name (chromosome:start..stop)
557             Bio::RangeI methods
558              
559             Additionally, chromosome and global statistics are also available
560             from any feature, as well as from Segment, Wiggle, Iterator, and
561             Summary objects. See the corresponding USeq methods for details.
562            
563             =head2 Bio::DB::USeq::Segment objects
564              
565             This is a SeqFeatureI compliant object representing a genomic segment
566             or interval. These support the following methods.
567              
568             =over 4
569              
570             =item features
571              
572             =item features($type)
573              
574             =item get_seq_stream
575              
576             =item get_seq_stream($type)
577              
578             =item scores
579              
580             =item observations
581              
582             Direct methods for returning features or scores. Coordinate information
583             need not be provided. See the corresponding Bio::DB::USeq methods for
584             more information.
585              
586             =item wiggle
587              
588             =item wiggle($bins)
589              
590             =item coverage
591              
592             =item coverage($bins)
593              
594             =item statistical_summary
595              
596             =item statistical_summary($bins)
597              
598             Convenience methods for returning wiggle (coverage) or summary features
599             over the segment. If desired, the number of bins may be specified. See
600             the features() method for more information.
601              
602             =item slices
603              
604             Returns an array of splice member names that overlap this segment.
605             See L<USEQ SLICES> for more information.
606              
607             =back
608              
609             =head2 Bio::DB::USeq::Iterator objects
610              
611             This is an iterator object for retrieving useq observation SeqFeatureI
612             objects in a memory efficient manner by returning one feature at a time.
613              
614             =over 4
615              
616             =item next_seq
617              
618             =item next_feature
619              
620             Returns the next feature present in the interrogated interval. Features
621             are generally returned in ascending coordinate order. Returns undefined
622             when no features are remaining in the interval. Features may include
623             either region or wiggle types, depending on how the iterator object was
624             established. See features() and get_seq_stream() methods for more
625             information.
626              
627             =back
628              
629             =head2 Bio::DB::USeq::Wiggle objects
630              
631             These are SeqFeatureI compliant objects for backwards compatibility with
632             L<Bio::Graphics> and GBrowse. They support the wiggle() and coverage()
633             methods, which returns an array of scores over the interrogated region. By
634             default, the array is equal to the length of the region (1 bp resolution),
635             or may be limited to a specified number of bins for efficiency. See the
636             features() method for more information.
637              
638             =over 4
639              
640             =item wiggle
641              
642             =item coverage
643              
644             The scores are stored as an array in the coverage attribute. For
645             convenience, the wiggle() and coverage() methods may be used to retrieve
646             the array or array reference of scores.
647              
648             =item statistical_summary
649              
650             Generate a statistical summary hash for the collected wiggle scores
651             (not the original data). This method is not entirely that useful; best
652             to use the summary feature type in the first place.
653              
654             =item chromosome and global statistics
655              
656             Chromosome and global statistics, including mean and standard deviation,
657             are available from wiggle objects. See the corresponding USeq methods
658             for details.
659              
660             =back
661              
662             =head2 Bio::DB::USeq::Summary objects
663              
664             These are SeqFeatureI compliant Summary objects, similar to those
665             generated by the L<Bio::DB::BigWig> database adaptor. As such, they are
666             compatible with L<Bio::Graphics> and GBrowse.
667              
668             Summary objects can generate statistical summaries over a specified
669             number of bins (default is 1 bin, or the entire requested region).
670             Each statistical summary is an anonymous hash consisting of five
671             keys: validCount, sumData, sumSquares, minVal, and maxVal. From
672             these values, a mean and standard deviation may be calculated.
673              
674             For convenience, three exported functions are available for calculating
675             the mean and standard deviation from a statistical summary hash.
676             See L<EXPORTED FUNCTIONS> for more information.
677              
678             Use statistical summaries in the following manner.
679            
680             my $stats = $summary->statistical_summary(10);
681             my $stat = shift @$stats;
682             my $min = $stat->{minVal};
683             my $max = $stat->{maxVal};
684             my $mean = $stat->{sumData} / $stat->{validCount};
685              
686             =over 4
687              
688             =item statistical_summary
689              
690             =item statistical_summary($bins)
691              
692             Generate a statistical summary hash for one or more bins across the
693             interrogated region. Provide the number of bins desired. If a feature
694             type of "summary:$bins" is requested through the features() or
695             get_seq_stream() method, then $bins number of bins will be used.
696             The default number of bins is 1.
697              
698             =item score
699              
700             Generate a single statistical summary over the entire region.
701              
702             =item chromosome and global statistics
703              
704             Chromosome and global statistics, including mean and standard deviation,
705             are available from summary objects. See the corresponding USeq methods
706             for details.
707              
708             =back
709              
710             =head1 EXPORTED FUNCTIONS
711              
712             Three subroutine functions are available for export to assist
713             in calculating the mean, variance, and standard deviation from
714             statistical summaries. These functions are identical to those
715             from the L<Bio::DB::BigWig> adaptor and may be used interchangeably.
716              
717             They are not exported by default; they must explicitly listed.
718            
719             use Bio::DB::USeq qw(binMean binStdev);
720             my $stats = $summary->statistical_summary(10);
721             my $stat = shift @$stats;
722             my $mean = binMean($stat);
723             my $stdev = binStdev($stat);
724              
725             =over 4
726              
727             =item binMean( $stat )
728              
729             Calculate the mean from a statistical summary anonymous hash.
730              
731             =item binVariance( $stat )
732              
733             Calculate the variance from a statistical summary anonymous hash.
734              
735             =item binStdev( $stat )
736              
737             Calculate the standard deviation from a statistical summary anonymous hash.
738              
739             =back
740              
741             =head1 USEQ SLICES
742              
743             Genomic observations are recorded in groups, called slices, of
744             usually 10000 observations at a time. Each slice is a separate
745             zip file member in the useq file archive. These methods are for
746             accessing information about each slice. In general, accessing
747             data through slices is a lower level operation. Users should
748             preferentially use the main data accessors.
749              
750             The following are Bio::DB::USeq methods available for working
751             with slices.
752              
753             =over 4
754              
755             =item slices
756              
757             Returns an array of all the slice member names present in the
758             useq archive file.
759              
760             =item slice_feature($slice)
761              
762             Return a L<Bio::DB::USeq::Segment> object representing the slice interval.
763             The features(), get_seq_stream(), and scores() methods are supported.
764              
765             =item slice_seq_id($slice)
766              
767             Return the chromosome or sequence identifier associated with a slice.
768              
769             =item slice_start($slice)
770              
771             Return the start position of the slice interval.
772              
773             =item slice_end($slice)
774              
775             Return the end position of the slice interval.
776              
777             =item slice_strand($slice)
778              
779             Return the strand of the slice interval.
780              
781             =item slice_type($slice)
782              
783             Return the file type of the slice member. This corresponds to the
784             file extension of the slice zip member and indicates how to parse
785             the binary member. Each letter in the type corresponds to a data
786             type, be it integer, short, floating-point, or text. See the
787             useq file documentation for more details.
788              
789             =item slice_obs_number($slice)
790              
791             Return the number of observations recorded in the slice interval.
792              
793             =back
794              
795             =head1 GBROWSE COMPATIBILITY
796              
797             The USeq adaptor has support for L<Bio::Graphics> and GBrowse.
798             It will work with the segments glyph for intervals,
799             the wiggle_xyplot glyph for displaying mean scores, and the
800             wiggle_whiskers glyph for displaying detailed statistics.
801              
802             Initialize the USeq database adaptor.
803            
804             [data1:database]
805             db_adaptor = Bio::DB::USeq
806             db_args = -file /path/to/data1.useq
807              
808             Displaying simple intervals with the segments glyph.
809            
810             [data1_segments]
811             database = data1
812             feature = region
813             glyph = segments
814             stranded = 1
815              
816             Displaying scores using the wiggle_xyplot glyph.
817             You may set the bins to whatever number is appropriate (in
818             this example, 1000), or leave blank (not recommended,
819             defaults to 1 bp resolution).
820            
821             [data1_xyplot]
822             database = data1
823             feature = wiggle:1000
824             glyph = wiggle_xyplot
825             graph_type = histogram
826             autoscale = chromosome
827              
828             Displaying scores using the wiggle_whiskers glyph. Note that
829             generating statistical summaries are computationally more expensive
830             than simple bins of mean values as with the wiggle feature type.
831            
832             [data1_whiskers]
833             database = data1
834             feature = summary
835             glyph = wiggle_whiskers
836             graph_type = histogram
837             autoscale = chromosome
838              
839             =head1 PERFORMANCE
840              
841             Because the Bio::DB::USeq is implemented as a Perl-only module,
842             performance is subject to the limitations of Perl execution itself and
843             the size of the data that needs to be parsed. In general when collecting
844             score data, requesting scores is the fastest mode of operation, followed
845             by wiggle feature types, and finally summary feature types.
846              
847             In comparison to UCSC bigWig files, the USeq format is typically much
848             faster when viewing intervals where the entire interval is represented by
849             one or a few internal slices. This is especially true for repeated queries
850             over the same or neighboring intervals, as the slice contents are retained
851             in memory. As the number of internal slices that must be loaded into memory
852             increases, for example querying intervals of hundreds of kilobases in size,
853             performance will begin to lag as each internal slice must be parsed into
854             memory. This is where the UCSC bigWig file format with internal zoom levels
855             of summary statistics can outperform, at the cost of file complexity and size.
856              
857             =cut
858              
859              
860             require 5.010000;
861 1     1   8795 use strict;
  1         3  
  1         33  
862 1     1   6 use Carp qw(carp cluck croak confess);
  1         2  
  1         74  
863 1     1   672 use Archive::Zip qw( :ERROR_CODES );
  1         95370  
  1         106  
864 1     1   448 use Set::IntervalTree;
  1         8057  
  1         74  
865 1     1   9 use File::Spec;
  1         2  
  1         29  
866              
867 1     1   5 use base 'Exporter';
  1         2  
  1         9010  
868             our @EXPORT_OK = qw(binMean binVariance binStdev);
869              
870             1;
871              
872             #### USeq initialization ####
873              
874             sub new {
875 2     2 1 221 my $class = shift;
876 2         22 my $self = {
877             'name' => undef,
878             'zip' => undef,
879             'stranded' => undef,
880             'seq_ids' => {},
881             'metadata' => {},
882             'coord2file' => {},
883             'file2attribute' => {},
884             'buffer' => {},
885             };
886 2         6 bless $self, $class;
887            
888             # check for arguments
889 2         4 my %args;
890 2 50       8 if (@_) {
891 2 100       13 if ($_[0] =~ /^-/) {
892 1         5 %args = @_;
893             }
894             else {
895 1         15 $args{-file} = shift;
896             }
897             }
898            
899             # open file
900 2   0     7 my $file = $args{-file} || $args{-useq} || undef;
901 2 50       8 if ($file) {
902 2 50       11 return unless ($self->open($file)); # open must return true
903             }
904            
905             # done
906 2         8 return $self;
907             }
908              
909             sub open {
910 2     2 1 4 my $self = shift;
911            
912             # check file
913 2         5 my $filename = shift;
914 2 50       6 unless ($filename) {
915 0         0 cluck("no file name passed!\n");
916 0         0 return;
917             }
918 2 50       11 unless ($filename =~ /\.useq$/i) {
919 0         0 carp "'$filename' is not a .useq archive file!\n";
920 0         0 return;
921             }
922 2 50       9 if ($self->slices) {
923 0         0 cluck "Only load one useq file per object!\n";
924 0         0 return;
925             }
926            
927             # open the archive
928 2         17 my $zip = Archive::Zip->new();
929 2         74 my $error = $zip->read($filename);
930 2 50       3114 unless ($error == AZ_OK) {
931 0         0 carp " unable to read USeq archive '$filename'! Error $error\n";
932 0         0 return;
933             }
934 2         6 $self->{'zip'} = $zip;
935 2         50 (undef, undef, $self->{'name'}) = File::Spec->splitpath($filename);
936            
937             # parse the contents
938 2 50       10 return unless ($self->_parse_members);
939             # we delay parsing metadata unless it is requested
940            
941             # success
942 2         6 return 1;
943             }
944              
945             sub clone {
946 1     1 1 3 my $self = shift;
947 1 50       6 return unless $self->{'zip'};
948 1         4 my $file = $self->zip->fileName;
949 1         17 my $zip = Archive::Zip->new();
950 1         32 my $error = $zip->read($file);
951 1 50       1560 unless ($error == AZ_OK) {
952 0         0 carp " unable to read USeq archive '$file'! Error $error\n";
953 0         0 return;
954             }
955 1         29 $self->{'zip'} = $zip;
956 1         5 return 1;
957             }
958              
959             sub zip {
960 9     9 1 57 return shift->{'zip'};
961             }
962              
963             sub name {
964 4     4 0 40 return shift->{'name'};
965             }
966              
967              
968              
969             #### General USeq information ####
970              
971             sub seq_ids {
972 1     1 1 3 my $self = shift;
973 1         2 return sort {$a cmp $b} keys %{ $self->{'seq_ids'} };
  1         7  
  1         7  
974             }
975              
976             sub length {
977 8     8 1 72 my $self = shift;
978 8 50       25 my $seq_id = shift or return;
979 8 50       34 if (exists $self->{'seq_ids'}{$seq_id}) {
980 8         38 return $self->{'seq_ids'}{$seq_id};
981             }
982             }
983              
984             sub stranded {
985 8     8 1 53 return shift->{'stranded'};
986             }
987              
988             sub attributes {
989 0     0 1 0 my $self = shift;
990 0 0       0 $self->_parse_metadata unless %{ $self->{'metadata'} };
  0         0  
991 0         0 return (sort {$a cmp $b} keys %{ $self->{'metadata'} });
  0         0  
  0         0  
992             }
993              
994             sub attribute {
995 1     1 1 2 my $self = shift;
996 1         3 my $key = shift;
997 1 50       3 return $self->attributes unless $key;
998 1 50       2 $self->_parse_metadata unless %{ $self->{'metadata'} };
  1         7  
999 1 50       4 if (exists $self->{'metadata'}{$key}) {
1000 1         7 return $self->{'metadata'}{$key};
1001             }
1002 0         0 return;
1003             }
1004              
1005             sub type {
1006 1     1 1 107 return shift->attribute('dataType');
1007             }
1008              
1009             sub version {
1010 0     0 1 0 return shift->attribute('useqArchiveVersion');
1011             }
1012              
1013             sub genome {
1014 0     0 1 0 return shift->attribute('versionedGenome');
1015             }
1016              
1017             sub chr_stats {
1018 1     1 1 3 my $self = shift;
1019 1 50       4 my $seq_id = shift or return;
1020 1         3 my $delay_write = shift; # option to delay rewriting the metadata
1021 1 50       1 $self->_parse_metadata unless %{ $self->{'metadata'} };
  1         6  
1022            
1023             # return the chromosome stats
1024 1 50       12 if (exists $self->{metadata}{"chromStats_$seq_id"}) {
1025 0         0 my @data = split(',', $self->{metadata}{"chromStats_$seq_id"});
1026 0         0 my %stat = (
1027             'validCount' => $data[0],
1028             'sumData' => $data[1],
1029             'sumSquares' => $data[2],
1030             'minVal' => $data[3],
1031             'maxVal' => $data[4],
1032             );
1033 0         0 return \%stat;
1034             }
1035            
1036             # chromosome stats must be generated
1037 1         14 my @slices = $self->_translate_coordinates_to_slices(
1038             $seq_id, 1, $self->length($seq_id), 0);
1039 1         5 $self->_clear_buffer(\@slices);
1040 1         9 my $stat = $self->_stat_summary(1, $self->length($seq_id), \@slices);
1041            
1042             # then associate with the metadata
1043 1         7 $self->{'metadata'}{"chromStats_$seq_id"} = join(',', map { $stat->{$_} }
  5         44  
1044             qw(validCount sumData sumSquares minVal maxVal) );
1045 1 50       9 $self->_rewrite_metadata unless $delay_write;
1046            
1047 1         6 return $stat;
1048             }
1049              
1050             sub chr_mean {
1051 1     1 1 142 my $self = shift;
1052 1 50       5 my $seq_id = shift or return;
1053 1         4 return Bio::DB::USeq::binMean( $self->chr_stats($seq_id) );
1054             }
1055              
1056             sub chr_stdev {
1057 0     0 1 0 my $self = shift;
1058 0 0       0 my $seq_id = shift or return;
1059 0         0 return Bio::DB::USeq::binStdev( $self->chr_stats($seq_id) );
1060             }
1061              
1062             sub global_stats {
1063             # this is an expensive proposition, because it must parse through every
1064             # slice in the archive
1065 0     0 1 0 my $self = shift;
1066 0 0       0 $self->_parse_metadata unless %{ $self->{'metadata'} };
  0         0  
1067            
1068             # return the chromosome stats
1069 0 0       0 if (exists $self->{metadata}{globalStats}) {
1070 0         0 my @data = split(',', $self->{metadata}{globalStats});
1071 0         0 my %stat = (
1072             'validCount' => $data[0],
1073             'sumData' => $data[1],
1074             'sumSquares' => $data[2],
1075             'minVal' => $data[3],
1076             'maxVal' => $data[4],
1077             );
1078 0         0 return \%stat;
1079             }
1080            
1081             # calculate new genome-wide statistics from individual chromosome stats
1082 0         0 my $count = 0;
1083 0         0 my $sum;
1084             my $sum_squares;
1085 0         0 my $min;
1086 0         0 my $max;
1087 0         0 foreach my $seq_id ($self->seq_ids) {
1088 0         0 my $stats = $self->chr_stats($seq_id, 1); # delay writing metadata
1089 0         0 $count += $stats->{validCount};
1090 0         0 $sum += $stats->{sumData};
1091 0         0 $sum_squares += $stats->{sumSquares};
1092 0 0 0     0 $min = $stats->{minVal} if (!defined $min or $stats->{minVal} < $min);
1093 0 0 0     0 $max = $stats->{maxVal} if (!defined $max or $stats->{maxVal} < $max);
1094             }
1095            
1096             # assemble the statistical summary hash
1097 0   0     0 my %stat = (
      0        
      0        
      0        
1098             'validCount' => $count,
1099             'sumData' => $sum || 0,
1100             'sumSquares' => $sum_squares || 0,
1101             'minVal' => $min || 0,
1102             'maxVal' => $max || 0,
1103             );
1104            
1105             # update metadata
1106 0         0 $self->{'metadata'}{'globalStats'} = join(',', map { $stat{$_} }
  0         0  
1107             qw(validCount sumData sumSquares minVal maxVal) );
1108 0         0 $self->_rewrite_metadata;
1109            
1110 0         0 return \%stat;
1111             }
1112              
1113             sub global_mean {
1114 0     0 1 0 my $self = shift;
1115 0         0 return Bio::DB::USeq::binMean( $self->global_stats );
1116             }
1117              
1118             sub global_stdev {
1119 0     0 1 0 my $self = shift;
1120 0         0 return Bio::DB::USeq::binStdev( $self->global_stats );
1121             }
1122              
1123             #### slice information ####
1124              
1125             sub slices {
1126 2     2 1 4 my $self = shift;
1127 2         3 return sort {$a cmp $b} keys %{ $self->{'file2attribute'} };
  0         0  
  2         13  
1128             }
1129              
1130             sub slice_seq_id {
1131 0     0 1 0 my $self = shift;
1132 0 0       0 my $slice = shift or return;
1133 0         0 return $self->{'file2attribute'}{$slice}->[0];
1134             }
1135              
1136             sub slice_start {
1137 1087     1087 1 1711 my $self = shift;
1138 1087 50       2189 my $slice = shift or return;
1139 1087         2424 return $self->{'file2attribute'}{$slice}->[1];
1140             }
1141              
1142             sub slice_end {
1143 2020     2020 1 2655 my $self = shift;
1144 2020 50       3759 my $slice = shift or return;
1145 2020         5941 return $self->{'file2attribute'}{$slice}->[2];
1146             }
1147              
1148             sub slice_strand {
1149 4     4 1 9 my $self = shift;
1150 4 50       16 my $slice = shift or return;
1151 4         23 return $self->{'file2attribute'}{$slice}->[3];
1152             }
1153              
1154             sub slice_type {
1155 1020     1020 1 1425 my $self = shift;
1156 1020 50       2040 my $slice = shift or return;
1157 1020         4047 return $self->{'file2attribute'}{$slice}->[4];
1158             }
1159              
1160             sub slice_obs_number {
1161 6     6 1 12 my $self = shift;
1162 6 50       15 my $slice = shift or return;
1163 6         19 return $self->{'file2attribute'}{$slice}->[5];
1164             }
1165              
1166             sub slice_feature {
1167 0     0 1 0 my $self = shift;
1168 0 0       0 my $slice = shift or return;
1169             return Bio::DB::USeq::Segment->new(
1170             -seq_id => $self->{'file2attribute'}{$slice}->[0],
1171             -start => $self->{'file2attribute'}{$slice}->[1],
1172             -stop => $self->{'file2attribute'}{$slice}->[2],
1173             -strand => $self->{'file2attribute'}{$slice}->[3],
1174 0         0 -type => $self->{'file2attribute'}{$slice}->[4],
1175             -source => $self->name,
1176             -name => $slice,
1177             );
1178             }
1179              
1180              
1181              
1182              
1183              
1184             #### Feature and data access ####
1185              
1186             sub segment {
1187 1     1 1 3 my $self = shift;
1188            
1189             # arguments can be chromo, start, stop, strand
1190 1         4 my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_);
1191 1 50       4 return unless $self->length($seq_id); # make sure chromosome is represented
1192            
1193 1         12 return Bio::DB::USeq::Segment->new(
1194             -seq_id => $seq_id,
1195             -start => $start,
1196             -end => $stop,
1197             -strand => $strand,
1198             -type => 'segment',
1199             -source => $self->name,
1200             -useq => $self,
1201             );
1202             }
1203              
1204             sub features {
1205 2     2 1 344 my $self = shift;
1206 2         10 my %args = @_;
1207            
1208             # check for type
1209 2         5 my $type;
1210 2   0     7 $args{-type} ||= $args{-types} || $args{-primary_tag} || 'region';
      33        
1211 2 50 33     11 if (ref $args{-type} and ref $args{-type} eq 'ARRAY') {
1212 0   0     0 $type = $args{-type}->[0] || 'region';
1213 0         0 $args{-type} = $type;
1214             }
1215             else {
1216 2         6 $type = $args{-type};
1217             }
1218            
1219             # return an appropriate feature
1220 2 50       20 if ($type =~ /chromosome/) {
    50          
1221             # compatibility to return chromosome features
1222             my @chromos = map {
1223 0         0 Bio::DB::USeq::Feature->new(
  0         0  
1224             -seq_id => $_,
1225             -start => 1,
1226             -end => $self->length($_),
1227             -type => $type,
1228             -source => $self->name,
1229             -name => $_,
1230             )
1231             } $self->seq_ids;
1232 0 0       0 return wantarray ? @chromos : \@chromos;
1233             }
1234             elsif ($type =~ /region|interval|observation|coverage|wiggle|summary/) {
1235             # region or segment are individual observation features
1236             # coverage or wiggle are for efficient score retrieval with
1237             # backwards compatibility with Bio::Graphics
1238             # summary are statistical summaries akin to Bio::DB::BigFile
1239            
1240             # set up an iterator
1241 2         13 my $iterator = $self->get_seq_stream(%args);
1242 2 50       9 return unless $iterator;
1243            
1244             # if user requested an iterator
1245 2 0 33     6 if (exists $args{-iterator} and $args{-iterator}) {
1246 0         0 return $iterator;
1247             }
1248            
1249             # collect the features
1250 2         4 my @features;
1251 2         7 while (my $f = $iterator->next_seq) {
1252 2         10 push @features, $f;
1253             }
1254 2 50       22 return wantarray ? @features : \@features;
1255             }
1256             else {
1257 0         0 confess "unknown type request '$type'!\n";
1258             }
1259             }
1260              
1261              
1262             sub get_features_by_type {
1263             # does not work without location information
1264 0     0 0 0 cluck "please use features() method instead";
1265 0         0 return;
1266             }
1267              
1268              
1269             sub get_features_by_location {
1270 0     0 1 0 my $self = shift;
1271            
1272             # arguments can be chromo, start, stop, strand
1273 0         0 my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_);
1274            
1275 0         0 return $self->features($seq_id, $start, $stop, $strand);
1276             }
1277              
1278              
1279             sub get_feature_by_id {
1280             # much as Bio::DB::BigWig fakes the id, so we will too here
1281             # I don't know how necessary this really is
1282 0     0 1 0 my $self = shift;
1283            
1284             # id will be encoded as chr:start:end ?
1285 0         0 my $id = shift;
1286 0         0 my ($seq_id, $start, $end, $type) = split /:/, $id;
1287            
1288 0   0     0 my @list = $self->features(
1289             -seq_id => $seq_id,
1290             -start => $start,
1291             -end => $end,
1292             -type => $type || undef,
1293             );
1294 0 0       0 return unless @list;
1295 0 0       0 return $list[0] if scalar @list == 1;
1296 0         0 foreach my $f (@list) {
1297 0 0 0     0 return $f if ($f->start == $start and $f->end == $end);
1298             }
1299 0         0 return;
1300             }
1301              
1302              
1303             sub get_feature_by_name {
1304 0     0 1 0 return shift->get_feature_by_id(@_);
1305             }
1306              
1307              
1308             sub get_seq_stream {
1309 3     3 1 95 my $self = shift;
1310            
1311             # arguments can be chromo, start, stop, strand
1312 3         12 my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_);
1313 3 50       12 return unless $self->length($seq_id); # make sure chromosome is represented
1314            
1315             # check for type
1316 3         10 my %args = @_;
1317 3         6 my $type;
1318 3   50     18 $args{-type} ||= $args{-types} || $args{-primary_tag} || 'region';
      66        
1319 3 50 33     29 if (ref $args{-type} and ref $args{-type} eq 'ARRAY') {
1320 0   0     0 $type = $args{-type}->[0] || 'region';
1321             }
1322             else {
1323 3         10 $type = $args{-type};
1324             }
1325            
1326 3         14 return Bio::DB::USeq::Iterator->new(
1327             -seq_id => $seq_id,
1328             -start => $start,
1329             -end => $stop,
1330             -strand => $strand,
1331             -type => $type,
1332             -source => $self->name,
1333             -useq => $self,
1334             );
1335             }
1336              
1337              
1338             sub scores {
1339 1     1 1 59 my $self = shift;
1340            
1341             # arguments can be chromo, start, stop, strand
1342 1         4 my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_);
1343 1 50       4 return unless $self->length($seq_id); # make sure chromosome is represented
1344            
1345             # determine which slices to retrieve
1346 1         4 my @slices = $self->_translate_coordinates_to_slices(
1347             $seq_id, $start, $stop, $strand);
1348 1 50       3 return unless @slices;
1349 1         5 $self->_clear_buffer(\@slices);
1350            
1351             # collect the scores from each of the requested slices
1352 1         2 my @scores;
1353 1         3 foreach my $slice (@slices) {
1354            
1355             # load and unpack the data
1356 1         4 $self->_load_slice($slice);
1357            
1358             # find the overlapping observations
1359 1         66 my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop);
1360            
1361             # record the scores
1362 1         7 foreach my $r (@$results) {
1363 31 50       65 push @scores, $r->[2] if defined $r->[2];
1364             }
1365             }
1366            
1367 1 50       25 return wantarray ? @scores : \@scores;
1368             }
1369              
1370             sub observations {
1371 0     0 1 0 my $self = shift;
1372            
1373             # arguments can be chromo, start, stop, strand
1374 0         0 my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_);
1375 0 0       0 return unless $self->length($seq_id); # make sure chromosome is represented
1376            
1377             # determine which slices to retrieve
1378 0         0 my @slices = $self->_translate_coordinates_to_slices(
1379             $seq_id, $start, $stop, $strand);
1380 0 0       0 return unless @slices;
1381 0         0 $self->_clear_buffer(\@slices);
1382            
1383             # collect the scores from each of the requested slices
1384 0         0 my @observations;
1385 0         0 foreach my $slice (@slices) {
1386             # load and unpack the data
1387 0         0 $self->_load_slice($slice);
1388            
1389             # find the overlapping observations
1390 0         0 my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop);
1391 0         0 push @observations, @$results;
1392             }
1393              
1394 0 0       0 return wantarray ? @observations : \@observations;
1395             }
1396              
1397              
1398              
1399              
1400             #### Private methods ####
1401              
1402             sub _parse_metadata {
1403 1     1   2 my $self = shift;
1404            
1405             # the metadata file should always be present in a USeq file
1406 1         5 my $readMe = $self->{'zip'}->contents('archiveReadMe.txt');
1407 1 50       1366 unless ($readMe) {
1408 0         0 carp " USeq file is malformed! It does not contain an archiveReadMe.txt file\n";
1409 0         0 return;
1410             }
1411            
1412             # parse the archive file
1413             # this is a simple text file, each line is key = value
1414 1         4 $self->{'metadata'}{'comments'} = [];
1415 1         6 foreach (split /\n/, $readMe) {
1416 5 50       15 if (/^#/) {
1417 0 0       0 push @{ $self->{'metadata'}{'comments'} }, $_ unless $_ =~ /validCount/;
  0         0  
1418 0         0 next;
1419             }
1420 5 50       21 if (/^\s*([^=\s]+)\s*=\s*(.+)\s*$/) {
1421             # separate key = value pairs tolerating whitespace
1422             # key may contain anything excluding = and whitespace
1423 5         27 $self->{'metadata'}{$1} = $2;
1424             }
1425             }
1426 1         4 return 1;
1427             }
1428              
1429              
1430             sub _parse_members {
1431 2     2   5 my $self = shift;
1432            
1433             # there is a lot of information encoded in each zip member file name
1434             # the chromosome, strand, coordinates of represented interval, and file type
1435             # this parses and indexes this metadata into a usable format
1436            
1437 2         4 my @errors;
1438 2         8 foreach my $member ($self->{'zip'}->memberNames) {
1439            
1440             # archive readMe
1441 8 100       95 next if ($member eq 'archiveReadMe.txt');
1442            
1443             # data file
1444 6         52 my ($chromo, $strand, $start, $stop, $number, $extension) =
1445             ($member =~ /^([\w\-\.]+)([\+\-\.])(\d+)\-(\d+)\-(\d+)\.(\w+)$/i);
1446            
1447             # check extracted metadata
1448 6 50 33     64 unless ($chromo and $strand and defined $start and defined $stop and
      33        
      33        
      33        
      33        
1449             $number and $extension) {
1450 0         0 push @errors, " data slice $member";
1451 0         0 next;
1452             }
1453            
1454             # check stranded data
1455 6 100       15 unless (defined $self->{'stranded'}) {
1456 2 100       7 if ($strand eq '.') {
1457 1         3 $self->{'stranded'} = 0;
1458             }
1459             else {
1460 1         4 $self->{'stranded'} = 1;
1461             }
1462             }
1463            
1464             # check seq_ids
1465             # record the length for each seq_id, which may or may not be entirely
1466             # accurate, since it is just the last interval position
1467 6 100       14 if (exists $self->{'seq_ids'}{$chromo} ) {
1468 3 50       10 if ($stop > $self->{'seq_ids'}{$chromo}) {
1469 3         7 $self->{'seq_ids'}{$chromo} = $stop;
1470             }
1471             }
1472             else {
1473 3         9 $self->{'seq_ids'}{$chromo} = $stop;
1474             }
1475            
1476             # convert to BioPerl convention
1477 6 50       29 $strand = $strand eq '+' ? 1 : $strand eq '.' ? 0 : $strand eq '-' ? -1 : 0;
    100          
    100          
1478 6         13 $start += 1;
1479            
1480             # store the member details for each member slice
1481 6         24 $self->{'file2attribute'}{$member} =
1482             [$chromo, $start, $stop, $strand, $extension, $number];
1483            
1484             # store the member into a chromosome-strand-specific interval tree
1485 6   66     49 $self->{'coord2file'}{$chromo}{$strand} ||= Set::IntervalTree->new();
1486 6         27 $self->{'coord2file'}{$chromo}{$strand}->insert($member, $start, $stop);
1487             }
1488            
1489             # check parsing
1490 2 50       16 if (@errors) {
1491 0         0 carp "Errors parsing data slice filenames:\n" . join("\n", @errors) . "\n";
1492             }
1493 2 50       5 unless (%{ $self->{'coord2file'} }) {
  2         9  
1494 0         0 carp "no data slices present in USeq archive!\n";
1495 0         0 return;
1496             }
1497            
1498 2         7 return 1;
1499             }
1500              
1501              
1502             sub _get_coordinates {
1503 5     5   9 my $self = shift;
1504            
1505 5         10 my ($seq_id, $start, $stop, $strand);
1506 5 50       26 if ($_[0] =~ /^\-/) {
1507 5         23 my %args = @_;
1508 5   0     19 $seq_id = $args{'-seq_id'} || $args{'-chromo'} || undef;
1509 5   0     16 $start = $args{'-start'} || $args{'-pos'} || undef;
1510 5   0     13 $stop = $args{'-end'} || $args{'-stop'} || undef;
1511 5   50     29 $strand = $args{'-strand'} || '0'; # unstranded
1512             }
1513             else {
1514 0         0 ($seq_id, $start, $stop, $strand) = @_;
1515             }
1516 5 50       11 unless ($seq_id) {
1517 0         0 cluck "no sequence ID provided!";
1518 0         0 return;
1519             }
1520 5   50     14 $start ||= 1;
1521 5   33     27 $stop ||= $self->length($seq_id);
1522 5   50     20 $strand ||= 0;
1523 5         19 return ($seq_id, $start, $stop, $strand);
1524             }
1525              
1526              
1527             sub _translate_coordinates_to_slices {
1528 6     6   13 my $self = shift;
1529 6         17 my ($seq_id, $start, $stop, $strand) = @_;
1530 6 50       29 return unless (exists $self->{'coord2file'}{$seq_id});
1531            
1532             # check strand request
1533 6         11 my $both = 0;
1534 6 50       19 if ($strand == 0) {
1535             # strand was not specified,
1536             # but collect from both strands if we have stranded data
1537 6 100       15 $both = 1 if $self->stranded;
1538             }
1539             else {
1540             # strand was specified,
1541             # but convert it to unstranded if the data is not stranded
1542 0 0       0 $strand = 0 unless $self->stranded;
1543             }
1544            
1545             # look for the overlapping slices
1546 6         13 my @slices;
1547 6 100       17 if ($both) {
1548             # need to collect from both strands
1549             # plus strand first, then minus strand
1550 1         9 my $results = $self->{'coord2file'}{$seq_id}{1}->fetch($start, $stop);
1551 1         5 my $results2 = $self->{'coord2file'}{$seq_id}{-1}->fetch($start, $stop);
1552 1         5 push @slices, @$results, @$results2;
1553             }
1554            
1555             # specific strand
1556             else {
1557 5         56 my $results = $self->{'coord2file'}{$seq_id}{$strand}->fetch($start, $stop);
1558 5         16 push @slices, @$results;
1559             }
1560            
1561 6         22 return @slices;
1562             }
1563              
1564              
1565             sub _clear_buffer {
1566 6     6   11 my $self = shift;
1567            
1568             # make a quick hash of wanted slices
1569 6         12 my %wanted = map {$_ => 1} @{$_[0]};
  9         31  
  6         25  
1570            
1571             # delete the existing buffers of slices we do not want
1572 6         11 foreach (keys %{ $self->{buffer} }) {
  6         28  
1573 5 100       8151 delete $self->{buffer}{$_} unless exists $wanted{$_};
1574             }
1575             }
1576              
1577             sub _load_slice {
1578 1016     1016   1396 my $self = shift;
1579 1016         1507 my $slice = shift;
1580 1016 100       2379 return if (exists $self->{'buffer'}{$slice});
1581            
1582             # each slice is parsed into observations consisting of start, stop, and
1583             # optionally score and text depending on type
1584             # these are stored as anonymous arrays [start, stop, score, text]
1585             # for quick retrieval each observation interval is stored in a Set::IntervalTree
1586             # these operations maintain the original 0-based coordinate system
1587            
1588 7         22 my $type = $self->slice_type($slice);
1589 7 50       59 if ($type eq 's') { $self->_load_s_slice($slice) }
  0 50       0  
    50          
    50          
    100          
    50          
    50          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
1590 0         0 elsif ($type eq 'sf') { $self->_load_sf_slice($slice) }
1591 0         0 elsif ($type eq 'st') { $self->_load_st_slice($slice) }
1592 0         0 elsif ($type eq 'ss') { $self->_load_ss_slice($slice) }
1593 6         39 elsif ($type eq 'ssf') { $self->_load_ssf_slice($slice) }
1594 0         0 elsif ($type eq 'sst') { $self->_load_sst_slice($slice) }
1595 1         5 elsif ($type eq 'ssft') { $self->_load_ssft_slice($slice) }
1596 0         0 elsif ($type eq 'i') { $self->_load_i_slice($slice) }
1597 0         0 elsif ($type eq 'if') { $self->_load_if_slice($slice) }
1598 0         0 elsif ($type eq 'it') { $self->_load_it_slice($slice) }
1599 0         0 elsif ($type eq 'ii') { $self->_load_ii_slice($slice) }
1600 0         0 elsif ($type eq 'iif') { $self->_load_iif_slice($slice) }
1601 0         0 elsif ($type eq 'iit') { $self->_load_iit_slice($slice) }
1602 0         0 elsif ($type eq 'iift') { $self->_load_iift_slice($slice) }
1603 0         0 elsif ($type eq 'is') { $self->_load_is_slice($slice) }
1604 0         0 elsif ($type eq 'isf') { $self->_load_isf_slice($slice) }
1605 0         0 elsif ($type eq 'ist') { $self->_load_ist_slice($slice) }
1606 0         0 elsif ($type eq 'isft') { $self->_load_isft_slice($slice) }
1607             else {
1608 0         0 croak "unable to load slice '$slice'! Unsupported slice type $type\n";
1609             }
1610             }
1611              
1612              
1613             sub _load_s_slice {
1614 0     0   0 my $self = shift;
1615 0         0 my $slice = shift;
1616 0         0 my $tree = Set::IntervalTree->new();
1617            
1618             # unpack the data slice zip member
1619 0         0 my $number = $self->slice_obs_number($slice);
1620 0         0 my ($null, @data) = unpack("si>(s>)$number", $self->zip->contents($slice) );
1621            
1622             # convert the unpacked data into start, stop, score
1623             # first observation
1624 0         0 my $position = shift @data;
1625 0         0 $tree->insert( [$position, $position + 1, undef], $position, $position + 1);
1626            
1627             # remaining observations
1628 0         0 while (@data) {
1629 0         0 $position += (shift @data) + 32768;
1630 0         0 $tree->insert( [$position, $position + 1, undef], $position, $position + 1);
1631             }
1632            
1633             # store Interval Tree buffer
1634 0         0 $self->{buffer}{$slice} = $tree;
1635             }
1636              
1637             sub _load_sf_slice {
1638 0     0   0 my $self = shift;
1639 0         0 my $slice = shift;
1640 0         0 my $tree = Set::IntervalTree->new();
1641            
1642             # unpack the data slice zip member
1643 0         0 my $number = $self->slice_obs_number($slice);
1644 0         0 my ($null, @data) = unpack("si>f>(s>f>)$number", $self->zip->contents($slice) );
1645            
1646             # convert the unpacked data into start, stop, score
1647             # first observation
1648 0         0 my $position = shift @data;
1649 0         0 $tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1);
1650            
1651             # remaining observations
1652 0         0 while (@data) {
1653 0         0 $position += (shift @data) + 32768;
1654 0         0 $tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1);
1655             }
1656            
1657             # store Interval Tree buffer
1658 0         0 $self->{buffer}{$slice} = $tree;
1659             }
1660              
1661             sub _load_st_slice {
1662 0     0   0 my $self = shift;
1663 0         0 my $slice = shift;
1664 0         0 my $tree = Set::IntervalTree->new();
1665            
1666             # convert the unpacked data into start, stop, (score), text
1667            
1668             # load the slice contents
1669 0         0 my $contents = $self->zip->contents($slice);
1670            
1671             # first observation
1672 0         0 my $i = 8; # go ahead and set index up to first observation text
1673 0         0 my (undef, $position, $t) = unpack('si>s>', substr($contents, 0, $i));
1674             # initial null, position, text_length
1675 0         0 my $text = unpack("A$t", substr($contents, $i, $t));
1676 0         0 $tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1);
1677 0         0 $i += $t;
1678            
1679             # remaining observations
1680 0         0 my $p; # position offset
1681 0         0 my $len = CORE::length($contents);
1682 0         0 while ($i < $len) {
1683 0         0 my ($p, $t) = unpack('s>s>', substr($contents, $i, 4));
1684 0         0 $i += 4;
1685 0         0 $position += $p + 32768;
1686 0         0 $text = unpack("A$t", substr($contents, $i, $t));
1687 0         0 $tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1);
1688 0         0 $i += $t;
1689             }
1690            
1691             # store Interval Tree buffer
1692 0         0 $self->{buffer}{$slice} = $tree;
1693             }
1694              
1695             sub _load_ss_slice {
1696 0     0   0 my $self = shift;
1697 0         0 my $slice = shift;
1698 0         0 my $tree = Set::IntervalTree->new();
1699            
1700             # unpack the data slice zip member
1701 0         0 my $number = $self->slice_obs_number($slice);
1702 0         0 my ($null, @data) = unpack("si>s>(s>s>)$number", $self->zip->contents($slice) );
1703            
1704             # convert the unpacked data into start, stop, no score
1705             # first observation
1706 0         0 my $position = @data;
1707 0         0 my $position2 = $position + (shift @data) + 32768;
1708 0         0 $tree->insert( [$position, $position2, undef], $position, $position2);
1709            
1710             # remaining observations
1711 0         0 while (@data) {
1712 0         0 $position += (shift @data) + 32768;
1713 0         0 $position2 = $position + (shift @data) + 32768;
1714 0         0 $tree->insert( [$position, $position2, undef], $position, $position2);
1715             }
1716            
1717             # store Interval Tree buffer
1718 0         0 $self->{buffer}{$slice} = $tree;
1719             }
1720              
1721             sub _load_ssf_slice {
1722 6     6   15 my $self = shift;
1723 6         11 my $slice = shift;
1724 6         57 my $tree = Set::IntervalTree->new();
1725            
1726             # unpack the data slice zip member
1727 6         21 my $number = $self->slice_obs_number($slice);
1728 6         31 my ($null, @data) = unpack("si>s>f>(s>s>f>)$number", $self->zip->contents($slice) );
1729            
1730             # convert the unpacked data into start, stop, score
1731             # first observation
1732 6         35114 my $position = (shift @data);
1733 6         25 my $position2 = $position + (shift @data) + 32768;
1734 6         58 $tree->insert( [$position, $position2, shift(@data)], $position, $position2);
1735            
1736             # remaining observations
1737 6         28 while (@data) {
1738 58298         83341 $position += (shift @data) + 32768;
1739 58298         77348 $position2 = $position + (shift @data) + 32768;
1740 58298         175094 $tree->insert( [$position, $position2, shift(@data)], $position, $position2);
1741             }
1742            
1743             # store Interval Tree buffer
1744 6         51 $self->{buffer}{$slice} = $tree;
1745             }
1746              
1747             sub _load_sst_slice {
1748 0     0   0 my $self = shift;
1749 0         0 my $slice = shift;
1750 0         0 my $tree = Set::IntervalTree->new();
1751 0         0 my $contents = $self->zip->contents($slice);
1752            
1753             # first observation
1754 0         0 my $i = 10; # go ahead and set index up to first observation text
1755 0         0 my (undef, $position, $l, $t) = unpack('si>s>s>', substr($contents, 0, $i));
1756             # initial null, position, length, text_length
1757 0         0 my $position2 = $position + $l + 32768;
1758 0         0 my $text = unpack("A$t", substr($contents, $i, $t));
1759 0         0 $tree->insert( [$position, $position2, undef, $text], $position, $position2);
1760 0         0 $i += $t;
1761            
1762             # remaining observations
1763 0         0 my $p; # position offset
1764 0         0 my $len = CORE::length($contents);
1765 0         0 while ($i < $len) {
1766 0         0 ($p, $l, $t) = unpack('s>s>s>', substr($contents, $i, 6));
1767 0         0 $i += 6;
1768 0         0 $position += $p + 32768;
1769 0         0 $position2 = $position + $l + 32768;
1770 0         0 $text = unpack("A$t", substr($contents, $i, $t));
1771 0         0 $tree->insert( [$position, $position2, undef, $text], $position, $position2);
1772 0         0 $i += $t;
1773             }
1774            
1775             # store Interval Tree buffer
1776 0         0 $self->{buffer}{$slice} = $tree;
1777             }
1778              
1779             sub _load_ssft_slice {
1780 1     1   2 my $self = shift;
1781 1         2 my $slice = shift;
1782 1         5 my $tree = Set::IntervalTree->new();
1783 1         4 my $contents = $self->zip->contents($slice);
1784            
1785             # first observation
1786 1         4361 my $i = 14; # go ahead and set index up to first observation text
1787 1         7 my (undef, $position, $l, $s, $t) = unpack('si>s>f>s>', substr($contents, 0, $i));
1788             # initial null, position, length, score, text_length
1789 1         4 my $position2 = $position + $l + 32768;
1790 1         7 my $text = unpack("A$t", substr($contents, $i, $t));
1791 1         7 $tree->insert( [$position, $position2, $s, $text], $position, $position2);
1792 1         1 $i += $t;
1793            
1794             # remaining observations
1795 1         2 my $p; # position offset
1796 1         3 my $len = CORE::length($contents);
1797 1         4 while ($i < $len) {
1798 10000         24967 ($p, $l, $s, $t) = unpack('s>s>f>s>', substr($contents, $i, 10));
1799 10000         15473 $i += 10;
1800 10000         12970 $position += $p + 32768;
1801 10000         12770 $position2 = $position + $l + 32768;
1802 10000         22424 $text = unpack("A$t", substr($contents, $i, $t));
1803 10000         36496 $tree->insert( [$position, $position2, $s, $text], $position, $position2);
1804 10000         18421 $i += $t;
1805             }
1806            
1807             # store Interval Tree buffer
1808 1         10 $self->{buffer}{$slice} = $tree;
1809             }
1810              
1811             sub _load_i_slice {
1812 0     0   0 my $self = shift;
1813 0         0 my $slice = shift;
1814 0         0 my $tree = Set::IntervalTree->new();
1815            
1816             # unpack the data slice zip member
1817 0         0 my $number = $self->slice_obs_number($slice);
1818 0         0 my ($null, @data) = unpack("si>(i>)$number", $self->zip->contents($slice) );
1819            
1820             # convert the unpacked data into start, stop, score
1821             # first observation
1822 0         0 my $position = @data;
1823 0         0 $tree->insert( [$position, $position + 1, undef], $position, $position + 1);
1824            
1825             # remaining observations
1826 0         0 while (@data) {
1827 0         0 $position += (shift @data);
1828 0         0 $tree->insert( [$position, $position + 1, undef], $position, $position + 1);
1829             }
1830            
1831             # store Interval Tree buffer
1832 0         0 $self->{buffer}{$slice} = $tree;
1833             }
1834              
1835             sub _load_if_slice {
1836 0     0   0 my $self = shift;
1837 0         0 my $slice = shift;
1838 0         0 my $tree = Set::IntervalTree->new();
1839            
1840             # unpack the data slice zip member
1841 0         0 my $number = $self->slice_obs_number($slice);
1842 0         0 my ($null, @data) = unpack("si>f>(i>f>)$number", $self->zip->contents($slice) );
1843            
1844             # convert the unpacked data into start, stop, score
1845             # first observation
1846 0         0 my $position = @data;
1847 0         0 $tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1);
1848            
1849             # remaining observations
1850 0         0 while (@data) {
1851 0         0 $position += (shift @data);
1852 0         0 $tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1);
1853             }
1854            
1855             # store Interval Tree buffer
1856 0         0 $self->{buffer}{$slice} = $tree;
1857             }
1858              
1859             sub _load_it_slice {
1860 0     0   0 my $self = shift;
1861 0         0 my $slice = shift;
1862 0         0 my $tree = Set::IntervalTree->new();
1863            
1864             # convert the unpacked data into start, stop, (score), text
1865            
1866             # load the slice contents
1867 0         0 my $contents = $self->zip->contents($slice);
1868            
1869             # first observation
1870 0         0 my $i = 10; # go ahead and set index up to first observation text
1871 0         0 my (undef, $position, $t) = unpack('si>i>', substr($contents, 0, $i));
1872             # initial null, position, text_length
1873 0         0 my $text = unpack("A$t", substr($contents, $i, $t));
1874 0         0 $tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1);
1875 0         0 $i += $t;
1876            
1877             # remaining observations
1878 0         0 my $p; # position offset
1879 0         0 my $len = CORE::length($contents);
1880 0         0 while ($i < $len) {
1881 0         0 ($p, $t) = unpack('i>s>', substr($contents, $i, 6));
1882 0         0 $i += 6;
1883 0         0 $position += $p + 32768;
1884 0         0 $text = unpack("A$t", substr($contents, $i, $t));
1885 0         0 $tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1);
1886 0         0 $i += $t;
1887             }
1888            
1889             # store Interval Tree buffer
1890 0         0 $self->{buffer}{$slice} = $tree;
1891             }
1892              
1893             sub _load_ii_slice {
1894 0     0   0 my $self = shift;
1895 0         0 my $slice = shift;
1896 0         0 my $tree = Set::IntervalTree->new();
1897            
1898             # unpack the data slice zip member
1899 0         0 my $number = $self->slice_obs_number($slice);
1900 0         0 my ($null, @data) = unpack("si>i>(i>i>)$number", $self->zip->contents($slice) );
1901            
1902             # convert the unpacked data into start, stop, score
1903             # first observation
1904 0         0 my $position = @data;
1905 0         0 my $position2 = $position + (shift @data);
1906 0         0 $tree->insert( [$position, $position2, undef], $position, $position2);
1907            
1908             # remaining observations
1909 0         0 while (@data) {
1910 0         0 $position += (shift @data);
1911 0         0 $position2 = $position + (shift @data);
1912 0         0 $tree->insert( [$position, $position2, undef], $position, $position2);
1913             }
1914            
1915             # store Interval Tree buffer
1916 0         0 $self->{buffer}{$slice} = $tree;
1917             }
1918              
1919             sub _load_iif_slice {
1920 0     0   0 my $self = shift;
1921 0         0 my $slice = shift;
1922 0         0 my $tree = Set::IntervalTree->new();
1923            
1924             # unpack the data slice zip member
1925 0         0 my $number = $self->slice_obs_number($slice);
1926 0         0 my ($null, @data) = unpack("si>i>f>(i>i>f>)$number", $self->zip->contents($slice) );
1927            
1928             # convert the unpacked data into start, stop, score
1929             # first observation
1930 0         0 my $position = @data;
1931 0         0 my $position2 = $position + (shift @data);
1932 0         0 $tree->insert( [$position, $position2, shift(@data)], $position, $position2);
1933            
1934             # remaining observations
1935 0         0 while (@data) {
1936 0         0 $position += (shift @data);
1937 0         0 $position2 = $position + (shift @data);
1938 0         0 $tree->insert( [$position, $position2, shift(@data)], $position, $position2);
1939             }
1940            
1941             # store Interval Tree buffer
1942 0         0 $self->{buffer}{$slice} = $tree;
1943             }
1944              
1945             sub _load_iit_slice {
1946 0     0   0 my $self = shift;
1947 0         0 my $slice = shift;
1948 0         0 my $tree = Set::IntervalTree->new();
1949 0         0 my $contents = $self->zip->contents($slice);
1950            
1951             # first observation
1952 0         0 my $i = 12; # go ahead and set index up to first observation text
1953 0         0 my (undef, $position, $l, $t) = unpack('si>i>s>', substr($contents, 0, $i));
1954             # initial null, position, length, text_length
1955 0         0 my $position2 = $position + $l;
1956 0         0 my $text = unpack("A$t", substr($contents, $i, $t));
1957 0         0 $tree->insert( [$position, $position2, undef, $text], $position, $position2);
1958 0         0 $i += $t;
1959            
1960             # remaining observations
1961 0         0 my $p; # position offset
1962 0         0 my $len = CORE::length($contents);
1963 0         0 while ($i < $len) {
1964 0         0 ($p, $l, $t) = unpack('i>i>s>', substr($contents, $i, 10));
1965 0         0 $i += 10;
1966 0         0 $position += $p;
1967 0         0 $position2 = $position + $l;
1968 0         0 $text = unpack("A$t", substr($contents, $i, $t));
1969 0         0 $tree->insert( [$position, $position2, undef, $text], $position, $position2);
1970 0         0 $i += $t;
1971             }
1972            
1973             # store Interval Tree buffer
1974 0         0 $self->{buffer}{$slice} = $tree;
1975             }
1976              
1977             sub _load_iift_slice {
1978 0     0   0 my $self = shift;
1979 0         0 my $slice = shift;
1980 0         0 my $tree = Set::IntervalTree->new();
1981 0         0 my $contents = $self->zip->contents($slice);
1982            
1983             # first observation
1984 0         0 my $i = 16; # go ahead and set index up to first observation text
1985 0         0 my (undef, $position, $l, $s, $t) = unpack('si>i>f>s>', substr($contents, 0, $i));
1986             # initial null, position, length, score, text_length
1987 0         0 my $position2 = $position + $l;
1988 0         0 my $text = unpack("A$t", substr($contents, $i, $t));
1989 0         0 $tree->insert( [$position, $position2, $s, $text], $position, $position2);
1990 0         0 $i += $t;
1991            
1992             # remaining observations
1993 0         0 my $p; # position offset
1994 0         0 my $len = CORE::length($contents);
1995 0         0 while ($i < $len) {
1996 0         0 ($p, $l, $s, $t) = unpack('i>i>f>s>', substr($contents, $i, 14));
1997 0         0 $i += 14;
1998 0         0 $position += $p;
1999 0         0 $position2 = $position + $l;
2000 0         0 $text = unpack("A$t", substr($contents, $i, $t));
2001 0         0 $tree->insert( [$position, $position2, $s, $text], $position, $position2);
2002 0         0 $i += $t;
2003             }
2004            
2005             # store Interval Tree buffer
2006 0         0 $self->{buffer}{$slice} = $tree;
2007             }
2008              
2009             sub _load_is_slice {
2010 0     0   0 my $self = shift;
2011 0         0 my $tree = Set::IntervalTree->new();
2012 0         0 my $slice = shift;
2013            
2014             # unpack the data slice zip member
2015 0         0 my $number = $self->slice_obs_number($slice);
2016 0         0 my ($null, @data) = unpack("si>s>(i>s>)$number", $self->zip->contents($slice) );
2017            
2018             # convert the unpacked data into start, stop, score
2019             # first observation
2020 0         0 my $position = @data;
2021 0         0 my $position2 = $position + (shift @data) + 32768;
2022 0         0 $tree->insert( [$position, $position2, undef], $position, $position2);
2023            
2024             # remaining observations
2025 0         0 while (@data) {
2026 0         0 $position += (shift @data);
2027 0         0 $position2 = $position + (shift @data) + 32768;
2028 0         0 $tree->insert( [$position, $position2, undef], $position, $position2);
2029             }
2030            
2031             # store Interval Tree buffer
2032 0         0 $self->{buffer}{$slice} = $tree;
2033             }
2034              
2035             sub _load_isf_slice {
2036 0     0   0 my $self = shift;
2037 0         0 my $slice = shift;
2038 0         0 my $tree = Set::IntervalTree->new();
2039            
2040             # unpack the data slice zip member
2041 0         0 my $number = $self->slice_obs_number($slice);
2042 0         0 my ($null, @data) = unpack("si>s>f>(i>s>f>)$number", $self->zip->contents($slice) );
2043            
2044             # convert the unpacked data into start, stop, score
2045             # first observation
2046 0         0 my $position = @data;
2047 0         0 my $position2 = $position + (shift @data) + 32768;
2048 0         0 $tree->insert( [$position, $position2, shift(@data)], $position, $position2);
2049            
2050             # remaining observations
2051 0         0 while (@data) {
2052 0         0 $position += (shift @data);
2053 0         0 $position2 = $position + (shift @data) + 32768;
2054 0         0 $tree->insert( [$position, $position2, shift(@data)], $position, $position2);
2055             }
2056            
2057             # store Interval Tree buffer
2058 0         0 $self->{buffer}{$slice} = $tree;
2059             }
2060              
2061             sub _load_ist_slice {
2062 0     0   0 my $self = shift;
2063 0         0 my $slice = shift;
2064 0         0 my $tree = Set::IntervalTree->new();
2065 0         0 my $contents = $self->zip->contents($slice);
2066            
2067             # first observation
2068 0         0 my $i = 10; # go ahead and set index up to first observation text
2069 0         0 my (undef, $position, $l, $t) = unpack('si>s>s>', substr($contents, 0, $i));
2070             # initial null, position, length, text_length
2071 0         0 my $position2 = $position + $l + 32768;
2072 0         0 my $text = unpack("A$t", substr($contents, $i, $t));
2073 0         0 $tree->insert( [$position, $position2, undef, $text], $position, $position2);
2074 0         0 $i += $t;
2075            
2076             # remaining observations
2077 0         0 my $p; # position offset
2078 0         0 my $len = CORE::length($contents);
2079 0         0 while ($i < $len) {
2080 0         0 ($p, $l, $t) = unpack('i>s>s>', substr($contents, $i, 8));
2081 0         0 $i += 8;
2082 0         0 $position += $p;
2083 0         0 $position2 = $position + $l + 32768;
2084 0         0 $text = unpack("A$t", substr($contents, $i, $t));
2085 0         0 $tree->insert( [$position, $position2, undef, $text], $position, $position2);
2086 0         0 $i += $t;
2087             }
2088            
2089             # store Interval Tree buffer
2090 0         0 $self->{buffer}{$slice} = $tree;
2091             }
2092              
2093             sub _load_isft_slice {
2094 0     0   0 my $self = shift;
2095 0         0 my $slice = shift;
2096 0         0 my $tree = Set::IntervalTree->new();
2097 0         0 my $contents = $self->zip->contents($slice);
2098            
2099             # first observation
2100 0         0 my $i = 14; # go ahead and set index up to first observation text
2101 0         0 my (undef, $position, $l, $s, $t) = unpack('si>s>f>s>', substr($contents, 0, $i));
2102             # initial null, position, length, score, text_length
2103 0         0 my $position2 = $position + $l + 32768;
2104 0         0 my $text = unpack("A$t", substr($contents, $i, $t));
2105 0         0 $tree->insert( [$position, $position2, $s, $text], $position, $position2);
2106 0         0 $i += $t;
2107            
2108             # remaining observations
2109 0         0 my $p; # position offset
2110 0         0 while ($i < CORE::length($contents)) {
2111 0         0 ($p, $l, $s, $t) = unpack('i>s>f>s>', substr($contents, $i, 12));
2112 0         0 $i += 12;
2113 0         0 $position += $p;
2114 0         0 $position2 = $position + $l + 32768;
2115 0         0 $text = unpack("A$t", substr($contents, $i, $t));
2116 0         0 $tree->insert( [$position, $position2, $s, $text], $position, $position2);
2117 0         0 $i += $t;
2118             }
2119            
2120             # store Interval Tree buffer
2121 0         0 $self->{buffer}{$slice} = $tree;
2122             }
2123              
2124             sub _mean_score {
2125 1000     1000   1383 my $self = shift;
2126 1000         1864 my ($start, $stop, $slices) = @_;
2127 1000 50       1771 return unless @$slices;
2128            
2129             # collect the scores from each of the requested slices
2130 1000         1259 my $sum;
2131 1000         1481 my $count = 0;
2132 1000         1482 foreach my $slice (@$slices) {
2133 1001 50       1773 next unless $self->slice_type($slice) =~ /f/;
2134             # no sense going through if no score present
2135            
2136             # load and unpack the data
2137 1001         2672 $self->_load_slice($slice);
2138            
2139             # find the overlapping observations and record values
2140 1001         111068 my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop);
2141 1001         2748 foreach my $r (@$results) {
2142 2276   100     5843 $sum += $r->[2] || 0;
2143 2276         4221 $count++;
2144             }
2145             }
2146 1000 50       5129 return $count ? $sum / $count : undef;
2147             }
2148              
2149             sub _stat_summary {
2150 11     11   20 my $self = shift;
2151 11         30 my ($start, $stop, $slices) = @_;
2152 11 50       27 return unless @$slices;
2153            
2154             # initialize the statistical scores
2155 11         19 my $count = 0;
2156 11         30 my $sum;
2157             my $sum_squares;
2158 11         0 my $min;
2159 11         0 my $max;
2160            
2161             # collect the scores from each of the requested slices
2162 11         20 foreach my $slice (@$slices) {
2163 12 50       45 next unless $self->slice_type($slice) =~ /f/;
2164             # no sense going through if no score present
2165            
2166             # load and unpack the data
2167 12         44 $self->_load_slice($slice);
2168            
2169             # find the overlapping observations
2170 12         3675 my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop);
2171 12         51 foreach my $r (@$results) {
2172             # each observation is [start, stop, score]
2173 21145 50       36885 if (defined $r->[2]) {
2174 21145         25897 $count++;
2175 21145         26920 $sum += $r->[2];
2176 21145         27491 $sum_squares += ($r->[2] * $r->[2]);
2177 21145 100 100     56032 $min = $r->[2] if (!defined $min or $r->[2] < $min);
2178 21145 100 100     57879 $max = $r->[2] if (!defined $max or $r->[2] > $max);
2179             }
2180             }
2181             }
2182            
2183             # assemble the statistical summary hash
2184 11   50     159 my %summary = (
      50        
      50        
      50        
2185             'validCount' => $count,
2186             'sumData' => $sum || 0,
2187             'sumSquares' => $sum_squares || 0,
2188             'minVal' => $min || 0,
2189             'maxVal' => $max || 0,
2190             );
2191 11         110 return \%summary;
2192             }
2193              
2194             sub _rewrite_metadata {
2195 1     1   3 my $self = shift;
2196 1 50       4 return unless (-w $self->zip->fileName);
2197 1         58 my $md = $self->{'metadata'};
2198            
2199             # generate new metadata as an array
2200 1         3 my @new_md;
2201 1         7 push @new_md, "useqArchiveVersion = " . $md->{useqArchiveVersion};
2202 1 50       4 push @new_md, @{ $md->{comments} } if exists $md->{comments};
  1         6  
2203 1         4 push @new_md, "dataType = " . $md->{dataType};
2204 1         3 push @new_md, "versionedGenome = " . $md->{versionedGenome};
2205            
2206             # additional keys that may be present
2207 1         8 foreach (keys %$md) {
2208 7 100       40 next if /useqArchiveVersion|dataType|versionedGenome|comments|chromStats|globalStats/;
2209 2         8 push @new_md, "$_ = $md->{$_}";
2210             }
2211            
2212             # global and chromosome stats
2213 1         3 push @new_md,
2214             "# Bio::DB::USeq statistics validCount,sumData,sumSquares,minVal,maxVal";
2215 1 50       5 push @new_md, "globalStats = $md->{globalStats}" if exists $md->{globalStats};
2216 1         8 foreach (sort {$a cmp $b} keys %$md) {
  13         22  
2217 7 100       22 push @new_md, "$_ = $md->{$_}" if /chromStats/;
2218             }
2219            
2220             # write the new metadata to the zip Archive
2221 1         11 $self->{'zip'}->contents('archiveReadMe.txt', join("\n", @new_md));
2222 1         713 $self->{'zip'}->overwrite;
2223 1         8302 $self->clone; # not sure if this is necessary, but just in case we reopen the zip
2224             }
2225              
2226              
2227             ######## Exported Functions ########
2228             # these are borrowed from Bio::DB::BigWig from Lincoln Stein
2229              
2230             sub binMean {
2231 2     2 1 126 my $score = shift;
2232 2 50       9 return 0 unless $score->{validCount};
2233 2         11 $score->{sumData}/$score->{validCount};
2234             }
2235              
2236             sub binVariance {
2237 1     1 1 2 my $score = shift;
2238 1 50       5 return 0 unless $score->{validCount};
2239 1         8 my $var = $score->{sumSquares} - $score->{sumData}**2/$score->{validCount};
2240 1 50       3 if ($score->{validCount} > 1) {
2241 1         5 $var /= $score->{validCount}-1;
2242             }
2243 1 50       4 return 0 if $var < 0;
2244 1         4 return $var;
2245             }
2246              
2247             sub binStdev {
2248 1     1 1 5 my $score = shift;
2249 1         5 return sqrt(binVariance($score));
2250             }
2251              
2252              
2253              
2254              
2255             ######## Other Classes #############
2256              
2257             package Bio::DB::USeq::Feature;
2258 1     1   14 use base 'Bio::SeqFeature::Lite';
  1         3  
  1         702  
2259              
2260             sub new {
2261 13     13   42 my $class = shift;
2262 13         60 return $class->SUPER::new(@_);
2263             }
2264              
2265             sub type {
2266             # Bio::SeqFeature::Lite mangles the type and returns
2267             # primary_tag:source if both are set
2268             # this may wreck havoc with parsers when the type already has a :
2269             # as in wiggle:1000
2270 16     16   161 my $self = shift;
2271 16         105 return $self->{type};
2272             }
2273              
2274             sub chr_stats {
2275 0     0   0 my $self = shift;
2276 0 0       0 return unless exists $self->{useq};
2277 0         0 return $self->{useq}->chr_stats( $self->seq_id );
2278             }
2279              
2280             sub chr_mean {
2281 0     0   0 my $self = shift;
2282 0 0       0 return unless exists $self->{useq};
2283 0         0 return $self->{useq}->chr_mean( $self->seq_id );
2284             }
2285              
2286             sub chr_stdev {
2287 0     0   0 my $self = shift;
2288 0 0       0 return unless exists $self->{useq};
2289 0         0 return $self->{useq}->chr_stdev( $self->seq_id );
2290             }
2291              
2292             sub global_stats {
2293 0     0   0 my $self = shift;
2294 0 0       0 return unless exists $self->{useq};
2295 0         0 return $self->{useq}->global_stats( $self->seq_id );
2296             }
2297              
2298             sub global_mean {
2299 0     0   0 my $self = shift;
2300 0 0       0 return unless exists $self->{useq};
2301 0         0 return $self->{useq}->global_mean( $self->seq_id );
2302             }
2303              
2304             sub global_stdev {
2305 0     0   0 my $self = shift;
2306 0 0       0 return unless exists $self->{useq};
2307 0         0 return $self->{useq}->global_stdev( $self->seq_id );
2308             }
2309              
2310              
2311              
2312              
2313             package Bio::DB::USeq::Segment;
2314 1     1   74120 use base 'Bio::DB::USeq::Feature';
  1         2  
  1         907  
2315              
2316             sub new {
2317 1     1   2 my $class = shift;
2318 1         6 my %args = @_;
2319 1         10 my $segment = $class->SUPER::new(@_);
2320 1 50       90 $segment->{'useq'} = $args{'-useq'} if exists $args{'-useq'};
2321 1         5 return $segment;
2322             }
2323              
2324             sub scores {
2325 1     1   141 my $self = shift;
2326 1         7 return $self->{'useq'}->scores(
2327             -seq_id => $self->seq_id,
2328             -start => $self->start,
2329             -end => $self->end,
2330             -strand => $self->strand,
2331             );
2332             }
2333              
2334             sub features {
2335 0     0   0 my $self = shift;
2336 0         0 my $type = shift;
2337 0   0     0 $type ||= 'region';
2338 0         0 return $self->{'useq'}->features(
2339             -seq_id => $self->seq_id,
2340             -start => $self->start,
2341             -end => $self->end,
2342             -strand => $self->strand,
2343             -type => $type,
2344             );
2345             }
2346              
2347             sub get_seq_stream {
2348 1     1   354 my $self = shift;
2349 1         2 my $type = shift;
2350 1   50     9 $type ||= 'region';
2351             return Bio::DB::USeq::Iterator->new(
2352             -seq_id => $self->seq_id,
2353             -start => $self->start,
2354             -end => $self->end,
2355             -strand => $self->strand,
2356             -source => $self->source,
2357             -type => $type,
2358             -useq => $self->{useq},
2359 1         8 );
2360             }
2361              
2362             sub slices {
2363 0     0   0 my $self = shift;
2364 0         0 my @slices = $self->{'useq'}->_translate_coordinates_to_slices(
2365             $self->seq_id, $self->start, $self->end, $self->strand
2366             );
2367 0 0       0 return wantarray ? @slices : \@slices;
2368             }
2369              
2370             sub coverage {
2371 0     0   0 my $self = shift;
2372 0         0 my $bins = shift;
2373 0   0     0 $bins ||= $self->length;
2374 0         0 return $self->features("coverage:$bins");
2375             }
2376              
2377             sub wiggle {
2378 0     0   0 my $self = shift;
2379 0         0 my $bins = shift;
2380 0   0     0 $bins ||= $self->length;
2381 0         0 return $self->features("wiggle:$bins");
2382             }
2383              
2384             sub statistical_summary {
2385 0     0   0 my $self = shift;
2386 0         0 my $bins = shift;
2387 0   0     0 $bins ||= 1;
2388 0         0 return $self->features("summary:$bins");
2389             }
2390              
2391             sub observations {
2392 0     0   0 my $self = shift;
2393 0         0 return $self->{'useq'}->observations(
2394             -seq_id => $self->seq_id,
2395             -start => $self->start,
2396             -end => $self->end,
2397             -strand => $self->strand,
2398             );
2399             }
2400              
2401              
2402             package Bio::DB::USeq::Iterator;
2403 1     1   15 use base 'Bio::DB::USeq::Feature';
  1         3  
  1         1590  
2404              
2405             sub new {
2406 4     4   74 my $class = shift;
2407            
2408             # create object
2409 4         22 my %args = @_;
2410 4 50       15 my $useq = $args{'-useq'} or
2411             die "Bio::DB::USeq::Iterator cannot be created without a -useq argument";
2412 4         19 my $iterator = $class->SUPER::new(@_);
2413 4         289 $iterator->{'useq'} = $useq;
2414            
2415             # determine which members to retrieve
2416             my @slices = $useq->_translate_coordinates_to_slices(
2417             $args{-seq_id},
2418             $args{-start},
2419             $args{-end},
2420             $args{-strand},
2421 4         20 );
2422 4         18 $useq->_clear_buffer(\@slices);
2423            
2424             # sort the slices as necessary
2425 4 100       20 if (scalar(@slices) > 1) {
2426 6         17 @slices = map {$_->[1]}
2427 3         11 sort {$a->[0] <=> $b->[0]}
2428 3         21 map { [$useq->slice_start($_), $_] } @slices;
  6         21  
2429             }
2430            
2431             # how we set up the iterator depends on the feature type requested
2432             # we need to add specific information to iterator object
2433            
2434 4 100       18 if ($iterator->type =~ /region|interval|observation/) {
2435             # useq observation features are simple
2436 2         8 $iterator->{'wiggle'} = 0;
2437            
2438             # prepare specific iterator information
2439 2         6 $iterator->{'slices'} = \@slices;
2440 2         4 $iterator->{'current_results'} = undef;
2441 2         6 $iterator->{'current_strand'} = undef;
2442            
2443 2         10 return $iterator;
2444             }
2445            
2446             # otherwise we work with more complex wiggle or summary features
2447             # set the type of wiggle feature: 1 = wiggle, 2 = summary
2448 2 100       7 $iterator->{'wiggle'} = $iterator->type =~ /summary/ ? 2 : 1;
2449            
2450             # normally the iterator will only return one wigggle|summary feature, but
2451             # will return two if stranded data, per behavior for Bio::Grapics...
2452 2 50 33     15 if ($iterator->strand == 0 and $useq->stranded) {
2453             # we have stranded data, so need to return two features
2454             # separate the slices into each respective strands for each feature
2455 0         0 my (@f, @r);
2456 0         0 foreach (@slices) {
2457 0 0       0 if ($useq->slice_strand($_) == 1) {
2458 0         0 push @f, $_;
2459             }
2460             else {
2461 0         0 push @r, $_;
2462             }
2463             }
2464 0         0 $iterator->{slices} = [\@f, \@r];
2465             }
2466             else {
2467             # only need to return one feature
2468 2         8 $iterator->{slices} = [ \@slices ];
2469             }
2470            
2471             # check for type and bins
2472 2         5 my ($bin, $step);
2473 2 100       6 if ($iterator->type =~ /:(\d+)$/i) {
2474 1         4 $bin = $1;
2475 1         7 my $length = $iterator->length;
2476 1 50       24 $bin = $length if $bin > $length;
2477 1         4 $step = $length/$bin;
2478             }
2479 2         7 $iterator->{bin} = $bin;
2480 2         5 $iterator->{step} = $step;
2481            
2482 2         10 return $iterator;
2483             }
2484              
2485             sub next_seq {
2486 10     10   409 my $self = shift;
2487            
2488 10 100       40 if ($self->{'wiggle'} == 1) {
    100          
2489 2         8 return $self->_next_wiggle;
2490             }
2491             elsif ($self->{'wiggle'} == 2) {
2492 2         7 return $self->_next_summary;
2493             }
2494             else {
2495 6         14 return $self->_next_region;
2496             }
2497             }
2498              
2499 0     0   0 sub next_feature {return shift->next_seq}
2500              
2501             sub _next_region {
2502             # this will keep returning observations as SeqFeature objects until we run out
2503             # of observations in the requested interval
2504 6     6   11 my $self = shift;
2505 6         11 my $useq = $self->{'useq'};
2506            
2507             # get current results
2508 6 100       16 unless ($self->{'current_results'}) {
2509 2   50     5 my $slice = shift @{ $self->{'slices'} } || undef;
2510 2 50       8 return unless $slice; # no more slices, we're done
2511            
2512             # collect the observations
2513 2         15 $useq->_load_slice($slice);
2514 2         19 my $result = $useq->{buffer}{$slice}->fetch($self->start - 1, $self->end);
2515            
2516             # sort the observations and store the results
2517 2 50       275 my @sorted = sort {$a->[0] <=> $b->[0] or $a->[1] <=> $b->[1]} @$result;
  303         522  
2518 2         13 $self->{'current_results'} = \@sorted;
2519 2         11 $self->{'current_strand'} = $useq->slice_strand($slice);
2520             }
2521            
2522             # return next observation as a Feature object
2523 6         10 my $obs = shift @{ $self->{'current_results'} };
  6         10  
2524 6 50       17 unless (defined $obs) {
2525             # no more observations for current slice, try to move to next slice
2526 0         0 undef $self->{'current_results'};
2527 0         0 return $self->_next_region;
2528             }
2529             return Bio::DB::USeq::Feature->new(
2530             -seq_id => $self->seq_id,
2531             -start => $obs->[0] + 1,
2532             -end => $obs->[1],
2533 6 100       24 -strand => $self->{'current_strand'},
2534             -score => $obs->[2],
2535             -source => $self->source,
2536             -type => $self->type,
2537             -name => defined $obs->[3] ? $obs->[3] :
2538             join(':', $self->seq_id, $obs->[0], $obs->[1]),
2539             )
2540             }
2541              
2542             sub _next_wiggle {
2543             # this will return one wiggle Feature, two if separate strands are requested
2544            
2545 2     2   4 my $self = shift;
2546 2         6 my $useq = $self->{'useq'};
2547            
2548             # determine which slices to retrieve
2549 2         3 my $slices = shift @{ $self->{slices} };
  2         6  
2550 2 100       8 return unless $slices;
2551            
2552             # more information
2553 1         3 my $start = $self->start;
2554 1         9 my $stop = $self->end;
2555 1         9 my $step = $self->{step};
2556            
2557             # check whether we are working with binned wiggle or not
2558 1         2 my @scores;
2559 1 50 33     6 if ($self->{bin} and $step > 1) {
2560             # we will be collecting the mean score value in bins
2561            
2562             # collect the scores for each bin
2563 1         4 for (my $begin = $start; $begin < $stop; $begin += $step) {
2564            
2565             # round off coordinates to integers
2566             # beginning point and step may not be integers
2567 1000         2189 my $s = int($begin + 0.5);
2568 1000         1642 my $e = int($s + $step - 0.5); # start + step - 1 + 0.5
2569            
2570             # collect the scores from each of the requested slices
2571 1000 50       1915 if (scalar @$slices > 1) {
2572             # more than one slice, identify which subset of slices to collect from
2573             # may or may not be all of the current slices
2574 1000         1412 my @sub_slices;
2575 1000         1583 foreach my $slice (@$slices) {
2576 2000 100       4135 next if $s > $useq->slice_end($slice);
2577 1061 100       1965 next if $e < $useq->slice_start($slice);
2578 1001         2254 push @sub_slices, $slice;
2579             }
2580 1000         2192 push @scores, $useq->_mean_score($s, $e, \@sub_slices);
2581             }
2582             else {
2583 0         0 push @scores, $useq->_mean_score($s, $e, $slices);
2584             }
2585             }
2586             }
2587             else {
2588             # otherwise we collect in one step and associate scores at bp resolution
2589             # collect the scores from each of the requested slices and
2590             # assemble them into a hash of values
2591              
2592             # correlate scores with position
2593 0         0 my %pos2score;
2594 0         0 foreach my $slice (@$slices) {
2595              
2596             # load and unpack the data
2597 0         0 $useq->_load_slice($slice);
2598              
2599             # fetch the overlapping observations
2600             # each observation is [start, stop, score]
2601 0         0 my $result = $useq->{'buffer'}{$slice}->fetch($start - 1, $stop);
2602 0         0 foreach my $r (@$result) {
2603 0         0 foreach my $p ( $r->[0] + 1 .. $r->[1] ) {
2604 0         0 $pos2score{$p} = $r->[2];
2605             }
2606             }
2607             }
2608              
2609             # convert positioned scores into an array
2610 0         0 foreach (my $s = $start; $s <= $stop; $s++) {
2611 0 0       0 push @scores, exists $pos2score{$s} ? $pos2score{$s} : undef;
2612             # for Bio::Graphics it is better to store undef than 0
2613             # which can do wonky things with graphs
2614             }
2615             }
2616              
2617             # generate the wiggle object
2618 1   50     8 my $strand = $useq->slice_strand( $slices->[0] ) || 0;
2619 1 50       11 return Bio::DB::USeq::Wiggle->new(
    50          
2620             -seq_id => $self->seq_id,
2621             -start => $start,
2622             -end => $stop,
2623             -strand => $strand,
2624             -type => $self->type,
2625             -source => $self->source,
2626             -attributes => { 'coverage' => [ \@scores ] },
2627             -name => $strand == 1 ? 'Forward' :
2628             $strand == -1 ? 'Reverse' : q(),
2629             -useq => $useq,
2630             );
2631             }
2632              
2633             sub _next_summary {
2634             # this will return one summary Feature, two if separate strands are requested
2635 2     2   4 my $self = shift;
2636            
2637             # determine which slices to retrieve
2638 2         3 my $slices = shift @{ $self->{slices} };
  2         6  
2639 2 100       7 return unless $slices;
2640            
2641             # all of the real statistical work is done elsewhere
2642             # just return the summary object
2643 1   50     6 my $strand = $self->{useq}->slice_strand( $slices->[0] ) || 0;
2644             return Bio::DB::USeq::Summary->new(
2645             -seq_id => $self->seq_id,
2646             -start => $self->start,
2647             -end => $self->end,
2648             -strand => $strand,
2649             -type => $self->type,
2650             -source => $self->source,
2651             -name => $strand == 1 ? 'Forward' :
2652             $strand == -1 ? 'Reverse' : q(),
2653             -useq => $self->{'useq'},
2654             -slices => $slices,
2655             -bin => $self->{bin},
2656             -step => $self->{step},
2657 1 50       5 );
    50          
2658             }
2659              
2660              
2661              
2662             package Bio::DB::USeq::Wiggle;
2663 1     1   12 use base 'Bio::DB::USeq::Feature';
  1         3  
  1         643  
2664             # Wiggle scores are stored in the coverage attribute for backwards
2665             # compatibility with Bio::Graphics.
2666              
2667             sub new {
2668 1     1   32 my $class = shift;
2669 1         15 my $wig = $class->SUPER::new(@_);
2670 1         130 my %args = @_;
2671 1 50       5 my $useq = $args{'-useq'} or
2672             die "Bio::DB::USeq::Wiggle cannot be created without a -useq argument";
2673 1         4 $wig->{useq} = $useq;
2674 1         9 return $wig;
2675             }
2676              
2677             sub coverage {
2678 1     1   2 my $self = shift;
2679 1         10 my ($coverage) = $self->get_tag_values('coverage');
2680 1 50       120 return wantarray ? @$coverage : $coverage;
2681             }
2682              
2683             sub wiggle {
2684 1     1   131 return shift->coverage;
2685             }
2686              
2687             # Borrowed from Bio::SeqFeature::Coverage from Bio::DB::Sam
2688             sub gff3_string {
2689 0     0   0 my $self = shift;
2690 0         0 my $gff3 = $self->SUPER::gff3_string(@_);
2691 0         0 my $coverage = $self->escape(join(',',$self->coverage));
2692 0         0 $gff3 =~ s/coverage=[^;]+/coverage=$coverage/g;
2693 0         0 return $gff3;
2694             }
2695              
2696             sub statistical_summary {
2697             # this is just for the wiggle scores, not original data
2698             # This is a fake statistical_summary to fool Bio::Graphics into
2699             # calculating chromosome or global statistics like a BigWig adaptor.
2700             # A real statistical_summary call would provide the number of bins,
2701             # so return null if bins is present.
2702             # We could calculate a real statistical summary, but that would be an
2703             # expensive circuitous route after we just collected wiggle scores.
2704             # Better to request the correct feature type in the first place.
2705 0     0   0 my $self = shift;
2706 0         0 my $bins = shift;
2707 0 0 0     0 return if $bins && $bins > 1;
2708            
2709             # initialize the statistical scores
2710 0         0 my $count = 0;
2711 0         0 my $sum;
2712             my $sum_squares;
2713 0         0 my $min;
2714 0         0 my $max;
2715            
2716             # generate a statistical summary of just the wiggle scores
2717 0         0 foreach my $s ($self->coverage) {
2718 0         0 $count++;
2719 0 0       0 next unless defined $s;
2720 0         0 $sum += $s;
2721 0         0 $sum_squares += $s * $s;
2722 0 0 0     0 $min = $s if (!defined $min or $s < $min);
2723 0 0 0     0 $max = $s if (!defined $max or $s > $max);
2724             }
2725            
2726             # return the statistical hash
2727 0   0     0 my %stat = (
      0        
      0        
      0        
2728             'validCount' => $count,
2729             'sumData' => $sum || 0,
2730             'sumSquares' => $sum_squares || 0,
2731             'minVal' => $min || 0,
2732             'maxVal' => $max || 0,
2733             );
2734 0         0 return \%stat;
2735             }
2736              
2737             sub get_seq_stream {
2738 0     0   0 my $self = shift;
2739             return Bio::DB::USeq::Iterator->new(
2740             -seq_id => $self->seq_id,
2741             -start => $self->start,
2742             -end => $self->end,
2743             -strand => $self->strand,
2744             -type => 'region',
2745             -source => $self->source,
2746             -useq => $self->{useq},
2747 0         0 );
2748             }
2749              
2750              
2751              
2752              
2753             package Bio::DB::USeq::Summary;
2754 1     1   8 use base 'Bio::DB::USeq::Feature';
  1         2  
  1         786  
2755              
2756             sub new {
2757 1     1   27 my $class = shift;
2758 1         11 my $summary = $class->SUPER::new(@_);
2759 1         87 my %args = @_;
2760 1 50       5 my $useq = $args{'-useq'} or
2761             die "Bio::DB::USeq::Summary cannot be created without a -useq argument";
2762 1         3 $summary->{useq} = $useq;
2763 1 50       4 $summary->{slices} = $args{-slices} if exists $args{-slices};
2764 1 50       5 $summary->{bin} = $args{-bin} if exists $args{-bin};
2765 1 50       4 $summary->{step} = $args{-step} if exists $args{-step};
2766 1         7 return $summary;
2767             }
2768              
2769             sub statistical_summary {
2770 1     1   171 my $self = shift;
2771 1         3 my $useq = $self->{useq};
2772            
2773             # get the number of bins to calculate the statistical summaries
2774 1         3 my $bin = shift;
2775 1 50 33     5 $bin ||= $self->{bin} if exists $self->{bin};
2776 1   50     4 $bin ||= 1;
2777 1 50       4 my $step = $self->{step} if exists $self->{step};
2778 1   33     13 $step ||= $self->length / $bin;
2779            
2780             # get the slices
2781 1 50       30 my $slices = $self->{slices} if exists $self->{slices};
2782 1 50       4 unless ($slices) {
2783             # this should already be established, but just in case
2784 0         0 my @a = $useq->_translate_coordinates_to_slices($self->seq_id,
2785             $self->start, $self->end, $self->strand);
2786 0         0 $useq->_clear_buffer(\@a);
2787 0         0 $slices = \@a;
2788             }
2789            
2790             # collect the statistical summaries for each bin
2791 1         378 my @summaries;
2792 1         6 for (my $begin = $self->start; $begin < $self->end; $begin += $step) {
2793            
2794             # round off coordinates to integers
2795             # beginning point and step may not be integers
2796 10         131 my $s = int($begin + 0.5);
2797 10         25 my $e = int($s + $step - 0.5); # start + step - 1 + 0.5
2798            
2799             # collect the scores from each of the requested slices
2800 10 50       31 if (scalar @$slices > 1) {
2801             # more than one slice, identify which subset of slices to collect from
2802             # may or may not be all of the current slices
2803 10         17 my @sub_slices;
2804 10         22 foreach my $slice (@$slices) {
2805 20 50       47 next if $s > $useq->slice_end($slice);
2806 20 100       42 next if $e < $useq->slice_start($slice);
2807 11         27 push @sub_slices, $slice;
2808             }
2809 10         50 push @summaries, $useq->_stat_summary($s, $e, \@sub_slices);
2810             }
2811             else {
2812 0         0 push @summaries, $useq->_stat_summary($s, $e, $slices);
2813             }
2814             }
2815            
2816             # return the reference to the statistical summaries
2817 1         19 return \@summaries;
2818             }
2819              
2820             sub score {
2821 0     0     my $self = shift;
2822 0           my $a = $self->statistical_summary(1);
2823 0           return $a->[0];
2824             }
2825              
2826             sub gff3_string {
2827             # this is going to be a little convoluted, since what we
2828             # really want here is coverage, which is easier to calculate with means,
2829             # rather than doing statistical summaries and calculate means from those
2830 0     0     my $self = shift;
2831             my ($wig) = $self->{useq}->features(
2832             # this should only return one feature because we have specific strand
2833 0           -seq_id => $self->seq_id,
2834             -start => $self->start,
2835             -end => $self->end,
2836             -strand => $self->strand,
2837             -type => 'coverage:1000',
2838             );
2839 0           return $wig->gff3_string;
2840             }
2841              
2842             sub get_seq_stream {
2843 0     0     my $self = shift;
2844             return Bio::DB::USeq::Iterator->new(
2845             -seq_id => $self->seq_id,
2846             -start => $self->start,
2847             -end => $self->end,
2848             -strand => $self->strand,
2849             -type => 'region',
2850             -source => $self->source,
2851             -useq => $self->{useq},
2852 0           );
2853             }
2854              
2855             =head1 AUTHOR
2856              
2857             Timothy J. Parnell, PhD
2858             Dept of Oncological Sciences
2859             Huntsman Cancer Institute
2860             University of Utah
2861             Salt Lake City, UT, 84112
2862              
2863             This package is free software; you can redistribute it and/or modify
2864             it under the terms of the Artistic License 2.0.