File Coverage

blib/lib/Bio/DB/USeq.pm

Criterion	Covered	Total	%
statement	463	932	49.6
branch	151	328	46.0
condition	39	142	27.4
subroutine	62	111	55.8
pod	37	39	94.8
total	752	1552	48.4

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::DB::USeq;
2
3							our $VERSION = '0.25';
4
5							=head1 NAME
6
7							Bio::DB::USeq - Read USeq archive database files
8
9							=head1 SYNOPSIS
10
11							use Bio::DB::USeq;
12							my $useq = Bio::DB::USeq->new('file.useq') or
13							die "unable to open file.useq!\n";
14
15							# sequence IDs
16							my @seq_ids = $useq->seq_ids;
17							my $length = $useq->length($chr); # approximate, not exact
18
19							### Retrieving features
20							# all features or observations across chromosome I
21							my @features = $useq->features(
22							-seq_id => 'chrI',
23							-type => 'region',
24							);
25
26							# same thing using a simple form
27							# use an array of (chromosome, start, stop, strand)
28							my @features = $useq->features('chrI');
29
30							# same thing with a memory efficient iterator
31							my $iterator = $useq->get_seq_stream(
32							-seq_id => 'chrI',
33							);
34							while (my $f = $iterator->next_seq) {
35							# each feature $f supports most SeqFeatureI methods
36							}
37
38
39							### Retrieving simple scores
40							my @scores = $useq->scores(
41							-seq_id => 'chrI',
42							-start => 1000,
43							-end => 2000
44							);
45
46
47							### Same methods as above after defining an interval first
48							my $segment = $useq->segment(
49							-seq_id => 'chrI',
50							-start => 1000,
51							-end => 2000,
52							);
53							my @scores = $segment->scores;
54							my @features = $segment->features;
55							my $iterator = $segment->get_seq_stream;
56
57
58							### Efficient retrieval of positioned scores in 100 bins
59							# compatible with Bio::Graphics
60							my ($wig) = $useq->features(
61							# assuming unstranded data here, otherwise two wiggle objects
62							# would be returned, one for each strand
63							-seq_id => 'chrI',
64							-start => 1000,
65							-end => 2000,
66							-type => 'wiggle:100',
67							);
68							my @bins = $wig->wiggle;
69							my @bins = $wig->coverage; # same thing
70							my ($bins) = $wig->get_tag_values('coverage'); # same thing
71
72
73							### Statistical summaries of intervals
74							# compatible with Bio::Graphics
75							my ($summary) = $useq->features(
76							# assuming unstranded data here, otherwise two summaries
77							# would be returned, one for each strand
78							-seq_id => 'chrI',
79							-start => 1000,
80							-end => 2000,
81							-type => 'summary',
82							);
83							my $stats = $summary->statistical_summary(10);
84							foreach (@$stats) {
85							my $max = $_->{maxVal};
86							my $mean = Bio::DB::USeq::binMean($_);
87							}
88
89
90							### Stranded data using an iterator
91							# could be used with either wiggle or summary features
92							my $stream = $useq->get_seq_stream(
93							-seq_id => 'chrI',
94							-start => 1000,
95							-end => 2000,
96							-type => 'wiggle:100',
97							);
98							my ($forward, $reverse);
99							if ($useq->stranded) {
100							$forward = $stream->next_seq;
101							$reverse = $stream->next_seq;
102							}
103							else {
104							$forward = $stream->next_seq;
105							}
106
107
108							=head1 DESCRIPTION
109
110							Bio::DB::USeq is a B<BioPerl> style adaptor for reading USeq files. USeq files
111							are compressed, indexed data files supporting modern bioinformatic datasets,
112							including genomic points, scores, and intervals. More information about the
113							USeq software package can be found at L<https://github.com/HuntsmanCancerInstitute/USeq>,
114							including a description of the USeq data
115							L<archive format\|https://github.com/HuntsmanCancerInstitute/USeq/blob/master/Documentation/USeqDocumentation/useqArchiveFormat.html>.
116
117							USeq files are typically half the size of corresponding bigBed and bigWig files,
118							due to a compact internal format and lack of internal zoom data. This adaptor,
119							however, can still return statistics across different zoom levels in the same
120							manner as big files, albeit at a cost of calculating these in realtime.
121
122							=head2 Generating useq files
123
124							USeq files may be generated using tools from the USeq package, available at
125							L<https://github.com/HuntsmanCancerInstitute/USeq>. They may be generated from
126							native Bar files, text Wig files, text Bed files, and UCSC bigWig and bigBed file
127							formats.
128
129							=head2 Compatibility
130
131							The adaptor follows most conventions of other B<BioPerl>-style Bio::DB
132							adaptors. Observations or features in the useq file archive are
133							returned as SeqFeatureI compatible objects.
134
135							Coordinates consumed and returned by the adaptor are 1-based, consistent
136							with B<BioPerl> convention. This is not true of the useq file itself, which
137							uses the interbase coordinate system.
138
139							Unlike wig and bigWig files, useq file archives support stranded data,
140							which can make data collection much simpler for complex experiments.
141
142							See below for GBrowse compatibility.
143
144							=head2 Limitations
145
146							This adaptor is read only. USeq files, in general, are not modified
147							or written with data. The exceptions are chromosome or global statistics
148							are written to the F<archiveReadMe.txt> file inside the zip archive to
149							cache for future queries.
150
151							No support for genomic sequence is included. Users who need access to
152							genomic sequence should seek an alternative B<BioPerl> adaptor, such as
153							L<Bio::DB::Fasta>.
154
155							Useq files do not have a native concept of type, primary_tag, or source
156							attributes, as expected with GFF-based database adaptors. The features
157							method does support special types (see below).
158
159							=head2 Requirements
160
161							The adaptor is a Perl-only implementation. It only requires the
162							L<Archive::Zip> module for opening and reading the useq file archive.
163							B<BioPerl> is required for working with SeqFeatures objects generated
164							from useq file observations.
165
166							=head1 METHODS
167
168							=head2 Initializing the Bio::DB::USeq object
169
170							These are class methods for creating and working with the Bio::DB::USeq
171							object itself.
172
173							=over 4
174
175							=item new
176
177							=item new($file)
178
179							=item new(-useq => $file)
180
181							This will create a new Bio::DB::USeq object. Optionally pass the path
182							to a useq file archive to open immediately. There is not much to do
183							unless you open a file.
184
185							Named arguments may be used to specify the file. Either -useq or -file
186							may be used.
187
188							=item open($file)
189
190							Open a useq file archive. Useq files typically use the F<.useq> file
191							extension. Returns true if successful.
192
193							DO NOT open a subsequent useq file archive when one has already been
194							opened. Please create a new object to open a new file.
195
196							=item clone
197
198							Force the object to re-open the useq archive file under a new file
199							handle to make it clone safe. Do this in the child process before
200							collecting data.
201
202							=item zip
203
204							Return the L<Archive::Zip> object representing the useq file archive.
205							Generally not recommended unless you know what you are doing.
206
207							=back
208
209							=head2 General information about the useq file
210
211							These are class methods for obtaining general information or
212							metadata attributes regarding the contents of the file.
213
214							=over 4
215
216							=item stranded
217
218							Return true or false (1 or 0) indicating whether the contents of the
219							useq file archive are recorded in a stranded fashion.
220
221							=item attributes
222
223							Return an array of available attribute keys that were recorded in the
224							useq file F<archiveReadMe.txt> member. These are key = value pairs
225							representing metadata for the useq file.
226
227							=item attribute($key)
228
229							Return the metadata attribute value for the specified key. These
230							are recorded in the useq file F<archiveReadMe.txt> member. Returns
231							undefined if the key does not exist.
232
233							=item type
234
235							Return the useq file metadata C<dataType> value.
236
237							=item genome
238
239							Return the useq file metadata C<versionedGenome> value.
240
241							=item version
242
243							Return the useq file metadata C<useqArchiveVersion> value.
244
245							=back
246
247							=head2 Chromosome Information
248
249							These are methods to obtain information about the chromosomes or reference
250							sequences represented in the file archive.
251
252							B<Note> that generating score statistics across one or more chromosomes may
253							be computationally expensive. Therefore, chromosome statistics, once
254							calculated, are cached in the useq file metadata for future reference.
255							This necessitates writing to the useq zip archive. This is currently the
256							only supported method for modifying the useq zip archive.
257
258							=over 4
259
260							=item seq_ids
261
262							Return an array of the chromosome or sequence identifiers represented
263							in the useq file archive. The names are sorted ASCIIbetically before
264							they are returned.
265
266							=item length($seq_id)
267
268							Return the length of the provided chromosome or sequence identifier.
269							Note that this may not be the actual length in the reference genome, but
270							rather the last recorded position of an observation for that chromosome.
271							Hence, it should be used only as an approximation.
272
273							=item chr_mean($seq_id)
274
275							Return the mean score value across the entire chromosome.
276
277							=item chr_stdev($seq_id)
278
279							Return the score standard deviation across the entire chromosome.
280
281							=item chr_stats($seq_id)
282
283							Return a statistical summary across the entire chromosome. This
284							is an anonymous hash with five keys: validCount, sumData,
285							sumSquares, minVal, and maxVal. These are equivalent to the
286							statistical summaries generated by the Bio::DB::BigWig adaptor.
287
288							=item global_mean
289
290							Return the mean score value across all chromosomes.
291
292							=item global_stdev
293
294							Return the mean score value across all chromosomes.
295
296							=item global_stats
297
298							Return a statistical summary across all chromosomes.
299
300							=back
301
302							=head2 Data accession
303
304							These are the primary methods for working with data contained in
305							useq file archive. These should be familiar to most B<BioPerl> users.
306
307							=over 4
308
309							=item features
310
311							Returns an array or array reference of SeqFeatureI compatible
312							objects overlapping a given genomic interval.
313
314							Coordinates of the interrogated regions must be supplied. At a
315							minimum, the seq_id must be supplied. A start of 1 and an end
316							corresponding to the length of the seq_id is used if not directly
317							provided. Coordinates may be specified in two different manners,
318							either as a list of (seq_id, start, end, strand) or as one or
319							more keyed values.
320
321							my @features = $useq->features($seq_id, $start, $end);
322
323							@features = $useq->features(
324							-seq_id => $seq_id,
325							-start => $start,
326							-end => $end,
327							);
328
329							If the -iterator argument is supplied with a true value, then an
330							iterator object is returned. See get_seq_stream() for details.
331
332							Bio::DB::USeq supports four different feature types. Feature
333							types may be specified using the -type argument.
334
335							=over 4
336
337							=item region
338
339							=item interval
340
341							=item observation
342
343							The default feature type if the type argument is not specified.
344							These are SeqFeatureI compatible objects representing observations
345							in the useq file archive. These are compatible with the iterator.
346							SeqFeature observations are returned in genomic order.
347
348							=item chromosome
349
350							Returns SeqFeatureI compatible objects representing the
351							reference sequences (chromosomes) listed in the useq file
352							archive. These are not compatibile with the iterator.
353
354							=item wiggle
355
356							=item wiggle:$bins
357
358							=item coverage
359
360							=item coverage:$bins
361
362							Returns an array of SeqFeatureI compatible objects for each
363							strand requested. If the useq file contains stranded data,
364							and no strand is requested, then two objects will be
365							returned representing each strand.
366
367							Each object contains an array representing scores across
368							the requested coordinates. This object is designed to be
369							backwards compatible with coverage features from the
370							L<Bio::DB::Sam> adaptor for use with L<Bio::Graphics> and GBrowse.
371							Note that only scores are returned, not a true depth coverage
372							in the sense of the L<Bio::DB::Sam> coverage types.
373
374							By default, the wiggle or coverage array is provided at 1
375							bp resolution. To improve efficiency with large regions,
376							the wiggle array may be limited by using a bin option,
377							where the interrogated interval is divided into the number
378							of bins requested.
379
380							To retrieve the scores, call the wiggle() or coverage() method.
381
382							For example, to request wiggle scores in 100 equal bins across
383							the interval, see the following example. The wiggle and
384							coverage types are synonymous.
385
386							my ($wiggle) = $useq->features(
387							-seq_id => $chromosome,
388							-start => $start,
389							-end => $end,
390							-type => 'wiggle:100',
391							);
392							my @scores = $wiggle->wiggle;
393							@scores = $wiggle->coverage;
394
395							Wiggle objects may also be obtained with a get_seq_stream()
396							iterator objects.
397
398							=item summary
399
400							=item summary:$bins
401
402							Returns an array of SeqFeatureI compatibile Summary objects
403							for each strand requested. If the useq file contains stranded
404							data, and no strand is requested, then two objects will be
405							returned representing each strand.
406
407							Each Summary object can then be used to call statistical
408							summaries for one or more bins across the interval. Each
409							statistical summary is an anonymous hash with five keys:
410							validCount, sumData, sumSquares, minVal, and maxVal. From
411							these values, a mean and standard deviation may also be
412							calculated.
413
414							my ($summary) = $useq->features(
415							-seq_id => $chromosome,
416							-start => $start,
417							-end => $end,
418							-type => 'summary',
419							);
420							my @stats = $summary->statistical_summary(100);
421							foreach my $stat (@stats) {
422							my $count = $stat->{validCount};
423							my $sum = $stat->{sumData};
424							my $mean = $sum / $count;
425							}
426
427							Statistical summaries are equivalent to those generated by the
428							L<Bio::DB::BigWig> adaptor and may be used interchangeably. They
429							are compatible with the L<Bio::Graphics> modules.
430
431							Summary objects may also be obtained with a get_seq_stream()
432							iterator object.
433
434							=back
435
436							=item get_seq_stream
437
438							This is a memory efficient data accessor. An iterator object is
439							returned for an interval specified using coordinate values in the
440							same manner as features(). Call the method next_seq() on the
441							iterator object to retrieve the observation SeqFeature objects
442							one at a time. The iterator is compatible with region, wiggle,
443							and summary feature types.
444
445							# establish the iterator object
446							my $iterator = $useq->get_seq_stream(
447							-seq_id => $seq_id,
448							-start => $start,
449							-end => $end,
450							-type => 'region',
451							);
452
453							# retrieve the features one at a time
454							while (my $f = $iterator->next_seq) {
455							# each feature $f is either a
456							# Bio::DB::USeq::Feature,
457							# a Bio::DB::USeq::Wiggle, or
458							# a Bio::DB::USeq::Summary object
459							}
460
461							=item scores
462
463							This is a simplified data accessor that only returns the score
464							values overlapping an interrogated interval, rather than
465							assembling each observation into a SeqFeature object. The scores
466							are not associated with genomic coordinates, and are not guaranteed
467							to be returned in original genomic order.
468
469							Provide the interval coordinates in the same manner as the
470							features() method. Stranded data collection is supported.
471
472							my @scores = $useq->scores(
473							-seq_id => $seq_id,
474							-start => $start,
475							-end => $end,
476							);
477
478							=item observations
479
480							This is a low-level data accessor, similar to features() but returning
481							an array of references to the original USeq observations. Each USeq
482							observation is an anonymous array reference of 2-4 elements: start, stop,
483							score, and text, depending on the stored data type. All coordinates are in
484							0-based coordinates, unlike high level accessors. Note that while strand is
485							supported, it is not reported for each observation. If observations exist on
486							both strands, then each strand should be searched separately. Observations
487							are not guaranteed to be returned in genomic order.
488
489							my @observations = $useq->observations(
490							-seq_id => $seq_id,
491							-start => $start,
492							-end => $end,
493							);
494							foreach my $obs (@observations) {
495							my $start = $obs->[0] + 1; # convert to 1-based coordinate
496							my $stop = $obs->[1];
497							my $score = $obs->[2]; # may not be present
498							my $text = $obs->[3]; # may not be present
499							}
500
501							=item segment
502
503							This returns a L<Bio::DB::USeq::Segment> object, which is a SeqFeatureI
504							compatible segment object corresponding to the specified coordinates.
505							From this object, one can call the features(), scores(), or
506							get_seq_stream() methods directly. Keyed options or location
507							information need not be provided. Stranded segments are
508							supported.
509
510							my $segment = $useq->segment(
511							-seq_id => $seq_id,
512							-start => $start,
513							-end => $end,
514							);
515							my @scores = $segment->scores;
516							my @features = $segment->features('wiggle:100');
517							my $iterator = $segment->get_seq_stream('region');
518
519							=item get_features_by_location
520
521							Convenience method for returning features restricted by location.
522
523							=item get_feature_by_id
524
525							=item get_feature_by_name
526
527							Compatibility methods for returning a specific feature or
528							observation in the USeq file. Text fields, if present, are not
529							indexed in the USeq file, preventing efficient searching of names.
530							As a workaround, an ID or name comprised of "$seq_id:$start:$end"
531							may be used, although a direct search of coordinates would be
532							more efficient.
533
534							my $feature = $useq->get_feature_by_id("$seq_id:$start:$end");
535
536
537							=back
538
539							=head1 ADDITIONAL CLASSES
540
541							These are additional class object that may be returned by various
542							methods above.
543
544							=head2 Bio::DB::USeq::Feature objects
545
546							These are SeqFeatureI compliant objects returned by the features()
547							or next_seq() methods. They support the following methods.
548
549							seq_id
550							start
551							end
552							strand
553							score
554							type
555							source (returns the useq archive filename)
556							name (chromosome:start..stop)
557							Bio::RangeI methods
558
559							Additionally, chromosome and global statistics are also available
560							from any feature, as well as from Segment, Wiggle, Iterator, and
561							Summary objects. See the corresponding USeq methods for details.
562
563							=head2 Bio::DB::USeq::Segment objects
564
565							This is a SeqFeatureI compliant object representing a genomic segment
566							or interval. These support the following methods.
567
568							=over 4
569
570							=item features
571
572							=item features($type)
573
574							=item get_seq_stream
575
576							=item get_seq_stream($type)
577
578							=item scores
579
580							=item observations
581
582							Direct methods for returning features or scores. Coordinate information
583							need not be provided. See the corresponding Bio::DB::USeq methods for
584							more information.
585
586							=item wiggle
587
588							=item wiggle($bins)
589
590							=item coverage
591
592							=item coverage($bins)
593
594							=item statistical_summary
595
596							=item statistical_summary($bins)
597
598							Convenience methods for returning wiggle (coverage) or summary features
599							over the segment. If desired, the number of bins may be specified. See
600							the features() method for more information.
601
602							=item slices
603
604							Returns an array of splice member names that overlap this segment.
605							See L<USEQ SLICES> for more information.
606
607							=back
608
609							=head2 Bio::DB::USeq::Iterator objects
610
611							This is an iterator object for retrieving useq observation SeqFeatureI
612							objects in a memory efficient manner by returning one feature at a time.
613
614							=over 4
615
616							=item next_seq
617
618							=item next_feature
619
620							Returns the next feature present in the interrogated interval. Features
621							are generally returned in ascending coordinate order. Returns undefined
622							when no features are remaining in the interval. Features may include
623							either region or wiggle types, depending on how the iterator object was
624							established. See features() and get_seq_stream() methods for more
625							information.
626
627							=back
628
629							=head2 Bio::DB::USeq::Wiggle objects
630
631							These are SeqFeatureI compliant objects for backwards compatibility with
632							L<Bio::Graphics> and GBrowse. They support the wiggle() and coverage()
633							methods, which returns an array of scores over the interrogated region. By
634							default, the array is equal to the length of the region (1 bp resolution),
635							or may be limited to a specified number of bins for efficiency. See the
636							features() method for more information.
637
638							=over 4
639
640							=item wiggle
641
642							=item coverage
643
644							The scores are stored as an array in the coverage attribute. For
645							convenience, the wiggle() and coverage() methods may be used to retrieve
646							the array or array reference of scores.
647
648							=item statistical_summary
649
650							Generate a statistical summary hash for the collected wiggle scores
651							(not the original data). This method is not entirely that useful; best
652							to use the summary feature type in the first place.
653
654							=item chromosome and global statistics
655
656							Chromosome and global statistics, including mean and standard deviation,
657							are available from wiggle objects. See the corresponding USeq methods
658							for details.
659
660							=back
661
662							=head2 Bio::DB::USeq::Summary objects
663
664							These are SeqFeatureI compliant Summary objects, similar to those
665							generated by the L<Bio::DB::BigWig> database adaptor. As such, they are
666							compatible with L<Bio::Graphics> and GBrowse.
667
668							Summary objects can generate statistical summaries over a specified
669							number of bins (default is 1 bin, or the entire requested region).
670							Each statistical summary is an anonymous hash consisting of five
671							keys: validCount, sumData, sumSquares, minVal, and maxVal. From
672							these values, a mean and standard deviation may be calculated.
673
674							For convenience, three exported functions are available for calculating
675							the mean and standard deviation from a statistical summary hash.
676							See L<EXPORTED FUNCTIONS> for more information.
677
678							Use statistical summaries in the following manner.
679
680							my $stats = $summary->statistical_summary(10);
681							my $stat = shift @$stats;
682							my $min = $stat->{minVal};
683							my $max = $stat->{maxVal};
684							my $mean = $stat->{sumData} / $stat->{validCount};
685
686							=over 4
687
688							=item statistical_summary
689
690							=item statistical_summary($bins)
691
692							Generate a statistical summary hash for one or more bins across the
693							interrogated region. Provide the number of bins desired. If a feature
694							type of "summary:$bins" is requested through the features() or
695							get_seq_stream() method, then $bins number of bins will be used.
696							The default number of bins is 1.
697
698							=item score
699
700							Generate a single statistical summary over the entire region.
701
702							=item chromosome and global statistics
703
704							Chromosome and global statistics, including mean and standard deviation,
705							are available from summary objects. See the corresponding USeq methods
706							for details.
707
708							=back
709
710							=head1 EXPORTED FUNCTIONS
711
712							Three subroutine functions are available for export to assist
713							in calculating the mean, variance, and standard deviation from
714							statistical summaries. These functions are identical to those
715							from the L<Bio::DB::BigWig> adaptor and may be used interchangeably.
716
717							They are not exported by default; they must explicitly listed.
718
719							use Bio::DB::USeq qw(binMean binStdev);
720							my $stats = $summary->statistical_summary(10);
721							my $stat = shift @$stats;
722							my $mean = binMean($stat);
723							my $stdev = binStdev($stat);
724
725							=over 4
726
727							=item binMean( $stat )
728
729							Calculate the mean from a statistical summary anonymous hash.
730
731							=item binVariance( $stat )
732
733							Calculate the variance from a statistical summary anonymous hash.
734
735							=item binStdev( $stat )
736
737							Calculate the standard deviation from a statistical summary anonymous hash.
738
739							=back
740
741							=head1 USEQ SLICES
742
743							Genomic observations are recorded in groups, called slices, of
744							usually 10000 observations at a time. Each slice is a separate
745							zip file member in the useq file archive. These methods are for
746							accessing information about each slice. In general, accessing
747							data through slices is a lower level operation. Users should
748							preferentially use the main data accessors.
749
750							The following are Bio::DB::USeq methods available for working
751							with slices.
752
753							=over 4
754
755							=item slices
756
757							Returns an array of all the slice member names present in the
758							useq archive file.
759
760							=item slice_feature($slice)
761
762							Return a L<Bio::DB::USeq::Segment> object representing the slice interval.
763							The features(), get_seq_stream(), and scores() methods are supported.
764
765							=item slice_seq_id($slice)
766
767							Return the chromosome or sequence identifier associated with a slice.
768
769							=item slice_start($slice)
770
771							Return the start position of the slice interval.
772
773							=item slice_end($slice)
774
775							Return the end position of the slice interval.
776
777							=item slice_strand($slice)
778
779							Return the strand of the slice interval.
780
781							=item slice_type($slice)
782
783							Return the file type of the slice member. This corresponds to the
784							file extension of the slice zip member and indicates how to parse
785							the binary member. Each letter in the type corresponds to a data
786							type, be it integer, short, floating-point, or text. See the
787							useq file documentation for more details.
788
789							=item slice_obs_number($slice)
790
791							Return the number of observations recorded in the slice interval.
792
793							=back
794
795							=head1 GBROWSE COMPATIBILITY
796
797							The USeq adaptor has support for L<Bio::Graphics> and GBrowse.
798							It will work with the segments glyph for intervals,
799							the wiggle_xyplot glyph for displaying mean scores, and the
800							wiggle_whiskers glyph for displaying detailed statistics.
801
802							Initialize the USeq database adaptor.
803
804							[data1:database]
805							db_adaptor = Bio::DB::USeq
806							db_args = -file /path/to/data1.useq
807
808							Displaying simple intervals with the segments glyph.
809
810							[data1_segments]
811							database = data1
812							feature = region
813							glyph = segments
814							stranded = 1
815
816							Displaying scores using the wiggle_xyplot glyph.
817							You may set the bins to whatever number is appropriate (in
818							this example, 1000), or leave blank (not recommended,
819							defaults to 1 bp resolution).
820
821							[data1_xyplot]
822							database = data1
823							feature = wiggle:1000
824							glyph = wiggle_xyplot
825							graph_type = histogram
826							autoscale = chromosome
827
828							Displaying scores using the wiggle_whiskers glyph. Note that
829							generating statistical summaries are computationally more expensive
830							than simple bins of mean values as with the wiggle feature type.
831
832							[data1_whiskers]
833							database = data1
834							feature = summary
835							glyph = wiggle_whiskers
836							graph_type = histogram
837							autoscale = chromosome
838
839							=head1 PERFORMANCE
840
841							Because the Bio::DB::USeq is implemented as a Perl-only module,
842							performance is subject to the limitations of Perl execution itself and
843							the size of the data that needs to be parsed. In general when collecting
844							score data, requesting scores is the fastest mode of operation, followed
845							by wiggle feature types, and finally summary feature types.
846
847							In comparison to UCSC bigWig files, the USeq format is typically much
848							faster when viewing intervals where the entire interval is represented by
849							one or a few internal slices. This is especially true for repeated queries
850							over the same or neighboring intervals, as the slice contents are retained
851							in memory. As the number of internal slices that must be loaded into memory
852							increases, for example querying intervals of hundreds of kilobases in size,
853							performance will begin to lag as each internal slice must be parsed into
854							memory. This is where the UCSC bigWig file format with internal zoom levels
855							of summary statistics can outperform, at the cost of file complexity and size.
856
857							=cut
858
859
860							require 5.010000;
861	1			1		8795	use strict;
	1					3
	1					33
862	1			1		6	use Carp qw(carp cluck croak confess);
	1					2
	1					74
863	1			1		672	use Archive::Zip qw( :ERROR_CODES );
	1					95370
	1					106
864	1			1		448	use Set::IntervalTree;
	1					8057
	1					74
865	1			1		9	use File::Spec;
	1					2
	1					29
866
867	1			1		5	use base 'Exporter';
	1					2
	1					9010
868							our @EXPORT_OK = qw(binMean binVariance binStdev);
869
870							1;
871
872							#### USeq initialization ####
873
874							sub new {
875	2			2	1	221	my $class = shift;
876	2					22	my $self = {
877							'name' => undef,
878							'zip' => undef,
879							'stranded' => undef,
880							'seq_ids' => {},
881							'metadata' => {},
882							'coord2file' => {},
883							'file2attribute' => {},
884							'buffer' => {},
885							};
886	2					6	bless $self, $class;
887
888							# check for arguments
889	2					4	my %args;
890	2	50				8	if (@_) {
891	2	100				13	if ($_[0] =~ /^-/) {
892	1					5	%args = @_;
893							}
894							else {
895	1					15	$args{-file} = shift;
896							}
897							}
898
899							# open file
900	2		0			7	my $file = $args{-file} \|\| $args{-useq} \|\| undef;
901	2	50				8	if ($file) {
902	2	50				11	return unless ($self->open($file)); # open must return true
903							}
904
905							# done
906	2					8	return $self;
907							}
908
909							sub open {
910	2			2	1	4	my $self = shift;
911
912							# check file
913	2					5	my $filename = shift;
914	2	50				6	unless ($filename) {
915	0					0	cluck("no file name passed!\n");
916	0					0	return;
917							}
918	2	50				11	unless ($filename =~ /\.useq$/i) {
919	0					0	carp "'$filename' is not a .useq archive file!\n";
920	0					0	return;
921							}
922	2	50				9	if ($self->slices) {
923	0					0	cluck "Only load one useq file per object!\n";
924	0					0	return;
925							}
926
927							# open the archive
928	2					17	my $zip = Archive::Zip->new();
929	2					74	my $error = $zip->read($filename);
930	2	50				3114	unless ($error == AZ_OK) {
931	0					0	carp " unable to read USeq archive '$filename'! Error $error\n";
932	0					0	return;
933							}
934	2					6	$self->{'zip'} = $zip;
935	2					50	(undef, undef, $self->{'name'}) = File::Spec->splitpath($filename);
936
937							# parse the contents
938	2	50				10	return unless ($self->_parse_members);
939							# we delay parsing metadata unless it is requested
940
941							# success
942	2					6	return 1;
943							}
944
945							sub clone {
946	1			1	1	3	my $self = shift;
947	1	50				6	return unless $self->{'zip'};
948	1					4	my $file = $self->zip->fileName;
949	1					17	my $zip = Archive::Zip->new();
950	1					32	my $error = $zip->read($file);
951	1	50				1560	unless ($error == AZ_OK) {
952	0					0	carp " unable to read USeq archive '$file'! Error $error\n";
953	0					0	return;
954							}
955	1					29	$self->{'zip'} = $zip;
956	1					5	return 1;
957							}
958
959							sub zip {
960	9			9	1	57	return shift->{'zip'};
961							}
962
963							sub name {
964	4			4	0	40	return shift->{'name'};
965							}
966
967
968
969							#### General USeq information ####
970
971							sub seq_ids {
972	1			1	1	3	my $self = shift;
973	1					2	return sort {$a cmp $b} keys %{ $self->{'seq_ids'} };
	1					7
	1					7
974							}
975
976							sub length {
977	8			8	1	72	my $self = shift;
978	8	50				25	my $seq_id = shift or return;
979	8	50				34	if (exists $self->{'seq_ids'}{$seq_id}) {
980	8					38	return $self->{'seq_ids'}{$seq_id};
981							}
982							}
983
984							sub stranded {
985	8			8	1	53	return shift->{'stranded'};
986							}
987
988							sub attributes {
989	0			0	1	0	my $self = shift;
990	0	0				0	$self->_parse_metadata unless %{ $self->{'metadata'} };
	0					0
991	0					0	return (sort {$a cmp $b} keys %{ $self->{'metadata'} });
	0					0
	0					0
992							}
993
994							sub attribute {
995	1			1	1	2	my $self = shift;
996	1					3	my $key = shift;
997	1	50				3	return $self->attributes unless $key;
998	1	50				2	$self->_parse_metadata unless %{ $self->{'metadata'} };
	1					7
999	1	50				4	if (exists $self->{'metadata'}{$key}) {
1000	1					7	return $self->{'metadata'}{$key};
1001							}
1002	0					0	return;
1003							}
1004
1005							sub type {
1006	1			1	1	107	return shift->attribute('dataType');
1007							}
1008
1009							sub version {
1010	0			0	1	0	return shift->attribute('useqArchiveVersion');
1011							}
1012
1013							sub genome {
1014	0			0	1	0	return shift->attribute('versionedGenome');
1015							}
1016
1017							sub chr_stats {
1018	1			1	1	3	my $self = shift;
1019	1	50				4	my $seq_id = shift or return;
1020	1					3	my $delay_write = shift; # option to delay rewriting the metadata
1021	1	50				1	$self->_parse_metadata unless %{ $self->{'metadata'} };
	1					6
1022
1023							# return the chromosome stats
1024	1	50				12	if (exists $self->{metadata}{"chromStats_$seq_id"}) {
1025	0					0	my @data = split(',', $self->{metadata}{"chromStats_$seq_id"});
1026	0					0	my %stat = (
1027							'validCount' => $data[0],
1028							'sumData' => $data[1],
1029							'sumSquares' => $data[2],
1030							'minVal' => $data[3],
1031							'maxVal' => $data[4],
1032							);
1033	0					0	return \%stat;
1034							}
1035
1036							# chromosome stats must be generated
1037	1					14	my @slices = $self->_translate_coordinates_to_slices(
1038							$seq_id, 1, $self->length($seq_id), 0);
1039	1					5	$self->_clear_buffer(\@slices);
1040	1					9	my $stat = $self->_stat_summary(1, $self->length($seq_id), \@slices);
1041
1042							# then associate with the metadata
1043	1					7	$self->{'metadata'}{"chromStats_$seq_id"} = join(',', map { $stat->{$_} }
	5					44
1044							qw(validCount sumData sumSquares minVal maxVal) );
1045	1	50				9	$self->_rewrite_metadata unless $delay_write;
1046
1047	1					6	return $stat;
1048							}
1049
1050							sub chr_mean {
1051	1			1	1	142	my $self = shift;
1052	1	50				5	my $seq_id = shift or return;
1053	1					4	return Bio::DB::USeq::binMean( $self->chr_stats($seq_id) );
1054							}
1055
1056							sub chr_stdev {
1057	0			0	1	0	my $self = shift;
1058	0	0				0	my $seq_id = shift or return;
1059	0					0	return Bio::DB::USeq::binStdev( $self->chr_stats($seq_id) );
1060							}
1061
1062							sub global_stats {
1063							# this is an expensive proposition, because it must parse through every
1064							# slice in the archive
1065	0			0	1	0	my $self = shift;
1066	0	0				0	$self->_parse_metadata unless %{ $self->{'metadata'} };
	0					0
1067
1068							# return the chromosome stats
1069	0	0				0	if (exists $self->{metadata}{globalStats}) {
1070	0					0	my @data = split(',', $self->{metadata}{globalStats});
1071	0					0	my %stat = (
1072							'validCount' => $data[0],
1073							'sumData' => $data[1],
1074							'sumSquares' => $data[2],
1075							'minVal' => $data[3],
1076							'maxVal' => $data[4],
1077							);
1078	0					0	return \%stat;
1079							}
1080
1081							# calculate new genome-wide statistics from individual chromosome stats
1082	0					0	my $count = 0;
1083	0					0	my $sum;
1084							my $sum_squares;
1085	0					0	my $min;
1086	0					0	my $max;
1087	0					0	foreach my $seq_id ($self->seq_ids) {
1088	0					0	my $stats = $self->chr_stats($seq_id, 1); # delay writing metadata
1089	0					0	$count += $stats->{validCount};
1090	0					0	$sum += $stats->{sumData};
1091	0					0	$sum_squares += $stats->{sumSquares};
1092	0	0	0			0	$min = $stats->{minVal} if (!defined $min or $stats->{minVal} < $min);
1093	0	0	0			0	$max = $stats->{maxVal} if (!defined $max or $stats->{maxVal} < $max);
1094							}
1095
1096							# assemble the statistical summary hash
1097	0		0			0	my %stat = (
			0
			0
			0
1098							'validCount' => $count,
1099							'sumData' => $sum \|\| 0,
1100							'sumSquares' => $sum_squares \|\| 0,
1101							'minVal' => $min \|\| 0,
1102							'maxVal' => $max \|\| 0,
1103							);
1104
1105							# update metadata
1106	0					0	$self->{'metadata'}{'globalStats'} = join(',', map { $stat{$_} }
	0					0
1107							qw(validCount sumData sumSquares minVal maxVal) );
1108	0					0	$self->_rewrite_metadata;
1109
1110	0					0	return \%stat;
1111							}
1112
1113							sub global_mean {
1114	0			0	1	0	my $self = shift;
1115	0					0	return Bio::DB::USeq::binMean( $self->global_stats );
1116							}
1117
1118							sub global_stdev {
1119	0			0	1	0	my $self = shift;
1120	0					0	return Bio::DB::USeq::binStdev( $self->global_stats );
1121							}
1122
1123							#### slice information ####
1124
1125							sub slices {
1126	2			2	1	4	my $self = shift;
1127	2					3	return sort {$a cmp $b} keys %{ $self->{'file2attribute'} };
	0					0
	2					13
1128							}
1129
1130							sub slice_seq_id {
1131	0			0	1	0	my $self = shift;
1132	0	0				0	my $slice = shift or return;
1133	0					0	return $self->{'file2attribute'}{$slice}->[0];
1134							}
1135
1136							sub slice_start {
1137	1087			1087	1	1711	my $self = shift;
1138	1087	50				2189	my $slice = shift or return;
1139	1087					2424	return $self->{'file2attribute'}{$slice}->[1];
1140							}
1141
1142							sub slice_end {
1143	2020			2020	1	2655	my $self = shift;
1144	2020	50				3759	my $slice = shift or return;
1145	2020					5941	return $self->{'file2attribute'}{$slice}->[2];
1146							}
1147
1148							sub slice_strand {
1149	4			4	1	9	my $self = shift;
1150	4	50				16	my $slice = shift or return;
1151	4					23	return $self->{'file2attribute'}{$slice}->[3];
1152							}
1153
1154							sub slice_type {
1155	1020			1020	1	1425	my $self = shift;
1156	1020	50				2040	my $slice = shift or return;
1157	1020					4047	return $self->{'file2attribute'}{$slice}->[4];
1158							}
1159
1160							sub slice_obs_number {
1161	6			6	1	12	my $self = shift;
1162	6	50				15	my $slice = shift or return;
1163	6					19	return $self->{'file2attribute'}{$slice}->[5];
1164							}
1165
1166							sub slice_feature {
1167	0			0	1	0	my $self = shift;
1168	0	0				0	my $slice = shift or return;
1169							return Bio::DB::USeq::Segment->new(
1170							-seq_id => $self->{'file2attribute'}{$slice}->[0],
1171							-start => $self->{'file2attribute'}{$slice}->[1],
1172							-stop => $self->{'file2attribute'}{$slice}->[2],
1173							-strand => $self->{'file2attribute'}{$slice}->[3],
1174	0					0	-type => $self->{'file2attribute'}{$slice}->[4],
1175							-source => $self->name,
1176							-name => $slice,
1177							);
1178							}
1179
1180
1181
1182
1183
1184							#### Feature and data access ####
1185
1186							sub segment {
1187	1			1	1	3	my $self = shift;
1188
1189							# arguments can be chromo, start, stop, strand
1190	1					4	my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_);
1191	1	50				4	return unless $self->length($seq_id); # make sure chromosome is represented
1192
1193	1					12	return Bio::DB::USeq::Segment->new(
1194							-seq_id => $seq_id,
1195							-start => $start,
1196							-end => $stop,
1197							-strand => $strand,
1198							-type => 'segment',
1199							-source => $self->name,
1200							-useq => $self,
1201							);
1202							}
1203
1204							sub features {
1205	2			2	1	344	my $self = shift;
1206	2					10	my %args = @_;
1207
1208							# check for type
1209	2					5	my $type;
1210	2		0			7	$args{-type} \|\|= $args{-types} \|\| $args{-primary_tag} \|\| 'region';
			33
1211	2	50	33			11	if (ref $args{-type} and ref $args{-type} eq 'ARRAY') {
1212	0		0			0	$type = $args{-type}->[0] \|\| 'region';
1213	0					0	$args{-type} = $type;
1214							}
1215							else {
1216	2					6	$type = $args{-type};
1217							}
1218
1219							# return an appropriate feature
1220	2	50				20	if ($type =~ /chromosome/) {
		50
1221							# compatibility to return chromosome features
1222							my @chromos = map {
1223	0					0	Bio::DB::USeq::Feature->new(
	0					0
1224							-seq_id => $_,
1225							-start => 1,
1226							-end => $self->length($_),
1227							-type => $type,
1228							-source => $self->name,
1229							-name => $_,
1230							)
1231							} $self->seq_ids;
1232	0	0				0	return wantarray ? @chromos : \@chromos;
1233							}
1234							elsif ($type =~ /region\|interval\|observation\|coverage\|wiggle\|summary/) {
1235							# region or segment are individual observation features
1236							# coverage or wiggle are for efficient score retrieval with
1237							# backwards compatibility with Bio::Graphics
1238							# summary are statistical summaries akin to Bio::DB::BigFile
1239
1240							# set up an iterator
1241	2					13	my $iterator = $self->get_seq_stream(%args);
1242	2	50				9	return unless $iterator;
1243
1244							# if user requested an iterator
1245	2	0	33			6	if (exists $args{-iterator} and $args{-iterator}) {
1246	0					0	return $iterator;
1247							}
1248
1249							# collect the features
1250	2					4	my @features;
1251	2					7	while (my $f = $iterator->next_seq) {
1252	2					10	push @features, $f;
1253							}
1254	2	50				22	return wantarray ? @features : \@features;
1255							}
1256							else {
1257	0					0	confess "unknown type request '$type'!\n";
1258							}
1259							}
1260
1261
1262							sub get_features_by_type {
1263							# does not work without location information
1264	0			0	0	0	cluck "please use features() method instead";
1265	0					0	return;
1266							}
1267
1268
1269							sub get_features_by_location {
1270	0			0	1	0	my $self = shift;
1271
1272							# arguments can be chromo, start, stop, strand
1273	0					0	my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_);
1274
1275	0					0	return $self->features($seq_id, $start, $stop, $strand);
1276							}
1277
1278
1279							sub get_feature_by_id {
1280							# much as Bio::DB::BigWig fakes the id, so we will too here
1281							# I don't know how necessary this really is
1282	0			0	1	0	my $self = shift;
1283
1284							# id will be encoded as chr:start:end ?
1285	0					0	my $id = shift;
1286	0					0	my ($seq_id, $start, $end, $type) = split /:/, $id;
1287
1288	0		0			0	my @list = $self->features(
1289							-seq_id => $seq_id,
1290							-start => $start,
1291							-end => $end,
1292							-type => $type \|\| undef,
1293							);
1294	0	0				0	return unless @list;
1295	0	0				0	return $list[0] if scalar @list == 1;
1296	0					0	foreach my $f (@list) {
1297	0	0	0			0	return $f if ($f->start == $start and $f->end == $end);
1298							}
1299	0					0	return;
1300							}
1301
1302
1303							sub get_feature_by_name {
1304	0			0	1	0	return shift->get_feature_by_id(@_);
1305							}
1306
1307
1308							sub get_seq_stream {
1309	3			3	1	95	my $self = shift;
1310
1311							# arguments can be chromo, start, stop, strand
1312	3					12	my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_);
1313	3	50				12	return unless $self->length($seq_id); # make sure chromosome is represented
1314
1315							# check for type
1316	3					10	my %args = @_;
1317	3					6	my $type;
1318	3		50			18	$args{-type} \|\|= $args{-types} \|\| $args{-primary_tag} \|\| 'region';
			66
1319	3	50	33			29	if (ref $args{-type} and ref $args{-type} eq 'ARRAY') {
1320	0		0			0	$type = $args{-type}->[0] \|\| 'region';
1321							}
1322							else {
1323	3					10	$type = $args{-type};
1324							}
1325
1326	3					14	return Bio::DB::USeq::Iterator->new(
1327							-seq_id => $seq_id,
1328							-start => $start,
1329							-end => $stop,
1330							-strand => $strand,
1331							-type => $type,
1332							-source => $self->name,
1333							-useq => $self,
1334							);
1335							}
1336
1337
1338							sub scores {
1339	1			1	1	59	my $self = shift;
1340
1341							# arguments can be chromo, start, stop, strand
1342	1					4	my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_);
1343	1	50				4	return unless $self->length($seq_id); # make sure chromosome is represented
1344
1345							# determine which slices to retrieve
1346	1					4	my @slices = $self->_translate_coordinates_to_slices(
1347							$seq_id, $start, $stop, $strand);
1348	1	50				3	return unless @slices;
1349	1					5	$self->_clear_buffer(\@slices);
1350
1351							# collect the scores from each of the requested slices
1352	1					2	my @scores;
1353	1					3	foreach my $slice (@slices) {
1354
1355							# load and unpack the data
1356	1					4	$self->_load_slice($slice);
1357
1358							# find the overlapping observations
1359	1					66	my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop);
1360
1361							# record the scores
1362	1					7	foreach my $r (@$results) {
1363	31	50				65	push @scores, $r->[2] if defined $r->[2];
1364							}
1365							}
1366
1367	1	50				25	return wantarray ? @scores : \@scores;
1368							}
1369
1370							sub observations {
1371	0			0	1	0	my $self = shift;
1372
1373							# arguments can be chromo, start, stop, strand
1374	0					0	my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_);
1375	0	0				0	return unless $self->length($seq_id); # make sure chromosome is represented
1376
1377							# determine which slices to retrieve
1378	0					0	my @slices = $self->_translate_coordinates_to_slices(
1379							$seq_id, $start, $stop, $strand);
1380	0	0				0	return unless @slices;
1381	0					0	$self->_clear_buffer(\@slices);
1382
1383							# collect the scores from each of the requested slices
1384	0					0	my @observations;
1385	0					0	foreach my $slice (@slices) {
1386							# load and unpack the data
1387	0					0	$self->_load_slice($slice);
1388
1389							# find the overlapping observations
1390	0					0	my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop);
1391	0					0	push @observations, @$results;
1392							}
1393
1394	0	0				0	return wantarray ? @observations : \@observations;
1395							}
1396
1397
1398
1399
1400							#### Private methods ####
1401
1402							sub _parse_metadata {
1403	1			1		2	my $self = shift;
1404
1405							# the metadata file should always be present in a USeq file
1406	1					5	my $readMe = $self->{'zip'}->contents('archiveReadMe.txt');
1407	1	50				1366	unless ($readMe) {
1408	0					0	carp " USeq file is malformed! It does not contain an archiveReadMe.txt file\n";
1409	0					0	return;
1410							}
1411
1412							# parse the archive file
1413							# this is a simple text file, each line is key = value
1414	1					4	$self->{'metadata'}{'comments'} = [];
1415	1					6	foreach (split /\n/, $readMe) {
1416	5	50				15	if (/^#/) {
1417	0	0				0	push @{ $self->{'metadata'}{'comments'} }, $_ unless $_ =~ /validCount/;
	0					0
1418	0					0	next;
1419							}
1420	5	50				21	if (/^\s([^=\s]+)\s=\s(.+)\s$/) {
1421							# separate key = value pairs tolerating whitespace
1422							# key may contain anything excluding = and whitespace
1423	5					27	$self->{'metadata'}{$1} = $2;
1424							}
1425							}
1426	1					4	return 1;
1427							}
1428
1429
1430							sub _parse_members {
1431	2			2		5	my $self = shift;
1432
1433							# there is a lot of information encoded in each zip member file name
1434							# the chromosome, strand, coordinates of represented interval, and file type
1435							# this parses and indexes this metadata into a usable format
1436
1437	2					4	my @errors;
1438	2					8	foreach my $member ($self->{'zip'}->memberNames) {
1439
1440							# archive readMe
1441	8	100				95	next if ($member eq 'archiveReadMe.txt');
1442
1443							# data file
1444	6					52	my ($chromo, $strand, $start, $stop, $number, $extension) =
1445							($member =~ /^([\w\-\.]+)([\+\-\.])(\d+)\-(\d+)\-(\d+)\.(\w+)$/i);
1446
1447							# check extracted metadata
1448	6	50	33			64	unless ($chromo and $strand and defined $start and defined $stop and
			33
			33
			33
			33
1449							$number and $extension) {
1450	0					0	push @errors, " data slice $member";
1451	0					0	next;
1452							}
1453
1454							# check stranded data
1455	6	100				15	unless (defined $self->{'stranded'}) {
1456	2	100				7	if ($strand eq '.') {
1457	1					3	$self->{'stranded'} = 0;
1458							}
1459							else {
1460	1					4	$self->{'stranded'} = 1;
1461							}
1462							}
1463
1464							# check seq_ids
1465							# record the length for each seq_id, which may or may not be entirely
1466							# accurate, since it is just the last interval position
1467	6	100				14	if (exists $self->{'seq_ids'}{$chromo} ) {
1468	3	50				10	if ($stop > $self->{'seq_ids'}{$chromo}) {
1469	3					7	$self->{'seq_ids'}{$chromo} = $stop;
1470							}
1471							}
1472							else {
1473	3					9	$self->{'seq_ids'}{$chromo} = $stop;
1474							}
1475
1476							# convert to BioPerl convention
1477	6	50				29	$strand = $strand eq '+' ? 1 : $strand eq '.' ? 0 : $strand eq '-' ? -1 : 0;
		100
		100
1478	6					13	$start += 1;
1479
1480							# store the member details for each member slice
1481	6					24	$self->{'file2attribute'}{$member} =
1482							[$chromo, $start, $stop, $strand, $extension, $number];
1483
1484							# store the member into a chromosome-strand-specific interval tree
1485	6		66			49	$self->{'coord2file'}{$chromo}{$strand} \|\|= Set::IntervalTree->new();
1486	6					27	$self->{'coord2file'}{$chromo}{$strand}->insert($member, $start, $stop);
1487							}
1488
1489							# check parsing
1490	2	50				16	if (@errors) {
1491	0					0	carp "Errors parsing data slice filenames:\n" . join("\n", @errors) . "\n";
1492							}
1493	2	50				5	unless (%{ $self->{'coord2file'} }) {
	2					9
1494	0					0	carp "no data slices present in USeq archive!\n";
1495	0					0	return;
1496							}
1497
1498	2					7	return 1;
1499							}
1500
1501
1502							sub _get_coordinates {
1503	5			5		9	my $self = shift;
1504
1505	5					10	my ($seq_id, $start, $stop, $strand);
1506	5	50				26	if ($_[0] =~ /^\-/) {
1507	5					23	my %args = @_;
1508	5		0			19	$seq_id = $args{'-seq_id'} \|\| $args{'-chromo'} \|\| undef;
1509	5		0			16	$start = $args{'-start'} \|\| $args{'-pos'} \|\| undef;
1510	5		0			13	$stop = $args{'-end'} \|\| $args{'-stop'} \|\| undef;
1511	5		50			29	$strand = $args{'-strand'} \|\| '0'; # unstranded
1512							}
1513							else {
1514	0					0	($seq_id, $start, $stop, $strand) = @_;
1515							}
1516	5	50				11	unless ($seq_id) {
1517	0					0	cluck "no sequence ID provided!";
1518	0					0	return;
1519							}
1520	5		50			14	$start \|\|= 1;
1521	5		33			27	$stop \|\|= $self->length($seq_id);
1522	5		50			20	$strand \|\|= 0;
1523	5					19	return ($seq_id, $start, $stop, $strand);
1524							}
1525
1526
1527							sub _translate_coordinates_to_slices {
1528	6			6		13	my $self = shift;
1529	6					17	my ($seq_id, $start, $stop, $strand) = @_;
1530	6	50				29	return unless (exists $self->{'coord2file'}{$seq_id});
1531
1532							# check strand request
1533	6					11	my $both = 0;
1534	6	50				19	if ($strand == 0) {
1535							# strand was not specified,
1536							# but collect from both strands if we have stranded data
1537	6	100				15	$both = 1 if $self->stranded;
1538							}
1539							else {
1540							# strand was specified,
1541							# but convert it to unstranded if the data is not stranded
1542	0	0				0	$strand = 0 unless $self->stranded;
1543							}
1544
1545							# look for the overlapping slices
1546	6					13	my @slices;
1547	6	100				17	if ($both) {
1548							# need to collect from both strands
1549							# plus strand first, then minus strand
1550	1					9	my $results = $self->{'coord2file'}{$seq_id}{1}->fetch($start, $stop);
1551	1					5	my $results2 = $self->{'coord2file'}{$seq_id}{-1}->fetch($start, $stop);
1552	1					5	push @slices, @$results, @$results2;
1553							}
1554
1555							# specific strand
1556							else {
1557	5					56	my $results = $self->{'coord2file'}{$seq_id}{$strand}->fetch($start, $stop);
1558	5					16	push @slices, @$results;
1559							}
1560
1561	6					22	return @slices;
1562							}
1563
1564
1565							sub _clear_buffer {
1566	6			6		11	my $self = shift;
1567
1568							# make a quick hash of wanted slices
1569	6					12	my %wanted = map {$_ => 1} @{$_[0]};
	9					31
	6					25
1570
1571							# delete the existing buffers of slices we do not want
1572	6					11	foreach (keys %{ $self->{buffer} }) {
	6					28
1573	5	100				8151	delete $self->{buffer}{$_} unless exists $wanted{$_};
1574							}
1575							}
1576
1577							sub _load_slice {
1578	1016			1016		1396	my $self = shift;
1579	1016					1507	my $slice = shift;
1580	1016	100				2379	return if (exists $self->{'buffer'}{$slice});
1581
1582							# each slice is parsed into observations consisting of start, stop, and
1583							# optionally score and text depending on type
1584							# these are stored as anonymous arrays [start, stop, score, text]
1585							# for quick retrieval each observation interval is stored in a Set::IntervalTree
1586							# these operations maintain the original 0-based coordinate system
1587
1588	7					22	my $type = $self->slice_type($slice);
1589	7	50				59	if ($type eq 's') { $self->_load_s_slice($slice) }
	0	50				0
		50
		50
		100
		50
		50
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
1590	0					0	elsif ($type eq 'sf') { $self->_load_sf_slice($slice) }
1591	0					0	elsif ($type eq 'st') { $self->_load_st_slice($slice) }
1592	0					0	elsif ($type eq 'ss') { $self->_load_ss_slice($slice) }
1593	6					39	elsif ($type eq 'ssf') { $self->_load_ssf_slice($slice) }
1594	0					0	elsif ($type eq 'sst') { $self->_load_sst_slice($slice) }
1595	1					5	elsif ($type eq 'ssft') { $self->_load_ssft_slice($slice) }
1596	0					0	elsif ($type eq 'i') { $self->_load_i_slice($slice) }
1597	0					0	elsif ($type eq 'if') { $self->_load_if_slice($slice) }
1598	0					0	elsif ($type eq 'it') { $self->_load_it_slice($slice) }
1599	0					0	elsif ($type eq 'ii') { $self->_load_ii_slice($slice) }
1600	0					0	elsif ($type eq 'iif') { $self->_load_iif_slice($slice) }
1601	0					0	elsif ($type eq 'iit') { $self->_load_iit_slice($slice) }
1602	0					0	elsif ($type eq 'iift') { $self->_load_iift_slice($slice) }
1603	0					0	elsif ($type eq 'is') { $self->_load_is_slice($slice) }
1604	0					0	elsif ($type eq 'isf') { $self->_load_isf_slice($slice) }
1605	0					0	elsif ($type eq 'ist') { $self->_load_ist_slice($slice) }
1606	0					0	elsif ($type eq 'isft') { $self->_load_isft_slice($slice) }
1607							else {
1608	0					0	croak "unable to load slice '$slice'! Unsupported slice type $type\n";
1609							}
1610							}
1611
1612
1613							sub _load_s_slice {
1614	0			0		0	my $self = shift;
1615	0					0	my $slice = shift;
1616	0					0	my $tree = Set::IntervalTree->new();
1617
1618							# unpack the data slice zip member
1619	0					0	my $number = $self->slice_obs_number($slice);
1620	0					0	my ($null, @data) = unpack("si>(s>)$number", $self->zip->contents($slice) );
1621
1622							# convert the unpacked data into start, stop, score
1623							# first observation
1624	0					0	my $position = shift @data;
1625	0					0	$tree->insert( [$position, $position + 1, undef], $position, $position + 1);
1626
1627							# remaining observations
1628	0					0	while (@data) {
1629	0					0	$position += (shift @data) + 32768;
1630	0					0	$tree->insert( [$position, $position + 1, undef], $position, $position + 1);
1631							}
1632
1633							# store Interval Tree buffer
1634	0					0	$self->{buffer}{$slice} = $tree;
1635							}
1636
1637							sub _load_sf_slice {
1638	0			0		0	my $self = shift;
1639	0					0	my $slice = shift;
1640	0					0	my $tree = Set::IntervalTree->new();
1641
1642							# unpack the data slice zip member
1643	0					0	my $number = $self->slice_obs_number($slice);
1644	0					0	my ($null, @data) = unpack("si>f>(s>f>)$number", $self->zip->contents($slice) );
1645
1646							# convert the unpacked data into start, stop, score
1647							# first observation
1648	0					0	my $position = shift @data;
1649	0					0	$tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1);
1650
1651							# remaining observations
1652	0					0	while (@data) {
1653	0					0	$position += (shift @data) + 32768;
1654	0					0	$tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1);
1655							}
1656
1657							# store Interval Tree buffer
1658	0					0	$self->{buffer}{$slice} = $tree;
1659							}
1660
1661							sub _load_st_slice {
1662	0			0		0	my $self = shift;
1663	0					0	my $slice = shift;
1664	0					0	my $tree = Set::IntervalTree->new();
1665
1666							# convert the unpacked data into start, stop, (score), text
1667
1668							# load the slice contents
1669	0					0	my $contents = $self->zip->contents($slice);
1670
1671							# first observation
1672	0					0	my $i = 8; # go ahead and set index up to first observation text
1673	0					0	my (undef, $position, $t) = unpack('si>s>', substr($contents, 0, $i));
1674							# initial null, position, text_length
1675	0					0	my $text = unpack("A$t", substr($contents, $i, $t));
1676	0					0	$tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1);
1677	0					0	$i += $t;
1678
1679							# remaining observations
1680	0					0	my $p; # position offset
1681	0					0	my $len = CORE::length($contents);
1682	0					0	while ($i < $len) {
1683	0					0	my ($p, $t) = unpack('s>s>', substr($contents, $i, 4));
1684	0					0	$i += 4;
1685	0					0	$position += $p + 32768;
1686	0					0	$text = unpack("A$t", substr($contents, $i, $t));
1687	0					0	$tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1);
1688	0					0	$i += $t;
1689							}
1690
1691							# store Interval Tree buffer
1692	0					0	$self->{buffer}{$slice} = $tree;
1693							}
1694
1695							sub _load_ss_slice {
1696	0			0		0	my $self = shift;
1697	0					0	my $slice = shift;
1698	0					0	my $tree = Set::IntervalTree->new();
1699
1700							# unpack the data slice zip member
1701	0					0	my $number = $self->slice_obs_number($slice);
1702	0					0	my ($null, @data) = unpack("si>s>(s>s>)$number", $self->zip->contents($slice) );
1703
1704							# convert the unpacked data into start, stop, no score
1705							# first observation
1706	0					0	my $position = @data;
1707	0					0	my $position2 = $position + (shift @data) + 32768;
1708	0					0	$tree->insert( [$position, $position2, undef], $position, $position2);
1709
1710							# remaining observations
1711	0					0	while (@data) {
1712	0					0	$position += (shift @data) + 32768;
1713	0					0	$position2 = $position + (shift @data) + 32768;
1714	0					0	$tree->insert( [$position, $position2, undef], $position, $position2);
1715							}
1716
1717							# store Interval Tree buffer
1718	0					0	$self->{buffer}{$slice} = $tree;
1719							}
1720
1721							sub _load_ssf_slice {
1722	6			6		15	my $self = shift;
1723	6					11	my $slice = shift;
1724	6					57	my $tree = Set::IntervalTree->new();
1725
1726							# unpack the data slice zip member
1727	6					21	my $number = $self->slice_obs_number($slice);
1728	6					31	my ($null, @data) = unpack("si>s>f>(s>s>f>)$number", $self->zip->contents($slice) );
1729
1730							# convert the unpacked data into start, stop, score
1731							# first observation
1732	6					35114	my $position = (shift @data);
1733	6					25	my $position2 = $position + (shift @data) + 32768;
1734	6					58	$tree->insert( [$position, $position2, shift(@data)], $position, $position2);
1735
1736							# remaining observations
1737	6					28	while (@data) {
1738	58298					83341	$position += (shift @data) + 32768;
1739	58298					77348	$position2 = $position + (shift @data) + 32768;
1740	58298					175094	$tree->insert( [$position, $position2, shift(@data)], $position, $position2);
1741							}
1742
1743							# store Interval Tree buffer
1744	6					51	$self->{buffer}{$slice} = $tree;
1745							}
1746
1747							sub _load_sst_slice {
1748	0			0		0	my $self = shift;
1749	0					0	my $slice = shift;
1750	0					0	my $tree = Set::IntervalTree->new();
1751	0					0	my $contents = $self->zip->contents($slice);
1752
1753							# first observation
1754	0					0	my $i = 10; # go ahead and set index up to first observation text
1755	0					0	my (undef, $position, $l, $t) = unpack('si>s>s>', substr($contents, 0, $i));
1756							# initial null, position, length, text_length
1757	0					0	my $position2 = $position + $l + 32768;
1758	0					0	my $text = unpack("A$t", substr($contents, $i, $t));
1759	0					0	$tree->insert( [$position, $position2, undef, $text], $position, $position2);
1760	0					0	$i += $t;
1761
1762							# remaining observations
1763	0					0	my $p; # position offset
1764	0					0	my $len = CORE::length($contents);
1765	0					0	while ($i < $len) {
1766	0					0	($p, $l, $t) = unpack('s>s>s>', substr($contents, $i, 6));
1767	0					0	$i += 6;
1768	0					0	$position += $p + 32768;
1769	0					0	$position2 = $position + $l + 32768;
1770	0					0	$text = unpack("A$t", substr($contents, $i, $t));
1771	0					0	$tree->insert( [$position, $position2, undef, $text], $position, $position2);
1772	0					0	$i += $t;
1773							}
1774
1775							# store Interval Tree buffer
1776	0					0	$self->{buffer}{$slice} = $tree;
1777							}
1778
1779							sub _load_ssft_slice {
1780	1			1		2	my $self = shift;
1781	1					2	my $slice = shift;
1782	1					5	my $tree = Set::IntervalTree->new();
1783	1					4	my $contents = $self->zip->contents($slice);
1784
1785							# first observation
1786	1					4361	my $i = 14; # go ahead and set index up to first observation text
1787	1					7	my (undef, $position, $l, $s, $t) = unpack('si>s>f>s>', substr($contents, 0, $i));
1788							# initial null, position, length, score, text_length
1789	1					4	my $position2 = $position + $l + 32768;
1790	1					7	my $text = unpack("A$t", substr($contents, $i, $t));
1791	1					7	$tree->insert( [$position, $position2, $s, $text], $position, $position2);
1792	1					1	$i += $t;
1793
1794							# remaining observations
1795	1					2	my $p; # position offset
1796	1					3	my $len = CORE::length($contents);
1797	1					4	while ($i < $len) {
1798	10000					24967	($p, $l, $s, $t) = unpack('s>s>f>s>', substr($contents, $i, 10));
1799	10000					15473	$i += 10;
1800	10000					12970	$position += $p + 32768;
1801	10000					12770	$position2 = $position + $l + 32768;
1802	10000					22424	$text = unpack("A$t", substr($contents, $i, $t));
1803	10000					36496	$tree->insert( [$position, $position2, $s, $text], $position, $position2);
1804	10000					18421	$i += $t;
1805							}
1806
1807							# store Interval Tree buffer
1808	1					10	$self->{buffer}{$slice} = $tree;
1809							}
1810
1811							sub _load_i_slice {
1812	0			0		0	my $self = shift;
1813	0					0	my $slice = shift;
1814	0					0	my $tree = Set::IntervalTree->new();
1815
1816							# unpack the data slice zip member
1817	0					0	my $number = $self->slice_obs_number($slice);
1818	0					0	my ($null, @data) = unpack("si>(i>)$number", $self->zip->contents($slice) );
1819
1820							# convert the unpacked data into start, stop, score
1821							# first observation
1822	0					0	my $position = @data;
1823	0					0	$tree->insert( [$position, $position + 1, undef], $position, $position + 1);
1824
1825							# remaining observations
1826	0					0	while (@data) {
1827	0					0	$position += (shift @data);
1828	0					0	$tree->insert( [$position, $position + 1, undef], $position, $position + 1);
1829							}
1830
1831							# store Interval Tree buffer
1832	0					0	$self->{buffer}{$slice} = $tree;
1833							}
1834
1835							sub _load_if_slice {
1836	0			0		0	my $self = shift;
1837	0					0	my $slice = shift;
1838	0					0	my $tree = Set::IntervalTree->new();
1839
1840							# unpack the data slice zip member
1841	0					0	my $number = $self->slice_obs_number($slice);
1842	0					0	my ($null, @data) = unpack("si>f>(i>f>)$number", $self->zip->contents($slice) );
1843
1844							# convert the unpacked data into start, stop, score
1845							# first observation
1846	0					0	my $position = @data;
1847	0					0	$tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1);
1848
1849							# remaining observations
1850	0					0	while (@data) {
1851	0					0	$position += (shift @data);
1852	0					0	$tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1);
1853							}
1854
1855							# store Interval Tree buffer
1856	0					0	$self->{buffer}{$slice} = $tree;
1857							}
1858
1859							sub _load_it_slice {
1860	0			0		0	my $self = shift;
1861	0					0	my $slice = shift;
1862	0					0	my $tree = Set::IntervalTree->new();
1863
1864							# convert the unpacked data into start, stop, (score), text
1865
1866							# load the slice contents
1867	0					0	my $contents = $self->zip->contents($slice);
1868
1869							# first observation
1870	0					0	my $i = 10; # go ahead and set index up to first observation text
1871	0					0	my (undef, $position, $t) = unpack('si>i>', substr($contents, 0, $i));
1872							# initial null, position, text_length
1873	0					0	my $text = unpack("A$t", substr($contents, $i, $t));
1874	0					0	$tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1);
1875	0					0	$i += $t;
1876
1877							# remaining observations
1878	0					0	my $p; # position offset
1879	0					0	my $len = CORE::length($contents);
1880	0					0	while ($i < $len) {
1881	0					0	($p, $t) = unpack('i>s>', substr($contents, $i, 6));
1882	0					0	$i += 6;
1883	0					0	$position += $p + 32768;
1884	0					0	$text = unpack("A$t", substr($contents, $i, $t));
1885	0					0	$tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1);
1886	0					0	$i += $t;
1887							}
1888
1889							# store Interval Tree buffer
1890	0					0	$self->{buffer}{$slice} = $tree;
1891							}
1892
1893							sub _load_ii_slice {
1894	0			0		0	my $self = shift;
1895	0					0	my $slice = shift;
1896	0					0	my $tree = Set::IntervalTree->new();
1897
1898							# unpack the data slice zip member
1899	0					0	my $number = $self->slice_obs_number($slice);
1900	0					0	my ($null, @data) = unpack("si>i>(i>i>)$number", $self->zip->contents($slice) );
1901
1902							# convert the unpacked data into start, stop, score
1903							# first observation
1904	0					0	my $position = @data;
1905	0					0	my $position2 = $position + (shift @data);
1906	0					0	$tree->insert( [$position, $position2, undef], $position, $position2);
1907
1908							# remaining observations
1909	0					0	while (@data) {
1910	0					0	$position += (shift @data);
1911	0					0	$position2 = $position + (shift @data);
1912	0					0	$tree->insert( [$position, $position2, undef], $position, $position2);
1913							}
1914
1915							# store Interval Tree buffer
1916	0					0	$self->{buffer}{$slice} = $tree;
1917							}
1918
1919							sub _load_iif_slice {
1920	0			0		0	my $self = shift;
1921	0					0	my $slice = shift;
1922	0					0	my $tree = Set::IntervalTree->new();
1923
1924							# unpack the data slice zip member
1925	0					0	my $number = $self->slice_obs_number($slice);
1926	0					0	my ($null, @data) = unpack("si>i>f>(i>i>f>)$number", $self->zip->contents($slice) );
1927
1928							# convert the unpacked data into start, stop, score
1929							# first observation
1930	0					0	my $position = @data;
1931	0					0	my $position2 = $position + (shift @data);
1932	0					0	$tree->insert( [$position, $position2, shift(@data)], $position, $position2);
1933
1934							# remaining observations
1935	0					0	while (@data) {
1936	0					0	$position += (shift @data);
1937	0					0	$position2 = $position + (shift @data);
1938	0					0	$tree->insert( [$position, $position2, shift(@data)], $position, $position2);
1939							}
1940
1941							# store Interval Tree buffer
1942	0					0	$self->{buffer}{$slice} = $tree;
1943							}
1944
1945							sub _load_iit_slice {
1946	0			0		0	my $self = shift;
1947	0					0	my $slice = shift;
1948	0					0	my $tree = Set::IntervalTree->new();
1949	0					0	my $contents = $self->zip->contents($slice);
1950
1951							# first observation
1952	0					0	my $i = 12; # go ahead and set index up to first observation text
1953	0					0	my (undef, $position, $l, $t) = unpack('si>i>s>', substr($contents, 0, $i));
1954							# initial null, position, length, text_length
1955	0					0	my $position2 = $position + $l;
1956	0					0	my $text = unpack("A$t", substr($contents, $i, $t));
1957	0					0	$tree->insert( [$position, $position2, undef, $text], $position, $position2);
1958	0					0	$i += $t;
1959
1960							# remaining observations
1961	0					0	my $p; # position offset
1962	0					0	my $len = CORE::length($contents);
1963	0					0	while ($i < $len) {
1964	0					0	($p, $l, $t) = unpack('i>i>s>', substr($contents, $i, 10));
1965	0					0	$i += 10;
1966	0					0	$position += $p;
1967	0					0	$position2 = $position + $l;
1968	0					0	$text = unpack("A$t", substr($contents, $i, $t));
1969	0					0	$tree->insert( [$position, $position2, undef, $text], $position, $position2);
1970	0					0	$i += $t;
1971							}
1972
1973							# store Interval Tree buffer
1974	0					0	$self->{buffer}{$slice} = $tree;
1975							}
1976
1977							sub _load_iift_slice {
1978	0			0		0	my $self = shift;
1979	0					0	my $slice = shift;
1980	0					0	my $tree = Set::IntervalTree->new();
1981	0					0	my $contents = $self->zip->contents($slice);
1982
1983							# first observation
1984	0					0	my $i = 16; # go ahead and set index up to first observation text
1985	0					0	my (undef, $position, $l, $s, $t) = unpack('si>i>f>s>', substr($contents, 0, $i));
1986							# initial null, position, length, score, text_length
1987	0					0	my $position2 = $position + $l;
1988	0					0	my $text = unpack("A$t", substr($contents, $i, $t));
1989	0					0	$tree->insert( [$position, $position2, $s, $text], $position, $position2);
1990	0					0	$i += $t;
1991
1992							# remaining observations
1993	0					0	my $p; # position offset
1994	0					0	my $len = CORE::length($contents);
1995	0					0	while ($i < $len) {
1996	0					0	($p, $l, $s, $t) = unpack('i>i>f>s>', substr($contents, $i, 14));
1997	0					0	$i += 14;
1998	0					0	$position += $p;
1999	0					0	$position2 = $position + $l;
2000	0					0	$text = unpack("A$t", substr($contents, $i, $t));
2001	0					0	$tree->insert( [$position, $position2, $s, $text], $position, $position2);
2002	0					0	$i += $t;
2003							}
2004
2005							# store Interval Tree buffer
2006	0					0	$self->{buffer}{$slice} = $tree;
2007							}
2008
2009							sub _load_is_slice {
2010	0			0		0	my $self = shift;
2011	0					0	my $tree = Set::IntervalTree->new();
2012	0					0	my $slice = shift;
2013
2014							# unpack the data slice zip member
2015	0					0	my $number = $self->slice_obs_number($slice);
2016	0					0	my ($null, @data) = unpack("si>s>(i>s>)$number", $self->zip->contents($slice) );
2017
2018							# convert the unpacked data into start, stop, score
2019							# first observation
2020	0					0	my $position = @data;
2021	0					0	my $position2 = $position + (shift @data) + 32768;
2022	0					0	$tree->insert( [$position, $position2, undef], $position, $position2);
2023
2024							# remaining observations
2025	0					0	while (@data) {
2026	0					0	$position += (shift @data);
2027	0					0	$position2 = $position + (shift @data) + 32768;
2028	0					0	$tree->insert( [$position, $position2, undef], $position, $position2);
2029							}
2030
2031							# store Interval Tree buffer
2032	0					0	$self->{buffer}{$slice} = $tree;
2033							}
2034
2035							sub _load_isf_slice {
2036	0			0		0	my $self = shift;
2037	0					0	my $slice = shift;
2038	0					0	my $tree = Set::IntervalTree->new();
2039
2040							# unpack the data slice zip member
2041	0					0	my $number = $self->slice_obs_number($slice);
2042	0					0	my ($null, @data) = unpack("si>s>f>(i>s>f>)$number", $self->zip->contents($slice) );
2043
2044							# convert the unpacked data into start, stop, score
2045							# first observation
2046	0					0	my $position = @data;
2047	0					0	my $position2 = $position + (shift @data) + 32768;
2048	0					0	$tree->insert( [$position, $position2, shift(@data)], $position, $position2);
2049
2050							# remaining observations
2051	0					0	while (@data) {
2052	0					0	$position += (shift @data);
2053	0					0	$position2 = $position + (shift @data) + 32768;
2054	0					0	$tree->insert( [$position, $position2, shift(@data)], $position, $position2);
2055							}
2056
2057							# store Interval Tree buffer
2058	0					0	$self->{buffer}{$slice} = $tree;
2059							}
2060
2061							sub _load_ist_slice {
2062	0			0		0	my $self = shift;
2063	0					0	my $slice = shift;
2064	0					0	my $tree = Set::IntervalTree->new();
2065	0					0	my $contents = $self->zip->contents($slice);
2066
2067							# first observation
2068	0					0	my $i = 10; # go ahead and set index up to first observation text
2069	0					0	my (undef, $position, $l, $t) = unpack('si>s>s>', substr($contents, 0, $i));
2070							# initial null, position, length, text_length
2071	0					0	my $position2 = $position + $l + 32768;
2072	0					0	my $text = unpack("A$t", substr($contents, $i, $t));
2073	0					0	$tree->insert( [$position, $position2, undef, $text], $position, $position2);
2074	0					0	$i += $t;
2075
2076							# remaining observations
2077	0					0	my $p; # position offset
2078	0					0	my $len = CORE::length($contents);
2079	0					0	while ($i < $len) {
2080	0					0	($p, $l, $t) = unpack('i>s>s>', substr($contents, $i, 8));
2081	0					0	$i += 8;
2082	0					0	$position += $p;
2083	0					0	$position2 = $position + $l + 32768;
2084	0					0	$text = unpack("A$t", substr($contents, $i, $t));
2085	0					0	$tree->insert( [$position, $position2, undef, $text], $position, $position2);
2086	0					0	$i += $t;
2087							}
2088
2089							# store Interval Tree buffer
2090	0					0	$self->{buffer}{$slice} = $tree;
2091							}
2092
2093							sub _load_isft_slice {
2094	0			0		0	my $self = shift;
2095	0					0	my $slice = shift;
2096	0					0	my $tree = Set::IntervalTree->new();
2097	0					0	my $contents = $self->zip->contents($slice);
2098
2099							# first observation
2100	0					0	my $i = 14; # go ahead and set index up to first observation text
2101	0					0	my (undef, $position, $l, $s, $t) = unpack('si>s>f>s>', substr($contents, 0, $i));
2102							# initial null, position, length, score, text_length
2103	0					0	my $position2 = $position + $l + 32768;
2104	0					0	my $text = unpack("A$t", substr($contents, $i, $t));
2105	0					0	$tree->insert( [$position, $position2, $s, $text], $position, $position2);
2106	0					0	$i += $t;
2107
2108							# remaining observations
2109	0					0	my $p; # position offset
2110	0					0	while ($i < CORE::length($contents)) {
2111	0					0	($p, $l, $s, $t) = unpack('i>s>f>s>', substr($contents, $i, 12));
2112	0					0	$i += 12;
2113	0					0	$position += $p;
2114	0					0	$position2 = $position + $l + 32768;
2115	0					0	$text = unpack("A$t", substr($contents, $i, $t));
2116	0					0	$tree->insert( [$position, $position2, $s, $text], $position, $position2);
2117	0					0	$i += $t;
2118							}
2119
2120							# store Interval Tree buffer
2121	0					0	$self->{buffer}{$slice} = $tree;
2122							}
2123
2124							sub _mean_score {
2125	1000			1000		1383	my $self = shift;
2126	1000					1864	my ($start, $stop, $slices) = @_;
2127	1000	50				1771	return unless @$slices;
2128
2129							# collect the scores from each of the requested slices
2130	1000					1259	my $sum;
2131	1000					1481	my $count = 0;
2132	1000					1482	foreach my $slice (@$slices) {
2133	1001	50				1773	next unless $self->slice_type($slice) =~ /f/;
2134							# no sense going through if no score present
2135
2136							# load and unpack the data
2137	1001					2672	$self->_load_slice($slice);
2138
2139							# find the overlapping observations and record values
2140	1001					111068	my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop);
2141	1001					2748	foreach my $r (@$results) {
2142	2276		100			5843	$sum += $r->[2] \|\| 0;
2143	2276					4221	$count++;
2144							}
2145							}
2146	1000	50				5129	return $count ? $sum / $count : undef;
2147							}
2148
2149							sub _stat_summary {
2150	11			11		20	my $self = shift;
2151	11					30	my ($start, $stop, $slices) = @_;
2152	11	50				27	return unless @$slices;
2153
2154							# initialize the statistical scores
2155	11					19	my $count = 0;
2156	11					30	my $sum;
2157							my $sum_squares;
2158	11					0	my $min;
2159	11					0	my $max;
2160
2161							# collect the scores from each of the requested slices
2162	11					20	foreach my $slice (@$slices) {
2163	12	50				45	next unless $self->slice_type($slice) =~ /f/;
2164							# no sense going through if no score present
2165
2166							# load and unpack the data
2167	12					44	$self->_load_slice($slice);
2168
2169							# find the overlapping observations
2170	12					3675	my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop);
2171	12					51	foreach my $r (@$results) {
2172							# each observation is [start, stop, score]
2173	21145	50				36885	if (defined $r->[2]) {
2174	21145					25897	$count++;
2175	21145					26920	$sum += $r->[2];
2176	21145					27491	$sum_squares += ($r->[2] * $r->[2]);
2177	21145	100	100			56032	$min = $r->[2] if (!defined $min or $r->[2] < $min);
2178	21145	100	100			57879	$max = $r->[2] if (!defined $max or $r->[2] > $max);
2179							}
2180							}
2181							}
2182
2183							# assemble the statistical summary hash
2184	11		50			159	my %summary = (
			50
			50
			50
2185							'validCount' => $count,
2186							'sumData' => $sum \|\| 0,
2187							'sumSquares' => $sum_squares \|\| 0,
2188							'minVal' => $min \|\| 0,
2189							'maxVal' => $max \|\| 0,
2190							);
2191	11					110	return \%summary;
2192							}
2193
2194							sub _rewrite_metadata {
2195	1			1		3	my $self = shift;
2196	1	50				4	return unless (-w $self->zip->fileName);
2197	1					58	my $md = $self->{'metadata'};
2198
2199							# generate new metadata as an array
2200	1					3	my @new_md;
2201	1					7	push @new_md, "useqArchiveVersion = " . $md->{useqArchiveVersion};
2202	1	50				4	push @new_md, @{ $md->{comments} } if exists $md->{comments};
	1					6
2203	1					4	push @new_md, "dataType = " . $md->{dataType};
2204	1					3	push @new_md, "versionedGenome = " . $md->{versionedGenome};
2205
2206							# additional keys that may be present
2207	1					8	foreach (keys %$md) {
2208	7	100				40	next if /useqArchiveVersion\|dataType\|versionedGenome\|comments\|chromStats\|globalStats/;
2209	2					8	push @new_md, "$_ = $md->{$_}";
2210							}
2211
2212							# global and chromosome stats
2213	1					3	push @new_md,
2214							"# Bio::DB::USeq statistics validCount,sumData,sumSquares,minVal,maxVal";
2215	1	50				5	push @new_md, "globalStats = $md->{globalStats}" if exists $md->{globalStats};
2216	1					8	foreach (sort {$a cmp $b} keys %$md) {
	13					22
2217	7	100				22	push @new_md, "$_ = $md->{$_}" if /chromStats/;
2218							}
2219
2220							# write the new metadata to the zip Archive
2221	1					11	$self->{'zip'}->contents('archiveReadMe.txt', join("\n", @new_md));
2222	1					713	$self->{'zip'}->overwrite;
2223	1					8302	$self->clone; # not sure if this is necessary, but just in case we reopen the zip
2224							}
2225
2226
2227							######## Exported Functions ########
2228							# these are borrowed from Bio::DB::BigWig from Lincoln Stein
2229
2230							sub binMean {
2231	2			2	1	126	my $score = shift;
2232	2	50				9	return 0 unless $score->{validCount};
2233	2					11	$score->{sumData}/$score->{validCount};
2234							}
2235
2236							sub binVariance {
2237	1			1	1	2	my $score = shift;
2238	1	50				5	return 0 unless $score->{validCount};
2239	1					8	my $var = $score->{sumSquares} - $score->{sumData}**2/$score->{validCount};
2240	1	50				3	if ($score->{validCount} > 1) {
2241	1					5	$var /= $score->{validCount}-1;
2242							}
2243	1	50				4	return 0 if $var < 0;
2244	1					4	return $var;
2245							}
2246
2247							sub binStdev {
2248	1			1	1	5	my $score = shift;
2249	1					5	return sqrt(binVariance($score));
2250							}
2251
2252
2253
2254
2255							######## Other Classes #############
2256
2257							package Bio::DB::USeq::Feature;
2258	1			1		14	use base 'Bio::SeqFeature::Lite';
	1					3
	1					702
2259
2260							sub new {
2261	13			13		42	my $class = shift;
2262	13					60	return $class->SUPER::new(@_);
2263							}
2264
2265							sub type {
2266							# Bio::SeqFeature::Lite mangles the type and returns
2267							# primary_tag:source if both are set
2268							# this may wreck havoc with parsers when the type already has a :
2269							# as in wiggle:1000
2270	16			16		161	my $self = shift;
2271	16					105	return $self->{type};
2272							}
2273
2274							sub chr_stats {
2275	0			0		0	my $self = shift;
2276	0	0				0	return unless exists $self->{useq};
2277	0					0	return $self->{useq}->chr_stats( $self->seq_id );
2278							}
2279
2280							sub chr_mean {
2281	0			0		0	my $self = shift;
2282	0	0				0	return unless exists $self->{useq};
2283	0					0	return $self->{useq}->chr_mean( $self->seq_id );
2284							}
2285
2286							sub chr_stdev {
2287	0			0		0	my $self = shift;
2288	0	0				0	return unless exists $self->{useq};
2289	0					0	return $self->{useq}->chr_stdev( $self->seq_id );
2290							}
2291
2292							sub global_stats {
2293	0			0		0	my $self = shift;
2294	0	0				0	return unless exists $self->{useq};
2295	0					0	return $self->{useq}->global_stats( $self->seq_id );
2296							}
2297
2298							sub global_mean {
2299	0			0		0	my $self = shift;
2300	0	0				0	return unless exists $self->{useq};
2301	0					0	return $self->{useq}->global_mean( $self->seq_id );
2302							}
2303
2304							sub global_stdev {
2305	0			0		0	my $self = shift;
2306	0	0				0	return unless exists $self->{useq};
2307	0					0	return $self->{useq}->global_stdev( $self->seq_id );
2308							}
2309
2310
2311
2312
2313							package Bio::DB::USeq::Segment;
2314	1			1		74120	use base 'Bio::DB::USeq::Feature';
	1					2
	1					907
2315
2316							sub new {
2317	1			1		2	my $class = shift;
2318	1					6	my %args = @_;
2319	1					10	my $segment = $class->SUPER::new(@_);
2320	1	50				90	$segment->{'useq'} = $args{'-useq'} if exists $args{'-useq'};
2321	1					5	return $segment;
2322							}
2323
2324							sub scores {
2325	1			1		141	my $self = shift;
2326	1					7	return $self->{'useq'}->scores(
2327							-seq_id => $self->seq_id,
2328							-start => $self->start,
2329							-end => $self->end,
2330							-strand => $self->strand,
2331							);
2332							}
2333
2334							sub features {
2335	0			0		0	my $self = shift;
2336	0					0	my $type = shift;
2337	0		0			0	$type \|\|= 'region';
2338	0					0	return $self->{'useq'}->features(
2339							-seq_id => $self->seq_id,
2340							-start => $self->start,
2341							-end => $self->end,
2342							-strand => $self->strand,
2343							-type => $type,
2344							);
2345							}
2346
2347							sub get_seq_stream {
2348	1			1		354	my $self = shift;
2349	1					2	my $type = shift;
2350	1		50			9	$type \|\|= 'region';
2351							return Bio::DB::USeq::Iterator->new(
2352							-seq_id => $self->seq_id,
2353							-start => $self->start,
2354							-end => $self->end,
2355							-strand => $self->strand,
2356							-source => $self->source,
2357							-type => $type,
2358							-useq => $self->{useq},
2359	1					8	);
2360							}
2361
2362							sub slices {
2363	0			0		0	my $self = shift;
2364	0					0	my @slices = $self->{'useq'}->_translate_coordinates_to_slices(
2365							$self->seq_id, $self->start, $self->end, $self->strand
2366							);
2367	0	0				0	return wantarray ? @slices : \@slices;
2368							}
2369
2370							sub coverage {
2371	0			0		0	my $self = shift;
2372	0					0	my $bins = shift;
2373	0		0			0	$bins \|\|= $self->length;
2374	0					0	return $self->features("coverage:$bins");
2375							}
2376
2377							sub wiggle {
2378	0			0		0	my $self = shift;
2379	0					0	my $bins = shift;
2380	0		0			0	$bins \|\|= $self->length;
2381	0					0	return $self->features("wiggle:$bins");
2382							}
2383
2384							sub statistical_summary {
2385	0			0		0	my $self = shift;
2386	0					0	my $bins = shift;
2387	0		0			0	$bins \|\|= 1;
2388	0					0	return $self->features("summary:$bins");
2389							}
2390
2391							sub observations {
2392	0			0		0	my $self = shift;
2393	0					0	return $self->{'useq'}->observations(
2394							-seq_id => $self->seq_id,
2395							-start => $self->start,
2396							-end => $self->end,
2397							-strand => $self->strand,
2398							);
2399							}
2400
2401
2402							package Bio::DB::USeq::Iterator;
2403	1			1		15	use base 'Bio::DB::USeq::Feature';
	1					3
	1					1590
2404
2405							sub new {
2406	4			4		74	my $class = shift;
2407
2408							# create object
2409	4					22	my %args = @_;
2410	4	50				15	my $useq = $args{'-useq'} or
2411							die "Bio::DB::USeq::Iterator cannot be created without a -useq argument";
2412	4					19	my $iterator = $class->SUPER::new(@_);
2413	4					289	$iterator->{'useq'} = $useq;
2414
2415							# determine which members to retrieve
2416							my @slices = $useq->_translate_coordinates_to_slices(
2417							$args{-seq_id},
2418							$args{-start},
2419							$args{-end},
2420							$args{-strand},
2421	4					20	);
2422	4					18	$useq->_clear_buffer(\@slices);
2423
2424							# sort the slices as necessary
2425	4	100				20	if (scalar(@slices) > 1) {
2426	6					17	@slices = map {$_->[1]}
2427	3					11	sort {$a->[0] <=> $b->[0]}
2428	3					21	map { [$useq->slice_start($_), $_] } @slices;
	6					21
2429							}
2430
2431							# how we set up the iterator depends on the feature type requested
2432							# we need to add specific information to iterator object
2433
2434	4	100				18	if ($iterator->type =~ /region\|interval\|observation/) {
2435							# useq observation features are simple
2436	2					8	$iterator->{'wiggle'} = 0;
2437
2438							# prepare specific iterator information
2439	2					6	$iterator->{'slices'} = \@slices;
2440	2					4	$iterator->{'current_results'} = undef;
2441	2					6	$iterator->{'current_strand'} = undef;
2442
2443	2					10	return $iterator;
2444							}
2445
2446							# otherwise we work with more complex wiggle or summary features
2447							# set the type of wiggle feature: 1 = wiggle, 2 = summary
2448	2	100				7	$iterator->{'wiggle'} = $iterator->type =~ /summary/ ? 2 : 1;
2449
2450							# normally the iterator will only return one wigggle\|summary feature, but
2451							# will return two if stranded data, per behavior for Bio::Grapics...
2452	2	50	33			15	if ($iterator->strand == 0 and $useq->stranded) {
2453							# we have stranded data, so need to return two features
2454							# separate the slices into each respective strands for each feature
2455	0					0	my (@f, @r);
2456	0					0	foreach (@slices) {
2457	0	0				0	if ($useq->slice_strand($_) == 1) {
2458	0					0	push @f, $_;
2459							}
2460							else {
2461	0					0	push @r, $_;
2462							}
2463							}
2464	0					0	$iterator->{slices} = [\@f, \@r];
2465							}
2466							else {
2467							# only need to return one feature
2468	2					8	$iterator->{slices} = [ \@slices ];
2469							}
2470
2471							# check for type and bins
2472	2					5	my ($bin, $step);
2473	2	100				6	if ($iterator->type =~ /:(\d+)$/i) {
2474	1					4	$bin = $1;
2475	1					7	my $length = $iterator->length;
2476	1	50				24	$bin = $length if $bin > $length;
2477	1					4	$step = $length/$bin;
2478							}
2479	2					7	$iterator->{bin} = $bin;
2480	2					5	$iterator->{step} = $step;
2481
2482	2					10	return $iterator;
2483							}
2484
2485							sub next_seq {
2486	10			10		409	my $self = shift;
2487
2488	10	100				40	if ($self->{'wiggle'} == 1) {
		100
2489	2					8	return $self->_next_wiggle;
2490							}
2491							elsif ($self->{'wiggle'} == 2) {
2492	2					7	return $self->_next_summary;
2493							}
2494							else {
2495	6					14	return $self->_next_region;
2496							}
2497							}
2498
2499	0			0		0	sub next_feature {return shift->next_seq}
2500
2501							sub _next_region {
2502							# this will keep returning observations as SeqFeature objects until we run out
2503							# of observations in the requested interval
2504	6			6		11	my $self = shift;
2505	6					11	my $useq = $self->{'useq'};
2506
2507							# get current results
2508	6	100				16	unless ($self->{'current_results'}) {
2509	2		50			5	my $slice = shift @{ $self->{'slices'} } \|\| undef;
2510	2	50				8	return unless $slice; # no more slices, we're done
2511
2512							# collect the observations
2513	2					15	$useq->_load_slice($slice);
2514	2					19	my $result = $useq->{buffer}{$slice}->fetch($self->start - 1, $self->end);
2515
2516							# sort the observations and store the results
2517	2	50				275	my @sorted = sort {$a->[0] <=> $b->[0] or $a->[1] <=> $b->[1]} @$result;
	303					522
2518	2					13	$self->{'current_results'} = \@sorted;
2519	2					11	$self->{'current_strand'} = $useq->slice_strand($slice);
2520							}
2521
2522							# return next observation as a Feature object
2523	6					10	my $obs = shift @{ $self->{'current_results'} };
	6					10
2524	6	50				17	unless (defined $obs) {
2525							# no more observations for current slice, try to move to next slice
2526	0					0	undef $self->{'current_results'};
2527	0					0	return $self->_next_region;
2528							}
2529							return Bio::DB::USeq::Feature->new(
2530							-seq_id => $self->seq_id,
2531							-start => $obs->[0] + 1,
2532							-end => $obs->[1],
2533	6	100				24	-strand => $self->{'current_strand'},
2534							-score => $obs->[2],
2535							-source => $self->source,
2536							-type => $self->type,
2537							-name => defined $obs->[3] ? $obs->[3] :
2538							join(':', $self->seq_id, $obs->[0], $obs->[1]),
2539							)
2540							}
2541
2542							sub _next_wiggle {
2543							# this will return one wiggle Feature, two if separate strands are requested
2544
2545	2			2		4	my $self = shift;
2546	2					6	my $useq = $self->{'useq'};
2547
2548							# determine which slices to retrieve
2549	2					3	my $slices = shift @{ $self->{slices} };
	2					6
2550	2	100				8	return unless $slices;
2551
2552							# more information
2553	1					3	my $start = $self->start;
2554	1					9	my $stop = $self->end;
2555	1					9	my $step = $self->{step};
2556
2557							# check whether we are working with binned wiggle or not
2558	1					2	my @scores;
2559	1	50	33			6	if ($self->{bin} and $step > 1) {
2560							# we will be collecting the mean score value in bins
2561
2562							# collect the scores for each bin
2563	1					4	for (my $begin = $start; $begin < $stop; $begin += $step) {
2564
2565							# round off coordinates to integers
2566							# beginning point and step may not be integers
2567	1000					2189	my $s = int($begin + 0.5);
2568	1000					1642	my $e = int($s + $step - 0.5); # start + step - 1 + 0.5
2569
2570							# collect the scores from each of the requested slices
2571	1000	50				1915	if (scalar @$slices > 1) {
2572							# more than one slice, identify which subset of slices to collect from
2573							# may or may not be all of the current slices
2574	1000					1412	my @sub_slices;
2575	1000					1583	foreach my $slice (@$slices) {
2576	2000	100				4135	next if $s > $useq->slice_end($slice);
2577	1061	100				1965	next if $e < $useq->slice_start($slice);
2578	1001					2254	push @sub_slices, $slice;
2579							}
2580	1000					2192	push @scores, $useq->_mean_score($s, $e, \@sub_slices);
2581							}
2582							else {
2583	0					0	push @scores, $useq->_mean_score($s, $e, $slices);
2584							}
2585							}
2586							}
2587							else {
2588							# otherwise we collect in one step and associate scores at bp resolution
2589							# collect the scores from each of the requested slices and
2590							# assemble them into a hash of values
2591
2592							# correlate scores with position
2593	0					0	my %pos2score;
2594	0					0	foreach my $slice (@$slices) {
2595
2596							# load and unpack the data
2597	0					0	$useq->_load_slice($slice);
2598
2599							# fetch the overlapping observations
2600							# each observation is [start, stop, score]
2601	0					0	my $result = $useq->{'buffer'}{$slice}->fetch($start - 1, $stop);
2602	0					0	foreach my $r (@$result) {
2603	0					0	foreach my $p ( $r->[0] + 1 .. $r->[1] ) {
2604	0					0	$pos2score{$p} = $r->[2];
2605							}
2606							}
2607							}
2608
2609							# convert positioned scores into an array
2610	0					0	foreach (my $s = $start; $s <= $stop; $s++) {
2611	0	0				0	push @scores, exists $pos2score{$s} ? $pos2score{$s} : undef;
2612							# for Bio::Graphics it is better to store undef than 0
2613							# which can do wonky things with graphs
2614							}
2615							}
2616
2617							# generate the wiggle object
2618	1		50			8	my $strand = $useq->slice_strand( $slices->[0] ) \|\| 0;
2619	1	50				11	return Bio::DB::USeq::Wiggle->new(
		50
2620							-seq_id => $self->seq_id,
2621							-start => $start,
2622							-end => $stop,
2623							-strand => $strand,
2624							-type => $self->type,
2625							-source => $self->source,
2626							-attributes => { 'coverage' => [ \@scores ] },
2627							-name => $strand == 1 ? 'Forward' :
2628							$strand == -1 ? 'Reverse' : q(),
2629							-useq => $useq,
2630							);
2631							}
2632
2633							sub _next_summary {
2634							# this will return one summary Feature, two if separate strands are requested
2635	2			2		4	my $self = shift;
2636
2637							# determine which slices to retrieve
2638	2					3	my $slices = shift @{ $self->{slices} };
	2					6
2639	2	100				7	return unless $slices;
2640
2641							# all of the real statistical work is done elsewhere
2642							# just return the summary object
2643	1		50			6	my $strand = $self->{useq}->slice_strand( $slices->[0] ) \|\| 0;
2644							return Bio::DB::USeq::Summary->new(
2645							-seq_id => $self->seq_id,
2646							-start => $self->start,
2647							-end => $self->end,
2648							-strand => $strand,
2649							-type => $self->type,
2650							-source => $self->source,
2651							-name => $strand == 1 ? 'Forward' :
2652							$strand == -1 ? 'Reverse' : q(),
2653							-useq => $self->{'useq'},
2654							-slices => $slices,
2655							-bin => $self->{bin},
2656							-step => $self->{step},
2657	1	50				5	);
		50
2658							}
2659
2660
2661
2662							package Bio::DB::USeq::Wiggle;
2663	1			1		12	use base 'Bio::DB::USeq::Feature';
	1					3
	1					643
2664							# Wiggle scores are stored in the coverage attribute for backwards
2665							# compatibility with Bio::Graphics.
2666
2667							sub new {
2668	1			1		32	my $class = shift;
2669	1					15	my $wig = $class->SUPER::new(@_);
2670	1					130	my %args = @_;
2671	1	50				5	my $useq = $args{'-useq'} or
2672							die "Bio::DB::USeq::Wiggle cannot be created without a -useq argument";
2673	1					4	$wig->{useq} = $useq;
2674	1					9	return $wig;
2675							}
2676
2677							sub coverage {
2678	1			1		2	my $self = shift;
2679	1					10	my ($coverage) = $self->get_tag_values('coverage');
2680	1	50				120	return wantarray ? @$coverage : $coverage;
2681							}
2682
2683							sub wiggle {
2684	1			1		131	return shift->coverage;
2685							}
2686
2687							# Borrowed from Bio::SeqFeature::Coverage from Bio::DB::Sam
2688							sub gff3_string {
2689	0			0		0	my $self = shift;
2690	0					0	my $gff3 = $self->SUPER::gff3_string(@_);
2691	0					0	my $coverage = $self->escape(join(',',$self->coverage));
2692	0					0	$gff3 =~ s/coverage=[^;]+/coverage=$coverage/g;
2693	0					0	return $gff3;
2694							}
2695
2696							sub statistical_summary {
2697							# this is just for the wiggle scores, not original data
2698							# This is a fake statistical_summary to fool Bio::Graphics into
2699							# calculating chromosome or global statistics like a BigWig adaptor.
2700							# A real statistical_summary call would provide the number of bins,
2701							# so return null if bins is present.
2702							# We could calculate a real statistical summary, but that would be an
2703							# expensive circuitous route after we just collected wiggle scores.
2704							# Better to request the correct feature type in the first place.
2705	0			0		0	my $self = shift;
2706	0					0	my $bins = shift;
2707	0	0	0			0	return if $bins && $bins > 1;
2708
2709							# initialize the statistical scores
2710	0					0	my $count = 0;
2711	0					0	my $sum;
2712							my $sum_squares;
2713	0					0	my $min;
2714	0					0	my $max;
2715
2716							# generate a statistical summary of just the wiggle scores
2717	0					0	foreach my $s ($self->coverage) {
2718	0					0	$count++;
2719	0	0				0	next unless defined $s;
2720	0					0	$sum += $s;
2721	0					0	$sum_squares += $s * $s;
2722	0	0	0			0	$min = $s if (!defined $min or $s < $min);
2723	0	0	0			0	$max = $s if (!defined $max or $s > $max);
2724							}
2725
2726							# return the statistical hash
2727	0		0			0	my %stat = (
			0
			0
			0
2728							'validCount' => $count,
2729							'sumData' => $sum \|\| 0,
2730							'sumSquares' => $sum_squares \|\| 0,
2731							'minVal' => $min \|\| 0,
2732							'maxVal' => $max \|\| 0,
2733							);
2734	0					0	return \%stat;
2735							}
2736
2737							sub get_seq_stream {
2738	0			0		0	my $self = shift;
2739							return Bio::DB::USeq::Iterator->new(
2740							-seq_id => $self->seq_id,
2741							-start => $self->start,
2742							-end => $self->end,
2743							-strand => $self->strand,
2744							-type => 'region',
2745							-source => $self->source,
2746							-useq => $self->{useq},
2747	0					0	);
2748							}
2749
2750
2751
2752
2753							package Bio::DB::USeq::Summary;
2754	1			1		8	use base 'Bio::DB::USeq::Feature';
	1					2
	1					786
2755
2756							sub new {
2757	1			1		27	my $class = shift;
2758	1					11	my $summary = $class->SUPER::new(@_);
2759	1					87	my %args = @_;
2760	1	50				5	my $useq = $args{'-useq'} or
2761							die "Bio::DB::USeq::Summary cannot be created without a -useq argument";
2762	1					3	$summary->{useq} = $useq;
2763	1	50				4	$summary->{slices} = $args{-slices} if exists $args{-slices};
2764	1	50				5	$summary->{bin} = $args{-bin} if exists $args{-bin};
2765	1	50				4	$summary->{step} = $args{-step} if exists $args{-step};
2766	1					7	return $summary;
2767							}
2768
2769							sub statistical_summary {
2770	1			1		171	my $self = shift;
2771	1					3	my $useq = $self->{useq};
2772
2773							# get the number of bins to calculate the statistical summaries
2774	1					3	my $bin = shift;
2775	1	50	33			5	$bin \|\|= $self->{bin} if exists $self->{bin};
2776	1		50			4	$bin \|\|= 1;
2777	1	50				4	my $step = $self->{step} if exists $self->{step};
2778	1		33			13	$step \|\|= $self->length / $bin;
2779
2780							# get the slices
2781	1	50				30	my $slices = $self->{slices} if exists $self->{slices};
2782	1	50				4	unless ($slices) {
2783							# this should already be established, but just in case
2784	0					0	my @a = $useq->_translate_coordinates_to_slices($self->seq_id,
2785							$self->start, $self->end, $self->strand);
2786	0					0	$useq->_clear_buffer(\@a);
2787	0					0	$slices = \@a;
2788							}
2789
2790							# collect the statistical summaries for each bin
2791	1					378	my @summaries;
2792	1					6	for (my $begin = $self->start; $begin < $self->end; $begin += $step) {
2793
2794							# round off coordinates to integers
2795							# beginning point and step may not be integers
2796	10					131	my $s = int($begin + 0.5);
2797	10					25	my $e = int($s + $step - 0.5); # start + step - 1 + 0.5
2798
2799							# collect the scores from each of the requested slices
2800	10	50				31	if (scalar @$slices > 1) {
2801							# more than one slice, identify which subset of slices to collect from
2802							# may or may not be all of the current slices
2803	10					17	my @sub_slices;
2804	10					22	foreach my $slice (@$slices) {
2805	20	50				47	next if $s > $useq->slice_end($slice);
2806	20	100				42	next if $e < $useq->slice_start($slice);
2807	11					27	push @sub_slices, $slice;
2808							}
2809	10					50	push @summaries, $useq->_stat_summary($s, $e, \@sub_slices);
2810							}
2811							else {
2812	0					0	push @summaries, $useq->_stat_summary($s, $e, $slices);
2813							}
2814							}
2815
2816							# return the reference to the statistical summaries
2817	1					19	return \@summaries;
2818							}
2819
2820							sub score {
2821	0			0			my $self = shift;
2822	0						my $a = $self->statistical_summary(1);
2823	0						return $a->[0];
2824							}
2825
2826							sub gff3_string {
2827							# this is going to be a little convoluted, since what we
2828							# really want here is coverage, which is easier to calculate with means,
2829							# rather than doing statistical summaries and calculate means from those
2830	0			0			my $self = shift;
2831							my ($wig) = $self->{useq}->features(
2832							# this should only return one feature because we have specific strand
2833	0						-seq_id => $self->seq_id,
2834							-start => $self->start,
2835							-end => $self->end,
2836							-strand => $self->strand,
2837							-type => 'coverage:1000',
2838							);
2839	0						return $wig->gff3_string;
2840							}
2841
2842							sub get_seq_stream {
2843	0			0			my $self = shift;
2844							return Bio::DB::USeq::Iterator->new(
2845							-seq_id => $self->seq_id,
2846							-start => $self->start,
2847							-end => $self->end,
2848							-strand => $self->strand,
2849							-type => 'region',
2850							-source => $self->source,
2851							-useq => $self->{useq},
2852	0						);
2853							}
2854
2855							=head1 AUTHOR
2856
2857							Timothy J. Parnell, PhD
2858							Dept of Oncological Sciences
2859							Huntsman Cancer Institute
2860							University of Utah
2861							Salt Lake City, UT, 84112
2862
2863							This package is free software; you can redistribute it and/or modify
2864							it under the terms of the Artistic License 2.0.