File Coverage

lib/Bio/SAGE/DataProcessing.pm
Criterion Covered Total %
statement 254 340 74.7
branch 62 106 58.4
condition 15 52 28.8
subroutine 18 23 78.2
pod 14 16 87.5
total 363 537 67.6


line stmt bran cond sub pod time code
1             # *%) $Id: DataProcessing.pm,v 1.26 2004/10/15 22:30:46 scottz Exp $
2             #
3             # Copyright (c) 2004 Scott Zuyderduyn .
4             # All rights reserved. This program is free software; you
5             # can redistribute it and/or modify it under the same
6             # terms as Perl itself.
7              
8             package Bio::SAGE::DataProcessing;
9              
10             =pod
11              
12             =head1 NAME
13              
14             Bio::SAGE::DataProcessing - Processes raw serial analysis of gene expression (SAGE) data.
15              
16             =head1 SYNOPSIS
17              
18             use Bio::SAGE::DataProcessing;
19             $sage = Bio::SAGE::DataProcessing->new();
20              
21             # open sequence and quality files
22             open( READS, "library.fasta" );
23             open( QUAL, "library.qual.fasta" );
24              
25             # collect ditags and statistics from reads
26             $sage->process_library( *READS, *QUAL );
27              
28             # close files
29             close( READS );
30             close( QUAL );
31              
32             # output tags in descending order of expression
33             my %tags = %{$sage->get_tagcounts()};
34             open( TAGS, ">library.tags" );
35             map { print TAGS join( "\t", $_, $tags{$_} ) . "\n" } sort { $tags{$b} <=> $tags{$a} } keys %tags;
36             close( TAGS );
37              
38             # tag AAACCGGGTT matches two different genes
39             # so 15th base pair may help resolve this
40             $sage->print_extra_base_calculation( $sage->get_extract_base_calculation( "AAACCGGGTT" ) );
41              
42             =head1 DESCRIPTION
43              
44             This module provides several tools for processing and
45             analyzing serial analysis of gene expression (SAGE)
46             libraries.
47              
48             =head1 README
49              
50             B
51              
52             Serial analysis of gene expression (SAGE) is a molecular
53             technique for generating a near-global snapshot of a
54             cell population’s transcriptome. Briefly, the technique
55             extracts short sequences at defined positions of
56             transcribed mRNA. These short sequences are then paired
57             to form ditags. The ditags are concatamerized to form
58             long sequences that are then cloned. The cloned DNA is
59             then sequenced. Bioinformatic techniques are then
60             employed to determine the original short tag sequences,
61             and to derive their progenitor mRNA. The number of times
62             a particular tag is observed can be used to quantitate
63             the amount of a particular transcript. The original
64             technique was described by Velculescu et al. (1995) and
65             utilized an ~14bp sequence tag. A modified protocol
66             was introduced by Saha et al. (2002) that produced ~21bp
67             tags.
68              
69             B
70              
71             This module facilitates the processing of SAGE data.
72             Specifically:
73              
74             1. extracting ditags from raw sequence reads.
75             2. extracting tags from ditags, with the option to
76             exclude tags if the Phred scores (described by
77             Ewing and Green, 1998a and Ewing et al., 1998b)
78             do not meet a minimum cutoff value.
79             3. calculating descriptive values
80             4. statistical analysis to determine, where possible,
81             additional nucleotides to extend the length of the
82             SAGE tag (thus facilitating more accurate tag to
83             gene mapping).
84              
85             Both regular SAGE (14mer tag) and LongSAGE (21mer tag)
86             are supported by this module. Future protocols should
87             be configurable with this module.
88              
89             B
90              
91             Velculescu V, Zhang L, Vogelstein B, Kinzler KW. (1995)
92             Serial analysis of gene expression. Science. 270:484-487.
93              
94             Saha S, Sparks AB, Rago C, Akmaev V, Wang CJ, Vogelstein B,
95             Kinzler KW, Velculescu V. (2002) Using the transcriptome
96             to annotate the genome. Nat. Biotechnol. 20:508-512.
97              
98             Ewing B, Hillier L, Wendl MC, Green P. (1998a) Base-calling
99             of automated sequencer traces using phred. I. Accuracy
100             assessment. Genome Res. 8:175-185.
101              
102             Ewing B, Green P. (1998b) Base-calling of automated
103             sequencer traces using phred. II. Error probabilities.
104             Genome Res. 8:186-194.
105              
106             =head1 INSTALLATION
107              
108             Follow the usual steps for installing any Perl module:
109              
110             perl Makefile.PL
111             make test
112             make install
113              
114             =head1 PREREQUISITES
115              
116             None (this module used to require the C package).
117              
118             =head1 CHANGES
119              
120             1.20 2004.10.15 - Minor spelling errors and misuse of terminology fixed in docs.
121             - Module now allows FASTA files with a blank line folling the
122             header (denoting an attempted read with no sequence), but prints
123             a warning to STDERR that this has happened. Module died previously.
124             1.11 2004.06.20 - Added flag in constructor to keep duplicate ditags.
125             1.10 2004.06.02 - Wrote new documentation and modified several methods to use the read-by-read
126             processing approach (see line below).
127             - Revamped the module to conserve memory. Reads are now processed one at a time
128             and then discarded. The memory requirements in the previous versions were
129             prohibitive to those with regular desktop machines.
130             - The Bio::SAGE::DataProcessing::Filter package can be subclassed to create
131             custom filters at the ditag and tag processing steps (previous versions only
132             allowed one approach to ditag/tag filtering).
133             1.01 2004.05.27 - Fixed bug where extract_tag_counts didn't work with quality cutoff defined.
134             - extract_tags was not applying the get_quality_cutoff value (was returning all data)
135             - Duplicate ditags are now removed by default.
136             1.00 2004.05.23 - Initial release.
137              
138             =cut
139              
140 1     1   983 use strict;
  1         2  
  1         43  
141 1     1   1004 use diagnostics;
  1         242296  
  1         16  
142 1     1   676 use vars qw( $VERSION @ISA @EXPORT @EXPORT_OK $PROTOCOL_SAGE $PROTOCOL_LONGSAGE $DEBUG $ENZYME_NLAIII $ENZYME_SAU3A $DEFAULT_DITAG_FILTER $DEFAULT_TAG_FILTER );
  1         2  
  1         232  
143              
144             require Exporter;
145             require AutoLoader;
146              
147             @ISA = qw( Exporter AutoLoader );
148             @EXPORT = qw();
149             $VERSION = "1.11";
150              
151             #use Statistics::Distributions;
152 1     1   777 use Bio::SAGE::DataProcessing::Filter;
  1         3  
  1         58  
153 1     1   426 use Bio::SAGE::DataProcessing::AveragePhredFilter;
  1         2  
  1         35  
154 1     1   492 use Bio::SAGE::DataProcessing::MinimumPhredFilter;
  1         2  
  1         3638  
155              
156             my $PACKAGE = "Bio::SAGE::DataProcessing";
157              
158             =pod
159              
160             =head1 VARIABLES
161              
162             B
163              
164             =over 2
165              
166             I<$PROTOCOL_SAGE>
167              
168             Hashref containing default protocol parameters for the
169             regular/original SAGE protocol (see set_protocol
170             documentation for more information).
171              
172             I<$PROTOCOL_LONGSAGE>
173              
174             Hashref containing default protocol parameters for the
175             LongSAGE protocol (see set_protocol documentation
176             for more information).
177              
178             I<$ENZYME_NLAIII> = 'CATG'
179              
180             Constant denoting the recognition sequence for NlaIII.
181              
182             I<$ENZYME_SAU3A> = 'GATC'
183              
184             Constant denoting the recognition sequence for Sau3a.
185              
186             I<$DEFAULT_TAG_FILTER>
187              
188             A default tag filter used when none is specified.
189             This filter rejects tags that contain any base pair
190             with a Phred quality score < 15 or an average
191             Phred quality score over all bases < 30.
192              
193             I<$DEFAULT_DITAG_FILTER>
194              
195             A default ditag filter used when none is specified.
196             This filter rejects ditags that contain any base pair
197             with a Phred quality score < 20.
198              
199             =back
200              
201             B
202              
203             =over 2
204              
205             I<$DEBUG = 0>
206              
207             Prints debugging output if value if >= 1.
208              
209             =back
210              
211             =cut
212              
213             my @ignoreTags = ( 'TCCCTATTAA', 'TCCCCGTACA' ); # linker derived sequences
214             my @ignoreLongTags = ( 'TCGGACGTACATCGTTA', 'TCGGATATTAAGCCTAG' ); # linker derived sequences
215              
216             my %params_sage = ( 'MINIMUM_DITAG_LENGTH' => 29,
217             'MAXIMUM_DITAG_LENGTH' => 32,
218             'TAG_LENGTH' => 10,
219             'IGNORE_TAGS' => \@ignoreTags );
220             my %params_longsage = ( 'MINIMUM_DITAG_LENGTH' => 40,
221             'MAXIMUM_DITAG_LENGTH' => 46,
222             'TAG_LENGTH' => 17,
223             'IGNORE_TAGS' => \@ignoreLongTags );
224              
225             $PROTOCOL_SAGE = \%params_sage; # regular SAGE (14-mer tags)
226             $PROTOCOL_LONGSAGE = \%params_longsage; # LongSAGE (21-mer tags)
227              
228             $ENZYME_NLAIII = "CATG";
229             $ENZYME_SAU3A = "GATC";
230              
231             $DEFAULT_TAG_FILTER = Bio::SAGE::DataProcessing::AveragePhredFilter->new( 30, 15 );
232             $DEFAULT_DITAG_FILTER = Bio::SAGE::DataProcessing::MinimumPhredFilter->new( 20 );
233              
234             $DEBUG = 0; # set this flag to non-zero to enable debugging messages
235              
236             =pod
237              
238             =head1 CLASS METHODS
239              
240             =cut
241              
242             #######################################################
243             sub new {
244             #######################################################
245             =pod
246              
247             =head2 new <$enzyme>, <\%protocol>, <$bKeepDuplicates>
248              
249             Constructor for a new Bio::SAGE::DataProcessing object.
250              
251             B
252              
253             I<$enzyme> (optional)
254              
255             The anchoring enzyme recognition site. This is
256             typically NlaIII (CATG) or Sau3a (GATC). The
257             default is the recognition sequence "CATG" (NlaIII).
258              
259             I<\%protocol> (optional)
260              
261             A hashref containing specifics of the protocol. Two
262             pre-made parameter sets are available: $PROTOCOL_SAGE
263             (regular SAGE) and $PROTOCOL_LONGSAGE (LongSAGE).
264              
265             The required hash keys:
266              
267             MINIMUM_DITAG_LENGTH | The shortest length a valid
268             ditag can be (the length
269             should include the anchoring
270             enzyme site sequences).
271             MAXIMUM_DITAG_LENGTH | The longest length a valid
272             ditag can be (the length
273             should include the anchoring
274             enzyme site sequences).
275             TAG_LENGTH | The expected tag length (the
276             length should NOT include the
277             anchoring enzyme site sequence).
278             IGNORE_TAGS | An arrayref listing tag
279             sequences that should be
280             ignored during tag extraction
281             (i.e. linker-derived sequences).
282              
283             The parameters for the default configurations are:
284              
285             +----------------+--------------------+
286             | $PROTOCOL_SAGE | $PROTOCOL_LONGSAGE |
287             +----------------------+----------------+--------------------+
288             | MINIMUM_DITAG_LENGTH | 29 | 40 |
289             +----------------------+----------------+--------------------+
290             | MAXIMUM_DITAG_LENGTH | 32 | 46 |
291             +----------------------+----------------+--------------------+
292             | TAG_LENGTH | 10 | 17 |
293             +----------------------+----------------+--------------------+
294             | IGNORE_TAGS | TCCCTATTAA | TCGGACGTACATCGTTA |
295             | | TCCCCGTACA | TCGGATATTAAGCCTAG |
296             +----------------------+----------------+--------------------+
297              
298             Once the Bio::SAGE::DataProcessing object has been instantiated,
299             the enzyme and protocol CANNOT be altered.
300              
301             I<$bKeepDuplicates> (optional)
302              
303             If this argument is TRUE (non-zero) then ditags with
304             identical sequence will be kept. The default behavior is
305             to discard these ditags as they likely represent
306             preferential PCR amplification.
307              
308             B
309              
310             my $sage = Bio::SAGE::DataProcessing->new( $Bio::SAGE::DataProcessing::ENZYME_NLAIII,
311             $Bio::SAGE::DataProcessing::PROTOCOL_LONGSAGE );
312              
313             # alternatively:
314             my $sage = Bio::SAGE::DataProcessing->new( "CATG", { 'MINIMUM_DITAG_LENGTH' => 31,
315             'MAXIMUM_DITAG_LENGTH' => 34,
316             'TAG_LENGTH' => 11,
317             'IGNORE_TAGS' => \( 'TCCCTATTAA', 'TCCCCGTACA' ) } );
318              
319             =cut
320              
321 1     1 1 427 my $this = shift;
322 1   50     8 my $enzyme = shift || "CATG";
323 1   33     4 my $protocol = shift || $PROTOCOL_SAGE;
324 1   50     6 my $bKeepDupes = shift || 0;
325 1   33     5 my $class = ref( $this ) || $this;
326 1         2 my $self = {};
327 1         3 bless( $self, $class );
328              
329             # set instance variables
330 1         6 $self->{'enzyme'} = $enzyme;
331 1         2 $self->{'protocol'} = $protocol;
332 1         2 $self->{'keep_duplicates'} = $bKeepDupes;
333             # $self->set_protocol( $protocol );
334              
335 1         2 $self->{'ditag_filter'} = $DEFAULT_DITAG_FILTER;
336 1         2 $self->{'tag_filter'} = $DEFAULT_TAG_FILTER;
337              
338 1         4 return $self;
339              
340             }
341              
342             =pod
343              
344             =head1 INSTANCE METHODS
345              
346             =cut
347              
348             #######################################################
349             sub set_ditag_filter {
350             #######################################################
351             =pod
352              
353             =head2 set_ditag_filter $filterObject
354              
355             Sets a new filter object (a concrete subclass of
356             Bio::SAGE::DataProcessing::Filter) that is applied
357             to ditags as they're extracted from sequence reads.
358              
359             The default filter uses a Bio::SAGE::DataProcessing::MinimumPhredFilter
360             instance that is set to reject any ditags that do not
361             have at least Phred 20 (p<=0.01) for all nucleotides.
362              
363             B
364              
365             I<$filterObject>
366              
367             An object ref to a concrete subclass of
368             Bio::SAGE::DataProcessing::Filter.
369              
370             =cut
371              
372 0     0 1 0 my $self = shift;
373 0   0     0 $self->{'ditag_filter'} = shift || die( $PACKAGE . "::set_ditag_filter no filter specified." );
374              
375             }
376              
377             #######################################################
378             sub set_tag_filter {
379             #######################################################
380             =pod
381              
382             =head2 set_tag_filter $filterObject
383              
384             Sets a new filter object (a concrete subclass of
385             Bio::SAGE::DataProcessing::Filter) that is applied
386             to tags as they're extracted from ditags.
387              
388             The default filter uses a Bio::SAGE::DataProcessing::AveragePhredFilter
389             instance that is set to reject any tags that do not
390             have an average Phred 30 score (p<=0.001) over all
391             nucleotides and at least Phred 15 (p<=0.0316) at
392             each nucleotide.
393              
394             B
395              
396             I<$filterObject>
397              
398             An object ref to a concrete subclass of
399             Bio::SAGE::DataProcessing::Filter.
400              
401             =cut
402              
403 0     0 1 0 my $self = shift;
404 0   0     0 $self->{'tag_filter'} = shift || die( $PACKAGE . "::set_tag_filter no filter specified." );
405              
406             }
407              
408             #######################################################
409             sub get_enzyme {
410             #######################################################
411             =pod
412              
413             =head2 get_enzyme
414              
415             Gets the current anchoring enzyme recognition site.
416              
417             B
418              
419             None.
420              
421             B
422              
423             The current anchoring enzyme recognition site. By
424             default, this will be 'CATG', the NlaIII recognition
425             site.
426              
427             B
428              
429             my $sage = Bio::SAGE::DataProcessing->new();
430             print "ENZYME_SITE: " . $sage->get_enzyme() . "\n";
431              
432             =cut
433              
434 1     1 1 1823 my $this = shift;
435              
436 1         7 return $this->{'enzyme'};
437              
438             }
439              
440             #######################################################
441             sub get_protocol {
442             #######################################################
443             =pod
444              
445             =head2 get_protocol
446              
447             Gets a copy (in other words, modifying the
448             returned hash does not affect the object's settings)
449             of the hash describing the protocol for this
450             instance.
451              
452             B
453              
454             None.
455              
456             B
457              
458             A hashref of the current protocol settings. See the
459             documentation for new (constructor) for more details
460             on the contents of this hash.
461              
462             B
463              
464             my $sage = Bio::SAGE::DataProcessing->new();
465             print "Default protocol:\n";
466             my %protocol = %{$sage->get_protocol()};
467             map { print $_ . "=" . $protocol{$_} . "\n" } keys %protocol;
468              
469             =cut
470              
471 1     1 1 4 my $this = shift;
472 1         3 my %protocol = %{$this->{'protocol'}};
  1         10  
473              
474 1         4 my %copy;
475 1         4 foreach my $key ( keys %protocol ) {
476 4         7 my $val = $protocol{$key};
477 4 100       11 if( $key eq 'IGNORE_TAGS' ) {
478 1         4 my @arr = @$val;
479 1         3 $val = \@arr;
480             }
481 4         11 $copy{$key} = $val;
482             }
483              
484 1         8 return \%copy;
485              
486             }
487              
488             #######################################################
489             sub process_library {
490             #######################################################
491             =pod
492              
493             =head2 process_library $sequence_handle, <$scores_handle>
494              
495             Processes reads from a Perl handle to FASTA format
496             sequence data. Phred scores in matching FASTA format
497             can be read concurrently.
498              
499             An example of FASTA format sequence:
500              
501             >SEQUENCE0001
502             ACAGATAGACAGAGATATAGAGACATATTTAGAGACAAATCGCGCAGGCGCGCGACATA
503             TGACTAGTTTATATCATCAGTATTAGCGATTATGACTATTATATATTACTGATTGATCT
504             ATAGCGCGATTATATCTATCTATGCATTCGATCATGCTATTATCGTATACTACTGCTAG
505             AGAGGACGACGCAGGCAGCGATTATATCTATTTATA
506             >SEQUENCE0002
507             CGCGACGCATGTCAGTAGCTAGCTGCGCCCGAATATATATCGTCATACGGATTCGTAGC
508             CCCCCGCGGAGTCTGATTATATCTGATT
509              
510             An example of FASTA format quality data:
511              
512             >SEQUENCE0001
513             11 17 18 16 19 17 19 19 16 19 19 16 11 10 9 15 10 12 24 24 35 29 29 39 40 40 40 40 37 37 46 46 40 40 40 40 56 56 56 56 35 35 35 35 35 35 56 40 40 46 40 40 39 39 35 39 56 56 51 51
514             51 51 51 56 35 35 35 35 35 35 40 40 51 45 51 51 39 39 39 39 39 39 40 40 40 40 40 40 56 56 56 56 56 46 46 40 39 39 39 45 45 45 56 56 56 56 56 56 56 56 40 39 39 39 39 35 39 39 39 39
515             45 56 45 45 45 45 51 35 39 39 39 39 39 40 40 40 40 40 51 56 56 40 40 40 40 40 43 56 56 56 43 35 35 35 35 35 43 45 45 45 45 45 45 51 51 51 51 51 51 56 56 56 56 56 56 51 51 51 56 56
516             7 7 9 9 11 10 13 11 10 8 10 10 8 8 8 10 10
517             >SEQUENCE0002
518             12 15 17 17 19 15 15 15 13 19 17 17 12 16 11 19 13 24 24 35 35 35 37 37 39 56 56 56 56 56 51 39 32 29 29 29 29 32 56 56 56 35 35 35 35 35 35 56 56 56 56 56 56 56 56 56 56 56 56 40
519             40 40 46 46 40 51 40 40 40 40 40 40 51 39 39 35 35 35 35 40 40 51 45 45 45 45 51 51 56 56 56 56 56 45 45 45 45 51 51 45 45 45 40 40 40 40 40 40 40 40 40 40 56 56 56 56 56 56 51 51
520             15 13 19 17 17 12 16 11 19 13
521              
522             B
523              
524             I<$sequence_handle>
525              
526             A Perl handle to sequence data in FASTA format.
527              
528             I<$scores_handle> (optional)
529              
530             A Perl handle to Phred scores in FASTA format. The order
531             of entries must match the $sequence_handle data. NOTE:
532             many implementations of Bio::SAGE::DataProcessing::Filter
533             require the scores to carry out their function. In the
534             default implementations, no filtering is done when scores
535             are not defined.
536              
537             B
538              
539             The number of sequences read.
540              
541             B
542              
543             my $sage = Bio::SAGE::DataProcessing->new();
544             open( SEQDATA, "sequences.fasta" );
545             my $nReads = $sage->process_library( *SEQDATA );
546             print "NUMBER_READS: $nReads\n";
547              
548             =cut
549              
550 1   50 1 1 876 my $this = shift || die( $PACKAGE . "::process_library can't be called statically." );
551 1   50     5 my $fh1 = shift || die( $PACKAGE . "::process_library no sequence data handle provided." );
552 1   50     9 my $fh2 = shift || undef;
553              
554 1         2 my $nRead = 0;
555              
556 1         2 my $currid = '';
557 1         1 my $currseq = '';
558              
559 1         2 my $currscoreid = '';
560              
561 1         23 while( my $line = <$fh1> ) {
562 75         135 chomp( $line ); $line =~ s/\r//g;
  75         127  
563 75 100       176 if( $line =~ /^>(.*?)$/ ) {
564 5         21 my $thisid = $1;
565 5 100       20 if( $currid ne '' ) {
566              
567             # do we have scores too?
568              
569 4         11 my $scores = '';
570 4 50       15 if( defined( $fh2 ) ) {
571 4         448 while( my $line2 = <$fh2> ) {
572 65         91 chomp( $line2 ); $line2 =~ s/\r//g;
  65         111  
573 65 100       151 if( $line2 =~ /^>(.*?)$/ ) {
574 5         14 my $thisscoreid = $1;
575 5 100       12 if( $currscoreid eq '' ) { $currscoreid = $thisscoreid; next; }
  1         2  
  1         7  
576 4 50       15 if( $currid ne $currscoreid ) { die( $PACKAGE . "::process_library sequence and score data don't match." ); }
  0         0  
577 4         6 $currscoreid = $thisscoreid;
578 4         11 last;
579             }
580 60 100       139 if( $scores ne '' ) { $scores .= ' '; }
  56         88  
581 60         294 $scores .= $line2;
582             }
583             }
584              
585 4 50       19 if( $scores eq '' ) { $scores = undef; }
  0         0  
586 4 50       21 if( $this->process_read( $currseq, $scores ) == 0 ) {
587 0         0 print STDERR "Non-fatal error on read >".$currid." ...\n";
588 0         0 print STDERR " Sequence: $currseq\n";
589 0 0       0 print STDERR " Scores : " . ( defined($scores) ? $scores : '' ) . "\n";
590 0         0 print STDERR "...continuing.\n";
591             }
592 4         10 $currseq = '';
593 4         10 $nRead++;
594              
595             }
596 5         13 $currid = $thisid;
597 5         52 next;
598             }
599 70         778 $currseq .= $line;
600              
601             }
602              
603 1 50       9 if( $currid ne '' ) {
604              
605             # do we have scores too?
606              
607 1         4 my $scores = '';
608 1 50       4 if( defined( $fh2 ) ) {
609 1         275 while( my $line2 = <$fh2> ) {
610 10         20 chomp( $line2 ); $line2 =~ s/\r//g;
  10         21  
611 10 50       27 if( $line2 =~ /^>(.*?)$/ ) {
612 0         0 my $thisscoreid = $1;
613 0 0       0 if( $currscoreid eq '' ) { $currscoreid = $thisscoreid; next; }
  0         0  
  0         0  
614 0 0       0 if( $currid ne $currscoreid ) { die( $PACKAGE . "::process_library sequence and score data don't match." ); }
  0         0  
615 0         0 $currscoreid = $thisscoreid;
616 0         0 last;
617             }
618 10 100       32 if( $scores ne '' ) { $scores .= ' '; }
  9         15  
619 10         71 $scores .= $line2;
620             }
621             }
622              
623 1 50       8 if( $scores eq '' ) { $scores = undef; }
  0         0  
624 1 50       8 if( $this->process_read( $currseq, $scores ) == 0 ) {
625 0         0 print STDERR "Error: " . $currid . "\n";
626 0         0 print STDERR $currseq . "\n";
627 0         0 print STDERR $scores . "\n";
628             }
629 1         4 $currseq = '';
630 1         4 $currid = '';
631 1         5 $nRead++;
632              
633             }
634              
635 1         10 return $nRead;
636              
637             }
638              
639             #######################################################
640             sub process_read {
641             #######################################################
642             =pod
643              
644             =head2 process_read $sequence, <$scores>
645              
646             Extracts and filters ditags from the given sequence read
647             (and optionally supplied Phred scores). The default ditag
648             filter (or a filter supplied to set_ditag_filter) is
649             applied during this procedure. The resulting ditags are
650             added to the list of currently processed ditags collected
651             from previous calls to this method.
652              
653             Ditags with identical sequence (duplicate ditags) are
654             considered the result of PCR artifacts and only the
655             ditag with the highest "score" (as defined by the current
656             ditag Bio::SAGE::DataProcessing::Filter) is retained.
657              
658             B
659              
660             I<$sequence>
661              
662             The nucleotide sequence of the sequence read to process.
663              
664             I<$scores> (optional)
665              
666             The Phred scores corresponding to the sequence read
667             supplied to the method. The method expects space-separated
668             Phred scores (ie. "20 24 54 32" etc.).
669              
670             B
671              
672             TRUE (1) if the method was successful, FALSE (0) otherwise.
673              
674             B
675              
676             my $sage = Bio::SAGE::DataProcessing->new();
677             my $sequence = "ACGTACGT";
678             my $scores = "20 25 34 12 23 45 51 23";
679             if( $sage->process_read( $sequence, $scores ) ) {
680             print "Successful!\n";
681             }
682              
683             =cut
684              
685 5     5 1 10 my $this = shift;
686 5         16 my $read_sequence = shift; # || die( $PACKAGE . "::process_read no sequence specified." );
687 5         27 my $read_scores = shift;
688              
689 5 50       21 if( !defined( $read_sequence ) ) { return 0; }
  0         0  
690 5 50       22 if( $read_sequence eq '' ) { return 0; }
  0         0  
691              
692 5 50       19 if( $DEBUG >= 1 ) {
693 0         0 print STDERR $PACKAGE . "::process_read\n";
694 0         0 print STDERR " \$read_sequence = $read_sequence\n";
695 0         0 print STDERR " \$read_scores = $read_scores\n";
696             }
697              
698 5         17 $this->{'stats'}->{'total_reads'}++;
699 5         23 $this->{'stats'}->{'total_bps'} += length( $read_sequence );
700              
701 5         18 $this->__extract_ditags( $read_sequence, $read_scores );
702              
703 5         33 return 1;
704              
705             }
706              
707             #######################################################
708             sub get_ditags {
709             #######################################################
710             =pod
711              
712             =head2 get_ditags
713              
714             Gets the list of currently extracted and valid ditags
715             stored in this object through calls to process_read.
716              
717             B
718              
719             None.
720              
721             B
722              
723             An array of ditag sequences.
724              
725             B
726              
727             my $sage = Bio::SAGE::DataProcessing->new();
728             my $sequence = "ACGTACGT";
729             my $scores = "20 25 34 12 23 45 51 23";
730             if( $sage->process_read( $sequence, $scores ) ) {
731             my @ditags = $sage->get_ditags();
732             print "Current ditags:\n";
733             map { print " ".$_."\n" } @ditags;
734             }
735              
736             =cut
737              
738 2     2 1 5 my $this = shift;
739 2 50       12 if( $this->{'keep_duplicates'} == 1 ) {
740 0         0 my @arr;
741 0         0 foreach my $ditag ( keys %{$this->{'ditags'}} ) {
  0         0  
742 0         0 for( my $i = 0; $i < scalar( @{$this->{'ditags'}{$ditag}} ); $i++ ) {
  0         0  
743 0         0 push( @arr, $ditag );
744             }
745             }
746 0         0 return @arr;
747             }
748 2         5 return keys %{$this->{'ditags'}};
  2         29  
749              
750             }
751              
752             #######################################################
753             sub get_tags {
754             #######################################################
755             =pod
756              
757             =head2 get_tags
758              
759             Uses the list of currently extracted and valid ditags
760             to generate a list of valid SAGE tags. The default
761             tag filter (or a filter supplied to set_tag_filter) is
762             applied during this procedure.
763              
764             B
765              
766             None.
767              
768             B
769              
770             An array of tag sequences.
771              
772             B
773              
774             my $sage = Bio::SAGE::DataProcessing->new();
775             my $sequence = "ACGTACGT";
776             my $scores = "20 25 34 12 23 45 51 23";
777             if( $sage->process_read( $sequence, $scores ) ) {
778             my @tags = $sage->get_tags();
779             print "Current tags:\n";
780             map { print " ".$_."\n" } @tags;
781             }
782              
783             =cut
784              
785 2     2 1 4 my $this = shift;
786              
787 2         7 my $enzymeLength = length( $this->{'enzyme'} );
788 2         8 my $tagLength = $this->{'protocol'}->{'TAG_LENGTH'};
789 2         5 my @ignoreTags = @{$this->{'protocol'}->{'IGNORE_TAGS'}};
  2         9  
790              
791 2         4 my @tags;
792              
793 2         4 foreach my $ditag ( keys %{$this->{'ditags'}} ) {
  2         43  
794              
795 122         222 my @scoresList;
796 122 50       288 if( $this->{'keep_duplicates'} == 1 ) {
797 0         0 @scoresList = @{$this->{'ditags'}{$ditag}};
  0         0  
798             } else {
799 122         134 $scoresList[0] = ${$this->{'ditags'}}{$ditag};
  122         459  
800             }
801              
802             #my $scores = ${$this->{'ditags'}}{$ditag};
803 122         213 foreach my $scores ( @scoresList ) {
804              
805 122 50       328 if( $scores eq '' ) {
806             # no scores were provided
807             }
808              
809 122         303 my $tag1 = substr( $ditag, $enzymeLength, $tagLength );
810 122         158 my $bIgnore = 0;
811 122         188 foreach my $ignoreTag ( @ignoreTags ) {
812 244 50       644 if( $ignoreTag eq $tag1 ) {
813 0         0 $bIgnore = 1;
814 0         0 last;
815             }
816             }
817 122 50       253 if( $bIgnore == 1 ) {
818 0         0 $this->{'stats'}->{'ignored_tags'}++;
819             } else {
820 122 50       467 if( $tag1 !~ /^[ACGT]+$/ ) {
821 0         0 $this->{'stats'}->{'bad_tags'}++;
822             } else {
823 122         171 my $tagScores = '';
824 122 50       349 if( $scores ne '' ) {
825 122         427 $tagScores = substr( $scores, 0, ( $tagLength+$enzymeLength )*3 );
826             }
827 122 100       422 if( $this->{'tag_filter'}->is_valid( $tag1, $tagScores ) ) {
828 120         234 $this->{'stats'}->{'good_tags'}++;
829 120         306 push( @tags, $tag1 );
830             } else {
831 2         9 $this->{'stats'}->{'lowquality_tags'}++;
832             }
833             }
834             }
835              
836 122         290 my $tag2 = substr( $ditag, length( $ditag ) - $tagLength - $enzymeLength, $tagLength );
837 122         219 $tag2 = reverse( $tag2 );
838 122         188 $tag2 =~ tr/ACGT/TGCA/;
839 122         169 $bIgnore = 0;
840 122         197 foreach my $ignoreTag ( @ignoreTags ) {
841 244 50       691 if( $ignoreTag eq $tag2 ) {
842 0         0 $bIgnore = 1;
843 0         0 last;
844             }
845             }
846 122 50       266 if( $bIgnore == 1 ) {
847 0         0 $this->{'stats'}->{'ignored_tags'}++;
848             } else {
849 122 50       459 if( $tag2 !~ /^[ACGT]+$/ ) {
850 0         0 $this->{'stats'}->{'bad_tags'}++;
851             } else {
852 122         161 my $tagScores = '';
853 122 50       352 if( $scores ne '' ) {
854 122         523 $tagScores = substr( $scores, (length( $ditag )-$tagLength-$enzymeLength)*3, ($tagLength+$enzymeLength)*3 );
855 122         1404 $tagScores = join( " ", map { sprintf( "%2i", $_ ) } reverse split( /\s/, $tagScores ) );
  1708         5691  
856             }
857 122 50       1034 if( $this->{'tag_filter'}->is_valid( $tag2, $tagScores ) ) {
858 122         240 $this->{'stats'}->{'good_tags'}++;
859 122         664 push( @tags, $tag2 );
860             } else {
861 0         0 $this->{'stats'}->{'lowquality_tags'}++;
862             }
863             }
864             }
865             }
866             }
867              
868 2         146 return @tags;
869              
870             }
871              
872             #######################################################
873             sub get_tagcounts {
874             #######################################################
875             =pod
876              
877             =head2 get_tagcounts
878              
879             Extracts valid tags from ditags and returns a hashref
880             containing tag sequences paired with their respective
881             counts.
882              
883             B
884              
885             None.
886              
887             B
888              
889             A hashref where the tag sequence is paired
890             with its observed number.
891              
892             B
893              
894             my $sage = Bio::SAGE::DataProcessing->new();
895             my @reads = ( 'ACGTAGACATAGACAAGAGATATAGA',
896             'GATAGACAAAGGAAGATTACAAGATTAT' );
897              
898             foreach my $read ( @reads ) {
899             $sage->process_read( $read );
900             }
901              
902             # get tag counts
903             my %counts = %{$sage->get_tagcounts()};
904              
905             # print tag counts
906             map { print join( "\t", $_, $counts{$_} ) . "\n" } keys %counts;
907              
908             =cut
909              
910 1     1 1 832 my $this = shift;
911              
912 1         2 my %counts;
913              
914 1         6 my @tags = $this->get_tags();
915 1         12 map { $counts{$_}++ } @tags;
  121         292  
916              
917 1         19 return \%counts;
918              
919             }
920              
921             #######################################################
922             sub get_ditag_base_distribution {
923             #######################################################
924             =pod
925              
926             =head2 get_ditag_base_distribution [$minLength], [$maxLength]
927              
928             Calculates the distribution of bases at each position and both
929             orientations of a set of ditags. This distribution is used
930             for calculating the 'expected' nucleotide count when determining
931             extra bases using get_extra_base_calculation.
932              
933             For example:
934              
935             CATGAAACCGTATGTAGAGAGGGACACATG
936             CATGTAGACAGATAGACACAGATACCATG
937              
938             has a distribution of:
939              
940             +---------------+---------------+
941             | forward | reverse |
942             +-----+---+---+---+---+---+---+---+---+
943             | pos | A | C | G | T | A | C | G | T |
944             +-----+---+---+---+---+---+---+---+---+
945             | 0 | 0 | 2 | 0 | 0 | 0 | 2 | 0 | 0 |
946             | 1 | 2 | 0 | 0 | 0 | 2 | 0 | 0 | 0 |
947             | 2 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 2 |
948             | 3 | 0 | 0 | 2 | 0 | 0 | 0 | 2 | 0 |
949             | 4 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
950             | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
951             | 6 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
952             | ... |
953             | 28 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 1 |
954             | 29 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
955             +-----+---+---+---+---+---+---+---+---+
956              
957             B
958              
959             I<$minLength> (optional)
960              
961             Ignore ditags that are smaller than this minimum length.
962             If the argument is not supplied, then the minimum
963             ditag length for the currently selected protocol
964             is used.
965              
966             I<$maxLength> (optional)
967              
968             Ignore ditags that are larger than this maximum length.
969             If the argument is not supplied, then the maximum
970             ditag length for the currently selected protocol
971             is used.
972              
973             B
974              
975             A hashref where the key is the zero-based base position
976             index, and the value is a hashref where the key is the
977             nucleotide and the value is a hashref where the key is
978             either 'fwd' or 'rev' and the value is the count of that
979             nucleotide (whew!).
980              
981             i.e. $HASH{$idx}->{'A'}->{'fwd'} = 23;
982              
983             B
984              
985             my $sage = Bio::SAGE::DataProcessing->new();
986             my @ditags = ( 'CATGAAACCGTATGTAGAGAGGGACACATG',
987             'CATGTAGACAGATAGACACAGATACCATG' );
988              
989             my %DIST = %{$sage->get_ditag_base_distribution()};
990              
991             # print distribution table
992             foreach my $idx ( sort { $a <=> $b } keys %DIST ) {
993             print $idx . " ";
994             print join( " ", map { defined( $DIST{$idx}->{$_} ) ? $DIST{$idx}->{$_} : 0 } ( 'A','C','G','T' ) );
995             print "\n";
996             }
997              
998             =cut
999              
1000 1     1 1 778 my $this = shift;
1001 1   33     11 my $minLength = shift || $this->{'protocol'}->{'MINIMUM_DITAG_LENGTH'};
1002 1   33     12 my $maxLength = shift || $this->{'protocol'}->{'MAXIMUM_DITAG_LENGTH'};
1003              
1004 1         2 my %distribution;
1005              
1006             # get forward distribution
1007 1         3 foreach my $ditag ( keys %{$this->{'ditags'}} ) {
  1         19  
1008 61         540 my @bps = split( //, $ditag );
1009 61 50 33     354 if( $minLength <= length( $ditag ) && length( $ditag ) <= $maxLength ) {
1010 61         147 for( my $i = 0; $i < length( $ditag ); $i++ ) {
1011 1866         6113 $distribution{$i}->{$bps[$i]}->{'fwd'}++;
1012             }
1013             }
1014             }
1015              
1016             # get reverse distribution
1017 1         14 foreach my $ditag ( keys %{$this->{'ditags'}} ) {
  1         17  
1018 61         112 $ditag = reverse( $ditag );
1019 61         92 $ditag =~ tr/ACGT/TGCA/;
1020 61         498 my @bps = split( //, $ditag );
1021 61 50 33     314 if( $minLength <= length( $ditag ) && length( $ditag ) <= $maxLength ) {
1022 61         127 for( my $i = 0; $i < length( $ditag ); $i++ ) {
1023 1866         5362 $distribution{$i}->{$bps[$i]}->{'fwd'}++;
1024             }
1025             }
1026             }
1027              
1028 1         19 return \%distribution;
1029              
1030             }
1031              
1032             #######################################################
1033             sub get_ditag_length_distribution {
1034             #######################################################
1035             =pod
1036              
1037             =head2 get_ditag_length_distribution
1038              
1039             Calculates the distribution of ditag lengths for a set
1040             of ditags.
1041              
1042             For example:
1043              
1044             CATGAAACCGTATGTAGAGAGGGACACATG
1045             CATGTAGACAGATAGACACAGATACCATG
1046             CATGTATCGCGGCATTACGATCTAGAACATG
1047             CATGACGACTATATCGATAGCTAACCATG
1048              
1049             has a distribution of:
1050              
1051             +-----+---+
1052             | len | N |
1053             +-----+---+
1054             | 29 | 2 |
1055             | 30 | 1 |
1056             | 31 | 1 |
1057             +-----+---+
1058              
1059             B
1060              
1061             None.
1062              
1063             B
1064              
1065             A hashref where the key is the ditag length, and
1066             the value is the number of ditags that have this
1067             length.
1068              
1069             i.e. $HASH{$len} = 1024;
1070              
1071             B
1072              
1073             my %DIST = %{$sage->get_ditag_length_distribution()};
1074              
1075             # print distribution table
1076             foreach my $idx ( sort { $a <=> $b } keys %DIST ) {
1077             print join( "\t", $idx, $DIST{$idx} ) . "\n";
1078             }
1079              
1080             =cut
1081              
1082 1     1 1 797 my $this = shift;
1083              
1084 1         4 my %dist;
1085 1         3 foreach my $ditag ( keys %{$this->{'ditags'}} ) {
  1         22  
1086 61         85 $dist{length($ditag)}++;
1087             }
1088              
1089 1         10 return \%dist;
1090              
1091             }
1092              
1093             #######################################################
1094             sub get_extra_base_calculation {
1095             #######################################################
1096             =pod
1097              
1098             =head2 get_extra_base_calculation $tag
1099              
1100             Calculates the number of times a given nucleotide is
1101             seen at additional positions of a SAGE tag. This method
1102             is only supported in methods where there the range of
1103             ditag size allows for extra bases to be included in
1104             some (or all) population of ditags.
1105              
1106             For example, the ditag sequence:
1107              
1108             CATGTATCGCGGCATTACGATCTAGAACATG
1109              
1110             becomes:
1111              
1112             CATGTATCGCGGCA and CATGTTCTAGATCG
1113              
1114             but an additonal TTA sequence is stored in the middle.
1115             Some or all of these extra bases may arise from one or
1116             both of the tags.
1117              
1118             B
1119              
1120             I<$tag>
1121              
1122             The tag sequence that is the focus of the extra base
1123             calculation. Only ditags that have this tag sequence
1124             are considered in the calculation.
1125              
1126             The method checks the length of specified tag and
1127             checks whether it begins with the expected anchoring
1128             enzyme site. If the tag appears to be missing just
1129             the anchoring enzyme site, it will append this
1130             automatically. Otherwise, the method will die.
1131              
1132             B
1133              
1134             A hashref that is several keys deep in the order:
1135             extra base position, ditag length, nucleotide. The
1136             key is the number of times the keyed event occured.
1137              
1138             In other words:
1139             $hash->{$position}->{$ditag_length}->{$nucleotide} = $obs;
1140              
1141             B
1142              
1143             my $dist = $sage->get_extra_base_calculation( "AAACGAATTA" );
1144              
1145             # dump results
1146             foreach my $ditag_length ( keys %$dist ) {
1147             foreach my $position ( keys %{$dist->{$ditag_length}} ) {
1148             foreach my $nucleotide ( keys %{$dist->{$ditag_length}->{$position}} ) {
1149             print join( ",", $ditag_length, $position, $nucleotide ) .
1150             "\t" .
1151             $dist->{$ditag_length}->{$position}->{$nucleotide} .
1152             "\n";
1153             }
1154             }
1155             }
1156              
1157             =cut
1158              
1159 1     1 1 419 my $this = shift;
1160 1   50     6 my $tag = shift || die( $PACKAGE . "::get_extra_base_calculation no tag argument specified." );
1161              
1162 1         4 $tag = uc( $tag );
1163              
1164 1         3 my $enzyme = $this->{'enzyme'};
1165 1 50 33     20 if( !( $tag =~ /^$enzyme/ && length( $tag ) == $this->{'protocol'}->{'TAG_LENGTH'}+length($enzyme) ) ) {
1166 1 50       10 if( length($tag) == $this->{'protocol'}->{'TAG_LENGTH'} ) {
1167 1         3 $tag = $enzyme . $tag;
1168             } else {
1169 0         0 die( $PACKAGE . "::get_extra_base_calculation tag '$tag' is not valid." );
1170             }
1171             }
1172 1         3 my $revtag = reverse( $tag );
1173 1         4 $revtag =~ tr/ACGT/TGCA/;
1174              
1175 1         6 my @ditags = $this->get_ditags();
1176              
1177 1         10 my $minDitagLength = 2 * ( length( $enzyme ) + $this->{'protocol'}->{'TAG_LENGTH'} );
1178              
1179 1         1 my @data;
1180 1         3 foreach my $ditag ( @ditags ) {
1181              
1182 61 50       135 if( $minDitagLength >= length($ditag) ) {
1183 0         0 next; # ignored because 0 extra bases
1184             }
1185              
1186 61 100       217 if( $ditag =~ /^$tag/ ) {
1187 1         3 push( @data, $ditag );
1188 1         4 next;
1189             }
1190 60 100       250 if( $ditag =~ /$revtag$/ ) {
1191 1         3 $ditag = reverse( $ditag );
1192 1         3 $ditag =~ tr/ACGT/TGCA/;
1193 1         3 push( @data, $ditag );
1194 1         3 next;
1195             }
1196              
1197             }
1198              
1199 1         5 my $taglength = $this->{'protocol'}->{'TAG_LENGTH'} + length($enzyme);
1200              
1201             # TODO: check unlikely event that same tag is sticking together
1202             # if so, we need to flip the tag in both directions to get extra bases from both
1203             # directions
1204              
1205 1         4 my %results;
1206              
1207              
1208 1         4 foreach my $ditag ( @data ) {
1209 2         44 $ditag =~ /^.{$taglength}(.*?).{$taglength}$/;
1210              
1211 2         12 my @extra_bases = split( //, $1 );
1212              
1213 2         11 for( my $i = 0; $i < scalar( @extra_bases ); $i++ ) {
1214 5         33 $results{$i}->{length($ditag)}->{$extra_bases[$i]}++;
1215             }
1216             }
1217              
1218 1         11 return \%results;
1219              
1220             }
1221              
1222             #######################################################
1223             sub print_extra_base_calculation {
1224             #######################################################
1225             =pod
1226              
1227             =head2 print_extra_base_calculation $resultRef, [$handle]
1228              
1229             Prints a formatted report to the specified handle.
1230              
1231             An example output looks like:
1232              
1233             +------+------+------+------+
1234             | A | C | G | T |
1235             +----+---+------+------+------+------+
1236             | 29 | 0 | 183 | 43 | 31 | 68 |
1237             | 30 | 0 | 2637 | 23 | 23 | 37 |
1238             | 31 | 0 | 2624 | 0 | 14 | 0 |
1239             | 32 | 0 | 665 | 0 | 1 | 0 |
1240             +----+---+------+------+------+------+
1241             | 30 | 1 | 639 | 784 | 435 | 862 |
1242             | 31 | 1 | 188 | 1875 | 198 | 377 |
1243             | 32 | 1 | 4 | 658 | 0 | 4 |
1244             +----+---+------+------+------+------+
1245             | 31 | 2 | 609 | 588 | 355 | 1086 |
1246             | 32 | 2 | 100 | 204 | 106 | 256 |
1247             +----+---+------+------+------+------+
1248             | 32 | 3 | 199 | 95 | 88 | 284 |
1249             +----+---+------+------+------+------+
1250              
1251             The first two columns are the ditag size and the extra
1252             base's relative 0-indexed position, respectively.
1253             The remaining columns are the four nucleotides and the
1254             number of ditags that met the condition described in
1255             the first two columns.
1256              
1257             In this example, the investigator could strongly reason
1258             that the extra nucleotides are AC.
1259              
1260             B
1261              
1262             I<$resultRef>
1263              
1264             A properly formed hashref containing the results of
1265             an extra base calculation. This data structure can
1266             be obtained with the get_extra_base_calculation method
1267             (see the documentation for that method for more details).
1268              
1269             I<$handle> (optional)
1270              
1271             A Perl handle to output the results to. If this
1272             argument is not specified, STDOUT is used by default.
1273              
1274             B
1275              
1276             my $dist = $sage->get_extra_base_calculation( "AAACGAATTA" );
1277              
1278             $sage->print_extra_base_calculation( $dist, *STDERR );
1279              
1280             =cut
1281              
1282 0     0 1 0 my $this = shift;
1283 0   0     0 my $pResults = shift || die( $PACKAGE . "::print_extra_base_calculation result hashref not provided." );
1284 0   0     0 my $handle = shift || *STDOUT;
1285              
1286 0         0 print $handle " +------+------+------+------+\n";
1287 0         0 my @bps = ('A','C','G','T');
1288 0         0 print $handle join( " | ", " ", map { " ".$_." " } @bps ) . " |\n";
  0         0  
1289 0         0 print $handle "+----+---+------+------+------+------+\n";
1290              
1291 0         0 foreach my $position ( sort { $a <=> $b } keys %$pResults ) {
  0         0  
1292 0         0 foreach my $ditag_length ( sort { $a <=> $b } keys %{$pResults->{$position}} ) {
  0         0  
  0         0  
1293             #my @bps = keys %{$pResults->{$ditag_length}->{$position}};
1294 0   0     0 print $handle "| " . join( " | ", $ditag_length,
1295             $position,
1296             join( " | ",
1297 0         0 map { sprintf( "%4i", $pResults->{$position}->{$ditag_length}->{$_} || 0 ) } @bps ) )
1298             . " |\n";
1299             }
1300 0         0 print $handle "+----+---+------+------+------+------+\n";
1301             }
1302              
1303             }
1304              
1305             sub __extract_ditags {
1306              
1307 5     5   9 my $this = shift;
1308 5   50     24 my $read_sequence = shift || die( $PACKAGE . "::__extract_ditags no sequence specified." );
1309 5         17 my $read_scores = shift;
1310              
1311 5         28 $read_sequence = uc( $read_sequence );
1312              
1313 5 50       21 if( defined( $read_scores ) ) {
1314             # make sure scores are padded to two digits
1315 5         1572 $read_scores = join( " ", map { sprintf( "%02i", $_ ) } split( /\s+/, $read_scores ) );
  4003         13735  
1316             }
1317              
1318 5 50       1773 if( $DEBUG >= 1 ) {
1319 0         0 print STDERR $PACKAGE . "::__extract_ditags\n";
1320 0         0 print STDERR " \$read_sequence = $read_sequence\n";
1321 0         0 print STDERR " \$read_scores = $read_scores\n";
1322             }
1323              
1324 5         20 my $enzyme = $this->{'enzyme'};
1325 5         19 my $minLength = $this->{'protocol'}->{'MINIMUM_DITAG_LENGTH'};
1326 5         16 my $maxLength = $this->{'protocol'}->{'MAXIMUM_DITAG_LENGTH'};
1327              
1328             # get position(s) of anchoring enzyme sites
1329 5         7 my $pos = -1;
1330 5         11 my @positions;
1331 5         48 while( ( $pos = index( $read_sequence, $enzyme, $pos ) ) > -1 ) {
1332 106         224 push( @positions, $pos );
1333 106         430 $pos++;
1334             }
1335              
1336 5         28 for( my $i = 0; $i < scalar( @positions )-1; $i++ ) {
1337              
1338 101         443 my $ditag_sequence = substr( $read_sequence, $positions[$i], $positions[($i+1)]+length( $enzyme )-$positions[$i] );
1339 101         205 $this->{'stats'}->{'total_ditags'}++;
1340 101 100       425 if( $ditag_sequence !~ /^[ACGT]+$/ ) {
1341 3         11 $this->{'stats'}->{'badseq_ditags'}++;
1342 3         22 next;
1343             }
1344 98 100       271 if( length( $ditag_sequence ) < $minLength ) {
1345 4         12 $this->{'stats'}->{'short_ditags'}++;
1346 4         15 next;
1347             }
1348 94 100       459 if( length( $ditag_sequence ) > $maxLength ) {
1349 4         9 $this->{'stats'}->{'long_ditags'}++;
1350 4         18 next;
1351             }
1352 90         122 my $ditag_scores = undef;
1353 90 50       224 if( defined( $read_scores ) ) {
1354 90         613 $ditag_scores = substr( $read_scores, $positions[$i]*3, ($positions[($i+1)]+length( $enzyme )-$positions[$i])*3 );
1355             }
1356 90 100       319 if( !$this->{'ditag_filter'}->is_valid( $ditag_sequence, $ditag_scores ) ) {
1357 29         57 $this->{'stats'}->{'lowquality_ditags'}++;
1358 29         495 next;
1359             }
1360             # }
1361              
1362 61 50 33     80 if( defined( ${$this->{'ditags'}}{$ditag_sequence} ) && $this->{'keep_duplicates'} == 0 ) {
  61         283  
1363             # we already have this ditag, which one is better?
1364 0 0       0 if( defined( $read_scores ) ) {
1365 0         0 my $result = $this->{'ditag_filter'}->compare( $ditag_scores, $this->{'ditags'}{$ditag_sequence} );
1366 0 0       0 if( $result <= -1 ) {
1367             # new one is better
1368 0         0 $this->{'ditags'}{$ditag_sequence} = $ditag_scores;
1369             }
1370             }
1371 0         0 $this->{'stats'}->{'duplicate_ditags'}++;
1372 0         0 next;
1373             }
1374              
1375 61 50       166 if( $this->{'keep_duplicates'} == 1 ) {
1376 0         0 $this->{'stats'}->{'good_ditags'}++;
1377 0         0 push( @{$this->{'ditags'}{$ditag_sequence}}, $ditag_scores );
  0         0  
1378 0         0 next;
1379             }
1380              
1381 61         108 $this->{'stats'}->{'good_ditags'}++;
1382 61         586 $this->{'ditags'}{$ditag_sequence} = $ditag_scores;
1383              
1384             }
1385              
1386             }
1387              
1388             sub save {
1389              
1390 0     0 0   my $this = shift;
1391 0   0       my $handle = shift || *STDOUT;
1392              
1393 0           print $handle '' . "\n";
1394 0           print $handle "\n";
1395              
1396 0           print $handle " \n";
1397              
1398 0           print $handle " " . $this->{'enzyme'} . "\n";
1399 0           print $handle " " . $this->{'keep_duplicates'} . "\n";
1400              
1401              
1402 0           print $handle " \n";
1403              
1404 0           print $handle "\n";
1405              
1406             # $self->{'enzyme'} = $enzyme;
1407             #$self->{'protocol'} = $protocol;
1408             #$self->{'keep_duplicates'} = $bKeepDupes;
1409             # $self->set_protocol( $protocol );
1410              
1411             # $self->{'ditag_filter'} = $DEFAULT_DITAG_FILTER;
1412             #$self->{'tag_filter'} = $DEFAULT_TAG_FILTER;
1413              
1414              
1415              
1416             }
1417              
1418 0     0 0   sub load {
1419             # make static
1420             }
1421              
1422              
1423             1;
1424              
1425             __END__