File Coverage

lib/Bio/SAGE/DataProcessing.pm

Criterion	Covered	Total	%
statement	254	340	74.7
branch	62	106	58.4
condition	15	52	28.8
subroutine	18	23	78.2
pod	14	16	87.5
total	363	537	67.6

line	stmt	bran	cond	sub	pod	time	code
1							# *%) $Id: DataProcessing.pm,v 1.26 2004/10/15 22:30:46 scottz Exp $
2							#
3							# Copyright (c) 2004 Scott Zuyderduyn .
4							# All rights reserved. This program is free software; you
5							# can redistribute it and/or modify it under the same
6							# terms as Perl itself.
7
8							package Bio::SAGE::DataProcessing;
9
10							=pod
11
12							=head1 NAME
13
14							Bio::SAGE::DataProcessing - Processes raw serial analysis of gene expression (SAGE) data.
15
16							=head1 SYNOPSIS
17
18							use Bio::SAGE::DataProcessing;
19							$sage = Bio::SAGE::DataProcessing->new();
20
21							# open sequence and quality files
22							open( READS, "library.fasta" );
23							open( QUAL, "library.qual.fasta" );
24
25							# collect ditags and statistics from reads
26							$sage->process_library( READS, QUAL );
27
28							# close files
29							close( READS );
30							close( QUAL );
31
32							# output tags in descending order of expression
33							my %tags = %{$sage->get_tagcounts()};
34							open( TAGS, ">library.tags" );
35							map { print TAGS join( "\t", $_, $tags{$_} ) . "\n" } sort { $tags{$b} <=> $tags{$a} } keys %tags;
36							close( TAGS );
37
38							# tag AAACCGGGTT matches two different genes
39							# so 15th base pair may help resolve this
40							$sage->print_extra_base_calculation( $sage->get_extract_base_calculation( "AAACCGGGTT" ) );
41
42							=head1 DESCRIPTION
43
44							This module provides several tools for processing and
45							analyzing serial analysis of gene expression (SAGE)
46							libraries.
47
48							=head1 README
49
50							B
51
52							Serial analysis of gene expression (SAGE) is a molecular
53							technique for generating a near-global snapshot of a
54							cell population�s transcriptome. Briefly, the technique
55							extracts short sequences at defined positions of
56							transcribed mRNA. These short sequences are then paired
57							to form ditags. The ditags are concatamerized to form
58							long sequences that are then cloned. The cloned DNA is
59							then sequenced. Bioinformatic techniques are then
60							employed to determine the original short tag sequences,
61							and to derive their progenitor mRNA. The number of times
62							a particular tag is observed can be used to quantitate
63							the amount of a particular transcript. The original
64							technique was described by Velculescu et al. (1995) and
65							utilized an ~14bp sequence tag. A modified protocol
66							was introduced by Saha et al. (2002) that produced ~21bp
67							tags.
68
69							B
70
71							This module facilitates the processing of SAGE data.
72							Specifically:
73
74							1. extracting ditags from raw sequence reads.
75							2. extracting tags from ditags, with the option to
76							exclude tags if the Phred scores (described by
77							Ewing and Green, 1998a and Ewing et al., 1998b)
78							do not meet a minimum cutoff value.
79							3. calculating descriptive values
80							4. statistical analysis to determine, where possible,
81							additional nucleotides to extend the length of the
82							SAGE tag (thus facilitating more accurate tag to
83							gene mapping).
84
85							Both regular SAGE (14mer tag) and LongSAGE (21mer tag)
86							are supported by this module. Future protocols should
87							be configurable with this module.
88
89							B
90
91							Velculescu V, Zhang L, Vogelstein B, Kinzler KW. (1995)
92							Serial analysis of gene expression. Science. 270:484-487.
93
94							Saha S, Sparks AB, Rago C, Akmaev V, Wang CJ, Vogelstein B,
95							Kinzler KW, Velculescu V. (2002) Using the transcriptome
96							to annotate the genome. Nat. Biotechnol. 20:508-512.
97
98							Ewing B, Hillier L, Wendl MC, Green P. (1998a) Base-calling
99							of automated sequencer traces using phred. I. Accuracy
100							assessment. Genome Res. 8:175-185.
101
102							Ewing B, Green P. (1998b) Base-calling of automated
103							sequencer traces using phred. II. Error probabilities.
104							Genome Res. 8:186-194.
105
106							=head1 INSTALLATION
107
108							Follow the usual steps for installing any Perl module:
109
110							perl Makefile.PL
111							make test
112							make install
113
114							=head1 PREREQUISITES
115
116							None (this module used to require the C package).
117
118							=head1 CHANGES
119
120							1.20 2004.10.15 - Minor spelling errors and misuse of terminology fixed in docs.
121							- Module now allows FASTA files with a blank line folling the
122							header (denoting an attempted read with no sequence), but prints
123							a warning to STDERR that this has happened. Module died previously.
124							1.11 2004.06.20 - Added flag in constructor to keep duplicate ditags.
125							1.10 2004.06.02 - Wrote new documentation and modified several methods to use the read-by-read
126							processing approach (see line below).
127							- Revamped the module to conserve memory. Reads are now processed one at a time
128							and then discarded. The memory requirements in the previous versions were
129							prohibitive to those with regular desktop machines.
130							- The Bio::SAGE::DataProcessing::Filter package can be subclassed to create
131							custom filters at the ditag and tag processing steps (previous versions only
132							allowed one approach to ditag/tag filtering).
133							1.01 2004.05.27 - Fixed bug where extract_tag_counts didn't work with quality cutoff defined.
134							- extract_tags was not applying the get_quality_cutoff value (was returning all data)
135							- Duplicate ditags are now removed by default.
136							1.00 2004.05.23 - Initial release.
137
138							=cut
139
140	1			1		983	use strict;
	1					2
	1					43
141	1			1		1004	use diagnostics;
	1					242296
	1					16
142	1			1		676	use vars qw( $VERSION @ISA @EXPORT @EXPORT_OK $PROTOCOL_SAGE $PROTOCOL_LONGSAGE $DEBUG $ENZYME_NLAIII $ENZYME_SAU3A $DEFAULT_DITAG_FILTER $DEFAULT_TAG_FILTER );
	1					2
	1					232
143
144							require Exporter;
145							require AutoLoader;
146
147							@ISA = qw( Exporter AutoLoader );
148							@EXPORT = qw();
149							$VERSION = "1.11";
150
151							#use Statistics::Distributions;
152	1			1		777	use Bio::SAGE::DataProcessing::Filter;
	1					3
	1					58
153	1			1		426	use Bio::SAGE::DataProcessing::AveragePhredFilter;
	1					2
	1					35
154	1			1		492	use Bio::SAGE::DataProcessing::MinimumPhredFilter;
	1					2
	1					3638
155
156							my $PACKAGE = "Bio::SAGE::DataProcessing";
157
158							=pod
159
160							=head1 VARIABLES
161
162							B
163
164							=over 2
165
166							I<$PROTOCOL_SAGE>
167
168							Hashref containing default protocol parameters for the
169							regular/original SAGE protocol (see set_protocol
170							documentation for more information).
171
172							I<$PROTOCOL_LONGSAGE>
173
174							Hashref containing default protocol parameters for the
175							LongSAGE protocol (see set_protocol documentation
176							for more information).
177
178							I<$ENZYME_NLAIII> = 'CATG'
179
180							Constant denoting the recognition sequence for NlaIII.
181
182							I<$ENZYME_SAU3A> = 'GATC'
183
184							Constant denoting the recognition sequence for Sau3a.
185
186							I<$DEFAULT_TAG_FILTER>
187
188							A default tag filter used when none is specified.
189							This filter rejects tags that contain any base pair
190							with a Phred quality score < 15 or an average
191							Phred quality score over all bases < 30.
192
193							I<$DEFAULT_DITAG_FILTER>
194
195							A default ditag filter used when none is specified.
196							This filter rejects ditags that contain any base pair
197							with a Phred quality score < 20.
198
199							=back
200
201							B
202
203							=over 2
204
205							I<$DEBUG = 0>
206
207							Prints debugging output if value if >= 1.
208
209							=back
210
211							=cut
212
213							my @ignoreTags = ( 'TCCCTATTAA', 'TCCCCGTACA' ); # linker derived sequences
214							my @ignoreLongTags = ( 'TCGGACGTACATCGTTA', 'TCGGATATTAAGCCTAG' ); # linker derived sequences
215
216							my %params_sage = ( 'MINIMUM_DITAG_LENGTH' => 29,
217							'MAXIMUM_DITAG_LENGTH' => 32,
218							'TAG_LENGTH' => 10,
219							'IGNORE_TAGS' => \@ignoreTags );
220							my %params_longsage = ( 'MINIMUM_DITAG_LENGTH' => 40,
221							'MAXIMUM_DITAG_LENGTH' => 46,
222							'TAG_LENGTH' => 17,
223							'IGNORE_TAGS' => \@ignoreLongTags );
224
225							$PROTOCOL_SAGE = \%params_sage; # regular SAGE (14-mer tags)
226							$PROTOCOL_LONGSAGE = \%params_longsage; # LongSAGE (21-mer tags)
227
228							$ENZYME_NLAIII = "CATG";
229							$ENZYME_SAU3A = "GATC";
230
231							$DEFAULT_TAG_FILTER = Bio::SAGE::DataProcessing::AveragePhredFilter->new( 30, 15 );
232							$DEFAULT_DITAG_FILTER = Bio::SAGE::DataProcessing::MinimumPhredFilter->new( 20 );
233
234							$DEBUG = 0; # set this flag to non-zero to enable debugging messages
235
236							=pod
237
238							=head1 CLASS METHODS
239
240							=cut
241
242							#######################################################
243							sub new {
244							#######################################################
245							=pod
246
247							=head2 new <$enzyme>, <\%protocol>, <$bKeepDuplicates>
248
249							Constructor for a new Bio::SAGE::DataProcessing object.
250
251							B
252
253							I<$enzyme> (optional)
254
255							The anchoring enzyme recognition site. This is
256							typically NlaIII (CATG) or Sau3a (GATC). The
257							default is the recognition sequence "CATG" (NlaIII).
258
259							I<\%protocol> (optional)
260
261							A hashref containing specifics of the protocol. Two
262							pre-made parameter sets are available: $PROTOCOL_SAGE
263							(regular SAGE) and $PROTOCOL_LONGSAGE (LongSAGE).
264
265							The required hash keys:
266
267							MINIMUM_DITAG_LENGTH \| The shortest length a valid
268							ditag can be (the length
269							should include the anchoring
270							enzyme site sequences).
271							MAXIMUM_DITAG_LENGTH \| The longest length a valid
272							ditag can be (the length
273							should include the anchoring
274							enzyme site sequences).
275							TAG_LENGTH \| The expected tag length (the
276							length should NOT include the
277							anchoring enzyme site sequence).
278							IGNORE_TAGS \| An arrayref listing tag
279							sequences that should be
280							ignored during tag extraction
281							(i.e. linker-derived sequences).
282
283							The parameters for the default configurations are:
284
285							+----------------+--------------------+
286							\| $PROTOCOL_SAGE \| $PROTOCOL_LONGSAGE \|
287							+----------------------+----------------+--------------------+
288							\| MINIMUM_DITAG_LENGTH \| 29 \| 40 \|
289							+----------------------+----------------+--------------------+
290							\| MAXIMUM_DITAG_LENGTH \| 32 \| 46 \|
291							+----------------------+----------------+--------------------+
292							\| TAG_LENGTH \| 10 \| 17 \|
293							+----------------------+----------------+--------------------+
294							\| IGNORE_TAGS \| TCCCTATTAA \| TCGGACGTACATCGTTA \|
295							\| \| TCCCCGTACA \| TCGGATATTAAGCCTAG \|
296							+----------------------+----------------+--------------------+
297
298							Once the Bio::SAGE::DataProcessing object has been instantiated,
299							the enzyme and protocol CANNOT be altered.
300
301							I<$bKeepDuplicates> (optional)
302
303							If this argument is TRUE (non-zero) then ditags with
304							identical sequence will be kept. The default behavior is
305							to discard these ditags as they likely represent
306							preferential PCR amplification.
307
308							B
309
310							my $sage = Bio::SAGE::DataProcessing->new( $Bio::SAGE::DataProcessing::ENZYME_NLAIII,
311							$Bio::SAGE::DataProcessing::PROTOCOL_LONGSAGE );
312
313							# alternatively:
314							my $sage = Bio::SAGE::DataProcessing->new( "CATG", { 'MINIMUM_DITAG_LENGTH' => 31,
315							'MAXIMUM_DITAG_LENGTH' => 34,
316							'TAG_LENGTH' => 11,
317							'IGNORE_TAGS' => \( 'TCCCTATTAA', 'TCCCCGTACA' ) } );
318
319							=cut
320
321	1			1	1	427	my $this = shift;
322	1		50			8	my $enzyme = shift \|\| "CATG";
323	1		33			4	my $protocol = shift \|\| $PROTOCOL_SAGE;
324	1		50			6	my $bKeepDupes = shift \|\| 0;
325	1		33			5	my $class = ref( $this ) \|\| $this;
326	1					2	my $self = {};
327	1					3	bless( $self, $class );
328
329							# set instance variables
330	1					6	$self->{'enzyme'} = $enzyme;
331	1					2	$self->{'protocol'} = $protocol;
332	1					2	$self->{'keep_duplicates'} = $bKeepDupes;
333							# $self->set_protocol( $protocol );
334
335	1					2	$self->{'ditag_filter'} = $DEFAULT_DITAG_FILTER;
336	1					2	$self->{'tag_filter'} = $DEFAULT_TAG_FILTER;
337
338	1					4	return $self;
339
340							}
341
342							=pod
343
344							=head1 INSTANCE METHODS
345
346							=cut
347
348							#######################################################
349							sub set_ditag_filter {
350							#######################################################
351							=pod
352
353							=head2 set_ditag_filter $filterObject
354
355							Sets a new filter object (a concrete subclass of
356							Bio::SAGE::DataProcessing::Filter) that is applied
357							to ditags as they're extracted from sequence reads.
358
359							The default filter uses a Bio::SAGE::DataProcessing::MinimumPhredFilter
360							instance that is set to reject any ditags that do not
361							have at least Phred 20 (p<=0.01) for all nucleotides.
362
363							B
364
365							I<$filterObject>
366
367							An object ref to a concrete subclass of
368							Bio::SAGE::DataProcessing::Filter.
369
370							=cut
371
372	0			0	1	0	my $self = shift;
373	0		0			0	$self->{'ditag_filter'} = shift \|\| die( $PACKAGE . "::set_ditag_filter no filter specified." );
374
375							}
376
377							#######################################################
378							sub set_tag_filter {
379							#######################################################
380							=pod
381
382							=head2 set_tag_filter $filterObject
383
384							Sets a new filter object (a concrete subclass of
385							Bio::SAGE::DataProcessing::Filter) that is applied
386							to tags as they're extracted from ditags.
387
388							The default filter uses a Bio::SAGE::DataProcessing::AveragePhredFilter
389							instance that is set to reject any tags that do not
390							have an average Phred 30 score (p<=0.001) over all
391							nucleotides and at least Phred 15 (p<=0.0316) at
392							each nucleotide.
393
394							B
395
396							I<$filterObject>
397
398							An object ref to a concrete subclass of
399							Bio::SAGE::DataProcessing::Filter.
400
401							=cut
402
403	0			0	1	0	my $self = shift;
404	0		0			0	$self->{'tag_filter'} = shift \|\| die( $PACKAGE . "::set_tag_filter no filter specified." );
405
406							}
407
408							#######################################################
409							sub get_enzyme {
410							#######################################################
411							=pod
412
413							=head2 get_enzyme
414
415							Gets the current anchoring enzyme recognition site.
416
417							B
418
419							None.
420
421							B
422
423							The current anchoring enzyme recognition site. By
424							default, this will be 'CATG', the NlaIII recognition
425							site.
426
427							B
428
429							my $sage = Bio::SAGE::DataProcessing->new();
430							print "ENZYME_SITE: " . $sage->get_enzyme() . "\n";
431
432							=cut
433
434	1			1	1	1823	my $this = shift;
435
436	1					7	return $this->{'enzyme'};
437
438							}
439
440							#######################################################
441							sub get_protocol {
442							#######################################################
443							=pod
444
445							=head2 get_protocol
446
447							Gets a copy (in other words, modifying the
448							returned hash does not affect the object's settings)
449							of the hash describing the protocol for this
450							instance.
451
452							B
453
454							None.
455
456							B
457
458							A hashref of the current protocol settings. See the
459							documentation for new (constructor) for more details
460							on the contents of this hash.
461
462							B
463
464							my $sage = Bio::SAGE::DataProcessing->new();
465							print "Default protocol:\n";
466							my %protocol = %{$sage->get_protocol()};
467							map { print $_ . "=" . $protocol{$_} . "\n" } keys %protocol;
468
469							=cut
470
471	1			1	1	4	my $this = shift;
472	1					3	my %protocol = %{$this->{'protocol'}};
	1					10
473
474	1					4	my %copy;
475	1					4	foreach my $key ( keys %protocol ) {
476	4					7	my $val = $protocol{$key};
477	4	100				11	if( $key eq 'IGNORE_TAGS' ) {
478	1					4	my @arr = @$val;
479	1					3	$val = \@arr;
480							}
481	4					11	$copy{$key} = $val;
482							}
483
484	1					8	return \%copy;
485
486							}
487
488							#######################################################
489							sub process_library {
490							#######################################################
491							=pod
492
493							=head2 process_library $sequence_handle, <$scores_handle>
494
495							Processes reads from a Perl handle to FASTA format
496							sequence data. Phred scores in matching FASTA format
497							can be read concurrently.
498
499							An example of FASTA format sequence:
500
501							>SEQUENCE0001
502							ACAGATAGACAGAGATATAGAGACATATTTAGAGACAAATCGCGCAGGCGCGCGACATA
503							TGACTAGTTTATATCATCAGTATTAGCGATTATGACTATTATATATTACTGATTGATCT
504							ATAGCGCGATTATATCTATCTATGCATTCGATCATGCTATTATCGTATACTACTGCTAG
505							AGAGGACGACGCAGGCAGCGATTATATCTATTTATA
506							>SEQUENCE0002
507							CGCGACGCATGTCAGTAGCTAGCTGCGCCCGAATATATATCGTCATACGGATTCGTAGC
508							CCCCCGCGGAGTCTGATTATATCTGATT
509
510							An example of FASTA format quality data:
511
512							>SEQUENCE0001
513							11 17 18 16 19 17 19 19 16 19 19 16 11 10 9 15 10 12 24 24 35 29 29 39 40 40 40 40 37 37 46 46 40 40 40 40 56 56 56 56 35 35 35 35 35 35 56 40 40 46 40 40 39 39 35 39 56 56 51 51
514							51 51 51 56 35 35 35 35 35 35 40 40 51 45 51 51 39 39 39 39 39 39 40 40 40 40 40 40 56 56 56 56 56 46 46 40 39 39 39 45 45 45 56 56 56 56 56 56 56 56 40 39 39 39 39 35 39 39 39 39
515							45 56 45 45 45 45 51 35 39 39 39 39 39 40 40 40 40 40 51 56 56 40 40 40 40 40 43 56 56 56 43 35 35 35 35 35 43 45 45 45 45 45 45 51 51 51 51 51 51 56 56 56 56 56 56 51 51 51 56 56
516							7 7 9 9 11 10 13 11 10 8 10 10 8 8 8 10 10
517							>SEQUENCE0002
518							12 15 17 17 19 15 15 15 13 19 17 17 12 16 11 19 13 24 24 35 35 35 37 37 39 56 56 56 56 56 51 39 32 29 29 29 29 32 56 56 56 35 35 35 35 35 35 56 56 56 56 56 56 56 56 56 56 56 56 40
519							40 40 46 46 40 51 40 40 40 40 40 40 51 39 39 35 35 35 35 40 40 51 45 45 45 45 51 51 56 56 56 56 56 45 45 45 45 51 51 45 45 45 40 40 40 40 40 40 40 40 40 40 56 56 56 56 56 56 51 51
520							15 13 19 17 17 12 16 11 19 13
521
522							B
523
524							I<$sequence_handle>
525
526							A Perl handle to sequence data in FASTA format.
527
528							I<$scores_handle> (optional)
529
530							A Perl handle to Phred scores in FASTA format. The order
531							of entries must match the $sequence_handle data. NOTE:
532							many implementations of Bio::SAGE::DataProcessing::Filter
533							require the scores to carry out their function. In the
534							default implementations, no filtering is done when scores
535							are not defined.
536
537							B
538
539							The number of sequences read.
540
541							B
542
543							my $sage = Bio::SAGE::DataProcessing->new();
544							open( SEQDATA, "sequences.fasta" );
545							my $nReads = $sage->process_library( *SEQDATA );
546							print "NUMBER_READS: $nReads\n";
547
548							=cut
549
550	1		50	1	1	876	my $this = shift \|\| die( $PACKAGE . "::process_library can't be called statically." );
551	1		50			5	my $fh1 = shift \|\| die( $PACKAGE . "::process_library no sequence data handle provided." );
552	1		50			9	my $fh2 = shift \|\| undef;
553
554	1					2	my $nRead = 0;
555
556	1					2	my $currid = '';
557	1					1	my $currseq = '';
558
559	1					2	my $currscoreid = '';
560
561	1					23	while( my $line = <$fh1> ) {
562	75					135	chomp( $line ); $line =~ s/\r//g;
	75					127
563	75	100				176	if( $line =~ /^>(.*?)$/ ) {
564	5					21	my $thisid = $1;
565	5	100				20	if( $currid ne '' ) {
566
567							# do we have scores too?
568
569	4					11	my $scores = '';
570	4	50				15	if( defined( $fh2 ) ) {
571	4					448	while( my $line2 = <$fh2> ) {
572	65					91	chomp( $line2 ); $line2 =~ s/\r//g;
	65					111
573	65	100				151	if( $line2 =~ /^>(.*?)$/ ) {
574	5					14	my $thisscoreid = $1;
575	5	100				12	if( $currscoreid eq '' ) { $currscoreid = $thisscoreid; next; }
	1					2
	1					7
576	4	50				15	if( $currid ne $currscoreid ) { die( $PACKAGE . "::process_library sequence and score data don't match." ); }
	0					0
577	4					6	$currscoreid = $thisscoreid;
578	4					11	last;
579							}
580	60	100				139	if( $scores ne '' ) { $scores .= ' '; }
	56					88
581	60					294	$scores .= $line2;
582							}
583							}
584
585	4	50				19	if( $scores eq '' ) { $scores = undef; }
	0					0
586	4	50				21	if( $this->process_read( $currseq, $scores ) == 0 ) {
587	0					0	print STDERR "Non-fatal error on read >".$currid." ...\n";
588	0					0	print STDERR " Sequence: $currseq\n";
589	0	0				0	print STDERR " Scores : " . ( defined($scores) ? $scores : '' ) . "\n";
590	0					0	print STDERR "...continuing.\n";
591							}
592	4					10	$currseq = '';
593	4					10	$nRead++;
594
595							}
596	5					13	$currid = $thisid;
597	5					52	next;
598							}
599	70					778	$currseq .= $line;
600
601							}
602
603	1	50				9	if( $currid ne '' ) {
604
605							# do we have scores too?
606
607	1					4	my $scores = '';
608	1	50				4	if( defined( $fh2 ) ) {
609	1					275	while( my $line2 = <$fh2> ) {
610	10					20	chomp( $line2 ); $line2 =~ s/\r//g;
	10					21
611	10	50				27	if( $line2 =~ /^>(.*?)$/ ) {
612	0					0	my $thisscoreid = $1;
613	0	0				0	if( $currscoreid eq '' ) { $currscoreid = $thisscoreid; next; }
	0					0
	0					0
614	0	0				0	if( $currid ne $currscoreid ) { die( $PACKAGE . "::process_library sequence and score data don't match." ); }
	0					0
615	0					0	$currscoreid = $thisscoreid;
616	0					0	last;
617							}
618	10	100				32	if( $scores ne '' ) { $scores .= ' '; }
	9					15
619	10					71	$scores .= $line2;
620							}
621							}
622
623	1	50				8	if( $scores eq '' ) { $scores = undef; }
	0					0
624	1	50				8	if( $this->process_read( $currseq, $scores ) == 0 ) {
625	0					0	print STDERR "Error: " . $currid . "\n";
626	0					0	print STDERR $currseq . "\n";
627	0					0	print STDERR $scores . "\n";
628							}
629	1					4	$currseq = '';
630	1					4	$currid = '';
631	1					5	$nRead++;
632
633							}
634
635	1					10	return $nRead;
636
637							}
638
639							#######################################################
640							sub process_read {
641							#######################################################
642							=pod
643
644							=head2 process_read $sequence, <$scores>
645
646							Extracts and filters ditags from the given sequence read
647							(and optionally supplied Phred scores). The default ditag
648							filter (or a filter supplied to set_ditag_filter) is
649							applied during this procedure. The resulting ditags are
650							added to the list of currently processed ditags collected
651							from previous calls to this method.
652
653							Ditags with identical sequence (duplicate ditags) are
654							considered the result of PCR artifacts and only the
655							ditag with the highest "score" (as defined by the current
656							ditag Bio::SAGE::DataProcessing::Filter) is retained.
657
658							B
659
660							I<$sequence>
661
662							The nucleotide sequence of the sequence read to process.
663
664							I<$scores> (optional)
665
666							The Phred scores corresponding to the sequence read
667							supplied to the method. The method expects space-separated
668							Phred scores (ie. "20 24 54 32" etc.).
669
670							B
671
672							TRUE (1) if the method was successful, FALSE (0) otherwise.
673
674							B
675
676							my $sage = Bio::SAGE::DataProcessing->new();
677							my $sequence = "ACGTACGT";
678							my $scores = "20 25 34 12 23 45 51 23";
679							if( $sage->process_read( $sequence, $scores ) ) {
680							print "Successful!\n";
681							}
682
683							=cut
684
685	5			5	1	10	my $this = shift;
686	5					16	my $read_sequence = shift; # \|\| die( $PACKAGE . "::process_read no sequence specified." );
687	5					27	my $read_scores = shift;
688
689	5	50				21	if( !defined( $read_sequence ) ) { return 0; }
	0					0
690	5	50				22	if( $read_sequence eq '' ) { return 0; }
	0					0
691
692	5	50				19	if( $DEBUG >= 1 ) {
693	0					0	print STDERR $PACKAGE . "::process_read\n";
694	0					0	print STDERR " \$read_sequence = $read_sequence\n";
695	0					0	print STDERR " \$read_scores = $read_scores\n";
696							}
697
698	5					17	$this->{'stats'}->{'total_reads'}++;
699	5					23	$this->{'stats'}->{'total_bps'} += length( $read_sequence );
700
701	5					18	$this->__extract_ditags( $read_sequence, $read_scores );
702
703	5					33	return 1;
704
705							}
706
707							#######################################################
708							sub get_ditags {
709							#######################################################
710							=pod
711
712							=head2 get_ditags
713
714							Gets the list of currently extracted and valid ditags
715							stored in this object through calls to process_read.
716
717							B
718
719							None.
720
721							B
722
723							An array of ditag sequences.
724
725							B
726
727							my $sage = Bio::SAGE::DataProcessing->new();
728							my $sequence = "ACGTACGT";
729							my $scores = "20 25 34 12 23 45 51 23";
730							if( $sage->process_read( $sequence, $scores ) ) {
731							my @ditags = $sage->get_ditags();
732							print "Current ditags:\n";
733							map { print " ".$_."\n" } @ditags;
734							}
735
736							=cut
737
738	2			2	1	5	my $this = shift;
739	2	50				12	if( $this->{'keep_duplicates'} == 1 ) {
740	0					0	my @arr;
741	0					0	foreach my $ditag ( keys %{$this->{'ditags'}} ) {
	0					0
742	0					0	for( my $i = 0; $i < scalar( @{$this->{'ditags'}{$ditag}} ); $i++ ) {
	0					0
743	0					0	push( @arr, $ditag );
744							}
745							}
746	0					0	return @arr;
747							}
748	2					5	return keys %{$this->{'ditags'}};
	2					29
749
750							}
751
752							#######################################################
753							sub get_tags {
754							#######################################################
755							=pod
756
757							=head2 get_tags
758
759							Uses the list of currently extracted and valid ditags
760							to generate a list of valid SAGE tags. The default
761							tag filter (or a filter supplied to set_tag_filter) is
762							applied during this procedure.
763
764							B
765
766							None.
767
768							B
769
770							An array of tag sequences.
771
772							B
773
774							my $sage = Bio::SAGE::DataProcessing->new();
775							my $sequence = "ACGTACGT";
776							my $scores = "20 25 34 12 23 45 51 23";
777							if( $sage->process_read( $sequence, $scores ) ) {
778							my @tags = $sage->get_tags();
779							print "Current tags:\n";
780							map { print " ".$_."\n" } @tags;
781							}
782
783							=cut
784
785	2			2	1	4	my $this = shift;
786
787	2					7	my $enzymeLength = length( $this->{'enzyme'} );
788	2					8	my $tagLength = $this->{'protocol'}->{'TAG_LENGTH'};
789	2					5	my @ignoreTags = @{$this->{'protocol'}->{'IGNORE_TAGS'}};
	2					9
790
791	2					4	my @tags;
792
793	2					4	foreach my $ditag ( keys %{$this->{'ditags'}} ) {
	2					43
794
795	122					222	my @scoresList;
796	122	50				288	if( $this->{'keep_duplicates'} == 1 ) {
797	0					0	@scoresList = @{$this->{'ditags'}{$ditag}};
	0					0
798							} else {
799	122					134	$scoresList[0] = ${$this->{'ditags'}}{$ditag};
	122					459
800							}
801
802							#my $scores = ${$this->{'ditags'}}{$ditag};
803	122					213	foreach my $scores ( @scoresList ) {
804
805	122	50				328	if( $scores eq '' ) {
806							# no scores were provided
807							}
808
809	122					303	my $tag1 = substr( $ditag, $enzymeLength, $tagLength );
810	122					158	my $bIgnore = 0;
811	122					188	foreach my $ignoreTag ( @ignoreTags ) {
812	244	50				644	if( $ignoreTag eq $tag1 ) {
813	0					0	$bIgnore = 1;
814	0					0	last;
815							}
816							}
817	122	50				253	if( $bIgnore == 1 ) {
818	0					0	$this->{'stats'}->{'ignored_tags'}++;
819							} else {
820	122	50				467	if( $tag1 !~ /^[ACGT]+$/ ) {
821	0					0	$this->{'stats'}->{'bad_tags'}++;
822							} else {
823	122					171	my $tagScores = '';
824	122	50				349	if( $scores ne '' ) {
825	122					427	$tagScores = substr( $scores, 0, ( $tagLength+$enzymeLength )*3 );
826							}
827	122	100				422	if( $this->{'tag_filter'}->is_valid( $tag1, $tagScores ) ) {
828	120					234	$this->{'stats'}->{'good_tags'}++;
829	120					306	push( @tags, $tag1 );
830							} else {
831	2					9	$this->{'stats'}->{'lowquality_tags'}++;
832							}
833							}
834							}
835
836	122					290	my $tag2 = substr( $ditag, length( $ditag ) - $tagLength - $enzymeLength, $tagLength );
837	122					219	$tag2 = reverse( $tag2 );
838	122					188	$tag2 =~ tr/ACGT/TGCA/;
839	122					169	$bIgnore = 0;
840	122					197	foreach my $ignoreTag ( @ignoreTags ) {
841	244	50				691	if( $ignoreTag eq $tag2 ) {
842	0					0	$bIgnore = 1;
843	0					0	last;
844							}
845							}
846	122	50				266	if( $bIgnore == 1 ) {
847	0					0	$this->{'stats'}->{'ignored_tags'}++;
848							} else {
849	122	50				459	if( $tag2 !~ /^[ACGT]+$/ ) {
850	0					0	$this->{'stats'}->{'bad_tags'}++;
851							} else {
852	122					161	my $tagScores = '';
853	122	50				352	if( $scores ne '' ) {
854	122					523	$tagScores = substr( $scores, (length( $ditag )-$tagLength-$enzymeLength)3, ($tagLength+$enzymeLength)3 );
855	122					1404	$tagScores = join( " ", map { sprintf( "%2i", $_ ) } reverse split( /\s/, $tagScores ) );
	1708					5691
856							}
857	122	50				1034	if( $this->{'tag_filter'}->is_valid( $tag2, $tagScores ) ) {
858	122					240	$this->{'stats'}->{'good_tags'}++;
859	122					664	push( @tags, $tag2 );
860							} else {
861	0					0	$this->{'stats'}->{'lowquality_tags'}++;
862							}
863							}
864							}
865							}
866							}
867
868	2					146	return @tags;
869
870							}
871
872							#######################################################
873							sub get_tagcounts {
874							#######################################################
875							=pod
876
877							=head2 get_tagcounts
878
879							Extracts valid tags from ditags and returns a hashref
880							containing tag sequences paired with their respective
881							counts.
882
883							B
884
885							None.
886
887							B
888
889							A hashref where the tag sequence is paired
890							with its observed number.
891
892							B
893
894							my $sage = Bio::SAGE::DataProcessing->new();
895							my @reads = ( 'ACGTAGACATAGACAAGAGATATAGA',
896							'GATAGACAAAGGAAGATTACAAGATTAT' );
897
898							foreach my $read ( @reads ) {
899							$sage->process_read( $read );
900							}
901
902							# get tag counts
903							my %counts = %{$sage->get_tagcounts()};
904
905							# print tag counts
906							map { print join( "\t", $_, $counts{$_} ) . "\n" } keys %counts;
907
908							=cut
909
910	1			1	1	832	my $this = shift;
911
912	1					2	my %counts;
913
914	1					6	my @tags = $this->get_tags();
915	1					12	map { $counts{$_}++ } @tags;
	121					292
916
917	1					19	return \%counts;
918
919							}
920
921							#######################################################
922							sub get_ditag_base_distribution {
923							#######################################################
924							=pod
925
926							=head2 get_ditag_base_distribution [$minLength], [$maxLength]
927
928							Calculates the distribution of bases at each position and both
929							orientations of a set of ditags. This distribution is used
930							for calculating the 'expected' nucleotide count when determining
931							extra bases using get_extra_base_calculation.
932
933							For example:
934
935							CATGAAACCGTATGTAGAGAGGGACACATG
936							CATGTAGACAGATAGACACAGATACCATG
937
938							has a distribution of:
939
940							+---------------+---------------+
941							\| forward \| reverse \|
942							+-----+---+---+---+---+---+---+---+---+
943							\| pos \| A \| C \| G \| T \| A \| C \| G \| T \|
944							+-----+---+---+---+---+---+---+---+---+
945							\| 0 \| 0 \| 2 \| 0 \| 0 \| 0 \| 2 \| 0 \| 0 \|
946							\| 1 \| 2 \| 0 \| 0 \| 0 \| 2 \| 0 \| 0 \| 0 \|
947							\| 2 \| 0 \| 0 \| 0 \| 2 \| 0 \| 0 \| 0 \| 2 \|
948							\| 3 \| 0 \| 0 \| 2 \| 0 \| 0 \| 0 \| 2 \| 0 \|
949							\| 4 \| 1 \| 0 \| 0 \| 1 \| 0 \| 0 \| 1 \| 1 \|
950							\| 5 \| 2 \| 0 \| 0 \| 0 \| 0 \| 0 \| 1 \| 1 \|
951							\| 6 \| 1 \| 0 \| 1 \| 0 \| 1 \| 0 \| 0 \| 1 \|
952							\| ... \|
953							\| 28 \| 0 \| 0 \| 1 \| 1 \| 0 \| 0 \| 1 \| 1 \|
954							\| 29 \| 0 \| 0 \| 1 \| 0 \| 0 \| 0 \| 1 \| 0 \|
955							+-----+---+---+---+---+---+---+---+---+
956
957							B
958
959							I<$minLength> (optional)
960
961							Ignore ditags that are smaller than this minimum length.
962							If the argument is not supplied, then the minimum
963							ditag length for the currently selected protocol
964							is used.
965
966							I<$maxLength> (optional)
967
968							Ignore ditags that are larger than this maximum length.
969							If the argument is not supplied, then the maximum
970							ditag length for the currently selected protocol
971							is used.
972
973							B
974
975							A hashref where the key is the zero-based base position
976							index, and the value is a hashref where the key is the
977							nucleotide and the value is a hashref where the key is
978							either 'fwd' or 'rev' and the value is the count of that
979							nucleotide (whew!).
980
981							i.e. $HASH{$idx}->{'A'}->{'fwd'} = 23;
982
983							B
984
985							my $sage = Bio::SAGE::DataProcessing->new();
986							my @ditags = ( 'CATGAAACCGTATGTAGAGAGGGACACATG',
987							'CATGTAGACAGATAGACACAGATACCATG' );
988
989							my %DIST = %{$sage->get_ditag_base_distribution()};
990
991							# print distribution table
992							foreach my $idx ( sort { $a <=> $b } keys %DIST ) {
993							print $idx . " ";
994							print join( " ", map { defined( $DIST{$idx}->{$_} ) ? $DIST{$idx}->{$_} : 0 } ( 'A','C','G','T' ) );
995							print "\n";
996							}
997
998							=cut
999
1000	1			1	1	778	my $this = shift;
1001	1		33			11	my $minLength = shift \|\| $this->{'protocol'}->{'MINIMUM_DITAG_LENGTH'};
1002	1		33			12	my $maxLength = shift \|\| $this->{'protocol'}->{'MAXIMUM_DITAG_LENGTH'};
1003
1004	1					2	my %distribution;
1005
1006							# get forward distribution
1007	1					3	foreach my $ditag ( keys %{$this->{'ditags'}} ) {
	1					19
1008	61					540	my @bps = split( //, $ditag );
1009	61	50	33			354	if( $minLength <= length( $ditag ) && length( $ditag ) <= $maxLength ) {
1010	61					147	for( my $i = 0; $i < length( $ditag ); $i++ ) {
1011	1866					6113	$distribution{$i}->{$bps[$i]}->{'fwd'}++;
1012							}
1013							}
1014							}
1015
1016							# get reverse distribution
1017	1					14	foreach my $ditag ( keys %{$this->{'ditags'}} ) {
	1					17
1018	61					112	$ditag = reverse( $ditag );
1019	61					92	$ditag =~ tr/ACGT/TGCA/;
1020	61					498	my @bps = split( //, $ditag );
1021	61	50	33			314	if( $minLength <= length( $ditag ) && length( $ditag ) <= $maxLength ) {
1022	61					127	for( my $i = 0; $i < length( $ditag ); $i++ ) {
1023	1866					5362	$distribution{$i}->{$bps[$i]}->{'fwd'}++;
1024							}
1025							}
1026							}
1027
1028	1					19	return \%distribution;
1029
1030							}
1031
1032							#######################################################
1033							sub get_ditag_length_distribution {
1034							#######################################################
1035							=pod
1036
1037							=head2 get_ditag_length_distribution
1038
1039							Calculates the distribution of ditag lengths for a set
1040							of ditags.
1041
1042							For example:
1043
1044							CATGAAACCGTATGTAGAGAGGGACACATG
1045							CATGTAGACAGATAGACACAGATACCATG
1046							CATGTATCGCGGCATTACGATCTAGAACATG
1047							CATGACGACTATATCGATAGCTAACCATG
1048
1049							has a distribution of:
1050
1051							+-----+---+
1052							\| len \| N \|
1053							+-----+---+
1054							\| 29 \| 2 \|
1055							\| 30 \| 1 \|
1056							\| 31 \| 1 \|
1057							+-----+---+
1058
1059							B
1060
1061							None.
1062
1063							B
1064
1065							A hashref where the key is the ditag length, and
1066							the value is the number of ditags that have this
1067							length.
1068
1069							i.e. $HASH{$len} = 1024;
1070
1071							B
1072
1073							my %DIST = %{$sage->get_ditag_length_distribution()};
1074
1075							# print distribution table
1076							foreach my $idx ( sort { $a <=> $b } keys %DIST ) {
1077							print join( "\t", $idx, $DIST{$idx} ) . "\n";
1078							}
1079
1080							=cut
1081
1082	1			1	1	797	my $this = shift;
1083
1084	1					4	my %dist;
1085	1					3	foreach my $ditag ( keys %{$this->{'ditags'}} ) {
	1					22
1086	61					85	$dist{length($ditag)}++;
1087							}
1088
1089	1					10	return \%dist;
1090
1091							}
1092
1093							#######################################################
1094							sub get_extra_base_calculation {
1095							#######################################################
1096							=pod
1097
1098							=head2 get_extra_base_calculation $tag
1099
1100							Calculates the number of times a given nucleotide is
1101							seen at additional positions of a SAGE tag. This method
1102							is only supported in methods where there the range of
1103							ditag size allows for extra bases to be included in
1104							some (or all) population of ditags.
1105
1106							For example, the ditag sequence:
1107
1108							CATGTATCGCGGCATTACGATCTAGAACATG
1109
1110							becomes:
1111
1112							CATGTATCGCGGCA and CATGTTCTAGATCG
1113
1114							but an additonal TTA sequence is stored in the middle.
1115							Some or all of these extra bases may arise from one or
1116							both of the tags.
1117
1118							B
1119
1120							I<$tag>
1121
1122							The tag sequence that is the focus of the extra base
1123							calculation. Only ditags that have this tag sequence
1124							are considered in the calculation.
1125
1126							The method checks the length of specified tag and
1127							checks whether it begins with the expected anchoring
1128							enzyme site. If the tag appears to be missing just
1129							the anchoring enzyme site, it will append this
1130							automatically. Otherwise, the method will die.
1131
1132							B
1133
1134							A hashref that is several keys deep in the order:
1135							extra base position, ditag length, nucleotide. The
1136							key is the number of times the keyed event occured.
1137
1138							In other words:
1139							$hash->{$position}->{$ditag_length}->{$nucleotide} = $obs;
1140
1141							B
1142
1143							my $dist = $sage->get_extra_base_calculation( "AAACGAATTA" );
1144
1145							# dump results
1146							foreach my $ditag_length ( keys %$dist ) {
1147							foreach my $position ( keys %{$dist->{$ditag_length}} ) {
1148							foreach my $nucleotide ( keys %{$dist->{$ditag_length}->{$position}} ) {
1149							print join( ",", $ditag_length, $position, $nucleotide ) .
1150							"\t" .
1151							$dist->{$ditag_length}->{$position}->{$nucleotide} .
1152							"\n";
1153							}
1154							}
1155							}
1156
1157							=cut
1158
1159	1			1	1	419	my $this = shift;
1160	1		50			6	my $tag = shift \|\| die( $PACKAGE . "::get_extra_base_calculation no tag argument specified." );
1161
1162	1					4	$tag = uc( $tag );
1163
1164	1					3	my $enzyme = $this->{'enzyme'};
1165	1	50	33			20	if( !( $tag =~ /^$enzyme/ && length( $tag ) == $this->{'protocol'}->{'TAG_LENGTH'}+length($enzyme) ) ) {
1166	1	50				10	if( length($tag) == $this->{'protocol'}->{'TAG_LENGTH'} ) {
1167	1					3	$tag = $enzyme . $tag;
1168							} else {
1169	0					0	die( $PACKAGE . "::get_extra_base_calculation tag '$tag' is not valid." );
1170							}
1171							}
1172	1					3	my $revtag = reverse( $tag );
1173	1					4	$revtag =~ tr/ACGT/TGCA/;
1174
1175	1					6	my @ditags = $this->get_ditags();
1176
1177	1					10	my $minDitagLength = 2 * ( length( $enzyme ) + $this->{'protocol'}->{'TAG_LENGTH'} );
1178
1179	1					1	my @data;
1180	1					3	foreach my $ditag ( @ditags ) {
1181
1182	61	50				135	if( $minDitagLength >= length($ditag) ) {
1183	0					0	next; # ignored because 0 extra bases
1184							}
1185
1186	61	100				217	if( $ditag =~ /^$tag/ ) {
1187	1					3	push( @data, $ditag );
1188	1					4	next;
1189							}
1190	60	100				250	if( $ditag =~ /$revtag$/ ) {
1191	1					3	$ditag = reverse( $ditag );
1192	1					3	$ditag =~ tr/ACGT/TGCA/;
1193	1					3	push( @data, $ditag );
1194	1					3	next;
1195							}
1196
1197							}
1198
1199	1					5	my $taglength = $this->{'protocol'}->{'TAG_LENGTH'} + length($enzyme);
1200
1201							# TODO: check unlikely event that same tag is sticking together
1202							# if so, we need to flip the tag in both directions to get extra bases from both
1203							# directions
1204
1205	1					4	my %results;
1206
1207
1208	1					4	foreach my $ditag ( @data ) {
1209	2					44	$ditag =~ /^.{$taglength}(.*?).{$taglength}$/;
1210
1211	2					12	my @extra_bases = split( //, $1 );
1212
1213	2					11	for( my $i = 0; $i < scalar( @extra_bases ); $i++ ) {
1214	5					33	$results{$i}->{length($ditag)}->{$extra_bases[$i]}++;
1215							}
1216							}
1217
1218	1					11	return \%results;
1219
1220							}
1221
1222							#######################################################
1223							sub print_extra_base_calculation {
1224							#######################################################
1225							=pod
1226
1227							=head2 print_extra_base_calculation $resultRef, [$handle]
1228
1229							Prints a formatted report to the specified handle.
1230
1231							An example output looks like:
1232
1233							+------+------+------+------+
1234							\| A \| C \| G \| T \|
1235							+----+---+------+------+------+------+
1236							\| 29 \| 0 \| 183 \| 43 \| 31 \| 68 \|
1237							\| 30 \| 0 \| 2637 \| 23 \| 23 \| 37 \|
1238							\| 31 \| 0 \| 2624 \| 0 \| 14 \| 0 \|
1239							\| 32 \| 0 \| 665 \| 0 \| 1 \| 0 \|
1240							+----+---+------+------+------+------+
1241							\| 30 \| 1 \| 639 \| 784 \| 435 \| 862 \|
1242							\| 31 \| 1 \| 188 \| 1875 \| 198 \| 377 \|
1243							\| 32 \| 1 \| 4 \| 658 \| 0 \| 4 \|
1244							+----+---+------+------+------+------+
1245							\| 31 \| 2 \| 609 \| 588 \| 355 \| 1086 \|
1246							\| 32 \| 2 \| 100 \| 204 \| 106 \| 256 \|
1247							+----+---+------+------+------+------+
1248							\| 32 \| 3 \| 199 \| 95 \| 88 \| 284 \|
1249							+----+---+------+------+------+------+
1250
1251							The first two columns are the ditag size and the extra
1252							base's relative 0-indexed position, respectively.
1253							The remaining columns are the four nucleotides and the
1254							number of ditags that met the condition described in
1255							the first two columns.
1256
1257							In this example, the investigator could strongly reason
1258							that the extra nucleotides are AC.
1259
1260							B
1261
1262							I<$resultRef>
1263
1264							A properly formed hashref containing the results of
1265							an extra base calculation. This data structure can
1266							be obtained with the get_extra_base_calculation method
1267							(see the documentation for that method for more details).
1268
1269							I<$handle> (optional)
1270
1271							A Perl handle to output the results to. If this
1272							argument is not specified, STDOUT is used by default.
1273
1274							B
1275
1276							my $dist = $sage->get_extra_base_calculation( "AAACGAATTA" );
1277
1278							$sage->print_extra_base_calculation( $dist, *STDERR );
1279
1280							=cut
1281
1282	0			0	1	0	my $this = shift;
1283	0		0			0	my $pResults = shift \|\| die( $PACKAGE . "::print_extra_base_calculation result hashref not provided." );
1284	0		0			0	my $handle = shift \|\| *STDOUT;
1285
1286	0					0	print $handle " +------+------+------+------+\n";
1287	0					0	my @bps = ('A','C','G','T');
1288	0					0	print $handle join( " \| ", " ", map { " ".$_." " } @bps ) . " \|\n";
	0					0
1289	0					0	print $handle "+----+---+------+------+------+------+\n";
1290
1291	0					0	foreach my $position ( sort { $a <=> $b } keys %$pResults ) {
	0					0
1292	0					0	foreach my $ditag_length ( sort { $a <=> $b } keys %{$pResults->{$position}} ) {
	0					0
	0					0
1293							#my @bps = keys %{$pResults->{$ditag_length}->{$position}};
1294	0		0			0	print $handle "\| " . join( " \| ", $ditag_length,
1295							$position,
1296							join( " \| ",
1297	0					0	map { sprintf( "%4i", $pResults->{$position}->{$ditag_length}->{$_} \|\| 0 ) } @bps ) )
1298							. " \|\n";
1299							}
1300	0					0	print $handle "+----+---+------+------+------+------+\n";
1301							}
1302
1303							}
1304
1305							sub __extract_ditags {
1306
1307	5			5		9	my $this = shift;
1308	5		50			24	my $read_sequence = shift \|\| die( $PACKAGE . "::__extract_ditags no sequence specified." );
1309	5					17	my $read_scores = shift;
1310
1311	5					28	$read_sequence = uc( $read_sequence );
1312
1313	5	50				21	if( defined( $read_scores ) ) {
1314							# make sure scores are padded to two digits
1315	5					1572	$read_scores = join( " ", map { sprintf( "%02i", $_ ) } split( /\s+/, $read_scores ) );
	4003					13735
1316							}
1317
1318	5	50				1773	if( $DEBUG >= 1 ) {
1319	0					0	print STDERR $PACKAGE . "::__extract_ditags\n";
1320	0					0	print STDERR " \$read_sequence = $read_sequence\n";
1321	0					0	print STDERR " \$read_scores = $read_scores\n";
1322							}
1323
1324	5					20	my $enzyme = $this->{'enzyme'};
1325	5					19	my $minLength = $this->{'protocol'}->{'MINIMUM_DITAG_LENGTH'};
1326	5					16	my $maxLength = $this->{'protocol'}->{'MAXIMUM_DITAG_LENGTH'};
1327
1328							# get position(s) of anchoring enzyme sites
1329	5					7	my $pos = -1;
1330	5					11	my @positions;
1331	5					48	while( ( $pos = index( $read_sequence, $enzyme, $pos ) ) > -1 ) {
1332	106					224	push( @positions, $pos );
1333	106					430	$pos++;
1334							}
1335
1336	5					28	for( my $i = 0; $i < scalar( @positions )-1; $i++ ) {
1337
1338	101					443	my $ditag_sequence = substr( $read_sequence, $positions[$i], $positions[($i+1)]+length( $enzyme )-$positions[$i] );
1339	101					205	$this->{'stats'}->{'total_ditags'}++;
1340	101	100				425	if( $ditag_sequence !~ /^[ACGT]+$/ ) {
1341	3					11	$this->{'stats'}->{'badseq_ditags'}++;
1342	3					22	next;
1343							}
1344	98	100				271	if( length( $ditag_sequence ) < $minLength ) {
1345	4					12	$this->{'stats'}->{'short_ditags'}++;
1346	4					15	next;
1347							}
1348	94	100				459	if( length( $ditag_sequence ) > $maxLength ) {
1349	4					9	$this->{'stats'}->{'long_ditags'}++;
1350	4					18	next;
1351							}
1352	90					122	my $ditag_scores = undef;
1353	90	50				224	if( defined( $read_scores ) ) {
1354	90					613	$ditag_scores = substr( $read_scores, $positions[$i]3, ($positions[($i+1)]+length( $enzyme )-$positions[$i])3 );
1355							}
1356	90	100				319	if( !$this->{'ditag_filter'}->is_valid( $ditag_sequence, $ditag_scores ) ) {
1357	29					57	$this->{'stats'}->{'lowquality_ditags'}++;
1358	29					495	next;
1359							}
1360							# }
1361
1362	61	50	33			80	if( defined( ${$this->{'ditags'}}{$ditag_sequence} ) && $this->{'keep_duplicates'} == 0 ) {
	61					283
1363							# we already have this ditag, which one is better?
1364	0	0				0	if( defined( $read_scores ) ) {
1365	0					0	my $result = $this->{'ditag_filter'}->compare( $ditag_scores, $this->{'ditags'}{$ditag_sequence} );
1366	0	0				0	if( $result <= -1 ) {
1367							# new one is better
1368	0					0	$this->{'ditags'}{$ditag_sequence} = $ditag_scores;
1369							}
1370							}
1371	0					0	$this->{'stats'}->{'duplicate_ditags'}++;
1372	0					0	next;
1373							}
1374
1375	61	50				166	if( $this->{'keep_duplicates'} == 1 ) {
1376	0					0	$this->{'stats'}->{'good_ditags'}++;
1377	0					0	push( @{$this->{'ditags'}{$ditag_sequence}}, $ditag_scores );
	0					0
1378	0					0	next;
1379							}
1380
1381	61					108	$this->{'stats'}->{'good_ditags'}++;
1382	61					586	$this->{'ditags'}{$ditag_sequence} = $ditag_scores;
1383
1384							}
1385
1386							}
1387
1388							sub save {
1389
1390	0			0	0		my $this = shift;
1391	0		0				my $handle = shift \|\| *STDOUT;
1392
1393	0						print $handle '' . "\n";
1394	0						print $handle "\n";
1395
1396	0						print $handle " \n";
1397
1398	0						print $handle " " . $this->{'enzyme'} . "\n";
1399	0						print $handle " " . $this->{'keep_duplicates'} . "\n";
1400
1401
1402	0						print $handle " \n";
1403
1404	0						print $handle "\n";
1405
1406							# $self->{'enzyme'} = $enzyme;
1407							#$self->{'protocol'} = $protocol;
1408							#$self->{'keep_duplicates'} = $bKeepDupes;
1409							# $self->set_protocol( $protocol );
1410
1411							# $self->{'ditag_filter'} = $DEFAULT_DITAG_FILTER;
1412							#$self->{'tag_filter'} = $DEFAULT_TAG_FILTER;
1413
1414
1415
1416							}
1417
1418	0			0	0		sub load {
1419							# make static
1420							}
1421
1422
1423							1;
1424
1425							__END__