line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::Tradis::FilterTags; |
2
|
|
|
|
|
|
|
$Bio::Tradis::FilterTags::VERSION = '1.3.2'; |
3
|
|
|
|
|
|
|
# ABSTRACT: Filter tags in a fastq file |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
|
6
|
3
|
|
|
3
|
|
99652
|
use Moose; |
|
3
|
|
|
|
|
389691
|
|
|
3
|
|
|
|
|
19
|
|
7
|
3
|
|
|
3
|
|
18929
|
use Bio::Tradis::Parser::Fastq; |
|
3
|
|
|
|
|
9
|
|
|
3
|
|
|
|
|
1754
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
has 'fastqfile' => ( is => 'rw', isa => 'Str', required => 1 ); |
10
|
|
|
|
|
|
|
has '_unzipped_fastq' => |
11
|
|
|
|
|
|
|
( is => 'rw', isa => 'Str', lazy => 1, builder => '_build__unzipped_fastq' ); |
12
|
|
|
|
|
|
|
has 'tag' => ( is => 'rw', isa => 'Str', required => 1 ); |
13
|
|
|
|
|
|
|
has 'mismatch' => ( is => 'rw', isa => 'Int', required => 0 ); |
14
|
|
|
|
|
|
|
has 'outfile' => ( |
15
|
|
|
|
|
|
|
is => 'rw', |
16
|
|
|
|
|
|
|
isa => 'Str', |
17
|
|
|
|
|
|
|
required => 0, |
18
|
|
|
|
|
|
|
default => sub { |
19
|
|
|
|
|
|
|
my ($self) = @_; |
20
|
|
|
|
|
|
|
my $o = $self->fastqfile; |
21
|
|
|
|
|
|
|
$o =~ s/\.fastq/\.tag\.fastq/; |
22
|
|
|
|
|
|
|
return $o; |
23
|
|
|
|
|
|
|
} |
24
|
|
|
|
|
|
|
); |
25
|
|
|
|
|
|
|
has '_currentread' => ( |
26
|
|
|
|
|
|
|
is => 'rw', |
27
|
|
|
|
|
|
|
isa => 'ArrayRef', |
28
|
|
|
|
|
|
|
required => 0, |
29
|
|
|
|
|
|
|
writer => '_set_currentread' |
30
|
|
|
|
|
|
|
); |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
sub _is_gz { |
33
|
12
|
|
|
12
|
|
23
|
my ($self) = @_; |
34
|
12
|
|
|
|
|
221
|
my $fq = $self->fastqfile; |
35
|
|
|
|
|
|
|
|
36
|
12
|
100
|
|
|
|
41
|
if ( $fq =~ /\.gz/ ) { |
37
|
2
|
|
|
|
|
9
|
return 1; |
38
|
|
|
|
|
|
|
} |
39
|
|
|
|
|
|
|
else { |
40
|
10
|
|
|
|
|
32
|
return 0; |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
} |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
sub _build__unzipped_fastq { |
45
|
6
|
|
|
6
|
|
15
|
my ($self) = @_; |
46
|
6
|
|
|
|
|
125
|
my $fq = $self->fastqfile; |
47
|
|
|
|
|
|
|
|
48
|
6
|
100
|
|
|
|
21
|
if ( $self->_is_gz ) { |
49
|
1
|
|
|
|
|
6
|
$fq =~ /([^\/]+)$/; |
50
|
1
|
|
|
|
|
3
|
my $newfq = $1; |
51
|
1
|
|
|
|
|
3
|
$newfq =~ s/\.gz//; |
52
|
1
|
50
|
|
|
|
29
|
if ( !-e $newfq ) { |
53
|
1
|
|
|
|
|
5054
|
`gunzip -c $fq > $newfq`; |
54
|
|
|
|
|
|
|
} |
55
|
1
|
|
|
|
|
99
|
return $newfq; |
56
|
|
|
|
|
|
|
} |
57
|
|
|
|
|
|
|
else { |
58
|
5
|
|
|
|
|
94
|
return $fq; |
59
|
|
|
|
|
|
|
} |
60
|
|
|
|
|
|
|
} |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
sub filter_tags { |
63
|
6
|
|
|
6
|
0
|
18
|
my ($self) = @_; |
64
|
6
|
|
|
|
|
170
|
my $tag = uc( $self->tag ); |
65
|
6
|
|
|
|
|
160
|
my $outfile = $self->outfile; |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
#set up fastq parser |
68
|
6
|
|
|
|
|
124
|
my $filename = $self->_unzipped_fastq; |
69
|
6
|
|
|
|
|
177
|
my $pars = Bio::Tradis::Parser::Fastq->new( file => $filename ); |
70
|
|
|
|
|
|
|
|
71
|
6
|
|
|
|
|
409
|
open( OUTFILE, ">$outfile" ); |
72
|
|
|
|
|
|
|
|
73
|
6
|
|
|
|
|
44
|
while ( $pars->next_read ) { |
74
|
94
|
|
|
|
|
201
|
my @read = $pars->read_info; |
75
|
94
|
|
|
|
|
2186
|
$self->_set_currentread( \@read ); |
76
|
94
|
|
|
|
|
162
|
my $id = $read[0]; |
77
|
94
|
|
|
|
|
125
|
my $seq_string = $read[1]; |
78
|
94
|
|
|
|
|
118
|
my $qual_string = $read[2]; |
79
|
|
|
|
|
|
|
|
80
|
94
|
|
|
|
|
122
|
my $print_out = 0; |
81
|
94
|
100
|
|
|
|
1777
|
if ( $self->mismatch == 0 ) { |
82
|
81
|
100
|
|
|
|
398
|
if ( $seq_string =~ /^$tag/ ) { |
83
|
54
|
|
|
|
|
89
|
$print_out = 1; |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
} |
86
|
|
|
|
|
|
|
else { |
87
|
13
|
|
|
|
|
31
|
my $mm = $self->_tag_mismatch($seq_string); |
88
|
13
|
100
|
|
|
|
212
|
if ( $mm <= $self->mismatch ) { |
89
|
10
|
|
|
|
|
15
|
$print_out = 1; |
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
} |
92
|
|
|
|
|
|
|
|
93
|
94
|
100
|
|
|
|
241
|
if ($print_out) { |
94
|
64
|
|
|
|
|
183
|
print OUTFILE "\@$id\n"; |
95
|
64
|
|
|
|
|
98
|
print OUTFILE $seq_string . "\n+\n"; |
96
|
64
|
|
|
|
|
230
|
print OUTFILE $qual_string . "\n"; |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
} |
99
|
6
|
100
|
|
|
|
21
|
if ( $self->_is_gz ) { |
100
|
1
|
|
|
|
|
43
|
unlink( $self->_unzipped_fastq ); |
101
|
|
|
|
|
|
|
} |
102
|
6
|
|
|
|
|
219
|
close OUTFILE; |
103
|
6
|
|
|
|
|
175
|
return 1; |
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
sub _tag_mismatch { |
107
|
13
|
|
|
13
|
|
17
|
my ($self) = @_; |
108
|
13
|
|
|
|
|
193
|
my $tag_len = length( $self->tag ); |
109
|
13
|
|
|
|
|
19
|
my $seq_string = ${ $self->_currentread }[1]; |
|
13
|
|
|
|
|
198
|
|
110
|
|
|
|
|
|
|
|
111
|
13
|
|
|
|
|
189
|
my @tag = split( "", $self->tag ); |
112
|
13
|
|
|
|
|
39
|
my @seq = split( "", substr( $seq_string, 0, $tag_len ) ); |
113
|
13
|
|
|
|
|
14
|
my $mismatches = 0; |
114
|
13
|
|
|
|
|
28
|
foreach my $i ( 0 .. ( $tag_len - 1 ) ) { |
115
|
117
|
100
|
|
|
|
210
|
if ( $tag[$i] ne $seq[$i] ) { |
116
|
26
|
|
|
|
|
33
|
$mismatches++; |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
} |
119
|
13
|
|
|
|
|
30
|
return $mismatches; |
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
123
|
3
|
|
|
3
|
|
23
|
no Moose; |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
17
|
|
124
|
|
|
|
|
|
|
1; |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
__END__ |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
=pod |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
=encoding UTF-8 |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=head1 NAME |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
Bio::Tradis::FilterTags - Filter tags in a fastq file |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
=head1 VERSION |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
version 1.3.2 |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
=head1 SYNOPSIS |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
Reads in a fastq file with tradis tags already attached to the start of the sequence |
143
|
|
|
|
|
|
|
Filters reads that contain the provided tag |
144
|
|
|
|
|
|
|
Outputs a file *.tag.fastq unless an alternative outfile name is specified |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
use Bio::Tradis::FilterTags; |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
my $pipeline = Bio::Tradis::FilterTags->new(fastqfile => 'abc', tag => 'abc'); |
149
|
|
|
|
|
|
|
$pipeline->filter_tags(); |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
=head1 PARAMETERS |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=head2 Required |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
=over |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=item * C<fastqfile> - path to/name of file to filter. This may be a gzipped fastq file, in which case a temporary unzipped version is used and removed on completion. |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
=item * C<tag> - TraDIS tag to match |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
=back |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
=head2 Optional |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
=over |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
=item * C<mismatch> - number of mismatches to allow when matching the tag. Default = 0 |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=item * C<outfile> - output file name. Defaults to C<file.tag.fastq> for an input file named C<file.fastq> |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=back |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
=head1 METHODS |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
C<filter_tags> - outputs all reads containing the provided tag to C<outfile> |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
=head1 AUTHOR |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
Carla Cummins <path-help@sanger.ac.uk> |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute. |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
This is free software, licensed under: |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
The GNU General Public License, Version 3, June 2007 |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=cut |