| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package FASTX::Reader; |
|
2
|
39
|
|
|
39
|
|
2559181
|
use 5.012; |
|
|
39
|
|
|
|
|
507
|
|
|
3
|
39
|
|
|
39
|
|
210
|
use warnings; |
|
|
39
|
|
|
|
|
74
|
|
|
|
39
|
|
|
|
|
1427
|
|
|
4
|
39
|
|
|
39
|
|
219
|
use Carp qw(confess); |
|
|
39
|
|
|
|
|
75
|
|
|
|
39
|
|
|
|
|
1773
|
|
|
5
|
39
|
|
|
39
|
|
19528
|
use Data::Dumper; |
|
|
39
|
|
|
|
|
204255
|
|
|
|
39
|
|
|
|
|
2141
|
|
|
6
|
39
|
|
|
39
|
|
18002
|
use PerlIO::encoding; |
|
|
39
|
|
|
|
|
438915
|
|
|
|
39
|
|
|
|
|
1488
|
|
|
7
|
|
|
|
|
|
|
$Data::Dumper::Sortkeys = 1; |
|
8
|
39
|
|
|
39
|
|
18215
|
use FASTX::Seq; |
|
|
39
|
|
|
|
|
110
|
|
|
|
39
|
|
|
|
|
1820
|
|
|
9
|
39
|
|
|
39
|
|
258
|
use File::Basename; |
|
|
39
|
|
|
|
|
84
|
|
|
|
39
|
|
|
|
|
3674
|
|
|
10
|
|
|
|
|
|
|
$FASTX::Reader::VERSION = '1.12.1'; |
|
11
|
|
|
|
|
|
|
require Exporter; |
|
12
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
#ABSTRACT: A simple module to parse FASTA and FASTQ files, supporting compressed files and paired-ends. |
|
15
|
|
|
|
|
|
|
|
|
16
|
39
|
|
|
39
|
|
252
|
use constant GZIP_SIGNATURE => pack('C3', 0x1f, 0x8b, 0x08); |
|
|
39
|
|
|
|
|
106
|
|
|
|
39
|
|
|
|
|
115251
|
|
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
sub new { |
|
20
|
|
|
|
|
|
|
# Instantiate object |
|
21
|
73
|
|
|
73
|
1
|
28432
|
my $class = shift @_; |
|
22
|
73
|
|
|
|
|
241
|
my $self = bless {} => $class; |
|
23
|
73
|
|
|
|
|
171
|
my $args = {}; |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# Named parameters: undefined $_[0] will read STDIN! |
|
26
|
73
|
100
|
100
|
|
|
792
|
if (defined $_[0] and substr($_[0], 0, 1) eq '-') { |
|
27
|
9
|
|
|
|
|
34
|
my %data = @_; |
|
28
|
|
|
|
|
|
|
# Try parsing |
|
29
|
9
|
|
|
|
|
35
|
for my $i (keys %data) { |
|
30
|
13
|
100
|
|
|
|
97
|
if ($i =~ /^-(file|filename)/i) { |
|
|
|
100
|
|
|
|
|
|
|
31
|
8
|
|
|
|
|
33
|
$args->{filename} = $data{$i}; |
|
32
|
|
|
|
|
|
|
} elsif ($i =~ /^-(loadseqs)/i) { |
|
33
|
4
|
|
|
|
|
13
|
$args->{loadseqs} = $data{$i}; |
|
34
|
|
|
|
|
|
|
} else { |
|
35
|
1
|
|
|
|
|
250
|
confess "[FASTX::Reader]: Unknown parameter `$i`\n"; |
|
36
|
|
|
|
|
|
|
} |
|
37
|
|
|
|
|
|
|
} |
|
38
|
|
|
|
|
|
|
} else { |
|
39
|
|
|
|
|
|
|
# Legacy instantiation |
|
40
|
64
|
|
|
|
|
221
|
($args) = @_; |
|
41
|
|
|
|
|
|
|
} |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
|
|
45
|
72
|
100
|
|
|
|
265
|
if (defined $args->{loadseqs}) { |
|
46
|
6
|
100
|
66
|
|
|
46
|
if ($args->{loadseqs} eq 'name' or $args->{loadseqs} eq 'names' ) { |
|
|
|
100
|
66
|
|
|
|
|
|
|
|
50
|
66
|
|
|
|
|
|
47
|
4
|
|
|
|
|
9
|
$args->{loadseqs} = 'name'; |
|
48
|
|
|
|
|
|
|
} elsif ($args->{loadseqs} eq 'seq' or $args->{loadseqs} eq 'seqs' or $args->{loadseqs} eq "1") { |
|
49
|
1
|
|
|
|
|
3
|
$args->{loadseqs} = 'seq'; |
|
50
|
|
|
|
|
|
|
} elsif ($args->{loadseqs} eq 'records') { |
|
51
|
1
|
|
|
|
|
2
|
$args->{loadseqs} = 'records'; |
|
52
|
|
|
|
|
|
|
} else { |
|
53
|
0
|
|
|
|
|
0
|
confess("attribute should be 'name' or 'seq' to specify the key of the hash."); |
|
54
|
|
|
|
|
|
|
} |
|
55
|
|
|
|
|
|
|
} |
|
56
|
|
|
|
|
|
|
|
|
57
|
72
|
|
|
|
|
360
|
$self->{filename} = $args->{filename}; |
|
58
|
72
|
|
|
|
|
196
|
$self->{loadseqs} = $args->{loadseqs}; |
|
59
|
72
|
|
|
|
|
216
|
$self->{aux} = [undef]; |
|
60
|
72
|
|
|
|
|
171
|
$self->{compressed} = 0; |
|
61
|
72
|
|
|
|
|
141
|
$self->{fh} = undef; |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# Check if a filename was provided and not {{STDIN}} |
|
65
|
|
|
|
|
|
|
# uncoverable branch false |
|
66
|
|
|
|
|
|
|
|
|
67
|
72
|
50
|
66
|
|
|
1587
|
if ( defined $self->{filename} and -d $self->{filename} ) { |
|
68
|
0
|
|
|
|
|
0
|
confess __PACKAGE__, " Directory provide where file was expected\n"; |
|
69
|
|
|
|
|
|
|
} |
|
70
|
|
|
|
|
|
|
|
|
71
|
72
|
100
|
66
|
|
|
631
|
if (defined $self->{filename} and $self->{filename} ne '{{STDIN}}') { |
|
72
|
70
|
100
|
|
|
|
4167
|
open my $initial_fh, '<', $self->{filename} or confess "Unable to read file ", $self->{filename}, "\n"; |
|
73
|
66
|
|
|
|
|
1915
|
read( $initial_fh, my $magic_byte, 4 ); |
|
74
|
66
|
|
|
|
|
1026
|
close $initial_fh; |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
# See: __BioX::Seq::Stream__ for GZIP (and other) compressed file reader |
|
77
|
66
|
100
|
|
|
|
1927
|
if (substr($magic_byte,0,3) eq GZIP_SIGNATURE) { |
|
|
|
50
|
|
|
|
|
|
|
78
|
33
|
|
|
|
|
98
|
$self->{compressed} = 1; |
|
79
|
33
|
|
|
|
|
205
|
our $GZIP_BIN = _which('pigz', 'gzip'); |
|
80
|
|
|
|
|
|
|
#close $fh; |
|
81
|
33
|
50
|
|
|
|
11782
|
if (! defined $GZIP_BIN) { |
|
82
|
0
|
|
|
|
|
0
|
$self->{decompressor} = 'IO::Uncompress::Gunzip'; |
|
83
|
0
|
|
|
|
|
0
|
require IO::Uncompress::Gunzip; |
|
84
|
0
|
|
|
|
|
0
|
my $fh = IO::Uncompress::Gunzip->new($self->{filename}, MultiStream => 1); |
|
85
|
0
|
|
|
|
|
0
|
$self->{fh} = $fh; |
|
86
|
|
|
|
|
|
|
} else { |
|
87
|
33
|
|
|
|
|
199
|
$self->{decompressor} = $GZIP_BIN; |
|
88
|
33
|
50
|
|
|
|
84730
|
open my $fh, '-|', "$GZIP_BIN -dc $self->{filename}" or confess "Error opening gzip file ", $self->{filename}, ": $!\n"; |
|
89
|
33
|
|
|
|
|
1686
|
$self->{fh} = $fh; |
|
90
|
|
|
|
|
|
|
} |
|
91
|
|
|
|
|
|
|
} elsif (-B $self->{filename}) { |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
# BINARY FILE NOT SUPPORTED? |
|
94
|
|
|
|
|
|
|
#close $fh; |
|
95
|
0
|
|
|
|
|
0
|
$self->{fh} = undef; |
|
96
|
0
|
|
|
|
|
0
|
$self->{status} = 1; |
|
97
|
0
|
|
|
|
|
0
|
$self->{message} = 'Binary file not supported'; |
|
98
|
|
|
|
|
|
|
} else { |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
#close $fh; |
|
101
|
33
|
50
|
|
|
|
1398
|
open (my $fh, '<:encoding(utf8):crlf', $self->{filename}) or confess "Unable to read file ", $self->{filename}, ": ", $!, "\n"; |
|
102
|
33
|
|
|
|
|
2870
|
$self->{fh} = $fh; |
|
103
|
|
|
|
|
|
|
} |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
} else { |
|
107
|
|
|
|
|
|
|
# Filename not provided, use STDIN |
|
108
|
2
|
|
|
|
|
6
|
$self->{fh} = \*STDIN; |
|
109
|
2
|
50
|
|
|
|
10
|
if ($self->{loadseqs}) { |
|
110
|
0
|
|
|
|
|
0
|
confess("Load sequences not supported for STDIN"); |
|
111
|
|
|
|
|
|
|
} |
|
112
|
|
|
|
|
|
|
} |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
|
|
117
|
68
|
100
|
|
|
|
524
|
if ($self->{loadseqs}) { |
|
118
|
6
|
|
|
|
|
39
|
_load_seqs($self); |
|
119
|
|
|
|
|
|
|
} |
|
120
|
|
|
|
|
|
|
|
|
121
|
68
|
|
|
|
|
1545
|
return $self; |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
} |
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
sub records { |
|
127
|
1
|
|
|
1
|
1
|
1209
|
my $self = shift; |
|
128
|
1
|
50
|
|
|
|
5
|
confess "No records loaded with -loadseqs => records!\n" unless $self->{loadseqs} eq 'records'; |
|
129
|
1
|
|
|
|
|
3
|
return $self->{seqs}; |
|
130
|
|
|
|
|
|
|
} |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub getRead { |
|
134
|
181
|
|
|
181
|
1
|
17002
|
my $self = shift; |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
#@::::::: ::: |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
|
|
139
|
181
|
50
|
33
|
|
|
628
|
return if (defined $self->{status} and $self->{status} == 0); |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
#my $aux = $self->{aux}; |
|
142
|
181
|
|
|
|
|
298
|
my $sequence_data; |
|
143
|
181
|
50
|
|
|
|
254
|
@{ $self->{aux} } = [undef, 0] if (!(@{ $self->{aux} })); |
|
|
0
|
|
|
|
|
0
|
|
|
|
181
|
|
|
|
|
546
|
|
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
|
|
146
|
181
|
100
|
|
|
|
504
|
if ($self->{aux}->[1]) { |
|
147
|
15
|
|
|
|
|
38
|
$self->{return_0}++; |
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
|
|
150
|
181
|
100
|
|
|
|
493
|
if (!defined($self->{aux}->[0])) { |
|
151
|
117
|
|
|
|
|
12663
|
while ($self->{line} = readline($self->{fh})) { |
|
152
|
|
|
|
|
|
|
|
|
153
|
101
|
|
|
|
|
717
|
chomp($self->{line}); |
|
154
|
101
|
100
|
100
|
|
|
908
|
if (substr($self->{line}, 0, 1) eq '>' || substr($self->{line}, 0, 1) eq '@') { |
|
155
|
95
|
|
|
|
|
318
|
$self->{aux}->[0] = $self->{line}; |
|
156
|
95
|
|
|
|
|
232
|
last; |
|
157
|
|
|
|
|
|
|
} |
|
158
|
|
|
|
|
|
|
} |
|
159
|
117
|
100
|
|
|
|
375
|
if (!defined($self->{aux}->[0])) { |
|
160
|
22
|
|
|
|
|
122
|
$self->{aux}->[1] = 1; |
|
161
|
22
|
|
|
|
|
47
|
$self->{return_1}++; |
|
162
|
22
|
|
|
|
|
75
|
return; |
|
163
|
|
|
|
|
|
|
} |
|
164
|
|
|
|
|
|
|
} |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# Comments can have more spaces: |
|
168
|
159
|
50
|
|
|
|
384
|
return unless defined $self->{line}; |
|
169
|
|
|
|
|
|
|
my ($name, $comm) = $self->{line}=~/^.(\S+)(?:\s+)(.+)/ ? ($1, $2) : |
|
170
|
159
|
50
|
|
|
|
2223
|
$self->{line}=~/^.(\S+)/ ? ($1, '') : ('?', ''); |
|
|
|
100
|
|
|
|
|
|
|
171
|
159
|
|
|
|
|
383
|
my $seq = ''; |
|
172
|
159
|
|
|
|
|
246
|
my $c; |
|
173
|
159
|
|
|
|
|
307
|
$self->{aux}->[0] = undef; |
|
174
|
159
|
|
|
|
|
742
|
while ($self->{line} = readline($self->{fh})) { |
|
175
|
|
|
|
|
|
|
# PARSE SEQx |
|
176
|
361
|
|
|
|
|
666
|
chomp($self->{line}); |
|
177
|
361
|
|
|
|
|
745
|
$c = substr($self->{line}, 0, 1); |
|
178
|
361
|
100
|
66
|
|
|
1816
|
last if ($c eq '>' || $c eq '@' || $c eq '+'); |
|
|
|
|
100
|
|
|
|
|
|
179
|
220
|
|
|
|
|
1067
|
$seq .= $self->{line}; |
|
180
|
|
|
|
|
|
|
} |
|
181
|
159
|
|
|
|
|
360
|
$self->{aux}->[0] = $self->{line}; |
|
182
|
159
|
100
|
|
|
|
392
|
$self->{aux}->[1] = 1 if (!defined($self->{aux}->[0])); |
|
183
|
159
|
|
|
|
|
561
|
$sequence_data->{name} = $name; |
|
184
|
159
|
|
|
|
|
459
|
$sequence_data->{comment} = $comm; |
|
185
|
159
|
|
|
|
|
482
|
$sequence_data->{seq} = $seq; |
|
186
|
159
|
|
|
|
|
504
|
$self->{counter}++; |
|
187
|
|
|
|
|
|
|
# Return FASTA |
|
188
|
159
|
100
|
|
|
|
439
|
if ($c ne '+') { |
|
189
|
94
|
|
|
|
|
213
|
$self->{return_fasta1}++; |
|
190
|
94
|
|
|
|
|
285
|
return $sequence_data; |
|
191
|
|
|
|
|
|
|
} |
|
192
|
65
|
|
|
|
|
162
|
my $qual = ''; |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
|
|
195
|
65
|
|
|
|
|
283
|
while ($self->{line} = readline($self->{fh})) { |
|
196
|
|
|
|
|
|
|
# PARSE QUALITY |
|
197
|
65
|
|
|
|
|
137
|
chomp($self->{line}); |
|
198
|
65
|
|
|
|
|
168
|
$qual .= $self->{line}; |
|
199
|
65
|
50
|
|
|
|
250
|
if (length($qual) >= length($seq)) { |
|
200
|
65
|
|
|
|
|
130
|
$self->{aux}->[0] = undef; |
|
201
|
65
|
|
|
|
|
151
|
$sequence_data->{name} = $name; |
|
202
|
65
|
|
|
|
|
257
|
$sequence_data->{seq} = $seq; |
|
203
|
65
|
|
|
|
|
235
|
$sequence_data->{comment} = $comm; |
|
204
|
65
|
|
|
|
|
217
|
$sequence_data->{qual} = $qual; |
|
205
|
|
|
|
|
|
|
# return FASTQ |
|
206
|
65
|
|
|
|
|
204
|
$self->{return_fastq}++; |
|
207
|
65
|
|
|
|
|
281
|
return $sequence_data; |
|
208
|
|
|
|
|
|
|
} |
|
209
|
|
|
|
|
|
|
} |
|
210
|
|
|
|
|
|
|
# PROCH |
|
211
|
0
|
|
|
|
|
0
|
close $self->{fh}; |
|
212
|
|
|
|
|
|
|
|
|
213
|
0
|
|
|
|
|
0
|
$self->{aux}->[1] = 1; |
|
214
|
0
|
|
|
|
|
0
|
$sequence_data->{name} = $name; |
|
215
|
0
|
|
|
|
|
0
|
$sequence_data->{seq} = $seq; |
|
216
|
0
|
|
|
|
|
0
|
$sequence_data->{comment} = $comm; |
|
217
|
0
|
|
|
|
|
0
|
$self->{counter}++; |
|
218
|
|
|
|
|
|
|
# return FASTA |
|
219
|
0
|
|
|
|
|
0
|
$self->{return_fasta2}++; |
|
220
|
0
|
|
|
|
|
0
|
return $sequence_data; |
|
221
|
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
} |
|
223
|
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
sub next { |
|
226
|
47
|
|
|
47
|
1
|
11404
|
my $self = shift; |
|
227
|
47
|
|
|
|
|
181
|
my $scalar_read = $self->getRead(); |
|
228
|
47
|
100
|
|
|
|
121
|
return unless defined $scalar_read; |
|
229
|
|
|
|
|
|
|
return FASTX::Seq->new( $scalar_read->{seq} // '', |
|
230
|
|
|
|
|
|
|
$scalar_read->{name} // undef, |
|
231
|
|
|
|
|
|
|
$scalar_read->{comment} // undef, |
|
232
|
42
|
|
50
|
|
|
762
|
$scalar_read->{qual} // undef); |
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
233
|
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
} |
|
235
|
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
sub getFastqRead { |
|
238
|
7
|
|
|
7
|
1
|
2782
|
my $self = shift; |
|
239
|
7
|
|
|
|
|
18
|
my $seq_object = undef; |
|
240
|
|
|
|
|
|
|
|
|
241
|
7
|
50
|
66
|
|
|
44
|
return if (defined $self->{status} and $self->{status} == 0); |
|
242
|
|
|
|
|
|
|
|
|
243
|
7
|
|
|
|
|
21
|
$self->{status} = 1; |
|
244
|
7
|
|
|
|
|
734
|
my $header = readline($self->{fh}); |
|
245
|
7
|
|
|
|
|
73
|
my $seq = readline($self->{fh}); |
|
246
|
7
|
|
|
|
|
23
|
my $check = readline($self->{fh}); |
|
247
|
7
|
|
|
|
|
30
|
my $qual = readline($self->{fh}); |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
# Check 4 lines were found (FASTQ) |
|
251
|
|
|
|
|
|
|
|
|
252
|
7
|
100
|
|
|
|
26
|
unless (defined $qual) { |
|
253
|
1
|
50
|
|
|
|
5
|
if (defined $header) { |
|
254
|
0
|
|
|
|
|
0
|
$self->{message} = "Unknown format: FASTQ truncated at " . $header . "?"; |
|
255
|
0
|
|
|
|
|
0
|
$self->{status} = 0; |
|
256
|
|
|
|
|
|
|
} |
|
257
|
1
|
|
|
|
|
4
|
return; |
|
258
|
|
|
|
|
|
|
} |
|
259
|
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
# Fast format control: header and separator |
|
261
|
6
|
100
|
66
|
|
|
85
|
if ( (substr($header, 0, 1) eq '@') and (substr($check, 0, 1) eq '+') ) { |
|
262
|
5
|
|
|
|
|
26
|
chomp($header); |
|
263
|
5
|
|
|
|
|
22
|
chomp($seq); |
|
264
|
5
|
|
|
|
|
9
|
chomp($qual); |
|
265
|
|
|
|
|
|
|
# Also control sequence integrity |
|
266
|
5
|
100
|
66
|
|
|
79
|
if ($seq=~/^[ACGTNacgtn]+$/ and length($seq) == length($qual) ) { |
|
267
|
4
|
|
|
|
|
31
|
my ($name, $comments) = split /\s+/, substr($header, 1); |
|
268
|
4
|
|
|
|
|
17
|
$seq_object->{name} = $name; |
|
269
|
4
|
|
|
|
|
9
|
$seq_object->{comments} = $comments; |
|
270
|
4
|
|
|
|
|
14
|
$seq_object->{seq} = $seq; |
|
271
|
4
|
|
|
|
|
17
|
$seq_object->{qual} = $qual; |
|
272
|
4
|
|
|
|
|
17
|
$self->{counter}++; |
|
273
|
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
} else { |
|
275
|
|
|
|
|
|
|
# Return error (corrupted FASTQ) |
|
276
|
1
|
|
|
|
|
3
|
$self->{message} = "Unknown format: expecting FASTQ (corrupted?)"; |
|
277
|
1
|
|
|
|
|
2
|
$self->{status} = 0; |
|
278
|
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
} |
|
280
|
|
|
|
|
|
|
} else { |
|
281
|
|
|
|
|
|
|
# Return error (not a FASTQ) |
|
282
|
1
|
|
|
|
|
2
|
$self->{message} = "Unknown format: expecting FASTQ but @ header not found"; |
|
283
|
|
|
|
|
|
|
|
|
284
|
1
|
50
|
|
|
|
4
|
if (substr($header, 0,1 ) eq '>' ) { |
|
285
|
|
|
|
|
|
|
# HINT: is FASTA? |
|
286
|
1
|
|
|
|
|
3
|
$self->{message} .= " (might be FASTA instead)"; |
|
287
|
|
|
|
|
|
|
} |
|
288
|
1
|
|
|
|
|
3
|
$self->{status} = 0; |
|
289
|
|
|
|
|
|
|
} |
|
290
|
|
|
|
|
|
|
|
|
291
|
6
|
|
|
|
|
26
|
return $seq_object; |
|
292
|
|
|
|
|
|
|
} |
|
293
|
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
sub getIlluminaRead { |
|
296
|
16
|
|
|
16
|
1
|
18385
|
my $self = shift; |
|
297
|
16
|
|
|
|
|
32
|
my $seq_object = undef; |
|
298
|
|
|
|
|
|
|
|
|
299
|
16
|
50
|
66
|
|
|
84
|
return if (defined $self->{status} and $self->{status} == 0); |
|
300
|
|
|
|
|
|
|
|
|
301
|
16
|
|
|
|
|
34
|
$self->{status} = 1; |
|
302
|
16
|
|
|
|
|
886
|
my $header = readline($self->{fh}); |
|
303
|
16
|
|
|
|
|
93
|
my $seq = readline($self->{fh}); |
|
304
|
16
|
|
|
|
|
49
|
my $check = readline($self->{fh}); |
|
305
|
16
|
|
|
|
|
52
|
my $qual = readline($self->{fh}); |
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
# Check 4 lines were found (FASTQ) |
|
309
|
|
|
|
|
|
|
|
|
310
|
16
|
100
|
|
|
|
55
|
unless (defined $qual) { |
|
311
|
2
|
50
|
|
|
|
9
|
if (defined $header) { |
|
312
|
0
|
|
|
|
|
0
|
$self->{message} = "Unknown format: FASTQ truncated at " . $header . "?"; |
|
313
|
0
|
|
|
|
|
0
|
$self->{status} = 0; |
|
314
|
|
|
|
|
|
|
} |
|
315
|
2
|
|
|
|
|
9
|
return; |
|
316
|
|
|
|
|
|
|
} |
|
317
|
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
# Fast format control: header and separator |
|
319
|
14
|
50
|
33
|
|
|
121
|
if ( (substr($header, 0, 1) eq '@') and (substr($check, 0, 1) eq '+') ) { |
|
320
|
14
|
|
|
|
|
33
|
chomp($header); |
|
321
|
14
|
|
|
|
|
22
|
chomp($seq); |
|
322
|
14
|
|
|
|
|
21
|
chomp($qual); |
|
323
|
|
|
|
|
|
|
# Also control sequence integrity |
|
324
|
14
|
50
|
33
|
|
|
139
|
if ($seq=~/^[ACGTNacgtn]+$/ and length($seq) == length($qual) ) { |
|
325
|
14
|
|
|
|
|
91
|
my ($name, $comments) = split /\s+/, substr($header, 1); |
|
326
|
|
|
|
|
|
|
#@::::::: ::: |
|
327
|
14
|
|
|
|
|
70
|
my ($instrument, $run, $flowcell, $lane, $tile, $x, $y, $umi) = split /:/, $name; |
|
328
|
14
|
|
|
|
|
32
|
my ($read, $filtered, $ctrl, $index1, $index2); |
|
329
|
|
|
|
|
|
|
|
|
330
|
14
|
50
|
|
|
|
31
|
if (not defined $y) { |
|
331
|
0
|
|
|
|
|
0
|
$self->{message} = "Unknown format: not Illumina naming: ::::::"; |
|
332
|
0
|
|
|
|
|
0
|
$self->{status} = 0; |
|
333
|
|
|
|
|
|
|
} |
|
334
|
|
|
|
|
|
|
|
|
335
|
14
|
|
|
|
|
47
|
$seq_object->{name} = $name; |
|
336
|
14
|
|
|
|
|
51
|
$seq_object->{comments} = $comments; |
|
337
|
14
|
|
|
|
|
35
|
$seq_object->{seq} = $seq; |
|
338
|
14
|
|
|
|
|
32
|
$seq_object->{qual} = $qual; |
|
339
|
|
|
|
|
|
|
|
|
340
|
14
|
|
|
|
|
25
|
$seq_object->{instrument} = $instrument; |
|
341
|
14
|
|
|
|
|
32
|
$seq_object->{run} = $run; |
|
342
|
14
|
|
|
|
|
21
|
$seq_object->{flowcell} = $flowcell; |
|
343
|
14
|
|
|
|
|
32
|
$seq_object->{lane} = $lane; |
|
344
|
14
|
|
|
|
|
26
|
$seq_object->{tile} = $tile; |
|
345
|
14
|
|
|
|
|
36
|
$seq_object->{x} = $x; |
|
346
|
14
|
|
|
|
|
35
|
$seq_object->{y} = $y; |
|
347
|
14
|
|
|
|
|
25
|
$seq_object->{umi} = $umi; |
|
348
|
14
|
|
|
|
|
28
|
$seq_object->{instrument} = $instrument; |
|
349
|
14
|
|
|
|
|
17
|
$seq_object->{run} = $run; |
|
350
|
14
|
|
|
|
|
25
|
$seq_object->{flowcell} = $flowcell; |
|
351
|
|
|
|
|
|
|
|
|
352
|
14
|
100
|
|
|
|
30
|
if (defined $comments) { |
|
353
|
7
|
|
|
|
|
40
|
($read, $filtered, $ctrl, $index1, $index2) = split /[:+]/, $comments; |
|
354
|
7
|
50
|
|
|
|
21
|
if ( defined $ctrl ) { |
|
355
|
7
|
|
|
|
|
19
|
$seq_object->{read} = $read; |
|
356
|
7
|
50
|
|
|
|
19
|
$filtered eq 'N' ? $seq_object->{filtered} = 0 : $seq_object->{filtered} = 1; |
|
357
|
7
|
|
|
|
|
14
|
$seq_object->{control} = $ctrl; |
|
358
|
7
|
50
|
|
|
|
9
|
if ($read eq '1') { |
|
359
|
7
|
|
|
|
|
22
|
$seq_object->{index} = $index1; |
|
360
|
7
|
|
|
|
|
17
|
$seq_object->{paired_index} = $index2; |
|
361
|
|
|
|
|
|
|
} else { |
|
362
|
0
|
|
|
|
|
0
|
$seq_object->{paired_index} = $index1; |
|
363
|
0
|
|
|
|
|
0
|
$seq_object->{index} = $index2; |
|
364
|
|
|
|
|
|
|
} |
|
365
|
|
|
|
|
|
|
} |
|
366
|
|
|
|
|
|
|
} |
|
367
|
14
|
|
|
|
|
36
|
$self->{counter}++; |
|
368
|
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
} else { |
|
370
|
|
|
|
|
|
|
# Return error (corrupted FASTQ) |
|
371
|
0
|
|
|
|
|
0
|
$self->{message} = "Unknown format: expecting FASTQ (corrupted?)"; |
|
372
|
0
|
|
|
|
|
0
|
$self->{status} = 0; |
|
373
|
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
} |
|
375
|
|
|
|
|
|
|
} else { |
|
376
|
|
|
|
|
|
|
# Return error (not a FASTQ) |
|
377
|
0
|
|
|
|
|
0
|
$self->{message} = "Unknown format: expecting FASTQ but @ header not found"; |
|
378
|
|
|
|
|
|
|
|
|
379
|
0
|
0
|
|
|
|
0
|
if (substr($header, 0,1 ) eq '>' ) { |
|
380
|
|
|
|
|
|
|
# HINT: is FASTA? |
|
381
|
0
|
|
|
|
|
0
|
$self->{message} .= " (might be FASTA instead)"; |
|
382
|
|
|
|
|
|
|
} |
|
383
|
0
|
|
|
|
|
0
|
$self->{status} = 0; |
|
384
|
|
|
|
|
|
|
} |
|
385
|
|
|
|
|
|
|
|
|
386
|
14
|
|
|
|
|
45
|
return $seq_object; |
|
387
|
|
|
|
|
|
|
} |
|
388
|
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
sub getFileFormat { |
|
390
|
4
|
|
|
4
|
1
|
1313
|
my $self = shift; |
|
391
|
4
|
|
|
|
|
8
|
my ($filename) = shift; |
|
392
|
4
|
50
|
|
|
|
11
|
return 0 if (not defined $filename); |
|
393
|
|
|
|
|
|
|
|
|
394
|
4
|
50
|
|
|
|
171
|
open my $fh, '<', $filename or confess "Unable to read file ", $filename, "\n"; |
|
395
|
4
|
|
|
|
|
109
|
read( $fh, my $magic_byte, 4 ); |
|
396
|
4
|
|
|
|
|
88
|
close $fh; |
|
397
|
|
|
|
|
|
|
|
|
398
|
4
|
50
|
|
|
|
26
|
if (substr($magic_byte,0,3) eq GZIP_SIGNATURE) { |
|
399
|
|
|
|
|
|
|
# GZIPPED FILE |
|
400
|
0
|
0
|
|
|
|
0
|
if (! defined $self->{GZIP_BIN}) { |
|
401
|
0
|
|
|
|
|
0
|
require IO::Uncompress::Gunzip; |
|
402
|
0
|
|
|
|
|
0
|
$fh = IO::Uncompress::Gunzip->new($filename, MultiStream => 1); |
|
403
|
|
|
|
|
|
|
} else { |
|
404
|
0
|
0
|
|
|
|
0
|
open $fh, '-|', "$self->{GZIP_BIN} -dc $filename" or confess "Error opening gzip file ", $filename, ": $!\n"; |
|
405
|
|
|
|
|
|
|
} |
|
406
|
|
|
|
|
|
|
} else { |
|
407
|
|
|
|
|
|
|
# NOT COMPRESSED |
|
408
|
4
|
|
33
|
|
|
144
|
open $fh, '<:encoding(utf8)', "$filename" || confess "Unable to read $filename\n$!\n"; |
|
409
|
|
|
|
|
|
|
} |
|
410
|
4
|
|
|
|
|
312
|
my $first = readline($fh); |
|
411
|
4
|
100
|
|
|
|
71
|
if (substr($first, 0,1) eq '>') { |
|
|
|
100
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
#should be FASTA |
|
413
|
1
|
|
|
|
|
19
|
return 'fasta'; |
|
414
|
|
|
|
|
|
|
} elsif (substr($first, 0, 1) eq '@') { |
|
415
|
|
|
|
|
|
|
#should be fastq |
|
416
|
2
|
|
|
|
|
6
|
readline($fh); |
|
417
|
2
|
|
|
|
|
5
|
my $sep = readline($fh); |
|
418
|
2
|
50
|
|
|
|
8
|
if ( substr($sep, 0, 1) eq '+' ) { |
|
419
|
|
|
|
|
|
|
#second check for fastq |
|
420
|
2
|
|
|
|
|
33
|
return 'fastq'; |
|
421
|
|
|
|
|
|
|
} |
|
422
|
|
|
|
|
|
|
} else { |
|
423
|
|
|
|
|
|
|
#unknown format |
|
424
|
1
|
|
|
|
|
17
|
return; |
|
425
|
|
|
|
|
|
|
} |
|
426
|
|
|
|
|
|
|
} |
|
427
|
|
|
|
|
|
|
sub _load_seqs { |
|
428
|
6
|
|
|
6
|
|
26
|
my ($self) = @_; |
|
429
|
6
|
50
|
|
|
|
29
|
return 0 unless (defined $self->{loadseqs}); |
|
430
|
|
|
|
|
|
|
|
|
431
|
6
|
|
|
|
|
14
|
my $seqs = undef; |
|
432
|
6
|
|
|
|
|
23
|
while (my $s = $self->getRead() ) { |
|
433
|
22
|
|
|
|
|
52
|
my ($name, $seq) = ($s->{name}, $s->{seq}); |
|
434
|
22
|
100
|
|
|
|
58
|
if ($self->{loadseqs} eq 'name') { |
|
|
|
100
|
|
|
|
|
|
|
435
|
12
|
|
|
|
|
55
|
$seqs->{$name} = $seq; |
|
436
|
|
|
|
|
|
|
} elsif ($self->{loadseqs} eq 'seq') { |
|
437
|
3
|
|
|
|
|
14
|
$seqs->{$seq} = $name; |
|
438
|
|
|
|
|
|
|
} else { |
|
439
|
|
|
|
|
|
|
my $r = FASTX::Seq->new( |
|
440
|
|
|
|
|
|
|
-name => $name, |
|
441
|
|
|
|
|
|
|
-seq => $seq, |
|
442
|
|
|
|
|
|
|
-comment => $s->{comments}, |
|
443
|
|
|
|
|
|
|
-qual => $s->{qual}, |
|
444
|
7
|
|
|
|
|
81
|
); |
|
445
|
7
|
|
|
|
|
18
|
push(@{$seqs}, $r); |
|
|
7
|
|
|
|
|
26
|
|
|
446
|
|
|
|
|
|
|
} |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
} |
|
449
|
6
|
|
|
|
|
20
|
$self->{seqs} = $seqs; |
|
450
|
|
|
|
|
|
|
} |
|
451
|
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
sub _which { |
|
454
|
33
|
50
|
|
33
|
|
274
|
return if ($^O eq 'MSWin32'); |
|
455
|
33
|
|
|
|
|
87
|
my $has_which = eval { require File::Which; File::Which->import(); 1}; |
|
|
33
|
|
|
|
|
7910
|
|
|
|
33
|
|
|
|
|
19162
|
|
|
|
33
|
|
|
|
|
139
|
|
|
456
|
33
|
50
|
|
|
|
93
|
if ($has_which) { |
|
457
|
33
|
|
|
|
|
204
|
foreach my $cmd (@_) { |
|
458
|
66
|
100
|
|
|
|
8210
|
return which($cmd) if (which($cmd)); |
|
459
|
|
|
|
|
|
|
} |
|
460
|
|
|
|
|
|
|
} else { |
|
461
|
0
|
|
|
|
|
|
foreach my $cmd (@_) { |
|
462
|
0
|
|
|
|
|
|
my $exit; |
|
463
|
0
|
|
|
|
|
|
eval { |
|
464
|
0
|
|
|
|
|
|
`which $cmd 2> /dev/null`; |
|
465
|
0
|
|
|
|
|
|
$exit = $?; |
|
466
|
|
|
|
|
|
|
}; |
|
467
|
0
|
0
|
0
|
|
|
|
return $cmd if ($exit == 0 and not $@); |
|
468
|
|
|
|
|
|
|
} |
|
469
|
|
|
|
|
|
|
} |
|
470
|
0
|
|
|
|
|
|
return; |
|
471
|
|
|
|
|
|
|
} |
|
472
|
|
|
|
|
|
|
1; |
|
473
|
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
__END__ |