File Coverage

blib/lib/FASTX/Abi.pm
Criterion Covered Total %
statement 169 175 96.5
branch 54 66 81.8
condition 8 15 53.3
subroutine 20 20 100.0
pod 5 5 100.0
total 256 281 91.1


line stmt bran cond sub pod time code
1             package FASTX::Abi;
2 12     12   8633 use 5.016;
  12         39  
3 12     12   55 use warnings;
  12         18  
  12         339  
4 12     12   55 use Carp qw(confess);
  12         19  
  12         631  
5 12     12   8961 use Bio::Trace::ABIF;
  12         174032  
  12         735  
6 12     12   2240 use Data::Dumper;
  12         21464  
  12         716  
7 12     12   77 use File::Basename;
  12         22  
  12         663  
8 12     12   5725 use FASTX::sw 'align';
  12         31  
  12         1060  
9             $FASTX::Abi::VERSION = '1.0.1';
10              
11 12         627 use constant DEFAULT_MATRIX => { 'wildcard_match' => 0,
12             'match' => 1,
13             'mismatch' => -1,
14             'gap' => -2,
15             'gap_extend' => 0,
16             'wildcard' => 'N',
17 12     12   75 };
  12         18  
18            
19 12     12   62 use constant SCORE => 0;
  12         30  
  12         534  
20 12     12   59 use constant EVENT => 1;
  12         21  
  12         414  
21 12     12   64 use constant EXTEND => 0;
  12         37  
  12         508  
22 12     12   72 use constant GAP_SRC => 1;
  12         38  
  12         474  
23 12     12   69 use constant GAP_TGT => 2;
  12         18  
  12         20005  
24              
25              
26              
27             #ABSTRACT: Read Sanger trace file (chromatograms) in FASTQ format. For traces called with I option, the ambiguities will be split into two sequences to allow usage from NGS tools that usually do not understand IUPAC ambiguities.
28              
29             our @valid_new_attributes = (
30             'filename', # *REQUIRED* input trace filepath
31             'trim_ends', # bool (default: 1) trim low quality ends
32             'wnd', # int (default: 16) sliding window for quality trim
33             'min_qual', # int (default: 22) threshold for low quality calls
34             'bad_bases', # int (default: 2) maximum number of low quality bases per window
35             'keep_abi', # bool (default: 0) import the Bio::Trace::ABIF object in FASTX::Abi (otherwise deleted after import)
36             );
37             our @valid_obj_attributes = (
38             'diff', # number of ambiguous bases
39             'diff_array', # array of ambiguous bases position
40             'sequence_name', # sequence name from filename
41             'instrument', # Instrument
42             'avg_peak_spacing', # Avg Peak Spacing in chromas
43             'version', # version chromatograms
44             'chromas', # Bio::Trace::ABIF object
45              
46             'hetero', # ambiguity
47             'seq1', # Sequence 1 (non ambiguous, allele1)
48             'seq2', # Sequence 2 (non ambiguous, allele2)
49             'sequence', # Sequence, trimmed
50             'quality', # Quality, trimmed
51             'raw_sequence', # Raw sequence
52             'raw_quality', # Raw quality
53             'iso_seq', # Sequence are equal
54             'discard', # Low quality sequence
55             );
56              
57             our %iupac = (
58             'R' => 'AG',
59             'Y' => 'CT',
60             'M' => 'CA',
61             'K' => 'TG',
62             'W' => 'TA',
63             'S' => 'CG'
64             );
65              
66              
67             sub new {
68             # Instantiate object
69 45     45 1 33930 my ($class, $args) = @_;
70              
71             my $self = {
72             filename => $args->{filename}, # Chromatogram file name
73             trim_ends => $args->{trim_ends}, # Trim low quality ends (bool)
74             min_qual => $args->{min_qual}, # Minimum quality
75             wnd => $args->{wnd}, # Window for end trimming
76             bad_bases => $args->{bad_bases}, # Number of low qual bases per $window_width
77             keep_abi => $args->{keep_abi}, # Do not destroy $self->{chromas} after use
78 45         222 };
79              
80             #check valid inputs:
81 45         85 for my $input (sort keys %{ $args } ) {
  45         201  
82 67 100       1046 if ( ! grep( /^$input$/, @valid_new_attributes ) ) {
83 1         214 confess("Method new() does not accept \"$input\" attribute. Valid attributes are:\n", join(', ', @valid_new_attributes));
84             }
85             }
86              
87             # CHECK INPUT FILE
88             # -----------------------------------
89 44 50       150 if (not defined $self->{filename}) {
90 0         0 confess("ABI file must be provided when creating new object");
91             }
92              
93 44 100       849 if (not -e $self->{filename}) {
94 1         114 confess("ABI file not found: ", $self->{filename});
95             }
96 43         113 my $abif;
97             my $try = eval
98 43         78 {
99 43         343 $abif = Bio::Trace::ABIF->new();
100 43 50       816 $abif->open_abif($self->{filename}) or confess "Error in file: ", $self->{filename};
101 43         164171 1;
102             };
103              
104 43 50       125 if (not $try) {
105 0         0 confess("Bio::Trace::ABIF was unable to read: ", $self->{filename});
106             }
107 43         189 my $object = bless $self, $class;
108 43         136 $object->{chromas} = $abif;
109              
110 43         149 my @ext = ('.abi','.ab1','.ABI','.abI','.AB1','.ab');
111 43         4491 my ($seqname) = basename($self->{filename}, @ext);
112 43         308 $object->{sequence_name} = $abif->sample_name();
113              
114             # DEFAULTS
115             # -----------------------------------
116 43 100       3584 $object->{trim_ends} = 1 unless defined $object->{trim_ends};
117 43 100       137 $object->{wnd} = 10 unless defined $object->{wnd};
118 43 100       144 $object->{min_qual} = 20 unless defined $object->{min_qual};
119 43 100       121 $object->{bad_bases} = 4 unless defined $object->{bad_bases};
120 43 100       107 $object->{keep_abi} = 0 unless defined $object->{keep_abi};
121 43         69 $object->{discard} = 0;
122              
123             # GET SEQUENCE FROM AB1 FILE
124             # -----------------------------------
125 43         139 my $seq = _get_sequence($self);
126 43 100       5330 if ($self->{keep_abi} == 0) {
127 42         114 $self->{chromas} = undef;
128             }
129              
130             #check valid attributes:
131 43         63 for my $input (sort keys %{ $self} ) {
  43         531  
132             # [this is a developer's safety net]
133             # uncoverable condition false
134 946 50       16486 if ( ! grep( /^$input$/, @valid_new_attributes, @valid_obj_attributes ) ) {
135 0         0 confess("Method new() does not accept \"$input\" attribute. Valid attributes are:\n", join(', ', @valid_new_attributes, @valid_obj_attributes));
136             }
137             }
138              
139              
140 43         356 return $object;
141             }
142              
143              
144             sub get_fastq {
145 16     16 1 1072 my ($self, $name, $quality_value) = @_;
146              
147 16 50       91 if (not defined $name) {
    50          
148 0         0 $name = $self->{sequence_name};
149             } elsif ($name=~/\s+/) {
150 0         0 $name =~s/\s+/_/g;
151             }
152              
153 16         29 my $quality = $self->{quality};
154 16 100       35 if (defined $quality_value) {
155 12 100 66     74 if ($quality_value =~/^\d+$/ and $quality_value >= 10) {
    100          
156 4 50       14 my $q = chr(($quality_value <= 93 ? $quality_value : 93) + 33);
157 4         14 $quality = $q x length($quality);
158             } elsif (length($quality_value) == 1) {
159 4         17 $quality = $quality_value x length($quality);
160             } else {
161 4         481 confess("Supplied quality is neither a valid integer or a single char: <$quality_value>\n");
162             }
163             }
164              
165 12         22 my $output = '';
166 12 100       52 if ( $self->{iso_seq} ) {
167             $output .= '@' . $name . "\n" .
168 3         62 $self->{seq1} . "\n+\n" .
169             $quality . "\n";
170             } else {
171             $output .= '@' . $name . "_1\n" .
172 9         96 $self->{seq1} . "\n+\n" .
173             $quality . "\n";
174             $output .= '@' . $name . "_2\n" .
175 9         49 $self->{seq2} . "\n+\n" .
176             $quality . "\n";
177             }
178 12         55 return $output;
179             }
180              
181              
182             sub get_trace_info {
183 1     1 1 85 my $self = shift;
184 1         2 my $data;
185 1         3 $data->{instrument} = $self->{instrument};
186 1         2 $data->{version} = $self->{version};
187 1         2 $data->{avg_peak_spacing} = $self->{avg_peak_spacing};
188              
189 1         3 return $data;
190             }
191              
192              
193             sub rc {
194 1     1 1 2 my $self = shift;
195 1         4 $self->{seq1} = reverse $self->{seq1};
196 1         3 $self->{seq2} = reverse $self->{seq2};
197 1         5 $self->{seq1} =~ tr/ACGTacgt/TGCAtgca/;
198 1         24 $self->{seq2} =~ tr/ACGTacgt/TGCAtgca/;
199 1         3 return $self;
200             }
201              
202              
203             sub merge {
204 1     1 1 563 my ($self, $other) = @_;
205 1         5 $other->rc();
206              
207 1         16 my $aligner = FASTX::sw->new($self->{seq1}, $other->{seq1});
208 1         11 my ($top, $bars, $bottom) = $aligner->pads;
209 1         4 my $consensus = "";
210 1         2 my $pos1 = 0;
211 1         3 my $pos2 = 0;
212 1         6 for (my $pos = 0; $pos < length($top); $pos++) {
213 807         803 my $base1 = substr($top, $pos, 1);
214 807         806 my $base2 = substr($bottom, $pos, 1);
215              
216 807 100       1049 my $pos1++ if ($base1 ne '-');
217 807 100       989 my $pos2++ if ($base2 ne '-');
218              
219 807 100 66     1263 if ($base1 eq $base2) {
    100 66        
    100 33        
    50          
220 481         731 $consensus .= $base1;
221             } elsif ($base1 eq '-' and $base2 ne '-') {
222 195         310 $consensus .= $base2;
223             } elsif ($base1 ne '-' and $base2 eq '-') {
224 130         202 $consensus .= $base1;
225             } elsif ($base1 ne '-' and $base2 ne '-') {
226 1         6 my $qual1 = substr($self->{quality}, $pos1, 1);
227 1         5 my $qual2 = substr($other->{quality}, $pos2, 1);
228 1 50       64 $consensus .= _ascii_qual($qual1) > _ascii_qual($qual2) ? lc($base1) : lc($base2);
229             }
230             }
231             # evaluate longest stretch of "|" in $m
232             # my $longest = 0;
233             # my $longest_start = 0;
234             # my $longest_end = 0;
235             # my $longest_str = '';
236             # my $i = 0;
237             # while ($i < length($m)) {
238             # my $str = substr($m, $i, 1);
239             # if ($str eq '|') {
240             # my $start = $i;
241             # my $end = $i;
242             # while ($str eq '|') {
243             # $end++;
244             # $str = substr($m, $end, 1);
245             # }
246             # if ($end - $start > $longest) {
247             # $longest = $end - $start;
248             # $longest_start = $start;
249             # $longest_end = $end;
250             # $longest_str = $str;
251             # }
252             # }
253             # $i++;
254             # }
255            
256 1         7 $self->{merge} = $aligner;
257 1         3 $self->{top} = $top;
258 1         3 $self->{bottom} = $bottom;
259 1         2 $self->{consensus} = $consensus;
260 1         3 $self->{bars} = $bars;
261 1         10 return $consensus;
262              
263             }
264              
265              
266             sub _get_sequence {
267 43     43   86 my $self = shift;
268 43         77 my $abif = $self->{chromas};
269              
270 43         170 $self->{raw_sequence} = $abif->sequence();
271              
272             # Get quality values
273 43         3818 my @qv = $abif->quality_values();
274             # Encode quality in FASTQ chars
275 43 50       8778 my @fqv = map {chr(int(($_<=93? $_ : 93)*4/6) + 33)} @qv;
  32692         51786  
276              
277             # FASTQ
278 43         958 my $q = join('', @fqv);
279              
280              
281 43         145 $self->{raw_quality} = $q;
282              
283 43         86 $self->{sequence} = $self->{raw_sequence};
284 43         91 $self->{quality} = $self->{raw_quality};
285              
286             # Trim
287 43 100       124 if ($self->{trim_ends}) {
288             #The Sequencing Analysis program determines the clear range of the sequence by trimming bases from the 5' to 3'
289             #ends until fewer than 4 bases out of 20 have a quality value less than 20.
290             #You can change these parameters by explicitly passing arguments to this method
291             #(the default values are $window_width = 20, $bad_bases_threshold = 4, $quality_threshold = 20).
292             # Note that Sequencing Analysis counts the bases starting from one, so you have to add one to the return values to get consistent results.
293              
294             my ($begin_pos, $end_pos) = $abif->clear_range(
295             $self->{wnd},
296             $self->{bad_bases},
297             $self->{min_qual},
298              
299 39         244 );
300              
301             # This can be tested with low quality chromatograms
302             # *TODO* to ask for some bad trace
303              
304             # uncoverable branch false
305             # uncoverable condition left
306             # uncoverable condition right
307              
308 39 50 33     28058 if ($begin_pos>0 and $end_pos>0) {
309 39         83 my $hi_qual_length = $end_pos-$begin_pos+1;
310 39         184 $self->{sequence} = substr($self->{sequence}, $begin_pos, $hi_qual_length);
311 39         154 $self->{quality} = substr($self->{quality} , $begin_pos, $hi_qual_length);
312             } else {
313 0         0 $self->{discard} = 1;
314             }
315             }
316              
317             # Check hetero bases
318 43 100       1161 if ($self->{sequence}!~/[ACGT][RYMKWS]+[ACGT]/i) {
319 14         46 $self->{hetero} = 0;
320             } else {
321 29         58 $self->{hetero} = 1;
322             }
323              
324             # Check
325 43         85 $self->{diff_array} = ();
326 43         168 $self->{diff} = 0;
327 43         88 my $seq1 = '';
328 43         106 my $seq2 = '';
329 43         141 for (my $i = 0; $i{sequence}); $i++) {
330 27280         31003 my $q0 = substr($self->{quality}, $i, 1);
331 27280         29455 my $s0 = substr($self->{sequence}, $i,1);
332              
333             # Ambiguity detected:
334 27280 100       32119 if ($iupac{$s0}) {
335 74         216 my ($base1, $base2) = split //, $iupac{$s0};
336 74         106 $seq1.=$base1;
337 74         91 $seq2.=$base2;
338 74         94 $self->{diff}++;
339 74         92 push(@{ $self->{diff_array} }, $i);
  74         196  
340             } else {
341 27206         26827 $seq1.=$s0;
342 27206         38608 $seq2.=$s0;
343              
344             }
345             }
346 43         205 $self->{seq1} = $seq1;
347 43         121 $self->{seq2} = $seq2;
348              
349 43 100       131 if ($seq1 eq $seq2) {
350 14         69 $self->{iso_seq} = 1
351             } else {
352 29         67 $self->{iso_seq} = 0;
353             }
354              
355              
356 43         293 $self->{instrument} = $self->{chromas}->official_instrument_name();
357 43         4587 $self->{version} = $self->{chromas}->abif_version();
358 43         1616 $self->{avg_peak_spacing} = $self->{chromas}->avg_peak_spacing();
359              
360             }
361              
362             sub _ascii_qual {
363 2     2   8 my ($qual, $offset) = @_;
364 2 50       6 $offset = 33 unless defined $offset;
365 2         12 return ord($qual) - $offset;
366             }
367             1;
368              
369             __END__