File Coverage

blib/lib/Bio/Graphics/Browser2/Realign.pm
Criterion Covered Total %
statement 46 138 33.3
branch 2 42 4.7
condition 2 18 11.1
subroutine 14 22 63.6
pod 9 9 100.0
total 73 229 31.8


line stmt bran cond sub pod time code
1             package Bio::Graphics::Browser2::Realign;
2              
3             =head1 NAME
4              
5             Bio::Graphics::Browser2::Realign - Perl extension for Smith-Waterman alignments
6              
7             =head1 SYNOPSIS
8              
9             use Bio::Graphics::Browser2::Realign 'align';
10             my ($top,$middle,$bottom) = align('gattttttc','gattttccc');
11             print join "\n",$top,$middle,$bottom,"\n";
12              
13             # produces:
14             gatttttt--c
15             |||||| |
16             gatttt--ccc
17              
18              
19             =head1 DESCRIPTION
20              
21             This is a helper utility used by gbrowse to produce global alignments.
22             It uses slow Smith-Waterman, so is only appropriate for short segments
23             that are mostly aligned already.
24              
25             It can be speeded up significantly by compiling
26             Bio::Graphics::Browser2::CAlign, an XS extension. To do this, build
27             gbrowse with the DO_XS=1 option:
28              
29             cd Generic-Genome-Browser
30             perl Makefile.PL DO_XS=1
31              
32             =head2 METHODS
33              
34             =over 4
35              
36             =cut
37              
38             # file: Sequence/Alignment.pm
39              
40 1     1   125282 use strict;
  1         2  
  1         23  
41 1     1   3 use Carp;
  1         5  
  1         47  
42 1     1   4 use vars '@DEFAULTS';
  1         1  
  1         36  
43 1     1   3 use constant DEBUG=>0;
  1         1  
  1         62  
44              
45 1     1   3 use vars qw(@ISA @EXPORT @EXPORT_OK);
  1         1  
  1         77  
46              
47             require Exporter;
48             @ISA = 'Exporter';
49             @EXPORT = ();
50             @EXPORT_OK = qw(align align_segs);
51              
52             # The default scoring matrix introduces a penalty of -2 for opening
53             # a gap in the sequence, but no penalty for extending an already opened
54             # gap. A nucleotide mismatch has a penalty of -1, and a match has a
55             # positive score of +1. An ambiguous nucleotide ("N") can match
56             # anything with no penalty.
57 1         41 use constant DEFAULT_MATRIX => { 'wildcard_match' => 0,
58             'match' => 1,
59             'mismatch' => -1,
60             'gap' => -2,
61             'gap_extend' => 0,
62             'wildcard' => 'N',
63 1     1   3 };
  1         2  
64              
65 1     1   3 use constant SCORE => 0;
  1         5  
  1         33  
66 1     1   3 use constant EVENT => 1;
  1         1  
  1         41  
67 1     1   3 use constant EXTEND => 0;
  1         1  
  1         31  
68 1     1   11 use constant GAP_SRC => 1;
  1         1  
  1         32  
69 1     1   3 use constant GAP_TGT => 2;
  1         1  
  1         1118  
70              
71             my @EVENTS = qw(extend gap_src gap_tgt);
72              
73             =item $aligner = Bio::Graphics::Browser2::Realign->new($src,$target [,\%matrix])
74              
75             The new() method takes two the two sequence strings to be aligned and
76             an optional weight matrix. Legal weight matrix keys and their default
77             values are shown here:
78              
79             Key name Default Description
80             -------- ------- -----------
81              
82             match 1 Award one point for an exact match.
83             mismatch -1 Penalize one point for a mismatch.
84             wildcard_match 0 No penalty for a match to a wildcard (e.g. "n").
85             gap -1 Penalize one point to create a gap.
86             gap_extend 0 No penalty for extending an existing gap.
87             wildcard 'N' The wildcard character.
88              
89             The alignment algorithm is run when new() is called.
90              
91             =cut
92              
93             # Construct a new alignment object. May be time consuming.
94             sub new {
95 1     1 1 11 my ($class,$src,$tgt,$matrix) = @_;
96 1 50 33     7 croak 'Usage: Realign->new($src,$tgt [,\%matrix])'
97             unless $src && $tgt;
98 1   50     7 my $self = bless {
99             src => $src,
100             target => $tgt,
101             matrix => $matrix || {},
102             },$class;
103 1         1 my $implementor = $class;
104 1 50       1 if (eval {require Bio::Graphics::Browser2::CAlign}) {
  1         356  
105 1         1 $implementor = 'Bio::Graphics::Browser2::CAlign';
106             }
107 1         15 my ($score,$alignment) = $implementor->_do_alignment($src,$tgt,$self->{matrix});
108 1         2 $self->{score} = $score;
109 1         1 $self->{alignment} = $alignment;
110 1         3 return $self;
111             }
112              
113             =item $score = $aligner->score
114              
115             Return the score from the alignment.
116              
117             =cut
118              
119             # return the score of the aligned region
120 1     1 1 90 sub score { return shift()->{'score'}; }
121              
122             =item $start = $aligner->start
123              
124             Return the start of the aligned region, in source sequence
125             coordinates.
126              
127             =cut
128              
129             # return the start of the aligned region
130             sub start {
131 0     0 1 0 return shift()->{'alignment'}->[0];
132             }
133              
134             =item $end = $aligner->end
135              
136             Return the end of the aligned region, in source sequence
137             coordinates.
138              
139             =cut
140              
141             # return the end of the aligned region
142             sub end {
143 0     0 1 0 my $alignment = shift()->{'alignment'};
144 0         0 return $alignment->[$#$alignment];
145             }
146              
147             =item $arrayref = $aligner->alignment
148              
149             Return an arrayref representing the alignment. The array will be
150             exactly as long as the source sequence. Its indexes correspond to
151             positions on the source sequence, and its values correspond to
152             positions on the target sequence. An unaligned base is indicated as
153             undef. Indexes are zero-based.
154              
155             For example, this alignment:
156              
157             gatttttt--c
158             |||||| |
159             gatttt--ccc
160              
161             corresponds to this arrayref:
162              
163             index value
164             0[g] 0[g]
165             1[a] 1[a]
166             2[t] 2[t]
167             3[t] 3[t]
168             4[t] 4[t]
169             5[t] 5[t]
170             6[t] undef
171             7[t] undef
172             8[c] 8[c]
173              
174             =cut
175              
176             # return the alignment as an array
177 1     1 1 4 sub alignment { shift()->{'alignment'}; }
178              
179             =item ($top,$middle,$bottom) = $aligner->pads
180              
181             Returns the alignment as three padded strings indicating the top,
182             middle and bottom lines of a pretty-printed representation.
183              
184             For example:
185              
186             print join "\n",$aligner->pads;
187              
188             Will produce this output:
189              
190             gatttttt--c
191             |||||| |
192             gatttt--ccc
193              
194             =cut
195              
196             # return the alignment as three padded strings for pretty-printing, etc.
197             sub pads {
198 0     0 1   my ($align,$src,$tgt) = @{shift()}{'alignment','src','target'};
  0            
199 0           my ($ps,$pt,$last);
200 0 0         $ps = '-' x ($align->[0]) if defined $align->[0]; # pad up the source
201 0 0         $pt = substr($tgt,0,$align->[0]) if defined $align->[0];
202 0   0       $last = $align->[0] || 0;
203 0           for (my $i=0;$i<@$align;$i++) {
204 0           my $t = $align->[$i];
205 0 0         if (defined $t) {
206 0 0         $pt .= $t-$last > 1 ? substr($tgt,$last+1,$t-$last): substr($tgt,$t,1);
207 0           $ps .= '-' x ($t-$last-1);
208 0           $last = $t;
209             } else {
210 0           $pt .= '-';
211             }
212 0           $ps .= substr($src,$i,1);
213             }
214             # clean up the ends
215 0           $ps .= substr($src,@$align);
216 0           $pt .= substr($tgt,$last+1);
217 0 0         $pt .= '-' x (length($ps) - length($pt)) if length($ps) > length($pt);
218 0 0         $ps .= '-' x (length($pt) - length($ps)) unless length($ps) > length($pt);
219             my $match = join('',
220 0 0         map { uc substr($ps,$_,1) eq uc substr($pt,$_,1) ? '|' : ' ' }
  0            
221             (0..length($pt)-1));
222 0           return ($ps,$match,$pt);
223             }
224              
225             =back
226              
227             =head2 EXPORTED METHODS
228              
229             No functions are exported by default, but the following two methods
230             can be imported explicitly.
231              
232             =over 4
233              
234             =cut
235              
236             =item ($top,$middle,$bottom) = align($source,$target [,\%matrix])
237              
238             Align the source and target sequences and return the padded strings
239             representing the alignment. It is exactly equivalent to calling:
240              
241             Bio::Graphics::Browser2::Realign->new($source,$target)->pads;
242              
243             =cut
244              
245             # take two sequences as strings, align them and return
246             # a three element array consisting of gapped seq1, match string, and
247             # gapped seq2.
248             sub align {
249 0     0 1   my ($seq1,$seq2,$matrix) = @_;
250 0           my $align = __PACKAGE__->new($seq1,$seq2,$matrix);
251 0           return $align->pads;
252             }
253              
254              
255             =item $segs_arrayref = align_segs($source,$target [,\%matrix])
256              
257             The align_segs() function aligns $source and $target and returns an
258             array of non-gapped segments. Each element of the array corresponds
259             to a contiguous nongapped alignment in the format
260             [src_start,src_end,tgt_start,tgt_end].
261              
262             This is useful for converting a gapped alignment into a series of
263             nongapped alignments.
264              
265             In a list context this function will return a list of non-gapped
266             segments.
267              
268             =cut
269              
270             sub align_segs {
271 0     0 1   my ($gap1,$align,$gap2) = align(@_);
272 0           return __PACKAGE__->pads_to_segments($gap1,$align,$gap2);
273             }
274              
275             =item $segs_arrayref = Bio::Graphics::Browser2::Realign->pads_to_segments($seq1,$pads,$seq2)
276              
277             This class method takes two padded sequence strings and the alignment
278             string that relates them and returns an array ref of non-gapped aligned
279             sequence in the format:
280              
281             [src_start,src_end,tgt_start,tgt_end]
282              
283             The 3 strings look like this CA-ACCCCCTTGCAACAACCTTGAGAACCCCAGGGA
284             | |||||||||||||||||||||||||||||||||
285             AAGACCCCCTTGCAACAACCTTGAGAACCCCAGGGA
286              
287             =cut
288              
289             sub pads_to_segments {
290 0     0 1   my $self = shift;
291 0           my ($gap1,$align,$gap2) = @_;
292              
293             # create arrays that map residue positions to gap positions
294 0           my @maps;
295 0           for my $seq ($gap1,$gap2) {
296 0           my @seq = split '',$seq;
297 0           my @map;
298 0           my $residue = 0;
299 0           for (my $i=0;$i<@seq;$i++) {
300 0           $map[$i] = $residue;
301 0 0         $residue++ if $seq[$i] ne '-';
302             }
303 0           push @maps,\@map;
304             }
305              
306 0           my @result;
307 0           while ($align =~ /(\S+)/g) {
308 0           my $align_end = pos($align) - 1;
309 0           my $align_start = $align_end - length($1) + 1;
310 0           push @result,[@{$maps[0]}[$align_start,$align_end],
311 0           @{$maps[1]}[$align_start,$align_end]];
  0            
312             }
313 0 0         return wantarray ? @result : \@result;
314             }
315              
316             sub _do_alignment {
317 0     0     my $class = shift;
318 0           local $^W = 0;
319 0           my($src,$tgt,$custom_matrix) = @_;
320 0           my @alignment;
321 0   0       $custom_matrix ||= {};
322 0           my %matrix = (%{DEFAULT_MATRIX()},%$custom_matrix);
  0            
323              
324 0           my ($max_score,$max_row,$max_col);
325 0           my $scores = [([0,EXTEND])x(length($tgt)+1)];
326              
327 0           print join(' ',map {sprintf("%-4s",$_)} (' ',split('',$tgt))),"\n" if DEBUG;
328 0           my $wildcard = $matrix{wildcard};
329 0           for (my $row=0;$row
330 0           my $s = uc substr($src,$row,1);
331 0           my @row = ([0,EXTEND]);
332 0           for (my $col=0;$col
333 0           my $t = uc substr($tgt,$col,1);
334              
335             # what happens if we extend the both strands one character?
336 0           my $extend = $scores->[$col][SCORE];
337             $extend += ($t eq $wildcard || $s eq $wildcard) ? $matrix{wildcard_match} :
338             ($t eq $s) ? $matrix{match} :
339 0 0 0       $matrix{mismatch};
    0          
340              
341             # what happens if we extend the src strand one character, gapping the tgt?
342             my $gap_tgt = $row[$#row][SCORE] + (($row[$#row][EVENT]==GAP_TGT) ? $matrix{gap_extend}
343 0 0         : $matrix{gap});
344              
345             # what happens if we extend the tgt strand one character, gapping the src?
346             my $gap_src = $scores->[$col+1][SCORE] + (($scores->[$col+1][EVENT] == GAP_SRC) ? $matrix{gap_extend}
347 0 0         : $matrix{gap});
348              
349             # find the best score among the possibilities
350 0           my $score;
351 0 0 0       if ($gap_src >= $gap_tgt && $gap_src >= $extend) {
    0          
352 0           $score = [$gap_src,GAP_SRC];
353             }
354             elsif ($gap_tgt >= $extend) {
355 0           $score = [$gap_tgt,GAP_TGT];
356             }
357             else {
358 0           $score = [$extend,EXTEND];
359             }
360              
361             # save it for posterity
362 0           push(@row,$score);
363 0 0         ($max_score,$max_row,$max_col) = ($score->[SCORE],$row,$col) if $score->[SCORE] >= $max_score;
364             }
365 0           print join(' ',($s,map {sprintf("%4d",$_->[SCORE])} @row[1..$#row])),"\n" if DEBUG;
366 0           $scores = \@row;
367 0           push(@alignment,[@row[1..$#row]]);
368             }
369 0           my $alignment = $class->_trace_back($max_row,$max_col,\@alignment);
370 0           if (DEBUG) {
371             for (my $i=0;$i<@$alignment;$i++) {
372             printf STDERR ("%3d %1s %3d %1s\n",
373             $i,substr($src,$i,1),
374             $alignment->[$i],defined $alignment->[$i] ? substr($tgt,$alignment->[$i],1):'');
375             }
376             }
377              
378 0           return ($max_score,$alignment);
379             }
380              
381             sub _trace_back {
382 0     0     my $self = shift;
383 0           my ($row,$col,$m) = @_;
384 0           my @alignment;
385 0   0       while ($row >= 0 && $col >= 0) {
386 0           printf STDERR "row=%d, col=%d score=%d event=%s\n",$row,$col,$m->[$row][$col][SCORE],$EVENTS[$m->[$row][$col][EVENT]] if DEBUG;
387 0           $alignment[$row] = $col;
388 0           my $score = $m->[$row][$col];
389 0 0         $row--,$col--,next if $score->[EVENT] == EXTEND;
390 0 0         $col--, next if $score->[EVENT] == GAP_TGT;
391 0 0         undef($alignment[$row]),$row--,next if $score->[EVENT] == GAP_SRC;
392             }
393 0           return \@alignment;
394             }
395              
396             1;
397              
398             __END__