File Coverage

blib/lib/Chemistry/MidasPattern.pm
Criterion Covered Total %
statement 122 145 84.1
branch 45 66 68.1
condition 9 14 64.2
subroutine 18 22 81.8
pod 5 8 62.5
total 199 255 78.0


line stmt bran cond sub pod time code
1             package Chemistry::MidasPattern;
2              
3             $VERSION = '0.11';
4              
5             # $Id: MidasPattern.pm,v 1.3 2005/05/16 23:03:37 itubert Exp $
6              
7             =head1 NAME
8              
9             Chemistry::MidasPattern - Select atoms in macromolecules
10              
11             =head1 SYNOPSIS
12              
13             use Chemistry::MidasPattern;
14             use Chemistry::File::PDB;
15              
16             # read a molecule
17             my $mol = Chemistry::MacroMol->read("test.pdb");
18              
19             # define a pattern matching carbons alpha and beta
20             # in all valine residues
21             my $str = ':VAL@CA,CB';
22             my $patt = Chemistry::MidasPattern->new($str);
23              
24             # apply the pattern to the molecule
25             $patt->match($mol);
26              
27             # extract the results
28             for my $atom ($patt->atom_map) {
29             printf "%s\t%s\n", $atom->attr("pdb/residue_name"), $atom->name;
30             }
31             printf "FOUND %d atoms\n", scalar($patt->atom_map);
32              
33             =head1 DESCRIPTION
34              
35             This module partially implements a pattern matching engine for selecting atoms
36             in macromolecules by using Midas/Chimera patterns. See
37             L
38             for a detailed description of this language.
39              
40             This module shares the same interface as L; to perform a
41             pattern matching operation on a molecule, follow these steps.
42              
43             1) Create a pattern object, by parsing a string. Let's assume that the pattern
44             object is stored in $patt and that the molecule is $mol.
45              
46             2) Execute the pattern on the molecule by calling $patt->match($mol).
47              
48             3) If $patt->match() returns true, extract the "map" that relates the pattern
49             to the molecule by calling $patt->atom_map. These method returns a list of the
50             atoms in the molecule that are matched by the pattern. Thus $patt->atom_map(1)
51             would be analogous to the $1 special variable used for regular expresion
52             matching. The difference between Chemistry::Pattern and Perl regular
53             expressions is that atoms are always captured, and that each atom always uses
54             one "slot".
55              
56             =head1 MIDAS ATOM SPECIFICATION LANGUAGE QUICK SUMMARY
57              
58             The current implementation does not have the concept of a model, only of
59             residues and atoms.
60              
61             What follows is not exactly a formal grammar specification, but it should give
62             a general idea:
63              
64             SELECTOR = ((:RESIDUE(.CHAIN)?)*(@ATOM)*)*
65              
66             The star here means "zero or more", the question mark means "zero or one", and
67             the parentheses are used to delimit the effect of the star. All other
68             characters are used verbatim.
69              
70             RESIDUE can be a name (e.g., LYS), a sequence number (e.g., 108), a range
71             (e.g., 1-10), or a comma-separated list of RESIDUEs (e.g. 1-10,6,LYS).
72              
73             ATOM is an atom name, a serial number (this is a non-standard extension) or a
74             comma-separated list of ATOMs.
75              
76             Names can have wildcards: * matches the whole name; ? matches one character;
77             and = matches zero or more characters. An @ATOM specification is asociated with
78             the closest preceding residue specification.
79              
80             DISTANCE_SELECTOR = SELECTOR za< DISTANCE
81              
82             Atoms within a certain distance of those that are matched by a selector can be
83             selected by using the za< operator, where DISTANCE is a number in Angstroms.
84              
85             EXPR = ( SELECTOR | DISTANCE_SELECTOR ) (& (SELECTOR | DISTANCE_SELECTOR))*
86              
87             The result of two or more selectors can be intersected using the & operator.
88              
89             =head1 EXAMPLES
90              
91             :ARG All arginine atoms
92             :ARG.A All arginine atoms in chain 'A'
93             :ARG@* All arginine atoms
94             @CA All alpha carbons
95             :*@CA All alpha carbons
96             :ARG@CA Arginine alpha carbons
97             :VAL@C= Valine carbons
98             :VAL@C? Valine carbons with two-letter names
99             :ARG,VAL@CA Arginine and valine alpha carbons
100             :ARG:VAL@CA All arginine atoms and valine alpha carbons
101             :ARG@CA,CB Arginine alpha and beta carbons
102             :ARG@CA@CB Arginine alpha and beta carbons
103             :1-10 Atoms in residues 1 to 10
104             :48-* Atoms in residues 11 to the last one
105             :30-40@CA & :ARG Alpha carbons in residues 1-10 which are
106             also arginines.
107             @123 Atom 123
108             @123 za<5.0 Atoms within 5.0 Angstroms of atom 123
109             @123 za>30.0 Atoms not within 30.0 Angstroms of atom 123
110             @CA & @123 za<5.0 Alpha carbons within 5.0 Angstroms of atom 123
111              
112             =cut
113              
114              
115 1     1   6 use strict;
  1         2  
  1         39  
116 1     1   5 use warnings;
  1         2  
  1         32  
117 1     1   991 use Chemistry::MacroMol;
  1         83770  
  1         76  
118 1     1   15 use Carp;
  1         2  
  1         77  
119 1     1   6 use base "Chemistry::Pattern";
  1         2  
  1         1262  
120              
121             our $DEBUG = 0;
122              
123             sub new {
124 18     18 1 42 my ($class, $str) = @_;
125 18   33     227 my $self = bless {
126             atom_map => [],
127             options => {},
128             matched => 0,
129             name => $str,
130             }, ref $class || $class;
131 18         62 $self->parse_midas_pattern($str);
132 18         82 $self;
133             }
134              
135             sub atom_map {
136 36     36 1 6391 my ($self) = @_;
137 36         43 @{$self->{atom_map}};
  36         173  
138             }
139              
140 0     0 1 0 sub bond_map { () }
141              
142             sub options {
143 18     18 1 25 my $self = shift;
144 18         64 $self->{options} = {@_};
145 18         45 $self;
146             }
147              
148             # residue specifier: name | range | number | *
149             # atom specifier: name | number | *
150             # names may have ? and = as wildcards (equivalent to . and .*)
151             sub parse_midas_pattern {
152 18     18 0 30 my ($self, $string) = @_;
153 18         28 my $tree = [];
154 18         236 $self->{tree} = $tree;
155              
156 18         84 for my $str (split / +& +/, $string) {
157 20         83 my $node = { type => 'normal' };
158 20 100       94 if ($str =~ s/ +z(.?)([<>])\s*(\d+(\.\d+)?)//) {
159 3   50     21 $node->{type} = 'zone_' . ($1 || 'r');
160 3         10 $node->{zone_op} = $2;
161 3         14 $node->{zone_val} = $3;
162             }
163 20         36 push @$tree, $node;
164 20         36 my $conditions = [];
165 20         39 $node->{conditions} = $conditions;
166              
167 20         43 $str =~ s/\s+//g; # ignore whitespace
168 20 100       79 $str = ":*$str" unless $str =~ /^:/; # add implicit "every residue"
169 20         75 my (undef, @residues) = split ':', $str;# =~ /:([^:]+)/g;
170 20         43 for my $residue (@residues) {
171 21         61 my ($res_base, $atom_base) = split '@', $residue, 2;
172             #print "res_base = $res_base\n";
173 21         46 my @res = map { parse_res($_) } split ',', $res_base;
  22         59  
174 21         75 my $res_node = { residue => \@res };
175 21         49 push @$conditions, $res_node;
176              
177 21   100     90 my (@atoms) = map { parse_atom($_) } split /[\@,]/, $atom_base||'*';
  23         52  
178 21         159 $res_node->{atoms} = \@atoms;
179             }
180             }
181             }
182              
183             sub parse_res {
184 22     22 0 36 my ($s) = @_;
185 22         43 $s = lc $s;
186 22         29 my $chain_id;
187             my $sub;
188 22 50       68 if ($s =~ s/\.(.)$//) { # chain id
189 0         0 $chain_id = $1;
190             }
191 22 50       188 if ($s =~ /^\d+$/) {
    100          
    50          
    100          
    50          
192 0     0   0 $sub = sub {$_->attr('pdb/sequence_number') == $s};
  0         0  
193             } elsif ($s =~ /^[a-zA-Z]+$/) {
194 12     600   48 $sub = sub {lc $_->type eq $s};
  600         4318  
195             } elsif ($s =~ /^[a-zA-Z=?]+$/) {
196 0         0 $s =~ s/\?/./g;
197 0         0 $s =~ s/=/.*/g;
198 0     0   0 $sub = sub {$_->type =~ qr/$s/i};
  0         0  
199             } elsif ($s eq '*') {
200 7     350   34 $sub = sub {1};
  350         737  
201             } elsif ($s =~ /^(\d+|\*)-(\d+|\*)/) {
202 3         13 my ($from, $to) = ($1, $2);
203 3 50       11 $from = 0 if $from eq '*';
204 3 100       11 $to = 1e9 if $to eq '*';
205            
206             $sub = sub {
207 150 100   150   441 my $n = $_->attr('pdb/sequence_number'); $n >= $from and $n <= $to
  150         1605  
208 3         15 };
209             } else {
210 0         0 croak "Invalid residue specification '$s'\n";
211             }
212              
213 0 0   0   0 return $chain_id ?
214             sub {lc $_->attr('pdb/chain_id') eq $chain_id and $sub->()}
215 22 50       97 : $sub;
216             }
217              
218             sub parse_atom {
219 23     23 0 41 my ($s) = @_;
220 23         43 $s = lc $s;
221 23 100       123 if ($s =~ /^\d+$/) {
    100          
    50          
222 4     3164   23 return sub {$_->attr('pdb/serial_number') == $s};
  3164         27701  
223             } elsif ($s =~ /^[a-zA-Z0-9='"?]+$/) {
224 13         26 $s =~ s/\?/./g;
225 13         21 $s =~ s/=/.*/g;
226 13     3093   64 return sub {$_->name =~ qr/^$s$/i};
  3093         34990  
227             } elsif ($s eq '*') {
228 6     387   33 return sub {1};
  387         938  
229             } else {
230 0         0 croak "Invalid atom specification '$s'\n";
231             }
232             }
233              
234             sub match {
235 18     18 1 89 my ($self, $mol) = @_;
236 18         36 my $tree = $self->{tree};
237 18         25 my @ret;
238 18         25 my $first = 1;
239              
240 18 50 33     64 if ($self->{matched} and $self->{matched} eq $mol) {
241 0         0 $self->{matched} = 0;
242 0         0 return 0;
243             }
244              
245 18         36 for my $term (@$tree) {
246 20 50       48 print "term\n" if $DEBUG;
247 20         27 my @atoms;
248 20         26 for my $residue_spec (@{$term->{conditions}}) {
  20         50  
249 21 50       47 print "\tresidue_spec\n" if $DEBUG;
250 21         25 my @residues;
251            
252             # get the matching residues
253 21         28 for my $cond (@{$residue_spec->{residue}}) {
  21         46  
254 22 50       52 print "\t\tresidue condition\n" if $DEBUG;
255 22         94 push @residues, grep $cond->(), $mol->domains;
256             }
257 21 50       172 if ($self->{options}{unique}) {
258 0         0 my %seen;
259 0         0 @seen{@residues} = @residues;
260 0         0 @residues = values %seen;
261             }
262 21 50       47 printf "got %d residues\n", scalar @residues if $DEBUG;
263              
264             # get the matching atoms
265 21         40 for my $res (@residues) {
266 406         3991 for my $cond (@{$residue_spec->{atoms}}) {
  406         796  
267 410 50       895 print "atom_cond\n" if $DEBUG;
268 410         1070 push @atoms, grep $cond->(), $res->atoms;
269             }
270             }
271             }
272 20         263 my %seen;
273 20 50       81 if ($self->{options}{unique}) {
274 0         0 @seen{@atoms} = @atoms;
275 0         0 @atoms = values %seen;
276             }
277 20 100       79 if ($term->{type} eq 'normal') {
    50          
    50          
278 17         96 @seen{@atoms} = @atoms;
279             } elsif ($term->{type} eq 'zone_r') {
280 0         0 croak "zr operator not implemented\n";
281             } elsif ($term->{type} eq 'zone_a') {
282 3         9 my $d = $term->{zone_val};
283 3         9 my $gt = $term->{zone_op} eq '>';
284             #print "zone_a ret(@ret)\n" unless $first;
285 1632         2437 @atoms = grep {
286 3 100       27 my $found = 0;
287 1632         2449 for my $a (@atoms) {
288 1632 100       8191 $found = 1, last if ($a->coords - $_->coords)->length < $d;
289             }
290 1632   100     142657 $found xor $gt;
291             } $first ? $mol->atoms : @ret;
292             #print "zone_a atoms(@atoms)\n" unless $first;
293 3         277 @seen{@atoms} = @atoms;
294             }
295 20 100       3492 if ($first) {
296             # keep all the atoms from this term
297 18         97 @ret = @atoms;
298 18         34 $first = 0;
299             } else {
300             # keep the intersection of the previous terms with the
301             # atoms from the current term
302 2         17 my @tmp = @ret;
303 2         15 @ret = ();
304 2         5 for (@tmp) {
305 61 100       502 push @ret, $_ if $seen{$_};
306             }
307             }
308 20 50       195 last unless @ret; # don't bother evaluating the other terms
309             }
310             # optionally sort the result
311 18 50       72 if ($self->{options}{sort}) {
312 0         0 @ret = map { $_->[0] } sort { $a->[1] <=> $b->[1] }
  0         0  
  0         0  
313 0         0 map { [$_, $_->attr("pdb/serial_number")] } @ret;
314             }
315 18         36 $self->{atom_map} = \@ret;
316 18         71 $self->{matched} = $mol;
317             }
318              
319             1;
320              
321              
322              
323             =head1 CAVEATS
324              
325             If a feature does not appear in any of the examples, it is probably not
326             implemented. For example, the zr< zone specifier, atom properties, and
327             various Chimera extensions.
328              
329             The zone specifiers (selection by distance) currently use a brute-force N^2
330             algorithm. You can optimize an & expression by putting the most unlikely
331             selectors first; for example
332              
333             :1-20 zr<10.0 & :38 atoms in residue 38 within 10 A of atoms
334             in residues 1-20 (slow)
335              
336             :38 & :1-20 zr<10.0 atoms in residue 38 within 10 A of atoms
337             in residues 1-20 (not so slow)
338            
339             In the first case, the N^2 search measures the distance between every atom in
340             the molecule and every atom in residues 1-20, and then intersects the results
341             with the atom list of residue 28; the second case only measures the distance
342             between every atom in residue 38 with every atom in residues 1-20. The second
343             way is much, much faster for large systems.
344              
345             Some day, a future version may implement a smarter algorithm...
346              
347             =head1 VERSION
348              
349             0.11
350              
351             =head1 SEE ALSO
352              
353             L, L
354              
355             The PerlMol website L
356              
357             =head1 AUTHOR
358              
359             Ivan Tubert Eitub@cpan.orgE
360              
361             =head1 COPYRIGHT
362              
363             Copyright (c) 2005 Ivan Tubert. All rights reserved. This program is free
364             software; you can redistribute it and/or modify it under the same terms as
365             Perl itself.
366              
367             =cut
368              
369              
370