File Coverage

blib/lib/Chemistry/FormulaPattern.pm
Criterion Covered Total %
statement 99 112 88.3
branch 29 46 63.0
condition 12 20 60.0
subroutine 14 17 82.3
pod 5 10 50.0
total 159 205 77.5


line stmt bran cond sub pod time code
1             package Chemistry::FormulaPattern;
2              
3             $VERSION = '0.10';
4              
5             # $Id: FormulaPattern.pm,v 1.1.1.1 2004/08/11 23:03:04 itubert Exp $
6              
7             =head1 NAME
8              
9             Chemistry::FormulaPattern - Match molecule by formula
10              
11             =head1 SYNOPSIS
12              
13             use Chemistry::FormulaPattern;
14              
15             # somehow get a bunch of molecules...
16             use Chemistry::File::SDF;
17             my @mols = Chemistry::Mol->read("file.sdf");
18              
19             # we want molecules with six carbons and 8 or more hydrogens
20             my $patt = Chemistry::FormulaPattern->new("C6H8-");
21              
22             for my $mol (@mols) {
23             if ($patt->match($mol)) {
24             print $mol->name, " has a nice formula!\n";
25             }
26             }
27              
28             # a concise way of selecting molecules with grep
29             my @matches = grep { $patt->match($mol) } @mols;
30              
31             =head1 DESCRIPTION
32              
33             This module implements a simple language for describing a range of
34             molecular formulas and allows one to find out whether a molecule matches
35             the formula specification. It can be used for searching for molecules by
36             formula, in a way similar to the NIST WebBook formula search
37             (L). Note however that the
38             language used by this module is different from the one used by the WebBook!
39              
40             Chemistry::FormulaPattern shares the same interface as L.
41             To perform a pattern matching operation on a molecule, follow these steps.
42              
43             1) Create a pattern object, by parsing a string. Let's assume that the pattern
44             object is stored in $patt and that the molecule is $mol.
45              
46             2) Execute the pattern on the molecule by calling $patt->match($mol).
47              
48             If $patt->match returns true, there was a match. If $patt->match is called two
49             consecutive times with the same molecule, it returns false; then true (if there
50             is a match), then false, etc. This is because the Chemistry::Pattern interface
51             is designed to allow multiple matches for a given molecule, and then returns
52             false when there are no further matches; in the case of a formula pattern,
53             there is only one possible match.
54              
55             $patt->match($mol); # may return true
56             $patt->match($mol); # always false
57             $patt->match($mol); # may return true
58             $patt->match($mol); # always false
59             # ...
60              
61             This allows one two use the standard while loop for all kinds of patterns
62             without having to worry about endless loops:
63              
64             # $patt might be a Chemistry::Pattern, Chemistry::FormulaPattern,
65             # or Chemistry::MidasPattern object
66             while ($patt->match($mol)) {
67             # do something
68             }
69              
70             Also note that formula patterns don't really have the concept of an atom map,
71             so $patt->atom_map and $patt->bond_map always return the empty list.
72              
73             =head1 FORMULA PATTERN LANGUAGE
74              
75             In the simplest case, a formula pattern may be just a regular formula, as
76             used by the L module. For example, the pattern
77             "C6H6" will only match molecules with six carbons, six hydrogens, and no other
78             atoms.
79              
80             The interesting thing is that one can also specify ranges for the elements,
81             as two hyphen-separated numbers. "C6H8-10" will match molecules with six
82             carbons and eight to ten hydrogens.
83              
84             Ranges may also be open, by omitting the upper part of the range. "C6H0-" will
85             match molecules with six carbons and any number of hydrogens (i.e., zero or
86             more).
87              
88             A formula pattern may also allow for unspecified elements by means of the
89             asterisk special character, which can be placed anywhere in the formula
90             pattern. For example, "C2H6*" (or "C2*H6, etc.) will match C2H6, and also
91             C2H6O, C2H6S, C2H6SO, etc.
92              
93             Ranges can also be used after a subformula in parentheses: "(CH2)1-2" will
94             match molecules with one or two carbons and two to four hydrogens. Note,
95             however, that the "structure" of the bracketed part of the formula is
96             forgotten, i.e., the multiplier applies to each element individually and does
97             not have to be an integer. That is, the above pattern will match CH2, CH3, CH4,
98             C2H2, C2H3, and C2H4.
99              
100             =cut
101              
102              
103 2     2   42588 use strict;
  2         4  
  2         88  
104 2     2   12 use warnings;
  2         5  
  2         66  
105 2     2   2520 use Chemistry::Mol;
  2         125028  
  2         138  
106 2     2   27 use Carp;
  2         3  
  2         138  
107 2     2   11 use base "Chemistry::Pattern";
  2         4  
  2         2963  
108 2     2   25635 use Text::Balanced qw(extract_bracketed);
  2         48155  
  2         1078  
109              
110             our $DEBUG = 0;
111              
112             sub new {
113 15     15 1 30646 my ($class, $str) = @_;
114 15   33     160 my $self = bless {
115             atom_map => [],
116             options => {},
117             matched => 0,
118             }, ref $class || $class;
119 15         56 $self->parse_formula_pattern($str);
120 15         51 $self;
121             }
122              
123 0     0 1 0 sub atom_map { () }
124              
125 0     0 1 0 sub bond_map { () }
126              
127             sub options {
128 0     0 1 0 my $self = shift;
129 0         0 $self->{options} = {%{$self->{options}}, @_};
  0         0  
130 0         0 $self;
131             }
132              
133             sub parse_formula_pattern {
134 15     15 0 237 my ($self, $string) = @_;
135 15 100       64 if ($string =~ s/\*//g) {
136 2         9 $self->{options}{allow_others} = 1;
137             }
138 15         43 my %formula = parse_formula($string);
139 15         447 $self->{formula_pattern} = \%formula;
140             }
141              
142             sub match {
143 16     16 1 12740 my ($self, $mol) = @_;
144 16         29 my $tree = $self->{tree};
145 16         21 my @ret;
146 16         20 my $first = 1;
147              
148 16 100 66     65 if ($self->{matched} and $self->{matched} eq $mol) {
149 5         75 $self->{matched} = 0;
150 5         16 return 0;
151             }
152              
153 11         132 my $want_formula = $self->{formula_pattern};
154 11         36 my $have_formula = $mol->formula_hash;
155 11         686 my $good = 0;
156 11         40 my @want_keys = keys %$want_formula;
157 11         23 for my $symbol (@want_keys) {
158 2     2   24 no warnings 'uninitialized';
  2         5  
  2         2940  
159 15   50     68 $have_formula->{$symbol} ||= 0;
160 15 100 66     80 if ($have_formula->{$symbol} >= $want_formula->{$symbol}{low}
161             and $have_formula->{$symbol} <= $want_formula->{$symbol}{high}) {
162 9         20 $good++;
163             } else {
164 6         13 last;
165             }
166             }
167 11         32 my @have_keys = keys %$have_formula;
168 11 50 66     56 if ($good == @want_keys
      66        
169             and ($good == @have_keys or $self->{options}{allow_others})) {
170 5         9 $self->{matched} = $mol;
171 5         25 return 1;
172             }
173 6         29 0;
174             }
175              
176             ### Code derived from formula.pl by Brent Gregersen follows
177              
178             my %macros = (
179             Me => '(CH3)',
180             Et => '(CH3CH2)',
181             Bu => '(C4H9)',
182             Bn => '(C6H5CH2)',
183             Cp => '(C5H5)',
184             Ph => '(C6H5)',
185             Bz => '(C6H5CO)',
186             # Ac is an element
187             # Pr is an element
188             );
189              
190              
191             sub parse_formula {
192 15     15 0 30 my ($formula) = @_;
193 15         25 my (%elements);
194              
195             #check balancing
196 15 50       280 return %elements if (!ParensBalanced($formula));
197              
198             # replace other grouping with normal parens
199 15         35 $formula =~ tr/<>{}[]/()()()/;
200              
201             # get rid of any spaces
202 15         106 $formula =~ s/\s+//g;
203              
204             # perform macro expansion
205 15         69 foreach (keys(%macros)) {
206 105         1266 $formula =~ s/$_/$macros{$_}/g;
207             }
208              
209             # determine initial compound coeficent
210 15         44 my ($low, $high, $is_range) = (1,1);
211 15 50       72 if ($formula =~ s/^(\d+)(-?)(\d*)//) {
212 0         0 ($low, $is_range, $high) = ($1, $2, $3);
213 0 0       0 if ($is_range) {
214 0 0       0 $high = 1E10 unless length $high;
215             } else {
216 0         0 $high = $low;
217             }
218             }
219              
220             # recursively process rest of formula
221 15         47 return internal_formula_parser($formula, $low, $high, %elements);
222             }
223              
224             sub internal_formula_parser {
225 17     17 0 59 my ($formula, $low, $high, %form) = @_;
226 17         23 my ($tmp_low, $tmp_high, $tmp_is_range);
227              
228 17         74 my ($extract, $remainder, $prefix) =
229             extract_bracketed($formula, '()', '[^(]*');
230              
231 17 100 66     1493 if (defined($extract) and $extract ne '') {
232 2         14 $extract =~ s/^\((.*)\)$/$1/;
233 2 50       13 if ($remainder =~ s/^(\d+)(-?)(\d*)//) {
234 2         10 ($tmp_low, $tmp_is_range, $tmp_high) = ($1, $2, $3);
235 2 50       6 if ($tmp_is_range) {
236 2 50       7 $tmp_high = 1E10 unless length $tmp_high;
237             } else {
238 0         0 $tmp_high = $tmp_low;
239             }
240 2         5 $tmp_low = $tmp_low * $low;
241 2         3 $tmp_high = $tmp_high * $high;
242             } else {
243 0         0 $tmp_low = $low;
244 0         0 $tmp_high = $high;
245             }
246              
247             # get formula of prefix ( it has no parens)
248 2 50       9 %form = add_formula_strings($prefix, $low, $high, %form) if ($prefix ne '');
249              
250             # check remainder for more parens
251 2 50       9 %form = internal_formula_parser($remainder, $low, $high, %form)
252             if ($remainder ne '');
253              
254             # check extract for more parens
255 2         11 %form =
256             internal_formula_parser($extract, $tmp_low, $tmp_high, %form);
257             ## we already know this is ne ''
258             } else { # get formula of complete string
259 15 50       153 %form = add_formula_strings($remainder, $low, $high, %form)
260             if ($remainder ne '');
261             }
262 17         150 return %form;
263             }
264              
265             sub add_formula_strings {
266 15     15 0 37 my ($formula, $low_coef, $high_coef, %elements) = @_;
267              
268             # print "Getting Formula of $formula\n";
269 15 50       125 $formula =~ /^(?:([A-Z][a-z]*)(\d+-?\d*)?)+$/o
270             or croak "Invalid Portion of Formula $formula";
271 15         106 while ($formula =~ m/([A-Z][a-z]*)(?:(\d+)(-?)(\d*))?/go) {
272 33         202 my ($elm, $low, $is_range, $high) = ($1, $2, $3, $4);
273 33 100       90 $low = 1 unless defined $low;
274 33 100       56 if ($is_range) {
275 10 100       33 $high = 1E10 unless length $high;
276             } else {
277 23         52 $high = $low;
278             }
279 33         123 $elements{$elm}{low} += $low_coef * $low;
280 33         163 $elements{$elm}{high} += $high_coef * $high;
281             }
282 15         83 return %elements;
283             }
284              
285             sub ParensBalanced {
286 15     15 0 29 my ($form) = @_;
287 15         27 my @stack = ();
288 15         85 my %pairs = (
289             '<' => '>',
290             '{' => '}',
291             '[' => ']',
292             '(' => ')'
293             );
294              
295 15         102 while ($form =~ m/([<>(){}\]\[])/go) {
296 4         11 my $current = $1;
297 4 100       15 if ($current =~ /[<({\[]/) {
298 2         6 push(@stack, $current);
299 2         9 next;
300             }
301 2 50       7 return 0 if (scalar(@stack) == 0);
302 2 50       14 return 0 if ($current ne $pairs{ pop @stack});
303             }
304 15 50       96 return @stack ? 0 : 1;
305             }
306              
307              
308              
309              
310             1;
311              
312              
313              
314             =head1 VERSION
315              
316             0.10
317              
318             =head1 SEE ALSO
319              
320             L
321              
322             The PerlMol website L
323              
324             =head1 AUTHOR
325              
326             Ivan Tubert-Brohman Eitub@cpan.orgE
327              
328             =head1 COPYRIGHT
329              
330             Copyright (c) 2004 Ivan Tubert-Brohman. All rights reserved. This program is
331             free software; you can redistribute it and/or modify it under the same terms as
332             Perl itself.
333              
334             =cut
335              
336