File Coverage

blib/lib/Chemistry/File/Formula.pm
Criterion Covered Total %
statement 98 99 98.9
branch 31 38 81.5
condition 7 11 63.6
subroutine 12 13 92.3
pod 3 8 37.5
total 151 169 89.3


line stmt bran cond sub pod time code
1             package Chemistry::File::Formula;
2             $VERSION = '0.37';
3             # $Id: Formula.pm,v 1.16 2009/05/10 19:37:59 itubert Exp $
4              
5 3     3   2916 use strict;
  3         7  
  3         178  
6 3     3   23 use base "Chemistry::File";
  3         7  
  3         1599  
7 3     3   889 use Chemistry::Mol;
  3         10  
  3         159  
8 3     3   21 use Carp;
  3         6  
  3         293  
9 3     3   4077 use Text::Balanced qw(extract_bracketed);
  3         49007  
  3         5293  
10              
11             =head1 NAME
12              
13             Chemistry::File::Formula - Molecular formula reader/formatter
14              
15             =head1 SYNOPSIS
16              
17             use Chemistry::File::Formula;
18              
19             my $mol = Chemistry::Mol->parse("H2O");
20             print $mol->print(format => formula);
21             print $mol->formula; # this is a shorthand for the above
22             print $mol->print(format => formula,
23             formula_format => "%s%d{%d});
24              
25             =cut
26              
27             Chemistry::Mol->register_format('formula');
28              
29             =head1 DESCRIPTION
30              
31             This module converts a molecule object to a string with the formula and back.
32             It registers the 'formula' format with Chemistry::Mol. Besides its obvious
33             use, it is included in the Chemistry::Mol distribution because it is a very
34             simple example of a Chemistry::File derived I/O module.
35              
36             =head2 Writing formulas
37              
38             The format can be specified as a printf-like string with the following control
39             sequences, which are specified with the formula_format parameter to $mol->print
40             or $mol->write.
41              
42             =over
43              
44             =item %s symbol
45              
46             =item %D number of atoms
47              
48             =item %d number of atoms, included only when it is greater than one
49              
50             =item %d{substr} substr is only included when number of atoms is greater than
51             one
52              
53             =item %j{substr} substr is inserted between the formatted string for each
54             element. (The 'j' stands for 'joiner'.) The format should have only one joiner,
55             but its location in the format string doesn't matter.
56              
57             =item %% a percent sign
58              
59             =back
60              
61             If no format is specified, the default is "%s%d". Some examples follow. Let's
62             assume that the formula is C2H6O, as it would be formatted by default.
63              
64             =over
65              
66             =item C<< %s%D >>
67              
68             Like the default, but include explicit indices for all atoms.
69             The formula would be formatted as "C2H6O1"
70              
71             =item C<< %s%d{EsubE%dE/subE} >>
72              
73             HTML format. The output would be
74             "CEsubE2E/subEHEsubE6E/subEO".
75              
76             =item C<< %D %s%j{, } >>
77              
78             Use a comma followed by a space as a joiner. The output would be
79             "2 C, 6 H, 1 O".
80              
81             =back
82              
83             =head3 Symbol Sort Order
84              
85             The elements in the formula are sorted by default in the "Hill order", which
86             means that:
87              
88             1) if the formula contains carbon, C goes first, followed by H,
89             and the rest of the symbols in alphabetical order. For example, "CH2BrF".
90              
91             2) if there is no carbon, all the symbols (including H) are listed
92             alphabetically. For example, "BrH".
93              
94             It is possible to supply a custom sorting subroutine with the 'formula_sort'
95             option. It expects a subroutine reference that takes a hash reference
96             describing the formula (similar to what is returned by parse_formula, discussed
97             below), and that returns a list of symbols in the desired order.
98              
99             For example, this will sort the symbols in reverse asciibetical order:
100              
101             my $formula = $mol->print(
102             format => 'formula',
103             formula_sort => sub {
104             my $formula_hash = shift;
105             return reverse sort keys %$formula_hash;
106             }
107             );
108              
109             =head2 Parsing Formulas
110              
111             Formulas can also be parsed back into Chemistry::Mol objects.
112             The formula may have parentheses and square or triangular brackets, and
113             it may have the following abbreviations:
114              
115             Me => '(CH3)',
116             Et => '(CH3CH2)',
117             Bu => '(C4H9)',
118             Bn => '(C6H5CH2)',
119             Cp => '(C5H5)',
120             Ph => '(C6H5)',
121             Bz => '(C6H5CO)',
122              
123             The formula may also be preceded by a number, which multiplies the whole
124             formula. Some examples of valid formulas:
125              
126             =over
127              
128             Formula Equivalent to
129             --------------------------------------------------------------
130             CH3(CH2)3CH3 C5H12
131             C6H3Me3 C9H12
132             2Cu[NH3]4(NO3)2 Cu2H24N12O12
133             2C(C[C5]4)3 C152
134             2C(C(C(C)5)4)3 C152
135             C 1 0 H 2 2 C10H22 (whitespace is completely ignored)
136              
137             =back
138              
139             When a formula is parsed, a molecule object is created which consists of
140             the set of the atoms in the formula (no bonds or coordinates, of course).
141             The atoms are created in alphabetical order, so the molecule object for C2H5Br
142             would have the atoms in the following sequence: Br, C, C, H, H, H, H, H.
143              
144             If you don't want to create a molecule object, but would rather have a simple
145             hash with the number of atoms for each element, use the C
146             method:
147              
148             my %formula = Chemistry::File::Formula->parse_formula("C2H6O");
149             use Data::Dumper;
150             print Dumper \%formula;
151              
152             which prints something like
153              
154             $VAR1 = {
155             'H' => 6,
156             'O' => 1,
157             'C' => 2
158             };
159              
160             The C method is called internally by the C method.
161              
162             =head3 Non-integer numbers in formulas
163              
164             The C method can also accept formulas that contain
165             floating-point numbers, such as H1.5N0.5. The numbers must be positive, and
166             numbers smaller than one should include a leading zero (e.g., 0.9, not .9).
167              
168             When formulas with non-integer numbers of atoms are turned into molecule
169             objects as described in the previous section, the number of atoms is always
170             B. For example, H1.5N0.5 will produce a molecule object with two
171             hydrogen atoms and one nitrogen atom.
172              
173             There is currently no way of I formulas with non-integer numbers;
174             perhaps a future version will include an "occupancy" property for atoms that
175             will result in non-integer formulas.
176              
177             =cut
178              
179             sub parse_string {
180 14     14 1 41 my ($self, $string, %opts) = @_;
181 14   50     41 my $mol_class = $opts{mol_class} || "Chemistry::Mol";
182 14   50     77 my $atom_class = $opts{atom_class} || "Chemistry::Atom";
183 14   50     84 my $bond_class = $opts{bond_class} || "Chemistry::Bond";
184              
185 14         53 my $mol = $mol_class->new;
186 14         45 my %formula = $self->parse_formula($string);
187 14         63 for my $sym (sort keys %formula) {
188 31         92 for (my $i = 0; $i < $formula{$sym}; ++$i) {
189 516         1992 $mol->add_atom($atom_class->new(symbol => $sym));
190             }
191             }
192 14         126 return $mol;
193             }
194              
195             sub write_string {
196 16     16 1 43 my ($self, $mol, %opts) = @_;
197 16         21 my @formula_parts;
198              
199 16   100     75 my $format = $opts{formula_format} || "%s%d"; # default format
200 16         59 my $fh = $mol->formula_hash;
201 16         39 $format =~ s/%%/\\%/g; # escape %% with a \
202 16         27 my $joiner = "";
203 16 50       49 $joiner = $1 if $format =~ s/(?
204              
205 16         20 my @symbols;
206 16 100       38 if ($opts{formula_sort}) {
207 1         5 @symbols = $opts{formula_sort}($fh);
208             } else {
209 15         47 @symbols = $self->sort_symbols($fh);
210             }
211              
212 16         57 for my $sym (@symbols) {
213 42         56 my $s = $format;
214 42         58 my $n = $fh->{$sym};
215 42         141 $s =~ s/(?
216 42         62 $s =~ s/(?
217 42 100       64 $s =~ s/(? 1 ? $1 : ''/eg; # %d{}
  3         9  
218 42 100       131 $s =~ s/(? 1 ? $n : ''/eg; # %d
  40         186  
219 42         68 $s =~ s/\\(.)/$1/g; # other \ escapes
220 42         109 push @formula_parts, $s;
221             }
222 16         132 return join($joiner, @formula_parts);
223             }
224              
225             sub sort_symbols {
226 15     15 0 22 my ($self, $formula_hash) = @_;
227 15         55 my @symbols = keys %$formula_hash;
228 15 100       41 if ($formula_hash->{C}) {
229             # C and H first, followed by alphabetical order
230 13         517 s/^([CH])$/\0$1/ for @symbols;
231 13         45 @symbols = sort @symbols;
232 13         102 s/^\0([CH])$/$1/ for @symbols;
233 13         69 return @symbols;
234             } else {
235             # simple alphabetical order
236 2         11 return sort @symbols;
237             }
238             }
239              
240             sub file_is {
241 0     0 1 0 return 0; # no files are identified automatically as having this format
242             }
243              
244             ### Code derived from formula.pl by Brent Gregersen follows
245              
246             my %macros = (
247             Me => '(CH3)',
248             Et => '(CH3CH2)',
249             Bu => '(C4H9)',
250             Bn => '(C6H5CH2)',
251             Cp => '(C5H5)',
252             Ph => '(C6H5)',
253             Bz => '(C6H5CO)',
254             # Ac is an element
255             # Pr is an element
256             );
257              
258              
259             sub parse_formula {
260 16     16 0 1876 my ($self, $formula) = @_;
261 16         19 my (%elements);
262              
263             #check balancing
264 16 50       42 return %elements if (!ParensBalanced($formula));
265              
266             # replace other grouping with normal parens
267 16         39 $formula =~ tr/<>{}[]/()()()/;
268              
269             # get rid of any spaces
270 16         40 $formula =~ s/\s+//g;
271              
272             # perform macro expansion
273 16         57 foreach (keys(%macros)) {
274 112         1043 $formula =~ s/$_/$macros{$_}/g;
275             }
276              
277             # determine initial compound coeficent
278 16 100       87 my $coef = ($formula =~ s/^(\d+\.?\d*)//) ? $1 : 1.0;
279              
280             # recursively process rest of formula
281 16         43 return internal_formula_parser($formula, $coef, %elements);
282             }
283              
284             sub internal_formula_parser {
285 35     35 0 77 my ($formula, $coef, %form) = @_;
286 35         33 my $tmp_coef;
287              
288 35         367 my ($extract, $remainder, $prefix) =
289             extract_bracketed($formula, '()', '[^(]*');
290              
291 35 100 66     5374 if (defined($extract) and $extract ne '') {
292 15         68 $extract =~ s/^\((.*)\)$/$1/;
293 15 100       73 if ($remainder =~ s/^(\d+\.?\d*)(.*)$/$2/) {
294 13         31 $tmp_coef = $1 * $coef;
295             } else {
296 2         3 $tmp_coef = $coef;
297             }
298              
299             # get formula of prefix ( it has no parens)
300 15 100       54 %form = add_formula_strings($prefix, $coef, %form) if ($prefix ne '');
301              
302             # check remainder for more parens
303 15 100       40 %form = internal_formula_parser($remainder, $coef, %form)
304             if ($remainder ne '');
305              
306             # check extract for more parens
307 15         116 %form =
308             internal_formula_parser($extract, $tmp_coef, %form);
309             ## we already know this is ne ''
310             } else { # get formula of complete string
311 20 50       77 %form = add_formula_strings($remainder, $coef, %form)
312             if ($remainder ne '');
313             }
314 35         167 return %form;
315             }
316              
317             sub add_formula_strings {
318 30     30 0 90 my ($formula, $coef, %elements) = @_;
319              
320             # print "Getting Formula of $formula\n";
321 30 50       383 $formula =~ /^(?:([A-Z][a-z]*)(\d+\.?\d*)?)+$/o # XXX new
322             or croak "Invalid Portion of Formula $formula";
323 30         130 while ($formula =~ m/([A-Z][a-z]*)(\d+\.?\d*)?/go) { # XXX new
324 55         113 my ($elm, $count) = ($1, $2);
325 55 100       100 $count = 1 unless defined $count;
326 55 100       128 if (defined $elements{$elm}) {
327 18         66 $elements{$elm} += $count * $coef;
328             } else {
329 37         245 $elements{$elm} = $count * $coef;
330             }
331             }
332 30         140 return %elements;
333             }
334              
335             sub ParensBalanced {
336 16     16 0 24 my ($form) = @_;
337 16         23 my @stack = ();
338 16         75 my %pairs = (
339             '<' => '>',
340             '{' => '}',
341             '[' => ']',
342             '(' => ')'
343             );
344              
345 16         74 while ($form =~ m/([<>(){}\]\[])/go) {
346 24         41 my $current = $1;
347 24 100       698 if ($current =~ /[<({\[]/) {
348 12         19 push(@stack, $current);
349 12         46 next;
350             }
351 12 50       27 return 0 if (scalar(@stack) == 0);
352 12 50       57 return 0 if ($current ne $pairs{ pop @stack});
353             }
354 16 50       90 return @stack ? 0 : 1;
355             }
356              
357             1;
358              
359             =head1 VERSION
360              
361             0.37
362              
363             =head1 SEE ALSO
364              
365             L, L
366              
367             For discussion about Hill order, just search the web for C
368             order">. The original reference is I B<1900>, I<22>,
369             478-494. L.
370              
371             The PerlMol website L
372              
373             =head1 AUTHOR
374              
375             Ivan Tubert-Brohman .
376              
377             Formula parsing code contributed by Brent Gregersen.
378              
379             Patch for non-integer formulas by Daniel Scott.
380              
381             =head1 COPYRIGHT
382              
383             Copyright (c) 2005 Ivan Tubert-Brohman. All rights reserved. This program is
384             free software; you can redistribute it and/or modify it under the same terms as
385             Perl itself.
386              
387             =cut
388