File Coverage

blib/lib/Chemistry/File/Formula.pm

Criterion	Covered	Total	%
statement	98	99	98.9
branch	31	38	81.5
condition	7	11	63.6
subroutine	12	13	92.3
pod	3	8	37.5
total	151	169	89.3

line	stmt	bran	cond	sub	pod	time	code
1							package Chemistry::File::Formula;
2
3							our $VERSION = '0.39'; # VERSION
4							# $Id$
5
6	3			3		2127	use strict;
	3					6
	3					76
7	3			3		13	use base "Chemistry::File";
	3					4
	3					954
8	3			3		467	use Chemistry::Mol;
	3					5
	3					101
9	3			3		14	use Carp;
	3					5
	3					131
10	3			3		1639	use Text::Balanced qw(extract_bracketed);
	3					24657
	3					3452
11
12							=head1 NAME
13
14							Chemistry::File::Formula - Molecular formula reader/formatter
15
16							=head1 SYNOPSIS
17
18							use Chemistry::File::Formula;
19
20							my $mol = Chemistry::Mol->parse("H2O");
21							print $mol->print(format => formula);
22							print $mol->formula; # this is a shorthand for the above
23							print $mol->print(format => formula,
24							formula_format => "%s%d{_%d});
25
26							=cut
27
28							Chemistry::Mol->register_format('formula');
29
30							=head1 DESCRIPTION
31
32							This module converts a molecule object to a string with the formula and back.
33							It registers the 'formula' format with Chemistry::Mol. Besides its obvious
34							use, it is included in the Chemistry::Mol distribution because it is a very
35							simple example of a Chemistry::File derived I/O module.
36
37							=head2 Writing formulas
38
39							The format can be specified as a printf-like string with the following control
40							sequences, which are specified with the formula_format parameter to $mol->print
41							or $mol->write.
42
43							=over
44
45							=item %s symbol
46
47							=item %D number of atoms
48
49							=item %d number of atoms, included only when it is greater than one
50
51							=item %d{substr} substr is only included when number of atoms is greater than
52							one
53
54							=item %j{substr} substr is inserted between the formatted string for each
55							element. (The 'j' stands for 'joiner'.) The format should have only one joiner,
56							but its location in the format string doesn't matter.
57
58							=item %% a percent sign
59
60							=back
61
62							If no format is specified, the default is "%s%d". Some examples follow. Let's
63							assume that the formula is C2H6O, as it would be formatted by default.
64
65							=over
66
67							=item C<< %s%D >>
68
69							Like the default, but include explicit indices for all atoms.
70							The formula would be formatted as "C2H6O1"
71
72							=item C<< %s%d{EsubE%dE/subE} >>
73
74							HTML format. The output would be
75							"CEsubE2E/subEHEsubE6E/subEO".
76
77							=item C<< %D %s%j{, } >>
78
79							Use a comma followed by a space as a joiner. The output would be
80							"2 C, 6 H, 1 O".
81
82							=back
83
84							=head3 Symbol Sort Order
85
86							The elements in the formula are sorted by default in the "Hill order", which
87							means that:
88
89							1) if the formula contains carbon, C goes first, followed by H,
90							and the rest of the symbols in alphabetical order. For example, "CH2BrF".
91
92							2) if there is no carbon, all the symbols (including H) are listed
93							alphabetically. For example, "BrH".
94
95							It is possible to supply a custom sorting subroutine with the 'formula_sort'
96							option. It expects a subroutine reference that takes a hash reference
97							describing the formula (similar to what is returned by parse_formula, discussed
98							below), and that returns a list of symbols in the desired order.
99
100							For example, this will sort the symbols in reverse asciibetical order:
101
102							my $formula = $mol->print(
103							format => 'formula',
104							formula_sort => sub {
105							my $formula_hash = shift;
106							return reverse sort keys %$formula_hash;
107							}
108							);
109
110							=head2 Parsing Formulas
111
112							Formulas can also be parsed back into Chemistry::Mol objects.
113							The formula may have parentheses and square or triangular brackets, and
114							it may have the following abbreviations:
115
116							Me => '(CH3)',
117							Et => '(CH3CH2)',
118							Bu => '(C4H9)',
119							Bn => '(C6H5CH2)',
120							Cp => '(C5H5)',
121							Ph => '(C6H5)',
122							Bz => '(C6H5CO)',
123
124							The formula may also be preceded by a number, which multiplies the whole
125							formula. Some examples of valid formulas:
126
127							=over
128
129							Formula Equivalent to
130							--------------------------------------------------------------
131							CH3(CH2)3CH3 C5H12
132							C6H3Me3 C9H12
133							2Cu[NH3]4(NO3)2 Cu2H24N12O12
134							2C(C[C5]4)3 C152
135							2C(C(C(C)5)4)3 C152
136							C 1 0 H 2 2 C10H22 (whitespace is completely ignored)
137
138							=back
139
140							When a formula is parsed, a molecule object is created which consists of
141							the set of the atoms in the formula (no bonds or coordinates, of course).
142							The atoms are created in alphabetical order, so the molecule object for C2H5Br
143							would have the atoms in the following sequence: Br, C, C, H, H, H, H, H.
144
145							If you don't want to create a molecule object, but would rather have a simple
146							hash with the number of atoms for each element, use the C
147							method:
148
149							my %formula = Chemistry::File::Formula->parse_formula("C2H6O");
150							use Data::Dumper;
151							print Dumper \%formula;
152
153							which prints something like
154
155							$VAR1 = {
156							'H' => 6,
157							'O' => 1,
158							'C' => 2
159							};
160
161							The C method is called internally by the C method.
162
163							=head3 Non-integer numbers in formulas
164
165							The C method can also accept formulas that contain
166							floating-point numbers, such as H1.5N0.5. The numbers must be positive, and
167							numbers smaller than one should include a leading zero (e.g., 0.9, not .9).
168
169							When formulas with non-integer numbers of atoms are turned into molecule
170							objects as described in the previous section, the number of atoms is always
171							B. For example, H1.5N0.5 will produce a molecule object with two
172							hydrogen atoms and one nitrogen atom.
173
174							There is currently no way of I formulas with non-integer numbers;
175							perhaps a future version will include an "occupancy" property for atoms that
176							will result in non-integer formulas.
177
178							=cut
179
180							sub parse_string {
181	14			14	1	31	my ($self, $string, %opts) = @_;
182	14		50			29	my $mol_class = $opts{mol_class} \|\| "Chemistry::Mol";
183	14		50			49	my $atom_class = $opts{atom_class} \|\| "Chemistry::Atom";
184	14		50			35	my $bond_class = $opts{bond_class} \|\| "Chemistry::Bond";
185
186	14					28	my $mol = $mol_class->new;
187	14					28	my %formula = $self->parse_formula($string);
188	14					45	for my $sym (sort keys %formula) {
189	31					63	for (my $i = 0; $i < $formula{$sym}; ++$i) {
190	516					915	$mol->add_atom($atom_class->new(symbol => $sym));
191							}
192							}
193	14					115	return $mol;
194							}
195
196							sub write_string {
197	16			16	1	30	my ($self, $mol, %opts) = @_;
198	16					21	my @formula_parts;
199
200	16		100			53	my $format = $opts{formula_format} \|\| "%s%d"; # default format
201	16					32	my $fh = $mol->formula_hash;
202	16					35	$format =~ s/%%/\\%/g; # escape %% with a \
203	16					25	my $joiner = "";
204	16	50				46	$joiner = $1 if $format =~ s/(?
205
206	16					16	my @symbols;
207	16	100				23	if ($opts{formula_sort}) {
208	1					3	@symbols = $opts{formula_sort}($fh);
209							} else {
210	15					34	@symbols = $self->sort_symbols($fh);
211							}
212
213	16					30	for my $sym (@symbols) {
214	42					50	my $s = $format;
215	42					53	my $n = $fh->{$sym};
216	42					104	$s =~ s/(?
217	42					76	$s =~ s/(?
218	42	100				52	$s =~ s/(? 1 ? $1 : ''/eg; # %d{}
	3					11
219	42	100				104	$s =~ s/(? 1 ? $n : ''/eg; # %d
	40					89
220	42					65	$s =~ s/\\(.)/$1/g; # other \ escapes
221	42					83	push @formula_parts, $s;
222							}
223	16					89	return join($joiner, @formula_parts);
224							}
225
226							sub sort_symbols {
227	15			15	0	23	my ($self, $formula_hash) = @_;
228	15					36	my @symbols = keys %$formula_hash;
229	15	100				29	if ($formula_hash->{C}) {
230							# C and H first, followed by alphabetical order
231	13					114	s/^([CH])$/\0$1/ for @symbols;
232	13					37	@symbols = sort @symbols;
233	13					61	s/^\0([CH])$/$1/ for @symbols;
234	13					44	return @symbols;
235							} else {
236							# simple alphabetical order
237	2					7	return sort @symbols;
238							}
239							}
240
241							sub file_is {
242	0			0	1	0	return 0; # no files are identified automatically as having this format
243							}
244
245							### Code derived from formula.pl by Brent Gregersen follows
246
247							my %macros = (
248							Me => '(CH3)',
249							Et => '(CH3CH2)',
250							Bu => '(C4H9)',
251							Bn => '(C6H5CH2)',
252							Cp => '(C5H5)',
253							Ph => '(C6H5)',
254							Bz => '(C6H5CO)',
255							# Ac is an element
256							# Pr is an element
257							);
258
259
260							sub parse_formula {
261	16			16	0	1124	my ($self, $formula) = @_;
262	16					19	my (%elements);
263
264							#check balancing
265	16	50				23	return %elements if (!ParensBalanced($formula));
266
267							# replace other grouping with normal parens
268	16					29	$formula =~ tr/<>{}[]/()()()/;
269
270							# get rid of any spaces
271	16					32	$formula =~ s/\s+//g;
272
273							# perform macro expansion
274	16					39	foreach (keys(%macros)) {
275	112					638	$formula =~ s/$_/$macros{$_}/g;
276							}
277
278							# determine initial compound coeficent
279	16	100				62	my $coef = ($formula =~ s/^(\d+\.?\d*)//) ? $1 : 1.0;
280
281							# recursively process rest of formula
282	16					35	return internal_formula_parser($formula, $coef, %elements);
283							}
284
285							sub internal_formula_parser {
286	35			35	0	65	my ($formula, $coef, %form) = @_;
287	35					40	my $tmp_coef;
288
289	35					75	my ($extract, $remainder, $prefix) =
290							extract_bracketed($formula, '()', '[^(]*');
291
292	35	100	66			2626	if (defined($extract) and $extract ne '') {
293	15					57	$extract =~ s/^$(.*)$$/$1/;
294	15	100				57	if ($remainder =~ s/^(\d+\.?\d)(.)$/$2/) {
295	13					24	$tmp_coef = $1 * $coef;
296							} else {
297	2					9	$tmp_coef = $coef;
298							}
299
300							# get formula of prefix ( it has no parens)
301	15	100				40	%form = add_formula_strings($prefix, $coef, %form) if ($prefix ne '');
302
303							# check remainder for more parens
304	15	100				41	%form = internal_formula_parser($remainder, $coef, %form)
305							if ($remainder ne '');
306
307							# check extract for more parens
308	15					33	%form =
309							internal_formula_parser($extract, $tmp_coef, %form);
310							## we already know this is ne ''
311							} else { # get formula of complete string
312	20	50				51	%form = add_formula_strings($remainder, $coef, %form)
313							if ($remainder ne '');
314							}
315	35					103	return %form;
316							}
317
318							sub add_formula_strings {
319	30			30	0	67	my ($formula, $coef, %elements) = @_;
320
321							# print "Getting Formula of $formula\n";
322	30	50				140	$formula =~ /^(?:([A-Z][a-z])(\d+\.?\d)?)+$/o # XXX new
323							or croak "Invalid Portion of Formula $formula";
324	30					109	while ($formula =~ m/([A-Z][a-z])(\d+\.?\d)?/go) { # XXX new
325	55					108	my ($elm, $count) = ($1, $2);
326	55	100				87	$count = 1 unless defined $count;
327	55	100				89	if (defined $elements{$elm}) {
328	18					44	$elements{$elm} += $count * $coef;
329							} else {
330	37					117	$elements{$elm} = $count * $coef;
331							}
332							}
333	30					99	return %elements;
334							}
335
336							sub ParensBalanced {
337	16			16	0	27	my ($form) = @_;
338	16					24	my @stack = ();
339	16					46	my %pairs = (
340							'<' => '>',
341							'{' => '}',
342							'[' => ']',
343							'(' => ')'
344							);
345
346	16					59	while ($form =~ m/([<>(){}\]\[])/go) {
347	24					39	my $current = $1;
348	24	100				44	if ($current =~ /[<({\[]/) {
349	12					16	push(@stack, $current);
350	12					23	next;
351							}
352	12	50				21	return 0 if (scalar(@stack) == 0);
353	12	50				35	return 0 if ($current ne $pairs{ pop @stack});
354							}
355	16	50				55	return @stack ? 0 : 1;
356							}
357
358							1;
359
360							=head1 SOURCE CODE REPOSITORY
361
362							L
363
364							=head1 SEE ALSO
365
366							L, L
367
368							For discussion about Hill order, just search the web for C
369							order">. The original reference is I B<1900>, I<22>,
370							478-494. L.
371
372							=head1 AUTHOR
373
374							Ivan Tubert-Brohman .
375
376							Formula parsing code contributed by Brent Gregersen.
377
378							Patch for non-integer formulas by Daniel Scott.
379
380							=head1 COPYRIGHT
381
382							Copyright (c) 2005 Ivan Tubert-Brohman. All rights reserved. This program is
383							free software; you can redistribute it and/or modify it under the same terms as
384							Perl itself.
385
386							=cut
387