File Coverage

blib/lib/Chemistry/File/MDLMol.pm
Criterion Covered Total %
statement 172 177 97.1
branch 47 64 73.4
condition 15 23 65.2
subroutine 19 20 95.0
pod 4 10 40.0
total 257 294 87.4


line stmt bran cond sub pod time code
1             package Chemistry::File::MDLMol;
2              
3             our $VERSION = '0.24'; # VERSION
4             # $Id$
5              
6 3     3   147199 use base "Chemistry::File";
  3         16  
  3         1169  
7 3     3   31538 use Chemistry::Mol;
  3         62354  
  3         136  
8 3     3   19 use Carp;
  3         5  
  3         128  
9 3     3   14 use List::Util;
  3         8  
  3         114  
10 3     3   15 use strict;
  3         4  
  3         49  
11 3     3   12 use warnings;
  3         6  
  3         1339  
12              
13             our $DEBUG = 0;
14              
15             =head1 NAME
16              
17             Chemistry::File::MDLMol - MDL molfile reader/writer
18              
19             =head1 SYNOPSIS
20              
21             use Chemistry::File::MDLMol;
22              
23             # read a molecule
24             my $mol = Chemistry::Mol->read('myfile.mol');
25              
26             # write a molecule
27             $mol->write("myfile.mol");
28              
29             # use a molecule as a query for substructure matching
30             use Chemistry::Pattern;
31             use Chemistry::Ring;
32             Chemistry::Ring::aromatize_mol($mol);
33              
34             my $patt = Chemistry::Pattern->read('query.mol');
35             if ($patt->match($mol)) {
36             print "it matches!\n";
37             }
38              
39             =cut
40              
41             Chemistry::Mol->register_format(mdl => __PACKAGE__);
42              
43             =head1 DESCRIPTION
44              
45             MDL Molfile (V2000) reader/writer.
46              
47             This module automatically registers the 'mdl' format with Chemistry::Mol.
48              
49             The first three lines of the molfile are stored as $mol->name,
50             $mol->attr("mdlmol/line2"), and $mol->attr("mdlmol/comment").
51              
52             This version only reads and writes some of the information available in a
53             molfile: it reads coordinates, atom and bond types, charges, isotopes,
54             radicals, and atom lists. It does not read other things such as
55             stereochemistry, 3d properties, etc.
56              
57             This module is part of the PerlMol project, L.
58              
59             =head2 Query properties
60              
61             The MDL molfile format supports query properties such as atom lists, and
62             special bond types such as "single or double", "single or aromatic", "double or
63             aromatic", "ring bond", or "any". These properties are supported by this module
64             in conjunction with L. However, support for query properties
65             is currently read-only, and the other properties listed in the specification
66             are not supported yet.
67              
68             So that atom and bond objects can use these special query options, the
69             conditions are represented as Perl subroutines. The generated code can be
70             read from the 'mdlmol/test_sub' attribute:
71              
72             $atom->attr('mdlmol/test_sub');
73             $bond->attr('mdlmol/test_sub');
74            
75             This may be useful for debugging, such as when an atom doesn't seem to match as
76             expected.
77              
78             =head2 Aromatic Queries
79              
80             To be able to search for aromatic substructures are represented by Kekule
81             structures, molfiles that are read as patterns (with
82             Cread) are aromatized automatically by using the
83             L module. The default bond test from Chemistry::Pattern::Bond
84             is overridden by one that checks the aromaticity in addition to the bond order.
85             The test is,
86              
87             $patt->aromatic ? $bond->aromatic
88             : (!$bond->aromatic && $patt->order == $bond->order);
89              
90             That is, aromatic pattern bonds match aromatic bonds, and aliphatic pattern
91             bonds match aliphatic bonds with the same bond order.
92            
93              
94             =cut
95              
96              
97             # some constants, based on tables from the file format specification
98              
99             my %OLD_CHARGE_MAP = (
100             1 => 3,
101             2 => 2,
102             3 => 1,
103             4 => 0,
104             5 => -1,
105             6 => -2,
106             7 => -3,
107             );
108              
109             my %BOND_TYPE_EXPR = (
110             4 => '($bond->aromatic)',
111             5 => '($bond->order == 1 or $bond->order == 2)',
112             6 => '($bond->order == 1 or $bond->aromatic)',
113             7 => '($bond->order == 2 or $bond->aromatic)',
114             8 => '(1)', # any bond
115             );
116              
117             my %BOND_TOPOLOGY_EXPR = (
118             1 => '@{$bond->attr("ring/rings")||[]}',
119             2 => '! @{$bond->attr("ring/rings")||[]}',
120             );
121              
122              
123             sub read_mol {
124 66     66 1 6619 my ($self, $fh, %opts) = @_;
125 66 100       246 return if $fh->eof;
126              
127 33         744 %opts = ( slurp => 1, %opts );
128 33   100     94 my $mol_class = $opts{mol_class} || "Chemistry::Mol";
129 33   33     201 my $atom_class = $opts{atom_class} || $mol_class->atom_class;
130 33   33     202 my $bond_class = $opts{bond_class} || $mol_class->bond_class;
131 33         117 local $_;
132              
133 33         79 my $mol = $mol_class->new();
134              
135             # header
136 33         758 my $name = <$fh>; chomp $name;
  33         61  
137 33         74 my $line2 = <$fh>; chomp $line2;
  33         43  
138 33         48 my $comment = <$fh>; chomp $comment;
  33         54  
139 33         105 $mol->name($name);
140 33         358 $mol->attr("mdlmol/line2", $line2);
141 33         418 $mol->attr("mdlmol/comment", $comment);
142              
143             # counts line
144 33 50       336 defined ($_ = <$fh>) or croak "unexpected end of file";
145 33         184 my ($na, $nb, undef, undef, $is_chiral) = unpack("A3A3A3A3A3", $_);
146              
147 33         138 $mol->attr("mdlmol/chiral", int $is_chiral);
148              
149 33         309 my %old_charges;
150             my %old_radicals;
151              
152             # atom block
153 33         115 for my $i (1 .. $na) { # for each atom...
154 439 50       32414 defined (my $line = <$fh>) or croak "unexpected end of file";
155 439         1632 my ($x, $y, $z, $symbol, $mass, $charge)
156             = unpack("A10A10A10xA3A2A3", $line);
157            
158             $old_charges{$i} = $OLD_CHARGE_MAP{$charge}
159 439 50       869 if $OLD_CHARGE_MAP{$charge};
160 439 50 33     1311 $old_radicals{$i} = 2
161             if $charge && $charge == 4;
162 439         505 my $mass_number;
163 439 100 66     988 if( int $mass && eval { require Chemistry::Isotope } ) {
  1 50       10  
164 1         5 my $abundance =
165             Chemistry::Isotope::isotope_abundance($symbol);
166 1         15 ($mass_number) = sort { $abundance->{$b} cmp
167 1         5 $abundance->{$a} }
168             sort keys %$abundance;
169 1         4 $mass_number += $mass;
170             } elsif( int $mass ) {
171 0         0 warn "no Chemistry::Isotope, cannot read mass number " .
172             "from atom block\n";
173             }
174             $mol->new_atom(
175 439         1489 symbol => $symbol,
176             coords => [$x*1, $y*1, $z*1],
177             mass_number => $mass_number,
178             );
179             }
180              
181             # bond block
182 33         2431 for my $i (1 .. $nb) { # for each bond...
183 3     3   21 no warnings 'numeric';
  3         6  
  3         1651  
184 451 50       1265 defined ($_ = <$fh>) or croak "unexpected end of file";
185             my ($a1, $a2, $type, $stereo, $topology, $rxn)
186 451         1515 = map {$_*1} unpack("A3A3A3A3x3A3A3", $_);
  2706         4225  
187 451 100       1713 my $order = $type =~ /^[123]$/ ? $type : 1;
188 451         986 my $bond = $mol->new_bond(
189             type => $type,
190             atoms => [$mol->atoms($a1,$a2)],
191             order => $order,
192             );
193 451 100       49912 if ($mol->isa('Chemistry::Pattern')) {
194 9         22 $self->bond_expr($bond, $i, $type, $topology);
195             }
196             }
197              
198             # properties block
199 33         118 while (<$fh>) {
200 46 100 66     477 if (/^M END/ or /^\$\$\$\$/) {
    100          
201 33         53 last;
202             } elsif (/^M (...)/) { # generic extended property handler
203 12 100 100     58 if ($1 eq 'CHG' or $1 eq 'RAD'){ # XXX
204             # clear old-style info
205 10         21 %old_radicals = ();
206 10         14 %old_charges = ();
207             }
208              
209 12         29 my $method = "M_$1";
210 12 50       102 $self->$method($mol, $_) if $self->can($method);
211             }
212             }
213              
214             # add old-style charges and radicals if they still apply
215 33         100 while (my ($key, $val) = each %old_charges) {
216 0         0 $mol->atoms($key)->formal_charge($val);
217             }
218 33         71 while (my ($key, $val) = each %old_radicals) {
219 0         0 $mol->atoms($key)->formal_radical($val);
220             }
221              
222             # make sure we get to the end of the file
223 33 50       66 if ($opts{slurp}) {
224 33         658 1 while <$fh>;
225             }
226              
227 33         149 $mol->add_implicit_hydrogens;
228              
229 33 100       21499 if ($mol->isa('Chemistry::Pattern')) {
230 2         500 require Chemistry::Ring;
231 2         5852 Chemistry::Ring::aromatize_mol($mol);
232             }
233              
234 33         4408 return $mol;
235             }
236              
237             sub bond_expr {
238 9     9 0 20 my ($self, $bond, $i, $type, $topology) = @_;
239 9         11 my @bond_exprs;
240 9         17 my $s = $BOND_TOPOLOGY_EXPR{$topology};
241 9 100       15 push @bond_exprs, $s if $s;
242 9         14 $s = $BOND_TYPE_EXPR{$type};
243 9 100       16 push @bond_exprs, $s if $s;
244 9 100       15 if (@bond_exprs) {
245 1         4 my $expr = join " and ", @bond_exprs;
246 1         5 my $sub_txt = <
247             sub {
248             no warnings;
249             my (\$patt, \$bond) = \@_;
250             $expr;
251             };
252             SUB
253 1 50       4 print "MDLMol bond($i) sub: <<<<$sub_txt>>>\n" if $DEBUG;
254 1         3 $bond->attr('mdlmol/test_sub' => $sub_txt);
255 1     1   113 $bond->test_sub(eval $sub_txt);
  1         10  
  1         2  
  1         119  
256             } else { # default bond sub
257 8         19 $bond->test_sub(\&default_bond_test);
258             }
259             }
260              
261             sub default_bond_test {
262 3     3   23 no warnings;
  3         12  
  3         2083  
263 138     138 0 41732 my ($patt, $bond) = @_;
264 138 100 100     241 $patt->aromatic ? $bond->aromatic
265             : (!$bond->aromatic && $patt->order == $bond->order);
266             }
267              
268              
269             sub M_CHG {
270 6     6 0 20 my ($self, $mol, $line) = @_;
271 6         49 my ($m, $type, $n, %data) = split " ", $line;
272 6         41 while (my ($key, $val) = each %data) {
273 10         86 $mol->atoms($key)->formal_charge($val);
274             }
275             }
276              
277             sub M_RAD {
278 4     4 0 13 my ($self, $mol, $line) = @_;
279 4         28 my ($m, $type, $n, %data) = split " ", $line;
280 4         18 while (my ($key, $val) = each %data) {
281 4         12 $mol->atoms($key)->formal_radical($val);
282             }
283             }
284              
285             sub M_ISO {
286 1     1 0 3 my ($self, $mol, $line) = @_;
287 1         5 my ($m, $type, $n, %data) = split " ", $line;
288 1         6 while (my ($key, $val) = each %data) {
289 1         4 $mol->atoms($key)->mass_number($val);
290             }
291             }
292              
293             sub M_ALS {
294 1     1 0 5 my ($self, $mol, $line) = @_;
295              
296 1 50       6 return unless $line =~ /^M ALS (...)(...) (.)/;
297 1         6 my ($n, $cnt, $exclude) = ($1, $2, $3);
298 1         2 my @symbols;
299 1 50       4 $exclude = $exclude =~ /^[Tt]$/ ? '!' : '';
300              
301             # parse the symbols out of the atom list line
302 1         5 for my $i (0 .. $cnt-1) {
303 4         10 my $s = substr $_, 16+$i*4, 4;
304 4         10 $s =~ s/\s+//g;
305 4         9 push @symbols, $s;
306             }
307              
308             # save attr
309 1         6 $mol->atoms($n)->attr('mdlmol/atom_list' => $_);
310              
311             # create test sub
312 1 50       28 if ($mol->isa('Chemistry::Pattern')) {
313 1         4 my $sub_txt = <
314             sub{
315             my (\$patt, \$atom) = \@_;
316             my \$sym = \$atom->symbol;
317             $exclude (List::Util::first {\$sym eq \$_} \@symbols);
318             };
319             SUB
320 1 50       3 print "MDLMol atom($n) sub: <<<<$sub_txt>>>\n" if $DEBUG;
321 1 50       3 print "MDLMol symbol list: (@symbols)\n" if $DEBUG;
322 1         8 $mol->atoms($n)->attr('mdlmol/test_sub' => $sub_txt);
323 1         20 $mol->atoms($n)->test_sub(eval $sub_txt);
324             }
325             }
326              
327             sub name_is {
328 0     0 1 0 my ($self, $fname) = @_;
329 0         0 $fname =~ /\.mol$/i;
330             }
331              
332              
333             sub file_is {
334 17     17 1 40759 my ($self, $fname) = @_;
335 17         82 $fname =~ /\.mol$/i;
336             }
337              
338              
339             sub write_string {
340 10     10 1 1700 my ($self, $mol, %opts) = @_;
341              
342 3     3   21 no warnings 'uninitialized';
  3         12  
  3         1387  
343 10         27 my $s = sprintf "%s\n perlmol \n\n", $mol->name;
344 10         152 $s .= sprintf "%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%6s\n",
345             0+$mol->atoms, 0+$mol->bonds,
346             0, 0, 0, 0, 0, 0, 0, 0, 999, "V2000"; # "counts" line
347              
348 10         136 my $i = 1;
349 10         28 my %idx_map;
350             my @charged_atoms;
351 10         0 my @isotope_atoms;
352 10         0 my @radical_atoms;
353 10         20 for my $atom ($mol->atoms) {
354 176         842 my ($x, $y, $z) = $atom->coords->array;
355              
356 176         1507 $s .= sprintf
357             "%10.4f%10.4f%10.4f %-3s%2i%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i\n",
358             $x, $y, $z, $atom->symbol,
359             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0;
360 176 100       1555 push @charged_atoms, $i if $atom->formal_charge;
361 176 100       819 push @isotope_atoms, $i if $atom->mass_number;
362 176 100       741 push @radical_atoms, $i if $atom->formal_radical;
363 176         719 $idx_map{$atom->id} = $i++;
364             }
365              
366 10         63 for my $bond ($mol->bonds) {
367 184         1081 my ($a1, $a2) = map {$idx_map{$_->id}} $bond->atoms;
  368         1829  
368 184         896 $s .= sprintf "%3i%3i%3i%3i%3i%3i%3i\n",
369             $a1, $a2, $bond->order,
370             0, 0, 0, 0;
371             }
372            
373 10         84 while (@charged_atoms) {
374 1 50       4 my $n = @charged_atoms > 8 ? 8 : @charged_atoms;
375 1         4 $s .= "M CHG $n";
376 1         3 for my $key (splice @charged_atoms, 0, $n) {
377 1         3 $s .= sprintf "%4d%4d", $key, $mol->atoms($key)->formal_charge;
378             }
379 1         15 $s .= "\n";
380             }
381 10         21 while (@isotope_atoms) {
382 1 50       72 my $n = @isotope_atoms > 8 ? 8 : @isotope_atoms;
383 1         6 $s .= "M ISO $n";
384 1         4 for my $key (splice @isotope_atoms, 0, $n) {
385 2         26 $s .= sprintf "%4d%4d", $key, $mol->atoms($key)->mass_number;
386             }
387 1         15 $s .= "\n";
388             }
389 10         22 while (@radical_atoms) {
390 1 50       3 my $n = @radical_atoms > 8 ? 8 : @radical_atoms;
391 1         3 $s .= "M RAD $n";
392 1         3 for my $key (splice @radical_atoms, 0, $n) {
393 1         3 $s .= sprintf "%4d%4d", $key, $mol->atoms($key)->formal_radical;
394             }
395 1         27 $s .= "\n";
396             }
397              
398 10         20 $s .= "M END\n";
399 10         102 $s;
400             }
401              
402              
403              
404             1;
405              
406             =head1 SOURCE CODE REPOSITORY
407              
408             L
409              
410             =head1 SEE ALSO
411              
412             L
413              
414             The MDL file format specification.
415             L,
416             L,
417             L,
418             or Arthur Dalby et al., J. Chem. Inf. Comput. Sci, 1992, 32, 244-255.
419             L.
420              
421             =head1 AUTHOR
422              
423             Ivan Tubert-Brohman
424              
425             =head1 COPYRIGHT
426              
427             Copyright (c) 2009 Ivan Tubert-Brohman. All rights reserved. This program is
428             free software; you can redistribute it and/or modify it under the same terms as
429             Perl itself.
430              
431             =cut
432