File Coverage

blib/lib/Chemistry/File/MDLMol.pm
Criterion Covered Total %
statement 171 176 97.1
branch 47 64 73.4
condition 15 23 65.2
subroutine 19 20 95.0
pod 4 10 40.0
total 256 293 87.3


line stmt bran cond sub pod time code
1             package Chemistry::File::MDLMol;
2              
3             our $VERSION = '0.22'; # VERSION
4             # $Id$
5              
6 3     3   156708 use base "Chemistry::File";
  3         18  
  3         1503  
7 3     3   39441 use Chemistry::Mol;
  3         73271  
  3         231  
8 3     3   35 use Carp;
  3         9  
  3         197  
9 3     3   19 use List::Util;
  3         10  
  3         168  
10 3     3   20 use strict;
  3         15  
  3         90  
11 3     3   18 use warnings;
  3         7  
  3         1514  
12              
13             our $DEBUG = 0;
14              
15             =head1 NAME
16              
17             Chemistry::File::MDLMol - MDL molfile reader/writer
18              
19             =head1 SYNOPSIS
20              
21             use Chemistry::File::MDLMol;
22              
23             # read a molecule
24             my $mol = Chemistry::Mol->read('myfile.mol');
25              
26             # write a molecule
27             $mol->write("myfile.mol");
28              
29             # use a molecule as a query for substructure matching
30             use Chemistry::Pattern;
31             use Chemistry::Ring;
32             Chemistry::Ring::aromatize_mol($mol);
33              
34             my $patt = Chemistry::Pattern->read('query.mol');
35             if ($patt->match($mol)) {
36             print "it matches!\n";
37             }
38              
39             =cut
40              
41             Chemistry::Mol->register_format(mdl => __PACKAGE__);
42              
43             =head1 DESCRIPTION
44              
45             MDL Molfile (V2000) reader/writer.
46              
47             This module automatically registers the 'mdl' format with Chemistry::Mol.
48              
49             The first three lines of the molfile are stored as $mol->name,
50             $mol->attr("mdlmol/line2"), and $mol->attr("mdlmol/comment").
51              
52             This version only reads and writes some of the information available in a
53             molfile: it reads coordinates, atom and bond types, charges, isotopes,
54             radicals, and atom lists. It does not read other things such as
55             stereochemistry, 3d properties, etc.
56              
57             This module is part of the PerlMol project, L.
58              
59             =head2 Query properties
60              
61             The MDL molfile format supports query properties such as atom lists, and
62             special bond types such as "single or double", "single or aromatic", "double or
63             aromatic", "ring bond", or "any". These properties are supported by this module
64             in conjunction with L. However, support for query properies
65             is currently read-only, and the other properties listed in the specification
66             are not supported yet.
67              
68             So that atom and bond objects can use these special query options, the
69             conditions are represented as Perl subroutines. The generated code can be
70             read from the 'mdlmol/test_sub' attribute:
71              
72             $atom->attr('mdlmol/test_sub');
73             $bond->attr('mdlmol/test_sub');
74            
75             This may be useful for debugging, such as when an atom doesn't seem to match as
76             expected.
77              
78             =head2 Aromatic Queries
79              
80             To be able to search for aromatic substructures are represented by Kekule
81             structures, molfiles that are read as patterns (with
82             Cread) are aromatized automatically by using the
83             L module. The default bond test from Chemistry::Pattern::Bond
84             is overriden by one that checks the aromaticity in addition to the bond order.
85             The test is,
86              
87             $patt->aromatic ? $bond->aromatic
88             : (!$bond->aromatic && $patt->order == $bond->order);
89              
90             That is, aromatic pattern bonds match aromatic bonds, and aliphatic pattern
91             bonds match aliphatic bonds with the same bond order.
92            
93              
94             =cut
95              
96              
97             # some constants, based on tables from the file format specification
98              
99             my %OLD_CHARGE_MAP = (
100             1 => 3,
101             2 => 2,
102             3 => 1,
103             4 => 0,
104             5 => -1,
105             6 => -2,
106             7 => -3,
107             );
108              
109             my %BOND_TYPE_EXPR = (
110             4 => '($bond->aromatic)',
111             5 => '($bond->order == 1 or $bond->order == 2)',
112             6 => '($bond->order == 1 or $bond->aromatic)',
113             7 => '($bond->order == 2 or $bond->aromatic)',
114             8 => '(1)', # any bond
115             );
116              
117             my %BOND_TOPOLOGY_EXPR = (
118             1 => '@{$bond->attr("ring/rings")||[]}',
119             2 => '! @{$bond->attr("ring/rings")||[]}',
120             );
121              
122              
123             sub read_mol {
124 66     66 1 9221 my ($self, $fh, %opts) = @_;
125 66 100       322 return if $fh->eof;
126              
127 33         786 %opts = ( slurp => 1, %opts );
128 33   100     140 my $mol_class = $opts{mol_class} || "Chemistry::Mol";
129 33   33     273 my $atom_class = $opts{atom_class} || $mol_class->atom_class;
130 33   33     290 my $bond_class = $opts{bond_class} || $mol_class->bond_class;
131 33         179 local $_;
132              
133 33         168 my $mol = $mol_class->new();
134              
135             # header
136 33         1098 my $name = <$fh>; chomp $name;
  33         97  
137 33         98 my $line2 = <$fh>; chomp $line2;
  33         68  
138 33         69 my $comment = <$fh>; chomp $comment;
  33         59  
139 33         151 $mol->name($name);
140 33         464 $mol->attr("mdlmol/line2", $line2);
141 33         569 $mol->attr("mdlmol/comment", $comment);
142              
143             # counts line
144 33 50       456 defined ($_ = <$fh>) or croak "unexpected end of file";
145 33         210 my ($na, $nb) = unpack("A3A3", $_);
146              
147 33         94 my %old_charges;
148             my %old_radicals;
149              
150             # atom block
151 33         155 for my $i (1 .. $na) { # for each atom...
152 439 50       39793 defined (my $line = <$fh>) or croak "unexpected end of file";
153 439         1935 my ($x, $y, $z, $symbol, $mass, $charge)
154             = unpack("A10A10A10xA3A2A3", $line);
155            
156             $old_charges{$i} = $OLD_CHARGE_MAP{$charge}
157 439 50       1093 if $OLD_CHARGE_MAP{$charge};
158 439 50 33     1699 $old_radicals{$i} = 2
159             if $charge && $charge == 4;
160 439         585 my $mass_number;
161 439 100 66     1283 if( int $mass && eval { require Chemistry::Isotope } ) {
  1 50       14  
162 1         6 my $abundance =
163             Chemistry::Isotope::isotope_abundance($symbol);
164 1         22 ($mass_number) = sort { $abundance->{$b} cmp
165 1         7 $abundance->{$a} }
166             sort keys %$abundance;
167 1         5 $mass_number += $mass;
168             } elsif( int $mass ) {
169 0         0 warn "no Chemistry::Isotope, cannot read mass number " .
170             "from atom block\n";
171             }
172             $mol->new_atom(
173 439         1809 symbol => $symbol,
174             coords => [$x*1, $y*1, $z*1],
175             mass_number => $mass_number,
176             );
177             }
178              
179             # bond block
180 33         3095 for my $i (1 .. $nb) { # for each bond...
181 3     3   26 no warnings 'numeric';
  3         6  
  3         2038  
182 451 50       1550 defined ($_ = <$fh>) or croak "unexpected end of file";
183             my ($a1, $a2, $type, $stereo, $topology, $rxn)
184 451         1832 = map {$_*1} unpack("A3A3A3A3x3A3A3", $_);
  2706         5233  
185 451 100       2127 my $order = $type =~ /^[123]$/ ? $type : 1;
186 451         1262 my $bond = $mol->new_bond(
187             type => $type,
188             atoms => [$mol->atoms($a1,$a2)],
189             order => $order,
190             );
191 451 100       61260 if ($mol->isa('Chemistry::Pattern')) {
192 9         23 $self->bond_expr($bond, $i, $type, $topology);
193             }
194             }
195              
196             # properties block
197 33         168 while (<$fh>) {
198 46 100 66     597 if (/^M END/ or /^\$\$\$\$/) {
    100          
199 33         91 last;
200             } elsif (/^M (...)/) { # generic extended property handler
201 12 100 100     104 if ($1 eq 'CHG' or $1 eq 'RAD'){ # XXX
202             # clear old-style info
203 10         34 %old_radicals = ();
204 10         24 %old_charges = ();
205             }
206              
207 12         40 my $method = "M_$1";
208 12 50       139 $self->$method($mol, $_) if $self->can($method);
209             }
210             }
211              
212             # add old-style charges and radicals if they still apply
213 33         149 while (my ($key, $val) = each %old_charges) {
214 0         0 $mol->atoms($key)->formal_charge($val);
215             }
216 33         109 while (my ($key, $val) = each %old_radicals) {
217 0         0 $mol->atoms($key)->formal_radical($val);
218             }
219              
220             # make sure we get to the end of the file
221 33 50       98 if ($opts{slurp}) {
222 33         819 1 while <$fh>;
223             }
224              
225 33         189 $mol->add_implicit_hydrogens;
226              
227 33 100       26873 if ($mol->isa('Chemistry::Pattern')) {
228 2         771 require Chemistry::Ring;
229 2         6929 Chemistry::Ring::aromatize_mol($mol);
230             }
231              
232 33         5585 return $mol;
233             }
234              
235             sub bond_expr {
236 9     9 0 21 my ($self, $bond, $i, $type, $topology) = @_;
237 9         17 my @bond_exprs;
238 9         15 my $s = $BOND_TOPOLOGY_EXPR{$topology};
239 9 100       21 push @bond_exprs, $s if $s;
240 9         16 $s = $BOND_TYPE_EXPR{$type};
241 9 100       17 push @bond_exprs, $s if $s;
242 9 100       20 if (@bond_exprs) {
243 1         5 my $expr = join " and ", @bond_exprs;
244 1         4 my $sub_txt = <
245             sub {
246             no warnings;
247             my (\$patt, \$bond) = \@_;
248             $expr;
249             };
250             SUB
251 1 50       5 print "MDLMol bond($i) sub: <<<<$sub_txt>>>\n" if $DEBUG;
252 1         4 $bond->attr('mdlmol/test_sub' => $sub_txt);
253 1     1   120 $bond->test_sub(eval $sub_txt);
  1         12  
  1         3  
  1         116  
254             } else { # default bond sub
255 8         22 $bond->test_sub(\&default_bond_test);
256             }
257             }
258              
259             sub default_bond_test {
260 3     3   27 no warnings;
  3         6  
  3         2393  
261 138     138 0 53765 my ($patt, $bond) = @_;
262 138 100 100     296 $patt->aromatic ? $bond->aromatic
263             : (!$bond->aromatic && $patt->order == $bond->order);
264             }
265              
266              
267             sub M_CHG {
268 6     6 0 25 my ($self, $mol, $line) = @_;
269 6         57 my ($m, $type, $n, %data) = split " ", $line;
270 6         42 while (my ($key, $val) = each %data) {
271 10         110 $mol->atoms($key)->formal_charge($val);
272             }
273             }
274              
275             sub M_RAD {
276 4     4 0 17 my ($self, $mol, $line) = @_;
277 4         22 my ($m, $type, $n, %data) = split " ", $line;
278 4         23 while (my ($key, $val) = each %data) {
279 4         15 $mol->atoms($key)->formal_radical($val);
280             }
281             }
282              
283             sub M_ISO {
284 1     1 0 4 my ($self, $mol, $line) = @_;
285 1         7 my ($m, $type, $n, %data) = split " ", $line;
286 1         7 while (my ($key, $val) = each %data) {
287 1         8 $mol->atoms($key)->mass_number($val);
288             }
289             }
290              
291             sub M_ALS {
292 1     1 0 4 my ($self, $mol, $line) = @_;
293              
294 1 50       7 return unless $line =~ /^M ALS (...)(...) (.)/;
295 1         6 my ($n, $cnt, $exclude) = ($1, $2, $3);
296 1         2 my @symbols;
297 1 50       5 $exclude = $exclude =~ /^[Tt]$/ ? '!' : '';
298              
299             # parse the symbols out of the atom list line
300 1         5 for my $i (0 .. $cnt-1) {
301 4         11 my $s = substr $_, 16+$i*4, 4;
302 4         14 $s =~ s/\s+//g;
303 4         10 push @symbols, $s;
304             }
305              
306             # save attr
307 1         4 $mol->atoms($n)->attr('mdlmol/atom_list' => $_);
308              
309             # create test sub
310 1 50       32 if ($mol->isa('Chemistry::Pattern')) {
311 1         5 my $sub_txt = <
312             sub{
313             my (\$patt, \$atom) = \@_;
314             my \$sym = \$atom->symbol;
315             $exclude (List::Util::first {\$sym eq \$_} \@symbols);
316             };
317             SUB
318 1 50       5 print "MDLMol atom($n) sub: <<<<$sub_txt>>>\n" if $DEBUG;
319 1 50       5 print "MDLMol symbol list: (@symbols)\n" if $DEBUG;
320 1         4 $mol->atoms($n)->attr('mdlmol/test_sub' => $sub_txt);
321 1         26 $mol->atoms($n)->test_sub(eval $sub_txt);
322             }
323             }
324              
325             sub name_is {
326 0     0 1 0 my ($self, $fname) = @_;
327 0         0 $fname =~ /\.mol$/i;
328             }
329              
330              
331             sub file_is {
332 17     17 1 55987 my ($self, $fname) = @_;
333 17         123 $fname =~ /\.mol$/i;
334             }
335              
336              
337             sub write_string {
338 10     10 1 2362 my ($self, $mol, %opts) = @_;
339              
340 3     3   112 no warnings 'uninitialized';
  3         11  
  3         1581  
341 10         41 my $s = sprintf "%s\n perlmol \n\n", $mol->name;
342 10         118 $s .= sprintf "%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%6s\n",
343             0+$mol->atoms, 0+$mol->bonds,
344             0, 0, 0, 0, 0, 0, 0, 0, 999, "V2000"; # "counts" line
345              
346 10         186 my $i = 1;
347 10         39 my %idx_map;
348             my @charged_atoms;
349 10         0 my @isotope_atoms;
350 10         0 my @radical_atoms;
351 10         105 for my $atom ($mol->atoms) {
352 176         1080 my ($x, $y, $z) = $atom->coords->array;
353              
354 176         2016 $s .= sprintf
355             "%10.4f%10.4f%10.4f %-3s%2i%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i\n",
356             $x, $y, $z, $atom->symbol,
357             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0;
358 176 100       1975 push @charged_atoms, $i if $atom->formal_charge;
359 176 100       980 push @isotope_atoms, $i if $atom->mass_number;
360 176 100       891 push @radical_atoms, $i if $atom->formal_radical;
361 176         985 $idx_map{$atom->id} = $i++;
362             }
363              
364 10         78 for my $bond ($mol->bonds) {
365 184         1407 my ($a1, $a2) = map {$idx_map{$_->id}} $bond->atoms;
  368         2671  
366 184         1860 $s .= sprintf "%3i%3i%3i%3i%3i%3i%3i\n",
367             $a1, $a2, $bond->order,
368             0, 0, 0, 0;
369             }
370            
371 10         100 while (@charged_atoms) {
372 1 50       5 my $n = @charged_atoms > 8 ? 8 : @charged_atoms;
373 1         3 $s .= "M CHG $n";
374 1         5 for my $key (splice @charged_atoms, 0, $n) {
375 1         5 $s .= sprintf "%4d%4d", $key, $mol->atoms($key)->formal_charge;
376             }
377 1         18 $s .= "\n";
378             }
379 10         50 while (@isotope_atoms) {
380 1 50       4 my $n = @isotope_atoms > 8 ? 8 : @isotope_atoms;
381 1         5 $s .= "M ISO $n";
382 1         5 for my $key (splice @isotope_atoms, 0, $n) {
383 2         24 $s .= sprintf "%4d%4d", $key, $mol->atoms($key)->mass_number;
384             }
385 1         17 $s .= "\n";
386             }
387 10         33 while (@radical_atoms) {
388 1 50       4 my $n = @radical_atoms > 8 ? 8 : @radical_atoms;
389 1         6 $s .= "M RAD $n";
390 1         3 for my $key (splice @radical_atoms, 0, $n) {
391 1         4 $s .= sprintf "%4d%4d", $key, $mol->atoms($key)->formal_radical;
392             }
393 1         19 $s .= "\n";
394             }
395              
396 10         20 $s .= "M END\n";
397 10         134 $s;
398             }
399              
400              
401              
402             1;
403              
404             =head1 SOURCE CODE REPOSITORY
405              
406             L
407              
408             =head1 SEE ALSO
409              
410             L
411              
412             The MDL file format specification.
413             L or
414             Arthur Dalby et al., J. Chem. Inf. Comput. Sci, 1992, 32, 244-255.
415              
416             =head1 AUTHOR
417              
418             Ivan Tubert-Brohman
419              
420             =head1 COPYRIGHT
421              
422             Copyright (c) 2009 Ivan Tubert-Brohman. All rights reserved. This program is
423             free software; you can redistribute it and/or modify it under the same terms as
424             Perl itself.
425              
426             =cut
427