File Coverage

blib/lib/Chemistry/File/MDLMol.pm
Criterion Covered Total %
statement 171 176 97.1
branch 47 64 73.4
condition 15 23 65.2
subroutine 19 20 95.0
pod 4 10 40.0
total 256 293 87.3


line stmt bran cond sub pod time code
1             package Chemistry::File::MDLMol;
2              
3             our $VERSION = '0.23'; # VERSION
4             # $Id$
5              
6 3     3   142692 use base "Chemistry::File";
  3         15  
  3         1424  
7 3     3   37658 use Chemistry::Mol;
  3         69399  
  3         235  
8 3     3   26 use Carp;
  3         8  
  3         160  
9 3     3   19 use List::Util;
  3         7  
  3         152  
10 3     3   18 use strict;
  3         15  
  3         76  
11 3     3   15 use warnings;
  3         7  
  3         1429  
12              
13             our $DEBUG = 0;
14              
15             =head1 NAME
16              
17             Chemistry::File::MDLMol - MDL molfile reader/writer
18              
19             =head1 SYNOPSIS
20              
21             use Chemistry::File::MDLMol;
22              
23             # read a molecule
24             my $mol = Chemistry::Mol->read('myfile.mol');
25              
26             # write a molecule
27             $mol->write("myfile.mol");
28              
29             # use a molecule as a query for substructure matching
30             use Chemistry::Pattern;
31             use Chemistry::Ring;
32             Chemistry::Ring::aromatize_mol($mol);
33              
34             my $patt = Chemistry::Pattern->read('query.mol');
35             if ($patt->match($mol)) {
36             print "it matches!\n";
37             }
38              
39             =cut
40              
41             Chemistry::Mol->register_format(mdl => __PACKAGE__);
42              
43             =head1 DESCRIPTION
44              
45             MDL Molfile (V2000) reader/writer.
46              
47             This module automatically registers the 'mdl' format with Chemistry::Mol.
48              
49             The first three lines of the molfile are stored as $mol->name,
50             $mol->attr("mdlmol/line2"), and $mol->attr("mdlmol/comment").
51              
52             This version only reads and writes some of the information available in a
53             molfile: it reads coordinates, atom and bond types, charges, isotopes,
54             radicals, and atom lists. It does not read other things such as
55             stereochemistry, 3d properties, etc.
56              
57             This module is part of the PerlMol project, L.
58              
59             =head2 Query properties
60              
61             The MDL molfile format supports query properties such as atom lists, and
62             special bond types such as "single or double", "single or aromatic", "double or
63             aromatic", "ring bond", or "any". These properties are supported by this module
64             in conjunction with L. However, support for query properies
65             is currently read-only, and the other properties listed in the specification
66             are not supported yet.
67              
68             So that atom and bond objects can use these special query options, the
69             conditions are represented as Perl subroutines. The generated code can be
70             read from the 'mdlmol/test_sub' attribute:
71              
72             $atom->attr('mdlmol/test_sub');
73             $bond->attr('mdlmol/test_sub');
74            
75             This may be useful for debugging, such as when an atom doesn't seem to match as
76             expected.
77              
78             =head2 Aromatic Queries
79              
80             To be able to search for aromatic substructures are represented by Kekule
81             structures, molfiles that are read as patterns (with
82             Cread) are aromatized automatically by using the
83             L module. The default bond test from Chemistry::Pattern::Bond
84             is overriden by one that checks the aromaticity in addition to the bond order.
85             The test is,
86              
87             $patt->aromatic ? $bond->aromatic
88             : (!$bond->aromatic && $patt->order == $bond->order);
89              
90             That is, aromatic pattern bonds match aromatic bonds, and aliphatic pattern
91             bonds match aliphatic bonds with the same bond order.
92            
93              
94             =cut
95              
96              
97             # some constants, based on tables from the file format specification
98              
99             my %OLD_CHARGE_MAP = (
100             1 => 3,
101             2 => 2,
102             3 => 1,
103             4 => 0,
104             5 => -1,
105             6 => -2,
106             7 => -3,
107             );
108              
109             my %BOND_TYPE_EXPR = (
110             4 => '($bond->aromatic)',
111             5 => '($bond->order == 1 or $bond->order == 2)',
112             6 => '($bond->order == 1 or $bond->aromatic)',
113             7 => '($bond->order == 2 or $bond->aromatic)',
114             8 => '(1)', # any bond
115             );
116              
117             my %BOND_TOPOLOGY_EXPR = (
118             1 => '@{$bond->attr("ring/rings")||[]}',
119             2 => '! @{$bond->attr("ring/rings")||[]}',
120             );
121              
122              
123             sub read_mol {
124 66     66 1 7854 my ($self, $fh, %opts) = @_;
125 66 100       258 return if $fh->eof;
126              
127 33         618 %opts = ( slurp => 1, %opts );
128 33   100     138 my $mol_class = $opts{mol_class} || "Chemistry::Mol";
129 33   33     231 my $atom_class = $opts{atom_class} || $mol_class->atom_class;
130 33   33     255 my $bond_class = $opts{bond_class} || $mol_class->bond_class;
131 33         150 local $_;
132              
133 33         98 my $mol = $mol_class->new();
134              
135             # header
136 33         902 my $name = <$fh>; chomp $name;
  33         102  
137 33         94 my $line2 = <$fh>; chomp $line2;
  33         54  
138 33         54 my $comment = <$fh>; chomp $comment;
  33         53  
139 33         133 $mol->name($name);
140 33         437 $mol->attr("mdlmol/line2", $line2);
141 33         489 $mol->attr("mdlmol/comment", $comment);
142              
143             # counts line
144 33 50       412 defined ($_ = <$fh>) or croak "unexpected end of file";
145 33         178 my ($na, $nb) = unpack("A3A3", $_);
146              
147 33         70 my %old_charges;
148             my %old_radicals;
149              
150             # atom block
151 33         141 for my $i (1 .. $na) { # for each atom...
152 439 50       38497 defined (my $line = <$fh>) or croak "unexpected end of file";
153 439         2089 my ($x, $y, $z, $symbol, $mass, $charge)
154             = unpack("A10A10A10xA3A2A3", $line);
155            
156             $old_charges{$i} = $OLD_CHARGE_MAP{$charge}
157 439 50       1035 if $OLD_CHARGE_MAP{$charge};
158 439 50 33     1631 $old_radicals{$i} = 2
159             if $charge && $charge == 4;
160 439         589 my $mass_number;
161 439 100 66     1279 if( int $mass && eval { require Chemistry::Isotope } ) {
  1 50       13  
162 1         7 my $abundance =
163             Chemistry::Isotope::isotope_abundance($symbol);
164 1         20 ($mass_number) = sort { $abundance->{$b} cmp
165 1         6 $abundance->{$a} }
166             sort keys %$abundance;
167 1         6 $mass_number += $mass;
168             } elsif( int $mass ) {
169 0         0 warn "no Chemistry::Isotope, cannot read mass number " .
170             "from atom block\n";
171             }
172             $mol->new_atom(
173 439         1864 symbol => $symbol,
174             coords => [$x*1, $y*1, $z*1],
175             mass_number => $mass_number,
176             );
177             }
178              
179             # bond block
180 33         2943 for my $i (1 .. $nb) { # for each bond...
181 3     3   26 no warnings 'numeric';
  3         7  
  3         1991  
182 451 50       1501 defined ($_ = <$fh>) or croak "unexpected end of file";
183             my ($a1, $a2, $type, $stereo, $topology, $rxn)
184 451         1827 = map {$_*1} unpack("A3A3A3A3x3A3A3", $_);
  2706         5182  
185 451 100       2086 my $order = $type =~ /^[123]$/ ? $type : 1;
186 451         1317 my $bond = $mol->new_bond(
187             type => $type,
188             atoms => [$mol->atoms($a1,$a2)],
189             order => $order,
190             );
191 451 100       60915 if ($mol->isa('Chemistry::Pattern')) {
192 9         26 $self->bond_expr($bond, $i, $type, $topology);
193             }
194             }
195              
196             # properties block
197 33         146 while (<$fh>) {
198 46 100 66     575 if (/^M END/ or /^\$\$\$\$/) {
    100          
199 33         73 last;
200             } elsif (/^M (...)/) { # generic extended property handler
201 12 100 100     98 if ($1 eq 'CHG' or $1 eq 'RAD'){ # XXX
202             # clear old-style info
203 10         20 %old_radicals = ();
204 10         20 %old_charges = ();
205             }
206              
207 12         36 my $method = "M_$1";
208 12 50       125 $self->$method($mol, $_) if $self->can($method);
209             }
210             }
211              
212             # add old-style charges and radicals if they still apply
213 33         130 while (my ($key, $val) = each %old_charges) {
214 0         0 $mol->atoms($key)->formal_charge($val);
215             }
216 33         90 while (my ($key, $val) = each %old_radicals) {
217 0         0 $mol->atoms($key)->formal_radical($val);
218             }
219              
220             # make sure we get to the end of the file
221 33 50       87 if ($opts{slurp}) {
222 33         752 1 while <$fh>;
223             }
224              
225 33         158 $mol->add_implicit_hydrogens;
226              
227 33 100       26427 if ($mol->isa('Chemistry::Pattern')) {
228 2         560 require Chemistry::Ring;
229 2         6494 Chemistry::Ring::aromatize_mol($mol);
230             }
231              
232 33         5244 return $mol;
233             }
234              
235             sub bond_expr {
236 9     9 0 22 my ($self, $bond, $i, $type, $topology) = @_;
237 9         11 my @bond_exprs;
238 9         18 my $s = $BOND_TOPOLOGY_EXPR{$topology};
239 9 100       20 push @bond_exprs, $s if $s;
240 9         16 $s = $BOND_TYPE_EXPR{$type};
241 9 100       21 push @bond_exprs, $s if $s;
242 9 100       17 if (@bond_exprs) {
243 1         6 my $expr = join " and ", @bond_exprs;
244 1         4 my $sub_txt = <
245             sub {
246             no warnings;
247             my (\$patt, \$bond) = \@_;
248             $expr;
249             };
250             SUB
251 1 50       4 print "MDLMol bond($i) sub: <<<<$sub_txt>>>\n" if $DEBUG;
252 1         5 $bond->attr('mdlmol/test_sub' => $sub_txt);
253 1     1   99 $bond->test_sub(eval $sub_txt);
  1         11  
  1         2  
  1         109  
254             } else { # default bond sub
255 8         22 $bond->test_sub(\&default_bond_test);
256             }
257             }
258              
259             sub default_bond_test {
260 3     3   24 no warnings;
  3         7  
  3         2413  
261 138     138 0 51542 my ($patt, $bond) = @_;
262 138 100 100     293 $patt->aromatic ? $bond->aromatic
263             : (!$bond->aromatic && $patt->order == $bond->order);
264             }
265              
266              
267             sub M_CHG {
268 6     6 0 21 my ($self, $mol, $line) = @_;
269 6         54 my ($m, $type, $n, %data) = split " ", $line;
270 6         38 while (my ($key, $val) = each %data) {
271 10         106 $mol->atoms($key)->formal_charge($val);
272             }
273             }
274              
275             sub M_RAD {
276 4     4 0 30 my ($self, $mol, $line) = @_;
277 4         22 my ($m, $type, $n, %data) = split " ", $line;
278 4         20 while (my ($key, $val) = each %data) {
279 4         14 $mol->atoms($key)->formal_radical($val);
280             }
281             }
282              
283             sub M_ISO {
284 1     1 0 4 my ($self, $mol, $line) = @_;
285 1         7 my ($m, $type, $n, %data) = split " ", $line;
286 1         7 while (my ($key, $val) = each %data) {
287 1         7 $mol->atoms($key)->mass_number($val);
288             }
289             }
290              
291             sub M_ALS {
292 1     1 0 5 my ($self, $mol, $line) = @_;
293              
294 1 50       6 return unless $line =~ /^M ALS (...)(...) (.)/;
295 1         5 my ($n, $cnt, $exclude) = ($1, $2, $3);
296 1         3 my @symbols;
297 1 50       4 $exclude = $exclude =~ /^[Tt]$/ ? '!' : '';
298              
299             # parse the symbols out of the atom list line
300 1         6 for my $i (0 .. $cnt-1) {
301 4         12 my $s = substr $_, 16+$i*4, 4;
302 4         12 $s =~ s/\s+//g;
303 4         10 push @symbols, $s;
304             }
305              
306             # save attr
307 1         5 $mol->atoms($n)->attr('mdlmol/atom_list' => $_);
308              
309             # create test sub
310 1 50       32 if ($mol->isa('Chemistry::Pattern')) {
311 1         5 my $sub_txt = <
312             sub{
313             my (\$patt, \$atom) = \@_;
314             my \$sym = \$atom->symbol;
315             $exclude (List::Util::first {\$sym eq \$_} \@symbols);
316             };
317             SUB
318 1 50       5 print "MDLMol atom($n) sub: <<<<$sub_txt>>>\n" if $DEBUG;
319 1 50       3 print "MDLMol symbol list: (@symbols)\n" if $DEBUG;
320 1         4 $mol->atoms($n)->attr('mdlmol/test_sub' => $sub_txt);
321 1         24 $mol->atoms($n)->test_sub(eval $sub_txt);
322             }
323             }
324              
325             sub name_is {
326 0     0 1 0 my ($self, $fname) = @_;
327 0         0 $fname =~ /\.mol$/i;
328             }
329              
330              
331             sub file_is {
332 17     17 1 46948 my ($self, $fname) = @_;
333 17         96 $fname =~ /\.mol$/i;
334             }
335              
336              
337             sub write_string {
338 10     10 1 2013 my ($self, $mol, %opts) = @_;
339              
340 3     3   26 no warnings 'uninitialized';
  3         8  
  3         1538  
341 10         32 my $s = sprintf "%s\n perlmol \n\n", $mol->name;
342 10         101 $s .= sprintf "%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%6s\n",
343             0+$mol->atoms, 0+$mol->bonds,
344             0, 0, 0, 0, 0, 0, 0, 0, 999, "V2000"; # "counts" line
345              
346 10         166 my $i = 1;
347 10         35 my %idx_map;
348             my @charged_atoms;
349 10         0 my @isotope_atoms;
350 10         0 my @radical_atoms;
351 10         96 for my $atom ($mol->atoms) {
352 176         1037 my ($x, $y, $z) = $atom->coords->array;
353              
354 176         1948 $s .= sprintf
355             "%10.4f%10.4f%10.4f %-3s%2i%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i\n",
356             $x, $y, $z, $atom->symbol,
357             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0;
358 176 100       2015 push @charged_atoms, $i if $atom->formal_charge;
359 176 100       1031 push @isotope_atoms, $i if $atom->mass_number;
360 176 100       881 push @radical_atoms, $i if $atom->formal_radical;
361 176         876 $idx_map{$atom->id} = $i++;
362             }
363              
364 10         72 for my $bond ($mol->bonds) {
365 184         1248 my ($a1, $a2) = map {$idx_map{$_->id}} $bond->atoms;
  368         2321  
366 184         1030 $s .= sprintf "%3i%3i%3i%3i%3i%3i%3i\n",
367             $a1, $a2, $bond->order,
368             0, 0, 0, 0;
369             }
370            
371 10         83 while (@charged_atoms) {
372 1 50       5 my $n = @charged_atoms > 8 ? 8 : @charged_atoms;
373 1         4 $s .= "M CHG $n";
374 1         4 for my $key (splice @charged_atoms, 0, $n) {
375 1         4 $s .= sprintf "%4d%4d", $key, $mol->atoms($key)->formal_charge;
376             }
377 1         20 $s .= "\n";
378             }
379 10         42 while (@isotope_atoms) {
380 1 50       6 my $n = @isotope_atoms > 8 ? 8 : @isotope_atoms;
381 1         4 $s .= "M ISO $n";
382 1         5 for my $key (splice @isotope_atoms, 0, $n) {
383 2         23 $s .= sprintf "%4d%4d", $key, $mol->atoms($key)->mass_number;
384             }
385 1         17 $s .= "\n";
386             }
387 10         24 while (@radical_atoms) {
388 1 50       4 my $n = @radical_atoms > 8 ? 8 : @radical_atoms;
389 1         5 $s .= "M RAD $n";
390 1         3 for my $key (splice @radical_atoms, 0, $n) {
391 1         4 $s .= sprintf "%4d%4d", $key, $mol->atoms($key)->formal_radical;
392             }
393 1         18 $s .= "\n";
394             }
395              
396 10         17 $s .= "M END\n";
397 10         117 $s;
398             }
399              
400              
401              
402             1;
403              
404             =head1 SOURCE CODE REPOSITORY
405              
406             L
407              
408             =head1 SEE ALSO
409              
410             L
411              
412             The MDL file format specification.
413             L or
414             Arthur Dalby et al., J. Chem. Inf. Comput. Sci, 1992, 32, 244-255.
415              
416             =head1 AUTHOR
417              
418             Ivan Tubert-Brohman
419              
420             =head1 COPYRIGHT
421              
422             Copyright (c) 2009 Ivan Tubert-Brohman. All rights reserved. This program is
423             free software; you can redistribute it and/or modify it under the same terms as
424             Perl itself.
425              
426             =cut
427