File Coverage

blib/lib/Chemistry/Smiles.pm
Criterion Covered Total %
statement 56 62 90.3
branch 12 16 75.0
condition 10 21 47.6
subroutine 11 13 84.6
pod 0 8 0.0
total 89 120 74.1


line stmt bran cond sub pod time code
1             package Chemistry::Smiles;
2              
3             $VERSION = "0.13";
4 1     1   12803 use 5.006001;
  1         5  
  1         54  
5 1     1   6 use strict;
  1         2  
  1         96  
6 1     1   7 use warnings;
  1         7  
  1         1145  
7              
8             =head1 NAME
9              
10             Chemistry::Smiles - SMILES parser (deprecated)
11              
12             =head1 SYNOPSYS
13              
14              
15             #!/usr/bin/perl
16             use Chemistry::Smiles;
17              
18             my $s = 'C1C[13C]1(=O)[O-]';
19              
20             # Default use - Requires Chemistry::Mol
21             my $default_parser = new Chemistry::Smiles;
22             my $mol = $default_parser->parse($s, new Chemistry::Mol);
23             print $mol->print;
24              
25             # Callback use
26             my $i = 0;
27             my $callback_parser = new Chemistry::Smiles(
28             add_atom => sub {print "ATOM(@_)\n"; ++$i},
29             add_bond => sub {print "BOND(@_)\n"}
30             );
31             $callback_parser->parse($s, 'mol');
32              
33             =head1 DESCRIPTION
34              
35             NOTE: THIS MODULE IS DEPRECATED! Use Chemistry::File::SMILES instead.
36              
37             This object-oriented module parses a SMILES (Simplified Molecular Input Line
38             Entry Specification) string. It can either return the molecule as a Chemistry::Mol
39             object or be used via callback functions.
40              
41             =head1 METHODS
42              
43             =over 4
44              
45             =cut
46              
47             my $Symbol = qr/
48             s|p|o|n|c|b|Zr|Zn|Yb|Y|Xe|W|V|U|Tm|Tl|Ti|Th|
49             Te|Tc|Tb|Ta|Sr|Sn|Sm|Si|Sg|SeSc|Sb|S|Ru|Rn|Rh|Rf|Re|Rb|Ra|
50             Pu|Pt|Pr|Po|Pm|Pd|Pb|Pa|P|Os|O|Np|No|Ni|Ne|NdNb|Na|N|Mt|Mt|
51             Mo|Mn|Mg|Md|Lu|Lr|Li|La|Kr|K|Ir|In|I|Hs|Hs|Ho|Hg|Hf|He|H|Ge
52             Gd|Ga|Fr|Fm|Fe|F|Eu|Es|Er|Dy|Ds|Db|Cu|Cs|Cr|Co|Cm|Cl|Cf|Ce|
53             Cd|Ca|C|Br|Bk|BiBh|Be|Ba|B|Au|At|As|Ar|Am|Al|Ag|Ac|\*
54             /x; # Order is reverse alphabetical to ensure longest match
55              
56             my $Simple_symbol = qr/Br|Cl|B|C|N|O|P|S|F|I|s|p|o|n|c|b/;
57              
58             my $Bond = qr/(?:[-=#:.\/\\])?/;
59             my $Simple_atom = qr/($Simple_symbol)/; #3
60             my $Complex_atom = qr/
61             (?:
62             \[ #begin atom
63             (\d*) #4 isotope
64             ($Symbol) #5 symbol
65             (\@{0,2}) #6 chirality
66             (?:H(\d*))? #7 H-count
67             (\+{2,}|-{2,}|\+\d*|-\d*)? #8 charge
68             \] #end atom
69             )
70             /x;
71              
72             my $Digits = qr/(?:($Bond)(?:\d|%\d\d))*/;
73             my $Chain = qr/
74             \G( #1
75             (?:
76             ($Bond) #2
77             (?:$Simple_atom|$Complex_atom) #3-8
78             ($Digits) #9
79             )
80             |\(
81             |\)
82             |.+
83             )
84             /x;
85              
86             my $digits_re = qr/($Bond)(\%\d\d|\d)/;
87              
88             =item Chemistry::Smiles->new([add_atom => \&sub1, add_bond => \&sub2])
89              
90             Create a SMILES parser. If the add_atom and add_bond subroutine references
91             are given, they will be called whenever an atom or a bond needs to be added
92             to the molecule. If they are not specified, default methods, which
93             create a Chemistry::Mol object, will be used.
94              
95             =cut
96              
97             sub new {
98 1     1 0 208 my $class = shift;
99 1         6 my %opts = @_;
100 1 50 33     12 require Chemistry::Mol unless $opts{add_atom} && $opts{add_bond};
101 1   50     30 my $self = bless {
      50        
102             add_atom => $opts{add_atom} || \&add_atom,
103             add_bond => $opts{add_bond} || \&add_bond,
104             }, $class;
105             }
106              
107             =item $obj->parse($string, $mol)
108              
109             Parse a Smiles $string. $mol is a "molecule state object". It can be anything;
110             the parser doesn't do anything with it except sending it as the first parameter
111             to the callback functions. If callback functions were not provided when
112             constructing the parser object, $mol must be a Chemistry::Mol object, because
113             that's what the default callback functions require.
114              
115             =cut
116              
117             sub parse {
118 7     7 0 1248 my $self = shift;
119 7         11 my ($s, $mol) = @_;
120 7         25 $self->{stack} = [ undef ];
121 7         24 $self->{digits} = {};
122              
123 7         82 while ($s =~ /$Chain/g) {
124             #my @a = ($1, $2, $3, $4, $5, $6, $7, $8);
125             #print Dumper(\@a);
126 54         301 my ($all, $bnd, $sym, $iso, $sym2, $chir, $hcnt, $chg, $dig)
127             = ($1, $2, $3, $4, $5, $6, $7, $8, $9);
128 54 100       169 if ($all eq '(') {
    100          
    100          
    50          
129 5         14 $self->start_branch();
130             } elsif ($all eq ')') {
131 5         12 $self->end_branch();
132             } elsif ($sym) { # Simple atom
133 1     1   8 no warnings;
  1         2  
  1         100  
134 40         83 my @digs = parse_digits($dig);
135 40         119 $self->atom($mol, $bnd, '', $sym, '', '', '', \@digs);
136             } elsif ($sym2) { # Complex atom
137 1     1   6 no warnings;
  1         2  
  1         822  
138 4         9 my @digs = parse_digits($dig);
139 4   50     31 $self->atom($mol, $bnd, $iso, $sym2, $chir, $hcnt || 0, $chg, \@digs);
140             } else {
141 0         0 die "SMILES ERROR: '$all'\n";
142             }
143             }
144 7         27 $mol;
145             }
146              
147             sub parse_digits {
148 44     44 0 60 my ($dig) = @_;
149 44         48 my @digs;
150 44   100     221 while ($dig && $dig =~ /$digits_re/g) {
151 16         379 push @digs, {bnd=>$1, dig=>$2};
152             }
153 44         112 @digs;
154             }
155             sub atom {
156 44     44 0 70 my $self = shift;
157 44         96 my ($mol,$bnd,$iso,$sym,$chir,$hcount,$chg,$digs) = @_;
158             #{no warnings; local $" = ','; print "atom(@_)\n"}
159 44         159 my $a = $self->{add_atom}($mol,$iso,$sym,$chir,$hcount,$chg);
160 44 100       535 if($self->{stack}[-1]) {
161 37         147 $self->{add_bond}($mol, $bnd, $self->{stack}[-1], $a);
162             }
163 44         267 for my $dig (@$digs) {
164 16 100       46 if ($self->{digits}{$dig->{dig}}) {
165 8 0 33     26 if ($dig->{bnd} && $self->{digits}{$dig->{dig}}{bnd}
      33        
166             && $dig->{bnd} ne $self->{digits}{$dig->{dig}}{bnd}){
167 0         0 die "SMILES: Inconsistent ring closure\n";
168             }
169 8   33     54 $self->{add_bond}($mol,
170             $dig->{bnd} || $self->{digits}{$dig->{dig}}{bnd},
171             $self->{digits}{$dig->{dig}}{atom}, $a);
172 8         417 delete $self->{digits}{$dig->{dig}};
173             } else {
174 8         53 $self->{digits}{$dig->{dig}} = {atom=>$a, bnd=>$dig->{bnd}};
175             }
176             }
177 44         546 $self->{stack}[-1] = $a;
178             }
179              
180             =back
181              
182             =head1 CALLBACK FUNCTIONS
183              
184             =over 4
185              
186             =item $atom = add_atom($mol, $iso, $sym, $chir, $hcount, $chg)
187              
188             Called by the parser whenever an atom is found. The first parameter is the
189             state object given to $obj->parse(). The other parameters are the isotope,
190             symbol, chirality, hydrogen count, and charge of the atom. Only the symbol is
191             guaranteed to be defined. Mnemonic: the parameters are given in the same order
192             that is used in a SMILES string (such as [18OH-]). This callback is expected to
193             return something that uniquely identifies the atom that was created (it might
194             be a number, a string, or an object).
195              
196             =cut
197              
198             # Default add_atom callback
199             sub add_atom {
200 0     0 0 0 my ($mol, $iso, $sym, $chir, $hcount, $chg) = @_;
201 0         0 $mol->new_atom(symbol=>$sym);
202             }
203              
204             =item add_bond($mol, $type, $a1, $a2)
205              
206             Called by the parser whenever an bond needs to be created. The first parameter
207             is the state object given to $obj->parse(). The other parameters are the bond
208             type and the two atoms that need to be bonded. The atoms are identified using
209             the return values from the add_atom() callback.
210              
211             =back
212              
213             =cut
214              
215             # Default add_bond callback
216             sub add_bond {
217 0     0 0 0 my ($mol, $type, $a1, $a2) = @_;
218 0         0 $mol->new_bond(type=>$type, atoms=>[$a1, $a2]);
219             }
220              
221             sub start_branch {
222 5     5 0 7 my $self = shift;
223             #print "start_branch\n";
224 5         8 push @{$self->{stack}}, $self->{stack}[-1];
  5         47  
225             }
226              
227             sub end_branch {
228 5     5 0 8 my $self = shift;
229             #print "end_branch\n";
230 5         7 pop @{$self->{stack}};
  5         119  
231             }
232              
233             =head1 SEE ALSO
234              
235             The SMILES Home Page at http://www.daylight.com/dayhtml/smiles/
236             The Daylight Theory Manual at
237             http://www.daylight.com/dayhtml/doc/theory/theory.smiles.html
238              
239             =head1 BUGS
240              
241             The SMILES specification is not fully implemented yet. For example, branches
242             that start before an atom (such as (OC)C, which should be equivalent to C(CO)
243             and COC).
244              
245             =head1 AUTHOR
246              
247             Ivan Tubert Eitub@cpan.orgE
248              
249             =head1 COPYRIGHT
250              
251             Copyright (c) 2004 Ivan Tubert. All rights reserved. This program is free
252             software; you can redistribute it and/or modify it under the same terms as
253             Perl itself.
254              
255             =cut
256