| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Chemistry::File::SMILES; |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
$VERSION = "0.47"; |
|
4
|
|
|
|
|
|
|
# $Id: SMILES.pm,v 1.16 2009/05/10 20:31:08 itubert Exp $ |
|
5
|
|
|
|
|
|
|
|
|
6
|
8
|
|
|
8
|
|
136676
|
use 5.006; |
|
|
8
|
|
|
|
|
30
|
|
|
|
8
|
|
|
|
|
604
|
|
|
7
|
8
|
|
|
8
|
|
50
|
use strict; |
|
|
8
|
|
|
|
|
19
|
|
|
|
8
|
|
|
|
|
775
|
|
|
8
|
8
|
|
|
8
|
|
46
|
use warnings; |
|
|
8
|
|
|
|
|
20
|
|
|
|
8
|
|
|
|
|
373
|
|
|
9
|
8
|
|
|
8
|
|
43
|
no warnings 'recursion'; |
|
|
8
|
|
|
|
|
21
|
|
|
|
8
|
|
|
|
|
584
|
|
|
10
|
8
|
|
|
8
|
|
47
|
use base "Chemistry::File"; |
|
|
8
|
|
|
|
|
15
|
|
|
|
8
|
|
|
|
|
41531
|
|
|
11
|
8
|
|
|
8
|
|
429306
|
use Chemistry::Mol; |
|
|
8
|
|
|
|
|
725461
|
|
|
|
8
|
|
|
|
|
1251
|
|
|
12
|
8
|
|
|
8
|
|
15014
|
use Chemistry::Bond::Find 'assign_bond_orders'; |
|
|
8
|
|
|
|
|
248080
|
|
|
|
8
|
|
|
|
|
1029
|
|
|
13
|
8
|
|
|
8
|
|
500
|
use List::Util 'first'; |
|
|
8
|
|
|
|
|
17
|
|
|
|
8
|
|
|
|
|
6518
|
|
|
14
|
8
|
|
|
8
|
|
54
|
use Carp; |
|
|
8
|
|
|
|
|
15
|
|
|
|
8
|
|
|
|
|
21312
|
|
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 NAME |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
Chemistry::File::SMILES - SMILES linear notation parser/writer |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 SYNOPSYS |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
#!/usr/bin/perl |
|
24
|
|
|
|
|
|
|
use Chemistry::File::SMILES; |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# parse a SMILES string |
|
27
|
|
|
|
|
|
|
my $s = 'C1CC1(=O)[O-]'; |
|
28
|
|
|
|
|
|
|
my $mol = Chemistry::Mol->parse($s, format => 'smiles'); |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
# print a SMILES string |
|
31
|
|
|
|
|
|
|
print $mol->print(format => 'smiles'); |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# print a unique (canonical) SMILES string |
|
34
|
|
|
|
|
|
|
print $mol->print(format => 'smiles', unique => 1); |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
# parse a SMILES file |
|
37
|
|
|
|
|
|
|
my @mols = Chemistry::Mol->read("file.smi", format => 'smiles'); |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# write a multiline SMILES file |
|
40
|
|
|
|
|
|
|
Chemistry::Mol->write("file.smi", mols => \@mols); |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
This module parses a SMILES (Simplified Molecular Input Line Entry |
|
46
|
|
|
|
|
|
|
Specification) string. This is a File I/O driver for the PerlMol project. |
|
47
|
|
|
|
|
|
|
L. It registers the 'smiles' format with |
|
48
|
|
|
|
|
|
|
Chemistry::Mol. |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
This parser interprets anything after whitespace as the molecule's name; |
|
51
|
|
|
|
|
|
|
for example, when the following SMILES string is parsed, $mol->name will be |
|
52
|
|
|
|
|
|
|
set to "Methyl chloride": |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
CCl Methyl chloride |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
The name is not included by default on output. However, if the C option |
|
57
|
|
|
|
|
|
|
is defined, the name will be included after the SMILES string, separated by a |
|
58
|
|
|
|
|
|
|
tab. |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
print $mol->print(format => 'smiles', name => 1); |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
=head2 Multiline SMILES and SMILES files |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
A file or string can contain multiple molecules, one per line. |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
CCl Methyl chloride |
|
67
|
|
|
|
|
|
|
CO Methanol |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
Files with the extension '.smi' are assumed to have this format. |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=head2 Atom Mapping Numbers |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
As an extension for reaction processing, SMILES strings may have atom mapping |
|
74
|
|
|
|
|
|
|
numbers, which are introduced after a colon in a bracketed atom. For example, |
|
75
|
|
|
|
|
|
|
[C:1]. The mapping number need not be unique. This module reads the mapping |
|
76
|
|
|
|
|
|
|
numbers and stores them as the name of the atom ($atom->name). |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
On output, atom names are not included by default. See the C and |
|
79
|
|
|
|
|
|
|
C options below for ways of including them. |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
head1 OPTIONS |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
The following options are supported in addition to the options mentioned for |
|
84
|
|
|
|
|
|
|
L, such as C, C, and C. |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=over |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=item aromatic |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
On output, detect aromatic atoms and bonds by means of the Chemistry::Ring |
|
91
|
|
|
|
|
|
|
module, and represent the organic aromatic atoms with lowercase symbols. |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=item unique |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
When used on output, canonicalize the structure if it hasn't been canonicalized |
|
96
|
|
|
|
|
|
|
already and generate a unique SMILES string. This option implies "aromatic". |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
=item number |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
For atoms that have a defined name, print the name as the "atom number". For |
|
101
|
|
|
|
|
|
|
example, if an ethanol molecule has the name "42" for the oxygen atom and the |
|
102
|
|
|
|
|
|
|
other atoms have undefined names, the output would be: |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
CC[OH:42] |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
=item auto_number |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
When used on output, number all the atoms explicitly and sequentially. The |
|
109
|
|
|
|
|
|
|
output for ethanol would look something like this: |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
[CH3:1][CH2:2][OH:3] |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=item name |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
Include the molecule name on output, as described in the previous section. |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=item kekulize |
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
When used on input, assign single or double bond orders to "aromatic" or |
|
120
|
|
|
|
|
|
|
otherwise unspecified bonds (i.e., generate the Kekule structure). If false, |
|
121
|
|
|
|
|
|
|
the bond orders will remain single. This option is true by default. This uses |
|
122
|
|
|
|
|
|
|
C from the L module. |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
=back |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=cut |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
# INITIALIZATION |
|
129
|
|
|
|
|
|
|
Chemistry::Mol->register_format('smiles'); |
|
130
|
|
|
|
|
|
|
my $Smiles_parser = __PACKAGE__->new_parser; |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
#=begin comment |
|
133
|
|
|
|
|
|
|
# |
|
134
|
|
|
|
|
|
|
#=over |
|
135
|
|
|
|
|
|
|
# |
|
136
|
|
|
|
|
|
|
#=cut |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
sub file_is { |
|
139
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
140
|
0
|
|
|
|
|
0
|
$self->name_is(@_); |
|
141
|
|
|
|
|
|
|
} |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
sub name_is { |
|
144
|
0
|
|
|
0
|
1
|
0
|
my ($self, $name) = @_; |
|
145
|
0
|
|
|
|
|
0
|
$name =~ /\.smi/; |
|
146
|
|
|
|
|
|
|
} |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
sub slurp_mol { |
|
149
|
0
|
|
|
0
|
1
|
0
|
my ($self, $fh) = @_; |
|
150
|
0
|
|
|
|
|
0
|
scalar <$fh>; |
|
151
|
|
|
|
|
|
|
} |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
sub read_mol { |
|
154
|
111
|
|
|
111
|
1
|
89396
|
my ($self, $fh, %opts) = @_; |
|
155
|
111
|
|
|
|
|
852
|
%opts = (kekulize => 1, %opts); |
|
156
|
111
|
|
50
|
|
|
519
|
my $mol_class = $opts{mol_class} || "Chemistry::Mol"; |
|
157
|
|
|
|
|
|
|
|
|
158
|
111
|
|
|
|
|
639
|
my $line = <$fh>; |
|
159
|
111
|
100
|
|
|
|
568
|
return unless defined $line; |
|
160
|
57
|
|
|
|
|
197
|
$line =~ tr/\r\n//d; |
|
161
|
57
|
|
|
|
|
223
|
my ($smiles, $name) = split " ", $line, 2; |
|
162
|
|
|
|
|
|
|
|
|
163
|
57
|
|
|
|
|
401
|
my $mol = $mol_class->new; |
|
164
|
57
|
50
|
|
|
|
1620
|
unless ($Smiles_parser->parse($smiles, $mol, \%opts)) { |
|
165
|
0
|
|
|
|
|
0
|
warn "error parsing SMILES line '$line'\n"; |
|
166
|
0
|
|
|
|
|
0
|
$mol = $mol_class->new; |
|
167
|
|
|
|
|
|
|
} |
|
168
|
57
|
|
|
|
|
565
|
$mol->name($name); |
|
169
|
57
|
|
|
|
|
438
|
$self->add_implicit_hydrogens($mol); |
|
170
|
57
|
100
|
|
|
|
627
|
if ($opts{kekulize}) { |
|
171
|
56
|
|
|
|
|
355
|
assign_bond_orders($mol, method => "itub", use_coords => 0, |
|
172
|
|
|
|
|
|
|
scratch => 0, charges => 0); |
|
173
|
|
|
|
|
|
|
} |
|
174
|
57
|
|
|
|
|
55748
|
$mol; |
|
175
|
|
|
|
|
|
|
} |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
### The contents of the original Chemistry::Smiles module start below |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
my $Symbol = qr/ |
|
181
|
|
|
|
|
|
|
s|p|o|n|c|b|Zr|Zn|Yb|Y|Xe|W|V|U|Tm|Tl|Ti|Th| |
|
182
|
|
|
|
|
|
|
Te|Tc|Tb|Ta|Sr|Sn|Sm|Si|Sg|Se|Sc|Sb|S|Ru|Rn|Rh|Rf|Re|Rb|Ra| |
|
183
|
|
|
|
|
|
|
Pu|Pt|Pr|Po|Pm|Pd|Pb|Pa|P|Os|O|Np|No|Ni|Ne|Nd|Nb|Na|N|Mt|Mt| |
|
184
|
|
|
|
|
|
|
Mo|Mn|Mg|Md|Lu|Lr|Li|La|Kr|K|Ir|In|I|Hs|Hs|Ho|Hg|Hf|He|H|Ge| |
|
185
|
|
|
|
|
|
|
Gd|Ga|Fr|Fm|Fe|F|Eu|Es|Er|Dy|Ds|Db|Cu|Cs|Cr|Co|Cm|Cl|Cf|Ce| |
|
186
|
|
|
|
|
|
|
Cd|Ca|C|Br|Bk|Bi|Bh|Be|Ba|B|Au|At|As|Ar|Am|Al|Ag|Ac|\*|R|X |
|
187
|
|
|
|
|
|
|
/x; # Order is reverse alphabetical to ensure longest match |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
my $Simple_symbol = qr/Br|Cl|B|C|N|O|P|S|F|I|H|s|p|o|n|c|b/; |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
my $Bond = qr/(?:[-=#:.\/\\])?/; |
|
192
|
|
|
|
|
|
|
my $Simple_atom = qr/($Simple_symbol)/; #3 |
|
193
|
|
|
|
|
|
|
my $Complex_atom = qr/ |
|
194
|
|
|
|
|
|
|
(?: |
|
195
|
|
|
|
|
|
|
\[ #begin atom |
|
196
|
|
|
|
|
|
|
(\d*) #4 isotope |
|
197
|
|
|
|
|
|
|
($Symbol) #5 symbol |
|
198
|
|
|
|
|
|
|
(\@{0,2}) #6 chirality |
|
199
|
|
|
|
|
|
|
(?:(H\d*))? #7 H-count |
|
200
|
|
|
|
|
|
|
(\+{2,}|-{2,}|\+\d*|-\d*)? #8 charge |
|
201
|
|
|
|
|
|
|
(?::(\d+))? #9 name |
|
202
|
|
|
|
|
|
|
\] #end atom |
|
203
|
|
|
|
|
|
|
) |
|
204
|
|
|
|
|
|
|
/x; |
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
my $Digits = qr/(?:($Bond)(?:\d|%\d\d))*/; |
|
207
|
|
|
|
|
|
|
my $Chain = qr/ |
|
208
|
|
|
|
|
|
|
\G( #1 |
|
209
|
|
|
|
|
|
|
(?: |
|
210
|
|
|
|
|
|
|
($Bond) #2 |
|
211
|
|
|
|
|
|
|
(?:$Simple_atom|$Complex_atom) #3-9 |
|
212
|
|
|
|
|
|
|
($Digits) #10 |
|
213
|
|
|
|
|
|
|
) |
|
214
|
|
|
|
|
|
|
|\( |
|
215
|
|
|
|
|
|
|
|\) |
|
216
|
|
|
|
|
|
|
|.+ |
|
217
|
|
|
|
|
|
|
) |
|
218
|
|
|
|
|
|
|
/x; |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
my $digits_re = qr/($Bond)(\%\d\d|\d)/; |
|
221
|
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
my %type_to_order = ( |
|
223
|
|
|
|
|
|
|
'-' => 1, |
|
224
|
|
|
|
|
|
|
'=' => 2, |
|
225
|
|
|
|
|
|
|
'#' => 3, |
|
226
|
|
|
|
|
|
|
'/' => 1, |
|
227
|
|
|
|
|
|
|
'\\' => 1, |
|
228
|
|
|
|
|
|
|
'' => 1, # not strictly true |
|
229
|
|
|
|
|
|
|
'.' => 0, |
|
230
|
|
|
|
|
|
|
); |
|
231
|
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
my %ORGANIC_ELEMS = ( |
|
233
|
|
|
|
|
|
|
Br => 1, Cl => 1, B => 3, C => 4, N => 3, O => 2, P => 3, S => 2, |
|
234
|
|
|
|
|
|
|
F => 1, I => 1, s => 1, p => 1, o => 1, n => 1, c => 1, b => 1, |
|
235
|
|
|
|
|
|
|
); |
|
236
|
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
#=item Chemistry::Smiles->new([add_atom => \&sub1, add_bond => \&sub2]) |
|
238
|
|
|
|
|
|
|
# |
|
239
|
|
|
|
|
|
|
#Create a SMILES parser. If the add_atom and add_bond subroutine references |
|
240
|
|
|
|
|
|
|
#are given, they will be called whenever an atom or a bond needs to be added |
|
241
|
|
|
|
|
|
|
#to the molecule. If they are not specified, default methods, which |
|
242
|
|
|
|
|
|
|
#create a Chemistry::Mol object, will be used. |
|
243
|
|
|
|
|
|
|
# |
|
244
|
|
|
|
|
|
|
#=cut |
|
245
|
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
sub new_parser { |
|
247
|
9
|
|
|
9
|
0
|
160
|
my $class = shift; |
|
248
|
9
|
|
|
|
|
31
|
my %opts = @_; |
|
249
|
9
|
|
100
|
|
|
165
|
my $self = bless { |
|
|
|
|
100
|
|
|
|
|
|
250
|
|
|
|
|
|
|
add_atom => $opts{add_atom} || \&add_atom, |
|
251
|
|
|
|
|
|
|
add_bond => $opts{add_bond} || \&add_bond, |
|
252
|
|
|
|
|
|
|
}, $class; |
|
253
|
|
|
|
|
|
|
} |
|
254
|
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
#=item $obj->parse($string, $mol) |
|
256
|
|
|
|
|
|
|
# |
|
257
|
|
|
|
|
|
|
#Parse a Smiles $string. $mol is a "molecule state object". It can be anything; |
|
258
|
|
|
|
|
|
|
#the parser doesn't do anything with it except sending it as the first parameter |
|
259
|
|
|
|
|
|
|
#to the callback functions. If callback functions were not provided when |
|
260
|
|
|
|
|
|
|
#constructing the parser object, $mol must be a Chemistry::Mol object, because |
|
261
|
|
|
|
|
|
|
#that's what the default callback functions require. |
|
262
|
|
|
|
|
|
|
# |
|
263
|
|
|
|
|
|
|
#=cut |
|
264
|
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
sub parse { |
|
266
|
64
|
|
|
64
|
0
|
1106
|
my ($self, $s, $mol, $opts) = @_; |
|
267
|
64
|
|
|
|
|
362
|
$self->{stack} = [ undef ]; |
|
268
|
64
|
|
|
|
|
187
|
$self->{digits} = {}; |
|
269
|
|
|
|
|
|
|
|
|
270
|
64
|
|
|
|
|
131
|
eval { |
|
271
|
64
|
|
|
|
|
2247
|
while ($s =~ /$Chain/g) { |
|
272
|
|
|
|
|
|
|
#my @a = ($1, $2, $3, $4, $5, $6, $7, $8); |
|
273
|
|
|
|
|
|
|
#print Dumper(\@a); |
|
274
|
538
|
|
|
|
|
3429
|
my ($all, $bnd, $sym, $iso, $sym2, $chir, $hcnt, $chg, $name, $dig) |
|
275
|
|
|
|
|
|
|
= ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10); |
|
276
|
538
|
100
|
|
|
|
1816
|
if ($all eq '(') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
277
|
58
|
|
|
|
|
167
|
$self->start_branch(); |
|
278
|
|
|
|
|
|
|
} elsif ($all eq ')') { |
|
279
|
58
|
|
|
|
|
169
|
$self->end_branch(); |
|
280
|
|
|
|
|
|
|
} elsif ($sym) { # Simple atom |
|
281
|
8
|
|
|
8
|
|
415
|
no warnings; |
|
|
8
|
|
|
|
|
17
|
|
|
|
8
|
|
|
|
|
810
|
|
|
282
|
409
|
|
|
|
|
1078
|
my @digs = parse_digits($dig); |
|
283
|
409
|
|
|
|
|
1263
|
$self->atom($mol, $bnd, '', $sym, '', undef, '', \@digs); |
|
284
|
|
|
|
|
|
|
} elsif ($sym2) { # Complex atom |
|
285
|
8
|
|
|
8
|
|
43
|
no warnings; |
|
|
8
|
|
|
|
|
17
|
|
|
|
8
|
|
|
|
|
12432
|
|
|
286
|
13
|
|
|
|
|
38
|
my @digs = parse_digits($dig); |
|
287
|
13
|
100
|
|
|
|
38
|
if ($hcnt eq 'H') { |
|
288
|
3
|
|
|
|
|
8
|
$hcnt = 1; |
|
289
|
|
|
|
|
|
|
} else { |
|
290
|
10
|
|
|
|
|
24
|
$hcnt =~ s/H//; |
|
291
|
|
|
|
|
|
|
} |
|
292
|
13
|
50
|
|
|
|
43
|
unless ($chg =~ /\d/) { |
|
293
|
13
|
100
|
|
|
|
44
|
$chg = ($chg =~ /-/) ? -length($chg) : length($chg); |
|
294
|
|
|
|
|
|
|
} |
|
295
|
13
|
|
100
|
|
|
125
|
$self->atom($mol, $bnd, $iso, $sym2, $chir, $hcnt || 0, |
|
|
|
|
100
|
|
|
|
|
|
296
|
|
|
|
|
|
|
$chg || 0, \@digs, $name); |
|
297
|
|
|
|
|
|
|
} else { |
|
298
|
0
|
|
|
|
|
0
|
die "SMILES ERROR: '$all in $s'\n"; |
|
299
|
|
|
|
|
|
|
} |
|
300
|
|
|
|
|
|
|
} |
|
301
|
|
|
|
|
|
|
}; |
|
302
|
|
|
|
|
|
|
# clean up to avoid memory leak |
|
303
|
64
|
|
|
|
|
194
|
$self->{stack} = undef; |
|
304
|
64
|
50
|
|
|
|
191
|
if ($@) { |
|
305
|
0
|
0
|
|
|
|
0
|
croak $@ if $opts->{fatal}; |
|
306
|
0
|
|
|
|
|
0
|
return; |
|
307
|
|
|
|
|
|
|
} |
|
308
|
64
|
|
|
|
|
288
|
$mol; |
|
309
|
|
|
|
|
|
|
} |
|
310
|
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
sub parse_digits { |
|
312
|
422
|
|
|
422
|
0
|
592
|
my ($dig) = @_; |
|
313
|
422
|
|
|
|
|
497
|
my @digs; |
|
314
|
422
|
|
100
|
|
|
1705
|
while ($dig && $dig =~ /$digits_re/g) { |
|
315
|
56
|
|
|
|
|
597
|
push @digs, {bnd=>$1, dig=>$2}; |
|
316
|
|
|
|
|
|
|
} |
|
317
|
422
|
|
|
|
|
923
|
@digs; |
|
318
|
|
|
|
|
|
|
} |
|
319
|
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
sub atom { |
|
321
|
422
|
|
|
422
|
0
|
523
|
my $self = shift; |
|
322
|
422
|
|
|
|
|
1076
|
my ($mol,$bnd,$iso,$sym,$chir,$hcount,$chg,$digs,$name) = @_; |
|
323
|
|
|
|
|
|
|
#{no warnings; local $" = ','; print "atom(@_)\n"} |
|
324
|
422
|
|
|
|
|
5680
|
my $a = $self->{add_atom}($mol,$iso,$sym,$chir,$hcount,$chg,$name); |
|
325
|
422
|
100
|
|
|
|
1912
|
if($self->{stack}[-1]) { |
|
326
|
358
|
|
|
|
|
2598
|
$self->{add_bond}($mol, $bnd, $self->{stack}[-1], $a); |
|
327
|
|
|
|
|
|
|
} |
|
328
|
422
|
|
|
|
|
1201
|
for my $dig (@$digs) { |
|
329
|
56
|
100
|
|
|
|
192
|
if ($self->{digits}{$dig->{dig}}) { |
|
330
|
28
|
0
|
33
|
|
|
116
|
if ($dig->{bnd} && $self->{digits}{$dig->{dig}}{bnd} |
|
|
|
|
33
|
|
|
|
|
|
331
|
|
|
|
|
|
|
&& $dig->{bnd} ne $self->{digits}{$dig->{dig}}{bnd}){ |
|
332
|
0
|
|
|
|
|
0
|
die "SMILES: Inconsistent ring closure\n"; |
|
333
|
|
|
|
|
|
|
} |
|
334
|
28
|
|
33
|
|
|
240
|
$self->{add_bond}($mol, |
|
335
|
|
|
|
|
|
|
$dig->{bnd} || $self->{digits}{$dig->{dig}}{bnd}, |
|
336
|
|
|
|
|
|
|
$self->{digits}{$dig->{dig}}{atom}, $a); |
|
337
|
28
|
|
|
|
|
167
|
delete $self->{digits}{$dig->{dig}}; |
|
338
|
|
|
|
|
|
|
} else { |
|
339
|
28
|
|
|
|
|
201
|
$self->{digits}{$dig->{dig}} = {atom=>$a, bnd=>$dig->{bnd}}; |
|
340
|
|
|
|
|
|
|
} |
|
341
|
|
|
|
|
|
|
} |
|
342
|
422
|
|
|
|
|
4856
|
$self->{stack}[-1] = $a; |
|
343
|
|
|
|
|
|
|
} |
|
344
|
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
#=back |
|
346
|
|
|
|
|
|
|
# |
|
347
|
|
|
|
|
|
|
#=head1 CALLBACK FUNCTIONS |
|
348
|
|
|
|
|
|
|
# |
|
349
|
|
|
|
|
|
|
#=over |
|
350
|
|
|
|
|
|
|
# |
|
351
|
|
|
|
|
|
|
#=item $atom = add_atom($mol, $iso, $sym, $chir, $hcount, $chg) |
|
352
|
|
|
|
|
|
|
# |
|
353
|
|
|
|
|
|
|
#Called by the parser whenever an atom is found. The first parameter is the |
|
354
|
|
|
|
|
|
|
#state object given to $obj->parse(). The other parameters are the isotope, |
|
355
|
|
|
|
|
|
|
#symbol, chirality, hydrogen count, and charge of the atom. Only the symbol is |
|
356
|
|
|
|
|
|
|
#guaranteed to be defined. Mnemonic: the parameters are given in the same order |
|
357
|
|
|
|
|
|
|
#that is used in a SMILES string (such as [18OH-]). This callback is expected to |
|
358
|
|
|
|
|
|
|
#return something that uniquely identifies the atom that was created (it might |
|
359
|
|
|
|
|
|
|
#be a number, a string, or an object). |
|
360
|
|
|
|
|
|
|
# |
|
361
|
|
|
|
|
|
|
#=cut |
|
362
|
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
# Default add_atom callback |
|
364
|
|
|
|
|
|
|
sub add_atom { |
|
365
|
378
|
|
|
378
|
0
|
704
|
my ($mol, $iso, $sym, $chir, $hcount, $chg, $name) = @_; |
|
366
|
378
|
|
|
|
|
1518
|
my $atom = $mol->new_atom(symbol => ucfirst $sym, name => $name); |
|
367
|
378
|
50
|
|
|
|
31821
|
$iso && $atom->attr('smiles/isotope' => $iso); |
|
368
|
378
|
50
|
|
|
|
720
|
$iso && $atom->mass($iso); |
|
369
|
378
|
50
|
|
|
|
704
|
$chir && $atom->attr('smiles/chirality' => $chir); |
|
370
|
378
|
100
|
|
|
|
813
|
defined $hcount && $atom->hydrogens($hcount); |
|
371
|
378
|
100
|
|
|
|
792
|
$chg && $atom->formal_charge($chg); |
|
372
|
378
|
100
|
|
|
|
1168
|
if ($sym =~ /^[a-z]/) { |
|
373
|
71
|
|
|
|
|
380
|
$atom->attr("smiles/aromatic", 1); |
|
374
|
|
|
|
|
|
|
} |
|
375
|
378
|
|
|
|
|
1594
|
$atom; |
|
376
|
|
|
|
|
|
|
} |
|
377
|
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
#=item add_bond($mol, $type, $a1, $a2) |
|
379
|
|
|
|
|
|
|
# |
|
380
|
|
|
|
|
|
|
#Called by the parser whenever an bond needs to be created. The first parameter |
|
381
|
|
|
|
|
|
|
#is the state object given to $obj->parse(). The other parameters are the bond |
|
382
|
|
|
|
|
|
|
#type and the two atoms that need to be bonded. The atoms are identified using |
|
383
|
|
|
|
|
|
|
#the return values from the add_atom() callback. |
|
384
|
|
|
|
|
|
|
# |
|
385
|
|
|
|
|
|
|
#=back |
|
386
|
|
|
|
|
|
|
# |
|
387
|
|
|
|
|
|
|
#=end comment |
|
388
|
|
|
|
|
|
|
# |
|
389
|
|
|
|
|
|
|
#=cut |
|
390
|
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
# Default add_bond callback |
|
392
|
|
|
|
|
|
|
sub add_bond { |
|
393
|
341
|
|
|
341
|
0
|
635
|
my ($mol, $type, $a1, $a2) = @_; |
|
394
|
341
|
100
|
|
|
|
1006
|
my $order = $type_to_order{$type} or return; # don't add bonds of order 0 |
|
395
|
333
|
|
|
|
|
3488
|
my $bond = $mol->new_bond(type=>$type, atoms=>[$a1, $a2], order=>$order); |
|
396
|
333
|
|
|
|
|
53981
|
$bond->attr("smiles/type" => $type); |
|
397
|
333
|
|
|
|
|
4308
|
$bond; |
|
398
|
|
|
|
|
|
|
} |
|
399
|
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
sub start_branch { |
|
401
|
58
|
|
|
58
|
0
|
98
|
my $self = shift; |
|
402
|
|
|
|
|
|
|
#print "start_branch\n"; |
|
403
|
58
|
|
|
|
|
78
|
push @{$self->{stack}}, $self->{stack}[-1]; |
|
|
58
|
|
|
|
|
599
|
|
|
404
|
|
|
|
|
|
|
} |
|
405
|
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
sub end_branch { |
|
407
|
58
|
|
|
58
|
0
|
88
|
my $self = shift; |
|
408
|
|
|
|
|
|
|
#print "end_branch\n"; |
|
409
|
58
|
|
|
|
|
86
|
pop @{$self->{stack}}; |
|
|
58
|
|
|
|
|
561
|
|
|
410
|
|
|
|
|
|
|
} |
|
411
|
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
# returns the number of hydrogens for an atom, assuming it has |
|
413
|
|
|
|
|
|
|
# no charge or radical (because those require an explicit H-count anyway) |
|
414
|
|
|
|
|
|
|
sub calc_implicit_hydrogens { |
|
415
|
369
|
|
|
369
|
0
|
508
|
my ($self, $atom) = @_; |
|
416
|
8
|
|
|
8
|
|
59
|
no warnings 'uninitialized'; |
|
|
8
|
|
|
|
|
13
|
|
|
|
8
|
|
|
|
|
39603
|
|
|
417
|
369
|
|
|
|
|
906
|
my $h_count = $ORGANIC_ELEMS{$atom->symbol} - $atom->valence; |
|
418
|
369
|
100
|
100
|
|
|
14714
|
if ($atom->attr("smiles/aromatic") and $atom->symbol =~ /^[CN]$/) { |
|
419
|
70
|
|
|
|
|
1125
|
$h_count--; |
|
420
|
|
|
|
|
|
|
} |
|
421
|
369
|
100
|
|
|
|
4005
|
$h_count = 0 if $h_count < 0; |
|
422
|
369
|
|
|
|
|
636
|
$h_count; |
|
423
|
|
|
|
|
|
|
} |
|
424
|
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
# returns the number of hydrogens that an atom should have, |
|
426
|
|
|
|
|
|
|
# taking into account that it may or may not have a few hydrogens |
|
427
|
|
|
|
|
|
|
# defined already. This assumes that the atom is neutral and not radical |
|
428
|
|
|
|
|
|
|
sub calc_implicit_hydrogens_2 { |
|
429
|
407
|
|
|
407
|
0
|
18441
|
my ($self, $atom) = @_; |
|
430
|
407
|
|
|
|
|
1198
|
my $h_count = $ORGANIC_ELEMS{$atom->symbol} - $atom->valence |
|
431
|
|
|
|
|
|
|
+ $atom->total_hydrogens; |
|
432
|
407
|
100
|
|
|
|
27062
|
$h_count = 0 if $h_count < 0; |
|
433
|
407
|
|
|
|
|
2562
|
$h_count; |
|
434
|
|
|
|
|
|
|
} |
|
435
|
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
sub add_implicit_hydrogens { |
|
437
|
57
|
|
|
57
|
0
|
108
|
my ($self, $mol) = @_; |
|
438
|
57
|
|
|
|
|
1504
|
for my $atom ($mol->atoms) { |
|
439
|
|
|
|
|
|
|
#print "H=".$atom->hydrogens."\n"; |
|
440
|
378
|
100
|
|
|
|
4129
|
unless (defined $atom->hydrogens) { |
|
441
|
369
|
|
|
|
|
2437
|
my $h_count = $self->calc_implicit_hydrogens($atom); |
|
442
|
369
|
|
|
|
|
1017
|
$atom->hydrogens($h_count); |
|
443
|
|
|
|
|
|
|
} |
|
444
|
|
|
|
|
|
|
} |
|
445
|
|
|
|
|
|
|
} |
|
446
|
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
##### SMILES WRITER ######## |
|
448
|
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
sub write_string { |
|
450
|
59
|
|
|
59
|
1
|
12910
|
my ($self, $mol_ref, %opts) = @_; |
|
451
|
|
|
|
|
|
|
|
|
452
|
59
|
|
|
|
|
117
|
my $eol; |
|
453
|
|
|
|
|
|
|
my @mols; |
|
454
|
59
|
100
|
|
|
|
210
|
if ($opts{mols}) { |
|
455
|
1
|
|
|
|
|
2
|
@mols = @{$opts{mols}}; |
|
|
1
|
|
|
|
|
4
|
|
|
456
|
1
|
|
|
|
|
3
|
$eol = "\n"; |
|
457
|
|
|
|
|
|
|
} else { |
|
458
|
58
|
|
|
|
|
194
|
@mols = $mol_ref; |
|
459
|
58
|
|
|
|
|
129
|
$eol = ""; |
|
460
|
|
|
|
|
|
|
} |
|
461
|
|
|
|
|
|
|
|
|
462
|
59
|
|
|
|
|
96
|
my $smiles; |
|
463
|
59
|
|
|
|
|
159
|
for my $mol (@mols) { |
|
464
|
62
|
|
|
|
|
3507
|
$mol = $mol->clone; |
|
465
|
62
|
|
|
|
|
31012
|
$mol->collapse_hydrogens; |
|
466
|
62
|
|
|
|
|
18876
|
my @atoms = $mol->atoms; |
|
467
|
|
|
|
|
|
|
|
|
468
|
62
|
50
|
|
|
|
613
|
if (@atoms) { |
|
469
|
62
|
|
|
|
|
100
|
my $i; |
|
470
|
62
|
100
|
|
|
|
204
|
if ($opts{auto_number}) { |
|
471
|
1
|
|
|
|
|
7
|
$_->name(++$i) for @atoms; |
|
472
|
1
|
|
|
|
|
53
|
$opts{number} = 1; |
|
473
|
|
|
|
|
|
|
} |
|
474
|
62
|
100
|
|
|
|
267
|
if ($opts{unique}) { |
|
475
|
15
|
50
|
|
|
|
56
|
unless ($atoms[0]->attr("canon/class")) { |
|
476
|
15
|
|
|
|
|
2903
|
require Chemistry::Canonicalize; |
|
477
|
15
|
|
|
|
|
128938
|
Chemistry::Canonicalize::canonicalize($mol); |
|
478
|
|
|
|
|
|
|
} |
|
479
|
15
|
|
|
|
|
190286
|
$opts{aromatic} = 1; # all unique smiles have to be aromatic |
|
480
|
171
|
|
|
|
|
1839
|
@atoms = sort { |
|
481
|
15
|
|
|
|
|
86
|
$a->attr("canon/class") <=> $b->attr("canon/class") |
|
482
|
|
|
|
|
|
|
} @atoms; |
|
483
|
|
|
|
|
|
|
} |
|
484
|
|
|
|
|
|
|
|
|
485
|
62
|
100
|
|
|
|
329
|
if ($opts{aromatic}) { |
|
486
|
61
|
|
|
|
|
11510
|
require Chemistry::Ring; |
|
487
|
61
|
|
|
|
|
39729
|
Chemistry::Ring::aromatize_mol($mol); |
|
488
|
|
|
|
|
|
|
} |
|
489
|
|
|
|
|
|
|
|
|
490
|
62
|
|
|
|
|
112296
|
my $visited = {}; |
|
491
|
62
|
|
|
|
|
119
|
my @s; |
|
492
|
62
|
|
|
|
|
147
|
for my $atom (@atoms) { |
|
493
|
414
|
100
|
|
|
|
5342
|
next if $visited->{$atom}; |
|
494
|
70
|
|
|
|
|
719
|
my $ring_atoms = {}; |
|
495
|
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
# first pass to find and number the ring bonds |
|
497
|
70
|
|
|
|
|
413
|
$self->find_ring_bonds($mol, \%opts, $atom, undef, {}, $ring_atoms); |
|
498
|
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
# second pass to actually generate the SMILES string |
|
500
|
70
|
|
|
|
|
1767
|
push @s, $self->branch($mol, \%opts, $atom, undef, $visited, $ring_atoms); |
|
501
|
|
|
|
|
|
|
} |
|
502
|
62
|
|
|
|
|
744
|
$smiles .= join '.', @s; |
|
503
|
|
|
|
|
|
|
} |
|
504
|
|
|
|
|
|
|
|
|
505
|
62
|
100
|
|
|
|
200
|
if ($opts{name}) { |
|
506
|
8
|
|
|
|
|
36
|
$smiles .= "\t" . $mol->name; |
|
507
|
|
|
|
|
|
|
} |
|
508
|
62
|
|
|
|
|
299
|
$smiles .= $eol; |
|
509
|
|
|
|
|
|
|
} |
|
510
|
59
|
|
|
|
|
3764
|
return $smiles; |
|
511
|
|
|
|
|
|
|
} |
|
512
|
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
sub find_ring_bonds { |
|
514
|
414
|
|
|
414
|
0
|
989
|
my ($self, $mol, $opts, $atom, $from_bond, $visited, $ring_atoms) = @_; |
|
515
|
|
|
|
|
|
|
|
|
516
|
414
|
|
|
|
|
1022
|
$visited->{$atom} = 1; |
|
517
|
414
|
|
|
|
|
4051
|
for my $bn ($self->sorted_bonds_neighbors($atom, $opts)) { |
|
518
|
736
|
|
|
|
|
3993
|
my $nei = $bn->{to}; |
|
519
|
736
|
|
|
|
|
932
|
my $bond = $bn->{bond}; |
|
520
|
736
|
100
|
|
|
|
2123
|
next if $visited->{$bond}; |
|
521
|
368
|
|
|
|
|
3336
|
$visited->{$bond} = 1; |
|
522
|
368
|
100
|
|
|
|
3870
|
if ($visited->{$nei}) { # closed ring |
|
523
|
|
|
|
|
|
|
#print "closing ring\n"; |
|
524
|
24
|
|
|
|
|
236
|
$ring_atoms->{$nei}++; |
|
525
|
|
|
|
|
|
|
} else { |
|
526
|
344
|
|
|
|
|
3097
|
$self->find_ring_bonds($mol, $opts, $nei, $bond, $visited, $ring_atoms); |
|
527
|
|
|
|
|
|
|
} |
|
528
|
|
|
|
|
|
|
} |
|
529
|
|
|
|
|
|
|
} |
|
530
|
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
sub branch { |
|
532
|
414
|
|
|
414
|
0
|
816
|
my ($self, $mol, $opts, $atom, $from_bond, $visited, $digits) = @_; |
|
533
|
|
|
|
|
|
|
|
|
534
|
414
|
|
|
|
|
597
|
my $prev_branch = ""; |
|
535
|
414
|
|
|
|
|
428
|
my $smiles; |
|
536
|
414
|
|
|
|
|
1092
|
$smiles .= $self->bond_symbol($from_bond, $opts); |
|
537
|
|
|
|
|
|
|
#$digits->{count}++; |
|
538
|
414
|
|
|
|
|
3425
|
$smiles .= $self->format_atom($atom, $opts); |
|
539
|
414
|
100
|
|
|
|
1833
|
if ($digits->{$atom}) { # opening a ring |
|
540
|
24
|
|
|
|
|
202
|
my @d; |
|
541
|
24
|
|
|
|
|
74
|
for (1 .. $digits->{$atom}) { |
|
542
|
24
|
|
|
|
|
5860
|
push @d, $self->next_digit($digits); |
|
543
|
|
|
|
|
|
|
} |
|
544
|
24
|
|
|
|
|
98
|
$digits->{$atom} = \@d; |
|
545
|
24
|
50
|
|
|
|
255
|
$smiles .= join "", map { $_ < 10 ? $_ : "%$_"} @d; |
|
|
24
|
|
|
|
|
148
|
|
|
546
|
|
|
|
|
|
|
} |
|
547
|
|
|
|
|
|
|
|
|
548
|
414
|
|
|
|
|
3718
|
$visited->{$atom} = 1; |
|
549
|
414
|
|
|
|
|
3704
|
my @bns = $self->sorted_bonds_neighbors($atom, $opts); |
|
550
|
|
|
|
|
|
|
|
|
551
|
414
|
|
|
|
|
840
|
for my $bn (@bns) { |
|
552
|
736
|
|
|
|
|
3357
|
my $nei = $bn->{to}; |
|
553
|
736
|
|
|
|
|
949
|
my $bond = $bn->{bond}; |
|
554
|
736
|
100
|
|
|
|
1728
|
next if $visited->{$bond}; |
|
555
|
392
|
100
|
|
|
|
3601
|
if ($visited->{$nei}) { # closed a ring |
|
556
|
24
|
|
|
|
|
181
|
my $digit = shift @{$digits->{$nei}}; |
|
|
24
|
|
|
|
|
61
|
|
|
557
|
24
|
|
|
|
|
310
|
$smiles .= $self->bond_symbol($bond, $opts); |
|
558
|
24
|
50
|
|
|
|
244
|
$smiles .= $digit < 10 ? $digit : "%$digit"; |
|
559
|
24
|
|
|
|
|
64
|
$digits->{used_digits}[$digit] = 0; # free for future use |
|
560
|
24
|
|
|
|
|
68
|
$visited->{$bond} = 1; |
|
561
|
|
|
|
|
|
|
} |
|
562
|
|
|
|
|
|
|
} |
|
563
|
|
|
|
|
|
|
|
|
564
|
414
|
|
|
|
|
4477
|
for my $bn (@bns) { |
|
565
|
736
|
|
|
|
|
2702
|
my $nei = $bn->{to}; |
|
566
|
736
|
|
|
|
|
1573
|
my $bond = $bn->{bond}; |
|
567
|
736
|
100
|
|
|
|
1650
|
next if $visited->{$bond}; |
|
568
|
344
|
|
|
|
|
2968
|
$visited->{$bond} = 1; |
|
569
|
344
|
50
|
|
|
|
4079
|
unless ($visited->{$nei}) { |
|
570
|
344
|
|
|
|
|
3931
|
my $branch = $self->branch($mol, $opts, $nei, $bond, $visited, $digits); |
|
571
|
344
|
100
|
|
|
|
848
|
if ($prev_branch) { |
|
572
|
54
|
|
|
|
|
106
|
$smiles .= "($prev_branch)"; |
|
573
|
|
|
|
|
|
|
} |
|
574
|
344
|
|
|
|
|
871
|
$prev_branch = $branch; |
|
575
|
|
|
|
|
|
|
} |
|
576
|
|
|
|
|
|
|
} |
|
577
|
414
|
|
|
|
|
3454
|
$smiles .= "$prev_branch"; |
|
578
|
414
|
|
|
|
|
1506
|
$smiles; |
|
579
|
|
|
|
|
|
|
} |
|
580
|
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
sub next_digit { |
|
582
|
24
|
|
|
24
|
0
|
83
|
my ($self, $digits) = @_; |
|
583
|
24
|
|
|
|
|
122
|
for (my $i = 1; $i < 100; $i++) { |
|
584
|
26
|
100
|
|
|
|
123
|
unless ($digits->{used_digits}[$i]) { |
|
585
|
24
|
|
|
|
|
59
|
$digits->{used_digits}[$i] = 1; # mark as used |
|
586
|
24
|
|
|
|
|
97
|
return $i; |
|
587
|
|
|
|
|
|
|
} |
|
588
|
|
|
|
|
|
|
} |
|
589
|
0
|
|
|
|
|
0
|
die "no more available smiles digits!"; # shouldn't happen |
|
590
|
|
|
|
|
|
|
} |
|
591
|
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
sub sorted_bonds_neighbors { |
|
593
|
828
|
|
|
828
|
0
|
1693
|
my ($self, $atom, $opts) = @_; |
|
594
|
828
|
|
|
|
|
2089
|
my @bn = $atom->bonds_neighbors; |
|
595
|
828
|
100
|
|
|
|
13583
|
if ($opts->{unique}) { |
|
596
|
160
|
|
|
|
|
1425
|
@bn = sort { |
|
597
|
190
|
|
|
|
|
454
|
$a->{to}->attr("canon/class") <=> $b->{to}->attr("canon/class") |
|
598
|
|
|
|
|
|
|
} @bn; |
|
599
|
|
|
|
|
|
|
} |
|
600
|
828
|
|
|
|
|
3598
|
@bn; |
|
601
|
|
|
|
|
|
|
} |
|
602
|
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
my %ORDER_TO_TYPE = ( |
|
604
|
|
|
|
|
|
|
2 => '=', 1 => '', 3 => '#', |
|
605
|
|
|
|
|
|
|
); |
|
606
|
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
sub bond_symbol { |
|
608
|
438
|
|
|
438
|
0
|
644
|
my ($self, $bond, $opts) = @_; |
|
609
|
438
|
100
|
|
|
|
1224
|
return '' unless $bond; |
|
610
|
368
|
100
|
100
|
|
|
3055
|
return '' if $opts->{aromatic} && $bond->aromatic; |
|
611
|
273
|
|
|
|
|
2442
|
return $ORDER_TO_TYPE{$bond->order}; |
|
612
|
|
|
|
|
|
|
} |
|
613
|
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
sub format_atom { |
|
615
|
414
|
|
|
414
|
0
|
639
|
my ($self, $atom, $opts) = @_; |
|
616
|
|
|
|
|
|
|
|
|
617
|
414
|
|
|
|
|
1071
|
my $symbol = $atom->symbol; |
|
618
|
414
|
100
|
100
|
|
|
3751
|
$symbol = lc $symbol if $opts->{aromatic} && $atom->aromatic; |
|
619
|
414
|
|
|
|
|
2873
|
my $s = $symbol; |
|
620
|
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
# unless atom is "simple"... |
|
622
|
414
|
100
|
100
|
|
|
1125
|
if (!$ORGANIC_ELEMS{$atom->symbol} || $atom->formal_charge |
|
|
|
|
66
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
623
|
|
|
|
|
|
|
|| $atom->total_hydrogens != $self->calc_implicit_hydrogens_2($atom) |
|
624
|
|
|
|
|
|
|
|| ($opts->{number} && defined $atom->name) |
|
625
|
|
|
|
|
|
|
) { |
|
626
|
|
|
|
|
|
|
# "complex atom"; bracketed |
|
627
|
15
|
|
|
|
|
131
|
my $h_count = $atom->hydrogens; |
|
628
|
15
|
|
100
|
|
|
88
|
my $charge = $atom->formal_charge || ''; |
|
629
|
15
|
|
50
|
|
|
130
|
my $iso = $atom->attr("smiles/isotope") || ''; |
|
630
|
15
|
|
|
|
|
147
|
my $number = ''; |
|
631
|
|
|
|
|
|
|
|
|
632
|
15
|
50
|
66
|
|
|
73
|
if ($charge and abs($charge) > 1) { |
|
|
|
100
|
|
|
|
|
|
|
633
|
0
|
|
|
|
|
0
|
$charge = sprintf("%+d", $charge); |
|
634
|
|
|
|
|
|
|
} elsif ($charge) { |
|
635
|
1
|
50
|
|
|
|
8
|
$charge = $charge > 0 ? '+' : '-'; |
|
636
|
|
|
|
|
|
|
} |
|
637
|
|
|
|
|
|
|
|
|
638
|
15
|
100
|
|
|
|
42
|
$h_count = $h_count ? ($h_count > 1 ? "H$h_count" : 'H') : ''; |
|
|
|
100
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
|
|
640
|
15
|
100
|
66
|
|
|
64
|
$number = ':' . $atom->name if $opts->{number} and defined $atom->name; |
|
641
|
|
|
|
|
|
|
|
|
642
|
15
|
|
|
|
|
157
|
$s = "[$iso$symbol$h_count$charge$number]"; |
|
643
|
|
|
|
|
|
|
} |
|
644
|
414
|
|
|
|
|
1216
|
$s; |
|
645
|
|
|
|
|
|
|
} |
|
646
|
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
1; |
|
649
|
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
=head1 CAVEATS |
|
651
|
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
Stereochemistry is not supported! Stereochemical descriptors such as @, @@, /, |
|
653
|
|
|
|
|
|
|
and \ will be silently ignored on input, and will certainly not be produced on |
|
654
|
|
|
|
|
|
|
output. |
|
655
|
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
Reading branches that start before an atom, such as (OC)C, which should be |
|
657
|
|
|
|
|
|
|
equivalent to C(OC) and COC, according to some variants of the SMILES |
|
658
|
|
|
|
|
|
|
specification. Many other tools don't implement this rule either. |
|
659
|
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
The kekulize option works by increasing the bond orders of atoms that don't |
|
661
|
|
|
|
|
|
|
have their usual valences satisfied. This may cause problems if you have atoms |
|
662
|
|
|
|
|
|
|
with explicitly low hydrogen counts. |
|
663
|
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
=head1 VERSION |
|
665
|
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
0.47 |
|
667
|
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
669
|
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
L, L |
|
671
|
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
The SMILES Home Page at http://www.daylight.com/dayhtml/smiles/ |
|
673
|
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
The Daylight Theory Manual at |
|
675
|
|
|
|
|
|
|
http://www.daylight.com/dayhtml/doc/theory/theory.smiles.html |
|
676
|
|
|
|
|
|
|
|
|
677
|
|
|
|
|
|
|
The PerlMol website L |
|
678
|
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
=head1 AUTHOR |
|
680
|
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
Ivan Tubert-Brohman Eitub@cpan.orgE |
|
682
|
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
=head1 COPYRIGHT |
|
684
|
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
Copyright (c) 2009 Ivan Tubert-Brohman. All rights reserved. This program is |
|
686
|
|
|
|
|
|
|
free software; you can redistribute it and/or modify it under the same terms as |
|
687
|
|
|
|
|
|
|
Perl itself. |
|
688
|
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
=cut |
|
690
|
|
|
|
|
|
|
|