File Coverage

blib/lib/Chemistry/File/PDB.pm
Criterion Covered Total %
statement 81 97 83.5
branch 18 36 50.0
condition 6 13 46.1
subroutine 12 12 100.0
pod 4 4 100.0
total 121 162 74.6


line stmt bran cond sub pod time code
1             package Chemistry::File::PDB;
2              
3             our $VERSION = '0.23';
4             # $Id: PDB.pm,v 1.13 2009/05/10 21:57:32 itubert Exp $
5              
6 2     2   85539 use base qw(Chemistry::File);
  2         4  
  2         3194  
7 2     2   75433 use Chemistry::MacroMol;
  2         178360  
  2         291  
8 2     2   3699 use Chemistry::Domain;
  2         1870  
  2         99  
9 2     2   13 use Scalar::Util qw(weaken);
  2         4  
  2         291  
10 2     2   12 use Carp;
  2         6  
  2         221  
11 2     2   10 use strict;
  2         5  
  2         55  
12 2     2   13 use warnings;
  2         85  
  2         3103  
13              
14             =head1 NAME
15              
16             Chemistry::File::PDB - Protein Data Bank file format reader/writer
17              
18             =head1 SYNOPSIS
19              
20             use Chemistry::File::PDB;
21              
22             # read a PDB file
23             my $macro_mol = Chemistry::MacroMol->read("myfile.pdb");
24              
25             # write a PDB file
26             $macro_mol->write("out.pdb");
27              
28             # read all models in a multi-model file
29             my @mols = Chemistry::MacroMol->read("models.pdb");
30              
31             # read one model at a time
32             my $file = Chemistry::MacroMol->file("models.pdb");
33             $file->open;
34             while (my $mol = $file->read_mol($file->fh)) {
35             # do something with $mol
36             }
37              
38             =cut
39              
40             =head1 DESCRIPTION
41              
42             This module reads and writes PDB files. The PDB file format is commonly used to
43             describe proteins, particularly those stored in the Protein Data Bank
44             (L). The current version of this module only reads the
45             following record types, ignoring everything else:
46              
47             ATOM
48             HETATM
49             ENDMDL
50             END
51              
52             This module automatically registers the 'pdb' format with Chemistry::Mol,
53             so that PDB files may be identified and read by Chemistry::Mol->read(). For
54             autodetection purpuses, it assumes that files ending in .pdb or having
55             a line matching /^(ATOM |HETATM)/ are PDB files.
56              
57             The PDB reader and writer is designed for dealing with Chemistry::MacroMol
58             objects, but it can also create and use Chemistry::Mol objects by throwing some
59             information away.
60              
61             =head2 Properties
62              
63             When reading and writing files, this module stores or gets some of the
64             information in the following places:
65              
66             =over
67              
68             =item $domain->type
69              
70             The residue type, such as "ARG".
71              
72             =item $domain->name
73              
74             The type and sequence number, such as "ARG114".
75              
76             =item $domain->attr("pdb/sequence_number")
77              
78             The residue sequence number as given in the PDB file.
79              
80             =item $domain->attr("pdb/chain_id")
81              
82             The chain to which this residue belongs (one character).
83              
84             =item $domain->attr("pdb/insertion_code")
85              
86             The residue insertion code (see the PDB specification for details).
87              
88             =item $atom->name
89              
90             The PDB atom name, such as "CA".
91              
92             =item $atom->attr("pdb/residue_name")
93              
94             The name of the residue, as discussed above.
95              
96             =item $atom->attr("pdb/serial_number")
97              
98             The serial number for the atom, as given in the PDB file.
99              
100             =back
101              
102             If some of this information is not available when writing a PDB file, this
103             module tries to make it up (by counting the atoms or residues, for example).
104             The default residue name for writing is UNK (unknown). Atom names are just the
105             atomic symbols.
106              
107             =head2 Multi-model files
108              
109             If a PDB file has multiple models (separated by END or ENDMDL records), each
110             call to read_mol will return one model.
111              
112             =head2 Output features
113              
114             On writing Chemistry::Mol objects, which don't have macromolecule information
115             and usually don't have atom names, the atom names are made up by concatenating
116             the atomic symbol with a unique ID (up to 1296 atoms are possible). The ID can
117             be disabled by setting the option 'noid':
118              
119             $mol->write("out.pdb", noid => 1);
120              
121             The molecule's name is written as a HEADER record; A REMARK record is added
122             listing the version of Chemistry::File::PDB that was used.
123              
124             =cut
125              
126             Chemistry::Mol->register_format(pdb => __PACKAGE__);
127              
128             sub read_mol {
129 6     6 1 1232 my ($self, $fh, %options) = @_;
130 6 100       65 return if $fh->eof;
131              
132 3         109 my $domain;
133 3   50     15 my $mol_class = $options{mol_class} || "Chemistry::MacroMol";
134 3         31 my $mol = $mol_class->new;
135 3         425 my $is_macro = $mol->isa('Chemistry::MacroMol');
136 3 100       13 $domain = $mol unless $is_macro;
137              
138 3         7 local $_;
139 3         9 my $curr_residue = 0;
140 3         24 while (<$fh>) {
141 423 100       13408 if (/^TER/) {
    100          
    50          
142             # TODO read separated molecules?
143             } elsif (/^(?:HETATM|ATOM)/) {
144 417         4415 my ($atom_n, $symbol, $suff, $res_name, $chain_id,
145             $seq_n, $ins_code, $x, $y, $z
146             ) = unpack "x6A5x1A2A2x1A3x1A1A4A1x3A8A8A8", $_;
147             #print "S:$symbol; N:$name; x:$x; y:$y; z:$z\n";
148 417         866 $seq_n *= 1;
149 417 100 100     1751 if (!$domain || $seq_n != $curr_residue) {
150             # new residue
151 30 100       322 if ($is_macro) {
152 20         156 $domain = Chemistry::Domain->new(
153             parent => $mol, name => "$res_name$seq_n",
154             type => $res_name);
155 20         2063 $mol->add_domain($domain);
156 20         261 weaken $domain;
157 20         70 $domain->attr('pdb/sequence_number', $seq_n);
158 20         418 $domain->attr('pdb/chain_id', $chain_id);
159 20         453 $domain->attr('pdb/insertion_code', $ins_code);
160             }
161 30         230 $curr_residue = $seq_n;
162             }
163 417         5492 $symbol =~ s/ //g;
164 417         879 my $atom_name = $symbol.$suff;
165 417         537 $atom_name =~ s/ //g;
166 417         861 $symbol = ucfirst(lc($symbol));
167 417         3259 my $a = $domain->new_atom(
168             symbol => $symbol,
169             coords => [$x, $y, $z],
170             name => $atom_name,
171             );
172 417         74307 $a->attr('pdb/residue_name', "$res_name$seq_n");
173 417         7682 $a->attr('pdb/serial_number', $atom_n*1);
174             #$a->attr('pdb/residue', $domain); # XXX causes memory leak
175             } elsif (/^END(MDL)?/) {
176 3         64 return $mol;
177             }
178             }
179 0         0 return $mol;
180             }
181              
182              
183             sub name_is {
184 1     1 1 180 my ($class, $fname) = @_;
185 1         8 $fname =~ /\.pdb$/i;
186             }
187              
188             sub file_is {
189 3     3 1 1007 my ($class, $fname) = @_;
190            
191 3 50       30 return 1 if $fname =~ /\.pdb$/i;
192              
193 0 0       0 open F, $fname or croak "Could not open file $fname";
194            
195 0         0 while (){
196 0 0 0     0 if (/^ATOM / or /^HETATM/) {
197 0         0 close F;
198 0         0 return 1;
199             }
200             }
201              
202 0         0 return 0;
203             }
204              
205             sub write_string {
206 1     1 1 350 my ($class, $mol, %opts) = @_;
207 1         3 my $ret = '';
208 1 50       11 $ret .= 'TITLE ' . $mol->name . "\n" if defined $mol->name;
209 1         20 $ret .= 'REMARK Generated by ' . __PACKAGE__ . " version $VERSION\n";
210 1 50       8 if ($mol->isa("Chemistry::MacroMol")) {
211 1         2 my $i = 1;
212 1         2 my $j = 1;
213 1         7 for my $res ($mol->domains) {
214 10         47 my $seq_n = $res->attr("pdb/sequence_number");
215 10 50       101 $seq_n = $j++ unless defined $seq_n;
216 10   50     29 my $res_name = substr($res->name, 0, 3) || "UNK";
217              
218 10         95 for my $atom ($res->atoms) {
219 139         1880 my $serial_n = $res->attr("pdb/serial_number");
220 139         1366 my $ins_code = $res->attr("pdb/insertion_code");
221 139 50       1773 $serial_n = $i++ unless defined $serial_n;
222 139         430 my @coords = $atom->coords->array;
223              
224             # make atom name
225 2     2   16 no warnings 'uninitialized';
  2         5  
  2         1221  
226 139         2008 my $symbol = substr $atom->symbol, 0, 2;
227 139   33     1082 my $atom_name = $atom->name || $symbol;
228 139         2495 $atom_name =~ /^(\dH|$symbol)(.{0,2})/;
229             #print "NAME: '$atom_name' ($1,$2); SYMBOL: '$symbol'\n";
230 139         521 $atom_name = sprintf "%2s%-2s", $1, $2;
231 139         1574 $ret .= sprintf "ATOM %5d %4s %-3s %4d%1s %8.3f%8.3f%8.3f\n",
232             $serial_n, $atom_name, $res_name, $seq_n, $ins_code, @coords;
233             }
234             }
235             } else {
236 0         0 my $i = 1;
237 0         0 for my $atom ($mol->atoms) {
238 0         0 my @coords = $atom->coords->array;
239 0 0       0 if ($atom->name) {
240 0         0 $ret .= sprintf "ATOM %5d %4s %-3s %4d %8.3f%8.3f%8.3f\n",
241             $i++, $atom->name, "UNK", 1, @coords;
242             } else {
243 0         0 my ($i0, $i1) = ($i % 36, int($i/36));
244 0 0       0 my $id = ($i1 < 10 ? $i1 : chr(55+$i1))
    0          
245             . ($i0 < 10 ? $i0 : chr(55+$i0));
246 0 0       0 $id = ' ' if $opts{noid};
247 0         0 $ret .= sprintf "HETATM%5d %2s%2s UNK 1 %8.3f%8.3f%8.3f\n",
248             $i++, $atom->symbol, $id, @coords;
249             }
250             }
251             }
252 1         5 $ret .= "TER\nEND\n";
253 1         49 $ret;
254             }
255              
256             1;
257              
258             =head1 VERSION
259              
260             0.23
261              
262             =head1 SEE ALSO
263              
264             L, L, L,
265             L.
266              
267             The PDB format description at
268             L
269              
270             There is another PDB reader in Perl, as part of the BioPerl project:
271             L.
272              
273             =head1 AUTHOR
274              
275             Ivan Tubert-Brohman
276              
277             =head1 COPYRIGHT
278              
279             Copyright (c) 2009 Ivan Tubert-Brohman. All rights reserved. This program is
280             free software; you can redistribute it and/or modify it under the same terms as
281             Perl itself.
282              
283             =cut
284              
285              
286