File Coverage

blib/lib/Chemistry/File.pm
Criterion Covered Total %
statement 111 147 75.5
branch 33 60 55.0
condition 16 27 59.2
subroutine 23 30 76.6
pod 20 21 95.2
total 203 285 71.2


line stmt bran cond sub pod time code
1             package Chemistry::File;
2             $VERSION = '0.37';
3              
4             =head1 NAME
5              
6             Chemistry::File - Molecule file I/O base class
7              
8             =head1 SYNOPSIS
9              
10             # As a convenient interface for several mol readers:
11             use Chemistry::File qw(PDB MDLMol); # load PDB and MDL modules
12            
13             # or try to use every file I/O module installed in the system:
14             use Chemistry::File ':auto';
15              
16             my $mol1 = Chemistry::Mol->read("file.pdb");
17             my $mol2 = Chemistry::Mol->read("file.mol");
18              
19              
20             # as a base for a mol reader:
21              
22             package Chemistry::File::Myfile;
23             use base qw(Chemistry::File);
24             Chemistry::Mol->register_type("myfile", __PACKAGE__);
25              
26             # override the read_mol method
27             sub read_mol {
28             my ($self, $fh, %opts) = shift;
29             my $mol_class = $opts{mol_class} || "Chemistry::Mol";
30             my $mol = $mol_class->new;
31             # ... do some stuff with $fh and $mol ...
32             return $mol;
33             }
34              
35             # override the write_mol method
36             sub write_mol {
37             my ($self, $fh, $mol, %opts) = shift;
38             print $fh $mol->name, "\n";
39             # ... do some stuff with $fh and $mol ...
40             }
41              
42             =head1 DESCRIPTION
43              
44             The main use of this module is as a base class for other molecule file I/O
45             modules (for example, Chemistry::File::PDB). Such modules should override and
46             extend the Chemistry::File methods as needed. You only need to care about the
47             methods here if if you are writing a file I/O module or if you want a finer
48             degree of control than what is offered by the simple read and write methods
49             in the Chemistry::Mol class.
50              
51             From the user's point of view, this module can also be used as shorthand
52             for using several Chemistry::File modules at the same time.
53              
54             use Chemistry::File qw(PDB MDLMol);
55              
56             is exactly equivalent to
57              
58             use Chemistry::File::PDB;
59             use Chemistry::File::MDLMol;
60              
61             If you use the :auto keyword, Chemistry::File will autodetect and load
62             all the Chemistry::File::* modules installed in your system.
63              
64             use Chemistry::File ':auto';
65              
66             =head1 FILE I/O MODEL
67              
68             Before version 0.30, file I/O modules typically used only parse_string,
69             write_string, parse_file, and write_file, and they were generally used as class
70             methods. A file could contain one or more molecules and only be read or written
71             whole; reading it would return every molecule on the file. This was problematic
72             when dealing with large multi-molecule files (such as SDF files), because all
73             the molecules would have to be loaded into memory at the same time.
74              
75             While version 0.30 retains backward compatibility with that simple model, it
76             also allows a more flexible interface that allows reading one molecule at a
77             time, skipping molecules, and reading and writing file-level information that
78             is not associated with specific molecules. The following diagram shows the
79             global structure of a file according to the new model:
80              
81             +-----------+
82             | header |
83             +-----------+
84             | molecule |
85             +-----------+
86             | molecule |
87             +-----------+
88             | ... |
89             +-----------+
90             | footer |
91             +-----------+
92              
93             In cases where the header and the footer are empty, the model reduces to the
94             pre-0.30 version. The low-level steps to read a file are the following:
95              
96             $file = Chemistry::File::MyFormat->new(file => 'xyz.mol');
97             $file->open('<');
98             $file->read_header;
99             while (my $mol = $self->read_mol($file->fh, %opts)) {
100             # do something with $mol...
101             }
102             $self->read_footer;
103              
104             The C method does all the above automatically, and it stores all the
105             molecules read in the mols property.
106              
107             =head1 STANDARD OPTIONS
108              
109             All the methods below include a list of options %opts at the end of the
110             parameter list. Each class implementing this interface may have its own
111             particular options. However, the following options should be recognized by all
112             classes:
113              
114             =over
115              
116             =item mol_class
117              
118             A class or object with a C method that constructs a molecule. This is
119             needed when the user want to specify a molecule subclass different from the
120             default. When this option is not defined, the module may use Chemistry::Mol
121             or whichever class is appropriate for that file format.
122              
123             =item format
124              
125             The name of the file format being used, as registered by
126             Chemistry::Mol->register_format.
127              
128             =item fatal
129              
130             If true, parsing errors should throw an exception; if false, they should just
131             try to recover if possible. True by default.
132              
133             =back
134              
135             =head1 CLASS METHODS
136              
137             The class methods in this class (or rather, its derived classes) are usually
138             not called directly. Instead, use Chemistry::Mol->read, write, print, parse,
139             and file. These methods also work if called as instance methods.
140              
141             =over
142              
143              
144             =cut
145              
146 14     14   35030 use strict;
  14         30  
  14         636  
147 14     14   76 use warnings;
  14         29  
  14         455  
148 14     14   69 no warnings qw(uninitialized);
  14         33  
  14         665  
149 14     14   75 use Carp;
  14         25  
  14         2253  
150 14     14   28324 use FileHandle;
  14         312207  
  14         111  
151 14     14   6847 use base qw(Chemistry::Obj);
  14         36  
  14         17817  
152             # don't blame our problems in the Chemistry::Mol module ;-)
153             our @CARP_NOT = qw(Chemistry::Mol);
154              
155             # This subroutine implements the :auto functionality
156             sub import {
157 13     13   235 my $pack = shift;
158 13         41095 for my $param (@_){
159 0 0       0 if ($param eq ':auto') {
160 0         0 for my $pmfile (map {glob "$_/Chemistry/File/*.pm"} @INC) {
  0         0  
161 0         0 my ($pm) = $pmfile =~ m|(Chemistry/File/.*\.pm)$|;
162             #warn "requiring $pm\n";
163 0         0 eval { require $pm };
  0         0  
164 0 0       0 die "Error in Chemistry::File: '$@'; pmfile='$pmfile'; pm='$pm'\n" if $@;
165             }
166             } else {
167 0         0 eval "use ${pack}::$param";
168 0 0       0 die "$@" if $@;
169             }
170             }
171             }
172              
173             =item $class->parse_string($s, %options)
174              
175             Parse a string $s and return one or mole molecule objects. This is an abstract
176             method, so it should be provided by all derived classes.
177              
178             =cut
179              
180             sub parse_string {
181 0     0 1 0 my ($self, $s, %opts) = @_;
182 0 0       0 if ($opts{_must_override}) {
183 0   0     0 my $class = ref $self || $self;
184 0         0 croak "parse_string() is not implemented for $class";
185             }
186 0         0 $self->new(file => \$s, opts => \%opts)->read;
187             }
188              
189              
190             =item $class->write_string($mol, %options)
191              
192             Convert a molecule to a string. This is an abstract method, so it should be
193             provided by all derived classes.
194              
195             =cut
196              
197             sub write_string {
198 3     3 1 10 my ($self, $mol, %opts) = @_;
199 3 50       11 if ($opts{_must_override}) {
200 0   0     0 my $class = ref $self || $self;
201 0         0 croak "write_string() is not implemented for $class";
202             }
203 3         6 my $s;
204 3         17 $self->new(file => \$s, mols => [$mol], opts => \%opts)->write;
205 3         13 $s;
206             }
207              
208             =item $class->parse_file($file, %options)
209              
210             Reads the file $file and returns one or more molecules. The default method
211             slurps the whole file and then calls parse_string, but derived classes may
212             choose to override it. $file can be a filehandle, a filename, or a scalar
213             reference. See C for details.
214              
215             =cut
216              
217             sub parse_file {
218 11     11 1 42 my ($self, $file, %opts) = @_;
219 11         89 $self->new(file => $file, opts => \%opts)->read;
220             }
221              
222             =item $class->write_file($mol, $file, %options)
223              
224             Writes a file $file containing the molecule $mol. The default method calls
225             write_string first and then saves the string to a file, but derived classes
226             may choose to override it. $file can be either a filehandle or a filename.
227              
228             =cut
229              
230             sub write_file {
231 3     3 1 12 my ($self, $mol, $file, %opts) = @_;
232              
233 3         20 $self->new(file => $file, mols => [$mol], opts => \%opts)->write;
234             }
235              
236             =item $class->name_is($fname, %options)
237              
238             Returns true if a filename is of the format corresponding to the class.
239             It should look at the filename only, because it may be called with
240             non-existent files. It is used to determine with which format to save a file.
241             For example, the Chemistry::File::PDB returns true if the file ends in .pdb.
242              
243             =cut
244              
245             sub name_is {
246 0     0 1 0 0;
247             }
248              
249             =item $class->string_is($s, %options)
250              
251             Examines the string $s and returns true if it has the format of the class.
252              
253             =cut
254              
255             sub string_is {
256 0     0 1 0 0;
257             }
258              
259             =item $class->file_is($file, %options)
260              
261             Examines the file $file and returns true if it has the format of the class.
262             The default method slurps the whole file and then calls string_is, but derived
263             classes may choose to override it.
264              
265             =cut
266              
267             sub file_is {
268 9     9 1 28 my ($self, $file, %opts) = @_;
269            
270 9         18 my $s = eval {
271 9         62 $self->open('<');
272 0         0 $self->slurp;
273             };
274 9 50       80 if ($s) {
    50          
275 0         0 $self->string_is($s, %opts);
276             } elsif (! ref $file) {
277 9         66 $self->name_is($file, %opts);
278             }
279             }
280              
281             =item $class->slurp
282              
283             Reads a file into a scalar. Automatic decompression of gzipped files is
284             supported if the Compress::Zlib module is installed. Files ending in .gz are
285             assumed to be compressed; otherwise it is possible to force decompression by
286             passing the gzip => 1 option (or no decompression with gzip => 0).
287              
288             =cut
289              
290             # slurp a file into a scalar, with transparent decompression
291             sub slurp {
292 0     0 1 0 my ($self) = @_;
293              
294 0         0 my $fh = $self->fh;
295 0         0 local $/;
296 0         0 <$fh>;
297             }
298              
299             =item $class->new(file => $file, opts => \%opts)
300              
301             Create a new file object. This method is usually called indirectly via
302             the Chemistry::Mol->file method. $file may be a scalar with a filename, an
303             open filehandle, or a reference to a scalar. If a reference to a scalar is
304             used, the string contained in the scalar is used as an in-memory file.
305              
306             =cut
307              
308             sub new {
309 22     22 1 229 my $self = shift->SUPER::new(@_);
310 22 50       229 $self->{opts}{fatal} = 1 unless exists $self->{opts}{fatal};
311 22         169 $self;
312             }
313              
314             Chemistry::Obj::accessor(qw(file fh opts mols mode));
315              
316             =back
317              
318             =head1 INSTANCE METHODS
319              
320             =head2 Accessors
321              
322             Chemistry::File objects are derived from Chemistry::Obj and have the same
323             properties (name, id, and type), as well as the following ones:
324              
325             =over
326              
327             =item file
328              
329             The "file" as described above under C.
330              
331             =item fh
332              
333             The filehandle used for reading and writing molecules. It is opened by C.
334              
335             =item opts
336              
337             A hashref containing the options that are passed through to the old-style class
338             methods. They are also passed to the instance method to keep a similar
339             interface, but they could access them via $self->opts anyway.
340              
341             =item mode
342              
343             '>' if the file is open for writing, '<' for reading, and false if not open.
344              
345             =item mols
346              
347             C stores all the molecules that were read in this property as an array
348             reference. C gets the molecules to write from here.
349              
350             =back
351              
352             =head2 Abstract methods
353              
354             These methods should be overridden, because they don't really do much by
355             default.
356              
357             =over
358              
359             =item $file->read_header
360              
361             Read whatever information is available in the file before the first molecule.
362             Does nothing by default.
363              
364             =cut
365              
366 14     14 1 55 sub read_header { }
367              
368             =item $file->read_footer
369              
370             Read whatever information is available in the file after the last molecule.
371             Does nothing by default.
372              
373             =cut
374              
375 14     14 1 26 sub read_footer { }
376              
377             =item $self->slurp_mol($fh)
378              
379             Reads from the input string until the end of the current molecule and returns
380             the "slurped" string. It does not parse the string. It returns undefined if
381             there are no more molecules in the file. This method should be overridden if
382             needed; by default, it slurps until the end of the file.
383              
384             =cut
385              
386             sub slurp_mol {
387 0     0 1 0 my ($self, $fh) = @_;
388 0         0 local $/; <$fh>;
  0         0  
389             }
390              
391             =item $self->skip_mol($fh)
392              
393             Similar to slurp_mol, but it doesn't need to return anything except true or
394             false. It should also be overridden if needed; by default, it just calls
395             slurp_mol.
396              
397             =cut
398              
399 0     0 1 0 sub skip_mol { shift->slurp_mol(@_) }
400              
401             =item $file->read_mol($fh, %opts)
402              
403             Read the next molecule in the input stream. It returns false if there are no
404             more molecules in the file. This method should be overridden by derived
405             classes; otherwise it will call slurp_mol and parse_string (for backwards
406             compatibility; it is recommended to override read_mol directly in new modules).
407              
408             Note: some old file I/O modules (written before the 0.30 interface) may return
409             more than one molecule anyway, so it is recommended to call read_mol in list
410             context to be safe:
411              
412             ($mol) = $file->read_mol($fh, %opts);
413              
414             =cut
415              
416             sub read_mol {
417 4     4 1 11 my ($self, $fh, %opts) = @_;
418 4         12 my $s = $self->slurp_mol($fh);
419 4 100 66     43 return unless defined $s and length $s;
420 3         12 $self->parse_string($s, %opts, _must_override => 1);
421             }
422             =item $file->write_header
423              
424             Write whatever information is needed before the first molecule.
425             Does nothing by default.
426              
427             =cut
428              
429 6     6 0 13 sub write_header { }
430              
431             =item $file->write_footer
432              
433             Write whatever information is needed after the last molecule.
434             Does nothing by default.
435              
436             =cut
437              
438 6     6 1 10 sub write_footer { }
439              
440             =item $self->write_mol($fh, $mol, %opts)
441              
442             Write one molecule to $fh. By default and for backward compatibility, it just
443             calls C and prints its return value to $self->fh. New classes
444             should override it.
445              
446             =cut
447              
448             sub write_mol {
449 0     0 1 0 my ($self, $fh, $mol, %opts) = @_;
450 0         0 print $fh $self->write_string($mol, %opts, _must_override => 1);
451             }
452              
453             ########################## OTHER ##################################
454              
455             =back
456              
457             =head2 Other methods
458              
459             =over
460              
461             =item $self->open($mode)
462              
463             Opens the file (held in $self->file) for reading by default, or for writing if
464             $mode eq '>'. This method sets $self->fh transparently regardless of whether
465             $self->file is a filename (compressed or not), a scalar reference, or a
466             filehandle.
467              
468             =cut
469              
470             sub open {
471 30     30 1 55 my ($self, $mode) = @_;
472 30         45 my $fh;
473             my $s;
474 30   50     96 $mode ||= '<';
475 30         161 $self->mode($mode);
476 30         153 my $file = $self->file;
477 30 100       2910 croak "Chemistry::File::open: no file supplied" unless defined $file;
478 21 100 66     301 if (ref $file eq 'SCALAR') {
    50 66        
    100          
479 4 50       19 croak "decompression only supported for files" if $self->{opts}{gzip};
480 4 50       20 if ($] >= 5.008) {
481 4     2   114 open $fh, $mode, $file;
  2         23  
  2         5  
  2         16  
482             } else {
483 0         0 require IO::String;
484 0         0 $fh = IO::String->new($$file);
485             }
486             } elsif (ref $file) {
487 0 0       0 croak "decompression only supported for files" if $self->{opts}{gzip};
488 0         0 $fh = $file;
489             } elsif ($self->{opts}{gzip}
490             or !defined $self->{opts}{gzip} and $file =~ /.gz$/)
491             {
492 4 50       9 eval { require Compress::Zlib } # Carp
  4         57  
493             or croak "Compress::Zlib not installed!";
494 4         1606 require File::Temp;
495              
496 4         15388 $fh = File::Temp::tempfile();
497 4   100     3541 $self->{opts}{gzip} ||= 1;
498 4 100       18 unless ($mode eq '>') {
499 2 50       12 my $gz = Compress::Zlib::gzopen($file, "rb")
500             or croak "Cannot open compressed $file: "
501             . "$Compress::Zlib::gzerrno\n";
502              
503 2         4883 my $buffer;
504 2         12 print $fh $buffer while $gz->gzread($buffer) > 0;
505            
506 2 50       2315 if ($Compress::Zlib::gzerrno != Compress::Zlib::Z_STREAM_END()) {
507 0         0 croak "Error reading from $file: $Compress::Zlib::gzerrno"
508             . ($Compress::Zlib::gzerrno+0) . "\n";
509             }
510 2         20 $gz->gzclose();
511 2         348 seek $fh, 0, 0;
512             }
513             } else {
514 13 50       163 $fh = FileHandle->new("$mode$file")
515             or croak "Could not open file $file: $!";
516             }
517 21         5161 $self->fh($fh);
518 21         43 $self;
519             }
520              
521             =item $self->close
522              
523             Close the file. For regular files this just closes the filehandle, but for
524             gzipped files it does some additional postprocessing. This method is called
525             automatically on object destruction, so it is not mandatory to call it
526             explicitly.
527              
528             =cut
529              
530             sub close {
531 43     43 1 75 my ($self) = @_;
532 43         156 my $fh = $self->fh;
533 43 100 100     261 if ($fh and $self->mode eq '>' and $self->{opts}{gzip}) {
      100        
534 2   50     8 my $level = $self->{opts}{gzip} || 6;
535 2 50       7 $level = 6 if $level == 1;
536 2         9 my $file = $self->file;
537 2 50       8 if (ref $file) {
538 0         0 croak "compression only supported for files";
539             } else {
540 2         109 seek $fh, 0, 0;
541 2 50       16 my $gz = Compress::Zlib::gzopen($file, "wb$level")
542             or croak "Cannot open $file $Compress::Zlib::gzerrno\n";
543 2         4682 local $_;
544 2         57 while (<$fh>) {
545 508 50       45583 $gz->gzwrite($_)
546             or croak "error writing: $Compress::Zlib::gzerrno\n";
547             }
548 2         209 $gz->gzclose;
549             }
550             }
551 43 100       1456 if ($self->mode) {
552 21 50       75 if ($fh) { $fh->close or croak "$!" };
  21 50       126  
553 21         903 $self->mode('');
554             }
555             }
556              
557 22     22   1231 sub DESTROY { shift->close }
558              
559             =item $file->read
560              
561             Read the whole file. This calls open, read_header, read_mol until there are no
562             more molecules left, read_footer, and close. Returns a list of molecules if
563             called in list context, or the first molecule in scalar context.
564              
565             =cut
566              
567             sub read {
568 15     15 1 889 my ($self) = @_;
569 15         67 $self->open('<');
570 15         86 $self->read_header;
571 15         23 my @all_mols;
572 15         110 $self->mols(\@all_mols);
573 15         58 while (my @mols = $self->read_mol($self->fh, %{$self->{opts}})) {
  32         215  
574 17         137 push @all_mols, @mols;
575             }
576 15         230 $self->read_footer;
577 15         92 $self->close;
578 15 100       139 wantarray ? @all_mols : $all_mols[0];
579             }
580              
581             =item $self->write
582              
583             Write all the molecules in $self->mols. It just calls open, write_header,
584             write_mol (per each molecule), write_footer, and close.
585              
586             =cut
587              
588             sub write {
589 6     6 1 14 my ($self) = @_;
590 6         25 $self->open('>');
591 6         31 $self->write_header;
592 6         7 for my $mol (@{$self->mols}) {
  6         21  
593 6         21 $self->write_mol($self->fh, $mol, %{$self->{opts}});
  6         46  
594             }
595 6         7367 $self->write_footer;
596 6         21 $self->close;
597             }
598              
599             1;
600              
601             =back
602              
603             =head1 CAVEATS
604              
605             The :auto feature may not be entirely portable, but it is known to work under
606             Unix and Windows (either Cygwin or ActiveState).
607              
608             =head1 VERSION
609              
610             0.37
611              
612             =head1 SEE ALSO
613              
614             L
615              
616             The PerlMol website L
617              
618             =head1 AUTHOR
619              
620             Ivan Tubert-Brohman-Brohman
621              
622             =head1 COPYRIGHT
623              
624             Copyright (c) 2005 Ivan Tubert-Brohman. All rights reserved. This program is
625             free software; you can redistribute it and/or modify it under the same terms as
626             Perl itself.
627              
628             =cut
629