File Coverage

blib/lib/Chemistry/File.pm
Criterion Covered Total %
statement 107 143 74.8
branch 33 60 55.0
condition 16 27 59.2
subroutine 23 30 76.6
pod 20 21 95.2
total 199 281 70.8


line stmt bran cond sub pod time code
1             package Chemistry::File;
2              
3             our $VERSION = '0.38'; # VERSION
4              
5             =head1 NAME
6              
7             Chemistry::File - Molecule file I/O base class
8              
9             =head1 SYNOPSIS
10              
11             # As a convenient interface for several mol readers:
12             use Chemistry::File qw(PDB MDLMol); # load PDB and MDL modules
13            
14             # or try to use every file I/O module installed in the system:
15             use Chemistry::File ':auto';
16              
17             my $mol1 = Chemistry::Mol->read("file.pdb");
18             my $mol2 = Chemistry::Mol->read("file.mol");
19              
20              
21             # as a base for a mol reader:
22              
23             package Chemistry::File::Myfile;
24             use base qw(Chemistry::File);
25             use Chemistry::Mol;
26             Chemistry::Mol->register_format("myfile", __PACKAGE__);
27              
28             # override the read_mol method
29             sub read_mol {
30             my ($self, $fh, %opts) = shift;
31             my $mol_class = $opts{mol_class} || "Chemistry::Mol";
32             my $mol = $mol_class->new;
33             # ... do some stuff with $fh and $mol ...
34             return $mol;
35             }
36              
37             # override the write_mol method
38             sub write_mol {
39             my ($self, $fh, $mol, %opts) = shift;
40             print $fh $mol->name, "\n";
41             # ... do some stuff with $fh and $mol ...
42             }
43              
44             =head1 DESCRIPTION
45              
46             The main use of this module is as a base class for other molecule file I/O
47             modules (for example, Chemistry::File::PDB). Such modules should override and
48             extend the Chemistry::File methods as needed. You only need to care about the
49             methods here if if you are writing a file I/O module or if you want a finer
50             degree of control than what is offered by the simple read and write methods
51             in the Chemistry::Mol class.
52              
53             From the user's point of view, this module can also be used as shorthand
54             for using several Chemistry::File modules at the same time.
55              
56             use Chemistry::File qw(PDB MDLMol);
57              
58             is exactly equivalent to
59              
60             use Chemistry::File::PDB;
61             use Chemistry::File::MDLMol;
62              
63             If you use the :auto keyword, Chemistry::File will autodetect and load
64             all the Chemistry::File::* modules installed in your system.
65              
66             use Chemistry::File ':auto';
67              
68             =head1 FILE I/O MODEL
69              
70             Before version 0.30, file I/O modules typically used only parse_string,
71             write_string, parse_file, and write_file, and they were generally used as class
72             methods. A file could contain one or more molecules and only be read or written
73             whole; reading it would return every molecule on the file. This was problematic
74             when dealing with large multi-molecule files (such as SDF files), because all
75             the molecules would have to be loaded into memory at the same time.
76              
77             While version 0.30 retains backward compatibility with that simple model, it
78             also allows a more flexible interface that allows reading one molecule at a
79             time, skipping molecules, and reading and writing file-level information that
80             is not associated with specific molecules. The following diagram shows the
81             global structure of a file according to the new model:
82              
83             +-----------+
84             | header |
85             +-----------+
86             | molecule |
87             +-----------+
88             | molecule |
89             +-----------+
90             | ... |
91             +-----------+
92             | footer |
93             +-----------+
94              
95             In cases where the header and the footer are empty, the model reduces to the
96             pre-0.30 version. The low-level steps to read a file are the following:
97              
98             $file = Chemistry::File::MyFormat->new(file => 'xyz.mol');
99             $file->open('<');
100             $file->read_header;
101             while (my $mol = $self->read_mol($file->fh, %opts)) {
102             # do something with $mol...
103             }
104             $self->read_footer;
105              
106             The C method does all the above automatically, and it stores all the
107             molecules read in the mols property.
108              
109             =head1 STANDARD OPTIONS
110              
111             All the methods below include a list of options %opts at the end of the
112             parameter list. Each class implementing this interface may have its own
113             particular options. However, the following options should be recognized by all
114             classes:
115              
116             =over
117              
118             =item mol_class
119              
120             A class or object with a C method that constructs a molecule. This is
121             needed when the user want to specify a molecule subclass different from the
122             default. When this option is not defined, the module may use Chemistry::Mol
123             or whichever class is appropriate for that file format.
124              
125             =item format
126              
127             The name of the file format being used, as registered by
128             Chemistry::Mol->register_format.
129              
130             =item fatal
131              
132             If true, parsing errors should throw an exception; if false, they should just
133             try to recover if possible. True by default.
134              
135             =back
136              
137             =head1 CLASS METHODS
138              
139             The class methods in this class (or rather, its derived classes) are usually
140             not called directly. Instead, use Chemistry::Mol->read, write, print, parse,
141             and file. These methods also work if called as instance methods.
142              
143             =over
144              
145              
146             =cut
147              
148 13     13   57821 use strict;
  13         31  
  13         339  
149 13     13   59 use warnings;
  13         23  
  13         323  
150 13     13   55 no warnings qw(uninitialized);
  13         21  
  13         424  
151 13     13   76 use Carp;
  13         24  
  13         894  
152 13     13   6131 use FileHandle;
  13         122226  
  13         79  
153 13     13   3944 use base qw(Chemistry::Obj);
  13         28  
  13         12489  
154             # don't blame our problems in the Chemistry::Mol module ;-)
155             our @CARP_NOT = qw(Chemistry::Mol);
156              
157             # This subroutine implements the :auto functionality
158             sub import {
159 13     13   203 my $pack = shift;
160 13         11647 for my $param (@_){
161 0 0       0 if ($param eq ':auto') {
162 0         0 for my $pmfile (map {glob "$_/Chemistry/File/*.pm"} @INC) {
  0         0  
163 0         0 my ($pm) = $pmfile =~ m|(Chemistry/File/.*\.pm)$|;
164             #warn "requiring $pm\n";
165 0         0 eval { require $pm };
  0         0  
166 0 0       0 die "Error in Chemistry::File: '$@'; pmfile='$pmfile'; pm='$pm'\n" if $@;
167             }
168             } else {
169 0         0 eval "use ${pack}::$param";
170 0 0       0 die "$@" if $@;
171             }
172             }
173             }
174              
175             =item $class->parse_string($s, %options)
176              
177             Parse a string $s and return one or mole molecule objects. This is an abstract
178             method, so it should be provided by all derived classes.
179              
180             =cut
181              
182             sub parse_string {
183 0     0 1 0 my ($self, $s, %opts) = @_;
184 0 0       0 if ($opts{_must_override}) {
185 0   0     0 my $class = ref $self || $self;
186 0         0 croak "parse_string() is not implemented for $class";
187             }
188 0         0 $self->new(file => \$s, opts => \%opts)->read;
189             }
190              
191              
192             =item $class->write_string($mol, %options)
193              
194             Convert a molecule to a string. This is an abstract method, so it should be
195             provided by all derived classes.
196              
197             =cut
198              
199             sub write_string {
200 3     3 1 12 my ($self, $mol, %opts) = @_;
201 3 50       14 if ($opts{_must_override}) {
202 0   0     0 my $class = ref $self || $self;
203 0         0 croak "write_string() is not implemented for $class";
204             }
205 3         6 my $s;
206 3         21 $self->new(file => \$s, mols => [$mol], opts => \%opts)->write;
207 3         10 $s;
208             }
209              
210             =item $class->parse_file($file, %options)
211              
212             Reads the file $file and returns one or more molecules. The default method
213             slurps the whole file and then calls parse_string, but derived classes may
214             choose to override it. $file can be a filehandle, a filename, or a scalar
215             reference. See C for details.
216              
217             =cut
218              
219             sub parse_file {
220 11     11 1 80 my ($self, $file, %opts) = @_;
221 11         81 $self->new(file => $file, opts => \%opts)->read;
222             }
223              
224             =item $class->write_file($mol, $file, %options)
225              
226             Writes a file $file containing the molecule $mol. The default method calls
227             write_string first and then saves the string to a file, but derived classes
228             may choose to override it. $file can be either a filehandle or a filename.
229              
230             =cut
231              
232             sub write_file {
233 3     3 1 11 my ($self, $mol, $file, %opts) = @_;
234              
235 3         16 $self->new(file => $file, mols => [$mol], opts => \%opts)->write;
236             }
237              
238             =item $class->name_is($fname, %options)
239              
240             Returns true if a filename is of the format corresponding to the class.
241             It should look at the filename only, because it may be called with
242             non-existent files. It is used to determine with which format to save a file.
243             For example, the Chemistry::File::PDB returns true if the file ends in .pdb.
244              
245             =cut
246              
247             sub name_is {
248 0     0 1 0 0;
249             }
250              
251             =item $class->string_is($s, %options)
252              
253             Examines the string $s and returns true if it has the format of the class.
254              
255             =cut
256              
257             sub string_is {
258 0     0 1 0 0;
259             }
260              
261             =item $class->file_is($file, %options)
262              
263             Examines the file $file and returns true if it has the format of the class.
264             The default method slurps the whole file and then calls string_is, but derived
265             classes may choose to override it.
266              
267             =cut
268              
269             sub file_is {
270 9     9 1 32 my ($self, $file, %opts) = @_;
271            
272 9         20 my $s = eval {
273 9         54 $self->open('<');
274 0         0 $self->slurp;
275             };
276 9 50       93 if ($s) {
    50          
277 0         0 $self->string_is($s, %opts);
278             } elsif (! ref $file) {
279 9         59 $self->name_is($file, %opts);
280             }
281             }
282              
283             =item $class->slurp
284              
285             Reads a file into a scalar. Automatic decompression of gzipped files is
286             supported if the Compress::Zlib module is installed. Files ending in .gz are
287             assumed to be compressed; otherwise it is possible to force decompression by
288             passing the gzip => 1 option (or no decompression with gzip => 0).
289              
290             =cut
291              
292             # slurp a file into a scalar, with transparent decompression
293             sub slurp {
294 0     0 1 0 my ($self) = @_;
295              
296 0         0 my $fh = $self->fh;
297 0         0 local $/;
298 0         0 <$fh>;
299             }
300              
301             =item $class->new(file => $file, opts => \%opts)
302              
303             Create a new file object. This method is usually called indirectly via
304             the Chemistry::Mol->file method. $file may be a scalar with a filename, an
305             open filehandle, or a reference to a scalar. If a reference to a scalar is
306             used, the string contained in the scalar is used as an in-memory file.
307              
308             =cut
309              
310             sub new {
311 22     22 1 219 my $self = shift->SUPER::new(@_);
312 22 50       149 $self->{opts}{fatal} = 1 unless exists $self->{opts}{fatal};
313 22         99 $self;
314             }
315              
316             Chemistry::Obj::accessor(qw(file fh opts mols mode));
317              
318             =back
319              
320             =head1 INSTANCE METHODS
321              
322             =head2 Accessors
323              
324             Chemistry::File objects are derived from Chemistry::Obj and have the same
325             properties (name, id, and type), as well as the following ones:
326              
327             =over
328              
329             =item file
330              
331             The "file" as described above under C.
332              
333             =item fh
334              
335             The filehandle used for reading and writing molecules. It is opened by C.
336              
337             =item opts
338              
339             A hashref containing the options that are passed through to the old-style class
340             methods. They are also passed to the instance method to keep a similar
341             interface, but they could access them via $self->opts anyway.
342              
343             =item mode
344              
345             '>' if the file is open for writing, '<' for reading, and false if not open.
346              
347             =item mols
348              
349             C stores all the molecules that were read in this property as an array
350             reference. C gets the molecules to write from here.
351              
352             =back
353              
354             =head2 Abstract methods
355              
356             These methods should be overridden, because they don't really do much by
357             default.
358              
359             =over
360              
361             =item $file->read_header
362              
363             Read whatever information is available in the file before the first molecule.
364             Does nothing by default.
365              
366             =cut
367              
368       14 1   sub read_header { }
369              
370             =item $file->read_footer
371              
372             Read whatever information is available in the file after the last molecule.
373             Does nothing by default.
374              
375             =cut
376              
377       14 1   sub read_footer { }
378              
379             =item $self->slurp_mol($fh)
380              
381             Reads from the input string until the end of the current molecule and returns
382             the "slurped" string. It does not parse the string. It returns undefined if
383             there are no more molecules in the file. This method should be overridden if
384             needed; by default, it slurps until the end of the file.
385              
386             =cut
387              
388             sub slurp_mol {
389 0     0 1 0 my ($self, $fh) = @_;
390 0         0 local $/; <$fh>;
  0         0  
391             }
392              
393             =item $self->skip_mol($fh)
394              
395             Similar to slurp_mol, but it doesn't need to return anything except true or
396             false. It should also be overridden if needed; by default, it just calls
397             slurp_mol.
398              
399             =cut
400              
401 0     0 1 0 sub skip_mol { shift->slurp_mol(@_) }
402              
403             =item $file->read_mol($fh, %opts)
404              
405             Read the next molecule in the input stream. It returns false if there are no
406             more molecules in the file. This method should be overridden by derived
407             classes; otherwise it will call slurp_mol and parse_string (for backwards
408             compatibility; it is recommended to override read_mol directly in new modules).
409              
410             Note: some old file I/O modules (written before the 0.30 interface) may return
411             more than one molecule anyway, so it is recommended to call read_mol in list
412             context to be safe:
413              
414             ($mol) = $file->read_mol($fh, %opts);
415              
416             =cut
417              
418             sub read_mol {
419 4     4 1 9 my ($self, $fh, %opts) = @_;
420 4         10 my $s = $self->slurp_mol($fh);
421 4 100 66     35 return unless defined $s and length $s;
422 3         9 $self->parse_string($s, %opts, _must_override => 1);
423             }
424             =item $file->write_header
425              
426             Write whatever information is needed before the first molecule.
427             Does nothing by default.
428              
429             =cut
430              
431       6 0   sub write_header { }
432              
433             =item $file->write_footer
434              
435             Write whatever information is needed after the last molecule.
436             Does nothing by default.
437              
438             =cut
439              
440       6 1   sub write_footer { }
441              
442             =item $self->write_mol($fh, $mol, %opts)
443              
444             Write one molecule to $fh. By default and for backward compatibility, it just
445             calls C and prints its return value to $self->fh. New classes
446             should override it.
447              
448             =cut
449              
450             sub write_mol {
451 0     0 1 0 my ($self, $fh, $mol, %opts) = @_;
452 0         0 print $fh $self->write_string($mol, %opts, _must_override => 1);
453             }
454              
455             ########################## OTHER ##################################
456              
457             =back
458              
459             =head2 Other methods
460              
461             =over
462              
463             =item $self->open($mode)
464              
465             Opens the file (held in $self->file) for reading by default, or for writing if
466             $mode eq '>'. This method sets $self->fh transparently regardless of whether
467             $self->file is a filename (compressed or not), a scalar reference, or a
468             filehandle.
469              
470             =cut
471              
472             sub open {
473 30     30 1 67 my ($self, $mode) = @_;
474 30         56 my $fh;
475             my $s;
476 30   50     72 $mode ||= '<';
477 30         137 $self->mode($mode);
478 30         81 my $file = $self->file;
479 30 100       2160 croak "Chemistry::File::open: no file supplied" unless defined $file;
480 21 100 66     241 if (ref $file eq 'SCALAR') {
    50 66        
    100          
481 4 50       13 croak "decompression only supported for files" if $self->{opts}{gzip};
482 4 50       14 if ($] >= 5.008) {
483 4     2   114 open $fh, $mode, $file;
  2         15  
  2         4  
  2         15  
484             } else {
485 0         0 require IO::String;
486 0         0 $fh = IO::String->new($$file);
487             }
488             } elsif (ref $file) {
489 0 0       0 croak "decompression only supported for files" if $self->{opts}{gzip};
490 0         0 $fh = $file;
491             } elsif ($self->{opts}{gzip}
492             or !defined $self->{opts}{gzip} and $file =~ /.gz$/)
493             {
494 4 50       7 eval { require Compress::Zlib } # Carp
  4         36  
495             or croak "Compress::Zlib not installed!";
496 4         746 require File::Temp;
497              
498 4         8096 $fh = File::Temp::tempfile();
499 4   100     2186 $self->{opts}{gzip} ||= 1;
500 4 100       12 unless ($mode eq '>') {
501 2 50       10 my $gz = Compress::Zlib::gzopen($file, "rb")
502             or croak "Cannot open compressed $file: "
503             . "$Compress::Zlib::gzerrno\n";
504              
505 2         3793 my $buffer;
506 2         7 print $fh $buffer while $gz->gzread($buffer) > 0;
507            
508 2 50       1603 if ($Compress::Zlib::gzerrno != Compress::Zlib::Z_STREAM_END()) {
509 0         0 croak "Error reading from $file: $Compress::Zlib::gzerrno"
510             . ($Compress::Zlib::gzerrno+0) . "\n";
511             }
512 2         13 $gz->gzclose();
513 2         248 seek $fh, 0, 0;
514             }
515             } else {
516 13 50       134 $fh = FileHandle->new("$mode$file")
517             or croak "Could not open file $file: $!";
518             }
519 21         3279 $self->fh($fh);
520 21         48 $self;
521             }
522              
523             =item $self->close
524              
525             Close the file. For regular files this just closes the filehandle, but for
526             gzipped files it does some additional postprocessing. This method is called
527             automatically on object destruction, so it is not mandatory to call it
528             explicitly.
529              
530             =cut
531              
532             sub close {
533 43     43 1 85 my ($self) = @_;
534 43         118 my $fh = $self->fh;
535 43 100 100     168 if ($fh and $self->mode eq '>' and $self->{opts}{gzip}) {
      100        
536 2   50     7 my $level = $self->{opts}{gzip} || 6;
537 2 50       5 $level = 6 if $level == 1;
538 2         5 my $file = $self->file;
539 2 50       7 if (ref $file) {
540 0         0 croak "compression only supported for files";
541             } else {
542 2         60 seek $fh, 0, 0;
543 2 50       17 my $gz = Compress::Zlib::gzopen($file, "wb$level")
544             or croak "Cannot open $file $Compress::Zlib::gzerrno\n";
545 2         3242 local $_;
546 2         39 while (<$fh>) {
547 508 50       34323 $gz->gzwrite($_)
548             or croak "error writing: $Compress::Zlib::gzerrno\n";
549             }
550 2         167 $gz->gzclose;
551             }
552             }
553 43 100       964 if ($self->mode) {
554 21 50       54 if ($fh) { $fh->close or croak "$!" };
  21 50       104  
555 21         618 $self->mode('');
556             }
557             }
558              
559 22     22   542 sub DESTROY { shift->close }
560              
561             =item $file->read
562              
563             Read the whole file. This calls open, read_header, read_mol until there are no
564             more molecules left, read_footer, and close. Returns a list of molecules if
565             called in list context, or the first molecule in scalar context.
566              
567             =cut
568              
569             sub read {
570 15     15 1 864 my ($self) = @_;
571 15         65 $self->open('<');
572 15         88 $self->read_header;
573 15         27 my @all_mols;
574 15         67 $self->mols(\@all_mols);
575 15         48 while (my @mols = $self->read_mol($self->fh, %{$self->{opts}})) {
  32         161  
576 17         105 push @all_mols, @mols;
577             }
578 15         147 $self->read_footer;
579 15         71 $self->close;
580 15 100       164 wantarray ? @all_mols : $all_mols[0];
581             }
582              
583             =item $self->write
584              
585             Write all the molecules in $self->mols. It just calls open, write_header,
586             write_mol (per each molecule), write_footer, and close.
587              
588             =cut
589              
590             sub write {
591 6     6 1 15 my ($self) = @_;
592 6         19 $self->open('>');
593 6         29 $self->write_header;
594 6         11 for my $mol (@{$self->mols}) {
  6         16  
595 6         15 $self->write_mol($self->fh, $mol, %{$self->{opts}});
  6         35  
596             }
597 6         4448 $self->write_footer;
598 6         37 $self->close;
599             }
600              
601             1;
602              
603             =back
604              
605             =head1 CAVEATS
606              
607             The :auto feature may not be entirely portable, but it is known to work under
608             Unix and Windows (either Cygwin or ActiveState).
609              
610             =head1 SOURCE CODE REPOSITORY
611              
612             L
613              
614             =head1 SEE ALSO
615              
616             L
617              
618             =head1 AUTHOR
619              
620             Ivan Tubert-Brohman-Brohman
621              
622             =head1 COPYRIGHT
623              
624             Copyright (c) 2005 Ivan Tubert-Brohman. All rights reserved. This program is
625             free software; you can redistribute it and/or modify it under the same terms as
626             Perl itself.
627              
628             =cut
629