File Coverage

blib/lib/Treex/Block/Write/BaseWriter.pm
Criterion Covered Total %
statement 54 99 54.5
branch 13 42 30.9
condition 8 27 29.6
subroutine 13 18 72.2
pod 0 1 0.0
total 88 187 47.0


line stmt bran cond sub pod time code
1             package Treex::Block::Write::BaseWriter;
2             $Treex::Block::Write::BaseWriter::VERSION = '2.20210102';
3 1     1   764 use Moose;
  1         5  
  1         7  
4 1     1   5092 use Treex::Core::Common;
  1         3  
  1         8  
5 1     1   5893 use autodie; # die if the output file cannot be opened
  1         3  
  1         7  
6 1     1   5450 use File::Path;
  1         5  
  1         81  
7 1     1   10 use File::Basename;
  1         3  
  1         65  
8 1     1   7 use IO::Handle;
  1         13  
  1         2089  
9              
10              
11             extends 'Treex::Core::Block';
12              
13             has extension => (
14             isa => 'Str',
15             is => 'ro',
16             documentation => 'Default extension for the output file type.',
17             default => ''
18             );
19              
20             has compress => (
21             is => 'rw',
22             isa => 'Bool',
23             documentation => 'Compression to .gz. Defaults to document->compress, or 0.'
24             );
25              
26             has [qw(file_stem path)] => (
27             isa => 'Str',
28             is => 'rw',
29             documentation => 'These provide a possibility of creating output file names from input file names.',
30             );
31              
32             has stem_suffix => (
33             isa => 'Str',
34             is => 'ro',
35             documentation => 'A suffix to append after file_stem.',
36             );
37              
38             has to => (
39             isa => 'Str',
40             is => 'rw',
41             documentation => 'The destination filename ("-" means the standard output; '
42             . 'use "." for the filename inherited from upstream blocks).',
43             );
44              
45             # Experimental feature. TODO: reconsider the design and add tests
46             has substitute => (
47             isa => 'Str',
48             is => 'ro',
49             documentation => 'A file loaded from dir1 can be saved to dir2 by substitute={dir1}{dir2}. '
50             . 'You can use regex substituions, e.g. substitute={dir(\d+)/file(\d+).treex}{f\1-\2.streex}i',
51             );
52              
53              
54             has _filenames => (
55             isa => 'ArrayRef[Str]',
56             is => 'rw',
57             builder => '_build_filenames',
58             writer => '_set_filenames',
59             lazy_build => 1,
60             documentation => 'Array of filenames where to save the documents if using multiple files;'
61             . ' automatically initialized from the attribute "to".',
62             );
63              
64             has _file_handle => (
65             isa => 'Maybe[FileHandle]',
66             is => 'rw',
67             writer => '_set_file_handle',
68             documentation => 'The open output file handle.',
69             );
70              
71             has _last_filename => (
72             isa => 'Str',
73             is => 'rw',
74             writer => '_set_last_filename',
75             documentation => 'Last output filename, to keep stream open if unchanged.',
76             );
77              
78             sub _build_filenames {
79 0     0   0 my $self = shift;
80 0 0       0 log_fatal "Parameter 'to' must be defined!" if !defined $self->to;
81 0         0 return [ split /[ ,]+/, $self->to ];
82             }
83              
84             sub BUILD {
85 1     1 0 14 my ($self) = @_;
86             # fail soon if running under treex -p
87 1 50 33     32 if ($self->scenario && $self->scenario->runner && $self->scenario->runner->jobindex) {
      33        
88             # and if the output file is specified
89 0 0 0     0 if (defined $self->to and $self->to ne '-' and $self->to ne '.') {
      0        
90 0         0 log_fatal "Merging writer output with treex -p into file(s) (except for stdout) "
91             . "is not supported. to=" . $self->to;
92             }
93             }
94 1         4 return;
95             }
96              
97             # Return 1 if the document should be compressed (and '.gz' added to its extension).
98             sub _compress_document {
99              
100 1     1   4 my ( $self, $document ) = @_;
101 1         3 my $compress = 0;
102              
103 1 50       55 if ( defined $self->compress ) {
    0          
104 1         46 $compress = $self->compress;
105             }
106             elsif ( defined $document->compress ) {
107 0         0 $compress = $document->compress;
108             }
109              
110 1         11 return $compress;
111             }
112              
113             # Return the correct extension for the given document, according to the default file extension
114             # for the format and the compression settings.
115             sub _document_extension {
116 1     1   5 my ( $self, $document ) = @_;
117 1 50       33 return $self->extension . ( $self->_compress_document($document) ? '.gz' : '' );
118             }
119              
120             # This just returns the next filename from the list given in the 'to' parameter.
121             sub _get_next_filename {
122              
123 0     0   0 my ($self) = @_;
124              
125 0         0 my ( $next_filename, @rest_filenames ) = @{ $self->_filenames };
  0         0  
126 0         0 $self->_set_filenames( \@rest_filenames );
127 0         0 return $next_filename;
128             }
129              
130              
131             # This returns the correct file name for the next document, taking the path, file_stem,
132             # stem_suffix and to parameters into account.
133             sub _get_filename {
134              
135 1     1   4 my ( $self, $document ) = @_;
136 1         4 my $filename = $document->full_filename . $self->_document_extension($document);
137              
138 1 50       40 if ( defined $self->path ) {
139 0         0 $document->set_path( $self->path );
140 0         0 $filename = $document->full_filename . $self->_document_extension($document);
141             }
142 1 50       56 if ( defined $self->file_stem ) {
143 0         0 $document->set_file_stem( $self->file_stem );
144 0         0 $filename = $document->full_filename . $self->_document_extension($document);
145             }
146 1 50       36 if ( defined $self->stem_suffix ) {
147 0 0       0 my $origstem = defined $self->file_stem
148             ? $self->file_stem : $document->file_stem;
149 0         0 $document->set_file_stem( $origstem . $self->stem_suffix );
150 0         0 $filename = $document->full_filename . $self->_document_extension($document);
151             }
152              
153 1 50 33     36 if ( defined( $self->to ) && ( $self->to ne '.' ) ) {
154              
155 1         11 my $next_filename = $self->_get_next_filename();
156              
157 1 50       5 if ( !defined $next_filename ) {
158 0         0 log_warn "There are more documents to save than filenames given ("
159             . $self->to . "). Falling back to the filename filled in by a DocumentReader ($filename).";
160             }
161             else {
162 1 50       30 $filename = ( defined $self->path ? $self->path : '' ) . $next_filename;
163             }
164             }
165              
166 1 50       31 if (defined $self->substitute){
167 0         0 my $eval_string = '$filename =~ s' . $self->substitute . ';1;';
168 0 0       0 eval $eval_string or log_fatal "Failed to eval $eval_string"; ## no critic qw(BuiltinFunctions::ProhibitStringyEval)
169             # An alternative implementation without stringy eval is commented below (see r14228),
170             # but it cannot handle $1 in rexex, e.g.
171             # Write::Treex substitute='{(.+)-(.+)}{$2-$1}
172              
173 0         0 my ($fn, $directories) = fileparse($filename, $self->_document_extension($document));
174 0         0 $directories =~ s{/$}{};
175 0         0 $document->set_path($directories);
176 0         0 $document->set_file_stem($fn);
177 0         0 $document->set_file_number('');
178             }
179              
180 1         4 return $filename;
181             }
182              
183             # my $regex = $self->substitute;
184             # my $delimiter = ",";
185             # my $lc = ($regex =~ /i$/);
186             #
187             # # regex can have format {<old>}{<new>}
188             # if ( $regex =~ /^{(.*)}{(.*)}i?$/ ) {
189             # $regex =~ s/^{(.*)}{(.*)}i?$/$1,$2/;
190             # }
191             # # otherwise regex should have format /<original>/<replacement>/, where "/" is delimiter of a choice
192             # else {
193             # $delimiter = substr($regex, 0, 1);
194             # $regex =~ s/^\Q$delimiter\E(.*)\Q$delimiter\E$/$1/;
195             # }
196             #
197             # my @old_new_regex = split /\Q$delimiter\E/, $regex;
198             # log_fatal "Incorrect form of regex $self->substitute" if (scalar @old_new_regex != 2);
199             # $filename =~ s/$old_new_regex[0]/$old_new_regex[1]/i if $lc;
200             # $filename =~ s/$old_new_regex[0]/$old_new_regex[1]/ if !$lc;
201              
202              
203              
204             # Default process_document method for all Writer blocks.
205             override 'process_document' => sub {
206             my ( $self, $document ) = @_;
207              
208             # set _file_handle properly (this MUST be called if process_document is overridden)
209             $self->_prepare_file_handle($document);
210              
211             $self->_do_before_process($document);
212              
213             # call the original process_document with _file_handle set
214             $self->_do_process_document($document);
215              
216             $self->_do_after_process($document);
217              
218             # This is not needed as the current file handle will be closed when opening the next file
219             # (in _prepare_file_handle) or at the end of the process.
220             # However, commenting the following line leads to undeterministic errors (e.g. in en2cs)
221             # UNFINISHED JOB e2c-news-dev2009-job001 PRODUCED EPILOG.
222             # On the other hand, always closing the handle prevents
223             # treex Read::Treex from=@my.list Write::Sentences to=out.txt
224             # As a workaround I decided to close the handle only in "treex -p".
225             # Martin Popel 2014
226             #$self->_close_file_handle() if $self->scenario->runner->jobindex;
227             ###
228             # For some reason the scenario->runner was not defined in some cases, so
229             # we test it too
230             # Dusan Varis 2014
231             # Also $self->scenario may not be defined if the writer is instantiated directly from Perl.
232             $self->_close_file_handle() if $self->scenario && $self->scenario->runner && $self->scenario->runner->jobindex;
233             # or if $self->scenario->runner->isa("Treex::Core::Parallel::Node")?
234              
235             return;
236             };
237              
238             sub _do_process_document
239             {
240 0     0   0 my ($self, $document) = @_;
241              
242 0         0 $self->Treex::Core::Block::process_document($document);
243              
244 0         0 return;
245             }
246              
247             sub _do_before_process {
248 0     0   0 my ($self, $document) = @_;
249              
250 0         0 return;
251             }
252              
253             sub _do_after_process {
254 0     0   0 my ($self, $document) = @_;
255              
256 0         0 return;
257             }
258              
259             override 'process_end' => sub {
260             my $self = shift;
261              
262             $self->_close_file_handle();
263              
264             return;
265             };
266              
267             sub _close_file_handle
268             {
269 2     2   5 my $self = shift;
270              
271             #log_warn("CLOSE FH: " . $self->_file_handle . "; LAST: " . $self->_last_filename);
272              
273             # close the previous one (except if it's stdout)
274 2 50 33     65 if ( defined $self->_file_handle
      66        
275             && ( !defined $self->_last_filename || $self->_last_filename ne "-" ) ) {
276             #log_warn("CLOSE - file handle - REAL");
277 0         0 close $self->_file_handle;
278 0         0 $self->_set_file_handle(undef);
279             }
280              
281 2         5 return;
282             }
283              
284             # Prepare the file handle for the next file to be processed.
285             # This MUST be called in all process_document overrides.
286             sub _prepare_file_handle {
287 1     1   4 my ( $self, $document ) = @_;
288              
289 1         8 my $filename = $self->_get_filename($document);
290              
291             #log_warn("PREPARE FILENAME: $filename; LAST: " . $self->_last_filename);
292             #log_warn(int(defined $self->_last_filename) . " + " . int($filename eq $self->_last_filename) . " + " . $filename ne "__FAKE_OUTPUT__");
293              
294 1 50 33     37 if ( defined $self->_last_filename && $filename eq $self->_last_filename && $filename !~ "__FAKE_OUTPUT__") {
      33        
295              
296             # nothing to do, keep writing to the old filename
297             }
298             else {
299             # need to switch output stream
300              
301 1         9 $self->_close_file_handle();
302              
303             # open the new output stream
304 1         16 log_info "Saving to $filename";
305 1         24 $self->_set_file_handle( $self->_open_file_handle($filename) );
306             }
307              
308             # remember last used filename
309 1         44 $self->_set_last_filename($filename);
310 1         3 return;
311             }
312              
313             # Open the given file handle (including compressed variants and standard output).
314             sub _open_file_handle {
315 1     1   5 my ( $self, $filename ) = @_;
316              
317 1 50       5 if ( $filename eq "-" ) {
318 1         17 STDOUT->autoflush(1);
319 1         86 return \*STDOUT;
320             }
321              
322 0           my $opn;
323             my $hdl;
324              
325             # file might not recognize some files!
326 0 0         if ( $filename =~ /\.gz$/ ) {
    0          
327 0           $opn = "gzip -c > '$filename'";
328             }
329             elsif ( $filename =~ /\.bz2$/ ) {
330 0           $opn = "bzip2 > '$filename'";
331             }
332              
333 0           mkpath( dirname($filename) );
334 0 0         if ($opn) {
335 0           open ( $hdl, '|-', $opn );
336             }
337             else {
338 0           open ( $hdl, '>', $filename );
339             }
340              
341 0           $hdl->autoflush(1);
342 0           return $hdl;
343             }
344              
345             1;
346              
347             __END__
348              
349             =encoding utf-8
350              
351             =head1 NAME
352              
353             Treex::Block::Write::BaseWriter
354              
355             =head1 VERSION
356              
357             version 2.20210102
358              
359             =head1 DESCRIPTION
360              
361             This is the base class for document writer blocks in Treex.
362              
363             It handles selecting and opening the output files, allowing for output of one-file per document.
364             The output file name(s) may be set in several ways (standard output may also be used as a file
365             with the name '-'); GZip file compression is supported.
366              
367             Other features, such as writing all documents to one file or setting character encoding,
368             are enabled in L<Treex::Block::Write::BaseTextWriter>.
369              
370             =head1 PARAMETERS
371              
372             =over
373              
374             =item C<to>
375              
376             Space-or-comma-separated list of output file names.
377              
378             =item C<file_stem>, C<path>
379              
380             These override the respective attributes in documents
381             (filled in by a L<DocumentReader|Treex::Core::DocumentReader>),
382             which are used for generating output file names.
383              
384             =item C<stem_suffix>
385              
386             A string to append after C<file_stem>.
387              
388             =item C<compress>
389              
390             If set to 1, the output files are compressed using GZip (if C<to> is used to set
391             file names, the names must also contain the ".gz" suffix).
392              
393             =back
394              
395             =head1 DERIVED CLASSES
396              
397             The derived classes should just use C<print { $self->_file_handle } "output text">, the
398             base class will take care of opening the proper file.
399              
400             All derived classes that override the C<process_document> method directly must call
401             the C<_prepare_file_handle> method to gain access to the correct file handle.
402              
403             The C<extension> parameter should be overriden with the default file extension
404             for the given file type.
405              
406             =head1 TODO
407              
408             =over
409              
410             =item *
411              
412             Set C<compress> if file name contains .gz or .bz2? Add .gz to extension to even for file names set with
413             the C<to> parameter if C<compress> is set to true?
414              
415             =item *
416              
417             Possibly rearrange somehow so that the C<_prepare_file_handle> method is not needed. The problem is that
418             if this was a Moose role, it would have to be applied only after an override to C<process_document>. The
419             Moose C<inner> and C<augment> operators are a possibility, but would not remove a need for a somewhat
420             non-standard behavior in derived classes (one could not just override C<process_document>, but would
421             have to C<augment> it).
422              
423             =back
424              
425             =head1 AUTHORS
426              
427             Ondřej Dušek <odusek@ufal.mff.cuni.cz>
428              
429             Martin Popel <popel@ufal.mff.cuni.cz>
430              
431             Ondřej Bojar <bojar@ufal.mff.cuni.cz>
432              
433             =head1 COPYRIGHT AND LICENSE
434              
435             Copyright © 2011-2012 by Institute of Formal and Applied Linguistics, Charles University in Prague
436              
437             This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.