File Coverage

blib/lib/Treex/Block/Read/BaseReader.pm

Criterion	Covered	Total	%
statement	45	67	67.1
branch	10	24	41.6
condition	3	9	33.3
subroutine	10	10	100.0
pod	4	5	80.0
total	72	115	62.6

line	stmt	bran	cond	sub	pod	time	code
1							package Treex::Block::Read::BaseReader;
2							$Treex::Block::Read::BaseReader::VERSION = '2.20210102';
3	3			3		141067	use Moose;
	3					484380
	3					23
4	3			3		21594	use Treex::Core::Common;
	3					10
	3					22
5	3			3		18468	use File::Slurp;
	3					11
	3					310
6							with 'Treex::Core::DocumentReader';
7	3			3		1941	use Treex::Core::Document;
	3					17
	3					1776
8
9							sub next_document {
10	1			1	1	2387	my ($self) = @_;
11	1					9	return log_fatal "method next_document must be overridden in " . ref($self);
12							}
13
14							# Default language is und (undetermined/unknown) and selector empty.
15							has selector => ( isa => 'Treex::Type::Selector', is => 'ro', default => q{} );
16							has language => ( isa => 'Treex::Type::LangCode', is => 'ro', default => 'und' );
17
18
19							has from => (
20							isa => 'Treex::Core::Files',
21							is => 'rw',
22							coerce => 1,
23							required => 1,
24							handles => [qw(current_filename file_number _set_file_number)],
25							documentation => 'arrayref of filenames to be loaded, '
26							. 'coerced from a space or comma separated list of filenames, '
27							. 'see POD for details',
28							);
29
30							has file_stem => (
31							isa => 'Str',
32							is => 'ro',
33							documentation => 'how to name the loaded documents',
34							);
35
36							has is_one_doc_per_file => (
37							is => 'rw',
38							isa => 'Bool',
39							default => 1,
40							);
41
42							has _file_numbers => ( is => 'rw', default => sub { {} } );
43
44							has _file_number_width => (
45							is => 'rw',
46							isa => 'Int',
47							default => 3,
48							documentation => 'The number of digits for numbered filenames. '
49							. 'The default (3) will create filenames with three digits as "001.treex.gz".'
50							);
51
52							has skip_finished => (
53							isa => 'Str',
54							is => 'ro',
55							documentation => 'Skip input files for which a matching non-empty output file exists '
56							. '(presumably created by a previous unfinished Treex run). '
57							. 'This parameter specifies a regex substitution how to derive the output filename from the input filename. '
58							. 'It is parallel to the parameter substitute={indir}{outdir} in writers. '
59							. 'However, you need to take care of filename extensions too, '
60							. 'e.g. if converting conll to treex, you should use skip_finished={indir/(.+).conll$}{outdir/$1.treex.gz}',
61							);
62
63
64							sub BUILD {
65	9			9	0	13182	my ( $self, $args ) = @_;
66	9	50				358	if (my $regex = $self->skip_finished){
67	0					0	my $filenames_ref = $self->from->filenames;
68	0					0	my @filtered_filenames;
69	0					0	my $eval_string = '$filename =~ s' . $regex . '; 1;';
70
71	0					0	for my $input_filename (@$filenames_ref){
72	0					0	my $filename = $input_filename;
73
74							# see r14228 for an alternative implementation (without stringy eval) which cannot handle $1 in rexex
75	0	0				0	eval $eval_string or log_fatal "Failed to eval $eval_string"; ## no critic qw(BuiltinFunctions::ProhibitStringyEval)
76
77	0	0				0	if (! -s $filename){
78	0					0	push @filtered_filenames, $input_filename;
79							#say "not finished: $input_filename -> $filename";
80							} #else {say "finished: $input_filename -> $filename";}
81							}
82	0					0	$self->from->_set_filenames(\@filtered_filenames);
83	0					0	my $input_number = @$filenames_ref;
84	0					0	my $filtered_number = @filtered_filenames;
85	0					0	my $finished_number = $input_number - $filtered_number;
86	0					0	log_info "$finished_number files out of $input_number were finished, reading only the remaining $filtered_number.";
87							}
88	9					34	return;
89							}
90
91							sub next_filename {
92	1			1	1	7	my ($self) = @_;
93
94							# return undef, but do not move further if we are at the end of document list (we might need the current file name)
95	1	50				7	return if ( $self->file_number >= $self->from->number_of_files );
96
97	1					6	$self->_set_file_number( $self->file_number + 1 );
98	1					6	return $self->current_filename();
99							}
100
101	3			3		35	use File::Spec;
	3					10
	3					1648
102
103							sub new_document {
104	1			1	1	6	my ( $self, $load_from ) = @_;
105	1					10	my $path = $self->current_filename();
106	1	50				4	log_fatal "next_filename() must be called before new_document()" if !defined $path;
107	1					29	my ( $volume, $dirs, $file ) = File::Spec->splitpath($path);
108
109							# Delete file extension, e.g.
110							# file.01.conll -> file.01
111							# cs42.treex.gz -> cs42
112	1					8	$file =~ s/\.[^.]+(\.gz)?$//;
113
114							# Substitute standard input for noname.
115	1					4	$file =~ s/^-$/noname/;
116
117	1					7	my %args = ( file_stem => $file, loaded_from => $path );
118	1	50				5	if ( defined $dirs ) {
119	1					6	$args{path} = $volume . $dirs;
120							}
121
122							# Override the naming heuristics above, if file_stem was specified.
123	1	50				42	if ( $self->file_stem ) {
124	0					0	$args{file_stem} = $self->file_stem;
125							}
126
127	1	50	33			101	if ( $self->is_one_doc_per_file && !$self->file_stem ) {
128	1					5	$args{file_number} = q{};
129							}
130							else {
131	0					0	my $num = $self->_file_numbers->{$file};
132	0					0	$self->_file_numbers->{$file} = ++$num;
133	0					0	my $fmt = "%0".$self->_file_number_width."d";
134	0					0	$args{file_number} = sprintf $fmt, $num;
135							}
136
137	1	50				5	if ( defined $load_from ) {
138	0					0	$args{filename} = $load_from;
139							}
140
141	1					43	$self->_set_doc_number( $self->doc_number + 1 );
142
143	1					3	my $document;
144	1	50	33			20	if ( defined $load_from and $load_from =~ /\.streex$/ ) {
145	0					0	$document = Treex::Core::Document->retrieve_storable($load_from);
146	0					0	$document->set_storable(1);
147							}
148							else {
149	1					46	$document = Treex::Core::Document->new( \%args );
150							}
151
152	1	50	33			10	if ( defined $load_from && $load_from =~ /\.gz$/ ) {
153	0					0	$document->set_compress(1);
154							}
155
156	1					6	return $document;
157							}
158
159							sub number_of_documents {
160	2			2	1	1152	my $self = shift;
161	2	50				97	return $self->is_one_doc_per_file ? $self->from->number_of_files : undef;
162							}
163
164							after 'restart' => sub {
165							my $self = shift;
166							$self->_set_file_number(0);
167							};
168
169							1;
170
171							__END__
172
173							=pod
174
175							=encoding utf-8
176
177							=head1 NAME
178
179							Treex::Block::Read::BaseReader - abstract ancestor for document readers
180
181							=head1 VERSION
182
183							version 2.20210102
184
185							=head1 DESCRIPTION
186
187							This class serves as a common ancestor for document readers
188							that have the parameter C<from> with a space or comma separated list of filenames
189							to be loaded.
190							It is designed to implement the L<Treex::Core::DocumentReader> interface.
191
192							In derived classes you need to define the C<next_document> method,
193							and you can use C<next_filename> and C<new_document> methods.
194
195							=head1 ATTRIBUTES
196
197							=over
198
199							=item from (required)
200
201							space or comma separated list of filenames, or C<-> for STDIN
202
203							An '@' directly in front of a file name causes this file to be interpreted as a file
204							list, with one file name per line, e.g. '@filelist.txt' causes the reader to open
205							'filelist.txt' and read a list of files from it. File lists may be arbitrarily
206							mixed with regular files in the parameter.
207
208							Similarly, you can use I<!> for wildcard expansion, e.g.
209							C<treex -Len Read::Treex from='!dir??/file*.txt'>.
210							The single quotes are needed for two reasons.
211							First, to prevent bash from interpreting the wildcard characters.
212							Second, to prevent bash from interpreting the exclamation mark as history expansion.
213
214							The I<@filelist> and I<!wildcard> conventions are used in several tools, e.g. 7z or javac.
215
216							(If you use this method via API you can specify a string array reference or a
217							L<Treex::Core::Files> object.)
218
219							=item file_stem (optional)
220
221							How to name the loaded documents.
222							This attribute will be saved to the same-named
223							attribute in documents and it will be used in document writers
224							to decide where to save the files.
225
226							=back
227
228							=head1 METHODS
229
230							=over
231
232							=item next_document
233
234							This method must be overridden in derived classes.
235							(The implementation in this class just issues fatal error.)
236
237							=item next_filename
238
239							returns the next filename (full path) to be loaded
240							(from the list specified in the attribute C<from>)
241
242							=item new_document($load_from?)
243
244							Returns a new empty document with pre-filled attributes
245							C<loaded_from>, C<file_stem>, C<file_number> and C<path>
246							which are guessed based on C<current_filename>.
247
248							=item current_filename
249
250							returns the last filename returned by C<next_filename>
251
252							=item is_next_document_for_this_job
253
254							Is the document that will be returned by C<next_document>
255							supposed to be processed by this job?
256							This is relevant only in parallel processing,
257							where each job has a different C<$jobnumber> assigned.
258
259							=item number_of_documents
260
261							Returns the number of documents that will be read by this reader.
262							If C<is_one_doc_per_file> returns C<true>, then the number of documents
263							equals the number of files given in C<from>.
264							Otherwise, this method returns C<undef>.
265
266							=back
267
268							=head1 SEE
269
270							L<Treex::Block::Read::BaseTextReader>
271							L<Treex::Block::Read::Text>
272
273							=head1 AUTHOR
274
275							Martin Popel <popel@ufal.mff.cuni.cz>
276
277							=head1 COPYRIGHT AND LICENSE
278
279							Copyright © 2011-2012 by Institute of Formal and Applied Linguistics, Charles University in Prague
280
281							This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.