File Coverage

blib/lib/Treex/Core/DocumentReader.pm
Criterion Covered Total %
statement 14 33 42.4
branch 4 14 28.5
condition 2 3 66.6
subroutine 4 6 66.6
pod 3 4 75.0
total 27 60 45.0


line stmt bran cond sub pod time code
1             package Treex::Core::DocumentReader;
2             $Treex::Core::DocumentReader::VERSION = '2.20210102';
3 3     3   2459 use Moose::Role;
  3         11  
  3         33  
4              
5             # with Moose >= 2.00, this must be present also in roles
6 3     3   13790 use MooseX::SemiAffordanceAccessor 0.09;
  3         98  
  3         32  
7              
8             requires 'next_document';
9              
10             requires 'number_of_documents';
11              
12             # attrs for distributed processing
13             # TODO: check jobs >= jobindex > 0
14             has jobs => (
15             is => 'rw',
16             isa => 'Int',
17             documentation => 'number of jobs for parallel processing',
18             );
19              
20             has jobindex => (
21             is => 'rw',
22             isa => 'Int',
23             documentation => 'ordinal number of the current job in parallel processing',
24             );
25              
26             # TODO: this should not be needed in future
27             has outdir => (
28             is => 'rw',
29             isa => 'Str',
30             );
31              
32             has doc_number => (
33             isa => 'Int',
34             is => 'ro',
35             writer => '_set_doc_number',
36             default => 0,
37             init_arg => undef,
38             documentation => 'Number of documents loaded so far, i.e.'
39             . ' the ordinal number of the current (most recently loaded) document.',
40             );
41              
42             has consumer => (
43             isa => 'Treex::Block::Read::ConsumerReader',
44             is => 'rw'
45             );
46              
47             sub next_document_for_this_job {
48 2     2 1 260 my ($self) = @_;
49              
50             # In parallel execution, the file name is sent from the head to the workers via TCP
51             # and only one doc per file is allowed, so we can override the file list to contain just
52             # the file to be processed and set the $self->file_number counter to 0 – just before the file
53             # to be processed (we will get another file name and reset it again next time).
54             #
55             # $self->doc_number is set to the number of processed files minus 1 since it will be increased
56             # in next_document().
57             #
58             # This is an ugly hack (next_filename _set_file_number is defined only in BaseReader and BaseAlignedReader),
59             # but this code must be specified here in next_document_for_this_job because the method next_filename
60             # may be overriden or may not be used at all (e.g., BaseTextReader delegates its functionality
61             # to Treex::Core::Files).
62              
63 2 50       90 if ( $self->consumer ) {
64 0         0 my $res = $self->consumer->call("next_filename");
65 0 0       0 if ($res) {
66 0         0 $self->_set_file_number(0);
67 0         0 $self->_set_doc_number( $res->{file_number} - 1 );
68            
69             # $res->{result} contains the next file name for plain readers,
70             # a hashref: zone -> file name for aligned readers
71 0 0       0 if (ref($res->{result}) eq 'HASH'){
72             # here we assume that all zones exist in _filenames
73             # (they should since all arguments are passed on to jobs)
74 0         0 while (my ($zone, $filename) = each %{$res->{result}}){
  0         0  
75 0         0 $self->_filenames->{$zone}->_set_filenames( [ $filename ] );
76             }
77             }
78             else {
79 0         0 $self->from->_set_filenames( [ $res->{result} ] );
80             }
81             }
82              
83             # Martin Majliš had the following for BaseAlignedReader but I see no reason for it.
84             # elsif ($self->_files_per_zone){
85             # $self->_set_file_number($self->_files_per_zone + 2);
86             #}
87             else {
88 0         0 return;
89             }
90             }
91              
92 2         12 my $doc = $self->next_document();
93              
94             # TODO this is not very elegant
95             # and it is also wrong, because if next_document issues some warnings,
96             # these are printed into a wrong file.
97             # However, I don't know how to get the correct doc_number before executing next_document.
98             # Regarding perlcritic ProtectPrivateSubs:
99             # I consider _redirect_output as internal for Treex::Core modules.
100             # print STDERR "DOC: " . $doc . " : " . $self->doc_number . ", JOB: " . $self->jobindex . "\n";
101              
102 2 50 66     43 if ( $doc && $self->jobindex ) {
103 0         0 Treex::Core::Parallel::Node::_redirect_output( $self->outdir, $self->doc_number, $self->jobindex ); ## no critic (ProtectPrivateSubs)
104             }
105              
106 2         10 return $doc;
107             }
108              
109             sub number_of_documents_per_this_job {
110 1     1 1 4 my ($self) = @_;
111 1 50       11 my $total = $self->number_of_documents() or return;
112 1 50       39 return $total if !$self->jobs;
113 0           my $rest = $total % $self->jobs;
114 0           my $div = ( $total - $rest ) / $self->jobs;
115 0 0         return $div + ( $rest >= $self->jobindex ? 1 : 0 );
116             }
117              
118             sub restart {
119 0     0 1   my ($self) = @_;
120 0           $self->_set_doc_number(0);
121 0           return;
122             }
123              
124             # Readers usually do not need any share files,
125             # but all blocks should implement this method
126             # and readers do not extend Treex::Core::Block.
127             sub get_required_share_files {
128 0     0 0   my ($self) = @_;
129 0           return ();
130             }
131              
132             1;
133              
134             __END__
135              
136             =encoding utf-8
137              
138             =head1 NAME
139              
140             Treex::Core::DocumentReader - interface for all document readers
141              
142             =head1 VERSION
143              
144             version 2.20210102
145              
146             =head1 DESCRIPTION
147              
148             Document readers are a Treex concept how to load documents to be processed by Treex.
149             The documents can be stored in files (in various formats) or read from C<STDIN>
150             or retrieved from a socket etc.
151              
152             =head1 METHODS
153              
154             =head2 To be implemented
155              
156             These methods must be implemented in classes that consume this role.
157              
158             =over
159              
160             =item next_document
161              
162             Return next document (L<Treex::Core::Document>).
163              
164             =item number_of_documents
165              
166             Total number of documents that will be produced by this reader.
167             If the number is unknown in advance, C<undef> should be returned.
168              
169             =back
170              
171             =head2 Already implemented
172              
173             =over
174              
175             =item is_current_document_for_this_job
176              
177             Is the document that was most recently returned by C<$self->next_document()>
178             supposed to be processed by this job?
179             Job indices and document numbers are 1-based, so e.g. for
180             C<jobs = 5, jobindex = 3> we want to load documents with numbers 3,8,13,18,...
181             C<jobs = 5, jobindex = 5> we want to load documents with numbers 5,10,15,20,...
182             i.e. those documents where C<(doc_number-1) % jobs == (jobindex-1)>.
183              
184             =item next_document_for_this_job
185              
186             Returns a next document which should be processed by this job.
187             If C<jobindex> is set, returns "modulo number of jobs".
188             See C<is_current_document_for_this_job>.
189              
190             =item number_of_documents_per_this_job
191              
192             Total number of documents that will be produced by this reader for this job.
193             It's computed based on C<number_of_documents>, C<jobindex> and C<jobs>.
194              
195             =item restart
196              
197             Start reading again from the first document.
198             This implementation just sets the attribute C<doc_number> to zero.
199             You can add additional behavior using the Moose C<after 'restart'> construct.
200              
201             =back
202              
203             =head1 SEE ALSO
204              
205             L<Treex::Block::Read::Sentences>
206             L<Treex::Block::Read::Text>
207             L<Treex::Block::Read::Treex>
208              
209              
210             =head1 AUTHOR
211              
212             Martin Popel <popel@ufal.mff.cuni.cz>
213              
214             =head1 COPYRIGHT AND LICENSE
215              
216             Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles University in Prague
217              
218             This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.