File Coverage

blib/lib/Treex/Block/Read/BaseAlignedReader.pm
Criterion Covered Total %
statement 2 4 50.0
branch n/a
condition n/a
subroutine 2 2 100.0
pod n/a
total 4 6 66.6


line stmt bran cond sub pod time code
1             package Treex::Block::Read::BaseAlignedReader;
2             BEGIN {
3 1     1   26601 $Treex::Block::Read::BaseAlignedReader::VERSION = '0.08170';
4             }
5 1     1   1723 use Moose;
  0            
  0            
6             use Treex::Core::Common;
7             with 'Treex::Core::DocumentReader';
8             use Treex::Core::Document;
9              
10             sub next_document {
11             my ($self) = @_;
12             return log_fatal "method next_document must be overriden in " . ref($self);
13             }
14              
15             has selector => ( isa => 'Treex::Type::Selector', is => 'ro', default => '' );
16              
17             has file_stem => (
18             isa => 'Str',
19             is => 'ro',
20             documentation => 'how to name the loaded documents',
21             );
22              
23             # private attributes
24             has _filenames => (
25             isa => 'HashRef[Str]',
26             is => 'ro',
27             init_arg => undef,
28             default => sub { {} },
29             documentation => 'mapping zone_label->filenames to be loaded;'
30             . ' automatically initialized from constructor arguments',
31             );
32              
33             has _files_per_zone => ( is => 'rw', default => 0 );
34              
35             has _file_number => (
36             isa => 'Int',
37             is => 'rw',
38             default => 0,
39             init_arg => undef,
40             documentation => 'Number of n-tuples of input files loaded so far.',
41             );
42              
43             #BUILD is needed for processing generic arguments - now only shortcuts of type langcode_selector
44             sub BUILD {
45             my ( $self, $args ) = @_;
46             foreach my $arg ( keys %{$args} ) {
47             my ( $lang, $sele ) = ( $arg, '' );
48             if ( $arg =~ /_/ ) {
49             ( $lang, $sele ) = split /_/, $arg;
50             }
51             if ( is_lang_code($lang) ) {
52             my $files_string = $args->{$arg};
53             $files_string =~ s/^\s+|\s+$//g;
54             my @files = split( /[ ,]+/, $files_string );
55             if ( !$self->_files_per_zone ) {
56             $self->_set_files_per_zone( scalar @files );
57             }
58             elsif ( @files != $self->_files_per_zone ) {
59             log_fatal("All zones must have the same number of files");
60             }
61             $self->_filenames->{$arg} = \@files;
62             }
63             elsif ( $arg =~ /selector|language|scenario/ ) { }
64             else { log_warn "$arg is not a zone label (e.g. en_src)"; }
65             }
66             return;
67             }
68              
69             sub current_filenames {
70             my ($self) = @_;
71             my $n = $self->_file_number;
72             return if $n == 0 || $n > $self->_files_per_zone;
73             return map { $_ => $self->_filenames->{$_}[ $n - 1 ] } keys %{ $self->_filenames };
74             }
75              
76             sub next_filenames {
77             my ($self) = @_;
78             $self->_set_file_number( $self->_file_number + 1 );
79             return $self->current_filenames;
80             }
81              
82             sub new_document {
83             my ( $self, $load_from ) = @_;
84             my %filenames = $self->current_filenames();
85             log_fatal "next_filenames() must be called before new_document()" if !%filenames;
86              
87             my ( $stem, $file_number ) = ( '', '' );
88             my ( $volume, $dirs, $file );
89             if ( $self->file_stem ) {
90             ( $stem, $file_number ) = ( $self->file_stem, undef );
91             }
92             else { # Magical heuristics how to choose default name for a document loaded from several files
93             foreach my $zone_label ( keys %filenames ) {
94             my $filename = $filenames{$zone_label};
95             ( $volume, $dirs, $file ) = File::Spec->splitpath($filename);
96             my ($name) = $file =~ /([^.]+)(?:\..+)?/; #we gracefully throw away extension, because it is not used
97             my ( $lang, $sele ) = ( $zone_label, '' );
98             if ( $zone_label =~ /_/ ) {
99             ( $lang, $sele ) = split /_/, $zone_label;
100             }
101             $name =~ s/[_-]?($lang|$sele|$zone_label)[_-]?//gi;
102             if ( !$name && !$stem ) {
103             $name = 'noname';
104             $file_number = undef;
105             }
106             if ( $stem !~ /$name/ ) {
107             if ( $stem ne '' ) {
108             $stem .= '_';
109             }
110             $stem .= $name;
111             }
112             }
113             }
114              
115             $self->_set_doc_number( $self->doc_number + 1 );
116             return Treex::Core::Document->new(
117             {
118             file_stem => $stem,
119             loaded_from => join( ',', values %filenames ),
120             defined $file_number ? ( file_number => $file_number ) : (),
121             defined $dirs ? ( path => $volume . $dirs ) : (),
122             defined $load_from ? ( filename => $load_from ) : (),
123             }
124             );
125             }
126              
127             sub number_of_documents {
128             my $self = shift;
129             return $self->_files_per_zone;
130             }
131              
132             after 'restart' => sub {
133             my $self = shift;
134             $self->_set_file_number(0);
135             };
136              
137             1;
138              
139             __END__
140              
141             =for Pod::Coverage BUILD
142              
143             =head1 NAME
144              
145             Treex::Block::Read::BaseAlignedReader - abstract ancestor for parallel-corpora document readers
146              
147             =head1 VERSION
148              
149             version 0.08170
150              
151             =head1 SYNOPSIS
152              
153             # in scenarios
154             Read::MyAlignedFormat en=english.txt de=german.txt
155              
156             # Zones can differ also in selectors, any number of zones can be read
157             Read::MyAlignedFormat en_ref=ref1,ref2 en_moses=mos1,mos2 en_tectomt=tmt1,tmt2
158              
159             =head1 DESCRIPTION
160              
161             This class serves as a common ancestor for document readers
162             that read more zones at once -- usually parallel sentences in two (or more) languages.
163             The readers take parameters named as the zones and values of the parameters
164             is a space or comma separated list of filenames to be loaded into the given zone.
165             The class is designed to implement the L<Treex::Core::DocumentReader> interface.
166              
167             In derived classes you need to define the C<next_document> method,
168             and you can use C<next_filenames> and C<new_document> methods.
169              
170             =head1 ATTRIBUTES
171              
172             =over
173              
174             =item any parameter in a form of a valid I<zone_label>
175              
176             space or comma separated list of filenames, or C<-> for STDIN.
177              
178             =item file_stem (optional)
179              
180             How to name the loaded documents.
181             This attribute will be saved to the same-named
182             attribute in documents and it will be used in document writers
183             to decide where to save the files.
184              
185             =back
186              
187             =head1 METHODS
188              
189             =over
190              
191             =item next_document
192              
193             This method must be overriden in derived classes.
194             (The implementation in this class just issues fatal error.)
195              
196             =item next_filenames
197              
198             Returns a hashref of filenames (full paths) to be loaded.
199             The keys of the hash are zone labels, the values are the filenames.
200              
201             =item new_document($load_from?)
202              
203             Returns a new empty document with pre-filled attributes
204             C<loaded_from>, C<file_stem>, C<file_number> and C<path>
205             which are guessed based on C<current_filenames>.
206              
207             =item current_filenames
208              
209             returns the last filenames returned by C<next_filenames>
210              
211             =item number_of_documents
212              
213             Returns the number of documents that will be read by this reader.
214              
215             =back
216              
217             =head1 SEE ALSO
218              
219             L<Treex::Block::Read::BaseReader>
220             L<Treex::Block::Read::BaseAlignedTextReader>
221              
222             =head1 AUTHOR
223              
224             Martin Popel
225              
226             =head1 COPYRIGHT AND LICENSE
227              
228             Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles University in Prague
229              
230             This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.