File Coverage

blib/lib/Treex/Core/Files.pm
Criterion Covered Total %
statement 68 95 71.5
branch 17 40 42.5
condition 2 6 33.3
subroutine 17 20 85.0
pod 8 10 80.0
total 112 171 65.5


line stmt bran cond sub pod time code
1             package Treex::Core::Files;
2             $Treex::Core::Files::VERSION = '2.20210102';
3 28     28   2034 use Moose;
  28         472685  
  28         268  
4 28     28   226100 use MooseX::SemiAffordanceAccessor 0.09;
  28         38931  
  28         165  
5 28     28   245812 use Treex::Core::Log;
  28         73  
  28         2498  
6 28     28   15416 use autodie;
  28         381026  
  28         179  
7 28     28   192724 use File::Slurp 9999.19;
  28         1045  
  28         2632  
8 28     28   233 use Digest::MD5 qw(md5_hex);
  28         125  
  28         1565  
9 28     28   16259 use PerlIO::via::gzip;
  28         1483798  
  28         1180  
10 28     28   265 use File::Basename;
  28         71  
  28         40481  
11              
12             has filenames => (
13             is => 'ro',
14             isa => 'ArrayRef[Str]',
15             writer => '_set_filenames',
16             );
17              
18             has file_number => (
19             isa => 'Int',
20             is => 'ro',
21             writer => '_set_file_number',
22             default => 0,
23             init_arg => undef,
24             documentation => 'Number of the current file',
25             );
26              
27             has current_filehandle => (
28             is => 'ro',
29             writer => '_set_current_filehandle',
30             );
31              
32             has encoding => (
33             isa => 'Str',
34             is => 'rw',
35             default => 'utf8',
36             );
37              
38             has join_files_for_next_line => (
39             isa => 'Bool',
40             is => 'ro',
41             default => 1,
42             documentation => 'Should method next_line automatically go to the next file when finished reading the current file?',
43             );
44              
45             sub BUILD {
46 10     10 0 21799 my ( $self, $args ) = @_;
47 10 50       72 if ($args->{filenames}){
    50          
48             ## Nothing to do, $args->{filenames} are ArrayRef[Str] checked by Moose
49             } elsif(defined $args->{string}){
50 10         53 $self->_set_filenames( $self->string_to_filenames( $args->{string} ) );
51             } else {
52 0         0 log_fatal 'One of the parameters (filenames, string) is required';
53             }
54 10         36 return;
55             }
56              
57             sub string_to_filenames {
58 10     10 1 37 my ( $self, $string ) = @_;
59            
60             # "!" means glob pattern which can contain {dir1,dir2}
61             # so it cannot be combined with separating tokens with comma.
62 10 50       54 if ($string =~ /^!(.+)/) {
63 0         0 my @filenames = glob $1;
64 0 0       0 log_warn "No filenames matched '$1' pattern" if !@filenames;
65 0         0 return \@filenames;
66             }
67            
68 10         62 return [ map { $self->_token_to_filenames($_) } grep {/./} split /[ ,]+/, $string ];
  10         50  
  10         67  
69             }
70              
71             sub _token_to_filenames {
72 10     10   33 my ( $self, $token ) = @_;
73 10 50       51 if ($token =~ /^!(.+)/) {
74 0         0 my @filenames = glob $1;
75 0 0       0 log_warn "No filenames matched '$1' pattern" if !@filenames;
76 0         0 return @filenames;
77             }
78 10 100       393 return $token if $token !~ s/^@(.*)/$1/;
79 1 50       4 my $filelist = $token eq '-' ? \*STDIN : $token;
80 1         7 my @filenames = grep { $_ ne '' } read_file( $filelist, chomp => 1 );
  2         235  
81              
82             # Filnames in a filelist can be relative to the filelist directory.
83 1         72 my $dir = dirname($token);
84 1 50       49 return @filenames if $dir eq '.';
85 0 0       0 return map {!m{^/} ? "$dir/$_" : $_} @filenames;
  0         0  
86             }
87              
88             sub number_of_files {
89 3     3 1 9 my ($self) = @_;
90 3         7 return scalar @{ $self->filenames };
  3         90  
91             }
92              
93             sub current_filename {
94 5     5 1 50 my ($self) = @_;
95 5 100 66     137 return if $self->file_number == 0 || $self->file_number > @{ $self->filenames };
  5         130  
96 4         109 return $self->filenames->[ $self->file_number - 1 ];
97             }
98              
99             sub next_filename {
100 2     2 1 6 my ($self) = @_;
101 2         70 $self->_set_file_number( $self->file_number + 1 );
102 2         8 return $self->current_filename();
103             }
104              
105             sub has_next_file {
106 0     0 0 0 my ($self) = @_;
107 0         0 return $self->file_number < $self->number_of_files;
108             }
109              
110             sub get_hash {
111 0     0 1 0 my $self = shift;
112              
113 0         0 my $md5 = Digest::MD5->new();
114 0         0 for my $filename (@{$self->filenames}) {
  0         0  
115 0 0       0 if ( -f $filename ) {
116 0         0 $md5->add($filename);
117 0         0 $md5->add((stat($filename))[9]);
118             }
119             }
120 0         0 return $md5->hexdigest;
121             }
122              
123             sub next_filehandle {
124 2     2 1 4 my ($self) = @_;
125 2         16 my $filename = $self->next_filename();
126 2         56 my $FH = $self->current_filehandle;
127            
128 2 100       8 if (!defined $filename){
    50          
129 1         3 $FH = undef;
130             }
131             elsif ( $filename eq '-' ) {
132 0         0 binmode STDIN, $self->encoding;
133 0         0 $FH = \*STDIN;
134             }
135             else {
136 1 50       6 my $mode = $filename =~ /[.]gz$/ ? '<:via(gzip):' : '<:';
137 1         42 $mode .= $self->encoding;
138 1 50       9 open $FH, $mode, $filename or log_fatal "Can't open $filename: $!";
139             }
140 2         2298 $self->_set_current_filehandle($FH);
141 2         13 return $FH;
142             }
143              
144             sub next_file_text {
145 2     2 1 5 my ($self) = @_;
146 2 100       8 my $FH = $self->next_filehandle() or return;
147              
148             # Slurp that is compatible with Perl::IO::via::gzip.
149 1         7 local $/ = undef;
150 1         43 return <$FH>;
151             }
152              
153             sub next_line {
154 0     0 1   my ($self) = @_;
155 0           my $FH = $self->current_filehandle;
156 0 0 0       return if !$FH && !$self->join_files_for_next_line;
157 0 0         if ( !$FH ) {
158 0 0         $FH = $self->next_filehandle() or return;
159             }
160 0           return <$FH>;
161             }
162              
163             #<<<
164 28     28   273 use Moose::Util::TypeConstraints;
  28         77  
  28         445  
165             coerce 'Treex::Core::Files'
166             => from 'Str'
167             => via { Treex::Core::Files->new( string => $_ ) }
168             => from 'ArrayRef[Str]'
169             => via { Treex::Core::Files->new( filenames => $_ ) };
170             #>>>
171             # TODO: POD, next_filehandle, gz support
172              
173             1;
174              
175             __END__
176              
177             =pod
178              
179             =encoding utf-8
180              
181             =head1 NAME
182              
183             Treex::Core::Files - helper class for iterating over filenames
184              
185             =head1 VERSION
186              
187             version 2.20210102
188              
189             =head1 SYNOPSIS
190              
191             package My::Class;
192             use Moose;
193              
194             has from => (
195             is => 'ro',
196             isa => 'Treex::Core::Files',
197             coerce => 1,
198             handles => [qw(next_filename current_filename)],
199             );
200              
201             # and then
202             my $c = My::Class(from=>'f1.txt f2.txt.gz @my.filelist');
203              
204             while (defined (my $filename = $c->next_filename)){ ... }
205             #or
206             while (my $filehandle = $c->next_filehandle){ ... }
207              
208             # You can use also wildcard expansion
209             my $c = My::Class(from=>'!dir??/file*.txt');
210              
211              
212             =head1 DESCRIPTION
213              
214             The I<@filelist> and I<!wildcard> conventions are used in several tools, e.g. 7z or javac.
215             For a large number of files, list the file names in a file - one per line.
216             Then use the list file name preceded by an @ character.
217              
218             Methods <next_*> serve as iterators and return undef if the called after the last file is reached.
219              
220             =head1 METHODS
221              
222             =head2 number_of_files
223              
224             Returns the total number of files contained by this instance.
225              
226             =head2 file_number
227              
228             Returns ordinal number (1..number_of_files) of the current file.
229              
230             =head2 current_filename
231              
232             Returns the current filename or undef if the iterator is before the first file
233             (i.e. C<next_filename> has not been called so far) or after the last file.
234              
235             =head2 next_filename
236              
237             Returns the next filename (and increments the file_number).
238              
239             =head2 current_filehandle
240              
241             Opens the current file for reading and returns the filehandle.
242             Filename "-" is interpreted as STDIN.
243             Filenames with extension ".gz" are opened via L<PerlIO::via::gzip> (ie. unzipped on the fly).
244              
245             =head2 next_filehandle
246              
247             Returns the next filehandle (and increments the file_number).
248              
249             =head2 next_file_text
250              
251             Returns the content of the next file (slurp) and increments the file_number.
252              
253             =head2 next_line
254              
255             Returns the next line of the current file.
256             If the end of file is reached and attribute C<join_files_for_next_line> is set to true (which is by default),
257             the first line of next file is returned (and file_number incremented).
258              
259             =head2 get_hash
260              
261             Returns MD5 hash computed from the filenames and last modify times.
262              
263             =head2 $filenames_ref = string_to_filenames($string)
264              
265             Helper method that expands comma-or-space-separated list of filenames
266             and returns an array reference containing the filenames.
267             If the string starts with "!", it is interpreted as wildcards (see Perl L<glob>).
268             If a filename starts with "@" it is interpreted as a file list with one filename per line.
269              
270             =head1 AUTHOR
271              
272             Martin Popel <popel@ufal.mff.cuni.cz>
273              
274             =head1 COPYRIGHT AND LICENSE
275              
276             Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles University in Prague
277              
278             This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.