File Coverage

blib/lib/Treex/Core/Files.pm
Criterion Covered Total %
statement 44 95 46.3
branch 7 40 17.5
condition 0 6 0.0
subroutine 12 20 60.0
pod 8 10 80.0
total 71 171 41.5


line stmt bran cond sub pod time code
1             package Treex::Core::Files;
2             $Treex::Core::Files::VERSION = '2.20160630';
3 28     28   1930 use Moose;
  28         395975  
  28         291  
4 28     28   213337 use MooseX::SemiAffordanceAccessor 0.09;
  28         36202  
  28         177  
5 28     28   234033 use Treex::Core::Log;
  28         78  
  28         2155  
6 28     28   15255 use autodie;
  28         303872  
  28         153  
7 28     28   186371 use File::Slurp 9999.19;
  28         938  
  28         2194  
8 28     28   186 use Digest::MD5 qw(md5_hex);
  28         66  
  28         1308  
9 28     28   14797 use PerlIO::via::gzip;
  28         1199246  
  28         1047  
10 28     28   281 use File::Basename;
  28         87  
  28         29892  
11              
12             has filenames => (
13             is => 'ro',
14             isa => 'ArrayRef[Str]',
15             writer => '_set_filenames',
16             );
17              
18             has file_number => (
19             isa => 'Int',
20             is => 'ro',
21             writer => '_set_file_number',
22             default => 0,
23             init_arg => undef,
24             documentation => 'Number of the current file',
25             );
26              
27             has current_filehandle => (
28             is => 'ro',
29             writer => '_set_current_filehandle',
30             );
31              
32             has encoding => (
33             isa => 'Str',
34             is => 'rw',
35             default => 'utf8',
36             );
37              
38             has join_files_for_next_line => (
39             isa => 'Bool',
40             is => 'ro',
41             default => 1,
42             documentation => 'Should method next_line automatically go to the next file when finished reading the current file?',
43             );
44              
45             sub BUILD {
46 1     1 0 2694 my ( $self, $args ) = @_;
47 1 50       5 if ($args->{filenames}){
    50          
48             ## Nothing to do, $args->{filenames} are ArrayRef[Str] checked by Moose
49             } elsif(defined $args->{string}){
50 1         5 $self->_set_filenames( $self->string_to_filenames( $args->{string} ) );
51             } else {
52 0         0 log_fatal 'One of the parameters (filenames, string) is required';
53             }
54 1         3 return;
55             }
56              
57             sub string_to_filenames {
58 1     1 1 3 my ( $self, $string ) = @_;
59            
60             # "!" means glob pattern which can contain {dir1,dir2}
61             # so it cannot be combined with separating tokens with comma.
62 1 50       4 if ($string =~ /^!(.+)/) {
63 0         0 my @filenames = glob $1;
64 0 0       0 log_warn "No filenames matched '$1' pattern" if !@filenames;
65 0         0 return \@filenames;
66             }
67            
68 1         5 return [ map { $self->_token_to_filenames($_) } grep {/./} split /[ ,]+/, $string ];
  1         3  
  1         7  
69             }
70              
71             sub _token_to_filenames {
72 1     1   3 my ( $self, $token ) = @_;
73 1 50       4 if ($token =~ /^!(.+)/) {
74 0         0 my @filenames = glob $1;
75 0 0       0 log_warn "No filenames matched '$1' pattern" if !@filenames;
76 0         0 return @filenames;
77             }
78 1 50       10 return $token if $token !~ s/^@(.*)/$1/;
79 1 50       4 my $filelist = $token eq '-' ? \*STDIN : $token;
80 1         6 my @filenames = grep { $_ ne '' } read_file( $filelist, chomp => 1 );
  2         249  
81              
82             # Filnames in a filelist can be relative to the filelist directory.
83 1         46 my $dir = dirname($token);
84 1 50       38 return @filenames if $dir eq '.';
85 0 0         return map {!m{^/} ? "$dir/$_" : $_} @filenames;
  0            
86             }
87              
88             sub number_of_files {
89 0     0 1   my ($self) = @_;
90 0           return scalar @{ $self->filenames };
  0            
91             }
92              
93             sub current_filename {
94 0     0 1   my ($self) = @_;
95 0 0 0       return if $self->file_number == 0 || $self->file_number > @{ $self->filenames };
  0            
96 0           return $self->filenames->[ $self->file_number - 1 ];
97             }
98              
99             sub next_filename {
100 0     0 1   my ($self) = @_;
101 0           $self->_set_file_number( $self->file_number + 1 );
102 0           return $self->current_filename();
103             }
104              
105             sub has_next_file {
106 0     0 0   my ($self) = @_;
107 0           return $self->file_number < $self->number_of_files;
108             }
109              
110             sub get_hash {
111 0     0 1   my $self = shift;
112              
113 0           my $md5 = Digest::MD5->new();
114 0           for my $filename (@{$self->filenames}) {
  0            
115 0 0         if ( -f $filename ) {
116 0           $md5->add($filename);
117 0           $md5->add((stat($filename))[9]);
118             }
119             }
120 0           return $md5->hexdigest;
121             }
122              
123             sub next_filehandle {
124 0     0 1   my ($self) = @_;
125 0           my $filename = $self->next_filename();
126 0           my $FH = $self->current_filehandle;
127            
128 0 0         if (!defined $filename){
    0          
129 0           $FH = undef;
130             }
131             elsif ( $filename eq '-' ) {
132 0           binmode STDIN, $self->encoding;
133 0           $FH = \*STDIN;
134             }
135             else {
136 0 0         my $mode = $filename =~ /[.]gz$/ ? '<:via(gzip):' : '<:';
137 0           $mode .= $self->encoding;
138 0 0         open $FH, $mode, $filename or log_fatal "Can't open $filename: $!";
139             }
140 0           $self->_set_current_filehandle($FH);
141 0           return $FH;
142             }
143              
144             sub next_file_text {
145 0     0 1   my ($self) = @_;
146 0 0         my $FH = $self->next_filehandle() or return;
147              
148             # Slurp that is compatible with Perl::IO::via::gzip.
149 0           local $/ = undef;
150 0           return <$FH>;
151             }
152              
153             sub next_line {
154 0     0 1   my ($self) = @_;
155 0           my $FH = $self->current_filehandle;
156 0 0 0       return if !$FH && !$self->join_files_for_next_line;
157 0 0         if ( !$FH ) {
158 0 0         $FH = $self->next_filehandle() or return;
159             }
160 0           return <$FH>;
161             }
162              
163             #<<<
164 28     28   251 use Moose::Util::TypeConstraints;
  28         76  
  28         390  
165             coerce 'Treex::Core::Files'
166             => from 'Str'
167             => via { Treex::Core::Files->new( string => $_ ) }
168             => from 'ArrayRef[Str]'
169             => via { Treex::Core::Files->new( filenames => $_ ) };
170             #>>>
171             # TODO: POD, next_filehandle, gz support
172              
173             1;
174              
175             __END__
176              
177             =pod
178              
179             =encoding utf-8
180              
181             =head1 NAME
182              
183             Treex::Core::Files - helper class for iterating over filenames
184              
185             =head1 VERSION
186              
187             version 2.20160630
188              
189             =head1 SYNOPSIS
190              
191             package My::Class;
192             use Moose;
193              
194             has from => (
195             is => 'ro',
196             isa => 'Treex::Core::Files',
197             coerce => 1,
198             handles => [qw(next_filename current_filename)],
199             );
200              
201             # and then
202             my $c = My::Class(from=>'f1.txt f2.txt.gz @my.filelist');
203              
204             while (defined (my $filename = $c->next_filename)){ ... }
205             #or
206             while (my $filehandle = $c->next_filehandle){ ... }
207              
208             # You can use also wildcard expansion
209             my $c = My::Class(from=>'!dir??/file*.txt');
210              
211              
212             =head1 DESCRIPTION
213              
214             The I<@filelist> and I<!wildcard> conventions are used in several tools, e.g. 7z or javac.
215             For a large number of files, list the file names in a file - one per line.
216             Then use the list file name preceded by an @ character.
217              
218             Methods <next_*> serve as iterators and return undef if the called after the last file is reached.
219              
220             =head1 METHODS
221              
222             =head2 number_of_files
223              
224             Returns the total number of files contained by this instance.
225              
226             =head2 file_number
227              
228             Returns ordinal number (1..number_of_files) of the current file.
229              
230             =head2 current_filename
231              
232             Returns the current filename or undef if the iterator is before the first file
233             (i.e. C<next_filename> has not been called so far) or after the last file.
234              
235             =head2 next_filename
236              
237             Returns the next filename (and increments the file_number).
238              
239             =head2 current_filehandle
240              
241             Opens the current file for reading and returns the filehandle.
242             Filename "-" is interpreted as STDIN.
243             Filenames with extension ".gz" are opened via L<PerlIO::via::gzip> (ie. unzipped on the fly).
244              
245             =head2 next_filehandle
246              
247             Returns the next filehandle (and increments the file_number).
248              
249             =head2 next_file_text
250              
251             Returns the content of the next file (slurp) and increments the file_number.
252              
253             =head2 next_line
254              
255             Returns the next line of the current file.
256             If the end of file is reached and attribute C<join_files_for_next_line> is set to true (which is by default),
257             the first line of next file is returned (and file_number incremented).
258              
259             =head2 get_hash
260              
261             Returns MD5 hash computed from the filenames and last modify times.
262              
263             =head2 $filenames_ref = string_to_filenames($string)
264              
265             Helper method that expands comma-or-space-separated list of filenames
266             and returns an array reference containing the filenames.
267             If the string starts with "!", it is interpreted as wildcards (see Perl L<glob>).
268             If a filename starts with "@" it is interpreted as a file list with one filename per line.
269              
270             =head1 AUTHOR
271              
272             Martin Popel <popel@ufal.mff.cuni.cz>
273              
274             =head1 COPYRIGHT AND LICENSE
275              
276             Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles University in Prague
277              
278             This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.