File Coverage

blib/lib/Dezi/Aggregator/FS.pm

Criterion	Covered	Total	%
statement	16	18	88.8
branch			n/a
condition			n/a
subroutine	6	6	100.0
pod			n/a
total	22	24	91.6

line	stmt	sub	time	code
1				package Dezi::Aggregator::FS;
2	2	2	4233	use Moose;
	2		5
	2		15
3				extends 'Dezi::Aggregator';
4
5	2	2	12868	use Carp;
	2		4
	2		143
6	2	2	10	use File::Find;
	2		4
	2		201
7	2	2	12	use File::Rules;
	2		4
	2		53
8	2	2	10	use Data::Dump qw( dump );
	2		4
	2		136
9	2	2	875	use SWISH::3;
	0
	0
10				use Try::Tiny;
11
12				our $VERSION = '0.014';
13
14				# we rely on file extensions to determine content type
15				# and thus parser type. If a file has no extension,
16				# assume this one.
17				our $DEFAULT_EXTENSION = 'txt';
18
19				=pod
20
21				=head1 NAME
22
23				Dezi::Aggregator::FS - crawl a filesystem
24
25				=head1 SYNOPSIS
26
27				use Dezi::Aggregator::FS;
28				my $fs = Dezi::Aggregator::FS->new(
29				indexer => Dezi::Indexer->new
30				);
31
32				$fs->indexer->start;
33				$fs->crawl( $path );
34				$fs->indexer->finish;
35
36				=head1 DESCRIPTION
37
38				Dezi::Aggregator::FS is a filesystem aggregator implementation
39				of the Dezi::Aggregator API. It is similar to the DirTree.pl
40				script in the Swish-e 2.4 distribution.
41
42				=cut
43
44				=head1 METHODS
45
46				See Dezi::Aggregator.
47
48				=head2 BUILD
49
50				Internal constructor method.
51
52				=cut
53
54				sub BUILD {
55				my $self = shift;
56
57				# create .ext regex to match in file_ok()
58				if ( $self->config->IndexOnly ) {
59				my $re = join( '\|',
60				grep {s/^\.//} split( m/\s+/, $self->config->IndexOnly ) );
61				$self->{_ext_re} = qr{\.($re)}io;
62				}
63				else {
64				$self->{_ext_re} = $Dezi::Utils::ExtRE;
65				}
66
67				}
68
69				=head2 file_ok( I<full_path> )
70
71				Check I<full_path> before fetch()ing it.
72
73				Returns 0 if I<full_path> should be skipped.
74
75				Returns file extension of I<full_path> if I<full_path> should be processed.
76
77				=cut
78
79				sub file_ok {
80				my $self = shift;
81				my $full_path = shift;
82				my $stat = shift;
83
84				$self->debug and warn "checking file $full_path\n";
85
86				my ( $path, $file, $ext )
87				= Dezi::Utils->path_parts( $full_path, $self->{_ext_re} );
88
89				$self->debug and warn "path=$path file=$file ext=$ext\n";
90
91				# treat no extension like plain text
92				$ext = $DEFAULT_EXTENSION unless length $ext;
93
94				return 0 if $file =~ m/^\./;
95
96				#carp "parsed file: $file\npath: $path\next: $ext";
97
98				$stat \|\|= [ stat($full_path) ];
99				return 0 unless -r _;
100				return 0 if -d _;
101				if ( $self->ok_if_newer_than
102				and $self->ok_if_newer_than >= $stat->[9] )
103				{
104				return 0;
105				}
106				return 0
107				if ( $self->_apply_file_rules($full_path)
108				&& !$self->_apply_file_match($full_path) );
109
110				$self->debug and warn " $full_path -> ok\n";
111				if ( $self->verbose & 4 ) {
112				local $\| = 1; # don't buffer
113				print "crawling file $full_path\n";
114				}
115
116				return $ext;
117				}
118
119				=head2 dir_ok( I<directory> )
120
121				Called by find() for all directories. You can control
122				the recursion into I<directory> via the config() params
123
124				=cut
125
126				sub dir_ok {
127				my $self = shift;
128				my $dir = shift;
129				my $stat = shift \|\| [ stat($dir) ];
130
131				$self->debug and warn "checking dir $dir\n";
132
133				return 0 unless -d _;
134				return 0 if $dir =~ m!/\.!;
135				return 0 if $dir =~ m/^\.[^\.]/; # could be ../foo
136				return 0 if $dir =~ m!/(\.svn\|RCS)/!;
137				return 0
138				if ( $self->_apply_file_rules($dir)
139				&& !$self->_apply_file_match($dir) );
140
141				$self->debug and warn " $dir -> ok\n";
142				if ( $self->verbose & 2 ) {
143				local $\| = 1; # don't buffer
144				print "crawling dir $dir\n";
145				}
146
147				1;
148				}
149
150				=head2 get_doc( I<file_path> [, I<stat>, I<ext> ] )
151
152				Returns a doc_class() instance representing I<file_path>.
153
154				=cut
155
156				sub get_doc {
157				my $self = shift;
158				my $url = shift or croak "file path required";
159				my ( $stat, $ext ) = @_;
160				my $buf;
161
162				# NOTE we always read in binary (raw) mode in case
163				# the file is compressed, binary, etc.
164				my $read_ok = try {
165
166				# the 2nd param runs in raw mode (no NULL substitution)
167				$buf = SWISH::3->slurp( $url, 1 );
168				$url =~ s/\.gz$//; # post-slurp, in case it failed.
169				return 1;
170				};
171
172				if ( !$read_ok ) {
173				carp "unable to read $url - skipping";
174				return;
175				}
176
177				$stat \|\|= [ stat($url) ];
178
179				my $type = Dezi::Utils->get_mime( $url, $self->indexer->swish3 );
180
181				if ( $self->ok_if_newer_than
182				and $self->ok_if_newer_than >= $stat->[9] )
183				{
184				warn "skipping $url ... too old\n";
185				return;
186				}
187
188				return $self->doc_class->new(
189				url => $url,
190				modtime => $stat->[9],
191				content => $buf,
192				type => $type,
193				size => $stat->[7],
194				debug => $self->debug
195				);
196
197				}
198
199				sub _do_file {
200				my $self = shift;
201				my $file = shift;
202				if ( my $ext = $self->file_ok($file) ) {
203				my $doc = $self->get_doc( $file, [ stat(_) ], $ext );
204				$self->swish_filter($doc);
205				if ( $self->test_mode ) {
206				warn join( ' ', $doc->url, $doc->type ) . "\n";
207				}
208				else {
209				$self->{indexer}->process($doc);
210				}
211				$self->_increment_count;
212				}
213				else {
214				$self->debug and warn "skipping file $file\n";
215				if ( $self->verbose & 4 ) {
216				local $\| = 1;
217				print "skipping $file\n";
218				}
219				}
220				}
221
222				#
223				# the basic wanted() code here based on Bill Moseley's DirTree.pl,
224				# part of the Swish-e 2.4 distrib.
225
226				=head2 crawl( I<paths_or_files> )
227
228				Crawl the filesystem recursively within I<paths_or_files>, processing
229				each document specified by the config().
230
231				=cut
232
233				sub crawl {
234				my $self = shift;
235
236				my @paths = @_;
237
238				my @files = grep { !-d } @paths;
239				my @dirs = grep {-d} @paths;
240
241				for my $f (@files) {
242				$self->_do_file($f);
243				}
244
245				# TODO set some flags here for filtering out files/dirs
246				# based on $self->indexer->config.
247
248				if (@dirs) {
249
250				find(
251				{ wanted => sub {
252
253				# canonpath cleans up any leading .
254				my $path = File::Spec->canonpath($File::Find::name);
255
256				if (-d) {
257				unless ( $self->dir_ok( $path, [ stat(_) ] ) ) {
258				if ( $self->verbose & 2 ) {
259				local $\| = 1;
260				print "skipping $path\n";
261				}
262				$File::Find::prune = 1;
263				return;
264				}
265
266				#warn "-d $path\n";
267				return;
268				}
269				else {
270
271				#warn "!-d $path\n";
272				}
273
274				$self->_do_file($path);
275
276				},
277				no_chdir => 1,
278				follow => $self->config->FollowSymLinks,
279
280				},
281				@dirs
282				);
283				}
284
285				return $self->{count};
286				}
287
288				__PACKAGE__->meta->make_immutable;
289
290				1;
291
292				__END__
293
294				=head1 AUTHOR
295
296				Peter Karman, E<lt>perl@peknet.comE<gt>
297
298				=head1 BUGS
299
300				Please report any bugs or feature requests to C<bug-swish-prog at rt.cpan.org>, or through
301				the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Dezi-App>.
302				I will be notified, and then you'll
303				automatically be notified of progress on your bug as I make changes.
304
305				=head1 SUPPORT
306
307				You can find documentation for this module with the perldoc command.
308
309				perldoc Dezi
310
311
312				You can also look for information at:
313
314				=over 4
315
316				=item * Mailing list
317
318				L<http://lists.swish-e.org/listinfo/users>
319
320				=item * RT: CPAN's request tracker
321
322				L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Dezi-App>
323
324				=item * AnnoCPAN: Annotated CPAN documentation
325
326				L<http://annocpan.org/dist/Dezi-App>
327
328				=item * CPAN Ratings
329
330				L<http://cpanratings.perl.org/d/Dezi-App>
331
332				=item * Search CPAN
333
334				L<http://search.cpan.org/dist/Dezi-App/>
335
336				=back
337
338				=head1 COPYRIGHT AND LICENSE
339
340				Copyright 2008-2009 by Peter Karman
341
342				This library is free software; you can redistribute it and/or modify
343				it under the same terms as Perl itself.
344
345				=head1 SEE ALSO
346
347				L<http://swish-e.org/>