File Coverage

lib/WWW/Crawler/Mojo.pm

Criterion	Covered	Total	%
statement	103	122	84.4
branch	35	50	70.0
condition	14	17	82.3
subroutine	20	25	80.0
pod	8	8	100.0
total	180	222	81.0

line	stmt	bran	cond	sub	pod	time	code
1							package WWW::Crawler::Mojo;
2	10			10		2430535	use strict;
	10					100
	10					296
3	10			10		50	use warnings;
	10					15
	10					327
4	10			10		2063	use Mojo::Base 'Mojo::EventEmitter';
	10					783555
	10					67
5	10			10		18308	use WWW::Crawler::Mojo::Job;
	10					28
	10					74
6	10			10		5175	use WWW::Crawler::Mojo::Queue::Memory;
	10					29
	10					72
7	10			10		4402	use WWW::Crawler::Mojo::UserAgent;
	10					30
	10					93
8	10					1066	use WWW::Crawler::Mojo::ScraperUtil qw{
9	10			10		5093	collect_urls_css html_handler_presets reduce_html_handlers resolve_href decoded_body};
	10					29
10	10			10		72	use Mojo::Message::Request;
	10					17
	10					78
11							our $VERSION = '0.26';
12
13							has clock_speed => 0.25;
14							has html_handlers => sub { html_handler_presets() };
15							has max_conn => 1;
16							has max_conn_per_host => 1;
17							has queue => sub { WWW::Crawler::Mojo::Queue::Memory->new };
18							has 'shuffle';
19							has ua => sub { WWW::Crawler::Mojo::UserAgent->new };
20							has ua_name =>
21							"www-crawler-mojo/$VERSION (+https://github.com/jamadam/www-crawler-mojo)";
22
23							sub crawl {
24	0			0	1	0	my ($self) = @_;
25
26	0					0	$self->init;
27
28	0	0				0	die 'No job is given' unless ($self->queue->length);
29
30	0					0	$self->emit('start');
31
32	0	0				0	Mojo::IOLoop->start unless Mojo::IOLoop->is_running;
33							}
34
35							sub init {
36	12			12	1	7286	my ($self) = @_;
37
38	2			2		315	$self->on('empty', sub { say "Queue is drained out."; $self->stop })
	2					24
39	12	50				55	unless $self->has_subscribers('empty');
40							$self->on(
41							'error',
42							sub {
43	0			0		0	say
44							sprintf("An error occured during crawling %s: %s", $_[2]->url, $_[1]);
45							}
46	12	50				178	) unless $self->has_subscribers('error');
47	12	100		0		121	$self->on('res', sub { $_[1]->() }) unless $self->has_subscribers('res');
	0					0
48
49	12					117	$self->ua->transactor->name($self->ua_name);
50	12					333	$self->ua->max_redirects(5);
51
52	12					111	$self->{_loop_ids} = [];
53
54							my $id = Mojo::IOLoop->recurring(
55							$self->clock_speed => sub {
56	15			15		3422892	my $queue = $self->queue;
57
58	15	100				261	if (!$queue->length) {
59	2	50				47	$self->emit('empty') unless ($self->ua->active_conn);
60	2					49	return;
61							}
62	13	50				186	return if $self->ua->active_conn >= $self->max_conn;
63							return
64	13	50				290	if $self->ua->active_host($queue->next->url)
65							>= $self->max_conn_per_host;
66	13					190	$self->process_job($queue->dequeue);
67							}
68	12					37	);
69
70	12					1042	push(@{$self->{_loop_ids}}, $id);
	12					29
71
72	12	50				36	if ($self->shuffle) {
73							my $id = Mojo::IOLoop->recurring(
74							$self->shuffle => sub {
75	0			0		0	$self->queue->shuffle;
76							}
77	0					0	);
78
79	0					0	push(@{$self->{_loop_ids}}, $id);
	0					0
80							}
81							}
82
83							sub process_job {
84	13			13	1	114	my ($self, $job) = @_;
85
86	13		100			55	my @args = ($job->method \|\| 'get', $job->url);
87	13	100				1197	push(@args, form => $job->tx_params->to_hash) if $job->tx_params;
88	13					216	my $tx = $self->ua->build_tx(@args);
89
90	13					3635	$self->emit('req', $job, $tx->req);
91
92							$self->ua->start(
93							$tx => sub {
94	13			13		5428	my ($ua, $tx) = @_;
95
96	13					67	$job->redirect(_urls_redirect($tx));
97
98	13					123	my $res = $tx->res;
99
100	13	50				83	unless ($res->code) {
101	0	0				0	my $msg = $res->error ? $res->error->{message} : 'Unknown error';
102	0					0	$self->emit('error', $msg, $job);
103	0					0	return;
104							}
105
106	13					186	$self->emit('res', sub { $self->scrape($res, $job, $_[0]) }, $job, $res);
	10					1237
107
108	13					3049	$job->close;
109							}
110	13					277	);
111							}
112
113							sub say_start {
114	0			0	1	0	my $self = shift;
115
116	0					0	print <<"EOF";
117							----------------------------------------
118	0					0	Crawling is starting with @{[ $self->queue->next->url ]}
119	0					0	Max Connection : @{[ $self->max_conn ]}
120	0					0	User Agent : @{[ $self->ua_name ]}
121							----------------------------------------
122							EOF
123							}
124
125							sub scrape {
126	26			26	1	781	my ($self, $res, $job, $contexts) = @_;
127	26					42	my @ret;
128
129	26	100	66			91	return unless $res->headers->content_length && $res->body;
130
131	25					1046	my $base = $job->url;
132	25					2577	my $type = $res->headers->content_type;
133
134	25	100	66			744	if ($type && $type =~ qr{^(text\|application)/(html\|xml\|xhtml)}) {
135	23	100				114	if ((my $base_tag = $res->dom->at('base[href]'))) {
136	4					2243	$base = resolve_href($base, $base_tag->attr('href'));
137							}
138	23					25321	my $dom = Mojo::DOM->new(decoded_body($res));
139	23					25540	my $handlers = reduce_html_handlers($self->html_handlers, $contexts);
140	23					44	for my $selector (sort keys %{$handlers}) {
	23					209
141							$dom->find($selector)->each(
142							sub {
143	83			83		29734	my $dom = shift;
144	83					226	for ($handlers->{$selector}->($dom)) {
145	86	100				1876	push(@ret, $self->_make_child($_, $dom, $job, $base)) if $_;
146							}
147							}
148	360					122480	);
149							}
150							}
151
152	25	100	66			10092	if ($type && $type =~ qr{text/css}) {
153	1					8	for (collect_urls_css(decoded_body($res))) {
154	1					6	push(@ret, $self->_make_child($_, $job->url, $job, $base));
155							}
156							}
157
158	25					191	return @ret;
159							}
160
161							sub stop {
162	2			2	1	10	my $self = shift;
163	2					9	while (my $id = shift @{$self->{_loop_ids}}) {
	4					217
164	2					33	Mojo::IOLoop->remove($id);
165							}
166	2					15	Mojo::IOLoop->stop;
167							}
168
169							sub _make_child {
170	84			84		257	my ($self, $url, $context, $job, $base) = @_;
171
172	84	50				165	return unless $url;
173	84	100				177	($url, my $method, my $params) = @$url if (ref $url);
174
175	84					200	my $resolved = resolve_href($base, $url);
176
177	84	100				200	return unless ($resolved->scheme =~ qr{^(http\|https\|ftp\|ws\|wss)$});
178
179	73	100	100			932	$resolved->query->append($params) if ($params && $method eq 'GET');
180
181	73					496	my $child = $job->child(_url => $resolved, literal_uri => $url,
182							_context => $context);
183
184	73	100				741	$child->method($method) if $method;
185	73	100	100			188	$child->tx_params($params) if ($params && $method eq 'POST');
186
187	73					528	return $child;
188							}
189
190							sub enqueue {
191	59			59	1	3055	my ($self, @jobs) = @_;
192							return
193	59					94	map { $self->queue->enqueue(WWW::Crawler::Mojo::Job->upgrade($_)) } @jobs;
	59					124
194							}
195
196							sub requeue {
197	1			1	1	504	my ($self, @jobs) = @_;
198							return
199	1					5	map { $self->queue->requeue(WWW::Crawler::Mojo::Job->upgrade($_)) } @jobs;
	1					3
200							}
201
202							sub _urls_redirect {
203	13			13		37	my $tx = shift;
204	13					34	my @urls;
205	13	50				67	@urls = _urls_redirect($tx->previous) if ($tx->previous);
206	13					96	unshift(@urls, $tx->req->url->userinfo(undef));
207	13					238	return @urls;
208							}
209
210							1;
211
212							=head1 NAME
213
214							WWW::Crawler::Mojo - A web crawling framework for Perl
215
216							=head1 SYNOPSIS
217
218							use strict;
219							use warnings;
220							use WWW::Crawler::Mojo;
221
222							my $bot = WWW::Crawler::Mojo->new;
223
224							$bot->on(res => sub {
225							my ($bot, $scrape, $job, $res) = @_;
226
227							$bot->enqueue($_) for $scrape->('#context');
228							});
229
230							$bot->enqueue('http://example.com/');
231							$bot->crawl;
232
233							=head1 DESCRIPTION
234
235							L is a web crawling framework for those who are familiar with
236							L::* APIs.
237
238							Althogh the module is only well tested for "focused crawl" at this point,
239							you can also use it for endless crawling by taking special care of memory usage.
240
241							=head1 ATTRIBUTES
242
243							L inherits all attributes from L and
244							implements the following new ones.
245
246							=head2 clock_speed
247
248							A number of main event loop interval in milliseconds. Defaults to 0.25.
249
250							$bot->clock_speed(2);
251							my $clock = $bot->clock_speed; # 2
252
253							=head2 html_handlers
254
255							Sets HTML handlers of scrapper. Defaults to
256							WWW::Crawler::Mojo::ScraperUtil::html_handler_presets.
257
258							$bot->html_handlers( {
259							'a[href]' => sub { return $_[0]->{href} },
260							'img[src]' => sub { return $_[0]->{src} },
261							} );
262
263							=head2 max_conn
264
265							An amount of max connections.
266
267							$bot->max_conn(5);
268							say $bot->max_conn; # 5
269
270							=head2 max_conn_per_host
271
272							An amount of max connections per host.
273
274							$bot->max_conn_per_host(5);
275							say $bot->max_conn_per_host; # 5
276
277							=head2 queue
278
279							L object for default.
280
281							$bot->queue(WWW::Crawler::Mojo::Queue::Memory->new);
282							$bot->queue->enqueue($job);
283
284							=head2 shuffle
285
286							An interval in seconds to shuffle the job queue. It also evalutated as boolean
287							for disabling/enabling the feature. Defaults to undef, meaning disable.
288
289							$bot->shuffle(5);
290							say $bot->shuffle; # 5
291
292							=head2 ua
293
294							A L instance.
295
296							my $ua = $bot->ua;
297							$bot->ua(WWW::Crawler::Mojo::UserAgent->new);
298
299							=head2 ua_name
300
301							Name of crawler for User-Agent header.
302
303							$bot->ua_name('my-bot/0.01 (+https://example.com/)');
304							say $bot->ua_name; # 'my-bot/0.01 (+https://example.com/)'
305
306							=head1 EVENTS
307
308							L inherits all events from L and
309							implements the following new ones.
310
311							=head2 req
312
313							Emitted right before crawler perform request to servers. The callback takes 3
314							arguments.
315
316							$bot->on(req => sub {
317							my ($bot, $job, $req) = @_;
318
319							# DO NOTHING
320							});
321
322							=head2 res
323
324							Emitted when crawler got response from server. The callback takes 4 arguments.
325
326							$bot->on(res => sub {
327							my ($bot, $scrape, $job, $res) = @_;
328							if (...) {
329							$bot->enqueue($_) for $scrape->();
330							} else {
331							# DO NOTHING
332							}
333							});
334
335							=head3 $bot
336
337							L instance.
338
339							=head3 $scrape
340
341							Scraper code reference for current document. The code takes optional argument
342							CSS selector for context and returns new jobs.
343
344							for my $job ($scrape->($context)) {
345							$bot->enqueue($job)
346							}
347
348							Optionally you can specify a scraping target container in CSS selector.
349
350							@jobs = $scrape->('#container');
351							@jobs = $scrape->(['#container1', '#container2']);
352
353							=head3 $job
354
355							L instance.
356
357							=head3 $res
358
359							L instance.
360
361							=head2 empty
362
363							Emitted when queue length gets zero.
364
365							$bot->on(empty => sub {
366							my ($bot) = @_;
367							say "Queue is drained out.";
368							});
369
370							=head2 error
371
372							Emitted when user agent returns no status code for request. Possibly caused by
373							network errors or un-responsible servers.
374
375							$bot->on(error => sub {
376							my ($bot, $error, $job) = @_;
377							say "error: $error";
378							if (...) { # until failur occures 3 times
379							$bot->requeue($job);
380							}
381							});
382
383							Note that server errors such as 404 or 500 cannot be catched with the event.
384							Consider res event for the use case instead of this.
385
386							=head2 start
387
388							Emitted right before crawl is started.
389
390							$bot->on(start => sub {
391							my $self = shift;
392							...
393							});
394
395							=head1 METHODS
396
397							L inherits all methods from L and
398							implements the following new ones.
399
400							=head2 crawl
401
402							Starts crawling loop.
403
404							$bot->crawl;
405
406							=head2 init
407
408							Initializes crawler settings.
409
410							$bot->init;
411
412							=head2 process_job
413
414							Processes a job.
415
416							$bot->process_job;
417
418							=head2 say_start
419
420							Displays starting messages to STDOUT
421
422							$bot->say_start;
423
424							=head2 scrape
425
426							Parses and discovers links in a web page and CSS. This performs scraping.
427							With the optional 4th argument, you can specify a CSS selector to container
428							you would collect URLs within.
429
430							$bot->scrape($res, $job, );
431							$bot->scrape($res, $job, $selector);
432							$bot->scrape($res, $job, [$selector1, $selector2]);
433
434							=head2 stop
435
436							Stop crawling.
437
438							$bot->stop;
439
440							=head2 enqueue
441
442							Appends one or more URLs or L objects. Returns the jobs
443							actually added.
444
445							my @jobs = $bot->enqueue('http://example.com/index1.html');
446
447							OR
448
449							my @jobs = $bot->enqueue($job1, $job2);
450
451							OR
452
453							my @jobs = $bot->enqueue(
454							'http://example.com/index1.html',
455							'http://example.com/index2.html',
456							'http://example.com/index3.html',
457							);
458
459							=head2 requeue
460
461							Appends one or more URLs or jobs for re-try. This accepts same arguments as
462							enqueue method. Returns the jobs actually added.
463
464							$self->on(error => sub {
465							my ($self, $msg, $job) = @_;
466							if (...) { # until failur occures 3 times
467							$bot->requeue($job);
468							}
469							});
470
471							=head1 EXAMPLE
472
473							L
474
475							=head1 AUTHOR
476
477							Keita Sugama, Esugama@jamadam.comE
478
479							=head1 COPYRIGHT AND LICENSE
480
481							Copyright (C) jamadam
482
483							This program is free software; you can redistribute it and/or
484							modify it under the same terms as Perl itself.
485
486							=cut