File Coverage

blib/lib/Plack/Middleware/DetectRobots.pm
Criterion Covered Total %
statement 63 64 98.4
branch 25 28 89.2
condition 14 15 93.3
subroutine 10 10 100.0
pod 2 2 100.0
total 114 119 95.8


line stmt bran cond sub pod time code
1              
2             package Plack::Middleware::DetectRobots;
3             $Plack::Middleware::DetectRobots::VERSION = '0.03';
4             # ABSTRACT: Automatically set a flag in the environment if a robot client is detected
5              
6 5     5   19167 use strict;
  5         8  
  5         179  
7 5     5   22 use warnings;
  5         8  
  5         173  
8              
9 5     5   460 use parent qw(Plack::Middleware);
  5         273  
  5         38  
10 5     5   12459 use Plack::Util::Accessor qw( env_key basic_check extended_check generic_check local_regexp );
  5         9  
  5         40  
11 5     5   4575 use Regexp::Assemble qw();
  5         75320  
  5         168  
12 5     5   47 use feature 'state';
  5         7  
  5         3358  
13              
14             sub prepare_app {
15 4     4 1 396 my $self = shift;
16 4 100       18 $self->basic_check(1) unless defined $self->basic_check;
17 4         392 return;
18             }
19              
20             sub call {
21 66     66 1 173939 my ( $self, $env ) = @_;
22              
23 66         104 state $reList = _read_list();
24 66         79 state $basic = _assemble( $reList, 'basic' );
25 66         26265 state $extended = _assemble( $reList, 'extended' );
26 66         408367 state $generic = _assemble( $reList, 'generic' );
27 66         10182 $reList = undef;
28              
29 66 50       379 my $key = defined( $self->env_key ) ? $self->env_key : 'robot_client';
30              
31 66         411 my $ua = $env->{'HTTP_USER_AGENT'};
32              
33 66         112 $env->{$key} = 0;
34              
35 66         129 my $local = $self->local_regexp;
36 66 100 66     396 if ( defined($local) and ( ref $local eq ref qr// ) and ( $ua =~ $local ) ) {
      100        
37 1         3 $env->{$key} = 'LOCAL';
38             }
39              
40 66 100 100     241 if ( !$env->{$key} and $self->basic_check ) {
41 63 100       1807 if ( $ua =~ $basic ) {
42 21         40 $env->{$key} = 'BASIC';
43             }
44             }
45              
46 66 100 100     255 if ( !$env->{$key} and $self->extended_check ) {
47 14 100       681 if ( $ua =~ $extended ) {
48 4         8 $env->{$key} = 'EXTENDED';
49             }
50             }
51              
52 66 100 100     377 if ( !$env->{$key} and $self->generic_check ) {
53 14 100       458 if ( $ua =~ $generic ) {
54 4         6 $env->{$key} = 'GENERIC';
55             }
56             }
57              
58 66         334 return $self->app->($env);
59             } ## end sub call
60              
61             sub _assemble {
62 12     12   45 my ( $bots, $type ) = @_;
63              
64 12         98 my $ra = Regexp::Assemble->new( flags => 'i' );
65 12         904 foreach my $r ( @{ $bots->{$type} } ) {
  12         46  
66 3176         360506 $ra->add($r);
67             }
68              
69 12         1256 return $ra->re;
70             }
71              
72             sub _read_list {
73 4     4   22 my $bots = { basic => [], extended => [], generic => [], };
74 4         9 my $currentType = 'basic';
75              
76 4         12 state $pos = tell(Plack::Middleware::DetectRobots::DATA);
77 4 50       21 if ( $ENV{'HARNESS_ACTIVE'} ) {
78 4         18 seek( Plack::Middleware::DetectRobots::DATA, $pos, 0 );
79             }
80              
81 4         97 while () {
82 3180         2373 chomp;
83 3180 100       3975 next unless $_;
84 3176 100       4474 $currentType = 'extended' if /\A##\s+EXTENDED/;
85 3176 100       4126 $currentType = 'generic' if /\A##\s+GENERIC/;
86              
87 3176         2172 push @{ $bots->{$currentType} }, $_;
  3176         7536  
88             }
89              
90 4 50       26 if ( !$ENV{'HARNESS_ACTIVE'} ) {
91 0         0 close Plack::Middleware::DetectRobots::DATA;
92             }
93              
94 4         16 return $bots;
95             }
96              
97             1;
98              
99             =pod
100              
101             =encoding utf-8
102              
103             =head1 NAME
104              
105             Plack::Middleware::DetectRobots - Automatically set a flag in the environment if a robot client is detected
106              
107             =head1 VERSION
108              
109             version 0.03
110              
111             =head1 SYNOPSIS
112              
113             use Plack::Builder;
114              
115             my $app = sub { ... } # as usual
116              
117             builder {
118             enable 'DetectRobots';
119             # or: enable 'DetectRobots', env_key => 'psgix.robot_client';
120             # or: enable 'DetectRobots', extended_check => 1, generic_check => 1;
121             $app;
122             };
123              
124             # ... and later ...
125            
126             if ( $env->{'robot_client'} ) {
127             # ... do something ...
128             }
129              
130             =head1 DESCRIPTION
131              
132             This Plack middleware uses the list of robots that is part of the
133             L software package to
134             analyse the C HTTP header and to set an environment
135             flag to either a true or false value depending on the detection
136             of a robot client.
137              
138             Once activated it checks the User-Agent HTTP header against a
139             basic list of patterns for common bots.
140              
141             If you activate the appropriate options, it can also use an extended
142             list for the detection of less common bots (cf. C)
143             and / or a list of quite generic patterns to detect unknown bots
144             (cf. C).
145              
146             You may also pass in your own regular expression as a string for
147             further checks (cf. ).
148              
149             The checks are executed in this order:
150              
151             B<1.> Local regular expression
152              
153             B<2.> Basic check
154              
155             B<3.> Extended check
156              
157             B<4.> Generic check
158              
159             If a check yields a positive result (i.e.: detects a bot) the
160             remaining checks are skipped.
161              
162             Depending on the check which detected a bot, the environment flag
163             is set to one of these values: C, C, C, or
164             C.
165              
166             If no bot is detected, the flag is set to C<0>.
167              
168             The default name of the flag in the environment is C,
169             but this can be customized by setting the C option when
170             enabling this middleware.
171              
172             It might make sense to use C by default instead,
173             but the PSGI spec states that the "'psgix.' prefix is reserved for
174             officially blessed extensions" - which does not apply to this module.
175             You may, however, set the key to C yourself
176             by using the C option mentioned before.
177              
178             =head1 WARNING
179              
180             This software is currently considered BETA and still needs to
181             be seriously tested!
182              
183             =head1 ROBOTS LIST
184              
185             Based on B of
186             L.
187              
188             B that list might be somewhat dated, as I did not find bingbot
189             in the list of common bots (only in the extended list) while it's
190             predecessor msnbot was considered common.
191              
192             =head1 CONFIGURATION
193              
194             You may specify the following option when enabling the middleware:
195              
196             =over 4
197              
198             =item C
199              
200             Set the name of the entry in the environment hash.
201              
202             =item C
203              
204             You may deactivate the standard checks by setting this option to
205             a false value. E.g. if your are only interested in obscure bots
206             or in your local pattern checks.
207              
208             By setting this option to a false value while simultaneously
209             passing a regular expression to C one can imitate
210             the behaviour of L.
211              
212             =item C
213              
214             Determines if an extended list of less often seen robots is also
215             checked for.
216             By default, only common robots are checked for, because the extended
217             check requires a rather large and complex regular expression.
218             Set this param to a true value to change the default behaviour.
219              
220             =item C
221              
222             Determines if the User-Agent string is also analysed to determine
223             if it contains certain strings that generically identify the
224             client as a bot, e.g. "spider" or "crawler"
225             By default, this check is not performed, even though it uses only
226             a relatively short and simple regex..
227             Set this param to a true value to change the default behaviour.
228              
229             =item C
230              
231             You may optionally pass in your own regular expression (as a Regexp
232             object using C) to check for additional patterns in the
233             User-Agent string.
234              
235             =back
236              
237             =head1 SEE ALSO
238              
239             L, L, L,
240             L
241              
242             The functionality provided by C is
243             basically the same as that of this module, but it requires you to
244             pass in your own regular expression and does not include a default
245             list of known bots.
246              
247             =head1 AUTHOR
248              
249             Heiko Jansen
250              
251             =head1 COPYRIGHT AND LICENSE
252              
253             This software is copyright (c) 2015 by Heiko Jansen.
254              
255             This is free software; you can redistribute it and/or modify it under
256             the same terms as the Perl 5 programming language system itself.
257              
258             =cut
259              
260             __DATA__