File Coverage

blib/lib/Plack/Middleware/DetectRobots.pm
Criterion Covered Total %
statement 63 64 98.4
branch 25 28 89.2
condition 14 15 93.3
subroutine 10 10 100.0
pod 2 2 100.0
total 114 119 95.8


line stmt bran cond sub pod time code
1              
2             package Plack::Middleware::DetectRobots;
3             {
4             $Plack::Middleware::DetectRobots::VERSION = '0.02';
5             }
6              
7             # ABSTRACT: Automatically set a flag in the environment if a robot client is detected
8              
9 5     5   38403 use strict;
  5         14  
  5         1097  
10 5     5   32 use warnings;
  5         10  
  5         423  
11              
12 5     5   1176 use parent qw(Plack::Middleware);
  5         295  
  5         51  
13 5     5   21368 use Plack::Util::Accessor qw( env_key basic_check extended_check generic_check local_regexp );
  5         11  
  5         48  
14 5     5   11778 use Regexp::Assemble qw();
  5         155528  
  5         190  
15 5     5   65 use feature 'state';
  5         14  
  5         7522  
16              
17             sub prepare_app {
18 4     4 1 898 my $self = shift;
19 4 100       24 $self->basic_check(1) unless defined $self->basic_check;
20 4         488 return;
21             }
22              
23             sub call {
24 66     66 1 1020154 my ( $self, $env ) = @_;
25              
26 66         127 state $reList = _read_list();
27 66         110 state $basic = _assemble( $reList, 'basic' );
28 66         50704 state $extended = _assemble( $reList, 'extended' );
29 66         606220 state $generic = _assemble( $reList, 'generic' );
30 66         13340 $reList = undef;
31              
32 66 50       470 my $key = defined( $self->env_key ) ? $self->env_key : 'robot_client';
33              
34 66         546 my $ua = $env->{'HTTP_USER_AGENT'};
35              
36 66         163 $env->{$key} = 0;
37              
38 66         219 my $local = $self->local_regexp;
39 66 100 66     498 if ( defined($local) and ( ref $local eq ref qr// ) and ( $ua =~ $local ) ) {
      100        
40 1         3 $env->{$key} = 'LOCAL';
41             }
42              
43 66 100 100     353 if ( !$env->{$key} and $self->basic_check ) {
44 63 100       2289 if ( $ua =~ $basic ) {
45 21         58 $env->{$key} = 'BASIC';
46             }
47             }
48              
49 66 100 100     331 if ( !$env->{$key} and $self->extended_check ) {
50 14 100       1095 if ( $ua =~ $extended ) {
51 4         10 $env->{$key} = 'EXTENDED';
52             }
53             }
54              
55 66 100 100     521 if ( !$env->{$key} and $self->generic_check ) {
56 14 100       528 if ( $ua =~ $generic ) {
57 4         14 $env->{$key} = 'GENERIC';
58             }
59             }
60              
61 66         667 return $self->app->($env);
62             } ## end sub call
63              
64             sub _assemble {
65 12     12   53 my ( $bots, $type ) = @_;
66              
67 12         114 my $ra = Regexp::Assemble->new( flags => 'i' );
68 12         1307 foreach my $r ( @{ $bots->{$type} } ) {
  12         55  
69 3176         630603 $ra->add($r);
70             }
71              
72 12         2047 return $ra->re;
73             }
74              
75             sub _read_list {
76 4     4   28 my $bots = { basic => [], extended => [], generic => [], };
77 4         12 my $currentType = 'basic';
78              
79 4         15 state $pos = tell(Plack::Middleware::DetectRobots::DATA);
80 4 50       32 if ( $ENV{'HARNESS_ACTIVE'} ) {
81 4         46 seek( Plack::Middleware::DetectRobots::DATA, $pos, 0 );
82             }
83              
84 4         145 while () {
85 3180         4523 chomp;
86 3180 100       6370 next unless $_;
87 3176 100       6408 $currentType = 'extended' if /\A##\s+EXTENDED/;
88 3176 100       7822 $currentType = 'generic' if /\A##\s+GENERIC/;
89              
90 3176         3141 push @{ $bots->{$currentType} }, $_;
  3176         13593  
91             }
92              
93 4 50       31 if ( !$ENV{'HARNESS_ACTIVE'} ) {
94 0         0 close Plack::Middleware::DetectRobots::DATA;
95             }
96              
97 4         22 return $bots;
98             }
99              
100             1;
101              
102             =pod
103              
104             =encoding utf-8
105              
106             =head1 NAME
107              
108             Plack::Middleware::DetectRobots - Automatically set a flag in the environment if a robot client is detected
109              
110             =head1 VERSION
111              
112             version 0.02
113              
114             =head1 SYNOPSIS
115              
116             use Plack::Builder;
117              
118             my $app = sub { ... } # as usual
119              
120             builder {
121             enable 'DetectRobots';
122             # or: enable 'DetectRobots', env_key => 'psgix.robot_client';
123             # or: enable 'DetectRobots', extended_check => 1, generic_check => 1;
124             $app;
125             };
126              
127             # ... and later ...
128            
129             if ( $env->{'robot_client'} ) {
130             # ... do something ...
131             }
132              
133             =head1 DESCRIPTION
134              
135             This Plack middleware uses the list of robots that is part of the
136             L software package to
137             analyse the C HTTP header and to set an environment
138             flag to either a true or false value depending on the detection
139             of a robot client.
140              
141             Once activated it checks the User-Agent HTTP header against a
142             basic list of patterns for common bots.
143              
144             If you activate the appropriate options, it can also use an extended
145             list for the detection of less common bots (cf. C)
146             and / or a list of quite generic patterns to detect unknown bots
147             (cf. C).
148              
149             You may also pass in your own regular expression as a string for
150             further checks (cf. ).
151              
152             The checks are executed in this order:
153              
154             B<1.> Local regular expression
155              
156             B<2.> Basic check
157              
158             B<3.> Extended check
159              
160             B<4.> Generic check
161              
162             If a check yields a positive result (i.e.: detects a bot) the
163             remaining checks are skipped.
164              
165             Depending on the check which detected a bot, the environment flag
166             is set to one of these values: C, C, C, or
167             C.
168              
169             If no bot is detected, the flag is set to C<0>.
170              
171             The default name of the flag in the environment is C,
172             but this can be customized by setting the C option when
173             enabling this middleware.
174              
175             It might make sense to use C by default instead,
176             but the PSGI spec states that the "'psgix.' prefix is reserved for
177             officially blessed extensions" - which does not apply to this module.
178             You may, however, set the key to C yourself
179             by using the C option mentioned before.
180              
181             =head1 WARNING
182              
183             This software is currently considered BETA and still needs to
184             be seriously tested!
185              
186             =head1 ROBOTS LIST
187              
188             Based on B of
189             L.
190              
191             B that list might be somewhat dated, as I did not find bingbot
192             in the list of common bots (only in the extended list) while it's
193             predecessor msnbot was considered common.
194              
195             =head1 CONFIGURATION
196              
197             You may specify the following option when enabling the middleware:
198              
199             =over 4
200              
201             =item C
202              
203             Set the name of the entry in the environment hash.
204              
205             =item C
206              
207             You may deactivate the standard checks by setting this option to
208             a false value. E.g. if your are only interested in obscure bots
209             or in your local pattern checks.
210              
211             By setting this option to a false value while simultaneously
212             passing a regular expression to C one can imitate
213             the behaviour of L.
214              
215             =item C
216              
217             Determines if an extended list of less often seen robots is also
218             checked for.
219             By default, only common robots are checked for, because the extended
220             check requires a rather large and complex regular expression.
221             Set this param to a true value to change the default behaviour.
222              
223             =item C
224              
225             Determines if the User-Agent string is also analysed to determine
226             if it contains certain strings that generically identify the
227             client as a bot, e.g. "spider" or "crawler"
228             By default, this check is not performed, even though it uses only
229             a relatively short and simple regex..
230             Set this param to a true value to change the default behaviour.
231              
232             =item C
233              
234             You may optionally pass in your own regular expression (as a Regexp
235             object using C) to check for additional patterns in the
236             User-Agent string.
237              
238             =back
239              
240             =head1 SEE ALSO
241              
242             L, L, L,
243             L
244              
245             The functionality provided by C is
246             basically the same as that of this module, but it requires you to
247             pass in your own regular expression and does not include a default
248             list of known bots.
249              
250             =head1 AUTHOR
251              
252             Heiko Jansen
253              
254             =head1 COPYRIGHT AND LICENSE
255              
256             This software is copyright (c) 2014 by Heiko Jansen.
257              
258             This is free software; you can redistribute it and/or modify it under
259             the same terms as the Perl 5 programming language system itself.
260              
261             =cut
262              
263             __DATA__