line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
|
2
|
|
|
|
|
|
|
package Plack::Middleware::DetectRobots; |
3
|
|
|
|
|
|
|
$Plack::Middleware::DetectRobots::VERSION = '0.03'; |
4
|
|
|
|
|
|
|
# ABSTRACT: Automatically set a flag in the environment if a robot client is detected |
5
|
|
|
|
|
|
|
|
6
|
5
|
|
|
5
|
|
19167
|
use strict; |
|
5
|
|
|
|
|
8
|
|
|
5
|
|
|
|
|
179
|
|
7
|
5
|
|
|
5
|
|
22
|
use warnings; |
|
5
|
|
|
|
|
8
|
|
|
5
|
|
|
|
|
173
|
|
8
|
|
|
|
|
|
|
|
9
|
5
|
|
|
5
|
|
460
|
use parent qw(Plack::Middleware); |
|
5
|
|
|
|
|
273
|
|
|
5
|
|
|
|
|
38
|
|
10
|
5
|
|
|
5
|
|
12459
|
use Plack::Util::Accessor qw( env_key basic_check extended_check generic_check local_regexp ); |
|
5
|
|
|
|
|
9
|
|
|
5
|
|
|
|
|
40
|
|
11
|
5
|
|
|
5
|
|
4575
|
use Regexp::Assemble qw(); |
|
5
|
|
|
|
|
75320
|
|
|
5
|
|
|
|
|
168
|
|
12
|
5
|
|
|
5
|
|
47
|
use feature 'state'; |
|
5
|
|
|
|
|
7
|
|
|
5
|
|
|
|
|
3358
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
sub prepare_app { |
15
|
4
|
|
|
4
|
1
|
396
|
my $self = shift; |
16
|
4
|
100
|
|
|
|
18
|
$self->basic_check(1) unless defined $self->basic_check; |
17
|
4
|
|
|
|
|
392
|
return; |
18
|
|
|
|
|
|
|
} |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
sub call { |
21
|
66
|
|
|
66
|
1
|
173939
|
my ( $self, $env ) = @_; |
22
|
|
|
|
|
|
|
|
23
|
66
|
|
|
|
|
104
|
state $reList = _read_list(); |
24
|
66
|
|
|
|
|
79
|
state $basic = _assemble( $reList, 'basic' ); |
25
|
66
|
|
|
|
|
26265
|
state $extended = _assemble( $reList, 'extended' ); |
26
|
66
|
|
|
|
|
408367
|
state $generic = _assemble( $reList, 'generic' ); |
27
|
66
|
|
|
|
|
10182
|
$reList = undef; |
28
|
|
|
|
|
|
|
|
29
|
66
|
50
|
|
|
|
379
|
my $key = defined( $self->env_key ) ? $self->env_key : 'robot_client'; |
30
|
|
|
|
|
|
|
|
31
|
66
|
|
|
|
|
411
|
my $ua = $env->{'HTTP_USER_AGENT'}; |
32
|
|
|
|
|
|
|
|
33
|
66
|
|
|
|
|
112
|
$env->{$key} = 0; |
34
|
|
|
|
|
|
|
|
35
|
66
|
|
|
|
|
129
|
my $local = $self->local_regexp; |
36
|
66
|
100
|
66
|
|
|
396
|
if ( defined($local) and ( ref $local eq ref qr// ) and ( $ua =~ $local ) ) { |
|
|
|
100
|
|
|
|
|
37
|
1
|
|
|
|
|
3
|
$env->{$key} = 'LOCAL'; |
38
|
|
|
|
|
|
|
} |
39
|
|
|
|
|
|
|
|
40
|
66
|
100
|
100
|
|
|
241
|
if ( !$env->{$key} and $self->basic_check ) { |
41
|
63
|
100
|
|
|
|
1807
|
if ( $ua =~ $basic ) { |
42
|
21
|
|
|
|
|
40
|
$env->{$key} = 'BASIC'; |
43
|
|
|
|
|
|
|
} |
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
|
46
|
66
|
100
|
100
|
|
|
255
|
if ( !$env->{$key} and $self->extended_check ) { |
47
|
14
|
100
|
|
|
|
681
|
if ( $ua =~ $extended ) { |
48
|
4
|
|
|
|
|
8
|
$env->{$key} = 'EXTENDED'; |
49
|
|
|
|
|
|
|
} |
50
|
|
|
|
|
|
|
} |
51
|
|
|
|
|
|
|
|
52
|
66
|
100
|
100
|
|
|
377
|
if ( !$env->{$key} and $self->generic_check ) { |
53
|
14
|
100
|
|
|
|
458
|
if ( $ua =~ $generic ) { |
54
|
4
|
|
|
|
|
6
|
$env->{$key} = 'GENERIC'; |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
} |
57
|
|
|
|
|
|
|
|
58
|
66
|
|
|
|
|
334
|
return $self->app->($env); |
59
|
|
|
|
|
|
|
} ## end sub call |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
sub _assemble { |
62
|
12
|
|
|
12
|
|
45
|
my ( $bots, $type ) = @_; |
63
|
|
|
|
|
|
|
|
64
|
12
|
|
|
|
|
98
|
my $ra = Regexp::Assemble->new( flags => 'i' ); |
65
|
12
|
|
|
|
|
904
|
foreach my $r ( @{ $bots->{$type} } ) { |
|
12
|
|
|
|
|
46
|
|
66
|
3176
|
|
|
|
|
360506
|
$ra->add($r); |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
|
69
|
12
|
|
|
|
|
1256
|
return $ra->re; |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
sub _read_list { |
73
|
4
|
|
|
4
|
|
22
|
my $bots = { basic => [], extended => [], generic => [], }; |
74
|
4
|
|
|
|
|
9
|
my $currentType = 'basic'; |
75
|
|
|
|
|
|
|
|
76
|
4
|
|
|
|
|
12
|
state $pos = tell(Plack::Middleware::DetectRobots::DATA); |
77
|
4
|
50
|
|
|
|
21
|
if ( $ENV{'HARNESS_ACTIVE'} ) { |
78
|
4
|
|
|
|
|
18
|
seek( Plack::Middleware::DetectRobots::DATA, $pos, 0 ); |
79
|
|
|
|
|
|
|
} |
80
|
|
|
|
|
|
|
|
81
|
4
|
|
|
|
|
97
|
while () { |
82
|
3180
|
|
|
|
|
2373
|
chomp; |
83
|
3180
|
100
|
|
|
|
3975
|
next unless $_; |
84
|
3176
|
100
|
|
|
|
4474
|
$currentType = 'extended' if /\A##\s+EXTENDED/; |
85
|
3176
|
100
|
|
|
|
4126
|
$currentType = 'generic' if /\A##\s+GENERIC/; |
86
|
|
|
|
|
|
|
|
87
|
3176
|
|
|
|
|
2172
|
push @{ $bots->{$currentType} }, $_; |
|
3176
|
|
|
|
|
7536
|
|
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
|
90
|
4
|
50
|
|
|
|
26
|
if ( !$ENV{'HARNESS_ACTIVE'} ) { |
91
|
0
|
|
|
|
|
0
|
close Plack::Middleware::DetectRobots::DATA; |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
|
94
|
4
|
|
|
|
|
16
|
return $bots; |
95
|
|
|
|
|
|
|
} |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
1; |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=pod |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
=encoding utf-8 |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=head1 NAME |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
Plack::Middleware::DetectRobots - Automatically set a flag in the environment if a robot client is detected |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
=head1 VERSION |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
version 0.03 |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=head1 SYNOPSIS |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
use Plack::Builder; |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
my $app = sub { ... } # as usual |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
builder { |
118
|
|
|
|
|
|
|
enable 'DetectRobots'; |
119
|
|
|
|
|
|
|
# or: enable 'DetectRobots', env_key => 'psgix.robot_client'; |
120
|
|
|
|
|
|
|
# or: enable 'DetectRobots', extended_check => 1, generic_check => 1; |
121
|
|
|
|
|
|
|
$app; |
122
|
|
|
|
|
|
|
}; |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
# ... and later ... |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
if ( $env->{'robot_client'} ) { |
127
|
|
|
|
|
|
|
# ... do something ... |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
=head1 DESCRIPTION |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
This Plack middleware uses the list of robots that is part of the |
133
|
|
|
|
|
|
|
L software package to |
134
|
|
|
|
|
|
|
analyse the C HTTP header and to set an environment |
135
|
|
|
|
|
|
|
flag to either a true or false value depending on the detection |
136
|
|
|
|
|
|
|
of a robot client. |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
Once activated it checks the User-Agent HTTP header against a |
139
|
|
|
|
|
|
|
basic list of patterns for common bots. |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
If you activate the appropriate options, it can also use an extended |
142
|
|
|
|
|
|
|
list for the detection of less common bots (cf. C) |
143
|
|
|
|
|
|
|
and / or a list of quite generic patterns to detect unknown bots |
144
|
|
|
|
|
|
|
(cf. C). |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
You may also pass in your own regular expression as a string for |
147
|
|
|
|
|
|
|
further checks (cf. ). |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
The checks are executed in this order: |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
B<1.> Local regular expression |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
B<2.> Basic check |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
B<3.> Extended check |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
B<4.> Generic check |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
If a check yields a positive result (i.e.: detects a bot) the |
160
|
|
|
|
|
|
|
remaining checks are skipped. |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
Depending on the check which detected a bot, the environment flag |
163
|
|
|
|
|
|
|
is set to one of these values: C, C, C, or |
164
|
|
|
|
|
|
|
C. |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
If no bot is detected, the flag is set to C<0>. |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
The default name of the flag in the environment is C, |
169
|
|
|
|
|
|
|
but this can be customized by setting the C option when |
170
|
|
|
|
|
|
|
enabling this middleware. |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
It might make sense to use C by default instead, |
173
|
|
|
|
|
|
|
but the PSGI spec states that the "'psgix.' prefix is reserved for |
174
|
|
|
|
|
|
|
officially blessed extensions" - which does not apply to this module. |
175
|
|
|
|
|
|
|
You may, however, set the key to C yourself |
176
|
|
|
|
|
|
|
by using the C option mentioned before. |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
=head1 WARNING |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
This software is currently considered BETA and still needs to |
181
|
|
|
|
|
|
|
be seriously tested! |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=head1 ROBOTS LIST |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
Based on B of |
186
|
|
|
|
|
|
|
L. |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
B that list might be somewhat dated, as I did not find bingbot |
189
|
|
|
|
|
|
|
in the list of common bots (only in the extended list) while it's |
190
|
|
|
|
|
|
|
predecessor msnbot was considered common. |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
=head1 CONFIGURATION |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
You may specify the following option when enabling the middleware: |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=over 4 |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
=item C |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
Set the name of the entry in the environment hash. |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
=item C |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
You may deactivate the standard checks by setting this option to |
205
|
|
|
|
|
|
|
a false value. E.g. if your are only interested in obscure bots |
206
|
|
|
|
|
|
|
or in your local pattern checks. |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
By setting this option to a false value while simultaneously |
209
|
|
|
|
|
|
|
passing a regular expression to C one can imitate |
210
|
|
|
|
|
|
|
the behaviour of L. |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
=item C |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
Determines if an extended list of less often seen robots is also |
215
|
|
|
|
|
|
|
checked for. |
216
|
|
|
|
|
|
|
By default, only common robots are checked for, because the extended |
217
|
|
|
|
|
|
|
check requires a rather large and complex regular expression. |
218
|
|
|
|
|
|
|
Set this param to a true value to change the default behaviour. |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=item C |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
Determines if the User-Agent string is also analysed to determine |
223
|
|
|
|
|
|
|
if it contains certain strings that generically identify the |
224
|
|
|
|
|
|
|
client as a bot, e.g. "spider" or "crawler" |
225
|
|
|
|
|
|
|
By default, this check is not performed, even though it uses only |
226
|
|
|
|
|
|
|
a relatively short and simple regex.. |
227
|
|
|
|
|
|
|
Set this param to a true value to change the default behaviour. |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
=item C |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
You may optionally pass in your own regular expression (as a Regexp |
232
|
|
|
|
|
|
|
object using C) to check for additional patterns in the |
233
|
|
|
|
|
|
|
User-Agent string. |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
=back |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
=head1 SEE ALSO |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
L, L, L, |
240
|
|
|
|
|
|
|
L |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
The functionality provided by C is |
243
|
|
|
|
|
|
|
basically the same as that of this module, but it requires you to |
244
|
|
|
|
|
|
|
pass in your own regular expression and does not include a default |
245
|
|
|
|
|
|
|
list of known bots. |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
=head1 AUTHOR |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
Heiko Jansen |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
This software is copyright (c) 2015 by Heiko Jansen. |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
256
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
=cut |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
__DATA__ |