File Coverage

blib/lib/HTTP/StreamParser.pm
Criterion Covered Total %
statement 104 106 98.1
branch 24 30 80.0
condition 2 5 40.0
subroutine 21 21 100.0
pod 16 16 100.0
total 167 178 93.8


line stmt bran cond sub pod time code
1             package HTTP::StreamParser;
2             # ABSTRACT: streaming HTTP parser
3 2     2   1093 use strict;
  2         4  
  2         68  
4 2     2   10 use warnings;
  2         14  
  2         59  
5 2     2   9 use parent qw(Mixin::Event::Dispatch);
  2         4  
  2         12  
6              
7             our $VERSION = '0.101';
8              
9             =head1 NAME
10              
11             HTTP::StreamParser - support for streaming HTTP request/response parsing
12              
13             =head1 VERSION
14              
15             version 0.101
16              
17             =head1 SYNOPSIS
18              
19             # For requests...
20             my $req_parser = HTTP::StreamParser::Request->new;
21             $req_parser->subscribe_to_event(
22             http_method => sub { print "Method: $_[1]\n" },
23             http_uri => sub { print "URI: $_[1]\n" },
24             http_header => sub { print "Header: $_[1]: $_[2]\n" },
25             );
26             $req_parser->parse(<<'EOF');
27             ...
28             EOF
29              
30             # ... and responses:
31             my $resp_parser = HTTP::StreamParser::Request->new;
32             $resp_parser->subscribe_to_event(
33             http_code => sub { print "Code: $_[1]\n" },
34             http_status => sub { print "Status: $_[1]\n" },
35             http_header => sub { print "Header: $_[1]: $_[2]\n" },
36             );
37             $resp_parser->parse(<<'EOF');
38             ...
39             EOF
40              
41             =head1 DESCRIPTION
42              
43             Parses HTTP requests or responses. Generates events. Should be suitable for streaming.
44             You may be looking for L<HTTP::Parser::XS> instead - it's at least 20x faster than
45             this module. If you wanted something without XS, there's L<HTTP::Parser>.
46              
47             Actual implementation is in L<HTTP::StreamParser::Request> or L<HTTP::StreamParser::Response>.
48              
49             Typically you'd instantiate one of these for each request you want to parse. You'd then
50             subscribe to the events you're interested in - for example, header information, request method,
51             etc. - and then start parsing via L</parse>.
52              
53             =cut
54              
55 2     2   13716 use List::Util qw(min);
  2         2  
  2         179  
56              
57 2     2   9 use constant BODY_CHUNK_SIZE => 4096;
  2         4  
  2         2640  
58              
59             my $CRLF = "\x0d\x0a";
60              
61             =head2 new
62              
63             Instantiates a new parser object.
64              
65             =cut
66              
67             sub new {
68 2     2 1 51 my $class = shift;
69 2         10 my $self = bless +{
70             text => '',
71             }, $class;
72 2         12 $self->{state_pending} = [ $self->state_sequence ];
73 2         6 $self->{state} = shift @{$self->{state_pending}};
  2         7  
74 2         7 $self
75             }
76              
77             =head2 parse
78              
79             Adds the given data to the pending buffer, and calls the state handler to check
80             whether we have enough data to do some useful parsing.
81              
82             =cut
83              
84             sub parse {
85 149     149 1 646 my $self = shift;
86 149         229 my $text = shift;
87 149         185 $self->{text} .= $text;
88 149         242 $self->handle_state;
89             }
90              
91             =head2 parse_state
92              
93             Sets the current parse state, then calls the state handler.
94              
95             =cut
96              
97             sub parse_state {
98 14     14 1 13 my $self = shift;
99 14         14 my $state = shift;
100 14         17 $self->{state} = $state;
101 14         31 $self->handle_state;
102             }
103              
104             =head2 next_state
105              
106             Moves to the next parser state.
107              
108             =cut
109              
110             sub next_state {
111 14     14 1 19 my $self = shift;
112 14         14 my $next_state = shift @{$self->{state_pending}};
  14         25  
113             # say "Parse state was " . $self->{state} . " now " . $next_state;
114 14         37 $self->parse_state($next_state);
115             }
116              
117             =head2 handle_state
118              
119             Call the handler for our current parser state.
120              
121             =cut
122              
123             sub handle_state {
124 163     163 1 142 my $self = shift;
125 163 50       571 die "Unknown state [" . $self->{state} . "]" unless my $handler = $self->can($self->{state});
126 163         286 $handler->($self, \$self->{text});
127             }
128              
129             { # Common subset of methods, subclass if you need any others
130             my %methods = map { $_ => 1 } qw(
131             CONNECT COPY DELETE DELTA FILEPATCH GET HEAD LOCK MKCOL
132             MOVE OPTIONS PATCH POST PROPFIND PROPPATCH PUT SIGNATURE
133             TRACE TRACK UNLOCK
134             );
135              
136             =head2 validate_method
137              
138             Validate the HTTP request method. Currently accepts any of these:
139              
140             =over 4
141              
142             =item * CONNECT
143              
144             =item * COPY
145              
146             =item * DELETE
147              
148             =item * DELTA
149              
150             =item * FILEPATCH
151              
152             =item * GET
153              
154             =item * HEAD
155              
156             =item * LOCK
157              
158             =item * MKCOL
159              
160             =item * MOVE
161              
162             =item * OPTIONS
163              
164             =item * PATCH
165              
166             =item * POST
167              
168             =item * PROPFIND
169              
170             =item * PROPPATCH
171              
172             =item * PUT
173              
174             =item * SIGNATURE
175              
176             =item * TRACE
177              
178             =item * TRACK
179              
180             =item * UNLOCK
181              
182             =back
183              
184             =cut
185              
186 1     1 1 7 sub validate_method { exists $methods{$_[1]} }
187             }
188              
189             =head2 http_method
190              
191             Parses the HTTP method information.
192              
193             =cut
194              
195             sub http_method {
196 2     2 1 3 my $self = shift;
197 2         3 my $buf = shift;
198 2 100       16 if($$buf =~ s/^([A-Z]+)(?=\s)//) {
199 1         7 $self->{method} = $1;
200 1 50       35 die "invalid method ". $self->{method} unless $self->validate_method($self->{method});
201 1         10 $self->invoke_event(http_method => $self->{method});
202 1         552 $self->next_state;
203             }
204 2         6 return $self
205             }
206              
207             =head2 validate_code
208              
209             Validate whether we have a sensible HTTP status code - currently, any code >= 100 is accepted.
210              
211             =cut
212              
213 1     1 1 11 sub validate_code { $_[1] >= 100 }
214              
215             =head2 http_code
216              
217             Parse an HTTP status code.
218              
219             =cut
220              
221             sub http_code {
222 3     3 1 4 my $self = shift;
223 3         4 my $buf = shift;
224 3 100       11 if($$buf =~ s/^(\d{3})(?=\s)//) {
225 1         3 $self->{code} = $1;
226 1 50       6 die "invalid response code ". $self->{code} unless $self->validate_code($self->{code});
227 1         4 $self->invoke_event(http_code => $self->{code});
228 1         16 $self->next_state;
229             }
230 3         8 return $self
231             }
232              
233             =head2 http_status
234              
235             Parse the HTTP status information - this is everything after the code to the end of the line.
236              
237             =cut
238              
239             sub http_status {
240 2     2 1 3 my $self = shift;
241 2         1 my $buf = shift;
242 2 100       38 if($$buf =~ s/^(.*?)(?=$CRLF)//) {
243 1         3 $self->{status} = $1;
244 1         4 $self->invoke_event(http_status => $self->{status});
245 1         15 $self->next_state;
246             }
247 2         6 return $self
248             }
249              
250             =head2 http_uri
251              
252             Parse URI information. Anything up to whitespace.
253              
254             =cut
255              
256             sub http_uri {
257 2     2 1 3 my $self = shift;
258 2         3 my $buf = shift;
259 2 100       37 if($$buf =~ s{^(.*)(\s+http/\d+\.\d+$CRLF)}{$2}i) {
260 1         26 $self->{uri} = $1;
261 1         4 $self->invoke_event(http_uri => $self->{uri});
262 1         570 $self->next_state;
263             }
264 2         8 return $self
265             }
266              
267             =head2 http_version
268              
269             Parse HTTP version information. Typically expects HTTP/1.1.
270              
271             =cut
272              
273             sub http_version {
274 5     5 1 6 my $self = shift;
275 5         6 my $buf = shift;
276 5 100       46 if($$buf =~ s{^(HTTP)/(\d+.\d+)(?=\s)}{}i) {
277 2         7 $self->{proto} = $1;
278 2         8 $self->{version} = $2;
279 2         13 $self->invoke_event(http_version => $self->{proto}, $self->{version});
280 2         54 $self->next_state;
281             }
282 5         15 return $self
283             }
284              
285             { # Some headers can have multiple values, these can be a mix of comma-separated or split as K: x, K: y
286             my %multi_valued = map { $_ => 1 } qw(Accept Accept-Encoding Accept-Charset Accept-Language Connection Via TE);
287              
288             =head2 http_headers
289              
290             Parse HTTP header lines.
291              
292             =cut
293              
294             sub http_headers {
295 134     134 1 138 my $self = shift;
296 134         123 my $buf = shift;
297 134         573 while($$buf =~ s{^([^:]+):(?: )*([^$CRLF]+)$CRLF}{}) {
298 13         27 my $k = $1;
299 13         17 my $v = $2;
300 13 100       35 $self->{remaining} = 0+$v if lc($k) eq 'content-length';
301 13 100       27 if(exists $multi_valued{$k}) {
302 2         11 for (split /\s*,\s*/, $v) {
303 2         7 $self->invoke_event(http_header => $k => $_);
304             }
305             } else {
306 11         27 $self->invoke_event(http_header => $k => $v);
307             }
308             }
309 134 100       8543 if($$buf =~ s{^$CRLF}{}) {
310 2         8 $self->invoke_event(http_body_start =>);
311 2         39 $self->next_state;
312             }
313 134         361 return $self
314             }
315             }
316              
317             =head2 single_space
318              
319             Parse a single space character.
320              
321             Returns $self.
322              
323             =cut
324              
325             sub single_space {
326 4     4 1 8 my $self = shift;
327 4         5 my $buf = shift;
328 4 50       38 return $self->next_state if $$buf =~ s{^ }{};
329 0         0 return $self
330             }
331              
332             =head2 newline
333              
334             Parse the "newline" (CRLF) characters.
335              
336             Returns $self.
337              
338             =cut
339              
340             sub newline {
341 2     2 1 4 my $self = shift;
342 2         3 my $buf = shift;
343 2 50       29 return $self->next_state if $$buf =~ s{^$CRLF}{};
344 0         0 return $self
345             }
346              
347             =head2 http_body
348              
349             Parse body chunks.
350              
351             Returns $self.
352              
353             =cut
354              
355             sub http_body {
356 9     9 1 11 my $self = shift;
357 9         11 my $buf = shift;
358 9         20 while(length $$buf) {
359 5   33     28 my $chunk = substr $$buf, 0, min(BODY_CHUNK_SIZE, length($$buf), $self->{remaining} // ()), '';
360 5 50       19 $self->{remaining} -= length $chunk if defined $self->{remaining};
361 5         13 $self->invoke_event(http_body_chunk => $chunk, $self->{remaining});
362             }
363 9 100 50     185 $self->invoke_event(http_body_end =>) if 0 == ($self->{remaining} // 1);
364 9         61 return $self
365             }
366              
367             1;
368              
369             __END__
370              
371             =head1 SEE ALSO
372              
373             =over 4
374              
375             =item * L<HTTP::Parser::XS> - used by several other modules, fast implementation, pure-Perl fallback,
376             but doesn't give access to the data until the headers have been parsed and aside from header count and
377             per-header size limitation, seems not to have any way to deal with oversized requests
378              
379             =item * L<HTTP::Parser> - parses into L<HTTP::Request>/L<HTTP::Response> objects. Doesn't seem to guard
380             against large buffers but does have at least some support for streaming.
381              
382             =item * L<HTTP::MessageParser> - also parses HTTP content
383              
384             =item * L<Mojo::Message::Request> - part of L<Mojolicious>
385              
386             =item * L<Mojo::Message::Response> - part of L<Mojolicious>
387              
388             =item * L<HTTP::Response::Parser> - parses responses...
389              
390             =item * L<POE::Filter::HTTP::Parser> - seems to be backed by L<HTTP::Parser::XS> / L<HTTP::Parser>
391              
392             =item * L<HTTP::HeaderParser::XS> - only parses the headers, albeit with some speed
393              
394             =back
395              
396             =head1 AUTHOR
397              
398             Tom Molesworth <cpan@entitymodel.com>
399              
400             =head1 LICENSE
401              
402             Copyright Tom Molesworth 2013. Licensed under the same terms as Perl itself.