File Coverage

blib/lib/HTTP/Parser.pm
Criterion Covered Total %
statement 81 119 68.0
branch 33 76 43.4
condition 7 24 29.1
subroutine 11 14 78.5
pod 6 6 100.0
total 138 239 57.7


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             HTTP::Parser - parse HTTP/1.1 request into HTTP::Request/Response object
4              
5             =head1 SYNOPSIS
6              
7             my $parser = HTTP::Parser->new();
8              
9             ...
10              
11             my $status = $parser->add($text);
12              
13             if(0 == $status) {
14             print "request: ".$parser->request()->as_string(); # HTTP::Request
15             } elsif(-3 == $status) {
16             print "no content length header!\n";
17             } elsif(-2 == $status) {
18             print "need a line of data\n";
19             } elsif(-1 == $status) {
20             print "need more data\n";
21             } else { # $status > 0
22             print "need $status byte(s)\n";
23             }
24              
25             =head1 DESCRIPTION
26              
27             This is an HTTP request parser. It takes chunks of text as received and
28             returns a 'hint' as to what is required, or returns the HTTP::Request when
29             a complete request has been read. HTTP/1.1 chunking is supported. It dies
30             if it finds an error.
31              
32             =cut
33 1     1   26298 use 5.006_001;
  1         4  
  1         37  
34 1     1   5 use strict;
  1         1  
  1         234  
35              
36             package HTTP::Parser;
37              
38             our $VERSION = '0.06';
39              
40 1     1   942 use HTTP::Request;
  1         86078  
  1         35  
41 1     1   13019 use HTTP::Response;
  1         15152  
  1         43  
42 1     1   12 use URI;
  1         2  
  1         1786  
43              
44             # token is (RFC 2616, ASCII)
45             my $Token =
46             qr/[\x21\x23-\x27\x2a\x2b\x2d\x2e\x30-\x39\x41-\x5a\x5e-\x7a\x7c\x7e]+/;
47              
48              
49             =head2 new ( named params... )
50              
51             Create a new HTTP::Parser object. Takes named parameters, e.g.:
52              
53             my $parser = HTTP::Parser->new(request => 1);
54              
55             =over 4
56              
57             =item request
58              
59             Allows or denies parsing an HTTP request and returning an C
60             object.
61              
62             =item response
63              
64             Allows or denies parsing an HTTP response and returning an C
65             object.
66              
67             =back
68              
69             If you pass neither C nor C, only requests are parsed (for
70             backwards compatibility); if you pass either, the other defaults to false
71             (disallowing both requests and responses is a fatal error).
72              
73             =cut
74             sub new {
75 4     4 1 6276 my ($class, %p) = @_;
76 4 100 100     36 $p{request} = 1 unless exists $p{response} or exists $p{request};
77 4 50 66     21 die 'must allow request or response to be parsed'
78             unless $p{request} or $p{response};
79 4         15 @p{qw(state data)} = ('blank', '');
80 4   33     23 my $self = bless \%p, ref $class || $class;
81 4         11 return $self;
82             }
83              
84              
85             =head2 add ( string )
86              
87             Parse request. Returns:
88              
89             =over 8
90              
91             =item 0
92              
93             if finished (call C to get an HTTP::Request or Response object)
94              
95             =item -1
96              
97             if not finished but not sure how many bytes remain
98              
99             =item -2
100              
101             if waiting for a line (like 0 with a hint)
102              
103             =item -3
104              
105             if there was no content-length header, so we can't tell whether we are
106             waiting for more data or not.
107              
108             If you are reading from a TCP stream, you can keep adding data until
109             the connection closes gracefully (the HTTP RFC allows this).
110              
111             If you are reading from a file, you should keep adding until you have
112             all the data.
113              
114             Once you have added all data, you may call C. if you are not
115             sure whether you have all the data, the HTTP::Response object might be
116             incomplete.
117              
118             =item count
119              
120             if waiting for that many bytes
121              
122             =back
123              
124             Dies on error.
125              
126             This method of parsing makes it easier to parse a request from an event-based
127             system, on the other hand, it's quite alright to pass in the whole request.
128             Ideally, the first chunk passed in is the header (up to the double newline),
129             then whatever byte counts are requested.
130              
131             When a request object is returned, the X-HTTP-Version header has the HTTP
132             version, the uri() method will always return a URI object, not a string.
133              
134             Note that a nonzero return is just a hint, and any amount of data can be
135             passed in to a subsequent add() call.
136              
137             =cut
138             sub add {
139 13     13 1 3942 my ($self,$s) = @_;
140 13 50       40 $s = '' if not defined $s;
141              
142 13         32 $self->{data} .= $s;
143              
144             # pre-header blank lines are allowed (RFC 2616 4.1)
145 13 100       34 if($self->{state} eq 'blank') {
146 5         25 $self->{data} =~ s/^(\x0d?\x0a)+//;
147 5 100       18 return -2 unless length $self->{data};
148 4         8 $self->{state} = 'header'; # done with blank lines; fall through
149             }
150              
151             # still waiting for the header
152 12 100       33 if($self->{state} eq 'header') {
    50          
    0          
    0          
153             # double line break indicates end of header; parse it
154 11 100       102 if($self->{data} =~ /^(.*?)\x0d?\x0a\x0d?\x0a/s) {
155 4         20 return $self->_parse_header(length $1);
156             }
157 7         20 return -2; # still waiting for unknown amount of header lines
158              
159             # waiting for main body of request
160             } elsif($self->{state} eq 'body') {
161 1         4 return $self->_parse_body();
162              
163             # chunked data
164             } elsif($self->{state} eq 'chunked') {
165 0         0 return $self->_parse_chunk();
166              
167             # trailers
168             } elsif($self->{state} eq 'trailer') {
169             # double line break indicates end of trailer; parse it
170 0 0       0 return $self->_parse_header(length $1,1)
171             if $self->{data} =~ /^(.*?)\x0d?\x0a\x0d?\x0a/s;
172 0         0 return -1; # still waiting for unknown amount of trailer data
173             }
174              
175 0         0 die "unknown state '$self->{state}'";
176             }
177              
178              
179             =head2 data
180              
181             Returns current data not parsed. Mainly useful after a request has been
182             parsed. The data is not removed from the object's buffer, and will be
183             seen before the data next passed to add().
184              
185             =cut
186             sub data {
187 0     0 1 0 shift->{data}
188             }
189              
190              
191             =head2 extra
192              
193             Returns the count of extra bytes (length of data()) after a request.
194              
195             =cut
196             sub extra {
197 0     0 1 0 length shift->{data}
198             }
199              
200              
201             =head2 object
202              
203             Returns the object request. Only useful after the parse has completed.
204              
205             =cut
206             sub object {
207 1     1 1 574 shift->{obj}
208             }
209              
210             # keep this for compatibility with 0.02
211             sub request {
212 2     2 1 796 shift->{obj}
213             }
214              
215              
216             # _parse_header ( position of double newline in data [, trailer flag] )
217             #
218             # helper for parse that parses an HTTP header
219             # prerequisite: we have data up to a double newline in $self->{data}
220             # if the trailer flag is set, we're parsing trailers
221             #
222             sub _parse_header {
223 4     4   7 my ($self,$eoh,$trailer) = @_;
224 4         16 my $header = substr($self->{data},0,$eoh,'');
225 4         17 $self->{data} =~ s/^\x0d?\x0a\x0d?\x0a//;
226              
227             # parse into lines
228 4         21 my @header = split /\x0d?\x0a/,$header;
229 4 50       13 my $request = shift @header unless $trailer;
230              
231             # join folded lines
232 4         7 my @out;
233 4         9 for(@header) {
234 5 50       14 if(s/^[ \t]+//) {
235 0 0       0 die 'LWS on first header line' unless @out;
236 0         0 $out[-1] .= $_;
237             } else {
238 5         9 push @out, $_;
239             }
240             }
241              
242             # parse request or response line
243 4         7 my $obj;
244 4 50       8 unless($trailer) {
245 4         4 my ($major, $minor);
246              
247             # is it an HTTP response?
248 4 100       20 if ($request =~ /^HTTP\/(\d+)\.(\d+)/i) {
249 2 100       16 die 'HTTP responses not allowed' unless $self->{response};
250 1         5 ($major,$minor) = ($1,$2);
251 1         5 $request =~ /^HTTP\/\d+\.\d+ (\d+) (.+)$/;
252 1         3 my $state = $1;
253 1         2 my $msg = $2;
254 1         12 $obj = $self->{obj} = HTTP::Response->new($state, $msg);
255              
256             # perhaps a request?
257             } else {
258 2         10 my ($method,$uri,$http) = split / /,$request;
259 2 50 33     19 die "'$request' is not the start of a valid HTTP request or response"
260             unless $http and $http =~ /^HTTP\/(\d+)\.(\d+)$/i;
261 2         7 ($major,$minor) = ($1,$2);
262 2 50       5 die 'HTTP requests not allowed' unless $self->{request};
263              
264             # If the Request-URI is an abs_path, we need to tell URI that we don't
265             # know the scheme, otherwise it will misinterpret paths that start with
266             # // as being scheme-relative uris, and will interpret the first
267             # component after // as the host (see rfc 2616)
268 2 50       11 $uri = "//$uri" if $uri =~ m(^/);
269 2         21 $obj = $self->{obj} = HTTP::Request->new($method, URI->new($uri));
270             }
271              
272 3         7317 $obj->header(X_HTTP_Version => "$major.$minor"); # pseudo-header
273              
274             # we've already seen the initial line and created the object
275             } else {
276 0         0 $obj = $self->{obj};
277             }
278              
279             # import headers
280 3         277 my $token = qr/[^][\x00-\x1f\x7f()<>@,;:\\"\/?={} \t]+/;
281 3         7 for $header(@header) {
282 5 50       197 die "bad header name in '$header'" unless $header =~ s/^($token):[\t ]*//;
283 5         30 $obj->push_header($1 => $header);
284             }
285              
286             # if we're parsing trailers we don't need to look at content
287 3 50       60 return 0 if $trailer;
288              
289             # see what sort of content we have, if any
290 3 100       10 if(my $length = $obj->header('content_length')) {
291 1         62 s/^\s+//, s/\s+$// for $length;
292 1 50       8 die "bad content-length '$length'" unless $length =~ /^(\d+)$/;
293 1         3 $self->{state} = 'body';
294 1         4 return $self->_parse_body();
295             }
296              
297             # check for transfer-encoding, and handle chunking
298 2 50       76 if(my @te = $obj->header('transfer_encoding')) {
299 0 0       0 if(grep { lc $_ eq 'chunked' } @te) {
  0         0  
300 0         0 $self->{state} = 'chunked';
301 0         0 return $self->_parse_chunk();
302             }
303             }
304              
305             # section 14.13 of the spec says an HTTP response "SHOULD" return a
306             # content-length header unless there are reasons not to
307             # however, the same RFC does allow "end of connection" as a valid marker
308             # of the end of data and means the server does not need to set a content
309             # length header. the only status codes that "MAY NOT" return data are
310             # 1xx, 204 and 304.
311             # therefore if there is no content length header, return -3 to the caller
312             # so they can decide whether to keep feeding data. if using HTTP::Parser
313             # with data from tcp, you could assume that the end of a connection is
314             # the end of the response data
315 2 50       72 if($self->{response}) {
316 0 0 0     0 if (!defined $obj->header('content_length') &&
      0        
      0        
317             $self->object->code ne '204' &&
318             $self->object->code ne '304' &&
319             $self->object->code !~ /1\d\d/) {
320              
321             # Assume headers are finished and we are moving into body mode
322 0         0 $self->{state} = 'body';
323 0         0 $self->{no_content_length} = 1;
324              
325             # Parse any data that might be left
326 0 0       0 return $self->_parse_body() if length $self->data;
327 0         0 return -3;
328             }
329             }
330              
331             # else we have no content so return success
332 2         14 return 0;
333             }
334              
335              
336             # _parse_body
337             #
338             # helper for parse, returns request object with content if done, else
339             # count of bytes remaining
340             #
341             sub _parse_body {
342 2     2   3 my $self = shift;
343 2         7 my $length = $self->{obj}->header('content_length');
344              
345             # if the server didn't include a content length header, inform the
346             # caller. they may choose to ignore this response or wait for
347             # the end of connection (which is a valid reason to assume that
348             # the response is finished)
349 2 50       78 if($self->{no_content_length}) {
350 0         0 $self->{obj}->content($self->{data});
351 0         0 return -3;
352             }
353              
354 2 100       9 if(length $self->{data} >= $length) {
355 1         12 $self->{obj}->content(substr($self->{data},0,$length,''));
356 1         27 return 0;
357             }
358 1         7 return $length-length $self->{data};
359             }
360              
361              
362             # _parse_chunk
363             #
364             # helper for parse, parse chunked transfer-encoded message; returns like parse
365             #
366             sub _parse_chunk {
367 0     0     my $self = shift;
368              
369             CHUNK:
370              
371             # need beginning of chunk with size
372 0 0         if(not $self->{chunk}) {
373 0 0         if($self->{data} =~ s/^([0-9a-fA-F]+)[^\x0d\x0a]*?\x0d?\x0a//) {
374              
375             # a zero-size chunk marks the end
376 0 0         unless($self->{chunk} = hex $1) {
377 0           $self->{state} = 'trailer';
378              
379             # double line break indicates end of trailer; parse it
380 0           $self->{data} = "\x0d\x0a".$self->{data}; # count previous line break
381 0 0         return $self->_parse_header(length $1,1)
382             if $self->{data} =~ /^(.*?)\x0d?\x0a\x0d?\x0a/s;
383 0           return -1; # still waiting for unknown amount of trailer data
384             }
385              
386             } else {
387 0 0         die "expected chunked encoding, got '".substr($self->{data},0,40)."...'"
388             if $self->{data} =~ /\x0d?\x0a/;
389 0           return -2; # waiting for a line with chunk information
390             }
391             }
392              
393             # do we have a current chunk size?
394 0 0         if($self->{chunk}) {
395              
396             # do we have enough data to fill it, plus a CR LF?
397 0 0 0       if(length $self->{data} > $self->{chunk} and
398             substr($self->{data},$self->{chunk},2) =~ /^(\x0d?\x0a)/) {
399 0           my $crlf = $1;
400 0           $self->{obj}->add_content(substr($self->{data},0,$self->{chunk}));
401 0           substr($self->{data},0,length $crlf) = '';
402              
403             # remove data from the buffer that we've already parsed
404 0           $self->{data} = substr($self->{data},delete $self->{chunk});
405              
406             # got chunks?
407 0           goto CHUNK;
408             }
409              
410 0           return $self->{chunk}-length($self->{data})+2; # extra CR LF
411             }
412             }
413              
414              
415             =head1 AUTHOR
416              
417             David Robins Edbrobins@davidrobins.netE
418             Fixes for 0.05 by David Cannings Edavid@edeca.netE
419              
420             =head1 SEE ALSO
421              
422             L, L.
423              
424             =cut
425              
426              
427             1;