File Coverage

blib/lib/HTML/TableParser/Table.pm
Criterion Covered Total %
statement 206 209 98.5
branch 87 98 88.7
condition 19 26 73.0
subroutine 22 23 95.6
pod 0 19 0.0
total 334 375 89.0


at line $line\n" );
line stmt bran cond sub pod time code
1             package HTML::TableParser::Table;
2              
3 8     8   33 use strict;
  8         10  
  8         209  
4 8     8   31 use warnings;
  8         9  
  8         209  
5              
6 8     8   31 use HTML::Entities;
  8         9  
  8         17648  
7              
8             our $VERSION = '0.38';
9              
10             ## no critic ( ProhibitAccessOfPrivateData )
11              
12             sub new
13             {
14 215     215 0 256 my $this = shift;
15              
16 215   33     696 my $class = ref($this) || $this;
17              
18 215         1344 my $self = {
19             data => [[]], # row data (for overlapping rows)
20             row => undef, # row info
21             col => undef, # column info
22             hdr => undef, # accumulated header info
23             hdr_row => 0, # index of header row
24             hdr_line => undef, # line in file of header row
25             in_hdr => 0, # are we in a header row?
26             prev_hdr => 0, # was the previous row a header row?
27             line => undef, # line in file of current row
28             start_line => undef, # line in file of table start
29             req => undef, # the matching table request
30             exclreqs => {}, # the requests which exlude this table
31             };
32              
33 215         293 bless $self, $class;
34              
35 215         229 my ( $parser, $ids, $reqs, $line ) = @_;
36              
37 215         262 $self->{parser} = $parser;
38 215         176 $self->{start_line} = $line;
39              
40             # if called with no args, create an empty, placeholder object
41 215 100       315 unless ( defined $ids )
42             {
43 93         156 $self->{ids} = [ 0 ];
44 93         137 $self->{process} = 0;
45 93         196 $self->{id} = 'sentinel';
46             }
47              
48             else
49             {
50 122         142 $ids->[-1]++;
51 122         227 $self->{oids} = [ @$ids ];
52 122         208 $self->{ids} = [ @$ids, 0 ];
53 122         155 $self->{id} = join( '.', grep { $_ != 0 } @{$ids} );
  152         553  
  122         157  
54              
55 122         147 $self->{reqs} = $reqs;
56              
57             # are we interested in this table?
58 122         218 $self->match_id();
59              
60             # inform user of table start. note that if we're looking for
61             # for column name matches, we don't want to do the callback;
62             # in that case $self->{req} isn't set and callback() won't
63             # actually make the call.
64             $self->callback( 'start', $self->{start_line} )
65 122 100       357 if $self->{process};
66             }
67              
68 215         3719 $self;
69             }
70              
71              
72             sub match_id
73             {
74 122     122 0 109 my $self = shift;
75              
76 122         142 $self->{process} = 0;
77 122         105 $self->{req} = undef;
78              
79             # 1. look for explicit id matches
80             # 2. if no explicit id match, use header matches
81             # 3. if no header matches, use DEFAULT
82             # 4. if no DEFAULT, no match
83              
84             # 1. explicit id.
85              
86 122         120 my ( $skip, $req );
87              
88             ( $skip, $req ) =
89             req_match_id( $self->{reqs}, $self->{id}, $self->{oids},
90 122         326 $self->{exclreqs} );
91              
92             # did we match a skip table request?
93 122 100       222 return if $skip;
94              
95 121 100       236 if ( $req )
96             {
97 28         61 $self->match_req( $req );
98 28         25 return;
99             }
100              
101              
102             # 2. header match.
103             # don't set {req}, as that'll trigger callbacks and we're not sure
104             # this is a match yet
105              
106 93 100       109 if ( grep { @{$_->{cols}} } @{$self->{reqs}})
  95         70  
  95         263  
  93         113  
107             {
108 62         84 $self->{process} = 1;
109 62         53 $self->{req} = undef;
110 62         89 return;
111             }
112              
113             # 3. DEFAULT match
114             ( $skip, $req ) =
115 31         59 req_match_id( $self->{reqs}, 'DEFAULT', $self->{oids}, $self->{exclreqs} );
116              
117             # did we match a skip table request? Does this make sense for DEFAULT?
118 31 50       54 return if $skip;
119              
120 31 100       58 if ( $req )
121             {
122 14         34 $self->match_req( $req );
123 14         15 return;
124             }
125              
126             # 4. out of luck. no match.
127             }
128              
129             # determine if a request matches an id. requests should
130             # be real objects, but until then...
131             sub req_match_id
132             {
133 182     182 0 218 my ( $reqs, $id, $oids, $excluded ) = @_;
134              
135 182         258 for my $req ( @$reqs )
136             {
137             # if we've already excluded this request, don't bother again.
138             # this is needed for id = DEFAULT passes where we've previously
139             # excluded based on actual table id and should again.
140 186 100       422 next if exists $excluded->{$req};
141              
142             # bail if this request has already matched and we're not
143             # multi-matching
144 185 100 66     480 next if $req->{match} && ! $req->{MultiMatch};
145              
146 155         137 for my $cmp ( @{$req->{id}} )
  155         256  
147             {
148             # is this a subroutine to call?
149 83 100       200 if ( 'CODE' eq ref $cmp->{match} )
    100          
150             {
151 6 100       11 next unless $cmp->{match}->($id, $oids );
152             }
153              
154             # regular expression
155             elsif( 'Regexp' eq ref $cmp->{match} )
156             {
157 3 100       16 next unless $id =~ /$cmp->{match}/;
158             }
159              
160             # a direct match?
161             else
162             {
163 74 100       243 next unless $id eq $cmp->{match};
164             }
165              
166             # we get here only if there was a match.
167              
168             # move on to next request if this was an explicit exclude
169             # request.
170 45 100       99 if ( $cmp->{exclude} )
171             {
172 1         3 $excluded->{$req}++;
173 1         2 next;
174             }
175              
176             # return match, plus whether this is a global skip request
177 44         111 return ( $cmp->{skip}, $req );
178             }
179             }
180              
181 138         236 ( 0, undef );
182             }
183              
184             # determine if a request matches a column. requests should
185             # be real objects, but until then...
186             sub req_match_cols
187             {
188 62     62 0 85 my ( $reqs, $cols, $id, $oids ) = @_;
189              
190 62         102 for my $req ( @$reqs )
191             {
192             # bail if this request has already matched and we're not
193             # multi-matching
194 63 100 66     161 next if $req->{match} && ! $req->{MultiMatch};
195              
196 55         163 my @fix_cols = @$cols;
197 55         98 fix_texts($req, \@fix_cols);
198              
199 55         79 for my $cmp ( @{$req->{cols}} )
  55         85  
200             {
201             # is this a subroutine to call?
202 54 100       160 if ( 'CODE' eq ref $cmp->{match} )
    100          
203             {
204 1 50       4 next unless $cmp->{match}->( $id, $oids, \@fix_cols );
205             }
206              
207             # regular expression
208             elsif( 'Regexp' eq ref $cmp->{match} )
209             {
210 17 100       22 next unless grep { /$cmp->{match}/ } @fix_cols;
  235         394  
211             }
212              
213             # a direct match?
214             else
215             {
216 36 100       39 next unless grep { $_ eq $cmp->{match} } @fix_cols;
  504         492  
217             }
218              
219             # we get here only if there was a match
220              
221             # move on to next request if this was an explicit exclude
222             # request.
223 33 50       68 next if $cmp->{exclude};
224              
225             # return match, plus whether this is a global skip request
226 33         104 return ( $cmp->{skip}, $req );
227             }
228              
229             }
230              
231 29         56 (0, undef);
232             }
233              
234             # we've pulled in a header; does it match against one of the requests?
235             sub match_hdr
236             {
237 62     62 0 179 my ( $self, @cols ) = @_;
238              
239              
240             # 1. check header matches
241             # 2. if no header matches, use DEFAULT id
242             # 3. if no DEFAULT, no match
243              
244             # 1. check header matches
245             my ( $skip, $req ) = req_match_cols( $self->{reqs}, \@cols, $self->{id},
246 62         207 $self->{oids} );
247             # did we match a skip table request?
248 62 50       121 return 0 if $skip;
249              
250 62 100       116 if ( $req )
251             {
252 33         76 $self->match_req( $req );
253 33         88 return 1;
254             }
255              
256              
257             # 2. DEFAULT match
258             ( $skip, $req ) =
259 29         82 req_match_id( $self->{reqs}, 'DEFAULT', $self->{oids}, $self->{exclreqs} );
260              
261             # did we match a skip table request? Does this make sense for DEFAULT?
262 29 50       66 return 0 if $skip;
263              
264 29 100       57 if ( $req )
265             {
266 1         3 $self->match_req( $req );
267 1         3 return 1;
268             }
269              
270             # 3. if no DEFAULT, no match
271              
272 28         81 0;
273             }
274              
275             sub match_req
276             {
277 76     76 0 126 my ( $self, $req ) = @_;
278              
279 76 100       211 if ( $req->{class} )
    100          
280             {
281             # no strict 'refs';
282 7         11 my $new = $req->{new};
283 7         30 $self->{obj} = $req->{class}->$new( $req->{id}, $req->{udata} );
284             }
285             elsif ( $req->{obj} )
286             {
287 6         6 $self->{obj} = $req->{obj};
288             }
289              
290 76         130 $self->{process} = 1;
291 76         68 $self->{req} = $req;
292 76         116 $self->{req}{match}++;
293             }
294              
295              
296             # generic call back interface. handle method calls as well as
297             # subroutine calls.
298             sub callback
299             {
300 3533     3533 0 2516 my $self = shift;
301 3533         2812 my $method = shift;
302              
303             return unless
304 3533 100 66     8494 defined $self->{req} && exists $self->{req}->{$method};
305              
306 2799         2108 my $req = $self->{req};
307 2799         2166 my $call = $req->{$method};
308              
309 2799 100       4105 if ( 'CODE' eq ref $call )
310             {
311 2128         3768 $call->( $self->{id}, @_, $req->{udata} );
312             }
313             else
314             {
315             # if the object was destroyed before we get here (if it
316             # was created by us and thus was destroyed before us if
317             # there was an error), we can't call a method
318             $self->{obj}->$call( $self->{id}, @_, $req->{udata} )
319 671 50       2385 if defined $self->{obj};
320             }
321             }
322              
323              
324             # handle
325             sub start_header
326             {
327 1630     1630 0 1159 my $self = shift;
328 1630         1278 my ( undef, $line ) = @_;
329              
330 1630         1170 $self->{in_hdr}++;
331 1630         1018 $self->{prev_hdr}++;
332 1630         1122 $self->{hdr_line} = $line;
333 1630         1672 $self->start_column( @_ );
334             }
335              
336              
337             # handle
338             sub end_header
339             {
340 1390     1390 0 962 my $self = shift;
341 1390         2191 $self->end_column();
342             }
343              
344             # handle
345             sub start_column
346             {
347 49757     49757 0 32839 my $self = shift;
348 49757         34499 my ( $attr, $line ) = @_;
349              
350             # end last column if not explicitly ended. perform check here
351             # to avoid extra method call
352 49757 100       75676 $self->end_column() if defined $self->{col};
353              
354             # we really shouldn't be here if a row hasn't been started
355 49757 50       59893 unless ( defined $self->{row} )
356             {
357 0         0 $self->callback( 'warn', $self->{id}, $line,
358             " or without
359 0         0 $self->start_row( {}, $line );
360             }
361              
362             # even weirder. if the last row was a header we have to process it now,
363             # rather than waiting until the end of this row, as there might be
364             # a table in one of the cells in this row and if the enclosing table
365             # was using a column match/re, we won't match it's header until after
366             # the enclosed table is completely parsed. this is bad, as it may
367             # grab a match (if there's no multimatch) meant for the enclosing table.
368              
369             # if we're one row past the header, we're done with the header
370             $self->finish_header()
371 49757 100 66     92553 if ! $self->{in_hdr} && $self->{prev_hdr};
372              
373 49757         92115 $self->{col} = { attr => { %$attr} };
374 49757   100     116767 $self->{col}{attr}{colspan} ||= 1;
375 49757   100     257521 $self->{col}{attr}{rowspan} ||= 1;
376             }
377              
378             # handle
379             sub end_column
380             {
381 50371     50371 0 30994 my $self = shift;
382              
383 50371 100       64016 return unless defined $self->{col};
384              
385 49757 50       66682 $self->{col}{text} = defined $self->{text} ? $self->{text} : '' ;
386              
387 49757         32531 push @{$self->{row}}, $self->{col};
  49757         60748  
388              
389 49757         37384 $self->{col} = undef;
390 49757         61324 $self->{text} = undef;
391             }
392              
393             sub start_row
394             {
395 3382     3382 0 2358 my $self = shift;
396 3382         3044 my ( $attr, $line ) = @_;
397              
398             # end last row if not explicitly ended
399 3382         3712 $self->end_row();
400              
401 3382         18264 $self->{row} = [];
402 3382         15711 $self->{line} = $line;
403             }
404              
405              
406             sub end_row
407             {
408 4100     4100 0 2680 my $self = shift;
409              
410 4100 100       6120 return unless defined $self->{row};
411              
412             # perhaps an unfinished row. first finish column
413 3382         3668 $self->end_column();
414              
415             # if we're in a header, deal with overlapping cells differently
416             # then if we're in the data section
417 3382 100       3784 if ( $self->{in_hdr} )
418             {
419              
420 167         139 my $cn = 0;
421 167         149 my $rn = 0;
422 167         127 foreach my $col ( @{$self->{row}} )
  167         247  
423             {
424             # do this just in case there are newlines and we're concatenating
425             # column names later on. causes strange problems. besides,
426             # column names should be regular
427 1630         1978 $col->{text} =~ s/^\s+//;
428 1630         1728 $col->{text} =~ s/\s+$//;
429              
430             # need to find the first undefined column
431 1630         2917 $cn++ while defined $self->{hdr}[$cn][$self->{hdr_row}];
432              
433             # note that header is stored as one array per column, not row!
434 1630         2256 for ( my $cnn = 0 ; $cnn < $col->{attr}{colspan} ; $cnn++, $cn++ )
435             {
436 1911   100     2564 $self->{hdr}[$cn] ||= [];
437 1911         2198 $self->{hdr}[$cn][$self->{hdr_row}] = $col->{text};
438            
439             # put empty placeholders in the rest of the rows
440 1911         4192 for ( my $rnn = 1 ; $rnn < $col->{attr}{rowspan} ; $rnn++ )
441             {
442 235         677 $self->{hdr}[$cn][$rnn + $self->{hdr_row}] = '';
443             }
444             }
445             }
446              
447 167         192 $self->{hdr_row}++;
448             }
449             else
450             {
451 3215         2336 my $cn = 0;
452 3215         2009 my $rn = 0;
453 3215         2003 foreach my $col ( @{$self->{row}} )
  3215         3981  
454             {
455             # need to find the first undefined column
456 48127         58370 $cn++ while defined $self->{data}[0][$cn];
457              
458 48127         62637 for ( my $cnn = 0 ; $cnn < $col->{attr}{colspan} ; $cnn++, $cn++ )
459             {
460 48147         57795 for ( my $rnn = 0 ; $rnn < $col->{attr}{rowspan} ; $rnn++ )
461             {
462 48167   100     54179 $self->{data}[$rnn] ||= [];
463 48167         117648 $self->{data}[$rnn][$cn] = $col->{text};
464             }
465             }
466             }
467             }
468              
469             # if we're one row past the header, we're done with the header
470             $self->finish_header()
471 3382 50 66     7540 if ! $self->{in_hdr} && $self->{prev_hdr};
472              
473             # output the data if we're not in a header
474             $self->callback( 'row', $self->{line},
475 3215         4806 fix_texts( $self->{req}, shift @{$self->{data}} ) )
476 3382 100       5074 unless $self->{in_hdr};
477              
478 3382         21684 $self->{in_hdr} = 0;
479 3382         9004 $self->{row} = undef;
480             }
481              
482             # collect the possible multiple header rows into one array and
483             # send it off
484             sub finish_header
485             {
486 208     208 0 203 my $self = shift;
487              
488 208 100       368 return unless $self->{hdr};
489              
490 1481 50       881 my @header = map { join( ' ', grep { defined $_ && $_ ne '' } @{$_}) }
  2146         5920  
  1481         1041  
491 104         96 @{ $self->{hdr} };
  104         190  
492              
493             # if we're trying to match header columns, check that here.
494 104 100       253 if ( defined $self->{req} )
495             {
496 42         106 fix_texts( $self->{req}, \@header );
497 42         92 $self->callback( 'hdr', $self->{hdr_line}, \@header );
498             }
499              
500             else
501             {
502 62 100       169 if ( $self->match_hdr( @header ) )
503             {
504             # haven't done this callback yet...
505 34         134 $self->callback( 'start', $self->{start_line} );
506              
507 34         16782028 fix_texts( $self->{req}, \@header );
508 34         108 $self->callback( 'hdr', $self->{hdr_line}, \@header );
509             }
510              
511             # no match. reach up to the controlling parser and turn off
512             # processing of this table. this is kind of kludgy!
513             else
514             {
515 28         100 $self->{parser}->process(0);
516             }
517             }
518              
519              
520 104         15519313 $self->{hdr} = undef;
521 104         444 $self->{prev_hdr} = undef;
522 104         196 $self->{hdr_row} = 0;
523             }
524              
525             DESTROY
526             {
527 215     215   93404 my $self = shift;
528              
529             # if we're actually parsing this table, do something.
530 215 100       1896 if ( $self->{process} )
531             {
532             # just in case
533 104         170 $self->end_row();
534              
535             # just in case there's no table body
536 104         463 $self->finish_header();
537              
538 104         162 $self->callback( 'end', $self->{line} );
539             }
540             }
541              
542             sub fix_texts
543             {
544 3346     3346 0 3025 my ( $req, $texts ) = @_;
545              
546 3346         3276 for ( @$texts )
547             {
548             local $HTML::Entities::entity2char{nbsp} =
549 49998         47072 $HTML::Entities::entity2char{nbsp};
550              
551             $HTML::Entities::entity2char{nbsp} = ' '
552 49998 50       56038 if $req->{DecodeNBSP};
553              
554             chomp $_
555 49998 100       53134 if $req->{Chomp};
556              
557             decode_entities( $_ )
558 49998 100       84533 if $req->{Decode};
559              
560              
561 49998 100       68030 if ( $req->{Trim} )
562             {
563 6529         6765 s/^\s+//;
564 6529         10370 s/\s+$//;
565             }
566             }
567              
568 3346         5272 $texts;
569             }
570              
571             sub text
572             {
573 62098     62098 0 40263 my $self = shift;
574              
575 62098         195131 $self->{text} = shift;
576             }
577              
578 0     0 0 0 sub id { $_[0]->{id} }
579 122     122 0 461 sub ids { $_[0]->{ids} }
580 244     244 0 649 sub process { $_[0]->{process} }
581              
582             1;
583              
584             __END__