File Coverage

blib/lib/HTML/TableParser/Table.pm
Criterion Covered Total %
statement 204 207 98.5
branch 87 98 88.7
condition 21 26 80.7
subroutine 22 23 95.6
pod 19 19 100.0
total 353 373 94.6


at line $line\n" ); , C, and C tags. As such
line stmt bran cond sub pod time code
1             package HTML::TableParser::Table;
2              
3             # ABSTRACT: support class for HTML::TableParser
4              
5 7     7   41 use strict;
  7         12  
  7         168  
6 7     7   31 use warnings;
  7         11  
  7         152  
7              
8 7     7   31 use HTML::Entities qw();
  7         9  
  7         14187  
9              
10             our $VERSION = '0.43';
11              
12             ## no critic ( ProhibitAccessOfPrivateData )
13              
14             #pod =method new
15             #pod
16             #pod =cut
17              
18             sub new
19             {
20 215     215 1 335 my $this = shift;
21              
22 215   33     552 my $class = ref($this) || $this;
23              
24 215         1143 my $self = {
25             data => [[]], # row data (for overlapping rows)
26             row => undef, # row info
27             col => undef, # column info
28             hdr => undef, # accumulated header info
29             hdr_row => 0, # index of header row
30             hdr_line => undef, # line in file of header row
31             in_hdr => 0, # are we in a header row?
32             prev_hdr => 0, # was the previous row a header row?
33             line => undef, # line in file of current row
34             start_line => undef, # line in file of table start
35             req => undef, # the matching table request
36             exclreqs => {}, # the requests which exlude this table
37             };
38              
39 215         345 bless $self, $class;
40              
41 215         395 my ( $parser, $ids, $reqs, $line ) = @_;
42              
43 215         312 $self->{parser} = $parser;
44 215         261 $self->{start_line} = $line;
45              
46             # if called with no args, create an empty, placeholder object
47 215 100       341 unless ( defined $ids )
48             {
49 93         153 $self->{ids} = [ 0 ];
50 93         167 $self->{process} = 0;
51 93         205 $self->{id} = 'sentinel';
52             }
53              
54             else
55             {
56 122         155 $ids->[-1]++;
57 122         215 $self->{oids} = [ @$ids ];
58 122         204 $self->{ids} = [ @$ids, 0 ];
59 122         159 $self->{id} = join( '.', grep { $_ != 0 } @{$ids} );
  152         530  
  122         190  
60              
61 122         183 $self->{reqs} = $reqs;
62              
63             # are we interested in this table?
64 122         254 $self->match_id();
65              
66             # inform user of table start. note that if we're looking for
67             # for column name matches, we don't want to do the callback;
68             # in that case $self->{req} isn't set and callback() won't
69             # actually make the call.
70             $self->callback( 'start', $self->{start_line} )
71 122 100       344 if $self->{process};
72             }
73              
74 215         3062 $self;
75             }
76              
77             #pod =method match_id
78             #pod
79             #pod =cut
80              
81             sub match_id
82             {
83 122     122 1 152 my $self = shift;
84              
85 122         183 $self->{process} = 0;
86 122         154 $self->{req} = undef;
87              
88             # 1. look for explicit id matches
89             # 2. if no explicit id match, use header matches
90             # 3. if no header matches, use DEFAULT
91             # 4. if no DEFAULT, no match
92              
93             # 1. explicit id.
94              
95 122         156 my ( $skip, $req );
96              
97             ( $skip, $req ) =
98             req_match_id( $self->{reqs}, $self->{id}, $self->{oids},
99 122         259 $self->{exclreqs} );
100              
101             # did we match a skip table request?
102 122 100       217 return if $skip;
103              
104 121 100       234 if ( $req )
105             {
106 28         66 $self->match_req( $req );
107 28         36 return;
108             }
109              
110              
111             # 2. header match.
112             # don't set {req}, as that'll trigger callbacks and we're not sure
113             # this is a match yet
114              
115 93 100       110 if ( grep { @{$_->{cols}} } @{$self->{reqs}})
  95         113  
  95         226  
  93         164  
116             {
117 62         82 $self->{process} = 1;
118 62         76 $self->{req} = undef;
119 62         97 return;
120             }
121              
122             # 3. DEFAULT match
123             ( $skip, $req ) =
124 31         70 req_match_id( $self->{reqs}, 'DEFAULT', $self->{oids}, $self->{exclreqs} );
125              
126             # did we match a skip table request? Does this make sense for DEFAULT?
127 31 50       72 return if $skip;
128              
129 31 100       63 if ( $req )
130             {
131 14         40 $self->match_req( $req );
132 14         66 return;
133             }
134              
135             # 4. out of luck. no match.
136             }
137              
138             #pod =method req_match_id
139             #pod
140             #pod =cut
141              
142             # determine if a request matches an id. requests should
143             # be real objects, but until then...
144             sub req_match_id
145             {
146 182     182 1 294 my ( $reqs, $id, $oids, $excluded ) = @_;
147              
148 182         309 for my $req ( @$reqs )
149             {
150             # if we've already excluded this request, don't bother again.
151             # this is needed for id = DEFAULT passes where we've previously
152             # excluded based on actual table id and should again.
153 186 100       381 next if exists $excluded->{$req};
154              
155             # bail if this request has already matched and we're not
156             # multi-matching
157 185 100 66     700 next if $req->{match} && ! $req->{MultiMatch};
158              
159 155         196 for my $cmp ( @{$req->{id}} )
  155         290  
160             {
161             # is this a subroutine to call?
162 83 100       210 if ( 'CODE' eq ref $cmp->{match} )
    100          
163             {
164 6 100       15 next unless $cmp->{match}->($id, $oids );
165             }
166              
167             # regular expression
168             elsif( 'Regexp' eq ref $cmp->{match} )
169             {
170 3 100       14 next unless $id =~ /$cmp->{match}/;
171             }
172              
173             # a direct match?
174             else
175             {
176 74 100       208 next unless $id eq $cmp->{match};
177             }
178              
179             # we get here only if there was a match.
180              
181             # move on to next request if this was an explicit exclude
182             # request.
183 45 100       84 if ( $cmp->{exclude} )
184             {
185 1         3 $excluded->{$req}++;
186 1         2 next;
187             }
188              
189             # return match, plus whether this is a global skip request
190 44         124 return ( $cmp->{skip}, $req );
191             }
192             }
193              
194 138         299 ( 0, undef );
195             }
196              
197             #pod =method req_match_cols
198             #pod
199             #pod =cut
200              
201             # determine if a request matches a column. requests should
202             # be real objects, but until then...
203             sub req_match_cols
204             {
205 62     62 1 108 my ( $reqs, $cols, $id, $oids ) = @_;
206              
207 62         109 for my $req ( @$reqs )
208             {
209             # bail if this request has already matched and we're not
210             # multi-matching
211 63 100 66     144 next if $req->{match} && ! $req->{MultiMatch};
212              
213 55         152 my @fix_cols = @$cols;
214 55         113 fix_texts($req, \@fix_cols);
215              
216 55         69 for my $cmp ( @{$req->{cols}} )
  55         95  
217             {
218             # is this a subroutine to call?
219 54 100       148 if ( 'CODE' eq ref $cmp->{match} )
    100          
220             {
221 1 50       3 next unless $cmp->{match}->( $id, $oids, \@fix_cols );
222             }
223              
224             # regular expression
225             elsif( 'Regexp' eq ref $cmp->{match} )
226             {
227 17 100       26 next unless grep { /$cmp->{match}/ } @fix_cols;
  235         854  
228             }
229              
230             # a direct match?
231             else
232             {
233 36 100       48 next unless grep { $_ eq $cmp->{match} } @fix_cols;
  504         711  
234             }
235              
236             # we get here only if there was a match
237              
238             # move on to next request if this was an explicit exclude
239             # request.
240 33 50       73 next if $cmp->{exclude};
241              
242             # return match, plus whether this is a global skip request
243 33         112 return ( $cmp->{skip}, $req );
244             }
245              
246             }
247              
248 29         62 (0, undef);
249             }
250              
251             #pod =method match_hdr
252             #pod
253             #pod =cut
254              
255             # we've pulled in a header; does it match against one of the requests?
256             sub match_hdr
257             {
258 62     62 1 206 my ( $self, @cols ) = @_;
259              
260              
261             # 1. check header matches
262             # 2. if no header matches, use DEFAULT id
263             # 3. if no DEFAULT, no match
264              
265             # 1. check header matches
266             my ( $skip, $req ) = req_match_cols( $self->{reqs}, \@cols, $self->{id},
267 62         163 $self->{oids} );
268             # did we match a skip table request?
269 62 50       106 return 0 if $skip;
270              
271 62 100       103 if ( $req )
272             {
273 33         70 $self->match_req( $req );
274 33         79 return 1;
275             }
276              
277              
278             # 2. DEFAULT match
279             ( $skip, $req ) =
280 29         65 req_match_id( $self->{reqs}, 'DEFAULT', $self->{oids}, $self->{exclreqs} );
281              
282             # did we match a skip table request? Does this make sense for DEFAULT?
283 29 50       53 return 0 if $skip;
284              
285 29 100       51 if ( $req )
286             {
287 1         5 $self->match_req( $req );
288 1         3 return 1;
289             }
290              
291             # 3. if no DEFAULT, no match
292              
293 28         77 0;
294             }
295              
296             #pod =method match_req
297             #pod
298             #pod =cut
299              
300             sub match_req
301             {
302 76     76 1 124 my ( $self, $req ) = @_;
303              
304 76 100       176 if ( $req->{class} )
    100          
305             {
306             # no strict 'refs';
307 7         11 my $new = $req->{new};
308 7         41 $self->{obj} = $req->{class}->$new( $req->{id}, $req->{udata} );
309             }
310             elsif ( $req->{obj} )
311             {
312 6         10 $self->{obj} = $req->{obj};
313             }
314              
315 76         155 $self->{process} = 1;
316 76         104 $self->{req} = $req;
317 76         114 $self->{req}{match}++;
318             }
319              
320              
321             #pod =method callback
322             #pod
323             #pod =cut
324              
325             # generic call back interface. handle method calls as well as
326             # subroutine calls.
327             sub callback
328             {
329 3533     3533 1 4242 my $self = shift;
330 3533         4311 my $method = shift;
331              
332             return unless
333 3533 100 100     9837 defined $self->{req} && exists $self->{req}->{$method};
334              
335 2799         3734 my $req = $self->{req};
336 2799         3554 my $call = $req->{$method};
337              
338 2799 100       5000 if ( 'CODE' eq ref $call )
339             {
340 2128         4446 $call->( $self->{id}, @_, $req->{udata} );
341             }
342             else
343             {
344             # if the object was destroyed before we get here (if it
345             # was created by us and thus was destroyed before us if
346             # there was an error), we can't call a method
347             $self->{obj}->$call( $self->{id}, @_, $req->{udata} )
348 671 50       1944 if defined $self->{obj};
349             }
350             }
351              
352              
353             # handle
354              
355             #pod =method start_header
356             #pod
357             #pod =cut
358              
359             sub start_header
360             {
361 1630     1630 1 1866 my $self = shift;
362 1630         2002 my ( undef, $line ) = @_;
363              
364 1630         1784 $self->{in_hdr}++;
365 1630         1752 $self->{prev_hdr}++;
366 1630         1752 $self->{hdr_line} = $line;
367 1630         2251 $self->start_column( @_ );
368             }
369              
370              
371             # handle
372              
373             #pod =method end_header
374             #pod
375             #pod =cut
376              
377             sub end_header
378             {
379 1390     1390 1 1575 my $self = shift;
380 1390         1810 $self->end_column();
381             }
382              
383             # handle
384             #pod =method start_column
385             #pod
386             #pod =cut
387              
388             sub start_column
389             {
390 49757     49757 1 56354 my $self = shift;
391 49757         64924 my ( $attr, $line ) = @_;
392              
393             # end last column if not explicitly ended. perform check here
394             # to avoid extra method call
395 49757 100       102135 $self->end_column() if defined $self->{col};
396              
397             # we really shouldn't be here if a row hasn't been started
398 49757 50       71991 unless ( defined $self->{row} )
399             {
400 0         0 $self->callback( 'warn', $self->{id}, $line,
401             " or without
402 0         0 $self->start_row( {}, $line );
403             }
404              
405             # even weirder. if the last row was a header we have to process it now,
406             # rather than waiting until the end of this row, as there might be
407             # a table in one of the cells in this row and if the enclosing table
408             # was using a column match/re, we won't match it's header until after
409             # the enclosed table is completely parsed. this is bad, as it may
410             # grab a match (if there's no multimatch) meant for the enclosing table.
411              
412             # if we're one row past the header, we're done with the header
413             $self->finish_header()
414 49757 100 100     118244 if ! $self->{in_hdr} && $self->{prev_hdr};
415              
416 49757         121699 $self->{col} = { attr => { %$attr} };
417 49757   100     149371 $self->{col}{attr}{colspan} ||= 1;
418 49757   100     257655 $self->{col}{attr}{rowspan} ||= 1;
419             }
420              
421             # handle
422              
423             #pod =method end_column
424             #pod
425             #pod =cut
426              
427             sub end_column
428             {
429 50371     50371 1 53721 my $self = shift;
430              
431 50371 100       74515 return unless defined $self->{col};
432              
433 49757 50       86077 $self->{col}{text} = defined $self->{text} ? $self->{text} : '' ;
434              
435 49757         52604 push @{$self->{row}}, $self->{col};
  49757         83295  
436              
437 49757         62324 $self->{col} = undef;
438 49757         83434 $self->{text} = undef;
439             }
440              
441             #pod =method start_row
442             #pod
443             #pod =cut
444              
445             sub start_row
446             {
447 3382     3382 1 4298 my $self = shift;
448 3382         4474 my ( $attr, $line ) = @_;
449              
450             # end last row if not explicitly ended
451 3382         6108 $self->end_row();
452              
453 3382         5339 $self->{row} = [];
454 3382         17888 $self->{line} = $line;
455             }
456              
457              
458             #pod =method end_row
459             #pod
460             #pod =cut
461              
462             sub end_row
463             {
464 4100     4100 1 4609 my $self = shift;
465              
466 4100 100       6612 return unless defined $self->{row};
467              
468             # perhaps an unfinished row. first finish column
469 3382         5775 $self->end_column();
470              
471             # if we're in a header, deal with overlapping cells differently
472             # then if we're in the data section
473 3382 100       4740 if ( $self->{in_hdr} )
474             {
475              
476 167         181 my $cn = 0;
477 167         175 foreach my $col ( @{$self->{row}} )
  167         312  
478             {
479             # do this just in case there are newlines and we're concatenating
480             # column names later on. causes strange problems. besides,
481             # column names should be regular
482 1630         2660 $col->{text} =~ s/^\s+//;
483 1630         2766 $col->{text} =~ s/\s+$//;
484              
485             # need to find the first undefined column
486 1630         3165 $cn++ while defined $self->{hdr}[$cn][$self->{hdr_row}];
487              
488             # note that header is stored as one array per column, not row!
489 1630         2601 for ( my $cnn = 0 ; $cnn < $col->{attr}{colspan} ; $cnn++, $cn++ )
490             {
491 1911   100     3095 $self->{hdr}[$cn] ||= [];
492 1911         2831 $self->{hdr}[$cn][$self->{hdr_row}] = $col->{text};
493              
494             # put empty placeholders in the rest of the rows
495 1911         4268 for ( my $rnn = 1 ; $rnn < $col->{attr}{rowspan} ; $rnn++ )
496             {
497 235         635 $self->{hdr}[$cn][$rnn + $self->{hdr_row}] = '';
498             }
499             }
500             }
501              
502 167         228 $self->{hdr_row}++;
503             }
504             else
505             {
506 3215         3581 my $cn = 0;
507 3215         3595 foreach my $col ( @{$self->{row}} )
  3215         5386  
508             {
509             # need to find the first undefined column
510 48127         73484 $cn++ while defined $self->{data}[0][$cn];
511              
512 48127         75637 for ( my $cnn = 0 ; $cnn < $col->{attr}{colspan} ; $cnn++, $cn++ )
513             {
514 48147         68368 for ( my $rnn = 0 ; $rnn < $col->{attr}{rowspan} ; $rnn++ )
515             {
516 48167   100     65574 $self->{data}[$rnn] ||= [];
517 48167         119788 $self->{data}[$rnn][$cn] = $col->{text};
518             }
519             }
520             }
521             }
522              
523             # if we're one row past the header, we're done with the header
524             $self->finish_header()
525 3382 50 66     9654 if ! $self->{in_hdr} && $self->{prev_hdr};
526              
527             # output the data if we're not in a header
528             $self->callback( 'row', $self->{line},
529 3215         6123 fix_texts( $self->{req}, shift @{$self->{data}} ) )
530 3382 100       6372 unless $self->{in_hdr};
531              
532 3382         25796 $self->{in_hdr} = 0;
533 3382         24314 $self->{row} = undef;
534             }
535              
536             # collect the possible multiple header rows into one array and
537             # send it off
538              
539             #pod =method finish_header
540             #pod
541             #pod =cut
542              
543             sub finish_header
544             {
545 208     208 1 245 my $self = shift;
546              
547 208 100       394 return unless $self->{hdr};
548              
549 1481 50       1652 my @header = map { join( ' ', grep { defined $_ && $_ ne '' } @{$_}) }
  2146         5667  
  1481         1719  
550 104         118 @{ $self->{hdr} };
  104         211  
551              
552             # if we're trying to match header columns, check that here.
553 104 100       228 if ( defined $self->{req} )
554             {
555 42         110 fix_texts( $self->{req}, \@header );
556 42         90 $self->callback( 'hdr', $self->{hdr_line}, \@header );
557             }
558              
559             else
560             {
561 62 100       144 if ( $self->match_hdr( @header ) )
562             {
563             # haven't done this callback yet...
564 34         85 $self->callback( 'start', $self->{start_line} );
565              
566 34         10155 fix_texts( $self->{req}, \@header );
567 34         88 $self->callback( 'hdr', $self->{hdr_line}, \@header );
568             }
569              
570             # no match. reach up to the controlling parser and turn off
571             # processing of this table. this is kind of kludgy!
572             else
573             {
574 28         76 $self->{parser}->process(0);
575             }
576             }
577              
578              
579 104         7172 $self->{hdr} = undef;
580 104         167 $self->{prev_hdr} = undef;
581 104         211 $self->{hdr_row} = 0;
582             }
583              
584             DESTROY
585             {
586 215     215   78058 my $self = shift;
587              
588             # if we're actually parsing this table, do something.
589 215 100       1761 if ( $self->{process} )
590             {
591             # just in case
592 104         235 $self->end_row();
593              
594             # just in case there's no table body
595 104         215 $self->finish_header();
596              
597 104         195 $self->callback( 'end', $self->{line} );
598             }
599             }
600              
601             #pod =method fix_texts
602             #pod
603             #pod =cut
604              
605             sub fix_texts
606             {
607 3346     3346 1 4929 my ( $req, $texts ) = @_;
608              
609 3346         4701 for ( @$texts )
610             {
611             local $HTML::Entities::entity2char{nbsp} =
612 49998         72156 $HTML::Entities::entity2char{nbsp};
613              
614             $HTML::Entities::entity2char{nbsp} = ' '
615 49998 50       68636 if $req->{DecodeNBSP};
616              
617             chomp $_
618 49998 100       66286 if $req->{Chomp};
619              
620             HTML::Entities::decode_entities( $_ )
621 49998 100       109550 if $req->{Decode};
622              
623              
624 49998 100       80874 if ( $req->{Trim} )
625             {
626 6529         10235 s/^\s+//;
627 6529         15254 s/\s+$//;
628             }
629             }
630              
631 3346         7606 $texts;
632             }
633              
634             #pod =method text
635             #pod
636             #pod =cut
637              
638             sub text
639             {
640 62098     62098 1 73203 my $self = shift;
641              
642 62098         216713 $self->{text} = shift;
643             }
644              
645             #pod =method id
646             #pod
647             #pod =cut
648              
649 0     0 1 0 sub id { $_[0]->{id} }
650              
651             #pod =method ids
652             #pod
653             #pod =cut
654              
655 122     122 1 463 sub ids { $_[0]->{ids} }
656              
657              
658             #pod =method process
659             #pod
660             #pod =cut
661              
662 244     244 1 672 sub process { $_[0]->{process} }
663              
664             1;
665              
666             #
667             # This file is part of HTML-TableParser
668             #
669             # This software is Copyright (c) 2018 by Smithsonian Astrophysical Observatory.
670             #
671             # This is free software, licensed under:
672             #
673             # The GNU General Public License, Version 3, June 2007
674             #
675              
676             =pod
677              
678             =head1 NAME
679              
680             HTML::TableParser::Table - support class for HTML::TableParser
681              
682             =head1 VERSION
683              
684             version 0.43
685              
686             =head1 DESCRIPTION
687              
688             This class is used to keep track of information related to a table and
689             to create the information passed back to the user callbacks. It is in
690             charge of marshalling the massaged header and row data to the user
691             callbacks.
692              
693             An instance is created when the controlling TableParser class finds a
694             C< tag. The object is given an id based upon which table it is , C
695             to work on. Its methods are invoked from the TableParser callbacks
696             when they run across an appropriate tag (C
, C). The
697             object is destroyed when the matching C
tag is found.
698              
699             Since tables may be nested, multiple B
700             objects may exist simultaneously. B uses two
701             pieces of information held by this class -- ids and process. The
702             first is an array of table ids, one element per level of table
703             nesting. The second is a flag indicating whether this table is being
704             processed (i.e. it matches a requested table) or being ignored. Since
705             B uses the ids information from an existing table
706             to initialize a new table, it first creates an empty sentinel (place
707             holder) table (by calling the B constructor
708             with no arguments).
709              
710             The class handles missing C
711             (especially when handling multi-row headers) user callbacks may
712             be slightly delayed (and data cached). It also handles rows
713             with overlapping columns
714              
715             =head1 METHODS
716              
717             =head2 new
718              
719             =head2 match_id
720              
721             =head2 req_match_id
722              
723             =head2 req_match_cols
724              
725             =head2 match_hdr
726              
727             =head2 match_req
728              
729             =head2 callback
730              
731             =head2 start_header
732              
733             =head2 end_header
734              
735             =head2 start_column
736              
737             =head2 end_column
738              
739             =head2 start_row
740              
741             =head2 end_row
742              
743             =head2 finish_header
744              
745             =head2 fix_texts
746              
747             =head2 text
748              
749             =head2 id
750              
751             =head2 ids
752              
753             =head2 process
754              
755             =head1 BUGS
756              
757             Please report any bugs or feature requests on the bugtracker website
758             L or by
759             email to
760             L.
761              
762             When submitting a bug or request, please include a test-file or a
763             patch to an existing test-file that illustrates the bug or desired
764             feature.
765              
766             =head1 SOURCE
767              
768             The development version is on github at L
769             and may be cloned from L
770              
771             =head1 SEE ALSO
772              
773             Please see those modules/websites for more information related to this module.
774              
775             =over 4
776              
777             =item *
778              
779             L
780              
781             =back
782              
783             =head1 AUTHOR
784              
785             Diab Jerius
786              
787             =head1 COPYRIGHT AND LICENSE
788              
789             This software is Copyright (c) 2018 by Smithsonian Astrophysical Observatory.
790              
791             This is free software, licensed under:
792              
793             The GNU General Public License, Version 3, June 2007
794              
795             =cut
796              
797             __END__