File Coverage

blib/lib/SystemC/Vregs/Input/TableExtract.pm
Criterion Covered Total %
statement 39 177 22.0
branch 2 64 3.1
condition 0 27 0.0
subroutine 7 13 53.8
pod 5 7 71.4
total 53 288 18.4


line stmt bran cond sub pod time code
1             # See copyright, etc in below POD section.
2             ######################################################################
3              
4             package SystemC::Vregs::Input::TableExtract;
5              
6 4     4   28643 use base qw(HTML::TableExtract);
  4         6  
  4         10219  
7             $VERSION = '1.470';
8              
9 4     4   66241 use strict;
  4         12  
  4         132  
10 4     4   27 use vars qw($Debug %Find_Start_Headers %Find_Headers);
  4         21  
  4         226  
11 4     4   23 use IO::File;
  4         8  
  4         709  
12 4     4   23 use HTML::TableExtract;
  4         6  
  4         32  
13 4     4   150 use HTML::Entities ();
  4         8  
  4         10641  
14              
15             %Find_Start_Headers = (Class=>1,
16             Package=>1,
17             Defines=>1,
18             Enum=>1,
19             Register=>1,
20             Vregs_End_Of_Decl=>1,
21             );
22             %Find_Headers = (%Find_Start_Headers,
23             Address=>1,
24             Attributes=>1,
25             );
26              
27             ######################################################################
28             #### Parsing
29              
30             sub parse_file {
31 0     0 1 0 my $self = shift;
32 0         0 my $filename = shift;
33              
34 0 0       0 print "parse_file $filename\n" if $Debug;
35 0 0       0 my $fh = IO::File->new ($filename) or die "%Error: $! $filename\n";
36 0         0 $self->{_fh} = $fh;
37 0         0 $self->{_vregs_filename} = $filename;
38 0         0 $self->{_vregs_num_tables} = -1;
39 0         0 $self->{_vregs_line} = 1;
40 0         0 $self->{_format} = ''; #latex2html
41              
42             # Track line numbers
43 0         0 $self->handler(start => "start", "self,tagname,attr,attrseq,text,line");
44              
45             # I've had lots of problems with parse_file missing stuff and not handling
46             # chunks breaking in the middle of HTML objects. So, we read it ourself.
47             # $self->SUPER::parse_file ($fh);
48 0         0 $self->parse(join('',$fh->getlines()));
49              
50 0         0 $self->eof();
51 0         0 $self->start("p");
52 0         0 $self->text("Vregs_End_Of_Decl");
53 0         0 $fh->close();
54 0 0       0 print "parse_file DONE\n" if $Debug;
55             }
56              
57             sub clean_html_text {
58 0     0 0 0 $_ = shift;
59             # HTML escapes to normal Latin1 ASCII
60 0         0 $_ = HTML::Entities::decode($_);
61             # Hack: Decode two-byte UTF-8 chars without checking that the HTML body
62             # uses UTF-8 encoding. wvHtml writes HTML in UTF-8 encoding when the
63             # .doc file comes from StarOffice7.
64 0         0 s{[\302\303].}{
65 0         0 my ($a, $b) = unpack("CC", $&);
66 0 0       0 (($b & 0xC0) != 0x80 ? $& # Not a 2-byte UTF-8 char.
67             : pack("C", ($a & 0x3)<<6 | ($b & 0x3f) ))
68             }eg;
69             # Latin1 decoding
70             # Substituting 2 or 3 periods for 'elipses' causes Vspecs to truncate
71             # the field description at the periods, so use dashes.
72 0         0 s/\205/-/g; # ISO-Latin1 horizontal elipses (0x85)
73 0         0 s/\221/\'/g; # ISO-Latin1 left single-quote
74 0         0 s/\222/\'/g; # ISO-Latin1 right single-quote
75 0         0 s/\223/\"/g; # ISO-Latin1 left double-quote
76 0         0 s/\224/\"/g; # ISO-Latin1 right double-quote
77 0         0 s/\225/\*/g; # ISO-Latin1 bullet
78 0         0 s/\226/-/g; # ISO-Latin1 en-dash (0x96)
79 0         0 s/\227/-/g; # ISO-Latin1 em-dash (0x97)
80 0         0 s/\230/~/g; # ISO-Latin1 small tilde
81 0         0 s/\233/>/g; # ISO-Latin1 single right angle quote
82 0         0 s/\240/ /g; # ISO-Latin1 nonbreaking space
83 0         0 s/\246/\|/g; # ISO-Latin1 broken vertical bar
84 0         0 s/\255//g; # ISO-Latin1 soft hyphen (0xAD)
85 0         0 s/\264/\'/g; # ISO-Latin1 spacing acute
86 0         0 s/\267/\*/g; # ISO-Latin1 middle dot (0xB7)
87 0         0 s/\327/\*/g;# x multiplication
88              
89             # These are automatically removed by HTML::Entities in Perl 5.7 and greater
90             # Also decode unicode sequences decoded by perl 5.8.*
91 0         0 s/\&\#8209;/-/g; # ISOpub nonbreaking hyphen (U+2011, ‑)
92 0         0 s/\&\#8211;/-/g; s/\–/-/g; s/\342\200\041/\-/g; # ISOpub en-dash (U+2013, –)
  0         0  
  0         0  
93 0         0 s/\&\#8212;/-/g; s/\—/-/g; s/\342\200\042/\-/g; # ISOpub em-dash (U+2014, —)
  0         0  
  0         0  
94 0         0 s/\&\#8216;/\'/g; s/\‘/\'/g; s/\342\200\176/\`/g; # ISOnum left single-quote (U+2018, ‘)
  0         0  
  0         0  
95 0         0 s/\&\#8217;/\'/g; s/\’/\'/g; s/\342\200\177/\`/g; # ISOnum right single-quote (U+2019, ’)
  0         0  
  0         0  
96 0         0 s/\&\#8230;/-/g; s/\…/-/g; s/\342\200\174/-/g; # ISOpub horizontal elipses (U+2026, …)
  0         0  
  0         0  
97 0         0 ; s/(\342\200\230|\x{2018})/\`/g; # ISOnum left single-quote (U+2018, ‘)
98 0         0 ; s/(\342\200\231|\x{2019})/\`/g; # ISOnum right single-quote (U+2019, ’)
99 0         0 s/\&\#8220;/\"/g; s/\“/\"/g; s/(\342\200\234|\x{201c})/\"/g; # ISOnum left double-quote (U+201C, “)
  0         0  
  0         0  
100 0         0 s/\&\#8221;/\"/g; s/\”/\"/g; s/(\342\200\235|\x{201d})/\"/g; # ISOnum right double-quote (U+201D, ”)
  0         0  
  0         0  
101 0         0 ; s/(\342\210\222|\x{2212})/-/g; # ISOpub horizontal elipses (U+2026, …)
102              
103             # Compress spacing
104 0         0 s/\`/\'/g;
105 0         0 s/[\t\n\r ]+/ /g;
106 0         0 s/^ *//g;
107 0         0 return $_;
108             }
109              
110             sub start {
111 0     0 1 0 my $self = shift;
112 0 0 0     0 $self->{_vregs_line} = $_[4] if $_[4] && !$self->{_vregs_forced_line}; # As we have a handler requesting it
113 0 0 0     0 if ($_[0] eq 'p' || $_[0] =~ /^h[0-9]/) {
114             #print "

\n" if $Debug;

115 0         0 $self->{_vregs_first_word_in_p} = 1;
116 0 0 0     0 if ($self->{_format} eq 'latex2html' # Latex2html doesn't do any

's
      0        
117             && $self->{_vregs_next_tag}
118             && $self->{_vregs_next}{$self->{_vregs_next_tag}} ne "") {
119 0         0 $self->{_vregs_next_tag} = undef;
120             }
121             }
122 0 0 0     0 if ($_[0] eq 'meta' && $_[1]->{content} && $_[1]->{content} =~ /latex2html/i) {
      0        
123 0         0 $self->{_format} = 'latex2html';
124 0 0       0 print "Format = latex2html\n" if $Debug;
125             }
126 0         0 $self->SUPER::start (@_);
127             }
128              
129             sub comment {
130 0     0 1 0 my $self = shift;
131 0         0 my $text = shift;
132 0 0       0 if ($text =~ /^\s*line\s+(\d+)\s+"(.*)"/) {
133 0         0 $self->{_vregs_forced_line} = $1;
134 0         0 $self->{_vregs_line} = $1;
135 0         0 $self->{_vregs_filename} = $2;
136             }
137             }
138              
139             sub end {
140 0     0 1 0 my $self = shift;
141 0 0       0 if ($_[0] eq 'p') {
142 0 0       0 print "<\/$_[0]>\n" if $Debug;
143 0 0       0 if (!$self->{_vregs_first_endp}) {
144 0         0 $self->{_vregs_first_endp} = 1;
145             } else {
146 0         0 $self->{_vregs_next_tag} = undef;
147             }
148             }
149 0         0 $self->SUPER::end (@_);
150             }
151              
152             sub text {
153 0     0 1 0 my $self = shift;
154             # Don't bother if we are in a table
155 0 0       0 if (! $self->{_in_a_table}) {
156 0         0 my $text = $_[0];
157 0 0       0 if ($Debug) {
158 0         0 (my $printtext = $text) =~ s/[\n \t]+/ /g;
159 0 0       0 printf +("Text %s: %s:|%s|\n", $self->{_vregs_line},
160             ($self->{_vregs_first_word_in_p}?"FW":" "),$printtext);
161             }
162 0         0 $text = clean_html_text($text);
163 0 0 0     0 if ($text !~ /^\s*$/
164             && $text !~ /^
165 0         0 my $nosp_text = $text;
166 0 0       0 if ($self->{_vregs_first_word_in_p}) {
167 0         0 $nosp_text =~ s/^\s+//; $nosp_text =~ s/\s+$//;
  0         0  
168             # Per W3C HTML spec, "SGML specifies that a line break immediately
169             # following a start tag must be ignored, as must a line break
170             # immediately before an end tag. This applies to all HTML elements
171             # without exception."
172             # And, "For all HTML elements except PRE, sequences of white space
173             # separate words."
174             # Sadly, the HTML parser did not take care of this for us.
175 0         0 $text = $nosp_text;
176             }
177 0 0 0     0 if ($self->{_vregs_first_word_in_p}
    0          
178             && $Find_Headers{$nosp_text}) {
179 0 0       0 print "Keyword '$text'\n" if $Debug;
180 0 0 0     0 if ($Find_Start_Headers{$text}
181             && $self->{_vregs_next}) {
182             # Process previous entry, and prepare for next discovery
183 0         0 my $inp = $self->{_vregs_inp};
184 0         0 $inp->new_item([], $self->{_vregs_next}); # note no table
185 0         0 $self->{_vregs_next} = undef;
186             }
187 0         0 $self->{_vregs_next}{at} = ($self->{_vregs_filename}.":".$self->{_vregs_line});
188 0         0 $self->{_vregs_next_tag} = $text;
189 0         0 $self->{_vregs_next}{$self->{_vregs_next_tag}} = "";
190 0         0 $self->{_vregs_first_endp} = 0;
191 0 0       0 $self->{_vregs_next} = undef if $text eq 'Vregs_End_Of_Decl';
192 0 0       0 $self->{_vregs_next_tag} = undef if $text eq 'Vregs_End_Of_Decl';
193             }
194             elsif ($self->{_vregs_next_tag}) {
195 0 0       0 print "TAG '$self->{_vregs_next_tag}' '$text'\n" if $Debug;
196 0         0 $self->{_vregs_next}{$self->{_vregs_next_tag}} .= $text;
197             }
198             }
199 0 0       0 $self->{_vregs_first_word_in_p} = 0 if ($text !~ /^\s*$/);
200 0         0 my @tables = $self->table_states();
201 0         0 my $numtables = ();
202 0 0       0 if ($#tables != $self->{_vregs_num_tables}) {
203 0         0 $self->{_vregs_num_tables} = $#tables;
204 0 0       0 if ($self->{_vregs_next}) { # else a table outside of a register decl
205 0         0 my $table = $tables[$#tables];
206             # Clean up the table to remove extra spaces in the tables
207 0         0 my @bittable = ($table->rows);
208 0         0 foreach my $row (@bittable) {
209 0 0       0 @$row = map {
210 0         0 $_ = '' if !defined $_; # TableExtract may return '' if empty column
211 0         0 $_ = clean_html_text($_);
212             # The below isn't quite right, as a < will become a < by the extractor
213 0         0 $_ =~ s/<[^>]+>//ig; #$_="" if /SupportEmptyParas/i; # Microsloth Word junk
214 0         0 $_ =~ s/^[ \t\n\r]+//sig;
215 0         0 $_ =~ s/[ \t\n\r]+$//sig;
216 0         0 $_;
217             } @$row;
218             }
219             # Process this, and prepare for next discovery
220 0         0 my $inp = $self->{_vregs_inp};
221 0 0       0 print ::Dumper(\@bittable, $self->{_vregs_next}) if $Debug;
222 0         0 $inp->new_item(\@bittable, $self->{_vregs_next});
223 0         0 $self->{_vregs_next} = undef;
224 0         0 $self->{_vregs_next_tag} = undef;
225             }
226             }
227             }
228 0         0 $self->SUPER::text (@_);
229             }
230              
231             ######################################################################
232             #### HTML cleaning
233              
234             sub clean_html_file {
235 1     1 0 103 my $filename = shift;
236             # Word puts so much XML &*(#@$ in the file... Strip it so it's more obvious.
237              
238 1         2 my $wholefile;
239             {
240 1 50       2 my $fh = IO::File->new($filename) or die "%Error: $! $filename\n";
  1         11  
241 1         112 local $/;
242 1         2 undef $/;
243 1         116 $wholefile = <$fh>;
244 1         10 $fh->close;
245             }
246              
247 1         61 $wholefile =~ s%\r%%smg;
248 1         40 $wholefile =~ s%%%smg;
249 1         28 $wholefile =~ s%[ \n]style='.*?'%%smg;
250 1         94 $wholefile =~ s%[ \n]style=".*?"%%smg;
251 1         28 $wholefile =~ s% border=0 % border=1 %smg;
252 1         44 $wholefile =~ s%<\/?span\s*>%%smg;
253 1         47 $wholefile =~ s%<\/?o:p>%%smg;
254 1         33 $wholefile =~ s% width=\d+ % %smg;
255 1         12 $wholefile =~ s%%%smg;
256              
257             # Microsoft Word changebars
258 1         16 $wholefile =~ s%\[Author\s+\S+:\s+at\s+\S+\s+\S+\s+\d+\s+\S+\s+\d+\s*\]\s*%%smg;
259              
260 1 50       6 my $fh = IO::File->new($filename,"w") or die "%Error: $! writing $filename\n";
261 1         348 print $fh $wholefile;
262 1         5 $fh->close;
263             }
264              
265             ######################################################################
266             #### Package return
267             1;
268             __END__