File Coverage

blib/lib/SystemC/Vregs/Input/TableExtract.pm

Criterion	Covered	Total	%
statement	39	177	22.0
branch	2	64	3.1
condition	0	27	0.0
subroutine	7	13	53.8
pod	5	7	71.4
total	53	288	18.4

line	stmt	bran	cond	sub	pod	time	code
1							# See copyright, etc in below POD section.
2							######################################################################
3
4							package SystemC::Vregs::Input::TableExtract;
5
6	4			4		28643	use base qw(HTML::TableExtract);
	4					6
	4					10219
7							$VERSION = '1.470';
8
9	4			4		66241	use strict;
	4					12
	4					132
10	4			4		27	use vars qw($Debug %Find_Start_Headers %Find_Headers);
	4					21
	4					226
11	4			4		23	use IO::File;
	4					8
	4					709
12	4			4		23	use HTML::TableExtract;
	4					6
	4					32
13	4			4		150	use HTML::Entities ();
	4					8
	4					10641
14
15							%Find_Start_Headers = (Class=>1,
16							Package=>1,
17							Defines=>1,
18							Enum=>1,
19							Register=>1,
20							Vregs_End_Of_Decl=>1,
21							);
22							%Find_Headers = (%Find_Start_Headers,
23							Address=>1,
24							Attributes=>1,
25							);
26
27							######################################################################
28							#### Parsing
29
30							sub parse_file {
31	0			0	1	0	my $self = shift;
32	0					0	my $filename = shift;
33
34	0	0				0	print "parse_file $filename\n" if $Debug;
35	0	0				0	my $fh = IO::File->new ($filename) or die "%Error: $! $filename\n";
36	0					0	$self->{_fh} = $fh;
37	0					0	$self->{_vregs_filename} = $filename;
38	0					0	$self->{_vregs_num_tables} = -1;
39	0					0	$self->{_vregs_line} = 1;
40	0					0	$self->{_format} = ''; #latex2html
41
42							# Track line numbers
43	0					0	$self->handler(start => "start", "self,tagname,attr,attrseq,text,line");
44
45							# I've had lots of problems with parse_file missing stuff and not handling
46							# chunks breaking in the middle of HTML objects. So, we read it ourself.
47							# $self->SUPER::parse_file ($fh);
48	0					0	$self->parse(join('',$fh->getlines()));
49
50	0					0	$self->eof();
51	0					0	$self->start("p");
52	0					0	$self->text("Vregs_End_Of_Decl");
53	0					0	$fh->close();
54	0	0				0	print "parse_file DONE\n" if $Debug;
55							}
56
57							sub clean_html_text {
58	0			0	0	0	$_ = shift;
59							# HTML escapes to normal Latin1 ASCII
60	0					0	$_ = HTML::Entities::decode($_);
61							# Hack: Decode two-byte UTF-8 chars without checking that the HTML body
62							# uses UTF-8 encoding. wvHtml writes HTML in UTF-8 encoding when the
63							# .doc file comes from StarOffice7.
64	0					0	s{[\302\303].}{
65	0					0	my ($a, $b) = unpack("CC", $&);
66	0	0				0	(($b & 0xC0) != 0x80 ? $& # Not a 2-byte UTF-8 char.
67							: pack("C", ($a & 0x3)<<6 \| ($b & 0x3f) ))
68							}eg;
69							# Latin1 decoding
70							# Substituting 2 or 3 periods for 'elipses' causes Vspecs to truncate
71							# the field description at the periods, so use dashes.
72	0					0	s/\205/-/g; # ISO-Latin1 horizontal elipses (0x85)
73	0					0	s/\221/\'/g; # ISO-Latin1 left single-quote
74	0					0	s/\222/\'/g; # ISO-Latin1 right single-quote
75	0					0	s/\223/\"/g; # ISO-Latin1 left double-quote
76	0					0	s/\224/\"/g; # ISO-Latin1 right double-quote
77	0					0	s/\225/\*/g; # ISO-Latin1 bullet
78	0					0	s/\226/-/g; # ISO-Latin1 en-dash (0x96)
79	0					0	s/\227/-/g; # ISO-Latin1 em-dash (0x97)
80	0					0	s/\230/~/g; # ISO-Latin1 small tilde
81	0					0	s/\233/>/g; # ISO-Latin1 single right angle quote
82	0					0	s/\240/ /g; # ISO-Latin1 nonbreaking space
83	0					0	s/\246/\\|/g; # ISO-Latin1 broken vertical bar
84	0					0	s/\255//g; # ISO-Latin1 soft hyphen (0xAD)
85	0					0	s/\264/\'/g; # ISO-Latin1 spacing acute
86	0					0	s/\267/\*/g; # ISO-Latin1 middle dot (0xB7)
87	0					0	s/\327/\*/g;# x multiplication
88
89							# These are automatically removed by HTML::Entities in Perl 5.7 and greater
90							# Also decode unicode sequences decoded by perl 5.8.*
91	0					0	s/\&\#8209;/-/g; # ISOpub nonbreaking hyphen (U+2011, ‑)
92	0					0	s/\&\#8211;/-/g; s/\–/-/g; s/\342\200\041/\-/g; # ISOpub en-dash (U+2013, –)
	0					0
	0					0
93	0					0	s/\&\#8212;/-/g; s/\—/-/g; s/\342\200\042/\-/g; # ISOpub em-dash (U+2014, —)
	0					0
	0					0
94	0					0	s/\&\#8216;/\'/g; s/\‘/\'/g; s/\342\200\176/\`/g; # ISOnum left single-quote (U+2018, ‘)
	0					0
	0					0
95	0					0	s/\&\#8217;/\'/g; s/\’/\'/g; s/\342\200\177/\`/g; # ISOnum right single-quote (U+2019, ’)
	0					0
	0					0
96	0					0	s/\&\#8230;/-/g; s/\…/-/g; s/\342\200\174/-/g; # ISOpub horizontal elipses (U+2026, …)
	0					0
	0					0
97	0					0	; s/(\342\200\230\|\x{2018})/\`/g; # ISOnum left single-quote (U+2018, ‘)
98	0					0	; s/(\342\200\231\|\x{2019})/\`/g; # ISOnum right single-quote (U+2019, ’)
99	0					0	s/\&\#8220;/\"/g; s/\“/\"/g; s/(\342\200\234\|\x{201c})/\"/g; # ISOnum left double-quote (U+201C, “)
	0					0
	0					0
100	0					0	s/\&\#8221;/\"/g; s/\”/\"/g; s/(\342\200\235\|\x{201d})/\"/g; # ISOnum right double-quote (U+201D, ”)
	0					0
	0					0
101	0					0	; s/(\342\210\222\|\x{2212})/-/g; # ISOpub horizontal elipses (U+2026, …)
102
103							# Compress spacing
104	0					0	s/\`/\'/g;
105	0					0	s/[\t\n\r ]+/ /g;
106	0					0	s/^ *//g;
107	0					0	return $_;
108							}
109
110							sub start {
111	0			0	1	0	my $self = shift;
112	0	0	0			0	$self->{_vregs_line} = $_[4] if $_[4] && !$self->{_vregs_forced_line}; # As we have a handler requesting it
113	0	0	0			0	if ($_[0] eq 'p' \|\| $_[0] =~ /^h[0-9]/) {
114							#print " \n" if $Debug;
115	0					0	$self->{_vregs_first_word_in_p} = 1;
116	0	0	0			0	if ($self->{_format} eq 'latex2html' # Latex2html doesn't do any 's
			0
117							&& $self->{_vregs_next_tag}
118							&& $self->{_vregs_next}{$self->{_vregs_next_tag}} ne "") {
119	0					0	$self->{_vregs_next_tag} = undef;
120							}
121							}
122	0	0	0			0	if ($_[0] eq 'meta' && $_[1]->{content} && $_[1]->{content} =~ /latex2html/i) {
			0
123	0					0	$self->{_format} = 'latex2html';
124	0	0				0	print "Format = latex2html\n" if $Debug;
125							}
126	0					0	$self->SUPER::start (@_);
127							}
128
129							sub comment {
130	0			0	1	0	my $self = shift;
131	0					0	my $text = shift;
132	0	0				0	if ($text =~ /^\sline\s+(\d+)\s+"(.)"/) {
133	0					0	$self->{_vregs_forced_line} = $1;
134	0					0	$self->{_vregs_line} = $1;
135	0					0	$self->{_vregs_filename} = $2;
136							}
137							}
138
139							sub end {
140	0			0	1	0	my $self = shift;
141	0	0				0	if ($_[0] eq 'p') {
142	0	0				0	print "<\/$_[0]>\n" if $Debug;
143	0	0				0	if (!$self->{_vregs_first_endp}) {
144	0					0	$self->{_vregs_first_endp} = 1;
145							} else {
146	0					0	$self->{_vregs_next_tag} = undef;
147							}
148							}
149	0					0	$self->SUPER::end (@_);
150							}
151
152							sub text {
153	0			0	1	0	my $self = shift;
154							# Don't bother if we are in a table
155	0	0				0	if (! $self->{_in_a_table}) {
156	0					0	my $text = $_[0];
157	0	0				0	if ($Debug) {
158	0					0	(my $printtext = $text) =~ s/[\n \t]+/ /g;
159	0	0				0	printf +("Text %s: %s:\|%s\|\n", $self->{_vregs_line},
160							($self->{_vregs_first_word_in_p}?"FW":" "),$printtext);
161							}
162	0					0	$text = clean_html_text($text);
163	0	0	0			0	if ($text !~ /^\s*$/
164							&& $text !~ /^
165	0					0	my $nosp_text = $text;
166	0	0				0	if ($self->{_vregs_first_word_in_p}) {
167	0					0	$nosp_text =~ s/^\s+//; $nosp_text =~ s/\s+$//;
	0					0
168							# Per W3C HTML spec, "SGML specifies that a line break immediately
169							# following a start tag must be ignored, as must a line break
170							# immediately before an end tag. This applies to all HTML elements
171							# without exception."
172							# And, "For all HTML elements except PRE, sequences of white space
173							# separate words."
174							# Sadly, the HTML parser did not take care of this for us.
175	0					0	$text = $nosp_text;
176							}
177	0	0	0			0	if ($self->{_vregs_first_word_in_p}
		0
178							&& $Find_Headers{$nosp_text}) {
179	0	0				0	print "Keyword '$text'\n" if $Debug;
180	0	0	0			0	if ($Find_Start_Headers{$text}
181							&& $self->{_vregs_next}) {
182							# Process previous entry, and prepare for next discovery
183	0					0	my $inp = $self->{_vregs_inp};
184	0					0	$inp->new_item([], $self->{_vregs_next}); # note no table
185	0					0	$self->{_vregs_next} = undef;
186							}
187	0					0	$self->{_vregs_next}{at} = ($self->{_vregs_filename}.":".$self->{_vregs_line});
188	0					0	$self->{_vregs_next_tag} = $text;
189	0					0	$self->{_vregs_next}{$self->{_vregs_next_tag}} = "";
190	0					0	$self->{_vregs_first_endp} = 0;
191	0	0				0	$self->{_vregs_next} = undef if $text eq 'Vregs_End_Of_Decl';
192	0	0				0	$self->{_vregs_next_tag} = undef if $text eq 'Vregs_End_Of_Decl';
193							}
194							elsif ($self->{_vregs_next_tag}) {
195	0	0				0	print "TAG '$self->{_vregs_next_tag}' '$text'\n" if $Debug;
196	0					0	$self->{_vregs_next}{$self->{_vregs_next_tag}} .= $text;
197							}
198							}
199	0	0				0	$self->{_vregs_first_word_in_p} = 0 if ($text !~ /^\s*$/);
200	0					0	my @tables = $self->table_states();
201	0					0	my $numtables = ();
202	0	0				0	if ($#tables != $self->{_vregs_num_tables}) {
203	0					0	$self->{_vregs_num_tables} = $#tables;
204	0	0				0	if ($self->{_vregs_next}) { # else a table outside of a register decl
205	0					0	my $table = $tables[$#tables];
206							# Clean up the table to remove extra spaces in the tables
207	0					0	my @bittable = ($table->rows);
208	0					0	foreach my $row (@bittable) {
209	0	0				0	@$row = map {
210	0					0	$_ = '' if !defined $_; # TableExtract may return '' if empty column
211	0					0	$_ = clean_html_text($_);
212							# The below isn't quite right, as a < will become a < by the extractor
213	0					0	$_ =~ s/<[^>]+>//ig; #$_="" if /SupportEmptyParas/i; # Microsloth Word junk
214	0					0	$_ =~ s/^[ \t\n\r]+//sig;
215	0					0	$_ =~ s/[ \t\n\r]+$//sig;
216	0					0	$_;
217							} @$row;
218							}
219							# Process this, and prepare for next discovery
220	0					0	my $inp = $self->{_vregs_inp};
221	0	0				0	print ::Dumper(\@bittable, $self->{_vregs_next}) if $Debug;
222	0					0	$inp->new_item(\@bittable, $self->{_vregs_next});
223	0					0	$self->{_vregs_next} = undef;
224	0					0	$self->{_vregs_next_tag} = undef;
225							}
226							}
227							}
228	0					0	$self->SUPER::text (@_);
229							}
230
231							######################################################################
232							#### HTML cleaning
233
234							sub clean_html_file {
235	1			1	0	103	my $filename = shift;
236							# Word puts so much XML &*(#@$ in the file... Strip it so it's more obvious.
237
238	1					2	my $wholefile;
239							{
240	1	50				2	my $fh = IO::File->new($filename) or die "%Error: $! $filename\n";
	1					11
241	1					112	local $/;
242	1					2	undef $/;
243	1					116	$wholefile = <$fh>;
244	1					10	$fh->close;
245							}
246
247	1					61	$wholefile =~ s%\r%%smg;
248	1					40	$wholefile =~ s%%%smg;
249	1					28	$wholefile =~ s%[ \n]style='.*?'%%smg;
250	1					94	$wholefile =~ s%[ \n]style=".*?"%%smg;
251	1					28	$wholefile =~ s% border=0 % border=1 %smg;
252	1					44	$wholefile =~ s%<\/?span\s*>%%smg;
253	1					47	$wholefile =~ s%<\/?o:p>%%smg;
254	1					33	$wholefile =~ s% width=\d+ % %smg;
255	1					12	$wholefile =~ s%\ %%smg;
256
257							# Microsoft Word changebars
258	1					16	$wholefile =~ s%\[Author\s+\S+:\s+at\s+\S+\s+\S+\s+\d+\s+\S+\s+\d+\s\]\s%%smg;
259
260	1	50				6	my $fh = IO::File->new($filename,"w") or die "%Error: $! writing $filename\n";
261	1					348	print $fh $wholefile;
262	1					5	$fh->close;
263							}
264
265							######################################################################
266							#### Package return
267							1;
268							__END__