File Coverage

blib/lib/Regexp/IgnoreTextCharacteristicsHTML.pm

Criterion	Covered	Total	%
statement	70	80	87.5
branch	18	26	69.2
condition	4	9	44.4
subroutine	7	8	87.5
pod	6	7	85.7
total	105	130	80.7

line	stmt	bran	cond	sub	pod	time	code
1							package Regexp::IgnoreTextCharacteristicsHTML;
2	1			1		1470	use Regexp::Ignore;
	1					3
	1					1065
3							our @ISA = ("Regexp::Ignore"); # inherit from Regexp::Ignore class
4
5							########################
6							# new
7							########################
8							sub new {
9	12			12	1	7051	my $proto = shift;
10	12		33			85	my $class = ref($proto) \|\| $proto;
11	12					75	my $self = $class->SUPER::new(@_);
12	12					29	$self->{IGNORE_HTML_REMARKS} = 1; # by default it ignores html remarks
13	12					23	$self->{IGNORE_WORD_REMARKS} = 1; # by default it ignores word remarks
14							# the tags to be ignored
15	12					215	$self->{IGNORE_TAGS} = { B => 1,
16							BASEFONT => 1,
17							BIG => 1,
18							BLINK => 1,
19							CITE => 1,
20							CODE => 1,
21							EM => 1,
22							FONT => 1,
23							I => 1,
24							KBD => 1,
25							PLAINTEXT => 1,
26							S => 1,
27							SMALL => 1,
28							STRIKE => 1,
29							STRONG => 1,
30							SUB => 1,
31							SUP => 1,
32							TT => 1,
33							U => 1,
34							VAR => 1,
35							A => 1,
36							SPAN => 1,
37							WBR => 1 };
38	12					40	$self->build_regular_expressions();
39	12					126	return $self;
40							} # of new
41
42							############################
43							# build_regular_expressions
44							############################
45							sub build_regular_expressions {
46	36			36	0	50	my $self = shift;
47
48							# the regular first expression will try to match:
49							# - HTML remarks - all the remark will be matched. this will
50							# clean out all the special tags of MSWord (that comes inside
51							# remarks)
52							# - MSWord remarks - starting with
53							# - HTML tags
54	36					52	my $re1 = '(<\/?[^\>]*?>)';
55	36	100				83	if ($self->{IGNORE_WORD_REMARKS}) {
56	24					42	$re1 = '(<\!\[[^\]]*?\]>)\|'.$re1;
57							}
58	36	100				85	if ($self->{IGNORE_HTML_REMARKS}) {
59	30					60	$re1 = '(<\!\-\-.+?\-\->)\|'.$re1;
60							}
61	36					899	$self->{RE1} = qr/$re1/is;
62
63							# if the tag that we found is one of the following, it is unwanted
64							# token.
65	36					97	my $re2 = "";
66	36	100				79	if ($self->{IGNORE_HTML_REMARKS}) {
67	30					41	$re2 = '(<\!\-\-.+?\-\->)\|';
68							}
69	36	100				71	if ($self->{IGNORE_WORD_REMARKS}) {
70	24					37	$re2 .= '(<\!\[[^\]]?\]>)\|<\/?\s[OVWXP]\:[^>]*?>\|';
71							}
72	36					71	foreach my $tag ($self->tags_to_ignore()) {
73	828					1114	$re2 .= '<\/?\s'.$tag.'(\s[^>]?>\|\s*>)\|';
74							}
75
76	36					101	chop($re2);
77	36					6130	$self->{RE2} = qr/$re2/is;
78							} # of build_regular_expressions
79
80							#####################
81							# do_not_ignore
82							#####################
83							sub do_not_ignore {
84	0			0	1	0	my $self = shift;
85	0					0	while (@_) {
86	0					0	my $tag = shift;
87	0	0				0	if (exists($self->{IGNORE_TAGS}{uc($tag)})) {
88	0					0	$self->{IGNORE_TAGS}{uc($tag)} = 0;
89							}
90							}
91	0					0	$self->build_regular_expressions();
92							} # of do_not_ignore
93
94							#####################
95							# tags_to_ignore
96							#####################
97							sub tags_to_ignore {
98	36			36	1	53	my $self = shift;
99	36					60	my $changed = 0;
100	36					91	while (@_) {
101	0					0	my $tag = shift;
102	0					0	$changed = 1;
103	0					0	$self->{IGNORE_TAGS}{uc($tag)} = 1;
104							}
105	36	50				69	if ($changed) {
106	0					0	$self->build_regular_expressions();
107							}
108	36	50				85	return unless defined (wantarray); # void context, do nothing
109	36					55	my @tags_to_ignore = ();
110	36					9869	foreach my $tag (keys(% { $self->{IGNORE_TAGS} })) {
	36					219
111	828	50				1604	if ($self->{IGNORE_TAGS}{$tag}) {
112	828					1102	push(@tags_to_ignore, $tag);
113							}
114							}
115	36					251	return @tags_to_ignore;
116							} # of tags_to_ignore
117
118							######################
119							# ignore_html_remarks
120							######################
121							sub ignore_html_remarks {
122	12			12	1	57	my $self = shift;
123	12	50				41	if (@_) {
124	12					19	$self->{IGNORE_HTML_REMARKS} = shift;
125	12					27	$self->build_regular_expressions();
126							}
127	12					89	return $self->{IGNORE_HTML_REMARKS};
128							} # of ignore_html_remarks
129
130							######################
131							# ignore_word_remarks
132							######################
133							sub ignore_word_remarks {
134	12			12	1	98	my $self = shift;
135	12	50				34	if (@_) {
136	12					17	$self->{IGNORE_WORD_REMARKS} = shift;
137	12					24	$self->build_regular_expressions();
138							}
139	12					87	return $self->{IGNORE_WORD_REMARKS};
140							} # of ignore_word_remarks
141
142							##########################################################################
143							# Our get_tokens will treat any html tag that change the style of the
144							# text as unwanted. It will also treat HTML remarks as unwanted. This
145							# will let us parse HTML documents that were saved by MSWord - where
146							# sometimes varibale_one becomes something like:
147							# varibale_one.
148							########################
149							# get_tokens
150							########################
151							sub get_tokens {
152	12			12	1	24	my $self = shift;
153
154	12					25	my $tokens = [];
155	12					17	my $flags = [];
156	12					20	my $index = 0;
157							# we should create tokens from the TEXT.
158	12					39	my $text = $self->text();
159
160							# the regular expressions
161	12					21	my $re1 = $self->{RE1};
162	12					17	my $re2 = $self->{RE2};
163
164	12		66			186	while (defined($text) && $text =~ /$re1/) {
165	6686	100				14487	if (length($`)) { # if there is a text before, take it as clean
166	1994					4137	$tokens->[$index] = $`;
167	1994					2798	$flags->[$index] = 1; # the text before the match is clean.
168	1994					2388	$index++; # increment the index
169							}
170
171	6686					13785	$tokens->[$index] = $&; # this is the match. it might be unwanted
172							# or wanted, as you can see below.
173	6686					49611	$text = $'; # update the original text to after the match.
174
175	6686	100				87082	if ($tokens->[$index] =~ /$re2/) {
176	4508					7038	$flags->[$index] = 0; # the match itself is unwanted.
177							}
178							else {
179	2178					3584	$flags->[$index] = 1; # the match itself is ok.
180							}
181	6686					52370	$index++; # increment the index again
182							}
183
184							# if we had no match, check if there is still something in the
185							# $text. this will be also a clean text.
186	12	50	33			71	if (defined($text) && $text) {
187	12					30	$tokens->[$index] = $text;
188	12					20	$flags->[$index] = 1;
189							}
190							# return the two lists
191	12					116	return ($tokens, $flags);
192							} # of get_tokens
193
194							1; # make perl happy
195
196							__END__