line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Regexp::IgnoreTextCharacteristicsHTML; |
2
|
1
|
|
|
1
|
|
1470
|
use Regexp::Ignore; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
1065
|
|
3
|
|
|
|
|
|
|
our @ISA = ("Regexp::Ignore"); # inherit from Regexp::Ignore class |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
######################## |
6
|
|
|
|
|
|
|
# new |
7
|
|
|
|
|
|
|
######################## |
8
|
|
|
|
|
|
|
sub new { |
9
|
12
|
|
|
12
|
1
|
7051
|
my $proto = shift; |
10
|
12
|
|
33
|
|
|
85
|
my $class = ref($proto) || $proto; |
11
|
12
|
|
|
|
|
75
|
my $self = $class->SUPER::new(@_); |
12
|
12
|
|
|
|
|
29
|
$self->{IGNORE_HTML_REMARKS} = 1; # by default it ignores html remarks |
13
|
12
|
|
|
|
|
23
|
$self->{IGNORE_WORD_REMARKS} = 1; # by default it ignores word remarks |
14
|
|
|
|
|
|
|
# the tags to be ignored |
15
|
12
|
|
|
|
|
215
|
$self->{IGNORE_TAGS} = { B => 1, |
16
|
|
|
|
|
|
|
BASEFONT => 1, |
17
|
|
|
|
|
|
|
BIG => 1, |
18
|
|
|
|
|
|
|
BLINK => 1, |
19
|
|
|
|
|
|
|
CITE => 1, |
20
|
|
|
|
|
|
|
CODE => 1, |
21
|
|
|
|
|
|
|
EM => 1, |
22
|
|
|
|
|
|
|
FONT => 1, |
23
|
|
|
|
|
|
|
I => 1, |
24
|
|
|
|
|
|
|
KBD => 1, |
25
|
|
|
|
|
|
|
PLAINTEXT => 1, |
26
|
|
|
|
|
|
|
S => 1, |
27
|
|
|
|
|
|
|
SMALL => 1, |
28
|
|
|
|
|
|
|
STRIKE => 1, |
29
|
|
|
|
|
|
|
STRONG => 1, |
30
|
|
|
|
|
|
|
SUB => 1, |
31
|
|
|
|
|
|
|
SUP => 1, |
32
|
|
|
|
|
|
|
TT => 1, |
33
|
|
|
|
|
|
|
U => 1, |
34
|
|
|
|
|
|
|
VAR => 1, |
35
|
|
|
|
|
|
|
A => 1, |
36
|
|
|
|
|
|
|
SPAN => 1, |
37
|
|
|
|
|
|
|
WBR => 1 }; |
38
|
12
|
|
|
|
|
40
|
$self->build_regular_expressions(); |
39
|
12
|
|
|
|
|
126
|
return $self; |
40
|
|
|
|
|
|
|
} # of new |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
############################ |
43
|
|
|
|
|
|
|
# build_regular_expressions |
44
|
|
|
|
|
|
|
############################ |
45
|
|
|
|
|
|
|
sub build_regular_expressions { |
46
|
36
|
|
|
36
|
0
|
50
|
my $self = shift; |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
# the regular first expression will try to match: |
49
|
|
|
|
|
|
|
# - HTML remarks - all the remark will be matched. this will |
50
|
|
|
|
|
|
|
# clean out all the special tags of MSWord (that comes inside |
51
|
|
|
|
|
|
|
# remarks) |
52
|
|
|
|
|
|
|
# - MSWord remarks - starting with |
53
|
|
|
|
|
|
|
# - HTML tags |
54
|
36
|
|
|
|
|
52
|
my $re1 = '(<\/?[^\>]*?>)'; |
55
|
36
|
100
|
|
|
|
83
|
if ($self->{IGNORE_WORD_REMARKS}) { |
56
|
24
|
|
|
|
|
42
|
$re1 = '(<\!\[[^\]]*?\]>)|'.$re1; |
57
|
|
|
|
|
|
|
} |
58
|
36
|
100
|
|
|
|
85
|
if ($self->{IGNORE_HTML_REMARKS}) { |
59
|
30
|
|
|
|
|
60
|
$re1 = '(<\!\-\-.+?\-\->)|'.$re1; |
60
|
|
|
|
|
|
|
} |
61
|
36
|
|
|
|
|
899
|
$self->{RE1} = qr/$re1/is; |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
# if the tag that we found is one of the following, it is unwanted |
64
|
|
|
|
|
|
|
# token. |
65
|
36
|
|
|
|
|
97
|
my $re2 = ""; |
66
|
36
|
100
|
|
|
|
79
|
if ($self->{IGNORE_HTML_REMARKS}) { |
67
|
30
|
|
|
|
|
41
|
$re2 = '(<\!\-\-.+?\-\->)|'; |
68
|
|
|
|
|
|
|
} |
69
|
36
|
100
|
|
|
|
71
|
if ($self->{IGNORE_WORD_REMARKS}) { |
70
|
24
|
|
|
|
|
37
|
$re2 .= '(<\!\[[^\]]*?\]>)|<\/?\s*[OVWXP]\:[^>]*?>|'; |
71
|
|
|
|
|
|
|
} |
72
|
36
|
|
|
|
|
71
|
foreach my $tag ($self->tags_to_ignore()) { |
73
|
828
|
|
|
|
|
1114
|
$re2 .= '<\/?\s*'.$tag.'(\s[^>]*?>|\s*>)|'; |
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
|
76
|
36
|
|
|
|
|
101
|
chop($re2); |
77
|
36
|
|
|
|
|
6130
|
$self->{RE2} = qr/$re2/is; |
78
|
|
|
|
|
|
|
} # of build_regular_expressions |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
##################### |
81
|
|
|
|
|
|
|
# do_not_ignore |
82
|
|
|
|
|
|
|
##################### |
83
|
|
|
|
|
|
|
sub do_not_ignore { |
84
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
85
|
0
|
|
|
|
|
0
|
while (@_) { |
86
|
0
|
|
|
|
|
0
|
my $tag = shift; |
87
|
0
|
0
|
|
|
|
0
|
if (exists($self->{IGNORE_TAGS}{uc($tag)})) { |
88
|
0
|
|
|
|
|
0
|
$self->{IGNORE_TAGS}{uc($tag)} = 0; |
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
} |
91
|
0
|
|
|
|
|
0
|
$self->build_regular_expressions(); |
92
|
|
|
|
|
|
|
} # of do_not_ignore |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
##################### |
95
|
|
|
|
|
|
|
# tags_to_ignore |
96
|
|
|
|
|
|
|
##################### |
97
|
|
|
|
|
|
|
sub tags_to_ignore { |
98
|
36
|
|
|
36
|
1
|
53
|
my $self = shift; |
99
|
36
|
|
|
|
|
60
|
my $changed = 0; |
100
|
36
|
|
|
|
|
91
|
while (@_) { |
101
|
0
|
|
|
|
|
0
|
my $tag = shift; |
102
|
0
|
|
|
|
|
0
|
$changed = 1; |
103
|
0
|
|
|
|
|
0
|
$self->{IGNORE_TAGS}{uc($tag)} = 1; |
104
|
|
|
|
|
|
|
} |
105
|
36
|
50
|
|
|
|
69
|
if ($changed) { |
106
|
0
|
|
|
|
|
0
|
$self->build_regular_expressions(); |
107
|
|
|
|
|
|
|
} |
108
|
36
|
50
|
|
|
|
85
|
return unless defined (wantarray); # void context, do nothing |
109
|
36
|
|
|
|
|
55
|
my @tags_to_ignore = (); |
110
|
36
|
|
|
|
|
9869
|
foreach my $tag (keys(% { $self->{IGNORE_TAGS} })) { |
|
36
|
|
|
|
|
219
|
|
111
|
828
|
50
|
|
|
|
1604
|
if ($self->{IGNORE_TAGS}{$tag}) { |
112
|
828
|
|
|
|
|
1102
|
push(@tags_to_ignore, $tag); |
113
|
|
|
|
|
|
|
} |
114
|
|
|
|
|
|
|
} |
115
|
36
|
|
|
|
|
251
|
return @tags_to_ignore; |
116
|
|
|
|
|
|
|
} # of tags_to_ignore |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
###################### |
119
|
|
|
|
|
|
|
# ignore_html_remarks |
120
|
|
|
|
|
|
|
###################### |
121
|
|
|
|
|
|
|
sub ignore_html_remarks { |
122
|
12
|
|
|
12
|
1
|
57
|
my $self = shift; |
123
|
12
|
50
|
|
|
|
41
|
if (@_) { |
124
|
12
|
|
|
|
|
19
|
$self->{IGNORE_HTML_REMARKS} = shift; |
125
|
12
|
|
|
|
|
27
|
$self->build_regular_expressions(); |
126
|
|
|
|
|
|
|
} |
127
|
12
|
|
|
|
|
89
|
return $self->{IGNORE_HTML_REMARKS}; |
128
|
|
|
|
|
|
|
} # of ignore_html_remarks |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
###################### |
131
|
|
|
|
|
|
|
# ignore_word_remarks |
132
|
|
|
|
|
|
|
###################### |
133
|
|
|
|
|
|
|
sub ignore_word_remarks { |
134
|
12
|
|
|
12
|
1
|
98
|
my $self = shift; |
135
|
12
|
50
|
|
|
|
34
|
if (@_) { |
136
|
12
|
|
|
|
|
17
|
$self->{IGNORE_WORD_REMARKS} = shift; |
137
|
12
|
|
|
|
|
24
|
$self->build_regular_expressions(); |
138
|
|
|
|
|
|
|
} |
139
|
12
|
|
|
|
|
87
|
return $self->{IGNORE_WORD_REMARKS}; |
140
|
|
|
|
|
|
|
} # of ignore_word_remarks |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
########################################################################## |
143
|
|
|
|
|
|
|
# Our get_tokens will treat any html tag that change the style of the |
144
|
|
|
|
|
|
|
# text as unwanted. It will also treat HTML remarks as unwanted. This |
145
|
|
|
|
|
|
|
# will let us parse HTML documents that were saved by MSWord - where |
146
|
|
|
|
|
|
|
# sometimes varibale_one becomes something like: |
147
|
|
|
|
|
|
|
# varibale_one. |
148
|
|
|
|
|
|
|
######################## |
149
|
|
|
|
|
|
|
# get_tokens |
150
|
|
|
|
|
|
|
######################## |
151
|
|
|
|
|
|
|
sub get_tokens { |
152
|
12
|
|
|
12
|
1
|
24
|
my $self = shift; |
153
|
|
|
|
|
|
|
|
154
|
12
|
|
|
|
|
25
|
my $tokens = []; |
155
|
12
|
|
|
|
|
17
|
my $flags = []; |
156
|
12
|
|
|
|
|
20
|
my $index = 0; |
157
|
|
|
|
|
|
|
# we should create tokens from the TEXT. |
158
|
12
|
|
|
|
|
39
|
my $text = $self->text(); |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
# the regular expressions |
161
|
12
|
|
|
|
|
21
|
my $re1 = $self->{RE1}; |
162
|
12
|
|
|
|
|
17
|
my $re2 = $self->{RE2}; |
163
|
|
|
|
|
|
|
|
164
|
12
|
|
66
|
|
|
186
|
while (defined($text) && $text =~ /$re1/) { |
165
|
6686
|
100
|
|
|
|
14487
|
if (length($`)) { # if there is a text before, take it as clean |
166
|
1994
|
|
|
|
|
4137
|
$tokens->[$index] = $`; |
167
|
1994
|
|
|
|
|
2798
|
$flags->[$index] = 1; # the text before the match is clean. |
168
|
1994
|
|
|
|
|
2388
|
$index++; # increment the index |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
|
171
|
6686
|
|
|
|
|
13785
|
$tokens->[$index] = $&; # this is the match. it might be unwanted |
172
|
|
|
|
|
|
|
# or wanted, as you can see below. |
173
|
6686
|
|
|
|
|
49611
|
$text = $'; # update the original text to after the match. |
174
|
|
|
|
|
|
|
|
175
|
6686
|
100
|
|
|
|
87082
|
if ($tokens->[$index] =~ /$re2/) { |
176
|
4508
|
|
|
|
|
7038
|
$flags->[$index] = 0; # the match itself is unwanted. |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
else { |
179
|
2178
|
|
|
|
|
3584
|
$flags->[$index] = 1; # the match itself is ok. |
180
|
|
|
|
|
|
|
} |
181
|
6686
|
|
|
|
|
52370
|
$index++; # increment the index again |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
# if we had no match, check if there is still something in the |
185
|
|
|
|
|
|
|
# $text. this will be also a clean text. |
186
|
12
|
50
|
33
|
|
|
71
|
if (defined($text) && $text) { |
187
|
12
|
|
|
|
|
30
|
$tokens->[$index] = $text; |
188
|
12
|
|
|
|
|
20
|
$flags->[$index] = 1; |
189
|
|
|
|
|
|
|
} |
190
|
|
|
|
|
|
|
# return the two lists |
191
|
12
|
|
|
|
|
116
|
return ($tokens, $flags); |
192
|
|
|
|
|
|
|
} # of get_tokens |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
1; # make perl happy |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
__END__ |