| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Regexp::IgnoreTextCharacteristicsHTML; |
|
2
|
1
|
|
|
1
|
|
1470
|
use Regexp::Ignore; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
1065
|
|
|
3
|
|
|
|
|
|
|
our @ISA = ("Regexp::Ignore"); # inherit from Regexp::Ignore class |
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
######################## |
|
6
|
|
|
|
|
|
|
# new |
|
7
|
|
|
|
|
|
|
######################## |
|
8
|
|
|
|
|
|
|
sub new { |
|
9
|
12
|
|
|
12
|
1
|
7051
|
my $proto = shift; |
|
10
|
12
|
|
33
|
|
|
85
|
my $class = ref($proto) || $proto; |
|
11
|
12
|
|
|
|
|
75
|
my $self = $class->SUPER::new(@_); |
|
12
|
12
|
|
|
|
|
29
|
$self->{IGNORE_HTML_REMARKS} = 1; # by default it ignores html remarks |
|
13
|
12
|
|
|
|
|
23
|
$self->{IGNORE_WORD_REMARKS} = 1; # by default it ignores word remarks |
|
14
|
|
|
|
|
|
|
# the tags to be ignored |
|
15
|
12
|
|
|
|
|
215
|
$self->{IGNORE_TAGS} = { B => 1, |
|
16
|
|
|
|
|
|
|
BASEFONT => 1, |
|
17
|
|
|
|
|
|
|
BIG => 1, |
|
18
|
|
|
|
|
|
|
BLINK => 1, |
|
19
|
|
|
|
|
|
|
CITE => 1, |
|
20
|
|
|
|
|
|
|
CODE => 1, |
|
21
|
|
|
|
|
|
|
EM => 1, |
|
22
|
|
|
|
|
|
|
FONT => 1, |
|
23
|
|
|
|
|
|
|
I => 1, |
|
24
|
|
|
|
|
|
|
KBD => 1, |
|
25
|
|
|
|
|
|
|
PLAINTEXT => 1, |
|
26
|
|
|
|
|
|
|
S => 1, |
|
27
|
|
|
|
|
|
|
SMALL => 1, |
|
28
|
|
|
|
|
|
|
STRIKE => 1, |
|
29
|
|
|
|
|
|
|
STRONG => 1, |
|
30
|
|
|
|
|
|
|
SUB => 1, |
|
31
|
|
|
|
|
|
|
SUP => 1, |
|
32
|
|
|
|
|
|
|
TT => 1, |
|
33
|
|
|
|
|
|
|
U => 1, |
|
34
|
|
|
|
|
|
|
VAR => 1, |
|
35
|
|
|
|
|
|
|
A => 1, |
|
36
|
|
|
|
|
|
|
SPAN => 1, |
|
37
|
|
|
|
|
|
|
WBR => 1 }; |
|
38
|
12
|
|
|
|
|
40
|
$self->build_regular_expressions(); |
|
39
|
12
|
|
|
|
|
126
|
return $self; |
|
40
|
|
|
|
|
|
|
} # of new |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
############################ |
|
43
|
|
|
|
|
|
|
# build_regular_expressions |
|
44
|
|
|
|
|
|
|
############################ |
|
45
|
|
|
|
|
|
|
sub build_regular_expressions { |
|
46
|
36
|
|
|
36
|
0
|
50
|
my $self = shift; |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
# the regular first expression will try to match: |
|
49
|
|
|
|
|
|
|
# - HTML remarks - all the remark will be matched. this will |
|
50
|
|
|
|
|
|
|
# clean out all the special tags of MSWord (that comes inside |
|
51
|
|
|
|
|
|
|
# remarks) |
|
52
|
|
|
|
|
|
|
# - MSWord remarks - starting with |
|
53
|
|
|
|
|
|
|
# - HTML tags |
|
54
|
36
|
|
|
|
|
52
|
my $re1 = '(<\/?[^\>]*?>)'; |
|
55
|
36
|
100
|
|
|
|
83
|
if ($self->{IGNORE_WORD_REMARKS}) { |
|
56
|
24
|
|
|
|
|
42
|
$re1 = '(<\!\[[^\]]*?\]>)|'.$re1; |
|
57
|
|
|
|
|
|
|
} |
|
58
|
36
|
100
|
|
|
|
85
|
if ($self->{IGNORE_HTML_REMARKS}) { |
|
59
|
30
|
|
|
|
|
60
|
$re1 = '(<\!\-\-.+?\-\->)|'.$re1; |
|
60
|
|
|
|
|
|
|
} |
|
61
|
36
|
|
|
|
|
899
|
$self->{RE1} = qr/$re1/is; |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
# if the tag that we found is one of the following, it is unwanted |
|
64
|
|
|
|
|
|
|
# token. |
|
65
|
36
|
|
|
|
|
97
|
my $re2 = ""; |
|
66
|
36
|
100
|
|
|
|
79
|
if ($self->{IGNORE_HTML_REMARKS}) { |
|
67
|
30
|
|
|
|
|
41
|
$re2 = '(<\!\-\-.+?\-\->)|'; |
|
68
|
|
|
|
|
|
|
} |
|
69
|
36
|
100
|
|
|
|
71
|
if ($self->{IGNORE_WORD_REMARKS}) { |
|
70
|
24
|
|
|
|
|
37
|
$re2 .= '(<\!\[[^\]]*?\]>)|<\/?\s*[OVWXP]\:[^>]*?>|'; |
|
71
|
|
|
|
|
|
|
} |
|
72
|
36
|
|
|
|
|
71
|
foreach my $tag ($self->tags_to_ignore()) { |
|
73
|
828
|
|
|
|
|
1114
|
$re2 .= '<\/?\s*'.$tag.'(\s[^>]*?>|\s*>)|'; |
|
74
|
|
|
|
|
|
|
} |
|
75
|
|
|
|
|
|
|
|
|
76
|
36
|
|
|
|
|
101
|
chop($re2); |
|
77
|
36
|
|
|
|
|
6130
|
$self->{RE2} = qr/$re2/is; |
|
78
|
|
|
|
|
|
|
} # of build_regular_expressions |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
##################### |
|
81
|
|
|
|
|
|
|
# do_not_ignore |
|
82
|
|
|
|
|
|
|
##################### |
|
83
|
|
|
|
|
|
|
sub do_not_ignore { |
|
84
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
85
|
0
|
|
|
|
|
0
|
while (@_) { |
|
86
|
0
|
|
|
|
|
0
|
my $tag = shift; |
|
87
|
0
|
0
|
|
|
|
0
|
if (exists($self->{IGNORE_TAGS}{uc($tag)})) { |
|
88
|
0
|
|
|
|
|
0
|
$self->{IGNORE_TAGS}{uc($tag)} = 0; |
|
89
|
|
|
|
|
|
|
} |
|
90
|
|
|
|
|
|
|
} |
|
91
|
0
|
|
|
|
|
0
|
$self->build_regular_expressions(); |
|
92
|
|
|
|
|
|
|
} # of do_not_ignore |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
##################### |
|
95
|
|
|
|
|
|
|
# tags_to_ignore |
|
96
|
|
|
|
|
|
|
##################### |
|
97
|
|
|
|
|
|
|
sub tags_to_ignore { |
|
98
|
36
|
|
|
36
|
1
|
53
|
my $self = shift; |
|
99
|
36
|
|
|
|
|
60
|
my $changed = 0; |
|
100
|
36
|
|
|
|
|
91
|
while (@_) { |
|
101
|
0
|
|
|
|
|
0
|
my $tag = shift; |
|
102
|
0
|
|
|
|
|
0
|
$changed = 1; |
|
103
|
0
|
|
|
|
|
0
|
$self->{IGNORE_TAGS}{uc($tag)} = 1; |
|
104
|
|
|
|
|
|
|
} |
|
105
|
36
|
50
|
|
|
|
69
|
if ($changed) { |
|
106
|
0
|
|
|
|
|
0
|
$self->build_regular_expressions(); |
|
107
|
|
|
|
|
|
|
} |
|
108
|
36
|
50
|
|
|
|
85
|
return unless defined (wantarray); # void context, do nothing |
|
109
|
36
|
|
|
|
|
55
|
my @tags_to_ignore = (); |
|
110
|
36
|
|
|
|
|
9869
|
foreach my $tag (keys(% { $self->{IGNORE_TAGS} })) { |
|
|
36
|
|
|
|
|
219
|
|
|
111
|
828
|
50
|
|
|
|
1604
|
if ($self->{IGNORE_TAGS}{$tag}) { |
|
112
|
828
|
|
|
|
|
1102
|
push(@tags_to_ignore, $tag); |
|
113
|
|
|
|
|
|
|
} |
|
114
|
|
|
|
|
|
|
} |
|
115
|
36
|
|
|
|
|
251
|
return @tags_to_ignore; |
|
116
|
|
|
|
|
|
|
} # of tags_to_ignore |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
###################### |
|
119
|
|
|
|
|
|
|
# ignore_html_remarks |
|
120
|
|
|
|
|
|
|
###################### |
|
121
|
|
|
|
|
|
|
sub ignore_html_remarks { |
|
122
|
12
|
|
|
12
|
1
|
57
|
my $self = shift; |
|
123
|
12
|
50
|
|
|
|
41
|
if (@_) { |
|
124
|
12
|
|
|
|
|
19
|
$self->{IGNORE_HTML_REMARKS} = shift; |
|
125
|
12
|
|
|
|
|
27
|
$self->build_regular_expressions(); |
|
126
|
|
|
|
|
|
|
} |
|
127
|
12
|
|
|
|
|
89
|
return $self->{IGNORE_HTML_REMARKS}; |
|
128
|
|
|
|
|
|
|
} # of ignore_html_remarks |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
###################### |
|
131
|
|
|
|
|
|
|
# ignore_word_remarks |
|
132
|
|
|
|
|
|
|
###################### |
|
133
|
|
|
|
|
|
|
sub ignore_word_remarks { |
|
134
|
12
|
|
|
12
|
1
|
98
|
my $self = shift; |
|
135
|
12
|
50
|
|
|
|
34
|
if (@_) { |
|
136
|
12
|
|
|
|
|
17
|
$self->{IGNORE_WORD_REMARKS} = shift; |
|
137
|
12
|
|
|
|
|
24
|
$self->build_regular_expressions(); |
|
138
|
|
|
|
|
|
|
} |
|
139
|
12
|
|
|
|
|
87
|
return $self->{IGNORE_WORD_REMARKS}; |
|
140
|
|
|
|
|
|
|
} # of ignore_word_remarks |
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
########################################################################## |
|
143
|
|
|
|
|
|
|
# Our get_tokens will treat any html tag that change the style of the |
|
144
|
|
|
|
|
|
|
# text as unwanted. It will also treat HTML remarks as unwanted. This |
|
145
|
|
|
|
|
|
|
# will let us parse HTML documents that were saved by MSWord - where |
|
146
|
|
|
|
|
|
|
# sometimes varibale_one becomes something like: |
|
147
|
|
|
|
|
|
|
# varibale_one. |
|
148
|
|
|
|
|
|
|
######################## |
|
149
|
|
|
|
|
|
|
# get_tokens |
|
150
|
|
|
|
|
|
|
######################## |
|
151
|
|
|
|
|
|
|
sub get_tokens { |
|
152
|
12
|
|
|
12
|
1
|
24
|
my $self = shift; |
|
153
|
|
|
|
|
|
|
|
|
154
|
12
|
|
|
|
|
25
|
my $tokens = []; |
|
155
|
12
|
|
|
|
|
17
|
my $flags = []; |
|
156
|
12
|
|
|
|
|
20
|
my $index = 0; |
|
157
|
|
|
|
|
|
|
# we should create tokens from the TEXT. |
|
158
|
12
|
|
|
|
|
39
|
my $text = $self->text(); |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
# the regular expressions |
|
161
|
12
|
|
|
|
|
21
|
my $re1 = $self->{RE1}; |
|
162
|
12
|
|
|
|
|
17
|
my $re2 = $self->{RE2}; |
|
163
|
|
|
|
|
|
|
|
|
164
|
12
|
|
66
|
|
|
186
|
while (defined($text) && $text =~ /$re1/) { |
|
165
|
6686
|
100
|
|
|
|
14487
|
if (length($`)) { # if there is a text before, take it as clean |
|
166
|
1994
|
|
|
|
|
4137
|
$tokens->[$index] = $`; |
|
167
|
1994
|
|
|
|
|
2798
|
$flags->[$index] = 1; # the text before the match is clean. |
|
168
|
1994
|
|
|
|
|
2388
|
$index++; # increment the index |
|
169
|
|
|
|
|
|
|
} |
|
170
|
|
|
|
|
|
|
|
|
171
|
6686
|
|
|
|
|
13785
|
$tokens->[$index] = $&; # this is the match. it might be unwanted |
|
172
|
|
|
|
|
|
|
# or wanted, as you can see below. |
|
173
|
6686
|
|
|
|
|
49611
|
$text = $'; # update the original text to after the match. |
|
174
|
|
|
|
|
|
|
|
|
175
|
6686
|
100
|
|
|
|
87082
|
if ($tokens->[$index] =~ /$re2/) { |
|
176
|
4508
|
|
|
|
|
7038
|
$flags->[$index] = 0; # the match itself is unwanted. |
|
177
|
|
|
|
|
|
|
} |
|
178
|
|
|
|
|
|
|
else { |
|
179
|
2178
|
|
|
|
|
3584
|
$flags->[$index] = 1; # the match itself is ok. |
|
180
|
|
|
|
|
|
|
} |
|
181
|
6686
|
|
|
|
|
52370
|
$index++; # increment the index again |
|
182
|
|
|
|
|
|
|
} |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
# if we had no match, check if there is still something in the |
|
185
|
|
|
|
|
|
|
# $text. this will be also a clean text. |
|
186
|
12
|
50
|
33
|
|
|
71
|
if (defined($text) && $text) { |
|
187
|
12
|
|
|
|
|
30
|
$tokens->[$index] = $text; |
|
188
|
12
|
|
|
|
|
20
|
$flags->[$index] = 1; |
|
189
|
|
|
|
|
|
|
} |
|
190
|
|
|
|
|
|
|
# return the two lists |
|
191
|
12
|
|
|
|
|
116
|
return ($tokens, $flags); |
|
192
|
|
|
|
|
|
|
} # of get_tokens |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
1; # make perl happy |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
__END__ |