| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Regexp::IgnoreHTML; |
|
2
|
1
|
|
|
1
|
|
1343
|
use Regexp::Ignore; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
537
|
|
|
3
|
|
|
|
|
|
|
our @ISA = ("Regexp::Ignore"); # inherit from Regexp::Ignore class |
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
######################## |
|
6
|
|
|
|
|
|
|
# new |
|
7
|
|
|
|
|
|
|
######################## |
|
8
|
|
|
|
|
|
|
sub new { |
|
9
|
6
|
|
|
6
|
1
|
2573
|
my $proto = shift; |
|
10
|
6
|
|
33
|
|
|
43
|
my $class = ref($proto) || $proto; |
|
11
|
6
|
|
|
|
|
43
|
my $self = $class->SUPER::new(@_); |
|
12
|
|
|
|
|
|
|
# by default it does not add spaces |
|
13
|
6
|
|
|
|
|
18
|
$self->{SPACE_AFTER_NON_TEXT_CHARACTERISTICS_HTML} = 0; |
|
14
|
6
|
|
|
|
|
14
|
return $self; |
|
15
|
|
|
|
|
|
|
} # of new |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
############################################# |
|
18
|
|
|
|
|
|
|
# space_after_non_text_characteristics_html |
|
19
|
|
|
|
|
|
|
############################################# |
|
20
|
|
|
|
|
|
|
sub space_after_non_text_characteristics_html { |
|
21
|
3314
|
|
|
3314
|
1
|
4675
|
my $self = shift; |
|
22
|
3314
|
100
|
|
|
|
6455
|
if (@_) { $self->{SPACE_AFTER_NON_TEXT_CHARACTERISTICS_HTML} = shift } |
|
|
6
|
|
|
|
|
12
|
|
|
23
|
3314
|
|
|
|
|
47492
|
return $self->{SPACE_AFTER_NON_TEXT_CHARACTERISTICS_HTML}; |
|
24
|
|
|
|
|
|
|
} # of space_after_non_text_characteristics_html |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
########################################### |
|
27
|
|
|
|
|
|
|
# |
|
28
|
|
|
|
|
|
|
# |
|
29
|
|
|
|
|
|
|
# |
|
30
|
|
|
|
|
|
|
######################## |
|
31
|
|
|
|
|
|
|
# get_tokens |
|
32
|
|
|
|
|
|
|
######################## |
|
33
|
|
|
|
|
|
|
sub get_tokens { |
|
34
|
6
|
|
|
6
|
1
|
10
|
my $self = shift; |
|
35
|
|
|
|
|
|
|
|
|
36
|
6
|
|
|
|
|
12
|
my $tokens = []; |
|
37
|
6
|
|
|
|
|
9
|
my $flags = []; |
|
38
|
6
|
|
|
|
|
10
|
my $index = 0; |
|
39
|
|
|
|
|
|
|
# we should create tokens from the TEXT. |
|
40
|
6
|
|
|
|
|
23
|
my $text = $self->text(); |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# the regular expression will try to match: |
|
43
|
|
|
|
|
|
|
# - HTML remarks - all the remark will be matched. |
|
44
|
|
|
|
|
|
|
# - HTML tags |
|
45
|
6
|
|
|
|
|
24
|
my $re1 = qr/(<\!\-\-[\s\S]+?\-\->)|(<\/?[^\>]*?>)/is; |
|
46
|
|
|
|
|
|
|
|
|
47
|
6
|
|
|
|
|
8
|
my $re2; |
|
48
|
6
|
100
|
|
|
|
14
|
if ($self->space_after_non_text_characteristics_html()) { |
|
49
|
|
|
|
|
|
|
# if the tag that we found is one of the following, we do not |
|
50
|
|
|
|
|
|
|
# put space after it: B, BASEFONT, BIG, BLINK, CITE, CODE, EM, |
|
51
|
|
|
|
|
|
|
# FONT, I, KBD, PLAINTEXT, S, SMALL, STRIKE, STRONG, SUB, SUP, |
|
52
|
|
|
|
|
|
|
# TT, U, VAR, A, SPAN, WBR |
|
53
|
3
|
|
|
|
|
8
|
$re2 = '<\!\-\-.+?\-\->|'. |
|
54
|
|
|
|
|
|
|
'\<\!\[[^\]]*?\]\>|'. |
|
55
|
|
|
|
|
|
|
'<\/?\s*B(\s[^>]*?>|\s*>)|'. |
|
56
|
|
|
|
|
|
|
'<\/?\s*BASEFONT(\s[^>]*?>|\s*>)|'. |
|
57
|
|
|
|
|
|
|
'<\/?\s*BIG(\s[^>]*?>|\s*>)|'. |
|
58
|
|
|
|
|
|
|
'<\/?\s*BLINK(\s[^>]*?>|\s*>)|'. |
|
59
|
|
|
|
|
|
|
'<\/?\s*CITE(\s[^>]*?>|\s*>)|'. |
|
60
|
|
|
|
|
|
|
'<\/?\s*CODE(\s[^>]*?>|\s*>)|'. |
|
61
|
|
|
|
|
|
|
'<\/?\s*EM(\s[^>]*?>|\s*>)|'. |
|
62
|
|
|
|
|
|
|
'<\/?\s*FONT(\s[^>]*?>|\s*>)|'. |
|
63
|
|
|
|
|
|
|
'<\/?\s*I(\s[^>]*?>|\s*>)|'. |
|
64
|
|
|
|
|
|
|
'<\/?\s*KBD(\s[^>]*?>|\s*>)|'. |
|
65
|
|
|
|
|
|
|
'<\/?\s*PLAINTEXT(\s[^>]*?>|\s*>)|'. |
|
66
|
|
|
|
|
|
|
'<\/?\s*S(\s[^>]*?>|\s*>)|'. |
|
67
|
|
|
|
|
|
|
'<\/?\s*SMALL(\s[^>]*?>|\s*>)|'. |
|
68
|
|
|
|
|
|
|
'<\/?\s*STRIKE(\s[^>]*?>|\s*>)|'. |
|
69
|
|
|
|
|
|
|
'<\/?\s*STRONG(\s[^>]*?>|\s*>)|'. |
|
70
|
|
|
|
|
|
|
'<\/?\s*SUB(\s[^>]*?>|\s*>)|'. |
|
71
|
|
|
|
|
|
|
'<\/?\s*SUP(\s[^>]*?>|\s*>)|'. |
|
72
|
|
|
|
|
|
|
'<\/?\s*TT(\s[^>]*?>|\s*>)|'. |
|
73
|
|
|
|
|
|
|
'<\/?\s*U(\s[^>]*?>|\s*>)|'. |
|
74
|
|
|
|
|
|
|
'<\/?\s*VAR(\s[^>]*?>|\s*>)|'. |
|
75
|
|
|
|
|
|
|
'<\/?\s*A(\s[^>]*?>|\s*>)|'. |
|
76
|
|
|
|
|
|
|
'<\/?\s*SPAN(\s[^>]*?>|\s*>)|'. |
|
77
|
|
|
|
|
|
|
'<\/?\s*WBR(\s[^>]*?>|\s*>)|'. |
|
78
|
|
|
|
|
|
|
'<\/?\s*[OVWXP]\:[^>]*?>'; |
|
79
|
3
|
|
|
|
|
338
|
$re2 = qr/$re2/is; |
|
80
|
|
|
|
|
|
|
} |
|
81
|
|
|
|
|
|
|
|
|
82
|
6
|
|
66
|
|
|
114
|
while (defined($text) && $text =~ /$re1/) { |
|
83
|
3302
|
100
|
|
|
|
7949
|
if ($`) { # if there is a text before, take it as clean |
|
84
|
960
|
|
|
|
|
5684
|
$tokens->[$index] = $`; |
|
85
|
960
|
|
|
|
|
1204
|
$flags->[$index] = 1; # the text before the match is clean. |
|
86
|
960
|
|
|
|
|
1160
|
$index++; # increment the index |
|
87
|
|
|
|
|
|
|
} |
|
88
|
3302
|
|
|
|
|
6915
|
$tokens->[$index] = $&; |
|
89
|
3302
|
|
|
|
|
4917
|
$flags->[$index] = 0; # the match itself is unwanted. |
|
90
|
3302
|
|
|
|
|
24253
|
$text = $'; # update the original text to after the match. |
|
91
|
3302
|
|
|
|
|
3799
|
$index++; # increment the index again |
|
92
|
|
|
|
|
|
|
# check if we should add space after the text |
|
93
|
3302
|
100
|
100
|
|
|
8084
|
if ($self->space_after_non_text_characteristics_html() && |
|
94
|
|
|
|
|
|
|
$tokens->[$index - 1] !~ /$re2/) { # this tag is not text |
|
95
|
|
|
|
|
|
|
# characteristic tag |
|
96
|
|
|
|
|
|
|
# we add a space token after this tag |
|
97
|
423
|
|
|
|
|
687
|
$tokens->[$index] = " "; |
|
98
|
423
|
|
|
|
|
521
|
$flags->[$index] = 1; |
|
99
|
423
|
|
|
|
|
3651
|
$index++; |
|
100
|
|
|
|
|
|
|
} |
|
101
|
|
|
|
|
|
|
} |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
# if we had no match, check if there is still something in the |
|
104
|
|
|
|
|
|
|
# $text. this will be also a clean text. |
|
105
|
6
|
50
|
33
|
|
|
32
|
if (defined($text) && $text) { |
|
106
|
6
|
|
|
|
|
15
|
$tokens->[$index] = $text; |
|
107
|
6
|
|
|
|
|
9
|
$flags->[$index] = 1; |
|
108
|
|
|
|
|
|
|
} |
|
109
|
|
|
|
|
|
|
# return the two lists |
|
110
|
6
|
|
|
|
|
59
|
return ($tokens, $flags); |
|
111
|
|
|
|
|
|
|
} # of get_tokens |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
1; # make perl happy |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
__END__ |