File Coverage

blib/lib/Regexp/IgnoreTextCharacteristicsHTML.pm
Criterion Covered Total %
statement 70 80 87.5
branch 18 26 69.2
condition 4 9 44.4
subroutine 7 8 87.5
pod 6 7 85.7
total 105 130 80.7


line stmt bran cond sub pod time code
1             package Regexp::IgnoreTextCharacteristicsHTML;
2 1     1   1470 use Regexp::Ignore;
  1         3  
  1         1065  
3             our @ISA = ("Regexp::Ignore"); # inherit from Regexp::Ignore class
4              
5             ########################
6             # new
7             ########################
8             sub new {
9 12     12 1 7051 my $proto = shift;
10 12   33     85 my $class = ref($proto) || $proto;
11 12         75 my $self = $class->SUPER::new(@_);
12 12         29 $self->{IGNORE_HTML_REMARKS} = 1; # by default it ignores html remarks
13 12         23 $self->{IGNORE_WORD_REMARKS} = 1; # by default it ignores word remarks
14             # the tags to be ignored
15 12         215 $self->{IGNORE_TAGS} = { B => 1,
16             BASEFONT => 1,
17             BIG => 1,
18             BLINK => 1,
19             CITE => 1,
20             CODE => 1,
21             EM => 1,
22             FONT => 1,
23             I => 1,
24             KBD => 1,
25             PLAINTEXT => 1,
26             S => 1,
27             SMALL => 1,
28             STRIKE => 1,
29             STRONG => 1,
30             SUB => 1,
31             SUP => 1,
32             TT => 1,
33             U => 1,
34             VAR => 1,
35             A => 1,
36             SPAN => 1,
37             WBR => 1 };
38 12         40 $self->build_regular_expressions();
39 12         126 return $self;
40             } # of new
41              
42             ############################
43             # build_regular_expressions
44             ############################
45             sub build_regular_expressions {
46 36     36 0 50 my $self = shift;
47              
48             # the regular first expression will try to match:
49             # - HTML remarks - all the remark will be matched. this will
50             # clean out all the special tags of MSWord (that comes inside
51             # remarks)
52             # - MSWord remarks - starting with
53             # - HTML tags
54 36         52 my $re1 = '(<\/?[^\>]*?>)';
55 36 100       83 if ($self->{IGNORE_WORD_REMARKS}) {
56 24         42 $re1 = '(<\!\[[^\]]*?\]>)|'.$re1;
57             }
58 36 100       85 if ($self->{IGNORE_HTML_REMARKS}) {
59 30         60 $re1 = '(<\!\-\-.+?\-\->)|'.$re1;
60             }
61 36         899 $self->{RE1} = qr/$re1/is;
62              
63             # if the tag that we found is one of the following, it is unwanted
64             # token.
65 36         97 my $re2 = "";
66 36 100       79 if ($self->{IGNORE_HTML_REMARKS}) {
67 30         41 $re2 = '(<\!\-\-.+?\-\->)|';
68             }
69 36 100       71 if ($self->{IGNORE_WORD_REMARKS}) {
70 24         37 $re2 .= '(<\!\[[^\]]*?\]>)|<\/?\s*[OVWXP]\:[^>]*?>|';
71             }
72 36         71 foreach my $tag ($self->tags_to_ignore()) {
73 828         1114 $re2 .= '<\/?\s*'.$tag.'(\s[^>]*?>|\s*>)|';
74             }
75              
76 36         101 chop($re2);
77 36         6130 $self->{RE2} = qr/$re2/is;
78             } # of build_regular_expressions
79              
80             #####################
81             # do_not_ignore
82             #####################
83             sub do_not_ignore {
84 0     0 1 0 my $self = shift;
85 0         0 while (@_) {
86 0         0 my $tag = shift;
87 0 0       0 if (exists($self->{IGNORE_TAGS}{uc($tag)})) {
88 0         0 $self->{IGNORE_TAGS}{uc($tag)} = 0;
89             }
90             }
91 0         0 $self->build_regular_expressions();
92             } # of do_not_ignore
93              
94             #####################
95             # tags_to_ignore
96             #####################
97             sub tags_to_ignore {
98 36     36 1 53 my $self = shift;
99 36         60 my $changed = 0;
100 36         91 while (@_) {
101 0         0 my $tag = shift;
102 0         0 $changed = 1;
103 0         0 $self->{IGNORE_TAGS}{uc($tag)} = 1;
104             }
105 36 50       69 if ($changed) {
106 0         0 $self->build_regular_expressions();
107             }
108 36 50       85 return unless defined (wantarray); # void context, do nothing
109 36         55 my @tags_to_ignore = ();
110 36         9869 foreach my $tag (keys(% { $self->{IGNORE_TAGS} })) {
  36         219  
111 828 50       1604 if ($self->{IGNORE_TAGS}{$tag}) {
112 828         1102 push(@tags_to_ignore, $tag);
113             }
114             }
115 36         251 return @tags_to_ignore;
116             } # of tags_to_ignore
117              
118             ######################
119             # ignore_html_remarks
120             ######################
121             sub ignore_html_remarks {
122 12     12 1 57 my $self = shift;
123 12 50       41 if (@_) {
124 12         19 $self->{IGNORE_HTML_REMARKS} = shift;
125 12         27 $self->build_regular_expressions();
126             }
127 12         89 return $self->{IGNORE_HTML_REMARKS};
128             } # of ignore_html_remarks
129              
130             ######################
131             # ignore_word_remarks
132             ######################
133             sub ignore_word_remarks {
134 12     12 1 98 my $self = shift;
135 12 50       34 if (@_) {
136 12         17 $self->{IGNORE_WORD_REMARKS} = shift;
137 12         24 $self->build_regular_expressions();
138             }
139 12         87 return $self->{IGNORE_WORD_REMARKS};
140             } # of ignore_word_remarks
141              
142             ##########################################################################
143             # Our get_tokens will treat any html tag that change the style of the
144             # text as unwanted. It will also treat HTML remarks as unwanted. This
145             # will let us parse HTML documents that were saved by MSWord - where
146             # sometimes varibale_one becomes something like:
147             # varibale_one.
148             ########################
149             # get_tokens
150             ########################
151             sub get_tokens {
152 12     12 1 24 my $self = shift;
153              
154 12         25 my $tokens = [];
155 12         17 my $flags = [];
156 12         20 my $index = 0;
157             # we should create tokens from the TEXT.
158 12         39 my $text = $self->text();
159              
160             # the regular expressions
161 12         21 my $re1 = $self->{RE1};
162 12         17 my $re2 = $self->{RE2};
163              
164 12   66     186 while (defined($text) && $text =~ /$re1/) {
165 6686 100       14487 if (length($`)) { # if there is a text before, take it as clean
166 1994         4137 $tokens->[$index] = $`;
167 1994         2798 $flags->[$index] = 1; # the text before the match is clean.
168 1994         2388 $index++; # increment the index
169             }
170            
171 6686         13785 $tokens->[$index] = $&; # this is the match. it might be unwanted
172             # or wanted, as you can see below.
173 6686         49611 $text = $'; # update the original text to after the match.
174              
175 6686 100       87082 if ($tokens->[$index] =~ /$re2/) {
176 4508         7038 $flags->[$index] = 0; # the match itself is unwanted.
177             }
178             else {
179 2178         3584 $flags->[$index] = 1; # the match itself is ok.
180             }
181 6686         52370 $index++; # increment the index again
182             }
183              
184             # if we had no match, check if there is still something in the
185             # $text. this will be also a clean text.
186 12 50 33     71 if (defined($text) && $text) {
187 12         30 $tokens->[$index] = $text;
188 12         20 $flags->[$index] = 1;
189             }
190             # return the two lists
191 12         116 return ($tokens, $flags);
192             } # of get_tokens
193              
194             1; # make perl happy
195              
196             __END__