File Coverage

blib/lib/HTML/ExtractContent.pm
Criterion Covered Total %
statement 115 131 87.7
branch 31 50 62.0
condition n/a
subroutine 19 19 100.0
pod 4 4 100.0
total 169 204 82.8


line stmt bran cond sub pod time code
1             package HTML::ExtractContent;
2 2     2   59836 use strict;
  2         5  
  2         74  
3 2     2   8 use warnings;
  2         4  
  2         64  
4 2     2   1691 use utf8;
  2         22  
  2         9  
5              
6             # core
7 2     2   88 use List::Util qw(reduce);
  2         3  
  2         271  
8              
9             # cpan
10             use Class::Accessor::Lite (
11 2         16 rw => [qw(opt content)],
12 2     2   973 );
  2         2642  
13              
14             # lib
15 2     2   903 use HTML::ExtractContent::Util;
  2         12  
  2         25  
16              
17             our $VERSION = '0.11';
18              
19             sub new {
20 1     1 1 349 my ($class, $opt) = @_;
21 1         4 my $self = bless {}, $class;
22 1         28 $self->{opt} = {
23             threshold => 60, # threhold for score of clusters
24             min_length => 30, # minimum length of blocks
25             decay_factor => 0.75, # decay factor for block scores
26             no_body_factor => 0.72,
27             continuous_factor => 1.62, # continuous factor for block scores
28             punctuation_weight => 10, # score weight for punctuations
29 2     2   384 punctuations => qr/(?:[。、.,!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)/is,
  2         2  
  2         28  
30             waste_expressions => qr/Copyright|All\s*Rights?\s*Reserved?/is,
31             # characteristic keywords including footer
32             affiliate_expressions =>
33             qr/amazon[a-z0-9\.\/\-\?&]+-22/is,
34             block_separator => qr/<\/?(?:div|center|td)[^>]*>|]*class\s*=\s*["']?(?:posted|plugin-\w+)['"]?[^>]*>/is,
35             # nocontent => qr/<\/frameset>|]*url/is,
36             nocontent => qr/<\/frameset>/is,
37             min_nolink => 8,
38             nolist_ratio => 0.2,
39             debug => 0
40             };
41 1         27 $self->{pattern} = {
42             a => qr/]*>.*?<\/a\s*>/is,
43             href => qr/
44             list => qr/<(ul|dl|ol)(.+)<\/\1>/is,
45             li => qr/(?:]*>|]*>)/is,
46             title => qr/]*>(.*?)<\/title\s*>/is,
47             headline => qr/(\s*(.*?)\s*<\/h\d\s*>)/is,
48             head => qr/]*>.*?<\/head\s*>/is,
49             comment => qr{ (?:
50             |
51             # remove invisible elements
52             < ( [\w:.-]+ ) \s [^>]*? style \s* = [^>]*? \b
53             (?: display \s* : \s* none | visibility \s* : \s* hidden )
54             \b [^>]* > .*?
55             ) }xis,
56             special => qr//is,
57             useless => [
58             qr/<(script|style|select|noscript)[^>]*>.*?<\/\1\s*>/is,
59             qr/]*(?:id|class)\s*=\s*['"]?\S*(?:more|menu|side|navi)\S*["']?[^>]*>/is,
60             ],
61             };
62 1         3 return bless $self, $class;
63             }
64              
65             sub as_text {
66 2     2 1 1049 my $self = shift;
67 2         7 return to_text $self->content;
68             }
69              
70             sub as_html {
71 2     2 1 8 my $self = shift;
72 2         5 return $self->content;
73             }
74              
75             sub extract {
76 2     2 1 709 my $self = shift;;
77 2         71 $self->content(shift);
78 2 50       16 if ($self->content =~ $self->opt->{nocontent}) {
79             # frameset or redirect
80 0         0 $self->content('');
81 0         0 return $self;
82             }
83 2         46 $self->_extract_title;
84 2         6 $self->_eliminate_head;
85              
86 2         5 $self->_eliminate_useless_symbols;
87 2         5 $self->_eliminate_useless_tags;
88              
89 2         4 my ($factor, $continuous);
90 2         2 $factor = $continuous = 1.0;
91 2         3 my $body = '';
92 2         2 my $score = 0;
93 2         7 my $best = {
94             content => "",
95             score => 0,
96             };
97 2         6 my @list = split $self->opt->{block_separator}, $self->content;
98 2         257 my $flag = 0;
99 2         4 for my $block (@list) {
100 44         124 $block = strip $block;
101 44 100       70 next unless decode $block;
102 13 100       41 $continuous /= $self->opt->{continuous_factor} if length $body;
103              
104             # ignore link list block
105 13         48 my $nolink = $self->_eliminate_links($block);
106 13         22 my $nolinklen = length $nolink;
107 13 100       24 next if $nolinklen < $self->opt->{min_length};
108              
109             # score
110 9         41 my $c = $self->_score($nolink, $factor);
111 9         50 $factor *= $self->opt->{decay_factor};
112              
113             # anti-scoring factors
114 9         34 my $no_body_rate = $self->_no_body_rate($block);
115              
116 9         18 $c *= ($self->opt->{no_body_factor} ** $no_body_rate);
117 9         45 my $c1 = $c * $continuous;
118              
119             # cluster scoring
120 9 100       17 if ($c1 > $self->opt->{threshold}) {
    100          
121 2         7 $flag = 1;
122 2 50       37 print "\n---- continue $c*$continuous=$c1 $nolinklen\n\n$block\n"
123             if $self->opt->{debug};
124 2         13 $body .= $block . "\n";
125 2         3 $score += $c1;
126 2         5 $continuous = $self->opt->{continuous_factor};
127             } elsif ($c > $self->opt->{threshold}) {
128 1         8 $flag = 1;
129 1 50       3 print "\n---- end of cluster: $score\n" if $self->opt->{debug};
130 1 50       11 if ($score > $best->{score}) {
131 1 50       4 print "!!!! best: score=$score\n" if $self->opt->{debug};
132 1         7 $best = {
133             content => $body,
134             score => $score,
135             };
136             }
137 1 50       5 print "\n" if $self->opt->{debug};
138 1         7 $body = $block . "\n";
139 1         1 $score = $c;
140 1         3 $continuous = $self->opt->{continuous_factor};
141 1 50       4 print "\n---- continue $c*$continuous=$c1 $nolinklen\n\n$block\n"
142             if $self->opt->{debug};
143             } else {
144 6 100       211 $factor /= $self->opt->{decay_factor} if !$flag;
145 6 50       18 print "\n>> reject $c*$continuous=$c1 $nolinklen\n$block\n",
146             "<< reject\n" if $self->opt->{debug};
147             }
148             }
149 2 50       6 print "\n---- end of cluster: $score\n" if $self->opt->{debug};
150 2 100       16 if ($best->{score} < $score) {
151 1 50       2 print "!!!! best: score=$score\n" if $self->opt->{debug};
152 1         7 $best = {
153             content =>$body,
154             score => $score,
155             };
156             }
157 2         7 $self->content($best->{content});
158              
159 2         17 return $self;
160             }
161              
162             sub _score {
163 9     9   13 my ($self, $nolink, $factor) = @_;
164 9         20 return ((length $nolink)
165             + (match_count $nolink, $self->opt->{punctuations})
166             * $self->opt->{punctuation_weight})
167             * $factor;
168             }
169              
170             sub _no_body_rate {
171 9     9   10 my ($self, $block) = @_;
172 9         14 return (match_count $block,$self->opt->{waste_expressions})
173             + (match_count $block,$self->opt->{affiliate_expressions})/2.0;
174             }
175              
176             sub _extract_title {
177 2     2   4 my $self = shift;
178 2         2 my $title;
179 2 50       5 if ($self->content =~ $self->{pattern}->{title}) {
180 2         73 $title = strip (strip_tags $1);
181 2 50       10 if (length $title) {
182 2         5 my $pat = $self->{pattern}->{headline};
183 2         30 $self->{content} =~ s/$pat/
184 4 100       9 (index $title, strip(strip_tags($2))) >= 0 ? "
$2<\/div>" : "$1"/igse;
185             }
186             }
187             }
188              
189             sub _eliminate_head {
190 2     2   3 my $self = shift;
191 2         3 my $pat = $self->{pattern}->{head};
192 2         24 $self->{content} =~ s/$pat//is;
193             }
194              
195             sub _eliminate_useless_symbols {
196 2     2   3 my $self = shift;
197 2         3 my $comment = $self->{pattern}->{comment};
198 2         3 my $special = $self->{pattern}->{special};
199 2         69 $self->{content} =~ s/$comment//igs;
200 2         18 $self->{content} =~ s/$special//igs;
201             }
202              
203             sub _eliminate_useless_tags {
204 2     2   3 my $self = shift;
205 2         2 my @useless = @{$self->{pattern}->{useless}};
  2         5  
206 2         5 for my $pat (@useless) {
207 4         206 $self->{content} =~ s/$pat//igs;
208             }
209             }
210              
211             sub _eliminate_links {
212 13     13   16 my ($self, $block) = @_;
213 13         33 my $count = match_count $block, $self->{pattern}->{a};
214 13         32 my $nolink = to_text (eliminate_forms (eliminate_links $block));
215 13 50       46 return '' if length $nolink < $self->opt->{min_nolink} * $count;
216 13 50       75 return '' if $self->_is_linklist($block);
217 13         23 return $nolink;
218             }
219              
220             sub _is_linklist {
221 13     13   15 my ($self, $block) = @_;
222 13         18 my $listpat = $self->{pattern}->{list};
223 13 50       70 if ($block =~ $listpat) {
224 0         0 my $list = $2;
225 0         0 my @fragments = split($listpat, $block, 2);
226 0         0 my $nolist = $list;
227 0         0 $nolist =~ s/$listpat//igs;
228 0         0 $nolist = to_text(join($nolist, @fragments));
229 0         0 my @listitems = split $self->{pattern}->{li}, $list;
230 0         0 shift @listitems;
231 0         0 my $rate = 0;
232 0         0 for my $li (@listitems) {
233 0 0       0 $rate++ if $li =~ $self->{pattern}->{href};
234             }
235 0 0       0 $rate = 1.0 * $rate / ($#listitems+1) if $#listitems+1;
236 0         0 $list = to_text $list;
237 0         0 my $limit = ($self->opt->{nolist_ratio}*$rate)
238             * ($rate * (length $list));
239 0         0 return length $nolist < $limit;
240             }
241 13         24 return 0;
242             }
243              
244             1;
245             __END__