File Coverage

blib/lib/HTML/ExtractContent.pm
Criterion Covered Total %
statement 115 131 87.7
branch 31 50 62.0
condition n/a
subroutine 19 19 100.0
pod 4 4 100.0
total 169 204 82.8


line stmt bran cond sub pod time code
1             package HTML::ExtractContent;
2 2     2   879 use strict;
  2         4  
  2         79  
3 2     2   10 use warnings;
  2         4  
  2         68  
4 2     2   1065 use HTML::ExtractContent::Util;
  2         31  
  2         18  
5 2     2   433 use List::Util qw(reduce);
  2         5  
  2         302  
6 2     2   11 use utf8;
  2         4  
  2         16  
7 2     2   61 use base qw(Class::Accessor::Lvalue::Fast);
  2         4  
  2         2092  
8             our $VERSION = '0.10';
9             __PACKAGE__->mk_accessors(qw(opt content));
10              
11             sub new {
12 1     1 1 636 my ($class, $opt) = @_;
13 1         18 my $self = $class->SUPER::new($opt);
14 1         56 $self->opt = {
15             threshold => 60, # threhold for score of clusters
16             min_length => 30, # minimum length of blocks
17             decay_factor => 0.75, # decay factor for block scores
18             no_body_factor => 0.72,
19             continuous_factor => 1.62, # continuous factor for block scores
20             punctuation_weight => 10, # score weight for punctuations
21 2     2   23003 punctuations => qr/(?:[。、.,!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)/is,
  2         6  
  2         51  
22             waste_expressions => qr/Copyright|All\s*Rights?\s*Reserved?/is,
23             # characteristic keywords including footer
24             affiliate_expressions =>
25             qr/amazon[a-z0-9\.\/\-\?&]+-22/is,
26             block_separator => qr/<\/?(?:div|center|td)[^>]*>|]*class\s*=\s*["']?(?:posted|plugin-\w+)['"]?[^>]*>/is,
27             # nocontent => qr/<\/frameset>|]*url/is,
28             nocontent => qr/<\/frameset>/is,
29             min_nolink => 8,
30             nolist_ratio => 0.2,
31             debug => 0
32             };
33 1         46 $self->{pattern} = {
34             a => qr/]*>.*?<\/a\s*>/is,
35             href => qr/
36             list => qr/<(ul|dl|ol)(.+)<\/\1>/is,
37             li => qr/(?:]*>|]*>)/is,
38             title => qr/]*>(.*?)<\/title\s*>/is,
39             headline => qr/(\s*(.*?)\s*<\/h\d\s*>)/is,
40             head => qr/]*>.*?<\/head\s*>/is,
41             comment => qr/(?:|<([^>\s]+)[^>]*\s+style=['"]?[^>'"]*(?:display:\s*none|visibility:\s*hidden)[^>'"]*['"]?[^>]*>.*?<\/\1\s*>)/is,
42             special => qr//is,
43             useless => [
44             qr/<(script|style|select|noscript)[^>]*>.*?<\/\1\s*>/is,
45             qr/]*(?:id|class)\s*=\s*['"]?\S*(?:more|menu|side|navi)\S*["']?[^>]*>/is,
46             ],
47             };
48 1         4 return bless $self, $class;
49             }
50              
51             sub as_text {
52 2     2 1 1883 my $self = shift;
53 2         10 return to_text $self->content;
54             }
55              
56             sub as_html {
57 2     2 1 10 my $self = shift;
58 2         6 return $self->content;
59             }
60              
61             sub extract {
62 2     2 1 15 my $self = shift;;
63 2         13 $self->content = shift;
64 2 50       26 if ($self->content =~ $self->opt->{nocontent}) {
65             # frameset or redirect
66 0         0 $self->content = '';
67 0         0 return $self;
68             }
69 2         46 $self->_extract_title;
70 2         10 $self->_eliminate_head;
71              
72 2         42 $self->_eliminate_useless_symbols;
73 2         48 $self->_eliminate_useless_tags;
74              
75 2         126 my ($factor, $continuous);
76 2         4 $factor = $continuous = 1.0;
77 2         3 my $body = '';
78 2         5 my $score = 0;
79 2         8 my $best = {
80             content => "",
81             score => 0,
82             };
83 2         9 my @list = split $self->opt->{block_separator}, $self->content;
84 2         329 my $flag = 0;
85 2         53 for my $block (@list) {
86 44         171 $block = strip $block;
87 44 100       110 next unless decode $block;
88 13 100       57 $continuous /= $self->opt->{continuous_factor} if length $body;
89              
90             # ignore link list block
91 13         62 my $nolink = $self->_eliminate_links($block);
92 13         21 my $nolinklen = length $nolink;
93 13 100       36 next if $nolinklen < $self->opt->{min_length};
94              
95             # score
96 9         65 my $c = $self->_score($nolink, $factor);
97 9         66 $factor *= $self->opt->{decay_factor};
98              
99             # anti-scoring factors
100 9         47 my $no_body_rate = $self->_no_body_rate($block);
101              
102 9         23 $c *= ($self->opt->{no_body_factor} ** $no_body_rate);
103 9         62 my $c1 = $c * $continuous;
104              
105             # cluster scoring
106 9 100       24 if ($c1 > $self->opt->{threshold}) {
    100          
107 2         11 $flag = 1;
108 2 50       7 print "\n---- continue $c*$continuous=$c1 $nolinklen\n\n$block\n"
109             if $self->opt->{debug};
110 2         16 $body .= $block . "\n";
111 2         3 $score += $c1;
112 2         6 $continuous = $self->opt->{continuous_factor};
113             } elsif ($c > $self->opt->{threshold}) {
114 1         11 $flag = 1;
115 1 50       5 print "\n---- end of cluster: $score\n" if $self->opt->{debug};
116 1 50       11 if ($score > $best->{score}) {
117 1 50       3 print "!!!! best: score=$score\n" if $self->opt->{debug};
118 1         10 $best = {
119             content => $body,
120             score => $score,
121             };
122             }
123 1 50       6 print "\n" if $self->opt->{debug};
124 1         9 $body = $block . "\n";
125 1         3 $score = $c;
126 1         5 $continuous = $self->opt->{continuous_factor};
127 1 50       8 print "\n---- continue $c*$continuous=$c1 $nolinklen\n\n$block\n"
128             if $self->opt->{debug};
129             } else {
130 6 100       73 $factor /= $self->opt->{decay_factor} if !$flag;
131 6 50       31 print "\n>> reject $c*$continuous=$c1 $nolinklen\n$block\n",
132             "<< reject\n" if $self->opt->{debug};
133             }
134             }
135 2 50       8 print "\n---- end of cluster: $score\n" if $self->opt->{debug};
136 2 100       19 if ($best->{score} < $score) {
137 1 50       4 print "!!!! best: score=$score\n" if $self->opt->{debug};
138 1         8 $best = {
139             content =>$body,
140             score => $score,
141             };
142             }
143 2         12 $self->content = $best->{content};
144              
145 2         21 return $self;
146             }
147              
148             sub _score {
149 9     9   17 my ($self, $nolink, $factor) = @_;
150 9         24 return ((length $nolink)
151             + (match_count $nolink, $self->opt->{punctuations})
152             * $self->opt->{punctuation_weight})
153             * $factor;
154             }
155              
156             sub _no_body_rate {
157 9     9   17 my ($self, $block) = @_;
158 9         3768 return (match_count $block,$self->opt->{waste_expressions})
159             + (match_count $block,$self->opt->{affiliate_expressions})/2.0;
160             }
161              
162             sub _extract_title {
163 2     2   5 my $self = shift;
164 2         2 my $title;
165 2 50       7 if ($self->content =~ $self->{pattern}->{title}) {
166 2         41 $title = strip (strip_tags $1);
167 2 50       8 if (length $title) {
168 2         5 my $pat = $self->{pattern}->{headline};
169 2         10 $self->content =~ s/$pat/
170 4 100       48 (index $title, strip(strip_tags($2))) >= 0 ? "
$2<\/div>" : "$1"/igse;
171             }
172             }
173             }
174              
175             sub _eliminate_head {
176 2     2   4 my $self = shift;
177 2         6 my $pat = $self->{pattern}->{head};
178 2         7 $self->content =~ s/$pat//is;
179             }
180              
181             sub _eliminate_useless_symbols {
182 2     2   4 my $self = shift;
183 2         7 my $comment = $self->{pattern}->{comment};
184 2         4 my $special = $self->{pattern}->{special};
185 2         8 $self->content =~ s/$comment//igs;
186 2         477 $self->content =~ s/$special//igs;
187             }
188              
189             sub _eliminate_useless_tags {
190 2     2   4 my $self = shift;
191 2         3 my @useless = @{$self->{pattern}->{useless}};
  2         9  
192 2         5 for my $pat (@useless) {
193 4         46 $self->content =~ s/$pat//igs;
194             }
195             }
196              
197             sub _eliminate_links {
198 13     13   19 my ($self, $block) = @_;
199 13         43 my $count = match_count $block, $self->{pattern}->{a};
200 13         37 my $nolink = to_text (eliminate_forms (eliminate_links $block));
201 13 50       54 return '' if length $nolink < $self->opt->{min_nolink} * $count;
202 13 50       104 return '' if $self->_is_linklist($block);
203 13         31 return $nolink;
204             }
205              
206             sub _is_linklist {
207 13     13   18 my ($self, $block) = @_;
208 13         29 my $listpat = $self->{pattern}->{list};
209 13 50       83 if ($block =~ $listpat) {
210 0         0 my $list = $2;
211 0         0 my @fragments = split($listpat, $block, 2);
212 0         0 my $nolist = $list;
213 0         0 $nolist =~ s/$listpat//igs;
214 0         0 $nolist = to_text(join($nolist, @fragments));
215 0         0 my @listitems = split $self->{pattern}->{li}, $list;
216 0         0 shift @listitems;
217 0         0 my $rate = 0;
218 0         0 for my $li (@listitems) {
219 0 0       0 $rate++ if $li =~ $self->{pattern}->{href};
220             }
221 0 0       0 $rate = 1.0 * $rate / ($#listitems+1) if $#listitems+1;
222 0         0 $list = to_text $list;
223 0         0 my $limit = ($self->opt->{nolist_ratio}*$rate)
224             * ($rate * (length $list));
225 0         0 return length $nolist < $limit;
226             }
227 13         42 return 0;
228             }
229              
230             1;
231             __END__