| blib/lib/HTML/ExtractContent.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 115 | 131 | 87.7 |
| branch | 31 | 50 | 62.0 |
| condition | n/a | ||
| subroutine | 19 | 19 | 100.0 |
| pod | 4 | 4 | 100.0 |
| total | 169 | 204 | 82.8 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | package HTML::ExtractContent; | ||||||
| 2 | 2 | 2 | 879 | use strict; | |||
| 2 | 4 | ||||||
| 2 | 79 | ||||||
| 3 | 2 | 2 | 10 | use warnings; | |||
| 2 | 4 | ||||||
| 2 | 68 | ||||||
| 4 | 2 | 2 | 1065 | use HTML::ExtractContent::Util; | |||
| 2 | 31 | ||||||
| 2 | 18 | ||||||
| 5 | 2 | 2 | 433 | use List::Util qw(reduce); | |||
| 2 | 5 | ||||||
| 2 | 302 | ||||||
| 6 | 2 | 2 | 11 | use utf8; | |||
| 2 | 4 | ||||||
| 2 | 16 | ||||||
| 7 | 2 | 2 | 61 | use base qw(Class::Accessor::Lvalue::Fast); | |||
| 2 | 4 | ||||||
| 2 | 2092 | ||||||
| 8 | our $VERSION = '0.10'; | ||||||
| 9 | __PACKAGE__->mk_accessors(qw(opt content)); | ||||||
| 10 | |||||||
| 11 | sub new { | ||||||
| 12 | 1 | 1 | 1 | 636 | my ($class, $opt) = @_; | ||
| 13 | 1 | 18 | my $self = $class->SUPER::new($opt); | ||||
| 14 | 1 | 56 | $self->opt = { | ||||
| 15 | threshold => 60, # threhold for score of clusters | ||||||
| 16 | min_length => 30, # minimum length of blocks | ||||||
| 17 | decay_factor => 0.75, # decay factor for block scores | ||||||
| 18 | no_body_factor => 0.72, | ||||||
| 19 | continuous_factor => 1.62, # continuous factor for block scores | ||||||
| 20 | punctuation_weight => 10, # score weight for punctuations | ||||||
| 21 | 2 | 2 | 23003 | punctuations => qr/(?:[。、.,!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)/is, | |||
| 2 | 6 | ||||||
| 2 | 51 | ||||||
| 22 | waste_expressions => qr/Copyright|All\s*Rights?\s*Reserved?/is, | ||||||
| 23 | # characteristic keywords including footer | ||||||
| 24 | affiliate_expressions => | ||||||
| 25 | qr/amazon[a-z0-9\.\/\-\?&]+-22/is, | ||||||
| 26 | block_separator => qr/<\/?(?:div|center|td)[^>]*>| ]*class\s*=\s*["']?(?:posted|plugin-\w+)['"]?[^>]*>/is, |
||||||
| 27 | # nocontent => qr/<\/frameset>|]*url/is, | ||||||
| 28 | nocontent => qr/<\/frameset>/is, | ||||||
| 29 | min_nolink => 8, | ||||||
| 30 | nolist_ratio => 0.2, | ||||||
| 31 | debug => 0 | ||||||
| 32 | }; | ||||||
| 33 | 1 | 46 | $self->{pattern} = { | ||||
| 34 | a => qr/]*>.*?<\/a\s*>/is, | ||||||
| 35 | href => qr/ | ||||||
| 36 | list => qr/<(ul|dl|ol)(.+)<\/\1>/is, | ||||||
| 37 | li => qr/(?: |
||||||
| 38 | title => qr/ |
||||||
| 39 | headline => qr/( |
||||||
| 40 | head => qr/]*>.*?<\/head\s*>/is, | ||||||
| 41 | comment => qr/(?:|<([^>\s]+)[^>]*\s+style=['"]?[^>'"]*(?:display:\s*none|visibility:\s*hidden)[^>'"]*['"]?[^>]*>.*?<\/\1\s*>)/is, | ||||||
| 42 | special => qr//is, | ||||||
| 43 | useless => [ | ||||||
| 44 | qr/<(script|style|select|noscript)[^>]*>.*?<\/\1\s*>/is, | ||||||
| 45 | qr/ ]*(?:id|class)\s*=\s*['"]?\S*(?:more|menu|side|navi)\S*["']?[^>]*>/is, |
||||||
| 46 | ], | ||||||
| 47 | }; | ||||||
| 48 | 1 | 4 | return bless $self, $class; | ||||
| 49 | } | ||||||
| 50 | |||||||
| 51 | sub as_text { | ||||||
| 52 | 2 | 2 | 1 | 1883 | my $self = shift; | ||
| 53 | 2 | 10 | return to_text $self->content; | ||||
| 54 | } | ||||||
| 55 | |||||||
| 56 | sub as_html { | ||||||
| 57 | 2 | 2 | 1 | 10 | my $self = shift; | ||
| 58 | 2 | 6 | return $self->content; | ||||
| 59 | } | ||||||
| 60 | |||||||
| 61 | sub extract { | ||||||
| 62 | 2 | 2 | 1 | 15 | my $self = shift;; | ||
| 63 | 2 | 13 | $self->content = shift; | ||||
| 64 | 2 | 50 | 26 | if ($self->content =~ $self->opt->{nocontent}) { | |||
| 65 | # frameset or redirect | ||||||
| 66 | 0 | 0 | $self->content = ''; | ||||
| 67 | 0 | 0 | return $self; | ||||
| 68 | } | ||||||
| 69 | 2 | 46 | $self->_extract_title; | ||||
| 70 | 2 | 10 | $self->_eliminate_head; | ||||
| 71 | |||||||
| 72 | 2 | 42 | $self->_eliminate_useless_symbols; | ||||
| 73 | 2 | 48 | $self->_eliminate_useless_tags; | ||||
| 74 | |||||||
| 75 | 2 | 126 | my ($factor, $continuous); | ||||
| 76 | 2 | 4 | $factor = $continuous = 1.0; | ||||
| 77 | 2 | 3 | my $body = ''; | ||||
| 78 | 2 | 5 | my $score = 0; | ||||
| 79 | 2 | 8 | my $best = { | ||||
| 80 | content => "", | ||||||
| 81 | score => 0, | ||||||
| 82 | }; | ||||||
| 83 | 2 | 9 | my @list = split $self->opt->{block_separator}, $self->content; | ||||
| 84 | 2 | 329 | my $flag = 0; | ||||
| 85 | 2 | 53 | for my $block (@list) { | ||||
| 86 | 44 | 171 | $block = strip $block; | ||||
| 87 | 44 | 100 | 110 | next unless decode $block; | |||
| 88 | 13 | 100 | 57 | $continuous /= $self->opt->{continuous_factor} if length $body; | |||
| 89 | |||||||
| 90 | # ignore link list block | ||||||
| 91 | 13 | 62 | my $nolink = $self->_eliminate_links($block); | ||||
| 92 | 13 | 21 | my $nolinklen = length $nolink; | ||||
| 93 | 13 | 100 | 36 | next if $nolinklen < $self->opt->{min_length}; | |||
| 94 | |||||||
| 95 | # score | ||||||
| 96 | 9 | 65 | my $c = $self->_score($nolink, $factor); | ||||
| 97 | 9 | 66 | $factor *= $self->opt->{decay_factor}; | ||||
| 98 | |||||||
| 99 | # anti-scoring factors | ||||||
| 100 | 9 | 47 | my $no_body_rate = $self->_no_body_rate($block); | ||||
| 101 | |||||||
| 102 | 9 | 23 | $c *= ($self->opt->{no_body_factor} ** $no_body_rate); | ||||
| 103 | 9 | 62 | my $c1 = $c * $continuous; | ||||
| 104 | |||||||
| 105 | # cluster scoring | ||||||
| 106 | 9 | 100 | 24 | if ($c1 > $self->opt->{threshold}) { | |||
| 100 | |||||||
| 107 | 2 | 11 | $flag = 1; | ||||
| 108 | 2 | 50 | 7 | print "\n---- continue $c*$continuous=$c1 $nolinklen\n\n$block\n" | |||
| 109 | if $self->opt->{debug}; | ||||||
| 110 | 2 | 16 | $body .= $block . "\n"; | ||||
| 111 | 2 | 3 | $score += $c1; | ||||
| 112 | 2 | 6 | $continuous = $self->opt->{continuous_factor}; | ||||
| 113 | } elsif ($c > $self->opt->{threshold}) { | ||||||
| 114 | 1 | 11 | $flag = 1; | ||||
| 115 | 1 | 50 | 5 | print "\n---- end of cluster: $score\n" if $self->opt->{debug}; | |||
| 116 | 1 | 50 | 11 | if ($score > $best->{score}) { | |||
| 117 | 1 | 50 | 3 | print "!!!! best: score=$score\n" if $self->opt->{debug}; | |||
| 118 | 1 | 10 | $best = { | ||||
| 119 | content => $body, | ||||||
| 120 | score => $score, | ||||||
| 121 | }; | ||||||
| 122 | } | ||||||
| 123 | 1 | 50 | 6 | print "\n" if $self->opt->{debug}; | |||
| 124 | 1 | 9 | $body = $block . "\n"; | ||||
| 125 | 1 | 3 | $score = $c; | ||||
| 126 | 1 | 5 | $continuous = $self->opt->{continuous_factor}; | ||||
| 127 | 1 | 50 | 8 | print "\n---- continue $c*$continuous=$c1 $nolinklen\n\n$block\n" | |||
| 128 | if $self->opt->{debug}; | ||||||
| 129 | } else { | ||||||
| 130 | 6 | 100 | 73 | $factor /= $self->opt->{decay_factor} if !$flag; | |||
| 131 | 6 | 50 | 31 | print "\n>> reject $c*$continuous=$c1 $nolinklen\n$block\n", | |||
| 132 | "<< reject\n" if $self->opt->{debug}; | ||||||
| 133 | } | ||||||
| 134 | } | ||||||
| 135 | 2 | 50 | 8 | print "\n---- end of cluster: $score\n" if $self->opt->{debug}; | |||
| 136 | 2 | 100 | 19 | if ($best->{score} < $score) { | |||
| 137 | 1 | 50 | 4 | print "!!!! best: score=$score\n" if $self->opt->{debug}; | |||
| 138 | 1 | 8 | $best = { | ||||
| 139 | content =>$body, | ||||||
| 140 | score => $score, | ||||||
| 141 | }; | ||||||
| 142 | } | ||||||
| 143 | 2 | 12 | $self->content = $best->{content}; | ||||
| 144 | |||||||
| 145 | 2 | 21 | return $self; | ||||
| 146 | } | ||||||
| 147 | |||||||
| 148 | sub _score { | ||||||
| 149 | 9 | 9 | 17 | my ($self, $nolink, $factor) = @_; | |||
| 150 | 9 | 24 | return ((length $nolink) | ||||
| 151 | + (match_count $nolink, $self->opt->{punctuations}) | ||||||
| 152 | * $self->opt->{punctuation_weight}) | ||||||
| 153 | * $factor; | ||||||
| 154 | } | ||||||
| 155 | |||||||
| 156 | sub _no_body_rate { | ||||||
| 157 | 9 | 9 | 17 | my ($self, $block) = @_; | |||
| 158 | 9 | 3768 | return (match_count $block,$self->opt->{waste_expressions}) | ||||
| 159 | + (match_count $block,$self->opt->{affiliate_expressions})/2.0; | ||||||
| 160 | } | ||||||
| 161 | |||||||
| 162 | sub _extract_title { | ||||||
| 163 | 2 | 2 | 5 | my $self = shift; | |||
| 164 | 2 | 2 | my $title; | ||||
| 165 | 2 | 50 | 7 | if ($self->content =~ $self->{pattern}->{title}) { | |||
| 166 | 2 | 41 | $title = strip (strip_tags $1); | ||||
| 167 | 2 | 50 | 8 | if (length $title) { | |||
| 168 | 2 | 5 | my $pat = $self->{pattern}->{headline}; | ||||
| 169 | 2 | 10 | $self->content =~ s/$pat/ | ||||
| 170 | 4 | 100 | 48 | (index $title, strip(strip_tags($2))) >= 0 ? " $2<\/div>" : "$1"/igse; |
|||
| 171 | } | ||||||
| 172 | } | ||||||
| 173 | } | ||||||
| 174 | |||||||
| 175 | sub _eliminate_head { | ||||||
| 176 | 2 | 2 | 4 | my $self = shift; | |||
| 177 | 2 | 6 | my $pat = $self->{pattern}->{head}; | ||||
| 178 | 2 | 7 | $self->content =~ s/$pat//is; | ||||
| 179 | } | ||||||
| 180 | |||||||
| 181 | sub _eliminate_useless_symbols { | ||||||
| 182 | 2 | 2 | 4 | my $self = shift; | |||
| 183 | 2 | 7 | my $comment = $self->{pattern}->{comment}; | ||||
| 184 | 2 | 4 | my $special = $self->{pattern}->{special}; | ||||
| 185 | 2 | 8 | $self->content =~ s/$comment//igs; | ||||
| 186 | 2 | 477 | $self->content =~ s/$special//igs; | ||||
| 187 | } | ||||||
| 188 | |||||||
| 189 | sub _eliminate_useless_tags { | ||||||
| 190 | 2 | 2 | 4 | my $self = shift; | |||
| 191 | 2 | 3 | my @useless = @{$self->{pattern}->{useless}}; | ||||
| 2 | 9 | ||||||
| 192 | 2 | 5 | for my $pat (@useless) { | ||||
| 193 | 4 | 46 | $self->content =~ s/$pat//igs; | ||||
| 194 | } | ||||||
| 195 | } | ||||||
| 196 | |||||||
| 197 | sub _eliminate_links { | ||||||
| 198 | 13 | 13 | 19 | my ($self, $block) = @_; | |||
| 199 | 13 | 43 | my $count = match_count $block, $self->{pattern}->{a}; | ||||
| 200 | 13 | 37 | my $nolink = to_text (eliminate_forms (eliminate_links $block)); | ||||
| 201 | 13 | 50 | 54 | return '' if length $nolink < $self->opt->{min_nolink} * $count; | |||
| 202 | 13 | 50 | 104 | return '' if $self->_is_linklist($block); | |||
| 203 | 13 | 31 | return $nolink; | ||||
| 204 | } | ||||||
| 205 | |||||||
| 206 | sub _is_linklist { | ||||||
| 207 | 13 | 13 | 18 | my ($self, $block) = @_; | |||
| 208 | 13 | 29 | my $listpat = $self->{pattern}->{list}; | ||||
| 209 | 13 | 50 | 83 | if ($block =~ $listpat) { | |||
| 210 | 0 | 0 | my $list = $2; | ||||
| 211 | 0 | 0 | my @fragments = split($listpat, $block, 2); | ||||
| 212 | 0 | 0 | my $nolist = $list; | ||||
| 213 | 0 | 0 | $nolist =~ s/$listpat//igs; | ||||
| 214 | 0 | 0 | $nolist = to_text(join($nolist, @fragments)); | ||||
| 215 | 0 | 0 | my @listitems = split $self->{pattern}->{li}, $list; | ||||
| 216 | 0 | 0 | shift @listitems; | ||||
| 217 | 0 | 0 | my $rate = 0; | ||||
| 218 | 0 | 0 | for my $li (@listitems) { | ||||
| 219 | 0 | 0 | 0 | $rate++ if $li =~ $self->{pattern}->{href}; | |||
| 220 | } | ||||||
| 221 | 0 | 0 | 0 | $rate = 1.0 * $rate / ($#listitems+1) if $#listitems+1; | |||
| 222 | 0 | 0 | $list = to_text $list; | ||||
| 223 | 0 | 0 | my $limit = ($self->opt->{nolist_ratio}*$rate) | ||||
| 224 | * ($rate * (length $list)); | ||||||
| 225 | 0 | 0 | return length $nolist < $limit; | ||||
| 226 | } | ||||||
| 227 | 13 | 42 | return 0; | ||||
| 228 | } | ||||||
| 229 | |||||||
| 230 | 1; | ||||||
| 231 | __END__ |