File Coverage

blib/lib/NewsExtractor/Role/ContentTextExtractor.pm
Criterion Covered Total %
statement 21 43 48.8
branch 0 10 0.0
condition n/a
subroutine 7 9 77.7
pod n/a
total 28 62 45.1


line stmt bran cond sub pod time code
1             package NewsExtractor::Role::ContentTextExtractor;
2 1     1   5985 use utf8;
  1         4  
  1         10  
3 1     1   39 use Moo::Role;
  1         3  
  1         8  
4              
5 1     1   452 use Types::Standard qw(Str Maybe);
  1         2  
  1         16  
6 1     1   1132 use List::Util qw(max);
  1         2  
  1         92  
7 1     1   7 use HTML::ExtractContent;
  1         2  
  1         30  
8              
9 1     1   6 use Importer 'NewsExtractor::TextUtil' => qw( html2text );
  1         3  
  1         10  
10 1     1   35 use Importer 'NewsExtractor::Constants' => qw( %RE );
  1         3  
  1         4  
11              
12             has site_name => (
13             is => "lazy",
14             isa => Maybe[Str],
15             );
16              
17             has content_text => (
18             is => "lazy",
19             isa => Maybe[Str],
20             );
21              
22             sub _build_site_name {
23 0     0     my ($self) = @_;
24              
25 0           my $el = $self->dom->at("meta[property='og:site_name']");
26 0 0         if ($el) {
27 0           return $el->attr('content');
28             }
29              
30 0           return undef;
31             }
32              
33             sub _build_content_text {
34 0     0     my ($self) = @_;
35 0           my ($el, $html);
36              
37             # Cleanup some noisy elements that are known to interfere.
38 0           $self->dom->find('script, style, p.appE1121, div.sexmask, div.cat-list, div#marquee, #setting_weather')->map('remove');
39              
40 0           my $extractor = HTML::ExtractContent->new;
41 0 0         if ($el = $self->dom->at('article')) {
42 0           $html = $extractor->extract("$el")->as_html;
43             } else {
44 0           $html = $extractor->extract( $self->dom->to_string )->as_html;
45             }
46              
47 0           my $text = html2text( $html );
48              
49 0 0         my @paragraphs = split(/\n\n/, $text) or return undef;
50              
51 0 0         if (my $site_name = $self->site_name) {
52 0           $paragraphs[-1] =~ s/\A \s* \p{Punct}? \s* ${site_name} \s* \p{Punct}? \s* \z//x;
53 0           $paragraphs[-1] =~ s/${site_name}//x;
54             }
55              
56 0           $paragraphs[-1] =~ s/\A \s* \p{Punct}? \s* $RE{newspaper_names} \s* \p{Punct}? \s* \z//x;
57              
58 0 0         if (max( map { length($_) } @paragraphs ) < 30) {
  0            
59             # err "[$$] Not enough contents";
60 0           return undef;
61             }
62              
63 0           return join "\n\n", @paragraphs;
64             }
65              
66             1;