File Coverage

blib/lib/NewsExtractor/Role/ContentTextExtractor.pm
Criterion Covered Total %
statement 21 43 48.8
branch 0 10 0.0
condition n/a
subroutine 7 9 77.7
pod n/a
total 28 62 45.1


line stmt bran cond sub pod time code
1             use utf8;
2 1     1   3903 use Moo::Role;
  1         3  
  1         7  
3 1     1   28  
  1         1  
  1         5  
4             use Types::Standard qw(Str Maybe);
5 1     1   321 use List::Util qw(max);
  1         2  
  1         34  
6 1     1   808 use HTML::ExtractContent;
  1         2  
  1         50  
7 1     1   4  
  1         2  
  1         23  
8             use Importer 'NewsExtractor::TextUtil' => qw( html2text );
9 1     1   4 use Importer 'NewsExtractor::Constants' => qw( %RE );
  1         2  
  1         7  
10 1     1   28  
  1         2  
  1         3  
11             has site_name => (
12             is => "lazy",
13             isa => Maybe[Str],
14             );
15              
16             has content_text => (
17             is => "lazy",
18             isa => Maybe[Str],
19             );
20              
21             my ($self) = @_;
22              
23 0     0     my $el = $self->dom->at("meta[property='og:site_name']");
24             if ($el) {
25 0           return $el->attr('content');
26 0 0         }
27 0            
28             return undef;
29             }
30 0            
31             my ($self) = @_;
32             my ($el, $html);
33              
34 0     0     # Cleanup some noisy elements that are known to interfere.
35 0           $self->dom->find('script, style, p.appE1121, div.sexmask, div.cat-list, div#marquee, #setting_weather')->map('remove');
36              
37             my $extractor = HTML::ExtractContent->new;
38 0           if ($el = $self->dom->at('article')) {
39             $html = $extractor->extract("$el")->as_html;
40 0           } else {
41 0 0         $html = $extractor->extract( $self->dom->to_string )->as_html;
42 0           }
43              
44 0           my $text = html2text( $html );
45              
46             my @paragraphs = split(/\n\n/, $text) or return undef;
47 0            
48             if (my $site_name = $self->site_name) {
49 0 0         $paragraphs[-1] =~ s/\A \s* \p{Punct}? \s* ${site_name} \s* \p{Punct}? \s* \z//x;
50             $paragraphs[-1] =~ s/${site_name}//x;
51 0 0         }
52 0            
53 0           $paragraphs[-1] =~ s/\A \s* \p{Punct}? \s* $RE{newspaper_names} \s* \p{Punct}? \s* \z//x;
54              
55             if (max( map { length($_) } @paragraphs ) < 30) {
56 0           # err "[$$] Not enough contents";
57             return undef;
58 0 0         }
  0            
59              
60 0           return join "\n\n", @paragraphs;
61             }
62              
63 0           1;