File Coverage

blib/lib/NewsExtractor/Extractor.pm
Criterion Covered Total %
statement 171 180 95.0
branch 0 4 0.0
condition n/a
subroutine 57 58 98.2
pod n/a
total 228 242 94.2


line stmt bran cond sub pod time code
1             use Moo;
2 1     1   6 extends 'NewsExtractor::TXExtractor';
  1         2  
  1         5  
3              
4             use Mojo::Transaction::HTTP;
5 1     1   282 use Mojo::URL;
  1         2  
  1         9  
6 1     1   23 use Types::Standard qw(InstanceOf);
  1         2  
  1         5  
7 1     1   22 use NewsExtractor::CSSRuleSet;
  1         2  
  1         5  
8 1     1   837 use NewsExtractor::CSSExtractor;
  1         3  
  1         33  
9 1     1   332 use NewsExtractor::JSONLDExtractor;
  1         2  
  1         29  
10 1     1   348 use NewsExtractor::GenericExtractor;
  1         4  
  1         26  
11 1     1   354 use NewsExtractor::SiteSpecificExtractor::www_rvn_com_tw;
  1         3  
  1         33  
12 1     1   371 use NewsExtractor::SiteSpecificExtractor::www_allnews_tw;
  1         2  
  1         29  
13 1     1   343 use NewsExtractor::SiteSpecificExtractor::www_peopo_org;
  1         3  
  1         26  
14 1     1   325 use NewsExtractor::SiteSpecificExtractor::www_ntdtv_com;
  1         3  
  1         25  
15 1     1   323 use NewsExtractor::SiteSpecificExtractor::www_ksnews_com_tw;
  1         3  
  1         28  
16 1     1   309 use NewsExtractor::SiteSpecificExtractor::news_tvbs_com_tw;
  1         2  
  1         27  
17 1     1   348 use NewsExtractor::SiteSpecificExtractor::www_taipeitimes_com;
  1         3  
  1         25  
18 1     1   319 use NewsExtractor::SiteSpecificExtractor::www_rti_org_tw;
  1         3  
  1         24  
19 1     1   322 use NewsExtractor::SiteSpecificExtractor::www_bcc_com_tw;
  1         2  
  1         25  
20 1     1   321 use NewsExtractor::SiteSpecificExtractor::www_setn_com;
  1         3  
  1         25  
21 1     1   315 use NewsExtractor::SiteSpecificExtractor::news_tnn_tw;
  1         7  
  1         31  
22 1     1   332 use NewsExtractor::SiteSpecificExtractor::turnnewsapp_com;
  1         3  
  1         26  
23 1     1   316 use NewsExtractor::SiteSpecificExtractor::news_cts_com_tw;
  1         2  
  1         26  
24 1     1   321 use NewsExtractor::SiteSpecificExtractor::talk_ltn_com_tw;
  1         3  
  1         29  
25 1     1   307 use NewsExtractor::SiteSpecificExtractor::estate_ltn_com_tw;
  1         3  
  1         28  
26 1     1   331 use NewsExtractor::SiteSpecificExtractor::www_upmedia_mg;
  1         3  
  1         64  
27 1     1   340 use NewsExtractor::SiteSpecificExtractor::ctee_com_tw;
  1         3  
  1         37  
28 1     1   323 use NewsExtractor::SiteSpecificExtractor::news_ebc_net_tw;
  1         3  
  1         25  
29 1     1   316 use NewsExtractor::SiteSpecificExtractor::newnet_tw;
  1         3  
  1         25  
30 1     1   327 use NewsExtractor::SiteSpecificExtractor::www_thestandnews_com;
  1         3  
  1         26  
31 1     1   318 use NewsExtractor::SiteSpecificExtractor::www_epochtimes_com;
  1         3  
  1         25  
32 1     1   315 use NewsExtractor::SiteSpecificExtractor::www_ttv_com_tw;
  1         2  
  1         25  
33 1     1   305 use NewsExtractor::SiteSpecificExtractor::news_ttv_com_tw;
  1         2  
  1         33  
34 1     1   325 use NewsExtractor::SiteSpecificExtractor::www_idn_com_tw;
  1         3  
  1         26  
35 1     1   308 use NewsExtractor::SiteSpecificExtractor::www_fountmedia_io;
  1         2  
  1         25  
36 1     1   318 use NewsExtractor::SiteSpecificExtractor::news_pts_org_tw;
  1         2  
  1         26  
37 1     1   366 use NewsExtractor::SiteSpecificExtractor::www_twreporter_org;
  1         3  
  1         24  
38 1     1   344 use NewsExtractor::SiteSpecificExtractor::new_ctv_com_tw;
  1         4  
  1         26  
39 1     1   337 use NewsExtractor::SiteSpecificExtractor::hk_crntt_com;
  1         3  
  1         25  
40 1     1   320 use NewsExtractor::SiteSpecificExtractor::hk_on_cc;
  1         3  
  1         25  
41 1     1   365 use NewsExtractor::SiteSpecificExtractor::www_hkcna_hk;
  1         2  
  1         25  
42 1     1   317 use NewsExtractor::SiteSpecificExtractor::www_hkcnews_com;
  1         3  
  1         26  
43 1     1   320 use NewsExtractor::SiteSpecificExtractor::www_xinhuanet_com;
  1         2  
  1         23  
44 1     1   330 use NewsExtractor::SiteSpecificExtractor::news_cctv_com;
  1         3  
  1         24  
45 1     1   317 use NewsExtractor::SiteSpecificExtractor::m_news_cctv_com;
  1         2  
  1         25  
46 1     1   339 use NewsExtractor::SiteSpecificExtractor::focustaiwan_tw;
  1         2  
  1         28  
47 1     1   324 use NewsExtractor::SiteSpecificExtractor::newtalk_tw;
  1         3  
  1         28  
48 1     1   324 use NewsExtractor::SiteSpecificExtractor::www_digitimes_com_tw;
  1         3  
  1         25  
49 1     1   325 use NewsExtractor::SiteSpecificExtractor::www_ustv_com_tw;
  1         3  
  1         25  
50 1     1   316 use NewsExtractor::SiteSpecificExtractor::www_mdnkids_com;
  1         2  
  1         25  
51 1     1   376 use NewsExtractor::SiteSpecificExtractor::www_nownews_com;
  1         3  
  1         26  
52 1     1   319 use NewsExtractor::SiteSpecificExtractor::www_penghutimes_com;
  1         3  
  1         27  
53 1     1   323 use NewsExtractor::SiteSpecificExtractor::www_aljazeera_com;
  1         3  
  1         25  
54 1     1   388 use NewsExtractor::SiteSpecificExtractor::www_bbc_com;
  1         2  
  1         320  
55 1     1   337 use NewsExtractor::SiteSpecificExtractor::yimedia_com_tw;
  1         2  
  1         26  
56 1     1   327 use NewsExtractor::SiteSpecificExtractor::UDN;
  1         2  
  1         25  
57 1     1   317 use NewsExtractor::SiteSpecificExtractor::ETtoday;
  1         2  
  1         26  
58 1     1   317 use NewsExtractor::SiteSpecificExtractor::ChinaTimes;
  1         3  
  1         26  
59 1     1   344  
  1         2  
  1         227  
60             has extractor => (
61             required => 0,
62             is => 'lazy',
63             isa => InstanceOf["NewsExtractor::CSSExtractor",
64             "NewsExtractor::JSONLDExtractor",
65             "NewsExtractor::SiteSpecificExtractor",
66             "NewsExtractor::GenericExtractor"],
67             builder => 1,
68             handles => [qw( headline dateline journalist content_text )],
69             );
70              
71             use constant {
72             SiteSpecificExtractorByHost => {
73 1         258 'www.bbc.com' => 'NewsExtractor::SiteSpecificExtractor::www_bbc_com',
74             'www.aljazeera.com' => 'NewsExtractor::SiteSpecificExtractor::www_aljazeera_com',
75             'www.penghutimes.com' => 'NewsExtractor::SiteSpecificExtractor::www_penghutimes_com',
76             'www.ustv.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_ustv_com_tw',
77             'www.epochtimes.com' => 'NewsExtractor::SiteSpecificExtractor::www_epochtimes_com',
78             'www.hkcnews.com' => 'NewsExtractor::SiteSpecificExtractor::www_hkcnews_com',
79             'www.thestandnews.com' => 'NewsExtractor::SiteSpecificExtractor::www_thestandnews_com',
80             'www.allnews.tw' => 'NewsExtractor::SiteSpecificExtractor::www_allnews_tw',
81             'www.rvn.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_rvn_com_tw',
82             'www.chinatimes.com' => 'NewsExtractor::SiteSpecificExtractor::ChinaTimes',
83             'video.udn.com' => 'NewsExtractor::JSONLDExtractor',
84             'www.ctwant.com' => 'NewsExtractor::JSONLDExtractor',
85             'www.peopo.org' => 'NewsExtractor::SiteSpecificExtractor::www_peopo_org',
86             'www.ntdtv.com' => 'NewsExtractor::SiteSpecificExtractor::www_ntdtv_com',
87             'www.ksnews.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_ksnews_com_tw',
88             'news.tvbs.com.tw' => 'NewsExtractor::SiteSpecificExtractor::news_tvbs_com_tw',
89             'udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN',
90             'stars.udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN',
91             'money.udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN',
92             'house.udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN',
93             'opinion.udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN',
94             'www.taipeitimes.com' => 'NewsExtractor::SiteSpecificExtractor::www_taipeitimes_com',
95             'www.ettoday.net' => 'NewsExtractor::SiteSpecificExtractor::ETtoday',
96             'star.ettoday.net' => 'NewsExtractor::SiteSpecificExtractor::ETtoday',
97             'house.ettoday.net' => 'NewsExtractor::SiteSpecificExtractor::ETtoday',
98             'health.ettoday.net' => 'NewsExtractor::SiteSpecificExtractor::ETtoday',
99             'www.rti.org.tw' => 'NewsExtractor::SiteSpecificExtractor::www_rti_org_tw',
100             'www.bcc.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_bcc_com_tw',
101             'www.setn.com' => 'NewsExtractor::SiteSpecificExtractor::www_setn_com',
102             'news.tnn.tw' => 'NewsExtractor::SiteSpecificExtractor::news_tnn_tw',
103             'turnnewsapp.com' => 'NewsExtractor::SiteSpecificExtractor::turnnewsapp_com',
104             'news.cts.com.tw' => 'NewsExtractor::SiteSpecificExtractor::news_cts_com_tw',
105             'talk.ltn.com.tw' => 'NewsExtractor::SiteSpecificExtractor::talk_ltn_com_tw',
106             'estate.ltn.com.tw' => 'NewsExtractor::SiteSpecificExtractor::estate_ltn_com_tw',
107             'www.upmedia.mg' => 'NewsExtractor::SiteSpecificExtractor::www_upmedia_mg',
108             'ctee.com.tw' => 'NewsExtractor::SiteSpecificExtractor::ctee_com_tw',
109             'news.ebc.net.tw' => 'NewsExtractor::SiteSpecificExtractor::news_ebc_net_tw',
110             'newnet.tw' => 'NewsExtractor::SiteSpecificExtractor::newnet_tw',
111             'www.ttv.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_ttv_com_tw',
112             'news.ttv.com.tw' => 'NewsExtractor::SiteSpecificExtractor::news_ttv_com_tw',
113             'www.idn.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_idn_com_tw',
114             'www.fountmedia.io' => 'NewsExtractor::SiteSpecificExtractor::www_fountmedia_io',
115             'news.pts.org.tw' => 'NewsExtractor::SiteSpecificExtractor::news_pts_org_tw',
116             'www.twreporter.org' => 'NewsExtractor::SiteSpecificExtractor::www_twreporter_org',
117             'new.ctv.com.tw' => 'NewsExtractor::SiteSpecificExtractor::new_ctv_com_tw',
118             'hk.crntt.com' => 'NewsExtractor::SiteSpecificExtractor::hk_crntt_com',
119             'hk.on.cc' => 'NewsExtractor::SiteSpecificExtractor::hk_on_cc',
120             'www.hkcna.hk' => 'NewsExtractor::SiteSpecificExtractor::www_hkcna_hk',
121             'www.xinhuanet.com' => 'NewsExtractor::SiteSpecificExtractor::www_xinhuanet_com',
122             'news.cctv.com' => 'NewsExtractor::SiteSpecificExtractor::news_cctv_com',
123             'm.news.cctv.com' => 'NewsExtractor::SiteSpecificExtractor::m_news_cctv_com',
124             'focustaiwan.tw' => 'NewsExtractor::SiteSpecificExtractor::focustaiwan_tw',
125             'newtalk.tw' => 'NewsExtractor::SiteSpecificExtractor::newtalk_tw',
126             'www.digitimes.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_digitimes_com_tw',
127             'www.mdnkids.com' => 'NewsExtractor::SiteSpecificExtractor::www_mdnkids_com',
128             'www.nownews.com' => 'NewsExtractor::SiteSpecificExtractor::www_nownews_com',
129             'yimedia.com.tw' => 'NewsExtractor::SiteSpecificExtractor::yimedia_com_tw',
130             },
131             CSSRuleSetByHost => {
132             'www.eventsinfocus.org' => {
133             headline => 'h1.title > span',
134             dateline => 'div.content time.datetime',
135             journalist => 'div.content div.node__content div.clearfix.text-formatted > p:nth-child(1)',
136             content_text => 'div.content article div.clearfix.text-formatted',
137             },
138             'www.5ch.com.tw' => {
139             headline => 'h3.m-ti',
140             dateline => 'div.more-about div.date',
141             journalist => 'div.more-about div.reporter',
142             content_text => 'div.text-edit',
143             },
144             'www.cw.com.tw' => {
145             headline => 'div.article__head h1',
146             dateline => 'div.article__detail > time',
147             journalist => 'div.author--item > a',
148             content_text => 'div.article__content',
149             },
150             'www.taiwannews.com.tw' => {
151             headline => 'h1.article-title',
152             dateline => 'div.article-date',
153             journalist => 'div.article-author',
154             content_text => 'article.article',
155             },
156             'www.enewstw.com' => {
157             headline => 'td.blog_title > strong',
158             dateline => 'td.blog_title tr:nth-child(2) > td.blog',
159             journalist => 'td.blog_title tr:nth-child(1) > td.blog',
160             content_text => 'td.new_t p',
161             },
162             'www.storm.mg' => {
163             headline => 'h1#article_title',
164             dateline => 'span#info_time',
165             journalist => '#article_info_wrapper #author_block a.link_author > span.info_author',
166             content_text => 'div#article_inner_wrapper > article:nth-child(1)',
167             }
168             }
169             };
170 1     1   7  
  1         1  
171             my ($self) = @_;
172             my $url = $self->tx->req->url;
173 0     0     my $host = $url->host;
174 0           my $extractor;
175 0           if (my $sel = CSSRuleSetByHost->{$host}) {
176 0           $extractor = NewsExtractor::CSSExtractor->new(
177 0 0         css_selector => NewsExtractor::CSSRuleSet->new(%$sel),
    0          
178 0           tx => $self->tx
179             );
180             } elsif (my $extractor_class = SiteSpecificExtractorByHost->{$host}) {
181             $extractor = $extractor_class->new( tx => $self->tx );
182             } else {
183 0           $extractor = NewsExtractor::GenericExtractor->new( tx => $self->tx );
184             }
185 0           return $extractor;
186             }
187 0            
188             1;