File Coverage

blib/lib/BusyBird/Input/Feed.pm
Criterion Covered Total %
statement 106 131 80.9
branch 30 56 53.5
condition 10 21 47.6
subroutine 22 25 88.0
pod 4 4 100.0
total 172 237 72.5


line stmt bran cond sub pod time code
1             package BusyBird::Input::Feed;
2 4     4   216906 use strict;
  4         9  
  4         91  
3 4     4   19 use warnings;
  4         8  
  4         75  
4 4     4   2305 use XML::FeedPP;
  4         81152  
  4         116  
5 4     4   2032 use DateTime::Format::ISO8601;
  4         2046615  
  4         186  
6 4     4   2058 use BusyBird::DateTime::Format;
  4         15558  
  4         160  
7 4     4   35 use DateTime;
  4         11  
  4         93  
8 4     4   24 use Try::Tiny;
  4         12  
  4         219  
9 4     4   27 use Carp;
  4         9  
  4         202  
10 4     4   1384 use WWW::Favicon ();
  4         233459  
  4         101  
11 4     4   54 use LWP::UserAgent;
  4         11  
  4         67  
12 4     4   17 use URI;
  4         8  
  4         4234  
13              
14             our $VERSION = "0.06";
15              
16             our @CARP_NOT = qw(Try::Tiny XML::FeedPP);
17              
18             sub new {
19 4     4 1 1804 my ($class, %args) = @_;
20             my $self = bless {
21             use_favicon => defined($args{use_favicon}) ? $args{use_favicon} : 1,
22             favicon_detector => WWW::Favicon->new,
23             user_agent => defined($args{user_agent}) ? $args{user_agent} : do {
24 4         39621 my $ua = LWP::UserAgent->new;
25 4         941 $ua->env_proxy;
26 4         457 $ua->timeout(30);
27 4         56 $ua->agent("BusyBird::Inpu::Feed-$VERSION"); ## some Web sites ban LWP::UserAgent's default UserAgent...
28 4         259 $ua;
29             },
30 4 50       48 image_max_num => defined($args{image_max_num}) ? $args{image_max_num} : 3,
    50          
    100          
31             }, $class;
32              
33             ## Note that WWW::Favicon#ua accessor method is not documented (as of version 0.03001)
34 4         34 $self->{favicon_detector}->ua($self->{user_agent});
35            
36 4         104 return $self;
37             }
38              
39             sub _get_url_head_and_dir {
40 520     520   9113 my ($url_raw) = @_;
41 520 50       1222 return (undef, undef) if not defined $url_raw;
42 520         1913 my $url = URI->new($url_raw);
43 520         50941 my $scheme = $url->scheme;
44 520         7929 my $authority = $url->authority;
45 520 50 33     6710 return (undef, undef) if !$scheme || !$authority;
46 520         1171 my $url_head = "$scheme://$authority";
47 520         760 my $url_dir;
48 520         1423 my $path = $url->path;
49 520 50       6091 if($path =~ m{^(.*/)}i) {
50 520         1123 $url_dir = $1;
51             }else {
52 0         0 $url_dir = "/";
53             }
54 520         1620 return ($url_head, $url_dir);
55             }
56              
57             sub _extract_image_urls {
58 545     545   1132 my ($self, $feed_item) = @_;
59 545 100       1494 return () if $self->{image_max_num} == 0;
60 520         1519 my $content = $feed_item->description;
61 520 50       33254 return () if !defined($content);
62 520         1247 my ($url_head, $url_dir) = _get_url_head_and_dir($feed_item->link);
63 520         1122 my @urls = ();
64 520   100     8460 while(($self->{image_max_num} < 0 || @urls < $self->{image_max_num})
      100        
65             && $content =~ m{<\s*img\s+[^>]*src\s*=\s*(['"])([^>]+?)\1[^>]*>}ig) {
66 558         9908 my $url = URI->new($2);
67 558 100       35932 if(!$url->scheme) {
68             ## Only "path" segment is in the src attribute.
69 27 50 33     449 next if !defined($url_head) || !defined($url_dir);
70 27 100       91 if(substr("$url", 0, 1) eq "/") {
71 21         170 $url = "$url_head$url";
72             }else {
73 6         68 $url = "$url_head$url_dir$url";
74             }
75             }
76 558         8367 push @urls, "$url";
77             }
78 520         3029 return @urls;
79             }
80              
81             sub _get_home_url {
82 0     0   0 my ($self, $feed, $statuses) = @_;
83 0         0 my $home_url = $feed->link;
84 0 0 0     0 if(defined($home_url) && $home_url =~ m{^https?://}i) {
85 0         0 return $home_url;
86             }
87            
88 0         0 foreach my $status (@$statuses) {
89 0 0       0 $home_url = $status->{busybird}{status_permalink} if defined($status->{busybird});
90 0 0       0 return $home_url if defined $home_url;
91             }
92 0         0 return undef;
93             }
94              
95             sub _get_favicon_url {
96 0     0   0 my ($self, $feed, $statuses) = @_;
97             return try {
98 0     0   0 my $home_url = $self->_get_home_url($feed, $statuses);
99 0 0       0 return undef if not defined $home_url;
100 0         0 my $favicon_url = $self->{favicon_detector}->detect($home_url);
101 0 0       0 return undef if not defined $favicon_url;
102 0         0 my $res = $self->{user_agent}->get($favicon_url);
103 0 0       0 return undef if !$res->is_success;
104 0         0 my $type = $res->header('Content-Type');
105 0 0 0     0 return undef if defined($type) && $type !~ /^image/i;
106 0         0 return $favicon_url;
107 0         0 };
108             }
109              
110             sub _make_timestamp_datetime {
111 545     545   20092 my ($self, $timestamp_str) = @_;
112 545 100       1410 return undef if not defined $timestamp_str;
113 500 50       1912 if($timestamp_str =~ /^\d+$/) {
114 0         0 return DateTime->from_epoch(epoch => $timestamp_str, time_zone => '+0000');
115             }
116 500     500   2793 my $datetime = try { DateTime::Format::ISO8601->parse_datetime($timestamp_str) };
  500         13620  
117 500 100       339427 return $datetime if defined $datetime;
118 45         229 return BusyBird::DateTime::Format->parse_datetime($timestamp_str);
119             }
120              
121             sub _make_status_from_item {
122 545     545   1207 my ($self, $feed_title, $feed_item) = @_;
123 545         1617 my $created_at_dt = $self->_make_timestamp_datetime($feed_item->pubDate);
124 545 100       45292 my $status = {
125             text => $feed_item->title,
126             busybird => { status_permalink => $feed_item->link },
127             created_at => ($created_at_dt ? BusyBird::DateTime::Format->format_datetime($created_at_dt) : undef ),
128             user => { screen_name => $feed_title },
129             };
130 545         192763 my $guid = $feed_item->guid;
131 545         10648 my $item_id;
132 545 100       1260 if(defined $guid) {
133 416         782 $item_id = $guid;
134 416         1242 $status->{busybird}{original}{id} = $guid;
135             }else {
136 129         318 $item_id = $feed_item->link;
137             }
138 545 100 66     3887 if(defined($created_at_dt) && defined($item_id)) {
    50          
139 500         1486 $status->{id} = $created_at_dt->epoch . '|' . $item_id;
140             }elsif(defined($item_id)) {
141 45         82 $status->{id} = $item_id;
142             }
143 545         5669 my @image_urls = $self->_extract_image_urls($feed_item);
144 545 100       1364 if(@image_urls) {
145 210         409 $status->{extended_entities}{media} = [map { +{ media_url => $_, indices => [0,0] } } @image_urls];
  558         2032  
146             }
147 545         3211 return $status;
148             }
149              
150             sub _make_statuses_from_feed {
151 30     30   1198224 my ($self, $feed) = @_;
152 30         183 my $feed_title = $feed->title;
153 30         1155 my $statuses = [ map { $self->_make_status_from_item($feed_title, $_) } $feed->get_item ];
  545         1790  
154 30 50       5247 return $statuses if !$self->{use_favicon};
155 0         0 my $favicon_url = $self->_get_favicon_url($feed, $statuses);
156 0 0       0 return $statuses if not defined $favicon_url;
157 0         0 $_->{user}{profile_image_url} = $favicon_url foreach @$statuses;
158 0         0 return $statuses;
159             }
160              
161             sub _parse_with_feedpp {
162 45     45   120 my ($self, $feed_source, $feed_type) = @_;
163             return $self->_make_statuses_from_feed(XML::FeedPP->new(
164             $feed_source, -type => $feed_type,
165             utf8_flag => 1, xml_deref => 1, lwp_useragent => $self->{user_agent},
166              
167             ## FeedPP and TreePP mess up with User-Agent. It's pretty annoying.
168 45         361 user_agent => scalar($self->{user_agent}->agent),
169             ));
170             }
171              
172             sub parse_string {
173 24     24 1 8261 my ($self, $string) = @_;
174 24         104 return $self->_parse_with_feedpp($string, "string");
175             }
176              
177             *parse = *parse_string;
178              
179             sub parse_file {
180 15     15 1 9429 my ($self, $filename) = @_;
181 15         83 return $self->_parse_with_feedpp($filename, "file");
182             }
183              
184             sub parse_url {
185 6     6 1 12568 my ($self, $url) = @_;
186 6         17 return $self->_parse_with_feedpp($url, "url");
187             }
188              
189             *parse_uri = *parse_url;
190              
191             1;
192             __END__
193              
194             =pod
195              
196             =head1 NAME
197              
198             BusyBird::Input::Feed - input BusyBird statuses from RSS/Atom feed
199              
200             =head1 SYNOPSIS
201              
202             use BusyBird;
203             use BusyBird::Input::Feed;
204            
205             my $input = BusyBird::Input::Feed->new;
206            
207             my $statuses = $input->parse($feed_xml);
208             timeline("feed")->add($statuses);
209            
210             $statuses = $input->parse_file("feed.atom");
211             timeline("feed")->add($statuses);
212            
213             $statuses = $input->parse_url('https://metacpan.org/feed/recent?f=');
214             timeline("feed")->add($statuses);
215              
216             =head1 DESCRIPTION
217              
218             L<BusyBird::Input::Feed> converts RSS and Atom feeds into L<BusyBird> status objects.
219              
220             For convenience, an executable script L<busybird_input_feed> is bundled in this distribution.
221              
222             =head1 CLASS METHODS
223              
224             =head2 $input = BusyBird::Input::Feed->new(%args)
225              
226             The constructor.
227              
228             Fields in C<%args> are:
229              
230             =over
231              
232             =item C<use_favicon> => BOOL (optional, default: true)
233              
234             If true (or omitted or C<undef>), it tries to use the favicon of the Web site providing the feed
235             as the statuses' icons.
236              
237             If it's defined and false, it won't use favicon.
238              
239             =item C<user_agent> => L<LWP::UserAgent> object (optional)
240              
241             L<LWP::UserAgent> object for fetching documents.
242              
243             =item C<image_max_num> => INT (optional, default: 3)
244              
245             The maximum number of image URLs extracted from the feed item.
246              
247             If set to 0, it extracts no images. If set to a negative value, it extracts all image URLs from the feed item.
248              
249             The extracted image URLs are stored as Twitter Entities in the status's C<extended_entities> field,
250             so that L<BusyBird> will render them.
251             See L<BusyBird::Manual::Status/extended_entities.media> for detail.
252              
253             =back
254              
255             =head1 OBJECT METHODS
256              
257             =head2 $statuses = $input->parse($feed_xml_string)
258              
259             =head2 $statuses = $input->parse_string($feed_xml_string)
260              
261             Convert the given C<$feed_xml_string> into L<BusyBird> C<$statuses>.
262             C<parse()> method is an alias for C<parse_string()>.
263              
264             C<$feed_xml_string> is the XML data to be parsed.
265             It must be a string encoded in UTF-8.
266              
267             Return value C<$statuses> is an array-ref of L<BusyBird> status objects.
268              
269             If C<$feed_xml_string> is invalid, it croaks.
270              
271             =head2 $statuses = $input->parse_file($feed_xml_filename)
272              
273             Same as C<parse_string()> except C<parse_file()> reads the file named C<$feed_xml_filename> and converts its content.
274              
275             =head2 $statuses = $input->parse_url($feed_xml_url)
276              
277             =head2 $statuses = $input->parse_uri($feed_xml_url)
278              
279             Same as C<parse_string()> except C<parse_url()> downloads the feed XML from C<$feed_xml_url> and converts its content.
280              
281             C<parse_uri()> method is an alias for C<parse_url()>.
282              
283             =head1 EXAMPLE
284              
285             The example below uses L<Parallel::ForkManager> to parallelize C<parse_url()> method of L<BusyBird::Input::Feed>.
286             It greatly reduces the total time to download a lot of RSS/Atom feeds.
287              
288             use strict;
289             use warnings;
290             use Parallel::ForkManager;
291             use BusyBird::Input::Feed;
292             use open qw(:std :encoding(utf8));
293            
294             my @feeds = (
295             'https://metacpan.org/feed/recent?f=',
296             'http://www.perl.com/pub/atom.xml',
297             'https://github.com/perl-users-jp/perl-users.jp-htdocs/commits/master.atom',
298             );
299             my $MAX_PROCESSES = 10;
300             my $pm = Parallel::ForkManager->new($MAX_PROCESSES);
301             my $input = BusyBird::Input::Feed->new;
302            
303             my @statuses = ();
304            
305             $pm->run_on_finish(sub {
306             my ($pid, $exitcode, $id, $signal, $coredump, $statuses) = @_;
307             push @statuses, @$statuses;
308             });
309            
310             foreach my $feed (@feeds) {
311             $pm->start and next;
312             warn "Start loading $feed\n";
313             my $statuses = $input->parse_url($feed);
314             warn "End loading $feed\n";
315             $pm->finish(0, $statuses);
316             }
317             $pm->wait_all_children;
318            
319             foreach my $status (@statuses) {
320             print "$status->{user}{screen_name}: $status->{text}\n";
321             }
322              
323              
324             =head1 SEE ALSO
325              
326             =over
327              
328             =item *
329              
330             L<BusyBird>
331              
332             =item *
333              
334             L<BusyBird::Manual::Status>
335              
336             =back
337              
338             =head1 REPOSITORY
339              
340             L<https://github.com/debug-ito/BusyBird-Input-Feed>
341              
342             =head1 BUGS AND FEATURE REQUESTS
343              
344             Please report bugs and feature requests to my Github issues
345             L<https://github.com/debug-ito/BusyBird-Input-Feed/issues>.
346              
347             Although I prefer Github, non-Github users can use CPAN RT
348             L<https://rt.cpan.org/Public/Dist/Display.html?Name=BusyBird-Input-Feed>.
349             Please send email to C<bug-BusyBird-Input-Feed at rt.cpan.org> to report bugs
350             if you do not have CPAN RT account.
351              
352              
353             =head1 AUTHOR
354            
355             Toshio Ito, C<< <toshioito at cpan.org> >>
356              
357              
358             =head1 LICENSE AND COPYRIGHT
359              
360             Copyright 2014 Toshio Ito.
361              
362             This program is free software; you can redistribute it and/or modify it
363             under the terms of either: the GNU General Public License as published
364             by the Free Software Foundation; or the Artistic License.
365              
366             See L<http://dev.perl.org/licenses/> for more information.
367              
368              
369             =cut
370