File Coverage

blib/lib/Mojolicious/Plugin/FeedReader.pm
Criterion Covered Total %
statement 184 187 98.4
branch 77 92 83.7
condition 60 77 77.9
subroutine 25 25 100.0
pod 3 8 37.5
total 349 389 89.7


line stmt bran cond sub pod time code
1             package Mojolicious::Plugin::FeedReader;
2 4     4   2128 use Mojo::Base 'Mojolicious::Plugin';
  4         7  
  4         25  
3              
4             our $VERSION = '0.09';
5 4     4   727 use Mojo::Util qw(decode trim);
  4         8  
  4         191  
6 4     4   20 use Mojo::File;
  4         6  
  4         118  
7 4     4   19 use Mojo::DOM;
  4         10  
  4         83  
8 4     4   16 use Mojo::IOLoop;
  4         6  
  4         32  
9 4     4   628 use HTTP::Date;
  4         2430  
  4         214  
10 4     4   25 use Carp qw(carp croak);
  4         7  
  4         209  
11 4     4   21 use Scalar::Util qw(blessed);
  4         7  
  4         9245  
12              
13             our @time_fields
14             = (qw(pubDate published created issued updated modified dc\:date));
15             our %is_time_field = map { $_ => 1 } @time_fields;
16              
17             # feed mime-types:
18             our @feed_types = (
19             'application/x.atom+xml', 'application/atom+xml',
20             'application/xml', 'text/xml',
21             'application/rss+xml', 'application/rdf+xml'
22             );
23             our %is_feed = map { $_ => 1 } @feed_types;
24              
25             sub register {
26 4     4 1 154 my ($self, $app) = @_;
27 4         10 foreach my $method (
28             qw( find_feeds parse_rss parse_opml ))
29             {
30 12         175 $app->helper($method => \&{$method});
  12         57  
31             }
32 4         61 $app->helper(parse_feed => \&parse_rss);
33             }
34              
35             sub make_dom {
36 25     25 0 64 my ($xml) = @_;
37 25         51 my $rss;
38 25 100 66     174 if (!ref $xml) { # assume file
    100 33        
    100          
    50          
39 9         85 $rss = Mojo::File->new($xml)->slurp;
40 9 50       1346 die "Unable to read file $xml: $!" unless ($rss);
41             }
42             elsif (ref $xml eq 'SCALAR') { # assume string
43 11         28 $rss = $$xml;
44             }
45             elsif (blessed $xml && $xml->isa('Mojo::DOM')) { # Mojo::DOM (old style)
46 1         4 return $xml;
47             }
48             elsif (blessed $xml && $xml->can('slurp')) { # Mojo::Asset/File or similar
49 4         18 $rss = $xml->slurp;
50             }
51             else {
52 0         0 die "don't know how to make a Mojo::DOM from object $xml";
53             }
54             #my $rss_str = decode 'UTF-8', $rss;
55 24         331 my $rss_str = $rss;
56 24 50       72 die "Failed to read asset $xml (as UTF-8): $!" unless ($rss_str);
57 24         213 return Mojo::DOM->new->parse($rss_str);
58             }
59              
60             sub parse_rss {
61 25     25 0 85868 my ($c, $xml, $cb) = @_;
62 25 100 100     212 if (blessed $xml && $xml->isa('Mojo::URL')) {
63             # this is the only case where we might go non-blocking:
64 4 100 66     29 if ($cb && ref $cb eq 'CODE') {
65             return
66             $c->ua->get(
67             $xml,
68             sub {
69 1     1   8685 my ($ua, $tx) = @_;
70 1         2 my $feed;
71 1 50       8 if ($tx->success) {
72 1         22 my $body = $tx->res->body;
73 1         25 my $dom = make_dom(\$body);
74 1         3752 eval { $feed = parse_rss_dom($dom); };
  1         8  
75             }
76 1         8 $c->$cb($feed);
77             }
78 1         9 );
79             }
80             else {
81 3         29 my $tx = $c->ua->get($xml);
82 3 50       19709 if ($tx->success) {
83 3         71 my $body = $tx->res->body;
84 3         60 $xml = \$body;
85             }
86             else {
87 0 0       0 croak "Error getting feed from url $xml: ", (($tx->error) ? $tx->error->{message} : '');
88             }
89             }
90             }
91 24         98 my $dom = make_dom($xml);
92 24 50       465792 return ($dom) ? parse_rss_dom($dom) : 1;
93             }
94              
95             sub parse_rss_dom {
96 25     25 0 228 my ($dom) = @_;
97 25 50       196 die "Argument $dom is not a Mojo::DOM" unless ($dom->isa('Mojo::DOM'));
98 25         86 my $feed = parse_rss_channel($dom); # Feed properties
99 25         144 my $items = $dom->find('item');
100 25         106905 my $entries = $dom->find('entry'); # Atom
101 25         108604 my $res = [];
102 25         154 foreach my $item ($items->each, $entries->each) {
103 130         588 push @$res, parse_rss_item($item);
104             }
105 25 100       139 if (@$res) {
106 20         86 $feed->{'items'} = $res;
107             }
108 25         4489 return $feed;
109             }
110              
111             sub parse_rss_channel {
112 25     25 0 65 my ($dom) = @_;
113 25         65 my %info;
114 25         96 foreach my $k (
115             qw{title subtitle description tagline link:not([rel]) link[rel=alternate] dc\:creator author webMaster},
116             @time_fields
117             )
118             {
119 400   100     3908 my $p = $dom->at("channel > $k") || $dom->at("feed > $k"); # direct child
120 400 100       3123288 if ($p) {
121 94   100     686 $info{$k} = $p->text || $p->content || $p->attr('href');
122 94 100 100     5519 if ($k eq 'author' && $p->at('name')) {
123 14   33     2223 $info{$k} = $p->at('name')->text || $p->at('name')->content;
124             }
125 94 100       2773 if ($is_time_field{$k}) {
126 17         134 $info{$k} = str2time($info{$k});
127             }
128             }
129             }
130             my ($htmlUrl)
131 50         152 = grep { defined $_ }
132 25         266 map { delete $info{$_} } ('link:not([rel])', 'link[rel=alternate]');
  50         205  
133             my ($description)
134 75         151 = grep { defined $_ }
135 25 100       74 map { exists $info{$_} ? $info{$_} : undef }
  75         219  
136             (qw(description tagline subtitle));
137 25 100       107 $info{htmlUrl} = $htmlUrl if ($htmlUrl);
138 25 100       94 $info{description} = $description if ($description);
139              
140             # normalize fields:
141 25         154 my @replace = (
142             'pubDate' => 'published',
143             'dc\:date' => 'published',
144             'created' => 'published',
145             'issued' => 'published',
146             'updated' => 'published',
147             'modified' => 'published',
148             'dc\:creator' => 'author',
149             'webMaster' => 'author'
150             );
151 25         139 while (my ($old, $new) = splice(@replace, 0, 2)) {
152 200 100 100     616 if ($info{$old} && !$info{$new}) {
153 19         83 $info{$new} = delete $info{$old};
154             }
155             }
156 25 100       197 return (keys %info) ? \%info : undef;
157             }
158              
159             sub parse_rss_item {
160 130     130 0 324 my ($item) = @_;
161 130         195 my %h;
162 130         353 foreach my $k (
163             qw(title id summary guid content description content\:encoded xhtml\:body dc\:creator author),
164             @time_fields
165             )
166             {
167 2210         29892 my $p = $item->at($k);
168 2210 100       969918 if ($p) {
169              
170             # skip namespaced items - like itunes:summary - unless explicitly
171             # searched:
172             next
173 779 100 100     4228 if ($p->tag =~ /\:/
      100        
      100        
      100        
174             && $k ne 'content\:encoded'
175             && $k ne 'xhtml\:body'
176             && $k ne 'dc\:date'
177             && $k ne 'dc\:creator');
178 774   66     10840 $h{$k} = $p->text || $p->content;
179 774 100 100     101330 if ($k eq 'author' && $p->at('name')) {
180 22         3300 $h{$k} = $p->at('name')->text;
181             }
182 774 100       5848 if ($is_time_field{$k}) {
183 269         996 $h{$k} = str2time($h{$k});
184             }
185             }
186             }
187              
188             # let's handle links seperately, because ATOM loves these buggers:
189             $item->find('link')->each(
190             sub {
191 137     137   80454 my $l = shift;
192 137 100       395 if ($l->attr('href')) {
193 130 100 100     2389 if (!$l->attr('rel') || $l->attr('rel') eq 'alternate') {
194 122         2292 $h{'link'} = $l->attr('href');
195             }
196             }
197             else {
198 7 50       172 if ($l->text =~ /\w+/) {
199 7         218 $h{'link'} = $l->text; # simple link
200             }
201              
202             # else { # we have an empty link element with no 'href'. :-(
203             # $h{'link'} = $1 if ($l->next->text =~ m/^(http\S+)/);
204             # }
205             }
206             }
207 130         803 );
208              
209             # find tags:
210 130         3223 my @tags;
211             $item->find('category, dc\:subject')
212 130   66 93   366 ->each(sub { push @tags, $_[0]->text || $_[0]->attr('term') });
  93         55933  
213 130 100       63270 if (@tags) {
214 76         302 $h{'tags'} = \@tags;
215             }
216             #
217             # normalize fields:
218 130         667 my @replace = (
219             'content\:encoded' => 'content',
220             'xhtml\:body' => 'content',
221             'summary' => 'description',
222             'pubDate' => 'published',
223             'dc\:date' => 'published',
224             'created' => 'published',
225             'issued' => 'published',
226             'updated' => 'published',
227             'modified' => 'published',
228             'dc\:creator' => 'author'
229              
230             # 'guid' => 'link'
231             );
232 130         546 while (my ($old, $new) = splice(@replace, 0, 2)) {
233 1300 100 100     4183 if ($h{$old} && !$h{$new}) {
234 126         474 $h{$new} = delete $h{$old};
235             }
236             }
237 130         490 my %copy = ('description' => 'content', link => 'id', guid => 'id');
238 130         483 while (my ($fill, $required) = each %copy) {
239 390 100 100     1387 if ($h{$fill} && !$h{$required}) {
240 8         38 $h{$required} = $h{$fill};
241             }
242             }
243 130         412 $h{"_raw"} = $item->to_string;
244 130         115697 return \%h;
245             }
246              
247             # find_feeds - get RSS/Atom feed URL from argument.
248             # Code adapted to use Mojolicious from Feed::Find by Benjamin Trott
249             # Any stupid mistakes are my own
250             sub find_feeds {
251 13     13 1 69650 my $self = shift;
252 13         26 my $url = shift;
253 13 100       38 my $cb = (ref $_[-1] eq 'CODE') ? pop @_ : undef;
254             # $self->ua->max_redirects(5)->connect_timeout(30);
255             my $main = sub {
256 13     13   26 my ($tx) = @_;
257 13         20 my @feeds;
258             # if ($tx->success) { say $tx->res->code } else { say $tx->error };
259 13 100 66     44 return unless ($tx->success && $tx->res->code == 200);
260 12         306 eval { @feeds = _find_feed_links($self, $tx->req->url, $tx->res); };
  12         25  
261 12 50       29 if ($@) {
262 0         0 croak "Exception in find_feeds - ", $@;
263             }
264 12         122 return (@feeds);
265 13         56 };
266 13 100       33 if ($cb) { # non-blocking:
267             $self->ua->get(
268             $url,
269             sub {
270 3     3   18562 my ($ua, $tx) = @_;
271 3         9 my (@feeds) = $main->($tx);
272 3         13 $cb->(@feeds);
273             }
274 3         17 );
275             }
276             else {
277 10         50 my $tx = $self->ua->get($url);
278 10         58759 return $main->($tx);
279             }
280             }
281              
282             sub _find_feed_links {
283 12     12   94 my ($self, $url, $res) = @_;
284              
285 12         20 state $feed_ext = qr/\.(?:rss|xml|rdf)$/;
286 12         19 my @feeds;
287              
288             # use split to remove charset attribute from content_type
289 12         30 my ($content_type) = split(/[; ]+/, $res->headers->content_type);
290 12 100       215 if ($is_feed{$content_type}) {
291 2         8 push @feeds, Mojo::URL->new($url)->to_abs;
292             }
293             else {
294             # we are in a web page. PHEAR.
295 10   33     44 my $base = Mojo::URL->new(
296             $res->dom->find('head base')->map('attr', 'href')->join('') || $url)->to_abs($url);
297 10   33     261914 my $title
298             = $res->dom->find('head > title')->map('text')->join('') || $url;
299             $res->dom->find('head link')->each(
300             sub {
301 131     131   48494 my $attrs = $_->attr();
302 131 100       1601 return unless ($attrs->{'rel'});
303 129         342 my %rel = map { $_ => 1 } split /\s+/, lc($attrs->{'rel'});
  129         329  
304 129 100       268 my $type = ($attrs->{'type'}) ? lc trim $attrs->{'type'} : '';
305 129 50 33     480 if ($is_feed{$type} && ($rel{'alternate'} || $rel{'service.feed'})) {
      66        
306 8         25 push @feeds, Mojo::URL->new($attrs->{'href'})->to_abs($base);
307             }
308             }
309 10         41904 );
310             $res->dom->find('a')->grep(
311             sub {
312 374 50   374   122360 $_->attr('href')
313             && Mojo::URL->new($_->attr('href'))->path =~ /$feed_ext/io;
314             }
315             )->each(
316             sub {
317 2     2   784 push @feeds, Mojo::URL->new($_->attr('href'))->to_abs($base);
318             }
319 10         2316 );
320 10 100       5698 unless (@feeds)
321             { # call me crazy, but maybe this is just a feed served as HTML?
322 5         24 my $body = $res->body;
323 5 100       137 if ($self->parse_feed(\$body)) {
324 1         5 push @feeds, Mojo::URL->new($url)->to_abs;
325             }
326             }
327             }
328 12         1029 return @feeds;
329             }
330              
331             sub parse_opml {
332 3     3 1 17933 my ($self, $opml_file) = @_;
333 3 50       28 my $opml_str = decode 'UTF-8',
334             (ref $opml_file) ? $opml_file->slurp : Mojo::File->new($opml_file)->slurp;
335 3         1312 my $d = Mojo::DOM->new->parse($opml_str);
336 3         156604 my (%subscriptions, %categories);
337 3         18 for my $item ($d->find(q{outline})->each) {
338 962         186207 my $node = $item->attr;
339 962 100       10846 if (!defined $node->{xmlUrl}) {
340 40   66     132 my $cat = $node->{title} || $node->{text};
341 40         93 $categories{$cat} = $item->children('[xmlUrl]')->map('attr', 'xmlUrl');
342             }
343             else { # file by RSS URL:
344 922         2205 $subscriptions{$node->{xmlUrl}} = $node;
345             }
346             }
347              
348              
349             # assign categories
350 3         239 for my $cat (keys %categories) {
351 40         107 for my $rss ($categories{$cat}->each) {
352 920 50       2003 next unless ($subscriptions{$rss}); # don't auto-vivify for empty "categories"
353 920   100     2941 $subscriptions{$rss}{'categories'} ||= [];
354 920         1112 push @{$subscriptions{$rss}{'categories'}}, $cat;
  920         1955  
355             }
356             }
357 3         934 return (values %subscriptions);
358             }
359              
360              
361             1;
362              
363             =encoding utf-8
364              
365             =head1 NAME
366              
367             Mojolicious::Plugin::FeedReader - Mojolicious plugin to find and parse RSS & Atom feeds
368              
369             =head1 SYNOPSIS
370              
371             # Mojolicious
372             $self->plugin('FeedReader');
373              
374             # Mojolicious::Lite
375             plugin 'FeedReader';
376              
377             # Blocking:
378             get '/b' => sub {
379             my $self = shift;
380             my ($feed) = $self->find_feeds(q{search.cpan.org});
381             my $out = $self->parse_feed($feed);
382             $self->render(template => 'uploads', items => $out->{items});
383             };
384              
385             # Non-blocking:
386             get '/nb' => sub {
387             my $self = shift;
388             $self->render_later;
389             my $delay = Mojo::IOLoop->delay(
390             sub {
391             $self->find_feeds("search.cpan.org", shift->begin(0));
392             },
393             sub {
394             my $feed = pop;
395             $self->parse_feed($feed, shift->begin);
396             },
397             sub {
398             my $data = pop;
399             $self->render(template => 'uploads', items => $data->{items});
400             });
401             $delay->wait unless Mojo::IOLoop->is_running;
402             };
403              
404             app->start;
405              
406             __DATA__
407              
408             @@ uploads.html.ep
409            
410             % for my $item (@$items) {
411            
  • <%= link_to $item->{title} => $item->{link} %> - <%= $item->{description} %>
  • 412             % }
    413            
    414              
    415             =head1 DESCRIPTION
    416              
    417             B implements minimalistic helpers for identifying,
    418             fetching and parsing RSS and Atom Feeds. It has minimal dependencies, relying as
    419             much as possible on Mojolicious components - Mojo::UserAgent for fetching feeds and
    420             checking URLs, Mojo::DOM for XML/HTML parsing.
    421             It is therefore rather fragile and naive, and should be considered Experimental/Toy
    422             code - B.
    423              
    424              
    425             =head1 METHODS
    426              
    427             L inherits all methods from
    428             L and implements the following new ones.
    429              
    430             =head2 register
    431              
    432             $plugin->register(Mojolicious->new);
    433              
    434             Register plugin in L application. This method will install the helpers
    435             listed below in your Mojolicious application.
    436              
    437             =head1 HELPERS
    438              
    439             B implements the following helpers.
    440              
    441             =head2 find_feeds
    442              
    443             # Call blocking
    444             my (@feeds) = app->find_feeds('search.cpan.org');
    445             # @feeds is a list of Mojo::URL objects
    446              
    447             # Call non-blocking
    448             $self->find_feeds('http://example.com', sub {
    449             my (@feeds) = @_;
    450             unless (@feeds) {
    451             $self->render_exception("no feeds found, " . $info->{error});
    452             }
    453             else {
    454             ....
    455             }
    456             });
    457              
    458             A Mojolicious port of L by Benjamin Trott. This helper implements feed auto-discovery for finding syndication feeds, given a URI.
    459             If given a callback function as an additional argument, execution will be non-blocking.
    460              
    461             =head2 parse_feed
    462              
    463             # parse an RSS/Atom feed
    464             # blocking
    465             my $url = Mojo::URL->new('http://rss.slashdot.org/Slashdot/slashdot');
    466             my $feed = $self->parse_feed($url);
    467             for my $item (@{$feed->{items}}) {
    468             say $_ for ($item->{title}, $item->{description}, 'Tags: ' . join q{,}, @{$item->{tags}});
    469             }
    470              
    471             # non-blocking
    472             $self->parse_feed($url, sub {
    473             my ($c, $feed) = @_;
    474             $c->render(text => "Feed tagline: " . $feed->{tagline});
    475             });
    476              
    477             # parse a file
    478             $feed2 = $self->parse_feed('/downloads/foo.rss');
    479              
    480             # parse response
    481             $self->ua->get($feed_url, sub {
    482             my ($ua, $tx) = @_;
    483             my $feed = $self->parse_feed($tx->res);
    484             });
    485              
    486             A minimalist liberal RSS/Atom parser, using Mojo::DOM queries.
    487              
    488             Dates are parsed using L.
    489              
    490             If parsing fails (for example, the parser was given an HTML page), the helper will return undef.
    491              
    492             On success, the result returned is a hashref with the following keys:
    493              
    494             =over 4
    495              
    496             =item * title
    497              
    498             =item * description (may be filled from subtitle or tagline if absent)
    499              
    500             =item * htmlUrl - web page URL associated with the feed
    501              
    502             =item * items - array ref of feed news items
    503              
    504             =item * subtitle (optional)
    505              
    506             =item * tagline (optional)
    507              
    508             =item * author (name of author field, or dc:creator or webMaster)
    509              
    510             =item * published - time in epoch seconds (may be filled with pubDate, dc:date, created, issued, updated or modified)
    511              
    512             =back
    513              
    514             Each item in the items array is a hashref with the following keys:
    515              
    516             =over 4
    517              
    518             =item * title
    519              
    520             =item * link
    521              
    522             =item * content (may be filled with content:encoded, xhtml:body or description fields)
    523              
    524             =item * id (will be equal to link or guid if it is undefined and either of those fields exists)
    525              
    526             =item * description (optional) - usually a shorter form of the content (may be filled with summary if description is missing)
    527              
    528             =item * guid (optional)
    529              
    530             =item * published - time in epoch seconds (may be filled with pubDate, dc:date, created, issued, updated or modified)
    531              
    532             =item * author (may be filled from author or dc:creator)
    533              
    534             =item * tags (optional) - array ref of tags, categories or dc:subjects.
    535              
    536             =item * _raw - XML serialized text of the item's Mojo::DOM node. Note that this can be different from the original XML text in the feed.
    537              
    538             =back
    539              
    540             =head2 parse_opml
    541              
    542             my @subscriptions = app->parse_opml( 'mysubs.opml' );
    543             foreach my $sub (@subscriptions) {
    544             say 'RSS URL is: ', $sub->{xmlUrl};
    545             say 'Website URL is: ', $sub->{htmlUrl};
    546             say 'categories: ', join ',', @{$sub->{categories}};
    547             }
    548              
    549             Parse an OPML subscriptions file and return the list of feeds as an array of hashrefs.
    550              
    551             Each hashref will contain an array ref in the key 'categories' listing the folders (parent nodes) in the OPML tree the subscription item appears in.
    552              
    553             =head1 CREDITS
    554              
    555             Some tests adapted from L and L Feed autodiscovery adapted from L.
    556              
    557             Test data (web pages, feeds and excerpts) included in this package is intended for testing purposes only, and is not meant in any way
    558             to infringe on the rights of the respective authors.
    559              
    560             =head1 COPYRIGHT AND LICENSE
    561              
    562             Copyright (C) 2014, Dotan Dimet.
    563              
    564             This program is free software, you can redistribute it and/or modify it
    565             under the terms of the Artistic License version 2.0.
    566              
    567             =head1 SEE ALSO
    568              
    569             L, L, L
    570              
    571             L, L, L
    572              
    573             =cut