File Coverage

blib/lib/Mojolicious/Plugin/FeedReader.pm

Criterion	Covered	Total	%
statement	184	187	98.4
branch	77	92	83.7
condition	60	77	77.9
subroutine	25	25	100.0
pod	3	8	37.5
total	349	389	89.7

line	stmt	bran	cond	sub	pod	time	code
1							package Mojolicious::Plugin::FeedReader;
2	4			4		2128	use Mojo::Base 'Mojolicious::Plugin';
	4					7
	4					25
3
4							our $VERSION = '0.09';
5	4			4		727	use Mojo::Util qw(decode trim);
	4					8
	4					191
6	4			4		20	use Mojo::File;
	4					6
	4					118
7	4			4		19	use Mojo::DOM;
	4					10
	4					83
8	4			4		16	use Mojo::IOLoop;
	4					6
	4					32
9	4			4		628	use HTTP::Date;
	4					2430
	4					214
10	4			4		25	use Carp qw(carp croak);
	4					7
	4					209
11	4			4		21	use Scalar::Util qw(blessed);
	4					7
	4					9245
12
13							our @time_fields
14							= (qw(pubDate published created issued updated modified dc\:date));
15							our %is_time_field = map { $_ => 1 } @time_fields;
16
17							# feed mime-types:
18							our @feed_types = (
19							'application/x.atom+xml', 'application/atom+xml',
20							'application/xml', 'text/xml',
21							'application/rss+xml', 'application/rdf+xml'
22							);
23							our %is_feed = map { $_ => 1 } @feed_types;
24
25							sub register {
26	4			4	1	154	my ($self, $app) = @_;
27	4					10	foreach my $method (
28							qw( find_feeds parse_rss parse_opml ))
29							{
30	12					175	$app->helper($method => \&{$method});
	12					57
31							}
32	4					61	$app->helper(parse_feed => \&parse_rss);
33							}
34
35							sub make_dom {
36	25			25	0	64	my ($xml) = @_;
37	25					51	my $rss;
38	25	100	66			174	if (!ref $xml) { # assume file
		100	33
		100
		50
39	9					85	$rss = Mojo::File->new($xml)->slurp;
40	9	50				1346	die "Unable to read file $xml: $!" unless ($rss);
41							}
42							elsif (ref $xml eq 'SCALAR') { # assume string
43	11					28	$rss = $$xml;
44							}
45							elsif (blessed $xml && $xml->isa('Mojo::DOM')) { # Mojo::DOM (old style)
46	1					4	return $xml;
47							}
48							elsif (blessed $xml && $xml->can('slurp')) { # Mojo::Asset/File or similar
49	4					18	$rss = $xml->slurp;
50							}
51							else {
52	0					0	die "don't know how to make a Mojo::DOM from object $xml";
53							}
54							#my $rss_str = decode 'UTF-8', $rss;
55	24					331	my $rss_str = $rss;
56	24	50				72	die "Failed to read asset $xml (as UTF-8): $!" unless ($rss_str);
57	24					213	return Mojo::DOM->new->parse($rss_str);
58							}
59
60							sub parse_rss {
61	25			25	0	85868	my ($c, $xml, $cb) = @_;
62	25	100	100			212	if (blessed $xml && $xml->isa('Mojo::URL')) {
63							# this is the only case where we might go non-blocking:
64	4	100	66			29	if ($cb && ref $cb eq 'CODE') {
65							return
66							$c->ua->get(
67							$xml,
68							sub {
69	1			1		8685	my ($ua, $tx) = @_;
70	1					2	my $feed;
71	1	50				8	if ($tx->success) {
72	1					22	my $body = $tx->res->body;
73	1					25	my $dom = make_dom(\$body);
74	1					3752	eval { $feed = parse_rss_dom($dom); };
	1					8
75							}
76	1					8	$c->$cb($feed);
77							}
78	1					9	);
79							}
80							else {
81	3					29	my $tx = $c->ua->get($xml);
82	3	50				19709	if ($tx->success) {
83	3					71	my $body = $tx->res->body;
84	3					60	$xml = \$body;
85							}
86							else {
87	0	0				0	croak "Error getting feed from url $xml: ", (($tx->error) ? $tx->error->{message} : '');
88							}
89							}
90							}
91	24					98	my $dom = make_dom($xml);
92	24	50				465792	return ($dom) ? parse_rss_dom($dom) : 1;
93							}
94
95							sub parse_rss_dom {
96	25			25	0	228	my ($dom) = @_;
97	25	50				196	die "Argument $dom is not a Mojo::DOM" unless ($dom->isa('Mojo::DOM'));
98	25					86	my $feed = parse_rss_channel($dom); # Feed properties
99	25					144	my $items = $dom->find('item');
100	25					106905	my $entries = $dom->find('entry'); # Atom
101	25					108604	my $res = [];
102	25					154	foreach my $item ($items->each, $entries->each) {
103	130					588	push @$res, parse_rss_item($item);
104							}
105	25	100				139	if (@$res) {
106	20					86	$feed->{'items'} = $res;
107							}
108	25					4489	return $feed;
109							}
110
111							sub parse_rss_channel {
112	25			25	0	65	my ($dom) = @_;
113	25					65	my %info;
114	25					96	foreach my $k (
115							qw{title subtitle description tagline link:not([rel]) link[rel=alternate] dc\:creator author webMaster},
116							@time_fields
117							)
118							{
119	400		100			3908	my $p = $dom->at("channel > $k") \|\| $dom->at("feed > $k"); # direct child
120	400	100				3123288	if ($p) {
121	94		100			686	$info{$k} = $p->text \|\| $p->content \|\| $p->attr('href');
122	94	100	100			5519	if ($k eq 'author' && $p->at('name')) {
123	14		33			2223	$info{$k} = $p->at('name')->text \|\| $p->at('name')->content;
124							}
125	94	100				2773	if ($is_time_field{$k}) {
126	17					134	$info{$k} = str2time($info{$k});
127							}
128							}
129							}
130							my ($htmlUrl)
131	50					152	= grep { defined $_ }
132	25					266	map { delete $info{$_} } ('link:not([rel])', 'link[rel=alternate]');
	50					205
133							my ($description)
134	75					151	= grep { defined $_ }
135	25	100				74	map { exists $info{$_} ? $info{$_} : undef }
	75					219
136							(qw(description tagline subtitle));
137	25	100				107	$info{htmlUrl} = $htmlUrl if ($htmlUrl);
138	25	100				94	$info{description} = $description if ($description);
139
140							# normalize fields:
141	25					154	my @replace = (
142							'pubDate' => 'published',
143							'dc\:date' => 'published',
144							'created' => 'published',
145							'issued' => 'published',
146							'updated' => 'published',
147							'modified' => 'published',
148							'dc\:creator' => 'author',
149							'webMaster' => 'author'
150							);
151	25					139	while (my ($old, $new) = splice(@replace, 0, 2)) {
152	200	100	100			616	if ($info{$old} && !$info{$new}) {
153	19					83	$info{$new} = delete $info{$old};
154							}
155							}
156	25	100				197	return (keys %info) ? \%info : undef;
157							}
158
159							sub parse_rss_item {
160	130			130	0	324	my ($item) = @_;
161	130					195	my %h;
162	130					353	foreach my $k (
163							qw(title id summary guid content description content\:encoded xhtml\:body dc\:creator author),
164							@time_fields
165							)
166							{
167	2210					29892	my $p = $item->at($k);
168	2210	100				969918	if ($p) {
169
170							# skip namespaced items - like itunes:summary - unless explicitly
171							# searched:
172							next
173	779	100	100			4228	if ($p->tag =~ /\:/
			100
			100
			100
174							&& $k ne 'content\:encoded'
175							&& $k ne 'xhtml\:body'
176							&& $k ne 'dc\:date'
177							&& $k ne 'dc\:creator');
178	774		66			10840	$h{$k} = $p->text \|\| $p->content;
179	774	100	100			101330	if ($k eq 'author' && $p->at('name')) {
180	22					3300	$h{$k} = $p->at('name')->text;
181							}
182	774	100				5848	if ($is_time_field{$k}) {
183	269					996	$h{$k} = str2time($h{$k});
184							}
185							}
186							}
187
188							# let's handle links seperately, because ATOM loves these buggers:
189							$item->find('link')->each(
190							sub {
191	137			137		80454	my $l = shift;
192	137	100				395	if ($l->attr('href')) {
193	130	100	100			2389	if (!$l->attr('rel') \|\| $l->attr('rel') eq 'alternate') {
194	122					2292	$h{'link'} = $l->attr('href');
195							}
196							}
197							else {
198	7	50				172	if ($l->text =~ /\w+/) {
199	7					218	$h{'link'} = $l->text; # simple link
200							}
201
202							# else { # we have an empty link element with no 'href'. :-(
203							# $h{'link'} = $1 if ($l->next->text =~ m/^(http\S+)/);
204							# }
205							}
206							}
207	130					803	);
208
209							# find tags:
210	130					3223	my @tags;
211							$item->find('category, dc\:subject')
212	130		66	93		366	->each(sub { push @tags, $_[0]->text \|\| $_[0]->attr('term') });
	93					55933
213	130	100				63270	if (@tags) {
214	76					302	$h{'tags'} = \@tags;
215							}
216							#
217							# normalize fields:
218	130					667	my @replace = (
219							'content\:encoded' => 'content',
220							'xhtml\:body' => 'content',
221							'summary' => 'description',
222							'pubDate' => 'published',
223							'dc\:date' => 'published',
224							'created' => 'published',
225							'issued' => 'published',
226							'updated' => 'published',
227							'modified' => 'published',
228							'dc\:creator' => 'author'
229
230							# 'guid' => 'link'
231							);
232	130					546	while (my ($old, $new) = splice(@replace, 0, 2)) {
233	1300	100	100			4183	if ($h{$old} && !$h{$new}) {
234	126					474	$h{$new} = delete $h{$old};
235							}
236							}
237	130					490	my %copy = ('description' => 'content', link => 'id', guid => 'id');
238	130					483	while (my ($fill, $required) = each %copy) {
239	390	100	100			1387	if ($h{$fill} && !$h{$required}) {
240	8					38	$h{$required} = $h{$fill};
241							}
242							}
243	130					412	$h{"_raw"} = $item->to_string;
244	130					115697	return \%h;
245							}
246
247							# find_feeds - get RSS/Atom feed URL from argument.
248							# Code adapted to use Mojolicious from Feed::Find by Benjamin Trott
249							# Any stupid mistakes are my own
250							sub find_feeds {
251	13			13	1	69650	my $self = shift;
252	13					26	my $url = shift;
253	13	100				38	my $cb = (ref $_[-1] eq 'CODE') ? pop @_ : undef;
254							# $self->ua->max_redirects(5)->connect_timeout(30);
255							my $main = sub {
256	13			13		26	my ($tx) = @_;
257	13					20	my @feeds;
258							# if ($tx->success) { say $tx->res->code } else { say $tx->error };
259	13	100	66			44	return unless ($tx->success && $tx->res->code == 200);
260	12					306	eval { @feeds = _find_feed_links($self, $tx->req->url, $tx->res); };
	12					25
261	12	50				29	if ($@) {
262	0					0	croak "Exception in find_feeds - ", $@;
263							}
264	12					122	return (@feeds);
265	13					56	};
266	13	100				33	if ($cb) { # non-blocking:
267							$self->ua->get(
268							$url,
269							sub {
270	3			3		18562	my ($ua, $tx) = @_;
271	3					9	my (@feeds) = $main->($tx);
272	3					13	$cb->(@feeds);
273							}
274	3					17	);
275							}
276							else {
277	10					50	my $tx = $self->ua->get($url);
278	10					58759	return $main->($tx);
279							}
280							}
281
282							sub _find_feed_links {
283	12			12		94	my ($self, $url, $res) = @_;
284
285	12					20	state $feed_ext = qr/\.(?:rss\|xml\|rdf)$/;
286	12					19	my @feeds;
287
288							# use split to remove charset attribute from content_type
289	12					30	my ($content_type) = split(/[; ]+/, $res->headers->content_type);
290	12	100				215	if ($is_feed{$content_type}) {
291	2					8	push @feeds, Mojo::URL->new($url)->to_abs;
292							}
293							else {
294							# we are in a web page. PHEAR.
295	10		33			44	my $base = Mojo::URL->new(
296							$res->dom->find('head base')->map('attr', 'href')->join('') \|\| $url)->to_abs($url);
297	10		33			261914	my $title
298							= $res->dom->find('head > title')->map('text')->join('') \|\| $url;
299							$res->dom->find('head link')->each(
300							sub {
301	131			131		48494	my $attrs = $_->attr();
302	131	100				1601	return unless ($attrs->{'rel'});
303	129					342	my %rel = map { $_ => 1 } split /\s+/, lc($attrs->{'rel'});
	129					329
304	129	100				268	my $type = ($attrs->{'type'}) ? lc trim $attrs->{'type'} : '';
305	129	50	33			480	if ($is_feed{$type} && ($rel{'alternate'} \|\| $rel{'service.feed'})) {
			66
306	8					25	push @feeds, Mojo::URL->new($attrs->{'href'})->to_abs($base);
307							}
308							}
309	10					41904	);
310							$res->dom->find('a')->grep(
311							sub {
312	374	50		374		122360	$_->attr('href')
313							&& Mojo::URL->new($_->attr('href'))->path =~ /$feed_ext/io;
314							}
315							)->each(
316							sub {
317	2			2		784	push @feeds, Mojo::URL->new($_->attr('href'))->to_abs($base);
318							}
319	10					2316	);
320	10	100				5698	unless (@feeds)
321							{ # call me crazy, but maybe this is just a feed served as HTML?
322	5					24	my $body = $res->body;
323	5	100				137	if ($self->parse_feed(\$body)) {
324	1					5	push @feeds, Mojo::URL->new($url)->to_abs;
325							}
326							}
327							}
328	12					1029	return @feeds;
329							}
330
331							sub parse_opml {
332	3			3	1	17933	my ($self, $opml_file) = @_;
333	3	50				28	my $opml_str = decode 'UTF-8',
334							(ref $opml_file) ? $opml_file->slurp : Mojo::File->new($opml_file)->slurp;
335	3					1312	my $d = Mojo::DOM->new->parse($opml_str);
336	3					156604	my (%subscriptions, %categories);
337	3					18	for my $item ($d->find(q{outline})->each) {
338	962					186207	my $node = $item->attr;
339	962	100				10846	if (!defined $node->{xmlUrl}) {
340	40		66			132	my $cat = $node->{title} \|\| $node->{text};
341	40					93	$categories{$cat} = $item->children('[xmlUrl]')->map('attr', 'xmlUrl');
342							}
343							else { # file by RSS URL:
344	922					2205	$subscriptions{$node->{xmlUrl}} = $node;
345							}
346							}
347
348
349							# assign categories
350	3					239	for my $cat (keys %categories) {
351	40					107	for my $rss ($categories{$cat}->each) {
352	920	50				2003	next unless ($subscriptions{$rss}); # don't auto-vivify for empty "categories"
353	920		100			2941	$subscriptions{$rss}{'categories'} \|\|= [];
354	920					1112	push @{$subscriptions{$rss}{'categories'}}, $cat;
	920					1955
355							}
356							}
357	3					934	return (values %subscriptions);
358							}
359
360
361							1;
362
363							=encoding utf-8
364
365							=head1 NAME
366
367							Mojolicious::Plugin::FeedReader - Mojolicious plugin to find and parse RSS & Atom feeds
368
369							=head1 SYNOPSIS
370
371							# Mojolicious
372							$self->plugin('FeedReader');
373
374							# Mojolicious::Lite
375							plugin 'FeedReader';
376
377							# Blocking:
378							get '/b' => sub {
379							my $self = shift;
380							my ($feed) = $self->find_feeds(q{search.cpan.org});
381							my $out = $self->parse_feed($feed);
382							$self->render(template => 'uploads', items => $out->{items});
383							};
384
385							# Non-blocking:
386							get '/nb' => sub {
387							my $self = shift;
388							$self->render_later;
389							my $delay = Mojo::IOLoop->delay(
390							sub {
391							$self->find_feeds("search.cpan.org", shift->begin(0));
392							},
393							sub {
394							my $feed = pop;
395							$self->parse_feed($feed, shift->begin);
396							},
397							sub {
398							my $data = pop;
399							$self->render(template => 'uploads', items => $data->{items});
400							});
401							$delay->wait unless Mojo::IOLoop->is_running;
402							};
403
404							app->start;
405
406							__DATA__
407
408							@@ uploads.html.ep
409
410							% for my $item (@$items) {
411							<%= link_to $item->{title} => $item->{link} %> - <%= $item->{description} %>
412							% }
413
414
415							=head1 DESCRIPTION
416
417							B implements minimalistic helpers for identifying,
418							fetching and parsing RSS and Atom Feeds. It has minimal dependencies, relying as
419							much as possible on Mojolicious components - Mojo::UserAgent for fetching feeds and
420							checking URLs, Mojo::DOM for XML/HTML parsing.
421							It is therefore rather fragile and naive, and should be considered Experimental/Toy
422							code - B.
423
424
425							=head1 METHODS
426
427							L inherits all methods from
428							L and implements the following new ones.
429
430							=head2 register
431
432							$plugin->register(Mojolicious->new);
433
434							Register plugin in L application. This method will install the helpers
435							listed below in your Mojolicious application.
436
437							=head1 HELPERS
438
439							B implements the following helpers.
440
441							=head2 find_feeds
442
443							# Call blocking
444							my (@feeds) = app->find_feeds('search.cpan.org');
445							# @feeds is a list of Mojo::URL objects
446
447							# Call non-blocking
448							$self->find_feeds('http://example.com', sub {
449							my (@feeds) = @_;
450							unless (@feeds) {
451							$self->render_exception("no feeds found, " . $info->{error});
452							}
453							else {
454							....
455							}
456							});
457
458							A Mojolicious port of L by Benjamin Trott. This helper implements feed auto-discovery for finding syndication feeds, given a URI.
459							If given a callback function as an additional argument, execution will be non-blocking.
460
461							=head2 parse_feed
462
463							# parse an RSS/Atom feed
464							# blocking
465							my $url = Mojo::URL->new('http://rss.slashdot.org/Slashdot/slashdot');
466							my $feed = $self->parse_feed($url);
467							for my $item (@{$feed->{items}}) {
468							say $_ for ($item->{title}, $item->{description}, 'Tags: ' . join q{,}, @{$item->{tags}});
469							}
470
471							# non-blocking
472							$self->parse_feed($url, sub {
473							my ($c, $feed) = @_;
474							$c->render(text => "Feed tagline: " . $feed->{tagline});
475							});
476
477							# parse a file
478							$feed2 = $self->parse_feed('/downloads/foo.rss');
479
480							# parse response
481							$self->ua->get($feed_url, sub {
482							my ($ua, $tx) = @_;
483							my $feed = $self->parse_feed($tx->res);
484							});
485
486							A minimalist liberal RSS/Atom parser, using Mojo::DOM queries.
487
488							Dates are parsed using L.
489
490							If parsing fails (for example, the parser was given an HTML page), the helper will return undef.
491
492							On success, the result returned is a hashref with the following keys:
493
494							=over 4
495
496							=item * title
497
498							=item * description (may be filled from subtitle or tagline if absent)
499
500							=item * htmlUrl - web page URL associated with the feed
501
502							=item * items - array ref of feed news items
503
504							=item * subtitle (optional)
505
506							=item * tagline (optional)
507
508							=item * author (name of author field, or dc:creator or webMaster)
509
510							=item * published - time in epoch seconds (may be filled with pubDate, dc:date, created, issued, updated or modified)
511
512							=back
513
514							Each item in the items array is a hashref with the following keys:
515
516							=over 4
517
518							=item * title
519
520							=item * link
521
522							=item * content (may be filled with content:encoded, xhtml:body or description fields)
523
524							=item * id (will be equal to link or guid if it is undefined and either of those fields exists)
525
526							=item * description (optional) - usually a shorter form of the content (may be filled with summary if description is missing)
527
528							=item * guid (optional)
529
530							=item * published - time in epoch seconds (may be filled with pubDate, dc:date, created, issued, updated or modified)
531
532							=item * author (may be filled from author or dc:creator)
533
534							=item * tags (optional) - array ref of tags, categories or dc:subjects.
535
536							=item * _raw - XML serialized text of the item's Mojo::DOM node. Note that this can be different from the original XML text in the feed.
537
538							=back
539
540							=head2 parse_opml
541
542							my @subscriptions = app->parse_opml( 'mysubs.opml' );
543							foreach my $sub (@subscriptions) {
544							say 'RSS URL is: ', $sub->{xmlUrl};
545							say 'Website URL is: ', $sub->{htmlUrl};
546							say 'categories: ', join ',', @{$sub->{categories}};
547							}
548
549							Parse an OPML subscriptions file and return the list of feeds as an array of hashrefs.
550
551							Each hashref will contain an array ref in the key 'categories' listing the folders (parent nodes) in the OPML tree the subscription item appears in.
552
553							=head1 CREDITS
554
555							Some tests adapted from L and L Feed autodiscovery adapted from L.
556
557							Test data (web pages, feeds and excerpts) included in this package is intended for testing purposes only, and is not meant in any way
558							to infringe on the rights of the respective authors.
559
560							=head1 COPYRIGHT AND LICENSE
561
562							Copyright (C) 2014, Dotan Dimet.
563
564							This program is free software, you can redistribute it and/or modify it
565							under the terms of the Artistic License version 2.0.
566
567							=head1 SEE ALSO
568
569							L, L, L
570
571							L, L, L
572
573							=cut