File Coverage

blib/lib/Mojolicious/Plugin/FeedReader.pm

Criterion	Covered	Total	%
statement	188	191	98.4
branch	78	94	82.9
condition	60	77	77.9
subroutine	25	25	100.0
pod	3	8	37.5
total	354	395	89.6

line	stmt	bran	cond	sub	pod	time	code
1							package Mojolicious::Plugin::FeedReader;
2	4			4		2272	use Mojo::Base 'Mojolicious::Plugin';
	4					9
	4					27
3
4							our $VERSION = '0.10';
5	4			4		883	use Mojo::Util qw(decode trim);
	4					19
	4					199
6	4			4		21	use Mojo::File;
	4					8
	4					124
7	4			4		20	use Mojo::DOM;
	4					8
	4					87
8	4			4		18	use Mojo::IOLoop;
	4					8
	4					34
9	4			4		673	use HTTP::Date;
	4					2680
	4					215
10	4			4		23	use Carp qw(carp croak);
	4					7
	4					186
11	4			4		21	use Scalar::Util qw(blessed);
	4					8
	4					10184
12
13							our @time_fields
14							= (qw(pubDate published created issued updated modified dc\:date));
15							our %is_time_field = map { $_ => 1 } @time_fields;
16
17							# feed mime-types:
18							our @feed_types = (
19							'application/x.atom+xml', 'application/atom+xml',
20							'application/xml', 'text/xml',
21							'application/rss+xml', 'application/rdf+xml'
22							);
23							our %is_feed = map { $_ => 1 } @feed_types;
24
25							our $default_charset = 'UTF-8';
26
27							sub register {
28	4			4	1	176	my ($self, $app) = @_;
29	4					9	foreach my $method (
30							qw( find_feeds parse_rss parse_opml ))
31							{
32	12					180	$app->helper($method => \&{$method});
	12					57
33							}
34	4					81	$app->helper(parse_feed => \&parse_rss);
35							}
36
37							sub make_dom {
38	25			25	0	73	my ($xml, $charset) = @_;
39	25					54	my $rss;
40	25	100	66			186	if (!ref $xml) { # assume file
		100	33
		100
		50
41	9					94	$rss = Mojo::File->new($xml)->slurp;
42	9	50				1438	die "Unable to read file $xml: $!" unless ($rss);
43							}
44							elsif (ref $xml eq 'SCALAR') { # assume string
45	11					45	$rss = $$xml;
46							}
47							elsif (blessed $xml && $xml->isa('Mojo::DOM')) { # Mojo::DOM (old style)
48	1					4	return $xml;
49							}
50							elsif (blessed $xml && $xml->can('slurp')) { # Mojo::Asset/File or similar
51	4					18	$rss = $xml->slurp;
52							}
53							else {
54	0					0	die "don't know how to make a Mojo::DOM from object $xml";
55							}
56	24	50				392	$charset = ($charset) ? $charset : $default_charset ;
57	24					153	my $rss_str = decode($charset, $rss);
58	24	50				1904	die "Failed to read asset $xml (as $charset): $!" unless ($rss_str);
59	24					183	return Mojo::DOM->new->parse($rss_str);
60							}
61
62							sub parse_rss {
63	25			25	0	93253	my ($c, $xml, $cb) = @_;
64	25					75	my $charset = undef;
65	25	100	100			219	if (blessed $xml && $xml->isa('Mojo::URL')) {
66							# this is the only case where we might go non-blocking:
67	4	100	66			25	if ($cb && ref $cb eq 'CODE') {
68							return
69							$c->ua->get(
70							$xml,
71							sub {
72	1			1		8147	my ($ua, $tx) = @_;
73	1					2	my $feed;
74	1	50				4	if ($tx->success) {
75	1					21	my $body = $tx->res->body;
76	1					46	$charset = $tx->res->content->charset;
77	1					24	my $dom = make_dom(\$body, $charset);
78	1					4492	eval { $feed = parse_rss_dom($dom); };
	1					6
79							}
80	1					8	$c->$cb($feed);
81							}
82	1					9	);
83							}
84							else {
85	3					32	my $tx = $c->ua->get($xml);
86	3	50				23881	if ($tx->success) {
87	3					91	my $body = $tx->res->body;
88	3					88	$charset = $tx->res->content->charset;
89	3					93	$xml = \$body;
90							}
91							else {
92	0	0				0	croak "Error getting feed from url $xml: ", (($tx->error) ? $tx->error->{message} : '');
93							}
94							}
95							}
96	24					93	my $dom = make_dom($xml, $charset);
97	24	50				529034	return ($dom) ? parse_rss_dom($dom) : 1;
98							}
99
100							sub parse_rss_dom {
101	25			25	0	255	my ($dom) = @_;
102	25	50				226	die "Argument $dom is not a Mojo::DOM" unless ($dom->isa('Mojo::DOM'));
103	25					100	my $feed = parse_rss_channel($dom); # Feed properties
104	25					161	my $items = $dom->find('item');
105	25					119319	my $entries = $dom->find('entry'); # Atom
106	25					119782	my $res = [];
107	25					175	foreach my $item ($items->each, $entries->each) {
108	130					628	push @$res, parse_rss_item($item);
109							}
110	25	100				167	if (@$res) {
111	20					124	$feed->{'items'} = $res;
112							}
113	25					6008	return $feed;
114							}
115
116							sub parse_rss_channel {
117	25			25	0	82	my ($dom) = @_;
118	25					60	my %info;
119	25					118	foreach my $k (
120							qw{title subtitle description tagline link:not([rel]) link[rel=alternate] dc\:creator author webMaster},
121							@time_fields
122							)
123							{
124	400		100			4858	my $p = $dom->at("channel > $k") \|\| $dom->at("feed > $k"); # direct child
125	400	100				3623425	if ($p) {
126	94		100			721	$info{$k} = $p->text \|\| $p->content \|\| $p->attr('href');
127	94	100	100			6345	if ($k eq 'author' && $p->at('name')) {
128	14		33			2440	$info{$k} = $p->at('name')->text \|\| $p->at('name')->content;
129							}
130	94	100				3133	if ($is_time_field{$k}) {
131	17					143	$info{$k} = str2time($info{$k});
132							}
133							}
134							}
135							my ($htmlUrl)
136	50					169	= grep { defined $_ }
137	25					261	map { delete $info{$_} } ('link:not([rel])', 'link[rel=alternate]');
	50					231
138							my ($description)
139	75					186	= grep { defined $_ }
140	25	100				92	map { exists $info{$_} ? $info{$_} : undef }
	75					253
141							(qw(description tagline subtitle));
142	25	100				114	$info{htmlUrl} = $htmlUrl if ($htmlUrl);
143	25	100				105	$info{description} = $description if ($description);
144
145							# normalize fields:
146	25					184	my @replace = (
147							'pubDate' => 'published',
148							'dc\:date' => 'published',
149							'created' => 'published',
150							'issued' => 'published',
151							'updated' => 'published',
152							'modified' => 'published',
153							'dc\:creator' => 'author',
154							'webMaster' => 'author'
155							);
156	25					144	while (my ($old, $new) = splice(@replace, 0, 2)) {
157	200	100	100			674	if ($info{$old} && !$info{$new}) {
158	19					87	$info{$new} = delete $info{$old};
159							}
160							}
161	25	100				192	return (keys %info) ? \%info : undef;
162							}
163
164							sub parse_rss_item {
165	130			130	0	371	my ($item) = @_;
166	130					207	my %h;
167	130					521	foreach my $k (
168							qw(title id summary guid content description content\:encoded xhtml\:body dc\:creator author),
169							@time_fields
170							)
171							{
172	2210					35887	my $p = $item->at($k);
173	2210	100				1050578	if ($p) {
174
175							# skip namespaced items - like itunes:summary - unless explicitly
176							# searched:
177							next
178	779	100	100			4503	if ($p->tag =~ /\:/
			100
			100
			100
179							&& $k ne 'content\:encoded'
180							&& $k ne 'xhtml\:body'
181							&& $k ne 'dc\:date'
182							&& $k ne 'dc\:creator');
183	774		66			12189	$h{$k} = $p->text \|\| $p->content;
184	774	100	100			115183	if ($k eq 'author' && $p->at('name')) {
185	22					3793	$h{$k} = $p->at('name')->text;
186							}
187	774	100				7033	if ($is_time_field{$k}) {
188	269					1121	$h{$k} = str2time($h{$k});
189							}
190							}
191							}
192
193							# let's handle links seperately, because ATOM loves these buggers:
194							$item->find('link')->each(
195							sub {
196	137			137		86841	my $l = shift;
197	137	100				532	if ($l->attr('href')) {
198	130	100	100			2668	if (!$l->attr('rel') \|\| $l->attr('rel') eq 'alternate') {
199	122					2395	$h{'link'} = $l->attr('href');
200							}
201							}
202							else {
203	7	50				478	if ($l->text =~ /\w+/) {
204	7					290	$h{'link'} = $l->text; # simple link
205							}
206
207							# else { # we have an empty link element with no 'href'. :-(
208							# $h{'link'} = $1 if ($l->next->text =~ m/^(http\S+)/);
209							# }
210							}
211							}
212	130					876	);
213
214							# find tags:
215	130					3656	my @tags;
216							$item->find('category, dc\:subject')
217	130		66	93		417	->each(sub { push @tags, $_[0]->text \|\| $_[0]->attr('term') });
	93					59609
218	130	100				67839	if (@tags) {
219	76					323	$h{'tags'} = \@tags;
220							}
221							#
222							# normalize fields:
223	130					949	my @replace = (
224							'content\:encoded' => 'content',
225							'xhtml\:body' => 'content',
226							'summary' => 'description',
227							'pubDate' => 'published',
228							'dc\:date' => 'published',
229							'created' => 'published',
230							'issued' => 'published',
231							'updated' => 'published',
232							'modified' => 'published',
233							'dc\:creator' => 'author'
234
235							# 'guid' => 'link'
236							);
237	130					631	while (my ($old, $new) = splice(@replace, 0, 2)) {
238	1300	100	100			4251	if ($h{$old} && !$h{$new}) {
239	126					531	$h{$new} = delete $h{$old};
240							}
241							}
242	130					683	my %copy = ('description' => 'content', link => 'id', guid => 'id');
243	130					576	while (my ($fill, $required) = each %copy) {
244	390	100	100			1638	if ($h{$fill} && !$h{$required}) {
245	8					39	$h{$required} = $h{$fill};
246							}
247							}
248	130					533	$h{"_raw"} = $item->to_string;
249	130					127387	return \%h;
250							}
251
252							# find_feeds - get RSS/Atom feed URL from argument.
253							# Code adapted to use Mojolicious from Feed::Find by Benjamin Trott
254							# Any stupid mistakes are my own
255							sub find_feeds {
256	13			13	1	105752	my $self = shift;
257	13					44	my $url = shift;
258	13	100				63	my $cb = (ref $_[-1] eq 'CODE') ? pop @_ : undef;
259							# $self->ua->max_redirects(5)->connect_timeout(30);
260							my $main = sub {
261	13			13		47	my ($tx) = @_;
262	13					38	my @feeds;
263							# if ($tx->success) { say $tx->res->code } else { say $tx->error };
264	13	100	66			74	return unless ($tx->success && $tx->res->code == 200);
265	12					496	eval { @feeds = _find_feed_links($self, $tx->req->url, $tx->res); };
	12					61
266	12	50				53	if ($@) {
267	0					0	croak "Exception in find_feeds - ", $@;
268							}
269	12					219	return (@feeds);
270	13					100	};
271	13	100				42	if ($cb) { # non-blocking:
272							$self->ua->get(
273							$url,
274							sub {
275	3			3		25677	my ($ua, $tx) = @_;
276	3					19	my (@feeds) = $main->($tx);
277	3					24	$cb->(@feeds);
278							}
279	3					26	);
280							}
281							else {
282	10					88	my $tx = $self->ua->get($url);
283	10					95624	return $main->($tx);
284							}
285							}
286
287							sub _find_feed_links {
288	12			12		151	my ($self, $url, $res) = @_;
289
290	12					36	state $feed_ext = qr/\.(?:rss\|xml\|rdf)$/;
291	12					27	my @feeds;
292
293							# use split to remove charset attribute from content_type
294	12					47	my ($content_type) = split(/[; ]+/, $res->headers->content_type);
295	12	100				329	if ($is_feed{$content_type}) {
296	2					14	push @feeds, Mojo::URL->new($url)->to_abs;
297							}
298							else {
299							# we are in a web page. PHEAR.
300	10		33			69	my $base = Mojo::URL->new(
301							$res->dom->find('head base')->map('attr', 'href')->join('') \|\| $url)->to_abs($url);
302	10		33			297830	my $title
303							= $res->dom->find('head > title')->map('text')->join('') \|\| $url;
304							$res->dom->find('head link')->each(
305							sub {
306	131			131		54162	my $attrs = $_->attr();
307	131	100				2177	return unless ($attrs->{'rel'});
308	129					520	my %rel = map { $_ => 1 } split /\s+/, lc($attrs->{'rel'});
	129					479
309	129	100				382	my $type = ($attrs->{'type'}) ? lc trim $attrs->{'type'} : '';
310	129	50	33			722	if ($is_feed{$type} && ($rel{'alternate'} \|\| $rel{'service.feed'})) {
			66
311	8					43	push @feeds, Mojo::URL->new($attrs->{'href'})->to_abs($base);
312							}
313							}
314	10					46043	);
315							$res->dom->find('a')->grep(
316							sub {
317	374	50		374		138735	$_->attr('href')
318							&& Mojo::URL->new($_->attr('href'))->path =~ /$feed_ext/io;
319							}
320							)->each(
321							sub {
322	2			2		1133	push @feeds, Mojo::URL->new($_->attr('href'))->to_abs($base);
323							}
324	10					3414	);
325	10	100				8043	unless (@feeds)
326							{ # call me crazy, but maybe this is just a feed served as HTML?
327	5					40	my $body = $res->body;
328	5	100				225	if ($self->parse_feed(\$body)) {
329	1					12	push @feeds, Mojo::URL->new($url)->to_abs;
330							}
331							}
332							}
333	12					1612	return @feeds;
334							}
335
336							sub parse_opml {
337	3			3	1	16568	my ($self, $opml_file) = @_;
338	3	50				45	my $opml_str = decode $default_charset,
339							(ref $opml_file) ? $opml_file->slurp : Mojo::File->new($opml_file)->slurp;
340	3					1401	my $d = Mojo::DOM->new->parse($opml_str);
341	3					141784	my (%subscriptions, %categories);
342	3					20	for my $item ($d->find(q{outline})->each) {
343	962					222109	my $node = $item->attr;
344	962	100				12296	if (!defined $node->{xmlUrl}) {
345	40		66			194	my $cat = $node->{title} \|\| $node->{text};
346	40					132	$categories{$cat} = $item->children('[xmlUrl]')->map('attr', 'xmlUrl');
347							}
348							else { # file by RSS URL:
349	922					2576	$subscriptions{$node->{xmlUrl}} = $node;
350							}
351							}
352
353
354							# assign categories
355	3					335	for my $cat (keys %categories) {
356	40					116	for my $rss ($categories{$cat}->each) {
357	920	50				2118	next unless ($subscriptions{$rss}); # don't auto-vivify for empty "categories"
358	920		100			3001	$subscriptions{$rss}{'categories'} \|\|= [];
359	920					1143	push @{$subscriptions{$rss}{'categories'}}, $cat;
	920					1942
360							}
361							}
362	3					1346	return (values %subscriptions);
363							}
364
365
366							1;
367
368							=encoding utf-8
369
370							=head1 NAME
371
372							Mojolicious::Plugin::FeedReader - Mojolicious plugin to find and parse RSS & Atom feeds
373
374							=head1 SYNOPSIS
375
376							# Mojolicious
377							$self->plugin('FeedReader');
378
379							# Mojolicious::Lite
380							plugin 'FeedReader';
381
382							# Blocking:
383							get '/b' => sub {
384							my $self = shift;
385							my ($feed) = $self->find_feeds(q{search.cpan.org});
386							my $out = $self->parse_feed($feed);
387							$self->render(template => 'uploads', items => $out->{items});
388							};
389
390							# Non-blocking:
391							get '/nb' => sub {
392							my $self = shift;
393							$self->render_later;
394							my $delay = Mojo::IOLoop->delay(
395							sub {
396							$self->find_feeds("search.cpan.org", shift->begin(0));
397							},
398							sub {
399							my $feed = pop;
400							$self->parse_feed($feed, shift->begin);
401							},
402							sub {
403							my $data = pop;
404							$self->render(template => 'uploads', items => $data->{items});
405							});
406							$delay->wait unless Mojo::IOLoop->is_running;
407							};
408
409							app->start;
410
411							__DATA__
412
413							@@ uploads.html.ep
414
415							% for my $item (@$items) {
416							<%= link_to $item->{title} => $item->{link} %> - <%= $item->{description} %>
417							% }
418
419
420							=head1 DESCRIPTION
421
422							B implements minimalistic helpers for identifying,
423							fetching and parsing RSS and Atom Feeds. It has minimal dependencies, relying as
424							much as possible on Mojolicious components - Mojo::UserAgent for fetching feeds and
425							checking URLs, Mojo::DOM for XML/HTML parsing.
426							It is therefore rather fragile and naive, and should be considered Experimental/Toy
427							code - B.
428
429
430							=head1 METHODS
431
432							L inherits all methods from
433							L and implements the following new ones.
434
435							=head2 register
436
437							$plugin->register(Mojolicious->new);
438
439							Register plugin in L application. This method will install the helpers
440							listed below in your Mojolicious application.
441
442							=head1 HELPERS
443
444							B implements the following helpers.
445
446							=head2 find_feeds
447
448							# Call blocking
449							my (@feeds) = app->find_feeds('search.cpan.org');
450							# @feeds is a list of Mojo::URL objects
451
452							# Call non-blocking
453							$self->find_feeds('http://example.com', sub {
454							my (@feeds) = @_;
455							unless (@feeds) {
456							$self->render_exception("no feeds found, " . $info->{error});
457							}
458							else {
459							....
460							}
461							});
462
463							A Mojolicious port of L by Benjamin Trott. This helper implements feed auto-discovery for finding syndication feeds, given a URI.
464							If given a callback function as an additional argument, execution will be non-blocking.
465
466							=head2 parse_feed
467
468							# parse an RSS/Atom feed
469							# blocking
470							my $url = Mojo::URL->new('http://rss.slashdot.org/Slashdot/slashdot');
471							my $feed = $self->parse_feed($url);
472							for my $item (@{$feed->{items}}) {
473							say $_ for ($item->{title}, $item->{description}, 'Tags: ' . join q{,}, @{$item->{tags}});
474							}
475
476							# non-blocking
477							$self->parse_feed($url, sub {
478							my ($c, $feed) = @_;
479							$c->render(text => "Feed tagline: " . $feed->{tagline});
480							});
481
482							# parse a file
483							$feed2 = $self->parse_feed('/downloads/foo.rss');
484
485							# parse response
486							$self->ua->get($feed_url, sub {
487							my ($ua, $tx) = @_;
488							my $feed = $self->parse_feed($tx->res);
489							});
490
491							A minimalist liberal RSS/Atom parser, using Mojo::DOM queries.
492
493							Dates are parsed using L.
494
495							If parsing fails (for example, the parser was given an HTML page), the helper will return undef.
496
497							On success, the result returned is a hashref with the following keys:
498
499							=over 4
500
501							=item * title
502
503							=item * description (may be filled from subtitle or tagline if absent)
504
505							=item * htmlUrl - web page URL associated with the feed
506
507							=item * items - array ref of feed news items
508
509							=item * subtitle (optional)
510
511							=item * tagline (optional)
512
513							=item * author (name of author field, or dc:creator or webMaster)
514
515							=item * published - time in epoch seconds (may be filled with pubDate, dc:date, created, issued, updated or modified)
516
517							=back
518
519							Each item in the items array is a hashref with the following keys:
520
521							=over 4
522
523							=item * title
524
525							=item * link
526
527							=item * content (may be filled with content:encoded, xhtml:body or description fields)
528
529							=item * id (will be equal to link or guid if it is undefined and either of those fields exists)
530
531							=item * description (optional) - usually a shorter form of the content (may be filled with summary if description is missing)
532
533							=item * guid (optional)
534
535							=item * published - time in epoch seconds (may be filled with pubDate, dc:date, created, issued, updated or modified)
536
537							=item * author (may be filled from author or dc:creator)
538
539							=item * tags (optional) - array ref of tags, categories or dc:subjects.
540
541							=item * _raw - XML serialized text of the item's Mojo::DOM node. Note that this can be different from the original XML text in the feed.
542
543							=back
544
545							=head2 parse_opml
546
547							my @subscriptions = app->parse_opml( 'mysubs.opml' );
548							foreach my $sub (@subscriptions) {
549							say 'RSS URL is: ', $sub->{xmlUrl};
550							say 'Website URL is: ', $sub->{htmlUrl};
551							say 'categories: ', join ',', @{$sub->{categories}};
552							}
553
554							Parse an OPML subscriptions file and return the list of feeds as an array of hashrefs.
555
556							Each hashref will contain an array ref in the key 'categories' listing the folders (parent nodes) in the OPML tree the subscription item appears in.
557
558							=head1 CREDITS
559
560							Some tests adapted from L and L Feed autodiscovery adapted from L.
561
562							Test data (web pages, feeds and excerpts) included in this package is intended for testing purposes only, and is not meant in any way
563							to infringe on the rights of the respective authors.
564
565							=head1 COPYRIGHT AND LICENSE
566
567							Copyright (C) 2014, Dotan Dimet.
568
569							This program is free software, you can redistribute it and/or modify it
570							under the terms of the Artistic License version 2.0.
571
572							=head1 SEE ALSO
573
574							L, L, L
575
576							L, L, L
577
578							=cut