File Coverage

blib/lib/XML/RSSLite.pm

Criterion	Covered	Total	%
statement	94	164	57.3
branch	38	94	40.4
condition	7	23	30.4
subroutine	11	13	84.6
pod	2	4	50.0
total	152	298	51.0

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/perl
2							package XML::RSSLite;
3	2			2		787	use strict;
	2					9
	2					84
4	2			2		9	use vars qw($VERSION);
	2					4
	2					123
5
6							$VERSION = 0.16;
7
8							sub import{
9	2			2		10	no strict 'refs';
	2					2
	2					4040
10	2			2		16	shift;
11	2					4	my $pkg = scalar caller();
12	2					7	*{"${pkg}::parseRSS"} = \&parseRSS;
	2					11
13	2	50				2673	*{"${pkg}::parseXML"} = \&parseXML if grep($_ eq 'parseXML', @_);
	0					0
14							}
15
16
17							sub parseRSS {
18	2			2	1	221	my($rr, $cref, $strip) = @_;
19
20	2	50				8	die "$rr is not a hash reference" unless ref($rr) eq 'HASH';
21	2	50				4	die "$cref is not a scalar reference" unless ref($cref) eq 'SCALAR';
22
23							# Gotta have some content to parse
24	2	50				4	return unless $$cref;
25
26	2					7	preprocess($cref, $strip);
27							{
28	2	100				2	_parseRSS($rr, $cref), last if index(${$cref}, '
	2					3
	2					11
29	1	50				2	_parseRDF($rr, $cref), last if index(${$cref}, '
	1					5
30	0	0				0	_parseSN( $rr, $cref), last if index(${$cref}, '
	0					0
31	0	0				0	_parseWL( $rr, $cref), last if index(${$cref}, '
	0					0
32	0					0	die "Content must be RSS\|RDF\|ScriptingNews\|Weblog\|reasonably close";
33							}
34	2					5	postprocess($rr);
35							}
36
37							sub preprocess {
38	2			2	0	3	my($cref, $strip) = @_;
39	2					21	$$cref =~ y/\r\n/\n/s;
40
41	2	100				8	if( ! defined($strip) ){
		50
42	1					10	$$cref =~ y{\n\t ~0-9\-+!@#$%^&*()_=a-zA-Z[]\\;':",./<>?}{ }cs;
43							}
44							elsif($strip eq '1' ){
45	0					0	$$cref =~ s/[^:print:]]/ /g;
46							}
47
48							#XXX $$cref =~ s/&(?!0[a-zA-Z0-9]+\|#\d+);/amp/gs;
49							#XXX Do we wish to (re)allow escaped HTML?!
50	2					681	$$cref =~ s{(?:<\|<)/?(?:b\|i\|h[1-6]\|p\|center\|quote\|strong)(?:>\|>)}{}gsi;
51							}
52
53							sub _parseRSS {
54	1			1		3	parseXML($_[0], $_[1], 'channel', 0);
55	1					3	$_[0]->{'items'} = $_[0]->{'item'};
56							}
57
58							sub _parseRDF {
59	1			1		2	my ($rr, $cref) = @_;
60
61	1					3	$rr->{'items'} = [];
62	1					1	my $item;
63
64	1					3	parseXML($_[0], $_[1], 'rdf:RDF', 0);
65
66							# Alias RDF to RSS
67	1	50				3	if( exists($rr->{'item'}) ){
68	1					3	$rr->{'items'} = $rr->{'item'};
69							}
70							else{
71	0		0			0	my $li = $_[0]->{'rdf:li'} \|\| $_[0]->{'rdf:Seq'}->{'rdf:li'};
72	0					0	foreach $item ( @{$li} ){
	0					0
73	0					0	my %ia;
74	0	0				0	if (exists $item->{'dc:description'}) {
75	0					0	$ia{'description'} = $item->{'dc:description'};
76							}
77	0	0				0	if (exists $item->{'dc:title'}) {
78	0					0	$ia{'title'} = $item->{'dc:title'};
79							}
80	0	0				0	if (exists $item->{'dc:identifier'}) {
81	0					0	$ia{'link'} = delete($item->{'dc:identifier'});
82							}
83
84	0					0	push(@{$rr->{'items'}}, \%ia);
	0					0
85							}
86							}
87							}
88
89							sub _parseSN {
90	0			0		0	my ($rr, $cref) = @_;
91
92	0					0	$rr->{'items'} = ();
93	0					0	my $item;
94
95	0					0	parseXML($rr, $cref, 'channel', 0);
96
97							# Alias SN to RSS terms
98	0					0	foreach $item ( @{$_[0]->{'rdf:li'}} ){
	0					0
99	0					0	my %ia;
100	0	0				0	if (exists $item->{'text'}) {
101	0					0	$ia{'description'} = $item->{'text'};
102							}
103	0	0				0	if (exists $item->{'linetext'}) {
104	0					0	$ia{'title'} = $item->{'linetext'};
105							}
106	0	0				0	if (exists $item->{'url'}) {
107	0					0	$ia{'link'} = $item->{'url'};
108							}
109
110	0					0	push(@{$rr->{'items'}}, \%ia);
	0					0
111							}
112							}
113
114
115							sub _parseWL {
116	0			0		0	my ($rr, $cref) = @_;
117
118	0					0	$rr->{'items'} = ();
119	0					0	my $item;
120
121							#XXX is this the right tag to parse for?
122	0					0	parseXML($rr, $cref, 'channel', 0);
123
124							# Alias WL to RSS
125	0					0	foreach $item ( @{$_[0]->{'rdf:li'}} ){
	0					0
126	0					0	my %ia;
127	0	0				0	if (exists $item->{'url'}) {
128	0					0	$ia{'link'} = delete($item->{'url'});
129							}
130
131	0					0	push(@{$rr->{'items'}}, \%ia);
	0					0
132							}
133							}
134
135
136							sub postprocess {
137	2			2	0	2	my $rr = shift;
138
139							#XXX Not much to do, what about un-munging URL's in source, etc.?!
140	2	50				5	return unless defined($rr->{'items'});
141	2	50				5	$rr->{'items'} = [$rr->{'items'}] unless ref($rr->{'items'}) eq 'ARRAY';
142
143	2					2	foreach my $i (@{$rr->{'items'}}) {
	2					6
144	26	50				37	$i->{description} = $i->{description}->{'<>'} if ref($i->{description});
145
146							# Put stuff into the right name if necessary
147	26	50				37	if( not $i->{'link'} ){
148	0	0				0	if( defined($i->{'url'}) ){
		0
149	0					0	$i->{'link'} = delete($i->{'url'}); }
150							# See if you can use misplaced url in title for empty links
151							elsif( exists($i->{'title'}) ){
152							# The next case would trap this, but try to short-circuit the gathering
153	0	0				0	if ($i->{'title'} =~ /^(?:https?\|ftp):/) {
		0
154	0					0	$i->{'link'} = $i->{'title'};
155							}
156							elsif ($i->{'title'} =~ /"((?:https?\|ftp).*?)"/) {
157	0					0	$i->{'link'} = $1;
158	0					0	$i->{'title'} =~ s/<.*?>//;
159							}
160							else {
161	0					0	next;
162							}
163							}
164							}
165
166							# Trim whitespace
167	26					42	$i->{'link'} =~ s/\s+$//;
168	26					61	$i->{'link'} =~ s/^\s+//;
169
170							# Make sure you've got an http/ftp link
171	26	50	33			96	if( exists( $i->{'link'}) && $i->{'link'} !~ m{^(https?\|ftp)://}i) {
172							## Rip link out of anchor tag
173	0	0	0			0	if( ref($i->{'link'}) && $i->{'link'}->{a}->{href} ){
		0	0
174	0					0	$i->{'link'} = $i->{'link'}->{a}->{href} }
175							## Smells like a relative url
176							elsif( $i->{'link'} =~ m{^[#/]} and $rr->{'link'} =~ m{^https?://} ){
177	0	0				0	if (substr($i->{'link'}, 0, 1) ne '/') {
178	0					0	$i->{'link'} = '/' . $i->{'link'};
179							}
180	0					0	$i->{'link'} = $rr->{'link'} . $i->{'link'};
181							}
182							else {
183	0					0	next;
184							}
185							}
186
187							#If we don't have a title, use the link
188	26	50				38	unless( defined($i->{'title'}) ){
189	0					0	$i->{'title'} = $i->{'link'};
190							}
191
192	26	50				37	if( exists($i->{'link'}) ){
193							#XXX # Fix pre-process munging
194							# $i->{'link'} =~ s/&/&/gi;
195	26					40	$i->{'link'} =~ s/ /%20/g;
196							}
197							}
198							}
199
200							sub parseXML{
201	2			2	1	4	my($hash, $xml, $tag, $comments) = @_;
202	2					2	my($begin, $end, @comments);
203	2					4	local $_;
204
205							#Kill comments
206	2		33			2	while( ($begin = index(${$xml}, '') for @comments;
	0					0
223
224							#Expose comments if requested
225	0	0				0	do{ push(@$comments, $_->[1]) for @comments } if ref($comments) eq 'ARRAY';
	0					0
226							}
227							}
228
229							sub _parseXML{
230	34			34		58	my($hash, $xml, $tag, $index) = @_;
231	34					37	my($begin, $end);
232
233							#Find topTag and set pos to start matching from there
234	34					40	${$xml} =~ /<$tag(?:>\|\s)/g;
	34					201
235	34		50			117	($begin, $end) = (0, pos(${$xml})\|\|0);
236
237							#Match either or , optional attributes, stash tag name
238	34					39	while( ${$xml} =~ m%<([^\s>]+)(?:\s+[^>]?)?(?:/\|>.?%sg ){
	179					851
239
240							#Save the tag name, we'll need it
241	145		33			334	$tag = $1 \|\| $2;
242
243							#Save the new beginning and end
244	145					147	($begin, $end) = ($end, pos(${$xml}));
	145					206
245
246							#Get the bit we just matched.
247	145					140	my $str = substr(${$xml}, $begin, $end-$begin);
	145					259
248
249							#Extract the actual attributes and contents of the tag
250	145	100				2319	$str =~ m%<\Q$tag\E\s([^>]?)?>(.*?)%s \|\|
251							#XXX pointed out by hv
252							# $str =~ s%^.?<$tag\s([^>]?)?>(.?)%<$tag>$2%s \|\|
253							$str =~ m%<\Q$tag\E\s([^>]?)?\s*/>%;
254	145					388	my($attr, $content) = ($1, $2);
255
256							#Did we get attributes? clean them up and chuck them in a hash.
257	145	100				240	if( $attr ){
258	43					72	($_, $attr) = ($attr, {});
259	43					335	$attr->{$1} = $3 while m/([^\s=]+)\s=\s(['"]?)([^\2>]*?)(?:\2\|$)/g;
260							}
261
262	145					152	my $inhash;
263							#Recurse if contents has more tags, replace contents with reference we get
264	145	100	100			350	if( $content && index($content, '<') > -1 ){
265	32					85	_parseXML($inhash={}, \$str, $tag);
266							#Was there any data in the contents? We should extract that...
267	32	50				89	if( $str =~ />[^><]+
268							#The odd RE above shortcircuits unnecessary entry
269
270							#Clean whitespace between tags
271							#$str =~ s%(?<=>)?\s*(?=<)%%g; #XXX ~same speed, wacko warning
272							#$str =~ s%(>?)\s*<%$1<%g;
273							#XXX #$str =~ s%(?:^\|(?<=>))\s*(?:(?=<)\|\z)%%g
274
275	32					36	my $qr = qr{@{[join('\|', map { quotemeta } keys %{$inhash})]}};
	32					34
	89					600
	32					77
276	32					789	$content =~ s%<($qr)\s(?:[^>]?)?(?:/\|>.*?%%sg;
277
278	32	50				113	$inhash->{'<>'} = $content if $content =~ /\S/;
279							}
280							}
281
282	145	100				246	if( ref($inhash) ){
		100
283							#We have attributes? Then we should merge them.
284	32	100				48	if( ref($attr) ){
285	22					23	for( keys %{$attr} ){
	22					50
286							$inhash->{$_} = exists($inhash->{$_}) ?
287							(ref($inhash->{$_}) eq 'ARRAY' ?
288	0					0	[@{$inhash->{$_}}, $attr->{$_}] :
289	22	0				52	[ $inhash->{$_}, $attr->{$_}] ) : $attr->{$_};
		50
290							}
291							}
292							}
293							elsif( ref($attr) ){
294	21					23	$inhash = $attr;
295							}
296							else{
297							#Otherwise save our content
298	92					102	$inhash = $content;
299							}
300
301							$hash->{$tag} = exists($hash->{$tag}) ?
302							(ref($hash->{$tag}) eq 'ARRAY' ?
303	40					151	[@{$hash->{$tag}}, $inhash] :
304	145	100				340	[ $hash->{$tag}, $inhash] ) : $inhash;
		100
305							}
306							}
307
308							1;
309							__END__