File Coverage

blib/lib/dirtyRSS.pm

Criterion	Covered	Total	%
statement	73	120	60.8
branch	28	62	45.1
condition	7	27	25.9
subroutine	4	7	57.1
pod	0	5	0.0
total	112	221	50.6

line	stmt	bran	cond	sub	pod	time	code
1							package dirtyRSS;
2
3	1			1		6948	use strict;
	1					3
	1					46
4	1			1		6	use warnings;
	1					3
	1					8547
5
6							require Exporter;
7
8							@dirtyRSS::ISA = qw[Exporter];
9							@dirtyRSS::EXPORT = qw[&parse &disptree];
10							$dirtyRSS::VERSION = '0.3';
11
12							our %htmlescapes = (
13							'quot' => 34,
14							'amp' => 38,
15							'apos' => 39,
16							'lt' => 60,
17							'gt' => 62,
18							'nbsp' => 32, # Was 160, but we make it a normal space
19							'iexcl' => 161,
20							'cent' => 162,
21							'pound' => 163,
22							'curren' => 164,
23							'yen' => 165,
24							'brvbar' => 166,
25							'sect' => 167,
26							'uml' => 168,
27							'copy' => 169,
28							'ordf' => 170,
29							'laquo' => 171,
30							'not' => 172,
31							'shy' => 173,
32							'reg' => 174,
33							'macr' => 175,
34							'deg' => 176,
35							'plusmn' => 177,
36							'sup2' => 178,
37							'sup3' => 179,
38							'acute' => 180,
39							'micro' => 181,
40							'para' => 182,
41							'middot' => 183,
42							'cedil' => 184,
43							'sup1' => 185,
44							'ordm' => 186,
45							'raquo' => 187,
46							'frac14' => 188,
47							'frac12' => 189,
48							'frac34' => 190,
49							'iquest' => 191,
50							'agrave' => 192,
51							'aacute' => 193,
52							'acirc' => 194,
53							'atilde' => 195,
54							'auml' => 196,
55							'aring' => 197,
56							'aelig' => 198,
57							'ccedil' => 199,
58							'egrave' => 200,
59							'eacute' => 201,
60							'ecirc' => 202,
61							'euml' => 203,
62							'igrave' => 204,
63							'iacute' => 205,
64							'icirc' => 206,
65							'iuml' => 207,
66							'eth' => 208,
67							'ntilde' => 209,
68							'ograve' => 210,
69							'oacute' => 211,
70							'ocirc' => 212,
71							'otilde' => 213,
72							'ouml' => 214,
73							'times' => 215,
74							'oslash' => 216,
75							'ugrave' => 217,
76							'uacute' => 218,
77							'ucirc' => 219,
78							'uuml' => 220,
79							'yacute' => 221,
80							'thorn' => 222,
81							'szlig' => 223,
82							'agrave' => 224,
83							'aacute' => 225,
84							'acirc' => 226,
85							'atilde' => 227,
86							'auml' => 228,
87							'aring' => 229,
88							'aelig' => 230,
89							'ccedil' => 231,
90							'egrave' => 232,
91							'eacute' => 233,
92							'ecirc' => 234,
93							'euml' => 235,
94							'igrave' => 236,
95							'iacute' => 237,
96							'icirc' => 238,
97							'iuml' => 239,
98							'eth' => 240,
99							'ntilde' => 241,
100							'ograve' => 242,
101							'oacute' => 243,
102							'ocirc' => 244,
103							'otilde' => 245,
104							'ouml' => 246,
105							'divide' => 247,
106							'oslash' => 248,
107							'ugrave' => 249,
108							'uacute' => 250,
109							'ucirc' => 251,
110							'uuml' => 252,
111							'yacute' => 253,
112							'thorn' => 254,
113							'yuml' => 255
114							);
115
116							# These are typical HTML tags, which should be omitted.
117
118							our %ignore_tags = (
119							'img' => 1,
120							'a' => 1,
121							'p' => 1,
122							'br' => 1,
123							'div' => 1,
124							'span' => 1,
125							'b' => 1,
126							'i' => 1,
127							'u' => 1,
128							'body' => 1,
129							'center' => 1,
130							'code' => 1,
131							'font' => 1,
132							'form' => 1,
133							'h1' => 1,
134							'h2' => 1,
135							'h3' => 1,
136							'h4' => 1,
137							'head' => 1,
138							'hr' => 1,
139							'html' => 1,
140							'li' => 1,
141							'ul' => 1,
142							'ol' => 1,
143							'pre' => 1,
144							'style' => 1,
145							'sub' => 1,
146							'sup' => 1,
147							'script' => 1,
148							'small' => 1,
149							'big' => 1,
150							'table' => 1,
151							'td' => 1,
152							'tr' => 1,
153							'th' => 1,
154							'textarea'=> 1,
155							'strong' => 1,
156							'strike' => 1,
157							'blockquote' => 1,
158							);
159
160							our %ns = (
161							# RSS 2.0 tags
162							'xml' => 'xml',
163							'rss' => 'rss',
164							'rdf' => 'rdf',
165							'item' => 'item',
166							'channel' => 'channel',
167							'image' => 'image',
168							'title' => 'title',
169							'link' => 'link',
170							'description' => 'description',
171							'language' => 'language',
172							'copyright' => 'copyright',
173							'pubdate' => 'pubdate',
174							'lastbuilddate'=> 'lastbuilddate',
175							'category' => 'category',
176							'generator' => 'generator',
177							'ttl' => 'ttl',
178							'url' => 'url',
179							'width' => 'width',
180							'height' => 'height',
181							'version' => 'version',
182							'encoding' => 'encoding',
183							'guid' => 'guid',
184							'enclosure' => 'enclosure',
185
186							# RSS 1.0 tags translated to RSS 2.0
187							'subject' => 'category',
188							'rights' => 'copyright',
189							'modified' => 'lastbuilddate',
190							'date' => 'pubdate',
191							'resource' => 'resource', # 1.0 specific!
192
193							# Atom 1.0 tags translated to RSS 2.0
194							'feed' => 'channel',
195							'summary' => 'description',
196							'content' => 'description',
197							'subtitle' => 'description',
198							'lang' => 'language',
199							'published' => 'pubdate',
200							'updated' => 'lastbuilddate',
201							'logo' => 'image',
202							'entry' => 'item',
203							'href' => 'link',
204							);
205
206							# Note that %specials refer to the right side of %ns, so only one
207							# entry is needed for each functional tag or its alias
208
209							# TRUE means array type
210							our %specials = (
211							'item' => 1,
212							'channel' => 1,
213							'image' => 1,
214							'xml' => 0,
215							'rss' => 0,
216							'rdf' => 0,
217							);
218
219							sub parse {
220	1			1	0	102	my ($in, $debug) = @_;
221
222	1					5	$in =~ s///gs; # Remove comments
223
224	1					90	my @segs = map { /^[ \n\r\t](.?)[ \n\r\t]*$/s } ($in =~ /(\|<[^>]+?>\|[^<]+)/gs);
	90					309
225
226							# Strip off CDATAs. Added a prefix space to avoid accidental tag hits
227	1	50				13	@segs = map { /^$/s ? " $1" : $_ } @segs;
	90					172
228
229	1					11	@segs = grep { length > 0 } @segs;
	90					136
230
231	1					5	my @stack = ();
232	1					2	my @valstack = ();
233	1					3	my %tree = ();
234	1					2	my $here = \%tree;
235	1					2	my @parent = ();
236	1					2	my $lastval = "";
237
238	1					3	foreach my $elem (@segs) {
239	63					328	my ($modifier, $tag, $attr, $empty) = ($elem =~ /^<([!?\#]{0,1})[ \n\r\t]([^ \n\r\t][^ \/\n\r\t])[ \n\r\t](.?)[ \n\r\t]*(\/{0,1})>$/s);
240
241	63	100				126	$empty = 1 if ($modifier);
242
243	63	100				99	if (defined $tag) {
244	45					61	$tag = lc $tag; # We're case-insensitive
245
246							# Note that the regex below removes "dc:"-like namespace prefices
247	45					43	my $closing;
248	45					185	($closing, $tag) = ($tag =~ /^(\/{0,1}).?:{0,1}([^:])$/);
249
250	45	50				125	if ($ignore_tags{$tag}) {
251	0					0	htmltags($here, unescape($elem));
252	0					0	next;
253							}
254
255	45	100				76	unless ($closing) { # Opening tags...
256	23					35	push @stack, $tag;
257
258	23					34	my $alias = $ns{$tag};
259
260	23	50				35	if (defined $alias) {
261	23					37	push @valstack, $lastval;
262	23					57	$lastval = "";
263
264	23	100				55	if (defined $specials{$alias}) {
265	5					7	push @parent, $here;
266	5					7	$here = {};
267							}
268
269							# Note that attributes may pollute the parent hash. This is
270							# necessary to support Atom 1.0
271
272	23					56	my @pairs = ($attr =~ /([^ \n\r\t]+?=\'[^\']?\'\|[^ \n\r\t]+?=\"[^\"]?\"\|[^ \n\r\t]+?=[^ \n\r\t]*)/g);
273
274	23					117	foreach my $p (@pairs) {
275	4					19	my ($k, $v) = ($p =~ /(.+?)=(.*)/);
276
277	4					8	$k = lc $k;
278
279	4	50	33			35	$v = $1
280							if (($v =~ /^\'(.)\'$/s) \|\| ($v =~ /^\"(.)\"$/s));
281
282	4					16	($k) = ($k =~ /([^:]*)$/); # Remove namespace prefix if present
283
284	4					9	my $alias = $ns{$k};
285
286	4	100				8	if (defined $alias) {
287	3					8	$here->{$alias} = unescape($v);
288							} else {
289	1	50				7	warn "Ignored attribute $k=$v\n"
290							if $debug;
291							}
292							}
293							} else {
294	0	0				0	warn "Ignored tag $tag\n"
295							if $debug;
296							}
297							}
298
299	45	100	100			154	if ($closing \|\| $empty) { # Closing tags, or close an empty opening tag
300	23					33	my $p = pop @stack;
301
302	23	50				50	return "Bad XML tag nesting. Expected end tag for '$p', got '/$tag'"
303							unless ($p eq $tag);
304
305	23					40	my $alias = $ns{$tag};
306
307	23	50				42	if (defined $alias) {
308	23					27	my $thislastval = $lastval;
309	23					29	$lastval = pop @valstack;
310
311	23	100				62	if (defined $specials{$alias}) {
312	5					6	my $parent = pop @parent;
313
314	5	100				34	if ($specials{$alias}) { # Array type
315	3	50	33			27	$parent->{$alias} = []
316							unless ((ref $parent->{$alias}) &&
317							(ref $parent->{$alias}) eq 'ARRAY');
318	3					7	push @{$parent->{$alias}}, $here;
	3					8
319							} else {
320	2					6	$parent->{$alias} = $here;
321							}
322
323							# UGLY HACK ALERT:
324							# Just before leaving a tree node, we clean the 'description'
325							# from possible HTML tags, and harvest the relevant values,
326							# if applicable. This is because some feeds think that the
327							# description should be rendered on a browser as is (cross
328							# scripting, anybody?)
329
330	5	100				20	$here->{'description'} =~ s/(<.*?>)/htmltags($here, $1)/ges
	0					0
331							if (defined $here->{'description'});
332
333	5					14	$here = $parent;
334							} else {
335	18	0	33			61	$here->{$alias} = unescape($thislastval)
			33
336							unless ((length($thislastval) == 0) &&
337							(defined $here->{$alias}) &&
338							(length $here->{$alias}));
339							}
340							}
341							}
342							} else {
343	18	50				42	$lastval = (length $lastval) ? "$lastval $elem" : $elem;
344							}
345
346							}
347	1	50				5	return("Bad XML nesting: There were unclosed tags at EOF")
348							if (@stack);
349
350	1					13	return \%tree;
351
352							}
353
354							sub htmltags {
355	0			0	0	0	my ($here, $seg) = @_;
356
357	0					0	my ($tag, $attr) = ($seg =~ /^<[ \n\r\t]([^ \n\r\t]+)[ \n\r\t](.?)[ \n\r\t]>$/s);
358
359	0	0				0	return "" unless (defined $tag);
360
361	0					0	$tag = lc $tag;
362
363							# Respect HTML line breaks, even though the renderer won't
364	0	0	0			0	return "\n"
365							if (($tag eq 'p') \|\| ($tag eq 'br'));
366
367	0	0	0			0	if (($tag eq 'img') && !(defined $here->{'altimage'})) {
		0	0
368	0					0	my $new = {};
369	0					0	$here->{'altimage'} = $new;
370	0					0	$here = $new;
371							} elsif (($tag eq 'a') && !(defined $here->{'altlink'})) {
372	0					0	my $new = {};
373	0					0	$here->{'altlink'} = $new;
374	0					0	$here = $new;
375	0					0	} else { return ""; }
376
377	0					0	my @pairs = ($attr =~ /([^ \n\r\t]+?=\'[^\']?\'\|[^ \n\r\t]+?=\"[^\"]?\"\|[^ \n\r\t]+?=[^ \n\r\t]*)/g);
378
379	0					0	foreach my $p (@pairs) {
380	0					0	my ($k, $v) = ($p =~ /(.+?)=(.*)/);
381
382	0					0	$k = lc $k;
383
384	0	0	0			0	$v = $1
385							if (($v =~ /^\'(.)\'$/s) \|\| ($v =~ /^\"(.)\"$/s));
386
387	0					0	$here->{$k} = $v;
388							}
389
390	0					0	return ""; # This makes the function useful in substitutions
391							}
392
393							sub single_unescape {
394	0			0	0	0	my ($ent) = @_;
395
396	0					0	my $ord = $htmlescapes{lc($ent)};
397	0	0				0	return chr($ord) if defined $ord;
398	0					0	return ""; # Conversion failed, return nothing
399							}
400
401							sub unescape {
402							# Note! Unicode characters are escaped to space!
403	21			21	0	24	my ($x) = @_;
404							# For now, we go wild, and convert all escape markers
405
406							# Run twice, because of double-nested markups :-O
407	21					50	for (my $i=0; $i<2; $i++) {
408	42					48	$x =~ s/&(\w+);/single_unescape($1)/ge;
	0					0
409	42	0				48	$x =~ s/&\#(\d+);/chr($1 < 256 ? $1 : 32)/ge;
	0					0
410	42	0				87	$x =~ s/&\#x([0-9a-fA-F]+);/chr(hex($1) < 256 ? hex($1) : 32)/ige;
	0					0
411							}
412	21					105	return $x;
413							}
414
415							sub disptree {
416	0			0	0		my ($what, $s) = @_;
417
418	0						foreach my $k (sort keys %{$what}) {
	0
419	0						my $v = $what->{$k};
420
421	0	0					if ((ref $v) eq 'HASH') {
422	0						print " "x$s."$k\n";
423	0						disptree($v, $s+2);
424	0						next;
425							}
426
427	0	0					if ((ref $v) eq 'ARRAY') {
428	0						my $count;
429	0						for ($count=0; $count<=$#{$v}; $count++) {
	0
430	0						print " "x$s.$k."[$count]\n";
431	0						disptree($v->[$count], $s+2);
432							}
433	0						next;
434							}
435
436	0						print " "x$s."$k => $v\n";
437							}
438							}
439
440							1;
441
442							__END__