File Coverage

blib/lib/WWW/Blog/Identify.pm
Criterion Covered Total %
statement 6 98 6.1
branch 0 172 0.0
condition n/a
subroutine 2 3 66.6
pod 1 1 100.0
total 9 274 3.2


line stmt bran cond sub pod time code
1             package WWW::Blog::Identify;
2              
3 1     1   561716 use strict;
  1         2  
  1         36  
4 1     1   7 use warnings;
  1         3  
  1         2133  
5              
6             require Exporter;
7            
8             our @ISA = qw(Exporter);
9             our @EXPORT_OK = qw/identify/;
10              
11              
12             our $VERSION = '0.06';
13              
14              
15             sub identify {
16 0     0 1   my ($url, $text) = @_;
17            
18 0           $url = lc( $url );
19 0           local $_ = $url;
20            
21             # patterns ordered roughly in terms of frequency
22              
23              
24             #
25             # URL CHECKING
26             #
27            
28 0 0         return "blogspot" if /\.blogspot\.com/o;
29 0 0         return "blogger" if m|\.blogger\.com/|o;
30            
31 0 0         return "blogger (br)" if m|\.blogger\.com\.br|o; # Brazilian Blogger
32 0 0         return 'terra' if m|weblogger\.(terra\.)?com\.br/|o;
33 0 0         return "diaryland" if /\.diaryland\./o;
34 0 0         return "livejournal" if /\.livejournal\.com/o;
35 0 0         return "journalspace" if /\.journalspace\.com/o;
36 0 0         return "blogalia" if /\.blogalia\.com/o;
37 0 0         return "pitas" if /\.pitas\.com/o;
38 0 0         return "persianblog" if /\.persianblog\.com/o; # Farsi
39 0 0         return "persianlog" if /\bpersianlog\.com/o; # Farsi
40 0 0         return "diaryhub" if /\.diaryhub\.(?:com|net)\/?$/io; # Thai
41            
42 0 0         return "radio" if /radio.weblogs\.com/o;
43 0 0         return "radio" if /blogs.law.harvard.edu/o;
44 0 0         return "radio" if /\.blogs.it\b/o;
45            
46 0 0         return "manila" if /\.manilasites\.com/o;
47 0 0         return "manila" if /\.editthispage\.com/o;
48 0 0         return "manila" if m|\.weblogger\.com/|o;
49 0 0         return "manila" if m|\.weblogs\.com/|o;
50            
51 0 0         return "20six" if m|\.20six\.|o;
52 0 0         return "typepad" if m|\.typepad\.|o;
53            
54 0 0         return "twoday" if /\.twoday\.net/o;
55 0 0         return "salon" if /blogs\.salon\.com/o;
56 0 0         return "splinder" if /\.splinder\.it/o; # Italy
57 0 0         return "diarist" if /\.diarist\.com/o;
58 0 0         return "antville" if /\.antville\.org/o;
59 0 0         return 'bloggingnetwork' if m|\.bloggingnetwork\.com/blogs|o;
60 0 0         return "crimsonblog" if /\.crimsonblog\./o;
61 0 0         return "skyblog" if /\.skyblog\.com/o; # French
62            
63 0 0         return "blog.pl (polish)" if /\.blog\.pl/o;
64 0 0         return "e-blog.pl (polish)" if /\.e-blog\.pl/o;
65 0 0         return "weblog.pl (polish)" if /\.weblog\.pl/o;
66            
67 0 0         return "twoday" if /\.twoday\.net/o;
68 0 0         return "monblogue" if /\.monblogue\.com/o;
69 0 0         return 'joueb' if m|joueb\.com/|o; # France
70 0 0         return 'blogstudio' if m|\.blogstudio\.com/|o;
71 0 0         return 'blog-city' if m|blog-city\.com/|o;
72 0 0         return 'blogsky' if m|\.blogsky\.com/|o; # English and Persian
73 0 0         return 'u-blog' if m|u-blog\.net/|o; # France
74 0 0         return 'barrapunto' if m|\bbarrapunto\.com/index\.pl|o; # Spain
75 0 0         return 'blig' if m|\.blig\.(?:ig.)?com\.br|o; # Brazil
76 0 0         return 'g-blog' if m|g-blog\.net/|o;
77 0 0         return 'babelogue' if m|babelogue\.citypages\.com|io;
78 0 0         return 'jevon' if m|\.jevon\.org/|io;
79 0 0         return 'tripod' if m|\.tripod\.com/|io;
80            
81 0 0         return 'xanga' if m|\.xanga\.com|o;
82             #
83             # CONTENT CHECKING
84             #
85              
86 0           local $_ = $text;
87            
88             # First, check META tags
89            
90 0 0         return "postnuke" if m|CONTENT="Post-?Nuke|io; # Nuke is nice enough to use META tags
91 0 0         return "php-nuke" if m|CONTENT="PHP-?Nuke|io;
92 0 0         return "microsoft" if m|]+Content=['"]Microsoft Visual|io;
93 0 0         return "nucleus" if m|]+content=['"]Nucleus|io;
94 0 0         return "greymatter" if m|]+content=['"]Greymatter|io;
95 0 0         return "land down under" if m|]+content=['"]Land Down Under|io;
96            
97             # Next, check actual content
98            
99 0 0         return "movable type" if m|cgi-bin/mt|o;
100 0 0         return "movable type" if m|Powered by.*Move?able ?Type|io; # common typo is 'Moveable'
101 0 0         return "movable type" if m|mtblog.gif|io;
102 0 0         return "movable type" if m|move?abletype.gif|o;
103 0 0         return "movable type" if m!function Open(Trackback|Comments)\s+\(c\)!o; # default MT JavaScript
104              
105 0 0         return "blogger pro" if m|powered_by_blogger_pro[0-9]*\.gif|io;
106 0 0         return "blogger pro" if m|powered by:? 107            
108 0 0         return "blogger" if m|bloggerbutton[0-9]+.gif|io;
109 0 0         return "blogger" if m|bloggertemplate[^.]+.gif|io;
110 0 0         return "blogger" if m|blogger_bluelong.gif|o;
111 0 0         return "blogger" if m|powered by ( 112            
113 0 0         return "radio" if m|img src="http://radio.weblogs.com|io;
114 0 0         return "radio" if m|http://radio.xmlstoragesystem.com/weblogStats|oi;
115 0 0         return "radio" if m|images/radioUserLand|oi;
116 0 0         return "radio" if m|xmlCoffeeCup|oi;
117            
118 0 0         return "manila" if m|thisIsAManilaSite|oi;
119            
120 0 0         return "cafelog" if m!function b2(?:open|comment)!o; # default cafelog JavaScript
121 0 0         return "cafelog" if m|powered by ( 122            
123 0 0         return "pivot" if m||io;
124 0 0         return "pivot" if m|pivot-?banner[^.]*.gif|io;
125            
126 0 0         return "textpattern" if m|txp_slug|o;
127 0 0         return "blosxom" if /blosxom\.gif/o;
128            
129 0 0         return "slogger" if /Created by Slogger/io;
130            
131 0 0         return "greymatter" if /gm-icon.gif/o;
132 0 0         return "greymatter" if /Powered by Greymatter/io;
133            
134 0 0         return "pMachine" if m|alt="[^"]+ pMachine|io; # This can be "Powered by" or "Gemaakt mit", for example
135 0 0         return "pMachine" if m|powered by (?: 136 0 0         return "pMachine" if m|pmachine.gif|io;
137            
138 0 0         return "psychoblogger" if m|Powered by (?:]+>)?Psychoblogger|io;
139 0 0         return "WebCrimson" if m|Powered by (?:]+>)?WebCrimson|io;
140            
141             # Tests of last resort
142 0           my @blog_count = $text =~ /\bblog\b/gi;
143            
144 0 0         return "suspected by URL" if $url =~ /[\W\-_](?:we)?blog/o;
145 0 0         return "suspected by URL" if $url =~ /\bbitacoras\b/i;
146 0 0         return "suspected by rss" if $text =~ /\brss\b/i;
147 0 0         return "suspected by content" if scalar @blog_count > 5;
148            
149 0           return;
150            
151             }
152              
153             1;
154              
155             __END__