File Coverage

blib/lib/WWW/Yandex/Catalog/LookupSite.pm
Criterion Covered Total %
statement 14 78 17.9
branch 0 20 0.0
condition 0 2 0.0
subroutine 6 17 35.2
pod 10 10 100.0
total 30 127 23.6


\s*.*(.*)(
(.*)
.*?)?(\d+)<}s;
line stmt bran cond sub pod time code
1             package WWW::Yandex::Catalog::LookupSite;
2              
3             # Last updated July 27, 2014
4             #
5             # Author: Irakliy Sunguryan ( www.sochi-travel.info )
6             # Date Created: January 30, 2010
7              
8 1     1   69256 use strict;
  1         2  
  1         40  
9 1     1   4 use warnings;
  1         2  
  1         35  
10              
11 1     1   5 use vars qw($VERSION);
  1         313  
  1         59  
12             $VERSION = '0.11';
13              
14 1     1   1413 use LWP::UserAgent;
  1         195720  
  1         52  
15              
16             my $HAS_PUNYMOD;
17 1     1   105 BEGIN { $HAS_PUNYMOD = eval 'use URI::UTF8::Punycode; 1;'; }
  1     1   1591  
  0            
  0            
18              
19              
20             sub new {
21 0     0 1   my $class = shift;
22 0           my %options = @_;
23              
24 0           my $self = {
25             _tic => undef,
26             # undef - if there was an error getting or parsing data
27             # 0 - (a) when site is not present in catalog and tIC is < 10
28             # - (b) when site is present in catalog, but the catalog
29             # reports it as zero (payed submission)
30             _shortDescr => undef,
31             # defined only when site is present in catalog; undef otherwise
32             _longDescr => undef,
33             # can be undef when site is present in catalog!
34             # not all sites in the catalog have long description
35             _categories => [],
36             # empty when site is not present in catalog
37             # at least one entry when present in catalog
38             _orderNum => undef,
39             # order number in the sub-category of catalog; "main" subcategory,
40             # when there are more than one.
41             # defined only when site is present in the catalog; undef otherwise
42             _uri => undef,
43             # URI as it is recorded in catalog. for example with/without 'www' prefix
44             # or it can be recorded with totally different address (narod.ru -> narod.yandex.ru)
45             # defined only when site is present in catalog; undef otherwise
46             };
47            
48 0           $self->{ua} = LWP::UserAgent->new( agent => __PACKAGE__ . "/" . $VERSION );
49              
50             # Pass options on to LWP::UserAgent
51 0           foreach my $option ( keys %options ) {
52 0           $self->{ua}->$option( $options{$option} );
53             }
54            
55 0           bless $self, $class;
56 0           return $self;
57             }
58              
59              
60             # Returns [ tIC, short description, long description, [list of catalogs], URI as returned by Yaca, order number in the main category ]
61             # "yaca" - Yandex Catalog
62             sub yaca_lookup {
63 0     0 1   my $self = shift;
64              
65 0   0       my $address = shift || return;
66              
67             # an $address is nomally a domain name (whatever level), but can include path too.
68             # scheme, authentication, port, and query strings are stripped --
69             # assuming Yandex won't accept URIs that contain all this
70              
71 0           $self->{_tic} = $self->{_shortDescr} = $self->{_longDescr} = $self->{_orderNum} = $self->{_uri} = undef;
72 0           $self->{_categories} = [];
73              
74 0           $address =~ s|.*?://||; # loose scheme
75 0           $address =~ s|.*?(:.*?)?@||; # loose authentication
76 0           $address =~ s|(\w):\d+|$1|; # loose port
77 0           $address =~ s|\?.*||; # loose query
78 0           $address =~ s|/$||; # loose trailing slash
79              
80 0           my $resp = $self->{ua}->get( 'http://yaca.yandex.ru/yca/cy/ch/' . $address . '/' );
81 0 0         return unless $resp->is_success;
82            
83 0           my $contents = $resp->decoded_content;
84            
85 0 0         if( $contents =~ /

/ ) {

86             # "ресурс не описан в Яндекс.Каталоге"
87             # It's not in the catalog, but tIC is always displayed.
88             # Ex.: Индекс цитирования (тИЦ) ресурса — 10
89 0           ( $self->{_tic} ) = $contents =~ /

.*?\s(\d+)/s;

90 0 0         $self->{_tic} = 0 unless defined $self->{_tic};
91             }
92             else {
93 0           my( $entry ) = $contents =~ qr{(
)}s;
94            
95 0           ( $self->{_orderNum}, $self->{_uri}, $self->{_shortDescr}, undef, $self->{_longDescr}, $self->{_tic} ) =
96             # $1 $2 $3 $4 $5
97             $entry =~ qr{(\d+)\.\s*
98              
99             # main catalog
100 0           my( $path, $rubric ) = $contents =~ qr{
(.*?)
\s*(.*?)}s;
101 0 0         if( $path ) {
102 0           $path =~ s{||\n}{}gs; # remove A, H1 tags and newline
103 0           $path =~ s|\x{0420}\x{0443}\x{0431}\x{0440}\x{0438}\x{043A}\x{0438} / ||;
104             # removed "Рубрики" - it always starts with this root word
105             # http://www.rishida.net/tools/conversion/
106 0 0         push( @{$self->{_categories}}, $path.' / '.$rubric ) if $entry;
  0            
107             }
108              
109             # additional catalogs
110 0           ( $entry ) = $contents =~ qr{}s;
111 0 0         if( $entry ) {
112 0           while( $entry =~ s{(.*?)

.*?(
113 0           my $catPath = $1;
114 0           $catPath =~ s|\x{041A}\x{0430}\x{0442}\x{0430}\x{043B}\x{043E}\x{0433} / ||;
115             # removed "Каталог" - we know it's in the catalog
116 0 0         push( @{$self->{_categories}}, $catPath ) if $catPath;
  0            
117             }
118             }
119             }
120              
121 0           return [ $self->{_tic}, $self->{_shortDescr}, $self->{_longDescr}, $self->{_categories}, $self->{_uri}, $self->{_orderNum} ];
122             }
123              
124             # Converts punycode in a IDN URL to utf8.
125             # Returns converted URL.
126             sub _punycode_utf8 {
127 0     0     my $uri = shift;
128            
129 0           s/^\s+//, s/\s+$// for $uri; # trim $uri just in case
130 0           my( $schema, $domain, $path ) = $uri =~ m{(http.*?//)(.*?)(($|/|:).*)};
131             # Ex.: http://www.domain.com:80/path?query#anchor -> 'http://' , 'www.domain.com' , ':80/path?query#anchor'
132             # I hope there are no urls with username/password links in YaCa
133             # I hope there are no non-http(s) links in YaCa
134             # I hope all links include schema part
135             # Anyway, from what I've seen in YaCa so far we should be Ok
136            
137 0 0         $domain = join( '.', map { /^xn--/ ? puny_dec($_) : $_ } split(/\./, $domain) );
  0            
138             # split by dot -> convert only punycode parts -> glue 'em back together
139              
140 0           return $schema.$domain.$path;
141             }
142              
143             # == Convenience functions =================================
144              
145             sub is_in_catalog {
146 0     0 1   my $self = shift;
147 0 0         return scalar @{$self->{_categories}} > 0 ? 1 : 0;
  0            
148             }
149              
150             sub tic {
151 0     0 1   my $self = shift;
152 0           return $self->{_tic};
153             }
154              
155             sub short_description {
156 0     0 1   my $self = shift;
157 0           return $self->{_shortDescr};
158             }
159              
160             sub long_description {
161 0     0 1   my $self = shift;
162 0           return $self->{_longDescr};
163             }
164              
165             sub categories {
166 0     0 1   my $self = shift;
167 0           return $self->{_categories};
168             }
169              
170             sub order_number {
171 0     0 1   my $self = shift;
172 0           return $self->{_orderNum};
173             }
174              
175             sub uri {
176 0     0 1   my $self = shift;
177 0           return $self->{_uri};
178             }
179              
180             sub uri_utf8 {
181 0     0 1   my $self = shift;
182 0 0         return $HAS_PUNYMOD ? _punycode_utf8( $self->{_uri} ) : $self->{_uri};
183             }
184              
185             1;
186              
187             __END__