File Coverage

blib/lib/WWW/Yandex/Catalog/LookupSite.pm
Criterion Covered Total %
statement 14 78 17.9
branch 0 20 0.0
condition 0 2 0.0
subroutine 6 17 35.2
pod 10 10 100.0
total 30 127 23.6


\s*.*(.*)(
(.*)
.*?)?(\d+)<}s;
line stmt bran cond sub pod time code
1             package WWW::Yandex::Catalog::LookupSite;
2              
3             # ABSTRACT: DEPRECATED
4              
5             # Last updated July 27, 2014
6             #
7             # Author: Irakliy Sunguryan ( www.sochi-travel.info )
8             # Date Created: January 30, 2010
9              
10 1     1   76093 use strict;
  1         2  
  1         29  
11 1     1   10 use warnings;
  1         2  
  1         27  
12              
13 1     1   5 use vars qw($VERSION);
  1         3  
  1         65  
14             $VERSION = '0.12';
15              
16 1     1   787 use LWP::UserAgent;
  1         57776  
  1         39  
17              
18             my $HAS_PUNYMOD;
19 1     1   65 BEGIN { $HAS_PUNYMOD = eval 'use URI::UTF8::Punycode; 1;'; }
  1     1   1380  
  0            
  0            
20              
21              
22             sub new {
23 0     0 1   my $class = shift;
24 0           my %options = @_;
25              
26 0           my $self = {
27             _tic => undef,
28             # undef - if there was an error getting or parsing data
29             # 0 - (a) when site is not present in catalog and tIC is < 10
30             # - (b) when site is present in catalog, but the catalog
31             # reports it as zero (payed submission)
32             _shortDescr => undef,
33             # defined only when site is present in catalog; undef otherwise
34             _longDescr => undef,
35             # can be undef when site is present in catalog!
36             # not all sites in the catalog have long description
37             _categories => [],
38             # empty when site is not present in catalog
39             # at least one entry when present in catalog
40             _orderNum => undef,
41             # order number in the sub-category of catalog; "main" subcategory,
42             # when there are more than one.
43             # defined only when site is present in the catalog; undef otherwise
44             _uri => undef,
45             # URI as it is recorded in catalog. for example with/without 'www' prefix
46             # or it can be recorded with totally different address (narod.ru -> narod.yandex.ru)
47             # defined only when site is present in catalog; undef otherwise
48             };
49            
50 0           $self->{ua} = LWP::UserAgent->new( agent => __PACKAGE__ . "/" . $VERSION );
51              
52             # Pass options on to LWP::UserAgent
53 0           foreach my $option ( keys %options ) {
54 0           $self->{ua}->$option( $options{$option} );
55             }
56            
57 0           bless $self, $class;
58 0           return $self;
59             }
60              
61              
62             # Returns [ tIC, short description, long description, [list of catalogs], URI as returned by Yaca, order number in the main category ]
63             # "yaca" - Yandex Catalog
64             sub yaca_lookup {
65 0     0 1   my $self = shift;
66              
67 0   0       my $address = shift || return;
68              
69             # an $address is nomally a domain name (whatever level), but can include path too.
70             # scheme, authentication, port, and query strings are stripped --
71             # assuming Yandex won't accept URIs that contain all this
72              
73 0           $self->{_tic} = $self->{_shortDescr} = $self->{_longDescr} = $self->{_orderNum} = $self->{_uri} = undef;
74 0           $self->{_categories} = [];
75              
76 0           $address =~ s|.*?://||; # loose scheme
77 0           $address =~ s|.*?(:.*?)?@||; # loose authentication
78 0           $address =~ s|(\w):\d+|$1|; # loose port
79 0           $address =~ s|\?.*||; # loose query
80 0           $address =~ s|/$||; # loose trailing slash
81              
82 0           my $resp = $self->{ua}->get( 'http://yaca.yandex.ru/yca/cy/ch/' . $address . '/' );
83 0 0         return unless $resp->is_success;
84            
85 0           my $contents = $resp->decoded_content;
86            
87 0 0         if( $contents =~ /

/ ) {

88             # "ресурс не описан в Яндекс.Каталоге"
89             # It's not in the catalog, but tIC is always displayed.
90             # Ex.: Индекс цитирования (тИЦ) ресурса — 10
91 0           ( $self->{_tic} ) = $contents =~ /

.*?\s(\d+)/s;

92 0 0         $self->{_tic} = 0 unless defined $self->{_tic};
93             }
94             else {
95 0           my( $entry ) = $contents =~ qr{(
)}s;
96            
97 0           ( $self->{_orderNum}, $self->{_uri}, $self->{_shortDescr}, undef, $self->{_longDescr}, $self->{_tic} ) =
98             # $1 $2 $3 $4 $5
99             $entry =~ qr{(\d+)\.\s*
100              
101             # main catalog
102 0           my( $path, $rubric ) = $contents =~ qr{
(.*?)
\s*(.*?)}s;
103 0 0         if( $path ) {
104 0           $path =~ s{||\n}{}gs; # remove A, H1 tags and newline
105 0           $path =~ s|\x{0420}\x{0443}\x{0431}\x{0440}\x{0438}\x{043A}\x{0438} / ||;
106             # removed "Рубрики" - it always starts with this root word
107             # http://www.rishida.net/tools/conversion/
108 0 0         push( @{$self->{_categories}}, $path.' / '.$rubric ) if $entry;
  0            
109             }
110              
111             # additional catalogs
112 0           ( $entry ) = $contents =~ qr{}s;
113 0 0         if( $entry ) {
114 0           while( $entry =~ s{(.*?)

.*?(
115 0           my $catPath = $1;
116 0           $catPath =~ s|\x{041A}\x{0430}\x{0442}\x{0430}\x{043B}\x{043E}\x{0433} / ||;
117             # removed "Каталог" - we know it's in the catalog
118 0 0         push( @{$self->{_categories}}, $catPath ) if $catPath;
  0            
119             }
120             }
121             }
122              
123 0           return [ $self->{_tic}, $self->{_shortDescr}, $self->{_longDescr}, $self->{_categories}, $self->{_uri}, $self->{_orderNum} ];
124             }
125              
126             # Converts punycode in a IDN URL to utf8.
127             # Returns converted URL.
128             sub _punycode_utf8 {
129 0     0     my $uri = shift;
130            
131 0           s/^\s+//, s/\s+$// for $uri; # trim $uri just in case
132 0           my( $schema, $domain, $path ) = $uri =~ m{(http.*?//)(.*?)(($|/|:).*)};
133             # Ex.: http://www.domain.com:80/path?query#anchor -> 'http://' , 'www.domain.com' , ':80/path?query#anchor'
134             # I hope there are no urls with username/password links in YaCa
135             # I hope there are no non-http(s) links in YaCa
136             # I hope all links include schema part
137             # Anyway, from what I've seen in YaCa so far we should be Ok
138            
139 0 0         $domain = join( '.', map { /^xn--/ ? puny_dec($_) : $_ } split(/\./, $domain) );
  0            
140             # split by dot -> convert only punycode parts -> glue 'em back together
141              
142 0           return $schema.$domain.$path;
143             }
144              
145             # == Convenience functions =================================
146              
147             sub is_in_catalog {
148 0     0 1   my $self = shift;
149 0 0         return scalar @{$self->{_categories}} > 0 ? 1 : 0;
  0            
150             }
151              
152             sub tic {
153 0     0 1   my $self = shift;
154 0           return $self->{_tic};
155             }
156              
157             sub short_description {
158 0     0 1   my $self = shift;
159 0           return $self->{_shortDescr};
160             }
161              
162             sub long_description {
163 0     0 1   my $self = shift;
164 0           return $self->{_longDescr};
165             }
166              
167             sub categories {
168 0     0 1   my $self = shift;
169 0           return $self->{_categories};
170             }
171              
172             sub order_number {
173 0     0 1   my $self = shift;
174 0           return $self->{_orderNum};
175             }
176              
177             sub uri {
178 0     0 1   my $self = shift;
179 0           return $self->{_uri};
180             }
181              
182             sub uri_utf8 {
183 0     0 1   my $self = shift;
184 0 0         return $HAS_PUNYMOD ? _punycode_utf8( $self->{_uri} ) : $self->{_uri};
185             }
186              
187             1;
188              
189             __END__