blib/lib/WWW/Yandex/Catalog/LookupSite.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 14 | 78 | 17.9 |
branch | 0 | 20 | 0.0 |
condition | 0 | 2 | 0.0 |
subroutine | 6 | 17 | 35.2 |
pod | 10 | 10 | 100.0 |
total | 30 | 127 | 23.6 |
line | stmt | bran | cond | sub | pod | time | code | |
---|---|---|---|---|---|---|---|---|
1 | package WWW::Yandex::Catalog::LookupSite; | |||||||
2 | ||||||||
3 | # ABSTRACT: DEPRECATED | |||||||
4 | ||||||||
5 | # Last updated July 27, 2014 | |||||||
6 | # | |||||||
7 | # Author: Irakliy Sunguryan ( www.sochi-travel.info ) | |||||||
8 | # Date Created: January 30, 2010 | |||||||
9 | ||||||||
10 | 1 | 1 | 76093 | use strict; | ||||
1 | 2 | |||||||
1 | 29 | |||||||
11 | 1 | 1 | 10 | use warnings; | ||||
1 | 2 | |||||||
1 | 27 | |||||||
12 | ||||||||
13 | 1 | 1 | 5 | use vars qw($VERSION); | ||||
1 | 3 | |||||||
1 | 65 | |||||||
14 | $VERSION = '0.12'; | |||||||
15 | ||||||||
16 | 1 | 1 | 787 | use LWP::UserAgent; | ||||
1 | 57776 | |||||||
1 | 39 | |||||||
17 | ||||||||
18 | my $HAS_PUNYMOD; | |||||||
19 | 1 | 1 | 65 | BEGIN { $HAS_PUNYMOD = eval 'use URI::UTF8::Punycode; 1;'; } | ||||
1 | 1 | 1380 | ||||||
0 | ||||||||
0 | ||||||||
20 | ||||||||
21 | ||||||||
22 | sub new { | |||||||
23 | 0 | 0 | 1 | my $class = shift; | ||||
24 | 0 | my %options = @_; | ||||||
25 | ||||||||
26 | 0 | my $self = { | ||||||
27 | _tic => undef, | |||||||
28 | # undef - if there was an error getting or parsing data | |||||||
29 | # 0 - (a) when site is not present in catalog and tIC is < 10 | |||||||
30 | # - (b) when site is present in catalog, but the catalog | |||||||
31 | # reports it as zero (payed submission) | |||||||
32 | _shortDescr => undef, | |||||||
33 | # defined only when site is present in catalog; undef otherwise | |||||||
34 | _longDescr => undef, | |||||||
35 | # can be undef when site is present in catalog! | |||||||
36 | # not all sites in the catalog have long description | |||||||
37 | _categories => [], | |||||||
38 | # empty when site is not present in catalog | |||||||
39 | # at least one entry when present in catalog | |||||||
40 | _orderNum => undef, | |||||||
41 | # order number in the sub-category of catalog; "main" subcategory, | |||||||
42 | # when there are more than one. | |||||||
43 | # defined only when site is present in the catalog; undef otherwise | |||||||
44 | _uri => undef, | |||||||
45 | # URI as it is recorded in catalog. for example with/without 'www' prefix | |||||||
46 | # or it can be recorded with totally different address (narod.ru -> narod.yandex.ru) | |||||||
47 | # defined only when site is present in catalog; undef otherwise | |||||||
48 | }; | |||||||
49 | ||||||||
50 | 0 | $self->{ua} = LWP::UserAgent->new( agent => __PACKAGE__ . "/" . $VERSION ); | ||||||
51 | ||||||||
52 | # Pass options on to LWP::UserAgent | |||||||
53 | 0 | foreach my $option ( keys %options ) { | ||||||
54 | 0 | $self->{ua}->$option( $options{$option} ); | ||||||
55 | } | |||||||
56 | ||||||||
57 | 0 | bless $self, $class; | ||||||
58 | 0 | return $self; | ||||||
59 | } | |||||||
60 | ||||||||
61 | ||||||||
62 | # Returns [ tIC, short description, long description, [list of catalogs], URI as returned by Yaca, order number in the main category ] | |||||||
63 | # "yaca" - Yandex Catalog | |||||||
64 | sub yaca_lookup { | |||||||
65 | 0 | 0 | 1 | my $self = shift; | ||||
66 | ||||||||
67 | 0 | 0 | my $address = shift || return; | |||||
68 | ||||||||
69 | # an $address is nomally a domain name (whatever level), but can include path too. | |||||||
70 | # scheme, authentication, port, and query strings are stripped -- | |||||||
71 | # assuming Yandex won't accept URIs that contain all this | |||||||
72 | ||||||||
73 | 0 | $self->{_tic} = $self->{_shortDescr} = $self->{_longDescr} = $self->{_orderNum} = $self->{_uri} = undef; | ||||||
74 | 0 | $self->{_categories} = []; | ||||||
75 | ||||||||
76 | 0 | $address =~ s|.*?://||; # loose scheme | ||||||
77 | 0 | $address =~ s|.*?(:.*?)?@||; # loose authentication | ||||||
78 | 0 | $address =~ s|(\w):\d+|$1|; # loose port | ||||||
79 | 0 | $address =~ s|\?.*||; # loose query | ||||||
80 | 0 | $address =~ s|/$||; # loose trailing slash | ||||||
81 | ||||||||
82 | 0 | my $resp = $self->{ua}->get( 'http://yaca.yandex.ru/yca/cy/ch/' . $address . '/' ); | ||||||
83 | 0 | 0 | return unless $resp->is_success; | |||||
84 | ||||||||
85 | 0 | my $contents = $resp->decoded_content; | ||||||
86 | ||||||||
87 | 0 | 0 | if( $contents =~ / / ) { |
|||||
88 | # "ресурс не описан в Яндекс.Каталоге" | |||||||
89 | # It's not in the catalog, but tIC is always displayed. | |||||||
90 | # Ex.: Индекс цитирования (тИЦ) ресурса — 10 | |||||||
91 | 0 | ( $self->{_tic} ) = $contents =~ / .*?\s(\d+)/s; |
||||||
92 | 0 | 0 | $self->{_tic} = 0 unless defined $self->{_tic}; | |||||
93 | } | |||||||
94 | else { | |||||||
95 | 0 | my( $entry ) = $contents =~ qr{( | ||||||
96 | ||||||||
97 | 0 | ( $self->{_orderNum}, $self->{_uri}, $self->{_shortDescr}, undef, $self->{_longDescr}, $self->{_tic} ) = | ||||||
98 | # $1 $2 $3 $4 $5 | |||||||
99 | $entry =~ qr{ | (\d+)\.\s* | .*(.*)(||||||
100 | ||||||||
101 | # main catalog | |||||||
102 | 0 | my( $path, $rubric ) = $contents =~ qr{ (.*?) \s* |
||||||
103 | 0 | 0 | if( $path ) { | |||||
104 | 0 | $path =~ s{?a.*?>|?h1>|\n}{}gs; # remove A, H1 tags and newline | ||||||
105 | 0 | $path =~ s|\x{0420}\x{0443}\x{0431}\x{0440}\x{0438}\x{043A}\x{0438} / ||; | ||||||
106 | # removed "Рубрики" - it always starts with this root word | |||||||
107 | # http://www.rishida.net/tools/conversion/ | |||||||
108 | 0 | 0 | push( @{$self->{_categories}}, $path.' / '.$rubric ) if $entry; | |||||
0 | ||||||||
109 | } | |||||||
110 | ||||||||
111 | # additional catalogs | |||||||
112 | 0 | ( $entry ) = $contents =~ qr{ (.*?) }s; |
||||||
113 | 0 | 0 | if( $entry ) { | |||||
114 | 0 | while( $entry =~ s{ | ||||||
115 | 0 | my $catPath = $1; | ||||||
116 | 0 | $catPath =~ s|\x{041A}\x{0430}\x{0442}\x{0430}\x{043B}\x{043E}\x{0433} / ||; | ||||||
117 | # removed "Каталог" - we know it's in the catalog | |||||||
118 | 0 | 0 | push( @{$self->{_categories}}, $catPath ) if $catPath; | |||||
0 | ||||||||
119 | } | |||||||
120 | } | |||||||
121 | } | |||||||
122 | ||||||||
123 | 0 | return [ $self->{_tic}, $self->{_shortDescr}, $self->{_longDescr}, $self->{_categories}, $self->{_uri}, $self->{_orderNum} ]; | ||||||
124 | } | |||||||
125 | ||||||||
126 | # Converts punycode in a IDN URL to utf8. | |||||||
127 | # Returns converted URL. | |||||||
128 | sub _punycode_utf8 { | |||||||
129 | 0 | 0 | my $uri = shift; | |||||
130 | ||||||||
131 | 0 | s/^\s+//, s/\s+$// for $uri; # trim $uri just in case | ||||||
132 | 0 | my( $schema, $domain, $path ) = $uri =~ m{(http.*?//)(.*?)(($|/|:).*)}; | ||||||
133 | # Ex.: http://www.domain.com:80/path?query#anchor -> 'http://' , 'www.domain.com' , ':80/path?query#anchor' | |||||||
134 | # I hope there are no urls with username/password links in YaCa | |||||||
135 | # I hope there are no non-http(s) links in YaCa | |||||||
136 | # I hope all links include schema part | |||||||
137 | # Anyway, from what I've seen in YaCa so far we should be Ok | |||||||
138 | ||||||||
139 | 0 | 0 | $domain = join( '.', map { /^xn--/ ? puny_dec($_) : $_ } split(/\./, $domain) ); | |||||
0 | ||||||||
140 | # split by dot -> convert only punycode parts -> glue 'em back together | |||||||
141 | ||||||||
142 | 0 | return $schema.$domain.$path; | ||||||
143 | } | |||||||
144 | ||||||||
145 | # == Convenience functions ================================= | |||||||
146 | ||||||||
147 | sub is_in_catalog { | |||||||
148 | 0 | 0 | 1 | my $self = shift; | ||||
149 | 0 | 0 | return scalar @{$self->{_categories}} > 0 ? 1 : 0; | |||||
0 | ||||||||
150 | } | |||||||
151 | ||||||||
152 | sub tic { | |||||||
153 | 0 | 0 | 1 | my $self = shift; | ||||
154 | 0 | return $self->{_tic}; | ||||||
155 | } | |||||||
156 | ||||||||
157 | sub short_description { | |||||||
158 | 0 | 0 | 1 | my $self = shift; | ||||
159 | 0 | return $self->{_shortDescr}; | ||||||
160 | } | |||||||
161 | ||||||||
162 | sub long_description { | |||||||
163 | 0 | 0 | 1 | my $self = shift; | ||||
164 | 0 | return $self->{_longDescr}; | ||||||
165 | } | |||||||
166 | ||||||||
167 | sub categories { | |||||||
168 | 0 | 0 | 1 | my $self = shift; | ||||
169 | 0 | return $self->{_categories}; | ||||||
170 | } | |||||||
171 | ||||||||
172 | sub order_number { | |||||||
173 | 0 | 0 | 1 | my $self = shift; | ||||
174 | 0 | return $self->{_orderNum}; | ||||||
175 | } | |||||||
176 | ||||||||
177 | sub uri { | |||||||
178 | 0 | 0 | 1 | my $self = shift; | ||||
179 | 0 | return $self->{_uri}; | ||||||
180 | } | |||||||
181 | ||||||||
182 | sub uri_utf8 { | |||||||
183 | 0 | 0 | 1 | my $self = shift; | ||||
184 | 0 | 0 | return $HAS_PUNYMOD ? _punycode_utf8( $self->{_uri} ) : $self->{_uri}; | |||||
185 | } | |||||||
186 | ||||||||
187 | 1; | |||||||
188 | ||||||||
189 | __END__ |