File Coverage

blib/lib/WWW/Google/WebmasterTools/Download.pm

Criterion	Covered	Total	%
statement	22	24	91.6
branch			n/a
condition			n/a
subroutine	8	8	100.0
pod			n/a
total	30	32	93.7

line	stmt	sub	time	code
1				package WWW::Google::WebmasterTools::Download;
2				$WWW::Google::WebmasterTools::Download::VERSION = '0.003';
3	1	1	629	use strict;
	1		2
	1		34
4	1	1	4	use warnings;
	1		3
	1		28
5
6	1	1	688	use namespace::autoclean;
	1		15468
	1		6
7
8	1	1	678	use Moose;
	1		3816917
	1		7
9
10	1	1	366784	use LWP::UserAgent;
	1		74661
	1		33
11	1	1	525	use HTTP::Cookies;
	1		5625
	1		27
12	1	1	4	use URI;
	1		2
	1		18
13
14	1	1	182	use XML::Twig;
	0
	0
15				use Text::CSV;
16				use JSON::MaybeXS;
17
18				use Path::Tiny;
19
20				my %ROOT_URIs = (
21				LOGIN => URI->new('https://www.google.com/accounts/ClientLogin'),
22				SITES => URI->new('https://www.google.com/webmasters/tools/feeds/sites/'),
23				DOWNLOADS => URI->new('https://www.google.com/webmasters/tools/downloads-list'),
24				);
25
26				has 'email' => (
27				is => 'ro',
28				isa => 'Str'
29				);
30
31				has 'password' => (
32				is => 'ro',
33				isa => 'Str'
34				);
35
36				has 'language' => (
37				is => 'ro',
38				isa => 'Str',
39				default => 'en',
40				);
41
42				has 'sites_regex' => (
43				is => 'ro',
44				isa => 'Regexp',
45				default => sub { qr/./ },
46				);
47
48				has 'ua' => (
49				is => 'ro',
50				isa => 'LWP::UserAgent',
51				default => sub {
52				LWP::UserAgent->new(
53				cookie_jar => HTTP::Cookies->new,
54				agent => 'WWW::Google::WebmasterTools::Download ',
55				);
56				},
57				);
58
59				has 'site_report_uri_cache' => (
60				is => 'ro',
61				isa => 'HashRef[URI]',
62				default => sub { {} },
63				);
64
65				sub BUILD {
66				my $self = shift;
67
68				$self->_login;
69				$self->_cache_download_uris;
70
71				return;
72				}
73
74				sub _login {
75				my $self = shift;
76
77				my %post_data = (
78				accountType => 'HOSTED_OR_GOOGLE',
79				Email => $self->email,
80				Passwd => $self->password,
81				service => 'sitemaps',
82				source => 'WWW::Google::WebmasterTools::Download',
83				);
84
85				my $response = $self->ua->post($ROOT_URIs{'LOGIN'}, \%post_data);
86
87				if (!$response->is_success) {
88				die sprintf(
89				"Failed to log in as '%s': %s\n",
90				$self->email,
91				$response->status_line
92				);
93				}
94
95				my $auth;
96				if ($response->content =~ m{Auth=(.+)}msx) {
97				$auth = $1;
98				}
99				else {
100				die sprintf(
101				"Failed to get auth token as '%s' from response content '%s'\n",
102				$self->email,
103				$response->content
104				);
105				}
106
107				$self->ua->default_headers(
108				HTTP::Headers->new(
109				'Authorization' => sprintf('GoogleLogin auth=%s', $auth),
110				'GData-Version' => 2,
111				)
112				);
113
114				return;
115				}
116
117				sub _cache_download_uris {
118				my $self = shift;
119
120				my $site_report_uri_cache = $self->site_report_uri_cache;
121
122				my @sites = $self->get_sites;
123
124				SITE:
125				for my $site (@sites) {
126				my $site_download_list_uri = $ROOT_URIs{'DOWNLOADS'}->clone;
127				$site_download_list_uri->query_form(
128				hl => $self->language,
129				siteUrl => $site
130				);
131
132				my $response = $self->ua->get($site_download_list_uri);
133				if (!$response->is_success) {
134				warn sprintf(
135				"Failed to get download uris for site '%s': %s\n",
136				$site,
137				$response->status_line
138				);
139				next SITE;
140				}
141
142				my $json = $response->content;
143				my $data = decode_json($json);
144
145				for my $report (keys %$data) {
146				my $URI = URI->new('https://www.google.com' . $data->{$report});
147				$site_report_uri_cache->{$site}{$report} = $URI;
148				$URI->query_form(
149				$URI->query_form,
150				prop => 'ALL',
151				more => 'true',
152				);
153				}
154				}
155
156				return;
157				}
158
159				sub get_sites {
160				my $self = shift;
161
162				my $response = $self->ua->get($ROOT_URIs{'SITES'});
163
164				if (!$response->is_success) {
165				die sprintf(
166				"Failed to get sites as '%s': %s\n",
167				$self->email,
168				$response->status_line
169				);
170				}
171
172				my @sites;
173
174				my $twig = XML::Twig->new(
175				twig_handlers => {
176				'/feed/entry/title' => sub { push @sites, $_->text_only }
177				},
178				);
179				$twig->parse($response->content);
180
181				return sort grep { $_ =~ $self->sites_regex } @sites;
182				}
183
184				sub get_top_pages_data { shift->_get_json_data(@_, report => 'TOP_PAGES') }
185				sub get_top_queries_data { shift->_get_json_data(@_, report => 'TOP_QUERIES') }
186				sub get_crawl_errors_data { shift->_get_json_data(@_, report => 'CRAWL_ERRORS') }
187				sub get_content_errors_data { shift->_get_json_data(@_, report => 'CONTENT_ERRORS') }
188				sub get_content_keywords_data { shift->_get_json_data(@_, report => 'CONTENT_KEYWORDS') }
189				sub get_latest_backlinks_data { shift->_get_json_data(@_, report => 'LATEST_BACKLINKS') }
190				sub get_internal_links_data { shift->_get_json_data(@_, report => 'INTERNAL_LINKS') }
191				sub get_external_links_data { shift->_get_json_data(@_, report => 'EXTERNAL_LINKS') }
192				sub get_social_activity_data { shift->_get_json_data(@_, report => 'SOCIAL_ACTIVITY') }
193
194				sub save_top_pages_as_csv { shift->_save_csv_data(@_, report => 'TOP_PAGES') }
195				sub save_top_queries_as_csv { shift->_save_csv_data(@_, report => 'TOP_QUERIES') }
196				sub save_crawl_errors_as_csv { shift->_save_csv_data(@_, report => 'CRAWL_ERRORS') }
197				sub save_content_errors_as_csv { shift->_save_csv_data(@_, report => 'CONTENT_ERRORS') }
198				sub save_content_keywords_as_csv { shift->_save_csv_data(@_, report => 'CONTENT_KEYWORDS') }
199				sub save_latest_backlinks_as_csv { shift->_save_csv_data(@_, report => 'LATEST_BACKLINKS') }
200				sub save_internal_links_as_csv { shift->_save_csv_data(@_, report => 'INTERNAL_LINKS') }
201				sub save_external_links_as_csv { shift->_save_csv_data(@_, report => 'EXTERNAL_LINKS') }
202				sub save_social_activity_as_csv { shift->_save_csv_data(@_, report => 'SOCIAL_ACTIVITY') }
203
204				sub _get_csv_data {
205				my $self = shift;
206				my %params = @_;
207
208				my $website = $params{'website'} // die "Missing required parameter 'website'";
209				my $report = $params{'report'} // die "Missing required parameter 'report'";
210
211				my $uri = $self->site_report_uri_cache->{$website}{$report};
212				if (!$uri) {
213				die sprintf(
214				"Don't know the URL for site '%s' report '%s'\n",
215				$website,
216				$report
217				);
218				}
219
220				my $response = $self->ua->get($uri);
221
222				if (!$response->is_success) {
223				die sprintf(
224				"Failed to get data for site '%s' report '%s': %s\n",
225				$website, $report,
226				$response->status_line
227				);
228				}
229
230				return $response->content;
231				}
232
233				sub _save_csv_data {
234				my $self = shift;
235				my %params = @_;
236
237				my $filename = delete $params{'filename'} // die "Missing required parameter 'filename'";
238
239				my $csv_data = $self->_get_csv_data(%params);
240
241				my $path = path($filename);
242				$path->spew($csv_data);
243
244				return;
245				}
246
247				sub _get_json_data {
248				my $self = shift;
249
250				my $csv_data = $self->_get_csv_data(@_);
251
252				my $csv_parser = Text::CSV->new({ binary => 1 });
253				my @csv_content = split /\n/, $csv_data;
254
255				my @data;
256				for my $line (@csv_content) {
257				$csv_parser->parse($line);
258				push @data, [ $csv_parser->fields ];
259				}
260
261				return @data;
262				}
263
264				__PACKAGE__->meta->make_immutable;
265
266				1;
267
268				__END__
269
270				=head1 NAME
271
272				WWW::Google::WebmasterTools::Download - Extract data from Google Webmaster Tools
273
274				=head1 VERSION
275
276				version 0.003
277
278				=head1 DESCRIPTION
279
280				This distribution is a rip-off of Stephan Schmitz's
281				php-webmaster-tools-downloads library which can be found on Github at
282				L<https://github.com/eyecatchup/php-webmaster-tools-downloads>.
283
284				This project provides an easy way to automate downloading of data tables from
285				Google Webmaster Tools and storing the results in CSV files.
286
287				It performs these actions essentially by scraping Google Webmaster Tools,
288				because the GWT API does not provide full access to all the data desired.
289
290				It is necessary because GWT only shows you data for the last three months, so
291				if you want to track your website for longer than that you have to store the
292				data separately yourself.
293
294				=head1 SYNOPSIS
295
296				use WWW::Google::WebmasterTools::Download;
297
298				my $gdata = WWW::Google::WebmasterTools::Download->new(
299				email => 'example@gmail.com',
300				password => 'correct horse battery staple',
301				);
302
303				my @data = $gdata->get_content_keywords_data(
304				website => 'http://www.example.org',
305				);
306
307				$gdata->save_top_queries_as_csv(
308				website => 'http://www.example.org',
309				filename => 'content_keywords_data.csv'
310				);
311
312				=head1 CONSTRUCTOR
313
314				=head2 new
315
316				Takes an email and password and returns an object with methods to access data
317				from Google Webmaster Tools.
318
319				my $gdata = WWW::Google::WebmasterTools::Download->new(
320				email => 'example@gmail.com',
321				password => 'correct horse battery staple',
322				);
323
324				Immediately logs in and pre-caches all your site URLs which can be slow if you
325				have a large number of them or a slow internet connection.
326
327				Optionally takes a regular expression for filtering on which sites you are
328				interested in; a language (ISO 639-1 2-letter language code); and a user agent.
329
330				my $gdata = WWW::Google::WebmasterTools::Download->new(
331				email => 'example@gmail.com',
332				password => 'correct horse battery staple',
333
334				sites_regex => qr/example/,
335				language => 'de',
336				ua => LWP::UserAgent->new(agent => "My Agent Name"),
337				);
338
339				The default sites regex matches all sites, the default language is 'en', and
340				the default user agent has the UserAgent string WWW::Google::WebmasterTools::Download.
341
342
343				=head1 SITE METHODS
344
345				=head2 get_sites
346
347				Returns a list of sites available for the user. Obeys the sites_regex parameter
348				passed to new().
349
350				my @sites = $gdata->get_sites;
351
352				=head1 DATA METHODS
353
354				Each of these takes a website and returns an array of arrayrefs representing a
355				table of data.
356
357				my @data = $gdata->get_top_pages_data(
358				website => 'http://www.example.org'
359				);
360
361				=head2 get_top_pages_data
362
363				=head2 get_top_queries_data
364
365				=head2 get_crawl_errors_data
366
367				=head2 get_content_errors_data
368
369				=head2 get_content_keywords_data
370
371				=head2 get_latest_backlinks_data
372
373				=head2 get_internal_links_data
374
375				=head2 get_external_links_data
376
377				=head2 get_social_activity_data
378
379				=head1 CSV METHODS
380
381				Each of these takes a website and a filename and writes a CSV file with the
382				data for that website.
383
384				$gdata->save_top_queries_as_csv(
385				website => 'http://www.example.org',
386				filename => 'example_org_top_queries.csv',
387				);
388
389				=head2 save_top_pages_as_csv
390
391				=head2 save_top_queries_as_csv
392
393				=head2 save_crawl_errors_as_csv
394
395				=head2 save_content_errors_as_csv
396
397				=head2 save_content_keywords_as_csv
398
399				=head2 save_latest_backlinks_as_csv
400
401				=head2 save_internal_links_as_csv
402
403				=head2 save_external_links_as_csv
404
405				=head2 save_social_activity_as_csv
406
407				=cut