File Coverage

blib/lib/WWW/Amazon/Wishlist.pm

Criterion	Covered	Total	%
statement	195	242	80.5
branch	41	80	51.2
condition	24	51	47.0
subroutine	19	19	100.0
pod	0	1	0.0
total	279	393	70.9

line	stmt	bran	cond	sub	pod	time	code
1
2							package WWW::Amazon::Wishlist;
3
4	1			1		20322	use strict;
	1					1
	1					38
5	1			1		4	use vars qw( @ISA @EXPORT @EXPORT_OK );
	1					0
	1					52
6
7	1			1		3	use Carp;
	1					4
	1					49
8	1			1		3	use Data::Dumper;
	1					1
	1					31
9	1			1		671	use HTML::TreeBuilder;
	1					21906
	1					7
10	1			1		649	use LWP::UserAgent;
	1					28694
	1					27
11
12	1			1		6	use constant COM => 0;
	1					1
	1					52
13	1			1		3	use constant UK => 1;
	1					1
	1					34
14
15	1			1		3	use constant DEBUG => 0;
	1					1
	1					31
16	1			1		3	use constant DEBUG_HTML => 0;
	1					1
	1					1987
17
18							require Exporter;
19
20							@ISA = qw(Exporter);
21							@EXPORT = qw(
22							);
23							@EXPORT_OK = qw(
24							get_list
25							UK
26							COM
27							);
28
29							my
30							$VERSION = 2.015;
31
32							=pod
33
34							=head1 NAME
35
36							WWW::Amazon::Wishlist - grab all the details from your Amazon wishlist
37
38							=head1 SYNOPSIS
39
40							use WWW::Amazon::Wishlist qw(get_list COM UK);
41
42							my @wishlist;
43
44							@wishlist = get_list($my_amazon_com_id); # gets it from amazon.com
45							@wishlist = get_list($my_amazon_com_id, COM); # same, explicitly
46							@wishlist = get_list($my_amazon_couk_id, UK); # gets it from amazon.co.uk
47
48							# Or, if you didn't import the COM and UK constants:
49							@wishlist = get_list ($my_amazon_couk_id, WWW::Amazon::Wishlist::UK);
50
51							# The elements of @wishlist are hashrefs that contain the following elements:
52							foreach my $book (@wishlist)
53							{
54							print $book->{title}, # the, err, title
55							$book->{author}, # and the author(s)
56							$book->{asin}, # the asin number, its unique id on Amazon
57							$book->{price}, # how much it will set you back
58							$book->{quantity}, # how many you said you want
59							$book->{priority}, # how urgently you said you want it (1-5)
60							$book->{type}; # Hardcover/Paperback/CD/DVD etc (not available in the US)
61							} # foreach
62
63							=head1 DESCRIPTION
64
65							Goes to amazon.(com\|co.uk), scrapes your wishlist, and returns it
66							in a array of hashrefs so that you can fiddle with it to your heart's
67							content.
68
69							=head1 GETTING YOUR AMAZON ID
70
71							The best way to do this is to search for your own wishlist in the search
72							tools.
73
74							Searching for mine (simon@twoshortplanks.com) on amazon.com takes me to
75							the URL something like
76
77							http://www.amazon.com/exec/obidos/wishlist/2EAJG83WS7YZM/...
78
79							there's some more cruft after that last string of numbers and letters
80							but it's the
81
82							2EAJG83WS7YZM
83
84							bit that's important.
85
86							Doing the same for amazon.co.uk is just as easy.
87
88							Apparently, some people have had problems getting to their wishlist right
89							after it gets set up. You may have to wait a while for it to become
90							browseable.
91
92							=head1 SHOWING YOUR APPRECIATION
93
94							There was a thread on london.pm mailing list about working in a vacuum -
95							that it was a bit depressing to keep writing modules but never get any
96							feedback. So, if you use and like this module then please send me an
97							email and make my day.
98
99							All it takes is a few little bytes.
100
101
102							=head1 BUGS
103
104							B
105
106							C is a screen scraper and is there for
107							is vulnerable to any changes that Amazon make to their HTML.
108
109							If it starts returning no items then this is very likely the reason
110							and I will get around to fixing it as soon as possible.
111
112							You might want to look at the C module instead.
113
114							It doesn't cope with anything apart from the UK and USA versions of Amazon.
115
116							I don't think it likes unavailable items - trying to work around this
117							breaks UK compatability.
118
119							The code has accumulated lots of cruft.
120
121							Lack of testing. It works for the pages I've tried it for but that's
122							no guarantee.
123
124							=head1 LICENSE
125
126							Copyright (c) 2003 Simon Wistow
127
128							Distributed under the same terms as Perl itself.
129
130							This software is under no warranty and will probably destroy your wish
131							list, kill your friends, burn your house and bring about the apocalypse
132
133							=head1 AUTHOR
134
135							Simon Wistow
136							Currently maintained by Martin Thurn
137
138							=head1 SEE ALSO
139
140							L, L, L
141
142							=cut
143
144							my $USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)';
145
146							sub get_list
147							{
148							# Required arg = wishlist ID:
149	2		33	2	0	2031	my $id = shift \|\| croak "No ID given to get_list() function\n";
150							# Optional arg = whether we're accessing the UK site. Default is "no":
151	2		50			10	my $uk = shift \|\| 0;
152							# Optional arg = turn on debugging:
153	2		100			7	my $test = shift \|\| DEBUG;
154							# Note to self ... should we UC the id? Nahhhh. Not yet.
155							# fairly self explanatory
156	2	50				4	my $domain = ($uk) ? "co.uk" : "com";
157							# set up some variables
158	2					2	my $iPage = 1;
159	2					1	my @items;
160							# and awaaaaaaaaaaaaay we go ....
161							INFINITE:
162	2					2	while (1)
163							{
164	3	50				13	my $url = $uk ? "https://www.amazon.co.uk/gp/registry/wishlist/ref=cm_wl_search_1?page=$iPage&cid=$id" :
165							"http://www.amazon.com/gp/registry/wishlist/$id/?page=$iPage";
166							# This is a typical complete .com URL as of 2008-12:
167							# http://www.amazon.com/gp/registry/wishlist/2O4B95NPM1W3L
168	3					3	DEBUG_HTML && warn " DDD fetching wishlist for $id, page $iPage...\n";
169							# Don't overwhelm the server:
170	3	100				3000110	sleep(3) if (1 < $iPage);
171	3					13	my $content = _fetch_page($url, $domain);
172	3					8	if (DEBUG_HTML == 88)
173							{
174							warn $content;
175							exit 88;
176							} # if
177							# As of 2009-08, Amazon returns HTML with MISSING BRACKETS:
178	3					775	$content =~ s/(\r\n]+)(\s+<)/$1>\n$2/g;
179							# There seems to be a bug in HTML::TreeBuilder that causes
180							# abutting tags to be skpped!?!
181	3					1834	$content =~ s!>
182	3	50				18	if (9 < $test)
183							{
184	0					0	eval "use File::Slurp";
185	0					0	write_file(qq'PAGES/fetched-$domain.html', $content);
186	0					0	exit 88;
187							} # if
188	3					8	my $iLen = length($content);
189							# warn " DDD fetched $iLen bytes.\n";
190
191							# UPDATED 2014-11. Both USA and UK sites use the same page
192							# format, therefore we always pass COM to the _extract() method:
193	3					12	my $result = _extract(COM, $content, $test);
194							# print Dumper($result);
195							# exit 88;
196	3	50				14	if (! defined $result)
197							{
198	0					0	DEBUG && warn " WWW _extract() returned nothing\n";
199	0					0	last INFINITE;
200							} # if
201	3	100				15	if (! ref $result->{items})
202							{
203							# Probably an empty wish list
204	1					2	DEBUG && warn " WWW _extract() returned no items\n";
205	1					3	last INFINITE;
206							} # if
207							ITEM:
208	2					3	foreach my $item (@{$result->{items}})
	2					7
209							{
210	42					41	$item->{'author'} =~ s!\n!!g;
211	42					27	$item->{'author'} =~ s!^\s*by\s+!!g;
212	42					25	$item->{'author'} =~ s! \n*!!s;
213	42	50				56	$item->{'quantity'} = $1 if ($item->{'priority'} =~ m!Desired:\s\s(\d+)!i);
214	42	50				45	$item->{'priority'} = $1 if ($item->{'priority'} =~ m!Priority:\s\s(\d)!i);
215	42	0	33			46	if (
			0
216							$uk
217							&&
218							$item->{image}
219							&&
220							($item->{image} !~ m!^http:!)
221							)
222							{
223	0					0	$item->{image} = q"http://images-eu.amazon.com/images/P/". $item->{image};
224							} # if
225	42					42	push @items, $item;
226							} # foreach ITEM
227	2					4	my $sURLNext = $result->{next};
228	2					4	my $iNext = 0;
229	2	50				5	if (! defined $sURLNext)
230							{
231							# DEBUG && warn " DDD content===$content===\n";
232							# exit 88;
233							# Use brute force to find it:
234	0	0				0	if ($content =~ m!([;&]page=\d+)">\s*(<[^>]+>)?Next!)
235							{
236	0					0	DEBUG && warn " DDD found next URL with brute force\n";
237	0					0	$sURLNext = $1;
238	0					0	$iNext = $2;
239							} # if
240							} # if
241							# Paranoia:
242	2	50				6	if (! defined $sURLNext)
243							{
244	0					0	DEBUG && warn " WWW did not find next url\n";
245	0					0	last INFINITE;
246							} # if
247	2	100				11	if ($sURLNext !~ m/[;&]page=(\d+)/)
248							{
249	1					1	DEBUG && warn " WWW next url =$sURLNext= does not contain page#\n";
250	1					4	last INFINITE;
251							} # if
252	1					5	$iNext = $1;
253							# More paranoia:
254	1	50				5	if ($iNext <= $iPage)
255							{
256	0					0	DEBUG && warn " WWW next url page=$iNext is not greater than current page=$iPage\n";
257	0					0	last INFINITE;
258							} # if
259							# ...and update:
260	1					4	$iPage = $iNext;
261							} # while INFINITE
262	2					363	return @items;
263							} # get_list
264
265
266							sub _fetch_page
267							{
268	3			3		4	my ($url, $domain) = @_;
269	3					4	if (0)
270							{
271							eval "use File::Slurp";
272							# For debugging UK site:
273							return read_file('Pages/uk-2008-12-page1.html');
274							# For debugging USA site:
275							return read_file('Pages/2008-12.html');
276							} # if 0
277							# Setting up the UA here is slower but makes the code easier to read
278							# really, the slow bit will not be setting up the UA each time
279
280							# set up the UA
281	3					33	my $ua = new LWP::UserAgent( keep_alive => 1, timeout => 30, agent => $USER_AGENT, );
282							# setting it in the 'new' seems not to work sometimes
283	3					3597	$ua->agent($USER_AGENT);
284							# for some reason this makes stuff work
285	3					115	$ua->max_redirect( 0 );
286							# make a full set of headers
287	3					46	my $h = new HTTP::Headers(
288							'Host' => "www.amazon.$domain",
289							'Referer' => $url,
290							'User-Agent' => $USER_AGENT,
291							'Accept' => 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,/;q=0.1',
292							'Accept-Language' => 'en-us,en;q=0.5',
293							'Accept-Charset' => 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
294							#'Accept-Encoding' => 'gzip,deflate',
295							'Keep-Alive' => '300',
296							'Connection' => 'keep-alive',
297							);
298	3					426	$h->referer("$url");
299	3					71	my $request = HTTP::Request->new ( 'GET', $url, $h );
300	3					6743	my $response;
301	3					4	my $times = 0;
302							# LWP should be able to do this but seemingly fails sometimes
303	3					11	while ($times++<3)
304							{
305	3					14	$response = $ua->request($request);
306	3	50				2061427	last if $response->is_success;
307	0	0				0	if ($response->is_redirect)
308							{
309	0					0	$url = $response->header("Location");
310							#$h->header("Referer", $url);
311	0					0	$h->referer("$url");
312	0					0	$request = HTTP::Request->new ( 'GET', $url, $h );
313							} # if
314							} # while
315	3	50				49	if (!$response->is_success)
316							{
317	0					0	croak "Failed to retrieve $url";
318	0					0	return undef;
319							} # if
320	3					28	my $s = $response->content;
321							# Clean the CRAP off the page:
322	3					2316	$s =~ s!!!gs;
323	3					1075	return $s;
324							} # _fetch_page
325
326							# This is the HTML parsing version written by Martin Thurn:
327
328							sub _extract
329							{
330							# Required arg1 = whether we are parsing the UK site or not (Boolean):
331	3		50	3		32	my $iUK = shift \|\| 0;
332							# Required arg2 = the HTML contents of the webpage:
333	3		50			281	my $s = shift \|\| '';
334							# Optional arg = debugging level:
335	3		100			15	my $iDebug = shift \|\| 0;
336	3					4	DEBUG_HTML && warn " DDD start _extract()\n";
337	3					6	my $rh = {};
338	3					36	my $oTree = new HTML::TreeBuilder;
339	3					670	$oTree->parse($s);
340	3					2693026	$oTree->eof;
341	3	50				81910	my @aoSPAN = $iUK ? $oTree->look_down(_tag => 'div',
342							class => 'a-text-left a-fixed-left-grid-col a-col-right',
343							# class => 'lineItemGroup',
344							)
345							: $oTree->look_down(_tag => 'div',
346							class => 'a-text-left a-fixed-left-grid-col a-col-right',
347							);
348							SPAN_TAG:
349	3					53244	foreach my $oSPAN (@aoSPAN)
350							{
351	48	50				18734	next SPAN_TAG unless ref $oSPAN;
352	48					49	DEBUG_HTML && warn " DDD found toplevel item tagset\n";
353	48					38	if (9 < DEBUG_HTML)
354							{
355							my $s = $oSPAN->as_HTML;
356							warn " DDD ==$s==\n";
357							} # if
358	48					48	my $sASIN = q{};
359	48					60	my $sName = q{};
360	48					45	my $sTitle = q{};
361	48					99	my @aoA = $oSPAN->look_down(_tag => 'a');
362	48					17105	DEBUG_HTML && warn sprintf(" DDD contains %d tags\n", scalar(@aoA));
363							A_TAG:
364	48					70	foreach my $oA (@aoA)
365							{
366	72	50				133	next A_TAG if ! ref $oA;
367	72					148	my $sA = $oA->as_HTML;
368	72					11297	DEBUG_HTML && warn " DDD try A\n";
369	72					61	if (9 < DEBUG_HTML)
370							{
371							warn " DDD ==$sA==\n";
372							} # if
373	72		66			140	$sTitle = $oA->attr('title') \|\| $oA->as_text;
374							# Strip leading whitespace:
375	72					962	$sTitle =~ s!\A\s+!!;
376							# Strip trailing whitespace:
377	72					173	$sTitle =~ s!\s+\Z!!;
378							# Ignore empty (image-only) tags:
379	72	50				185	next A_TAG if ($sTitle !~ m/\S/);
380							# Strip out zero-width spaces scattered about randomly in item titles
381	72					130	$sTitle =~ s/\x{200b}//g;
382	72					154	DEBUG_HTML && warn " DDD found item named '$sTitle'\n";
383	72	50				116	next A_TAG if ($sTitle eq 'Universal Wish List Button');
384	72	100				124	next A_TAG if ($sTitle eq 'Buying this gift elsewhere?');
385	60					115	my $sURL = $oA->attr('href');
386	60					310	DEBUG_HTML && warn " DDD URL ==$sURL==\n";
387	60	100	33			495	if (
			66
388							($sURL =~ m!/detail(?:/offer-listing)?/-/(.+?)/ref!)
389							\|\|
390							($sURL =~ m!/gp/product/(.+?)/ref!)
391							\|\|
392							($sURL =~ m!/dp/(.+?)/ref!)
393							)
394							{
395							# It's a match!
396	42					75	$sASIN = $1;
397	42					62	last A_TAG;
398							} # if
399							else
400							{
401	18					21	DEBUG_HTML && warn " EEE url does not contain asin\n";
402							}
403							} # foreach A_TAG
404	48					50	DEBUG_HTML && warn " DDD ASIN ==$sASIN==\n";
405	48	100				92	if ($sASIN eq q{})
406							{
407	6					12	next SPAN_TAG;
408							} # if
409							# Grab the smallest-containing ancestor of this item:
410	42	50				72	my $oParent = $iUK
411							? $oSPAN->look_up(_tag => 'tbody',
412							class => 'itemWrapper',
413							)
414							: $oSPAN;
415	42	50				85	if (! ref $oParent)
416							{
417	0					0	DEBUG_HTML && warn " WWW did not find ancestor TBODY\n";
418	0					0	next SPAN_TAG;
419							} # if
420	42					76	my $sParentHTML = $oParent->as_HTML;
421	42					206286	DEBUG_HTML && warn " DDD parent HTML ==$sParentHTML==\n";
422	42					94	my $sParent = $oParent->as_text;
423							# Manual text clean-up:
424	42					13110	$sParent =~ s/(DESIRED\|RECEIVED\|PRIORITY)/; $1: /g;
425	42					42	DEBUG_HTML && warn " DDD parent text ==$sParent==\n";
426	42					92	my $iDesired = _match_desired($sParent);
427	42					34	DEBUG_HTML && warn " DDD desired set to =$iDesired=\n";
428	42					89	my $sPriority = _match_priority($sParent);
429	42					39	DEBUG_HTML && warn " DDD priority set to =$sPriority=\n";
430	42					133	my @aoTDtiny = $oParent->look_down(_tag => 'td',
431							class => 'tiny',
432							);
433							QUANT_TAG:
434	42					14838	foreach my $oSPAN (@aoTDtiny)
435							{
436	0	0				0	next QUANT_TAG unless ref $oSPAN;
437	0					0	my $sSpan = $oSPAN->as_text;
438	0					0	DEBUG_HTML && warn " DDD TDtiny=$sSpan=\n";
439	0		0			0	$sPriority \|\|= _match_priority($sSpan);
440	0					0	DEBUG_HTML && warn " DDD priority set to =$sPriority=\n";
441	0		0			0	$iDesired \|\|= _match_desired($sSpan);
442	0					0	DEBUG_HTML && warn " DDD desired set to =$iDesired=\n";
443							} # foreach QUANT_TAG
444	42	50	33			122	if (! $iDesired \|\| ! $sPriority)
445							{
446							# See if they are encoded in a FORM:
447							# Find the priority:
448	42	50				169	if ($sParentHTML =~ m!
449							{
450	0					0	$sPriority = $1;
451	0					0	DEBUG_HTML && warn " DDD priority set to =$sPriority=\n";
452							} # if
453							else
454							{
455	42					41	DEBUG_HTML && warn " WWW did not find
456							}
457							# Find the quantity desired:
458	42	50				136	if ($sParentHTML =~ m!!)
459							{
460	0					0	$iDesired = $1;
461	0					0	DEBUG_HTML && warn " DDD desired set to =$iDesired=\n";
462							} # if
463							else
464							{
465	42					39	DEBUG_HTML && warn " WWW did not find for desired-quantity\n";
466							}
467							} # if
468							# Put in default values if we never found them:
469	42		100			145	$sPriority \|\|= 'medium';
470	42					27	DEBUG_HTML && warn " DDD priority set to =$sPriority=\n";
471	42		50			112	$iDesired \|\|= 1;
472							# Find the date added:
473	42					42	my $sDate = '';
474	42	50				331	if ($sParentHTML =~ m!>added\s+(.+?)
475							{
476	0					0	$sDate = $1;
477	0					0	DEBUG_HTML && warn " DDD date=$sDate=\n";
478							} # if
479							else
480							{
481	42					34	DEBUG_HTML && warn " WWW did not find text for date-added\n";
482							}
483
484							# Find the "author" of this item:
485	42					33	my @aoTDauthor;
486	42	50				60	if ($iUK)
487							{
488	0					0	@aoTDauthor = $oParent->look_down(_tag => 'td',
489							class => 'small',
490							);
491							}
492							else
493							{
494							@aoTDauthor = $oParent->look_down(_tag => 'span',
495							sub
496							{
497	943			943		18528	my $sHtml = $_[0]->as_HTML;
498							# DEBUG_HTML && warn " DDD try oTDauthor span==$sHtml==\n";
499	943		100			146619	my $s = $_[0]->attr('class') \|\| q{};
500	943					6627	$s =~ m'BYLINE'i;
501							},
502	42					220	);
503							} # else
504	42					1637	my $sAuthor = '';
505							AUTHOR_TAG:
506	42					65	foreach my $oTD (@aoTDauthor)
507							{
508	0	0				0	next AUTHOR_TAG unless ref $oTD;
509	0					0	my $s = $oTD->as_HTML;
510	0					0	DEBUG_HTML && warn " DDD try oTDauthor==$s==\n";
511	0					0	$s = $oTD->as_text;
512	0	0				0	if ($s =~ s!\A\s*(by\|~)\s+!!)
513							{
514	0					0	$sAuthor = $s;
515	0					0	last AUTHOR_TAG;
516							} # if
517							} # foreach AUTHOR_TAG
518	42					37	DEBUG_HTML && warn " DDD author=$sAuthor=\n";
519							# Find the price of this item:
520	42					68	my $sPrice = '';
521							my $oTDprice = $oParent->look_down(_tag => 'span',
522							sub
523							{
524	165		50	165		8228	my $s = $_[0]->attr('class') \|\| q{};
525	165					1122	$s =~ m'PRICE'i;
526							},
527	42					205	);
528	42	50				289	if (! ref $oTDprice)
529							{
530	0					0	DEBUG_HTML && warn " WWW did not find TD for price\n";
531							# warn $oParent->as_HTML;
532							# exit 88;
533							# next SPAN_TAG;
534							} # if
535							else
536							{
537	42					89	$sPrice = $oTDprice->as_text;
538	42	50				639	if ($sPrice =~ m!Price:\s+(.+)\Z!)
539							{
540	0					0	$sPrice = $1;
541							} # if
542	42					128	$sPrice =~ s!\A\s+!!;
543	42					83	$sPrice =~ s!\s+\Z!!;
544	42					53	DEBUG_HTML && warn " DDD price=$sPrice=\n";
545							} # else
546							# Add this item to the result set:
547	42					248	my %hsItem = (
548							asin => $sASIN,
549							author => $sAuthor,
550							# image => $sImageURL,
551							price => $sPrice,
552							priority => $sPriority,
553							quantity => $iDesired,
554							title => $sTitle,
555							# type => $sType,
556							);
557	42					42	DEBUG_HTML && warn Dumper(\%hsItem);
558	42					38	push @{$rh->{items}}, \%hsItem;
	42					115
559							# All done with this item:
560	42					124	$oParent->detach;
561	42					545	$oParent->delete;
562							} # foreach SPAN_TAG
563							# Look for the next-page link:
564							my $oA = $oTree->look_down(_tag => 'a',
565							sub
566							{
567	222		50	222		34713	my $s = $_[0]->as_text \|\| q{};
568	222					4650	DEBUG_HTML && warn " DDD try next ==$s==\n";
569	1			1		9	$s =~ m/\A\s*NEXT/i;
	1					2
	1					12
	222					521
570							},
571	3					950	);
572	3	50				40	if (ref $oA)
573							{
574	3					12	$rh->{next} = $oA->attr('href');
575	3					37	DEBUG_HTML && warn " DDD raw next URL is ==$rh->{next}==\n";
576							} # if
577							else
578							{
579	0					0	DEBUG_HTML && warn " DDD did not find next URL\n";
580							}
581	3					3811	return $rh;
582							} # _extract
583
584							sub _match_priority
585							{
586	42		50	42		83	my $s = shift \|\| return;
587	42	100				20579	if ($s =~ m'.+PRIORITY:?\s*(\w+?)(\s\|\z)'i)
588							{
589	9					37	return lc $1;
590							} # if
591	33					39	return;
592							} # _match_priority
593
594							sub _match_desired
595							{
596	42		50	42		115	my $s = shift \|\| return;
597	42	50				648	if ($s =~ m'(?:DESIRED\|WANTS):?\s*(\d+)'i)
598							{
599	0					0	return lc $1;
600							} # if
601	42					61	return;
602							} # _match_desired
603
604							1;
605
606							__END__