File Coverage

blib/lib/WWW/Amazon/Wishlist.pm
Criterion Covered Total %
statement 195 242 80.5
branch 41 80 51.2
condition 24 51 47.0
subroutine 19 19 100.0
pod 0 1 0.0
total 279 393 70.9


line stmt bran cond sub pod time code
1            
2             package WWW::Amazon::Wishlist;
3            
4 1     1   20322 use strict;
  1         1  
  1         38  
5 1     1   4 use vars qw( @ISA @EXPORT @EXPORT_OK );
  1         0  
  1         52  
6            
7 1     1   3 use Carp;
  1         4  
  1         49  
8 1     1   3 use Data::Dumper;
  1         1  
  1         31  
9 1     1   671 use HTML::TreeBuilder;
  1         21906  
  1         7  
10 1     1   649 use LWP::UserAgent;
  1         28694  
  1         27  
11            
12 1     1   6 use constant COM => 0;
  1         1  
  1         52  
13 1     1   3 use constant UK => 1;
  1         1  
  1         34  
14            
15 1     1   3 use constant DEBUG => 0;
  1         1  
  1         31  
16 1     1   3 use constant DEBUG_HTML => 0;
  1         1  
  1         1987  
17            
18             require Exporter;
19            
20             @ISA = qw(Exporter);
21             @EXPORT = qw(
22             );
23             @EXPORT_OK = qw(
24             get_list
25             UK
26             COM
27             );
28            
29             my
30             $VERSION = 2.015;
31            
32             =pod
33            
34             =head1 NAME
35            
36             WWW::Amazon::Wishlist - grab all the details from your Amazon wishlist
37            
38             =head1 SYNOPSIS
39            
40             use WWW::Amazon::Wishlist qw(get_list COM UK);
41            
42             my @wishlist;
43            
44             @wishlist = get_list($my_amazon_com_id); # gets it from amazon.com
45             @wishlist = get_list($my_amazon_com_id, COM); # same, explicitly
46             @wishlist = get_list($my_amazon_couk_id, UK); # gets it from amazon.co.uk
47            
48             # Or, if you didn't import the COM and UK constants:
49             @wishlist = get_list ($my_amazon_couk_id, WWW::Amazon::Wishlist::UK);
50            
51             # The elements of @wishlist are hashrefs that contain the following elements:
52             foreach my $book (@wishlist)
53             {
54             print $book->{title}, # the, err, title
55             $book->{author}, # and the author(s)
56             $book->{asin}, # the asin number, its unique id on Amazon
57             $book->{price}, # how much it will set you back
58             $book->{quantity}, # how many you said you want
59             $book->{priority}, # how urgently you said you want it (1-5)
60             $book->{type}; # Hardcover/Paperback/CD/DVD etc (not available in the US)
61             } # foreach
62            
63             =head1 DESCRIPTION
64            
65             Goes to amazon.(com|co.uk), scrapes your wishlist, and returns it
66             in a array of hashrefs so that you can fiddle with it to your heart's
67             content.
68            
69             =head1 GETTING YOUR AMAZON ID
70            
71             The best way to do this is to search for your own wishlist in the search
72             tools.
73            
74             Searching for mine (simon@twoshortplanks.com) on amazon.com takes me to
75             the URL something like
76            
77             http://www.amazon.com/exec/obidos/wishlist/2EAJG83WS7YZM/...
78            
79             there's some more cruft after that last string of numbers and letters
80             but it's the
81            
82             2EAJG83WS7YZM
83            
84             bit that's important.
85            
86             Doing the same for amazon.co.uk is just as easy.
87            
88             Apparently, some people have had problems getting to their wishlist right
89             after it gets set up. You may have to wait a while for it to become
90             browseable.
91            
92             =head1 SHOWING YOUR APPRECIATION
93            
94             There was a thread on london.pm mailing list about working in a vacuum -
95             that it was a bit depressing to keep writing modules but never get any
96             feedback. So, if you use and like this module then please send me an
97             email and make my day.
98            
99             All it takes is a few little bytes.
100            
101            
102             =head1 BUGS
103            
104             B
105            
106             C is a screen scraper and is there for
107             is vulnerable to any changes that Amazon make to their HTML.
108            
109             If it starts returning no items then this is very likely the reason
110             and I will get around to fixing it as soon as possible.
111            
112             You might want to look at the C module instead.
113            
114             It doesn't cope with anything apart from the UK and USA versions of Amazon.
115            
116             I don't think it likes unavailable items - trying to work around this
117             breaks UK compatability.
118            
119             The code has accumulated lots of cruft.
120            
121             Lack of testing. It works for the pages I've tried it for but that's
122             no guarantee.
123            
124             =head1 LICENSE
125            
126             Copyright (c) 2003 Simon Wistow
127            
128             Distributed under the same terms as Perl itself.
129            
130             This software is under no warranty and will probably destroy your wish
131             list, kill your friends, burn your house and bring about the apocalypse
132            
133             =head1 AUTHOR
134            
135             Simon Wistow
136             Currently maintained by Martin Thurn
137            
138             =head1 SEE ALSO
139            
140             L, L, L
141            
142             =cut
143            
144             my $USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)';
145            
146             sub get_list
147             {
148             # Required arg = wishlist ID:
149 2   33 2 0 2031 my $id = shift || croak "No ID given to get_list() function\n";
150             # Optional arg = whether we're accessing the UK site. Default is "no":
151 2   50     10 my $uk = shift || 0;
152             # Optional arg = turn on debugging:
153 2   100     7 my $test = shift || DEBUG;
154             # Note to self ... should we UC the id? Nahhhh. Not yet.
155             # fairly self explanatory
156 2 50       4 my $domain = ($uk) ? "co.uk" : "com";
157             # set up some variables
158 2         2 my $iPage = 1;
159 2         1 my @items;
160             # and awaaaaaaaaaaaaay we go ....
161             INFINITE:
162 2         2 while (1)
163             {
164 3 50       13 my $url = $uk ? "https://www.amazon.co.uk/gp/registry/wishlist/ref=cm_wl_search_1?page=$iPage&cid=$id" :
165             "http://www.amazon.com/gp/registry/wishlist/$id/?page=$iPage";
166             # This is a typical complete .com URL as of 2008-12:
167             # http://www.amazon.com/gp/registry/wishlist/2O4B95NPM1W3L
168 3         3 DEBUG_HTML && warn " DDD fetching wishlist for $id, page $iPage...\n";
169             # Don't overwhelm the server:
170 3 100       3000110 sleep(3) if (1 < $iPage);
171 3         13 my $content = _fetch_page($url, $domain);
172 3         8 if (DEBUG_HTML == 88)
173             {
174             warn $content;
175             exit 88;
176             } # if
177             # As of 2009-08, Amazon returns HTML with MISSING BRACKETS:
178 3         775 $content =~ s/(\r\n]+)(\s+<)/$1>\n$2/g;
179             # There seems to be a bug in HTML::TreeBuilder that causes
180             # abutting tags to be skpped!?!
181 3         1834 $content =~ s!>
182 3 50       18 if (9 < $test)
183             {
184 0         0 eval "use File::Slurp";
185 0         0 write_file(qq'PAGES/fetched-$domain.html', $content);
186 0         0 exit 88;
187             } # if
188 3         8 my $iLen = length($content);
189             # warn " DDD fetched $iLen bytes.\n";
190            
191             # UPDATED 2014-11. Both USA and UK sites use the same page
192             # format, therefore we always pass COM to the _extract() method:
193 3         12 my $result = _extract(COM, $content, $test);
194             # print Dumper($result);
195             # exit 88;
196 3 50       14 if (! defined $result)
197             {
198 0         0 DEBUG && warn " WWW _extract() returned nothing\n";
199 0         0 last INFINITE;
200             } # if
201 3 100       15 if (! ref $result->{items})
202             {
203             # Probably an empty wish list
204 1         2 DEBUG && warn " WWW _extract() returned no items\n";
205 1         3 last INFINITE;
206             } # if
207             ITEM:
208 2         3 foreach my $item (@{$result->{items}})
  2         7  
209             {
210 42         41 $item->{'author'} =~ s!\n!!g;
211 42         27 $item->{'author'} =~ s!^\s*by\s+!!g;
212 42         25 $item->{'author'} =~ s!
\n*!!s;
213 42 50       56 $item->{'quantity'} = $1 if ($item->{'priority'} =~ m!Desired:\s*\s*(\d+)!i);
214 42 50       45 $item->{'priority'} = $1 if ($item->{'priority'} =~ m!Priority:\s*\s*(\d)!i);
215 42 0 33     46 if (
      0        
216             $uk
217             &&
218             $item->{image}
219             &&
220             ($item->{image} !~ m!^http:!)
221             )
222             {
223 0         0 $item->{image} = q"http://images-eu.amazon.com/images/P/". $item->{image};
224             } # if
225 42         42 push @items, $item;
226             } # foreach ITEM
227 2         4 my $sURLNext = $result->{next};
228 2         4 my $iNext = 0;
229 2 50       5 if (! defined $sURLNext)
230             {
231             # DEBUG && warn " DDD content===$content===\n";
232             # exit 88;
233             # Use brute force to find it:
234 0 0       0 if ($content =~ m!([;&]page=\d+)">\s*(<[^>]+>)?Next!)
235             {
236 0         0 DEBUG && warn " DDD found next URL with brute force\n";
237 0         0 $sURLNext = $1;
238 0         0 $iNext = $2;
239             } # if
240             } # if
241             # Paranoia:
242 2 50       6 if (! defined $sURLNext)
243             {
244 0         0 DEBUG && warn " WWW did not find next url\n";
245 0         0 last INFINITE;
246             } # if
247 2 100       11 if ($sURLNext !~ m/[;&]page=(\d+)/)
248             {
249 1         1 DEBUG && warn " WWW next url =$sURLNext= does not contain page#\n";
250 1         4 last INFINITE;
251             } # if
252 1         5 $iNext = $1;
253             # More paranoia:
254 1 50       5 if ($iNext <= $iPage)
255             {
256 0         0 DEBUG && warn " WWW next url page=$iNext is not greater than current page=$iPage\n";
257 0         0 last INFINITE;
258             } # if
259             # ...and update:
260 1         4 $iPage = $iNext;
261             } # while INFINITE
262 2         363 return @items;
263             } # get_list
264            
265            
266             sub _fetch_page
267             {
268 3     3   4 my ($url, $domain) = @_;
269 3         4 if (0)
270             {
271             eval "use File::Slurp";
272             # For debugging UK site:
273             return read_file('Pages/uk-2008-12-page1.html');
274             # For debugging USA site:
275             return read_file('Pages/2008-12.html');
276             } # if 0
277             # Setting up the UA here is slower but makes the code easier to read
278             # really, the slow bit will not be setting up the UA each time
279            
280             # set up the UA
281 3         33 my $ua = new LWP::UserAgent( keep_alive => 1, timeout => 30, agent => $USER_AGENT, );
282             # setting it in the 'new' seems not to work sometimes
283 3         3597 $ua->agent($USER_AGENT);
284             # for some reason this makes stuff work
285 3         115 $ua->max_redirect( 0 );
286             # make a full set of headers
287 3         46 my $h = new HTTP::Headers(
288             'Host' => "www.amazon.$domain",
289             'Referer' => $url,
290             'User-Agent' => $USER_AGENT,
291             'Accept' => 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1',
292             'Accept-Language' => 'en-us,en;q=0.5',
293             'Accept-Charset' => 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
294             #'Accept-Encoding' => 'gzip,deflate',
295             'Keep-Alive' => '300',
296             'Connection' => 'keep-alive',
297             );
298 3         426 $h->referer("$url");
299 3         71 my $request = HTTP::Request->new ( 'GET', $url, $h );
300 3         6743 my $response;
301 3         4 my $times = 0;
302             # LWP should be able to do this but seemingly fails sometimes
303 3         11 while ($times++<3)
304             {
305 3         14 $response = $ua->request($request);
306 3 50       2061427 last if $response->is_success;
307 0 0       0 if ($response->is_redirect)
308             {
309 0         0 $url = $response->header("Location");
310             #$h->header("Referer", $url);
311 0         0 $h->referer("$url");
312 0         0 $request = HTTP::Request->new ( 'GET', $url, $h );
313             } # if
314             } # while
315 3 50       49 if (!$response->is_success)
316             {
317 0         0 croak "Failed to retrieve $url";
318 0         0 return undef;
319             } # if
320 3         28 my $s = $response->content;
321             # Clean the CRAP off the page:
322 3         2316 $s =~ s!!!gs;
323 3         1075 return $s;
324             } # _fetch_page
325            
326             # This is the HTML parsing version written by Martin Thurn:
327            
328             sub _extract
329             {
330             # Required arg1 = whether we are parsing the UK site or not (Boolean):
331 3   50 3   32 my $iUK = shift || 0;
332             # Required arg2 = the HTML contents of the webpage:
333 3   50     281 my $s = shift || '';
334             # Optional arg = debugging level:
335 3   100     15 my $iDebug = shift || 0;
336 3         4 DEBUG_HTML && warn " DDD start _extract()\n";
337 3         6 my $rh = {};
338 3         36 my $oTree = new HTML::TreeBuilder;
339 3         670 $oTree->parse($s);
340 3         2693026 $oTree->eof;
341 3 50       81910 my @aoSPAN = $iUK ? $oTree->look_down(_tag => 'div',
342             class => 'a-text-left a-fixed-left-grid-col a-col-right',
343             # class => 'lineItemGroup',
344             )
345             : $oTree->look_down(_tag => 'div',
346             class => 'a-text-left a-fixed-left-grid-col a-col-right',
347             );
348             SPAN_TAG:
349 3         53244 foreach my $oSPAN (@aoSPAN)
350             {
351 48 50       18734 next SPAN_TAG unless ref $oSPAN;
352 48         49 DEBUG_HTML && warn " DDD found toplevel item tagset\n";
353 48         38 if (9 < DEBUG_HTML)
354             {
355             my $s = $oSPAN->as_HTML;
356             warn " DDD ==$s==\n";
357             } # if
358 48         48 my $sASIN = q{};
359 48         60 my $sName = q{};
360 48         45 my $sTitle = q{};
361 48         99 my @aoA = $oSPAN->look_down(_tag => 'a');
362 48         17105 DEBUG_HTML && warn sprintf(" DDD contains %d tags\n", scalar(@aoA));
363             A_TAG:
364 48         70 foreach my $oA (@aoA)
365             {
366 72 50       133 next A_TAG if ! ref $oA;
367 72         148 my $sA = $oA->as_HTML;
368 72         11297 DEBUG_HTML && warn " DDD try A\n";
369 72         61 if (9 < DEBUG_HTML)
370             {
371             warn " DDD ==$sA==\n";
372             } # if
373 72   66     140 $sTitle = $oA->attr('title') || $oA->as_text;
374             # Strip leading whitespace:
375 72         962 $sTitle =~ s!\A\s+!!;
376             # Strip trailing whitespace:
377 72         173 $sTitle =~ s!\s+\Z!!;
378             # Ignore empty (image-only) tags:
379 72 50       185 next A_TAG if ($sTitle !~ m/\S/);
380             # Strip out zero-width spaces scattered about randomly in item titles
381 72         130 $sTitle =~ s/\x{200b}//g;
382 72         154 DEBUG_HTML && warn " DDD found item named '$sTitle'\n";
383 72 50       116 next A_TAG if ($sTitle eq 'Universal Wish List Button');
384 72 100       124 next A_TAG if ($sTitle eq 'Buying this gift elsewhere?');
385 60         115 my $sURL = $oA->attr('href');
386 60         310 DEBUG_HTML && warn " DDD URL ==$sURL==\n";
387 60 100 33     495 if (
      66        
388             ($sURL =~ m!/detail(?:/offer-listing)?/-/(.+?)/ref!)
389             ||
390             ($sURL =~ m!/gp/product/(.+?)/ref!)
391             ||
392             ($sURL =~ m!/dp/(.+?)/ref!)
393             )
394             {
395             # It's a match!
396 42         75 $sASIN = $1;
397 42         62 last A_TAG;
398             } # if
399             else
400             {
401 18         21 DEBUG_HTML && warn " EEE url does not contain asin\n";
402             }
403             } # foreach A_TAG
404 48         50 DEBUG_HTML && warn " DDD ASIN ==$sASIN==\n";
405 48 100       92 if ($sASIN eq q{})
406             {
407 6         12 next SPAN_TAG;
408             } # if
409             # Grab the smallest-containing ancestor of this item:
410 42 50       72 my $oParent = $iUK
411             ? $oSPAN->look_up(_tag => 'tbody',
412             class => 'itemWrapper',
413             )
414             : $oSPAN;
415 42 50       85 if (! ref $oParent)
416             {
417 0         0 DEBUG_HTML && warn " WWW did not find ancestor TBODY\n";
418 0         0 next SPAN_TAG;
419             } # if
420 42         76 my $sParentHTML = $oParent->as_HTML;
421 42         206286 DEBUG_HTML && warn " DDD parent HTML ==$sParentHTML==\n";
422 42         94 my $sParent = $oParent->as_text;
423             # Manual text clean-up:
424 42         13110 $sParent =~ s/(DESIRED|RECEIVED|PRIORITY)/; $1: /g;
425 42         42 DEBUG_HTML && warn " DDD parent text ==$sParent==\n";
426 42         92 my $iDesired = _match_desired($sParent);
427 42         34 DEBUG_HTML && warn " DDD desired set to =$iDesired=\n";
428 42         89 my $sPriority = _match_priority($sParent);
429 42         39 DEBUG_HTML && warn " DDD priority set to =$sPriority=\n";
430 42         133 my @aoTDtiny = $oParent->look_down(_tag => 'td',
431             class => 'tiny',
432             );
433             QUANT_TAG:
434 42         14838 foreach my $oSPAN (@aoTDtiny)
435             {
436 0 0       0 next QUANT_TAG unless ref $oSPAN;
437 0         0 my $sSpan = $oSPAN->as_text;
438 0         0 DEBUG_HTML && warn " DDD TDtiny=$sSpan=\n";
439 0   0     0 $sPriority ||= _match_priority($sSpan);
440 0         0 DEBUG_HTML && warn " DDD priority set to =$sPriority=\n";
441 0   0     0 $iDesired ||= _match_desired($sSpan);
442 0         0 DEBUG_HTML && warn " DDD desired set to =$iDesired=\n";
443             } # foreach QUANT_TAG
444 42 50 33     122 if (! $iDesired || ! $sPriority)
445             {
446             # See if they are encoded in a FORM:
447             # Find the priority:
448 42 50       169 if ($sParentHTML =~ m!
449             {
450 0         0 $sPriority = $1;
451 0         0 DEBUG_HTML && warn " DDD priority set to =$sPriority=\n";
452             } # if
453             else
454             {
455 42         41 DEBUG_HTML && warn " WWW did not find
456             }
457             # Find the quantity desired:
458 42 50       136 if ($sParentHTML =~ m!!)
459             {
460 0         0 $iDesired = $1;
461 0         0 DEBUG_HTML && warn " DDD desired set to =$iDesired=\n";
462             } # if
463             else
464             {
465 42         39 DEBUG_HTML && warn " WWW did not find for desired-quantity\n";
466             }
467             } # if
468             # Put in default values if we never found them:
469 42   100     145 $sPriority ||= 'medium';
470 42         27 DEBUG_HTML && warn " DDD priority set to =$sPriority=\n";
471 42   50     112 $iDesired ||= 1;
472             # Find the date added:
473 42         42 my $sDate = '';
474 42 50       331 if ($sParentHTML =~ m!>added\s+(.+?)
475             {
476 0         0 $sDate = $1;
477 0         0 DEBUG_HTML && warn " DDD date=$sDate=\n";
478             } # if
479             else
480             {
481 42         34 DEBUG_HTML && warn " WWW did not find text for date-added\n";
482             }
483            
484             # Find the "author" of this item:
485 42         33 my @aoTDauthor;
486 42 50       60 if ($iUK)
487             {
488 0         0 @aoTDauthor = $oParent->look_down(_tag => 'td',
489             class => 'small',
490             );
491             }
492             else
493             {
494             @aoTDauthor = $oParent->look_down(_tag => 'span',
495             sub
496             {
497 943     943   18528 my $sHtml = $_[0]->as_HTML;
498             # DEBUG_HTML && warn " DDD try oTDauthor span==$sHtml==\n";
499 943   100     146619 my $s = $_[0]->attr('class') || q{};
500 943         6627 $s =~ m'BYLINE'i;
501             },
502 42         220 );
503             } # else
504 42         1637 my $sAuthor = '';
505             AUTHOR_TAG:
506 42         65 foreach my $oTD (@aoTDauthor)
507             {
508 0 0       0 next AUTHOR_TAG unless ref $oTD;
509 0         0 my $s = $oTD->as_HTML;
510 0         0 DEBUG_HTML && warn " DDD try oTDauthor==$s==\n";
511 0         0 $s = $oTD->as_text;
512 0 0       0 if ($s =~ s!\A\s*(by|~)\s+!!)
513             {
514 0         0 $sAuthor = $s;
515 0         0 last AUTHOR_TAG;
516             } # if
517             } # foreach AUTHOR_TAG
518 42         37 DEBUG_HTML && warn " DDD author=$sAuthor=\n";
519             # Find the price of this item:
520 42         68 my $sPrice = '';
521             my $oTDprice = $oParent->look_down(_tag => 'span',
522             sub
523             {
524 165   50 165   8228 my $s = $_[0]->attr('class') || q{};
525 165         1122 $s =~ m'PRICE'i;
526             },
527 42         205 );
528 42 50       289 if (! ref $oTDprice)
529             {
530 0         0 DEBUG_HTML && warn " WWW did not find TD for price\n";
531             # warn $oParent->as_HTML;
532             # exit 88;
533             # next SPAN_TAG;
534             } # if
535             else
536             {
537 42         89 $sPrice = $oTDprice->as_text;
538 42 50       639 if ($sPrice =~ m!Price:\s+(.+)\Z!)
539             {
540 0         0 $sPrice = $1;
541             } # if
542 42         128 $sPrice =~ s!\A\s+!!;
543 42         83 $sPrice =~ s!\s+\Z!!;
544 42         53 DEBUG_HTML && warn " DDD price=$sPrice=\n";
545             } # else
546             # Add this item to the result set:
547 42         248 my %hsItem = (
548             asin => $sASIN,
549             author => $sAuthor,
550             # image => $sImageURL,
551             price => $sPrice,
552             priority => $sPriority,
553             quantity => $iDesired,
554             title => $sTitle,
555             # type => $sType,
556             );
557 42         42 DEBUG_HTML && warn Dumper(\%hsItem);
558 42         38 push @{$rh->{items}}, \%hsItem;
  42         115  
559             # All done with this item:
560 42         124 $oParent->detach;
561 42         545 $oParent->delete;
562             } # foreach SPAN_TAG
563             # Look for the next-page link:
564             my $oA = $oTree->look_down(_tag => 'a',
565             sub
566             {
567 222   50 222   34713 my $s = $_[0]->as_text || q{};
568 222         4650 DEBUG_HTML && warn " DDD try next ==$s==\n";
569 1     1   9 $s =~ m/\A\s*NEXT/i;
  1         2  
  1         12  
  222         521  
570             },
571 3         950 );
572 3 50       40 if (ref $oA)
573             {
574 3         12 $rh->{next} = $oA->attr('href');
575 3         37 DEBUG_HTML && warn " DDD raw next URL is ==$rh->{next}==\n";
576             } # if
577             else
578             {
579 0         0 DEBUG_HTML && warn " DDD did not find next URL\n";
580             }
581 3         3811 return $rh;
582             } # _extract
583            
584             sub _match_priority
585             {
586 42   50 42   83 my $s = shift || return;
587 42 100       20579 if ($s =~ m'.+PRIORITY:?\s*(\w+?)(\s|\z)'i)
588             {
589 9         37 return lc $1;
590             } # if
591 33         39 return;
592             } # _match_priority
593            
594             sub _match_desired
595             {
596 42   50 42   115 my $s = shift || return;
597 42 50       648 if ($s =~ m'(?:DESIRED|WANTS):?\s*(\d+)'i)
598             {
599 0         0 return lc $1;
600             } # if
601 42         61 return;
602             } # _match_desired
603            
604             1;
605            
606             __END__