#!/usr/bin/python -tt # amazon-wishlist v0.3 # Copyright (c) 2008-9, John Morrissey # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, # THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. AWS_ACCESS_KEY = '' AWS_SECRET_KEY = '' WISHLIST_ID = '' AWS_XML_NS = 'http://webservices.amazon.com/AWSECommerceService/2011-08-01' # MCLS branches I prefer. BRANCHES = [ 'Evans Branch West - Irondequoit Library', 'Henrietta Public Library', 'McGraw Branch East - Irondequoit Library', 'Rochester Public Library Central', ] from base64 import b64encode from datetime import date import hashlib import hmac import json import re import sys from time import gmtime, strftime from urllib import quote_plus, urlencode from urllib2 import Request from urlparse import parse_qs, urljoin, urlparse from xml.etree.ElementTree import ElementTree import html5lib import mechanize from BeautifulSoup import Comment as BeautifulSoupComment cj = mechanize.LWPCookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) class MclsScraper(object): endpoint = 'http://classic.libraryweb.org/carlweb/jsp/' def _parse_result(self, body): parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(body) if soup.find(text=re.compile(r'No status information was found for this title\.')): return [] if soup.find(text=re.compile(r'No se sabe el estado de')): return [] if soup.find(text=re.compile(r'No matches were found')): return [] marcid = soup.find('td', {'class': 'MARCtaglabel'}) if not marcid: return [] id = (marcid.findParents('tr')[0]. find(text=re.compile(r'^\s*\d+\s*$')).strip()) branches_req = Request( '%sGetDisplayBranches?databaseID=720&controlNumber=%s' % ( self.endpoint, id, ) ) branches_req.add_header('User-Agent', 'amazon-wishlist 0.3') data = opener.open(branches_req) body = ''.join(data.fp.readlines()).strip() if not body: return [] branches = json.loads(body) locations = [] for branch in branches: branch_req = Request( '%sGetDisplayItems?databaseID=720&controlNumber=%s&branchID=%s' % ( self.endpoint, id, branch['BRANCHID'] ) ) branch_req.add_header('User-Agent', 'amazon-wishlist 0.3') data = opener.open(branch_req) branch_item = json.loads(''.join(data.fp.readlines())) locations.append({ 'Branch': branch['BRANCHNAME'], 'Call Number': ', '.join([ item['CALLNUM'] for item in branch_item ]), 'numAvailable': int(branch['NUMAVAILABLE']), }) return locations def get_info(self, type, title, author): cj.clear() search = Request( '%sDoSearch?%s' % ( self.endpoint, urlencode([ ('databaseID', 720), ('count', 10), ('index', 'w'), ('terms', '%s %s' % (title, author)), ('keywords', type), ]) ) ) search.add_header('User-Agent', 'amazon-wishlist 0.3') data = opener.open(search) results = Request('%stitlelist.jsp' % self.endpoint) results.add_header('User-Agent', 'amazon-wishlist 0.3') data = opener.open(results) body = ''.join(data.fp.readlines()) parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(body) result_uris = [] for tr in soup.findAll('tr'): if 'eBook' in tr: continue for rec in tr.findAll(href=re.compile('FullRecord')): if rec['href'] not in result_uris: result_uris.append(rec['href']) if len(result_uris) == 0: return self._parse_result(body) locations = [] for uri in result_uris: auth = Request('%s%s' % (self.endpoint, uri)) auth.add_header('User-Agent', 'amazon-wishlist 0.3') data = opener.open(auth) auth = Request('%sfullrecord.jsp' % (self.endpoint)) auth.add_header('User-Agent', 'amazon-wishlist 0.3') locations.extend(self._parse_result(''.join(opener.open(auth).fp.readlines()))) return locations class RitScraper(object): def _parse_result(self, body): parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(body) if soup.find(text=re.compile('NO ENTRIES FOUND')): return [] if soup.find(text=re.compile('Available electronically:')): return [{ 'LOCATION': 'ELECTRONIC', }] try: header = soup.find('tr', attrs={'class': 'bibItemsHeader'}) items = soup.findAll('tr', attrs={'class': 'bibItemsEntry'}) except: return [] fields = {} i = 0 for col in header.findAll('th'): fields[col.find(text=True).strip()] = i i += 1 locations = [] for location in items: locinfo = {} cells = location.findAll('td') for pretty, td_index in fields.iteritems(): value = ''.join(cells[td_index].findAll( text=lambda text: not isinstance(text, BeautifulSoupComment))).strip() if pretty == 'LOCATION': value = value.replace(' FLOOR', '') elif pretty == 'STATUS': if value.split(' ')[0] == 'DUE': month, day, year = \ [int(i) for i in re.sub(r'.*?(\d+-\d+-\d+).*', r'\1', value).split('-')] if year >= 0 and year <= date.today().year: year += 2000 else: year += 1900 locinfo['DUE'] = date(year, month, day) locinfo['AVAILABLE'] = (value == 'AVAILABLE') locinfo[pretty] = value locations.append(locinfo) return locations def get_info(self, type, title, author): auth = Request( 'http://albert.rit.edu/search/X?t:(%s)+and+a:(%s)&searchscope=3&SORT=A' % \ (quote_plus(title), quote_plus(author))) auth.add_header('User-Agent', 'amazon-wishlist 0.3') data = mechanize.urlopen(auth) body = ''.join(data.fp.readlines()) parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(body) result_uris = [] for rec in soup.findAll(attrs={'class': 'briefCitRow'}): link = rec.find('a', {'href': re.compile('SUBKEY')}) if link['href'] not in result_uris: result_uris.append(link['href']) if len(result_uris) == 0: return self._parse_result(body) locations = [] for uri in result_uris: item = Request(urljoin(auth.get_full_url(), uri)) item.add_header('User-Agent', 'amazon-wishlist 0.3') locations.extend(self._parse_result(''.join(mechanize.urlopen(item).fp.readlines()))) return locations def get_wishlist_body(url): if not url.startswith('http'): url = 'http://www.amazon.com' + url wishlist = Request(url) wishlist.add_header('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.186 Safari/535.1') data = mechanize.urlopen(wishlist) body = ''.join(data.fp.readlines()) parser = html5lib.HTMLParser( tree=html5lib.treebuilders.getTreeBuilder("beautifulsoup")) return parser.parse(body) def get_asins(body): list = body.find(attrs={'class': 'compact-items wlrdZeroTable'}) asins = [] for item in list.findAll('tbody'): asins.append(re.sub(r'.*\.', r'', item['name'])) return asins def ele(ns, item, name): if item is None: return None qname = '{%s}%s' % (ns, ('/{%s}' % (ns)).join(name.split('/'))) return item.find('.//%s' % qname) def elet(item, name): node = ele(AWS_XML_NS, item, name) if node is None: return '' return node.text def loadItems(wishlist_id): global AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_XML_NS mcls = MclsScraper() rit = RitScraper() body = get_wishlist_body( 'http://www.amazon.com/registry/wishlist/%s/ref=cm_wl_sb?reveal=unpurchased&filter=all&sort=date-added&layout=compact' % wishlist_id) asins = [] asins = get_asins(body) page_links = body.find(attrs={'class': 'pagDiv'}).findAll('a') for link in page_links: page_num = link.find(text=re.compile(r'^\d+$')) if not page_num: continue asins.extend(get_asins(get_wishlist_body(link['href']))) items = [] for asin in asins: qs_dict = { 'Service': 'AWSECommerceService', 'AWSAccessKeyId': AWS_ACCESS_KEY, 'Operation': 'ItemLookup', 'IdType': 'ASIN', 'ItemId': asin, 'Condition': 'All', 'ResponseGroup': 'Medium', 'AssociateTag': '0', } url = sign_amazon_req(AWS_SECRET_KEY, 'GET', 'http://webservices.amazon.com/onca/xml', qs_dict) auth = Request(url) auth.add_header('User-Agent', 'amazon-wishlist 0.3') tree = ElementTree(file=mechanize.urlopen(auth)) item = tree.getroot().find('.//{%s}Item' % (AWS_XML_NS)) img = ele(AWS_XML_NS, item, 'MediumImage') attrs = ele(AWS_XML_NS, item, 'ItemAttributes') offers = ele(AWS_XML_NS, item, 'OfferSummary') this_item = { 'title': elet(attrs, 'Title'), 'url': elet(item, 'DetailPageURL'), 'sort': [ elet(attrs, 'ProductGroup'), elet(item, 'BrowseNodes/BrowseNode/Ancestors/BrowseNode/Ancestors/BrowseNode/Ancestors/BrowseNode/Name'), elet(item, 'BrowseNodes/BrowseNode/Ancestors/BrowseNode/Ancestors/BrowseNode/Name'), ], 'img': { 'url': elet(img, 'URL'), 'width': elet(img, 'Width'), 'height': elet(img, 'Height'), 'caption': elet(attrs, 'Title'), }, 'prices': { 'list': elet(ele(AWS_XML_NS, attrs, 'ListPrice'), 'FormattedPrice'), 'new': { 'count': elet(offers, 'TotalNew'), 'lowest': elet(ele(AWS_XML_NS, offers, 'LowestNewPrice'), 'FormattedPrice'), }, 'used': { 'count': elet(offers, 'TotalUsed'), 'lowest': elet(ele(AWS_XML_NS, offers, 'LowestUsedPrice'), 'FormattedPrice'), }, }, } author = ele(AWS_XML_NS, attrs, 'Author') if author is not None: this_item['by'] = author.text artist = ele(AWS_XML_NS, attrs, 'Artist') if artist is not None: this_item['by'] = artist.text group = elet(attrs, 'ProductGroup') if group == 'Book': type = 'Book=qbook' elif group == 'Music': type = 'Music CD=qmusiccompactdisc' else: type = '' search_title = re.sub(r'[:(].*', '', this_item['title']) search_title = ' '.join([ word for word in search_title.split(' ') if word.lower() not in ['the', 'and', 'a', 'an', 'of', 'in', 'to', 'is', 'with'] ]) search_author = re.sub(r'\s\w{1,3}\.(\s|$)', r'\1', this_item.get('by', '')) for attempt in [1, 2, 3, 4, 5]: try: mcls_info = mcls.get_info(type, search_title, search_author) break except Exception, e: print 'Unable to retrieve MCLS info for %s: %s' % ( this_item['title'].encode('utf-8'), str(e)) mcls_info = [] branches = {} for avail in mcls_info: if 'Branch' not in avail: continue if avail['Branch'] not in branches: branches[avail['Branch']] = { 'count': 0, } if 'Call Number' in avail: branches[avail['Branch']]['call'] = avail['Call Number'] branches[avail['Branch']]['count'] += avail['numAvailable'] this_item['mcls'] = branches try: rit_info = rit.get_info(None, search_title, this_item.get('by', '')) except Exception, e: print 'Unable to retrieve RIT info for %s: %s' % ( this_item['title'].encode('utf-8'), str(e)) rit_info = [] if len(rit_info) > 0: this_item['rit'] = {} if rit_info[0]['LOCATION'] == 'ELECTRONIC': this_item['rit']['call'] = rit_info[0]['LOCATION'] else: this_item['rit']['call'] = \ '%s %s' % (rit_info[0]['LOCATION'], rit_info[0]['CALL NO.']) this_item['rit']['status'] = rit_info[0]['STATUS'] items.append(this_item) return items def displayItems(items): global BRANCHES for i in range(len(items)): item = items[i] if i % 2 == 0: print '
  • ' else: print '
  • ' print '' % item['url'] print '%s' % ( item['img']['url'], item['img']['height'], item['img']['width'], item['img']['caption'].encode('utf-8') ) print '' print item['title'].encode('utf-8'), '
    ' if 'by' in item: print item['by'].encode('utf-8'), '
    ' print 'List:', item['prices']['list'], '
    ' print 'New (%s): %s, Used (%s): %s
    ' % \ (item['prices']['new']['count'], item['prices']['new']['lowest'], item['prices']['used']['count'], item['prices']['used']['lowest']) if 'mcls' in item and len(item['mcls']) > 0: print 'MCLS: ' branches = [ '%s (%s, %s)' % (branch, avail['count'], avail['call']) for branch, avail in item['mcls'].iteritems() if branch in BRANCHES ] if len(branches) > 0: print ', '.join(branches) else: print ', '.join([ '%s (%s, %s)' % (branch, avail['count'], avail['call']) for branch, avail in item['mcls'].iteritems() ]) if 'rit' in item and 'call' in item['rit']: if 'status' in item['rit']: print 'RIT: %s, %s' % \ (item['rit']['status'], item['rit']['call']) else: print 'RIT: %s' % item['rit']['call'] print '
  • ' def sign_amazon_req(key, method, url, qs_dict): parsed_url = urlparse(url) qs = urlencode([ (k, qs_dict[k]) for k in sorted(qs_dict.keys()) ] + [('Timestamp', strftime('%Y-%m-%dT%H:%M:%S.000Z', gmtime()))]) to_sign = '\n'.join([method, parsed_url.netloc, parsed_url.path, qs]) sig = quote_plus(b64encode( hmac.new( key=key, msg=to_sign, digestmod=hashlib.sha256).digest() )) return '%s?%s&Signature=%s' % (url, qs, sig) print ''' jwm's Amazon Wishlist '''