# -*- coding: utf-8 -*- import scrapy import urlparse from mobygames.items import Game, Screenshot class GamesSpider(scrapy.Spider): name = "games" allowed_domains = ["mobygames.com"] start_urls = ( 'http://www.mobygames.com/browse/games/list-games/', ) def parse(self, response): gamerows = response.xpath('//table[@id="mof_object_list"]/tbody/tr') for gamerow in gamerows: links = gamerow.xpath('./td/a') game = Game() game['title'] = links[0].xpath('./text()').extract()[0] game['url'] = urlparse.urljoin(response.url, links[0].xpath('./@href').extract()[0]) game['year'] = links[1].xpath('./text()').extract()[0] path = urlparse.urlparse(game['url']).path game['key'] = path.split('/')[-1] or path.split('/')[-2] yield game screenshot_url = urlparse.urljoin(game['url'] + '/', "screenshots") print "fetching", screenshot_url, "for", game['key'] yield scrapy.http.Request(screenshot_url, callback=self.parseScreenshots, meta={'key': game['key']}) nextlink = response.xpath('//td[@class="mobHeaderNav"]/a[text()[contains(.,"Next")]]/@href').extract() if len(nextlink) > 0: yield scrapy.http.Request(urlparse.urljoin(response.url, nextlink[0])) def parseScreenshots(self, response): for thumbdiv in response.xpath('//div[@class="thumbnail"]'): screenshot = Screenshot() screenshot['key'] = response.meta['key'] screenshot['description'] = u' '.join(thumbdiv.xpath('.//div[@class="thumbnail-caption"]//text()').extract()).strip() screenshot['url'] = urlparse.urljoin(response.url, thumbdiv.xpath('.//a[@class="thumbnail-image"]/@href').extract()[0]) yield screenshot