mobygames-screens/mobygames/spiders/games.py

# -*- coding: utf-8 -*-
import scrapy
import urlparse
from mobygames.items import Game, Screenshot


class GamesSpider(scrapy.Spider):
    name = "games"
    allowed_domains = ["mobygames.com"]
    start_urls = (
        'http://www.mobygames.com/browse/games/list-games/',
    )

    def parse(self, response):
        gamerows = response.xpath('//table[@id="mof_object_list"]/tbody/tr')
        for gamerow in gamerows:
        	links = gamerow.xpath('./td/a')
        	game = Game()
        	game['title'] = links[0].xpath('./text()').extract()[0]
        	game['url'] = urlparse.urljoin(response.url, links[0].xpath('./@href').extract()[0])
        	game['year'] = links[1].xpath('./text()').extract()[0]
        	path = urlparse.urlparse(game['url']).path
        	game['key'] = path.split('/')[-1] or path.split('/')[-2]
        	yield game

        	screenshot_url = urlparse.urljoin(game['url'] + '/', "screenshots")
        	print "fetching", screenshot_url, "for", game['key']
        	yield scrapy.http.Request(screenshot_url, callback=self.parseScreenshots, meta={'key': game['key']})

    	nextlink = response.xpath('//td[@class="mobHeaderNav"]/a[text()[contains(.,"Next")]]/@href').extract()
    	if len(nextlink) > 0:
    		yield scrapy.http.Request(urlparse.urljoin(response.url, nextlink[0]))

    def parseScreenshots(self, response):
    	for thumbdiv in response.xpath('//div[@class="thumbnail"]'):
    		screenshot = Screenshot()
    		screenshot['key'] = response.meta['key']
    		screenshot['description'] = u' '.join(thumbdiv.xpath('.//div[@class="thumbnail-caption"]//text()').extract()).strip()
    		screenshot['url'] = urlparse.urljoin(response.url, thumbdiv.xpath('.//a[@class="thumbnail-image"]/@href').extract()[0])
    		yield screenshot
Initial commit 2015-05-06 13:10:41 +00:00			`# -- coding: utf-8 --`
			`import scrapy`
			`import urlparse`
			`from mobygames.items import Game, Screenshot`


			`class GamesSpider(scrapy.Spider):`
			`name = "games"`
			`allowed_domains = ["mobygames.com"]`
			`start_urls = (`
			`'http://www.mobygames.com/browse/games/list-games/',`
			`)`

			`def parse(self, response):`
			`gamerows = response.xpath('//table[@id="mof_object_list"]/tbody/tr')`
			`for gamerow in gamerows:`
			`links = gamerow.xpath('./td/a')`
			`game = Game()`
			`game['title'] = links[0].xpath('./text()').extract()[0]`
			`game['url'] = urlparse.urljoin(response.url, links[0].xpath('./@href').extract()[0])`
			`game['year'] = links[1].xpath('./text()').extract()[0]`
			`path = urlparse.urlparse(game['url']).path`
			`game['key'] = path.split('/')[-1] or path.split('/')[-2]`
			`yield game`

			`screenshot_url = urlparse.urljoin(game['url'] + '/', "screenshots")`
			`print "fetching", screenshot_url, "for", game['key']`
			`yield scrapy.http.Request(screenshot_url, callback=self.parseScreenshots, meta={'key': game['key']})`

			`nextlink = response.xpath('//td[@class="mobHeaderNav"]/a[text()[contains(.,"Next")]]/@href').extract()`
			`if len(nextlink) > 0:`
			`yield scrapy.http.Request(urlparse.urljoin(response.url, nextlink[0]))`

			`def parseScreenshots(self, response):`
			`for thumbdiv in response.xpath('//div[@class="thumbnail"]'):`
			`screenshot = Screenshot()`
			`screenshot['key'] = response.meta['key']`
			`screenshot['description'] = u' '.join(thumbdiv.xpath('.//div[@class="thumbnail-caption"]//text()').extract()).strip()`
			`screenshot['url'] = urlparse.urljoin(response.url, thumbdiv.xpath('.//a[@class="thumbnail-image"]/@href').extract()[0])`
			`yield screenshot`