41 lines
1.7 KiB
Python
41 lines
1.7 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
import scrapy
|
||
|
import urlparse
|
||
|
from mobygames.items import Game, Screenshot
|
||
|
|
||
|
|
||
|
class GamesSpider(scrapy.Spider):
|
||
|
name = "games"
|
||
|
allowed_domains = ["mobygames.com"]
|
||
|
start_urls = (
|
||
|
'http://www.mobygames.com/browse/games/list-games/',
|
||
|
)
|
||
|
|
||
|
def parse(self, response):
|
||
|
gamerows = response.xpath('//table[@id="mof_object_list"]/tbody/tr')
|
||
|
for gamerow in gamerows:
|
||
|
links = gamerow.xpath('./td/a')
|
||
|
game = Game()
|
||
|
game['title'] = links[0].xpath('./text()').extract()[0]
|
||
|
game['url'] = urlparse.urljoin(response.url, links[0].xpath('./@href').extract()[0])
|
||
|
game['year'] = links[1].xpath('./text()').extract()[0]
|
||
|
path = urlparse.urlparse(game['url']).path
|
||
|
game['key'] = path.split('/')[-1] or path.split('/')[-2]
|
||
|
yield game
|
||
|
|
||
|
screenshot_url = urlparse.urljoin(game['url'] + '/', "screenshots")
|
||
|
print "fetching", screenshot_url, "for", game['key']
|
||
|
yield scrapy.http.Request(screenshot_url, callback=self.parseScreenshots, meta={'key': game['key']})
|
||
|
|
||
|
nextlink = response.xpath('//td[@class="mobHeaderNav"]/a[text()[contains(.,"Next")]]/@href').extract()
|
||
|
if len(nextlink) > 0:
|
||
|
yield scrapy.http.Request(urlparse.urljoin(response.url, nextlink[0]))
|
||
|
|
||
|
def parseScreenshots(self, response):
|
||
|
for thumbdiv in response.xpath('//div[@class="thumbnail"]'):
|
||
|
screenshot = Screenshot()
|
||
|
screenshot['key'] = response.meta['key']
|
||
|
screenshot['description'] = u' '.join(thumbdiv.xpath('.//div[@class="thumbnail-caption"]//text()').extract()).strip()
|
||
|
screenshot['url'] = urlparse.urljoin(response.url, thumbdiv.xpath('.//a[@class="thumbnail-image"]/@href').extract()[0])
|
||
|
yield screenshot
|