Initial commit
This commit is contained in:
commit
0710281e82
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
*.pyc
|
0
mobygames/__init__.py
Normal file
0
mobygames/__init__.py
Normal file
19
mobygames/items.py
Normal file
19
mobygames/items.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Define here the models for your scraped items
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# http://doc.scrapy.org/en/latest/topics/items.html
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
class Game(scrapy.Item):
|
||||||
|
key = scrapy.Field()
|
||||||
|
title = scrapy.Field()
|
||||||
|
url = scrapy.Field()
|
||||||
|
year = scrapy.Field()
|
||||||
|
|
||||||
|
class Screenshot(scrapy.Item):
|
||||||
|
key = scrapy.Field()
|
||||||
|
url = scrapy.Field()
|
||||||
|
description = scrapy.Field()
|
11
mobygames/pipelines.py
Normal file
11
mobygames/pipelines.py
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Define your item pipelines here
|
||||||
|
#
|
||||||
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||||
|
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
|
||||||
|
|
||||||
|
class MobygamesPipeline(object):
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
return item
|
17
mobygames/settings.py
Normal file
17
mobygames/settings.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Scrapy settings for mobygames project
|
||||||
|
#
|
||||||
|
# For simplicity, this file contains only the most important settings by
|
||||||
|
# default. All the other settings are documented here:
|
||||||
|
#
|
||||||
|
# http://doc.scrapy.org/en/latest/topics/settings.html
|
||||||
|
#
|
||||||
|
|
||||||
|
BOT_NAME = 'mobygames'
|
||||||
|
|
||||||
|
SPIDER_MODULES = ['mobygames.spiders']
|
||||||
|
NEWSPIDER_MODULE = 'mobygames.spiders'
|
||||||
|
|
||||||
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
|
USER_AGENT = 'scraping for a tumblr, will provide full attribution (+http://consultyourcodewheel.tumblr.com/)'
|
4
mobygames/spiders/__init__.py
Normal file
4
mobygames/spiders/__init__.py
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
# This package will contain the spiders of your Scrapy project
|
||||||
|
#
|
||||||
|
# Please refer to the documentation for information on how to create and manage
|
||||||
|
# your spiders.
|
40
mobygames/spiders/games.py
Normal file
40
mobygames/spiders/games.py
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import scrapy
|
||||||
|
import urlparse
|
||||||
|
from mobygames.items import Game, Screenshot
|
||||||
|
|
||||||
|
|
||||||
|
class GamesSpider(scrapy.Spider):
|
||||||
|
name = "games"
|
||||||
|
allowed_domains = ["mobygames.com"]
|
||||||
|
start_urls = (
|
||||||
|
'http://www.mobygames.com/browse/games/list-games/',
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
gamerows = response.xpath('//table[@id="mof_object_list"]/tbody/tr')
|
||||||
|
for gamerow in gamerows:
|
||||||
|
links = gamerow.xpath('./td/a')
|
||||||
|
game = Game()
|
||||||
|
game['title'] = links[0].xpath('./text()').extract()[0]
|
||||||
|
game['url'] = urlparse.urljoin(response.url, links[0].xpath('./@href').extract()[0])
|
||||||
|
game['year'] = links[1].xpath('./text()').extract()[0]
|
||||||
|
path = urlparse.urlparse(game['url']).path
|
||||||
|
game['key'] = path.split('/')[-1] or path.split('/')[-2]
|
||||||
|
yield game
|
||||||
|
|
||||||
|
screenshot_url = urlparse.urljoin(game['url'] + '/', "screenshots")
|
||||||
|
print "fetching", screenshot_url, "for", game['key']
|
||||||
|
yield scrapy.http.Request(screenshot_url, callback=self.parseScreenshots, meta={'key': game['key']})
|
||||||
|
|
||||||
|
nextlink = response.xpath('//td[@class="mobHeaderNav"]/a[text()[contains(.,"Next")]]/@href').extract()
|
||||||
|
if len(nextlink) > 0:
|
||||||
|
yield scrapy.http.Request(urlparse.urljoin(response.url, nextlink[0]))
|
||||||
|
|
||||||
|
def parseScreenshots(self, response):
|
||||||
|
for thumbdiv in response.xpath('//div[@class="thumbnail"]'):
|
||||||
|
screenshot = Screenshot()
|
||||||
|
screenshot['key'] = response.meta['key']
|
||||||
|
screenshot['description'] = u' '.join(thumbdiv.xpath('.//div[@class="thumbnail-caption"]//text()').extract()).strip()
|
||||||
|
screenshot['url'] = urlparse.urljoin(response.url, thumbdiv.xpath('.//a[@class="thumbnail-image"]/@href').extract()[0])
|
||||||
|
yield screenshot
|
11
scrapy.cfg
Normal file
11
scrapy.cfg
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
# Automatically created by: scrapy startproject
|
||||||
|
#
|
||||||
|
# For more information about the [deploy] section see:
|
||||||
|
# http://doc.scrapy.org/en/latest/topics/scrapyd.html
|
||||||
|
|
||||||
|
[settings]
|
||||||
|
default = mobygames.settings
|
||||||
|
|
||||||
|
[deploy]
|
||||||
|
#url = http://localhost:6800/
|
||||||
|
project = mobygames
|
Loading…
Reference in a new issue