Initial commit

2015-05-06 09:10:41 -04:00 · 2015-05-06 09:10:41 -04:00 · 0710281e82
commit 0710281e82
8 changed files with 103 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
 *.pyc
--- a/mobygames/init.py
+++ b/mobygames/init.py
--- a/mobygames/items.py
+++ b/mobygames/items.py
@ -0,0 +1,19 @@
 # -*- coding: utf-8 -*-
 # Define here the models for your scraped items
 #
 # See documentation in:
 # http://doc.scrapy.org/en/latest/topics/items.html
 import scrapy
 class Game(scrapy.Item):
 	key = scrapy.Field()
 	title = scrapy.Field()
 	url = scrapy.Field()
 	year = scrapy.Field()
 class Screenshot(scrapy.Item):
 	key = scrapy.Field()
 	url = scrapy.Field()
 	description = scrapy.Field()
--- a/mobygames/pipelines.py
+++ b/mobygames/pipelines.py
@ -0,0 +1,11 @@
 # -*- coding: utf-8 -*-
 # Define your item pipelines here
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 class MobygamesPipeline(object):
    def process_item(self, item, spider):
        return item
--- a/mobygames/settings.py
+++ b/mobygames/settings.py
@ -0,0 +1,17 @@
 # -*- coding: utf-8 -*-
 # Scrapy settings for mobygames project
 #
 # For simplicity, this file contains only the most important settings by
 # default. All the other settings are documented here:
 #
 #     http://doc.scrapy.org/en/latest/topics/settings.html
 #
 BOT_NAME = 'mobygames'
 SPIDER_MODULES = ['mobygames.spiders']
 NEWSPIDER_MODULE = 'mobygames.spiders'
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 USER_AGENT = 'scraping for a tumblr, will provide full attribution (+http://consultyourcodewheel.tumblr.com/)'
--- a/mobygames/spiders/init.py
+++ b/mobygames/spiders/init.py
@ -0,0 +1,4 @@
 # This package will contain the spiders of your Scrapy project
 #
 # Please refer to the documentation for information on how to create and manage
 # your spiders.
--- a/mobygames/spiders/games.py
+++ b/mobygames/spiders/games.py
@ -0,0 +1,40 @@
 # -*- coding: utf-8 -*-
 import scrapy
 import urlparse
 from mobygames.items import Game, Screenshot
 class GamesSpider(scrapy.Spider):
    name = "games"
    allowed_domains = ["mobygames.com"]
    start_urls = (
        'http://www.mobygames.com/browse/games/list-games/',
    )
    def parse(self, response):
        gamerows = response.xpath('//table[@id="mof_object_list"]/tbody/tr')
        for gamerow in gamerows:
        	links = gamerow.xpath('./td/a')
        	game = Game()
        	game['title'] = links[0].xpath('./text()').extract()[0]
        	game['url'] = urlparse.urljoin(response.url, links[0].xpath('./@href').extract()[0])
        	game['year'] = links[1].xpath('./text()').extract()[0]
        	path = urlparse.urlparse(game['url']).path
        	game['key'] = path.split('/')[-1] or path.split('/')[-2]
        	yield game
        	screenshot_url = urlparse.urljoin(game['url'] + '/', "screenshots")
        	print "fetching", screenshot_url, "for", game['key']
        	yield scrapy.http.Request(screenshot_url, callback=self.parseScreenshots, meta={'key': game['key']})
    	nextlink = response.xpath('//td[@class="mobHeaderNav"]/a[text()[contains(.,"Next")]]/@href').extract()
    	if len(nextlink) > 0:
    		yield scrapy.http.Request(urlparse.urljoin(response.url, nextlink[0]))
    def parseScreenshots(self, response):
    	for thumbdiv in response.xpath('//div[@class="thumbnail"]'):
    		screenshot = Screenshot()
    		screenshot['key'] = response.meta['key']
    		screenshot['description'] = u' '.join(thumbdiv.xpath('.//div[@class="thumbnail-caption"]//text()').extract()).strip()
    		screenshot['url'] = urlparse.urljoin(response.url, thumbdiv.xpath('.//a[@class="thumbnail-image"]/@href').extract()[0])
    		yield screenshot
--- a/scrapy.cfg
+++ b/scrapy.cfg
@ -0,0 +1,11 @@
 # Automatically created by: scrapy startproject
 #
 # For more information about the [deploy] section see:
 # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 [settings]
 default = mobygames.settings
 [deploy]
 #url = http://localhost:6800/
 project = mobygames