From 0710281e824d388f944cdb2e8d977a1579c2ba6a Mon Sep 17 00:00:00 2001 From: Jeremy Penner Date: Wed, 6 May 2015 09:10:41 -0400 Subject: [PATCH] Initial commit --- .gitignore | 1 + mobygames/__init__.py | 0 mobygames/items.py | 19 +++++++++++++++++ mobygames/pipelines.py | 11 ++++++++++ mobygames/settings.py | 17 +++++++++++++++ mobygames/spiders/__init__.py | 4 ++++ mobygames/spiders/games.py | 40 +++++++++++++++++++++++++++++++++++ scrapy.cfg | 11 ++++++++++ 8 files changed, 103 insertions(+) create mode 100644 .gitignore create mode 100644 mobygames/__init__.py create mode 100644 mobygames/items.py create mode 100644 mobygames/pipelines.py create mode 100644 mobygames/settings.py create mode 100644 mobygames/spiders/__init__.py create mode 100644 mobygames/spiders/games.py create mode 100644 scrapy.cfg diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7e99e36 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc \ No newline at end of file diff --git a/mobygames/__init__.py b/mobygames/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mobygames/items.py b/mobygames/items.py new file mode 100644 index 0000000..a65a71d --- /dev/null +++ b/mobygames/items.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + +class Game(scrapy.Item): + key = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + year = scrapy.Field() + +class Screenshot(scrapy.Item): + key = scrapy.Field() + url = scrapy.Field() + description = scrapy.Field() diff --git a/mobygames/pipelines.py b/mobygames/pipelines.py new file mode 100644 index 0000000..2aec291 --- /dev/null +++ b/mobygames/pipelines.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html + + +class MobygamesPipeline(object): + def process_item(self, item, spider): + return item diff --git a/mobygames/settings.py b/mobygames/settings.py new file mode 100644 index 0000000..f61ef69 --- /dev/null +++ b/mobygames/settings.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for mobygames project +# +# For simplicity, this file contains only the most important settings by +# default. All the other settings are documented here: +# +# http://doc.scrapy.org/en/latest/topics/settings.html +# + +BOT_NAME = 'mobygames' + +SPIDER_MODULES = ['mobygames.spiders'] +NEWSPIDER_MODULE = 'mobygames.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'scraping for a tumblr, will provide full attribution (+http://consultyourcodewheel.tumblr.com/)' diff --git a/mobygames/spiders/__init__.py b/mobygames/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/mobygames/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/mobygames/spiders/games.py b/mobygames/spiders/games.py new file mode 100644 index 0000000..da28c4f --- /dev/null +++ b/mobygames/spiders/games.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +import scrapy +import urlparse +from mobygames.items import Game, Screenshot + + +class GamesSpider(scrapy.Spider): + name = "games" + allowed_domains = ["mobygames.com"] + start_urls = ( + 'http://www.mobygames.com/browse/games/list-games/', + ) + + def parse(self, response): + gamerows = response.xpath('//table[@id="mof_object_list"]/tbody/tr') + for gamerow in gamerows: + links = gamerow.xpath('./td/a') + game = Game() + game['title'] = links[0].xpath('./text()').extract()[0] + game['url'] = urlparse.urljoin(response.url, links[0].xpath('./@href').extract()[0]) + game['year'] = links[1].xpath('./text()').extract()[0] + path = urlparse.urlparse(game['url']).path + game['key'] = path.split('/')[-1] or path.split('/')[-2] + yield game + + screenshot_url = urlparse.urljoin(game['url'] + '/', "screenshots") + print "fetching", screenshot_url, "for", game['key'] + yield scrapy.http.Request(screenshot_url, callback=self.parseScreenshots, meta={'key': game['key']}) + + nextlink = response.xpath('//td[@class="mobHeaderNav"]/a[text()[contains(.,"Next")]]/@href').extract() + if len(nextlink) > 0: + yield scrapy.http.Request(urlparse.urljoin(response.url, nextlink[0])) + + def parseScreenshots(self, response): + for thumbdiv in response.xpath('//div[@class="thumbnail"]'): + screenshot = Screenshot() + screenshot['key'] = response.meta['key'] + screenshot['description'] = u' '.join(thumbdiv.xpath('.//div[@class="thumbnail-caption"]//text()').extract()).strip() + screenshot['url'] = urlparse.urljoin(response.url, thumbdiv.xpath('.//a[@class="thumbnail-image"]/@href').extract()[0]) + yield screenshot diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000..34fed4d --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# http://doc.scrapy.org/en/latest/topics/scrapyd.html + +[settings] +default = mobygames.settings + +[deploy] +#url = http://localhost:6800/ +project = mobygames