From 0710281e824d388f944cdb2e8d977a1579c2ba6a Mon Sep 17 00:00:00 2001
From: Jeremy Penner <jeremy@Lappy386.local>
Date: Wed, 6 May 2015 09:10:41 -0400
Subject: [PATCH] Initial commit

---
 .gitignore                    |  1 +
 mobygames/__init__.py         |  0
 mobygames/items.py            | 19 +++++++++++++++++
 mobygames/pipelines.py        | 11 ++++++++++
 mobygames/settings.py         | 17 +++++++++++++++
 mobygames/spiders/__init__.py |  4 ++++
 mobygames/spiders/games.py    | 40 +++++++++++++++++++++++++++++++++++
 scrapy.cfg                    | 11 ++++++++++
 8 files changed, 103 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 mobygames/__init__.py
 create mode 100644 mobygames/items.py
 create mode 100644 mobygames/pipelines.py
 create mode 100644 mobygames/settings.py
 create mode 100644 mobygames/spiders/__init__.py
 create mode 100644 mobygames/spiders/games.py
 create mode 100644 scrapy.cfg

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7e99e36
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*.pyc
\ No newline at end of file
diff --git a/mobygames/__init__.py b/mobygames/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/mobygames/items.py b/mobygames/items.py
new file mode 100644
index 0000000..a65a71d
--- /dev/null
+++ b/mobygames/items.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+class Game(scrapy.Item):
+	key = scrapy.Field()
+	title = scrapy.Field()
+	url = scrapy.Field()
+	year = scrapy.Field()
+
+class Screenshot(scrapy.Item):
+	key = scrapy.Field()
+	url = scrapy.Field()
+	description = scrapy.Field()
diff --git a/mobygames/pipelines.py b/mobygames/pipelines.py
new file mode 100644
index 0000000..2aec291
--- /dev/null
+++ b/mobygames/pipelines.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+class MobygamesPipeline(object):
+    def process_item(self, item, spider):
+        return item
diff --git a/mobygames/settings.py b/mobygames/settings.py
new file mode 100644
index 0000000..f61ef69
--- /dev/null
+++ b/mobygames/settings.py
@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for mobygames project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#
+
+BOT_NAME = 'mobygames'
+
+SPIDER_MODULES = ['mobygames.spiders']
+NEWSPIDER_MODULE = 'mobygames.spiders'
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+USER_AGENT = 'scraping for a tumblr, will provide full attribution (+http://consultyourcodewheel.tumblr.com/)'
diff --git a/mobygames/spiders/__init__.py b/mobygames/spiders/__init__.py
new file mode 100644
index 0000000..ebd689a
--- /dev/null
+++ b/mobygames/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/mobygames/spiders/games.py b/mobygames/spiders/games.py
new file mode 100644
index 0000000..da28c4f
--- /dev/null
+++ b/mobygames/spiders/games.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+import scrapy
+import urlparse
+from mobygames.items import Game, Screenshot
+
+
+class GamesSpider(scrapy.Spider):
+    name = "games"
+    allowed_domains = ["mobygames.com"]
+    start_urls = (
+        'http://www.mobygames.com/browse/games/list-games/',
+    )
+
+    def parse(self, response):
+        gamerows = response.xpath('//table[@id="mof_object_list"]/tbody/tr')
+        for gamerow in gamerows:
+        	links = gamerow.xpath('./td/a')
+        	game = Game()
+        	game['title'] = links[0].xpath('./text()').extract()[0]
+        	game['url'] = urlparse.urljoin(response.url, links[0].xpath('./@href').extract()[0])
+        	game['year'] = links[1].xpath('./text()').extract()[0]
+        	path = urlparse.urlparse(game['url']).path
+        	game['key'] = path.split('/')[-1] or path.split('/')[-2]
+        	yield game
+
+        	screenshot_url = urlparse.urljoin(game['url'] + '/', "screenshots")
+        	print "fetching", screenshot_url, "for", game['key']
+        	yield scrapy.http.Request(screenshot_url, callback=self.parseScreenshots, meta={'key': game['key']})
+
+    	nextlink = response.xpath('//td[@class="mobHeaderNav"]/a[text()[contains(.,"Next")]]/@href').extract()
+    	if len(nextlink) > 0:
+    		yield scrapy.http.Request(urlparse.urljoin(response.url, nextlink[0]))
+
+    def parseScreenshots(self, response):
+    	for thumbdiv in response.xpath('//div[@class="thumbnail"]'):
+    		screenshot = Screenshot()
+    		screenshot['key'] = response.meta['key']
+    		screenshot['description'] = u' '.join(thumbdiv.xpath('.//div[@class="thumbnail-caption"]//text()').extract()).strip()
+    		screenshot['url'] = urlparse.urljoin(response.url, thumbdiv.xpath('.//a[@class="thumbnail-image"]/@href').extract()[0])
+    		yield screenshot
diff --git a/scrapy.cfg b/scrapy.cfg
new file mode 100644
index 0000000..34fed4d
--- /dev/null
+++ b/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/en/latest/topics/scrapyd.html
+
+[settings]
+default = mobygames.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = mobygames