Initial commmit

2015-05-06 10:16:39 -04:00 · 2015-05-06 10:16:39 -04:00 · 6d9ce8b28a
commit 6d9ce8b28a
8 changed files with 132 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+*.pyc
--- a/fmfridays/init.py
+++ b/fmfridays/init.py
--- a/fmfridays/items.py
+++ b/fmfridays/items.py
@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class FileItem(scrapy.Item):
+  url = scrapy.Field()
+  url2 = scrapy.Field()
+  post_url = scrapy.Field()
+  title = scrapy.Field()
+  path = scrapy.Field()
+  
--- a/fmfridays/pipelines.py
+++ b/fmfridays/pipelines.py
@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import scrapy
+from scrapy.contrib.pipeline.media import MediaPipeline
+from scrapy.exceptions import DropItem
+from fmfridays.settings import TUMBLR
+import os
+import re
+
+class MP3DownloadPipeline(MediaPipeline):
+  def get_media_requests(self, item, info):
+    if not os.path.exists(self.path_from_item(item)):
+      yield scrapy.Request(item['url'])
+
+  def path_from_item(self, item):
+    return os.path.join('/Users/jpenner/Music/downloads/', TUMBLR, re.sub(r'[^-_!\(\),\'& a-zA-Z0-9]', '_', item['title']) + '.mp3')
+
+  def item_completed(self, results, item, info):
+    for ok, response in results:
+      if ok:
+        path = self.path_from_item(item)
+        if not os.path.exists(os.path.dirname(path)):
+          os.makedirs(os.path.dirname(path))
+        with open(path, 'wb') as f:
+          f.write(response.body)
+        item['path'] = path
+    return item
--- a/fmfridays/settings.py
+++ b/fmfridays/settings.py
@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for fmfridays project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#
+
+BOT_NAME = 'fmfridays'
+
+SPIDER_MODULES = ['fmfridays.spiders']
+NEWSPIDER_MODULE = 'fmfridays.spiders'
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'fmfridays (+http://www.yourdomain.com)'
+ITEM_PIPELINES = ['fmfridays.pipelines.MP3DownloadPipeline']
+
+DOWNLOAD_DELAY = 0.25
+
+TUMBLR = 'bestofmidi'
--- a/fmfridays/spiders/init.py
+++ b/fmfridays/spiders/init.py
@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/fmfridays/spiders/audio_spider.py
+++ b/fmfridays/spiders/audio_spider.py
@ -0,0 +1,46 @@
+import scrapy
+from fmfridays.items import FileItem
+from fmfridays.settings import TUMBLR
+import urlparse
+
+PATHS = {
+  'fmfridays': {
+    'post': '//section[@class="post"]',
+    'next': '//a[@class="next"]/@href'
+  },
+  'bestofmidi': {
+    'post': '//div[@class="audio"]',
+    'next': '//a[@id="prev"]/@href'
+  }
+}
+class AudioSpider(scrapy.Spider):
+  name = 'audio'
+  allowed_domains = [TUMBLR + ".tumblr.com"]
+  start_urls = ['http://' + TUMBLR + '.tumblr.com/']
+
+  def parse(self, response):
+    # find audio players
+    for post in response.xpath(PATHS[TUMBLR]['post']):
+      for iframe in post.xpath('.//iframe/@src'):
+        urlstring = iframe.extract()
+        if '/audio_player_iframe/' in urlstring:
+          yield scrapy.http.Request(urlstring, callback=self.parse_audioplayer)
+
+    # get next page
+    for href in response.xpath(PATHS[TUMBLR]['next']):
+      url = 'http://' + TUMBLR + '.tumblr.com' + href.extract()
+      yield scrapy.http.Request(url, callback=self.parse)
+
+  def parse_audioplayer(self, response):
+    url = urlparse.urlparse(response.request.url)
+    audio_url = urlparse.parse_qs(url.query).get('audio_file')
+    if audio_url:
+      item = FileItem()
+      # item['url2'] = audio_url[0]
+      item['url'] = audio_url[0] + '?play_key=' + response.xpath('//div[contains(@class, "audio_player_container")]/@data-post-key').extract()[0]
+      item['post_url'] = response.request.url
+      title = response.xpath('//li[@class="track_name"]/text()').extract()
+      artist = response.xpath('//li[@class="artist_name"]/text()').extract()
+      item['title'] = ' - '.join(title + artist)
+      yield item
+
--- a/scrapy.cfg
+++ b/scrapy.cfg
@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/en/latest/topics/scrapyd.html
+
+[settings]
+default = fmfridays.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = fmfridays