From 6d9ce8b28ad710f6dec7bca30930a77d34121446 Mon Sep 17 00:00:00 2001 From: Jeremy Penner Date: Wed, 6 May 2015 10:16:39 -0400 Subject: [PATCH] Initial commmit --- .gitignore | 1 + fmfridays/__init__.py | 0 fmfridays/items.py | 17 ++++++++++++ fmfridays/pipelines.py | 31 +++++++++++++++++++++ fmfridays/settings.py | 22 +++++++++++++++ fmfridays/spiders/__init__.py | 4 +++ fmfridays/spiders/audio_spider.py | 46 +++++++++++++++++++++++++++++++ scrapy.cfg | 11 ++++++++ 8 files changed, 132 insertions(+) create mode 100644 .gitignore create mode 100644 fmfridays/__init__.py create mode 100644 fmfridays/items.py create mode 100644 fmfridays/pipelines.py create mode 100644 fmfridays/settings.py create mode 100644 fmfridays/spiders/__init__.py create mode 100644 fmfridays/spiders/audio_spider.py create mode 100644 scrapy.cfg diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7e99e36 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc \ No newline at end of file diff --git a/fmfridays/__init__.py b/fmfridays/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/fmfridays/items.py b/fmfridays/items.py new file mode 100644 index 0000000..ea34441 --- /dev/null +++ b/fmfridays/items.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class FileItem(scrapy.Item): + url = scrapy.Field() + url2 = scrapy.Field() + post_url = scrapy.Field() + title = scrapy.Field() + path = scrapy.Field() + \ No newline at end of file diff --git a/fmfridays/pipelines.py b/fmfridays/pipelines.py new file mode 100644 index 0000000..15daf9f --- /dev/null +++ b/fmfridays/pipelines.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html +import scrapy +from scrapy.contrib.pipeline.media import MediaPipeline +from scrapy.exceptions import DropItem +from fmfridays.settings import TUMBLR +import os +import re + +class MP3DownloadPipeline(MediaPipeline): + def get_media_requests(self, item, info): + if not os.path.exists(self.path_from_item(item)): + yield scrapy.Request(item['url']) + + def path_from_item(self, item): + return os.path.join('/Users/jpenner/Music/downloads/', TUMBLR, re.sub(r'[^-_!\(\),\'& a-zA-Z0-9]', '_', item['title']) + '.mp3') + + def item_completed(self, results, item, info): + for ok, response in results: + if ok: + path = self.path_from_item(item) + if not os.path.exists(os.path.dirname(path)): + os.makedirs(os.path.dirname(path)) + with open(path, 'wb') as f: + f.write(response.body) + item['path'] = path + return item diff --git a/fmfridays/settings.py b/fmfridays/settings.py new file mode 100644 index 0000000..3afc24a --- /dev/null +++ b/fmfridays/settings.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for fmfridays project +# +# For simplicity, this file contains only the most important settings by +# default. All the other settings are documented here: +# +# http://doc.scrapy.org/en/latest/topics/settings.html +# + +BOT_NAME = 'fmfridays' + +SPIDER_MODULES = ['fmfridays.spiders'] +NEWSPIDER_MODULE = 'fmfridays.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'fmfridays (+http://www.yourdomain.com)' +ITEM_PIPELINES = ['fmfridays.pipelines.MP3DownloadPipeline'] + +DOWNLOAD_DELAY = 0.25 + +TUMBLR = 'bestofmidi' \ No newline at end of file diff --git a/fmfridays/spiders/__init__.py b/fmfridays/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/fmfridays/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/fmfridays/spiders/audio_spider.py b/fmfridays/spiders/audio_spider.py new file mode 100644 index 0000000..a56834a --- /dev/null +++ b/fmfridays/spiders/audio_spider.py @@ -0,0 +1,46 @@ +import scrapy +from fmfridays.items import FileItem +from fmfridays.settings import TUMBLR +import urlparse + +PATHS = { + 'fmfridays': { + 'post': '//section[@class="post"]', + 'next': '//a[@class="next"]/@href' + }, + 'bestofmidi': { + 'post': '//div[@class="audio"]', + 'next': '//a[@id="prev"]/@href' + } +} +class AudioSpider(scrapy.Spider): + name = 'audio' + allowed_domains = [TUMBLR + ".tumblr.com"] + start_urls = ['http://' + TUMBLR + '.tumblr.com/'] + + def parse(self, response): + # find audio players + for post in response.xpath(PATHS[TUMBLR]['post']): + for iframe in post.xpath('.//iframe/@src'): + urlstring = iframe.extract() + if '/audio_player_iframe/' in urlstring: + yield scrapy.http.Request(urlstring, callback=self.parse_audioplayer) + + # get next page + for href in response.xpath(PATHS[TUMBLR]['next']): + url = 'http://' + TUMBLR + '.tumblr.com' + href.extract() + yield scrapy.http.Request(url, callback=self.parse) + + def parse_audioplayer(self, response): + url = urlparse.urlparse(response.request.url) + audio_url = urlparse.parse_qs(url.query).get('audio_file') + if audio_url: + item = FileItem() + # item['url2'] = audio_url[0] + item['url'] = audio_url[0] + '?play_key=' + response.xpath('//div[contains(@class, "audio_player_container")]/@data-post-key').extract()[0] + item['post_url'] = response.request.url + title = response.xpath('//li[@class="track_name"]/text()').extract() + artist = response.xpath('//li[@class="artist_name"]/text()').extract() + item['title'] = ' - '.join(title + artist) + yield item + diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000..7f2f891 --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# http://doc.scrapy.org/en/latest/topics/scrapyd.html + +[settings] +default = fmfridays.settings + +[deploy] +#url = http://localhost:6800/ +project = fmfridays