Initial commmit

This commit is contained in:
Jeremy Penner 2015-05-06 10:16:39 -04:00
commit 6d9ce8b28a
8 changed files with 132 additions and 0 deletions

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
*.pyc

0
fmfridays/__init__.py Normal file
View file

17
fmfridays/items.py Normal file
View file

@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class FileItem(scrapy.Item):
url = scrapy.Field()
url2 = scrapy.Field()
post_url = scrapy.Field()
title = scrapy.Field()
path = scrapy.Field()

31
fmfridays/pipelines.py Normal file
View file

@ -0,0 +1,31 @@
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy
from scrapy.contrib.pipeline.media import MediaPipeline
from scrapy.exceptions import DropItem
from fmfridays.settings import TUMBLR
import os
import re
class MP3DownloadPipeline(MediaPipeline):
def get_media_requests(self, item, info):
if not os.path.exists(self.path_from_item(item)):
yield scrapy.Request(item['url'])
def path_from_item(self, item):
return os.path.join('/Users/jpenner/Music/downloads/', TUMBLR, re.sub(r'[^-_!\(\),\'& a-zA-Z0-9]', '_', item['title']) + '.mp3')
def item_completed(self, results, item, info):
for ok, response in results:
if ok:
path = self.path_from_item(item)
if not os.path.exists(os.path.dirname(path)):
os.makedirs(os.path.dirname(path))
with open(path, 'wb') as f:
f.write(response.body)
item['path'] = path
return item

22
fmfridays/settings.py Normal file
View file

@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
# Scrapy settings for fmfridays project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'fmfridays'
SPIDER_MODULES = ['fmfridays.spiders']
NEWSPIDER_MODULE = 'fmfridays.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'fmfridays (+http://www.yourdomain.com)'
ITEM_PIPELINES = ['fmfridays.pipelines.MP3DownloadPipeline']
DOWNLOAD_DELAY = 0.25
TUMBLR = 'bestofmidi'

View file

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View file

@ -0,0 +1,46 @@
import scrapy
from fmfridays.items import FileItem
from fmfridays.settings import TUMBLR
import urlparse
PATHS = {
'fmfridays': {
'post': '//section[@class="post"]',
'next': '//a[@class="next"]/@href'
},
'bestofmidi': {
'post': '//div[@class="audio"]',
'next': '//a[@id="prev"]/@href'
}
}
class AudioSpider(scrapy.Spider):
name = 'audio'
allowed_domains = [TUMBLR + ".tumblr.com"]
start_urls = ['http://' + TUMBLR + '.tumblr.com/']
def parse(self, response):
# find audio players
for post in response.xpath(PATHS[TUMBLR]['post']):
for iframe in post.xpath('.//iframe/@src'):
urlstring = iframe.extract()
if '/audio_player_iframe/' in urlstring:
yield scrapy.http.Request(urlstring, callback=self.parse_audioplayer)
# get next page
for href in response.xpath(PATHS[TUMBLR]['next']):
url = 'http://' + TUMBLR + '.tumblr.com' + href.extract()
yield scrapy.http.Request(url, callback=self.parse)
def parse_audioplayer(self, response):
url = urlparse.urlparse(response.request.url)
audio_url = urlparse.parse_qs(url.query).get('audio_file')
if audio_url:
item = FileItem()
# item['url2'] = audio_url[0]
item['url'] = audio_url[0] + '?play_key=' + response.xpath('//div[contains(@class, "audio_player_container")]/@data-post-key').extract()[0]
item['post_url'] = response.request.url
title = response.xpath('//li[@class="track_name"]/text()').extract()
artist = response.xpath('//li[@class="artist_name"]/text()').extract()
item['title'] = ' - '.join(title + artist)
yield item

11
scrapy.cfg Normal file
View file

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# http://doc.scrapy.org/en/latest/topics/scrapyd.html
[settings]
default = fmfridays.settings
[deploy]
#url = http://localhost:6800/
project = fmfridays