Initial commmit

Jeremy Penner 2015-05-06 10:16:39 -04:00
commit 6d9ce8b28a
8 changed files with 132 additions and 0 deletions

# -*- coding: utf-8 -*-
# Define here the models for your scraped items
# See documentation in:
import scrapy
class FileItem(scrapy.Item):
url = scrapy.Field()
url2 = scrapy.Field()
post_url = scrapy.Field()
title = scrapy.Field()
path = scrapy.Field()

# -*- coding: utf-8 -*-
# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See:
import scrapy
from import MediaPipeline
from scrapy.exceptions import DropItem
from fmfridays.settings import TUMBLR
import os
import re
class MP3DownloadPipeline(MediaPipeline):
def get_media_requests(self, item, info):
if not os.path.exists(self.path_from_item(item)):
yield scrapy.Request(item['url'])
def path_from_item(self, item):
return os.path.join('/Users/jpenner/Music/downloads/', TUMBLR, re.sub(r'[^-_!\(\),\'& a-zA-Z0-9]', '_', item['title']) + '.mp3')
def item_completed(self, results, item, info):
for ok, response in results:
if ok:
path = self.path_from_item(item)
if not os.path.exists(os.path.dirname(path)):
with open(path, 'wb') as f:
item['path'] = path
return item

# -*- coding: utf-8 -*-
# Scrapy settings for fmfridays project
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
BOT_NAME = 'fmfridays'
SPIDER_MODULES = ['fmfridays.spiders']
NEWSPIDER_MODULE = 'fmfridays.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'fmfridays (+'
ITEM_PIPELINES = ['fmfridays.pipelines.MP3DownloadPipeline']
TUMBLR = 'bestofmidi'

# This package will contain the spiders of your Scrapy project
# Please refer to the documentation for information on how to create and manage
# your spiders.

import scrapy
from fmfridays.items import FileItem
from fmfridays.settings import TUMBLR
import urlparse
'fmfridays': {
'post': '//section[@class="post"]',
'next': '//a[@class="next"]/@href'
'bestofmidi': {
'post': '//div[@class="audio"]',
'next': '//a[@id="prev"]/@href'
class AudioSpider(scrapy.Spider):
name = 'audio'
allowed_domains = [TUMBLR + ""]
start_urls = ['http://' + TUMBLR + '']
def parse(self, response):
# find audio players
for post in response.xpath(PATHS[TUMBLR]['post']):
for iframe in post.xpath('.//iframe/@src'):
urlstring = iframe.extract()
if '/audio_player_iframe/' in urlstring:
yield scrapy.http.Request(urlstring, callback=self.parse_audioplayer)
# get next page
for href in response.xpath(PATHS[TUMBLR]['next']):
url = 'http://' + TUMBLR + '' + href.extract()
yield scrapy.http.Request(url, callback=self.parse)
def parse_audioplayer(self, response):
url = urlparse.urlparse(response.request.url)
audio_url = urlparse.parse_qs(url.query).get('audio_file')
if audio_url:
item = FileItem()
# item['url2'] = audio_url[0]
item['url'] = audio_url[0] + '?play_key=' + response.xpath('//div[contains(@class, "audio_player_container")]/@data-post-key').extract()[0]
item['post_url'] = response.request.url
title = response.xpath('//li[@class="track_name"]/text()').extract()
artist = response.xpath('//li[@class="artist_name"]/text()').extract()
item['title'] = ' - '.join(title + artist)
yield item

# Automatically created by: scrapy startproject
# For more information about the [deploy] section see:
default = fmfridays.settings
#url = http://localhost:6800/
project = fmfridays