Initial commmit
This commit is contained in:
commit
6d9ce8b28a
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
*.pyc
|
0
fmfridays/__init__.py
Normal file
0
fmfridays/__init__.py
Normal file
17
fmfridays/items.py
Normal file
17
fmfridays/items.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Define here the models for your scraped items
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# http://doc.scrapy.org/en/latest/topics/items.html
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class FileItem(scrapy.Item):
|
||||||
|
url = scrapy.Field()
|
||||||
|
url2 = scrapy.Field()
|
||||||
|
post_url = scrapy.Field()
|
||||||
|
title = scrapy.Field()
|
||||||
|
path = scrapy.Field()
|
||||||
|
|
31
fmfridays/pipelines.py
Normal file
31
fmfridays/pipelines.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Define your item pipelines here
|
||||||
|
#
|
||||||
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||||
|
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
import scrapy
|
||||||
|
from scrapy.contrib.pipeline.media import MediaPipeline
|
||||||
|
from scrapy.exceptions import DropItem
|
||||||
|
from fmfridays.settings import TUMBLR
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
class MP3DownloadPipeline(MediaPipeline):
|
||||||
|
def get_media_requests(self, item, info):
|
||||||
|
if not os.path.exists(self.path_from_item(item)):
|
||||||
|
yield scrapy.Request(item['url'])
|
||||||
|
|
||||||
|
def path_from_item(self, item):
|
||||||
|
return os.path.join('/Users/jpenner/Music/downloads/', TUMBLR, re.sub(r'[^-_!\(\),\'& a-zA-Z0-9]', '_', item['title']) + '.mp3')
|
||||||
|
|
||||||
|
def item_completed(self, results, item, info):
|
||||||
|
for ok, response in results:
|
||||||
|
if ok:
|
||||||
|
path = self.path_from_item(item)
|
||||||
|
if not os.path.exists(os.path.dirname(path)):
|
||||||
|
os.makedirs(os.path.dirname(path))
|
||||||
|
with open(path, 'wb') as f:
|
||||||
|
f.write(response.body)
|
||||||
|
item['path'] = path
|
||||||
|
return item
|
22
fmfridays/settings.py
Normal file
22
fmfridays/settings.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Scrapy settings for fmfridays project
|
||||||
|
#
|
||||||
|
# For simplicity, this file contains only the most important settings by
|
||||||
|
# default. All the other settings are documented here:
|
||||||
|
#
|
||||||
|
# http://doc.scrapy.org/en/latest/topics/settings.html
|
||||||
|
#
|
||||||
|
|
||||||
|
BOT_NAME = 'fmfridays'
|
||||||
|
|
||||||
|
SPIDER_MODULES = ['fmfridays.spiders']
|
||||||
|
NEWSPIDER_MODULE = 'fmfridays.spiders'
|
||||||
|
|
||||||
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
|
#USER_AGENT = 'fmfridays (+http://www.yourdomain.com)'
|
||||||
|
ITEM_PIPELINES = ['fmfridays.pipelines.MP3DownloadPipeline']
|
||||||
|
|
||||||
|
DOWNLOAD_DELAY = 0.25
|
||||||
|
|
||||||
|
TUMBLR = 'bestofmidi'
|
4
fmfridays/spiders/__init__.py
Normal file
4
fmfridays/spiders/__init__.py
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
# This package will contain the spiders of your Scrapy project
|
||||||
|
#
|
||||||
|
# Please refer to the documentation for information on how to create and manage
|
||||||
|
# your spiders.
|
46
fmfridays/spiders/audio_spider.py
Normal file
46
fmfridays/spiders/audio_spider.py
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
import scrapy
|
||||||
|
from fmfridays.items import FileItem
|
||||||
|
from fmfridays.settings import TUMBLR
|
||||||
|
import urlparse
|
||||||
|
|
||||||
|
PATHS = {
|
||||||
|
'fmfridays': {
|
||||||
|
'post': '//section[@class="post"]',
|
||||||
|
'next': '//a[@class="next"]/@href'
|
||||||
|
},
|
||||||
|
'bestofmidi': {
|
||||||
|
'post': '//div[@class="audio"]',
|
||||||
|
'next': '//a[@id="prev"]/@href'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
class AudioSpider(scrapy.Spider):
|
||||||
|
name = 'audio'
|
||||||
|
allowed_domains = [TUMBLR + ".tumblr.com"]
|
||||||
|
start_urls = ['http://' + TUMBLR + '.tumblr.com/']
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
# find audio players
|
||||||
|
for post in response.xpath(PATHS[TUMBLR]['post']):
|
||||||
|
for iframe in post.xpath('.//iframe/@src'):
|
||||||
|
urlstring = iframe.extract()
|
||||||
|
if '/audio_player_iframe/' in urlstring:
|
||||||
|
yield scrapy.http.Request(urlstring, callback=self.parse_audioplayer)
|
||||||
|
|
||||||
|
# get next page
|
||||||
|
for href in response.xpath(PATHS[TUMBLR]['next']):
|
||||||
|
url = 'http://' + TUMBLR + '.tumblr.com' + href.extract()
|
||||||
|
yield scrapy.http.Request(url, callback=self.parse)
|
||||||
|
|
||||||
|
def parse_audioplayer(self, response):
|
||||||
|
url = urlparse.urlparse(response.request.url)
|
||||||
|
audio_url = urlparse.parse_qs(url.query).get('audio_file')
|
||||||
|
if audio_url:
|
||||||
|
item = FileItem()
|
||||||
|
# item['url2'] = audio_url[0]
|
||||||
|
item['url'] = audio_url[0] + '?play_key=' + response.xpath('//div[contains(@class, "audio_player_container")]/@data-post-key').extract()[0]
|
||||||
|
item['post_url'] = response.request.url
|
||||||
|
title = response.xpath('//li[@class="track_name"]/text()').extract()
|
||||||
|
artist = response.xpath('//li[@class="artist_name"]/text()').extract()
|
||||||
|
item['title'] = ' - '.join(title + artist)
|
||||||
|
yield item
|
||||||
|
|
11
scrapy.cfg
Normal file
11
scrapy.cfg
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
# Automatically created by: scrapy startproject
|
||||||
|
#
|
||||||
|
# For more information about the [deploy] section see:
|
||||||
|
# http://doc.scrapy.org/en/latest/topics/scrapyd.html
|
||||||
|
|
||||||
|
[settings]
|
||||||
|
default = fmfridays.settings
|
||||||
|
|
||||||
|
[deploy]
|
||||||
|
#url = http://localhost:6800/
|
||||||
|
project = fmfridays
|
Loading…
Reference in a new issue