Initial commmit
This commit is contained in:
commit
6d9ce8b28a
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
*.pyc
|
0
fmfridays/__init__.py
Normal file
0
fmfridays/__init__.py
Normal file
17
fmfridays/items.py
Normal file
17
fmfridays/items.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# http://doc.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class FileItem(scrapy.Item):
|
||||
url = scrapy.Field()
|
||||
url2 = scrapy.Field()
|
||||
post_url = scrapy.Field()
|
||||
title = scrapy.Field()
|
||||
path = scrapy.Field()
|
||||
|
31
fmfridays/pipelines.py
Normal file
31
fmfridays/pipelines.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
import scrapy
|
||||
from scrapy.contrib.pipeline.media import MediaPipeline
|
||||
from scrapy.exceptions import DropItem
|
||||
from fmfridays.settings import TUMBLR
|
||||
import os
|
||||
import re
|
||||
|
||||
class MP3DownloadPipeline(MediaPipeline):
|
||||
def get_media_requests(self, item, info):
|
||||
if not os.path.exists(self.path_from_item(item)):
|
||||
yield scrapy.Request(item['url'])
|
||||
|
||||
def path_from_item(self, item):
|
||||
return os.path.join('/Users/jpenner/Music/downloads/', TUMBLR, re.sub(r'[^-_!\(\),\'& a-zA-Z0-9]', '_', item['title']) + '.mp3')
|
||||
|
||||
def item_completed(self, results, item, info):
|
||||
for ok, response in results:
|
||||
if ok:
|
||||
path = self.path_from_item(item)
|
||||
if not os.path.exists(os.path.dirname(path)):
|
||||
os.makedirs(os.path.dirname(path))
|
||||
with open(path, 'wb') as f:
|
||||
f.write(response.body)
|
||||
item['path'] = path
|
||||
return item
|
22
fmfridays/settings.py
Normal file
22
fmfridays/settings.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Scrapy settings for fmfridays project
|
||||
#
|
||||
# For simplicity, this file contains only the most important settings by
|
||||
# default. All the other settings are documented here:
|
||||
#
|
||||
# http://doc.scrapy.org/en/latest/topics/settings.html
|
||||
#
|
||||
|
||||
BOT_NAME = 'fmfridays'
|
||||
|
||||
SPIDER_MODULES = ['fmfridays.spiders']
|
||||
NEWSPIDER_MODULE = 'fmfridays.spiders'
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = 'fmfridays (+http://www.yourdomain.com)'
|
||||
ITEM_PIPELINES = ['fmfridays.pipelines.MP3DownloadPipeline']
|
||||
|
||||
DOWNLOAD_DELAY = 0.25
|
||||
|
||||
TUMBLR = 'bestofmidi'
|
4
fmfridays/spiders/__init__.py
Normal file
4
fmfridays/spiders/__init__.py
Normal file
|
@ -0,0 +1,4 @@
|
|||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
46
fmfridays/spiders/audio_spider.py
Normal file
46
fmfridays/spiders/audio_spider.py
Normal file
|
@ -0,0 +1,46 @@
|
|||
import scrapy
|
||||
from fmfridays.items import FileItem
|
||||
from fmfridays.settings import TUMBLR
|
||||
import urlparse
|
||||
|
||||
PATHS = {
|
||||
'fmfridays': {
|
||||
'post': '//section[@class="post"]',
|
||||
'next': '//a[@class="next"]/@href'
|
||||
},
|
||||
'bestofmidi': {
|
||||
'post': '//div[@class="audio"]',
|
||||
'next': '//a[@id="prev"]/@href'
|
||||
}
|
||||
}
|
||||
class AudioSpider(scrapy.Spider):
|
||||
name = 'audio'
|
||||
allowed_domains = [TUMBLR + ".tumblr.com"]
|
||||
start_urls = ['http://' + TUMBLR + '.tumblr.com/']
|
||||
|
||||
def parse(self, response):
|
||||
# find audio players
|
||||
for post in response.xpath(PATHS[TUMBLR]['post']):
|
||||
for iframe in post.xpath('.//iframe/@src'):
|
||||
urlstring = iframe.extract()
|
||||
if '/audio_player_iframe/' in urlstring:
|
||||
yield scrapy.http.Request(urlstring, callback=self.parse_audioplayer)
|
||||
|
||||
# get next page
|
||||
for href in response.xpath(PATHS[TUMBLR]['next']):
|
||||
url = 'http://' + TUMBLR + '.tumblr.com' + href.extract()
|
||||
yield scrapy.http.Request(url, callback=self.parse)
|
||||
|
||||
def parse_audioplayer(self, response):
|
||||
url = urlparse.urlparse(response.request.url)
|
||||
audio_url = urlparse.parse_qs(url.query).get('audio_file')
|
||||
if audio_url:
|
||||
item = FileItem()
|
||||
# item['url2'] = audio_url[0]
|
||||
item['url'] = audio_url[0] + '?play_key=' + response.xpath('//div[contains(@class, "audio_player_container")]/@data-post-key').extract()[0]
|
||||
item['post_url'] = response.request.url
|
||||
title = response.xpath('//li[@class="track_name"]/text()').extract()
|
||||
artist = response.xpath('//li[@class="artist_name"]/text()').extract()
|
||||
item['title'] = ' - '.join(title + artist)
|
||||
yield item
|
||||
|
11
scrapy.cfg
Normal file
11
scrapy.cfg
Normal file
|
@ -0,0 +1,11 @@
|
|||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# http://doc.scrapy.org/en/latest/topics/scrapyd.html
|
||||
|
||||
[settings]
|
||||
default = fmfridays.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = fmfridays
|
Loading…
Reference in a new issue