47 lines
1.6 KiB
Python
47 lines
1.6 KiB
Python
|
import scrapy
|
||
|
from fmfridays.items import FileItem
|
||
|
from fmfridays.settings import TUMBLR
|
||
|
import urlparse
|
||
|
|
||
|
PATHS = {
|
||
|
'fmfridays': {
|
||
|
'post': '//section[@class="post"]',
|
||
|
'next': '//a[@class="next"]/@href'
|
||
|
},
|
||
|
'bestofmidi': {
|
||
|
'post': '//div[@class="audio"]',
|
||
|
'next': '//a[@id="prev"]/@href'
|
||
|
}
|
||
|
}
|
||
|
class AudioSpider(scrapy.Spider):
|
||
|
name = 'audio'
|
||
|
allowed_domains = [TUMBLR + ".tumblr.com"]
|
||
|
start_urls = ['http://' + TUMBLR + '.tumblr.com/']
|
||
|
|
||
|
def parse(self, response):
|
||
|
# find audio players
|
||
|
for post in response.xpath(PATHS[TUMBLR]['post']):
|
||
|
for iframe in post.xpath('.//iframe/@src'):
|
||
|
urlstring = iframe.extract()
|
||
|
if '/audio_player_iframe/' in urlstring:
|
||
|
yield scrapy.http.Request(urlstring, callback=self.parse_audioplayer)
|
||
|
|
||
|
# get next page
|
||
|
for href in response.xpath(PATHS[TUMBLR]['next']):
|
||
|
url = 'http://' + TUMBLR + '.tumblr.com' + href.extract()
|
||
|
yield scrapy.http.Request(url, callback=self.parse)
|
||
|
|
||
|
def parse_audioplayer(self, response):
|
||
|
url = urlparse.urlparse(response.request.url)
|
||
|
audio_url = urlparse.parse_qs(url.query).get('audio_file')
|
||
|
if audio_url:
|
||
|
item = FileItem()
|
||
|
# item['url2'] = audio_url[0]
|
||
|
item['url'] = audio_url[0] + '?play_key=' + response.xpath('//div[contains(@class, "audio_player_container")]/@data-post-key').extract()[0]
|
||
|
item['post_url'] = response.request.url
|
||
|
title = response.xpath('//li[@class="track_name"]/text()').extract()
|
||
|
artist = response.xpath('//li[@class="artist_name"]/text()').extract()
|
||
|
item['title'] = ' - '.join(title + artist)
|
||
|
yield item
|
||
|
|