47 lines
1.6 KiB
Python
47 lines
1.6 KiB
Python
import scrapy
|
|
from fmfridays.items import FileItem
|
|
from fmfridays.settings import TUMBLR
|
|
import urlparse
|
|
|
|
PATHS = {
|
|
'fmfridays': {
|
|
'post': '//section[@class="post"]',
|
|
'next': '//a[@class="next"]/@href'
|
|
},
|
|
'bestofmidi': {
|
|
'post': '//div[@class="audio"]',
|
|
'next': '//a[@id="prev"]/@href'
|
|
}
|
|
}
|
|
class AudioSpider(scrapy.Spider):
|
|
name = 'audio'
|
|
allowed_domains = [TUMBLR + ".tumblr.com"]
|
|
start_urls = ['http://' + TUMBLR + '.tumblr.com/']
|
|
|
|
def parse(self, response):
|
|
# find audio players
|
|
for post in response.xpath(PATHS[TUMBLR]['post']):
|
|
for iframe in post.xpath('.//iframe/@src'):
|
|
urlstring = iframe.extract()
|
|
if '/audio_player_iframe/' in urlstring:
|
|
yield scrapy.http.Request(urlstring, callback=self.parse_audioplayer)
|
|
|
|
# get next page
|
|
for href in response.xpath(PATHS[TUMBLR]['next']):
|
|
url = 'http://' + TUMBLR + '.tumblr.com' + href.extract()
|
|
yield scrapy.http.Request(url, callback=self.parse)
|
|
|
|
def parse_audioplayer(self, response):
|
|
url = urlparse.urlparse(response.request.url)
|
|
audio_url = urlparse.parse_qs(url.query).get('audio_file')
|
|
if audio_url:
|
|
item = FileItem()
|
|
# item['url2'] = audio_url[0]
|
|
item['url'] = audio_url[0] + '?play_key=' + response.xpath('//div[contains(@class, "audio_player_container")]/@data-post-key').extract()[0]
|
|
item['post_url'] = response.request.url
|
|
title = response.xpath('//li[@class="track_name"]/text()').extract()
|
|
artist = response.xpath('//li[@class="artist_name"]/text()').extract()
|
|
item['title'] = ' - '.join(title + artist)
|
|
yield item
|
|
|