tumblr-music-scraper/fmfridays/spiders/audio_spider.py

47 lines
1.6 KiB
Python
Raw Normal View History

2015-05-06 14:16:39 +00:00
import scrapy
from fmfridays.items import FileItem
from fmfridays.settings import TUMBLR
import urlparse
PATHS = {
'fmfridays': {
'post': '//section[@class="post"]',
'next': '//a[@class="next"]/@href'
},
'bestofmidi': {
'post': '//div[@class="audio"]',
'next': '//a[@id="prev"]/@href'
}
}
class AudioSpider(scrapy.Spider):
name = 'audio'
allowed_domains = [TUMBLR + ".tumblr.com"]
start_urls = ['http://' + TUMBLR + '.tumblr.com/']
def parse(self, response):
# find audio players
for post in response.xpath(PATHS[TUMBLR]['post']):
for iframe in post.xpath('.//iframe/@src'):
urlstring = iframe.extract()
if '/audio_player_iframe/' in urlstring:
yield scrapy.http.Request(urlstring, callback=self.parse_audioplayer)
# get next page
for href in response.xpath(PATHS[TUMBLR]['next']):
url = 'http://' + TUMBLR + '.tumblr.com' + href.extract()
yield scrapy.http.Request(url, callback=self.parse)
def parse_audioplayer(self, response):
url = urlparse.urlparse(response.request.url)
audio_url = urlparse.parse_qs(url.query).get('audio_file')
if audio_url:
item = FileItem()
# item['url2'] = audio_url[0]
item['url'] = audio_url[0] + '?play_key=' + response.xpath('//div[contains(@class, "audio_player_container")]/@data-post-key').extract()[0]
item['post_url'] = response.request.url
title = response.xpath('//li[@class="track_name"]/text()').extract()
artist = response.xpath('//li[@class="artist_name"]/text()').extract()
item['title'] = ' - '.join(title + artist)
yield item