import scrapy from fmfridays.items import FileItem from fmfridays.settings import TUMBLR import urlparse PATHS = { 'fmfridays': { 'post': '//section[@class="post"]', 'next': '//a[@class="next"]/@href' }, 'bestofmidi': { 'post': '//div[@class="audio"]', 'next': '//a[@id="prev"]/@href' } } class AudioSpider(scrapy.Spider): name = 'audio' allowed_domains = [TUMBLR + ".tumblr.com"] start_urls = ['http://' + TUMBLR + '.tumblr.com/'] def parse(self, response): # find audio players for post in response.xpath(PATHS[TUMBLR]['post']): for iframe in post.xpath('.//iframe/@src'): urlstring = iframe.extract() if '/audio_player_iframe/' in urlstring: yield scrapy.http.Request(urlstring, callback=self.parse_audioplayer) # get next page for href in response.xpath(PATHS[TUMBLR]['next']): url = 'http://' + TUMBLR + '.tumblr.com' + href.extract() yield scrapy.http.Request(url, callback=self.parse) def parse_audioplayer(self, response): url = urlparse.urlparse(response.request.url) audio_url = urlparse.parse_qs(url.query).get('audio_file') if audio_url: item = FileItem() # item['url2'] = audio_url[0] item['url'] = audio_url[0] + '?play_key=' + response.xpath('//div[contains(@class, "audio_player_container")]/@data-post-key').extract()[0] item['post_url'] = response.request.url title = response.xpath('//li[@class="track_name"]/text()').extract() artist = response.xpath('//li[@class="artist_name"]/text()').extract() item['title'] = ' - '.join(title + artist) yield item