tumblr-music-scraper/fmfridays/pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy
from scrapy.contrib.pipeline.media import MediaPipeline
from scrapy.exceptions import DropItem
from fmfridays.settings import TUMBLR
import os
import re

class MP3DownloadPipeline(MediaPipeline):
  def get_media_requests(self, item, info):
    if not os.path.exists(self.path_from_item(item)):
      yield scrapy.Request(item['url'])

  def path_from_item(self, item):
    return os.path.join('/Users/jpenner/Music/downloads/', TUMBLR, re.sub(r'[^-_!\(\),\'& a-zA-Z0-9]', '_', item['title']) + '.mp3')

  def item_completed(self, results, item, info):
    for ok, response in results:
      if ok:
        path = self.path_from_item(item)
        if not os.path.exists(os.path.dirname(path)):
          os.makedirs(os.path.dirname(path))
        with open(path, 'wb') as f:
          f.write(response.body)
        item['path'] = path
    return item