Source code for zounds.datasets.internetarchive

import requests
import urllib.parse
from zounds.soundfile.audio_metadata import AudioMetaData


[docs]class InternetArchive(object):
    """
    Produces an iterable of :class:`zounds.soundfile.AudioMetaData` instances
    for every file of a particular format from an internet archive id.

    Args:
        archive_id (str): the Internet Archive identifier
        format_filter (str): The file format to return
        attrs (dict): Extra attributes to add to the :class:`AudioMetaData`

    Raises:
        ValueError: when archive_id is not provided

    Examples:
        >>> from zounds import InternetArchive
        >>> ia = InternetArchive('Greatest_Speeches_of_the_20th_Century')
        >>> iter(ia).next()
        {'creator': u'John F. Kennedy', 'height': u'0', 'channels': None, 'genre': u'Folk', 'licensing': None, 'mtime': u'1236666800', 'samplerate': None, 'size': u'7264435', 'album': u'Great Speeches of the 20th Century [Box Set] Disc 2', 'title': u'The Cuban Missile Crisis', 'format': u'128Kbps MP3', 'source': u'original', 'description': None, 'tags': None, 'track': u'15', 'crc32': u'ace17eb5', 'md5': u'e00f4e7bd9df7bdba4db7098d1ccdfe0', 'sha1': u'e42d1f348078a11ed9a6ea9c8934a1236235c7b3', 'artist': u'John F. Kennedy', 'external-identifier': [u'urn:acoustid:ff850a0c-2efa-450f-8034-efdb31a9b696', u'urn:mb_recording_id:912cedd0-5530-4f26-972c-13d131fef06e'], 'uri': <Request [GET]>, 'length': u'454.03', 'width': u'0'}

    See Also:
        :class:`FreeSoundSearch`
        :class:`PhatDrumLoops`
        :class:`zounds.soundfile.AudioMetaData`
    """
    def __init__(self, archive_id, format_filter=None, **attrs):
        super(InternetArchive, self).__init__()

        self.attrs = attrs
        if not archive_id:
            raise ValueError('You must supply an Internet Archive id')

        self.format_filter = format_filter or \
            (lambda x: x['format'] == 'Ogg Vorbis')
        self.archive_id = archive_id

    def _get_metadata(self, data, all_files):
        if data['source'] == 'original':
            return data
        elif data['source'] == 'derivative':
            return all_files['/' + data['original']]

    def __iter__(self):
        base_url = 'https://archive.org/'
        archive_id = self.archive_id
        url = urllib.parse.urljoin(
            base_url, '/details/{archive_id}&output=json'.format(**locals()))
        resp = requests.get(url)

        try:
            all_files = resp.json()['files']
        except ValueError as e:
            all_files = dict()

        for k, v in all_files.items():
            if self.format_filter(v):
                sound_url = urllib.parse.urljoin(
                    base_url, '/download/{archive_id}{k}'.format(**locals()))
                request = requests.Request(method='GET', url=sound_url)
                metadata = self._get_metadata(v, all_files)
                metadata.update(self.attrs)
                web_url = 'https://archive.org//details/{archive_id}'\
                    .format(**locals())
                metadata.update(web_url=web_url)
                yield AudioMetaData(uri=request, **metadata)