Source code for zounds.datasets.internetarchive

import requests
import urllib.parse
from zounds.soundfile.audio_metadata import AudioMetaData


[docs]class InternetArchive(object): """ Produces an iterable of :class:`zounds.soundfile.AudioMetaData` instances for every file of a particular format from an internet archive id. Args: archive_id (str): the Internet Archive identifier format_filter (str): The file format to return attrs (dict): Extra attributes to add to the :class:`AudioMetaData` Raises: ValueError: when archive_id is not provided Examples: >>> from zounds import InternetArchive >>> ia = InternetArchive('Greatest_Speeches_of_the_20th_Century') >>> iter(ia).next() {'creator': u'John F. Kennedy', 'height': u'0', 'channels': None, 'genre': u'Folk', 'licensing': None, 'mtime': u'1236666800', 'samplerate': None, 'size': u'7264435', 'album': u'Great Speeches of the 20th Century [Box Set] Disc 2', 'title': u'The Cuban Missile Crisis', 'format': u'128Kbps MP3', 'source': u'original', 'description': None, 'tags': None, 'track': u'15', 'crc32': u'ace17eb5', 'md5': u'e00f4e7bd9df7bdba4db7098d1ccdfe0', 'sha1': u'e42d1f348078a11ed9a6ea9c8934a1236235c7b3', 'artist': u'John F. Kennedy', 'external-identifier': [u'urn:acoustid:ff850a0c-2efa-450f-8034-efdb31a9b696', u'urn:mb_recording_id:912cedd0-5530-4f26-972c-13d131fef06e'], 'uri': <Request [GET]>, 'length': u'454.03', 'width': u'0'} See Also: :class:`FreeSoundSearch` :class:`PhatDrumLoops` :class:`zounds.soundfile.AudioMetaData` """ def __init__(self, archive_id, format_filter=None, **attrs): super(InternetArchive, self).__init__() self.attrs = attrs if not archive_id: raise ValueError('You must supply an Internet Archive id') self.format_filter = format_filter or \ (lambda x: x['format'] == 'Ogg Vorbis') self.archive_id = archive_id def _get_metadata(self, data, all_files): if data['source'] == 'original': return data elif data['source'] == 'derivative': return all_files['/' + data['original']] def __iter__(self): base_url = 'https://archive.org/' archive_id = self.archive_id url = urllib.parse.urljoin( base_url, '/details/{archive_id}&output=json'.format(**locals())) resp = requests.get(url) try: all_files = resp.json()['files'] except ValueError as e: all_files = dict() for k, v in all_files.items(): if self.format_filter(v): sound_url = urllib.parse.urljoin( base_url, '/download/{archive_id}{k}'.format(**locals())) request = requests.Request(method='GET', url=sound_url) metadata = self._get_metadata(v, all_files) metadata.update(self.attrs) web_url = 'https://archive.org//details/{archive_id}'\ .format(**locals()) metadata.update(web_url=web_url) yield AudioMetaData(uri=request, **metadata)