I am new to programming and web-scraping, but I am trying to scrape audio features and collaborators for all R&B songs released onto Spotify from 2018-2023 using Spotipy. I understand that there are limits as for how many songs one can scrape, but I referenced an answer on how to solve this and tried to adapt the given Python code to my problem.
However, when I run the below script:
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import pandas as pd
import re
# Spotify API Credentials and Constants
USER_ID = 'USER_ID'
CLIENT_ID = 'CLIENT_ID'
CLIENT_SECRET = 'CLIENT_SECRET'
REDIRECT_URI = 'http://localhost:3000'
# Spotify Scopes
SCOPE = [
'user-library-read',
'user-follow-read',
'user-top-read',
'playlist-read-private',
'playlist-read-collaborative',
'playlist-modify-public',
'playlist-modify-private'
]
# Initialize Spotipy client
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(
scope=SCOPE,
username=USER_ID,
redirect_uri=REDIRECT_URI,
client_id=CLIENT_ID,
client_secret=CLIENT_SECRET))
def get_categories():
query_limit = 50
categories = []
new_offset = 0
while True:
results = sp.category_playlists(category_id='0JQ5DAqbMKFEZPnFQSFB1T', limit=query_limit, country='US', offset=new_offset)
for item in results['playlists']['items']:
if item and item.get('name') and item.get('tracks', {}).get('href'):
tokens = re.split(r"[\/]", item['tracks']['href'])
categories.append({
'id': item['id'],
'name': item['name'],
'url': item['external_urls']['spotify'],
'tracks': item['tracks']['href'],
'playlist_id': tokens[5],
'type': item['type']
})
new_offset += query_limit
next_page = results['playlists']['next']
if not next_page:
break
return categories
def get_songs(categories):
songs = []
for category in categories:
playlist_id = category['playlist_id']
results = sp.playlist_tracks(playlist_id=playlist_id)
for item in results['items']:
track = item['track']
if track and 'id' in track and track['id']:
release_date = track['album']['release_date']
# Filter songs based on release date
if '2018' <= release_date.split('-')[0] <= '2023':
features = sp.audio_features(track['id'])[0]
main_artist = track['artists'][0]['name']
featured_artists = ', '.join([artist['name'] for artist in track['artists'][1:]]) if len(track['artists']) > 1 else 'N/A'
is_featured = len(track['artists']) > 1
songs.append({
'track_name': track['name'],
'album_name': track['album']['name'],
'main_artist': main_artist,
'features_artist': is_featured,
'featured_artist': featured_artists,
'release_date': release_date,
'popularity_score': track['popularity'],
'explicit': track['explicit'],
'duration': track['duration_ms'],
'danceability': features['danceability'],
'energy': features['energy'],
'key': features['key'],
'loudness': features['loudness'],
'mode': features['mode'],
'speechiness': features['speechiness'],
'acousticness': features['acousticness'],
'instrumentalness': features['instrumentalness'],
'liveness': features['liveness'],
'valence': features['valence'],
'tempo': features['tempo'],
})
return songs
# Fetch categories and songs
categories = get_categories()
songs = get_songs(categories)
# Create dataframe
df = pd.DataFrame(songs)
print(df.shape)
df.head()
I either get an error that I've done the Maximum Retries or the following error:
---------------------------------------------------------------------------
TimeoutError Traceback (most recent call last)
File ~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:467, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
463 except BaseException as e:
464 # Remove the TypeError from the exception chain in
465 # Python 3 (including for exceptions like SystemExit).
466 # Otherwise it looks like a bug in the code.
--> 467 six.raise_from(e, None)
468 except (SocketTimeout, BaseSSLError, SocketError) as e:
File <string>:3, in raise_from(value, from_value)
File ~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:462, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
461 try:
--> 462 httplib_response = conn.getresponse()
463 except BaseException as e:
464 # Remove the TypeError from the exception chain in
465 # Python 3 (including for exceptions like SystemExit).
466 # Otherwise it looks like a bug in the code.
File ~/anaconda3/lib/python3.11/http/client.py:1378, in HTTPConnection.getresponse(self)
1377 try:
-> 1378 response.begin()
1379 except ConnectionError:
File ~/anaconda3/lib/python3.11/http/client.py:318, in HTTPResponse.begin(self)
317 while True:
--> 318 version, status, reason = self._read_status()
319 if status != CONTINUE:
File ~/anaconda3/lib/python3.11/http/client.py:279, in HTTPResponse._read_status(self)
278 def _read_status(self):
--> 279 line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
280 if len(line) > _MAXLINE:
File ~/anaconda3/lib/python3.11/socket.py:706, in SocketIO.readinto(self, b)
705 try:
--> 706 return self._sock.recv_into(b)
707 except timeout:
File ~/anaconda3/lib/python3.11/ssl.py:1311, in SSLSocket.recv_into(self, buffer, nbytes, flags)
1308 raise ValueError(
1309 "non-zero flags not allowed in calls to recv_into() on %s" %
1310 self.__class__)
-> 1311 return self.read(nbytes, buffer)
1312 else:
File ~/anaconda3/lib/python3.11/ssl.py:1167, in SSLSocket.read(self, len, buffer)
1166 if buffer is not None:
-> 1167 return self._sslobj.read(len, buffer)
1168 else:
TimeoutError: The read operation timed out
During handling of the above exception, another exception occurred:
ReadTimeoutError Traceback (most recent call last)
File ~/anaconda3/lib/python3.11/site-packages/requests/adapters.py:486, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
485 try:
--> 486 resp = conn.urlopen(
487 method=request.method,
488 url=url,
489 body=request.body,
490 headers=request.headers,
491 redirect=False,
492 assert_same_host=False,
493 preload_content=False,
494 decode_content=False,
495 retries=self.max_retries,
496 timeout=timeout,
497 chunked=chunked,
498 )
500 except (ProtocolError, OSError) as err:
File ~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:799, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
797 e = ProtocolError("Connection aborted.", e)
--> 799 retries = retries.increment(
800 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
801 )
802 retries.sleep()
File ~/anaconda3/lib/python3.11/site-packages/urllib3/util/retry.py:550, in Retry.increment(self, method, url, response, error, _pool, _stacktrace)
549 if read is False or not self._is_method_retryable(method):
--> 550 raise six.reraise(type(error), error, _stacktrace)
551 elif read is not None:
File ~/anaconda3/lib/python3.11/site-packages/urllib3/packages/six.py:770, in reraise(tp, value, tb)
769 raise value.with_traceback(tb)
--> 770 raise value
771 finally:
File ~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:715, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
714 # Make the request on the httplib connection object.
--> 715 httplib_response = self._make_request(
716 conn,
717 method,
718 url,
719 timeout=timeout_obj,
720 body=body,
721 headers=headers,
722 chunked=chunked,
723 )
725 # If we're going to release the connection in ``finally:``, then
726 # the response doesn't need to know about the connection. Otherwise
727 # it will also try to release it and we'll have a double-release
728 # mess.
File ~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:469, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
468 except (SocketTimeout, BaseSSLError, SocketError) as e:
--> 469 self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
470 raise
File ~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:358, in HTTPConnectionPool._raise_timeout(self, err, url, timeout_value)
357 if isinstance(err, SocketTimeout):
--> 358 raise ReadTimeoutError(
359 self, url, "Read timed out. (read timeout=%s)" % timeout_value
360 )
362 # See the above comment about EAGAIN in Python 3. In Python 2 we have
363 # to specifically catch it and throw the timeout error
ReadTimeoutError: HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)
During handling of the above exception, another exception occurred:
ReadTimeout Traceback (most recent call last)
Cell In[16], line 95
93 # Fetch categories and songs
94 categories = get_categories()
---> 95 songs = get_songs(categories)
97 # Create dataframe
98 df = pd.DataFrame(songs)
Cell In[16], line 65, in get_songs(categories)
63 # Filter songs based on release date
64 if '2018' <= release_date.split('-')[0] <= '2023':
---> 65 features = sp.audio_features(track['id'])[0]
66 main_artist = track['artists'][0]['name']
67 featured_artists = ', '.join([artist['name'] for artist in track['artists'][1:]]) if len(track['artists']) > 1 else 'N/A'
File ~/anaconda3/lib/python3.11/site-packages/spotipy/client.py:1734, in Spotify.audio_features(self, tracks)
1732 if isinstance(tracks, str):
1733 trackid = self._get_id("track", tracks)
-> 1734 results = self._get("audio-features/?ids=" + trackid)
1735 else:
1736 tlist = [self._get_id("track", t) for t in tracks]
File ~/anaconda3/lib/python3.11/site-packages/spotipy/client.py:323, in Spotify._get(self, url, args, payload, **kwargs)
320 if args:
321 kwargs.update(args)
--> 323 return self._internal_call("GET", url, payload, kwargs)
File ~/anaconda3/lib/python3.11/site-packages/spotipy/client.py:266, in Spotify._internal_call(self, method, url, payload, params)
262 logger.debug('Sending %s to %s with Params: %s Headers: %s and Body: %r ',
263 method, url, args.get("params"), headers, args.get('data'))
265 try:
--> 266 response = self._session.request(
267 method, url, headers=headers, proxies=self.proxies,
268 timeout=self.requests_timeout, **args
269 )
271 response.raise_for_status()
272 results = response.json()
File ~/anaconda3/lib/python3.11/site-packages/requests/sessions.py:589, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
584 send_kwargs = {
585 "timeout": timeout,
586 "allow_redirects": allow_redirects,
587 }
588 send_kwargs.update(settings)
--> 589 resp = self.send(prep, **send_kwargs)
591 return resp
File ~/anaconda3/lib/python3.11/site-packages/requests/sessions.py:703, in Session.send(self, request, **kwargs)
700 start = preferred_clock()
702 # Send the request
--> 703 r = adapter.send(request, **kwargs)
705 # Total elapsed time of the request (approximately)
706 elapsed = preferred_clock() - start
File ~/anaconda3/lib/python3.11/site-packages/requests/adapters.py:532, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
530 raise SSLError(e, request=request)
531 elif isinstance(e, ReadTimeoutError):
--> 532 raise ReadTimeout(e, request=request)
533 elif isinstance(e, _InvalidHeader):
534 raise InvalidHeader(e, request=request)
ReadTimeout: HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)
Does anyone know how I can modify the below script to achieve my goal or a new approach?
Your audio_features() API call too many call error response from Spotify. This indicates that your app has reached our Web API rate limit
So I update your code by removing audio_features() call.
This code get the all of 3360 R&B songs between 2018 and 2023
Result