Data Collection

Define helper functions
Define search queries (search words)
Query the Spotify API

Here, we document our process to collect data from the Spotify API.

A brief overview of the steps:

Get authentication token for accessing the API
Sample Spotify playlists
- Specify 150 predefined search keywords
- Use random word generator to generate 50 random words
- Combine the 2 lists and use get_playlists_by_search_word() to get playlists that match the search words
- For each keyword, scrape 50 playlists on Spotify that match
Store playlist sample with associated attributes in a json file. Attributes include:
- Playlist ID
- Owner ID
- Whether playlist is collaborative
- Number of tracks
- Track IDs
- Number of follower
Get track info from the sample of playlists. Information include:
- Track name
- Track ID
- Track ISRC ID
- Artist names
- Artist IDs
- Artist genres
- Artist popularities
- Artist number of followers
- Album name
- Album ID
- Album popularity
- Audio features (‘danceability’, ‘energy’, ‘key’, ‘loudness’, ‘mode’, ‘speechiness’, ‘acousticness’, ‘instrumentalness’, ‘liveness’, ‘valence’, ‘tempo’, ‘duration_ms’, ‘time_signature’)
- Whether lyrics is explicit
- Number of available markets
- Popularity
Store all track information in a tracks.json file Note: A track is often associated with multiple artists

Define helper functions

These libraries and functions are used to scrape the Spotify API.

def get_auth_spotipy () -> Spotify:
    """
    Function that returns authorized Spotify client
    """ 
    os.environ['SPOTIPY_CLIENT_ID'] = 'c4cd8ee33b624ca6b224debdef35ba58'
    os.environ['SPOTIPY_CLIENT_SECRET'] = '3d5127677713483d99ded163e45198c6'

    client_credentials_manager = SpotifyClientCredentials()
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    return sp


def dump_data(data_to_json, file):
    """
    Function to save scraped data to json file
    example: file = '../data/playlists_5088.json'
    """
    with open(file,'w') as fd:
        json.dump(data_to_json, fd)


def load_data(file):
    """
    Function to load json file
    """
    with open(file, 'r') as fd:
        data_from_json = json.load(fd)
        return data_from_json


def generate_search_word(predefined, number):
    """
    Funciton to generate a list of search words, including a set of predefined words and randomly generated words
    """
    rw = RandomWords()
    random_words = rw.random_words(letter=None, count=number)
    search_words = predefined + random_words
    return list(set(search_words))


def get_playlists_by_search_word(search_words):
    """
    Function to get playlists from the Spotify API using a list of search words
    """
    all_playlists = []

    for word in search_words:
    #     time.sleep(1)
        try:
            print(word)
            playlists = sp.search(word, type='playlist', limit=50)

            for i, playlist in enumerate(playlists['playlists']['items']):
                playlist_info = {}
                user = playlist['owner']['id']

                try:
                    current_playlist = sp.user_playlist(user, playlist_id=playlist['id'])

                    tracks = current_playlist['tracks']['items']
                    track_ids = set()
                    for track in tracks:
                        if track['track']:
                            track_ids.add(track['track']['id'])

                    playlist_info['track_ids'] = list(track_ids) # convert the set of track_ids of a playlist into a list

                    playlist_info['num_followers'] = current_playlist['followers']['total']
                    playlist_info['collaborative'] = current_playlist['collaborative']
                    playlist_info['id'] = current_playlist['id']
                    playlist_info['owner_id'] = current_playlist['owner']['id']
                    playlist_info['num_tracks'] = current_playlist['tracks']['total']

                    all_playlists.append(playlist_info)

                except:
                    continue
        except:
            continue

    return all_playlists


def get_tracks_in_playlists(playlists):
    """
    Function to extract track features from a list of playlists
    """
    all_tracks = {}
    track_ids_set = set()
    len_playlists = len(playlists)
    for i, playlist in enumerate(playlists):
        sys.stdout.write('\r{0}% completed.'.format((float(i+1)/len_playlists)*100))
        sys.stdout.flush()

        track_ids = playlist['track_ids']
        track_ids = [i for i in track_ids if i] # remove NaNs in track_ids

        for track_id in track_ids:
            try:
                if track_id in track_ids_set:
                    continue
                track_ids_set.add(track_id)
                track_info = {}

                # Features related to the TRACK itself
                track = sp.track(track_id)
                track_info['id'] = track_id
                track_info['name'] = track['name']
                
                if 'explicit' in track:
                    track_info['explicit'] = track['explicit']
                if track['external_ids']:
                    track_info['isrc'] = track['external_ids']['isrc']
                
                track_info['num_available_markets'] = len(track['available_markets'])

                # Get audio_features of a track
                audio_feature_keys = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
                                     'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
                track_audio_feature = sp.audio_features(track_id)[0]
                if track_audio_feature:
                    for key in audio_feature_keys:
                        track_info[key] = track_audio_feature[key]
                track_info['popularity'] = track['popularity']
                
                
                # Features related to the ALBUM
                album = sp.album(track['album']['id'])
                track_info['album_name'] = album['name']
                track_info['album_id'] = album['id']
                track_info['album_genres'] = album['genres']
                track_info['album_popularity'] = album['popularity']
                
                # Features related to the ARTIST
                track_info['artists_names'] = []
                track_info['artists_ids'] = []
                track_info['artists_popularities'] = []
                track_info['artists_num_followers'] = []
                track_info['artists_genres'] = []
                
                for artist in track['artists']:
                    current_artist = sp.artist(artist['id'])
                    track_info['artists_names'].append(current_artist['name'])
                    track_info['artists_ids'].append(current_artist['id'])
                    track_info['artists_popularities'].append(current_artist['popularity'])
                    track_info['artists_num_followers'].append(current_artist['followers']['total'])
                    track_info['artists_genres'].extend(current_artist['genres'])
                
                
                track_info['avg_artist_popularity'] = np.mean(track_info['artists_popularities'])
                track_info['std_artist_popularity'] = np.std(track_info['artists_popularities'])
                track_info['avg_artist_num_followers'] = np.mean(track_info['artists_num_followers'])
                track_info['std_artist_num_followers'] = np.std(track_info['artists_num_followers'])
                
                if (track_info['artists_genres']):
                    # count the most freqent artist genre

                    counter = collections.Counter(track_info['artists_genres'])
                    track_info['mode_artist_genre'] = counter.most_common()[0][0]
                
                all_tracks[track_id] = track_info
                
            except:
                continue
    return all_tracks  


def missing_tracks(tracks_db, playlists):
    """
    Function that checks if tracks in playlists are missing from tracks database(tracks_db)
    Returns a set of missing track ids 
    """
    missing_counts = 0
    missing_tracks = []
    # Loop over each playlist
    for index, playlist in enumerate(playlists):
        # get the list of track ids for playlist
        track_ids = playlist['track_ids']
        
        # check if tracks in playlist are in the track database
        for track_id in track_ids:
            # check if the track_id is in the tracks_db
            if track_id in tracks_db.keys():
                continue
            else:
                missing_counts += 1
                missing_tracks.append(track_id)
    print('tracks that are missing : {}'.format(missing_counts))
    return set(missing_tracks)


def get_tracks_by_track_ids(track_ids):
    """
    Function to extract track features from a list of track ids
    """
    all_tracks = {}
    num_track_ids = len(track_ids)
    for i, track_id in enumerate(track_ids):
        try:
            time.sleep(1)
            sys.stdout.write('\r{0}% completed.'.format((float(i+1)/num_track_ids)*100))
            sys.stdout.flush()

            track_info = {}

            # Features related to the TRACK itself
            track = sp.track(track_id)
            track_info['id'] = track_id
            track_info['name'] = track['name']

            if 'explicit' in track:
                track_info['explicit'] = track['explicit']
            if track['external_ids']:
                track_info['isrc'] = track['external_ids']['isrc']

            track_info['num_available_markets'] = len(track['available_markets'])

            # Get audio_features of a track
            audio_feature_keys = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
                                 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
            track_audio_feature = sp.audio_features(track_id)[0]
            if track_audio_feature:
                for key in audio_feature_keys:
                    track_info[key] = track_audio_feature[key]
            track_info['popularity'] = track['popularity']


            # Features related to the ALBUM
            album = sp.album(track['album']['id'])
            track_info['album_name'] = album['name']
            track_info['album_id'] = album['id']
            track_info['album_genres'] = album['genres']
            track_info['album_popularity'] = album['popularity']

            # Features related to the ARTIST
            track_info['artists_names'] = []
            track_info['artists_ids'] = []
            track_info['artists_popularities'] = []
            track_info['artists_num_followers'] = []
            track_info['artists_genres'] = []

            for artist in track['artists']:
                current_artist = sp.artist(artist['id'])
                track_info['artists_names'].append(current_artist['name'])
                track_info['artists_ids'].append(current_artist['id'])
                track_info['artists_popularities'].append(current_artist['popularity'])
                track_info['artists_num_followers'].append(current_artist['followers']['total'])
                track_info['artists_genres'].extend(current_artist['genres'])

            track_info['avg_artist_popularity'] = np.mean(track_info['artists_popularities'])
            track_info['std_artist_popularity'] = np.std(track_info['artists_popularities'])
            track_info['avg_artist_num_followers'] = np.mean(track_info['artists_num_followers'])
            track_info['std_artist_num_followers'] = np.std(track_info['artists_num_followers'])

            if (track_info['artists_genres']):
                # count the most freqent artist genre
                counter = collections.Counter(track_info['artists_genres'])
                track_info['mode_artist_genre'] = counter.most_common()[0][0]
        except:
            continue

        all_tracks[track_id] = track_info
            
    return all_tracks  

sp = get_auth_spotipy()

Define search queries (search words)

To broadly and randomly sample playlists from the Spotify API, we combine 150 predefined words and 50 randomly generated words as our search words to query the API.

Our 150 predefined search words are mostly inspired from Sportify trending playlist words 2015, DigitalTrends.com best Spotify playlists, and gizmodo.com 25 most popular Spotify playlists.

predefined = ['your', 'my', 'are', 'the', 'is', 'a', 'can', 'love', 'hate', 'holiday', 'work', 'workday',
              'weekend', 'sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'beautiful',
              'fall', 'summer', 'spring', 'winter', 'classics',  'throwback', 'car', 'morning', 'shower', 'current', 
              'jesus', 'party', 'gym','late', 'night', 'old', 'chill', 'country', 'new', 'feel', 'good', 'workout', 
              'slow','hood','tropical', 'EDM', 'wedding', 'sex', 'honeymoon', 'senior', 'cool', 'house', 'jam',
              'today', 'top', 'hits', 'dance', 'mix', 'teen', 'cardio', 'hangouts', 'hot', 'ultimate',
              'hip hop', 'mega', 'upbeat', 'acoustic', 'deep', 'girls', 'baby', 'indie', 'punk', 'rock', 'family',
              'funk', 'jazz', 'instrumental', 'rap', 'beats', 'future', 'happy', 'emotional', 'great', 'magic',
              'finds', 'escape', 'fresh', 'high', 'low', 'buzz', 'kill', 'mood', 'blue' 'dirty', 'soul', 'pop',
              'beach', 'dream', 'shuffle', 'date', 'romantic', 'prom', 'college', 'kids', 'sleep', 'serenade',
              'calm', 'light', 'heavy', 'soft', 'strong', 'drama', 'confession', 'blink', 'sad', 'heart',
              'trend', 'trending', 'max', 'folk', 'blues', 'contemporary', 'electric', 'R&B', 'alternative', 
              'easy', 'metal', 'reggae', 'southern', 'cozy', 'darling', 'like', 'you', 'I', 'club', 'mind', 
              'waltz', 'glow', 'crazy', 'women', 'men', 'vibes', 'wave', 'trip', 'crave', 'him', 'break', 
              'true', 'different', 'her']

search_words = generate_search_word(predefined=predefined, number=50)

dump_data(search_words, '../data/200_search_words.json')

Query the Spotify API

With these search words, we scrape the Spotify API for playlists that match these search queries. For each of the search words, we scraped 50 matched playlists. Therefore, with 200 search words, we scraped 9511 unique playlists. We save the matched playlists in a json file.

keywords = load_data('../data/200_search_words.json')

all_playlists = get_playlists_by_search_word(keywords)

dump_data(all_playlists, '../data/playlists_from_200_search_words.json')

Next, we scrape the songs that are associated with these playlists. We only get the unique songs (i.e. if 2 playlists have the same song, we only scrape the song once). Note: because it takes a very long time to scrape, we divided the list of playlists into 5 sections and scraped them in parallel. These json files containing track information are merged into a single track database at the conclusion of our data collection process.

all_playlists = load_data('../data/playlists_from_200_search_words.json')

tracks_2000 = get_tracks_in_playlists(all_playlists[:2000])
tracks_4000 = get_tracks_in_playlists(all_playlists[2000:4000])
tracks_6000 = get_tracks_in_playlists(all_playlists[4000:6000])
tracks_8000 = get_tracks_in_playlists(all_playlists[6000:8000])
tracks_9000 = get_tracks_in_playlists(all_playlists[8000:9000])
tracks_last = get_tracks_in_playlists(all_playlists[9000:])

dump_data(tracks_2000, '../data/tracks_2000.json')
dump_data(tracks_4000, '../data/tracks_2000_4000.json')
dump_data(tracks_6000, '../data/tracks_4000_6000.json')
dump_data(tracks_8000, '../data/tracks_6000_8000.json')
dump_data(tracks_9000, '../data/tracks_8000_9000.json')
dump_data(tracks_last, '../data/tracks_last.json')

After the first scrape, we checked to see if we have missing tracks in our database. For those tracks that are missing, we re-scraped the Spotify API based on their track IDs and store all track data into our tracks.json database.

tracks = {}
tracks_2000 = load_data('../data_archive/tracks_2000.json')
tracks_4000 = load_data('../data_archive/tracks_2000_4000.json')
tracks_6000 = load_data('../data_archive/tracks_4000_6000.json')
tracks_8000 = load_data('../data_archive/tracks_6000_8000.json')
tracks_9000 = load_data('../data_archive/tracks_8000_9000.json')
tracks_last = load_data('../data_archive/tracks_9000.json')

tracks.update(tracks_2000)
tracks.update(tracks_4000)
tracks.update(tracks_6000)
tracks.update(tracks_8000)
tracks.update(tracks_9000)
tracks.update(tracks_last)

missing_2000 = missing_tracks(tracks_2000, all_playlists[1000:2000])
missing_4000 = missing_tracks(tracks_4000, all_playlists[2000:4000])
missing_6000 = missing_tracks(tracks_6000, all_playlists[4000:6000])
missing_8000 = missing_tracks(tracks_8000, all_playlists[6000:8000])
missing_9000 = missing_tracks(tracks_9000, all_playlists[8000:9000])
missing_last = missing_tracks(tracks_last, all_playlists[9000:])

missing_tracks_2000 = get_tracks_by_track_ids(missing_2000)
missing_tracks_4000 = get_tracks_by_track_ids(missing_4000)
missing_tracks_6000 = get_tracks_by_track_ids(missing_6000)
missing_tracks_8000 = get_tracks_by_track_ids(missing_8000)
missing_tracks_9000 = get_tracks_by_track_ids(missing_9000)
missing_tracks_last = get_tracks_by_track_ids(missing_last)

tracks.update(missing_tracks_2000)
tracks.update(missing_tracks_4000)
tracks.update(missing_tracks_6000)
tracks.update(missing_tracks_8000)
tracks.update(missing_tracks_9000)
tracks.update(missing_tracks_last)

dump_data(tracks, '../data_archive/tracks.json')

Check one last time to confirm that there is no missing tracks from the playlists

all_playlists = load_data('../data_archive/playlists_from_200_search_words.json')
tracks_db = load_data('../data_archive/tracks.json')
missing_tracks(tracks_db, all_playlists)

tracks that are missing : 505

{None}

Note: While there are still 505 tracks missing, these are entries without track IDs. We will discard these entries in subsequent data preprocessing step.

Spotify

Predicting the success of your playlist and providing personalized recommendations

Contents

Define helper functions

Define search queries (search words)

Query the Spotify API