#pip install --upgrade xgboost cupy-cuda12x shap

# Import data libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cupy as cp
import shap

# Import Preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline

# Import Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor, plot_importance

# Import Metrics
from sklearn.metrics import classification_report, confusion_matrix, r2_score, mean_squared_error

# Suppress Warnings
import warnings
warnings.simplefilter(action='ignore', category=[FutureWarning])
#warnings.filterwarnings('ignore', category=RuntimeWarning)

# Load the dataset
path = "./data/spotify_data.csv"
df = pd.read_csv(path, low_memory=False)

# We will define a function to display some detailed information about our dataframe

def show_info(dataframe):
    # Display first few rows
    print("\nFirst Few Rows\n")
    display(dataframe.head())
    
    # Basic stats
    print("\nBasic Numeric Stats\n")
    display(dataframe.describe())
    
    print("\nShape & Cols\n")
    print(dataframe.shape)
    print(dataframe.columns)
    
    # Show column info
    print("\nDetailed Column Info\n")
    dataframe.info(verbose=True, show_counts=True)

show_info(df)

First Few Rows

Basic Numeric Stats

Shape & Cols

(1159764, 20)
Index(['Unnamed: 0', 'artist_name', 'track_name', 'track_id', 'popularity',
       'year', 'genre', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms', 'time_signature'],
      dtype='object')

Detailed Column Info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1159764 entries, 0 to 1159763
Data columns (total 20 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   Unnamed: 0        1159764 non-null  int64  
 1   artist_name       1159749 non-null  object 
 2   track_name        1159763 non-null  object 
 3   track_id          1159764 non-null  object 
 4   popularity        1159764 non-null  int64  
 5   year              1159764 non-null  int64  
 6   genre             1159764 non-null  object 
 7   danceability      1159764 non-null  float64
 8   energy            1159764 non-null  float64
 9   key               1159764 non-null  int64  
 10  loudness          1159764 non-null  float64
 11  mode              1159764 non-null  int64  
 12  speechiness       1159764 non-null  float64
 13  acousticness      1159764 non-null  float64
 14  instrumentalness  1159764 non-null  float64
 15  liveness          1159764 non-null  float64
 16  valence           1159764 non-null  float64
 17  tempo             1159764 non-null  float64
 18  duration_ms       1159764 non-null  int64  
 19  time_signature    1159764 non-null  int64  
dtypes: float64(9), int64(7), object(4)
memory usage: 177.0+ MB

# Select a subset of continuous variables
numeric_cols = ['year', 'danceability', 'energy', 'key',
                'loudness', 'speechiness', 'acousticness', 
                'instrumentalness', 'liveness', 'valence', 
                'tempo', 'duration_ms', 'popularity']

corr_matrix = df[numeric_cols].corr()

plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap of Selected Features")
plt.show()

cols_to_drop = ['Unnamed: 0', 'artist_name', 'track_name', 'track_id']
df_dropped = df.drop(cols_to_drop, axis=1)

df_dropped['year'] = 2023 - df_dropped['year']

df_dropped['dance_energy'] = df_dropped['danceability'] * df_dropped['energy']
df_dropped['valence_tempo'] = df_dropped['valence'] * df_dropped['tempo']

# Count songs per artist
artist_song_counts = df['artist_name'].value_counts()

# Map the counts back to the dataframe
df_dropped['artist_song_count'] = df['artist_name'].map(artist_song_counts)
df_dropped['artist_song_count'] = df_dropped['artist_song_count'].fillna(1)

# Display only cols with null vals, and print num rows w missing vals
missing_values = df_dropped.isnull().sum()
display(missing_values[missing_values > 0])
missing_value_count = missing_values[missing_values > 0].count()
print(f"Number of columns with missing values: {missing_value_count}")

Series([], dtype: int64)

Number of columns with missing values: 0

df_encoded = pd.get_dummies(df_dropped, columns=['genre'], drop_first=True)

show_info(df_encoded)

First Few Rows

Basic Numeric Stats

Shape & Cols

(1159764, 99)
Index(['popularity', 'year', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms', 'time_signature', 'dance_energy',
       'valence_tempo', 'artist_song_count', 'genre_afrobeat',
       'genre_alt-rock', 'genre_ambient', 'genre_black-metal', 'genre_blues',
       'genre_breakbeat', 'genre_cantopop', 'genre_chicago-house',
       'genre_chill', 'genre_classical', 'genre_club', 'genre_comedy',
       'genre_country', 'genre_dance', 'genre_dancehall', 'genre_death-metal',
       'genre_deep-house', 'genre_detroit-techno', 'genre_disco',
       'genre_drum-and-bass', 'genre_dub', 'genre_dubstep', 'genre_edm',
       'genre_electro', 'genre_electronic', 'genre_emo', 'genre_folk',
       'genre_forro', 'genre_french', 'genre_funk', 'genre_garage',
       'genre_german', 'genre_gospel', 'genre_goth', 'genre_grindcore',
       'genre_groove', 'genre_guitar', 'genre_hard-rock', 'genre_hardcore',
       'genre_hardstyle', 'genre_heavy-metal', 'genre_hip-hop', 'genre_house',
       'genre_indian', 'genre_indie-pop', 'genre_industrial', 'genre_jazz',
       'genre_k-pop', 'genre_metal', 'genre_metalcore', 'genre_minimal-techno',
       'genre_new-age', 'genre_opera', 'genre_party', 'genre_piano',
       'genre_pop', 'genre_pop-film', 'genre_power-pop',
       'genre_progressive-house', 'genre_psych-rock', 'genre_punk',
       'genre_punk-rock', 'genre_rock', 'genre_rock-n-roll', 'genre_romance',
       'genre_sad', 'genre_salsa', 'genre_samba', 'genre_sertanejo',
       'genre_show-tunes', 'genre_singer-songwriter', 'genre_ska',
       'genre_sleep', 'genre_songwriter', 'genre_soul', 'genre_spanish',
       'genre_swedish', 'genre_tango', 'genre_techno', 'genre_trance',
       'genre_trip-hop'],
      dtype='object')

Detailed Column Info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1159764 entries, 0 to 1159763
Data columns (total 99 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   popularity               1159764 non-null  int64  
 1   year                     1159764 non-null  int64  
 2   danceability             1159764 non-null  float64
 3   energy                   1159764 non-null  float64
 4   key                      1159764 non-null  int64  
 5   loudness                 1159764 non-null  float64
 6   mode                     1159764 non-null  int64  
 7   speechiness              1159764 non-null  float64
 8   acousticness             1159764 non-null  float64
 9   instrumentalness         1159764 non-null  float64
 10  liveness                 1159764 non-null  float64
 11  valence                  1159764 non-null  float64
 12  tempo                    1159764 non-null  float64
 13  duration_ms              1159764 non-null  int64  
 14  time_signature           1159764 non-null  int64  
 15  dance_energy             1159764 non-null  float64
 16  valence_tempo            1159764 non-null  float64
 17  artist_song_count        1159764 non-null  float64
 18  genre_afrobeat           1159764 non-null  bool   
 19  genre_alt-rock           1159764 non-null  bool   
 20  genre_ambient            1159764 non-null  bool   
 21  genre_black-metal        1159764 non-null  bool   
 22  genre_blues              1159764 non-null  bool   
 23  genre_breakbeat          1159764 non-null  bool   
 24  genre_cantopop           1159764 non-null  bool   
 25  genre_chicago-house      1159764 non-null  bool   
 26  genre_chill              1159764 non-null  bool   
 27  genre_classical          1159764 non-null  bool   
 28  genre_club               1159764 non-null  bool   
 29  genre_comedy             1159764 non-null  bool   
 30  genre_country            1159764 non-null  bool   
 31  genre_dance              1159764 non-null  bool   
 32  genre_dancehall          1159764 non-null  bool   
 33  genre_death-metal        1159764 non-null  bool   
 34  genre_deep-house         1159764 non-null  bool   
 35  genre_detroit-techno     1159764 non-null  bool   
 36  genre_disco              1159764 non-null  bool   
 37  genre_drum-and-bass      1159764 non-null  bool   
 38  genre_dub                1159764 non-null  bool   
 39  genre_dubstep            1159764 non-null  bool   
 40  genre_edm                1159764 non-null  bool   
 41  genre_electro            1159764 non-null  bool   
 42  genre_electronic         1159764 non-null  bool   
 43  genre_emo                1159764 non-null  bool   
 44  genre_folk               1159764 non-null  bool   
 45  genre_forro              1159764 non-null  bool   
 46  genre_french             1159764 non-null  bool   
 47  genre_funk               1159764 non-null  bool   
 48  genre_garage             1159764 non-null  bool   
 49  genre_german             1159764 non-null  bool   
 50  genre_gospel             1159764 non-null  bool   
 51  genre_goth               1159764 non-null  bool   
 52  genre_grindcore          1159764 non-null  bool   
 53  genre_groove             1159764 non-null  bool   
 54  genre_guitar             1159764 non-null  bool   
 55  genre_hard-rock          1159764 non-null  bool   
 56  genre_hardcore           1159764 non-null  bool   
 57  genre_hardstyle          1159764 non-null  bool   
 58  genre_heavy-metal        1159764 non-null  bool   
 59  genre_hip-hop            1159764 non-null  bool   
 60  genre_house              1159764 non-null  bool   
 61  genre_indian             1159764 non-null  bool   
 62  genre_indie-pop          1159764 non-null  bool   
 63  genre_industrial         1159764 non-null  bool   
 64  genre_jazz               1159764 non-null  bool   
 65  genre_k-pop              1159764 non-null  bool   
 66  genre_metal              1159764 non-null  bool   
 67  genre_metalcore          1159764 non-null  bool   
 68  genre_minimal-techno     1159764 non-null  bool   
 69  genre_new-age            1159764 non-null  bool   
 70  genre_opera              1159764 non-null  bool   
 71  genre_party              1159764 non-null  bool   
 72  genre_piano              1159764 non-null  bool   
 73  genre_pop                1159764 non-null  bool   
 74  genre_pop-film           1159764 non-null  bool   
 75  genre_power-pop          1159764 non-null  bool   
 76  genre_progressive-house  1159764 non-null  bool   
 77  genre_psych-rock         1159764 non-null  bool   
 78  genre_punk               1159764 non-null  bool   
 79  genre_punk-rock          1159764 non-null  bool   
 80  genre_rock               1159764 non-null  bool   
 81  genre_rock-n-roll        1159764 non-null  bool   
 82  genre_romance            1159764 non-null  bool   
 83  genre_sad                1159764 non-null  bool   
 84  genre_salsa              1159764 non-null  bool   
 85  genre_samba              1159764 non-null  bool   
 86  genre_sertanejo          1159764 non-null  bool   
 87  genre_show-tunes         1159764 non-null  bool   
 88  genre_singer-songwriter  1159764 non-null  bool   
 89  genre_ska                1159764 non-null  bool   
 90  genre_sleep              1159764 non-null  bool   
 91  genre_songwriter         1159764 non-null  bool   
 92  genre_soul               1159764 non-null  bool   
 93  genre_spanish            1159764 non-null  bool   
 94  genre_swedish            1159764 non-null  bool   
 95  genre_tango              1159764 non-null  bool   
 96  genre_techno             1159764 non-null  bool   
 97  genre_trance             1159764 non-null  bool   
 98  genre_trip-hop           1159764 non-null  bool   
dtypes: bool(81), float64(12), int64(6)
memory usage: 248.9 MB

# Separate features and target (popularity is our target variable)
X = df_encoded.drop('popularity', axis=1)
y = df_encoded['popularity']

# Split data into training and test sets (using stratification)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

sns.histplot(y, kde=True)
plt.title("Target Distribution")
plt.show()

lr = Pipeline(steps=[
    ("regressor", LinearRegression(n_jobs=-1))
])

lr.fit(X_train, y_train)

Pipeline(steps=[('regressor', LinearRegression(n_jobs=-1))])

Pipeline(steps=[('regressor', LinearRegression(n_jobs=-1))])

LinearRegression(n_jobs=-1)

y_pred = lr.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"R²: {r2:.3f}, RMSE: {rmse:.2f}")

R²: 0.547, RMSE: 10.69

pipeline = Pipeline(steps=[
    ('regressor', HistGradientBoostingRegressor(
        l2_regularization = 10.0,
        learning_rate = 0.2,
        max_depth = 10,
        max_iter = 1000,
        min_samples_leaf = 100
    ))
])

# Define the parameter grid. For Ridge, we tune the regularization strength 'alpha'.
# param_grid = {
#     'regressor__learning_rate': [0.01, 0.05, 0.1, 0.2],
#     'regressor__max_iter': [100, 300, 500, 1000],
#     'regressor__max_depth': [None, 5, 10],
#     'regressor__min_samples_leaf': [20, 50, 100],
#     'regressor__l2_regularization': [0.0, 1.0, 10.0],
# }

# grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
# grid_search.fit(X_train, y_train)

# print("Best parameters found:", grid_search.best_params_)
# print("Best cross-validation R^2:", grid_search.best_score_)

# # You can now use the best model:
# best_model = grid_search.best_estimator_

# score = best_model.score(X_test, y_test)
# print(f"R^2 on test set: {score:.3f}")

pipeline.fit(X_train, y_train)

Pipeline(steps=[('regressor',
                 HistGradientBoostingRegressor(l2_regularization=10.0,
                                               learning_rate=0.2, max_depth=10,
                                               max_iter=1000,
                                               min_samples_leaf=100))])

Pipeline(steps=[('regressor',
                 HistGradientBoostingRegressor(l2_regularization=10.0,
                                               learning_rate=0.2, max_depth=10,
                                               max_iter=1000,
                                               min_samples_leaf=100))])

HistGradientBoostingRegressor(l2_regularization=10.0, learning_rate=0.2,
                              max_depth=10, max_iter=1000,
                              min_samples_leaf=100)

y_pred = pipeline.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"R²: {r2:.3f}, RMSE: {rmse:.2f}")

R²: 0.695, RMSE: 8.77

# # Define a hyperparameter search space
# param_dist = {
#     'n_estimators': [100, 300, 500, 700, 1000, 5000],
#     'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2],
#     'max_depth': [3, 4, 6, 8, 10, 12],
#     'min_child_weight': [1, 3, 5, 10, 20],
#     'subsample': [0.5, 0.7, 0.8, 0.9, 1.0],
#     'colsample_bytree': [0.5, 0.7, 0.8, 0.9, 1.0],
#     'gamma': [0, 0.1, 0.3, 0.5, 1],
#     'reg_alpha': [0, 0.01, 0.1, 1, 10],
#     'reg_lambda': [0.1, 1, 5, 10, 50],
# }

# # Set up the randomized search
# random_search = RandomizedSearchCV(
#     estimator=XGBRegressor(device='cuda', objective='reg:squarederror', random_state=42),
#     param_distributions=param_dist,
#     scoring='r2',
#     n_iter=100,               # Number of random combos to try
#     cv=3,
#     verbose=2,
#     random_state=42,
#     n_jobs=15
# )

# # Fit the model
# random_search.fit(X_train, y_train)

# # Output results
# print("Best R² Score: {:.4f}".format(random_search.best_score_))
# print("Best Parameters:", random_search.best_params_)

# Split a validation set from the training data
X_train_sub, X_val, y_train_sub, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# Define the model
xgb = XGBRegressor(
    tree_method="hist",
    device="cuda",
    n_estimators=5000,
    learning_rate=0.05,
    max_depth=15,
    min_child_weight=5,
    subsample=1,
    reg_lambda=0.1,
    reg_alpha=0.01,
    colsample_bytree=0.5,
    early_stopping_rounds=20,
    eval_metric='rmse',
    random_state=42,
)

# Fit with early stopping
xgb.fit(
    X_train_sub, y_train_sub,
    eval_set=[(X_val, y_val)],
    verbose=False
)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.5, device='cuda', early_stopping_rounds=20,
             enable_categorical=False, eval_metric='rmse', feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.05, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=15,
             max_leaves=None, min_child_weight=5, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=5000,
             n_jobs=None, num_parallel_tree=None, ...)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.5, device='cuda', early_stopping_rounds=20,
             enable_categorical=False, eval_metric='rmse', feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.05, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=15,
             max_leaves=None, min_child_weight=5, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=5000,
             n_jobs=None, num_parallel_tree=None, ...)

# Predict on final test set
y_pred = xgb.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"R²: {r2:.3f}, RMSE: {rmse:.2f}")

R²: 0.722, RMSE: 8.38

plt.scatter(y_test, y_pred, alpha=0.3)
plt.xlabel("Actual Popularity")
plt.ylabel("Predicted Popularity")
plt.title("XGBoost Predictions vs Actual")
plt.show()

plt.figure(figsize=(12, 6))
plot_importance(xgb, max_num_features=20, importance_type='gain', 
                height=0.6, grid=False, show_values=False)
plt.title("Top 20 Feature Importances by Gain")
plt.xlabel("Average Gain", fontsize=12)
plt.ylabel("Feature", fontsize=12)
plt.show()

plt.figure(figsize=(12, 6))
plot_importance(xgb, max_num_features=20, importance_type='weight', 
                height=0.6, grid=False, show_values=False)
plt.title("Top 20 Feature Importances by Frequency (Weight)")
plt.xlabel("Number of Splits", fontsize=12)
plt.ylabel("Feature", fontsize=12)
plt.show()

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

	Unnamed: 0	artist_name	track_name	track_id	popularity	year	genre	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	liveness	valence	tempo	duration_ms	time_signature
0	0	Jason Mraz	I Won't Give Up	53QF56cjZA9RTuuMZDrSA6	68	2012	acoustic	0.483	0.303	4	-10.058	1	0.0429	0.6940	0.000000	0.1150	0.139	133.406	240166	3
1	1	Jason Mraz	93 Million Miles	1s8tP3jP4GZcyHDsjvw218	50	2012	acoustic	0.572	0.454	3	-10.286	1	0.0258	0.4770	0.000014	0.0974	0.515	140.182	216387	4
2	2	Joshua Hyslop	Do Not Let Me Go	7BRCa8MPiyuvr2VU3O9W0F	57	2012	acoustic	0.409	0.234	3	-13.711	1	0.0323	0.3380	0.000050	0.0895	0.145	139.832	158960	4
3	3	Boyce Avenue	Fast Car	63wsZUhUZLlh1OsyrZq7sz	58	2012	acoustic	0.392	0.251	10	-9.845	1	0.0363	0.8070	0.000000	0.0797	0.508	204.961	304293	4
4	4	Andrew Belle	Sky's Still Blue	6nXIYClvJAfi6ujLiKqEq8	54	2012	acoustic	0.430	0.791	6	-5.419	0	0.0302	0.0726	0.019300	0.1100	0.217	171.864	244320	4

	Unnamed: 0	popularity	year	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	liveness	valence	tempo	duration_ms	time_signature
count	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06
mean	6.590613e+05	1.838312e+01	2.011955e+03	5.374382e-01	6.396699e-01	5.287778e+00	-8.981353e+00	6.346533e-01	9.281477e-02	3.215370e-01	2.523489e-01	2.230189e-01	4.555636e-01	1.213771e+02	2.495618e+05	3.885879e+00
std	4.285492e+05	1.588554e+01	6.803901e+00	1.844780e-01	2.705009e-01	3.555197e+00	5.682215e+00	4.815275e-01	1.268409e-01	3.549872e-01	3.650731e-01	2.010707e-01	2.685190e-01	2.977975e+01	1.494262e+05	4.676967e-01
min	0.000000e+00	0.000000e+00	2.000000e+03	0.000000e+00	0.000000e+00	0.000000e+00	-5.810000e+01	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	2.073000e+03	0.000000e+00
25%	2.899408e+05	5.000000e+00	2.006000e+03	4.130000e-01	4.540000e-01	2.000000e+00	-1.082900e+01	0.000000e+00	3.710000e-02	6.400000e-03	1.050000e-06	9.790000e-02	2.260000e-01	9.879700e+01	1.810910e+05	4.000000e+00
50%	5.798815e+05	1.500000e+01	2.012000e+03	5.500000e-01	6.940000e-01	5.000000e+00	-7.450000e+00	1.000000e+00	5.070000e-02	1.470000e-01	1.760000e-03	1.340000e-01	4.380000e-01	1.219310e+02	2.257440e+05	4.000000e+00
75%	1.031689e+06	2.900000e+01	2.018000e+03	6.770000e-01	8.730000e-01	8.000000e+00	-5.276000e+00	1.000000e+00	8.900000e-02	6.400000e-01	6.140000e-01	2.920000e-01	6.740000e-01	1.399030e+02	2.869135e+05	4.000000e+00
max	1.473395e+06	1.000000e+02	2.023000e+03	9.930000e-01	1.000000e+00	1.100000e+01	6.172000e+00	1.000000e+00	9.710000e-01	9.960000e-01	1.000000e+00	1.000000e+00	1.000000e+00	2.499930e+02	6.000495e+06	5.000000e+00

	popularity	year	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	liveness	valence	tempo	duration_ms	time_signature	dance_energy	valence_tempo	artist_song_count
count	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06	1.159764e+06
mean	1.838312e+01	1.104500e+01	5.374382e-01	6.396699e-01	5.287778e+00	-8.981353e+00	6.346533e-01	9.281477e-02	3.215370e-01	2.523489e-01	2.230189e-01	4.555636e-01	1.213771e+02	2.495618e+05	3.885879e+00	3.507327e-01	5.595898e+01	1.178108e+02
std	1.588554e+01	6.803901e+00	1.844780e-01	2.705009e-01	3.555197e+00	5.682215e+00	4.815275e-01	1.268409e-01	3.549872e-01	3.650731e-01	2.010707e-01	2.685190e-01	2.977975e+01	1.494262e+05	4.676967e-01	1.853440e-01	3.667326e+01	3.016462e+02
min	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	-5.810000e+01	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	2.073000e+03	0.000000e+00	0.000000e+00	0.000000e+00	1.000000e+00
25%	5.000000e+00	5.000000e+00	4.130000e-01	4.540000e-01	2.000000e+00	-1.082900e+01	0.000000e+00	3.710000e-02	6.400000e-03	1.050000e-06	9.790000e-02	2.260000e-01	9.879700e+01	1.810910e+05	4.000000e+00	2.095020e-01	2.592107e+01	2.200000e+01
50%	1.500000e+01	1.100000e+01	5.500000e-01	6.940000e-01	5.000000e+00	-7.450000e+00	1.000000e+00	5.070000e-02	1.470000e-01	1.760000e-03	1.340000e-01	4.380000e-01	1.219310e+02	2.257440e+05	4.000000e+00	3.618060e-01	5.132775e+01	5.400000e+01
75%	2.900000e+01	1.700000e+01	6.770000e-01	8.730000e-01	8.000000e+00	-5.276000e+00	1.000000e+00	8.900000e-02	6.400000e-01	6.140000e-01	2.920000e-01	6.740000e-01	1.399030e+02	2.869135e+05	4.000000e+00	4.890000e-01	8.074121e+01	1.140000e+02
max	1.000000e+02	2.300000e+01	9.930000e-01	1.000000e+00	1.100000e+01	6.172000e+00	1.000000e+00	9.710000e-01	9.960000e-01	1.000000e+00	1.000000e+00	1.000000e+00	2.499930e+02	6.000495e+06	5.000000e+00	9.516000e-01	2.273503e+02	4.058000e+03

Project 3: Regression¶

1. Introduction¶

1.1 Objectives¶

1.2 Questions to Answer¶

1.3 Data Introduction¶

1.4 What is Linear Regression?¶

2. Experiment One: Linear Regression¶

2.0 Library Imports¶

2.1 Data Preview & Understanding¶

Visualize Correlation Matrix¶

2.2 Data Preprocessing¶

2.3 Modeling¶

2.4 Evaluation¶

3. Experiment Two: Histogram-based Gradient Boosting Regression Tree¶

4. Experiment Three: XGBoost¶

5. Conclusion¶

6. Impact¶

7. References¶

	popularity	year	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	...	genre_ska	genre_sleep	genre_songwriter	genre_soul	genre_spanish	genre_swedish	genre_tango	genre_techno	genre_trance	genre_trip-hop
0	68	11	0.483	0.303	4	-10.058	1	0.0429	0.6940	0.000000	...	False	False	False	False	False	False	False	False	False	False
1	50	11	0.572	0.454	3	-10.286	1	0.0258	0.4770	0.000014	...	False	False	False	False	False	False	False	False	False	False
2	57	11	0.409	0.234	3	-13.711	1	0.0323	0.3380	0.000050	...	False	False	False	False	False	False	False	False	False	False
3	58	11	0.392	0.251	10	-9.845	1	0.0363	0.8070	0.000000	...	False	False	False	False	False	False	False	False	False	False
4	54	11	0.430	0.791	6	-5.419	0	0.0302	0.0726	0.019300	...	False	False	False	False	False	False	False	False	False	False