import pandas as pd


df = pd.read_csv("C:/Users/juanc/Downloads/archive/Spotify_Song_Attributes.csv")
df.head()


df.isnull().sum()

trackName              0
artistName             0
msPlayed               0
genre               1500
danceability         550
energy               550
key                  550
loudness             550
mode                 550
speechiness          550
acousticness         550
instrumentalness     550
liveness             550
valence              550
tempo                550
type                 550
id                   550
uri                  550
track_href           550
analysis_url         550
duration_ms          550
time_signature       550
dtype: int64


df.duplicated().sum()

5040


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10080 entries, 0 to 10079
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   trackName         10080 non-null  object 
 1   artistName        10080 non-null  object 
 2   msPlayed          10080 non-null  int64  
 3   genre             8580 non-null   object 
 4   danceability      9530 non-null   float64
 5   energy            9530 non-null   float64
 6   key               9530 non-null   float64
 7   loudness          9530 non-null   float64
 8   mode              9530 non-null   float64
 9   speechiness       9530 non-null   float64
 10  acousticness      9530 non-null   float64
 11  instrumentalness  9530 non-null   float64
 12  liveness          9530 non-null   float64
 13  valence           9530 non-null   float64
 14  tempo             9530 non-null   float64
 15  type              9530 non-null   object 
 16  id                9530 non-null   object 
 17  uri               9530 non-null   object 
 18  track_href        9530 non-null   object 
 19  analysis_url      9530 non-null   object 
 20  duration_ms       9530 non-null   float64
 21  time_signature    9530 non-null   float64
dtypes: float64(13), int64(1), object(8)
memory usage: 1.7+ MB


numeric_stats = df.describe()
numeric_stats


unique_artists = df['artistName'].nunique()
unique_artists

2312


# Import necessary libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Filter out the columns with numeric data
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns

# Create histograms for each numeric column
fig, axs = plt.subplots(len(numeric_columns), 1, figsize=(10, 40))

for i, column in enumerate(numeric_columns):
    sns.histplot(df[column], kde=False, ax=axs[i])
    axs[i].set_title(f'Histogram of {column}', fontsize=15)

plt.tight_layout()
plt.show()


# Import numpy library
import numpy as np

# Compute the correlation matrix
corr = df[numeric_columns].corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

plt.title('Correlation Matrix of Numeric Features', fontsize=15)
plt.show()


# Top 10 genres
top_genres = df['genre'].value_counts().head(10)

# Top 10 artists
top_artists = df['artistName'].value_counts().head(10)

fig, axs = plt.subplots(2, 1, figsize=(12, 10))

# Barplot for top genres
sns.barplot(x=top_genres.values, y=top_genres.index, ax=axs[0], palette="Blues_d")
axs[0].set_title('Top 10 Genres', fontsize=15)

# Barplot for top artists
sns.barplot(x=top_artists.values, y=top_artists.index, ax=axs[1], palette="Blues_d")
axs[1].set_title('Top 10 Artists', fontsize=15)

plt.tight_layout()
plt.show()

	trackName	artistName	msPlayed	genre	danceability	energy	key	loudness	mode	speechiness	...	liveness	valence	tempo	type	id	uri	track_href	analysis_url	duration_ms	time_signature
0	"Honest"	Nico Collins	191772	NaN	0.476	0.799	4.0	-4.939	0.0	0.2120	...	0.2570	0.577	162.139	audio_features	7dTxqsaFGHOXwtzHINjfHv	spotify:track:7dTxqsaFGHOXwtzHINjfHv	https://api.spotify.com/v1/tracks/7dTxqsaFGHOX...	https://api.spotify.com/v1/audio-analysis/7dTx...	191948.0	4.0
1	"In The Hall Of The Mountain King" from Peer G...	London Symphony Orchestra	1806234	british orchestra	0.475	0.130	7.0	-17.719	1.0	0.0510	...	0.1010	0.122	112.241	audio_features	14Qcrx6Dfjvcj0H8oV8oUW	spotify:track:14Qcrx6Dfjvcj0H8oV8oUW	https://api.spotify.com/v1/tracks/14Qcrx6Dfjvc...	https://api.spotify.com/v1/audio-analysis/14Qc...	150827.0	4.0
2	#BrooklynBloodPop!	SyKo	145610	glitchcore	0.691	0.814	1.0	-3.788	0.0	0.1170	...	0.3660	0.509	132.012	audio_features	7K9Z3yFNNLv5kwTjQYGjnu	spotify:track:7K9Z3yFNNLv5kwTjQYGjnu	https://api.spotify.com/v1/tracks/7K9Z3yFNNLv5...	https://api.spotify.com/v1/audio-analysis/7K9Z...	145611.0	4.0
3	$10	Good Morning	25058	experimental pop	0.624	0.596	4.0	-9.804	1.0	0.0314	...	0.1190	0.896	120.969	audio_features	3koAwrM1RO0TGMeQJ3qt9J	spotify:track:3koAwrM1RO0TGMeQJ3qt9J	https://api.spotify.com/v1/tracks/3koAwrM1RO0T...	https://api.spotify.com/v1/audio-analysis/3koA...	89509.0	4.0
4	(I Just) Died In Your Arms	Cutting Crew	5504949	album rock	0.625	0.726	11.0	-11.402	0.0	0.0444	...	0.0625	0.507	124.945	audio_features	4ByEFOBuLXpCqvO1kw8Wdm	spotify:track:4ByEFOBuLXpCqvO1kw8Wdm	https://api.spotify.com/v1/tracks/4ByEFOBuLXpC...	https://api.spotify.com/v1/audio-analysis/4ByE...	280400.0	4.0

	msPlayed	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	liveness	valence	tempo	duration_ms	time_signature
count	1.008000e+04	9530.000000	9530.000000	9530.000000	9530.000000	9530.000000	9530.000000	9530.000000	9530.000000	9530.000000	9530.000000	9530.000000	9.530000e+03	9530.000000
mean	1.519657e+06	0.602469	0.563524	5.241973	-8.685077	0.612382	0.078468	0.362924	0.153215	0.174589	0.434113	119.374474	2.029311e+05	3.917524
std	5.317343e+06	0.157745	0.243548	3.570615	5.414814	0.487232	0.080101	0.334337	0.313132	0.130749	0.242761	28.993087	9.587253e+04	0.386189
min	0.000000e+00	0.000000	0.001080	0.000000	-42.044000	0.000000	0.000000	0.000002	0.000000	0.024900	0.000000	0.000000	1.002700e+04	0.000000
25%	1.367800e+05	0.509000	0.403000	2.000000	-10.189000	0.000000	0.036100	0.053800	0.000000	0.096200	0.237000	97.568000	1.616970e+05	4.000000
50%	2.662875e+05	0.623000	0.589000	5.000000	-7.218000	1.000000	0.047900	0.245000	0.000025	0.119000	0.409000	119.822000	1.942860e+05	4.000000
75%	1.186307e+06	0.714000	0.751000	8.000000	-5.336000	1.000000	0.081900	0.668000	0.027600	0.209000	0.614000	139.785000	2.295260e+05	4.000000
max	1.583671e+08	0.976000	0.999000	11.000000	3.010000	1.000000	0.966000	0.996000	0.993000	0.964000	0.986000	236.196000	4.581483e+06	5.000000

Basic data analysis with spotify song attributes dataset¶

Cabral Juan Andrés¶