import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

📌 Load and Preprocess Data

We are going to use the MovieLens Dataset

ratings = pd.read_csv("ml-latest-small/ratings.csv")
movies = pd.read_csv("ml-latest-small/movies.csv")

display(ratings.head())
display(movies.head())

Ratings:

	userId	movieId	rating	timestamp
0	1	1	4.0	964982703
1	1	3	4.0	964981247
2	1	6	4.0	964982224
3	1	47	5.0	964983815
4	1	50	5.0	964982931

Movies

	movieId	title	genres
0	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy
1	2	Jumanji (1995)	Adventure\|Children\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama\|Romance
4	5	Father of the Bride Part II (1995)	Comedy

# merge datasets 
data = pd.merge(ratings,movies,on="movieId")
data = data.drop(columns=["timestamp"])
data.head()

	userId	movieId	rating	title	genres
0	1	1	4.0	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy
1	1	3	4.0	Grumpier Old Men (1995)	Comedy\|Romance
2	1	6	4.0	Heat (1995)	Action\|Crime\|Thriller
3	1	47	5.0	Seven (a.k.a. Se7en) (1995)	Mystery\|Thriller
4	1	50	5.0	Usual Suspects, The (1995)	Crime\|Mystery\|Thriller

❓Why do we encode?

The reason that we need to encode userId and movieId is that, while they are numerical, they do not have a meaningful ordinal relationship. For example, movieId = 500 isn’t “closer” to movieId = 501.

Encoding also reduces memory usage. For example, if userId ranges from 1 to 10000, but there are only 100 users in the dataset, the encodinig will map them from 0 to 99, reducing memory usage and reducing training time.

# encode categorical features 
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

data['userId'] = user_encoder.fit_transform(data['userId'])
data['movieId'] = movie_encoder.fit_transform(data['movieId'])

data.head()

	movieId	rating	title	genres
0	0	4.0	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy
1	2	4.0	Grumpier Old Men (1995)	Comedy\|Romance
2	5	4.0	Heat (1995)	Action\|Crime\|Thriller
3	43	5.0	Seven (a.k.a. Se7en) (1995)	Mystery\|Thriller
4	46	5.0	Usual Suspects, The (1995)	Crime\|Mystery\|Thriller

# create train-test split 
train_data, test_data = train_test_split(data, test_size= 0.2, random_state=42 )

print(len(train_data))
print(len(test_data))

80668
20168

📌 Train Random Forest Model for Rating Prediction

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

# define features and target variables 
x_train, x_test = train_data[["userId","movieId"]], test_data[["userId","movieId"]]
y_train, y_test = train_data["rating"], test_data["rating"]

# train random forest regressor model 

model = RandomForestRegressor(n_estimators=50,random_state=42)
model.fit(x_train,y_train)

RandomForestRegressor(n_estimators=50, random_state=42)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

# evaluate model 
import numpy as np 

y_pred = model.predict(x_test)
rmse = root_mean_squared_error(y_test,y_pred)
print('RMSE:', rmse)

RMSE: 1.0604925210240557

# create a baseline model 
# baseline model: predict average rating for all movies 

baseline_pred = ratings['rating'].mean()
baseline_rmse = root_mean_squared_error(test_data['rating'],[baseline_pred]*len(test_data))
print('Baseline RMSE:', baseline_rmse)

Baseline RMSE: 1.0488361768130714

We can see that the RMSE of our model is close to the baseline, but performs just slightly below. Let’s try to improve this.

📌 Hyperparamter Tuning

from sklearn.model_selection import GridSearchCV
import time

%%time
# set the parameters that we want to search 
params = {
    'max_depth': [2,3,5,10,20],
    'min_samples_leaf': [5,10,20,50,100,200],
    'n_estimators': [10,25,30,50,100,200]
}

grid_search = GridSearchCV(estimator=model,
                           param_grid=params,
                           cv = 4,
                           n_jobs=-1, verbose=1, scoring="neg_mean_squared_error")
grid_search.fit(x_train, y_train)

Fitting 4 folds for each of 180 candidates, totalling 720 fits
CPU times: user 21.4 s, sys: 4.71 s, total: 26.1 s
Wall time: 5min 33s

GridSearchCV(cv=4,
             estimator=RandomForestRegressor(n_estimators=50, random_state=42),
             n_jobs=-1,
             param_grid={'max_depth': [2, 3, 5, 10, 20],
                         'min_samples_leaf': [5, 10, 20, 50, 100, 200],
                         'n_estimators': [10, 25, 30, 50, 100, 200]},
             scoring='neg_mean_squared_error', verbose=1)

Now that we have a set of ideal hyperparameters, let’s retrain the model to see if we can get a better score.

# train random forest regressor model 
 model = RandomForestRegressor(max_depth=20,min_samples_leaf=20, n_estimators=200,
                       random_state=42)
 model.fit(x_train,y_train)

RandomForestRegressor(max_depth=20,min_samples_leaf=20, n_estimators=200,random_state=42)

# evaluate model 
import numpy as np 

y_pred = model.predict(x_test)
rmse = root_mean_squared_error(y_test,y_pred)
print('RMSE:', rmse)

RMSE: 0.9423977142884801

We can see that the model performs much better with the new hyperparameters!