-
Notifications
You must be signed in to change notification settings - Fork 4
/
Netflix_Movie_Imputation.R
134 lines (108 loc) · 6.24 KB
/
Netflix_Movie_Imputation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
library("softImpute")
library("reshape2")
library("Metrics")
setwd("/Users/BlackHawk/Desktop/TheBigD/Netflix_Movie_Data/Small data") # Change this to the directory where your data is
movie_titles = read.csv("movies.csv") #movieId,title,genres
ratings = read.csv("ratings.csv") #userId,movieId,rating,timestamp
#---------- may (un)comment to reduce datasize and improve speed --------#
#ratings = ratings[1:10000,]
#----------------------------------------------------------------------#
tags = read.csv("tags.csv") #userId,movieId,tag,timestamp
links = read.csv("links.csv") #movieId,imdbId,tmdbId - used in linking different sources of movie data
drop_rate = 0.01
drop_rows = sample(1:nrow(ratings),floor(nrow(ratings)*drop_rate))
rating_test = ratings[drop_rows,] #create our testing data
rating_training = as.matrix(ratings[,1:3]) #remove timestamp
rating_training[drop_rows,3] <- NA #We will use these rows for testing, so need to impute
rating_training = as.data.frame(rating_training) #cast to dataframe to use in reshape
rating_training = dcast(rating_training, userId ~ movieId, value.var = "rating")
rating_training = rating_training[,-1] #can drop the userid column, the names of rows are now useids
test_userids = as.character(rating_test[,1]) #We want to later get the columns with the same name as
# our userids, not the column #. Because our data isn't perfect not every column # is the same as its corresponding userid
test_movieids = as.character(rating_test[,2])
userid_names = rownames(rating_training)
movieid_names = colnames(rating_training)
test_coordinates = cbind(test_userids, test_movieids)
#---------- A function for getting the ratings given lists of user and movie ids --------#
get_coordinates <- function(x, i, j) {
v = {}
for(m in 1:length(i)) {
v = cbind(v, x[i[m],j[m]])
}
return(as.vector(v))
}
#-----------------------------------------------------------------------------------------#
max_lambda = lambda0(rating_training) #This is the lambda that will return a 0 matrix, our lambda must be smaller
matrix_rank = floor(min(dim(rating_training))/2)
#result_parts = 0
#-------- Calculate vectors of actual (testing) data and predicted (training) data ------#
# lam = lambda; method = "svd" or "als"
get_actual_and_predicted <- function(lam, method, training, rn, cn, testing, mr) {
result_parts <<- softImpute(as.matrix(training), rank.max = mr, lambda = lam, type = method) #returns UDV
imputed <<- as.matrix(result_parts$u) %*% diag(result_parts$d) %*% t(as.matrix(result_parts$v))
# <<- in R means write as global variable (can be accessed outside function)
# even though the last value in result_parts$d is always a 0, it's better to keep it than remove it
rownames(imputed) = rn
colnames(imputed) = cn
actual = testing[,3] # Taken from our testing data
predicted = get_coordinates(imputed, test_userids, test_movieids) # Get the values we predicted with our imputation
return(cbind(actual, predicted))
}
#-----------------------------------------------------------------------------------------#
#---------- We will use the below function for testing different values of lambda -------#
testing_lambda = function(max_lam, training, rn, cn, testing, mr, type) {
# ------ Tunning Parameters --------#
max_lam = max_lam / 1000
step = 4 #this is the amount we will increase lambda with each iteration - large means faster convergence
iterations = 30 #number of for loop iterations
#-----------------------------------#
#------------- set up --------------#
lambda = {}
soft_result = {}
type.method = "svd" # our default method
type.lambda = 0 # our default lambda
#-----------------------------------#
for(i in 1:iterations) {
switch(type, # This switch statement will make the function slower, but the code simpler
soft = {
type.lambda = max_lam*i*step
},
hard = 1,
als = {
type.lambda = max_lam*i*step
type.method = "als"
})
ap = get_actual_and_predicted(type.lambda, type.method, training, rn, cn, testing, mr)
RMSE = rmse(ap[,1], ap[,2]) #Calculate the difference (error) btwn the values we predicted and actual values
print(c("Soft RMSE for lambda = ", type.lambda, " is : ", RMSE, " i is: ", i))
lambda[i] = type.lambda
soft_result[i] = RMSE
}
return(cbind(lambda, soft_result)) #return the lambdas and their corresponding error rates that were tested
}
#----------------------------------------------------------------------------------#
get_best_lambda <- function(data) {
min_error_index = which.min(data[,2])
best_lambda = data[min_error_index, 1]
return(c(best_lambda, data[min_error_index, 2]))
}
#--------------------------- Soft SVD -----------------------------#
lambdas_and_errors = testing_lambda(max_lambda, rating_training, userid_names,
movieid_names, rating_test, matrix_rank, type = "soft")
#get_actual_and_predicted: (lam, method, training, rn, cn, testing, mr)
soft_lambda = get_best_lambda(lambdas_and_errors)[1]
ap = get_actual_and_predicted(soft_lambda, method = "svd", rating_training, userid_names,
movieid_names, rating_test, matrix_rank)
soft_RMSE = rmse(ap[,1], ap[,2]) #Calculate the difference (error) btwn the values we predicted and actual values
#--------------------------- Hard SVD -----------------------------#
ap = get_actual_and_predicted(0, method = "svd", rating_training, userid_names,
movieid_names, rating_test, matrix_rank)
hard_RMSE = rmse(ap[,1], ap[,2]) #Calculate the difference (error) btwn the values we predicted and actual values
#----------------- Alternating Least Squares (ALS) ----------------#
lambdas_and_errors = testing_lambda(max_lambda, rating_training, userid_names,
movieid_names, rating_test, matrix_rank, type = "als")
als_lambda = get_best_lambda(lambdas_and_errors)[1]
ap = get_actual_and_predicted(als_lambda, method = "als", rating_training, userid_names,
movieid_names, rating_test, matrix_rank)
ALS_RMSE = rmse(ap[,1], ap[,2])
#problem is a lot of ratings with 0s