I'm trying to reduce the number of dimensions in my dataset for a movie recommender system using SVD. I'm using the 'MovieLens 1M Dataset' from GroupLens.org. I've used the MathNet library for computing SVD and it seems to work alright.
As far as I know I should be able to input my ratings matrix, compute the SVD of the matrix and then keep the top 80% of the singular values of the Sigma matrix. This should give me the best low rank approximation. I was expecting to get about 10-20 singular values for my dataset to reach 80% of the total singular values. I've tested it with 1000 movies and 1000 users and about 135.000 ratings from the MovieLens Dataset. When I compute the SVD of this and keep the top 80% of the sigma matrix i end up with 289 sigular values (dimensions). What am I doing wrong? Shouldnt I be able to reduce the data to a few dimensions (less than 30)
Can I reduce the number of dimensions more and still keep the precision? I've tried filling all 0 values in the matrix with the avg rating from that specific user, but it didnt have the effect i was looking for. Is there something i need to do with the data before or after running the SVD or is my data simply in ~ 300 dimensions? Do you know any other datasets which might be easier for me to reduce the number of dimensions of.
I'm programming in C# and i've included part of my current code here:
using MathNet.Numerics.LinearAlgebra;
using MathNet.Numerics.LinearAlgebra.Factorization;
using System;
using System.Collections.Generic;
using System.Data;
using System.Data.SqlClient;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace RecommenderSystem
{
class Program
{
static void Main(string[] args)
{
//ConnectDatabase();
SqlConnection sqlConnection = new SqlConnection("Data Source=(localdb)\\MSSQLLocalDB;Initial Catalog=millionRatings;Integrated Security=True;Connect Timeout=30;Encrypt=False;TrustServerCertificate=True;ApplicationIntent=ReadWrite;MultiSubnetFailover=False");
SqlCommand cmd = new SqlCommand();
cmd.CommandType = CommandType.Text;
cmd.Connection = sqlConnection;
sqlConnection.Open();
cmd.CommandText = "SELECT TOP 1 [MovieID] FROM dbo.Rating ORDER BY MovieID DESC";
int MaxMovieID = (int)cmd.ExecuteScalar();
cmd.CommandText = "SELECT TOP 1 [UserID] FROM dbo.Rating ORDER BY UserID DESC";
int MaxUserID = (int)cmd.ExecuteScalar();
cmd.CommandText = "SELECT * FROM dbo.Rating ORDER BY UserID ASC";
SqlDataReader reader;
reader = cmd.ExecuteReader();
var elements = new List<Tuple<int, int, double>>();
while (reader.Read())
{
elements.Add(new Tuple<int, int, double>((int)reader["UserID"] - 1, (int)reader["MovieID"] - 1, (double)reader["Score"]));
}
Matrix<double> matrix = Matrix<double>.Build.DenseOfIndexed(MaxUserID, MaxMovieID, elements);
Console.WriteLine("Original Matrix:\n" + matrix);
var _ReducedMatrix = ReducedMatrix(matrix, 1000, 1000);
Console.WriteLine("Reduced Matrix:\n" + _ReducedMatrix);
var SVD = _ReducedMatrix.Svd(true);
Console.WriteLine(SVD.U.ToString("0.00"));
Console.WriteLine(SVD.W.ToString("0.00"));
Console.WriteLine(SVD.VT.ToString("0.00"));
CalculateEnergy(SVD);
var FilledMatrix = FillAvgUserRating(_ReducedMatrix);
Console.WriteLine("Filled Matrix:\n" + FilledMatrix);
var SVDFilled = FilledMatrix.Svd(true);
Console.WriteLine(SVDFilled.U.ToString("0.00"));
Console.WriteLine(SVDFilled.W.ToString("0.00"));
Console.WriteLine(SVDFilled.VT.ToString("0.00"));
CalculateEnergy(SVDFilled);
Console.ReadKey();
}
public static Matrix<double> ReducedMatrix(Matrix<double> matrix, int rows, int columns)
{
var elements = new List<Tuple<int, int, double>>();
for (int i = 0; i < rows; i++)
for (int j = 0; j < columns; j++)
elements.Add(new Tuple<int, int, double>(i, j, matrix[i, j]));
return Matrix<double>.Build.DenseOfIndexed(rows, columns, elements);
}
private static int CalculateEnergy(Svd<double> SVD)
{
int conceptCount = SVD.S.Count();
double TotalEnergy = SVD.S.Sum();
Console.WriteLine("Total Energy: " + TotalEnergy.ToString("0.00"));
double rho = 0.80;
double UsedEnergy = 0;
int Count = 0;
for (int i = 0; i < conceptCount; i++)
{
if (UsedEnergy <= (TotalEnergy * rho) || (SVD.S[i] / TotalEnergy * 100 >= 5.0))
{
UsedEnergy += SVD.S[Count];
Console.WriteLine("Including #" + (i + 1).ToString("00") + " (" + ((SVD.S[i] / TotalEnergy * 100).ToString("00.00") + "% / " + ((UsedEnergy / TotalEnergy * 100).ToString("00.00")) + " % )"));
Count++;
}
else
{
Console.WriteLine("Not Including #" + (i + 1).ToString("00") + " (" + ((SVD.S[i] / TotalEnergy * 100).ToString("00.00") + "% )"));
}
}
Console.WriteLine("\nReturning " + Count + " Concepts!");
return Count;
}
public static Matrix<double> FillAvgUserRating(Matrix<double> matrix)
{
var elements = new List<Tuple<int, int, double>>();
for (int i = 0; i < matrix.RowCount; i++) // each rows
{
// calculate avg of each user
double sum = 0.0;
var count = 0;
for (int j = 0; j < matrix.ColumnCount; j++)
{
if (matrix[i, j] != 0)
{
sum += matrix[i, j];
count++;
}
}
double avg = (count == 0 ? 0 : sum / count);
for (int j = 0; j < matrix.ColumnCount; j++)
{
if (matrix[i, j] == 0)
{
elements.Add(new Tuple<int, int, double>(i, j, avg));
}
else
{
elements.Add(new Tuple<int, int, double>(i, j, matrix[i, j]));
}
}
}
return Matrix<double>.Build.DenseOfIndexed(matrix.RowCount, matrix.ColumnCount, elements);
}
}
}
If anyone is interested in seeing the console output of the code above, i've uploaded it here: https://justpaste.it/15wgc