#Created By: Sarika Padmashali import json import csv from collections import defaultdict import sys import pandas import random import numpy #Taking inputs - user ID and restaurant ID from command line userID = unicode(sys.argv[1]) restaurantID = unicode(sys.argv[2]) print "User ID entered: ", userID print "Restaurant ID entered: ", restaurantID data = [] user_rating = {} #Count of unique users unique_users = [] #Count of unique restaurants unique_restaurants = [] count = 0 #Finding unique users and restaurants in the matrix with open('reviews.json') as f: for line in f: count = count+1 data = json.loads(line) if not data["user_id"] in unique_users: unique_users.append(data["user_id"]) if not data["business_id"] in unique_restaurants: unique_restaurants.append(data["business_id"]) restaurant_index = [x for x in range(len(unique_restaurants))] user_index = [x for x in range(len(unique_users))] restaurant_mapping = dict(zip(unique_restaurants,restaurant_index)) user_mapping = dict(zip(unique_users,user_index)) N = len(user_mapping) M = len(restaurant_mapping) #Creating a matrix R of ratings and initializing 0 R =[] for i in range(N): b = list() for j in range(M): b.insert(j,0) R.append(b) #Loading reviews data and populating the rating matrix R with open('reviews.json') as f: for line in f: data = json.loads(line) restaurant = restaurant_mapping[data["business_id"]] user = user_mapping[data["user_id"]] R[user][restaurant] = data["stars"] #Finding K features. K = 2 in our case taken randomly K = 2 #P and Q are the factors we want to find P = numpy.random.rand(N,K) Q = numpy.random.rand(M,K) #Latent matrix factorization using gradient descent def latent_matrix_factorization(R, P, Q, K, steps=500, alpha=0.0002, beta=0.02): Q = Q.T for step in xrange(steps): for i in xrange(len(R)): for j in xrange(len(R[i])): if R[i][j] > 0: eij = R[i][j] - numpy.dot(P[i,:],Q[:,j]) for k in xrange(K): P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k]) Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j]) eR = numpy.dot(P,Q) e = 0 for i in xrange(len(R)): for j in xrange(len(R[i])): if R[i][j] > 0: e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2) for k in xrange(K): e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2)) #If error goes below 0.001 then stop if e < 0.001: break return P, Q.T factorP,factorQ = latent_matrix_factorization(R, P, Q, K) approximate_R = numpy.dot(factorP, factorQ.T) user_entered_restaurantID = restaurant_mapping[unicode(restaurantID)] user_enetered_userID = user_mapping[unicode(userID)] #print user_entered_restaurantID #print user_enetered_userID print "The predited rating for the user id and restaurant id entered above is",approximate_R[user_enetered_userID][user_entered_restaurantID]