單純的練手專案,電影的時間沒考慮,使用者的口味隨時間的變化之類的因素都沒考慮,單純的練手
import pandas as pd
import numpy as np
import copy
#這塊是自己寫的乙個personr相關係數的演算法,最好別用這個,我在學習,一堆坑
#from scipy.stats import pearsonr
# pearsonr(vector1,vector2) 建議用scipy框架的pearsonr相關係數
def get_pearsonr(b,o):
b_s = np.sum(b)
o_s = np.sum(o)
b_ss = np.sum(np.square(b))
o_ss = np.sum(np.square(o))
t_bo = 0
n = len(b)
for i in range(len(b)):
if not np.isnan(o[i]) :
t_bo += b[i] * o[i]
denominator = np.sqrt(b_ss - np.square(b_s)/n)*np.sqrt(o_ss - np.square(o_s)/n)
if denominator:
return (t_bo - (o_s * b_s)/n) / denominator
else:
return 0
def d_pearsonr(data_f):
n_arr = data_f.values
base_vector = n_arr[0,:]
d_values =
for o_vector in n_arr:
if base_vector is o_vector:
continue
if len(d_values) > 0:
return pd.series(d_values,index=data_f.index).sort_values()
else:
return -1
# emmm資料**於
# 沒使用者資料的推薦評分相對較高的熱門電影.
# 毫無難度所以比較適合入門,沒區分時間所以並不准,只適合了解推薦演算法這個東西
def start_studio(data_f):
result = d_pearsonr(data_f).index[0:3]
return result
def newuser_reeccomendation(ratings):
# ratings.userid = 1
n_r = copy.deepcopy(ratings)
n_r.userid = 1
n_s = n_r.groupby('movieid').sum()
# print(n_s)
# print(n_s.sort_values(by='userid',ascending=false).iloc[0:100])
# print(type(n_s.sort_values(by='userid',ascending=false).iloc[0:100]))
u_movieid = n_s.sort_values(by='userid',ascending=false).iloc[0:100].index
r_movieid = n_s.sort_values(by='rating',ascending=false).iloc[0:100].index
return fetch_movieinfo(list((set(u_movieid) & set(r_movieid)))[0:20])
def fetch_movieinfo(movieids):
if len(movieids) < 20:
return -1
movies = pd.read_csv('./ml-latest-small/movies.csv', delimiter=',')
r_movies =
for m_id in movieids[0:20]:
name_str = ('電影名稱:' + movies[movies['movieid'] == m_id]['title'])
actor_str = ('演員' + movies[movies['movieid'] == m_id]['genres'])
return r_movies
def push(userid):
ratings = pd.read_csv('./ml-latest-small/ratings.csv', delimiter=',')
all_user = set(ratings.userid)
real_columns = set(ratings[ratings['userid'] == userid]['movieid'])
if not len(real_columns):
return newuser_reeccomendation(ratings)
real_indexs = [userid]
for o_userid in all_user:
if o_userid == userid:
continue
u_movie = set(ratings[ratings['userid'] == o_userid]['movieid'])
if len(u_movie&real_columns) > 26:
data_arr = np.array(ratings[ratings.userid == userid]['rating']).reshape((1,-1))
for u_id in real_indexs:
if u_id == userid:
continue
t_arr =
for m_id in real_columns:
um_rank = ratings[ratings.userid == u_id][ratings.movieid == m_id]['rating']
if not um_rank.empty:
else:
data_arr = np.concatenate((data_arr,np.array(t_arr).reshape((1,-1))),axis=0)
s_userids = start_studio(pd.dataframe(data=data_arr,index=real_indexs,columns=real_columns))
allreccomendation =
for r_userid in s_userids:
allreccomendation.extend(ratings[ratings['userid'] == r_userid].sort_values(by='rating')['movieid'])
real_movie = list(set(allreccomendation) - set(real_columns))
return fetch_movieinfo(real_movie)
print(push(345))
Python 相關性分析
資料來源示例 餐飲銷量資料相關性分析 import pandas as pd catering sale data catering sale all.xls 餐飲資料,含有其他屬性 data pd.read excel catering sale,index col u 日期 讀取資料,指定 日期...
spearman相關性分析 相關性分析
r語言常用函式 cor 預設結果為矩陣 cor mydat,use method use 缺失值的處理,method 處理方法 cor x,y 可以計算非方形矩陣,x y分別為2個矩陣,相同的行數 cor.test x,y,alternative method x y為檢驗相關性的變數 librar...
統計 相關性與自相關性
相關係數度量指的是兩個不同事件彼此之間的相互影響程度 而自相關係數度量的是同一事件在兩個不同時期之間的相關程度,形象的講就是度量自己過去的行為對自己現在的影響。自相關,也稱 序列相關。是乙個訊號於其自身在不同時間點的互相關。非正式地來說,它就是兩次觀察之間的相似度對它們之間的時間差的函式。它是找出重...