這篇文章給大家介紹怎么在python3中基于用戶實(shí)現(xiàn)協(xié)同過(guò)濾,內(nèi)容非常詳細(xì),感興趣的小伙伴們可以參考借鑒,希望對(duì)大家能有所幫助。
目前成都創(chuàng)新互聯(lián)已為上1000家的企業(yè)提供了網(wǎng)站建設(shè)、域名、虛擬空間、成都網(wǎng)站托管、企業(yè)網(wǎng)站設(shè)計(jì)、福安網(wǎng)站維護(hù)等服務(wù),公司將堅(jiān)持客戶導(dǎo)向、應(yīng)用為本的策略,正道將秉承"和諧、參與、激情"的文化,與客戶和合作伙伴齊心協(xié)力一起成長(zhǎng),共同發(fā)展。python的數(shù)據(jù)類(lèi)型有哪些?python的數(shù)據(jù)類(lèi)型:1. 數(shù)字類(lèi)型,包括int(整型)、long(長(zhǎng)整型)和float(浮點(diǎn)型)。2.字符串,分別是str類(lèi)型和unicode類(lèi)型。3.布爾型,Python布爾類(lèi)型也是用于邏輯運(yùn)算,有兩個(gè)值:True(真)和False(假)。4.列表,列表是Python中使用最頻繁的數(shù)據(jù)類(lèi)型,集合中可以放任何數(shù)據(jù)類(lèi)型。5. 元組,元組用”()”標(biāo)識(shí),內(nèi)部元素用逗號(hào)隔開(kāi)。6. 字典,字典是一種鍵值對(duì)的集合。7. 集合,集合是一個(gè)無(wú)序的、不重復(fù)的數(shù)據(jù)組合。
#!/usr/bin/python3 # -*- coding: utf-8 -*- #20170916號(hào)協(xié)同過(guò)濾電影推薦基稿 #字典等格式數(shù)據(jù)處理及直接寫(xiě)入文件 ##from numpy import * import time from math import sqrt ##from texttable import Texttable class CF: def __init__(self, movies, ratings, k=5, n=20): self.movies = movies#[MovieID,Title,Genres] (self.train_data,self.test_data) = (ratings[0], ratings[1])#[UserID::MovieID::Rating::Timestamp] # 鄰居個(gè)數(shù) self.k = k # 推薦個(gè)數(shù) self.n = n # 用戶對(duì)電影的評(píng)分 # 數(shù)據(jù)格式{'UserID用戶ID':[(MovieID電影ID,Rating用戶對(duì)電影的評(píng)星)]} self.userDict = {} # 對(duì)某電影評(píng)分的用戶 # 數(shù)據(jù)格式:{'MovieID電影ID':[UserID,用戶ID]} # {'1',[1,2,3..],...} self.ItemUser = {} # 鄰居的信息 self.neighbors = [] # 推薦列表 self.recommandList = []#包含dist和電影id self.recommand = [] #訓(xùn)練集合測(cè)試集的交集,且僅有電影id #用戶評(píng)過(guò)電影信息 self.train_user = [] self.test_user = [] #給用戶的推薦列表,僅含movieid self.train_rec =[] self.test_rec = [] #test中的電影評(píng)分預(yù)測(cè)數(shù)據(jù)集合, self.forecast = {}#前k個(gè)近鄰的評(píng)分集合 self.score = {}#最終加權(quán)平均后的評(píng)分集合{“電影id”:預(yù)測(cè)評(píng)分} #召回率和準(zhǔn)確率 self.pre = [0.0,0.0] self.z = [0.0, 0.0] ''''' userDict數(shù)據(jù)格式: '3': [('3421', 0.8), ('1641', 0.4), ('648', 0.6), ('1394', 0.8), ('3534', 0.6), ('104', 0.8), ('2735', 0.8), ('1210', 0.8), ('1431', 0.6), ('3868', 0.6), ('1079', 1.0), ('2997', 0.6), ('1615', 1.0), ('1291', 0.8), ('1259', 1.0), ('653', 0.8), ('2167', 1.0), ('1580', 0.6), ('3619', 0.4), ('260', 1.0), ('2858', 0.8), ('3114', 0.6), ('1049', 0.8), ('1261', 0.2), ('552', 0.8), ('480', 0.8), ('1265', 0.4), ('1266', 1.0), ('733', 1.0), ('1196', 0.8), ('590', 0.8), ('2355', 1.0), ('1197', 1.0), ('1198', 1.0), ('1378', 1.0), ('593', 0.6), ('1379', 0.8), ('3552', 1.0), ('1304', 1.0), ('1270', 0.6), ('2470', 0.8), ('3168', 0.8), ('2617', 0.4), ('1961', 0.8), ('3671', 1.0), ('2006', 0.8), ('2871', 0.8), ('2115', 0.8), ('1968', 0.8), ('1136', 1.0), ('2081', 0.8)]} ItemUser數(shù)據(jù)格式: {'42': ['8'], '2746': ['10'], '2797': ['1'], '2987': ['5'], '1653': ['5', '8', '9'], '194': ['5'], '3500': ['8', '10'], '3753': ['6', '7'], '1610': ['2', '5', '7'], '1022': ['1', '10'], '1244': ['2'], '25': ['8', '9'] ''' # 將ratings轉(zhuǎn)換為userDict和ItemUser def formatRate(self,train_or_test): self.userDict = {} self.ItemUser = {} for i in train_or_test:#[UserID,MovieID,Rating,Timestamp] # 評(píng)分最高為5 除以5 進(jìn)行數(shù)據(jù)歸一化 ## temp = (i[1], float(i[2]) / 5) temp = (i[1], float(i[2])) ## temp = (i[1], i[2]) # 計(jì)算userDict {'用戶id':[(電影id,評(píng)分),(2,5)...],'2':[...]...}一個(gè)觀眾對(duì)每一部電影的評(píng)分集合 if(i[0] in self.userDict): self.userDict[i[0]].append(temp) else: self.userDict[i[0]] = [temp] # 計(jì)算ItemUser {'電影id',[用戶id..],...}同一部電影的觀眾集合 if(i[1] in self.ItemUser): self.ItemUser[i[1]].append(i[0]) else: self.ItemUser[i[1]] = [i[0]] # 格式化userDict數(shù)據(jù) def formatuserDict(self, userId, p):#userID為待查詢目標(biāo),p為近鄰對(duì)象 user = {} #user數(shù)據(jù)格式為:電影id:[userID的評(píng)分,近鄰用戶的評(píng)分] for i in self.userDict[userId]:#i為userDict數(shù)據(jù)中的每個(gè)括號(hào)同81行 user[i[0]] = [i[1], 0] for j in self.userDict[p]: if(j[0] not in user): user[j[0]] = [0, j[1]]#說(shuō)明目標(biāo)用戶和近鄰用戶沒(méi)有同時(shí)對(duì)一部電影評(píng)分 else: user[j[0]][1] = j[1]#說(shuō)明兩者對(duì)同一部電影都有評(píng)分 return user # 計(jì)算余弦距離 def getCost(self, userId, p): # 獲取用戶userId和p評(píng)分電影的并集 # {'電影ID':[userId的評(píng)分,p的評(píng)分]} 沒(méi)有評(píng)分為0 user = self.formatuserDict(userId, p) x = 0.0 y = 0.0 z = 0.0 for k, v in user.items():#k是鍵,v是值 x += float(v[0]) * float(v[0]) y += float(v[1]) * float(v[1]) z += float(v[0]) * float(v[1]) if(z == 0.0): return 0 return z / sqrt(x * y) #計(jì)算皮爾遜相似度 ## def getCost(self, userId, p): ## # 獲取用戶userId和l評(píng)分電影的并集 ## # {'電影ID':[userId的評(píng)分,l的評(píng)分]} 沒(méi)有評(píng)分為0 ## user = self.formatuserDict(userId, p) ## sumxsq = 0.0 ## sumysq = 0.0 ## sumxy = 0.0 ## sumx = 0.0 ## sumy = 0.0 ## n = len(user) ## for k, v in user.items(): ## sumx +=float(v[0]) ## sumy +=float(v[1]) ## sumxsq += float(v[0]) * float(v[0]) ## sumysq += float(v[1]) * float(v[1]) ## sumxy += float(v[0]) * float(v[1]) ## up = sumxy -sumx*sumy/n ## down = sqrt((sumxsq - pow(sumxsq,2)/n)*(sumysq - pow(sumysq,2)/n)) ## if(down == 0.0): ## return 0 ## return up/down # 找到某用戶的相鄰用戶 def getNearestNeighbor(self, userId): neighbors = [] self.neighbors = [] # 獲取userId評(píng)分的電影都有那些用戶也評(píng)過(guò)分 for i in self.userDict[userId]:#i為userDict數(shù)據(jù)中的每個(gè)括號(hào)同95行#user數(shù)據(jù)格式為:電影id:[userID的評(píng)分,近鄰用戶的評(píng)分] for j in self.ItemUser[i[0]]:#i[0]為電影編號(hào),j為看同一部電影的每位用戶 if(j != userId and j not in neighbors): neighbors.append(j) # 計(jì)算這些用戶與userId的相似度并排序 for i in neighbors:#i為用戶id dist = self.getCost(userId, i) self.neighbors.append([dist, i]) # 排序默認(rèn)是升序,reverse=True表示降序 self.neighbors.sort(reverse=True) self.neighbors = self.neighbors[:self.k]#切片操作,取前k個(gè) ## print('neighbors',len(neighbors)) # 獲取推薦列表 def getrecommandList(self, userId): self.recommandList = [] # 建立推薦字典 recommandDict = {} for neighbor in self.neighbors:#這里的neighbor數(shù)據(jù)格式為[[dist,用戶id],[],....] movies = self.userDict[neighbor[1]]#movies數(shù)據(jù)格式為[(電影id,評(píng)分),(),。。。。] for movie in movies: if(movie[0] in recommandDict): recommandDict[movie[0]] += neighbor[0]####???? else: recommandDict[movie[0]] = neighbor[0] # 建立推薦列表 for key in recommandDict:#recommandDict數(shù)據(jù)格式{電影id:累計(jì)dist,。。。} self.recommandList.append([recommandDict[key], key])#recommandList數(shù)據(jù)格式【【累計(jì)dist,電影id】,【】,。。。?!?nbsp; self.recommandList.sort(reverse=True) ## print(len(self.recommandList)) self.recommandList = self.recommandList[:self.n] ## print(len(self.recommandList)) # 推薦的準(zhǔn)確率 def getPrecision(self, userId): ## print("開(kāi)始?。?!") #先運(yùn)算test_data,這樣最終self.neighbors等保留的是后來(lái)計(jì)算train_data后的數(shù)據(jù)(不交換位置的話就得在gR函數(shù)中增加參數(shù)保留各自的neighbor) (self.test_user,self.test_rec) = self.getRecommand(self.test_data,userId)#測(cè)試集的用戶userId所評(píng)價(jià)的電影和給該用戶推薦的電影列表 (self.train_user,self.train_rec) = self.getRecommand(self.train_data,userId)#訓(xùn)練集的用戶userId所評(píng)價(jià)的所有電影集合(self.train_user)和給該用戶推薦的電影列表(self.train_rec) #西安電大的張海朋:基于協(xié)同過(guò)濾的電影推薦系統(tǒng)的構(gòu)建(2015)中的準(zhǔn)確率召回率計(jì)算 for i in self.test_rec: if i in self.train_rec: self.recommand.append(i) self.pre[0] = len(self.recommand)/len(self.train_rec) self.z[0] = len(self.recommand)/len(self.test_rec) #北京交大黃宇:基于協(xié)同過(guò)濾的推薦系統(tǒng)設(shè)計(jì)與實(shí)現(xiàn)(2015)中的準(zhǔn)、召計(jì)算 self.recommand = []#這里沒(méi)有歸零的話,下面計(jì)算初始recommand不為空 for i in self.train_rec: if i in self.test_user: self.recommand.append(i) self.pre[1] = len(self.recommand)/len(self.train_rec) self.z[1] = len(self.recommand)/len(self.test_user) ## print(self.train_rec,self.test_rec,"20",len(self.train_rec),len(self.train_rec)) #對(duì)同一用戶分別通過(guò)訓(xùn)練集和測(cè)試集處理 def getRecommand(self,train_or_test,userId): self.formatRate(train_or_test) self.getNearestNeighbor(userId) self.getrecommandList(userId) user = [i[0] for i in self.userDict[userId]]#用戶userId評(píng)分的所有電影集合 recommand = [i[1] for i in self.recommandList]#推薦列表僅有電影id的集合,區(qū)別于recommandList(還含有dist) ## print("userid該用戶已通過(guò)訓(xùn)練集測(cè)試集處理") return (user,recommand) #對(duì)test的電影進(jìn)行評(píng)分預(yù)測(cè) def foreCast(self): self.forecast = {}#?????前面變量統(tǒng)一定義初始化后,函數(shù)內(nèi)部是否需要該初始化???? same_movie_id = [] neighbors_id = [i[1] for i in self.neighbors] #近鄰用戶數(shù)據(jù)僅含用戶id的集合 for i in self.test_user:#i為電影id,即在test里的i有被推薦到 if i in self.train_rec: same_movie_id.append(i) for j in self.ItemUser[i]:#j為用戶id,即尋找近鄰用戶的評(píng)分和相似度 if j in neighbors_id: user = [i[0] for i in self.userDict[j]]#self.userDict[userId]數(shù)據(jù)格式:數(shù)據(jù)格式為[(電影id,評(píng)分),(),。。。。];這里的userid應(yīng)為近鄰用戶p a = self.neighbors[neighbors_id.index(j)]#找到該近鄰用戶的數(shù)據(jù)【dist,用戶id】 b = self.userDict[j][user.index(i)]#找到該近鄰用戶的數(shù)據(jù)【電影id,用戶id】 c = [a[0], b[1], a[1]] if (i in self.forecast): self.forecast[i].append(c) else: self.forecast[i] = [c]#數(shù)據(jù)格式:字典{“電影id”:【dist,評(píng)分,用戶id】【】}{'589': [[0.22655856915174025, 0.6, '419'], [0.36264561173211646, 1.0, '1349']。。。} ## print(same_movie_id) #每個(gè)近鄰用戶的評(píng)分加權(quán)平均計(jì)算得預(yù)測(cè)評(píng)分 self.score = {} if same_movie_id :#在test里的電影是否有在推薦列表里,如果為空不做判斷,下面的處理會(huì)報(bào)錯(cuò) for movieid in same_movie_id: total_d = 0 total_down = 0 for d in self.forecast[movieid]:#此時(shí)的d已經(jīng)是最里層的列表了【】;self.forecast[movieid]的數(shù)據(jù)格式[[]] total_d += d[0]*d[1] total_down += d[0] self.score[movieid] = [round(total_d/total_down,3)]#加權(quán)平均后取3位小數(shù)的精度 #在test里但是推薦沒(méi)有的電影id,這里先按零計(jì)算 for i in self.test_user: if i not in movieid: self.score[i] = [0] else: for i in self.test_user: self.score[i] = [0] ## return self.score #計(jì)算平均絕對(duì)誤差MAE def cal_Mae(self,userId): self.formatRate(self.test_data) ## print(self.userDict) for item in self.userDict[userId]: if item[0] in self.score: self.score[item[0]].append(item[1])#self.score數(shù)據(jù)格式[[預(yù)測(cè)分,實(shí)際分]] ## #過(guò)渡代碼 ## for i in self.score: ## pass return self.score # 基于用戶的推薦 # 根據(jù)對(duì)電影的評(píng)分計(jì)算用戶之間的相似度 ## def recommendByUser(self, userId): ## print("親,請(qǐng)稍等片刻,系統(tǒng)正在快馬加鞭為你運(yùn)作中") #人機(jī)交互輔助解讀, ## self.getPrecision(self,userId) # 獲取數(shù)據(jù) def readFile(filename): files = open(filename, "r", encoding = "utf-8") data = [] for line in files.readlines(): item = line.strip().split("::") data.append(item) return data files.close() def load_dict_from_file(filepath): _dict = {} try: with open(filepath, 'r',encoding = "utf -8") as dict_file: for line in dict_file.readlines(): (key, value) = line.strip().split(':') _dict[key] = value except IOError as ioerr: print ("文件 %s 不存在" % (filepath)) return _dict def save_dict_to_file(_dict, filepath): try: with open(filepath, 'w',encoding = "utf - 8") as dict_file: for (key,value) in _dict.items(): dict_file.write('%s:%s\n' % (key, value)) except IOError as ioerr: print ("文件 %s 無(wú)法創(chuàng)建" % (filepath)) def writeFile(data,filename): with open(filename, 'w', encoding = "utf-8")as f: f.write(data) # -------------------------開(kāi)始------------------------------- def start3(): start1 = time.clock() movies = readFile("D:/d/movies.dat") ratings = [readFile("D:/d/201709train.txt"),readFile("D:/d/201709test.txt")] demo = CF(movies, ratings, k=20) userId = '1000' demo.getPrecision(userId) ## print(demo.foreCast()) demo.foreCast() print(demo.cal_Mae(userId)) ## demo.recommendByUser(ID) #上一句只能實(shí)現(xiàn)固定用戶查詢,這句可以實(shí)現(xiàn)“想查哪個(gè)查哪個(gè)”,后期可以加個(gè)循環(huán),挨個(gè)查,查到你不想查 print("處理的數(shù)據(jù)為%d條" % (len(ratings[0])+len(ratings[1]))) ## print("____---",len(ratings[0]),len(ratings[1])) ## print("準(zhǔn)確率: %.2f %%" % (demo.pre * 100)) ## print("召回率: %.2f %%" % (demo.z * 100)) print(demo.pre) print(demo.z) end1 = time.clock() print("耗費(fèi)時(shí)間: %f s" % (end1 - start1)) def start1(): start1 = time.clock() movies = readFile("D:/d/movies.dat") ratings = [readFile("D:/d/201709train.txt"),readFile("D:/d/201709test.txt")] demo = CF(movies, ratings, k = 20) demo.formatRate(ratings[0]) writeFile(str(demo.userDict),"D:/d/dd/userDict.txt") writeFile(str(demo.ItemUser), "D:/d/dd/ItemUser.txt") ## save_dict_to_file(demo.userDict,"D:/d/dd/userDict.txt") ## save_dict_to_file(demo.ItemUser,"D:/d/dd/ItemUser.txt") print("處理結(jié)束") ## with open("D:/d/dd/userDict.txt",'r',encoding = 'utf-8') as f: ## diction = f.read() ## i = 0 ## for j in eval(diction): ## print(j) ## i += 1 ## if i == 4: ## break def start2(): start1 = time.clock() movies = readFile("D:/d/movies.dat") ratings = [readFile("D:/d/201709train.txt"),readFile("D:/d/201709test.txt")] demo = CF(movies, ratings, k = 20) demo.formatRate_toMovie(ratings[0]) writeFile(str(demo.movieDict),"D:/d/dd/movieDict.txt") ## writeFile(str(demo.userDict),"D:/d/dd/userDict.txt") ## writeFile(str(demo.ItemUser), "D:/d/dd/ItemUser.txt") ## save_dict_to_file(demo.userDict,"D:/d/dd/userDict.txt") ## save_dict_to_file(demo.ItemUser,"D:/d/dd/ItemUser.txt") print("處理結(jié)束") if __name__ == '__main__': start1()
關(guān)于怎么在python3中基于用戶實(shí)現(xiàn)協(xié)同過(guò)濾就分享到這里了,希望以上內(nèi)容可以對(duì)大家有一定的幫助,可以學(xué)到更多知識(shí)。如果覺(jué)得文章不錯(cuò),可以把它分享出去讓更多的人看到。