两层 for 嵌套的循环,算法优化一下?
两层for循环代码,复杂度为n^2,大家又想到什么优化的算法啊吗?
另外还有一个二维list生成式,
UBdata = [[eval(list(user_s.loc[user_s['user'] == u,'history'])[0])['book'].count(int(b)) if b in eval(list(user_s.loc[user_s['user'] == u,'history'])[0])['book'] else 0 for b in books] for u in users]
# 其实我个人不是很喜欢一行式、不是很方便阅读,下面注释代码为嵌套for的详细式子
# for u in users:
# print('用户',u,'------------------\n')
# history = eval(list(user_s.loc[user_s['user'] == u,'history'])[0])
# print('books',history['book'])
# for b in books:
# print(b,history['book'].count(int(b)))
以上代码需求,纵轴index为users,横轴cloumns为books,从用户历史记录中找到用户看过的书,并计数。
另外,之前发这个贴子想解决的哪个三层循环,哪个代码是我脑子秀逗了,写错了。这是更改后的代码,正确的
TBdata = np.zeros((len(books),len(tags)))
for b_i,b_v in enumerate(book_tag['book']):
t_v=book_tag['given_tag'][b_i]
TBdata[books.index(b_v),tags.index(t_v)] +=1
代码如上,books是一个装有n万个书籍id的list,tags是所有标签(千把个)的id的list。
book_tag = {‘book’:[1,2,3,4,4,3,2,3,4],’given_tag’:[5,6,7,2,3,4,4,3,2]}
这是一个book_tag例子,格式如上,实际数据很大,而且book和given_tag中有很多重复的。
意义是,book_tag['book'][0]
和book_tag['given_tag'][0]
是相对应的一组(总共有n万组)
代码的目的是生成TBdata# TBdata: index=books,columns=tags
,然后每个点TBdata[b_i,t_i]
的数据如果出现一次,则增加一次。
单纯想降低复杂度。
如果我还没解释清楚请留个言,靴靴
整体源代码是想尝试矩阵分解
源代码:
#!/usr/bin/python
# -*- coding: utf-8 -*-
#__author__ : stray_camel
#pip_source : https://mirrors.aliyun.com/pypi/simple
import sys,os
import pandas as pd
import numpy as np
# fix error : _csv.Error: field larger than field limit (131072)
import csv
maxInt = sys.maxsize
while True:
# decrease the maxInt value by factor 10
# as long as the OverflowError occurs.
try:
csv.field_size_limit(maxInt)
break
except OverflowError:
maxInt = int(maxInt/10)
class CF_CB():
'''
Collaborative Filtering + Content-based Filtering
'''
def __init__(self):
pass
def Matrix_factorization(self):
book_tag = {'book':[],'given_tag':[]}
books = []
tags = []
user_s = pd.read_csv(os.path.dirname(__file__)+'/data/users.csv', nrows=5, sep='\t', engine='python', dtype={'history':dict})
for _ in user_s['history']:
try:
books.extend(set(_['book']))
tags.extend(set(_['given_tag']))
book_tag.append(_)
except TypeError as e:
books.extend(set(eval(_)['book']))
tags.extend(set(eval(_)['given_tag']))
book_tag['book']+=eval(_)['book']
book_tag['given_tag']+=eval(_)['given_tag']
books.sort()
books = list(set(books))
tags.sort()
tags = list(set(tags))
users = list(user_s['user'])
# UBdata: index=users,columns=books
UBdata = [[eval(list(user_s.loc[user_s['user'] == u,'history'])[0])['book'].count(int(b)) if b in eval(list(user_s.loc[user_s['user'] == u,'history'])[0])['book'] else 0 for b in books] for u in users]
user_books = pd.DataFrame(UBdata,index=users,columns=books)
# UTdata: index=users,columns=tags
UTdata = [[eval(list(user_s.loc[user_s['user'] == u,'history'])[0])['given_tag'].count(int(t)) if t in eval(list(user_s.loc[user_s['user'] == u,'history'])[0])['given_tag'] else 0 for t in tags] for u in users]
users_tags = pd.DataFrame(UTdata,index=users,columns=tags)
# TBdata: index=books,columns=tags
TBdata = np.zeros((len(books),len(tags)))
for b_i,b_v in enumerate(book_tag['book']):
t_v=book_tag['given_tag'][b_i]
TBdata[books.index(b_v),tags.index(t_v)] +=1
TBdata = pd.DataFrame(TBdata)
# print(TBdata)
print(UBdata.values.dot(TBdata.values))
print('矩阵分解~~')
if __name__ == "__main__":
test = CF_CB()
test.Matrix_factorization()
推荐文章: