# 两层 for 嵌套的循环，算法优化一下？

``````UBdata = [[eval(list(user_s.loc[user_s['user'] == u,'history'])[0])['book'].count(int(b)) if b in eval(list(user_s.loc[user_s['user'] == u,'history'])[0])['book']  else 0 for b in books] for u in users]
# 其实我个人不是很喜欢一行式、不是很方便阅读，下面注释代码为嵌套for的详细式子
# for u in users:
#     print('用户',u,'------------------\n')
#     history = eval(list(user_s.loc[user_s['user'] == u,'history'])[0])
#     print('books',history['book'])
#     for b in books:
#         print(b,history['book'].count(int(b)))``````

``````TBdata = np.zeros((len(books),len(tags)))
for b_i,b_v in enumerate(book_tag['book']):
t_v=book_tag['given_tag'][b_i]
TBdata[books.index(b_v),tags.index(t_v)] +=1``````

book_tag = {‘book’:[1,2,3,4,4,3,2,3,4],’given_tag’:[5,6,7,2,3,4,4,3,2]}

``````
#!/usr/bin/python
# -*- coding: utf-8 -*-
#__author__ : stray_camel
#pip_source : https://mirrors.aliyun.com/pypi/simple
import sys,os
import pandas as pd
import numpy as np

# fix error : _csv.Error: field larger than field limit (131072)
import csv
maxInt = sys.maxsize
while True:
# decrease the maxInt value by factor 10
# as long as the OverflowError occurs.
try:
csv.field_size_limit(maxInt)
break
except OverflowError:
maxInt = int(maxInt/10)

class CF_CB():
'''
Collaborative Filtering + Content-based Filtering
'''
def __init__(self):
pass

def Matrix_factorization(self):
book_tag = {'book':[],'given_tag':[]}
books = []
tags = []
user_s = pd.read_csv(os.path.dirname(__file__)+'/data/users.csv', nrows=5, sep='\t', engine='python', dtype={'history':dict})
for _ in user_s['history']:
try:
books.extend(set(_['book']))
tags.extend(set(_['given_tag']))
book_tag.append(_)
except TypeError as e:
books.extend(set(eval(_)['book']))
tags.extend(set(eval(_)['given_tag']))
book_tag['book']+=eval(_)['book']
book_tag['given_tag']+=eval(_)['given_tag']
books.sort()
books = list(set(books))
tags.sort()
tags = list(set(tags))
users = list(user_s['user'])

# UBdata: index=users,columns=books
UBdata = [[eval(list(user_s.loc[user_s['user'] == u,'history'])[0])['book'].count(int(b)) if b in eval(list(user_s.loc[user_s['user'] == u,'history'])[0])['book']  else 0 for b in books] for u in users]
user_books = pd.DataFrame(UBdata,index=users,columns=books)

# UTdata: index=users,columns=tags
UTdata = [[eval(list(user_s.loc[user_s['user'] == u,'history'])[0])['given_tag'].count(int(t)) if t in eval(list(user_s.loc[user_s['user'] == u,'history'])[0])['given_tag']  else 0 for t in tags] for u in users]
users_tags = pd.DataFrame(UTdata,index=users,columns=tags)

# TBdata: index=books,columns=tags
TBdata = np.zeros((len(books),len(tags)))
for b_i,b_v in enumerate(book_tag['book']):
t_v=book_tag['given_tag'][b_i]
TBdata[books.index(b_v),tags.index(t_v)] +=1
TBdata = pd.DataFrame(TBdata)
# print(TBdata)
print(UBdata.values.dot(TBdata.values))
print('矩阵分解~~')

if __name__ == "__main__":
test = CF_CB()
test.Matrix_factorization()
``````

``TBdata[books.index(b_v),tags.index(t_v)] +=1``

• 如果books或tags中只有id, 那根本就不需要books及tags这两个列表, 只要记录book总数及tag总数, . 那动作就变简单, 也快多了, 至少不用books.index, tags.index . 如下:
``````value = tuple(zip(*list(book_tag.values())))
unique, counts = np.unique(value, return_counts=True, axis=0)
rows, cols = zip(*unique)
TBdata[rows, cols] = counts``````
• 如果你有一串book_tags, 直接用list, 而不是dictionary
``````book_tags = [[[1,2,3,4,4,3,2,3,4],[5,6,7,2,3,4,4,3,2]],
[[1,2,3,4,4,3,2,3,4],[4,2,5,6,3,2,4,1,5]]]
value = []
for book_tag in book_tags:
value += list(zip(*book_tag))
unique, counts = np.unique(value, return_counts=True, axis=0)
rows, cols = zip(*unique)
TBdata[rows, cols] = counts``````
1年前 评论

For example:

``````# Code added for testing
books = [1,2,3,4]
tags = [1,2,3,4,5,6,7]
book_tag = {'book':[1,2,3,4,4,3,2,3,4],'given_tag':[5,6,7,2,3,4,4,3,2]}

TBdata = np.zeros((len(books),len(tags)))
for b_i,b in enumerate(books):
for t_i,t in enumerate(tags):
"""
1. TBdata +1 if 'book' and 'given_tag' are the same in book_tag
2. There's no any difference for different books and different tags
3. It means all the content in TBdata will be the same, all 1 here.
"""
for i,v in enumerate(book_tag['book']):
if book_tag['given_tag'][i] == v:
TBdata[b_i,t_i] +=1
print(TBdata)``````

result:
[[1. 1. 1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1. 1. 1.]]

1年前 评论

Jason990420 （作者） 1年前

1年前 评论

``TBdata[books.index(b_v),tags.index(t_v)] +=1``

• 如果books或tags中只有id, 那根本就不需要books及tags这两个列表, 只要记录book总数及tag总数, . 那动作就变简单, 也快多了, 至少不用books.index, tags.index . 如下:
``````value = tuple(zip(*list(book_tag.values())))
unique, counts = np.unique(value, return_counts=True, axis=0)
rows, cols = zip(*unique)
TBdata[rows, cols] = counts``````
• 如果你有一串book_tags, 直接用list, 而不是dictionary
``````book_tags = [[[1,2,3,4,4,3,2,3,4],[5,6,7,2,3,4,4,3,2]],
[[1,2,3,4,4,3,2,3,4],[4,2,5,6,3,2,4,1,5]]]
value = []
for book_tag in book_tags:
value += list(zip(*book_tag))
unique, counts = np.unique(value, return_counts=True, axis=0)
rows, cols = zip(*unique)
TBdata[rows, cols] = counts``````
1年前 评论

:+1:

1年前 评论

1个月前 评论