基于kmeans算法的中文关键词提取 现在过程有点问题 麻烦大佬们指点下
代码如下
import sys,os
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from scipy import spatial
def getkeywords_kmeans(data,topK):
words = data["word"] # 词汇
vecs = data.iloc[:, 1:] # 向量表示
i=2
kmeans = KMeans(n_clusters=i).fit(vecs)
labels = kmeans.labels_ # 类别结果标签
labels = pd.DataFrame(labels, columns=['label'])
new_df = pd.concat([labels, vecs], axis=1)
df_count_type = new_df.groupby('label').size() # 各类别统计个数
# print df_count_type
vec_center = kmeans.cluster_centers_ # 聚类中心
#vec_words = np.array(vecs) # 候选关键词向量,dataFrame转array
quantity = pd.Series(kmeans.labels_).value_counts()
for num in range(len(quantity)) : #0~1个簇类
vec_center = vec_center[num]
length = 200
distances = []
res0Series = pd.Series(kmeans.labels_)
res0 = res0Series[res0Series.values == num]
res = vecs.iloc[res0.index]
#res = res.reset_index(drop=True)#71个词
words_num=len(res)
res_word=np.array(res)
for word in range(words_num):#0~171个词
res = res_word[word]
dis = 0 # 向量距离
for index in range(length):#0~200维
#计算每一个簇中每一个词在每一维与聚类中心的距离(欧氏距离)
dis += (vec_center[index] - res[index]) * (vec_center[index] - res[index])
dis_ture=np.sqrt(dis)
distances.append(dis_ture)
distances = pd.DataFrame(distances, columns=['dis'])
result = pd.concat([words, labels, distances], axis=1) # 拼接词语与其对应中心点的距离
result = result.sort_values(by="dis", ascending=True) # 按照距离大小进行升序排序
def main():
# 读取数据集
dataFile = 'D:/STUDY/1BIYESHEJI/keyword_extraction-master/data/data1.csv'
articleData = pd.read_csv(dataFile,encoding='utf-8')
ids, titles, keys = [], [], []
rootdir = "D:/STUDY/1BIYESHEJI/keyword_extraction-master/result/vecs" # 词向量文件根目录
fileList = os.listdir(rootdir) #列出文件夹下所有的目录与文件
# 遍历文件
for i in range(len(fileList)):
filename = fileList[i]
path = os.path.join(rootdir,filename)
if os.path.isfile(path):
data = pd.read_csv(path, encoding='utf-8') # 读取词向量文件数据
#print(data)
artile_keys = getkeywords_kmeans(data,10) # 聚类算法得到当前文件的关键词
print(artile_keys)
# 根据文件名获得文章id以及标题
(shortname, extension) = os.path.splitext(filename) # 得到文件名和文件扩展名
t = shortname.split("_")
article_id = int(t[len(t)-1]) # 获得文章id
artile_tit = articleData[articleData.id==article_id]['title'] # 获得文章标题
artile_tit = list(artile_tit)[0] # series转成字符串
ids.append(article_id)
titles.append(artile_tit)
keys.append(artile_keys)
# 所有结果写入文件
result = pd.DataFrame({"id": ids, "title": titles, "key": keys}, columns=['id', 'title', 'key'])
result = result.sort_values(by="id",ascending=True) # 排序
result.to_csv("D:/STUDY/1BIYESHEJI/keyword_extraction-master/result/news1_keys_word2vec1.csv", encoding="gbk",index=False)
if __name__ == '__main__':
main()
显示错误如下
Traceback (most recent call last):
File "D:/STUDY/1BIYESHEJI/keyword_extraction-master/wiki_zh_word2vec-master/kmeans.py", line 81, in <module>
main()
File "D:/STUDY/1BIYESHEJI/keyword_extraction-master/wiki_zh_word2vec-master/kmeans.py", line 64, in main
artile_keys = getkeywords_kmeans(data,10) # 聚类算法得到当前文件的关键词
File "D:/STUDY/1BIYESHEJI/keyword_extraction-master/wiki_zh_word2vec-master/kmeans.py", line 41, in getkeywords_kmeans
dis += (vec_center[index] - res[index]) * (vec_center[index] - res[index])
IndexError: invalid index to scalar variable.
Process finished with exit code 1
还请大佬们指点感激不尽
推荐文章: