基于kmeans算法的中文关键词提取 现在过程有点问题 麻烦大佬们指点下

代码如下

import sys,os
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from scipy import spatial

def getkeywords_kmeans(data,topK):
    words = data["word"]  # 词汇
    vecs = data.iloc[:, 1:]  # 向量表示
    i=2
    kmeans = KMeans(n_clusters=i).fit(vecs)
    labels = kmeans.labels_  # 类别结果标签
    labels = pd.DataFrame(labels, columns=['label'])
    new_df = pd.concat([labels, vecs], axis=1)
    df_count_type = new_df.groupby('label').size()  # 各类别统计个数
    # print df_count_type
    vec_center = kmeans.cluster_centers_  # 聚类中心

    #vec_words = np.array(vecs)  # 候选关键词向量,dataFrame转array
    quantity = pd.Series(kmeans.labels_).value_counts()
    for num in range(len(quantity)) : #0~1个簇类
        vec_center = vec_center[num]
        length = 200
        distances = []
        res0Series = pd.Series(kmeans.labels_)
        res0 = res0Series[res0Series.values == num]
        res = vecs.iloc[res0.index]
        #res = res.reset_index(drop=True)#71个词
        words_num=len(res)
        res_word=np.array(res)

        for word in range(words_num):#0~171个词
            res = res_word[word]
            dis = 0  # 向量距离
            for index in range(length):#0~200维
                #计算每一个簇中每一个词在每一维与聚类中心的距离(欧氏距离)
                dis += (vec_center[index] - res[index]) * (vec_center[index] - res[index])
            dis_ture=np.sqrt(dis)
            distances.append(dis_ture)
        distances = pd.DataFrame(distances, columns=['dis'])
        result = pd.concat([words, labels, distances], axis=1)  # 拼接词语与其对应中心点的距离
        result = result.sort_values(by="dis", ascending=True)  # 按照距离大小进行升序排序


def main():
    # 读取数据集
    dataFile = 'D:/STUDY/1BIYESHEJI/keyword_extraction-master/data/data1.csv'
    articleData = pd.read_csv(dataFile,encoding='utf-8')
    ids, titles, keys = [], [], []

    rootdir = "D:/STUDY/1BIYESHEJI/keyword_extraction-master/result/vecs" # 词向量文件根目录
    fileList = os.listdir(rootdir) #列出文件夹下所有的目录与文件
    # 遍历文件
    for i in range(len(fileList)):
        filename = fileList[i]
        path = os.path.join(rootdir,filename)
        if os.path.isfile(path):
            data = pd.read_csv(path, encoding='utf-8') # 读取词向量文件数据
            #print(data)
            artile_keys = getkeywords_kmeans(data,10) # 聚类算法得到当前文件的关键词
            print(artile_keys)
            # 根据文件名获得文章id以及标题
            (shortname, extension) = os.path.splitext(filename) # 得到文件名和文件扩展名
            t = shortname.split("_")
            article_id = int(t[len(t)-1]) # 获得文章id
            artile_tit = articleData[articleData.id==article_id]['title'] # 获得文章标题
            artile_tit = list(artile_tit)[0] # series转成字符串
            ids.append(article_id)
            titles.append(artile_tit)
            keys.append(artile_keys)
    # 所有结果写入文件
    result = pd.DataFrame({"id": ids, "title": titles, "key": keys}, columns=['id', 'title', 'key'])
    result = result.sort_values(by="id",ascending=True) # 排序
    result.to_csv("D:/STUDY/1BIYESHEJI/keyword_extraction-master/result/news1_keys_word2vec1.csv", encoding="gbk",index=False)

if __name__ == '__main__':
    main()

显示错误如下

Traceback (most recent call last):
  File "D:/STUDY/1BIYESHEJI/keyword_extraction-master/wiki_zh_word2vec-master/kmeans.py", line 81, in <module>
    main()
  File "D:/STUDY/1BIYESHEJI/keyword_extraction-master/wiki_zh_word2vec-master/kmeans.py", line 64, in main
    artile_keys = getkeywords_kmeans(data,10) # 聚类算法得到当前文件的关键词
  File "D:/STUDY/1BIYESHEJI/keyword_extraction-master/wiki_zh_word2vec-master/kmeans.py", line 41, in getkeywords_kmeans
    dis += (vec_center[index] - res[index]) * (vec_center[index] - res[index])
IndexError: invalid index to scalar variable.

Process finished with exit code 1

还请大佬们指点感激不尽

讨论数量: 1

有大佬看看吗 :sob::sob::sob:

2年前 评论

讨论应以学习和精进为目的。请勿发布不友善或者负能量的内容,与人为善,比聪明更重要!