如何解決判断字串是否存在,但结果不正确的问题?

大家好,小弟目前有一份csv档案,需要抓取部分关键字的资料
csv档案共有8个栏位,五万多行,如下图:

csv档案资料
我使用了以下程式筛选含有关键字的资料,例如我需要筛选“武漢肺炎”四个字,但是输出的结果却部分成功辨识含有这关键字,部分则辨识失败。
以下是我写的程式码给你们看看问题出现在哪里:

import time
start_time = time.time()
import re
import csv
import sys
import os
import shutil
maxInt = sys.maxsize

while True:
    # decrease the maxInt value by factor 10 
    # as long as the OverflowError occurs.
    try:
        csv.field_size_limit(maxInt)
        break
    except OverflowError:
        maxInt = int(maxInt/10)

input_file = '../Web_Crawler_Data/CA_Group_Complete_Data.csv'
output_folderpath = '../Data_Type'
output_file_1 = output_folderpath+'/Output_COVID19_Content_6W.csv'
output_file_2 = output_folderpath+'/Output_Non_COVID19_Content_6W.csv'

if os.path.isdir(output_folderpath):
    print("路徑存在,重新建立")
    shutil.rmtree(output_folderpath)
    os.mkdir(output_folderpath)

else:
    print("路徑不存在")
    os.mkdir(output_folderpath)

def _input_file_(input_file_name,line_num):
    with open(input_file_name, newline='', encoding='utf-8') as csvfile:
        lines = csv.reader(csvfile)
        columns = [line[line_num] for line in lines]
        return columns
def _output_file_(output_file_name,output_type,D0,D1,D2,D3,D4,D5,D6,D7):
    with open(output_file_name, output_type,newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([D0,D1,D2,D3,D4,D5,D6,D7])

#帖子編號
col_0 = _input_file_(input_file,0)
#帖子標題
col_1 = _input_file_(input_file,1)
#發表作者
col_2 = _input_file_(input_file,2)
#發表時間
col_3 = _input_file_(input_file,3)
#發表內容
col_4 = _input_file_(input_file,4)
#回應作者
col_5 = _input_file_(input_file,5)
#回應時間
col_6 = _input_file_(input_file,6)
#回應內容
col_7 = _input_file_(input_file,7)
#計算總行數
rows = len(col_0)

_output_file_(output_file_1,'w',col_0[0],col_1[0],col_2[0],col_3[0],col_4[0],col_5[0],col_6[0],col_7[0])
_output_file_(output_file_2,'w',col_0[0],col_1[0],col_2[0],col_3[0],col_4[0],col_5[0],col_6[0],col_7[0])


for i in range(1,rows):
    data_1 = "".join(col_4[i])
    data_4 = "".join(col_4[i])

    kw_1_1 = data_1.count('武漢肺炎')
    kw_1_4 = data_4.count('武漢肺炎')
    kw_2_1 = data_1.count('新型冠狀病毒')
    kw_2_4 = data_4.count('新型冠狀病毒')
    kw_3_1 = data_1.count('冠狀病毒')
    kw_3_4 = data_4.count('冠狀病毒')
    kw_4_1 = data_1.count('確診個案')
    kw_4_4 = data_4.count('確診個案')
    kw_5_1 = data_1.count('Corona')
    kw_5_4 = data_4.count('Corona')
    kw_6_1 = data_1.count('新冠疫情')
    kw_6_4 = data_4.count('新冠疫情')
    kw_7_1 = data_1.count('武肺')
    kw_7_4 = data_4.count('武肺')
    kw_8_1 = data_1.count('COVID-19')
    kw_8_4 = data_4.count('COVID-19')
    kw_9_1 = data_1.count('肺炎疫情')
    kw_9_4 = data_4.count('肺炎疫情')
    kw_10_1 = data_1.count('新型肺炎')
    kw_10_4 = data_4.count('新型肺炎')
    kw_11_1 = data_1.count('武漢疫情')
    kw_11_4 = data_4.count('武漢疫情')
    kw_12_1 = data_1.count('疫情指揮中心')
    kw_12_4 = data_4.count('疫情指揮中心')
    kw_13_1 = data_1.count('強制檢疫')
    kw_13_4 = data_4.count('強制檢疫')
    kw_14_1 = data_1.count('全國疫情')
    kw_14_4 = data_4.count('全國疫情')
    kw_15_1 = data_1.count('口罩')
    kw_15_4 = data_4.count('口罩')
    kw_16_1 = data_1.count('武漢疫情')
    kw_16_4 = data_4.count('武漢疫情')
    kw_17_1 = data_1.count('張建宗')
    kw_17_4 = data_4.count('張建宗')
    kw_18_1 = data_1.count('疫情')
    kw_18_4 = data_4.count('疫情')
    kw_19_1 = data_1.count('隱瞞疫情')
    kw_19_4 = data_4.count('隱瞞疫情')
    kw_20_1 = data_1.count('武漢')
    kw_20_4 = data_4.count('武漢')
    kw_21_1 = data_1.count('經濟')
    kw_21_4 = data_4.count('經濟')
    kw_22_1 = data_1.count('封關')
    kw_22_4 = data_4.count('封關')
    kw_23_1 = data_1.count('控制疫情')
    kw_23_4 = data_4.count('控制疫情')
    kw_24_1 = data_1.count('共產黨')
    kw_24_4 = data_1.count('共產黨')
    kw_25_1 = data_1.count('世衛')
    kw_25_4 = data_1.count('世衛')
    kw_26_1 = data_1.count('旅行')
    kw_26_4 = data_1.count('旅行')
    kw_27_1 = data_1.count('湖南')
    kw_27_4 = data_1.count('湖南')
    kw_28_1 = data_1.count('營業')
    kw_28_4 = data_1.count('營業')

    if(kw_1_1>0 or kw_1_4>0 or kw_2_1>0 or kw_2_4>0 or kw_3_1>0 or kw_3_4>0 or kw_4_1>0 or kw_4_4>0 or kw_5_1>0 or kw_5_4>0 or kw_6_1>0 or kw_6_4>0 or kw_7_1>0 or kw_7_4>0 or kw_8_1>0 or kw_8_4>0 or kw_9_1>0 or kw_9_4>0 or kw_10_1>0 or kw_10_4>0 or kw_11_1>0 or kw_11_4>0 or kw_12_1>0 or kw_12_4>0 or kw_13_1>0 or kw_13_4>0 or kw_14_1>0 or kw_14_4>0 or kw_15_1>0 or kw_15_4>0 or kw_16_1>0 or kw_16_4>0 or kw_19_1>0 or kw_19_4>0 or kw_20_1>0 or kw_20_4>0 or kw_23_1>0 or kw_23_4>0):
        print(i,": yes")
        _output_file_(output_file_1,'a',col_0[i],col_1[i],col_2[i],col_3[i],col_4[i],col_5[i],col_6[i],col_7[i])
    elif((kw_18_1>0 or kw_18_4>0) and (kw_28_1>0 or kw_28_4>0)):
        print(i,": yes")
        _output_file_(output_file_1,'a',col_0[i],col_1[i],col_2[i],col_3[i],col_4[i],col_5[i],col_6[i],col_7[i])
    else:
        print(i,": no")
        _output_file_(output_file_2,'a',col_0[i],col_1[i],col_2[i],col_3[i],col_4[i],col_5[i],col_6[i],col_7[i])
print('*'*20+"Finished"+'*'*20)
print("--- %s seconds ---" % (time.time() - start_time))

我所使用的方法是计算词语出现次数大于0就会输出档案A,否则就是输出档案B
发生问题的时候我就直接针对无法被辨识的行数再次测试:

kw_1_1 = data_1.count('武漢肺炎')
print('武漢肺炎出現次數:',kw_1_1)

当我使用以上方式检查后,出现次数居然是0,但明明是有这词语,例如“點睇蕭生評論武漢肺炎 ?”
有见及此,我有尝试使用以下方式测试,但错辨识误的行数依旧出现错误。

if('武漢肺炎' in data_1 or '武漢肺炎' in data_4):

我想知道问题其实出现在哪里?希望你们可以帮到我,谢谢

Jason990420
最佳答案
data_1 = "".join(col_4[i])    ## data_1 = "".join(col_1[i]) ??
3年前 评论
fd5556 (楼主) 3年前
fd5556 (楼主) 3年前
Jason990420 (作者) 3年前
讨论数量: 2
Jason990420
data_1 = "".join(col_4[i])    ## data_1 = "".join(col_1[i]) ??
3年前 评论
fd5556 (楼主) 3年前
fd5556 (楼主) 3年前
Jason990420 (作者) 3年前
pardon110

如果只是想统计词语出现的频率,使用jieba分词

3年前 评论

讨论应以学习和精进为目的。请勿发布不友善或者负能量的内容,与人为善,比聪明更重要!