如何解決判断字串是否存在,但结果不正确的问题?
大家好,小弟目前有一份csv档案,需要抓取部分关键字的资料
csv档案共有8个栏位,五万多行,如下图:
我使用了以下程式筛选含有关键字的资料,例如我需要筛选“武漢肺炎”四个字,但是输出的结果却部分成功辨识含有这关键字,部分则辨识失败。
以下是我写的程式码给你们看看问题出现在哪里:
import time
start_time = time.time()
import re
import csv
import sys
import os
import shutil
maxInt = sys.maxsize
while True:
# decrease the maxInt value by factor 10
# as long as the OverflowError occurs.
try:
csv.field_size_limit(maxInt)
break
except OverflowError:
maxInt = int(maxInt/10)
input_file = '../Web_Crawler_Data/CA_Group_Complete_Data.csv'
output_folderpath = '../Data_Type'
output_file_1 = output_folderpath+'/Output_COVID19_Content_6W.csv'
output_file_2 = output_folderpath+'/Output_Non_COVID19_Content_6W.csv'
if os.path.isdir(output_folderpath):
print("路徑存在,重新建立")
shutil.rmtree(output_folderpath)
os.mkdir(output_folderpath)
else:
print("路徑不存在")
os.mkdir(output_folderpath)
def _input_file_(input_file_name,line_num):
with open(input_file_name, newline='', encoding='utf-8') as csvfile:
lines = csv.reader(csvfile)
columns = [line[line_num] for line in lines]
return columns
def _output_file_(output_file_name,output_type,D0,D1,D2,D3,D4,D5,D6,D7):
with open(output_file_name, output_type,newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([D0,D1,D2,D3,D4,D5,D6,D7])
#帖子編號
col_0 = _input_file_(input_file,0)
#帖子標題
col_1 = _input_file_(input_file,1)
#發表作者
col_2 = _input_file_(input_file,2)
#發表時間
col_3 = _input_file_(input_file,3)
#發表內容
col_4 = _input_file_(input_file,4)
#回應作者
col_5 = _input_file_(input_file,5)
#回應時間
col_6 = _input_file_(input_file,6)
#回應內容
col_7 = _input_file_(input_file,7)
#計算總行數
rows = len(col_0)
_output_file_(output_file_1,'w',col_0[0],col_1[0],col_2[0],col_3[0],col_4[0],col_5[0],col_6[0],col_7[0])
_output_file_(output_file_2,'w',col_0[0],col_1[0],col_2[0],col_3[0],col_4[0],col_5[0],col_6[0],col_7[0])
for i in range(1,rows):
data_1 = "".join(col_4[i])
data_4 = "".join(col_4[i])
kw_1_1 = data_1.count('武漢肺炎')
kw_1_4 = data_4.count('武漢肺炎')
kw_2_1 = data_1.count('新型冠狀病毒')
kw_2_4 = data_4.count('新型冠狀病毒')
kw_3_1 = data_1.count('冠狀病毒')
kw_3_4 = data_4.count('冠狀病毒')
kw_4_1 = data_1.count('確診個案')
kw_4_4 = data_4.count('確診個案')
kw_5_1 = data_1.count('Corona')
kw_5_4 = data_4.count('Corona')
kw_6_1 = data_1.count('新冠疫情')
kw_6_4 = data_4.count('新冠疫情')
kw_7_1 = data_1.count('武肺')
kw_7_4 = data_4.count('武肺')
kw_8_1 = data_1.count('COVID-19')
kw_8_4 = data_4.count('COVID-19')
kw_9_1 = data_1.count('肺炎疫情')
kw_9_4 = data_4.count('肺炎疫情')
kw_10_1 = data_1.count('新型肺炎')
kw_10_4 = data_4.count('新型肺炎')
kw_11_1 = data_1.count('武漢疫情')
kw_11_4 = data_4.count('武漢疫情')
kw_12_1 = data_1.count('疫情指揮中心')
kw_12_4 = data_4.count('疫情指揮中心')
kw_13_1 = data_1.count('強制檢疫')
kw_13_4 = data_4.count('強制檢疫')
kw_14_1 = data_1.count('全國疫情')
kw_14_4 = data_4.count('全國疫情')
kw_15_1 = data_1.count('口罩')
kw_15_4 = data_4.count('口罩')
kw_16_1 = data_1.count('武漢疫情')
kw_16_4 = data_4.count('武漢疫情')
kw_17_1 = data_1.count('張建宗')
kw_17_4 = data_4.count('張建宗')
kw_18_1 = data_1.count('疫情')
kw_18_4 = data_4.count('疫情')
kw_19_1 = data_1.count('隱瞞疫情')
kw_19_4 = data_4.count('隱瞞疫情')
kw_20_1 = data_1.count('武漢')
kw_20_4 = data_4.count('武漢')
kw_21_1 = data_1.count('經濟')
kw_21_4 = data_4.count('經濟')
kw_22_1 = data_1.count('封關')
kw_22_4 = data_4.count('封關')
kw_23_1 = data_1.count('控制疫情')
kw_23_4 = data_4.count('控制疫情')
kw_24_1 = data_1.count('共產黨')
kw_24_4 = data_1.count('共產黨')
kw_25_1 = data_1.count('世衛')
kw_25_4 = data_1.count('世衛')
kw_26_1 = data_1.count('旅行')
kw_26_4 = data_1.count('旅行')
kw_27_1 = data_1.count('湖南')
kw_27_4 = data_1.count('湖南')
kw_28_1 = data_1.count('營業')
kw_28_4 = data_1.count('營業')
if(kw_1_1>0 or kw_1_4>0 or kw_2_1>0 or kw_2_4>0 or kw_3_1>0 or kw_3_4>0 or kw_4_1>0 or kw_4_4>0 or kw_5_1>0 or kw_5_4>0 or kw_6_1>0 or kw_6_4>0 or kw_7_1>0 or kw_7_4>0 or kw_8_1>0 or kw_8_4>0 or kw_9_1>0 or kw_9_4>0 or kw_10_1>0 or kw_10_4>0 or kw_11_1>0 or kw_11_4>0 or kw_12_1>0 or kw_12_4>0 or kw_13_1>0 or kw_13_4>0 or kw_14_1>0 or kw_14_4>0 or kw_15_1>0 or kw_15_4>0 or kw_16_1>0 or kw_16_4>0 or kw_19_1>0 or kw_19_4>0 or kw_20_1>0 or kw_20_4>0 or kw_23_1>0 or kw_23_4>0):
print(i,": yes")
_output_file_(output_file_1,'a',col_0[i],col_1[i],col_2[i],col_3[i],col_4[i],col_5[i],col_6[i],col_7[i])
elif((kw_18_1>0 or kw_18_4>0) and (kw_28_1>0 or kw_28_4>0)):
print(i,": yes")
_output_file_(output_file_1,'a',col_0[i],col_1[i],col_2[i],col_3[i],col_4[i],col_5[i],col_6[i],col_7[i])
else:
print(i,": no")
_output_file_(output_file_2,'a',col_0[i],col_1[i],col_2[i],col_3[i],col_4[i],col_5[i],col_6[i],col_7[i])
print('*'*20+"Finished"+'*'*20)
print("--- %s seconds ---" % (time.time() - start_time))
我所使用的方法是计算词语出现次数大于0就会输出档案A,否则就是输出档案B
发生问题的时候我就直接针对无法被辨识的行数再次测试:
kw_1_1 = data_1.count('武漢肺炎')
print('武漢肺炎出現次數:',kw_1_1)
当我使用以上方式检查后,出现次数居然是0,但明明是有这词语,例如“點睇蕭生評論武漢肺炎 ?”
有见及此,我有尝试使用以下方式测试,但错辨识误的行数依旧出现错误。
if('武漢肺炎' in data_1 or '武漢肺炎' in data_4):
我想知道问题其实出现在哪里?希望你们可以帮到我,谢谢