python爬取小说网站写入txt时出现个别字乱码,这种是什么原因
import requests
from lxml import html
import os
import re
if name == ‘main‘:
qq = 0
headers = {
‘Accept’:’text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8’,
‘Accept-Encoding’:’gzi,deflate,br’,
‘Accept-Language’:’zh-CN,zh;q=0.8’,
‘Connection’:’keep-alive’,
‘Cookie’:’’,
‘Host’:’www.qb5.tw',
‘Upgrade-Insecure-Requests’:’1’,
‘User-Agent’:’Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36’
}
url = ‘www.qb5.tw/fenlei/1_1/'
response = requests.get(url=url, headers=headers).text
qq=qq+1
etree = html.etree
tree = etree.HTML(response)
if not os.path.exists(‘./xs_picLibs’):
os.mkdir(‘./xs_picLibs’)
for pageNum in range(1,377):
new_url = ‘www.qb5.tw/fenlei/1_' + str(pageNum) + ‘/‘
page_text = requests.get(url=new_url, headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath(‘//ul[@class=”titlelist”]/li’)
x=0
for li in li_list:
list_url_list = li.xpath(‘.//a/@href’)[0]
page_text = requests.get(url=list_url_list, headers=headers).text
mz = ‘
(.?) .?
‘
mz = re.findall(mz,page_text,re.S)
mz = mz.str().replace(“‘“,’’).replace(“]”,’’).replace(“[“,’’)
if not os.path.exists(‘./xs_picLibs/‘ + mz):
os.mkdir(‘./xs_picLibs/‘+ mz)
ex = ‘.?<a href=”(.?)”.?‘
url_list = re.findall(ex,page_text,re.S)[12:]
qw =’,’.join(str(n) for n in url_list)
for qw in url_list:
n_url = list_url_list + qw
page_text = requests.get(url=n_url, headers=headers).text
ex = ‘
(.)
‘
wz_data = re.findall(ex,page_text,re.S)
wz_data = wz_data.str().replace(‘ ’,’\n ‘).replace(‘
‘,’’).replace(“‘“,’’).replace(“]”,’’).replace(“[“,’’)
ec = ‘
(.*?)
‘
txt1_name = re.findall(ec,page_text,re.S)
txt1_name = txt1_name.str().replace(“‘“,’’).replace(‘[‘,’’).replace(‘]’,’’)
txt_name = mz + ‘.txt’
img_path = ‘xs_picLibs/‘ + txt_name
with open(img_path, ‘a’,encoding=(‘utf-8’))as fp:
fp.write(‘\n\n’ + txt1_name +’\n’+ (wz_data))
print(‘第’+qq.str()+’页—‘+mz+’—‘+txt1_name+’—已保存’)
推荐文章: