python爬取小说网站写入txt时出现个别字乱码,这种是什么原因

import requests
from lxml import html
import os
import re
if name == ‘main‘:
qq = 0
headers = {
‘Accept’:’text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8’,
‘Accept-Encoding’:’gzi,deflate,br’,
‘Accept-Language’:’zh-CN,zh;q=0.8’,
‘Connection’:’keep-alive’,
‘Cookie’:’’,
‘Host’:’www.qb5.tw',
‘Upgrade-Insecure-Requests’:’1’,
‘User-Agent’:’Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36’
}
url = ‘www.qb5.tw/fenlei/1_1/'
response = requests.get(url=url, headers=headers).text
qq=qq+1
etree = html.etree
tree = etree.HTML(response)
if not os.path.exists(‘./xs_picLibs’):
os.mkdir(‘./xs_picLibs’)
for pageNum in range(1,377):
new_url = ‘www.qb5.tw/fenlei/1_' + str(pageNum) + ‘/‘
page_text = requests.get(url=new_url, headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath(‘//ul[@class=”titlelist”]/li’)
x=0
for li in li_list:
list_url_list = li.xpath(‘.//a/@href’)[0]
page_text = requests.get(url=list_url_list, headers=headers).text
mz = ‘

(.?) .?


mz = re.findall(mz,page_text,re.S)
mz = mz.str().replace(“‘“,’’).replace(“]”,’’).replace(“[“,’’)
if not os.path.exists(‘./xs_picLibs/‘ + mz):
os.mkdir(‘./xs_picLibs/‘+ mz)
ex = ‘.?<a href=”(.?)”.?
url_list = re.findall(ex,page_text,re.S)[12:]
qw =’,’.join(str(n) for n in url_list)
for qw in url_list:
n_url = list_url_list + qw
page_text = requests.get(url=n_url, headers=headers).text
ex = ‘

(.)


wz_data = re.findall(ex,page_text,re.S)
wz_data = wz_data.str().replace(‘    ’,’\n ‘).replace(‘

‘,’’).replace(“‘“,’’).replace(“]”,’’).replace(“[“,’’)
ec = ‘

(.*?)


txt1_name = re.findall(ec,page_text,re.S)
txt1_name = txt1_name.str().replace(“‘“,’’).replace(‘[‘,’’).replace(‘]’,’’)
txt_name = mz + ‘.txt’
img_path = ‘xs_picLibs/‘ + txt_name
with open(img_path, ‘a’,encoding=(‘utf-8’))as fp:
fp.write(‘\n\n’ + txt1_name +’\n’+ (wz_data))
print(‘第’+qq.str()+’页—‘+mz+’—‘+txt1_name+’—已保存’)

讨论数量: 12
Jason990420

代码呈乱码, 无法执行, 你省事 ... 难找问题, 帮忙的人找的好累, 所以帮忙的人也想省事, 所以没有解释

>>> response = requests.get(url=n_url, headers=headers)
>>> response.encoding
'GB2312'
>>> response.encoding = 'gbk'
>>> page_text = response.text

in html

charset="gbk"

in response headers

Content-Type: text/html; charset=GB2312

3年前 评论
pardon110

应根据爬取页面后,响应体头部编码动态确定存入文本文件编码,而非如你这般在代码中写死成utf8

with open(img_path, ‘a’,encoding=(‘utf-8))as fp:
3年前 评论
Jason990420

代码高亮

记得前面多加一个空行

3年前 评论

file 标注的地方在原网站是“屌丝”,写入时就变成这样,“篆”写入就变成“?”

3年前 评论
Jason990420

代码乱了

3年前 评论
import requests
from lxml import html
import os
import re
if __name__ == '__main__':
    qq = 0
    headers = {
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding':'gzi,deflate,br',
        'Accept-Language':'zh-CN,zh;q=0.8',
        'Connection':'keep-alive',
        'Host':'www.qb5.tw',
        'Upgrade-Insecure-Requests':'1',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36'
    }
    url = 'https://www.qb5.tw/fenlei/1_1/'
    response = requests.get(url=url, headers=headers).text
    qq=qq+1
    etree = html.etree
    tree = etree.HTML(response)
    if not os.path.exists('./xs_picLibs'):
        os.mkdir('./xs_picLibs')
    for pageNum in range(1,377):
        new_url = 'https://www.qb5.tw/fenlei/1_' + str(pageNum) + '/'
        page_text = requests.get(url=new_url, headers=headers).text
        tree = etree.HTML(page_text)
        li_list = tree.xpath('//ul[@class="titlelist"]/li')
        x=0
        for li in li_list:
            list_url_list = li.xpath('.//a/@href')[0]
            page_text = requests.get(url=list_url_list, headers=headers).text
            mz = '<h1>(.*?) <small>.*?</h1>'
            mz = re.findall(mz,page_text,re.S)
            mz = mz.__str__().replace("'",'').replace("]",'').replace("[",'')
            if not os.path.exists('./xs_picLibs/' + mz):
                os.mkdir('./xs_picLibs/'+ mz)
            ex = '<dd>.*?<a href="(.*?)".*?</dd>'
            url_list = re.findall(ex,page_text,re.S)[12:]
            qw =','.join(str(n) for n in url_list)
            for qw in url_list:
                n_url = list_url_list + qw
                page_text = requests.get(url=n_url, headers=headers).text
                ex = '<br><br>(.*)<br /><br />'
                wz_data = re.findall(ex,page_text,re.S)
                wz_data = wz_data.__str__().replace('&nbsp;&nbsp;&nbsp;&nbsp;','\n    ').replace('<br /><br />','').replace("'",'').replace("]",'').replace("[",'')
                ec = '<h1>(.*?)</h1>'
                txt1_name = re.findall(ec,page_text,re.S)
                txt1_name = txt1_name.__str__().replace("'",'').replace('[','').replace(']','')
                txt_name = mz + '.txt'
                img_path = 'xs_picLibs/' + txt_name
                with open(img_path, 'a')as fp:
                    fp.write('\n\n' + txt1_name +'\n'+ (wz_data))
                print('第'+qq.__str__()+'页---'+mz+'---'+txt1_name+'---已保存')
```![file](https://cdn.learnku.com/uploads/images/202012/18/74669/gF6Q3D6FfI.png!large)
3年前 评论

这个网页编辑器是支持markdown格式的,把你的代码放在代码框里。。。这贴的代码看着太费事了。。。

3年前 评论

第一次使用,不太了解功能,见谅

3年前 评论

有些网站编码方式是不同的,建议: response = requests.get(url=url, headers=headers).text print(response)>>>> response = requests.get(url=url, headers=headers).content print(response.decode())

3年前 评论
aa033988

爬取小说网站写入txt时出现个别字乱码,这种是什么原因 www.meiguoivf.com 有谁能帮忙解答!

2年前 评论
pardon110 2年前

讨论应以学习和精进为目的。请勿发布不友善或者负能量的内容,与人为善,比聪明更重要!