python爬取小说后部分内容乱码
下面是代码
import requests
import os
import lxml
from bs4 import BeautifulSoup
if __name__ == '__main__':
url = 'https://www.linovelib.com/novel/2547/catalog'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
page_text = requests.get(url=url, headers=headers).text
soup = BeautifulSoup(page_text, 'lxml')
li_list = soup.select('.volume-list ul')
li_list = li_list[0]
li_list = str(li_list).split('\n')
print(li_list)
li_list1 = soup.select('.volume-list ul > div')
li_list2 = soup.select('.volume-list li')
Number1 = -1;
Number2 = -1;
for li in li_list:
if 'v-line' in li:
print('2')
Number1 = Number1 + 1;
if Number1 < 9:
if not os.path.exists('./' + str(li_list1[Number1].text)):
os.mkdir('./' + str(li_list1[Number1].text))
elif 'href' in li:
if Number2 < 2:
print('1')
Number2 = Number2 + 1
href = 'https://www.linovelib.com' + li_list2[Number2].a['href']
href_page = requests.get(url=href, headers=headers, )
href_page.encoding = 'utf-8'
href_page_text = href_page.text
href_soup = BeautifulSoup(href_page_text, 'lxml')
href_list = href_soup.find('div', id='mlfy_main_text')
content = href_list.text
print(content)
fileName = './' + str(li_list1[Number1].text) + '/' + str(li_list2[Number2].a.string) + '.text'
fp = open(fileName, 'w', encoding='utf-8')
fp.write(content)
print("nb")
然后下面是执行的部分结果
这种情况算乱码吗
你抓下来的只是页面的源代码
事实上该内容可能有经以下 javascript 代码置换成最后显示的页面内容, 我不懂 javascript, 所以要你自己去看它们如何置换成最后的结果.
In pctheme.js
Try to get the text by selenium