北邮《Python编程与实践》——爬虫学习

视频地址 点这里

豆瓣读书列表采集代码

import requests
from requests.structures import CaseInsensitiveDict
from lxml import etree
import csv

book_info = []
page = 1
while 1:
    page_url = f"https://book.douban.com/tag/%E7%BC%96%E7%A8%8B?start={(page-1)*20}&type=T"
    print(page_url)
    headers = CaseInsensitiveDict()
    headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
    res = requests.get(page_url, headers=headers)
    sel = etree.HTML(res.text)

    for block in sel.xpath("//li[@class='subject-item']"):
        title = ''
        elem_title = block.xpath('.//h2/a')
        if elem_title:
            title = ''.join(elem_title[0].itertext()).replace('\n', '').replace(' ', '')

        price = -1
        elem_price = block.xpath(".//span[@class='buy-info']/a/text()")
        if elem_price:
            s = elem_price[0].strip()
            price = float(s[s.find('版')+2:s.find('元')])

        cover = ''
        elem_cover = block.xpath(".//img/@src")
        if elem_cover:
            cover = elem_cover[0]

        book_info.append([title, cover, price])
    page += 1
    if page > 5:
        break

# windows下要传 encoding  newline 
# 不传 encoding 默认 gbk 直接乱码
# 不传 newline 默认 \n 写一行空一行
with open('book.csv', 'w',  encoding ='utf-8', newline='') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerows(book_info)

B站热门列表采集代码

import requests
import csv

hot_list = []
page = 1
while 1:
    page_url = f"https://api.bilibili.com/x/web-interface/popular?ps=20&pn={page}"
    print(page_url)
    res = requests.get(page_url)
    json_content = res.json()
    for item in json_content['data']['list']:
        hot_list.append([
            item['bvid'],
            item['pic'],
            item['title'],
            item['owner']['name']
        ])

    page += 1
    if page > 11:
        break

with open('bilibili.csv', 'w', encoding='utf-8', newline='') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerows(hot_list)
本作品采用《CC 协议》,转载必须注明作者和本文链接
感谢阅读,有收获的话不妨点个赞:smiling_imp:
讨论数量: 0
(= ̄ω ̄=)··· 暂无内容!

讨论应以学习和精进为目的。请勿发布不友善或者负能量的内容,与人为善,比聪明更重要!