初识python

import datetime
import re
import pymysql
import requests
from bs4 import BeautifulSoup


def spider():
    url = "https://www.bbiquge.net/"
    html = requests.get(url)

    html.encoding = 'gbk'
    text = html.text
    bs = BeautifulSoup(text, 'lxml')
    box = bs.select("#mainleft .titletop")
    db = conn()
    query = db.cursor()
    for item in box:
        category = item.select('h3')[0].string
        time = datetime.datetime.now().strftime('%Y-%m-%d  %H:%M:%S')
        sql = 'insert into category(name,created_at) values (%s,%s)'
        query.execute(sql, (category, time))
        insert_id = db.insert_id()
        handler_top(item, insert_id, query, db)
        li = item.select("ul li")
        del li[:1]
        for i in li:
            book_id, link = handler_li(i, insert_id, query, db)
            handler_chapter(book_id, link, query, db)


def handler_top(content, insert_id, query, db):
    print("-----------开始采集top--------")
    top = content.select("ul li")[0]
    title = top.select(".text strong a")
    name = title[0].string
    link = title[0]['href']
    author_str = top.select(".text p")
    category_id = insert_id
    pattern = re.compile("(?<=作者:).*?(?=<br/>)")
    s = str(author_str[0])
    m = pattern.search(s)
    author = m.group()
    book_sql = 'insert into books(name,author,link,category_id) values (%s,%s,%s,%s)'
    query.execute(book_sql, (name, author, link, category_id))
    book_id = db.insert_id()
    handler_chapter(book_id, link, query, db)


def handler_li(content, insert_id, query, db):
    print("-----------开始采集书本名称--------")
    name = content.select("a")[0].string
    link = content.select("a")[0]['href']
    category_id = insert_id
    author = content.select("span", class_="author")[0].string
    book_sql = 'insert into books(name,author,link,category_id) values (%s,%s,%s,%s)'
    query.execute(book_sql, (name, author, link, category_id))
    book_id = db.insert_id()
    return book_id, link


def handler_chapter(book_id, link, query, db):
    print("-----------开始采集章节内容--------" + link)
    page_html = requests.get(link)
    page_text = page_html.text
    bs = BeautifulSoup(page_text, 'lxml')
    pages = bs.find("select", "form-control").find_all("option")
    for page in range(1, len(pages)):
        url = link + "index_" + str(page) + ".html"
        print("-----------开始采集章节页码--------" + url)
        chapter_html = requests.get(url)
        chapter_text = chapter_html.text
        bs = BeautifulSoup(chapter_text, 'lxml')
        dd = bs.select("dl dd")
        for d in dd:
            href = d.select("a")[0]["href"]
            url = link + href
            print("-----------开始采集内容--------" + url)
            headers = {
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                              'Chrome/112.0.0.0 Safari/537.36',
                'x-client-data': "CKK1yQEIiLbJAQiitskBCMG2yQEIqZ3KAQj5k8sBCJShywEI/KrMAQic/swBCIWgzQEIvqLNAQ=="
            }
            content_html = requests.get(url, headers=headers)
            content_html.encoding = 'gbk'
            content_text = content_html.text
            bs = BeautifulSoup(content_text, 'lxml')

            article = bs.find("div", id="content").text
            name = bs.find("h1").text
            page_size = page
            old_chapter = href.split(".", 1)[0]
            lk = url
            created_at = datetime.datetime.now().strftime('%Y-%m-%d  %H:%M:%S')
            bid = book_id

            content_sql = 'insert into chapter(name,link,old_chapter,content,page,created_at,book_id)' \
                          ' values (%s,%s,%s,%s,%s,%s,%s)'
            query.execute(content_sql, (name, lk, old_chapter, article, page_size, created_at, bid))
            db.commit()
            print("-----------采集完一条内容------------")


def conn():
    try:
        db = pymysql.connect(
            host='127.0.0.1',
            port=3306,
            user='root',
            passwd='root',
            db='stories',
            charset='utf8'
        )
        return db
    except Exception as b:
        print(b.args)


if __name__ == '__main__':
    try:
        spider()
    except Exception as e:
        print(e.args)
讨论数量: 1

下一步 学习使用多线程采集,下下一步学习做一个桌面工具采集

11个月前 评论

讨论应以学习和精进为目的。请勿发布不友善或者负能量的内容,与人为善,比聪明更重要!