初识python
import datetime
import re
import pymysql
import requests
from bs4 import BeautifulSoup
def spider():
url = "https://www.bbiquge.net/"
html = requests.get(url)
html.encoding = 'gbk'
text = html.text
bs = BeautifulSoup(text, 'lxml')
box = bs.select("#mainleft .titletop")
db = conn()
query = db.cursor()
for item in box:
category = item.select('h3')[0].string
time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
sql = 'insert into category(name,created_at) values (%s,%s)'
query.execute(sql, (category, time))
insert_id = db.insert_id()
handler_top(item, insert_id, query, db)
li = item.select("ul li")
del li[:1]
for i in li:
book_id, link = handler_li(i, insert_id, query, db)
handler_chapter(book_id, link, query, db)
def handler_top(content, insert_id, query, db):
print("-----------开始采集top--------")
top = content.select("ul li")[0]
title = top.select(".text strong a")
name = title[0].string
link = title[0]['href']
author_str = top.select(".text p")
category_id = insert_id
pattern = re.compile("(?<=作者:).*?(?=<br/>)")
s = str(author_str[0])
m = pattern.search(s)
author = m.group()
book_sql = 'insert into books(name,author,link,category_id) values (%s,%s,%s,%s)'
query.execute(book_sql, (name, author, link, category_id))
book_id = db.insert_id()
handler_chapter(book_id, link, query, db)
def handler_li(content, insert_id, query, db):
print("-----------开始采集书本名称--------")
name = content.select("a")[0].string
link = content.select("a")[0]['href']
category_id = insert_id
author = content.select("span", class_="author")[0].string
book_sql = 'insert into books(name,author,link,category_id) values (%s,%s,%s,%s)'
query.execute(book_sql, (name, author, link, category_id))
book_id = db.insert_id()
return book_id, link
def handler_chapter(book_id, link, query, db):
print("-----------开始采集章节内容--------" + link)
page_html = requests.get(link)
page_text = page_html.text
bs = BeautifulSoup(page_text, 'lxml')
pages = bs.find("select", "form-control").find_all("option")
for page in range(1, len(pages)):
url = link + "index_" + str(page) + ".html"
print("-----------开始采集章节页码--------" + url)
chapter_html = requests.get(url)
chapter_text = chapter_html.text
bs = BeautifulSoup(chapter_text, 'lxml')
dd = bs.select("dl dd")
for d in dd:
href = d.select("a")[0]["href"]
url = link + href
print("-----------开始采集内容--------" + url)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/112.0.0.0 Safari/537.36',
'x-client-data': "CKK1yQEIiLbJAQiitskBCMG2yQEIqZ3KAQj5k8sBCJShywEI/KrMAQic/swBCIWgzQEIvqLNAQ=="
}
content_html = requests.get(url, headers=headers)
content_html.encoding = 'gbk'
content_text = content_html.text
bs = BeautifulSoup(content_text, 'lxml')
article = bs.find("div", id="content").text
name = bs.find("h1").text
page_size = page
old_chapter = href.split(".", 1)[0]
lk = url
created_at = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
bid = book_id
content_sql = 'insert into chapter(name,link,old_chapter,content,page,created_at,book_id)' \
' values (%s,%s,%s,%s,%s,%s,%s)'
query.execute(content_sql, (name, lk, old_chapter, article, page_size, created_at, bid))
db.commit()
print("-----------采集完一条内容------------")
def conn():
try:
db = pymysql.connect(
host='127.0.0.1',
port=3306,
user='root',
passwd='root',
db='stories',
charset='utf8'
)
return db
except Exception as b:
print(b.args)
if __name__ == '__main__':
try:
spider()
except Exception as e:
print(e.args)
推荐文章: