Python asyncio 爬虫
import aiohttp, aiofiles
from aiohttp.client_exceptions import ClientConnectionError
import asyncio
import os
import re
RE_IMG_PAGES = re.compile('''<li><a href=["'](https://www.mzitu.com/\d+)["']''') # 每一个套图的入口URL
RE_LIST_NEXT_PAGE = re.compile('''next page-numbers" href=["'](https://www.mzitu.com/page/\d+/)["']>''') # 列表的下一页
RE_IMG_INFO = re.compile('''<div class="main-image">.+?<img src=["']([^"']+?)["'] alt=["']([^"']+?)["']''') # 图片URL和名称
RE_IMG_NEXT_PAGE = re.compile('''href=["']([^"']+?/\d+/\d+)["']><span>下一页''') # 图片的下一页
RE_SUB_DIRNAME = re.compile(r'[<>/\\|:*?]') # 图片名称字符过滤
async def download(url, retries=0):
headers = {'User-Agent': 'Mozilla', 'Referer':'https://www.mzitu.com/'}
if retries < 3:
async with aiohttp.request('GET', url, headers=headers, allow_redirects=False, expect100=True) as resp:
if resp.status == 200:
return await resp.read()
else:
await asyncio.sleep(10)
return await download(url, retries+1)
else:
raise ClientConnectionError
async def save_image(img_url, save_dir=''):
img = await download(img_url)
save_dir = RE_SUB_DIRNAME.sub('_', save_dir) # 过滤目录名中的不规范字符
save_path = os.path.join(save_dir, os.path.split(img_url)[-1])
try:
# 直接保存,如果不成功再目录,再保存
async with aiofiles.open(save_path, mode='wb') as img_fp:
await img_fp.write(img)
except FileNotFoundError:
os.mkdir(save_dir)
async with aiofiles.open(save_path, mode='wb') as img_fp:
await img_fp.write(img)
print(save_path)
async def process_list_page(list_page_url):
list_page = await download(list_page_url)
list_page = list_page.decode('utf-8')
img_page_list = RE_IMG_PAGES.findall(list_page)
# for img_page in img_page_list:
# 爬取列表中每一个项目,谨慎开启
for img_page in img_page_list[:1]:
await process_img_page(img_page)
# list_next_page_list = RE_LIST_NEXT_PAGE.findall(list_page)
# for list_next_page in list_next_page_list:
# await process_list_page(list_next_page)
# 爬取列表中的下一页,谨慎开启
async def process_img_page(img_page_url):
img_page = await download(img_page_url)
img_page = img_page.decode('utf-8')
img_info_list = RE_IMG_INFO.findall(img_page)
for img_url, img_title in img_info_list:
await save_image(img_url, img_title)
img_next_page_list = RE_IMG_NEXT_PAGE.findall(img_page)
for img_next_page in img_next_page_list:
await process_img_page(img_next_page)
base_url = 'https://www.mzitu.com/'
loop = asyncio.get_event_loop()
loop.run_until_complete(process_list_page(base_url))
本作品采用《CC 协议》,转载必须注明作者和本文链接
推荐文章: