小白入门分享第一个爬虫
import requests
from lxml import etree
import os
name = 0
def header(url):
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
'Referer': url
}
return header
#获取图片地址
def get_img(url):
global name
name+=1
img_name = '{}.jpg'.format(name)
if os.path.isfile(img_name)!=True:
img = requests.get(url,headers=header(url)).content
with open(img_name,'wb') as save_img:
save_img.write(img)
#获取网站地址
def get_url(url):
html = requests.get(url,headers=header(url)).text
etree_html = etree.HTML(html)
img_url = etree_html.xpath('//div[@class="main-image"]/p/a/img/@src')
return img_url
#获取主页图片集地址
def get_mainpic_url(url):
html = requests.get(url,headers=header(url)).text
etree_html = etree.HTML(html)
page_url = etree_html.xpath('//div[@class="pagenavi"]/a/@href')
return page_url
#获取子页图片集地址
def get_subset(url):
html = requests.get(url,headers=header(url)).text
etree_html = etree.HTML(html)
page_url = etree_html.xpath('//div[@class="postlist"]/ul[@id="pins"]/li/a/@href')
return page_url
def main():
address = os.path.abspath('.')+"\\pic"
number = 0
x = 0
#爬取数量
maxnumber = int(input("你想爬取多少个图册:"))
while(1):
if x==0:
url = 'https://www.mzitu.com'
x+=1
else:
url = 'https://www.mzitu.com//page//{}.format(x)'
addrlist_main = get_subset(url)
#首页的子页url
n=0
original_folder = address
for subaddr in addrlist_main:
#创建存储文件夹
os.chdir(original_folder)
if n< maxnumber:
n+=1
os.chdir(original_folder)
pagelist_sub = get_mainpic_url(subaddr)
#子页里获取后面几页的url??
for page in pagelist_sub:
if page!="/hot/":
#获取子集中的主图片
img_list = get_url(page)
for img in img_list:
#下载图片
get_img(img)
else:
exit()
main()
推荐文章: