小白入门分享第一个爬虫

import requests
from lxml import etree
import os

name = 0

def header(url):
    header = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
        'Referer': url
        }

    return header

def get_img(url):
    global name
    name+=1

    img_name = '{}.jpg'.format(name)
    if os.path.isfile(img_name)!=True:
        img = requests.get(url,headers=header(url)).content

        with open(img_name,'wb') as save_img:
            save_img.write(img)

def get_url(url):  
    html = requests.get(url,headers=header(url)).text
    etree_html = etree.HTML(html)
    img_url = etree_html.xpath('//div[@class="main-image"]/p/a/img/@src')       
    return img_url

def get_mainpic_url(url):
    html = requests.get(url,headers=header(url)).text                       
    etree_html = etree.HTML(html)
    page_url = etree_html.xpath('//div[@class="pagenavi"]/a/@href')
    return page_url

def get_subset(url):
    html = requests.get(url,headers=header(url)).text                       
    etree_html = etree.HTML(html)
    page_url = etree_html.xpath('//div[@class="postlist"]/ul[@id="pins"]/li/a/@href')
    return page_url

def main():
    address = os.path.abspath('.')+"\\pic"

    number = 0
    x = 0
    maxnumber = int(input("你想爬取多少个图册:"))
    while(1):
        if x==0:
            url = 'https://www.mzitu.com'
            x+=1
        else:
            url = 'https://www.mzitu.com//page//{}.format(x)'

        addrlist_main = get_subset(url)          #首页的子页url
        n=0
        original_folder = address

        for subaddr in addrlist_main:
            os.chdir(original_folder)                   #初始文件夹

            if n< maxnumber:
                n+=1
                os.chdir(original_folder)           
                pagelist_sub = get_mainpic_url(subaddr)            #子页里获取后面几页的url??
                for page in pagelist_sub:
                    if page!="/hot/":             
                        img_list = get_url(page)           #获取子集中的主图片
                        for img in img_list:
                            get_img(img)                    #下载图片
            else:
                exit()

main()
讨论数量: 3

排版,说明,代码高亮、缩进都没有。。。。。

4周前 评论

@Bgods 第一次参与这个论坛发帖,还不会这些操作。。。

2周前 评论

是不是打算面向监狱编程

2周前 评论

请勿发布不友善或者负能量的内容。与人为善,比聪明更重要!