python学习笔记

先上代码

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# 导入相应的库文件
from lxml import etree
import requests
import csv
import time

type_dict = {"1": "手机", "2": "摄影摄像", '3': "智能数码", "5": "笔记本", "6": "平板电脑"}

# 加入请求头
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36'
                  '(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}


def get_text(url, try_num):
    try:
        html = requests.get(url, headers=headers)
        return html
    except Exception as e:
        print(e.args)
        if try_num >= 1:
            return get_text(url, try_num - 1)
        else:
            return ''


def my_import(k, t):
    # 创建csv
    fp = open('aihuishou' + str(k) + '.csv', 'wt', newline='', encoding='utf_8_sig')

    # 写入header
    writer = csv.writer(fp)
    writer.writerow(('type_id', 'type', 'name_info', 'price'))

    # 构造urls
    urls = ['https://www.aihuishou.com/product/Search.html?c=' + str(k) + '&p={}&all=True'.format(str(i)) for i in
            range(1, 250)]

    if k == "2":
        urls = ['https://www.aihuishou.com/sheying{}?all=True'.format("" if i == 1 else "-p" + str(i)) for i in
                range(1, 250)]

    for url in urls:
        try_num = 5
        html = get_text(url, try_num)
        if not html:
            continue

        selector = etree.HTML(html.text)
        # 取大标签,以此循环
        infos = selector.xpath('//div[@class="product-list-wrapper"]/ul/li')

        if not infos:
            break

        for info in infos:
            name_info = info.xpath('a/p/text()')[0]  # .split(' ')  # 数码名称信息
            # brand = name_info[0]  # 品牌
            # model = name_info[1] if len(name_info) > 1 else ""  # 型号
            # version = name_info[2] if len(name_info) > 2 else ""  # 版本号
            price = (info.xpath('a/div[@class="price"]/em/text()')[0]).replace('¥', '')  # 最高回收价格
            # 写入数据
            writer.writerow((k, t, name_info, price))
        # 访问过快可能会被限制ip
        time.sleep(2)
    # 关闭csv文件
    fp.close()


for k, t in type_dict.items():
    my_import(k, t, )

步骤

  • 安装lxml
sudo pip install lxml
  • 分析网站url的分页方式

注意:

  • 需要sleep,防止对网站造成影响,以及ip被封等情况

  • 可能出现网络异常,所以需要添加尝试次数

  • 代码仅供学习交流,严禁用于商业用途,请于24小时内删除

享受追踪思维漏洞的过程
讨论数量: 0
(= ̄ω ̄=)··· 暂无内容!

讨论应以学习和精进为目的。请勿发布不友善或者负能量的内容,与人为善,比聪明更重要!