python学习笔记
先上代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# 导入相应的库文件
from lxml import etree
import requests
import csv
import time
type_dict = {"1": "手机", "2": "摄影摄像", '3': "智能数码", "5": "笔记本", "6": "平板电脑"}
# 加入请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36'
'(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
def get_text(url, try_num):
try:
html = requests.get(url, headers=headers)
return html
except Exception as e:
print(e.args)
if try_num >= 1:
return get_text(url, try_num - 1)
else:
return ''
def my_import(k, t):
# 创建csv
fp = open('aihuishou' + str(k) + '.csv', 'wt', newline='', encoding='utf_8_sig')
# 写入header
writer = csv.writer(fp)
writer.writerow(('type_id', 'type', 'name_info', 'price'))
# 构造urls
urls = ['https://www.aihuishou.com/product/Search.html?c=' + str(k) + '&p={}&all=True'.format(str(i)) for i in
range(1, 250)]
if k == "2":
urls = ['https://www.aihuishou.com/sheying{}?all=True'.format("" if i == 1 else "-p" + str(i)) for i in
range(1, 250)]
for url in urls:
try_num = 5
html = get_text(url, try_num)
if not html:
continue
selector = etree.HTML(html.text)
# 取大标签,以此循环
infos = selector.xpath('//div[@class="product-list-wrapper"]/ul/li')
if not infos:
break
for info in infos:
name_info = info.xpath('a/p/text()')[0] # .split(' ') # 数码名称信息
# brand = name_info[0] # 品牌
# model = name_info[1] if len(name_info) > 1 else "" # 型号
# version = name_info[2] if len(name_info) > 2 else "" # 版本号
price = (info.xpath('a/div[@class="price"]/em/text()')[0]).replace('¥', '') # 最高回收价格
# 写入数据
writer.writerow((k, t, name_info, price))
# 访问过快可能会被限制ip
time.sleep(2)
# 关闭csv文件
fp.close()
for k, t in type_dict.items():
my_import(k, t, )
步骤
- 安装lxml
sudo pip install lxml
- 分析网站url的分页方式
注意:
需要sleep,防止对网站造成影响,以及ip被封等情况
可能出现网络异常,所以需要添加尝试次数
代码仅供学习交流,严禁用于商业用途,请于24小时内删除