有没大神帮忙解决一下某宝爬虫问题,写了一个爬虫,但是遇到只能爬取第一页的情况翻页不了
# myspaceyang
# 2022-04-19 15:33
import random
import time
import csv
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
# 定义查找商品的函数
def search():
driver.find_element(By.CSS_SELECTOR, '#q').send_keys(keyword)
time.sleep(random.randint(1, 3))
#按销量排序
driver.find_element(By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button').click()
time.sleep(random.randint(1, 10))
#输入登录名密码
driver.find_element(By.CSS_SELECTOR, '#fm-login-id').send_keys('淘宝名')
time.sleep(random.randint(1, 7))
driver.find_element(By.CSS_SELECTOR, '#fm-login-password').send_keys('密码')
time.sleep(random.randint(10, 15))
# 定义要获取的页面数
token = driver.find_element(By.XPATH, '//div[@class="total"]').text
token = int(re.compile('(\d+)').search(token).group(0))
return token
# 下一页
def next_page():
token = search()
num = 0
while num != token-1:
print('*' * 100)
print('正在爬取第{}页的数据'.format(num + 1))
print('*' * 100)
driver.get('https://s.taobao.com/search?q={}&s={}'.format(keyword,44 * num))
driver.implicitly_wait(10)
num += 1
drop_down()
time.sleep(random.randint(1, 3))
get_products()
# 定义一个下拉滑动条
def drop_down():
for x in range(1, 11, 2):
time.sleep(random.randint(1, 2))
j = x / 10
js = 'document.documentElement.scrollTop=document.documentElement.scrollHerght * %f' % j
driver.execute_script(js)
def get_products():
# 按销量排序
# J_relative > div.sort-row > div > ul > li:nth-child(2) > a
driver.find_element(By.CSS_SELECTOR, '#J_relative > div.sort-row > div > ul > li:nth-child(2) > a').click()
time.sleep(random.randint(1, 3))
# 获取区域
lis = driver.find_elements(By.XPATH, '//div[@class="items"]/div[@class="item J_MouserOnverReq "]')
# 遍历区域内的数据
for li in lis:
# 取标题
info = li.find_element(By.XPATH, './/div[@class="row row-2 title"]/a').text
# 取价格class="price g_price g_price-highlight"
price = li.find_element(By.XPATH, './/div[@class="price g_price g_price-highlight"]').text
# 取销量class="deal-cnt"
deal = li.find_element(By.XPATH, './/div[@class="deal-cnt"]').text
# 取店铺class="shop"
shop = li.find_element(By.XPATH, './/div[@class="shop"]/a').text
# 取发货地class="location"
location = li.find_element(By.XPATH, './/div[@class="location"]').text
# 店铺链接地址
hrefs = li.find_element(By.XPATH, './/div[@class="row row-2 title"]/a')
href=hrefs.get_attribute('href')
print(info, price, deal, shop, location, href, sep='|')
with open('e://python/day7/data.csv', "a", newline='',encoding='utf-8-sig') as filecsv:
csvwriter=csv.writer(filecsv)
csvwriter.writerow([info, price, deal, shop, location,href])
if __name__ == '__main__':
# 实例化浏览器对象
keyword=input('请输入要搜索的商品:')
driver = webdriver.Chrome()
# selenium防止被识别
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument",
{"source": """Object.defineProperty(navigator,'webdriver',{get:()=>false})"""})
# 通过selenium打开一个浏览器
driver.get('https://www.taobao.com/')
# 设置隐式等待
driver.implicitly_wait(10)
driver.maximize_window()
time.sleep(random.randint(1, 3))
next_page()
推荐文章: