[求助] Pixiv爬虫:通过requests爬取包含作者全部作品的ajax请求链接的时候,返回的数据不完整

尝试爬取Pixiv中一个作者的全部作品,在详情页抓取到一个ajax的数据包,它的url在浏览器打开是可以返回所有的数据,但是通过requests请求时,只能访问部分数据。
代码如下

import requests
http_address = '127.0.0.1:1088'
proxies = {
            "http": http_address,
            "https": http_address
}
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "referer": "https://www.pixiv.net/users/432332/illustrations"
}
url = "https://www.pixiv.net/ajax/user/432332/profile/all?lang=zh"
resp = requests.get(url=url, proxies=proxies)
print(resp.text)

拿到的数据如下:
{"error":false,"message":"","body":{"illusts":{"56725501":null,"44981873":null},...
浏览器请求的数据如下:
{"error":false,"message":"","body":{"illusts":{"71282994":null,"67511354":null,"64667165":null,"62930956":null,"56725501":null,"54229008":null,"51870535":null,"44981873":null},...
缺失了一部分的作品id
请问大佬们有什么好的处理办法吗?

Jason990420
最佳答案

这个网站要登入才能获取完整的资料

from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys


class Browser():

    def __init__(self):

        driver_path = "D:/Python/Project/chromedriver.exe"
        self.browser = webdriver.Chrome(executable_path=driver_path)
        self.login()

    def close(self):

        self.browser.close()

    def login(self):

        login_url = (
            "https://accounts.pixiv.net/login?"
            "return_to=https%3A%2F%2Fwww.pixiv.net%2F&"
            "lang=zh_cn&source=pc&view_type=page")

        username = "你的帐号名称"
        password = "你的帐号密码"

        self.browser.get(login_url)
        element = self.browser.find_element_by_xpath("// input [@ autocomplete ='username']")
        element.send_keys(username)
        element = self.browser.find_element_by_xpath("// input [@ autocomplete ='current-password']")
        element.send_keys(password)
        element.send_keys(Keys.ENTER)
        element = element.find_element_by_xpath("// button [@ type ='submit']").click
        sleep (10)

    def get(self, url):
        self.browser.get(url)
        return self.browser.page_source


url = "https://www.pixiv.net/ajax/user/432332/profile/all?lang=zh"
browser = Browser()
content = browser.get(url)
browser.close()

print(content)
2年前 评论
lingyux (楼主) 2年前
lingyux (楼主) 2年前
讨论数量: 4

你把requests请求里加上headers试试,如下:

resp = requests.get(url=url, proxies=proxies, headers=headers)
2年前 评论
lingyux (楼主) 2年前
lingyux (楼主) 2年前
    def main(self):
        cookies = self.log_in()
        session = requests.session()
        for cookie in cookies:
            session.cookies.set(cookie['name'], cookie['value'])  # 转换cookies
        pic_ids = self.get_pic_id(session)
        # loop = asyncio.get_event_loop()
        # loop.run_until_complete(self.download(pic_ids))

    def log_in(self):
        print("请输入用户名密码: ")
        username = input("用户名: ")
        passwd = input("密码: ")
        web = Chrome()
        web.get('https://www.pixiv.net/')
        time.sleep(5)
        web.find_element_by_xpath('//*[@id="wrapper"]/div[3]/div[2]/a[2]').click()
        web.find_element_by_xpath('//*[@id="LoginComponent"]/form/div[1]/div[1]/input').send_keys(username)
        web.find_element_by_xpath('//*[@id="LoginComponent"]/form/div[1]/div[2]/input').send_keys(passwd)
        web.find_element_by_xpath('//*[@id="LoginComponent"]/form/button').click()
        cookies = web.get_cookies()
        return cookies

    def get_pic_id(self, session):
        user_id = int(input("作者的id号: "))
        #  https: // www.pixiv.net / ajax / user / 27024181 / profile / all?lang = zh
        pic_id_url = "https://www.pixiv.net/ajax/user/%d/profile/all?lang=zh" % user_id
        proxies = self.proxies
        referer_url = "https://www.pixiv.net/users/%d/illustrations"
        referer = referer_url % user_id
        headers = self.headers
        headers["referer"] = referer
        resp = session.get(url=pic_id_url, headers=headers, proxies=proxies).json()
        #     拿到了存放所有插画的id的字典, 需要从字典提取信息
        #     dict.keys()
        # print(resp)
        print(resp)
        userName = resp['body']['pickup'][0]['userName']
        print(resp['body']['illusts'].keys())

我尝试通过selenium登陆拿到网站的cookie进行模拟登陆,但是拿到的数据还是不对

2年前 评论
Jason990420

这个网站要登入才能获取完整的资料

from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys


class Browser():

    def __init__(self):

        driver_path = "D:/Python/Project/chromedriver.exe"
        self.browser = webdriver.Chrome(executable_path=driver_path)
        self.login()

    def close(self):

        self.browser.close()

    def login(self):

        login_url = (
            "https://accounts.pixiv.net/login?"
            "return_to=https%3A%2F%2Fwww.pixiv.net%2F&"
            "lang=zh_cn&source=pc&view_type=page")

        username = "你的帐号名称"
        password = "你的帐号密码"

        self.browser.get(login_url)
        element = self.browser.find_element_by_xpath("// input [@ autocomplete ='username']")
        element.send_keys(username)
        element = self.browser.find_element_by_xpath("// input [@ autocomplete ='current-password']")
        element.send_keys(password)
        element.send_keys(Keys.ENTER)
        element = element.find_element_by_xpath("// button [@ type ='submit']").click
        sleep (10)

    def get(self, url):
        self.browser.get(url)
        return self.browser.page_source


url = "https://www.pixiv.net/ajax/user/432332/profile/all?lang=zh"
browser = Browser()
content = browser.get(url)
browser.close()

print(content)
2年前 评论
lingyux (楼主) 2年前
lingyux (楼主) 2年前

一模一样的问题, id数不完整, 反复用len()确认发现少了一些, 后来听楼上的加cookie, 把cookie怼进请求头(懒方法) 解决, 获得了完整work_id

2年前 评论

讨论应以学习和精进为目的。请勿发布不友善或者负能量的内容,与人为善,比聪明更重要!