关于class类内函数调用的问题——附上代码请求指点

代码

import requests
from lxml import etree

class Chaidanzhuanjia(object):
    def __init__(self,times):
        self.times = int(times)
        self.headers = headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36'
}

# data{
#     #第一页
#     #https://movie.douban.com/subject/30171424/reviews
#     #第二页
#     #https://movie.douban.com/subject/30171424/reviews?start=20
#     #第三页
#     #https://movie.douban.com/subject/30171424/reviews?start=40
# }

    # 创建爬取函数
    def request_page(self):
        for i in range(self.times):
            #根据页面规则构造页面url
            url = 'https://movie.douban.com/subject/30171424/reviews?start={}'.format(i*20)
            response = requests.get(url=url,headers=self.headers).text
            print(response)
            return response

    #创建清洗数据函数
    def wash_data(self):
        data = request_page()
        html = etree.HTML(data)
        html = etree.tostring(html)
        print(type(html))
if __name__ == '__main__':
    page = int(input('输入爬取页数:'))
    chaidanzhuanjia = Chaidanzhuanjia(page)
    chaidanzhuanjia.request_page()
    chaidanzhuanjia.wash_data()

问题

  • Chaidanzhuanjia()类内如何将request_page(),return的html数据传到wash_data()函数里面进行清洗呢?
Jason990420
最佳答案

Revised as following,

import requests
from lxml import etree

class Chaidanzhuanjia(object):

    def __init__(self,times):
        self.times = int(times)
        self.headers = headers = {
            'User-Agent': (
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/'
                '537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36')
        }

    def request_page(self, url):
        """
        创建爬取函数
        """
        try:
            response = requests.get(url=url,headers=self.headers).text
            print(response)
        except:
            response = None
        return response

    def wash_data(self):
        """
        创建清洗数据函数
        根据页面规则构造页面url
        第一页 https://movie.douban.com/subject/30171424/reviews
        第二页 https://movie.douban.com/subject/30171424/reviews?start=20
        第三页 https://movie.douban.com/subject/30171424/reviews?start=40
        """
        for i in range(self.times):
            url = ('https://movie.douban.com/subject/30171424/reviews?start={}'
                .format(i*20))
            data = self.request_page(url)
            if data is not None:
                html = etree.HTML(data)
                html = etree.tostring(html).decode()
                print(type(html))

if __name__ == '__main__':
    try:
        pages = int(input('输入爬取页数:'))
    except:
        pages = 1
    chaidanzhuanjia = Chaidanzhuanjia(pages)
    chaidanzhuanjia.wash_data()
3年前 评论
Scrooge (楼主) 3年前
Scrooge (楼主) 3年前
Jason990420 (作者) 3年前
讨论数量: 1
Jason990420

Revised as following,

import requests
from lxml import etree

class Chaidanzhuanjia(object):

    def __init__(self,times):
        self.times = int(times)
        self.headers = headers = {
            'User-Agent': (
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/'
                '537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36')
        }

    def request_page(self, url):
        """
        创建爬取函数
        """
        try:
            response = requests.get(url=url,headers=self.headers).text
            print(response)
        except:
            response = None
        return response

    def wash_data(self):
        """
        创建清洗数据函数
        根据页面规则构造页面url
        第一页 https://movie.douban.com/subject/30171424/reviews
        第二页 https://movie.douban.com/subject/30171424/reviews?start=20
        第三页 https://movie.douban.com/subject/30171424/reviews?start=40
        """
        for i in range(self.times):
            url = ('https://movie.douban.com/subject/30171424/reviews?start={}'
                .format(i*20))
            data = self.request_page(url)
            if data is not None:
                html = etree.HTML(data)
                html = etree.tostring(html).decode()
                print(type(html))

if __name__ == '__main__':
    try:
        pages = int(input('输入爬取页数:'))
    except:
        pages = 1
    chaidanzhuanjia = Chaidanzhuanjia(pages)
    chaidanzhuanjia.wash_data()
3年前 评论
Scrooge (楼主) 3年前
Scrooge (楼主) 3年前
Jason990420 (作者) 3年前

讨论应以学习和精进为目的。请勿发布不友善或者负能量的内容,与人为善,比聪明更重要!