爬取到的页面保存html文件然后打开显示空白啥原因

在贴吧爬取到页面然后保存为html用浏览器打开但是只显示最上面一排文字, 其余都是空白, 怎么回事

讨论数量: 1
Jason990420

在贴吧爬取到页面然后保存为 html

It depend on how and what you get to save.

For example

import zlib
import brotli
from base64 import b64encode
from urllib.parse import urlsplit
from urllib import request, error

def signal(message, show=True):
    """
    Function as an failure interface for methods defined here. If you have
    different GUI, you can redefine it.
    : Parameters
      message: object, error message generally in string
      show: print the message, or not.
    : Return - None
    """
    if not isinstance(message, str):
        raise TypeError
    if show:
        print(message)
    return

def read_URL(url, data=None, headers=None, encoding='utf-8', errors='ignore',
             user=None, password=None, byte=False):
    """
    Read text from URL
    Compress method for gzip, deflate, br dealed internally.
    :Parameter
      url     : string or a Request object.
      data    : an object specifying additional data to be sent, or None.
      headers : dictionary, header of Http request entity.
      encoding: name of encoding to convert bytes into string.
      errors  : error process, 'strict', 'ignore', 'replace', ...
      user    : string, user name.
      password: string, password.
      byte    : flag for not decoding by encoding
    :Return
      (None, None) if failed, else response, string of html content
    """
    if not headers:
        url_base = urlsplit(url).netloc
        headers = {
            'Accept': ('text/html,application/xhtml+xml,application/xml;q=0.9,'
                       'application/json,'
                       'image/webp,image/apng,*/*;q=0.8,application/signed-exc'
                       'hange;v=b3;q=0.9'),
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8,en;q=0.7',
            'Connection': 'keep-alive',
            'Host': url_base,
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWeb'
                           'Kit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.14'
                           '9 Safari/537.36')}
    if not (None in [user, password]):
        auth = str.encode("%s:%s" % (user, password))
        user_and_pass = b64encode(auth).decode("ascii")
        headers['Authorization'] = 'Basic %s' % user_and_pass
    try:
        req = request.Request(url, data=data, headers=headers)
    except:
        signal('ValueError')
        return None, None
    try:
        response = request.urlopen(req)
    except error.HTTPError as e:
        signal('HTTPError')
        return (None, None)
    except error.URLError as e:
        signal('URLError')
        return (None, None)
    if not str(response.status).startswith('2'):
        signal(client.responses[response.status])
        return (None, None)

    data = response.read()

    accept_encoding = response.headers['Content-Encoding']
    if accept_encoding == 'gzip':
        data = zlib.decompress(data, zlib.MAX_WBITS|16)
    elif accept_encoding == 'deflate':
        data = zlib.decompress(data, -zlib.MAX_WBITS)
    elif accept_encoding == 'br':
        data = brotli.decompress(data)
    html = data if byte else data.decode(encoding=encoding, errors=errors)

    return (response, html)

filename = "d:/test.html"
url = "https://tieba.baidu.com/index.html"

response, html = read_URL(url)
if response.code == 200:
    with open(filename, "wt") as f:
        f.write(html)

file

1个月前 评论

讨论应以学习和精进为目的。请勿发布不友善或者负能量的内容,与人为善,比聪明更重要!