import zlib
import brotli
from base64 import b64encode
from urllib.parse import urlsplit
from urllib import request, error
defsignal(message, show=True):"""
Function as an failure interface for methods defined here. If you have
different GUI, you can redefine it.
: Parameters
message: object, error message generally in string
show: print the message, or not.
: Return - None
"""ifnotisinstance(message,str):raise TypeError
if show:print(message)returndefread_URL(url, data=None, headers=None, encoding='utf-8', errors='ignore',
user=None, password=None, byte=False):"""
Read text from URL
Compress method for gzip, deflate, br dealed internally.
:Parameter
url : string or a Request object.
data : an object specifying additional data to be sent, or None.
headers : dictionary, header of Http request entity.
encoding: name of encoding to convert bytes into string.
errors : error process, 'strict', 'ignore', 'replace', ...
user : string, user name.
password: string, password.
byte : flag for not decoding by encoding
:Return
(None, None) if failed, else response, string of html content
"""ifnot headers:
url_base = urlsplit(url).netloc
headers ={'Accept':('text/html,application/xhtml+xml,application/xml;q=0.9,''application/json,''image/webp,image/apng,*/*;q=0.8,application/signed-exc''hange;v=b3;q=0.9'),'Accept-Encoding':'gzip, deflate, br','Accept-Language':'zh-CN,zh-TW;q=0.9,zh;q=0.8,en;q=0.7','Connection':'keep-alive','Host': url_base,'Sec-Fetch-Dest':'document','Sec-Fetch-Mode':'navigate','Sec-Fetch-Site':'none','Upgrade-Insecure-Requests':'1','User-Agent':('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWeb''Kit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.14''9 Safari/537.36')}ifnot(Nonein[user, password]):
auth =str.encode("%s:%s"%(user, password))
user_and_pass = b64encode(auth).decode("ascii")
headers['Authorization']='Basic %s'% user_and_pass
try:
req = request.Request(url, data=data, headers=headers)except:
signal('ValueError')returnNone,Nonetry:
response = request.urlopen(req)except error.HTTPError as e:
signal('HTTPError')return(None,None)except error.URLError as e:
signal('URLError')return(None,None)ifnotstr(response.status).startswith('2'):
signal(client.responses[response.status])return(None,None)
data = response.read()
accept_encoding = response.headers['Content-Encoding']if accept_encoding =='gzip':
data = zlib.decompress(data, zlib.MAX_WBITS|16)elif accept_encoding =='deflate':
data = zlib.decompress(data,-zlib.MAX_WBITS)elif accept_encoding =='br':
data = brotli.decompress(data)
html = data if byte else data.decode(encoding=encoding, errors=errors)return(response, html)
filename ="d:/test.html"
url ="https://tieba.baidu.com/index.html"
response, html = read_URL(url)if response.code ==200:withopen(filename,"wt")as f:
f.write(html)
It depend on how and what you get to save.
For example