urlopen 请求的实现
前言
urlopen:是 Python 官方标准库 urllib.request
中的基本方法。
urlopen
urllib.request
模块源码
# 省略若干代码...
_opener = None
##
# @url: 网络资源定位符
# @data: 请求 body 的数据
# @timeout: 超时时间,默认为空对象
# @cafile,@capath,@cadefault,@context: HTTPS 相关参数(很少用到)
#
def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
*, cafile=None, capath=None, cadefault=False, context=None):
# 引用外部 _opener 变量
global _opener
# HTTPS 相关
if cafile or capath or cadefault:
import warnings
warnings.warn("cafile, capath and cadefault are deprecated, use a "
"custom context instead.", DeprecationWarning, 2)
# CA 证书不能与内容同时传递
if context is not None:
raise ValueError(
"You can't pass both context and any of cafile, capath, and "
"cadefault"
)
if not _have_ssl:
raise ValueError('SSL support not available')
context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH,
cafile=cafile,
capath=capath)
https_handler = HTTPSHandler(context=context)
opener = build_opener(https_handler)
# 创建 HTTPS 请求,很少用到
elif context:
https_handler = HTTPSHandler(context=context)
opener = build_opener(https_handler)
# !重点!我们多数走这里
elif _opener is None:
_opener = opener = build_opener()
# 防止 opener 多次实例化,如果外部已有 opener,直接引用
else:
opener = _opener
# 调用 openner 的 open 方法,进行 URL 请求
return opener.open(url, data, timeout)
# 省略若干代码...
关于 global _opener
的想法:
一段优秀的代码,一定是 CPU 去重复指令化,且充分利用好内存资源。去重复指令:指程序运行生命周期内,可能重复的 CPU 指令仅执行一次,将结果放入内容,当再次需要结果时,直接从内存取就可以了。
URL 开启人(opener)的创建
urllib.request
模块源码
# 省略若干代码...
##
# @*handlers: 自定义的处理器列表(可变参数,例如 build_opener(ProxyHandler, HTTPHandler))
#
def build_opener(*handlers):
# 实例化 OpenerDirector 类对象
opener = OpenerDirector()
# 定义内置的网络处理器
default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
HTTPDefaultErrorHandler, HTTPRedirectHandler,
FTPHandler, FileHandler, HTTPErrorProcessor,
DataHandler]
# 如果 ssl 模块可导入,则可开启 HTTPS 处理器
if hasattr(http.client, "HTTPSConnection"):
default_classes.append(HTTPSHandler)
# 如果函数传入了自定义的处理器,则优先使用传入的处理器
skip = set()
for klass in default_classes:
for check in handlers:
if isinstance(check, type):
if issubclass(check, klass):
skip.add(klass)
elif isinstance(check, klass):
skip.add(klass)
# 删除默认内置的处理器,用自定义的处理器
for klass in skip:
default_classes.remove(klass)
# 实例化内置处理器,并注册到 opener 中
for klass in default_classes:
opener.add_handler(klass())
# 实例化自定义处理器,并注册到 opener 中
for h in handlers:
if isinstance(h, type):
h = h()
opener.add_handler(h)
# 返回 opener
return opener
# 省略若干代码...
OpenerDirector
的实例化
# 省略若干代码...
class OpenerDirector:
def __init__(self):
# 设置请求的 UA 标识
client_version = "Python-urllib/%s" % __version__
self.addheaders = [('User-agent', client_version)]
# 初始化符合条件的处理器列表
self.handlers = []
# 初始化符合条件的处理器执行请求字典
self.handle_open = {}
# 初始化 HTTP 重定向和异常的处理字典
self.handle_error = {}
# 初始化响应字典
self.process_response = {}
# 初始化请求字典
self.process_request = {}
# 省略若干代码...
关于如何注册进 opener
中
# 省略若干代码...
class OpenerDirector:
# 省略若干代码...
def add_handler(self, handler):
# 所有处理器必须有 add_parent 方法,用来反注册 opener
if not hasattr(handler, "add_parent"):
raise TypeError("expected BaseHandler instance, got %r" %
type(handler))
# 默认不注册
added = False
# 遍历处理器所有属性和方法
for meth in dir(handler):
# 方法如果是列表之一,当前方法不处理
if meth in ["redirect_request", "do_open", "proxy_open"]:
continue
# 从方法名中分离出协议和具体操作,如:http_open --> http, open
i = meth.find("_")
protocol = meth[:i]
condition = meth[i+1:]
# 如果操作含有 error 字符,则注册进 handle_error 中
if condition.startswith("error"):
j = condition.find("_") + i + 1
kind = meth[j+1:]
try:
kind = int(kind)
except ValueError:
pass
lookup = self.handle_error.get(protocol, {})
self.handle_error[protocol] = lookup
# 如果是 open 操作,注册进 handle_open 中
elif condition == "open":
kind = protocol
lookup = self.handle_open
# 如果是 response 操作,注册进 process_response 中
elif condition == "response":
kind = protocol
lookup = self.process_response
# 如果是 request 操作,注册进 process_request 中
elif condition == "request":
kind = protocol
lookup = self.process_request
# 啥操作没有,跳过
else:
continue
# 实际进行注册动作的代码
handlers = lookup.setdefault(kind, [])
if handlers:
bisect.insort(handlers, handler)
else:
handlers.append(handler)
added = True
# 只要有一项操作被注册了,那么当前处理器注册进 opener,且处理器反注册 opener
if added:
bisect.insort(self.handlers, handler)
handler.add_parent(self)
-
注册结果截图
执行 opener 打开操作
urllib.request
模块源码
# 省略若干代码...
class OpenerDirector:
# 省略若干代码...
def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
# 根据 url 实例化基本 Request 对象
if isinstance(fullurl, str):
req = Request(fullurl, data)
else:
req = fullurl
if data is not None:
req.data = data
# 设置 Request 对象的超时时间
req.timeout = timeout
# 获取请求协议(http、https、ftp、file等)
protocol = req.type
# 从指定协议字典中取 request 操作,用对应处理器进一步加工 Request 对象
meth_name = protocol+"_request"
for processor in self.process_request.get(protocol, []):
meth = getattr(processor, meth_name)
req = meth(req)
# 核心!!执行请求换响应操作
response = self._open(req, data)
# 从指定协议字典中取 response 操作,用对应处理器进一步加工 response 对象
meth_name = protocol+"_response"
for processor in self.process_response.get(protocol, []):
meth = getattr(processor, meth_name)
response = meth(req, response)
# 返回响应
return response
关于 self._open
后续操作简要说明一下
执行 _open
方法时,opener 会从已注册的 open 操作中,找到对应的协议 open 操作进行实际的 url 请求,例如 https 会执行 https_open
方法。
而 https_open
方法,实际调用的是父类 AbstractHTTPHandler
的 do_open
方法。
https 在传入 do_open
参数时,传入的是 http.client.HTTPSConnection
。http 的请求传入的是 http.client.HTTPConnection
。
如下:
class HTTPHandler(AbstractHTTPHandler):
def http_open(self, req):
# HTTP 协议
return self.do_open(http.client.HTTPConnection, req)
http_request = AbstractHTTPHandler.do_request_
if hasattr(http.client, 'HTTPSConnection'):
class HTTPSHandler(AbstractHTTPHandler):
def __init__(self, debuglevel=0, context=None, check_hostname=None):
AbstractHTTPHandler.__init__(self, debuglevel)
self._context = context
self._check_hostname = check_hostname
def https_open(self, req):
# HTTPS 协议
return self.do_open(http.client.HTTPSConnection, req,
context=self._context, check_hostname=self._check_hostname)
https_request = AbstractHTTPHandler.do_request_
__all__.append('HTTPSHandler')
注:HTTPSConnection 继承自 HTTPConnection。在定义默认端口的时候,子类重定义 port 为 443,默认是 80
之后的操作,就是初始化 HTTPSConnection 类,执行其 request 方法
def do_open(self, http_class, req, **http_conn_args):
host = req.host
if not host:
raise URLError('no host given')
# 初始化 HTTPSConnection
h = http_class(host, timeout=req.timeout, **http_conn_args)
h.set_debuglevel(self._debuglevel)
headers = dict(req.unredirected_hdrs)
headers.update({k: v for k, v in req.headers.items()
if k not in headers})
headers["Connection"] = "close"
headers = {name.title(): val for name, val in headers.items()}
if req._tunnel_host:
tunnel_headers = {}
proxy_auth_hdr = "Proxy-Authorization"
if proxy_auth_hdr in headers:
tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
del headers[proxy_auth_hdr]
h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
try:
try:
# 执行请求
h.request(req.get_method(), req.selector, req.data, headers,
encode_chunked=req.has_header('Transfer-encoding'))
except OSError as err: # timeout error
raise URLError(err)
r = h.getresponse()
except:
h.close()
raise
if h.sock:
h.sock.close()
h.sock = None
r.url = req.get_full_url()
r.msg = r.reason
return r
而在执行请求的时候,会将 HTTPS 和 HTTP 协议的请求头报文和请求体(如果有),进行二进制编码(ASCII编译为对应二进制数)
http.client
模块源码
def _send_output(self, message_body=None, encode_chunked=False):
self._buffer.extend((b"", b""))
msg = b"\r\n".join(self._buffer)
del self._buffer[:]
self.send(msg)
msg 内容示例
b'GET /tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz HTTP/1.1\r\nAccept-Encoding: identity\r\nHost: storage.googleapis.com\r\nUser-Agent: Python-urllib/3.7\r\nConnection: close\r\n\r\n'
接下来,就是调用 Python 的 socket 模块,切合系统底层 socket 接口,打开远程 TCP 或 TLS 连接。将报文发送给远程的 80 端口或者 443 端口。