urlopen 请求的实现

未匹配的标注

前言

urlopen:是 Python 官方标准库 urllib.request 中的基本方法。

图片

图片

官方地址

urlopen

urllib.request 模块源码


# 省略若干代码...

_opener = None
##
# @url: 网络资源定位符
# @data: 请求 body 的数据
# @timeout: 超时时间,默认为空对象
# @cafile,@capath,@cadefault,@context: HTTPS 相关参数(很少用到)
#
def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
            *, cafile=None, capath=None, cadefault=False, context=None):

    # 引用外部 _opener 变量
    global _opener

    # HTTPS 相关
    if cafile or capath or cadefault:
        import warnings
        warnings.warn("cafile, capath and cadefault are deprecated, use a "
                      "custom context instead.", DeprecationWarning, 2)
        # CA 证书不能与内容同时传递
        if context is not None:
            raise ValueError(
                "You can't pass both context and any of cafile, capath, and "
                "cadefault"
            )
        if not _have_ssl:
            raise ValueError('SSL support not available')
        context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH,
                                             cafile=cafile,
                                             capath=capath)
        https_handler = HTTPSHandler(context=context)
        opener = build_opener(https_handler)

    # 创建 HTTPS 请求,很少用到
    elif context:
        https_handler = HTTPSHandler(context=context)
        opener = build_opener(https_handler)

    # !重点!我们多数走这里
    elif _opener is None:
        _opener = opener = build_opener()

    # 防止 opener 多次实例化,如果外部已有 opener,直接引用
    else:
        opener = _opener

    # 调用 openner 的 open 方法,进行 URL 请求
    return opener.open(url, data, timeout)

# 省略若干代码...

关于 global _opener 的想法:

一段优秀的代码,一定是 CPU 去重复指令化,且充分利用好内存资源。去重复指令:指程序运行生命周期内,可能重复的 CPU 指令仅执行一次,将结果放入内容,当再次需要结果时,直接从内存取就可以了。

URL 开启人(opener)的创建

urllib.request 模块源码

# 省略若干代码...

##
# @*handlers: 自定义的处理器列表(可变参数,例如 build_opener(ProxyHandler, HTTPHandler))
#
def build_opener(*handlers):

    # 实例化 OpenerDirector 类对象
    opener = OpenerDirector()

    # 定义内置的网络处理器
    default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
                       HTTPDefaultErrorHandler, HTTPRedirectHandler,
                       FTPHandler, FileHandler, HTTPErrorProcessor,
                       DataHandler]

    # 如果 ssl 模块可导入,则可开启 HTTPS 处理器
    if hasattr(http.client, "HTTPSConnection"):
        default_classes.append(HTTPSHandler)

    # 如果函数传入了自定义的处理器,则优先使用传入的处理器
    skip = set()
    for klass in default_classes:
        for check in handlers:
            if isinstance(check, type):
                if issubclass(check, klass):
                    skip.add(klass)
            elif isinstance(check, klass):
                skip.add(klass)

    # 删除默认内置的处理器,用自定义的处理器
    for klass in skip:
        default_classes.remove(klass)

    # 实例化内置处理器,并注册到 opener 中
    for klass in default_classes:
        opener.add_handler(klass())

    # 实例化自定义处理器,并注册到 opener 中
    for h in handlers:
        if isinstance(h, type):
            h = h()
        opener.add_handler(h)

    # 返回 opener
    return opener

# 省略若干代码...

OpenerDirector 的实例化

# 省略若干代码...

class OpenerDirector:
    def __init__(self):
        # 设置请求的 UA 标识
        client_version = "Python-urllib/%s" % __version__
        self.addheaders = [('User-agent', client_version)]
        # 初始化符合条件的处理器列表
        self.handlers = []
        # 初始化符合条件的处理器执行请求字典
        self.handle_open = {}
        # 初始化 HTTP 重定向和异常的处理字典
        self.handle_error = {}
        # 初始化响应字典
        self.process_response = {}
        # 初始化请求字典
        self.process_request = {}

# 省略若干代码...

关于如何注册进 opener

# 省略若干代码...

class OpenerDirector:

    # 省略若干代码...

    def add_handler(self, handler):

        # 所有处理器必须有 add_parent 方法,用来反注册 opener
        if not hasattr(handler, "add_parent"):
            raise TypeError("expected BaseHandler instance, got %r" %
                            type(handler))

        # 默认不注册
        added = False

        # 遍历处理器所有属性和方法
        for meth in dir(handler):

            # 方法如果是列表之一,当前方法不处理
            if meth in ["redirect_request", "do_open", "proxy_open"]:
                continue

            # 从方法名中分离出协议和具体操作,如:http_open --> http, open
            i = meth.find("_")
            protocol = meth[:i]
            condition = meth[i+1:]

            # 如果操作含有 error 字符,则注册进 handle_error 中
            if condition.startswith("error"):
                j = condition.find("_") + i + 1
                kind = meth[j+1:]
                try:
                    kind = int(kind)
                except ValueError:
                    pass
                lookup = self.handle_error.get(protocol, {})
                self.handle_error[protocol] = lookup
            # 如果是 open 操作,注册进 handle_open 中
            elif condition == "open":
                kind = protocol
                lookup = self.handle_open
            # 如果是 response 操作,注册进 process_response 中
            elif condition == "response":
                kind = protocol
                lookup = self.process_response
            # 如果是 request 操作,注册进 process_request 中
            elif condition == "request":
                kind = protocol
                lookup = self.process_request
            # 啥操作没有,跳过
            else:
                continue

            # 实际进行注册动作的代码
            handlers = lookup.setdefault(kind, [])
            if handlers:
                bisect.insort(handlers, handler)
            else:
                handlers.append(handler)
            added = True

        # 只要有一项操作被注册了,那么当前处理器注册进 opener,且处理器反注册 opener
        if added:
            bisect.insort(self.handlers, handler)
            handler.add_parent(self)
  • 注册结果截图

    TensorFlow

执行 opener 打开操作

urllib.request 模块源码

# 省略若干代码...

class OpenerDirector:

    # 省略若干代码...

    def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
        # 根据 url 实例化基本 Request 对象
        if isinstance(fullurl, str):
            req = Request(fullurl, data)
        else:
            req = fullurl
            if data is not None:
                req.data = data
        # 设置 Request 对象的超时时间
        req.timeout = timeout
        # 获取请求协议(http、https、ftp、file等)
        protocol = req.type

        # 从指定协议字典中取 request 操作,用对应处理器进一步加工 Request 对象
        meth_name = protocol+"_request"
        for processor in self.process_request.get(protocol, []):
            meth = getattr(processor, meth_name)
            req = meth(req)

        # 核心!!执行请求换响应操作
        response = self._open(req, data)

        # 从指定协议字典中取 response 操作,用对应处理器进一步加工 response 对象
        meth_name = protocol+"_response"
        for processor in self.process_response.get(protocol, []):
            meth = getattr(processor, meth_name)
            response = meth(req, response)

        # 返回响应
        return response

关于 self._open 后续操作简要说明一下

执行 _open 方法时,opener 会从已注册的 open 操作中,找到对应的协议 open 操作进行实际的 url 请求,例如 https 会执行 https_open 方法。

https_open 方法,实际调用的是父类 AbstractHTTPHandlerdo_open 方法。

https 在传入 do_open 参数时,传入的是 http.client.HTTPSConnection。http 的请求传入的是 http.client.HTTPConnection

如下:

class HTTPHandler(AbstractHTTPHandler):

    def http_open(self, req):
        # HTTP 协议
        return self.do_open(http.client.HTTPConnection, req)

    http_request = AbstractHTTPHandler.do_request_

if hasattr(http.client, 'HTTPSConnection'):

    class HTTPSHandler(AbstractHTTPHandler):

        def __init__(self, debuglevel=0, context=None, check_hostname=None):
            AbstractHTTPHandler.__init__(self, debuglevel)
            self._context = context
            self._check_hostname = check_hostname

        def https_open(self, req):
            # HTTPS 协议
            return self.do_open(http.client.HTTPSConnection, req,
                context=self._context, check_hostname=self._check_hostname)

        https_request = AbstractHTTPHandler.do_request_

    __all__.append('HTTPSHandler')

注:HTTPSConnection 继承自 HTTPConnection。在定义默认端口的时候,子类重定义 port 为 443,默认是 80

之后的操作,就是初始化 HTTPSConnection 类,执行其 request 方法

    def do_open(self, http_class, req, **http_conn_args):
        host = req.host
        if not host:
            raise URLError('no host given')

        # 初始化 HTTPSConnection
        h = http_class(host, timeout=req.timeout, **http_conn_args)
        h.set_debuglevel(self._debuglevel)

        headers = dict(req.unredirected_hdrs)
        headers.update({k: v for k, v in req.headers.items()
                        if k not in headers})

        headers["Connection"] = "close"
        headers = {name.title(): val for name, val in headers.items()}

        if req._tunnel_host:
            tunnel_headers = {}
            proxy_auth_hdr = "Proxy-Authorization"
            if proxy_auth_hdr in headers:
                tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
                del headers[proxy_auth_hdr]
            h.set_tunnel(req._tunnel_host, headers=tunnel_headers)

        try:
            try:
                # 执行请求
                h.request(req.get_method(), req.selector, req.data, headers,
                          encode_chunked=req.has_header('Transfer-encoding'))
            except OSError as err: # timeout error
                raise URLError(err)
            r = h.getresponse()
        except:
            h.close()
            raise

        if h.sock:
            h.sock.close()
            h.sock = None

        r.url = req.get_full_url()
        r.msg = r.reason
        return r

而在执行请求的时候,会将 HTTPS 和 HTTP 协议的请求头报文和请求体(如果有),进行二进制编码(ASCII编译为对应二进制数)

http.client 模块源码

    def _send_output(self, message_body=None, encode_chunked=False):

        self._buffer.extend((b"", b""))
        msg = b"\r\n".join(self._buffer)
        del self._buffer[:]
        self.send(msg)

msg 内容示例

b'GET /tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz HTTP/1.1\r\nAccept-Encoding: identity\r\nHost: storage.googleapis.com\r\nUser-Agent: Python-urllib/3.7\r\nConnection: close\r\n\r\n'

接下来,就是调用 Python 的 socket 模块,切合系统底层 socket 接口,打开远程 TCP 或 TLS 连接。将报文发送给远程的 80 端口或者 443 端口。

本文章首发在 LearnKu.com 网站上。

上一篇 下一篇
讨论数量: 0
发起讨论 查看所有版本


暂无话题~