longling.spider.lib.get_html 源代码

# coding: utf-8
# 2020/3/23 @ tongshiwei

import urllib.request
from .utils import get_http_header, get_opener
from longling.spider.conf import logger

__all__ = ["get_html_code"]


[文档]def get_html_code(url): """ get encoded html code from specified url """ logger.debug("request %s" % url) req = urllib.request.Request(url=url, headers=get_http_header(), method='GET') opener = get_opener() ret = opener.open(req) assert ret.status == 200, "error when request %s, error code %s" % (url, ret.status) content = ret.read() assert content, "error when request %s, null content" % url try: content = content.decode('utf-8') except UnicodeDecodeError: # pragma: no cover content = content.decode('utf-8', 'ignore') return content