关于urllib.request.urlopen的编码问题
python3.5.2,环境是pycharm
去掉了spider_main中的try-except后发现报错如下:
Traceback (most recent call last): File "S:/Python Learning/baike_spider/spider_main.py", line 43, in <module> obj_spider.craw(root_url) # 启动爬虫 File "S:/Python Learning/baike_spider/spider_main.py", line 26, in craw html_cont = self.downloader.download(new_url) File "S:\Python Learning\baike_spider\html_downloader.py", line 10, in download resp = request.urlopen(url) File "E:\Tools\Python\Python3.5\lib\urllib\request.py", line 163, in urlopen return opener.open(url, data, timeout) File "E:\Tools\Python\Python3.5\lib\urllib\request.py", line 466, in open response = self._open(req, data) File "E:\Tools\Python\Python3.5\lib\urllib\request.py", line 484, in _open '_open', req) File "E:\Tools\Python\Python3.5\lib\urllib\request.py", line 444, in _call_chain result = func(*args) File "E:\Tools\Python\Python3.5\lib\urllib\request.py", line 1297, in https_open context=self._context, check_hostname=self._check_hostname) File "E:\Tools\Python\Python3.5\lib\urllib\request.py", line 1254, in do_open h.request(req.get_method(), req.selector, req.data, headers) File "E:\Tools\Python\Python3.5\lib\http\client.py", line 1107, in request self._send_request(method, url, body, headers) File "E:\Tools\Python\Python3.5\lib\http\client.py", line 1142, in _send_request self.putrequest(method, url, **skips) File "E:\Tools\Python\Python3.5\lib\http\client.py", line 984, in putrequest self._output(request.encode('ascii')) UnicodeEncodeError: 'ascii' codec can't encode characters in position 10-12: ordinal not in range(128)
html_downloader模块代码:
# coding:utf-8 from urllib import request class HtmlDownloader(object): def download(self, url): if url is None: return None resp = request.urlopen(url) if resp.getcode() != 200: return None return resp.read()
查了很多但越改越乱,求解谢谢!