为了账号安全,请及时绑定邮箱和手机立即绑定

爬虫 You provided Unicode markup but also provided a value for from_encoding

爬虫 You provided Unicode markup but also provided a value for from_encoding

活在篮子里的人 2017-02-21 11:33:29
#coding:utf-8"""@author 篮子里"""from baike_spider import html_downloader,html_outputer,html_parser,url_managerclass SpiderMain(object):    def __init__(self):        self.urls=url_manager.UrlManager()        self.downloader=html_downloader.HtmlDownloader()        self.parser=html_parser.HtmlParser()        self.outputer=html_outputer.HtmOutputer()    def craw(self,root_url):        count=1        self.urls.add_new_url(root_url)        while self.urls.has_new_url():            try:                new_url=self.urls.get_new_url()                print("craw {0}:{1}".format(count,new_url))                html_cont=self.downloader.download(new_url)                new_url,new_data=self.parser.parse(new_url,html_cont)                self.urls.add_new_url(new_url)                self.outputer.collect_data(new_data)                if count==1000:                    break                count+=1            except(Exception):                print("Failed!")        self.outputer.output_html()if __name__=="__main__":    root_url="http://baike.baidu.com/subview/16030/16030.htm"    obj_spider=SpiderMain()    obj_spider.craw(root_url)class UrlManager(object):    def __init__(self):        self.new_urls=set()        self.old_urls=set()    def add_new_url(self,url):        if url is None:            return        if url not in self.new_urls and url not in self.old_urls:            self.new_urls.add(url)    def add_new_urls(self,urls):        if urls is None or len(urls) ==0:            return        for url in urls:            self.add_new_url(url)    def has_new_url(self):        return len(self.new_urls) !=0    def get_new_url(self):        new_url=self.new_urls.pop()        self.old_urls.add(new_url)        return new_urlimport requestsclass HtmlDownloader(object):    def download(self,url):        if url is None:            return None        r=requests.get(url)        if r.status_code !=200:            return None        return r.text.decode('UTF-8')from bs4 import BeautifulSoupimport reimport urlparse2class HtmlParser(object):    def _get_new_urls(self,page_url,soup):        new_urls=set()        #http://www.huajiao.com/l/82485740?hd=1        links=soup.find_all("a",href=re.compile(r"/subview/\d+/\d+.htm"))        for link in links:            new_url=link["href"]            new_full_url=urlparse2.urljoin(page_url,new_url)            new_urls.add(new_full_url)        return new_url    def get_new_data(self,page_url,soup):        res_data={}        res_data["url"]=page_url        #<dd class="lemmaWgt-lemmaTitle-title"><h1>科比·布莱恩特</h1>        title_node=soup.find("dd",class_="lemmaWgt-lemmaTitle-title").find("h1")        res_data["title"]=title_node.get_text()        #<div class="lemma-summary" label-module="lemmaSummary">        summary_node=soup.find("div",class_="lemma-summary")        res_data=summary_node.get_text()        return res_data    def parse(self,page_url,html_cont):        if page_url is None or html_cont is None:            return        soup=BeautifulSoup(html_cont,"html.parser")        new_urls = self._get_new_urls(page_url, soup)        new_data = self._get_new_data(page_url, soup)        return new_urls, new_dataclass HtmOutputer(object):    def __init__(self):        self.datas=[]    def collect_data(self,data):        if data is None:            return        self.datas.append(data)    def output_html(self):        fout = open("output.html","w",encoding="utf-8")        fout.write("<html>")        fout.write("<body>")        fout.write("<table>")        for data in self.datas:            fout.write("<tr>")            fout.write("<td>{0}</td>".format(data["url"]))            fout.write("<td>{0}</td>".format(data["title"].encode("UTF-8")))            fout.write("<td>{0}</td>".format(data["summary"].encode("UTF-8")))            fout.write("</tr>")        fout.write("</table>")        fout.write("</body>")        fout.write("</html>")        fout.close()
查看完整描述

1 回答

?
二毛毛

TA贡献1条经验 获得超20个赞


报错:

UserWarning: You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.

解决方法:
soup = BeautifulSoup(html_doc,"html.parser")

这一句中删除【from_encoding="utf-8"】

原因:

python3 缺省的编码是unicode, 再在from_encoding设置为utf8, 会被忽视掉,去掉【from_encoding="utf-8"】这一个好了

查看完整回答
20 反对 回复 2017-03-16
  • 1 回答
  • 2 关注
  • 11968 浏览
慕课专栏
更多

添加回答

举报

0/150
提交
取消
微信客服

购课补贴
联系客服咨询优惠详情

帮助反馈 APP下载

慕课网APP
您的移动学习伙伴

公众号

扫描二维码
关注慕课网微信公众号