title_node=soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find("h1")有什么错
title_node=soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find("h1")
res_data['title']=title_node.get_text()
title_node=soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find("h1")
res_data['title']=title_node.get_text()
2018-07-23
#已经很明确的告诉你了,你这个soup是NoneType,没有实例化
下面是我的代码,希望对你有帮助
import re import urllib.parse from bs4 import BeautifulSoup class HtmlParser(object): def parse(self, page_url, html_cont): if page_url is None or html_cont is None: return soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8') new_urls = self._get_new_urls(page_url,soup) new_data = self._get_new_data(page_url,soup) return new_urls,new_data def _get_new_urls(self, page_url, soup): new_urls = set() #/view/%a1%e7%ae%97%e6%9c%ba%(乱码的代表汉字) links = soup.find_all('a',href=re.compile(r'/item/\w+')) for link in links: new_url = link['href'] new_full_url = urllib.parse.urljoin(page_url,new_url) new_urls.add(new_full_url) return new_urls def _get_new_data(self, page_url, soup): res_data = {} # url res_data['url'] = page_url # <dl class="lemmaWgt-lemmaTitle lemmaWgt-lemmaTitle-"><h1>Python</h1> title_node = soup.find('dl',class_='lemmaWgt-lemmaTitle-').find("h1") res_data['title'] = title_node.get_text() #<div class="lemma-summary" label-module="lemmaSummary"> summary_node = soup.find('div',class_='lemma-summary') res_data['summary'] = summary_node.get_text() return res_data
举报