直接就爬取失败了 ,是什么原因?
直接就爬取失败了 ,是什么原因? craw 1 : http://baike.baidu.com/view/21087.htm craw failed Process finished with exit code 0
直接就爬取失败了 ,是什么原因? craw 1 : http://baike.baidu.com/view/21087.htm craw failed Process finished with exit code 0
2016-05-14
def _get_new_urls(self, page_url, soup): Movie_urls = set() # ./nowplaying/ #"div#screening div.screening-hd h2 span a[abs:href]" MovieNode = soup.find_all('div',class_='screening-hd').find('h2').find_all('a') for link in MovieNode: new_url = link['href'] new_full_url = urlparse.urljoin(page_url,new_url) #匹配成与page_url相同的格式,补全相对url Movie_urls.add(new_full_url) print Movie_urls return Movie_urls
#我现在想稍微改变一下代码爬取豆瓣电影首页,全部正在热映和即将上映 两个链接,又出现了同样的问题,目前我只修改了这部分代码,请帮我看一下这个MovieNode 获取得不对吗?怎么获取不到?谢谢
''' 输出器 ''' class HtmlOutputer(object): def __init__(self): #维护收集的数据 self.datas = [] def collect_data(self, data): if data is None: return self.datas.append(data) def output_html(self): fout = open('output.html', 'w') fout.write('<html>') fout.write('<body>') fout.write('<table>') for data in self.datas: fout.write('<tr>') fout.write('<td>%s</td>'%data['url'].encode('utf-8')) fout.write('<td>%s</td>'%data['title'].encode('utf-8')) fout.write('<td>%s</td>'%data['summary'].encode('utf-8')) fout.write('</tr>') fout.write('</table>') fout.write('</body>') fout.write('</html>')
html_outputer.py
''' URL管理器 需要两个列表: 待爬取的URL列表 爬取过的URL列表 ''' class UrlManager(object): #构造函数中初始化两个列表 def __init__(self): self.new_urls = set() self.old_urls = set() #向URL管理器里面添加一个新的url def add_new_url(self, url): #url是空的 if url is None: return #说明这个url既不在待爬取的URL列表,又不在已爬取的URL列表 if url not in self.new_urls and url not in self.old_urls: #说明这是一个新的URL,它可以待爬取 self.new_urls.add(url) #向URL管理器添加批量的url def add_new_urls(self, urls): #如果这个urls是空的,或者他的长度是0 if urls is None or len(urls) == 0: return #取出里面单个的url,并且调用添加单个url的方法 for url in urls: self.add_new_url(url) #判断管理器中是否有新的待爬取的url def has_new_url(self): #如果待爬取的URL列表不等于0的话,说明就有新的待爬取的URL return len(self.new_urls) != 0 #从URL管理器中获取一个新的待爬取的url def get_new_url(self): #获取一个URL,并且删除这个URL new_url = self.new_urls.pop() #添加进已经爬取的URL self.old_urls.add(new_url) #返回这个URL return new_url
url_manager.py
from baike_spider import url_manager, html_downloader, html_parser,\ html_outputer class SpiderMain(object): #初始化方法 def __init__(self): #初始化URL管理器 self.urls = url_manager.UrlManager() #URL下载器 self.downloader = html_downloader.HtmlDownloader() #解析器 self.parser = html_parser.HtmlParser() #输出对象 self.outputer = html_outputer.HtmlOutputer() def craw(self, root_url): #记录当前查询的是第几个URL count = 1 #添加一个入口url self.urls.add_new_url(root_url) #如果url管理器有一个新的url的时候 while self.urls.has_new_url(): #有的链接可能不能访问,或者出现问题 try: #获取这个页面的url new_url = self.urls.get_new_url() print 'craw %d : %s'%(count, new_url) #启动下载器下载这个页面 html_cont = self.downloader.download(new_url) #调用解析器来解析这个页面得到新的url列表和新的数据 #解析器接收当前查询的url以及下载好的数据 new_urls, new_data = self.parser.parser(new_url, html_cont) #添加新的url self.urls.add_new_urls(new_urls) #收集数据 self.outputer.collect_data(new_data) #只查询1000条 if count == 1000: break count = count + 1 except: print 'craw error' #写出收集好的数据 self.outputer.output_html() if __name__ == "__main__": root_url = "http://baike.baidu.com/view/21087.htm" obj_spider = SpiderMain() #启动爬虫 obj_spider.craw(root_url)
spider_main.py
from bs4 import BeautifulSoup import re import urlparse ''' 解析器 ''' class HtmlParser(object): #获取页面中所有相关的url def _get_new_urls(self, page_url, soup): new_urls = set() # 查找这种格式的url /view/123.htm links = soup.find_all('a', href=re.compile(r"/view/\d+\.htm")) for link in links: new_url = link['href'] #把new_url按照page_url的格式,拼接成一个完整的url new_full_url = urlparse.urljoin(page_url, new_url) new_urls.add(new_full_url) return new_urls def _get_new_data(self, page_url, soup): res_data = {} #url res_data['url'] = page_url #得到标题 <dd class="lemmaWgt-lemmaTitle-title"> <h1>Python</h1> title_node = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h1') res_data['title'] = title_node.get_text() #<div class="lemma-summary" label-module="lemmaSummary"> summary_node = soup.find('div', class_='lemma-summary') #获得summary正文的内容 res_data['summary'] = summary_node.get_text() return res_data # 从cont中解析出两个数据,新的url列表和数据 def parser(self, page_url, html_cont): if page_url is None or html_cont is None: return soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='urf-8') new_urls = self._get_new_urls(page_url, soup) new_data = self._get_new_data(page_url, soup) return new_urls, new_data
class HtmlParser(object):
def parse(self,page_url,html_cont):
if page_url is None or html_cont is None:
return
soup = BeautifulSoup(html_cont,'html.parser',from_encoding ='utf-8')
new_urls = self._get_new_urls(page_url,soup)
new_data = self._get_new_data(page_url,soup)
return new_urls,new_data
def _get_new_urls(self, page_url, soup):
new_urls = set()
#/view/123.htm
links =soup.find_all('a',href = re.compiler(r"/view/\d+.html"))
for link in links:
new_url = link['href']
new_full_url = urlparse.urljoin(page_url,new_url) #匹配成与page_url相同的格式,补全url
new_urls.add(new_full_url)
return new_urls
def _get_new_data(self, page_url, soup):
res_data ={}
res_data['url']= page_url
#<dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1>
title_node =soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find("h1")
res_data['title']=title_node.get_text()
#lemma-summary
summary_node = soup.find('div',class_ ="lemma-summary")
res_data['summary'] = summary_node.get_text()
return res_data
举报