运行spider_miam,报错,实在找不出错误了
from baike_spider import url_manager, html_downloader , html_parser, html_output class SpiderMain(object): def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.output = html_output.HtmlOutput() def craw(self, root_url): count = 1 self.urls.add_new_url(root_url) while self.urls.has_new_url(): try: new_url = self.urls.get_new_url() print "craw %d:%s"%(count,new_url) html_cont = self.downloader.download(new_url) new_urls,new_data = self.parser.parse(new_url,html_cont) self.urls.add_new_urls(new_urls) self.output.collect_data(new_data) if count == 1000: break count=count+1 except: print 'ceaw failed' self.output.ouput_html() #if _name_=="_main_": if __name__== "__main__": root_url = 'http://baike.baidu.com/item/Python' obj_spider = SpiderMain() obj_spider.craw(root_url)
接下来是url_manager
# coding:utf8 class UrlManager(object): def _init_(self): self.new_urls = set() self.old_urls = set() def add_new_url(self,url): if url is None: return if url not in self.new_urls and url not in self.old_urls: self.new_urls.add(url) def has_new_url(self): return len(self.new_urls)!=0 def get_new_url(self): new_url = self.new_urls.pop() self.old_urls.add(new_url) return new_url def add_new_urls(self,urls): if urls is None or len(urls)==0: return for url in urls: self.add_new_url(url)
报错提示是
Traceback (most recent call last):
File "C:\Users\whc\workspace\imooc����test\baike_spider\spider_main.py", line 43, in <module>
obj_spider.craw(root_url)
File "C:\Users\whc\workspace\imooc����test\baike_spider\spider_main.py", line 14, in craw
self.urls.add_new_url(root_url)
File "C:\Users\whc\workspace\imooc����test\baike_spider\url_manager.py", line 14, in add_new_url
if url not in self.new_urls and url not in self.old_urls:
AttributeError: 'UrlManager' object has no attribute 'new_urls'