runfile('C:/Users/Administrator/Desktop/新建文件夹/SpiderMain.py', wdir='C:/Users/Administrator/Desktop/新建文件夹')
Reloaded modules: html_downloader, html_outputer, html_parser, url_manager
Traceback (most recent call last):
File "<ipython-input-21-acc2b5e5b102>", line 1, in <module>
runfile('C:/Users/Administrator/Desktop/新建文件夹/SpiderMain.py', wdir='C:/Users/Administrator/Desktop/新建文件夹')
File "C:\ProgramData\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 880, in runfile
execfile(filename, namespace)
File "C:\ProgramData\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/Administrator/Desktop/新建文件夹/SpiderMain.py", line 56, in <module>
obj_spider.craw(root_url)
File "C:/Users/Administrator/Desktop/新建文件夹/SpiderMain.py", line 27, in craw
self.urls.add_new_url(root_url)
AttributeError: 'SpiderMain' object has no attribute 'urls'
上面这是报错的,然后下面的代码。
import html_downloader
import html_outputer
import html_parser
import url_manager
class SpiderMain(object):
def _init_(self):
#初始化url管理器
self.urls=url_manager.UrlManager()
#初始化url下载器
self.downloader=html_downloader.HtmlDownloader()
#初始化url解析器
self.parser=html_parser.HtmlParser()
#初始化url输出
self.outputer=html_outputer.HtmlOutputer()
def craw(self,root_url):
count=1
#url管理器中添加一个new url
self.urls.add_new_url(root_url)
#判断是否有新的url 开始爬去
while self.urls.has_new_url():
try:
#得到新的url
new_url=self.urls.get_new_url()
print ('craw %d:%s' % (count,new_url))
#下载新的url数据
html_cont=self.downloader.download(new_url)
#解析出来url的内容和地址
new_urls,new_data=self.parser.parse(new_url,html_cont)
#新的url补充到url管理器
self.urls.add_new_urls(new_urls)
#输出数据
self.outputer.collect_data(new_data)
if count==1000:
print ("finished")
break
count=count+1
print (count)
except:
print ("ceaw failed!")
self.outputer.output_html()
if __name__=="__main__":
root_url="http://baike.baidu.com/view/21087.htm"
obj_spider=SpiderMain()
obj_spider.craw(root_url)
添加回答
举报
0/150
提交
取消