##2的教程在3的环境下运行,改掉一些后好不容易能完成最后运行还是出现问题,真心求教#coding:utf8from baike_spider import url_manager, html_outputer, html_downloader,\ html_parserprint ('http://baike.baidu.com/view/21087.htm')class SpiderMain(object): def __int__(self): self.urls=url_manager.ulrmanager()#初始化对象 self.dowload=html_downloader.htmldownloader() self.parser=html_parser.htmlpaser() self.outputer=html_outputer.htmloutputer() def craw(self, url): count = 1 self.urls.add_new_url(url) while self.urls.has_new_urls(): try: new_url =self.urls.get_new_url()#获取新的url print ('craw %d: %s'%(count,new_url)) html_cont =self.downloader.download(new_url)#调用网页解析器下载 new_urls,new_data=self.parser.paser(new_url,html_cont) self.urls.add_new_urls(new_urls)#新的url补充进去 self.outputer.collect_data(new_data) if count ==100: break count =count+1 except: print ('erro 1') self.outputer.output_html() #设置待爬取目标urlif __name__=="__main__": url = 'http://baike.baidu.com/view/21087.htm' obj_spider = SpiderMain() obj_spider.craw(url)
添加回答
举报
0/150
提交
取消