爬虫可爬url但没有html文件输出,也不能正常工作?
craw 1:http://baike.baidu.com/view/21087.htm
craw failed collect_data() missing 1 required positional argument: 'data'
craw 1:http://baike.baidu.com/view/5078.htm
craw failed collect_data() missing 1 required positional argument: 'data'
craw 1:http://baike.baidu.com/view/113188.htm
craw failed collect_data() missing 1 required positional argument: 'data'
craw 1:http://baike.baidu.com/view/1020193.htm
craw failed collect_data() missing 1 required positional argument: 'data'
craw 1:http://baike.baidu.com/view/1483082.htm
craw failed collect_data() missing 1 required positional argument: 'data'
craw 1:http://baike.baidu.com/view/2753125.htm
craw failed collect_data() missing 1 required positional argument: 'data'
craw 1:http://baike.baidu.com/view/309208.htm
craw failed collect_data() missing 1 required positional argument: 'data' 求解啊,查了一遍,代码没什么问题啊!
parser代码:
def _get_new_data(self, page_url, soup):
res_data = {}
res_data['url'] = page_url
#<dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1>
title_node = soup.find('dd',class_="lemmaWgt-lemmaTitle-title")
if title_node == None:
res_data['title'] =''
res_data['summary'] =''
return res_data
else:
title_node = title_node.find("h1")
res_data['title'] = title_node.get_text()
#<div class="lemma-summary" label-module="lemmaSummary">
summary_node = soup.find('div',class_="lemma-summary")
if summary_node == None:
res_data['summary'] = ''
else:
res_data['summary'] = summary_node.get_text()
return res_data
outputer:
class HtmlOutputer(object):
def __init__(self):
self.datas = []
def collect_data(self,data):
if data is None:
return
self.datas.append(data)
def output_html(self):
fout = open('output.html','w',encoding = 'utf-8')
fout.write('<html>')
fout.write("<head><meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\"></head>")
fout.write('<body>')
fout.write('<table>')
for data in self.datas:
fout.write('<tr>')
fout.write('<td>%s</td>'% data['url'])
fout.write('<td>%s</td>'% data['title'].encode('utf-8'))
fout.write('<td>%s</td>'% data['summary'].encode('utf-8'))
fout.write('</tr>')
fout.write('</table>')
fout.write('</body>')
fout.write('</html>')
fout.close()