-
流程简单查看全部
-
我把老师的几个文件整理到一个.py里了,方便去爬其它网站
#!/usr/bin/python #coding=utf-8 import urllib2,re,urlparse from bs4 import BeautifulSoup start_string = """var spider_vue = new Vue({ el:'#spider', data:{ items:[ """ end_string = """]}})""" def craw(root_url): new_urls = set() old_urls = set() count=0; fout = open('js/spider-vue.js',"w+") fout.write(start_string) new_urls.add(root_url) while(new_urls is not None and len(new_urls)!=0 and count<5): count+=1 url = new_urls.pop() old_urls.add(url) urls,new_data = parse(url) for url in urls: if url not in new_urls and url not in old_urls: new_urls.add(url) fout.write("""{ title:'%s', url:'%s', summary:'%s' }""" % (new_data['title'].encode('utf-8'),new_data['url'].encode('utf-8'),'\\n'.join((new_data['summary'].encode('utf-8').split('\n'))[0:-1]))) print '\n'.join((new_data['summary'].encode('utf-8').split('\n'))[0:-1]) if count<5: fout.write(",\n") fout.write(end_string) #<dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1> def parse(url): html = urllib2.urlopen(url) soup = BeautifulSoup(html,'html.parser') try: _as = soup.find_all('a',href = re.compile(r'(.*)/item/(.*)')) except BaseException,e: print e p_urls = [] for a in _as: p_urls.append( urlparse.urljoin(root_url,a['href']) ) data = {'url':url} data['title'] = soup.select("dd.lemmaWgt-lemmaTitle-title h1")[0].get_text() data['summary'] = soup.select("div.lemma-summary")[0].get_text() return p_urls,data def main(): craw(root_url) root_url = "http://baike.baidu.com/item/Python" main();
查看全部 -
fout = open('output.html','w',encoding='utf-8')
#养成良好习惯,打开文件时指定编码,不然会提示错误
查看全部 -
查看全部
-
3
查看全部 -
2
查看全部 -
url manager
查看全部 -
mubiao fenxi
查看全部 -
buzhou
查看全部 -
4
查看全部 -
class_为了避免python关键字冲突
查看全部 -
2
查看全部 -
1
查看全部 -
查看全部
-
哈哈哈哈查看全部
举报
0/150
提交
取消