import urlparse,urllib2,redef download(url,num_retries=2): print 'Downloading:',url try: html=urllib2.urlopen(url).read() except urllib2.URLError as e: print 'Download error:',e.reason html=None if num_retries>0: if hasattr(e,'code') and 500<=e.code<600: return download(url,num_retries-1) return html def get_links(html): webpage_regex=re.compile(r'<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE) return webpage_regex.findall(html)def link_crawler(seed_url,link_regex): crawl_queue=[seed_url] while crawl_queue: url=crawl_queue.pop() html=download(url) for link in get_links(html): if re.match(link_regex,link): link=urlparse.urljoin(seed_url,link) crawl_queue.append(link)运行link_crawler('http://example.webscraping.com/','/(index|view)')Downloading: http://example.webscraping.com/Traceback (most recent call last): File "<pyshell#12>", line 1, in <module> link_crawler('http://example.webscraping.com/','/(index|view)') File "C:/Python27/lianxi/pachong4.py", line 23, in link_crawler for link in get_links(html): File "C:/Python27/lianxi/pachong4.py", line 16, in get_links return webpage_regex.findall(html)TypeError: expected string or buffer什么问题呀?!!小白
添加回答
举报
0/150
提交
取消