-
import urllib2 import cookielib url="http://www.imooc.com/video/10683/0" print 'one' response1 = urllib2.urlopen(url) print response1.getcode() print len(response1.read()) print 'two' request = urllib2.Request(url) request.add_header("user-agent","mozilla/5.0") response2 = urllib2.urlopen(request) print response2.getcode() print len(response2.read()) print 'there' cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) response3 = urllib2.urlopen(url) print response3.getcode() print cj print response3.read()查看全部
-
import bs4 from bs4 import BeautifulSoup import re html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ print BeautifulSoup soup = BeautifulSoup(html_doc,'html.parser',from_encoding='utf-8') links = soup.find_all('a') print 'get all link' for link in links: print link.name,link['href'],link.get_text print 'find by regexp' link_node = soup.find('a',href=re.compile(r"ill")) print link_node.name,link_node['href'],link_node.get_text()查看全部
-
import urllib2 class HtmlDownloader(object): def download(self, url): if url is None: return None response = urllib2.urlopen(url) if response getcode() != 200: return None return response.read()查看全部
-
html解析器查看全部
-
爬虫分析目标查看全部
-
urlparse模块,按照旧URL的模式拼接新URL查看全部
-
提取信息查看全部
-
名称,属性,内容,都可以查找,当做参数名传入查看全部
-
大型公司存在redis,高性能查看全部
-
soup = BeautifulSoup.(html_doc) beautifulsoup后面跟的是字符串名,他可以解析字符串查看全部
-
特殊情景的处理器查看全部
-
网页解析器-结构化解析查看全部
-
Python的网页解析器查看全部
-
Python的网页解析器查看全部
-
URL管理器查看全部
举报
0/150
提交
取消