首页手记 ---简单图片爬虫------

---简单图片爬虫------

标签：

Python

-- coding:UTF-8 --

import urllib2
import urllib
import re
from bs4 import BeautifulSoup

def http_web(url_1):#获取网页
request_1=urllib2.Request(url_1)
request_1.add_header("user-agent","Mozilla/5.0")
opendz=urllib2.urlopen(request_1)
read_sj=opendz.read()
return read_sj

def screen_link_1(readdz):#根据网页抓取有用链接（正则筛选）并保存链接（保存不重复的）

fgzl=re.findall(r'href="http://.+\.html',readdz)

fgzl_1=re.findall(r'http://.+\.html',readdz)
i=0
print fgzl_1
wj=open('http.txt','w')
for a in fgzl_1:
        print i,':',a

        wj.write(a+'\n')#写入行数据
        i+=1
wj.close()

def screen_link_2():
wj=open('http.txt','r')
wj_web=open('http_web.txt','w')

i=2

while True:
                wjh=(wj.readline())
                tupian_http=re.findall(r'http://.+\d',wjh)#序列
                if not wjh:break
                print tupian_http[0]

                for i in range(2,50):

                    url_2=tupian_http[0]+'_'+str(i)+'.html'
                    i+=1
                    print url_2
                    req = urllib2.Request(url_2)
                    try:
                        urllib2.urlopen(req)
                    except urllib2.HTTPError, e:
                        print '网页错误'
                        print e.code
                        print e.reason

                    #网页正常就保存网址
                    else:
                        wj_web.write(url_2+'\n')#写入行数据

                        #url_2=tupian_http[0]+'_'+str(i)+'.html'  
                        #i+=1
                        #print url_2

wj.close()  
wj_web.close()

获取网页上的数据名称

def s_s(html_string):
soup_1=BeautifulSoup(html_string,'html.parser',from_encoding='utf-8')

links_1=soup_1.find_all('img')

links_1=soup_1.find('img')
return links_1

根据数据名称保存文件

def p_f(string_1):
i=n

for link in string_1:

print '--下载图片中--'
    #print link.name,link['src'],link.get_text()
print string_1.name,string_1['src'],string_1.get_text()
urllib.urlretrieve(string_1['src'],'%s.jpg' % i)
    #urllib.urlretrieve(link['src'],'%s.jpg' % i)
    #i+=1

对以上函数整体调用

def dywj(http_):
http_string=httpweb(http)
soup_string=s_s(http_string)
print_file=p_f(soup_string)

----------执行------------

url_http=str(raw_input('#请输入官网网址：'))
a_1=http_web(url_http)
b_1=screen_link_1(a_1)
screen_link_2()#根据网页进行扩展网页页数
print '----开始抓取图片---'
n=0
wj=open('http_web.txt','r')#网址文件名字
while True:
wjh=(wj.readline())
if not wjh:break
dywj(wjh)
n+=50
wj.close()

print '#-----抓取图片完成------'

点击查看更多内容