首页手记如何用python爬取知乎话题？

如何用python爬取知乎话题？

标签：

Python

因为要做观点，观点的屋子类似于知乎的话题，所以得想办法把他给爬下来，搞了半天最终还是妥妥的搞定了，代码是python写的，不懂得麻烦自学哈！懂得直接看代码，绝对可用

#coding:utf-8

"""

@author:haoning

@create time:2015.8.5

"""

from __future__ import division # 精确除法

from Queue import Queue

from __builtin__ import False

import json

import os

import re

import platform

import uuid

import urllib

import urllib2

import sys

import time

import MySQLdb as mdb

from bs4 import BeautifulSoup

reload(sys)

sys.setdefaultencoding( "utf-8" )

headers = {

'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0',

'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',

'X-Requested-With':'XMLHttpRequest',

'Referer':'https://www.zhihu.com/topics',

'Cookie':'__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a'

}

DB_HOST = '127.0.0.1'

DB_USER = 'root'

DB_PASS = 'root'

queue= Queue() #接收队列

nodeSet=set()

keywordSet=set()

stop=0

offset=-20

level=0

maxLevel=7

counter=0

base=""

conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'zhihu', charset='utf8')

conn.autocommit(False)

curr = conn.cursor()

def get_html(url):

try:

req = urllib2.Request(url)

response = urllib2.urlopen(req,None,3) #在这里应该加入代理

html = response.read()

return html

except:

pass

return None

def getTopics():

url = 'https://www.zhihu.com/topics'

print url

try:

req = urllib2.Request(url)

response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞�

html = response.read().decode('utf-8')

print html

soup = BeautifulSoup(html)

lis = soup.find_all('li', {'class' : 'zm-topic-cat-item'})

for li in lis:

data_id=li.get('data-id')

name=li.text

curr.execute('select id from classify_new where name=%s',(name))

y= curr.fetchone()

if not y:

curr.execute('INSERT INTO classify_new(data_id,name)VALUES(%s,%s)',(data_id,name))

conn.commit()

except Exception as e:

print "get topic error",e

def get_extension(name):

where=name.rfind('.')

if where!=-1:

return name[where:len(name)]

return None

def which_platform():

sys_str = platform.system()

return sys_str

def GetDateString():

when=time.strftime('%Y-%m-%d',time.localtime(time.time()))

foldername = str(when)

return foldername

def makeDateFolder(par,classify):

try:

if os.path.isdir(par):

newFolderName=par + '//' + GetDateString() + '//' +str(classify)

if which_platform()=="Linux":

newFolderName=par + '/' + GetDateString() + "/" +str(classify)

if not os.path.isdir( newFolderName ):

os.makedirs( newFolderName )

return newFolderName

else:

return None

except Exception,e:

print "kk",e

return None

def download_img(url,classify):

try:

extention=get_extension(url)

if(extention is None):

return None

req = urllib2.Request(url)

resp = urllib2.urlopen(req,None,3)

dataimg=resp.read()

name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention

top="E://topic_pic"

folder=makeDateFolder(top, classify)

filename=None

if folder is not None:

filename =folder+"//"+name

try:

if "e82bab09c_m" in str(url):

return True

if not os.path.exists(filename):

file_object = open(filename,'w+b')

file_object.write(dataimg)

file_object.close()

return '/room/default/'+GetDateString()+'/'+str(classify)+"/"+name

else:

print "file exist"

return None

except IOError,e1:

print "e1=",e1

pass

except Exception as e:

print "eee",e

pass

return None #如果没有下载下来就利用原来网站的链接

def getChildren(node,name):

global queue,nodeSet

try:

url="https://www.zhihu.com/topic/"+str(node)+"/hot"

html=get_html(url)

if html is None:

return

soup = BeautifulSoup(html)

p_ch='父话题'

node_name=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text

topic_cla=soup.find('div', {'class' : 'child-topic'})

if topic_cla is not None:

try:

p_ch=str(topic_cla.text)

aList = soup.find_all('a', {'class' : 'zm-item-tag'}) #获取所有子节点

if u'子话题' in p_ch:

for a in aList:

token=a.get('data-token')

a=str(a).replace('\n','').replace('\t','').replace('\r','')

start=str(a).find('>')

end=str(a).rfind('</a>')

new_node=str(str(a)[start+1:end])

curr.execute('select id from rooms where name=%s',(new_node)) #先保证名字绝不相同

y= curr.fetchone()

if not y:

print "y=",y,"new_node=",new_node,"token=",token

queue.put((token,new_node,node_name))

except Exception as e:

print "add queue error",e

except Exception as e:

print "get html error",e

def getContent(n,name,p,top_id):

try:

global counter

curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同

y= curr.fetchone()

print "exist?? ",y,"n=",n

if not y:

url="https://www.zhihu.com/topic/"+str(n)+"/hot"

html=get_html(url)

if html is None:

return

soup = BeautifulSoup(html)

title=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text

pic_path=soup.find('a',{'id':'zh-avartar-edit-form'}).find('img').get('src')

description=soup.find('div',{'class':'zm-editable-content'})

if description is not None:

description=description.text

if (u"未归类" in title or u"根话题" in title): #允许入库，避免死循环

description=None

tag_path=download_img(pic_path,top_id)

print "tag_path=",tag_path

if (tag_path is not None) or tag_path==True:

if tag_path==True:

tag_path=None

father_id=2 #默认为杂谈

curr.execute('select id from rooms where name=%s',(p))

results = curr.fetchall()

for r in results:

father_id=r[0]

name=title

curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同

y= curr.fetchone()

print "store see..",y

if not y:

friends_num=0

temp = time.time()

x = time.localtime(float(temp))

create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now

create_time

creater_id=None

room_avatar=tag_path

is_pass=1

has_index=0

reason_id=None

#print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id

######################有资格入库的内容

counter=counter+1

curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id))

conn.commit() #必须时时进入数据库，不然找不到父节点

if counter % 200==0:

print "current node",name,"num",counter

except Exception as e:

print "get content error",e

def work():

global queue

curr.execute('select id,node,parent,name from classify where status=1')

results = curr.fetchall()

for r in results:

top_id=r[0]

node=r[1]

parent=r[2]

name=r[3]

try:

queue.put((node,name,parent)) #首先放入队列

while queue.qsize() >0:

n,p=queue.get() #顶节点出队

getContent(n,p,top_id)

getChildren(n,name) #出队内容的子节点

conn.commit()

except Exception as e:

print "what's wrong",e

def new_work():

global queue

curr.execute('select id,data_id,name from classify_new_copy where status=1')

results = curr.fetchall()

for r in results:

top_id=r[0]

data_id=r[1]

name=r[2]

try:

get_topis(data_id,name,top_id)

except:

pass

def get_topis(data_id,name,top_id):

global queue

url = 'https://www.zhihu.com/node/TopicsPlazzaListV2'

isGet = True;

offset = -20;

data_id=str(data_id)

while isGet:

offset = offset + 20

values = {'method': 'next', 'params': '{"topic_id":'+data_id+',"offset":'+str(offset)+',"hash_id":""}'}

try:

msg=None

try:

data = urllib.urlencode(values)

request = urllib2.Request(url,data,headers)

response = urllib2.urlopen(request,None,5)

html=response.read().decode('utf-8')

json_str = json.loads(html)

ms=json_str['msg']

if len(ms) <5:

break

msg=ms[0]

except Exception as e:

print "eeeee",e

#print msg

if msg is not None:

soup = BeautifulSoup(str(msg))

blks = soup.find_all('div', {'class' : 'blk'})

for blk in blks:

page=blk.find('a').get('href')

if page is not None:

node=page.replace("/topic/","") #将更多的种子入库

parent=name

ne=blk.find('strong').text

try:

queue.put((node,ne,parent)) #首先放入队列

while queue.qsize() >0:

n,name,p=queue.get() #顶节点出队

size=queue.qsize()

if size > 0:

print size

getContent(n,name,p,top_id)

getChildren(n,name) #出队内容的子节点

conn.commit()

except Exception as e:

print "what's wrong",e

except urllib2.URLError, e:

print "error is",e

pass

if __name__ == '__main__':

i=0

while i<400:

new_work()

i=i+1

点击查看更多内容

为 TA 点赞

若觉得本文不错，就分享一下吧！

评论

评论

共同学习，写下你的评论

评论加载中...

展开查看更多评论

作者其他优质文章

正在加载中

慕姐4208626

手记
篇

粉丝

5

获赞与收藏

39

关注作者，订阅最新文章

阅读免费教程

Python 办公自动化教程

17个小节 27554 930

Python 算法入门教程

15个小节 30303 1172

Python 进阶应用教程

38个小节 72994 1146

推荐

评论

收藏

共同学习，写下你的评论



感谢您的支持，我会继续努力的～

扫码打赏，你说多少就多少

赞赏金额会直接到老师账户

支付方式

打开微信扫一扫，即可进行扫码打赏哦

今天注册有机会得

100积分直接送

付费专栏免费学

大额优惠券免费领

立即参与放弃机会

点击
抽奖

慕课手记新用户专享福利

恭喜你，你的运气太好了，居然抽中了 100个积分！

恭喜你，抽中了价值元的专栏！

太棒了，直接落到你账户里！

积分商城里的罗技鼠标、机械键盘、
Kindle 阅读器、小米平衡车
Apple iPad （10.2英寸）、大额优惠券
在等着你去兑换了噢

作者：

免费赠送

兑换码：1111222211 复制

优惠券可用于购买实战课、体系课
无门槛使用

先去看看，有什么好东西马上兑换我爱学习，选课去


热搜

最近搜索清空

如何用python爬取知乎话题？

阅读免费教程