首页手记 4-腾讯微博数据收集

4-腾讯微博数据收集

标签：

Python

为了练习python，于是写了个代码实现实时获取腾讯微博广播大厅的最新微博数据,每条查询结果以json的格式保存在文件当中

承接前一篇微博

http://loma1990.blog.51cto.com/6082839/1308205

主要在createApiCaller()函数里面添加appkey open_id access_token

已经实测获取了280多万条微博数据

结果如下:

091821881.png

代码如下：

#! /usr/bin/env python

#coding=utf-8

#Author:loma

#持续获取腾讯微博广播大厅的微博数据

'''

* @author loma

* qq:124172231 mail:lwsbox@qq.com

'''

import urllib2

import urllib

import webbrowser

import urlparse

import os

import time

import json

import sys

reload(sys)

sys.setdefaultencoding('utf-8')

class ApiManager:

#获取使用api的公共参数

def getPublicParams(self,appKey,access_token,open_id):

params = {};

params['oauth_consumer_key'] = appKey;

params['access_token'] = access_token;

params['openid'] = open_id;

params['oauth_version'] = '2.a';

params['scope'] = 'all';

return params;

#通过Ie打开授权页面

def OAuth2(self,appKey,redirect_url,response_type = 'token'):

format = 'https://open.t.qq.com/cgi-bin/oauth2/authorize?client_id=%s&response_type=%s&redirect_uri=%s';

url = format%(appKey,response_type,redirect_url);

webbrowser.open_new_tab(url);

#从得到的URl解析出access_token和client_id到params字典中

def decodeUrl(self,urlStr):

urlStr = urlStr.replace('#','?');

result=urlparse.urlparse(urlStr);

params=urlparse.parse_qs(result.query,True);

for a in params:

print params[a]

return params;

#调用腾讯微博开放平台的api

def doRequest(self,appKey,open_id,access_token,apiStr,params):

apiHead = 'http://open.t.qq.com/api/';

requestParams = self.getPublicParams(appKey,access_token,open_id);

requestParams.update(params);

url = apiHead + apiStr + "?" + urllib.urlencode(requestParams);

data = urllib2.urlopen(url);

return data;

#ApiCaller负责API的调用

class ApiCaller:

def __init__(self):

self.count = 0

self.apiManager = ApiManager()

self.Callers = {}

#增加调用者

def addCaller(self,appKey,open_id,access_token):

self.Callers[len(self.Callers) + 1] = {"appKey":appKey,"open_id":open_id,"access_token":access_token}

self.count = self.count + 1

#调用api

def callAPI(self,apiStr,params):

if self.count <= 0 or self.count > len(self.Callers):

return False,None

caller = self.Callers[self.count]

if self.count > 1:

self.count = self.count % (len(self.Callers) -1) + 1

while True:

try:

data = self.apiManager.doRequest(caller["appKey"],caller["open_id"],caller["access_token"],apiStr,params)

break;

except:

print (u"10s后重试连接").encode('gbk')

time.sleep(10);

continue;

return True,data

class Spider:

def __init__(self,callers):

self.totalNum = 0 #用来保存微博数量

self.caller = callers

self.idSet = set() #用来保存所有微博的id

self.idList = [] #用来保存一次json里面的微博id

self.isStop = False #用来结束spiding的循环的

self.tag = 0

#不定时爬取微博数据

def spiding(self,dataFile,sleepTime,apiStr,apiParams):

self.initIdSet(dataFile)

self.isStop = False

weiboSum = 0

while True:

print (u"努力地爬取数据中....%d"%(self.tag)).encode('gbk')

self.tag = self.tag + 1

#调用API获取微博数据

re,data = self.caller.callAPI(apiStr,apiParams)

time.sleep(sleepTime)

data = data.read()

try:

js = json.loads(data)

except:

print 'loads failed'

continue

if js['data'] == None:

apiParams['pos'] = 0

continue

info = js['data']['info']

pos = js['data']['pos']

apiParams['pos'] = pos

#保存json里所有微博的id，用于判断是否有重复的微博

self.idList = []

for weibo in info:

self.idList.insert(0,weibo['id'])

#过滤掉重复的微博

js['data']['info'] = filter(lambda x:x['id'] not in self.idSet and self.idList.count(x['id']) == 1,info)

info = js['data']['info']

print (u"此次获取的微博个数:%d"%(len(info))).encode('gbk')

weiboSum = weiboSum + len(info)

self.totalNum = self.totalNum + len(info)

print (u"累计获取微博:%d 全部微博数量:%d"%(weiboSum,self.totalNum)).encode('gbk')

for weibo in info:

self.idSet.add(weibo['id'])

if self.isStop == True:

break

jsStr = json.dumps(js,ensure_ascii=False,encoding='utf8')

# print jsStr.decode('utf8').encode('gbk')

file = open(dataFile,'a')

file.write(jsStr + '\n')

file.close()

def stopSpiding(self):

self.isStop = True

def initIdSet(self,dataFile):

try:

file = open(dataFile,"r")

except:

return

for line in file:

js = json.loads(line)

info = js['data']['info']

for weibo in info:

id = weibo['id']

#print weibo['text'].decode('utf8').encode('gbk')

self.idSet.add(id)

self.totalNum = self.totalNum + 1

file.close()

#授权调用

def doOAuth():

#需要设置的地方

appkey = '801348303';

redirect_url = 'http://loma1990.blog.51cto.com';

#实例化

manager = ApiManager();

#调用授权页面

manager.OAuth2(appkey,redirect_url);

#等待用户输入授权后跳转到的页面里的url

url = raw_input('Input the url');

#提取access_token和openid

params = manager.decodeUrl(url);

print params['openid']

print "\"%s\",\"%s\",\"%s\""%(appkey,params['openid'],params['access_token'])

#api参数设置

apiParams = {'format':'json','pos':'0','reqnum':'100'};

#调用api获取数据

data = manager.doRequest(appkey,params['openid'],params['access_token'],'statuses/public_timeline',apiParams);

#将获取的数据保存到指定文件中

file = open("d:\\weibo.txt","wb");

file.write(data.read());

file.close();

#打印数据

print data.read();

#ApiCaller创建

def createApiCaller():

callers = ApiCaller()

#要在这边调用addCaller增加你的appKey,open_id,access_token 然后直接运行就可以了

return callers

#doOAuth()

callers = createApiCaller()

#api参数设置

apiParams = {'format':'json','pos':0,'reqnum':100};

spider = Spider(callers)

spider.spiding('d:\\weiboBase.txt',5,'statuses/public_timeline',apiParams)

python腾讯微博收集微博数据python

点击查看更多内容

为 TA 点赞

若觉得本文不错，就分享一下吧！

评论

评论

共同学习，写下你的评论

评论加载中...

展开查看更多评论

作者其他优质文章

正在加载中

holdtom

手记
篇

粉丝

240

获赞与收藏

992

关注作者，订阅最新文章

阅读免费教程

Python 办公自动化教程

17个小节 25740 873

Python 算法入门教程

15个小节 27463 1074

Python 进阶应用教程

38个小节 65875 1035

推荐

评论

收藏

共同学习，写下你的评论



感谢您的支持，我会继续努力的～

扫码打赏，你说多少就多少

赞赏金额会直接到老师账户

支付方式

打开微信扫一扫，即可进行扫码打赏哦

今天注册有机会得

100积分直接送

付费专栏免费学

大额优惠券免费领

立即参与放弃机会

点击
抽奖

慕课手记新用户专享福利

恭喜你，你的运气太好了，居然抽中了 100个积分！

恭喜你，抽中了价值元的专栏！

太棒了，直接落到你账户里！

积分商城里的罗技鼠标、机械键盘、
Kindle 阅读器、小米平衡车
Apple iPad （10.2英寸）、大额优惠券
在等着你去兑换了噢

作者：

免费赠送

兑换码：1111222211 复制

优惠券可用于购买实战课、体系课
无门槛使用

先去看看，有什么好东西马上兑换我爱学习，选课去


热搜

最近搜索清空

4-腾讯微博数据收集

阅读免费教程