import requests
import bs4
import re
import openpyxl
def open_url(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3393.4 Safari/537.36'}
res = requests.get(url, headers=headers)
return res
def find_data(res):
data = []
soup = bs4.BeautifulSoup(res.text, 'html.parser')
content = soup.find(id='Cnt-Main-Article-QQ')
target = content.find_all('p', style='TEXT-INDENT: 2em')
target = iter(target)
for each in target:
if each.text.isnumeric():
data.append([
re.search(r'\[(.+)\]', next(target).text).group(1),
re.search(r'\d.*', next(target).text).group(),
re.search(r'\d.*', next(target).text).group(),
re.search(r'\d.*', next(target).text).group()
])
return data
def to_excel(data):
wb = openpyxl.Workbook()
wb.guess_types = True
ws = wb.active
ws.append(['城市', '平均房价', '平均工资', '房价工资比'])
for each in data:
ws.append(each)
wb.save("2017年中国主要城市房价工资比排行榜.xlsx")
def main():
host = 'http://news.house.qq.com/a/20170702/003985.htm'
res = open_url(host)
data = find_data(res)
to_excel(data)
if __name__ == '__main__':
main()
点击查看更多内容
为 TA 点赞
评论
共同学习,写下你的评论
评论加载中...
作者其他优质文章
正在加载中
感谢您的支持,我会继续努力的~
扫码打赏,你说多少就多少
赞赏金额会直接到老师账户
支付方式
打开微信扫一扫,即可进行扫码打赏哦