1 回答
TA贡献1844条经验 获得超8个赞
这些链接是通过 JavaScript 从外部 URL 动态加载的。您可以使用此示例来打印所有链接:
import json
import requests
from bs4 import BeautifulSoup
data = {'action': 'infinite_scroll', 'page': 1}
api_url = 'https://uynaa.wordpress.com/?infinity=scrolling'
page = 1
while True:
data['page'] = page
data = requests.post(api_url, data=data).json()
# uncomment next line to print all data:
# print(json.dumps(data, indent=4))
for p in data['postflair']:
print(p)
if data['lastbatch']:
break
page += 1
印刷:
https://uynaa.wordpress.com/2014/01/02/2013-in-review/
https://uynaa.wordpress.com/2013/10/07/%d0%b0%d1%84%d0%b3%d0%b0%d0%bd%d0%b8%d1%81%d1%82%d0%b0%d0%bd-%d0%b0%d0%bd%d1%85%d0%b4%d0%b0%d0%b3%d1%87-%d1%88%d0%b0%d0%bb%d1%82%d0%b3%d0%b0%d0%b0%d0%bd/
https://uynaa.wordpress.com/2013/10/07/%d0%b5-%d0%ba%d0%b0%d1%81%d0%bf%d0%b5%d1%80%d1%81%d0%ba%d0%b8%d0%b9-%d0%b1%d0%b8-%d0%b4%d0%b0%d1%80%d0%b0%d0%bd%d0%b3%d1%83%d0%b9%d0%bb%d0%b0%d0%bb-%d1%82%d0%be%d0%b3%d1%82%d0%be%d0%be-%d0%b3%d1%8d/
https://uynaa.wordpress.com/2013/10/07/%d1%88%d0%b0%d0%bd%d1%85%d0%b0%d0%b9-%d0%bd%d0%be%d0%b3%d0%be%d0%be%d0%bd/
https://uynaa.wordpress.com/2013/10/07/%d1%8d%d0%bd%d1%8d-%d0%b3%d0%b0%d0%b7%d0%b0%d1%80-%d0%bc%d0%b0%d0%bd%d0%b0%d0%b9%d1%85-%d0%b1%d0%b0%d0%b9%d1%81%d0%b0%d0%bd-%d1%8e%d0%bc/
https://uynaa.wordpress.com/2013/10/07/500-%d0%b6%d0%b8%d0%bb-%d0%b0%d1%80%d1%87%d0%bb%d1%83%d1%83%d0%bb%d0%b0%d0%b0%d0%b3%d2%af%d0%b9-%d0%b4%d1%8d%d0%bb%d1%85%d0%b8%d0%b9%d0%bd-%d1%86%d0%be%d1%80%d1%8b%d0%bd-%d0%b3%d0%b0%d0%bd%d1%86/
https://uynaa.wordpress.com/2013/02/01/%d1%83%d0%bb%d0%b7-%d0%bd%d1%83%d1%82%d0%b3%d0%b8%d0%b9%d0%bd-%d0%bf%d0%b8%d1%84%d0%b0%d0%b3%d0%be%d1%80/
https://uynaa.wordpress.com/2013/01/21/%d1%82%d0%b5%d0%bb%d0%b5%d0%b2%d0%b8%d0%b7%d0%b8%d0%b9%d0%bd-%d1%82%d2%af%d2%af%d1%85%d1%8d%d0%bd-%d0%b4%d1%8d%d1%85-%d1%85%d0%b0%d0%bc%d0%b3%d0%b8%d0%b9%d0%bd-%d0%b3%d0%b0%d0%b6%d0%b8%d0%b3-%d1%88/
https://uynaa.wordpress.com/2013/01/18/%d0%b0%d0%bf%d0%be%d1%84%d0%b8%d1%81-%d0%be%d0%be%d1%81-%d2%af%d2%af%d0%b4%d1%8d%d0%bd-%d3%a9%d1%80%d0%bd%d3%a9%d1%85-%d0%b6%d2%af%d0%b6%d0%b8%d0%b3/
https://uynaa.wordpress.com/2013/01/17/%d0%b0%d1%80%d0%b8%d1%83%d0%bd%d1%82%d0%bd%d1%8b-%d0%bd%d1%83%d1%82%d0%b0%d0%b3-%d0%b8%d0%b9%d0%b3-%d1%8d%d0%b7%d1%8d%d0%b3%d0%bd%d1%8d%d1%85-%d1%85%d0%b0%d0%bd/
https://uynaa.wordpress.com/2013/01/15/%d1%81%d0%b0%d1%83%d0%b4%d1%8b%d0%bd-%d1%82%d0%b0%d0%b3%d0%bd%d1%83%d1%83%d0%bb%d1%87%d0%b8%d0%b4-%d0%b0%d1%81%d0%b0%d0%b4%d1%8b%d0%b3-%d0%be%d0%bb%d0%b6%d1%8d%d1%8d/
https://uynaa.wordpress.com/2013/01/15/%d0%bc%d0%b0%d0%bb%d0%b8%d0%b3%d1%8d%d1%8d%d1%81-%d1%81%d0%be%d0%bc%d0%b0%d0%bb%d0%b8-%d1%85%d2%af%d1%80%d1%82%d1%8d%d0%bb/
https://uynaa.wordpress.com/2013/01/10/%d1%85%d0%be%d1%80%d0%b2%d0%be%d0%be-%d0%b5%d1%80%d1%82%d3%a9%d0%bd%d1%86-%d1%85%d0%b0%d0%bb%d0%b0%d0%b0%d1%81%d0%b0%d0%bd%d0%b4-%d0%b1%d0%b0%d0%b3%d1%82%d0%b0%d0%bd%d0%b0/
https://uynaa.wordpress.com/2013/01/10/%d1%82%d0%b0%d0%bd%d0%b3%d0%b0%d1%80%d0%b0%d0%b3-%d3%a9%d1%80%d0%b3%d3%a9%d1%85-%d1%91%d1%81%d0%bb%d0%be%d0%bb-%d1%85%d2%af%d0%bb%d1%8d%d1%8d%d0%b6-%d0%b1%d0%b0%d0%b9%d0%b3-%d1%8d%d1%8d/
https://uynaa.wordpress.com/2013/01/09/%d0%b1%d0%be%d0%bb%d0%bb%d0%b8%d0%b2%d1%83%d0%b4%d1%8b%d0%bd-%d0%ba%d0%b8%d0%bd%d0%be%d0%bd%d0%be%d0%be%d1%81-%d1%87-%d0%b0%d0%b9%d0%bc%d0%b0%d0%b0%d1%80/
https://uynaa.wordpress.com/2013/01/08/%d0%bf%d0%b5%d0%bd%d1%82%d0%b0%d0%b3%d0%be%d0%bd-%d0%b1%d0%be%d0%bb%d0%be%d0%bd-%d1%82%d1%82%d0%b3-%d1%8b%d0%b3-%d1%83%d0%b4%d0%b8%d1%80%d0%b4%d0%b0%d1%85-%d0%bc%d0%b0%d0%b3%d0%b0%d0%b4%d0%bb%d0%b0/
https://uynaa.wordpress.com/2013/01/07/%d0%b7%d0%b8%d0%b0%d0%b4-%d1%82%d0%b0%d0%ba%d0%b8%d0%b5%d0%b4%d0%b4%d0%b8%d0%bd/
...and so on.
编辑:要仅过滤指定类别的链接,您可以使用以下脚本:
import json
import requests
from bs4 import BeautifulSoup
data = {'action': 'infinite_scroll', 'page': 1}
api_url = 'https://uynaa.wordpress.com/?infinity=scrolling'
all_links = []
page = 1
while True:
data['page'] = page
data = requests.post(api_url, data=data).json()
# uncomment next line to print all data:
# print(json.dumps(data, indent=4))
soup = BeautifulSoup(data['html'], 'html.parser')
for p in soup.select('.post'):
if any('%d0%be%d1%80%d1%87%d1%83%d1%83%d0%bb%d0%b3%d1%8b%d0%bd-%d0%bd%d0%b8%d0%b9%d1%82%d0%bb%d1%8d%d0%bb' in cat['href'] for cat in p.select('[rel="category tag"]')):
if p.h2.a['href'] not in all_links:
print(p.h2.a['href'])
all_links.append(p.h2.a['href'])
if data['lastbatch']:
break
page += 1
print(len(all_links))
打印 135 个链接:
...
https://uynaa.wordpress.com/2011/05/13/%e2%80%9c%d1%83%d1%85%d0%b0%d0%b0%d0%bd-%d0%bc%d1%83%d1%83%d1%82%d0%bd%d1%83%d1%83%d0%b4%d1%8b%d0%bd-%d2%af%d0%b5%e2%80%9d/
https://uynaa.wordpress.com/2011/05/04/%d2%af%d1%85%d0%bb%d0%b8%d0%b9%d0%bd-%d1%82%d0%be%d0%b3%d0%bb%d0%be%d0%be%d0%bc/
https://uynaa.wordpress.com/2011/05/04/%d0%be%d1%81%d0%b0%d0%bc%d0%b0-%d0%b1%d0%b8%d0%bd-%d0%bb%d0%b0%d0%b4%d0%b5%d0%bd%d0%b8%d0%b9%d0%b3-%d1%8f%d0%b0%d0%b6-%d0%b8%d0%bb%d1%80%d2%af%d2%af%d0%bb%d1%81%d1%8d%d0%bd-%d0%b1%d1%8d/
135
TA贡献1858条经验 获得超8个赞
不确定为什么你的代码不起作用。对我来说,我使用下面的代码首先获取所有链接。
list_href = []
a_tags = soup.find_all('a')
for tag in a_tags:
list_href.append(tag.get('href'))
文章的链接位于 list_href[5:26] 中。
添加回答
举报