1 回答
![?](http://img1.sycdn.imooc.com/5458502c00012d4a02200220-100-100.jpg)
TA贡献1848条经验 获得超10个赞
然后是循环 RSS 源的简单情况。
import feedparser
from bs4 import BeautifulSoup
import urllib.parse, xml.sax
import pandas as pd
# get some RSS feeds....
resp = requests.get("https://blog.feedspot.com/world_news_rss_feeds/")
soup = BeautifulSoup(resp.content.decode(), "html.parser")
rawfeeds = soup.find_all("h2")
feeds = {}
for rf in rawfeeds:
a = rf.find("a")
if a is not None:
feeds[a.string.replace("RSS Feed", "").strip()] = urllib.parse.parse_qs(a['href'])["q"][0].replace("site:","")
# now source them all into a dataframe
df = pd.DataFrame()
for k, url in feeds.items():
try:
df = pd.concat([df, pd.json_normalize(feedparser.parse(url)["entries"]).assign(Source=k)])
except (Exception, xml.sax.SAXParseException):
print(f"invalid xml: {url}")
可重入
使用etag和修改的功能
feedparser
持久化数据帧,以便再次运行时它会从上次停止的地方开始
我会使用线程,这样它就不是纯粹顺序的。显然,对于线程,您需要考虑同步您的保存点。然后,您只需在调度程序中运行即可定期在 RSS 源中获取新项目并获取相关文章。
import feedparser, requests, newspaper
from bs4 import BeautifulSoup
import urllib.parse, xml.sax
from pathlib import Path
import pandas as pd
if not Path().cwd().joinpath("news").is_dir(): Path.cwd().joinpath("news").mkdir()
p = Path().cwd().joinpath("news")
# get some RSS feeds....
if p.joinpath("rss.pickle").is_file():
dfrss = pd.read_pickle(p.joinpath("rss.pickle"))
else:
resp = requests.get("https://blog.feedspot.com/world_news_rss_feeds/")
soup = BeautifulSoup(resp.content.decode(), "html.parser")
rawfeeds = soup.find_all("h2")
feeds = []
for rf in rawfeeds:
a = rf.find("a")
if a is not None:
feeds.append({"name":a.string.replace("RSS Feed", "").strip(),
"url":urllib.parse.parse_qs(a['href'])["q"][0].replace("site:",""),
"etag":"","status":0, "dubug_msg":"", "modified":""})
dfrss = pd.DataFrame(feeds).set_index("url")
if p.joinpath("rssdata.pickle").is_file():
df = pd.read_pickle(p.joinpath("rssdata.pickle"))
else:
df = pd.DataFrame({"id":[],"link":[]})
# now source them all into a dataframe. head() is there for testing purposes
for r in dfrss.head(5).itertuples():
# print(r.Index)
try:
fp = feedparser.parse(r.Index, etag=r.etag, modified=r.modified)
if fp.bozo==1: raise Exception(fp.bozo_exception)
except Exception as e:
fp = feedparser.FeedParserDict(**{"etag":r.etag, "entries":[], "status":500, "debug_message":str(e)})
# keep meta information of what has already been sourced from a RSS feed
if "etag" in fp.keys(): dfrss.loc[r.Index,"etag"] = fp.etag
dfrss.loc[r.Index,"status"] = fp.status
if "debug_message" in fp.keys(): dfrss.loc[r.Index,"debug_mgs"] = fp.debug_message
# 304 means upto date... getting 301 and entries hence test len...
if len(fp["entries"])>0:
dft = pd.json_normalize(fp["entries"]).assign(Source=r.Index)
# don't capture items that have already been captured...
df = pd.concat([df, dft[~dft["link"].isin(df["link"])]])
# save to make re-entrant...
dfrss.to_pickle(p.joinpath("rss.pickle"))
df.to_pickle(p.joinpath("rssdata.pickle"))
# finally get the text...
if p.joinpath("text.pickle").is_file():
dftext = pd.read_pickle(p.joinpath("text.pickle"))
else:
dftext = pd.DataFrame({"link":[], "text":[]})
# head() is there for testing purposes
for r in df[~df["link"].isin(dftext["link"])].head(5).itertuples():
a = newspaper.Article(r.link)
a.download()
a.parse()
dftext = dftext.append({"link":r.link, "text":a.text},ignore_index=True)
dftext.to_pickle(p.joinpath("text.pickle"))
对检索到的数据进行分析。
添加回答
举报