2 回答
TA贡献1835条经验 获得超7个赞
以下实现在16 秒内实现了这一目标。
为加快执行进程,我采取了以下措施:
完全删除
Selenium
(无需点击)对于
abstract
, 使用BeautifulSoup
的输出并稍后对其进行处理添加
multiprocessing
以显着加快该过程
from multiprocessing import Process, Manager
import requests
from bs4 import BeautifulSoup
import re
import time
start_time = time.time()
def get_no_of_pages(showing_text):
no_of_results = int((re.findall(r"(\d+,*\d+) results for all",showing_text)[0].replace(',','')))
pages = no_of_results//200 + 1
print("total pages:",pages)
return pages
def clean(text):
return text.replace("\n", '').replace(" ",'')
def get_data_from_page(url,page_number,data):
print("getting page",page_number)
response = requests.get(url+"start="+str(page_number*200))
soup = BeautifulSoup(response.content, "lxml")
arxiv_results = soup.find_all("li",{"class","arxiv-result"})
for arxiv_result in arxiv_results:
paper = {}
paper["titles"]= clean(arxiv_result.find("p",{"class","title is-5 mathjax"}).text)
links = arxiv_result.find_all("a")
paper["arxiv_ids"]= links[0].text.replace('arXiv:','')
paper["arxiv_links"]= links[0].get('href')
paper["pdf_link"]= links[1].get('href')
paper["authors"]= clean(arxiv_result.find("p",{"class","authors"}).text.replace('Authors:',''))
split_abstract = arxiv_result.find("p",{"class":"abstract mathjax"}).text.split("▽ More\n\n\n",1)
if len(split_abstract) == 2:
paper["abstract"] = clean(split_abstract[1].replace("△ Less",''))
else:
paper["abstract"] = clean(split_abstract[0].replace("△ Less",''))
paper["date"] = re.split(r"Submitted|;",arxiv_results[0].find("p",{"class":"is-size-7"}).text)[1]
paper["tag"] = clean(arxiv_results[0].find("div",{"class":"tags is-inline-block"}).text)
doi = arxiv_results[0].find("div",{"class":"tags has-addons"})
if doi is None:
paper["doi"] = "None"
else:
paper["doi"] = re.split(r'\s', doi.text)[1]
data.append(paper)
print(f"page {page_number} done")
if __name__ == "__main__":
url = 'https://arxiv.org/search/?searchtype=all&query=healthcare&abstracts=show&size=200&order=-announced_date_first&'
response = requests.get(url+"start=0")
soup = BeautifulSoup(response.content, "lxml")
with Manager() as manager:
data = manager.list()
processes = []
get_data_from_page(url,0,data)
showing_text = soup.find("h1",{"class":"title is-clearfix"}).text
for i in range(1,get_no_of_pages(showing_text)):
p = Process(target=get_data_from_page, args=(url,i,data))
p.start()
processes.append(p)
for p in processes:
p.join()
print("Number of entires scraped:",len(data))
stop_time = time.time()
print("Time taken:", stop_time-start_time,"seconds")
输出:
>>> python test.py
getting page 0
page 0 done
total pages: 10
getting page 1
getting page 4
getting page 2
getting page 6
getting page 5
getting page 3
getting page 7
getting page 9
getting page 8
page 9 done
page 4 done
page 1 done
page 6 done
page 2 done
page 7 done
page 3 done
page 5 done
page 8 done
Number of entires scraped: 1890
Time taken: 15.911492586135864 seconds
TA贡献1155条经验 获得超0个赞
您可以根据要求尝试一下美丽的汤做法。无需点击更多链接。
from requests import get
from bs4 import BeautifulSoup
# you can change the size to retrieve all the results at one shot.
url = 'https://arxiv.org/search/?query=healthcare&searchtype=all&abstracts=show&order=-announced_date_first&size=50&start=0'
response = get(url,verify = False)
soup = BeautifulSoup(response.content, "lxml")
#print(soup)
queryresults = soup.find_all("li", attrs={"class": "arxiv-result"})
for result in queryresults:
title = result.find("p",attrs={"class": "title is-5 mathjax"})
print(title.text)
#If you need full abstract content - try this (you do not need to click on more button
for result in queryresults:
abstractFullContent = result.find("span",attrs={"class": "abstract-full has-text-grey-dark mathjax"})
print(abstractFullContent.text)
输出:
Interpretable Deep Learning for Automatic Diagnosis of 12-lead Electrocardiogram
Leveraging Technology for Healthcare and Retaining Access to Personal Health Data to Enhance Personal Health and Well-being
Towards new forms of particle sensing and manipulation and 3D imaging on a smartphone for healthcare applications
添加回答
举报