1 回答
TA贡献1840条经验 获得超5个赞
下面的代码能够遍历所有类别并提取数据。该代码肯定需要更多的测试和一些增强的错误处理。
PS祝你在这个编码项目中好运。
import requests
import time
from random import randint
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from newspaper.utils import BeautifulSoup
from newspaper import Article
chrome_options = Options()
chrome_options.add_argument("--test-type")
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('disable-infobars')
chrome_options.add_argument("--incognito")
# chrome_options.add_argument('--headless')
# window size as an argument is required in headless mode
# chrome_options.add_argument('window-size=1920x1080')
driver = webdriver.Chrome('/usr/local/bin/chromedriver', options=chrome_options)
papers = []
urls_set = set()
def get_articles(link):
while True:
try:
next_link = driver.find_element_by_link_text("Suivant")
if next_link:
raw_html = requests.get(url)
soup = BeautifulSoup(raw_html.text, 'html.parser')
for articles_tags in soup.findAll('div', {'class': 'articles'}):
for article_href in articles_tags.find_all('a', href=True):
if not str(article_href['href']).endswith('#commentaires'):
article = Article(article_href['href'])
article.download()
article.parse()
if article.url is not None:
article_url = article_href['href']
title = article.title
publish_date = datetime.strptime(str(article.publish_date),
'%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d')
text_of_article = article.text.replace('\n', '')
driver.execute_script("arguments[0].scrollIntoView(true);", next_link)
next_link.click()
# Initiates a random wait to prevent the
# harvesting operation from starting before
# the page has completely loaded
time.sleep(randint(2, 4))
except NoSuchElementException:
return
legorafi_urls = {'monde-libre': 'http://www.legorafi.fr/category/monde-libre',
'politique': 'http://www.legorafi.fr/category/france/politique',
'societe': 'http://www.legorafi.fr/category/france/societe',
'economie': 'http://www.legorafi.fr/category/france/economie',
'culture': 'http://www.legorafi.fr/category/culture',
'people': 'http://www.legorafi.fr/category/people',
'sports': 'http://www.legorafi.fr/category/sports',
'hi-tech': 'http://www.legorafi.fr/category/hi-tech',
'sciences': 'http://www.legorafi.fr/category/sciences',
'ledito': 'http://www.legorafi.fr/category/ledito/'
}
for category, url in legorafi_urls.items():
if url:
browser = driver.get(url)
driver.implicitly_wait(30)
get_articles(browser)
else:
driver.quit()
添加回答
举报