2 回答
TA贡献1825条经验 获得超4个赞
所以使用硒。基本上它获取第一页,然后单击“下一步”。它会一直持续到没有更多的页面可以访问。
我遇到的问题是它运行得太快,所以在某些时候它没有找到“Next”并崩溃。我设置了 1 秒的延迟(但有更好的方法可以做到这一点,比如隐式等待……我仍在学习如何正确使用它。)
但这会让你继续前进。
import bs4 as bs
from selenium import webdriver
import time
import pandas as pd
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
parsed = []
url = 'https://etfdb.com/screener/'
driver.get(url)
while driver.find_element_by_xpath('//*[@id="mobile_table_pills"]/div[1]/div/div[2]/div/ul/li[8]/a'):
try:
resp = driver.page_source
soup = bs.BeautifulSoup(resp, 'lxml')
table = soup.find('table', {'class': 'table table-bordered table-hover table-striped mm-mobile-table'})
i = 0
while i<len(table.find_all('td')):
try:
ticker = table.find_all('td')[i].text
name = table.find_all('td')[i+1].text
asset_class = table.find_all('td')[i+2].text
parsed.append([ticker, name ,asset_class])
except:
pass
i = i+8
elem = driver.find_element_by_xpath('//*[@id="mobile_table_pills"]/div[1]/div/div[2]/div/ul/li[8]/a').click()
print ('Aquired page: %s' %(driver.current_url.split('page=')[-1]))
time.sleep(1)
except:
break
df = pd.DataFrame(parsed, columns=['Ticker','Name','Asset Class'])
输出:
print (df)
Ticker ... Asset Class
0 SPY ... Equity
1 IVV ... Equity
2 VTI ... Equity
3 VOO ... Equity
4 VEA ... Equity
5 QQQ ... Equity
6 EFA ... Equity
7 VWO ... Equity
8 IEMG ... Equity
9 AGG ... Bond
10 IEFA ... Equity
11 IJH ... Equity
12 VTV ... Equity
13 IJR ... Equity
14 IWM ... Equity
15 IWF ... Equity
16 IWD ... Equity
17 BND ... Bond
18 VUG ... Equity
19 EEM ... Equity
20 GLD ... Commodity
21 VNQ ... Real Estate
22 VIG ... Equity
23 LQD ... Bond
24 VB ... Equity
25 VO ... Equity
26 XLF ... Equity
27 VCSH ... Bond
28 USMV ... Equity
29 VEU ... Equity
... ... ...
2219 BDD ... Commodity
2220 WDRW ... Equity
2221 LACK ... Equity
2222 HONR ... Equity
2223 PEXL ... Equity
2224 FOANC ... Equity
2225 DYY ... Commodity
2226 HAUD ... Equity
2227 SCC ... Equity
2228 PASS ... Equity
2229 CHEP ... Alternatives
2230 EKAR ... Equity
2231 LTL ... Equity
2232 INR ... Currency
2233 BUYN ... Equity
2234 PETZC ... Equity
2235 SBM ... Equity
2236 RPUT ... Alternatives
2237 SZO ... Commodity
2238 EEH ... Equity
2239 HEWW ... Equity
2240 FUE ... Commodity
2241 AGF ... Commodity
2242 GRBIC ... Equity
2243 VSL ... Equity
2244 DLBL ... Bond
2245 BOS ... Commodity
2246 LD ... Commodity
2247 BOM ... Commodity
2248 DDP ... Commodity
[2249 rows x 3 columns]
TA贡献1820条经验 获得超9个赞
该站点的分页操作是动态的,因此,您需要使用selenium:
from selenium import webdriver
from bs4 import BeautifulSoup as soup
import re
def get_content(_d):
_headers = [re.sub('\n', '', i.text) for i in _d.find('table').find('tr').find_all('th')]
_, *data = list(filter(None, [[re.sub('\n', '', i.text) for i in b.find_all('td')] for b in _d.find_all('tr')]))
return [dict(zip(_headers, i)) for i in data]
d = webdriver.Chrome('/Users/jamespetullo/Downloads/chromedriver')
d.get('https://etfdb.com/screener/')
all_data = [get_content(soup(d.page_source, 'html.parser'))]
_next = [i for i in d.find_elements_by_tag_name('a') if re.findall('^next', i.text, re.I)]
while _next:
_next[0].send_keys('\n')
all_data.append(get_content(soup(d.page_source, 'html.parser')))
_next = [i for i in d.find_elements_by_tag_name('a') if re.findall('^next', i.text, re.I)]
示例输出(仅限第一页,由于 SO 的回答字符限制):
[{'Symbol': 'IVV', 'ETF Name': 'iShares Core S&P 500 ETF', 'Asset ClassNew': 'Equity', 'Total Assets ($MM)': '$159,613.22', 'YTD': '11.41%', 'Avg. Volume': '6,062,749', 'Previous Closing Price': '$280.32', 'Overall Rating': ''}, {'Symbol': 'VTI', 'ETF Name': 'Vanguard Total Stock Market ETF', 'Asset ClassNew': 'Equity', 'Total Assets ($MM)': '$107,909.21', 'YTD': '12.40%', 'Avg. Volume': '4,901,940', 'Previous Closing Price': '$143.46', 'Overall Rating': ''}, {'Symbol': 'VOO', 'ETF Name': 'Vanguard S&P 500 ETF', 'Asset ClassNew': 'Equity', 'Total Assets ($MM)': '$102,814.11', 'YTD': '11.43%', 'Avg. Volume': '4,474,837', 'Previous Closing Price': '$256.07', 'Overall Rating': ''}, {'Symbol': 'VEA', 'ETF Name': 'Vanguard FTSE Developed Markets ETF', 'Asset ClassNew': 'Equity', 'Total Assets ($MM)': '$70,063.68', 'YTD': '9.97%', 'Avg. Volume': '22,225,420', 'Previous Closing Price': '$40.80', 'Overall Rating': ''}, {'Symbol': 'QQQ', 'ETF Name': 'Invesco QQQ', 'Asset ClassNew': 'Equity', 'Total Assets ($MM)': '$68,001.87', 'YTD': '12.27%', 'Avg. Volume': '48,660,348', 'Previous Closing Price': '$173.19', 'Overall Rating': ''}, {'Symbol': 'EFA', 'ETF Name': 'iShares MSCI EAFE ETF', 'Asset ClassNew': 'Equity', 'Total Assets ($MM)': '$65,100.01', 'YTD': '9.34%', 'Avg. Volume': '39,227,020', 'Previous Closing Price': '$64.27', 'Overall Rating': ''}, {'Symbol': 'VWO', 'ETF Name': 'Vanguard FTSE Emerging Markets ETF', 'Asset ClassNew': 'Equity', 'Total Assets ($MM)': '$62,464.73', 'YTD': '9.24%', 'Avg. Volume': '21,504,412', 'Previous Closing Price': '$41.62', 'Overall Rating': ''}, {'Symbol': 'IEMG', 'ETF Name': 'iShares Core MSCI Emerging Markets ETF', 'Asset ClassNew': 'Equity', 'Total Assets ($MM)': '$58,650.02', 'YTD': '8.42%', 'Avg. Volume': '23,205,799', 'Previous Closing Price': '$51.12', 'Overall Rating': ''}, {'Symbol': 'AGG', 'ETF Name': 'iShares Core U.S. Aggregate Bond ETF', 'Asset ClassNew': 'Bond', 'Total Assets ($MM)': '$58,023.69', 'YTD': '0.79%', 'Avg. Volume': '6,496,300', 'Previous Closing Price': '$107.07', 'Overall Rating': ''}, {'Symbol': 'IEFA', 'ETF Name': 'iShares Core MSCI EAFE ETF', 'Asset ClassNew': 'Equity', 'Total Assets ($MM)': '$57,206.30', 'YTD': '9.62%', 'Avg. Volume': '15,197,579', 'Previous Closing Price': '$60.29', 'Overall Rating': ''}, {'Symbol': 'IJH', 'ETF Name': 'iShares Core S&P Mid-Cap ETF', 'Asset ClassNew': 'Equity', 'Total Assets ($MM)': '$48,274.19', 'YTD': '15.12%', 'Avg. Volume': '2,089,207', 'Previous Closing Price': '$191.16', 'Overall Rating': ''}, {'Symbol': 'VTV', 'ETF Name': 'Vanguard Value ETF', 'Asset ClassNew': 'Equity', 'Total Assets ($MM)': '$46,173.50', 'YTD': '10.02%', 'Avg. Volume': '2,821,418', 'Previous Closing Price': '$107.76', 'Overall Rating': ''}, {'Symbol': 'IJR', 'ETF Name': 'iShares Core S&P Small-Cap ETF', 'Asset ClassNew': 'Equity', 'Total Assets ($MM)': '$44,864.19', 'YTD': '15.44%', 'Avg. Volume': '5,460,632', 'Previous Closing Price': '$80.02', 'Overall Rating': ''}, {'Symbol': 'IWM', 'ETF Name': 'iShares Russell 2000 ETF', 'Asset ClassNew': 'Equity', 'Total Assets ($MM)': '$43,733.85', 'YTD': '17.09%', 'Avg. Volume': '26,073,227', 'Previous Closing Price': '$156.78', 'Overall Rating': ''}, {'Symbol': 'IWF', 'ETF Name': 'iShares Russell 1000 Growth ETF', 'Asset ClassNew': 'Equity', 'Total Assets ($MM)': '$42,024.32', 'YTD': '12.77%', 'Avg. Volume': '2,928,795', 'Previous Closing Price': '$147.63', 'Overall Rating': ''}, {'Symbol': 'IWD', 'ETF Name': 'iShares Russell 1000 Value ETF', 'Asset ClassNew': 'Equity', 'Total Assets ($MM)': '$38,059.34', 'YTD': '11.08%', 'Avg. Volume': '4,668,100', 'Previous Closing Price': '$123.35', 'Overall Rating': ''}, {'Symbol': 'BND', 'ETF Name': 'Vanguard Total Bond Market ETF', 'Asset ClassNew': 'Bond', 'Total Assets ($MM)': '$37,358.63', 'YTD': '1.02%', 'Avg. Volume': '2,981,882', 'Previous Closing Price': '$79.82', 'Overall Rating': ''}, {'Symbol': 'VUG', 'ETF Name': 'Vanguard Growth ETF', 'Asset ClassNew': 'Equity', 'Total Assets ($MM)': '$36,989.43', 'YTD': '13.23%', 'Avg. Volume': '1,319,368', 'Previous Closing Price': '$152.10', 'Overall Rating': ''}, {'Symbol': 'EEM', 'ETF Name': 'iShares MSCI Emerging Markets ETF', 'Asset ClassNew': 'Equity', 'Total Assets ($MM)': '$34,442.08', 'YTD': '8.65%', 'Avg. Volume': '89,719,367', 'Previous Closing Price': '$42.44', 'Overall Rating': ''}, {'Symbol': 'GLD', 'ETF Name': 'SPDR Gold Trust', 'Asset ClassNew': 'Commodity', 'Total Assets ($MM)': '$33,249.82', 'YTD': '2.26%', 'Avg. Volume': '8,782,053', 'Previous Closing Price': '$123.99', 'Overall Rating': ''}, {'Symbol': 'VNQ', 'ETF Name': 'Vanguard Real Estate Index Fund', 'Asset ClassNew': 'Real Estate', 'Total Assets ($MM)': '$32,604.41', 'YTD': '12.63%', 'Avg. Volume': '8,538,158', 'Previous Closing Price': '$83.99', 'Overall Rating': ''}, {'Symbol': 'VIG', 'ETF Name': 'Vanguard Dividend Appreciation ETF', 'Asset ClassNew': 'Equity', 'Total Assets ($MM)': '$32,584.13', 'YTD': '11.19%', 'Avg. Volume': '1,610,603', 'Previous Closing Price': '$108.91', 'Overall Rating': ''}, {'Symbol': 'LQD', 'ETF Name': 'iShares iBoxx $ Investment Grade Corporate Bond ETF', 'Asset ClassNew': 'Bond', 'Total Assets ($MM)': '$32,520.19', 'YTD': '3.16%', 'Avg. Volume': '9,748,131', 'Previous Closing Price': '$116.03', 'Overall Rating': ''}, {'Symbol': 'VB', 'ETF Name': 'Vanguard Small Cap ETF', 'Asset ClassNew': 'Equity', 'Total Assets ($MM)': '$24,678.09', 'YTD': '17.14%', 'Avg. Volume': '1,025,374', 'Previous Closing Price': '$154.61', 'Overall Rating': ''}, {'Symbol': 'Export to CSV with ETFdb.com Pro'}, {'Symbol': 'Export to CSV with ETFdb.com Pro'}, {'Symbol': 'Export to CSV with ETFdb.com Pro'}, {'Symbol': 'Export to CSV with ETFdb.com Pro'}, {'Symbol': 'Export to CSV with ETFdb.com Pro'}, {'Symbol': 'Export to CSV with ETFdb.com Pro'}, {'Symbol': 'Export to CSV with ETFdb.com Pro'}, {'Symbol': 'Export to CSV with ETFdb.com Pro'}, {'Symbol': 'Export to CSV with ETFdb.com Pro'}, {'Symbol': 'Export to CSV with ETFdb.com Pro'}, {'Symbol': 'Export to CSV with ETFdb.com Pro'}, {'Symbol': 'Export to CSV with ETFdb.com Pro'}]
此解决方案利用while循环不断单击“下一步”按钮以浏览页面。这可以处理与站点提供的尽可能多的页面,而不是依赖于抓取前五个结果。
添加回答
举报