我有一个 JSON 文件,其中填充了抓取网站的数据。重复数据的次数比通常要多,例如,我提供了 JSON 文件外观的片段。是否可以删除重复项并保留第一次出现?用我的完整代码更新了最多。如果这有所作为。# grabs all the trending quotes for that daydef getTrendingQuotes(browser): # wait until trending links appear, not really needed only for example all_trendingQuotes = WebDriverWait(browser, 10).until( lambda d: d.find_elements_by_css_selector('#trendingQuotes a') ) return [link.get_attribute('href') for link in all_trendingQuotes]def getStockDetails(url, browser): print(url) browser.get(url) quote_wrapper = browser.find_element_by_css_selector('div.quote-wrapper') quote_name = quote_wrapper.find_element_by_class_name( "quote-name").find_element_by_tag_name('h2').text quote_price = quote_wrapper.find_element_by_class_name("quote-price").text quote_volume = quote_wrapper.find_element_by_class_name( "quote-volume").text print("\n") print("Quote Name: " + quote_name) print("Quote Price: " + quote_price) print("Quote Volume: " + quote_volume) print("\n") convertToJson(quote_name, quote_price, quote_volume, url)quotesArr = []# Convert to a JSON filedef convertToJson(quote_name, quote_price, quote_volume, url): quoteObject = { "url": url, "Name": quote_name, "Price": quote_price, "Volume": quote_volume } quotesArr.append(quoteObject)def trendingBot(url, browser): browser.get(url) trending = getTrendingQuotes(browser) for trend in trending: getStockDetails(trend, browser) # requests finished, write json to file with open('trendingQuoteData.json', 'w') as outfile: json.dump(quotesArr, outfile)def Main(): scheduler = BlockingScheduler() chrome_options = Options() chrome_options.add_argument("--headless") # applicable to windows os only chrome_options.add_argument('--disable-gpu')
添加回答
举报
0/150
提交
取消