1 回答
TA贡献1829条经验 获得超7个赞
这很简单,
获取 中所有类别的列表all_categories,现在不要抓取所有链接,只抓取第一个类别链接,一旦该类别的所有页面都被抓取,然后将请求发送到另一个类别链接。
这是代码,我没有运行代码,所以可能存在一些语法错误,但你需要的是逻辑
class PccomSpider(scrapy.Spider):
name = 'pccom'
allowed_domains = ['pccomponentes.com']
start_urls = ['https://www.pccomponentes.com/componentes']
all_categories = []
def yield_category(self):
if self.all_categories:
url = self.all_categories.pop()
print("Scraping category %s " % (url))
return scrapy.Request(url, self.parse_item_list)
else:
print("all done")
#Scrapes links for every category from main page
def parse(self, response):
categories = response.xpath('//a[contains(@class,"enlace-secundario")]/@href')
self.all_categories = list(response.urljoin(category.extract()) for category in categories)
yield self.yield_category()
#Scrapes products from every page of each category
def parse_item_list(self, response, prio):
products = response.xpath('//article[contains(@class,"tarjeta-articulo")]')
for product in products:
item = ScrapperPccomItem()
item['name'] = product.xpath('@data-name').extract()
item['price'] = product.xpath('@data-price').extract()
yield item
#URL of the next page
next_page = response.xpath('//div[@id="pager"]//li[contains(@class,"c-paginator__next")]//a/@href').extract_first()
if next_page:
next_url = response.urljoin(next_page)
yield scrapy.Request(next_url, self.parse_item_list)
else:
print("All pages of this category scraped, now scraping next category")
yield self.yield_category()
添加回答
举报