-
开发环境。
查看全部 -
scrapy
查看全部 -
可以通过rz命令打开选择文件的页面进行导入下载好的文件
查看全部 -
指定路径编译安装 python3和各种包
查看全部 -
安装openssl-devel
查看全部 -
scrapy爬虫框架的课程大纲
查看全部 -
class DoubanSpiderSpider(scrapy.Spider): name = 'douban_spider' allowed_domains = ['movie.douban.com'] start_urls = ['http://movie.douban.com/top250'] # 默认的解析方法 def parse(self, response): movie_list = response.xpath("//div[@class='article']//ol[@class='grid_view']/li") for i_item in movie_list: douban_item = DoubanItem() douban_item['serial_number'] = i_item.xpath(".//div[@class='item']//em/text()").extract_first() douban_item['movie_name'] = i_item.xpath( ".//div[@class='info']/div[@class='hd']/a/span[1]/text()").extract_first() content = i_item.xpath(".//div[@class='info']//div[@class='bd']/p[1]/text()").extract() for i_content in content: content_s = ''.join(i_content.split()) douban_item['introduce'] = content_s douban_item['star'] = i_item.xpath(".//span[@class='rating_num']/text()").extract_first() douban_item['evaluate'] = i_item.xpath(".//div[@class='star']/span[4]/text()").extract_first() douban_item['describe'] = i_item.xpath(".//p[@class='quote']//span/text()").extract_first() yield douban_item # 解析下一页规则,取的后页的xpath next_link = response.xpath("//span[@class='next']/link/@href").extract() if next_link: next_link = next_link[0] yield scrapy.Request('http://movie.douban.com/top250' + next_link, callback=self.parse)
查看全部 -
class DoubanSpiderSpider(scrapy.Spider): name = 'douban_spider' allowed_domains = ['movie.douban.com'] start_urls = ['http://movie.douban.com/top250'] # 默认的解析方法 def parse(self, response): movie_list = response.xpath("//div[@class='article']//ol[@class='grid_view']/li") for i_item in movie_list: douban_item = DoubanItem() douban_item['serial_number'] = i_item.xpath(".//div[@class='item']//em/text()").extract_first() douban_item['movie_name'] = i_item.xpath( ".//div[@class='info']/div[@class='hd']/a/span[1]/text()").extract_first() content = i_item.xpath(".//div[@class='info']//div[@class='bd']/p[1]/text()").extract() for i_content in content: content_s = ''.join(i_content.split()) douban_item['introduce'] = content_s douban_item['star'] = i_item.xpath(".//span[@class='rating_num']/text()").extract_first() douban_item['evaluate'] = i_item.xpath(".//div[@class='star']/span[4]/text()").extract_first() douban_item['describe'] = i_item.xpath(".//p[@class='quote']//span/text()").extract_first() yield douban_item # 解析下一页规则,取的后页的xpath next_link = response.xpath("//span[@class='next']/link/@href").extract() if next_link: next_link = next_link[0] yield scrapy.Request('http://movie.douban.com/top250' + next_link, callback=self.parse)
查看全部
举报
0/150
提交
取消