为了账号安全,请及时绑定邮箱和手机立即绑定

scrapy-redis 分布式问题

scrapy-redis 分布式问题

呼如林 2019-02-17 18:22:36
#!/usr/bin env python3 __author__ = 'Stephen' import scrapy, json from Espider.tools.get_cookies import get_cookies from scrapy_redis.spiders import RedisSpider from scrapy_redis.utils import bytes_to_str from Espider.items.jingzhunitem import jingzhun_item import execjs class jingzhun(RedisSpider): name = "jingzhun" # start_urls = ['https://rong.36kr.com/'] redis_key = 'edcToSpider' custom_settings = { "RANDOM_DELAY": 5, "SCHEDULER": "scrapy_redis.scheduler.Scheduler", "DUPEFILTER_CLASS": 'scrapy_redis.dupefilter.RFPDupeFilter', "SCHEDULER_QUEUE_CLASS": 'scrapy_redis.queue.SpiderPriorityQueue', "SCHEDULER_PERSIST": True, "ITEM_PIPELINES": { 'scrapy_redis.pipelines.RedisPipeline': 400 }, "REDIS_URL": 'redis://685e545f59634200:WutongaMINUS1968@685e545f59634200.m.cnqda.kvstore.aliyuncs.com:6379/27', "REDIS_HOST": '685e545f59634200.m.cnqda.kvstore.aliyuncs.com', "REDIS_PORT": 6379, "REDIS_PARAMS": { 'password': '685e545f59634200:WutongaMINUS1968' } } def __init__(self, *args, **kwargs): domain = kwargs.pop('domain', '') self.allowed_domains = filter(None, domain.split(',')) super(jingzhun, self).__init__(*args, **kwargs) self.cookie_str = "acw_tc=b65cfd2515395760831792797e7a30fed7278a95d7c68d0dcad0b9cbc4ac1b; kwlo_iv=1h; kr_stat_uuid=TRRfp25694452; Hm_lvt_e8ec47088ed7458ec32cde3617b23ee3=1541062621,1541150329,1541661241; Hm_lpvt_e8ec47088ed7458ec32cde3617b23ee3=1541667148; download_animation=1; _kr_p_se=9867c144-9614-4298-96f7-0e46ed5efefe; krid_user_id=2014445492; krid_user_version=2; kr_plus_id=2014445492; kr_plus_token=8dnyAhS2t87wW1PU1p91L_jUAHPFmepeJJ75____; kr_plus_utype=0; device-uid=5fa2cef0-e334-11e8-978f-67115035d613" self.headers = {"Referer": "https://rong.36kr.com/list/detail&?sortField=HOT_SCORE", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" , "Host": "rong.36kr.com"} self.co_headers = {"Referer": "https://rong.36kr.com/list/detail&?sortField=HOT_SCORE", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" , "Host": "rong.36kr.com", "cookie": self.cookie_str} self.js_read = open('./js/jingzhun.js', 'r').read() def make_request_from_data(self, data): url = bytes_to_str(self.redis_key, self.redis_encoding) return self.make_requests_from_url(url) def make_requests_from_url(self, url): yield scrapy.Request( url=url, callback=self.get_all_info, headers=self.headers, #dont_filter=True, cookies=get_cookies(self.cookie_str), ) 报错如下 2018-11-18 01:24:35 [scrapy.middleware] INFO: Enabled item pipelines: ['scrapy_redis.pipelines.RedisPipeline'] 2018-11-18 01:24:35 [scrapy.core.engine] INFO: Spider opened 2018-11-18 01:24:35 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 2018-11-18 01:24:35 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023 2018-11-18 01:24:35 [scrapy.utils.signal] ERROR: Error caught on signal handler: <bound method RefererMiddleware.request_scheduled of <scrapy.spidermiddlewares.referer.RefererMiddleware object at 0x7fa0e88d19b0>> Traceback (most recent call last): File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/utils/signal.py", line 30, in send_catch_log *arguments, **named) File "/home/shenjianlin/.local/lib/python3.4/site-packages/pydispatch/robustapply.py", line 55, in robustApply return receiver(*arguments, **named) File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/spidermiddlewares/referer.py", line 343, in request_scheduled redirected_urls = request.meta.get('redirect_urls', []) AttributeError: 'generator' object has no attribute 'meta' Unhandled Error Traceback (most recent call last): File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/commands/crawl.py", line 58, in run self.crawler_process.start() File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/crawler.py", line 291, in start reactor.run(installSignalHandlers=False) # blocking call File "/home/shenjianlin/.local/lib/python3.4/site-packages/twisted/internet/base.py", line 1261, in run self.mainLoop() File "/home/shenjianlin/.local/lib/python3.4/site-packages/twisted/internet/base.py", line 1270, in mainLoop self.runUntilCurrent() --- <exception caught here> --- File "/home/shenjianlin/.local/lib/python3.4/site-packages/twisted/internet/base.py", line 896, in runUntilCurrent call.func(*call.args, **call.kw) File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/utils/reactor.py", line 41, in __call__ return self._func(*self._a, **self._kw) File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/core/engine.py", line 135, in _next_request self.crawl(request, spider) File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/core/engine.py", line 210, in crawl self.schedule(request, spider) File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/core/engine.py", line 216, in schedule if not self.slot.scheduler.enqueue_request(request): File "/usr/lib/python3.4/site-packages/scrapy_redis/scheduler.py", line 162, in enqueue_request if not request.dont_filter and self.df.request_seen(request): builtins.AttributeError: 'generator' object has no attribute 'dont_filter' 2018-11-18 01:24:35 [twisted] CRITICAL: Unhandled Error Traceback (most recent call last): File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/commands/crawl.py", line 58, in run self.crawler_process.start() File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/crawler.py", line 291, in start reactor.run(installSignalHandlers=False) # blocking call File "/home/shenjianlin/.local/lib/python3.4/site-packages/twisted/internet/base.py", line 1261, in run self.mainLoop() File "/home/shenjianlin/.local/lib/python3.4/site-packages/twisted/internet/base.py", line 1270, in mainLoop self.runUntilCurrent() --- <exception caught here> --- File "/home/shenjianlin/.local/lib/python3.4/site-packages/twisted/internet/base.py", line 896, in runUntilCurrent call.func(*call.args, **call.kw) File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/utils/reactor.py", line 41, in __call__ return self._func(*self._a, **self._kw) File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/core/engine.py", line 135, in _next_request self.crawl(request, spider) File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/core/engine.py", line 210, in crawl self.schedule(request, spider) File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/core/engine.py", line 216, in schedule if not self.slot.scheduler.enqueue_request(request): File "/usr/lib/python3.4/site-packages/scrapy_redis/scheduler.py", line 162, in enqueue_request if not request.dont_filter and self.df.request_seen(request): builtins.AttributeError: 'generator' object has no attribute 'dont_filter' 2018-11-18 01:24:40 [jingzhun] DEBUG: Read 1 requests from 'edcToSpider' 2018-11-18 01:25:35 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
查看完整描述

1 回答

?
慕斯王

TA贡献1864条经验 获得超2个赞

把代码的yield换成return看看。

查看完整回答
反对 回复 2019-03-01
  • 1 回答
  • 0 关注
  • 1493 浏览
慕课专栏
更多

添加回答

举报

0/150
提交
取消
意见反馈 帮助中心 APP下载
官方微信