为了账号安全,请及时绑定邮箱和手机立即绑定

只输出了两条内容,但去掉try并没有报错。。。要哭了,请各位大神帮忙看看!

(1)spider_main

#coding:utf8

import sys

reload(sys)   

sys.setdefaultencoding('utf8')


class UrlManager(object):

    def __init__(self):

        self.new_urls = set()

        self.old_urls = set()

    

    def add_new_url(self, url):

        if url is None:

            return

        if url not in self.new_urls and url not in self.old_urls:

            self.new_urls.add(url)


    def add_new_urls(self, urls):

        if urls is None or len(urls) == 0:

            return

        for url in urls:

            self.add_new_url(url)

    

    def has_new_url(self):

        return len(self.new_urls) != 0

    def get_new_url(self):

        #pop方法会抽取并移除一个url

        new_url = self.new_urls.pop()

        self.old_urls.add(new_url)

        return new_url

(2)html_downloader

#coding:utf8

import urllib2

import sys

reload(sys)   

sys.setdefaultencoding('utf8')



class HtmlDownloader(object):

    

    

    def download(self, url):

        if url is None:

            return None

        

        response = urllib2.urlopen(url)

        

        if response.getcode() != 200:

            return None

        

        return response.read()

(3)html_parser

#coding:utf8

from bs4 import BeautifulSoup

import re

import urlparse

import sys

reload(sys)   

sys.setdefaultencoding('utf8')


class HtmlParser(object):

    

    

    def _get_new_urls(self, page_url, soup):

        new_urls = set()

        #/item/%E6%95%99%E5%AD%A6   https://baike.baidu.com/item/%E6%95%99%E5%AD%A6

        links = soup.find_all('a', href = re.compile(r"/item/."))

        for link in links:

            new_url = link['href']

            new_full_url = urlparse.urljoin(page_url, new_url)

            new_urls.add(new_full_url)

            return new_urls

    

    

    def _get_new_data(self, page_url, soup):

        #将数据提出用res_data

        res_data = {}

        

        #把url放进最终数据中

        res_data['url'] = page_url

        

        #<dd class="lemmaWgt-lemmaTitle-title"> <h1>Python</h1>

        title_node = soup.find('dd',class_ = "lemmaWgt-lemmaTitle-title").find("h1")

        res_data['title'] = title_node.get_text()

        

        #<div class="lemma-summary" label-module="lemmaSummary">

        summary_node = soup.find('div', class_ = "lemma-summary")

        res_data['summary'] = summary_node.get_text()

        

        return res_data

    

    

    def parse(self, page_url, html_cont):

        if page_url is None or html_cont is None:

            return

        

        soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')

        new_urls = self._get_new_urls(page_url, soup)

        new_data = self._get_new_data(page_url, soup)

        return new_urls, new_data

(4)html_outputer

#coding:utf8

import sys

reload(sys)   

sys.setdefaultencoding('utf8')


class HtmlOutputer(object):

    def __init__(self):

        self.datas = []

    

    

    def collect_data(self, data):

        if data is None:

            return

        self.datas.append(data)


    

    def output_html(self):

        fout = open('output.html', 'w')

        

        fout.write("<html>")

        fout.write("<body>")

        fout.write("<table>")

        

        #Python默认编码是ascii,所以如需输出utf-8需做处理.encode('utf-8')

        for data in self.datas:

            fout.write("<tr>")

            fout.write("<td>%s</td>" % data['url'])

            fout.write("<td>%s</td>" % data['title'].encode('utf-8'))

            fout.write("<td>%s</td>" % data['summary'].encode('utf-8'))

            fout.write("/tr>")

            

        fout.write("</table>")

        fout.write("</body>")

        fout.write("</html>")

        

        fout.close()

(5)url_manager

#coding:utf8

import sys

reload(sys)   

sys.setdefaultencoding('utf8')


class UrlManager(object):

    def __init__(self):

        self.new_urls = set()

        self.old_urls = set()

    

    def add_new_url(self, url):

        if url is None:

            return

        if url not in self.new_urls and url not in self.old_urls:

            self.new_urls.add(url)


    def add_new_urls(self, urls):

        if urls is None or len(urls) == 0:

            return

        for url in urls:

            self.add_new_url(url)

    

    def has_new_url(self):

        return len(self.new_urls) != 0


    def get_new_url(self):

        #pop方法会抽取并移除一个url

        new_url = self.new_urls.pop()

        self.old_urls.add(new_url)

        return new_url

    

 

正在回答

举报

0/150
提交
取消
Python开发简单爬虫
  • 参与学习       227670    人
  • 解答问题       1219    个

本教程带您解开python爬虫这门神奇技术的面纱

进入课程

只输出了两条内容,但去掉try并没有报错。。。要哭了,请各位大神帮忙看看!

我要回答 关注问题
意见反馈 帮助中心 APP下载
官方微信