为了账号安全,请及时绑定邮箱和手机立即绑定

PyQt5加载网页内容时返回None值

PyQt5加载网页内容时返回None值

慕的地8271018 2023-08-15 18:44:54
我正在尝试获取网页部分的内容。该部分中的数据由 JavaScript 动态加载。我在这里找到了一些代码,对其进行了编辑,但是当我运行脚本时我返回None这是代码import bs4 as bsimport sysimport urllib.requestfrom PyQt5.QtWebEngineWidgets import QWebEnginePagefrom PyQt5.QtWidgets import QApplicationfrom PyQt5.QtCore import QUrlfrom pprint import pprintclass Page(QWebEnginePage):    def __init__(self, url):        self.app = QApplication(sys.argv)        QWebEnginePage.__init__(self)        self.html = ''        self.loadFinished.connect(self._on_load_finished)        self.load(QUrl(url))        self.app.exec_()            def _on_load_finished(self):        self.html = self.toHtml(self.Callable)        print('Load finished')    def Callable(self, html_str):        self.html = html_str        self.app.quit()def main():    page = Page('https://www.ibm.com/support/fixcentral/swg/selectFixes?parent=IBM%20Security&product=ibm/Information+Management/InfoSphere+Guardium&release=10.0&platform=Linux&function=all')    soup = bs.BeautifulSoup(page.html, 'html.parser')    section = soup.find('table', {'id' : 'DataTables_Table_0'})    pprint (section)if __name__ == '__main__': main()这是输出Load finishedNone
查看完整描述

1 回答

?
蝴蝶不菲

TA贡献1810条经验 获得超4个赞

loadFinished 信号仅指示页面已加载,但之后可以创建更多 DOM 元素,这就是 id 为“DataTables_Table_0”的元素的情况,该元素是在页面加载后立即创建的。


一个可能的解决方案是注入一个脚本来检查该元素是否存在,并发出通知以便获取 HTML。


import sys

from functools import cached_property


from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets, QtWebChannel


from pprint import pprint

import bs4 as bs



def get_webchannel_source():

    file = QtCore.QFile(":/qtwebchannel/qwebchannel.js")

    if not file.open(QtCore.QIODevice.ReadOnly):

        return ""

    content = file.readAll()

    file.close()

    return content.data().decode()



class Manager(QtCore.QObject):

    def __init__(self, *, offline=True, visible=False, parent=None):

        super().__init__(parent)

        self._html = ""

        self._is_finished = False

        self.app

        self._profile = (

            QtWebEngineWidgets.QWebEngineProfile()

            if offline

            else QtWebEngineWidgets.QWebEngineProfile.defaultProfile()

        )

        self.view.resize(640, 480)

        if not visible:

            self.view.setAttribute(QtCore.Qt.WA_DontShowOnScreen, True)

        self.view.show()

        self.webchannel.registerObject("manager", self)

        self.view.page().setWebChannel(self.webchannel)


    @cached_property

    def app(self):

        return QtWidgets.QApplication(sys.argv)


    @property

    def profile(self):

        return self._profile


    @cached_property

    def view(self):

        view = QtWebEngineWidgets.QWebEngineView()

        page = QtWebEngineWidgets.QWebEnginePage(self.profile, self)

        view.setPage(page)

        return view


    @cached_property

    def webchannel(self):

        return QtWebChannel.QWebChannel(self)


    @property

    def html(self):

        return self._html


    def set_script(self, script):

        qscript = QtWebEngineWidgets.QWebEngineScript()

        qscript.setName("qscript")

        qscript.setSourceCode(get_webchannel_source() + "\n" + script)

        qscript.setInjectionPoint(QtWebEngineWidgets.QWebEngineScript.DocumentReady)

        qscript.setWorldId(QtWebEngineWidgets.QWebEngineScript.MainWorld)

        self.profile.scripts().insert(qscript)


    def start(self, url):

        self.view.load(QtCore.QUrl.fromUserInput(url))

        self.app.exec_()


    @QtCore.pyqtSlot()

    def save_html(self):

        if not self._is_finished:

            self.view.page().toHtml(self.html_callable)

            self._is_finished = True


    def html_callable(self, html):

        self._html = html

        self.app.quit()



JS = """

var manager = null;


function find_element() {

  var e = document.getElementById('DataTables_Table_0');

  console.log("try verify", e, manager);

  if (e != null && manager != null) {

    console.log(e)

    manager.save_html()

  } else {

    setTimeout(find_element, 100);

  }

}


(function wait_qt() {

  if (typeof qt != 'undefined') {

    console.log("Qt loaded");

    new QWebChannel(qt.webChannelTransport, function (channel) {

      manager = channel.objects.manager;

      find_element();

    });

  } else {

    setTimeout(wait_qt, 100);

  }

})();

"""



def main():

    manager = Manager()

    manager.set_script(JS)

    manager.start(

        "https://www.ibm.com/support/fixcentral/swg/selectFixes?parent=IBM%20Security&product=ibm/Information+Management/InfoSphere+Guardium&release=10.0&platform=Linux&function=all"

    )

    soup = bs.BeautifulSoup(manager.html, "html.parser")

    section = soup.find("table", {"id": "DataTables_Table_0"})

    pprint(section)



if __name__ == "__main__":

    main()



查看完整回答
反对 回复 2023-08-15
  • 1 回答
  • 0 关注
  • 145 浏览
慕课专栏
更多

添加回答

举报

0/150
提交
取消
意见反馈 帮助中心 APP下载
官方微信