1 回答
![?](http://img1.sycdn.imooc.com/5333a0780001a6e702200220-100-100.jpg)
TA贡献1810条经验 获得超4个赞
loadFinished 信号仅指示页面已加载,但之后可以创建更多 DOM 元素,这就是 id 为“DataTables_Table_0”的元素的情况,该元素是在页面加载后立即创建的。
一个可能的解决方案是注入一个脚本来检查该元素是否存在,并发出通知以便获取 HTML。
import sys
from functools import cached_property
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets, QtWebChannel
from pprint import pprint
import bs4 as bs
def get_webchannel_source():
file = QtCore.QFile(":/qtwebchannel/qwebchannel.js")
if not file.open(QtCore.QIODevice.ReadOnly):
return ""
content = file.readAll()
file.close()
return content.data().decode()
class Manager(QtCore.QObject):
def __init__(self, *, offline=True, visible=False, parent=None):
super().__init__(parent)
self._html = ""
self._is_finished = False
self.app
self._profile = (
QtWebEngineWidgets.QWebEngineProfile()
if offline
else QtWebEngineWidgets.QWebEngineProfile.defaultProfile()
)
self.view.resize(640, 480)
if not visible:
self.view.setAttribute(QtCore.Qt.WA_DontShowOnScreen, True)
self.view.show()
self.webchannel.registerObject("manager", self)
self.view.page().setWebChannel(self.webchannel)
@cached_property
def app(self):
return QtWidgets.QApplication(sys.argv)
@property
def profile(self):
return self._profile
@cached_property
def view(self):
view = QtWebEngineWidgets.QWebEngineView()
page = QtWebEngineWidgets.QWebEnginePage(self.profile, self)
view.setPage(page)
return view
@cached_property
def webchannel(self):
return QtWebChannel.QWebChannel(self)
@property
def html(self):
return self._html
def set_script(self, script):
qscript = QtWebEngineWidgets.QWebEngineScript()
qscript.setName("qscript")
qscript.setSourceCode(get_webchannel_source() + "\n" + script)
qscript.setInjectionPoint(QtWebEngineWidgets.QWebEngineScript.DocumentReady)
qscript.setWorldId(QtWebEngineWidgets.QWebEngineScript.MainWorld)
self.profile.scripts().insert(qscript)
def start(self, url):
self.view.load(QtCore.QUrl.fromUserInput(url))
self.app.exec_()
@QtCore.pyqtSlot()
def save_html(self):
if not self._is_finished:
self.view.page().toHtml(self.html_callable)
self._is_finished = True
def html_callable(self, html):
self._html = html
self.app.quit()
JS = """
var manager = null;
function find_element() {
var e = document.getElementById('DataTables_Table_0');
console.log("try verify", e, manager);
if (e != null && manager != null) {
console.log(e)
manager.save_html()
} else {
setTimeout(find_element, 100);
}
}
(function wait_qt() {
if (typeof qt != 'undefined') {
console.log("Qt loaded");
new QWebChannel(qt.webChannelTransport, function (channel) {
manager = channel.objects.manager;
find_element();
});
} else {
setTimeout(wait_qt, 100);
}
})();
"""
def main():
manager = Manager()
manager.set_script(JS)
manager.start(
"https://www.ibm.com/support/fixcentral/swg/selectFixes?parent=IBM%20Security&product=ibm/Information+Management/InfoSphere+Guardium&release=10.0&platform=Linux&function=all"
)
soup = bs.BeautifulSoup(manager.html, "html.parser")
section = soup.find("table", {"id": "DataTables_Table_0"})
pprint(section)
if __name__ == "__main__":
main()
添加回答
举报