, , , JS, . , JS , .
PyQt4. webscraping.com HTML, BeautifulSoup, :
( webscraping python. , )
import sys
from bs4 import BeautifulSoup
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
url = 'http://hcavirginia.com/home/'
r = Render(url)
soup = BeautifulSoup(unicode(r.frame.toHtml()))
nums = [int(span) for span in soup.find_all('span', class_='ehc-er-digits')]
print nums
:
[21, 23, 47, 11, 10, 8, 68, 56, 19, 15, 7]
, ghost.py:
- , ghost.py. ( Python 2.7, ghost.py 0.1b3 PyQt4-4 32-). , !
from ghost import Ghost
from time import sleep
ghost = Ghost(wait_timeout=50, download_images=False)
page, extra_resources = ghost.open('http://hcavirginia.com/home/',
headers={'User-Agent': 'Mozilla/4.0'})
page, resources = ghost.wait_for_selector("span.ehc-er-digits")
nums, resources = ghost.evaluate(
"""
elems = document.getElementsByClassName('ehc-er-digits');
nums = []
for (i = 0; i < elems.length; ++i) {
nums[i] = elems[i].innerHTML;
}
nums;
""")
wt_data = [int(x) for x in nums]
print wt_data
sleep(30)
:
, dict Ghost.evaluate(document.getElementsByClassName('ehc-er-digits');) - , , , , .
script . 30 .