Slow page scrolling with Selenium

I am trying to clear some data from the flight search page.

This page works as follows:

You fill out the form, and then click on the search button - this is normal. When you click the button, you are redirected to the results page, and that is the problem. This page constantly adds results, for example, within one minute, which is important - the problem is to get all these results. When you are in a real browser, you need to scroll down the page and these results will appear. So I tried to scroll down using Selenium. It scrolls down to the bottom of the page, probably so fast or it is a jump instead of scrolling, that the page does not load any new results.

When you scroll slowly, it reloads the results, but if you do it very quickly, it stops loading.

I'm not sure if my code helps to figure this out, so I am attaching it.

SEARCH_STRING = """URL"""

class spider():

    def __init__(self):
        self.driver = webdriver.Firefox()

    @staticmethod
    def prepare_get(dep_airport,arr_airport,dep_date,arr_date):
        string = SEARCH_STRING%(dep_airport,arr_airport,arr_airport,dep_airport,dep_date,arr_date)
        return string


    def find_flights_html(self,dep_airport, arr_airport, dep_date, arr_date):
        if isinstance(dep_airport, list):
            airports_string = str(r'%20').join(dep_airport)
            dep_airport = airports_string

        wait = WebDriverWait(self.driver, 60) # wait for results
        self.driver.get(spider.prepare_get(dep_airport, arr_airport, dep_date, arr_date))
        wait.until(EC.invisibility_of_element_located((By.XPATH, '//img[contains(@src, "loading")]')))
        wait.until(EC.invisibility_of_element_located((By.XPATH, u'//div[. = "Poprosíme o trpezlivosť, hľadáme pre Vás ešte viac letov"]/preceding-sibling::img')))
        self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")

        self.driver.find_element_by_xpath('//body').send_keys(Keys.CONTROL+Keys.END)
        return self.driver.page_source

    @staticmethod 
    def get_info_from_borderbox(div):
        arrival = div.find('div',class_='departure').text
        price = div.find('div',class_='pricebox').find('div',class_=re.compile('price'))
        departure = div.find_all('div',class_='departure')[1].contents
        date_departure = departure[1].text 
        airport_departure = departure[5].text
        arrival = div.find_all('div', class_= 'arrival')[0].contents
        date_arrival = arrival[1].text
        airport_arrival = arrival[3].text[1:]
        print 'DEPARTURE: ' 
        print date_departure,airport_departure
        print 'ARRIVAL: '
        print date_arrival,airport_arrival

    @staticmethod
    def get_flights_from_result_page(html):

        def match_tag(tag, classes):
            return (tag.name == 'div'
                    and 'class' in tag.attrs
                    and all([c in tag['class'] for c in classes]))

        soup = mLib.getSoup_html(html)
        divs = soup.find_all(lambda t: match_tag(t, ['borderbox', 'flightbox', 'p2']))

        for div in divs:
            spider.get_info_from_borderbox(div)

        print len(divs)


spider_inst = spider() 

print spider.get_flights_from_result_page(spider_inst.find_flights_html(['BTS','BRU','PAR'], 'MAD', '2015-07-15', '2015-08-15'))

So the main problem, in my opinion, is that it scrolls too fast to cause a new load of results.

Do you have any ideas how to make it work?

+3
source share
3 answers

Here is another approach that worked for me, including scrolling in the representation of the last search result and waiting for additional elements to load before scrolling:

# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC


class wait_for_more_than_n_elements(object):
    def __init__(self, locator, count):
        self.locator = locator
        self.count = count

    def __call__(self, driver):
        try:
            count = len(EC._find_elements(driver, self.locator))
            return count >= self.count
        except StaleElementReferenceException:
            return False


driver = webdriver.Firefox()

dep_airport = ['BTS', 'BRU', 'PAR']
arr_airport = 'MAD'
dep_date = '2015-07-15'
arr_date = '2015-08-15'

airports_string = str(r'%20').join(dep_airport)
dep_airport = airports_string

url = "https://www.pelikan.sk/sk/flights/list?dfc=C%s&dtc=C%s&rfc=C%s&rtc=C%s&dd=%s&rd=%s&px=1000&ns=0&prc=&rng=1&rbd=0&ct=0" % (dep_airport, arr_airport, arr_airport, dep_airport, dep_date, arr_date)
driver.maximize_window()
driver.get(url)

wait = WebDriverWait(driver, 60)
wait.until(EC.invisibility_of_element_located((By.XPATH, '//img[contains(@src, "loading")]')))
wait.until(EC.invisibility_of_element_located((By.XPATH,
                                               u'//div[. = "Poprosíme o trpezlivosť, hľadáme pre Vás ešte viac letov"]/preceding-sibling::img')))

while True:  # TODO: make the endless loop end
    results = driver.find_elements_by_css_selector("div.flightbox")
    print "Results count: %d" % len(results)

    # scroll to the last element
    driver.execute_script("arguments[0].scrollIntoView();", results[-1])

    # wait for more results to load
    wait.until(wait_for_more_than_n_elements((By.CSS_SELECTOR, 'div.flightbox'), len(results)))

Notes:

  • you need to figure out when to stop the loop - for example, at a certain value len(results)
  • wait_for_more_than_n_elements , , ,
+1

,

y = 1000
    for timer in range(0,50):
         driver.execute_script("window.scrollTo(0, "+str(y)+")")
         y += 1000  
         time.sleep(1)

1000,

+2

, , :

    def __scroll_down_page(self, speed=8):
    current_scroll_position, new_height= 0, 1
    while current_scroll_position <= new_height:
        current_scroll_position += speed
        self.__driver.execute_script("window.scrollTo(0, {});".format(current_scroll_position))
        new_height = self.__driver.execute_script("return document.body.scrollHeight")
0

Source: https://habr.com/ru/post/1624136/


All Articles