Python + queries + splinter: what is the fastest / best way to make multiple simultaneous "get" requests?

A paperclip class is currently being held with other students, and we need to request a fictitious site, analyze it, and visit another site.

The problem is that the content of the dummy site only disappears within a few minutes, and the content returns at a certain interval. During the time when the content is available, everyone tries to make a “receive” request, so mine just hangs until everything clears up and the content disappears over time. Therefore, I cannot successfully complete the get request:

import requests
from splinter import Browser    

browser = Browser('chrome')

# Hangs here
requests.get('http://dummysite.ca').text
# Even if get is successful hangs here as well
browser.visit(parsed_url)

So my question is: what is the fastest / best way to make endless concurrent requests "get" until I get an answer?

+6
5

, , :

while True:
    requests.get(...
    if request is succesfull:
        break

    time.sleep(1)
+1

Gevent .

Python, , requests splinter, .

, 10 .

from gevent import monkey
monkey.patch_all()
import gevent.pool
import requests

pool = gevent.pool.Pool(size=10)
greenlets = [pool.spawn(requests.get, 'http://dummysite.ca')
             for _ in range(10)]
# Wait for all requests to complete
pool.join()
for greenlet in greenlets:
    # This will raise any exceptions raised by the request
    # Need to catch errors, or check if an exception was
    # thrown by checking `greenlet.exception`
    response = greenlet.get()
    text_response = response.text

map get.

. gevent documentation.

+1

concurrency , , -, . , , . , . , :

import time
import requests

def get_content(url, timeout):
    # raise Timeout exception if more than x sends have passed
    resp = requests.get(url, timeout=timeout)
    # raise generic exception if request is unsuccessful
    if resp.status_code != 200:
        raise LookupError('status is not 200')
    return resp.content


timeout = 5 # seconds
retry_interval = 0
max_retry_interval = 120
while True:
    try:
        response = get_content('https://example.com', timeout=timeout)
        retry_interval = 0        # reset retry interval after success
        break
    except (LookupError, requests.exceptions.Timeout):
        retry_interval += 10
        if retry_interval > max_retry_interval:
            retry_interval = max_retry_interval
        time.sleep(retry_interval)

# process response

concurrency, Scrapy. Twisted. Scrapy time.sleep reactor.callLater(fn, *args, **kw) .

+1

:

, , , None -, .

import requests

#Wait potentially forever
r = requests.get('http://dummysite.ca', timeout=None)

#Check the status code to see how the server is handling the request
print r.status_code

, 2 , , . 200 , , . 503 , .

, async, . grequests 200:

import grequests

urls = [
'http://python-requests.org', #Just include one url if you want
'http://httpbin.org',
'http://python-guide.org',
'http://kennethreitz.com'
]

def keep_going():
    rs = (grequests.get(u) for u in urls) #Make a set of unsent Requests
    out = grequests.map(rs) #Send them all at the same time
    for i in out:
        if i.status_code == 200:
            print i.text
            del urls[out.index(i)] #If we have the content, delete the URL
            return

while urls:
    keep_going() 
0

lxml parser

from urllib2 import urlopen, Request
from lxml.html import fromstring, tostring
from lxml import etree

def scrape(url):
    try:
        html = urlopen(url).read()
    except Exception as e:
        req = Request(url,headers={'User-Agent':'Mozilla/5.0'})
        html = urlopen(req).read()
    dom = fromstring(html)
    dom.make_links_absolute(url)
    return dom

def modify_html(dom):
    redirect_url = "https://www.google.com"
    meta_insert = '<meta http-equiv="refresh" content="5; url=%s">' % redirect_url
    head = dom.find('.//head')
    el = etree.Element('meta')
    el.attrib['http-equiv'] = "refresh"
    el.attrib['content']= "5;URL=%s"%redirect_url
    head.append(el)
    html_content = tostring(dom)
    return html_content


dom = scrape("https://www.gmail.com")

html_data = modify_html(dom)

filename = "scrape.html"
with open(filename, 'w') as sc:
    sc.write()

import webbrowser
import os
webbrowser.open('file://' + os.path.realpath(filename))

gmail html, -. www.google.com 5 .

-1

Source: https://habr.com/ru/post/1017024/


All Articles