Logging on an Eventlet Page Scraper?

I am trying to do some scraping of a site that requires authentication (rather than http auth). Used script method is based on this example, in the example ... . Primarily,

urls = ["https://mysecuresite.com/data.aspx?itemid=blah1",
     "https://mysecuresite.com/data.aspx?itemid=blah2",
     "https://mysecuresite.com/data.aspx?itemid=blah3"]

import eventlet
from eventlet.green import urllib2  

def fetch(url):
  print "opening", url
  body = urllib2.urlopen(url).read()
  print "done with", url
  return url, body

pool = eventlet.GreenPool(10)
for url, body in pool.imap(fetch, urls):
  print "got body from", url, "of length", len(body)

Creating a session is not easy; I need to load the login page, extract some variables from the login form, and then send a POST request with details and these variables. After a successful session, the rest of the requests are simple GET requests.

Using the code above as a breakpoint, how do I create a session that the rest of the pool will use? (I need subsequent requests to be done in parallel)

+3
3

, , urllib2 . :

opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())

- , . .

script, secondlife.com . , CSRF , . . , , .

import eventlet
from eventlet.green import urllib2
import re

login_url = 'https://secure-web28.secondlife.com/my/account/login.php?lang=en&type=second-life-member&nextpage=/my/index.php?lang=en'

pool = eventlet.GreenPool(10)

def fetch_title(opener, url):
    match = re.search(r'<title>(.*)</title>', opener.open(url).read())
    if match:
        return match.group(1)
    else:
        return "no title"

def login(login_url, fullname, password):
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
    login_page = opener.open(login_url).read()
    csrf_token = re.search(r'<input type="hidden" name="CSRFToken" value="(.*)"/>', login_page).group(1)
    username, lastname = fullname.split()
    auth = "CSRFToken=%s&form[type]=second-life-member&form[nextpage]=/my/index.php?lang=en&form[persistent]=Y&form[form_action]=Log%%20In&form[form_lang]=en&form[username]=%s&form[lastname]=%s&form[password]=%s&submit=Submit" % (
        csrf_token, username, lastname, password)
    logged_in = opener.open(login_url, auth).read()
    return opener


def login_and_fetch(login_url, fullname, password, page_urls):
    opener = login(login_url, fullname, password)
    # note that this deliberately uses the global pool
    pile = eventlet.GreenPile(pool)
    for url in page_urls:
        pile.spawn(fetch_title, opener, url)

    return pile

login_urls = [login_url] *2
usernames = [...]
passwords = [...]
page_urls = [['https://secure-web28.secondlife.com/my/account/?lang=en-US',
        'https://secure-web28.secondlife.com/my/community/events/index.php?lang=en-US']] * 2

for user_iter in pool.imap(login_and_fetch, login_urls, usernames, passwords, page_urls):
    for title in user_iter:
        print "got title", title
+4

, mechanize. , cookie .

, eventlet, ssl stdlib , .

eventlet, . gevent, , ,

gevent; monkey.patch_all()

.

example.

+1

mechanize, , /, ( Google, , , ).

0

Source: https://habr.com/ru/post/1733487/


All Articles