I wrote a crawler to extract information from the Q & A website. Since not all fields are presented on the page all the time, I used several trial exceptions to handle the situation.
def answerContentExtractor( loginSession, questionLinkQueue , answerContentList) :
while True:
URL = questionLinkQueue.get()
try:
response = loginSession.get(URL,timeout = MAX_WAIT_TIME)
raw_data = response.text
questionId = re.findall(REGEX,raw_data)[0]
answerId = re.findall(REGEX,raw_data)[0]
title = re.findall(REGEX,raw_data)[0]
except requests.exceptions.Timeout ,IndexError:
print >> sys.stderr, URL + " extraction error..."
questionLinkQueue.task_done()
continue
try:
questionInfo = re.findall(REGEX,raw_data)[0]
except IndexError:
questionInfo = ""
try:
answerContent = re.findall(REGEX,raw_data)[0]
except IndexError:
answerContent = ""
result = {
'questionId' : questionId,
'answerId' : answerId,
'title' : title,
'questionInfo' : questionInfo,
'answerContent': answerContent
}
answerContentList.append(result)
questionLinkQueue.task_done()
And this code, sometimes, may or may not, gives the following exception at runtime:
UnboundLocalError: local variable 'IndexError' referenced before assignment
The line number indicates that an error occurs in the second except IndexError:
Thank you all for your suggestions, I would like to give the brands you deserve, too bad, I can only point out one correct answer ...
source
share