Currently working with Scrapy.
I have a list of URLs stored in a MySQL database. The spider visits these URLs, captures two target data ( score and count ). My goal is that when Scrapy scripting is completed, it automatically fills in the corresponding columns before it moves to the next URL.
I am a newbie and I can’t get to save a part to work correctly. Valuesand count are successfully transferred to the database. But it is saved as newlines instead of being bound to the original URL.
Here is my code: amazon_spider.py
import scrapy
from whatoplaybot.items import crawledScore
import MySQLdb
class amazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ["amazon.com"]
start_urls = []
def parse(self, response):
print self.start_urls
def start_requests(self):
conn = MySQLdb.connect(
user='root',
passwd='',
db='scraper',
host='127.0.0.1',
charset="utf8",
use_unicode=True
)
cursor = conn.cursor()
cursor.execute(
'SELECT url FROM scraped;'
)
rows = cursor.fetchall()
for row in rows:
yield self.make_requests_from_url(row[0])
conn.close()
def parse(self, response):
item = crawledScore()
item['reviewScore'] = response.xpath('//*[@id="avgRating"]/span/a/span/text()').re("[0-9,.]+")[0]
item['reviewCount'] = response.xpath('//*[@id="summaryStars"]/a/text()').re("[0-9,]+")
yield item
pipelines.py
import sys
import MySQLdb
class storeScore(object):
def __init__(self):
self.conn = MySQLdb.connect(
user='root',
passwd='',
db='scraper',
host='127.0.0.1',
charset="utf8",
use_unicode=True
)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO scraped(score, count) VALUES (%s, %s)""", (item['reviewScore'], item['reviewCount']))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
Any help and guidance would be greatly appreciated.
Thanks guys.