Trying to plunge into Scrapy, but hit a few dead ends.
I have 2 tables per page and you want to extract data from each, and then go to the next page.
The tables look like this (the first is called Y1, the second is called Y2), and the structures are the same.
<div id="Y1" style="margin-bottom: 0px; margin-top: 15px;">
<h2>First information</h2><hr style="margin-top: 5px; margin-bottom: 10px;">
<table class="table table-striped table-hover table-curved">
<thead>
<tr>
<th class="tCol1" style="padding: 10px;">First Col Head</th>
<th class="tCol2" style="padding: 10px;">Second Col Head</th>
<th class="tCol3" style="padding: 10px;">Third Col Head</th>
</tr>
</thead>
<tbody>
<tr>
<td>Info 1</td>
<td>Monday 5 September, 2016</td>
<td>Friday 21 October, 2016</td>
</tr>
<tr class="vevent">
<td class="summary"><b>Info 2</b></td>
<td class="dtstart" timestamp="1477094400"><b></b></td>
<td class="dtend" timestamp="1477785600">
<b>Sunday 30 October, 2016</b></td>
</tr>
<tr>
<td>Info 3</td>
<td>Monday 31 October, 2016</td>
<td>Tuesday 20 December, 2016</td>
</tr>
<tr class="vevent">
<td class="summary"><b>Info 4</b></td>
<td class="dtstart" timestamp="1482278400"><b>Wednesday 21 December, 2016</b></td>
<td class="dtend" timestamp="1483315200">
<b>Monday 2 January, 2017</b></td>
</tr>
</tbody>
</table>
As you can see, the structure is a bit inconsistent, but for now I can get each td and output to csv, then I will be a happy guy.
I tried using xPath, but it only confused me more.
My last attempt:
import scrapy
class myScraperSpider(scrapy.Spider):
name = "myScraper"
allowed_domains = ["mysite.co.uk"]
start_urls = (
'https://mysite.co.uk/page1/',
)
def parse_products(self, response):
products = response.xpath('//*[@id="Y1"]/table')
for product in products[1:]
item = Schooldates1Item()
item['hol'] = product.xpath('//*[@id="Y1"]/table/tbody/tr[1]/td[1]').extract()[0]
item['first'] = product.xpath('//*[@id="Y1"]/table/tbody/tr[1]/td[2]').extract()[0]
item['last'] = product.xpath('//*[@id="Y1"]/table/tbody/tr[1]/td[3]').extract()[0]
yield item
There are no errors here, but it just returns a lot of information about the workaround, but no actual results.
Update:
import scrapy
class SchoolSpider(scrapy.Spider):
name = "school"
allowed_domains = ["termdates.co.uk"]
start_urls = (
'https://termdates.co.uk/school-holidays-16-19-abingdon/',
)
def parse_products(self, response):
products = sel.xpath('//*[@id="Year1"]/table//tr')
for p in products[1:]:
item = dict()
item['hol'] = p.xpath('td[1]/text()').extract_first()
item['first'] = p.xpath('td[1]/text()').extract_first()
item['last'] = p.xpath('td[1]/text()').extract_first()
yield item
This gives me: IndentationError: unexpected indentation
script ( @Granitosaurus) CSV (-o schoolDates.csv), :
import scrapy
class SchoolSpider(scrapy.Spider):
name = "school"
allowed_domains = ["termdates.co.uk"]
start_urls = ('https://termdates.co.uk/school-holidays-16-19-abingdon/',)
def parse_products(self, response):
products = sel.xpath('//*[@id="Year1"]/table//tr')
for p in products[1:]:
item = dict()
item['hol'] = p.xpath('td[1]/text()').extract_first()
item['first'] = p.xpath('td[1]/text()').extract_first()
item['last'] = p.xpath('td[1]/text()').extract_first()
yield item
:
- 2017-03-23 12:04:08 [scrapy.core.engine] INFO:
2017-03-23 12:04:08 [scrapy.extensions.logstats] INFO: Crawled 0
( 0 /), 0 ( 0 /) 2017-03-23
12:04:08 [scrapy.extensions.telnet] DEBUG: Telnet
... 2017-03-23 12:04:08 [scrapy.core.engine] DEBUG: Crawled (200)
https://termdates.co.uk/robots.txt > (: ) 2017-03-23
12:04:08 [scrapy.core.engine] DEBUG: Crawled (200) https://termdates.co.uk/school-holidays-16-19-abingdon/" > (:
) 2017-03-23 12:04:08 [scrapy.core.scraper] :
https://termdates.co.uk/school-holidays-16-19-abingdon/" > (:
) Traceback ( ):
"c:\python27\lib\site-packages\twisted\internet\defer.py", 653,
_ runCallbacks current.result = callback (current.result, * args, ** kw) "c:\python27\lib\site-packages\scrapy-1.3.3-py2.7.egg\scrapy\spiders__init __. py",
76, raise NotImplementedError NotImplementedError 2017-03-23 12:04:08 [scrapy.core.engine] INFO: () 2017-03-23
12:04:08 [scrapy.statscollectors] INFO: :
{'downloader/request_bytes': 467, 'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 11311, 'downloader/response_count': 2,
'downloader/response_status_count/200': 2, 'finish_reason':
'finished', 'finish_time': datetime.datetime(2017, 3, 23, 12, 4, 8,
845000), "log_count/DEBUG": 3, "log_count/ERROR": 1,
'log_count/INFO': 7, 'response_received_count': 2,
'scheduler/dequeued': 1, 'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1, 'scheduler/enqueued/memory': 1,
'spider_exceptions/NotImplementedError': 1, 'start_time':
datetime.datetime(2017, 3, 23, 12, 4, 8, 356000)} 2017-03-23 12:04:08
[scrapy.core.engine] INFO: ()
2: ( )
csv , .
{'hol': None, 'last': u '\ r\n\t\t\t\t\t\t\t\t', 'first': None}
import scrapy
class SchoolSpider(scrapy.Spider):
name = "school"
allowed_domains = ["termdates.co.uk"]
start_urls = ('https://termdates.co.uk/school-holidays-16-19-abingdon/',)
def parse(self, response):
products = response.xpath('//*[@id="Year1"]/table//tr')
for p in products[1:]:
item = dict()
item['hol'] = p.xpath('td[1]/text()').extract_first()
item['first'] = p.xpath('td[2]/text()').extract_first()
item['last'] = p.xpath('td[3]/text()').extract_first()
yield item
: @ vold
start_urls
import scrapy
from SchoolDates_1.items import Schooldates1Item
class SchoolSpider(scrapy.Spider):
name = "school"
allowed_domains = ["termdates.co.uk"]
start_urls = ('https://termdates.co.uk/school-holidays-16-19-abingdon/',
'https://termdates.co.uk/school-holidays-3-dimensions',)
def parse(self, response):
products = response.xpath('//*[@id="Year1"]/table//tr')
for product in products[1:]:
item = Schooldates1Item()
item['hol'] = product.xpath('td[1]//text()').extract_first()
item['first'] = product.xpath('td[2]//text()').extract_first()
item['last'] = ''.join(product.xpath('td[3]//text()').extract()).strip()
item['url'] = response.url
yield item