Get data from <script> tag in HTML using Scrapy

I am trying to extract data from a script tag in HTML HTML using Scrapy (xpath). But my main problem is determining the correct div and script tags. I am new to using xpath and would appreciate any help!

HTML ( http://www.kbb.com/nissan/altima/2014/25-s-sedan-4d/?vehicleid=392396&intent=buy-used&mileage=10000&condition=fair&pricetype=retail ):

<script type="text/javascript" src="http://s1.kbb.com/combine/IncentivesPilotJs/949332058"></script>
        <input type="hidden" id="ResaleValueUrl" value="/ymmt/resalevalue/?vehicleid=392396" />
        <input type="hidden" id="Intent" value="buy-used" />
        <!--[if lt IE 9]>
            <script>
            window.FlashCanvasOptions = {
               swfPath: "/js/canvas/FlashCanvas/UCMarketMeter/"
            };
            </script>
            <script type="text/javascript" src="http://s1.kbb.com/combine/YmmtMarketMeterFlashCanvasJs/795892638"></script>
        <![endif]-->
        <script type="text/javascript" src="http://s1.kbb.com/combine/YMMTOverview/1527402533"></script>
        <script type="text/javascript" src="http://s1.kbb.com/combine/YmmtPricingOverviewBuyUsedJs/-1416499456"></script>

        <script language="javascript" type="text/javascript">
            $(document).ready(function() {
                KBB.Vehicle.Pages.PricingOverview.Buyers.setup({
                    //Workaround until we get cross domain working for Flash
                    imageDir: window.FlashCanvasOptions ? "/Content/images" : "http://file.kelleybluebookimages.com/kbb/images/marketmeter",
                    vehicleId: "392396",
                    zipCode: "78701",
                    mileage: "10000",
                    intent: "buy-used",
                    priceType: "retail",
                    condition: "good",
                    options: "392396|53635|78701|100|10|",
                    price: "17074",
                    manufacturer: "Nissan",
                    model: "Altima",
                    year: "2014",
                    style: "2.5 S Sedan 4D",
                    category: "",
                    hasCpo: true,
                    meetsCpoReq: true,
                    showOthersPaid: false,
                    data: {
    "values": {
     "cpo": {
       "priceMin": 17335.0,
        "price": 18275.0,
        "priceMax": 19214.0
    },
    "fpp": {
      "priceMin": 15286.0,
      "price": 17074.0,
      "priceMax": 18861.0
    },
    "privatepartyexcellent": {
      "priceMin": 0.0,
      "price": 16064.0,
      "priceMax": 0.0
    },
    "privatepartyfair": {
      "priceMin": 0.0,
      "price": 14081.0,
      "priceMax": 0.0
    },
    "privatepartygood": {
      "priceMin": 0.0,
      "price": 15454.0,
      "priceMax": 0.0
    },
    "privatepartyverygood": {
      "priceMin": 0.0,
      "price": 15715.0,
      "priceMax": 0.0
    },
    "retail": {
      "priceMin": 0.0,
      "price": 17875.0,
      "priceMax": 0.0
    }
  },
     "timAmount": 0.0,
    "monthlyPayments": {
    "cpo": {
      "vehiclePrice": 18275.0,
      "rate": 2.9,
      "terms": 60.0,
      "taxAndTitle": 6.5,
      "downPay": 0.0,
      "amount": 348.0
    },
    "fpp": {
      "vehiclePrice": 17074.0,
      "rate": 4.9,
      "terms": 60.0,
      "taxAndTitle": 6.5,
      "downPay": 0.0,
      "amount": 342.0
    },
    "privatepartyexcellent": {
      "vehiclePrice": 16064.0,
      "rate": 4.9,
      "terms": 60.0,
      "taxAndTitle": 6.5,
      "downPay": 0.0,
      "amount": 322.0
    },
    "privatepartyfair": {
      "vehiclePrice": 14081.0,
      "rate": 4.9,
      "terms": 60.0,
      "taxAndTitle": 6.5,
      "downPay": 0.0,
      "amount": 282.0
    },
    "privatepartygood": {
      "vehiclePrice": 15454.0,
      "rate": 4.9,
      "terms": 60.0,
      "taxAndTitle": 6.5,
      "downPay": 0.0,
      "amount": 309.0
    },
    "privatepartyverygood": {
      "vehiclePrice": 15715.0,
      "rate": 4.9,
      "terms": 60.0,
      "taxAndTitle": 6.5,
      "downPay": 0.0,
      "amount": 315.0
    },
    "retail": {
      "vehiclePrice": 17875.0,
      "rate": 4.9,
      "terms": 60.0,
      "taxAndTitle": 6.5,
      "downPay": 0.0,
      "amount": 358.0
    }
  },
  "scale": {
    "scaleLow": 14081.0,
    "scaleHigh": 19214.0
  },
  "transactions": {
    "below": 7,
    "between": 17,
    "above": 3
  }
},
                    adPriceRanges: {"AdPriceRange":[{"PriceMin":0,"PriceMax":8499,"AdPRValue":1},{"PriceMin":8500,"PriceMax":18499,"AdPRValue":2},{"PriceMin":18500,"PriceMax":23499,"AdPRValue":3},{"PriceMin":23500,"PriceMax":28499,"AdPRValue":4},{"PriceMin":28500,"PriceMax":33499,"AdPRValue":5},{"PriceMin":33500,"PriceMax":38499,"AdPRValue":6},{"PriceMin":38500,"PriceMax":43499,"AdPRValue":7},{"PriceMin":43500,"PriceMax":48499,"AdPRValue":8},{"PriceMin":48500,"PriceMax":53499,"AdPRValue":9},{"PriceMin":53500,"PriceMax":63499,"AdPRValue":10},{"PriceMin":63500,"PriceMax":73499,"AdPRValue":11},{"PriceMin":73500,"PriceMax":1000000,"AdPRValue":12}]}});
            });
            $('.foot-note').hide();
            $(window).on('popstate', function() {
                KBB.Vehicle.Pages.PricingOverview.Buyers.stateChangeHandler();
            });
        </script>


Scrapy Code:

from scrapy.spider import BaseSpider
from scrapy.selector import Selector
import scrapy

from kbb.items import kbbItem

class kbbSpider(scrapy.Spider):
name = "kbb"
allowed_domains = ["kbb.com"]
start_urls = [
    "http://www.kbb.com/nissan/altima/2014/25-s-sedan-4d/?vehicleid=392396&intent=buy-used&10000&good&pricetype=retail"
]

def parse(self, response):
    sel=Selector(response)
    #sites=sel.xpath('//div')
    items=[]
    #for site in sites:
    item=kbbItem
    item['priceMin']=site.xpath('//div/script').extract[35][915:922]
    return items

Finally, I want to fill in the fields priceMin, price, priceMaxfrom fpp, and the price of retailmy items. I am currently using indexes to get these values, but wondered if there is an easier way.

+4
1

, Javascript. , , .

, script, , , /, , python json module .

Scrapy Shell:

In [1]: import re
In [2]: import json

In [3]: pattern = re.compile(r"KBB\.Vehicle\.Pages\.PricingOverview\.Buyers\.setup\(.*?data: ({.*?}),\W+adPriceRanges", re.MULTILINE | re.DOTALL)
In [4]: data = response.xpath("//script[contains(., 'KBB.Vehicle.Pages.PricingOverview.Buyers.setup')]/text()").re(pattern)[0]

In [5]: data = data.replace("//Workaround until we get cross domain working for Flash", "")

In [6]: data_obj = json.loads(data)

In [7]: data_obj['values']['fpp']
Out[7]: {u'price': 15569.0, u'priceMax': 17356.0, u'priceMin': 13781.0}

In [8]: data_obj['values']['retail']
Out[8]: {u'price': 16370.0, u'priceMax': 0.0, u'priceMin': 0.0}
+5

Source: https://habr.com/ru/post/1614306/


All Articles