I am trying to scrape the following site - www.firstcry.com . The website uses AJAX (in the form of XHR) to display it's search results.
Now, if you see my code, the jsonresponse variable contains the json output of the website. Now, when I try to print it, it contains many \ (backslashes).
Now, if you correctly see my code just below the jsonresponse variable, I have commented several lines. Those were my attempts (which I tried after reading several similar questions, here on Stack Overflow) to remove all the backslashes and also these - u', which were also present there.
But, after all those tries, I am unable to remove ALL the backslashes and u'.
Now, if I don't remove all of those, I am not able to access the jsonresponse using it's keys, So, it is very essential for me to remove ALL of them.
Please help me resolve this issue. It would be better, if you provide a code , in particular for my case (issue), and not a general code rather!
My Code is here-:
from twisted.internet import reactor
from scrapy.crawler import CrawlerProcess, CrawlerRunner
import scrapy
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from scrapy.settings import Settings
import datetime
from multiprocessing import Process, Queue
import os
from scrapy.http import Request
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from scrapy.signalmanager import SignalManager
import json , simplejson , ujson
#query=raw_input("Enter a product to search for= ")
query='bag'
query1=query.replace(" ", "+")
class DmozItem(scrapy.Item):
productname = scrapy.Field()
product_link = scrapy.Field()
current_price = scrapy.Field()
mrp = scrapy.Field()
offer = scrapy.Field()
imageurl = scrapy.Field()
outofstock_status = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["http://www.firstcry.com"]
def start_requests(self):
task_urls = [
]
i=1
for i in range(1,2):
temp = "http://www.firstcry.com/svcs/search.svc/GetSearchPagingProducts_new?PageNo=" + str(i) + "&PageSize=20&SortExpression=Relevance&SubCatId=&BrandId=&Price=&OUTOFSTOCK=&DISCOUNT=&Q=" + query1 + "&rating="
task_urls.append(temp)
i=i+1
start_urls = (task_urls)
p=len(task_urls)
return [ Request(url = start_url) for start_url in start_urls ]
def parse(self, response):
print response
items = []
jsonresponse = dict(ujson.loads(response.body_as_unicode()))
# jsonresponse = jsonresponse.replace("\\","")
# jsonresponse = jsonresponse.decode('string_escape')
# jsonresponse = ("%r" % json.loads(response.body_as_unicode()))
# d= jsonresponse.json()
#jsonresponse = jsonresponse.strip("/")
# print jsonresponse
# print d
# print json.dumps("%r" % jsonresponse, indent=4, sort_keys=True)
# a = simplejson.dumps(simplejson.loads(response.body_as_unicode()).replace("u\'","\'"), indent=4, sort_keys=True)
#a= json.dumps(json.JSONDecoder().decode(jsonresponse))
#a = ujson.dumps((ujson.loads(response.body_as_unicode())) , indent=4 )
a=json.dumps(jsonresponse, indent=4)
a=a.decode('string_escape')
a=(a.decode('string_escape'))
# a.gsub('\\', '')
#a = a.strip('/')
#print (jsonresponse)
print a
#print "%r" % a
# print "%r" % json.loads(response.body_as_unicode())
p=(jsonresponse["hits"])["hit"]
# print p
# raw_input()
for x in p:
item = DmozItem()
item['productname'] = str(x['title'])
item['product_link'] = "http://www.yepme.com/Deals1.aspx?CampId="+str(x["uniqueId"])
item['current_price']='Rs. ' + str(x["price"])
try:
p=x["marketprice"]
item['mrp'] = 'Rs. ' + str(p)
except:
item['mrp'] = item['current_price']
try:
item['offer'] = str(x["promotionalMsg"])
except:
item['offer'] = str('No additional offer available')
item['imageurl'] = "http://staticaky.yepme.com/newcampaign/"+str(x["uniqueId"])[:-1]+"/"+str(x["smallimage"])
item['outofstock_status'] = str('In Stock')
items.append(item)
print (items)
spider1 = DmozSpider()
settings = Settings()
settings.set("PROJECT", "dmoz")
settings.set("CONCURRENT_REQUESTS" , 100)
#)
settings.set( "DEPTH_PRIORITY" , 1)
settings.set("SCHEDULER_DISK_QUEUE" , "scrapy.squeues.PickleFifoDiskQueue")
settings.set( "SCHEDULER_MEMORY_QUEUE" , "scrapy.squeues.FifoMemoryQueue")
crawler = CrawlerProcess(settings)
crawler.crawl(spider1)
crawler.start()