I'm having a nightmare with data scrapped with Scrapy. Currently I encode it using UTF-8 i.e detail_content.select('p/text()[1]').extract()[0].encode('utf-8')
saved into a JSON file, and then the captured text is displayed again using Django and a mobile app.
In the JSON file the escaped HTML gets escaped using unicode 'blah blah \u00a34,000 blah'
Now my problem is when I try and display the text in a django template or the mobile app the actual literal characters display: \u00a3
instead of £
Should I not be storing escaped unicode in JSON? Would it be better to store ASCII in the JSON file using the JSON escaping? If so how do you go about doing this with scrapy?
Scrappy code:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from scrapy.item import Item, Field
import datetime
import unicodedata
import re
class Spider(BaseSpider):
#spider stuff
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//ul[@class = "category3"]/li')
for row in rows:
item = Item()
if len(row.select('div[2]/a/text()').extract()) > 0:
item['header'] = str(row.select('div[2]/a/text()')
.extract()[0].encode('utf-8'))
else:
item['header'] = ''
if len(row.select('div[2]/a/text()').extract()) > 0:
item['_id'] = str(row.select('div[2]/a/text()')
.extract()[0].encode('utf-8'))
else:
item['_id'] = ''
item['_id'] = self.slugify(item['_id'])[0:20]
item_url = row.select('div[2]/a/@href').extract()
today = datetime.datetime.now().isoformat()
item['dateAdded'] = str(today)
yield Request(item_url[0], meta={'item' : item},
callback=self.parse_item)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
detail_content = hxs.select('//*[@id="content-area"]')
item = response.request.meta['item']
item['description'] = str(detail_content.select('p/text()[1]')
.extract()[0])
item['itemUrl'] = str(detail_content.select('//a[@title="Blah"]/@href')
.extract()[0])
item['image_urls'] = detail_content.select('//img[@width="418"]/../@href')
.extract()
print item
return item