I have scrapy pulling data from a web page. An issue Ive run across is it pulls alot of whitespace and Ive elected to use .strip() as suggested by others. Ive run into an issue though
if a.strip():
print a
if b.strip():
print b
Returns:
a1
b1
.
.
.
But this:
if a.strip():
aList.append(a)
if b.strip():
bList.append(b)
print aList, bList
Returns this:
a1
b1
Im trying to simulate the whitespace that I remove with .strip() here, but you get the point. For whatever reason it adds the whitespace to the list even though I told it not to. I can even print the list in the if statement and it also shows correctly, but for whatever reason, when I decide to print outside the if statements it doesnt work as I intended.
Here is my entire code:
# coding: utf-8
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.exporter import CsvItemExporter
import re
import csv
import urlparse
from stockscrape.items import EPSItem
from itertools import izip
class epsScrape(BaseSpider):
name = "eps"
allowed_domains = ["investors.com"]
ifile = open('test.txt', "r")
reader = csv.reader(ifile)
start_urls = []
for row in ifile:
url = row.replace("\n","")
if url == "symbol":
continue
else:
start_urls.append("http://research.investors.com/quotes/nyse-" + url + ".htm")
ifile.close()
def parse(self, response):
f = open("eps.txt", "a+")
sel = HtmlXPathSelector(response)
sites = sel.select("//div")
# items = []
for site in sites:
symbolList = []
epsList = []
item = EPSItem()
item['symbol'] = site.select("h2/span[contains(@id, 'qteSymb')]/text()").extract()
item['eps'] = site.select("table/tbody/tr/td[contains(@class, 'rating')]/span/text()").extract()
strSymb = str(item['symbol'])
newSymb = strSymb.replace("[]","").replace("[u'","").replace("']","")
strEps = str(item['eps'])
newEps = strEps.replace("[]","").replace(" ","").replace("[u'\\r\\n","").replace("']","")
if newSymb.strip():
symbolList.append(newSymb)
# print symbolList
if newEps.strip():
epsList.append(newEps)
# print epsList
print symbolList, epsList
for symb, eps in izip(symbolList, epsList):
f.write("%s\t%s\n", (symb, eps))
f.close()