0

I'm trying to create one dataframe with data from multiple urls I'm scraping. The code works however I'm unable to store the data in one DataFrame recursively. The DataFrame (called frame) is replaced with a new url's data each time rather than having the new data concatenated to the same frame. Thank you, I deeply appreciate your help!

import urllib
import re
import json
import pandas
import pylab
import numpy
import matplotlib.pyplot
from pandas import *
from pylab import *
from threading import Thread
import sqlite3

urls = ['http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=795226', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1807944', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=277459' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1076779' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=971546']

i=0
regex = '<p class="commentText">(.+?)</p>'
regex2 = '<strong>Easiness</strong><span>(.+?)</span></p>'
regex3 = 'Helpfulness</strong><span>(.+?)</span></p>'
regex4 = 'Clarity</strong><span>(.+?)</span></p>'
regex5 = 'Rater Interest</strong><span>(.+?)</span></p>'
regex6 = '<div class="date">(.+?)</div>'
regex7 = '<div class="class"><p style="word-wrap:break-word;">(.+?)</p>'
regex8 = '<meta name="prof_name" content="(.+?)"/>'

pattern = re.compile(regex)
easiness = re.compile(regex2)
helpfulness = re.compile(regex3)
clarity = re.compile(regex4)
interest = re.compile(regex5)
date = re.compile(regex6)
mathclass = re.compile(regex7)
prof_name = re.compile(regex8)

while i < len(urls):
    htmlfile = urllib.urlopen(urls[i])
    htmltext = htmlfile.read()
    content = re.findall(pattern,htmltext)
    Easiness = re.findall(easiness,htmltext)
    Helpfulness = re.findall(helpfulness, htmltext)
    Clarity = re.findall(clarity, htmltext)
    Interest = re.findall(interest, htmltext)
    Date = re.findall(date, htmltext)
    Class = re.findall(mathclass, htmltext)
    PROFNAME=re.findall(prof_name, htmltext)
    i+=1

    frame = DataFrame({'Comments': content, 'Easiness': Easiness, 'Helpfulness': Helpfulness, 
    'Clarity': Clarity, 'Rater Interest': Interest, 'Class': Class,
    'Date': Date[1:len(Date)], 'Professor': PROFNAME[0]})

    print frame
user2687863
  • 13
  • 1
  • 6

2 Answers2

0

Use pd.concat:

frames = []

while i < len(urls):
    htmlfile = urllib.urlopen(urls[i])
    htmltext = htmlfile.read()
    content = re.findall(pattern,htmltext)
    Easiness = re.findall(easiness,htmltext)
    Helpfulness = re.findall(helpfulness, htmltext)
    Clarity = re.findall(clarity, htmltext)
    Interest = re.findall(interest, htmltext)
    Date = re.findall(date, htmltext)
    Class = re.findall(mathclass, htmltext)
    PROFNAME=re.findall(prof_name, htmltext)
    i+=1

    frames.append(DataFrame({'Comments': content, 'Easiness': Easiness, 'Helpfulness': Helpfulness, 
    'Clarity': Clarity, 'Rater Interest': Interest, 'Class': Class,
    'Date': Date[1:len(Date)], 'Professor': PROFNAME[0]}))

pd.concat(frames)
Phillip Cloud
  • 24,919
  • 11
  • 68
  • 88
0

You are overwriting your frame with each iteration of the loop. As Phillip Cloud suggested, you can make a list of frames that you append with each loop. I simplified your code differently, but I think this gives you what you want.

import urllib
import re
import pandas as pd

urls = ['http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131',
        'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=795226',
        'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131',
        'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1807944',
        'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=277459', 
        'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1076779', 
        'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=971546']

regex = {'pattern' : re.compile('<p class="commentText">(.+?)</p>'),
        'easiness' : re.compile('<strong>Easiness</strong><span>(.+?)</span></p>'),
        'helpfulness' : re.compile('Helpfulness</strong><span>(.+?)</span></p>'),
        'clarity' : re.compile('Clarity</strong><span>(.+?)</span></p>'),
        'interest' : re.compile('Rater Interest</strong><span>(.+?)</span></p>'),
        'date' : re.compile('<div class="date">(.+?)</div>'),
        'mathclass' : re.compile('<div class="class"><p style="word-wrap:break-word;">(.+?)</p>'),
        'prof_name' : re.compile('<meta name="prof_name" content="(.+?)"/>')}

# Make a dictionary with empty lists using the same keys
d = {}
for k in regex.keys():
    d[k] = []

# Now fill those lists
for url in urls:
    htmlfile = urllib.urlopen(url)
    htmltext = htmlfile.read()
    for k, v in regex.iteritems():
        d[k].append(re.findall(v, htmltext))
frame = pd.DataFrame(d) # Dump the dict into a DataFrame
print frame
user394430
  • 2,805
  • 2
  • 28
  • 27