Concatenate Data From URLS Recursively Inside one DataFrame

Question

I'm trying to create one dataframe with data from multiple urls I'm scraping. The code works however I'm unable to store the data in one DataFrame recursively. The DataFrame (called frame) is replaced with a new url's data each time rather than having the new data concatenated to the same frame. Thank you, I deeply appreciate your help!

import urllib
import re
import json
import pandas
import pylab
import numpy
import matplotlib.pyplot
from pandas import *
from pylab import *
from threading import Thread
import sqlite3

urls = ['http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=795226', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1807944', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=277459' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1076779' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=971546']

i=0
regex = '<p class="commentText">(.+?)</p>'
regex2 = '<strong>Easiness</strong><span>(.+?)</span></p>'
regex3 = 'Helpfulness</strong><span>(.+?)</span></p>'
regex4 = 'Clarity</strong><span>(.+?)</span></p>'
regex5 = 'Rater Interest</strong><span>(.+?)</span></p>'
regex6 = '<div class="date">(.+?)</div>'
regex7 = '<div class="class"><p style="word-wrap:break-word;">(.+?)</p>'
regex8 = '<meta name="prof_name" content="(.+?)"/>'

pattern = re.compile(regex)
easiness = re.compile(regex2)
helpfulness = re.compile(regex3)
clarity = re.compile(regex4)
interest = re.compile(regex5)
date = re.compile(regex6)
mathclass = re.compile(regex7)
prof_name = re.compile(regex8)

while i < len(urls):
    htmlfile = urllib.urlopen(urls[i])
    htmltext = htmlfile.read()
    content = re.findall(pattern,htmltext)
    Easiness = re.findall(easiness,htmltext)
    Helpfulness = re.findall(helpfulness, htmltext)
    Clarity = re.findall(clarity, htmltext)
    Interest = re.findall(interest, htmltext)
    Date = re.findall(date, htmltext)
    Class = re.findall(mathclass, htmltext)
    PROFNAME=re.findall(prof_name, htmltext)
    i+=1

    frame = DataFrame({'Comments': content, 'Easiness': Easiness, 'Helpfulness': Helpfulness, 
    'Clarity': Clarity, 'Rater Interest': Interest, 'Class': Class,
    'Date': Date[1:len(Date)], 'Professor': PROFNAME[0]})

    print frame

I knew I recognized those course numbers! I went to City College! — Phillip Cloud, Aug 16 '13 at 05:49

score 0 · Answer 1 · answered Aug 16 '13 at 04:23

Use pd.concat:

frames = []

while i < len(urls):
    htmlfile = urllib.urlopen(urls[i])
    htmltext = htmlfile.read()
    content = re.findall(pattern,htmltext)
    Easiness = re.findall(easiness,htmltext)
    Helpfulness = re.findall(helpfulness, htmltext)
    Clarity = re.findall(clarity, htmltext)
    Interest = re.findall(interest, htmltext)
    Date = re.findall(date, htmltext)
    Class = re.findall(mathclass, htmltext)
    PROFNAME=re.findall(prof_name, htmltext)
    i+=1

    frames.append(DataFrame({'Comments': content, 'Easiness': Easiness, 'Helpfulness': Helpfulness, 
    'Clarity': Clarity, 'Rater Interest': Interest, 'Class': Class,
    'Date': Date[1:len(Date)], 'Professor': PROFNAME[0]}))

pd.concat(frames)

score 0 · Answer 2 · answered Aug 16 '13 at 04:43

You are overwriting your frame with each iteration of the loop. As Phillip Cloud suggested, you can make a list of frames that you append with each loop. I simplified your code differently, but I think this gives you what you want.

import urllib
import re
import pandas as pd

urls = ['http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131',
        'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=795226',
        'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131',
        'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1807944',
        'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=277459', 
        'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1076779', 
        'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=971546']

regex = {'pattern' : re.compile('<p class="commentText">(.+?)</p>'),
        'easiness' : re.compile('<strong>Easiness</strong><span>(.+?)</span></p>'),
        'helpfulness' : re.compile('Helpfulness</strong><span>(.+?)</span></p>'),
        'clarity' : re.compile('Clarity</strong><span>(.+?)</span></p>'),
        'interest' : re.compile('Rater Interest</strong><span>(.+?)</span></p>'),
        'date' : re.compile('<div class="date">(.+?)</div>'),
        'mathclass' : re.compile('<div class="class"><p style="word-wrap:break-word;">(.+?)</p>'),
        'prof_name' : re.compile('<meta name="prof_name" content="(.+?)"/>')}

# Make a dictionary with empty lists using the same keys
d = {}
for k in regex.keys():
    d[k] = []

# Now fill those lists
for url in urls:
    htmlfile = urllib.urlopen(url)
    htmltext = htmlfile.read()
    for k, v in regex.iteritems():
        d[k].append(re.findall(v, htmltext))
frame = pd.DataFrame(d) # Dump the dict into a DataFrame
print frame

You could even do `d = defaultdict(list)` and avoid that first loop. :) — Phillip Cloud, Aug 16 '13 at 06:11

Concatenate Data From URLS Recursively Inside one DataFrame

2 Answers2