2

I am learning the book programming collective intelligence. Below is my code:

import feedparser
import re

# Returns title and dictionary of word counts for an RSS feed
def getwordcounts(url):
    # Parse the feed
    d = feedparser.parse(url)
    wc={}

    # Loop over all the entries
    for e in d.entries:
        if 'summary' in e:
            summary = e.summary
        else:
            summary = e.description

        # Extract a list of words
        words = getwords(e.title + '' + summary)
        for word in words:
            wc.setdefault(word, 0)
            wc[word] += 1
    return d.feed.title, wc

def getwords(html):
    # Remove all the HTML tags
    txt = re.compile(r'[^>]+>').sub('',html)

    # Split words by all non-alpha characters
    words = re.compile(r'[^A-Z^a-z]+').split(txt)

    # Convert to lowercase
    return [word.lower() for word in words if word!='']


apcount = {}
wordcounts = {}
for feedurl in file('feedlist.txt'):
    title, wc = getwordcounts(feedurl)
    wordcounts[title] = wc
    for word, count in wc.items():
        apcount.setdefault(word, 0)
        if count>1:
            apcount[word] += 1

wordlist = []
for w, bc in apcount.items():
    frac = float(bc)/len(feedlist)
    if frac>0.1 and frac<0.5:
        wordlist.append(w)

out = file('blogdata.txt', 'w')
out.write('Blog')
for word in wordlist:
    out.write('\t%s' % word)
out.write('\n')
for blog, wc in wordcounts.items(): 
    out.write(blog)
    for word in wordlist:
        if word in wc:
            out.wirte('\t%d' % wc[word])
        else:
            out.write('\t0')
    out.write('\n')

When I run this script, I got the message:

Traceback (most recent call last):
  File "generatefeedvector.py", line 38, in <module>
    title, wc = getwordcounts(feedurl)
  File "generatefeedvector.py", line 22, in getwordcounts
    return d.feed.title, wc
  File "build/bdist.linux-x86_64/egg/feedparser.py", line 416, in __getattr__
AttributeError: object has no attribute 'title'

I have checked the version of feedparser is 5.1.3.

So how to fix this problem? Thanks

alecxe
  • 462,703
  • 120
  • 1,088
  • 1,195
epx
  • 571
  • 4
  • 16
  • 27

1 Answers1

2

The URL you are trying to parse with feedparser is either not a valid feed (check it with feedvalidator), but a web page, or the feed is empty, or the title is empty.

As a workaround, use getattr():

return getattr(d.feed, 'title', 'Unknown title'), wc

Also see:

Community
  • 1
  • 1
alecxe
  • 462,703
  • 120
  • 1,088
  • 1,195