How do I extract the text part from part of an HTML tag while using a web-crawler

Question

import requests
from bs4 import BeautifulSoup
import operator
from collections 
import Counter

def start(url):
  wordlist=[]
  source_code=requests.get(url).text
  soup=BeautifulSoup(source_code,'html.parser')
  for each_text in soup.findAll('div',{'class':'entry-content'}):
    content=each_text.strings
    words=content.lower().split()
    for each_word in words:
        wordlist.append(each_word)
    clean_wordlist(wordlist)

def clean_wordlist(wordlist):
  clean_list=[]
  for word in wordlist:
    symbols='!@#$%^&*()_-+={[}]|\;:"<>?/.,'
    for i in range (0,len(symbols)):
        word=word.replace(symbols[i],'')
    if len(word)>0:
        clean_list.append(word)
create_dictionary(clean_list)

def create_dictionary(clean_list):
  word_count={}
  for word in clean_list:
    if word in word_count:
        word_count[word]+=1
    else:
        word_count[word]=1
  for key,value in sorted(word_count.items(),key=operator.itemgetter(1)):
    print ("%s : %s " % (key,value))
  c=Counter(word_count)
  top=c.most_common(3)
  print(top)

 start("https://www.geeksforgeeks.org/programming-language-choose/")</code>

The following program gives the error "Attribute Error": "Generator" object has no attribute .lower(). I printed out the type of each_text.strings was returning which printed [class 'generator'] but now how do I move forward and get the text part from the given link

As documented, `.strings` returns a generator object. You'll have to iterate over it somehow to get the contents — juanpa.arrivillaga, Mar 14 '18 at 08:33

score 0 · Answer 1 · answered Mar 14 '18 at 09:13

Instead of creating a generator object we just use .text or if we really wanted to use .strings you could then do unpacking (i.e. print(*stingsobject))

As you can tell we use the asterisk before the object to unpack it, I'll not go into details but you can find more about it HERE

import requests
from bs4 import BeautifulSoup
import operator
from collections import Counter


def start(url):
  wordlist = []
  source_code = requests.get(url).text
  soup = BeautifulSoup(source_code, 'html.parser')
  for each_text in soup.findAll('div', {'class': 'entry-content'}):
    content = each_text.text
    words = content.lower().split()
    for each_word in words:
        wordlist.append(each_word)
    clean_wordlist(wordlist)


def clean_wordlist(wordlist):
  clean_list = []
  for word in wordlist:
    symbols = '!@#$%^&*()_-+={[}]|\;:"<>?/.,'
    for i in range(0, len(symbols)):
        word = word.replace(symbols[i], '')
    if len(word) > 0:
        clean_list.append(word)
  create_dictionary(clean_list)


def create_dictionary(clean_list):
  word_count = {}
  for word in clean_list:
    if word in word_count:
        word_count[word] += 1
    else:
        word_count[word] = 1
  for key, value in sorted(word_count.items(), key=operator.itemgetter(1)):
    print("%s : %s " % (key, value))
  c = Counter(word_count)
  top = c.most_common(3)
  print(top)

start("https://www.geeksforgeeks.org/programming-language-choose/")

Thanks a ton!! Your answer worked pitch perfect ! ;) – Pradhan29 Mar 14 '18 at 21:42 — Pradhan29, Mar 14 '18 at 21:42

How do I extract the text part from part of an HTML tag while using a web-crawler

1 Answers1