I'm learning how to programme and I want to scrape a webpage minus the javascript code. I'm following an example from a book. The code below should return just the html code from the website, however it only returns the title of the site and some JavaScript code at the bottom. Can someone please let me know where I went wrong? Cheers.
import urllib2
from bs4 import BeautifulSoup
url = "http://www.theurl.com/"
page = urllib2.urlopen(url)
soup = BeautifulSoup(page, "html.parser")
[x.extract() for x in soup.find_all('script')]
print soup.get_text()
This is what it returns after the title.
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-11092338-1']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();