I'm using PyQT4 (for the first time) to scrape some pages. Since I try to scrape multiple pages I use QEventloop. However I could not add loadFinished signal to code. Here is how my code looks like this:
# Imports
import requests
from bs4 import BeautifulSoup
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from PyQt4.QtNetwork import QNetworkRequest
from PyQt4.QtGui import *
from lxml import html
import csv
import win_unicode_console
import time
# Main setting
DIR = "data"
URL = "https://addons.mozilla.org"
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"}
def Render(url):
page = QWebPage()
loop = QEventLoop() # Create event loop
page.mainFrame().loadFinished.connect(loop.quit) # Connect loadFinished to loop quit
page.mainFrame().load(QUrl(url))
loop.exec_() # Run event loop, it will end on loadFinished
return page.mainFrame().toHtml()
app = QApplication(sys.argv)
def pagination(page):
page_url = "https://addons.mozilla.org/en-US/firefox/extensions/?sort=users&page=" + str(page)
response = requests.get(page_url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")
items = soup.findAll("div", class_="item addon")
for item in items:
time.sleep(2)
item = URL + item.h3.select('a')[0].get('href')
print(item)
addon_scraper(item)
def addon_scraper(url):
time.sleep(7)
result = Render(url)
print(result)
soup = BeautifulSoup(result, "lxml")
addon_name = soup.select("#addon > hgroup > h1 > span")[0].get_text()
print(addon_name)
addon_author = soup.select("#addon > hgroup > h4 > a")[0].get_text()
category = soup.select("#related > ul")[0].get_text().strip()
with open("category_list.csv", "a", newline="", encoding="utf-16") as f:
writer = csv.writer(f, dialect="excel-tab")
writer.writerow([addon_name, addon_author, category])
# Run the scraper
if __name__ == "__main__":
win_unicode_console.enable() # Enable unicode support in command line interface
for i in range(1, 100):
print(i)
pagination(i)
app.exit()
At the end it just restarts the script and does nothing. I was trying to implement solution provided by user Mip here:Web Scraping Multiple Links with PyQt / QtWebkit I think adding user agent to above app and implicit sleep (similar to selenium case) would solve my problem. But I couldn't manage to do it. Now I get the following error. I think it is because PyQt4 exits the loop before source content is loaded:
Traceback (most recent call last): File "main.py", line 56, in <module> pagination(i) File "mozilla_file.py", line 36, in pagination addon_scraper(item) File "mozilla_file.py", line 46, in addon_scraper category = soup.select("#related > ul")[0].get_text().strip() IndexError: list index out of range