Using sys.argv for webscraping, and trying run it through CMD console. I am new to this, adn dont knwo how to solve "NoneType" error. Using pycharm

Question

import sys 
import requests 
import csv 
import os 
from bs4 import BeautifulSoup



def main():
    url = input_arguments()[0]
    filename = input_arguments()[1]
    header = summary_info(url)
    data = data_scraping(url)
    save_to_csv(filename, header, data)
    check_file(filename)


def url_list():
    url = "https://volby.cz/pls/ps2017nss/ps3?xjazyk=CZ"
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    list_href = []
    for index in range(1, 15):
        find = soup.find_all(("td", {"headers": f"{index}sa3"}))
        for href in find:
            url_part = "https://volby.cz/pls/ps2017nss/"
            all_url = url_part + href.a["href"]
            list_href.append(all_url)
    return list_href

def input_arguments():
    """
    Kontroluje a přijímá argumet URL a název výstupního souboru.
    """
    if len(sys.argv) != 3:
        print(
            f"+--------------------------------------------------------------------------+",
            f"| 1) Jako první argument zadej URL adresu v uvozovkách:                    |",
            f'|    např.: "https://volby.cz/pls/ps2017"                                  |',
            f"| 2) Jako druhý argument zadej název souboru pro uložení dat v uvozovkách: |",
            f'|    např.: "vysledky_lipno.csv"                                          |',
            f"| 3) Celkový vzor:                                                         |",
            f"+--------------------------------------------------------------------------+",
            f'| election_scraper.py "https://volby.cz/pls/ps2017" "vysledky_lipno.csv"  |',
            f"+--------------------------------------------------------------------------+",
            sep="\n"
            )
        quit()
    elif sys.argv[2] in url_list():
            print(
            f"+--------------------------------------------------------------------------+",
            f"| Jako prní argument zadej URL adresu a jako druhý název souboru.          |",
            f"| Argumenty zadej v uvozovkách a odděl mezerou.                            |",
            f"| Viz vzor:                                                                |",
            f"+--------------------------------------------------------------------------+",
            f'| election_scraper.py "https://volby.cz/pls/ps2017" "vysledky_lipno.csv"  |',
            f"+--------------------------------------------------------------------------+",
            sep="\n"
            )
            quit()
    elif sys.argv[1] not in url_list():
            print(
            f"+--------------------------------------------------------------------------+",
            f"| Tato URL adresa není podporována.                                        |",
            f"| Zadej podporovanou URL adresu způsobem viz vzor:                         |",
            f"+--------------------------------------------------------------------------+",
            f'| election_scraper.py "https://volby.cz/pls/ps2017" "vysledky_lipno.csv"  |',
            f"+--------------------------------------------------------------------------+",
            sep="\n"
            )
            quit()
    elif not sys.argv[2].endswith('.csv'):
            print(
            f"+--------------------------------------------------------------------------+",
            f"| Název souboru musí končit příponou .csv                                  |",
            f"| Viz vzor:                                                                |",
            f"+--------------------------------------------------------------------------+",
            f'| election_scraper.py "https://volby.cz/pls/ps2017" "vysledky_lipno.csv"  |',
            f"+--------------------------------------------------------------------------+",
            sep="\n"
            )
            quit()
    else:
        url = sys.argv[1]
        filename = sys.argv[2]
    return url, filename

def summary_info(url):
    """
    Vypíše názvy stran.
    """
    header = [
        "kód obce,"
        "název obce",
        "voliči v seznamu",
        "vydané obálky",
        "platé hlasy",
    ]

    first_page = requests.get(url)

    print(
        f"Stahuji informace z vybraného URL:",
        f"{url}",
        sep="\n"
    )

    soup = BeautifulSoup(first_page.text, "html.parser")
    first_page_href = soup.find("td", {"class": "cislo"}).a["href"]

    part_url = "https://volby.cz/pls/ps2017nss/"
    header_url = part_url + first_page_href

    second_page = requests.get(header_url)
    soup = BeautifulSoup(second_page.text, "html.parser")

    for name in (soup.find_all("td", {"class": "overflow_name"})):
        header.append(name.text)

    return header

def data_scraping(url):
    """
    ( This function allows download data)Funkce bere kód obce a název obce a přidává data do listu ze stránky vybrané
    uživatelem. Následně přistupuje přes kód obce ke zbylým datům, která přidává do listu.
    Jakmile má funkce všechna data pro danou obec/řádek, tak přidává list do listu data.
    Tento postup se opakuje pro všechny obce.
    """

    first_page = requests.get(url)

    print(
        f"stahuji data z vybraného URL:"
        f"(url",
        sep="\n"
        )

    soup = BeautifulSoup(first_page.text, "html.parser")

    first_page_code = soup.find_all("td", {"class": "cislo"})
    first_page_names = soup.find_all("td", {"class": "overflow_name"})

    if len(first_page_names) == 0:
        first_page_names = soup.find_all("td", {"headers": "t1sa1 t1sb2"})

    first_page_hrefs = [href.a["href"] for href in first_page_code]

    data = []

    part_url = "https://volby.cz/pls/ps2017nss/"

    for index, result in enumerate(first_page_hrefs, 0):
        row_list = []

        second_url = part_url + result

        second_page = requests.get(second_url)
        soup = BeautifulSoup(second_page.text, "html.parser")

        row_list.append(first_page_code[index].text)
        row_list.append(first_page_names[index].text)

        row_list.append((soup.find("td", {"headers": "sa2"}).text).replace('\xa0', ''))
        row_list.append((soup.find("td", {"headers": "sa3"}).text).replace('\xa0', ''))
        row_list.append((soup.find("td", {"headers": "sa6"}).text).replace('\xa0', ''))

        first_candidate_parties = (soup.find_all("td", {"headers": "t1sa2 t1sb3"}))
        for data_candidate in first_candidate_parties:
            row_list.append(data_candidate.text.replace('\xa0', ''))

        second_candidate_parties = (soup.find_all("td", {"headers": "t2sa2 t2sb3"}))
        for data_candidate in second_candidate_parties:
            numeric = (data_candidate.text.replace('\xa0', ''))
            if numeric.isnumeric():
                row_list.append(numeric)

        data.append(row_list)

    return data


def check_file(filename):
    dash = "-" * len(filename)

    if filename in os.listdir():
        print(
            f"+---------------------------{dash}+",
            f"| Data uložena do souboru: {filename} |",
            f"+---------------------------{dash}+",
            f"Ukončuji program...",
            sep="\n", end=("")
        )
    else:
        print(
            f"+--------------------{dash}+",
            f"| Soubor nenalezen: {filename} |",
            f"+--------------------{dash}+",
            f"Ukončuji program...",
            sep="\n", end=("")
        )


def save_to_csv(filename, header, data):
    """
    Uloží data do csv.
    """
    with open(filename, mode="w", newline="", encoding="utf-8") as data_csv:
        writer = csv.writer(data_csv)
        print(
            f"Ukládám data do vybraného souboru:",
            f"{filename}",
            sep="\n"
        )
        writer.writerow(header)
        for row in data:
            writer.writerow(row)


if __name__ == "__main__":
    main()

You must insert to Pycharm terminal 3 arguments for start program and it looks like this:

py election_scraper.py
Url in quotation marks "https://volby.cz/pls/ps2017nss/ps311?xjazyk=CZ&xkraj=6&xobec=566403&xvyber=4204"
Name of result file in csv format like "vysledky_lipno.csv"

But when I try do this, still getting error like this:

Traceback (most recent call last):
  File "C:\Users\Admin\PycharmProjects\pythonProject3ElectionScraper\election_scraper.py", line 226, in <module>      
    main()
  File "C:\Users\Admin\PycharmProjects\pythonProject3ElectionScraper\election_scraper.py", line 16, in main
    url = input_arguments()[0]
  File "C:\Users\Admin\PycharmProjects\pythonProject3ElectionScraper\election_scraper.py", line 55, in input_arguments
    elif sys.argv[2] in url_list():
  File "C:\Users\Admin\PycharmProjects\pythonProject3ElectionScraper\election_scraper.py", line 33, in url_list       
    all_url = url_part + href.a["href"]
TypeError: 'NoneType' object is not subscriptable

I am new to this, so sorry if this ASK is stupid, but i am trying handle this for so long, and I decided to ask here for help. Thank you.

Can anyone help please? This is my first web scraping, and i am done. — JWchild, Jul 24 '22 at 11:08
first check what you have in `href` - maybe this element doesn't have `` and `href.a` gives `None` - and you should first get `href.a` and check `if href.a is not None:` before you try to get `["href"]`. And this problem has nothing to do with `sys.argv` — furas, Jul 24 '22 at 13:52
when you use `print(href)` before `all_url =... ` then you see `Praha` - and this element doesn't have ``. You should check `href.a is not None` and skip `["href"]` when `href.a` is `None`. — furas, Jul 24 '22 at 14:01

furas · Answer 1 · 2022-07-24T14:40:57.310

sys.argv has nothing to do with your problem

You try to get item href.a but some element doesn't have <a> in item href and href.a gives None and href.a["href"] means None["href"] - and this raise error.

You should get ["href"] only when href.a is not None

url_part = "https://volby.cz/pls/ps2017nss/"

for href in find:
    if href.a is not None:
        all_url = url_part + href.a["href"]
        list_href.append(all_url)

Now it works without error but you have other problem with url_list() because if I run code

election_scraper.py "https://volby.cz/pls/ps2017" "vysledky_lipno.csv"

then it always show message

Tato URL adresa není podporována.

You have wrong logic in this code but I don't know what you expect.

Using sys.argv for webscraping, and trying run it through CMD console. I am new to this, adn dont knwo how to solve "NoneType" error. Using pycharm

1 Answers1