the script extract data from the list of urls using beautifulsoup and convert the data into a dataframe in order to export as excel file.
the problem is when i try to convert the data into dataframe it display the below error :
Traceback (most recent call last):
File "f:\AIenv\web_scrapping\job_desc_email.py", line 144, in <module>
scrap_website()
File "f:\AIenv\web_scrapping\job_desc_email.py", line 88, in scrap_website
convert_to_dataFrame(joineddd)
File "f:\AIenv\web_scrapping\job_desc_email.py", line 98, in convert_to_dataFrame
df = pd.DataFrame(joineddd,columns=["link","location","Company_Industry","Company_Type","Job_Role","Employment_Type","Monthly_Salary_Range","Number_of_Vacancies","Career_Level","Years_of_Experience","Residence_Location","Gender","Nationality","Degree","Age"])
File "F:\AIenv\lib\site-packages\pandas\core\frame.py", line 509, in __init__
arrays, columns = to_arrays(data, columns, dtype=dtype)
File "F:\AIenv\lib\site-packages\pandas\core\internals\construction.py", line 524, in to_arrays
return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype)
File "F:\AIenv\lib\site-packages\pandas\core\internals\construction.py", line 567, in _list_to_arrays
raise ValueError(e) from e
ValueError: 15 columns passed, passed data had 13 columns
my question is how to fix this error ?
I think if the extracted data is empty i can extract it and display NaN or null
CODE:
import time
import pandas as pd
from datetime import date
import requests
from bs4 import BeautifulSoup
def scrap_website():
url_list = ["https://www.bayt.com/en/international/jobs/executive-chef-jobs/","https://www.bayt.com/en/international/jobs/head-chef-jobs/","https://www.bayt.com/en/international/jobs/executive-sous-chef-jobs/"]
for url in url_list:
soup = BeautifulSoup(requests.get(url).content,"lxml")
links = []
for a in soup.select("h2.m0.t-regular a"):
if a['href'] not in links:
links.append("https://www.bayt.com"+ a['href'])
joineddd = []
for link in links:
s = BeautifulSoup(requests.get(link).content, "lxml")
alldd = [dd.text for dd in s.select("div[class='card-content is-spaced'] dd")]
alldd.insert(0,link)
joineddd.append(alldd)
print("-" * 80)
print("Web Crawling is Done for {}".format(url))
convert_to_dataFrame(joineddd)
def remove_unwanted_cols(dataset, cols):
for col in cols:
del dataset[col]
return dataset
def convert_to_dataFrame(joineddd):
df = pd.DataFrame(joineddd,columns=["link","location","Company_Industry","Company_Type",
"Job_Role","Employment_Type","Monthly_Salary_Range","Number_of_Vacancies","Career_Level",
"Years_of_Experience","Residence_Location","Gender","Nationality","Degree","Age"])
df = remove_unwanted_cols(df, ["Company_Industry","Company_Type","Job_Role","Number_of_Vacancies"])
df_to_excel = df.to_excel(r"F:\\AIenv\web_scrapping\\jobDesc.xlsx", index = False, header=True)
if __name__ == "__main__":
scrap_website()