0

i am new to programming , i have created a webscraper in python using beautiful soup but when i run this program it opens python command line and and just cursor blink on it and nothing happens...and now i receive these error

TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond

ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

...please dont mind the indentation,

below are my codes:

import urllib.request
import urllib
import json
import xml.etree.ElementTree as ET
import csv
from bs4 import BeautifulSoup

link = 'https://maharerait.mahaonline.gov.in/searchlist/searchlist'
talukaLink = "https://maharerait.mahaonline.gov.in/SearchList/GetTaluka"
distlink = "https://maharerait.mahaonline.gov.in/SearchList/GetDistrict"
prjLink = "https://maharerait.mahaonline.gov.in/SearchList/GetProjectName"

alldata = []

links = {}
certificatedata = []

def getData(url, values):
    data = urllib.parse.urlencode(values)
    data = data.encode('utf-8')
    req = urllib.request.Request(url, data)
    response=urllib.request.urlopen(req)
    data = response.read()
    data = data.decode("utf-8")
    return data


def getDivsion():
    ## for now we are taking 6 districts.. it needs to updated when the data 
gets updatedd
    return range(1,7)

  def getDistrict(divId):
      global distlink
      values = {'DivID': divId}
      data = getData(distlink, values)
    return data

def parseJson(data):
    parsed = json.loads(data)
    return parsed

def getTaluka(disId):
   global talukaLink
   values= {'DisID': disId}
   data = getData(talukaLink, values)
    return data

def getProjects(divId, disId):
    global prjLink
    values= {'DisID': disId, 'DivID': divId}
    #print(values)
    data = getData( prjLink, values)
    if len(data)<10:
    return "{}"
return data

def getProjectsList():
    divList = getDivsion()
    flag = 0
    for divId in divList:
        disData = getDistrict(divId)
        disList = parseJson(disData)
        for disObj in disList:
            disId = disObj["ID"]
            prjData = getProjects(divId, disId)
        #print(" >>>> "+str(disId)+" >> "+str(divId))
        #print(prjData)
        prjJson = parseJson(prjData)
        for prjObj in prjJson:
            flag += 1
            prjId = prjObj["ID"]
            values = {'ID':0, 'pageTraverse': 1, 'Division': divId,     'hdnDistrict': '', 'hdnProject':'', 'District': disId, 'Taluka':'', 'Village': '', 'Project': prjId, 'CertiNo':'', 'btnSearch':'Search'}
            finalPrjData = getData(link, values)
            parseXMLData(finalPrjData)
            #if len(alldata)>100:
            #    break

def parseXMLData(htmldata):
    global alldata, links
    soup = BeautifulSoup(htmldata, "html.parser")
    tables = soup.find_all("table")
    for table in tables:
        print(len(alldata))
    attr = table.attrs
    if "table" in attr['class']:
        tbody = table.find_all("tbody")
        if len(tbody)>0:
            tbody = tbody[0]
            tr_lst = tbody.find_all("tr")
            for tr in tr_lst:
                sublist = []
                td_lst = tr.find_all("td")
                if len(td_lst)>6:
                    prjname = td_lst[1].text
                    proname = td_lst[2].text
                    certNo = td_lst[3].text
                    sublist.append(prjname)
                    sublist.append(proname)
                    sublist.append(certNo)
                    td = td_lst[4]
                    a_lst = td.find_all("a")
                    if len(a_lst)>0:
                        a = a_lst[0]
                        href = a.attrs['href']
                        link = "https://maharerait.mahaonline.gov.in/"+href
                        links[certNo] = link
                        sublist.append(link)
                if len(sublist)>0:
                    alldata.append(sublist)
return alldata


def writedata(alldata1, filename):
    print(" >>>> FINAL PRINTING DATA >>>> ")
    #import pdb; pdb.set_trace()
    with open("./"+filename,'w') as csvfile:
        csvfile = csv.writer(csvfile, delimiter=',')
        #csvfile.writerow(titleRow)
        csvfile.writerow("")
        for i in range(0, len( alldata1 )):
            #print(alldata1[i])
            csvfile.writerow( alldata1[i]  )


def processlinksforcert():
    global links, certificatedata
    print(">> Came in fetching certificates data >>> " )
    for certno in links.keys():
        link = links[certno]
        htmldata = getData(link, {})
        soup = BeautifulSoup(htmldata, "html.parser") 
        divs = soup.find_all("div")
        for div in divs:
            attr = div.attrs
        if "id" in attr.keys() and "DivProfessional" in attr['id']:
            table = div.find_all("table")
            if len(table)<=0:
                continue
            t_attr = table[0].attrs
            if "table" in t_attr["class"]:
                print(len(certificatedata))
                table = table[0]
                tr_lst = table.find_all("tr")
                index = 1
                while index<len(tr_lst):
                    #import pdb; pdb.set_trace()
                    #for tr in tr_lst:
                    #if index==0:
                    #    continue
                    tr = tr_lst[index]
                    index += 1
                    sublist = []
                    td_lst = tr.find_all("td")
                    if len(td_lst)>2:
                        sublist.append(certno)
                        pername = formattext( td_lst[0].text)
                        cerno = formattext( td_lst[1].text )
                        proftype = formattext( td_lst[2].text )
                        sublist.append(pername)
                        sublist.append(cerno)
                        sublist.append(proftype)
                        certificatedata.append(sublist)
return certificatedata

def formattext(text):
    while text.find("\r\n")>=0:
        text = text.replace("\r\n","")

while text.find("   ")>=0:
    text = text.replace("   ","")
return text

def main():
    global alldata, certificatedata
    #data = getData(url, {})
    getProjectsList()
    print("Before write the projects data to the file. Count >> 
"+str(len(alldata)))
    writedata(alldata, "data.csv")
    data = processlinksforcert()
    print("Before write the certificates data to the file. Count >> 
"+str(len(data)))
    writedata( data, "certificates.csv" )


main()

can someone pleases suggest what am i doing wrong...i have everything installed pip and pip beautifulsoup also..please dont mind the indentation, it is just for here....

  • 1
    I think problem is that I can't even access some of your url from my browser `https://maharerait.mahaonline.gov.in/SearchList/GetTaluka`. I think probable you have to pass cookies or login info to your destination site to allow code get data. As for now it can't access source that's why it crushes after expiration time. – Grynets Aug 03 '17 at 08:48
  • But i had created scarper for same website in excel vba and it worked fine...but i noticed when i ping maharerait.mahaonline.gov.in it returns 100% loss – Prince Bhatia Aug 03 '17 at 17:13
  • 1
    You asked about suggestion, and as I see problem is not with your code. Problem with this special site, because as you've said, you get 100% loss. That's why script is not working. You need to investigate what is the main difference between your code and Excel VBA code. Here I can't help because I have no experience with Excel VBA. But, I suggest that the problem hides close to cookies or to authentication. – Grynets Aug 03 '17 at 20:13

1 Answers1

1

I solved it by using selenium. Thank you so much everyone