0

I'm scraping using httpx and trio, after few iteration on the link, it return an error - ReadError: socket connection broken: [WinError 10053] An established connection was aborted by the software in your host machine

and this is my script:

allin = []

async def worker(channel):
    async with channel:
        async for key_ in channel:
            async with httpx.AsyncClient(timeout=None) as client:
                client.headers.update(h)
                params = {
                    "e": key_
                }
                r = await client.get('https://disclosure.bursamalaysia.com/FileAccess/viewHtml', params=params)
                
                soup =bs(r.text,'html.parser')
                
                try :
                    all = soup.find('td',class_='FootNote').text
                except :
                    all = np.nan
                
                name = pd.read_html(r.text, match='Name')[0].set_index([0, pd.read_html(r.text, match='Name')[0].groupby(0).cumcount()])[1].unstack(0)
                comname = pd.read_html(r.text, match='Company Name')[0].set_index([0, pd.read_html(r.text, match='Company Name')[0].groupby(0).cumcount()])[1].unstack(0)
        
                try :
                    adm = pd.read_html(r.text, match='Admission Sponsor', index_col=0)[0].T
                except :
                    adm = pd.DataFrame({'Admission Sponsor':np.nan,'Sponsor':np.nan},index=[0])

                df = name.join(comname).join(adm)
                
                df['remark']= all
                allin.append(df)
                
                finaldf = pd.concat(allin, ignore_index=True)
#                 print(finaldf )
                finaldf.to_excel(exportpath, index=False,sheet_name='Change_of_Company_secretary', engine='xlsxwriter')



async def main():
    async with trio.open_nursery() as nurse:

        sender, receiver = trio.open_memory_channel(0)

        async with receiver:
            for _ in range(1000):
                nurse.start_soon(worker, receiver.clone())
            count=0
            async with sender:
                
                for k in titlelink:
                    await sender.send(k)
                    count +=1
                    print(count,'ID-',k,'|', end=' ')
                    
                    

    

if __name__ == "__main__":
    start = datetime.datetime.now()
    trio.run(main)
    
    finish = datetime.datetime.now() - start 
    print("Time Taken:",finish)

the total link to be iterate is 9731, it break before even hit 2000 link, im not sure which part of my script needed to be fix to encounter this issues.

Full error message as below :

---------------------------------------------------------------------------
ReadError                                 Traceback (most recent call last)
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_transports\default.py in map_httpcore_exceptions()
     60     try:
---> 61         yield
     62     except Exception as exc:

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_transports\default.py in handle_async_request(self, method, url, headers, stream, extensions)
    282                 extensions,
--> 283             ) = await self._pool.handle_async_request(
    284                 method=method,

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_async\connection_pool.py in handle_async_request(self, method, url, headers, stream, extensions)
    236             try:
--> 237                 response = await connection.handle_async_request(
    238                     method, url, headers=headers, stream=stream, extensions=extensions

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_async\connection.py in handle_async_request(self, method, url, headers, stream, extensions)
    147         )
--> 148         return await self.connection.handle_async_request(
    149             method, url, headers, stream, extensions

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_async\http11.py in handle_async_request(self, method, url, headers, stream, extensions)
    127             headers,
--> 128         ) = await self._receive_response(timeout)
    129         response_stream = AsyncIteratorByteStream(

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_async\http11.py in _receive_response(self, timeout)
    188         while True:
--> 189             event = await self._receive_event(timeout)
    190             if isinstance(event, h11.Response):

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_async\http11.py in _receive_event(self, timeout)
    224             if event is h11.NEED_DATA:
--> 225                 data = await self.socket.read(self.READ_NUM_BYTES, timeout)
    226 

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_backends\trio.py in read(self, n, timeout)
     65                     await self.stream.aclose()
---> 66                     raise exc
     67 

c:\users\baizura\appdata\local\programs\python\python38\lib\contextlib.py in __exit__(self, type, value, traceback)
    130             try:
--> 131                 self.gen.throw(type, value, traceback)
    132             except StopIteration as exc:

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_exceptions.py in map_exceptions(map)
     11             if isinstance(exc, from_exc):
---> 12                 raise to_exc(exc) from None
     13         raise

ReadError: socket connection broken: [WinError 10053] An established connection was aborted by the software in your host machine

The above exception was the direct cause of the following exception:

ReadError                                 Traceback (most recent call last)
<ipython-input-14-59a009208fba> in <module>
     59 if __name__ == "__main__":
     60     start = datetime.datetime.now()
---> 61     trio.run(main)
     62     asyncio.sleep(1)
     63     finish = datetime.datetime.now() - start

    [... skipping hidden 1 frame]

<ipython-input-14-59a009208fba> in main()
     51                     await sender.send(k)
     52                     count +=1
---> 53                     print(count,'ID-',k,'|', end=' ')
     54 
     55 

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\trio\_core\_run.py in __aexit__(self, etype, exc, tb)
    813             old_context = combined_error_from_nursery.__context__
    814             try:
--> 815                 raise combined_error_from_nursery
    816             finally:
    817                 _, value, _ = sys.exc_info()

<ipython-input-14-59a009208fba> in worker(channel)
      9                     "e": key_
     10                 }
---> 11                 r = await client.get('https://disclosure.bursamalaysia.com/FileAccess/viewHtml', params=params)
     12 
     13                 soup =bs(r.text,'html.parser')

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in get(self, url, params, headers, cookies, auth, allow_redirects, timeout)
   1720         **Parameters**: See `httpx.request`.
   1721         """
-> 1722         return await self.request(
   1723             "GET",
   1724             url,

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in request(self, method, url, content, data, files, json, params, headers, cookies, auth, allow_redirects, timeout)
   1479             cookies=cookies,
   1480         )
-> 1481         response = await self.send(
   1482             request, auth=auth, allow_redirects=allow_redirects, timeout=timeout
   1483         )

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in send(self, request, stream, auth, allow_redirects, timeout)
   1566         auth = self._build_request_auth(request, auth)
   1567 
-> 1568         response = await self._send_handling_auth(
   1569             request,
   1570             auth=auth,

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in _send_handling_auth(self, request, auth, timeout, allow_redirects, history)
   1602 
   1603             while True:
-> 1604                 response = await self._send_handling_redirects(
   1605                     request,
   1606                     timeout=timeout,

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in _send_handling_redirects(self, request, timeout, allow_redirects, history)
   1638                 )
   1639 
-> 1640             response = await self._send_single_request(request, timeout)
   1641             try:
   1642                 response.history = list(history)

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in _send_single_request(self, request, timeout)
   1679                 stream,
   1680                 extensions,
-> 1681             ) = await transport.handle_async_request(
   1682                 request.method.encode(),
   1683                 request.url.raw,

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_transports\default.py in handle_async_request(self, method, url, headers, stream, extensions)
    276     ]:
    277         with map_httpcore_exceptions():
--> 278             (
    279                 status_code,
    280                 headers,

c:\users\baizura\appdata\local\programs\python\python38\lib\contextlib.py in __exit__(self, type, value, traceback)
    129                 value = type()
    130             try:
--> 131                 self.gen.throw(type, value, traceback)
    132             except StopIteration as exc:
    133                 # Suppress StopIteration *unless* it's the same exception that

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_transports\default.py in map_httpcore_exceptions()
     76 
     77         message = str(exc)
---> 78         raise mapped_exc(message) from exc
     79 
     80 

ReadError: socket connection broken: [WinError 10053] An established connection was aborted by the software in your host machine
  • always put full error message (starting at word "Traceback") in question (not comment) as text (not screenshot, not link to external portal). There are other useful information. – furas Jul 27 '21 at 08:18
  • maybe you send too many requests to server and it think you are hacker or spamer and it disconected you. – furas Jul 27 '21 at 08:22
  • @furas i updated the question with full error message – Yazid Yaakub Jul 27 '21 at 08:26
  • Network connections break. It's a fact of life. Some servers shut down the worker threads after serving so many requests, if there is a proxy in the way, those tend to do it too, or some connection-tracking firewall runs out of space and drops the connection…. Just implement proper re-connecting. – Jan Hudec Jul 27 '21 at 08:39

0 Answers0