I'm scraping using httpx and trio, after few iteration on the link, it return an error - ReadError: socket connection broken: [WinError 10053] An established connection was aborted by the software in your host machine
and this is my script:
allin = []
async def worker(channel):
async with channel:
async for key_ in channel:
async with httpx.AsyncClient(timeout=None) as client:
client.headers.update(h)
params = {
"e": key_
}
r = await client.get('https://disclosure.bursamalaysia.com/FileAccess/viewHtml', params=params)
soup =bs(r.text,'html.parser')
try :
all = soup.find('td',class_='FootNote').text
except :
all = np.nan
name = pd.read_html(r.text, match='Name')[0].set_index([0, pd.read_html(r.text, match='Name')[0].groupby(0).cumcount()])[1].unstack(0)
comname = pd.read_html(r.text, match='Company Name')[0].set_index([0, pd.read_html(r.text, match='Company Name')[0].groupby(0).cumcount()])[1].unstack(0)
try :
adm = pd.read_html(r.text, match='Admission Sponsor', index_col=0)[0].T
except :
adm = pd.DataFrame({'Admission Sponsor':np.nan,'Sponsor':np.nan},index=[0])
df = name.join(comname).join(adm)
df['remark']= all
allin.append(df)
finaldf = pd.concat(allin, ignore_index=True)
# print(finaldf )
finaldf.to_excel(exportpath, index=False,sheet_name='Change_of_Company_secretary', engine='xlsxwriter')
async def main():
async with trio.open_nursery() as nurse:
sender, receiver = trio.open_memory_channel(0)
async with receiver:
for _ in range(1000):
nurse.start_soon(worker, receiver.clone())
count=0
async with sender:
for k in titlelink:
await sender.send(k)
count +=1
print(count,'ID-',k,'|', end=' ')
if __name__ == "__main__":
start = datetime.datetime.now()
trio.run(main)
finish = datetime.datetime.now() - start
print("Time Taken:",finish)
the total link to be iterate is 9731, it break before even hit 2000 link, im not sure which part of my script needed to be fix to encounter this issues.
Full error message as below :
---------------------------------------------------------------------------
ReadError Traceback (most recent call last)
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_transports\default.py in map_httpcore_exceptions()
60 try:
---> 61 yield
62 except Exception as exc:
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_transports\default.py in handle_async_request(self, method, url, headers, stream, extensions)
282 extensions,
--> 283 ) = await self._pool.handle_async_request(
284 method=method,
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_async\connection_pool.py in handle_async_request(self, method, url, headers, stream, extensions)
236 try:
--> 237 response = await connection.handle_async_request(
238 method, url, headers=headers, stream=stream, extensions=extensions
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_async\connection.py in handle_async_request(self, method, url, headers, stream, extensions)
147 )
--> 148 return await self.connection.handle_async_request(
149 method, url, headers, stream, extensions
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_async\http11.py in handle_async_request(self, method, url, headers, stream, extensions)
127 headers,
--> 128 ) = await self._receive_response(timeout)
129 response_stream = AsyncIteratorByteStream(
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_async\http11.py in _receive_response(self, timeout)
188 while True:
--> 189 event = await self._receive_event(timeout)
190 if isinstance(event, h11.Response):
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_async\http11.py in _receive_event(self, timeout)
224 if event is h11.NEED_DATA:
--> 225 data = await self.socket.read(self.READ_NUM_BYTES, timeout)
226
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_backends\trio.py in read(self, n, timeout)
65 await self.stream.aclose()
---> 66 raise exc
67
c:\users\baizura\appdata\local\programs\python\python38\lib\contextlib.py in __exit__(self, type, value, traceback)
130 try:
--> 131 self.gen.throw(type, value, traceback)
132 except StopIteration as exc:
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_exceptions.py in map_exceptions(map)
11 if isinstance(exc, from_exc):
---> 12 raise to_exc(exc) from None
13 raise
ReadError: socket connection broken: [WinError 10053] An established connection was aborted by the software in your host machine
The above exception was the direct cause of the following exception:
ReadError Traceback (most recent call last)
<ipython-input-14-59a009208fba> in <module>
59 if __name__ == "__main__":
60 start = datetime.datetime.now()
---> 61 trio.run(main)
62 asyncio.sleep(1)
63 finish = datetime.datetime.now() - start
[... skipping hidden 1 frame]
<ipython-input-14-59a009208fba> in main()
51 await sender.send(k)
52 count +=1
---> 53 print(count,'ID-',k,'|', end=' ')
54
55
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\trio\_core\_run.py in __aexit__(self, etype, exc, tb)
813 old_context = combined_error_from_nursery.__context__
814 try:
--> 815 raise combined_error_from_nursery
816 finally:
817 _, value, _ = sys.exc_info()
<ipython-input-14-59a009208fba> in worker(channel)
9 "e": key_
10 }
---> 11 r = await client.get('https://disclosure.bursamalaysia.com/FileAccess/viewHtml', params=params)
12
13 soup =bs(r.text,'html.parser')
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in get(self, url, params, headers, cookies, auth, allow_redirects, timeout)
1720 **Parameters**: See `httpx.request`.
1721 """
-> 1722 return await self.request(
1723 "GET",
1724 url,
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in request(self, method, url, content, data, files, json, params, headers, cookies, auth, allow_redirects, timeout)
1479 cookies=cookies,
1480 )
-> 1481 response = await self.send(
1482 request, auth=auth, allow_redirects=allow_redirects, timeout=timeout
1483 )
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in send(self, request, stream, auth, allow_redirects, timeout)
1566 auth = self._build_request_auth(request, auth)
1567
-> 1568 response = await self._send_handling_auth(
1569 request,
1570 auth=auth,
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in _send_handling_auth(self, request, auth, timeout, allow_redirects, history)
1602
1603 while True:
-> 1604 response = await self._send_handling_redirects(
1605 request,
1606 timeout=timeout,
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in _send_handling_redirects(self, request, timeout, allow_redirects, history)
1638 )
1639
-> 1640 response = await self._send_single_request(request, timeout)
1641 try:
1642 response.history = list(history)
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in _send_single_request(self, request, timeout)
1679 stream,
1680 extensions,
-> 1681 ) = await transport.handle_async_request(
1682 request.method.encode(),
1683 request.url.raw,
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_transports\default.py in handle_async_request(self, method, url, headers, stream, extensions)
276 ]:
277 with map_httpcore_exceptions():
--> 278 (
279 status_code,
280 headers,
c:\users\baizura\appdata\local\programs\python\python38\lib\contextlib.py in __exit__(self, type, value, traceback)
129 value = type()
130 try:
--> 131 self.gen.throw(type, value, traceback)
132 except StopIteration as exc:
133 # Suppress StopIteration *unless* it's the same exception that
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_transports\default.py in map_httpcore_exceptions()
76
77 message = str(exc)
---> 78 raise mapped_exc(message) from exc
79
80
ReadError: socket connection broken: [WinError 10053] An established connection was aborted by the software in your host machine