import trio
import httpx
from bs4 import BeautifulSoup
import pandas as pd
from functools import partial
async def main(url):
async with httpx.AsyncClient(timeout=None) as client:
r = await client.get(url)
soup = BeautifulSoup(r.text, 'lxml')
tfile = soup.select_one('.file-link:-soup-contains(Table)').a['href']
async with client.stream('GET', tfile) as r:
fname = r.headers.get('content-disposition').split('=')[-1]
async with await trio.open_file(fname, 'wb') as f:
async for chunk in r.aiter_bytes():
await f.write(chunk)
df = await trio.to_thread.run_sync(partial(pd.read_excel, fname, sheet_name=3, engine="pyxlsb"))
print(df)
if __name__ == "__main__":
trio.run(main, 'https://rigcount.bakerhughes.com/na-rig-count')
Output:
Country County Basin DrillFor ... Week RigCount State/Province PublishDate
0 UNITED STATES SABINE Haynesville Gas ... 13 1 LOUISIANA 40634
1 UNITED STATES TERREBONNE Other Oil ... 13 1 LOUISIANA 40634
2 UNITED STATES VERMILION Other Gas ... 13 1 LOUISIANA 40634
3 UNITED STATES VERMILION Other Gas ... 13 1 LOUISIANA 40634
4 UNITED STATES EDDY Permian Oil ... 13 1 NEW MEXICO 40634
... ... ... ... ... ... ... ... ... ...
769390 UNITED STATES KERN Other Oil ... 29 1 CALIFORNIA 44393
769391 UNITED STATES KERN Other Oil ... 29 1 CALIFORNIA 44393
769392 UNITED STATES KERN Other Oil ... 29 1 CALIFORNIA 44393
769393 UNITED STATES KERN Other Oil ... 29 1 CALIFORNIA 44393
769394 UNITED STATES KERN Other Oil ... 29 1 CALIFORNIA 44393
[769395 rows x 13 columns]
>Note: Seems you reached a bug within `pyxlsb` reader. Reading the sheet using index is the reason but using `sheet_name='Master Data'` is works fine.
Update:
the problem is that the excel file has 2 hidden sheets, and the 2nd sheets really has 1457 rows, the Master Data is actually the 4th sheet, so sheet_name=3 will work
Last update:
In order to follow Python DRY Principle . I noticed that we don't need to save the file locally, or even visualize a file and store to the memory, and then load it to pandas.
Actually the response
content itself stored into the memory, so we can load it all at once by passing r.content
directly to pandas!
Use the below code:
import trio
import httpx
from bs4 import BeautifulSoup
import pandas as pd
from functools import partial
async def main(url):
async with httpx.AsyncClient(timeout=None) as client:
r = await client.get(url)
soup = BeautifulSoup(r.text, 'lxml')
tfile = soup.select_one('.file-link:-soup-contains(Table)').a['href']
r = await client.get(tfile)
df = await trio.to_thread.run_sync(partial(pd.read_excel, r.content, sheet_name=3, engine="pyxlsb"))
print(df)
if __name__ == "__main__":
trio.run(main, 'https://rigcount.bakerhughes.com/na-rig-count')