Is there any way to get list of models available on Hugging Face? E.g. for Automatic Speech Recognition (ASR).
Asked
Active
Viewed 431 times
2 Answers
3
You can use huggingface_hub with list_models and a ModelFilter:
from huggingface_hub import HfApi, ModelFilter
api = HfApi()
models = api.list_models(
filter=ModelFilter(
task="automatic-speech-recognition"
)
)
models = list(models)
print(len(models))
print(models[0].modelId)
Output:
7195
13048909972/wav2vec2-common_voice-tr-demo
0
Try something like:
import re
import requests
from bs4 import BeautifulSoup
def list_models(task, sort_by):
"""Returns first page of results from available models on huggingface.co"""
url = f"https://huggingface.co/models?pipeline_tag={task}&sort={sort_by}"
response = requests.get(url)
soup = BeautifulSoup(response.content.decode('utf8'))
for model in soup.find_all('article'):
parsed_text = [line.strip() for line in re.sub(' +', ' ', model.text.replace('\n', ' ').replace('\t', ' ').replace('•', '\n')).strip().split('\n')]
model_name_str, last_updated_str, downloaded, *liked = parsed_text
liked = int(liked[0]) if liked else 0
model_name = model.find('a').attrs['href'][1:]
timestamp = model.find('time').attrs['datetime']
yield {"model_name": model_name, "last_updated": timestamp, "downloaded": downloaded.strip(), "liked": liked}
task = "automatic-speech-recognition"
sort_by = "downloads"
list(list_models(task, sort_by))
[out]:
[{'model_name': 'jonatasgrosman/wav2vec2-large-xlsr-53-english',
'last_updated': '2022-12-14T02:02:32',
'downloaded': '13.7M',
'liked': 31},
{'model_name': 'pyannote/voice-activity-detection',
'last_updated': '2022-10-28T13:46:55',
'downloaded': '1.06M',
'liked': 37},
{'model_name': 'pyannote/speaker-diarization',
'last_updated': '2022-11-17T13:45:04',
'downloaded': '855k',
'liked': 171},
{'model_name': 'facebook/wav2vec2-large-960h-lv60-self',
'last_updated': '2022-05-23T16:13:42',
'downloaded': '636k',
'liked': 71},
{'model_name': 'yongjian/wav2vec2-large-a',
'last_updated': '2022-10-22T07:21:15',
'downloaded': '272k',
'liked': 5},
{'model_name': 'jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli',
'last_updated': '2022-02-25T19:07:57',
'downloaded': '248k',
'liked': 4},
{'model_name': 'facebook/wav2vec2-base-960h',
'last_updated': '2022-11-14T21:37:23',
'downloaded': '222k',
'liked': 109},
{'model_name': 'openai/whisper-tiny.en',
'last_updated': '2023-02-23T15:31:39',
'downloaded': '66.4k',
'liked': 35},
{'model_name': 'facebook/wav2vec2-xlsr-53-espeak-cv-ft',
'last_updated': '2021-12-10T17:18:39',
'downloaded': '62.1k',
'liked': 11},
{'model_name': 'maxidl/wav2vec2-large-xlsr-german',
'last_updated': '2021-07-06T12:32:21',
'downloaded': '60k',
'liked': 0},
{'model_name': 'pyannote/overlapped-speech-detection',
'last_updated': '2022-10-28T13:46:33',
'downloaded': '45.9k',
'liked': 4},
{'model_name': 'openai/whisper-tiny',
'last_updated': '2023-03-10T17:15:01',
'downloaded': '43.6k',
'liked': 41},
{'model_name': 'ceyda/wav2vec2-base-760-turkish',
'last_updated': '2021-07-06T00:16:04',
'downloaded': '42.1k',
'liked': 2},
{'model_name': 'openai/whisper-large-v2',
'last_updated': '2023-03-10T17:15:07',
'downloaded': '41.7k',
'liked': 275},
{'model_name': 'openai/whisper-small',
'last_updated': '2023-03-10T17:15:13',
'downloaded': '39.7k',
'liked': 37},
{'model_name': 'techiaith/wav2vec2-xlsr-ft-en-cy',
'last_updated': '2023-03-02T06:30:13',
'downloaded': '32.5k',
'liked': 1},
{'model_name': 'facebook/wav2vec2-xlsr-53-phon-cv-babel-ft',
'last_updated': '2021-11-10T12:02:20',
'downloaded': '31.2k',
'liked': 1},
{'model_name': 'comodoro/wav2vec2-xls-r-300m-cs-250',
'last_updated': '2022-03-23T18:26:50',
'downloaded': '31.1k',
'liked': 0},
{'model_name': 'jonatasgrosman/wav2vec2-large-xlsr-53-russian',
'last_updated': '2022-12-14T01:58:43',
'downloaded': '29.3k',
'liked': 12},
{'model_name': 'facebook/hubert-large-ls960-ft',
'last_updated': '2022-05-24T10:43:42',
'downloaded': '23.8k',
'liked': 27},
{'model_name': 'jonatasgrosman/wav2vec2-large-xlsr-53-japanese',
'last_updated': '2022-12-14T01:58:09',
'downloaded': '21.7k',
'liked': 9},
{'model_name': 'openai/whisper-base',
'last_updated': '2023-03-10T17:13:49',
'downloaded': '20.5k',
'liked': 52},
{'model_name': 'jonatasgrosman/wav2vec2-large-xlsr-53-dutch',
'last_updated': '2022-12-14T01:58:20',
'downloaded': '20.1k',
'liked': 0},
{'model_name': 'openai/whisper-medium',
'last_updated': '2023-03-10T17:15:08',
'downloaded': '20k',
'liked': 40},
{'model_name': 'jonatasgrosman/wav2vec2-large-xlsr-53-portuguese',
'last_updated': '2022-12-14T01:59:47',
'downloaded': '19.2k',
'liked': 11},
{'model_name': 'facebook/wav2vec2-large-960h',
'last_updated': '2022-04-05T16:40:42',
'downloaded': '15.5k',
'liked': 14},
{'model_name': 'facebook/data2vec-audio-base-960h',
'last_updated': '2022-05-24T10:41:22',
'downloaded': '15.4k',
'liked': 7},
{'model_name': 'theainerd/Wav2Vec2-large-xlsr-hindi',
'last_updated': '2023-03-20T05:28:11',
'downloaded': '15.3k',
'liked': 2},
{'model_name': 'tyoc213/wav2vec2-large-xlsr-nahuatl',
'last_updated': '2021-04-07T02:59:04',
'downloaded': '12.5k',
'liked': 1},
{'model_name': 'openai/whisper-large',
'last_updated': '2023-03-10T17:15:11',
'downloaded': '12k',
'liked': 228}]
Also posted a feature request on https://discuss.huggingface.co/t/feature-request-listing-available-models-datasets-and-metrics/34389

alvas
- 115,346
- 109
- 446
- 738
-
imo it should be noted that this solution is webscraping, which may not play well with OPs use case. ie: if OP plans to ship an app, they may get rejected for web scraping – Matt Pengelly Jul 31 '23 at 00:20