2

Is there any way to get list of models available on Hugging Face? E.g. for Automatic Speech Recognition (ASR).

alvas
  • 115,346
  • 109
  • 446
  • 738

2 Answers2

3

You can use huggingface_hub with list_models and a ModelFilter:

from huggingface_hub import HfApi, ModelFilter
api = HfApi()
models = api.list_models(
    filter=ModelFilter(
        task="automatic-speech-recognition"
    )
)
models = list(models)
print(len(models))
print(models[0].modelId)

Output:

7195
13048909972/wav2vec2-common_voice-tr-demo
dfrankow
  • 20,191
  • 41
  • 152
  • 214
cronoik
  • 15,434
  • 3
  • 40
  • 78
0

Try something like:

import re

import requests
from bs4 import BeautifulSoup


def list_models(task, sort_by):
  """Returns first page of results from available models on huggingface.co"""
  url = f"https://huggingface.co/models?pipeline_tag={task}&sort={sort_by}"

  response = requests.get(url)
  soup = BeautifulSoup(response.content.decode('utf8'))

  for model in soup.find_all('article'):
    parsed_text = [line.strip() for line in re.sub(' +', ' ', model.text.replace('\n', ' ').replace('\t', ' ').replace('•', '\n')).strip().split('\n')]
    model_name_str, last_updated_str, downloaded, *liked = parsed_text
    liked = int(liked[0]) if liked else 0

    model_name = model.find('a').attrs['href'][1:]
    timestamp = model.find('time').attrs['datetime']
    yield {"model_name": model_name, "last_updated": timestamp, "downloaded": downloaded.strip(), "liked": liked}



task = "automatic-speech-recognition"
sort_by = "downloads"

list(list_models(task, sort_by))

[out]:

[{'model_name': 'jonatasgrosman/wav2vec2-large-xlsr-53-english',
  'last_updated': '2022-12-14T02:02:32',
  'downloaded': '13.7M',
  'liked': 31},
 {'model_name': 'pyannote/voice-activity-detection',
  'last_updated': '2022-10-28T13:46:55',
  'downloaded': '1.06M',
  'liked': 37},
 {'model_name': 'pyannote/speaker-diarization',
  'last_updated': '2022-11-17T13:45:04',
  'downloaded': '855k',
  'liked': 171},
 {'model_name': 'facebook/wav2vec2-large-960h-lv60-self',
  'last_updated': '2022-05-23T16:13:42',
  'downloaded': '636k',
  'liked': 71},
 {'model_name': 'yongjian/wav2vec2-large-a',
  'last_updated': '2022-10-22T07:21:15',
  'downloaded': '272k',
  'liked': 5},
 {'model_name': 'jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli',
  'last_updated': '2022-02-25T19:07:57',
  'downloaded': '248k',
  'liked': 4},
 {'model_name': 'facebook/wav2vec2-base-960h',
  'last_updated': '2022-11-14T21:37:23',
  'downloaded': '222k',
  'liked': 109},
 {'model_name': 'openai/whisper-tiny.en',
  'last_updated': '2023-02-23T15:31:39',
  'downloaded': '66.4k',
  'liked': 35},
 {'model_name': 'facebook/wav2vec2-xlsr-53-espeak-cv-ft',
  'last_updated': '2021-12-10T17:18:39',
  'downloaded': '62.1k',
  'liked': 11},
 {'model_name': 'maxidl/wav2vec2-large-xlsr-german',
  'last_updated': '2021-07-06T12:32:21',
  'downloaded': '60k',
  'liked': 0},
 {'model_name': 'pyannote/overlapped-speech-detection',
  'last_updated': '2022-10-28T13:46:33',
  'downloaded': '45.9k',
  'liked': 4},
 {'model_name': 'openai/whisper-tiny',
  'last_updated': '2023-03-10T17:15:01',
  'downloaded': '43.6k',
  'liked': 41},
 {'model_name': 'ceyda/wav2vec2-base-760-turkish',
  'last_updated': '2021-07-06T00:16:04',
  'downloaded': '42.1k',
  'liked': 2},
 {'model_name': 'openai/whisper-large-v2',
  'last_updated': '2023-03-10T17:15:07',
  'downloaded': '41.7k',
  'liked': 275},
 {'model_name': 'openai/whisper-small',
  'last_updated': '2023-03-10T17:15:13',
  'downloaded': '39.7k',
  'liked': 37},
 {'model_name': 'techiaith/wav2vec2-xlsr-ft-en-cy',
  'last_updated': '2023-03-02T06:30:13',
  'downloaded': '32.5k',
  'liked': 1},
 {'model_name': 'facebook/wav2vec2-xlsr-53-phon-cv-babel-ft',
  'last_updated': '2021-11-10T12:02:20',
  'downloaded': '31.2k',
  'liked': 1},
 {'model_name': 'comodoro/wav2vec2-xls-r-300m-cs-250',
  'last_updated': '2022-03-23T18:26:50',
  'downloaded': '31.1k',
  'liked': 0},
 {'model_name': 'jonatasgrosman/wav2vec2-large-xlsr-53-russian',
  'last_updated': '2022-12-14T01:58:43',
  'downloaded': '29.3k',
  'liked': 12},
 {'model_name': 'facebook/hubert-large-ls960-ft',
  'last_updated': '2022-05-24T10:43:42',
  'downloaded': '23.8k',
  'liked': 27},
 {'model_name': 'jonatasgrosman/wav2vec2-large-xlsr-53-japanese',
  'last_updated': '2022-12-14T01:58:09',
  'downloaded': '21.7k',
  'liked': 9},
 {'model_name': 'openai/whisper-base',
  'last_updated': '2023-03-10T17:13:49',
  'downloaded': '20.5k',
  'liked': 52},
 {'model_name': 'jonatasgrosman/wav2vec2-large-xlsr-53-dutch',
  'last_updated': '2022-12-14T01:58:20',
  'downloaded': '20.1k',
  'liked': 0},
 {'model_name': 'openai/whisper-medium',
  'last_updated': '2023-03-10T17:15:08',
  'downloaded': '20k',
  'liked': 40},
 {'model_name': 'jonatasgrosman/wav2vec2-large-xlsr-53-portuguese',
  'last_updated': '2022-12-14T01:59:47',
  'downloaded': '19.2k',
  'liked': 11},
 {'model_name': 'facebook/wav2vec2-large-960h',
  'last_updated': '2022-04-05T16:40:42',
  'downloaded': '15.5k',
  'liked': 14},
 {'model_name': 'facebook/data2vec-audio-base-960h',
  'last_updated': '2022-05-24T10:41:22',
  'downloaded': '15.4k',
  'liked': 7},
 {'model_name': 'theainerd/Wav2Vec2-large-xlsr-hindi',
  'last_updated': '2023-03-20T05:28:11',
  'downloaded': '15.3k',
  'liked': 2},
 {'model_name': 'tyoc213/wav2vec2-large-xlsr-nahuatl',
  'last_updated': '2021-04-07T02:59:04',
  'downloaded': '12.5k',
  'liked': 1},
 {'model_name': 'openai/whisper-large',
  'last_updated': '2023-03-10T17:15:11',
  'downloaded': '12k',
  'liked': 228}]

Also posted a feature request on https://discuss.huggingface.co/t/feature-request-listing-available-models-datasets-and-metrics/34389

alvas
  • 115,346
  • 109
  • 446
  • 738
  • imo it should be noted that this solution is webscraping, which may not play well with OPs use case. ie: if OP plans to ship an app, they may get rejected for web scraping – Matt Pengelly Jul 31 '23 at 00:20