I have written an image-to-speech Python application but want to build a Gradio frontend for the app. Below is my Python code:
from transformers import pipeline
from bark import SAMPLE_RATE, generate_audio, preload_models
from IPython.display import Audio
from PIL import Image
import gradio as gr
import numpy as np
def image_to_speech(image):
# load text-to-speech bark models
preload_models()
captioner = pipeline("image-to-text",model = "Salesforce/blip-image-captioning-base")
# Open the image using PIL
image_pil = Image.open(image)
result = captioner(image_pil)
result = result[0]
result_text = result["generated_text"]
audio_array = generate_audio(result_text)
return Audio(audio_array,rate = SAMPLE_RATE)
app = gr.Interface(image_to_speech, inputs = "image", outputs = "audio")
app.launch(debug = True)
However, when I pass an image to the app and run it to get an audio output, I get the error message below in my notebook:
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/PIL/Image.py", line 2979, in open
fp.seek(0)
AttributeError: 'numpy.ndarray' object has no attribute 'seek'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/gradio/routes.py", line 439, in run_predict
output = await app.get_blocks().process_api(
File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1389, in process_api
result = await self.call_function(
File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1094, in call_function
prediction = await anyio.to_thread.run_sync(
File "/usr/local/lib/python3.10/dist-packages/anyio/to_thread.py", line 33, in run_sync
return await get_asynclib().run_sync_in_worker_thread(
File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 877, in run_sync_in_worker_thread
return await future
File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 807, in run
result = context.run(func, *args)
File "/usr/local/lib/python3.10/dist-packages/gradio/utils.py", line 704, in wrapper
response = f(*args, **kwargs)
File "<ipython-input-2-a32bc59fff0d>", line 17, in image_to_speech
image_pil = Image.open(image)
File "/usr/local/lib/python3.10/dist-packages/PIL/Image.py", line 2981, in open
fp = io.BytesIO(fp.read())
AttributeError: 'numpy.ndarray' object has no attribute 'read'
I have exhausted all the tricks I know in the books and yet the problem persists. I do not know where to go from here. ChatGPT couldn't help either. Please, I need your help. Thank you!