Something like this:
from moviepy.editor import *
picture = VideoFileClip("img.jpg", audio=False).set_duration(50)
textOne = "First Line!"
textTwo = "Second Caption!!!!"
textThree = "Third one!!!"
texts = [textOne, textTwo, textThree]
step = 15 #each 15 sec: 0, 15, 30
duration = 10
t = 0
txt_clips = []
for text,i in zip(texts,range(0,3)):
txt_clip = TextClip(text,fontsize = 40, color='white')
txt_clip = txt_clip.set_start(t)
txt_clip = txt_clip.set_pos('center').set_duration(duration)
txt_clips.append(txt_clip)
t += step
audio = AudioFileClip(r"C:\Users\Public\Music\Sample Music\Kalimba.mp3").subclip(0,50)
video_with_new_audio = picture.set_audio(audio)
final_video = CompositeVideoClip([video_with_new_audio,txt_clips[0],txt_clips[1],txt_clips[2]])
final_video.write_videofile("TEXT.mp4")
For more flexibility, you can use not range(0,3) etc., but lists with the time points and durations, like with the captions; something like that:
starts = [0, 15, 30] # or whatever
durations = [10, 5, 7]
for text,t,duration in zip(texts, starts, durations):
txt_clip = TextClip(text,fontsize = 40, color='white')
txt_clip = txt_clip.set_start(t)
txt_clip = txt_clip.set_pos('center').set_duration(duration)
txt_clips.append(txt_clip)