Generative AI-Powered Audio/Video Processing: Whisper’s Python Adventure
What Is the Purpose of This Application?
An application that displays the ability of Generative AI to process and analyze audio/video files, and then output the lyrics for that audio or video file.
Video of execution:
Code:
import streamlit as st
from pytube import YouTube
import os
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
def get_mp3(url):
yt = YouTube(str(url))
audio = yt.streams.filter(only_audio = True).first()
destination = '.'
out_file = audio.download(output_path=destination)
base, ext = os.path.splitext(out_file)
new_file = base + '.mp3'
os.rename(out_file, new_file)
return new_file
def get_transcript(audio_file):
device = "cuda:0" if torch.cuda.is_available() else "cpu" #If you have GPU else it will use cpu
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-tiny" # define the model
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
# create the pipeline
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=30,
batch_size=16,
return_timestamps=True,
torch_dtype=torch_dtype,
device=device,
generate_kwargs={"language": "english"})
result = pipe(audio_file, return_timestamps=True,generate_kwargs={"language": "english"})
result = result["chunks"]
return result
def format_lyrics(lyrics):
formatted_lyrics = ""
for line in lyrics:
text = line["text"]
formatted_lyrics += f"{text}\n\n"
return formatted_lyrics.strip()
def fetch_lyrics(url):
mp3 = get_mp3(url)
status_placeholder = st.empty()
status_placeholder.subheader("Please wait for few seconds. Preparing the lyrics for you...")
lyrics = get_transcript(mp3)
status_placeholder.empty()
lyrics = format_lyrics(lyrics)
return lyrics
def main():
text_color = "#7dd100"
#st.markdown('<p style="color: #006fd1; font-family: sans-serif; text-align:center; font-size: 65px;"><b>YouTube Lyrics App</b></p>', unsafe_allow_html=True)
st.markdown("""<p style="color: #d15000;font-size: 70px;font-family: sans-serif; text-align:center;margin-bottom:0px;"><b>Lyrics</b><span style="color: #E94B3CFF;font-size: 70px;font-family: sans-serif;"><b>Box</b></span></p>""", unsafe_allow_html=True)
st.markdown('<p style="font-family: sans-serif; text-align:center; font-size: 20px; margin-bottom:60px;">Get the Lyrics of your Favorite Song for Free</p>', unsafe_allow_html=True)
# Input field for the user to enter the URL of the song
st.markdown('<p style="font-family: sans-serif; text-align:left; font-size: 20px; margin-bottom:0px;">Enter the audio/video link below:</p>', unsafe_allow_html=True)
url = st.text_input("", "")
if url:
# Button to trigger fetching and displaying the lyrics
if st.button("Get Lyrics of this A/V"):
lyrics = fetch_lyrics(url)
st.subheader("Lyrics:")
st.write(lyrics)
st.markdown('<p style="font-size: 35px;font-family: sans-serif; text-align:left; margin-top: 100px;"><b>How to Get the Lyrics of your Video?</b></p>', unsafe_allow_html=True)
st.markdown('<p style="font-family: sans-serif; text-align:left; font-size: 20px">To extract the lyrics of your favorite video using this tool follow the steps. <br /> <br />   1. Copy the link of the video from Youtube. \
<br />   2. Paste the link in the box above. <br />   3. Hit the "Get Lyrics of this A/V" button. </p>', unsafe_allow_html=True)
st.markdown('<p style="font-size: 35px;font-family: sans-serif; text-align:left; margin-top: 40px;"><b>Why Should you use this tool?</b></p>', unsafe_allow_html=True)
st.markdown('<p style="font-family: sans-serif; text-align:left; font-size: 20px">Features of this tool are given below: <br /> <br /> \
  1. This Tool uses <a href="https://huggingface.co/openai/whisper-tiny/">OpenAI Whisper</a> to extract transcript from audio file.\
<br />   2. We do not save your data or video.\
<br />   3. Easy and Free-to-use.</p>', unsafe_allow_html=True)
if __name__ == "__main__":
main()