If you’ve ever tackled the challenge of crafting lengthy YouTube videos packed with dialogue, you’re likely familiar with the tedious task of clipping, trimming, and slicing segments of video or audio to assemble the final product. While it’s a laborious process, I’ve stumbled upon a straightforward yet efficient solution to this dilemma. I might have a not-so-sophisticated but a quick solution this this problem.
Go through the script below or try it out directly in the Goolgle Colab (remember to connect your colab to local runtime if your local resources are better): Link to Google Colab
Step 1: Load your input video file and set chunklength
# Set the video file path
video_file_path = r"path\to\your\video\file.mp4"
# Set the chunk length for splitted portions in seconds - chunking is necessary to prevent openai-whisper model from hallucinating
chunklength = 30
Step 2: Install necessary libraries if not already installed
# Install pydub and numpy if not already installed
!pip install moviepy pydub pandas openai-whisper datetime
# ffmpeg should be downloaded and installed and added to system path in environment variables
Step 3: Loading the necessary functions
import os
from moviepy.editor import VideoFileClip
from pydub import AudioSegment
from pydub.silence import detect_silence
import pandas as pd
import whisper
import datetime
# Function to split audio into chunks ending at word boundaries
def split_audio_at_words(audio_file_path, chunk_size_ms=chunklength*1000, min_chunk_size_ms=chunklength*10000*0.8, min_silence_len=1000, silence_thresh=-30):
audio = AudioSegment.from_wav(audio_file_path)
chunks = []
start = 0
while start < len(audio):
end = start + chunk_size_ms
# Adjust the end index if the next word boundary occurs before chunk_size_ms but after min_chunk_size_ms
if end < len(audio):
audio_slice = audio[start:end]
silence = detect_silence(audio_slice, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
silence = [s for s in silence if s[0] > min_chunk_size_ms]
if silence:
end = start + silence[0][1]
chunks.append(audio[start:end])
start = end
return chunks
# Function to process each chunk
def process_chunk(chunk, chunk_index, total_duration_so_far):
# Save the chunk to a temporary audio file
chunk_file_path = f"temp_audio_{chunk_index}.wav"
chunk.export(chunk_file_path, format="wav")
# Detect silent parts
silent_ranges = detect_silence(chunk, min_silence_len=500, silence_thresh=-30)
silent_ranges = [(start / 1000, end / 1000) for start, end in silent_ranges]
# Create a list of dictionaries for the silent sections
data = []
for start, end in silent_ranges:
data.append({"transcript": "#silent section", "start_time": start, "end_time": end})
# Create a DataFrame from the list of dictionaries
SilentDf = pd.DataFrame(data)
# Transcribe the chunk
result = model.transcribe(chunk_file_path, language="ne", temperature=0.2, word_timestamps=True)
# Prepare the transcriptions list and create the DataFrame directly
data = []
for segment in result['segments']:
for word_info in segment['words']:
word = word_info['word']
start_word_time = word_info['start']
end_word_time = word_info['end']
data.append([word, start_word_time, end_word_time])
TranscriptDf = pd.DataFrame(data, columns=["transcript", "start_time", "end_time"])
# Add source columns
SilentDf["source"] = "SilentDf"
TranscriptDf["source"] = "TranscriptDf"
# Combine dataframes
combined_df = pd.concat([SilentDf, TranscriptDf]).reset_index(drop=True)
combined_df.insert(0, "serial_number", range(1, len(combined_df) + 1))
# Sort by start_time
combined_df = combined_df.sort_values(by="start_time").reset_index(drop=True)
# Calculate duration
combined_df["duration"] = combined_df["start_time"].shift(-1) - combined_df["start_time"]
combined_df.at[len(combined_df) - 1, "duration"] = combined_df.at[len(combined_df) - 1, "end_time"] - combined_df.at[len(combined_df) - 1, "start_time"]
# Initialize starttimeR and endtimeR columns
combined_df["starttimeR"] = 0
combined_df["endtimeR"] = 0
# Calculate starttimeR and endtimeR
for i in range(len(combined_df)):
if i == 0:
combined_df.at[i, "starttimeR"] = total_duration_so_far
else:
combined_df.at[i, "starttimeR"] = combined_df.at[i-1, "endtimeR"]
combined_df.at[i, "endtimeR"] = combined_df.at[i, "starttimeR"] + combined_df.at[i, "duration"]
# Check the sum of the duration column
total_duration = combined_df["duration"].sum()
endtimeR_last_item = combined_df.at[len(combined_df) - 1, "endtimeR"]
# Export to Excel
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
excel_file_name = f"combined_data_chunk_{chunk_index}_{current_time}.xlsx"
combined_df.to_excel(excel_file_name, index=False)
os.remove(chunk_file_path) # Remove the temporary audio file
return excel_file_name, endtimeR_last_item
print("Functions loaded sucessfully")
Step 4: Converting to audio file and chunking the audio file
# Create a VideoFileClip object
video = VideoFileClip(video_file_path)
print("Video located")
# Extract the audio from the video
audio_file_path = "temp_audio.wav"
video.audio.write_audiofile(audio_file_path)
print("Video converted into audio")
# Split audio into chunks ending at word boundaries
audio_chunks = split_audio_at_words(audio_file_path)
print("Index prepared for chunking audio file")
# Load the Whisper model
model = whisper.load_model("large-v3")
print("Whisper Model Loaded")
# Initialize variables
total_duration_so_far = 0
excel_files = []
Step 5: Transcribing each chunk - this takes a toll on CPU/GPU and Time
# Process each chunk
print("Starting the process of transcribing each chunk")
for i, chunk in enumerate(audio_chunks):
excel_file, total_duration_so_far = process_chunk(chunk, i, total_duration_so_far)
excel_files.append(excel_file)
print("Chunk " + str(i) + " processed sucessfully")
Step 6: Compiling the transcriptions and generating final excel file
# Combine all Excel files into a single file
combined_data = pd.DataFrame()
for file in excel_files:
df = pd.read_excel(file)
df['source_file'] = file
combined_data = pd.concat([combined_data, df], ignore_index=True)
# Save the combined data to a single Excel file
video_file_name = os.path.splitext(os.path.basename(video_file_path))[0]
combined_file_name = f"{video_file_name}.xlsx"
combined_data.to_excel(combined_file_name, index=False)
# Read the combined Excel file back into a DataFrame
combined_data = pd.read_excel(combined_file_name)
# Remove rows where the 'duration' column has the value 0
filtered_df = combined_data[combined_data['duration'] != 0]
# Insert a new column 'clipmark' at index 9
filtered_df.insert(9, 'clipmark', '')
# Save the filtered data back to the Excel file
filtered_df.to_excel(combined_file_name, index=False)
# Delete individual Excel files
for file in excel_files:
os.remove(file)
# Close the video file
video.close()
# Remove the temporary audio file
os.remove(audio_file_path)
Step 7: Now revisit the excel file and mark your clips
'''
In this step:
1. Open the excel file in the same folder
2. In the tenth column called "clipmark" type in "clip" in the rows that you want to clip out from the video
3. After that process has been completed the code below will clip out the timestamps that has been marked as "clip"
4. A new clipped video will be generated
'''
Step 8: Finally render your project
from moviepy.editor import VideoFileClip, concatenate_videoclips
import os
import pandas as pd
# Load the Excel file into a DataFrame
video_file_name = os.path.splitext(os.path.basename(video_file_path))[0]
video = VideoFileClip(video_file_path)
combined_file_name = f"{video_file_name}.xlsx"
df = pd.read_excel(combined_file_name)
# Collect the segments to keep
segments_to_keep = []
start_time = 0
for _, row in df.iterrows():
if row['clipmark'] == 'clip':
# Add segment before the clip
end_time = row['starttimeR']
if start_time < end_time:
segments_to_keep.append(video.subclip(start_time, end_time))
# Move start_time to the end of the clipped segment
start_time = row['endtimeR']
# Add the final segment after the last clip
if start_time < video.duration:
segments_to_keep.append(video.subclip(start_time, video.duration))
# Concatenate the segments to create the final video
if segments_to_keep:
final_video = concatenate_videoclips(segments_to_keep)
final_video.write_videofile(f"{video_file_name}_clipped.mp4", codec='libx264')
print(f"Clipped video saved as: {video_file_name}")
else:
print("No segments to keep. Output video not created.")
For easy use you can copy these codes to Jupyter Notebook, Google Colab or Python IDE of your choice to complete the action.
Haha, this is awesome. You redid the entire process of Descript locally. Looking forward to the YouTube videos.
Guilty as charged : )
Thank you Ankur Dai – halfway there. And pls gift me a GPU.
Could you create a report for the same?
Oops, sorry what I meant was could you create a GitHub report for the same project so that others and we also can contribute to it