Using pylab to generate a spectrogram (example) for each frame of a video and a same length frame of audio, we can match frames of video that sound similar (using nearest neighbors) to the input audio approximating a re-creation of the original audio.
I used two seperate scripts so I could try different audio with each video without having to re-generate a specgram for each frame of video each time and instead just store the specgrams to a Numpy data file (create_clips.py). Then I ran NN-search.py to search and render a final video using MoviePy.
from pylab import * from moviepy.editor import * import numpy as np FILENAME = ".mp4" oClip = VideoFileClip(FILENAME) FRAME_DURATION = 1.0 / oClip.fps # clip_ffts is an array filled with the specgram of each frame of audio clip_ffts =  # loop over each frame and calculate specgram (power of particular frequencies) for i in np.arange(0, oClip.duration - FRAME_DURATION, FRAME_DURATION): clip = (oClip .audio .subclip(i, i + FRAME_DURATION)) # Stereo to mono by averaging both channels with np.mean test_clip = np.mean(clip.to_soundarray(fps=16000, nbytes=2), axis=1).flatten().astype(np.float64) # Calculate the specgram using pylab Pxx, freqs, bins, im = specgram(test_clip, NFFT=512, Fs=16000, window=window_hanning, noverlap=440, detrend="mean") clip_ffts.append(Pxx.flatten()) # Convert python list to Numpy array clip_ffts = np.array(clip_ffts) # Save numpy array for future uses with NN-search.py np.save(FILENAME + ".npy", clip_ffts) print clip_ffts
from sklearn.neighbors import NearestNeighbors import numpy as np from pylab import specgram, cm, window_hanning from moviepy.editor import * # Video file we will use to try and approximate the audio file FILENAME = ".mp4" oClip = VideoFileClip(FILENAME) FRAME_DURATION = 1.0/ oClip.fps # The "target" audio file tClip = AudioFileClip(".wav") # We must generate a Numpy file containing an array of specram data from the video and load it X = np.load(FILENAME + '.npy') # Fitting the nearest neighbors model to the specgram data generated from create_clips.py nbrs = NearestNeighbors(n_neighbors=1).fit(X) # List containing moviepy clips of the nearest neighbor to the target audio frame out_clips =  # Loop over each fram in the target clip (tClip) for i in np.arange(0, tClip.duration-FRAME_DURATION, FRAME_DURATION): test_clip = np.mean(tClip.subclip(i,i+FRAME_DURATION).to_soundarray(fps=16000, nbytes=2), axis=1).flatten().astype(np.float64) # Generate specgram from target audio Pxx, freqs, bins, im = specgram(test_clip, NFFT=512, Fs=16000, window=window_hanning, noverlap=440, detrend="mean", cmap=cm.gray) # Find nearest neighbor from frames of video distances, indices = nbrs.kneighbors(Pxx.flatten()) print distances index = indices print index # Push clip to be concatenated to list based on index and frame rate out_clips.append(oClip.subclip(index*FRAME_DURATION , (index*FRAME_DURATION)+FRAME_DURATION)) out_vid = concatenate(out_clips) out_vid.write_videofile("out.mp4") print "done!"
I'm no DSP expert so the settings for the FFT to generate the specgram was completed using trail and error (with some stack exchange hints from other people's questions).
I might try and work on a script that can use multiple video files at some point.
I should also normalize the audio levels between video and audio inputs so the NN matching works better.