6
6
import numpy as np
7
7
import matplotlib .pyplot as plt
8
8
import scipy .io as sio
9
+ import os
9
10
import librosa
11
+ import librosa .display
10
12
import argparse
11
13
from CSMSSMTools import getCSM , getCSMCosine
12
14
from SimilarityFusion import doSimilarityFusion
13
15
from SongStructureGUI import saveResultsJSON
16
+ import subprocess
17
+
18
+ MANUAL_AUDIO_LOAD = False
19
+ FFMPEG_BINARY = "ffmpeg"
14
20
15
21
def plotFusionResults (Ws , vs , alllabels , times ):
16
22
"""
@@ -83,7 +89,8 @@ def getFusedSimilarity(filename, sr, hop_length, win_fac, wins_per_block, K, reg
83
89
wins_per_block: int
84
90
Number of aggregated windows per sliding window block
85
91
K: int
86
- Number of nearest neighbors in SNF
92
+ Number of nearest neighbors in SNF. If -1, then autotuned to sqrt(N)
93
+ for an NxN similarity matrix
87
94
reg_diag: float
88
95
Regularization for self-similarity promotion
89
96
reg_neighbs: float
@@ -99,11 +106,18 @@ def getFusedSimilarity(filename, sr, hop_length, win_fac, wins_per_block, K, reg
99
106
-------
100
107
{'Ws': An dictionary of weighted adjacency matrices for individual features
101
108
and the fused adjacency matrix,
102
- 'times': Time in seconds of each row in the similarity matrices}
109
+ 'times': Time in seconds of each row in the similarity matrices,
110
+ 'K': The number of nearest neighbors actually used}
103
111
"""
104
112
## Step 1: Load audio
105
113
print ("Loading %s..." % filename )
106
- y , sr = librosa .load (filename , sr = sr )
114
+ if MANUAL_AUDIO_LOAD :
115
+ subprocess .call ([FFMPEG_BINARY , "-i" , filename , "-ar" , "%i" % sr , "-ac" , "1" , "%s.wav" % filename ])
116
+ sr , y = sio .wavfile .read ("%s.wav" % filename )
117
+ y = y / 2.0 ** 15
118
+ os .remove ("%s.wav" % filename )
119
+ else :
120
+ y , sr = librosa .load (filename , sr = sr )
107
121
108
122
## Step 2: Figure out intervals to which to sync features
109
123
if win_fac > 0 :
@@ -119,7 +133,6 @@ def getFusedSimilarity(filename, sr, hop_length, win_fac, wins_per_block, K, reg
119
133
intervals = librosa .util .fix_frames (beats , x_max = C .shape [1 ])
120
134
intervals = librosa .segment .subsegment (C , intervals , n_segments = abs (win_fac ))
121
135
122
-
123
136
## Step 3: Compute features
124
137
# 1) CQT chroma with 3x oversampling in pitch
125
138
chroma = librosa .feature .chroma_cqt (y = y , sr = sr , hop_length = hop_length , bins_per_octave = 12 * 3 )
@@ -134,39 +147,47 @@ def getFusedSimilarity(filename, sr, hop_length, win_fac, wins_per_block, K, reg
134
147
mfcc = coeffs [:, None ]* mfcc
135
148
136
149
# 3) Tempograms
137
-
150
+ oenv = librosa .onset .onset_strength (y = y , sr = sr , hop_length = hop_length )
151
+ tempogram = librosa .feature .tempogram (onset_envelope = oenv , sr = sr , hop_length = hop_length )
138
152
139
153
140
154
## Step 4: Synchronize features to intervals
141
- n_frames = min (chroma .shape [1 ], mfcc .shape [1 ])
155
+ n_frames = min (min ( chroma .shape [1 ], mfcc . shape [ 1 ]), tempogram .shape [1 ])
142
156
# median-aggregate chroma to suppress transients and passing tones
143
157
intervals = librosa .util .fix_frames (intervals , x_min = 0 , x_max = n_frames )
144
158
chroma = librosa .util .sync (chroma , intervals , aggregate = np .median )
145
159
mfcc = librosa .util .sync (mfcc , intervals )
160
+ tempogram = librosa .util .sync (tempogram , intervals )
146
161
times = intervals * float (hop_length )/ float (sr )
147
162
148
163
149
164
150
165
chroma = chroma [:, :n_frames ]
151
166
mfcc = mfcc [:, :n_frames ]
167
+ tempogram = tempogram [:, :n_frames ]
152
168
153
169
#Do a delay embedding and compute SSMs
154
170
XChroma = librosa .feature .stack_memory (chroma , n_steps = wins_per_block , mode = 'edge' ).T
155
171
XMFCC = librosa .feature .stack_memory (mfcc , n_steps = wins_per_block , mode = 'edge' ).T
172
+ XTempogram = librosa .feature .stack_memory (tempogram , n_steps = wins_per_block , mode = 'edge' ).T
156
173
DChroma = getCSMCosine (XChroma , XChroma ) #Cosine distance
157
174
DMFCC = getCSM (XMFCC , XMFCC ) #Euclidean distance
175
+ DTempogram = getCSM (XTempogram , XTempogram )
158
176
159
177
#Run similarity network fusion
160
- FeatureNames = ['MFCCs' , 'Chromas' ]
161
- Ds = [DMFCC , DChroma ]
178
+ FeatureNames = ['MFCCs' , 'Chromas' , 'Tempogram' ]
179
+ Ds = [DMFCC , DChroma , DTempogram ]
162
180
# Edge case: zeropad if it's too small
163
181
for i , Di in enumerate (Ds ):
164
182
if Di .shape [0 ] < 2 * K :
165
183
D = np .zeros ((2 * K , 2 * K ))
166
184
D [0 :Di .shape [0 ], 0 :Di .shape [1 ]] = Di
167
185
Ds [i ] = D
168
-
169
- (Ws , WFused ) = doSimilarityFusion (Ds , K = K , niters = niters , \
186
+
187
+ pK = K
188
+ if K == - 1 :
189
+ pK = int (np .round (np .sqrt (Ds [0 ].shape [0 ])))
190
+ (Ws , WFused ) = doSimilarityFusion (Ds , K = pK , niters = niters , \
170
191
reg_diag = reg_diag , reg_neighbs = reg_neighbs , \
171
192
do_animation = do_animation , PlotNames = FeatureNames , \
172
193
PlotExtents = [times [0 ], times [- 1 ]])
@@ -177,7 +198,7 @@ def getFusedSimilarity(filename, sr, hop_length, win_fac, wins_per_block, K, reg
177
198
if plot_result :
178
199
plotFusionResults (WsDict , {}, {}, times )
179
200
plt .savefig ("%s_Plot.png" % filename , bbox_inches = 'tight' )
180
- return {'Ws' :WsDict , 'times' :times }
201
+ return {'Ws' :WsDict , 'times' :times , 'K' : pK }
181
202
182
203
if __name__ == '__main__' :
183
204
parser = argparse .ArgumentParser ()
@@ -188,7 +209,7 @@ def getFusedSimilarity(filename, sr, hop_length, win_fac, wins_per_block, K, reg
188
209
parser .add_argument ('--hop_length' , type = int , default = 512 , help = "Hop Size in samples" )
189
210
parser .add_argument ('--win_fac' , type = int , default = 10 , help = "Number of windows to average in a frame. If negative, then do beat tracking, and subdivide by |win_fac| times within each beat" )
190
211
parser .add_argument ('--wins_per_block' , type = int , default = 20 , help = "Number of frames to stack in sliding window for every feature" )
191
- parser .add_argument ('--K' , type = int , default = 10 , help = "Number of nearest neighbors in similarity network fusion" )
212
+ parser .add_argument ('--K' , type = int , default = 10 , help = "Number of nearest neighbors in similarity network fusion. If -1, then autotune to sqrt(N) for an NxN similarity matrix " )
192
213
parser .add_argument ('--reg_diag' , type = float , default = 1.0 , help = "Regularization for self-similarity promotion" )
193
214
parser .add_argument ('--reg_neighbs' , type = float , default = 0.5 , help = "Regularization for direct neighbor similarity promotion" )
194
215
parser .add_argument ('--niters' , type = int , default = 10 , help = "Number of iterations in similarity network fusion" )
@@ -204,4 +225,4 @@ def getFusedSimilarity(filename, sr, hop_length, win_fac, wins_per_block, K, reg
204
225
K = opt .K , reg_diag = opt .reg_diag , reg_neighbs = opt .reg_neighbs , niters = opt .niters , \
205
226
do_animation = opt .do_animation , plot_result = opt .plot_result )
206
227
sio .savemat (opt .matfilename , res )
207
- saveResultsJSON (opt .filename , res ['times' ], res ['Ws' ], opt . K , opt .neigs , opt .jsonfilename , opt .diffusion_znormalize )
228
+ saveResultsJSON (opt .filename , res ['times' ], res ['Ws' ], res [ 'K' ] , opt .neigs , opt .jsonfilename , opt .diffusion_znormalize )
0 commit comments