Skip to content

Commit

Permalink
added downsampling step to feature extraction and feature learning pr…
Browse files Browse the repository at this point in the history
…ocess
  • Loading branch information
mkholghi committed Oct 31, 2018
1 parent 7c915e6 commit e9be1d4
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 59 deletions.
5 changes: 3 additions & 2 deletions src/AnalysisConfigFiles/FeatureLearningConfig.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ FrequencyScaleType: Mel
FrameSize: 1024
FinalBinCount: 128


# The default values for minFreqBin and maxFreqBin are 1 and FinalBinCount
# For any other arbitrary frequency bin bounds, these two parameters need to be manually set.
MinFreqBin: 24
Expand All @@ -44,9 +43,11 @@ NumRandomPatches: 1000
# the number of clusters to be generated from the selected patch set
NumClusters: 256


# Applying noise reduction and whitening if these options are set to 'true'
DoNoiseReduction: true
DoWhitening: true

# The factor of data downsampling using max pooling
MaxPoolingFactor: 6
...

26 changes: 11 additions & 15 deletions src/AnalysisPrograms/MahnooshSandpit.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,15 @@ namespace AnalysisPrograms
using System.Drawing.Imaging;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Accord.MachineLearning;
using Accord.Math;
using Accord.Statistics;
using Acoustics.Shared;
using Acoustics.Shared.ConfigFile;
using Acoustics.Shared.Csv;
using AudioAnalysisTools.DSP;
using AudioAnalysisTools.StandardSpectrograms;
using AudioAnalysisTools.WavTools;
using McMaster.Extensions.CommandLineUtils;
using NeuralNets;
using Production.Arguments;
using TowseyLibrary;

Expand All @@ -34,17 +30,17 @@ public void Execute(Arguments arguments)
{
LoggedConsole.WriteLine("feature learning process...");

var inputDir = @"D:\Mahnoosh\Liz\"; //@"C:\Users\kholghim\Mahnoosh\Liz\"; // @"C:\Users\kholghim\Mahnoosh\UnsupervisedFeatureLearning\"; // @"M:\Postdoc\Liz\"; //
var inputDir = @"M:\Postdoc\Liz\"; //@"C:\Users\kholghim\Mahnoosh\Liz\"; //@"D:\Mahnoosh\Liz\"; // @"C:\Users\kholghim\Mahnoosh\UnsupervisedFeatureLearning\"; //
var resultDir = Path.Combine(inputDir, "FeatureLearning");
var inputPath = Path.Combine(inputDir, "TrainSet"); //PatchSamplingSegments //PatchSampling
var trainSetPath = Path.Combine(inputDir, "TrainSet");
var testSetPath = Path.Combine(inputDir, "TestSet");
var configPath = @"D:\Mahnoosh\Liz\AnalysisConfigFiles\FeatureLearningConfig.yml"; //@"C:\Work\GitHub\audio-analysis\src\AnalysisConfigFiles\FeatureLearningConfig.yml"; //
var outputMelImagePath = Path.Combine(resultDir, "MelScaleSpectrogram.png");
var outputNormMelImagePath = Path.Combine(resultDir, "NormalizedMelScaleSpectrogram.png");
var outputNoiseReducedMelImagePath = Path.Combine(resultDir, "NoiseReducedMelSpectrogram.png");
var outputReSpecImagePath = Path.Combine(resultDir, "ReconstrcutedSpectrogram.png");
var outputClusterImagePath = Path.Combine(resultDir, "Clusters.bmp");
// var testSetPath = Path.Combine(inputDir, "TestSet");
var configPath = @"C:\Users\kholghim\Mahnoosh\Liz\FeatureLearningConfig.yml"; //@"D:\Mahnoosh\Liz\AnalysisConfigFiles\FeatureLearningConfig.yml"; //@"C:\Work\GitHub\audio-analysis\src\AnalysisConfigFiles\FeatureLearningConfig.yml"; //
// var outputMelImagePath = Path.Combine(resultDir, "MelScaleSpectrogram.png");
// var outputNormMelImagePath = Path.Combine(resultDir, "NormalizedMelScaleSpectrogram.png");
// var outputNoiseReducedMelImagePath = Path.Combine(resultDir, "NoiseReducedMelSpectrogram.png");
// var outputReSpecImagePath = Path.Combine(resultDir, "ReconstrcutedSpectrogram.png");
// var outputClusterImagePath = Path.Combine(resultDir, "Clusters.bmp");

// +++++++++++++++++++++++++++++++++++++++++++++++++patch sampling from 1-min recordings

Expand Down Expand Up @@ -946,11 +942,11 @@ public override Task<int> Execute(CommandLineApplication app)
public static void ExtractClusteringFeatures()
{
LoggedConsole.WriteLine("feature extraction process...");
var inputDir = @"D:\Mahnoosh\Liz\"; //@"C:\Users\kholghim\Mahnoosh\UnsupervisedFeatureLearning\"; //@"M:\Postdoc\Liz\"; //
var inputDir = @"M:\Postdoc\Liz\"; //@"D:\Mahnoosh\Liz\"; //@"C:\Users\kholghim\Mahnoosh\UnsupervisedFeatureLearning\"; //@"M:\Postdoc\Liz\"; //
var resultDir = Path.Combine(inputDir, "FeatureLearning");
//var trainSetPath = Path.Combine(inputDir, "TrainSet");
var testSetPath = Path.Combine(inputDir, "TestSet");
var configPath = @"D:\Mahnoosh\Liz\AnalysisConfigFiles\FeatureLearningConfig.yml"; //@"C:\Work\GitHub\audio-analysis\src\AnalysisConfigFiles\FeatureLearningConfig.yml"; //
var configPath = @"C:\Users\kholghim\Mahnoosh\Liz\FeatureLearningConfig.yml"; //@"D:\Mahnoosh\Liz\AnalysisConfigFiles\FeatureLearningConfig.yml"; //@"C:\Work\GitHub\audio-analysis\src\AnalysisConfigFiles\FeatureLearningConfig.yml"; //
var centroidsPath = Path.Combine(resultDir, "ClusterCentroids0.csv");

var configFile = configPath.ToFileInfo();
Expand Down Expand Up @@ -985,7 +981,7 @@ public static void GenerateSpectrograms()

int frameSize = 1024;
int finalBinCount = 256;
int hertzInterval = 1000;
// int hertzInterval = 1000;
FreqScaleType scaleType = FreqScaleType.Mel;
var settings = new SpectrogramSettings()
{
Expand Down
33 changes: 22 additions & 11 deletions src/AudioAnalysisTools/DSP/FeatureExtraction.cs
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,12 @@ public static void UnsupervisedFeatureExtraction(FeatureLearningSettings config,
// the number of frames that their feature vectors will be concatenated in order to preserve temporal information.
int frameWindowLength = config.FrameWindowLength;

// he step size to make a window of frames
// the step size to make a window of frames
int stepSize = config.StepSize;

// the factor of downsampling
int maxPoolingFactor = config.MaxPoolingFactor;

// check whether there is any file in the folder/subfolders
if (Directory.GetFiles(inputPath, "*", SearchOption.AllDirectories).Length == 0)
{
Expand All @@ -88,7 +91,7 @@ public static void UnsupervisedFeatureExtraction(FeatureLearningSettings config,
// process the wav file if it is not empty
if (fileInfo.Length != 0)
{
string pathToSimilrityVectorsFile = Path.Combine(simVecDir.FullName, fileInfo.Name + ".csv");
string pathToSimilarityVectorsFile = Path.Combine(simVecDir.FullName, fileInfo.Name + ".csv");
var recording = new AudioRecording(filePath);
settings.SourceFileName = recording.BaseName;

Expand Down Expand Up @@ -124,9 +127,13 @@ public static void UnsupervisedFeatureExtraction(FeatureLearningSettings config,
List<double[,]> allSequentialPatchMatrix = new List<double[,]>();
for (int i = 0; i < matrices2.GetLength(0); i++)
{
int rows = matrices2[i].GetLength(0);
int columns = matrices2[i].GetLength(1);
var sequentialPatches = PatchSampling.GetPatches(matrices2[i], patchWidth, patchHeight, (rows / patchHeight) * (columns / patchWidth), PatchSampling.SamplingMethod.Sequential);
// downsampling the input matrix by a factor of n (MaxPoolingFactor) using max pooling
double[,] downsampledMatrix = FeatureLearning.MaxPooling(matrices2[i], config.MaxPoolingFactor);

int rows = downsampledMatrix.GetLength(0); // matrices2[i].GetLength(0);
int columns = downsampledMatrix.GetLength(1); // matrices2[i].GetLength(1);
var sequentialPatches = PatchSampling.GetPatches(downsampledMatrix, patchWidth, patchHeight, (rows / patchHeight) * (columns / patchWidth), PatchSampling.SamplingMethod.Sequential);
//var sequentialPatches = PatchSampling.GetPatches(matrices2[i], patchWidth, patchHeight, (rows / patchHeight) * (columns / patchWidth), PatchSampling.SamplingMethod.Sequential);
allSequentialPatchMatrix.Add(sequentialPatches.ToMatrix());
}

Expand Down Expand Up @@ -186,16 +193,16 @@ public static void UnsupervisedFeatureExtraction(FeatureLearningSettings config,
similarityVectors[j] = allNormCentroids.ToArray()[i].ToMatrix().Dot(normVector);
}

Csv.WriteMatrixToCsv(pathToSimilrityVectorsFile.ToFileInfo(), similarityVectors.ToMatrix());
Csv.WriteMatrixToCsv(pathToSimilarityVectorsFile.ToFileInfo(), similarityVectors.ToMatrix());

// To preserve the temporal information, we can concatenate the similarity vectors of a group of frames using
// FrameWindowLength
// To preserve the temporal information, we can concatenate the similarity vectors of a group of frames
// using FrameWindowLength

// patchId refers to the patch id that has been processed so far according to the step size.
// if we want no overlap between different frame windows, then stepSize = frameWindowLength
int patchId = 0;

// patchCounter refers to the number of patches that has been processed so far accroding to FrameWindowLength.
// patchCounter refers to the number of patches that has been processed so far according to FrameWindowLength.
//int patchCounter = 0;

while (patchId + frameWindowLength < similarityVectors.GetLength(0))
Expand Down Expand Up @@ -229,8 +236,12 @@ public static void UnsupervisedFeatureExtraction(FeatureLearningSettings config,
List<double[,]> allStdFeatureVectors = new List<double[,]>();
List<double[,]> allSkewnessFeatureVectors = new List<double[,]>();

// number of frames needs to be concatenated to form 1 second. Each 24 frames make 1 second.
int numFrames = (24 / patchHeight) * 60; //24 // patchHeight; //
// number of frames needs to be concatenated to form 1 minute in the case of Black Rail.
// Each 24 frames form 1 second.
// factors such as stepSize, and maxPoolingFactor should be considered in temporal summarization.
//int numFrames = (24 / patchHeight) * 60; //24 // patchHeight;

int numFrames = (24 * 60) / (patchHeight * stepSize * maxPoolingFactor);

foreach (var freqBandFeature in allFeatureTransVectors)
{
Expand Down
63 changes: 49 additions & 14 deletions src/AudioAnalysisTools/DSP/FeatureLearning.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ namespace AudioAnalysisTools.DSP
using System.IO;
using Accord.Math;
using StandardSpectrograms;
using TowseyLibrary;
using WavTools;

/// <summary>
Expand Down Expand Up @@ -117,22 +118,28 @@ public static class FeatureLearning

while (count < allSubmatrices.Count)
{
// downsampling the input matrix by a factor of n (MaxPoolingFactor) using max pooling
double[,] downsampledMatrix = MaxPooling(allSubmatrices.ToArray()[count], config.MaxPoolingFactor);

randomPatchLists[$"randomPatch{count.ToString()}"].Add(PatchSampling
.GetPatches(allSubmatrices.ToArray()[count], patchWidth, patchHeight, numRandomPatches,
.GetPatches(downsampledMatrix, patchWidth, patchHeight, numRandomPatches,
PatchSampling.SamplingMethod.Random).ToMatrix());

/*
randomPatchLists[$"randomPatch{count.ToString()}"].Add(PatchSampling.
GetPatches(allSubmatrices.ToArray()[count], patchWidth, patchHeight, numRandomPatches,
PatchSampling.SamplingMethod.Random).ToMatrix());
/*
// take the total number of frames out of each second minute paper
if (no / 2 == 0)
{
int rows = allSubmatrices.ToArray()[count].GetLength(0);
int columns = allSubmatrices.ToArray()[count].GetLength(1);
randomPatchLists[$"randomPatch{count.ToString()}"].Add(PatchSampling
.GetPatches(allSubmatrices.ToArray()[count], patchWidth, patchHeight, (rows / patchHeight) * (columns / patchWidth),
PatchSampling.SamplingMethod.Sequential).ToMatrix());
no++;
}
*/
// take the total number of frames out of each second minute paper
if (no / 2 == 0)
{
int rows = allSubmatrices.ToArray()[count].GetLength(0);
int columns = allSubmatrices.ToArray()[count].GetLength(1);
randomPatchLists[$"randomPatch{count.ToString()}"].Add(PatchSampling
.GetPatches(allSubmatrices.ToArray()[count], patchWidth, patchHeight, (rows / patchHeight) * (columns / patchWidth),
PatchSampling.SamplingMethod.Sequential).ToMatrix());
no++;
}
*/
count++;
}
}
Expand Down Expand Up @@ -163,6 +170,34 @@ public static class FeatureLearning
return allClusteringOutput;
}

/// <summary>
/// This method downsamples the input matrix (x,y) by a factor of n on the temporal scale (x) using max pooling
/// </summary>
public static double[,] MaxPooling(double[,] matrix, int factor)
{
int count = 0;
List<double[]> downsampledMatrix = new List<double[]>();
while (count + factor <= matrix.GetLength(0))
{
List<double> maxValues = new List<double>();
for (int j = 0; j < matrix.GetLength(1); j++)
{
List<double> temp = new List<double>();
for (int i = count; i < count + factor; i++)
{
temp.Add(matrix[i, j]);
}

maxValues.Add(temp.ToArray().GetMaxValue());
}

downsampledMatrix.Add(maxValues.ToArray());
count = count + factor;
}

return downsampledMatrix.ToArray().ToMatrix();
}

/// <summary>
/// This method is called supervised feature learning because the frames to form a cluster
/// have been manually selected from 1-min recordings.
Expand Down
40 changes: 23 additions & 17 deletions src/AudioAnalysisTools/DSP/FeatureLearningSettings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -38,55 +38,61 @@ public class FeatureLearningSettings : Config

public const bool DefaultDoWhitening = true;

public const int DefaultMaxPoolingFactor = 1;

/// <summary>
/// Initializes a new instance of the <see cref="FeatureLearningSettings"/> class.
/// CONSTRUCTOR
/// </summary>

/*
public FeatureLearningSettings()
{
this.FrequencyScaleType = DefaultFrequencyScaleType;
//this.HertzInterval = DefaultHertzInterval;
this.FrameSize = DefaultFrameSize;
this.FinalBinCount = DefaultFinalBinCount;
this.MinFreqBin = DefaultMinFreqBin;
this.MaxFreqBin = DefaultMaxFreqBin;
this.NumFreqBand = DefaultNumFreqBand;
//this.PatchWidth = DefaultPatchWidth;
this.PatchHeight = DefaultPatchHeight;
this.NumRandomPatches = DefaultNumRandomPatches;
this.NumClusters = DefaultNumClusters;
this.DoNoiseReduction = DefaultDoNoiseReduction;
this.DoWhitening = DefaultDoWhitening;
this.MaxPoolingFactor = DefaultMaxPoolingFactor;
}
*/

public FreqScaleType FrequencyScaleType { get; set; } = DefaultFrequencyScaleType;

public FreqScaleType FrequencyScaleType { get; set; }
//public int HertzInterval { get; set; } = DefaultHertzInterval;

//public int HertzInterval { get; set; }
public int FrameSize { get; set; } = DefaultFrameSize;

public int FrameSize { get; set; }
public int FinalBinCount { get; set; } = DefaultFinalBinCount;

public int FinalBinCount { get; set; }
public int MinFreqBin { get; set; } = DefaultMinFreqBin;

public int MinFreqBin { get; set; }
public int MaxFreqBin { get; set; } = DefaultMaxFreqBin;

public int MaxFreqBin { get; set; }
public int NumFreqBand { get; set; } = DefaultNumFreqBand;

public int NumFreqBand { get; set; }
//public int PatchWidth { get; set; } = DefaultPatchWidth;

//public int PatchWidth { get; set; }
public int PatchHeight { get; set; } = DefaultPatchHeight;

public int PatchHeight { get; set; }
public int FrameWindowLength { get; set; } = DefaultFrameWindowLength;

public int FrameWindowLength { get; set; }
public int StepSize { get; set; } = DefaultStepSize;

public int StepSize { get; set; }
public int NumRandomPatches { get; set; } = DefaultNumRandomPatches;

public int NumRandomPatches { get; set; }
public int NumClusters { get; set; } = DefaultNumClusters;

public int NumClusters { get; set; }
public bool DoNoiseReduction { get; set; } = DefaultDoNoiseReduction;

public bool DoNoiseReduction { get; set; }
public bool DoWhitening { get; set; } = DefaultDoWhitening;

public bool DoWhitening { get; set; }
public int MaxPoolingFactor { get; set; } = DefaultMaxPoolingFactor;
}
}

0 comments on commit e9be1d4

Please sign in to comment.