I implement a classification algorithm for K-nearest neighbors in C # for a set for training and testing about 20,000 samples each and 25 measurements.
In my implementation, there are only two classes represented by "0" and "1". At the moment, I have the following simple implementation:
static int[] TestKnnCase(IList<double[]> trainSamples, IList<double[]> testSamples, IList<int[]> trainClasses, int K)
{
Console.WriteLine("Performing KNN with K = "+K);
var testResults = new int[testSamples.Count()];
var testNumber = testSamples.Count();
var trainNumber = trainSamples.Count();
var distances = new double[trainNumber][];
for (var i = 0; i < trainNumber; i++)
{
distances[i] = new double[2];
}
for (var tst = 0; tst < testNumber; tst++)
{
Parallel.For(0, trainNumber, trn =>
{
var dist = GetDistance(testSamples[tst], trainSamples[trn]);
distances[trn][0] = dist;
distances[trn][1] = trn;
});
var votingDistances = distances.AsParallel().OrderBy(t => t[0]).Take(K);
var yea = 0.0;
var nay = 0.0;
foreach (var voter in votingDistances)
{
if (trainClasses[(int)voter[1]] == 1)
yea++;
else
nay++;
}
if (yea > nay)
testResults[tst] = 1;
else
testResults[tst] = 0;
}
return testResults;
}
static double GetDistance(IList<double> sample1, IList<double> sample2)
{
var distance = 0.0;
for (var i = 0; i < sample1.Count; i++)
{
var temp = sample1[i] - sample2[i];
distance += temp * temp;
}
return distance;
}
It takes quite a while to complete. It takes about 80 seconds on my system. How can I optimize this, ensuring that it also scales to more sample data? As you can see, I tried using PLINQ and parallel for loops, which helped (without them, it took about 120 seconds). What else can I do?
, KD- KNN , , .
qaru.site/questions/394106/... , , 3 , , - .
#, R C #, , , , , . , .
- PCA - . 25 .