From 511978befd88d548e341e86d8c03a49feea8d7b4 Mon Sep 17 00:00:00 2001 From: tuidan <40883104+tuidan@users.noreply.github.com> Date: Wed, 25 Aug 2021 16:24:36 +0800 Subject: [PATCH] Birch wx (#48) * modify data sink part in StreamKM * fix bug for birch, complete task #10, modify store functions --- benchmark/datasets/{new.txt => CoverType.txt} | 0 benchmark/datasets/Mock.txt | 150 +++++++ benchmark/src/Benchmark.cpp | 14 +- include/Algorithm/Algorithm.hpp | 2 +- include/Algorithm/AlgorithmFactory.hpp | 16 +- include/Algorithm/Birch.hpp | 56 +++ include/Algorithm/DataStructure/CFTree.hpp | 67 ++++ .../DataStructure/DataStructureFactory.hpp | 5 + .../Algorithm/DataStructure/FeatureVector.hpp | 34 ++ include/Algorithm/DataStructure/Point.hpp | 1 + include/Utils/BenchmarkUtils.hpp | 3 + src/Algorithm/Algorithm.cpp | 16 +- src/Algorithm/AlgorithmFactory.cpp | 58 ++- src/Algorithm/Birch.cpp | 372 ++++++++++++++++++ src/Algorithm/CMakeLists.txt | 1 + src/Algorithm/DataStructure/CFTree.cpp | 102 +++++ src/Algorithm/DataStructure/CMakeLists.txt | 2 + .../DataStructure/DataStructureFactory.cpp | 6 + src/Algorithm/DataStructure/FeatureVector.cpp | 72 ++++ src/Algorithm/DataStructure/Point.cpp | 3 + src/Algorithm/StreamKM.cpp | 49 +-- src/Utils/BenchmarkUtils.cpp | 9 +- test/CMakeLists.txt | 4 +- test/SystemTest.cpp | 50 --- test/SystemTest/BirchTest.cpp | 47 +++ test/SystemTest/CluStreamTest.cpp | 52 +++ test/SystemTest/StreamKMTest.cpp | 48 +++ test/datasets/{new.txt => CoverType.txt} | 0 test/datasets/Mock.txt | 150 +++++++ 29 files changed, 1241 insertions(+), 148 deletions(-) rename benchmark/datasets/{new.txt => CoverType.txt} (100%) create mode 100644 benchmark/datasets/Mock.txt create mode 100644 include/Algorithm/Birch.hpp create mode 100644 include/Algorithm/DataStructure/CFTree.hpp create mode 100644 include/Algorithm/DataStructure/FeatureVector.hpp create mode 100644 src/Algorithm/Birch.cpp create mode 100644 src/Algorithm/DataStructure/CFTree.cpp create mode 100644 src/Algorithm/DataStructure/FeatureVector.cpp delete mode 100644 test/SystemTest.cpp create mode 100644 test/SystemTest/BirchTest.cpp create mode 100644 test/SystemTest/CluStreamTest.cpp create mode 100644 test/SystemTest/StreamKMTest.cpp rename test/datasets/{new.txt => CoverType.txt} (100%) create mode 100644 test/datasets/Mock.txt diff --git a/benchmark/datasets/new.txt b/benchmark/datasets/CoverType.txt similarity index 100% rename from benchmark/datasets/new.txt rename to benchmark/datasets/CoverType.txt diff --git a/benchmark/datasets/Mock.txt b/benchmark/datasets/Mock.txt new file mode 100644 index 00000000..de6ed589 --- /dev/null +++ b/benchmark/datasets/Mock.txt @@ -0,0 +1,150 @@ +1 5.1 3.5 1.4 0.2 0 +2 4.9 3 1.4 0.2 0 +3 4.7 3.2 1.3 0.2 0 +4 4.6 3.1 1.5 0.2 0 +5 5 3.6 1.4 0.2 0 +6 5.4 3.9 1.7 0.4 0 +7 4.6 3.4 1.4 0.3 0 +8 5 3.4 1.5 0.2 0 +9 4.4 2.9 1.4 0.2 0 +10 4.9 3.1 1.5 0.1 0 +11 5.4 3.7 1.5 0.2 0 +12 4.8 3.4 1.6 0.2 0 +13 4.8 3 1.4 0.1 0 +14 4.3 3 1.1 0.1 0 +15 5.8 4 1.2 0.2 0 +16 5.7 4.4 1.5 0.4 0 +17 5.4 3.9 1.3 0.4 0 +18 5.1 3.5 1.4 0.3 0 +19 5.7 3.8 1.7 0.3 0 +20 5.1 3.8 1.5 0.3 0 +21 5.4 3.4 1.7 0.2 0 +22 5.1 3.7 1.5 0.4 0 +23 4.6 3.6 1 0.2 0 +24 5.1 3.3 1.7 0.5 0 +25 4.8 3.4 1.9 0.2 0 +26 5 3 1.6 0.2 0 +27 5 3.4 1.6 0.4 0 +28 5.2 3.5 1.5 0.2 0 +29 5.2 3.4 1.4 0.2 0 +30 4.7 3.2 1.6 0.2 0 +31 4.8 3.1 1.6 0.2 0 +32 5.4 3.4 1.5 0.4 0 +33 5.2 4.1 1.5 0.1 0 +34 5.5 4.2 1.4 0.2 0 +35 4.9 3.1 1.5 0.1 0 +36 5 3.2 1.2 0.2 0 +37 5.5 3.5 1.3 0.2 0 +38 4.9 3.1 1.5 0.1 0 +39 4.4 3 1.3 0.2 0 +40 5.1 3.4 1.5 0.2 0 +41 5 3.5 1.3 0.3 0 +42 4.5 2.3 1.3 0.3 0 +43 4.4 3.2 1.3 0.2 0 +44 5 3.5 1.6 0.6 0 +45 5.1 3.8 1.9 0.4 0 +46 4.8 3 1.4 0.3 0 +47 5.1 3.8 1.6 0.2 0 +48 4.6 3.2 1.4 0.2 0 +49 5.3 3.7 1.5 0.2 0 +50 5 3.3 1.4 0.2 0 +51 7 3.2 4.7 1.4 1 +52 6.4 3.2 4.5 1.5 1 +53 6.9 3.1 4.9 1.5 1 +54 5.5 2.3 4 1.3 1 +55 6.5 2.8 4.6 1.5 1 +56 5.7 2.8 4.5 1.3 1 +57 6.3 3.3 4.7 1.6 1 +58 4.9 2.4 3.3 1 1 +59 6.6 2.9 4.6 1.3 1 +60 5.2 2.7 3.9 1.4 1 +61 5 2 3.5 1 1 +62 5.9 3 4.2 1.5 1 +63 6 2.2 4 1 1 +64 6.1 2.9 4.7 1.4 1 +65 5.6 2.9 3.6 1.3 1 +66 6.7 3.1 4.4 1.4 1 +67 5.6 3 4.5 1.5 1 +68 5.8 2.7 4.1 1 1 +69 6.2 2.2 4.5 1.5 1 +70 5.6 2.5 3.9 1.1 1 +71 5.9 3.2 4.8 1.8 1 +72 6.1 2.8 4 1.3 1 +73 6.3 2.5 4.9 1.5 1 +74 6.1 2.8 4.7 1.2 1 +75 6.4 2.9 4.3 1.3 1 +76 6.6 3 4.4 1.4 1 +77 6.8 2.8 4.8 1.4 1 +78 6.7 3 5 1.7 1 +79 6 2.9 4.5 1.5 1 +80 5.7 2.6 3.5 1 1 +81 5.5 2.4 3.8 1.1 1 +82 5.5 2.4 3.7 1 1 +83 5.8 2.7 3.9 1.2 1 +84 6 2.7 5.1 1.6 1 +85 5.4 3 4.5 1.5 1 +86 6 3.4 4.5 1.6 1 +87 6.7 3.1 4.7 1.5 1 +88 6.3 2.3 4.4 1.3 1 +89 5.6 3 4.1 1.3 1 +90 5.5 2.5 4 1.3 1 +91 5.5 2.6 4.4 1.2 1 +92 6.1 3 4.6 1.4 1 +93 5.8 2.6 4 1.2 1 +94 5 2.3 3.3 1 1 +95 5.6 2.7 4.2 1.3 1 +96 5.7 3 4.2 1.2 1 +97 5.7 2.9 4.2 1.3 1 +98 6.2 2.9 4.3 1.3 1 +99 5.1 2.5 3 1.1 1 +100 5.7 2.8 4.1 1.3 1 +101 6.3 3.3 6 2.5 2 +102 5.8 2.7 5.1 1.9 2 +103 7.1 3 5.9 2.1 2 +104 6.3 2.9 5.6 1.8 2 +105 6.5 3 5.8 2.2 2 +106 7.6 3 6.6 2.1 2 +107 4.9 2.5 4.5 1.7 2 +108 7.3 2.9 6.3 1.8 2 +109 6.7 2.5 5.8 1.8 2 +110 7.2 3.6 6.1 2.5 2 +111 6.5 3.2 5.1 2 2 +112 6.4 2.7 5.3 1.9 2 +113 6.8 3 5.5 2.1 2 +114 5.7 2.5 5 2 2 +115 5.8 2.8 5.1 2.4 2 +116 6.4 3.2 5.3 2.3 2 +117 6.5 3 5.5 1.8 2 +118 7.7 3.8 6.7 2.2 2 +119 7.7 2.6 6.9 2.3 2 +120 6 2.2 5 1.5 2 +121 6.9 3.2 5.7 2.3 2 +122 5.6 2.8 4.9 2 2 +123 7.7 2.8 6.7 2 2 +124 6.3 2.7 4.9 1.8 2 +125 6.7 3.3 5.7 2.1 2 +126 7.2 3.2 6 1.8 2 +127 6.2 2.8 4.8 1.8 2 +128 6.1 3 4.9 1.8 2 +129 6.4 2.8 5.6 2.1 2 +130 7.2 3 5.8 1.6 2 +131 7.4 2.8 6.1 1.9 2 +132 7.9 3.8 6.4 2 2 +133 6.4 2.8 5.6 2.2 2 +134 6.3 2.8 5.1 1.5 2 +135 6.1 2.6 5.6 1.4 2 +136 7.7 3 6.1 2.3 2 +137 6.3 3.4 5.6 2.4 2 +138 6.4 3.1 5.5 1.8 2 +139 6 3 4.8 1.8 2 +140 6.9 3.1 5.4 2.1 2 +141 6.7 3.1 5.6 2.4 2 +142 6.9 3.1 5.1 2.3 2 +143 5.8 2.7 5.1 1.9 2 +144 6.8 3.2 5.9 2.3 2 +145 6.7 3.3 5.7 2.5 2 +146 6.7 3 5.2 2.3 2 +147 6.3 2.5 5 1.9 2 +148 6.5 3 5.2 2 2 +149 6.2 3.4 5.4 2.3 2 +150 5.9 3 5.1 1.8 2 diff --git a/benchmark/src/Benchmark.cpp b/benchmark/src/Benchmark.cpp index ed1e72fe..b7c8f09d 100644 --- a/benchmark/src/Benchmark.cpp +++ b/benchmark/src/Benchmark.cpp @@ -35,19 +35,7 @@ int main(int argc, char **argv) { SESAME::DataSinkPtr sinkPtr = SESAME::DataSinkFactory::create(); //Create Algorithm. - SESAME::AlgorithmPtr algoPtr = SESAME::AlgorithmFactory::create(cmd_params.algoName, - cmd_params.pointNumber, - cmd_params.clusterNumber, - cmd_params.dimension, - cmd_params.coresetSize, - cmd_params.seed, - cmd_params.lastArrivingNum, - cmd_params.timeWindow, - cmd_params.timeInterval, - cmd_params.onlineClusterNumber, - cmd_params.radiusFactor, - cmd_params.initBuffer, - cmd_params.offlineTimeWindow); + SESAME::AlgorithmPtr algoPtr = SESAME::AlgorithmFactory::create(cmd_params); //Run algorithm producing results. BenchmarkUtils::runBenchmark(cmd_params, sourcePtr, sinkPtr, algoPtr); diff --git a/include/Algorithm/Algorithm.hpp b/include/Algorithm/Algorithm.hpp index 83267217..b095e893 100644 --- a/include/Algorithm/Algorithm.hpp +++ b/include/Algorithm/Algorithm.hpp @@ -30,7 +30,7 @@ class Algorithm { virtual void Initilize() = 0; virtual void runOnlineClustering(SESAME::PointPtr input) = 0; virtual void runOfflineClustering(SESAME::DataSinkPtr ptr) = 0; - void store(std::string outputPath, int numberOfCenters, int dimension, std::vector results); + void store(std::string outputPath, int dimension, std::vector results); }; } diff --git a/include/Algorithm/AlgorithmFactory.hpp b/include/Algorithm/AlgorithmFactory.hpp index 97c26ab3..81bcb119 100644 --- a/include/Algorithm/AlgorithmFactory.hpp +++ b/include/Algorithm/AlgorithmFactory.hpp @@ -8,23 +8,13 @@ #define SESAME_SRC_ALGORITHM_ALGORITHMFACTORY_HPP_ #include +#include + namespace SESAME { class AlgorithmFactory { public: - static SESAME::AlgorithmPtr create(std::string algoName, - int pointNumber, - int clusterNumber, - int dimension, - int coresetSize, - int seed, - int lastArrivingNum, - int timeWindow, - unsigned int timeInterval, - int onlineClusterNumber, - double radiusFactor, - int initBuffer, - int offlineTimeWindow); + static SESAME::AlgorithmPtr create(param_t &cmd_params); }; } #endif //SESAME_SRC_ALGORITHM_ALGORITHMFACTORY_HPP_ diff --git a/include/Algorithm/Birch.hpp b/include/Algorithm/Birch.hpp new file mode 100644 index 00000000..35b9a7cd --- /dev/null +++ b/include/Algorithm/Birch.hpp @@ -0,0 +1,56 @@ +// +// Created by tuidan on 2021/8/24. +// + +#ifndef SESAME_INCLUDE_ALGORITHM_BIRCH_HPP_ +#define SESAME_INCLUDE_ALGORITHM_BIRCH_HPP_ +#include +#include +#include +#include +#include +namespace SESAME { + +class BirchParameter : public AlgorithmParameters { + public: + int maxInternalNodes; // B + int maxLeafNodes; // L + double thresholdDistance; // T +}; + +class Birch : public Algorithm { + + public: + BirchParameter BirchParam; + std::shared_ptr kmeans; //used for offline initialization + int leafMask = 0; + NodePtr root; + vector leafNodes; + CFTreePtr cfTree; + Birch(); + + ~Birch(); + + void Initilize() override; + + void runOnlineClustering(PointPtr input) override; + + void runOfflineClustering(DataSinkPtr sinkPtr) override; + private: + + void forwardInsert(PointPtr point); + void backwardEvolution(NodePtr &curNode, PointPtr &point); + void calculateCorDistance(vector> &distance, vector &nodes); + double calculateRadius(PointPtr &point, PointPtr ¢roid); + void selectChild(vector &children, PointPtr &insertPoint, NodePtr &node); + double clusterToClusterDist(NodePtr &nodeA, NodePtr &nodeB); + void pointToClusterDist(PointPtr &insertPoint, NodePtr &node, double &dist); + void calculateCentroid(CFPtr &cf, PointPtr ¢roid); + void updateNLS(NodePtr &node, PointPtr &point, bool updateAll); + void initializeCF(CFPtr &cf, int dimension); + void setCFToBlankNode(SESAME::NodePtr &curNode, SESAME::PointPtr &point); + void addNodeNLSToNode(SESAME::NodePtr &child, SESAME::NodePtr &parent); + void clearChildParents(vector &children); +}; +} +#endif //SESAME_INCLUDE_ALGORITHM_BIRCH_HPP_ diff --git a/include/Algorithm/DataStructure/CFTree.hpp b/include/Algorithm/DataStructure/CFTree.hpp new file mode 100644 index 00000000..cdf27d4a --- /dev/null +++ b/include/Algorithm/DataStructure/CFTree.hpp @@ -0,0 +1,67 @@ +// +// Created by tuidan on 2021/8/24. +// + +#ifndef SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_CFTREE_HPP_ +#define SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_CFTREE_HPP_ +#include +#include +#include +#include +#include +namespace SESAME { + +// define the share point of the class object +class CFNode; +class CFTree; +typedef std::shared_ptr NodePtr; +typedef std::shared_ptr CFTreePtr; + +class CFTree { + private: + int maxInternalNodes; // max CF number of each internal node + int maxLeafNodes; // max CF number of each leaf node + double thresholdDistance; // threshold radius of each sub cluster in leaf nodes + public: + CFTree(int b, int l, double t); + void initialTree(int b, int l, double t); + ~CFTree(); + int getB() const; + int getL() const; + double getT() const; + void setB(int b); + void setL(int l); + void setT(double t); +}; + + +class CFNode { + private: + CFPtr curCF; + bool isLeaf; + std::vector children; + std::vector parent; + std:: vector clusterPoints; + int index; + public: + CFNode(); + ~CFNode(); + SESAME::CFPtr getCF(); + void setCF(SESAME::CFPtr &cf); + void insertPoint(PointPtr &p); + std::vector getPoints(); + std::vector getParents(); + int getIndex() const; + std::vector getChildren(); + void removeChild(SESAME::NodePtr &child); + SESAME::NodePtr copy(); + bool getIsLeaf(); + void setIsLeaf(bool leaf); + void setNode(SESAME::CFPtr &Node); + void setIndex(int Index); + void setParent(SESAME::NodePtr &Parent); + void setChild(SESAME::NodePtr &child); + void clearParents(); +}; +} +#endif //SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_CFTREE_HPP_ diff --git a/include/Algorithm/DataStructure/DataStructureFactory.hpp b/include/Algorithm/DataStructure/DataStructureFactory.hpp index 378c6823..6ffea88c 100644 --- a/include/Algorithm/DataStructure/DataStructureFactory.hpp +++ b/include/Algorithm/DataStructure/DataStructureFactory.hpp @@ -11,6 +11,9 @@ #include #include #include +#include +#include + namespace SESAME { class DataStructureFactory { @@ -28,6 +31,8 @@ class DataStructureFactory { static void clearMicroCluster(MicroClusterPtr microCluster); static SnapshotPtr createSnapshot(MicroClusters & otherMicroClusters,int elapsedTime); static void clearSnapshot(SnapshotPtr snapshot); + static CFTreePtr createCFTree(); + static NodePtr createNode(); }; } diff --git a/include/Algorithm/DataStructure/FeatureVector.hpp b/include/Algorithm/DataStructure/FeatureVector.hpp new file mode 100644 index 00000000..08530df1 --- /dev/null +++ b/include/Algorithm/DataStructure/FeatureVector.hpp @@ -0,0 +1,34 @@ +// +// Created by tuidan on 2021/8/24. +// + +#ifndef SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_FEATUREVECTOR_H_ +#define SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_FEATUREVECTOR_H_ +#include +#include +#include + +namespace SESAME { +class CF; +typedef std::shared_ptr CFPtr; +class CF { + private: + // N是子类中节点的数目,LS是N个节点的线性和,SS是N个节点的平方和 + int NumberOfNodes; + std::vector LS; + std::vector SS; + public: + CF(); + ~CF(); + int getN() const; + void setN(int n); + std::vector getLS() const; + std::vector getSS() const; + double getLSItem(int index) const; + double getSSItem(int index) const; + void setLS(std::vector &newLs); + void setSS(std::vector &newSs); + SESAME::CFPtr copy(); +}; +} +#endif //SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_FEATUREVECTOR_H_ diff --git a/include/Algorithm/DataStructure/Point.hpp b/include/Algorithm/DataStructure/Point.hpp index aea24284..736def28 100644 --- a/include/Algorithm/DataStructure/Point.hpp +++ b/include/Algorithm/DataStructure/Point.hpp @@ -36,6 +36,7 @@ class Point { void setClusteringCenter(int index); int getDimension() const; void setDimension(int d); + int getFeatureLength(); SESAME::PointPtr copy(); }; } diff --git a/include/Utils/BenchmarkUtils.hpp b/include/Utils/BenchmarkUtils.hpp index 1b4d0705..46167936 100644 --- a/include/Utils/BenchmarkUtils.hpp +++ b/include/Utils/BenchmarkUtils.hpp @@ -33,6 +33,9 @@ struct param_t { double radiusFactor; int initBuffer; int offlineTimeWindow; + int maxInternalNodes; + int maxLeafNodes; + double thresholdDistance; std::string inputPath; std::string outputPath; std::string algoName; diff --git a/src/Algorithm/Algorithm.cpp b/src/Algorithm/Algorithm.cpp index ab4b0bde..e1d414d3 100644 --- a/src/Algorithm/Algorithm.cpp +++ b/src/Algorithm/Algorithm.cpp @@ -7,25 +7,15 @@ #include #include void SESAME::Algorithm::store(std::string outputPath, - int numberOfCenters, int dimension, std::vector result) { - + int numberOfCenters = (int)result.size(); FILE *out = fopen(outputPath.c_str(), "w"); for (int i = 0; i < numberOfCenters; i++) { int l; fprintf(out, "%f ", result[i]->getWeight()); - for (l = 0; l < dimension - 1; l++) { - if (result[i]->getWeight() != 0.0) { - fprintf(out, "%f ", result[i]->getFeatureItem(l) / result[i]->getWeight()); - } else { - fprintf(out, "%f ", result[i]->getFeatureItem(l)); - } - } - if (result[i]->getWeight() != 0.0) { - fprintf(out, "%f", result[i]->getFeatureItem(dimension - 1) / result[i]->getWeight()); - } else { - fprintf(out, "%f", result[i]->getFeatureItem(dimension - 1)); + for (l = 0; l < dimension; l++) { + fprintf(out, "%f ", result[i]->getFeatureItem(l) / result[i]->getWeight()); } fprintf(out, "\n"); } diff --git a/src/Algorithm/AlgorithmFactory.cpp b/src/Algorithm/AlgorithmFactory.cpp index 59776149..6aa01bbf 100644 --- a/src/Algorithm/AlgorithmFactory.cpp +++ b/src/Algorithm/AlgorithmFactory.cpp @@ -6,44 +6,42 @@ #include #include +#include #include -SESAME::AlgorithmPtr SESAME::AlgorithmFactory::create(std::string algoName, - int pointNumber, - int clusterNumber, - int dimension, - int coresetSize, - int seed, - int lastArrivingNum, - int timeWindow, - unsigned int timeInterval, - int onlineClusterNumber, - double radiusFactor, - int initBuffer, - int offlineTimeWindow) { - if (algoName == "StreamKMeans") { +SESAME::AlgorithmPtr SESAME::AlgorithmFactory::create(param_t &cmd_params) { + if (cmd_params.algoName == "StreamKMeans") { shared_ptr streamkm = std::make_shared(); - streamkm->StreamKMParam.pointNumber = pointNumber; - streamkm->StreamKMParam.clusterNumber = clusterNumber; - streamkm->StreamKMParam.windowSize = coresetSize; - streamkm->StreamKMParam.seed = seed; - streamkm->StreamKMParam.dimension = dimension; + streamkm->StreamKMParam.pointNumber = cmd_params.pointNumber; + streamkm->StreamKMParam.clusterNumber = cmd_params.clusterNumber; + streamkm->StreamKMParam.windowSize = cmd_params.coresetSize; + streamkm->StreamKMParam.seed = cmd_params.seed; + streamkm->StreamKMParam.dimension = cmd_params.dimension; return (SESAME::AlgorithmPtr) streamkm; } - if (algoName == "CluStream") { + if (cmd_params.algoName == "CluStream") { shared_ptr cluStream = std::make_shared(); - cluStream->CluStreamParam.pointNumber = pointNumber; - cluStream->CluStreamParam.clusterNumber = onlineClusterNumber; - cluStream->CluStreamParam.dimension = dimension; - cluStream->CluStreamParam.lastArrivingNum = lastArrivingNum; - cluStream->CluStreamParam.timeWindow = timeWindow; - cluStream->CluStreamParam.timeInterval = timeInterval; - cluStream->CluStreamParam.offlineClusterNumber = clusterNumber; - cluStream->CluStreamParam.radiusFactor = radiusFactor; - cluStream->CluStreamParam.initBuffer = initBuffer; - cluStream->CluStreamParam.offlineTimeWindow = offlineTimeWindow; + cluStream->CluStreamParam.pointNumber = cmd_params.pointNumber; + cluStream->CluStreamParam.clusterNumber = cmd_params.onlineClusterNumber; + cluStream->CluStreamParam.dimension = cmd_params.dimension; + cluStream->CluStreamParam.lastArrivingNum = cmd_params.lastArrivingNum; + cluStream->CluStreamParam.timeWindow = cmd_params.timeWindow; + cluStream->CluStreamParam.timeInterval = cmd_params.timeInterval; + cluStream->CluStreamParam.offlineClusterNumber = cmd_params.clusterNumber; + cluStream->CluStreamParam.radiusFactor = cmd_params.radiusFactor; + cluStream->CluStreamParam.initBuffer = cmd_params.initBuffer; + cluStream->CluStreamParam.offlineTimeWindow = cmd_params.offlineTimeWindow; return (SESAME::AlgorithmPtr) cluStream; } + if (cmd_params.algoName == "Birch") { + shared_ptr birch = std::make_shared(); + birch->BirchParam.pointNumber = cmd_params.pointNumber; + birch->BirchParam.dimension = cmd_params.dimension; + birch->BirchParam.maxInternalNodes = cmd_params.maxInternalNodes; + birch->BirchParam.maxLeafNodes = cmd_params.maxLeafNodes; + birch->BirchParam.thresholdDistance = cmd_params.thresholdDistance; + return (SESAME::AlgorithmPtr) birch; + } throw std::invalid_argument("Unsupported"); } diff --git a/src/Algorithm/Birch.cpp b/src/Algorithm/Birch.cpp new file mode 100644 index 00000000..8dee115e --- /dev/null +++ b/src/Algorithm/Birch.cpp @@ -0,0 +1,372 @@ +// +// Created by tuidan on 2021/8/24. +// +#include +#include + +void SESAME::Birch::Initilize() { + this->cfTree = DataStructureFactory::createCFTree(); + this->cfTree->setB(BirchParam.maxInternalNodes); + this->cfTree->setL(BirchParam.maxLeafNodes); + this->cfTree->setT(BirchParam.thresholdDistance); + this->root = DataStructureFactory::createNode(); + this->root->setIsLeaf(true); +} + + +void SESAME::Birch::runOnlineClustering(const SESAME::PointPtr input) { + // insert the root + forwardInsert(input); +} + + +void SESAME::Birch::runOfflineClustering(DataSinkPtr sinkPtr) { + for(int i = 0; i < this->leafNodes.size(); i++) { + PointPtr centroid = DataStructureFactory::createPoint(i, 1, BirchParam.dimension, 0); + for(int j = 0; j < BirchParam.dimension; j++) { + centroid->setFeatureItem(this->leafNodes[i]->getCF()->getLS().at(j) / this->leafNodes[i]->getCF()->getN(), j); + } + sinkPtr->put(centroid->copy()); + } + SESAME_DEBUG( "The size of the centroid is :" << sinkPtr->getResults().size()); +// std::vector> oldGroups, newGroups; +// this->kmeans->runKMeans((int)middleCentroids.size() / 2, (int)middleCentroids.size(), +// middleCentroids,oldGroups,newGroups, true); +// this->kmeans->produceResult(oldGroups, sinkPtr); +} + +SESAME::Birch::Birch() { + +} +SESAME::Birch::~Birch() { + +} +// when a new point insert into the CF, update the CF N, LS and SS +void SESAME::Birch::updateNLS(SESAME::NodePtr &node, SESAME::PointPtr &point, bool updateAll){ + SESAME::NodePtr nodeSearch = node; + while(true) { + SESAME::CFPtr cf = nodeSearch->getCF(); + vector tmpLS = cf->getLS(); + vector tmpSS = cf->getSS(); + if(tmpLS.empty()) { + for(int i = 0; i < point->getDimension(); i++){ + tmpLS.push_back(0); + tmpSS.push_back(0); + } + } + cf->setN(cf->getN() + 1); + if(nodeSearch->getIsLeaf()) { + nodeSearch->insertPoint(point); + } + for(int i = 0; i < point->getDimension(); i++){ + tmpLS[i] += point->getFeatureItem(i); + tmpSS[i] += pow(point->getFeatureItem(i), 2); + } + cf->setLS(tmpLS); + cf->setSS(tmpSS); + if(!nodeSearch->getParents().empty() && updateAll) { + nodeSearch = nodeSearch->getParents().at(0); + } else break; + } +} + +// centroid index: -1(virtual) +// centroid feature: mean of the feature of cluster points +// centroid cluster: -1 +void SESAME::Birch::calculateCentroid(SESAME::CFPtr &cf, SESAME::PointPtr ¢roid) { + centroid->setIndex(-1); + centroid->setClusteringCenter(-1); + vector ls = cf->getLS(); + for(int i = 0; i < ls.size(); i++) centroid->setFeatureItem(ls.at(i) / (double)ls.size(), i); +} + +// use Manhattan Distance +void SESAME::Birch::pointToClusterDist(SESAME::PointPtr &insertPoint, SESAME::NodePtr &node, double & dist) { + dist = 0; + SESAME::PointPtr centroid = make_shared(); + SESAME::CFPtr curCF = node->getCF(); + calculateCentroid(curCF, centroid); + for(int i = 0; i < insertPoint->getDimension(); i++) { + dist += abs(centroid->getFeatureItem(i) - insertPoint->getFeatureItem(i)); + } +} + +// use Manhattan Distance +double SESAME::Birch::clusterToClusterDist(SESAME::NodePtr &nodeA, SESAME::NodePtr &nodeB) { + double dist = 0; + SESAME::PointPtr centroidA = make_shared(); + SESAME::PointPtr centroidB = make_shared(); + SESAME::CFPtr curCFA = nodeA->getCF(); + SESAME::CFPtr curCFB = nodeB->getCF(); + calculateCentroid(curCFA, centroidA); + calculateCentroid(curCFB, centroidB);; + for(int i = 0; i < centroidA->getDimension(); i++) { + dist += abs(centroidA->getFeatureItem(i) - centroidB->getFeatureItem(i)); + } + return dist; +} + +// select the closest child cluster according to Manhattan Distance +void SESAME::Birch::selectChild(vector &children, SESAME::PointPtr &insertPoint, SESAME::NodePtr &node) { + double dist = 0; + double temp = 0; + pointToClusterDist(insertPoint, children.at(0), dist); + node = children.at(0); + for(int i = 1; i < children.size(); i++) { + pointToClusterDist(insertPoint, children.at(i), temp); + if(temp < dist) { + dist = temp; + node = children.at(i); + } + } +} + +// calculate the radius of a cluster +double SESAME::Birch::calculateRadius(SESAME::PointPtr &point, SESAME::PointPtr ¢roid) { + double denominator = 0; + double radius = 0; + for(int i = 0; i < point->getDimension(); i++) { + denominator += pow(centroid->getFeatureItem(i) - point->getFeatureItem(i), 2); + } + radius = sqrt(denominator); + return radius; +} + +void SESAME::Birch::calculateCorDistance(vector> &distance, vector &nodes) { + // initialization: create a metrics with nxn + for(int i = 0; i < nodes.size(); i++) { + vector row; + for(int j = 0; j < nodes.size(); j++) { + row.push_back(0); + } + distance.push_back(row); + } + + // calculate the correlate distance + for(int i = 0; i < nodes.size(); i++) { + for(int j = i; j < nodes.size(); j++) { + double dist = clusterToClusterDist(nodes[i], nodes[j]); + distance[i][j] = dist; + distance[j][i] = dist; + } + } +} + +void SESAME::Birch::setCFToBlankNode(SESAME::NodePtr &curNode, SESAME::PointPtr &point) { + SESAME::CFPtr curCF = curNode->getCF(); + curCF->setN(curCF->getN() + 1); + vector newLs; + vector newSs; + for(int i = 0; i < point->getDimension(); i++) { + newLs.push_back(point->getFeatureItem(i)); + newSs.push_back(pow(point->getFeatureItem(i), 2)); + } + curCF->setSS(newSs); + curCF->setLS(newLs); +} + +void SESAME::Birch::addNodeNLSToNode(SESAME::NodePtr &child, SESAME::NodePtr &parent) { + SESAME::CFPtr childCF = child->getCF(); + SESAME::CFPtr parCF = parent->getCF(); + parCF->setN(parCF->getN() + childCF->getN()); + vector newLs; + vector newSs; + for(int i = 0; i < childCF->getLS().size(); i++) { + newLs.push_back(childCF->getLS().at(i) + parCF->getLS().at(i)); + newSs.push_back(childCF->getSS().at(i) + parCF->getSS().at(i)); + } + parCF->setLS(newLs); + parCF->setSS(newSs); +} + +void SESAME::Birch::initializeCF(SESAME::CFPtr &cf, int dimension) { + vector ls = cf->getLS(); + vector ss = cf->getSS(); + for(int i = 0; i < dimension; i++) { + ls.push_back(0); + ss.push_back(0); + } + cf->setLS(ls); + cf->setSS(ss); +} + +void SESAME::Birch::clearChildParents(vector &children) { + for(auto child : children) { + child->clearParents(); + } +} + +void SESAME::Birch::forwardInsert(SESAME::PointPtr point){ + NodePtr curNode = this->root; + if(curNode->getCF()->getN() == 0) { + updateNLS(curNode, point, true); + } else{ + while(1) { + vector childrenNode = curNode->getChildren(); + if(curNode->getIsLeaf()) { + CFPtr curCF = curNode->getCF(); + if(curCF->getN() == 0) { + initializeCF(curCF, point->getDimension()); + } + PointPtr centroid = make_shared(); + calculateCentroid(curCF, centroid); + if(calculateRadius(point, centroid) <= this->cfTree->getT()) { // concept drift detection + // whether the new radius is lower than threshold T + updateNLS(curNode, point, true); + // means this point could get included in this cluster + SESAME_DEBUG("No concept drift occurs(t <= T), insert tha point into the leaf node..."); + break; + // Normally insert the data point into the tree leafNode without concept drift + } else { + // concept drift adaption + SESAME_DEBUG("Concept drift occurs(t > T), the current leaf node capacity reaches the threshold T"); + backwardEvolution(curNode, point); + break; + } + } else{ + selectChild(childrenNode, point, curNode); + } + } + } +} + +// concept drift adaption +void SESAME::Birch::backwardEvolution(SESAME::NodePtr &curNode, SESAME::PointPtr &point) { + if(curNode->getParents().empty()) { // means current node is root node + SESAME_DEBUG("l <= L, create a new leaf node and insert the point into it(root change)"); + NodePtr newRoot = make_shared(); + newRoot->setIsLeaf(false); + newRoot->setChild(curNode); + curNode->setParent(newRoot); + NodePtr newNode = make_shared(); + newNode->setIsLeaf(true); + newNode->setParent(curNode->getParents().at(0)); + vector curLS = curNode->getCF()->getLS(); + vector curSS = curNode->getCF()->getSS(); + int curN = curNode->getCF()->getN(); + newRoot->getCF()->setLS(curLS); + newRoot->getCF()->setSS(curSS); + newRoot->getCF()->setN(curN); + newRoot->setIndex(this->leafMask++); + this->leafNodes.push_back(newRoot); + + + // update the parent node + newRoot->setChild(newNode); + updateNLS(newNode, point, true); + this->root = newRoot; + + } else{ + NodePtr parent = curNode->getParents().at(0); + NodePtr newNode = make_shared(); + newNode->setIsLeaf(true); + updateNLS(newNode, point, false); + newNode->setParent(parent); + parent->setChild(newNode); + if(parent->getChildren().size() <= this->cfTree->getL()){ + // whether the number of CFs(clusters) in the current leaf node is lower thant threshold L + SESAME_DEBUG("l <= L, create a new leaf node and insert the point into it"); + + // update the parent node + updateNLS(parent, point, true); + } else{ + SESAME_DEBUG("l > L, parent node of the current leaf node capacity reaches the threshold L"); + SESAME_DEBUG("split a new parent node from the old one "); + bool CurNodeIsLeaf = true; + while(true) { + NodePtr parParent; + if(parent->getParents().empty()) { + parParent = make_shared(); + parParent->setIsLeaf(false); + this->root = parParent; + CFPtr parCF = parent->getCF(); + parParent->setCF(parCF); + } else{ + parParent = parent->getParents().at(0); + parParent->removeChild(parent); + } + NodePtr newParentA = make_shared(); + NodePtr newParentB = make_shared(); + if(parent->getChildren().at(0)->getIsLeaf()) { + for(int i = 0; i < this->leafNodes.size(); i++) { + if(this->leafNodes.at(i)->getIndex() == parent->getIndex()) { + this->leafNodes.erase(this->leafNodes.begin() + i); + } + } + newParentA->setIndex(++this->leafMask); + newParentB->setIndex(++this->leafMask); + this->leafNodes.push_back(newParentA); + this->leafNodes.push_back(newParentB); + } + + + newParentB->setIsLeaf(false); + newParentA->setIsLeaf(false); + newParentB->setParent(parParent); + parParent->setChild(newParentB); + parParent->setChild(newParentA); + newParentA->setParent(parParent); + CFPtr cfA = newParentA->getCF(); + CFPtr cfB = newParentB->getCF(); + initializeCF(cfA, point->getDimension()); + initializeCF(cfB, point->getDimension()); + + vector broNodes = parent->getChildren(); + vector> corCFDistance; + calculateCorDistance(corCFDistance, broNodes); + + // choose two farthest CFs as seedA and seedB + int seedA = 0; + int seedB = 0; + double max = 0; + for(int i = 0; i < broNodes.size(); i++) { + for(int j = i; j < broNodes.size(); j++) { + if(max < corCFDistance[i][j]) { + seedA = i; + seedB = j; + max = corCFDistance[i][j]; + } + } + } + + // insert the child node into the nearest seed(A / B) + clearChildParents(broNodes); + newParentA->setChild(broNodes[seedA]); + addNodeNLSToNode(broNodes[seedA], newParentA); + broNodes[seedA]->setParent(newParentA); + newParentB->setChild(broNodes[seedB]); + broNodes[seedB]->setParent(newParentB); + addNodeNLSToNode(broNodes[seedB], newParentB); + for(int i = 0; i < broNodes.size(); i++) { + if(i != seedA and i != seedB){ + if(corCFDistance[i][seedA] < corCFDistance[i][seedB]) { + newParentA->setChild(broNodes[i]); + addNodeNLSToNode(broNodes[i], newParentA); + broNodes[i]->clearParents(); + broNodes[i]->setParent(newParentA); + }else { + newParentB->setChild(broNodes[i]); + addNodeNLSToNode(broNodes[i], newParentB); + broNodes[i]->clearParents(); + broNodes[i]->setParent(newParentB); + } + } + } + if(CurNodeIsLeaf){ + updateNLS(parParent, point, true); + } + + if(parParent->getChildren().size() <= this->cfTree->getB()) { + SESAME_DEBUG("b < B, remove the old node and insert the new nodeA and nodeB into the parent node"); + break; + }else { + SESAME_DEBUG("b >= B, parent node of the current interior node capacity reaches the threshold B"); + curNode = curNode->getParents()[0]; + parent = parParent; + CurNodeIsLeaf = false; + } + } + } + } +} + diff --git a/src/Algorithm/CMakeLists.txt b/src/Algorithm/CMakeLists.txt index eb2b9485..11395010 100644 --- a/src/Algorithm/CMakeLists.txt +++ b/src/Algorithm/CMakeLists.txt @@ -1,6 +1,7 @@ add_source_sesame( StreamKM.cpp CluStream.cpp + Birch.cpp Algorithm.cpp AlgorithmFactory.cpp ) diff --git a/src/Algorithm/DataStructure/CFTree.cpp b/src/Algorithm/DataStructure/CFTree.cpp new file mode 100644 index 00000000..3751819f --- /dev/null +++ b/src/Algorithm/DataStructure/CFTree.cpp @@ -0,0 +1,102 @@ +// +// Created by tuidan on 2021/8/24. +// + +#include +SESAME::CFTree::CFTree(int b, int l, double t) { + this->maxInternalNodes = b; + this->maxLeafNodes = l; + this->thresholdDistance = t; +} +SESAME::CFTree::~CFTree() { + +} +int SESAME::CFTree::getB() const { + return this->maxInternalNodes; +} +int SESAME::CFTree::getL() const { + return this->maxLeafNodes; +} +double SESAME::CFTree::getT() const { + return this-> thresholdDistance; +} +void SESAME::CFTree::setB(int b) { + this->maxInternalNodes = b; +} +void SESAME::CFTree::setT(double t) { + this->thresholdDistance = t; +} +void SESAME::CFTree::setL(int l) { + this->maxLeafNodes = l; +} +void SESAME::CFTree::initialTree(int b, int l, double t) { + this->maxInternalNodes = b; + this->maxLeafNodes = l; + this->thresholdDistance = t; +} + +SESAME::CFPtr SESAME::CFNode::getCF() { + return this->curCF; +} +std::vector SESAME::CFNode::getParents() { + return this->parent; +} +int SESAME::CFNode::getIndex() const { + return this->index; +} +std::vector SESAME::CFNode::getChildren() { + return this->children; +} + +void SESAME::CFNode::setNode(CFPtr &Node) { + this->curCF = Node; +} +void SESAME::CFNode::setParent(NodePtr &Parent) { + this->parent.push_back(Parent); +} +void SESAME::CFNode::setIndex(int Index) { + this->index = Index; +} +void SESAME::CFNode::setChild(NodePtr &child) { + this->children.push_back(child); +} +SESAME::CFNode::CFNode() { + this->curCF = std::make_shared(); + this->isLeaf = true; +} +SESAME::CFNode::~CFNode() { + +} +void SESAME::CFNode::insertPoint(PointPtr &p) { + this->clusterPoints.push_back(p->copy()); +} +std::vector SESAME::CFNode::getPoints() { + return this->clusterPoints; +} +bool SESAME::CFNode::getIsLeaf() { + return this->isLeaf; +} +void SESAME::CFNode::setIsLeaf(bool leaf) { + this->isLeaf = leaf; +} +SESAME::NodePtr SESAME::CFNode::copy() { + return std::make_shared(*this); +} +void SESAME::CFNode::setCF(CFPtr &cf) { + this->curCF->setN(cf->getN()); + std::vector ls = cf->getLS(); + std::vector ss = cf->getSS(); + this->curCF->setLS(ls); + this->curCF->setSS(ss); +} +void SESAME::CFNode::clearParents() { + std::vector ().swap(this->parent); +} +void SESAME::CFNode::removeChild(NodePtr &child) { + for(int i = 0; i < this->children.size(); i++){ + if(this->children[i]->getIndex() == child->getIndex()) { + this->children.erase(this->children.begin() + i); + } + } +} + diff --git a/src/Algorithm/DataStructure/CMakeLists.txt b/src/Algorithm/DataStructure/CMakeLists.txt index d18493f3..120bafaa 100644 --- a/src/Algorithm/DataStructure/CMakeLists.txt +++ b/src/Algorithm/DataStructure/CMakeLists.txt @@ -5,4 +5,6 @@ add_source_sesame( MicroCluster.cpp Snapshot.cpp DataStructureFactory.cpp + CFTree.cpp + FeatureVector.cpp ) \ No newline at end of file diff --git a/src/Algorithm/DataStructure/DataStructureFactory.cpp b/src/Algorithm/DataStructure/DataStructureFactory.cpp index b145f216..b0397a81 100644 --- a/src/Algorithm/DataStructure/DataStructureFactory.cpp +++ b/src/Algorithm/DataStructure/DataStructureFactory.cpp @@ -45,3 +45,9 @@ SESAME::SnapshotPtr SESAME::DataStructureFactory::createSnapshot(SESAME::MicroCl void SESAME::DataStructureFactory::clearSnapshot(SESAME::SnapshotPtr Snapshot){ Snapshot.reset(); } +SESAME::CFTreePtr SESAME::DataStructureFactory::createCFTree() { + return std::make_shared(0,0,0); +} +SESAME::NodePtr SESAME::DataStructureFactory::createNode() { + return std::make_shared(); +} diff --git a/src/Algorithm/DataStructure/FeatureVector.cpp b/src/Algorithm/DataStructure/FeatureVector.cpp new file mode 100644 index 00000000..d19493a9 --- /dev/null +++ b/src/Algorithm/DataStructure/FeatureVector.cpp @@ -0,0 +1,72 @@ +// +// Created by tuidan on 2021/8/24. +// + +#include + +SESAME::CF::CF() { + this->NumberOfNodes = 0; +} +SESAME::CF::~CF() { + +} + +int SESAME::CF::getN() const { + return this->NumberOfNodes; +} + +void SESAME::CF::setN(int n) { + this->NumberOfNodes = n; +} +std::vector SESAME::CF::getLS() const { + return this->LS; +} + +std::vector SESAME::CF::getSS() const { + return this->SS; +} + +double SESAME::CF::getLSItem(int index) const { + return this->getLS().at(index); +} + + +double SESAME::CF::getSSItem(int index) const { + return this->getSS().at(index); +} + +void SESAME::CF::setLS(std::vector & newLs) { + if(this->getLS().empty()) { + for(double newL : newLs) { + this->LS.push_back(newL); + } + } else { + if(this->getLS().size() != newLs.size()) { + std::cout << "Size Error: CF's LS size: " << this->getLS().size() << ", newLS's size: " << newLs.size(); + } else { + for(int i = 0; i < newLs.size(); i++) { + this->LS[i] = newLs[i]; + } + } + } +} + +void SESAME::CF::setSS(std::vector & newSs) { + if(this->getSS().empty()) { + for(double newS : newSs) { + this->SS.push_back(newS); + } + } else{ + if(this->SS.size() != newSs.size()) { + std::cout << "Size Error: CF's SS size: " << this->getSS().size() << ", newSs's size: " << newSs.size(); + } else { + for(int i = 0; i < newSs.size(); i++) { + this->SS[i] = newSs[i]; + } + } + } + +} +SESAME::CFPtr SESAME::CF::copy() { + return std::make_shared(*this); +} diff --git a/src/Algorithm/DataStructure/Point.cpp b/src/Algorithm/DataStructure/Point.cpp index 5a9247a6..6a558074 100644 --- a/src/Algorithm/DataStructure/Point.cpp +++ b/src/Algorithm/DataStructure/Point.cpp @@ -73,4 +73,7 @@ SESAME::PointPtr SESAME::Point::copy() { int SESAME::Point::getDimension() const { return this->dimension; } +int SESAME::Point::getFeatureLength() { + return (int)this->feature->size(); +} diff --git a/src/Algorithm/StreamKM.cpp b/src/Algorithm/StreamKM.cpp index 7d365ff7..3cb109c6 100644 --- a/src/Algorithm/StreamKM.cpp +++ b/src/Algorithm/StreamKM.cpp @@ -59,31 +59,32 @@ void SESAME::StreamKM::runOfflineClustering(DataSinkPtr sinkPtr) { newGroups, true); // store the result input output - this->km.storeResult(oldGroups, centers); - this->km.groupPointsByCenters((int) centers.size(), (int) this->inputs.size(), - const_cast &>(this->inputs), centers, groups); - // print the clustering information - dumpResults(centers, groups, sinkPtr); - cout << endl; -} -void SESAME::StreamKM::dumpResults(vector ¢ers, - vector > groups, - DataSinkPtr sinkPtr) const { - int cluster = 0; - cout << cluster << " cluster: "; - for (int i = 0; i < groups.size(); i++) { - if (cluster != centers.at(i)->getClusteringCenter()) { - cluster = centers.at(i)->getClusteringCenter(); - cout << endl << cluster << " cluster: "; - } - for (int j = 0; j < groups[i].size(); j++) { - groups[i][j]->setClusteringCenter(centers[i]->getClusteringCenter()); - cout << groups[i][j]->getIndex() << " "; - sinkPtr->put(groups[i][j]); - } - } - cout << endl; + this->km.produceResult(oldGroups, sinkPtr); +// this->km.storeResult(oldGroups, centers); +// this->km.groupPointsByCenters((int) centers.size(), (int) this->inputs.size(), +// const_cast &>(this->inputs), centers, groups); +// // print the clustering information +// dumpResults(centers, groups, sinkPtr); +// cout << endl; } +//void SESAME::StreamKM::dumpResults(vector ¢ers, +// vector > groups, +// DataSinkPtr sinkPtr) const { +// int cluster = 0; +// cout << cluster << " cluster: "; +// for (int i = 0; i < groups.size(); i++) { +// if (cluster != centers.at(i)->getClusteringCenter()) { +// cluster = centers.at(i)->getClusteringCenter(); +// cout << endl << cluster << " cluster: "; +// } +// for (int j = 0; j < groups[i].size(); j++) { +// groups[i][j]->setClusteringCenter(centers[i]->getClusteringCenter()); +// cout << groups[i][j]->getIndex() << " "; +// sinkPtr->put(groups[i][j]); +// } +// } +// cout << endl; +//} SESAME::StreamKM::StreamKM() { diff --git a/src/Utils/BenchmarkUtils.cpp b/src/Utils/BenchmarkUtils.cpp index 1e481a8d..533afae7 100644 --- a/src/Utils/BenchmarkUtils.cpp +++ b/src/Utils/BenchmarkUtils.cpp @@ -137,10 +137,13 @@ void BenchmarkUtils::defaultParam(param_t &cmd_params) { cmd_params.radiusFactor = 70; cmd_params.initBuffer = 500; cmd_params.offlineTimeWindow = 2; - cmd_params.inputPath = std::filesystem::current_path().generic_string() + "/datasets/new.txt"; + cmd_params.maxLeafNodes = 3; + cmd_params.maxInternalNodes = 3; + cmd_params.thresholdDistance = 6550; + cmd_params.inputPath = std::filesystem::current_path().generic_string() + "/datasets/CoverType.txt"; SESAME_INFO("Default Input Data Directory: " + cmd_params.inputPath); cmd_params.outputPath = "results.txt"; - cmd_params.algoName = "StreamKMeans";//StreamKMeans CluStream + cmd_params.algoName = "Birch";//StreamKMeans CluStream Birch } /* command line handling functions */ @@ -190,7 +193,7 @@ void BenchmarkUtils::runBenchmark(param_t &cmd_params, while (!sinkPtr->isFinished());//wait for sink to stop. //Store results. - algoPtr->store(cmd_params.outputPath, cmd_params.clusterNumber, cmd_params.dimension, sinkPtr->getResults()); + algoPtr->store(cmd_params.outputPath, cmd_params.dimension, sinkPtr->getResults()); SESAME_INFO("Finished store results: "<getResults().size()); engine.stop(); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ddbc6ba0..b57fdc7e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,6 +1,8 @@ # adding the Google_Tests_run target add_executable(Google_Tests_run - SystemTest.cpp + SystemTest/CluStreamTest.cpp + SystemTest/StreamKMTest.cpp + SystemTest/BirchTest.cpp ) # linking Google_Tests_run with sesame_lib which will be tested diff --git a/test/SystemTest.cpp b/test/SystemTest.cpp deleted file mode 100644 index 77144225..00000000 --- a/test/SystemTest.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) - -// -// Created by shuhao zhang on 8/8/2021. -// - -#include -#include -#include -#include -#include -#include - -TEST(SystemTest, SampleRun) { - //Setup Logs. - setupLogging("benchmark.log", LOG_DEBUG); - - //Parse parameters. - param_t cmd_params; - BenchmarkUtils::defaultParam(cmd_params); - - std::vector input; - std::vector results; - - //Create Spout. - SESAME::DataSourcePtr sourcePtr = SESAME::DataSourceFactory::create(); - //Directly load data from file. TODO: configure it to load from external sensors, e.g., HTTP. - BenchmarkUtils::loadData(cmd_params, sourcePtr); - - //Create Sink. - SESAME::DataSinkPtr sinkPtr = SESAME::DataSinkFactory::create(); - - //Create Algorithm. - SESAME::AlgorithmPtr algoPtr = SESAME::AlgorithmFactory::create(cmd_params.algoName, - cmd_params.pointNumber, - cmd_params.clusterNumber, - cmd_params.dimension, - cmd_params.coresetSize, - cmd_params.seed, - cmd_params.lastArrivingNum, - cmd_params.timeWindow, - cmd_params.timeInterval, - cmd_params.onlineClusterNumber, - cmd_params.radiusFactor, - cmd_params.initBuffer, - cmd_params.offlineTimeWindow); - - //Run algorithm producing results. - BenchmarkUtils::runBenchmark(cmd_params, sourcePtr, sinkPtr, algoPtr); -} \ No newline at end of file diff --git a/test/SystemTest/BirchTest.cpp b/test/SystemTest/BirchTest.cpp new file mode 100644 index 00000000..950919b9 --- /dev/null +++ b/test/SystemTest/BirchTest.cpp @@ -0,0 +1,47 @@ +// +// Created by tuidan on 2021/8/25. +// + + +#include +#include +#include +#include +#include +#include +#include + +TEST(SystemTest, BirchTest) { + //Setup Logs. + setupLogging("benchmark.log", LOG_DEBUG); + + //Parse parameters. + param_t cmd_params; + cmd_params.pointNumber = 150; + cmd_params.thresholdDistance = 7; + cmd_params.maxInternalNodes = 3; + cmd_params.maxLeafNodes = 3; + cmd_params.dimension = 4; + + cmd_params.inputPath = std::filesystem::current_path().generic_string() + "/datasets/Mock.txt"; + cmd_params.outputPath = "results.txt"; + cmd_params.algoName = "Birch"; + + + std::vector input; + std::vector results; + + //Create Spout. + SESAME::DataSourcePtr sourcePtr = SESAME::DataSourceFactory::create(); + //Directly load data from file. TODO: configure it to load from external sensors, e.g., HTTP. + BenchmarkUtils::loadData(cmd_params, sourcePtr); + + //Create Sink. + SESAME::DataSinkPtr sinkPtr = SESAME::DataSinkFactory::create(); + + //Create Algorithm. + SESAME::AlgorithmPtr algoPtr = SESAME::AlgorithmFactory::create(cmd_params); + + //Run algorithm producing results. + BenchmarkUtils::runBenchmark(cmd_params, sourcePtr, sinkPtr, algoPtr); +} \ No newline at end of file diff --git a/test/SystemTest/CluStreamTest.cpp b/test/SystemTest/CluStreamTest.cpp new file mode 100644 index 00000000..e68343e8 --- /dev/null +++ b/test/SystemTest/CluStreamTest.cpp @@ -0,0 +1,52 @@ +// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) + +// +// Created by shuhao zhang on 8/8/2021. +// +#include +#include +#include +#include +#include +#include +#include + +TEST(SystemTest, CluStreamTest) { + //Setup Logs. + setupLogging("benchmark.log", LOG_DEBUG); + + //Parse parameters. + param_t cmd_params; + cmd_params.pointNumber = 15120; + cmd_params.seed = 10; + cmd_params.clusterNumber = 10; + cmd_params.dimension = 54; + cmd_params.coresetSize = 100; + cmd_params.lastArrivingNum = 60; + cmd_params.timeWindow = 6; + cmd_params.timeInterval = 4; + cmd_params.onlineClusterNumber = 15; + cmd_params.radiusFactor = 70; + cmd_params.initBuffer = 500; + cmd_params.offlineTimeWindow = 2; + cmd_params.inputPath = std::filesystem::current_path().generic_string() + "/datasets/CoverType.txt"; + cmd_params.outputPath = "results.txt"; + cmd_params.algoName = "CluStream";//StreamKMeans CluStream + + std::vector input; + std::vector results; + + //Create Spout. + SESAME::DataSourcePtr sourcePtr = SESAME::DataSourceFactory::create(); + //Directly load data from file. TODO: configure it to load from external sensors, e.g., HTTP. + BenchmarkUtils::loadData(cmd_params, sourcePtr); + + //Create Sink. + SESAME::DataSinkPtr sinkPtr = SESAME::DataSinkFactory::create(); + + //Create Algorithm. + SESAME::AlgorithmPtr algoPtr = SESAME::AlgorithmFactory::create(cmd_params); + + //Run algorithm producing results. + BenchmarkUtils::runBenchmark(cmd_params, sourcePtr, sinkPtr, algoPtr); +} \ No newline at end of file diff --git a/test/SystemTest/StreamKMTest.cpp b/test/SystemTest/StreamKMTest.cpp new file mode 100644 index 00000000..fa3314ea --- /dev/null +++ b/test/SystemTest/StreamKMTest.cpp @@ -0,0 +1,48 @@ +// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) + +// +// Created by shuhao zhang on 8/8/2021. +// +#include +#include +#include +#include +#include +#include +#include + +TEST(SystemTest, StreamKMTest) { + //Setup Logs. + setupLogging("benchmark.log", LOG_DEBUG); + + //Parse parameters. + param_t cmd_params; + cmd_params.pointNumber = 150; + cmd_params.seed = 10; + cmd_params.clusterNumber = 10; + cmd_params.dimension = 4; + cmd_params.coresetSize = 30; + + cmd_params.inputPath = std::filesystem::current_path().generic_string() + "/datasets/Mock.txt"; + SESAME_INFO("Default Input Data Directory: " + cmd_params.inputPath); + cmd_params.outputPath = "results.txt"; + cmd_params.algoName = "StreamKMeans"; + + + std::vector input; + std::vector results; + + //Create Spout. + SESAME::DataSourcePtr sourcePtr = SESAME::DataSourceFactory::create(); + //Directly load data from file. TODO: configure it to load from external sensors, e.g., HTTP. + BenchmarkUtils::loadData(cmd_params, sourcePtr); + + //Create Sink. + SESAME::DataSinkPtr sinkPtr = SESAME::DataSinkFactory::create(); + + //Create Algorithm. + SESAME::AlgorithmPtr algoPtr = SESAME::AlgorithmFactory::create(cmd_params); + + //Run algorithm producing results. + BenchmarkUtils::runBenchmark(cmd_params, sourcePtr, sinkPtr, algoPtr); +} \ No newline at end of file diff --git a/test/datasets/new.txt b/test/datasets/CoverType.txt similarity index 100% rename from test/datasets/new.txt rename to test/datasets/CoverType.txt diff --git a/test/datasets/Mock.txt b/test/datasets/Mock.txt new file mode 100644 index 00000000..de6ed589 --- /dev/null +++ b/test/datasets/Mock.txt @@ -0,0 +1,150 @@ +1 5.1 3.5 1.4 0.2 0 +2 4.9 3 1.4 0.2 0 +3 4.7 3.2 1.3 0.2 0 +4 4.6 3.1 1.5 0.2 0 +5 5 3.6 1.4 0.2 0 +6 5.4 3.9 1.7 0.4 0 +7 4.6 3.4 1.4 0.3 0 +8 5 3.4 1.5 0.2 0 +9 4.4 2.9 1.4 0.2 0 +10 4.9 3.1 1.5 0.1 0 +11 5.4 3.7 1.5 0.2 0 +12 4.8 3.4 1.6 0.2 0 +13 4.8 3 1.4 0.1 0 +14 4.3 3 1.1 0.1 0 +15 5.8 4 1.2 0.2 0 +16 5.7 4.4 1.5 0.4 0 +17 5.4 3.9 1.3 0.4 0 +18 5.1 3.5 1.4 0.3 0 +19 5.7 3.8 1.7 0.3 0 +20 5.1 3.8 1.5 0.3 0 +21 5.4 3.4 1.7 0.2 0 +22 5.1 3.7 1.5 0.4 0 +23 4.6 3.6 1 0.2 0 +24 5.1 3.3 1.7 0.5 0 +25 4.8 3.4 1.9 0.2 0 +26 5 3 1.6 0.2 0 +27 5 3.4 1.6 0.4 0 +28 5.2 3.5 1.5 0.2 0 +29 5.2 3.4 1.4 0.2 0 +30 4.7 3.2 1.6 0.2 0 +31 4.8 3.1 1.6 0.2 0 +32 5.4 3.4 1.5 0.4 0 +33 5.2 4.1 1.5 0.1 0 +34 5.5 4.2 1.4 0.2 0 +35 4.9 3.1 1.5 0.1 0 +36 5 3.2 1.2 0.2 0 +37 5.5 3.5 1.3 0.2 0 +38 4.9 3.1 1.5 0.1 0 +39 4.4 3 1.3 0.2 0 +40 5.1 3.4 1.5 0.2 0 +41 5 3.5 1.3 0.3 0 +42 4.5 2.3 1.3 0.3 0 +43 4.4 3.2 1.3 0.2 0 +44 5 3.5 1.6 0.6 0 +45 5.1 3.8 1.9 0.4 0 +46 4.8 3 1.4 0.3 0 +47 5.1 3.8 1.6 0.2 0 +48 4.6 3.2 1.4 0.2 0 +49 5.3 3.7 1.5 0.2 0 +50 5 3.3 1.4 0.2 0 +51 7 3.2 4.7 1.4 1 +52 6.4 3.2 4.5 1.5 1 +53 6.9 3.1 4.9 1.5 1 +54 5.5 2.3 4 1.3 1 +55 6.5 2.8 4.6 1.5 1 +56 5.7 2.8 4.5 1.3 1 +57 6.3 3.3 4.7 1.6 1 +58 4.9 2.4 3.3 1 1 +59 6.6 2.9 4.6 1.3 1 +60 5.2 2.7 3.9 1.4 1 +61 5 2 3.5 1 1 +62 5.9 3 4.2 1.5 1 +63 6 2.2 4 1 1 +64 6.1 2.9 4.7 1.4 1 +65 5.6 2.9 3.6 1.3 1 +66 6.7 3.1 4.4 1.4 1 +67 5.6 3 4.5 1.5 1 +68 5.8 2.7 4.1 1 1 +69 6.2 2.2 4.5 1.5 1 +70 5.6 2.5 3.9 1.1 1 +71 5.9 3.2 4.8 1.8 1 +72 6.1 2.8 4 1.3 1 +73 6.3 2.5 4.9 1.5 1 +74 6.1 2.8 4.7 1.2 1 +75 6.4 2.9 4.3 1.3 1 +76 6.6 3 4.4 1.4 1 +77 6.8 2.8 4.8 1.4 1 +78 6.7 3 5 1.7 1 +79 6 2.9 4.5 1.5 1 +80 5.7 2.6 3.5 1 1 +81 5.5 2.4 3.8 1.1 1 +82 5.5 2.4 3.7 1 1 +83 5.8 2.7 3.9 1.2 1 +84 6 2.7 5.1 1.6 1 +85 5.4 3 4.5 1.5 1 +86 6 3.4 4.5 1.6 1 +87 6.7 3.1 4.7 1.5 1 +88 6.3 2.3 4.4 1.3 1 +89 5.6 3 4.1 1.3 1 +90 5.5 2.5 4 1.3 1 +91 5.5 2.6 4.4 1.2 1 +92 6.1 3 4.6 1.4 1 +93 5.8 2.6 4 1.2 1 +94 5 2.3 3.3 1 1 +95 5.6 2.7 4.2 1.3 1 +96 5.7 3 4.2 1.2 1 +97 5.7 2.9 4.2 1.3 1 +98 6.2 2.9 4.3 1.3 1 +99 5.1 2.5 3 1.1 1 +100 5.7 2.8 4.1 1.3 1 +101 6.3 3.3 6 2.5 2 +102 5.8 2.7 5.1 1.9 2 +103 7.1 3 5.9 2.1 2 +104 6.3 2.9 5.6 1.8 2 +105 6.5 3 5.8 2.2 2 +106 7.6 3 6.6 2.1 2 +107 4.9 2.5 4.5 1.7 2 +108 7.3 2.9 6.3 1.8 2 +109 6.7 2.5 5.8 1.8 2 +110 7.2 3.6 6.1 2.5 2 +111 6.5 3.2 5.1 2 2 +112 6.4 2.7 5.3 1.9 2 +113 6.8 3 5.5 2.1 2 +114 5.7 2.5 5 2 2 +115 5.8 2.8 5.1 2.4 2 +116 6.4 3.2 5.3 2.3 2 +117 6.5 3 5.5 1.8 2 +118 7.7 3.8 6.7 2.2 2 +119 7.7 2.6 6.9 2.3 2 +120 6 2.2 5 1.5 2 +121 6.9 3.2 5.7 2.3 2 +122 5.6 2.8 4.9 2 2 +123 7.7 2.8 6.7 2 2 +124 6.3 2.7 4.9 1.8 2 +125 6.7 3.3 5.7 2.1 2 +126 7.2 3.2 6 1.8 2 +127 6.2 2.8 4.8 1.8 2 +128 6.1 3 4.9 1.8 2 +129 6.4 2.8 5.6 2.1 2 +130 7.2 3 5.8 1.6 2 +131 7.4 2.8 6.1 1.9 2 +132 7.9 3.8 6.4 2 2 +133 6.4 2.8 5.6 2.2 2 +134 6.3 2.8 5.1 1.5 2 +135 6.1 2.6 5.6 1.4 2 +136 7.7 3 6.1 2.3 2 +137 6.3 3.4 5.6 2.4 2 +138 6.4 3.1 5.5 1.8 2 +139 6 3 4.8 1.8 2 +140 6.9 3.1 5.4 2.1 2 +141 6.7 3.1 5.6 2.4 2 +142 6.9 3.1 5.1 2.3 2 +143 5.8 2.7 5.1 1.9 2 +144 6.8 3.2 5.9 2.3 2 +145 6.7 3.3 5.7 2.5 2 +146 6.7 3 5.2 2.3 2 +147 6.3 2.5 5 1.9 2 +148 6.5 3 5.2 2 2 +149 6.2 3.4 5.4 2.3 2 +150 5.9 3 5.1 1.8 2